diff --git a/.jenkins/caffe2/README.md b/.ci/caffe2/README.md
similarity index 100%
rename from .jenkins/caffe2/README.md
rename to .ci/caffe2/README.md
diff --git a/.jenkins/caffe2/common.sh b/.ci/caffe2/common.sh
similarity index 95%
rename from .jenkins/caffe2/common.sh
rename to .ci/caffe2/common.sh
index 087055536564..e4c7218068e1 100644
--- a/.jenkins/caffe2/common.sh
+++ b/.ci/caffe2/common.sh
@@ -28,7 +28,7 @@ fi
 
 # /usr/local/caffe2 is where the cpp bits are installed to in cmake-only
 # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so
-# that the test code in .jenkins/test.sh is the same
+# that the test code in .ci/test.sh is the same
 INSTALL_PREFIX="/usr/local/caffe2"
 
 mkdir -p "$gtest_reports_dir" || true
diff --git a/.jenkins/caffe2/test.sh b/.ci/caffe2/test.sh
similarity index 100%
rename from .jenkins/caffe2/test.sh
rename to .ci/caffe2/test.sh
diff --git a/.circleci/docker/README.md b/.ci/docker/README.md
similarity index 100%
rename from .circleci/docker/README.md
rename to .ci/docker/README.md
diff --git a/.circleci/docker/android/AndroidManifest.xml b/.ci/docker/android/AndroidManifest.xml
similarity index 100%
rename from .circleci/docker/android/AndroidManifest.xml
rename to .ci/docker/android/AndroidManifest.xml
diff --git a/.circleci/docker/android/build.gradle b/.ci/docker/android/build.gradle
similarity index 96%
rename from .circleci/docker/android/build.gradle
rename to .ci/docker/android/build.gradle
index 66b936326b72..d7c946719c1d 100644
--- a/.circleci/docker/android/build.gradle
+++ b/.ci/docker/android/build.gradle
@@ -53,7 +53,7 @@ dependencies {
     implementation 'androidx.appcompat:appcompat:1.0.0'
     implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
     implementation 'com.google.code.findbugs:jsr305:3.0.1'
-    implementation 'com.facebook.soloader:nativeloader:0.10.4'
+    implementation 'com.facebook.soloader:nativeloader:0.10.5'
 
     implementation 'junit:junit:' + rootProject.junitVersion
     implementation 'androidx.test:core:' + rootProject.coreVersion
diff --git a/.circleci/docker/build.sh b/.ci/docker/build.sh
similarity index 85%
rename from .circleci/docker/build.sh
rename to .ci/docker/build.sh
index f4e2121010d3..914ece6d5bfd 100755
--- a/.circleci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -73,6 +73,9 @@ if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
   DOCKERFILE="${OS}-cuda/Dockerfile"
 elif [[ "$image" == *rocm* ]]; then
   DOCKERFILE="${OS}-rocm/Dockerfile"
+elif [[ "$image" == *linter* ]]; then
+  # Use a separate Dockerfile for linter to keep a small image size
+  DOCKERFILE="linter/Dockerfile"
 fi
 
 # CMake 3.18 is needed to support CUDA17 language variant
@@ -97,6 +100,7 @@ case "$image" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
     CUDA_VERSION=11.7.0
@@ -110,17 +114,33 @@ case "$image" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7)
+    CUDA_VERSION=11.8.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=7
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-focal-py3-clang7-asan)
-    ANACONDA_PYTHON_VERSION=3.7
+    ANACONDA_PYTHON_VERSION=3.9
     CLANG_VERSION=7
     PROTOBUF=yes
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-focal-py3-clang10-onnx)
-    ANACONDA_PYTHON_VERSION=3.7
+    ANACONDA_PYTHON_VERSION=3.8
     CLANG_VERSION=10
     PROTOBUF=yes
     DB=yes
@@ -137,8 +157,8 @@ case "$image" in
     GRADLE_VERSION=6.8.3
     NINJA_VERSION=1.9.0
     ;;
-  pytorch-linux-bionic-py3.7-clang9)
-    ANACONDA_PYTHON_VERSION=3.7
+  pytorch-linux-bionic-py3.8-clang9)
+    ANACONDA_PYTHON_VERSION=3.8
     CLANG_VERSION=9
     PROTOBUF=yes
     DB=yes
@@ -146,6 +166,18 @@ case "$image" in
     VULKAN_SDK_VERSION=1.2.162.1
     SWIFTSHADER=yes
     CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-bionic-py3.11-clang9)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
+    CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-bionic-py3.8-gcc9)
     ANACONDA_PYTHON_VERSION=3.8
@@ -154,35 +186,37 @@ case "$image" in
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
-  pytorch-linux-focal-rocm5.2-py3.8)
+  pytorch-linux-focal-rocm-n-1-py3)
     ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=5.2
+    ROCM_VERSION=5.3
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-focal-rocm5.3-py3.8)
+  pytorch-linux-focal-rocm-n-py3)
     ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
     VISION=yes
-    ROCM_VERSION=5.3
+    ROCM_VERSION=5.4.2
     NINJA_VERSION=1.9.0
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-focal-py3.7-gcc7)
-    ANACONDA_PYTHON_VERSION=3.7
+  pytorch-linux-focal-py3.8-gcc7)
+    ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=7
     PROTOBUF=yes
     DB=yes
     VISION=yes
     KATEX=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12)
     ANACONDA_PYTHON_VERSION=3.8
@@ -192,6 +226,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
+    TRITON=yes
     ;;
   pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
     ANACONDA_PYTHON_VERSION=3.8
@@ -201,6 +236,24 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
+    ANACONDA_PYTHON_VERSION=3.8
+    CUDA_VERSION=11.8
+    CUDNN_VERSION=8
+    CLANG_VERSION=12
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-linter)
+    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
+    # We will need to update mypy version eventually, but that's for another day. The task
+    # would be to upgrade mypy to 1.0.0 with Python 3.11
+    ANACONDA_PYTHON_VERSION=3.9
+    CONDA_CMAKE=yes
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
@@ -292,6 +345,7 @@ docker build \
        --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
        --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
        --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
+       --build-arg "TRITON=${TRITON}" \
        -f $(dirname ${DOCKERFILE})/${DOCKERFILE_NAME} \
        -t "$tmp_tag" \
        "$@" \
diff --git a/.circleci/docker/build_docker.sh b/.ci/docker/build_docker.sh
similarity index 86%
rename from .circleci/docker/build_docker.sh
rename to .ci/docker/build_docker.sh
index bd3b30e7d50e..c033a7acc022 100755
--- a/.circleci/docker/build_docker.sh
+++ b/.ci/docker/build_docker.sh
@@ -18,7 +18,6 @@ tag="${DOCKER_TAG}"
 
 registry="308535385114.dkr.ecr.us-east-1.amazonaws.com"
 image="${registry}/pytorch/${IMAGE_NAME}"
-ghcr_image="ghcr.io/pytorch/ci-image"
 
 login() {
   aws ecr get-authorization-token --region us-east-1 --output text --query 'authorizationData[].authorizationToken' |
@@ -52,13 +51,6 @@ if [ "${DOCKER_SKIP_PUSH:-true}" = "false" ]; then
   if ! docker manifest inspect "${image}:${tag}" >/dev/null 2>/dev/null; then
     docker push "${image}:${tag}"
   fi
-
-  if [ "${PUSH_GHCR_IMAGE:-}" = "true" ]; then
-    # Push docker image to the ghcr.io
-    echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin
-    docker tag "${image}:${tag}" "${ghcr_image}:${IMAGE_NAME}-${tag}"
-    docker push "${ghcr_image}:${IMAGE_NAME}-${tag}"
-  fi
 fi
 
 if [ -z "${DOCKER_SKIP_S3_UPLOAD:-}" ]; then
diff --git a/.circleci/docker/centos-rocm/Dockerfile b/.ci/docker/centos-rocm/Dockerfile
similarity index 89%
rename from .circleci/docker/centos-rocm/Dockerfile
rename to .ci/docker/centos-rocm/Dockerfile
index af7073f87ad4..3bd2ff66df33 100644
--- a/.circleci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@@ -17,7 +17,9 @@ RUN bash ./install_base.sh && rm install_base.sh
 # Update CentOS git version
 RUN yum -y remove git
 RUN yum -y remove git-*
-RUN yum -y install https://packages.endpoint.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm
+RUN yum -y install https://packages.endpoint.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm || \
+    (yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
+    sed -i "s/packages.endpoint/packages.endpointdev/" /etc/yum.repos.d/endpoint.repo)
 RUN yum install -y git
 
 # Install devtoolset
@@ -43,8 +45,8 @@ ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-RUN rm /opt/conda/requirements-ci.txt
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
new file mode 100644
index 000000000000..d3ca0816018a
--- /dev/null
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -0,0 +1 @@
+b8b470bc597c1c5bd03682c09fe3e6b7c53787fd
diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
new file mode 100644
index 000000000000..27c1b815a0ea
--- /dev/null
+++ b/.ci/docker/common/common_utils.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Work around bug where devtoolset replaces sudo and breaks it.
+if [ -n "$DEVTOOLSET_VERSION" ]; then
+  export SUDO=/bin/sudo
+else
+  export SUDO=sudo
+fi
+
+as_jenkins() {
+  # NB: unsetting the environment variables works around a conda bug
+  # https://github.com/conda/conda/issues/6576
+  # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
+  # NB: This must be run from a directory that jenkins has access to,
+  # works around https://github.com/conda/conda-package-handling/pull/34
+  $SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
+}
+
+conda_install() {
+  # Ensure that the install command don't upgrade/downgrade Python
+  # This should be called as
+  #   conda_install pkg1 pkg2 ... [-c channel]
+  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
+}
+
+conda_run() {
+  as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
+}
+
+pip_install() {
+  as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
+}
+
+get_pinned_commit() {
+  cat "${1}".txt
+}
diff --git a/.circleci/docker/common/install_android.sh b/.ci/docker/common/install_android.sh
similarity index 100%
rename from .circleci/docker/common/install_android.sh
rename to .ci/docker/common/install_android.sh
diff --git a/.circleci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
similarity index 100%
rename from .circleci/docker/common/install_base.sh
rename to .ci/docker/common/install_base.sh
diff --git a/.circleci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
similarity index 100%
rename from .circleci/docker/common/install_cache.sh
rename to .ci/docker/common/install_cache.sh
diff --git a/.circleci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh
similarity index 100%
rename from .circleci/docker/common/install_clang.sh
rename to .ci/docker/common/install_clang.sh
diff --git a/.circleci/docker/common/install_cmake.sh b/.ci/docker/common/install_cmake.sh
similarity index 100%
rename from .circleci/docker/common/install_cmake.sh
rename to .ci/docker/common/install_cmake.sh
diff --git a/.circleci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
similarity index 73%
rename from .circleci/docker/common/install_conda.sh
rename to .ci/docker/common/install_conda.sh
index 64d6f4dd0ee9..4e6efc118180 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -24,21 +24,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   mkdir -p /opt/conda
   chown jenkins:jenkins /opt/conda
 
-  # Work around bug where devtoolset replaces sudo and breaks it.
-  if [ -n "$DEVTOOLSET_VERSION" ]; then
-    SUDO=/bin/sudo
-  else
-    SUDO=sudo
-  fi
-
-  as_jenkins() {
-    # NB: unsetting the environment variables works around a conda bug
-    # https://github.com/conda/conda/issues/6576
-    # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
-    # NB: This must be run from a directory that jenkins has access to,
-    # works around https://github.com/conda/conda-package-handling/pull/34
-    $SUDO -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
-  }
+  source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 
   pushd /tmp
   wget -q "${BASE_URL}/${CONDA_FILE}"
@@ -68,20 +54,13 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   # Install correct Python version
   as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION"
 
-  conda_install() {
-    # Ensure that the install command don't upgrade/downgrade Python
-    # This should be called as
-    #   conda_install pkg1 pkg2 ... [-c channel]
-    as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
-  }
-
-  pip_install() {
-    as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
-  }
-
   # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
-  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2022.0.1 mkl-include=2022.0.1 setuptools six"
-  if [ "$ANACONDA_PYTHON_VERSION" = "3.10" ]; then
+  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
+    # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
+    # TODO: Stop using `-c malfet`
+    conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS} llvmdev=8.0.0 -c malfet
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.10" ]; then
     # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
     conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} llvmdev=8.0.0
   elif [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
@@ -91,8 +70,8 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
     conda_install numpy=1.18.5 ${CONDA_COMMON_DEPS} llvmdev=8.0.0
   else
-    # Install `typing_extensions` for 3.7
-    conda_install numpy=1.18.5 ${CONDA_COMMON_DEPS} typing_extensions
+    # Install `typing-extensions` for 3.7
+    conda_install numpy=1.18.5 ${CONDA_COMMON_DEPS} typing-extensions
   fi
 
   # Use conda cmake in some cases. Conda cmake will be newer than our supported
diff --git a/.circleci/docker/common/install_cudnn.sh b/.ci/docker/common/install_cudnn.sh
similarity index 67%
rename from .circleci/docker/common/install_cudnn.sh
rename to .ci/docker/common/install_cudnn.sh
index f68fc6946c2e..94d494b07973 100644
--- a/.circleci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@@ -7,8 +7,11 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
     if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive"
         curl --retry 3 -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
     else
-        curl --retry 3 -OLs  https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
     fi
 
     tar xf ${CUDNN_NAME}.tar.xz
diff --git a/.circleci/docker/common/install_db.sh b/.ci/docker/common/install_db.sh
similarity index 100%
rename from .circleci/docker/common/install_db.sh
rename to .ci/docker/common/install_db.sh
diff --git a/.circleci/docker/common/install_devtoolset.sh b/.ci/docker/common/install_devtoolset.sh
similarity index 100%
rename from .circleci/docker/common/install_devtoolset.sh
rename to .ci/docker/common/install_devtoolset.sh
diff --git a/.circleci/docker/common/install_docs_reqs.sh b/.ci/docker/common/install_docs_reqs.sh
similarity index 100%
rename from .circleci/docker/common/install_docs_reqs.sh
rename to .ci/docker/common/install_docs_reqs.sh
diff --git a/.circleci/docker/common/install_gcc.sh b/.ci/docker/common/install_gcc.sh
similarity index 100%
rename from .circleci/docker/common/install_gcc.sh
rename to .ci/docker/common/install_gcc.sh
diff --git a/.circleci/docker/common/install_glibc.sh b/.ci/docker/common/install_glibc.sh
similarity index 100%
rename from .circleci/docker/common/install_glibc.sh
rename to .ci/docker/common/install_glibc.sh
diff --git a/.circleci/docker/common/install_jni.sh b/.ci/docker/common/install_jni.sh
similarity index 100%
rename from .circleci/docker/common/install_jni.sh
rename to .ci/docker/common/install_jni.sh
diff --git a/.circleci/docker/common/install_lcov.sh b/.ci/docker/common/install_lcov.sh
similarity index 100%
rename from .circleci/docker/common/install_lcov.sh
rename to .ci/docker/common/install_lcov.sh
diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh
new file mode 100644
index 000000000000..a7f008fb735d
--- /dev/null
+++ b/.ci/docker/common/install_linter.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+if [ -n "${UBUNTU_VERSION}" ]; then
+  apt update
+  apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
+fi
+
+# Do shallow clone of PyTorch so that we can init lintrunner in Docker build context
+git clone https://github.com/pytorch/pytorch.git --depth 1
+chown -R jenkins pytorch
+
+pushd pytorch
+# Install all linter dependencies
+pip_install -r requirements.txt
+conda_run lintrunner init
+
+# Cache .lintbin directory as part of the Docker image
+cp -r .lintbin /tmp
+popd
+
+# Node dependencies required by toc linter job
+npm install -g markdown-toc
+
+# Cleaning up
+rm -rf pytorch
diff --git a/.circleci/docker/common/install_ninja.sh b/.ci/docker/common/install_ninja.sh
similarity index 100%
rename from .circleci/docker/common/install_ninja.sh
rename to .ci/docker/common/install_ninja.sh
diff --git a/.circleci/docker/common/install_openmpi.sh b/.ci/docker/common/install_openmpi.sh
similarity index 100%
rename from .circleci/docker/common/install_openmpi.sh
rename to .ci/docker/common/install_openmpi.sh
diff --git a/.circleci/docker/common/install_openssl.sh b/.ci/docker/common/install_openssl.sh
similarity index 100%
rename from .circleci/docker/common/install_openssl.sh
rename to .ci/docker/common/install_openssl.sh
diff --git a/.circleci/docker/common/install_protobuf.sh b/.ci/docker/common/install_protobuf.sh
similarity index 100%
rename from .circleci/docker/common/install_protobuf.sh
rename to .ci/docker/common/install_protobuf.sh
diff --git a/.circleci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
similarity index 100%
rename from .circleci/docker/common/install_rocm.sh
rename to .ci/docker/common/install_rocm.sh
diff --git a/.circleci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
similarity index 100%
rename from .circleci/docker/common/install_rocm_magma.sh
rename to .ci/docker/common/install_rocm_magma.sh
diff --git a/.circleci/docker/common/install_swiftshader.sh b/.ci/docker/common/install_swiftshader.sh
similarity index 100%
rename from .circleci/docker/common/install_swiftshader.sh
rename to .ci/docker/common/install_swiftshader.sh
diff --git a/.circleci/docker/common/install_thrift.sh b/.ci/docker/common/install_thrift.sh
similarity index 100%
rename from .circleci/docker/common/install_thrift.sh
rename to .ci/docker/common/install_thrift.sh
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
new file mode 100755
index 000000000000..4926b817bd2f
--- /dev/null
+++ b/.ci/docker/common/install_triton.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+get_conda_version() {
+  as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
+}
+
+conda_reinstall() {
+  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
+}
+
+# The logic here is copied from .ci/pytorch/common_utils.sh
+TRITON_PINNED_COMMIT=$(get_pinned_commit triton)
+
+apt update
+apt-get install -y gpg-agent
+
+if [ -n "${CONDA_CMAKE}" ]; then
+  # Keep the current cmake and numpy version here, so we can reinstall them later
+  CMAKE_VERSION=$(get_conda_version cmake)
+  NUMPY_VERSION=$(get_conda_version numpy)
+fi
+
+if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
+  # Triton needs at least gcc-9 to build
+  apt-get install -y g++-9
+
+  CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
+elif [ -n "${CLANG_VERSION}" ]; then
+  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
+  add-apt-repository -y ppa:ubuntu-toolchain-r/test
+  apt-get install -y g++-9
+
+  CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
+else
+  pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
+fi
+
+if [ -n "${CONDA_CMAKE}" ]; then
+  # TODO: This is to make sure that the same cmake and numpy version from install conda
+  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+  # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+  # this can be removed.
+  #
+  # The correct numpy version also needs to be set here because conda claims that it
+  # causes inconsistent environment.  Without this, conda will attempt to install the
+  # latest numpy version, which fails ASAN tests with the following import error: Numba
+  # needs NumPy 1.20 or less.
+  conda_reinstall cmake="${CMAKE_VERSION}"
+  conda_reinstall numpy="${NUMPY_VERSION}"
+fi
diff --git a/.circleci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh
similarity index 100%
rename from .circleci/docker/common/install_ucc.sh
rename to .ci/docker/common/install_ucc.sh
diff --git a/.circleci/docker/common/install_user.sh b/.ci/docker/common/install_user.sh
similarity index 79%
rename from .circleci/docker/common/install_user.sh
rename to .ci/docker/common/install_user.sh
index 93a436cbfc78..29d69edd3c43 100755
--- a/.circleci/docker/common/install_user.sh
+++ b/.ci/docker/common/install_user.sh
@@ -22,5 +22,12 @@ chown jenkins:jenkins /usr/local
 # TODO: Maybe we shouldn't
 echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
 
+# Work around bug where devtoolset replaces sudo and breaks it.
+if [ -n "$DEVTOOLSET_VERSION" ]; then
+  SUDO=/bin/sudo
+else
+  SUDO=sudo
+fi
+
 # Test that sudo works
-sudo -u jenkins sudo -v
+$SUDO -u jenkins $SUDO -v
diff --git a/.circleci/docker/common/install_vision.sh b/.ci/docker/common/install_vision.sh
similarity index 100%
rename from .circleci/docker/common/install_vision.sh
rename to .ci/docker/common/install_vision.sh
diff --git a/.circleci/docker/common/install_vulkan_sdk.sh b/.ci/docker/common/install_vulkan_sdk.sh
similarity index 100%
rename from .circleci/docker/common/install_vulkan_sdk.sh
rename to .ci/docker/common/install_vulkan_sdk.sh
diff --git a/.circleci/docker/java/jni.h b/.ci/docker/java/jni.h
similarity index 100%
rename from .circleci/docker/java/jni.h
rename to .ci/docker/java/jni.h
diff --git a/.ci/docker/linter/Dockerfile b/.ci/docker/linter/Dockerfile
new file mode 100644
index 000000000000..968918a3617c
--- /dev/null
+++ b/.ci/docker/linter/Dockerfile
@@ -0,0 +1,34 @@
+ARG UBUNTU_VERSION
+
+FROM ubuntu:${UBUNTU_VERSION}
+
+ARG UBUNTU_VERSION
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install common dependencies (so that this step can be cached separately)
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Install user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+# Install conda and other packages (e.g., numpy, pytest)
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
+
+# Note that Docker build forbids copying file outside the build context
+COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_linter.sh
+RUN rm install_linter.sh common_utils.sh
+
+USER jenkins
+CMD ["bash"]
diff --git a/.circleci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
similarity index 94%
rename from .circleci/docker/requirements-ci.txt
rename to .ci/docker/requirements-ci.txt
index 890ead22a740..2196c92fe99a 100644
--- a/.circleci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -36,11 +36,6 @@ flatbuffers==2.0
 #Pinned versions: 2.0
 #test that import:
 
-#future #this breaks linux-bionic-rocm4.5-py3.7
-#Description: compatibility layer between python 2 and python 3
-#Pinned versions:
-#test that import:
-
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
@@ -52,7 +47,7 @@ junitparser==2.1.1
 #Pinned versions: 2.1.1
 #test that import:
 
-librosa>=0.6.2
+librosa>=0.6.2 ; python_version < "3.11"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
@@ -216,6 +211,7 @@ scikit-image
 
 scipy==1.6.3 ; python_version < "3.10"
 scipy==1.8.1 ; python_version == "3.10"
+scipy==1.9.3 ; python_version == "3.11"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.6.3
@@ -247,3 +243,23 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Description: saves unit test results to xml
 #Pinned versions:
 #test that import:
+
+lintrunner==0.9.2
+#Description: all about linters
+#Pinned versions: 0.9.2
+#test that import:
+
+rockset==1.0.3
+#Description: queries Rockset
+#Pinned versions: 1.0.3
+#test that import:
+
+ghstack==0.7.1
+#Description: ghstack tool
+#Pinned versions: 0.7.1
+#test that import:
+
+jinja2==3.1.2
+#Description: jinja2 template engine
+#Pinned versions: 3.1.2
+#test that import:
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
similarity index 87%
rename from .circleci/docker/ubuntu-cuda/Dockerfile
rename to .ci/docker/ubuntu-cuda/Dockerfile
index b64e3ee39303..0e294838f90f 100644
--- a/.circleci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -29,8 +29,8 @@ ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-RUN rm /opt/conda/requirements-ci.txt
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 
 # Install gcc
 ARG GCC_VERSION
@@ -85,6 +85,15 @@ COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 
+ARG TRITON
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
diff --git a/.circleci/docker/ubuntu-rocm/.gitignore b/.ci/docker/ubuntu-rocm/.gitignore
similarity index 100%
rename from .circleci/docker/ubuntu-rocm/.gitignore
rename to .ci/docker/ubuntu-rocm/.gitignore
diff --git a/.circleci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
similarity index 95%
rename from .circleci/docker/ubuntu-rocm/Dockerfile
rename to .ci/docker/ubuntu-rocm/Dockerfile
index 3d3cbf7a0502..42956546ee71 100644
--- a/.circleci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -31,8 +31,8 @@ ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-RUN rm /opt/conda/requirements-ci.txt
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 
 # Install gcc
 ARG GCC_VERSION
diff --git a/.circleci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
similarity index 90%
rename from .circleci/docker/ubuntu/Dockerfile
rename to .ci/docker/ubuntu/Dockerfile
index bce7d487941b..fd0e3a4fdfba 100644
--- a/.circleci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -40,8 +40,8 @@ ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-RUN rm /opt/conda/requirements-ci.txt
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 
 # Install gcc
 ARG GCC_VERSION
@@ -134,13 +134,18 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
 ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh
 
+ARG TRITON
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-# See https://github.com/pytorch/pytorch/issues/82174
-# TODO(sdym@fb.com):
-# check if this is needed after full off Xenial migration
-ENV CARGO_NET_GIT_FETCH_WITH_CLI true
 RUN bash ./install_cache.sh && rm install_cache.sh
 
 # Add jni.h for java host build
diff --git a/.jenkins/onnx/README.md b/.ci/onnx/README.md
similarity index 100%
rename from .jenkins/onnx/README.md
rename to .ci/onnx/README.md
diff --git a/.jenkins/onnx/common.sh b/.ci/onnx/common.sh
similarity index 100%
rename from .jenkins/onnx/common.sh
rename to .ci/onnx/common.sh
diff --git a/.jenkins/onnx/test.sh b/.ci/onnx/test.sh
similarity index 87%
rename from .jenkins/onnx/test.sh
rename to .ci/onnx/test.sh
index e214ac11eedd..f29188c6fd50 100755
--- a/.jenkins/onnx/test.sh
+++ b/.ci/onnx/test.sh
@@ -52,16 +52,19 @@ $MAYBE_SUDO pip -q uninstall -y coverage
 # CircleCI, so we host a copy on S3 instead
 $MAYBE_SUDO pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl
 $MAYBE_SUDO pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl
-$MAYBE_SUDO pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl
+$MAYBE_SUDO pip -q install hypothesis==4.57.1
 
 ##############
 # ONNX tests #
 ##############
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
-  pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.12.1 beartype==0.10.4 onnx==1.12.0
+  pip install -q --user transformers==4.25.1
+  pip install -q --user ninja flatbuffers==2.0 numpy==1.22.4 onnxruntime==1.14.0 beartype==0.10.4
+  # TODO: change this when onnx 1.13.1 is released.
+  pip install --no-use-pep517 'onnx @ git+https://github.com/onnx/onnx@e192ba01e438d22ca2dedd7956e28e3551626c91'
   # TODO: change this when onnx-script is on testPypi
-  pip install 'onnx-script @ git+https://github.com/microsoft/onnx-script@4f3ff0d806d0d0f30cecdfd3e8b094b1e492d44a'
+  pip install 'onnx-script @ git+https://github.com/microsoft/onnx-script@0298154caf6b46fc4e30abba034095c1290c26e3'
   # numba requires numpy <= 1.20, onnxruntime requires numpy >= 1.21.
   # We don't actually need it for our tests, but it's imported if it's present, so uninstall.
   pip uninstall -q --yes numba
diff --git a/.jenkins/pytorch/.shellcheckrc b/.ci/pytorch/.shellcheckrc
similarity index 100%
rename from .jenkins/pytorch/.shellcheckrc
rename to .ci/pytorch/.shellcheckrc
diff --git a/.jenkins/pytorch/README.md b/.ci/pytorch/README.md
similarity index 91%
rename from .jenkins/pytorch/README.md
rename to .ci/pytorch/README.md
index 9fd68ecf7f15..15e3a58dbc90 100644
--- a/.jenkins/pytorch/README.md
+++ b/.ci/pytorch/README.md
@@ -10,7 +10,7 @@ it is very easy to run these tests yourself:
    ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``,
    where ``$BUILD_ENVIRONMENT`` is one of the build environments
    enumerated in
-   [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/build.sh). The dockerfile used by jenkins can be found under the `.circle` [directory](https://github.com/pytorch/pytorch/blob/master/.circleci/docker)
+   [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.ci/docker/build.sh). The dockerfile used by jenkins can be found under the `.ci` [directory](https://github.com/pytorch/pytorch/blob/master/.ci/docker)
 
 2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
    run one of the scripts in this directory.
diff --git a/.jenkins/pytorch/build-asan.sh b/.ci/pytorch/build-asan.sh
similarity index 100%
rename from .jenkins/pytorch/build-asan.sh
rename to .ci/pytorch/build-asan.sh
diff --git a/.jenkins/pytorch/build-mobile.sh b/.ci/pytorch/build-mobile.sh
similarity index 100%
rename from .jenkins/pytorch/build-mobile.sh
rename to .ci/pytorch/build-mobile.sh
diff --git a/.jenkins/pytorch/build-tsan.sh b/.ci/pytorch/build-tsan.sh
similarity index 100%
rename from .jenkins/pytorch/build-tsan.sh
rename to .ci/pytorch/build-tsan.sh
diff --git a/.jenkins/pytorch/build.sh b/.ci/pytorch/build.sh
similarity index 97%
rename from .jenkins/pytorch/build.sh
rename to .ci/pytorch/build.sh
index e6f76308a4fa..cfca6fad834c 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -191,6 +191,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
   set -e
 
   get_bazel
+  install_sccache_nvcc_for_bazel
 
   # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
   # the runner
@@ -292,6 +293,13 @@ else
   else
     # Test no-Python build
     echo "Building libtorch"
+
+    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
+    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
+    # 16 CPUs
+    MAX_JOBS=$(nproc --ignore=4)
+    export MAX_JOBS
+
     # NB: Install outside of source directory (at the same level as the root
     # pytorch folder) so that it doesn't get cleaned away prior to docker push.
     BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
diff --git a/.jenkins/pytorch/codegen-test.sh b/.ci/pytorch/codegen-test.sh
similarity index 93%
rename from .jenkins/pytorch/codegen-test.sh
rename to .ci/pytorch/codegen-test.sh
index 4794dc48eb89..719a9ca6232b 100755
--- a/.jenkins/pytorch/codegen-test.sh
+++ b/.ci/pytorch/codegen-test.sh
@@ -3,8 +3,8 @@
 # This script can also be used to test whether your diff changes any codegen output.
 #
 # Run it before and after your change:
-#   .jenkins/pytorch/codegen-test.sh <baseline_output_dir>
-#   .jenkins/pytorch/codegen-test.sh <test_output_dir>
+#   .ci/pytorch/codegen-test.sh <baseline_output_dir>
+#   .ci/pytorch/codegen-test.sh <test_output_dir>
 #
 # Then run diff to compare the generated files:
 #   diff -Naur <baseline_output_dir> <test_output_dir>
diff --git a/.jenkins/pytorch/common-build.sh b/.ci/pytorch/common-build.sh
similarity index 100%
rename from .jenkins/pytorch/common-build.sh
rename to .ci/pytorch/common-build.sh
diff --git a/.jenkins/pytorch/common.sh b/.ci/pytorch/common.sh
similarity index 56%
rename from .jenkins/pytorch/common.sh
rename to .ci/pytorch/common.sh
index c31b853dbcdd..23719dceb448 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@@ -33,28 +33,6 @@ BUILD_TEST_LIBTORCH=0
 # TODO: Reenable nvfuser when issues with gfx908 resolved
 PYTORCH_JIT_ENABLE_NVFUSER=0
 
-# Use conda cmake in some CI build. Conda cmake will be newer than our supported
-# min version (3.5 for xenial and 3.10 for bionic),
-# so we only do it in four builds that we know should use conda.
-# Linux bionic cannot find conda mkl with cmake 3.10, so we need a cmake from conda.
-# Alternatively we could point cmake to the right place
-# export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-if [[ "${TEST_CONFIG:-}" == *xla* ]] || \
-   [[ "$BUILD_ENVIRONMENT" == *centos* ]] || \
-   [[ "$BUILD_ENVIRONMENT" == *linux-bionic* ]] || \
-   [[ "$BUILD_ENVIRONMENT" == *linux-focal* ]]; then
-  if ! which conda; then
-    echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
-    exit 1
-  else
-    conda install -q -y cmake
-  fi
-  if [[ "$BUILD_ENVIRONMENT" == *centos* ]]; then
-    # cmake3 package will conflict with conda cmake
-    sudo yum -y remove cmake3 || true
-  fi
-fi
-
 retry () {
   "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@")
 }
diff --git a/.jenkins/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
similarity index 80%
rename from .jenkins/pytorch/common_utils.sh
rename to .ci/pytorch/common_utils.sh
index e4290064595a..501919c338c3 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -108,6 +108,26 @@ function get_bazel() {
   chmod +x tools/bazel
 }
 
+# This function is bazel specific because of the bug
+# in the bazel that requires some special paths massaging
+# as a workaround. See
+# https://github.com/bazelbuild/bazel/issues/10167
+function install_sccache_nvcc_for_bazel() {
+  sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real
+
+  # Write the `/usr/local/cuda/bin/nvcc`
+  cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
+#!/bin/sh
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache /usr/local/cuda/bin/nvcc "\$@"
+else
+  exec external/local_cuda/cuda/bin/nvcc-real "\$@"
+fi
+EOF
+
+  sudo chmod +x /usr/local/cuda/bin/nvcc
+}
+
 function install_monkeytype {
   # Install MonkeyType
   pip_install MonkeyType
@@ -152,7 +172,18 @@ function install_triton() {
     echo "skipping triton due to rocm"
   else
     commit=$(get_pinned_commit triton)
-    pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
+    if [[ "${BUILD_ENVIRONMENT}" == *gcc7* ]]; then
+      # Trition needs gcc-9 to build
+      sudo apt-get install -y g++-9
+      CXX=g++-9 pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
+    elif [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
+      # Trition needs <filesystem> which surprisingly is not available with clang-9 toolchain
+      sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+      sudo apt-get install -y g++-9
+      CXX=g++-9 pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
+    else
+      pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
+    fi
     pip_install --user jinja2
   fi
 }
@@ -211,9 +242,14 @@ function checkout_install_torchbench() {
   git clone https://github.com/pytorch/benchmark torchbench
   pushd torchbench
   git checkout no_torchaudio
-  # Occasionally the installation may fail on one model but it is ok to continue
-  # to install and test other models
-  python install.py --continue_on_fail
+
+  if [ "$1" ]; then
+    python install.py --continue_on_fail models "$@"
+  else
+    # Occasionally the installation may fail on one model but it is ok to continue
+    # to install and test other models
+    python install.py --continue_on_fail
+  fi
   popd
 }
 
diff --git a/.jenkins/pytorch/create_test_cert.py b/.ci/pytorch/create_test_cert.py
similarity index 100%
rename from .jenkins/pytorch/create_test_cert.py
rename to .ci/pytorch/create_test_cert.py
diff --git a/.jenkins/pytorch/docker-build-test.sh b/.ci/pytorch/docker-build-test.sh
similarity index 100%
rename from .jenkins/pytorch/docker-build-test.sh
rename to .ci/pytorch/docker-build-test.sh
diff --git a/.jenkins/pytorch/docs-test.sh b/.ci/pytorch/docs-test.sh
similarity index 100%
rename from .jenkins/pytorch/docs-test.sh
rename to .ci/pytorch/docs-test.sh
diff --git a/.jenkins/pytorch/fake_numpy/numpy.py b/.ci/pytorch/fake_numpy/numpy.py
similarity index 100%
rename from .jenkins/pytorch/fake_numpy/numpy.py
rename to .ci/pytorch/fake_numpy/numpy.py
diff --git a/.jenkins/pytorch/macos-build-test.sh b/.ci/pytorch/macos-build-test.sh
similarity index 100%
rename from .jenkins/pytorch/macos-build-test.sh
rename to .ci/pytorch/macos-build-test.sh
diff --git a/.jenkins/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
similarity index 85%
rename from .jenkins/pytorch/macos-build.sh
rename to .ci/pytorch/macos-build.sh
index dbba68081d3e..7edbc3ca363b 100755
--- a/.jenkins/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -40,6 +40,16 @@ cross_compile_arm64() {
   USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }
 
+compile_arm64() {
+  # Compilation for arm64
+  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
+  USE_DISTRIBUTED=0 USE_OPENMP=0 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_x86_64() {
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
+}
+
 compile_x86_64() {
   USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
 }
@@ -63,7 +73,11 @@ build_lite_interpreter() {
 }
 
 if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
-  cross_compile_arm64
+  if [[ $(uname -m) == "arm64" ]]; then
+    compile_arm64
+  else
+    cross_compile_arm64
+  fi
 elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
   export BUILD_LITE_INTERPRETER=1
   build_lite_interpreter
diff --git a/.jenkins/pytorch/macos-common.sh b/.ci/pytorch/macos-common.sh
similarity index 100%
rename from .jenkins/pytorch/macos-common.sh
rename to .ci/pytorch/macos-common.sh
diff --git a/.jenkins/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
similarity index 86%
rename from .jenkins/pytorch/macos-test.sh
rename to .ci/pytorch/macos-test.sh
index ebdba69613ee..a5111b62e833 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -89,6 +89,16 @@ print_cmake_info() {
   CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
   # Print all libraries under cmake rpath for debugging
   ls -la "$CONDA_INSTALLATION_DIR/../lib"
+
+  export CMAKE_EXEC
+  # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
+  # where cmake dependencies couldn't be found. This seems to point to how conda
+  # links $CMAKE_EXEC to its package cache when cloning a new environment
+  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
+  # Adding the rpath will invalidate cmake signature, so signing it again here
+  # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
+  # with an exit code 137 otherwise
+  codesign -f -s - "${CMAKE_EXEC}" || true
 }
 
 test_custom_backend() {
@@ -99,7 +109,7 @@ test_custom_backend() {
   rm -rf build && mkdir build
   pushd build
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" "${CMAKE_EXEC}" ..
   make VERBOSE=1
   popd
 
@@ -122,7 +132,7 @@ test_custom_script_ops() {
   rm -rf build && mkdir build
   pushd build
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" "${CMAKE_EXEC}" ..
   make VERBOSE=1
   popd
 
@@ -144,7 +154,7 @@ test_jit_hooks() {
   rm -rf build && mkdir build
   pushd build
   SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" "${CMAKE_EXEC}" ..
   make VERBOSE=1
   popd
 
diff --git a/.jenkins/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
similarity index 97%
rename from .jenkins/pytorch/multigpu-test.sh
rename to .ci/pytorch/multigpu-test.sh
index 32f947b53c58..1eaa612a8ab8 100755
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@@ -42,7 +42,6 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/ops/
 time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/ops/test_softmax
 time python test/run_test.py --verbose -i distributed/_shard/sharded_optim/test_sharded_optim
 time python test/run_test.py --verbose -i distributed/_shard/test_partial_tensor
-time python test/run_test.py --verbose -i distributed/_shard/test_replicated_tensor
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
 time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors
diff --git a/.jenkins/pytorch/perf_test/common.sh b/.ci/pytorch/perf_test/common.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/common.sh
rename to .ci/pytorch/perf_test/common.sh
diff --git a/.jenkins/pytorch/perf_test/compare_with_baseline.py b/.ci/pytorch/perf_test/compare_with_baseline.py
similarity index 97%
rename from .jenkins/pytorch/perf_test/compare_with_baseline.py
rename to .ci/pytorch/perf_test/compare_with_baseline.py
index 95f60edd4bca..6d2839ac1db4 100644
--- a/.jenkins/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@@ -62,7 +62,7 @@
     raise Exception('''\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
-`cd .jenkins/pytorch/perf_test/ && bash {}.sh` on your local machine
+`cd .ci/pytorch/perf_test/ && bash {}.sh` on your local machine
 and compare the runtime before/after your code change.
 '''.format(test_name))
 else:
diff --git a/.jenkins/pytorch/perf_test/get_stats.py b/.ci/pytorch/perf_test/get_stats.py
similarity index 100%
rename from .jenkins/pytorch/perf_test/get_stats.py
rename to .ci/pytorch/perf_test/get_stats.py
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh b/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
rename to .ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh b/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_cpu_speed_mnist.sh
rename to .ci/pytorch/perf_test/test_cpu_speed_mnist.sh
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh b/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
similarity index 78%
rename from .jenkins/pytorch/perf_test/test_cpu_speed_torch.sh
rename to .ci/pytorch/perf_test/test_cpu_speed_torch.sh
index 0f639aec5338..77b86e77a26f 100644
--- a/.jenkins/pytorch/perf_test/test_cpu_speed_torch.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
@@ -19,7 +19,7 @@ test_cpu_speed_torch () {
   fi
 
   if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
-    echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
     exit 1
   fi
 }
diff --git a/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh b/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
similarity index 79%
rename from .jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
rename to .ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
index e45b8adf7c7c..fc8ede36c90e 100644
--- a/.jenkins/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
@@ -19,7 +19,7 @@ test_cpu_speed_torch_tensor () {
   fi
 
   if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
-    echo "To reproduce this regression, run \`cd .jenkins/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
     exit 1
   fi
 }
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh b/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
rename to .ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh b/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_gpu_speed_lstm.sh
rename to .ci/pytorch/perf_test/test_gpu_speed_lstm.sh
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh b/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_gpu_speed_mlstm.sh
rename to .ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh b/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_gpu_speed_mnist.sh
rename to .ci/pytorch/perf_test/test_gpu_speed_mnist.sh
diff --git a/.jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh b/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
similarity index 100%
rename from .jenkins/pytorch/perf_test/test_gpu_speed_word_language_model.sh
rename to .ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
diff --git a/.jenkins/pytorch/perf_test/update_commit_hash.py b/.ci/pytorch/perf_test/update_commit_hash.py
similarity index 100%
rename from .jenkins/pytorch/perf_test/update_commit_hash.py
rename to .ci/pytorch/perf_test/update_commit_hash.py
diff --git a/.jenkins/pytorch/print_sccache_log.py b/.ci/pytorch/print_sccache_log.py
similarity index 100%
rename from .jenkins/pytorch/print_sccache_log.py
rename to .ci/pytorch/print_sccache_log.py
diff --git a/.jenkins/pytorch/run_glootls_test.sh b/.ci/pytorch/run_glootls_test.sh
similarity index 100%
rename from .jenkins/pytorch/run_glootls_test.sh
rename to .ci/pytorch/run_glootls_test.sh
diff --git a/.jenkins/pytorch/short-perf-test-cpu.sh b/.ci/pytorch/short-perf-test-cpu.sh
similarity index 97%
rename from .jenkins/pytorch/short-perf-test-cpu.sh
rename to .ci/pytorch/short-perf-test-cpu.sh
index 7cb4608a75f7..41f0a493b55f 100755
--- a/.jenkins/pytorch/short-perf-test-cpu.sh
+++ b/.ci/pytorch/short-perf-test-cpu.sh
@@ -2,10 +2,10 @@
 
 SCRIPT_PARENT_DIR=$(dirname "${BASH_SOURCE[0]}")
 
-# shellcheck source=.jenkins/pytorch/common.sh
+# shellcheck source=.ci/pytorch/common.sh
 source "$SCRIPT_PARENT_DIR/common.sh"
 
-cd .jenkins/pytorch/perf_test
+cd .ci/pytorch/perf_test
 
 echo "Running CPU perf test for PyTorch..."
 
diff --git a/.jenkins/pytorch/short-perf-test-gpu.sh b/.ci/pytorch/short-perf-test-gpu.sh
similarity index 98%
rename from .jenkins/pytorch/short-perf-test-gpu.sh
rename to .ci/pytorch/short-perf-test-gpu.sh
index d7a49cb18842..5fc897fefb7d 100755
--- a/.jenkins/pytorch/short-perf-test-gpu.sh
+++ b/.ci/pytorch/short-perf-test-gpu.sh
@@ -3,7 +3,7 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
-pushd .jenkins/pytorch/perf_test
+pushd .ci/pytorch/perf_test
 
 echo "Running GPU perf test for PyTorch..."
 
diff --git a/.jenkins/pytorch/test.sh b/.ci/pytorch/test.sh
similarity index 85%
rename from .jenkins/pytorch/test.sh
rename to .ci/pytorch/test.sh
index 7bb6bca5064c..428cb4bf88eb 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -19,7 +19,7 @@ BUILD_RENAMED_DIR="build_renamed"
 BUILD_BIN_DIR="$BUILD_DIR"/bin
 
 export VALGRIND=ON
-export TORCH_INDUCTOR_INSTALL_GXX=ON
+# export TORCH_INDUCTOR_INSTALL_GXX=ON
 if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
   # clang9 appears to miscompile code involving c10::optional<c10::SymInt>,
   # such that valgrind complains along these lines:
@@ -92,6 +92,9 @@ fi
 
 if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
   export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1
+  # TODO: slow gradcheck tests run out of memory a lot recently, so setting this
+  # to run them sequentially with only one process to mitigate the issue
+  export PYTORCH_TEST_CUDA_MEM_LEAK_CHECK=1
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
@@ -105,14 +108,6 @@ if [[ "$TEST_CONFIG" == *crossref* ]]; then
   export PYTORCH_TEST_WITH_CROSSREF=1
 fi
 
-if [[ "$TEST_CONFIG" == *dynamo* ]]; then
-  export PYTORCH_TEST_WITH_DYNAMO=1
-fi
-
-if [[ "$TEST_CONFIG" == *inductor* ]]; then
-  export PYTORCH_TEST_WITH_INDUCTOR=1
-fi
-
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # Print GPU info
   rocminfo
@@ -222,7 +217,7 @@ test_dynamo_shard() {
   python tools/dynamo/verify_dynamo.py
   # Temporarily disable test_fx for dynamo pending the investigation on TTS
   # regression in https://github.com/pytorch/torchdynamo/issues/784
-  time python test/run_test.py \
+  time python test/run_test.py --dynamo \
     --exclude-jit-executor \
     --exclude-distributed-tests \
     --exclude \
@@ -249,14 +244,15 @@ test_dynamo_shard() {
 test_inductor_distributed() {
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
-  PYTORCH_TEST_WITH_INDUCTOR=0 PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include distributed/test_dynamo_distributed --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_traceable_collectives --verbose
   assert_git_not_dirty
 }
 
 test_inductor() {
   python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --include test_modules test_ops test_ops_gradients test_torch --verbose
-  PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include inductor/test_torchinductor --include inductor/test_torchinductor_opinfo --verbose
+  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
+  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose
 }
 
 test_single_dynamo_benchmark() {
@@ -284,7 +280,7 @@ test_single_dynamo_benchmark() {
   # Feel free to remove --device cuda if you ever decide to need to
   # test CPU as well in CI
   python "benchmarks/dynamo/$suite.py" \
-    --ci --accuracy --device cuda \
+    --ci --accuracy --timing --explain \
     "$@" "${partition_flags[@]}" \
     --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
   python benchmarks/dynamo/check_csv.py \
@@ -297,10 +293,10 @@ test_aot_eager_benchmark() {
   local exit_status=0
 
   # Check inference with --float32
-  test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager || exit_status=$?
+  test_single_dynamo_benchmark "aot_eager_inference" "$@" --backend aot_eager --device cuda || exit_status=$?
 
   # Check training with --amp
-  test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager --training --amp || exit_status=$?
+  test_single_dynamo_benchmark "aot_eager_training" "$@" --backend aot_eager  --device cuda --training --amp || exit_status=$?
 
   if [[ $exit_status -ne 0 ]]; then
     echo "Some benchmarks failed; scroll up for details"
@@ -311,15 +307,22 @@ test_aot_eager_benchmark() {
 test_inductor_benchmark() {
   # Usage: test_dynamo_benchmark huggingface 0
 
-  # Check inference with --float32
-  test_single_dynamo_benchmark "inductor_inference" "$@" --inductor
+  local device="$1"
+  shift
 
-  # Check training with --amp
-  test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp
+  if [[ $device == "cpu" ]]; then
+    # TODO: Add training and dynamic shape test
+    test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --float32 --device cpu
+  else
+    # Check inference with --float32
+    test_single_dynamo_benchmark "inductor_inference" "$@" --inductor --device cuda
+
+    # Check training with --amp
+    test_single_dynamo_benchmark "inductor_training" "$@" --inductor --training --amp --device cuda
 
-  # Check training with symbolic shapes (not actually inductor)
-  test_single_dynamo_benchmark "dynamic_aot_eager_training" "$@" \
-    --backend aot_eager --dynamic-shapes --training
+    # Check inference with --dynamic-shapes
+    test_single_dynamo_benchmark "dynamic_inductor-inference" "$@" --inductor --dynamic-shapes --device cuda
+  fi
 }
 
 test_inductor_benchmark_perf() {
@@ -336,12 +339,23 @@ test_inductor_benchmark_perf() {
   # Not checking accuracy for perf test for now
   # shellcheck disable=SC2086
   if [[ "$1" == *smoketest* ]]; then
-    python benchmarks/dynamo/torchbench.py --performance --backend inductor --float16 --training \
+    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
       --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
       --output "$TEST_REPORTS_DIR"/inductor_training_$1.csv
     # the reference speedup value is hardcoded in check_hf_bert_perf_csv.py
     # this value needs to be actively maintained to make this check useful
     python benchmarks/dynamo/check_hf_bert_perf_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_$1.csv
+
+    # Check memory compression ratio for a few models
+    for test in hf_Albert timm_efficientdet timm_vision_transformer; do
+      python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
+        --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
+        --only $test --output "$TEST_REPORTS_DIR"/inductor_training_$1_$test.csv
+      cat "$TEST_REPORTS_DIR"/inductor_training_$1_$test.csv
+      python benchmarks/dynamo/check_memory_compression_ratio.py --actual \
+        "$TEST_REPORTS_DIR"/inductor_training_$1_$test.csv \
+        --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
+    done
   else
     python benchmarks/dynamo/$1.py --ci --training --performance --disable-cudagraphs\
       --device cuda --inductor --amp $PARTITION_FLAGS  --output "$TEST_REPORTS_DIR"/inductor_training_$1.csv
@@ -351,9 +365,9 @@ test_inductor_benchmark_perf() {
 # No sharding for the periodic job, we don't care if latency is bad
 test_aot_eager_all() {
   local exit_status=0
-  PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench "" || exit_status=$?
-  test_aot_eager_benchmark huggingface "" || exit_status=$?
-  test_aot_eager_benchmark timm_models "" || exit_status=$?
+  PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench "" "$@" || exit_status=$?
+  test_aot_eager_benchmark huggingface "" "$@" || exit_status=$?
+  test_aot_eager_benchmark timm_models "" "$@" || exit_status=$?
   if [[ $exit_status -ne 0 ]]; then
     echo "Some benchmarks failed; scroll up for details"
   fi
@@ -361,7 +375,9 @@ test_aot_eager_all() {
 }
 
 test_inductor_huggingface() {
-  test_inductor_benchmark huggingface ""
+  local device=$1
+  shift
+  test_inductor_benchmark "$device" huggingface ""
 }
 
 test_inductor_huggingface_perf() {
@@ -373,7 +389,9 @@ test_inductor_timm_shard() {
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
     exit 1
   fi
-  test_inductor_benchmark timm_models "$1"
+  local device=$1
+  shift
+  test_inductor_benchmark "$device" timm_models "$1"
 }
 
 test_inductor_timm_perf_shard() {
@@ -385,7 +403,9 @@ test_inductor_timm_perf_shard() {
 }
 
 test_inductor_torchbench() {
-  PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark torchbench ""
+  local device=$1
+  shift
+  PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark "$device" torchbench ""
 }
 
 test_inductor_torchbench_perf() {
@@ -474,13 +494,14 @@ test_libtorch() {
     ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
+    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"
 
     # Start background download
     python tools/download_mnist.py --quiet -d test/cpp/api/mnist &
 
     # Make test_reports directory
     # NB: the ending test_libtorch must match the current function name for the current
-    # test reporting process (in print_test_stats.py) to function as expected.
+    # test reporting process to function as expected.
     TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_libtorch
     mkdir -p $TEST_REPORTS_DIR
 
@@ -491,6 +512,7 @@ test_libtorch() {
 
     if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
       "$TORCH_BIN_DIR"/test_jit  --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
+      "$TORCH_BIN_DIR"/nvfuser_tests --gtest_output=xml:$TEST_REPORTS_DIR/nvfuser_tests.xml
     else
       "$TORCH_BIN_DIR"/test_jit  --gtest_filter='-*CUDA' --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml
     fi
@@ -527,7 +549,7 @@ test_aot_compilation() {
 
   # Make test_reports directory
   # NB: the ending test_libtorch must match the current function name for the current
-  # test reporting process (in print_test_stats.py) to function as expected.
+  # test reporting process to function as expected.
   TEST_REPORTS_DIR=test/test-reports/cpp-unittest/test_aot_compilation
   mkdir -p $TEST_REPORTS_DIR
   if [ -f "$TORCH_BIN_DIR"/test_mobile_nnc ]; then "$TORCH_BIN_DIR"/test_mobile_nnc --gtest_output=xml:$TEST_REPORTS_DIR/test_mobile_nnc.xml; fi
@@ -541,7 +563,7 @@ test_vulkan() {
     ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_TEST_DIR"
     export VK_ICD_FILENAMES=/var/lib/jenkins/swiftshader/swiftshader/build/Linux/vk_swiftshader_icd.json
     # NB: the ending test_vulkan must match the current function name for the current
-    # test reporting process (in print_test_stats.py) to function as expected.
+    # test reporting process to function as expected.
     TEST_REPORTS_DIR=test/test-reports/cpp-vulkan/test_vulkan
     mkdir -p $TEST_REPORTS_DIR
     LD_LIBRARY_PATH=/var/lib/jenkins/swiftshader/swiftshader/build/Linux/ "$TORCH_TEST_DIR"/vulkan_api_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml
@@ -558,7 +580,7 @@ test_distributed() {
     ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
     # NB: the ending test_distributed must match the current function name for the current
-    # test reporting process (in print_test_stats.py) to function as expected.
+    # test reporting process to function as expected.
     TEST_REPORTS_DIR=test/test-reports/cpp-distributed/test_distributed
     mkdir -p $TEST_REPORTS_DIR
     "$TORCH_BIN_DIR"/FileStoreTest --gtest_output=xml:$TEST_REPORTS_DIR/FileStoreTest.xml
@@ -582,7 +604,7 @@ test_rpc() {
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
     echo "Testing RPC C++ tests"
     # NB: the ending test_rpc must match the current function name for the current
-    # test reporting process (in print_test_stats.py) to function as expected.
+    # test reporting process to function as expected.
     ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
     ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
@@ -711,6 +733,7 @@ test_forward_backward_compatibility() {
 
   # build torch at the base commit to generate a base function schema for comparison
   git reset --hard "${SHA_TO_COMPARE}"
+  git submodule sync && git submodule update --init --recursive
   echo "::group::Installing Torch From Base Commit"
   pip install -r requirements.txt
   # shellcheck source=./common-build.sh
@@ -724,6 +747,7 @@ test_forward_backward_compatibility() {
   python dump_all_function_schemas.py --filename nightly_schemas.txt
 
   git reset --hard "${SHA1}"
+  git submodule sync && git submodule update --init --recursive
   # FC: verify new model can be load with old code.
   if ! python ../load_torchscript_model.py /tmp/model_new.pt; then
       echo "FC check failed: new model cannot be load in old code"
@@ -804,7 +828,7 @@ test_vec256() {
 }
 
 test_docs_test() {
-  .jenkins/pytorch/docs-test.sh
+  .ci/pytorch/docs-test.sh
 }
 
 test_executorch() {
@@ -864,40 +888,93 @@ elif [[ "${TEST_CONFIG}" == *aot_eager_all* ]]; then
   checkout_install_torchbench
   install_huggingface
   install_timm
-  test_aot_eager_all
+  if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
+    # NB: This code path is currently dead because dynamic shapes takes
+    # too long to run unsharded
+    test_aot_eager_all --dynamic-shapes
+  else
+    test_aot_eager_all
+  fi
+elif [[ "${TEST_CONFIG}" == *aot_eager_huggingface* ]]; then
+  install_torchvision
+  install_filelock
+  install_huggingface
+  if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
+    test_aot_eager_benchmark huggingface "" --dynamic-shapes
+  else
+    test_aot_eager_benchmark huggingface ""
+  fi
+elif [[ "${TEST_CONFIG}" == *aot_eager_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  install_filelock
+  install_timm
+  id=$((SHARD_NUMBER-1))
+  if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
+    test_aot_eager_benchmark timm_models "$id" --dynamic-shapes
+  else
+    test_aot_eager_benchmark timm_models "$id"
+  fi
+elif [[ "${TEST_CONFIG}" == *aot_eager_torchbench* ]]; then
+  install_torchtext
+  install_torchvision
+  install_filelock
+  checkout_install_torchbench
+  if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
+    PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench "" --dynamic-shapes
+  else
+    PYTHONPATH=$(pwd)/torchbench test_aot_eager_benchmark torchbench ""
+  fi
 elif [[ "${TEST_CONFIG}" == *inductor_huggingface* ]]; then
   install_torchvision
   install_filelock
-  install_triton
+  if [[ "${TEST_CONFIG}" != *inductor_huggingface_cpu_accuracy* ]]; then
+    # Cpp backend does not depend on triton
+    install_triton
+  fi
   install_huggingface
   if [[ "${TEST_CONFIG}" == *inductor_huggingface_perf* ]]; then
     test_inductor_huggingface_perf
+  elif [[ "${TEST_CONFIG}" == *inductor_huggingface_cpu_accuracy* ]]; then
+    test_inductor_huggingface cpu
   else
-    test_inductor_huggingface
+    test_inductor_huggingface cuda
   fi
 elif [[ "${TEST_CONFIG}" == *inductor_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
-  install_triton
+  if [[ "${TEST_CONFIG}" != *inductor_timm_cpu_accuracy* ]]; then
+    # Cpp backend does not depend on triton
+    install_triton
+  fi
   install_timm
   id=$((SHARD_NUMBER-1))
   if [[ "${TEST_CONFIG}" == *inductor_timm_perf* && $NUM_TEST_SHARDS -gt 1 ]]; then
     test_inductor_timm_perf_shard $id
+  elif [[ "${TEST_CONFIG}" == *inductor_timm_cpu_accuracy* && $NUM_TEST_SHARDS -gt 1 ]]; then
+    test_inductor_timm_shard cpu $id
   else
-    test_inductor_timm_shard $id
+    test_inductor_timm_shard cuda $id
   fi
 elif [[ "${TEST_CONFIG}" == *inductor_torchbench* ]]; then
   install_torchtext
   install_torchvision
   install_filelock
-  install_triton
-  checkout_install_torchbench
+  if [[ "${TEST_CONFIG}" != *inductor_torchbench_cpu_accuracy* ]]; then
+    # Cpp backend does not depend on triton
+    install_triton
+  fi
   if [[ "${TEST_CONFIG}" == *inductor_torchbench_perf* ]]; then
+    checkout_install_torchbench
     test_inductor_torchbench_perf
+  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_accuracy* ]]; then
+    checkout_install_torchbench
+    test_inductor_torchbench cpu
   elif [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
+    checkout_install_torchbench hf_Bert hf_Albert timm_efficientdet timm_vision_transformer
     test_inductor_torchbench_smoketest_perf
   else
-    test_inductor_torchbench
+    checkout_install_torchbench
+    test_inductor_torchbench cuda
   fi
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
   install_torchvision
@@ -922,6 +999,7 @@ elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_torch_function_benchmark
 elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then
   # Handle arbitrary number of shards
+  install_torchvision
   install_triton
   test_python_shard "$SHARD_NUMBER"
 elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
diff --git a/.jenkins/pytorch/win-build.sh b/.ci/pytorch/win-build.sh
similarity index 85%
rename from .jenkins/pytorch/win-build.sh
rename to .ci/pytorch/win-build.sh
index c518630c908e..0c7700a07cad 100755
--- a/.jenkins/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@@ -15,13 +15,6 @@ source "$SCRIPT_PARENT_DIR/common.sh"
 # shellcheck source=./common-build.sh
 source "$SCRIPT_PARENT_DIR/common-build.sh"
 
-IMAGE_COMMIT_ID=$(git rev-parse HEAD)
-export IMAGE_COMMIT_ID
-export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
-if [[ ${JOB_NAME} == *"develop"* ]]; then
-  export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG}
-fi
-
 export TMP_DIR="${PWD}/build/win_tmp"
 TMP_DIR_WIN=$(cygpath -w "${TMP_DIR}")
 export TMP_DIR_WIN
@@ -59,7 +52,4 @@ set -ex
 
 assert_git_not_dirty
 
-if [ ! -f "${TMP_DIR}"/"${IMAGE_COMMIT_TAG}".7z ] && [ ! "${BUILD_ENVIRONMENT}" == "" ]; then
-    exit 1
-fi
 echo "BUILD PASSED"
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
similarity index 93%
rename from .jenkins/pytorch/win-test-helpers/build_pytorch.bat
rename to .ci/pytorch/win-test-helpers/build_pytorch.bat
index 54167c0b0da0..1c6d834ce4f2 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -80,7 +80,8 @@ set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set DISTUTILS_USE_SDK=1
 set PATH=%TMP_DIR_WIN%\bin;%PATH%
 
-if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=7.0
+:: The latest Windows CUDA test is running on AWS G5 runner with A10G GPU
+if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=8.6
 
 :: The default sccache idle timeout is 600, which is too short and leads to intermittent build errors.
 set SCCACHE_IDLE_TIMEOUT=0
@@ -137,9 +138,7 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps
   if "%BUILD_ENVIRONMENT%"=="" (
     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
   ) else (
-    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\functorch && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
-    if errorlevel 1 exit /b
-    if not errorlevel 0 exit /b
+    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
 
     :: export test times so that potential sharded tests that'll branch off this build will use consistent data
     python tools/stats/export_test_times.py
diff --git a/.jenkins/pytorch/win-test-helpers/choose_runtime_cuda_version.bat b/.ci/pytorch/win-test-helpers/choose_runtime_cuda_version.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/choose_runtime_cuda_version.bat
rename to .ci/pytorch/win-test-helpers/choose_runtime_cuda_version.bat
diff --git a/.jenkins/pytorch/win-test-helpers/install_test_functorch.bat b/.ci/pytorch/win-test-helpers/install_test_functorch.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/install_test_functorch.bat
rename to .ci/pytorch/win-test-helpers/install_test_functorch.bat
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
rename to .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
rename to .ci/pytorch/win-test-helpers/installation-helpers/install_magma.bat
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
rename to .ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_sccache.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
rename to .ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
diff --git a/.jenkins/pytorch/win-test-helpers/run_python_nn_smoketests.py b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/run_python_nn_smoketests.py
rename to .ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
similarity index 83%
rename from .jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
rename to .ci/pytorch/win-test-helpers/setup_pytorch_env.bat
index 043d67f843c1..2b71b649b0d3 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -14,6 +14,13 @@ call %INSTALLER_DIR%\activate_miniconda3.bat
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b
 
+:: PyTorch is now installed using the standard wheel on Windows into the conda environment.
+:: However, the test scripts are still frequently referring to the workspace temp directory
+:: build\torch. Rather than changing all these references, making a copy of torch folder
+:: from conda to the current workspace is easier. The workspace will be cleaned up after
+:: the job anyway
+xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+
 pushd .
 if "%VC_VERSION%" == "" (
     call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64
@@ -48,16 +55,6 @@ set NUMBAPRO_NVVM=%CUDA_PATH%\nvvm\bin\nvvm64_32_0.dll
 
 set PYTHONPATH=%TMP_DIR_WIN%\build;%PYTHONPATH%
 
-if NOT "%BUILD_ENVIRONMENT%"=="" (
-    pushd %TMP_DIR_WIN%\build
-    copy /Y %PYTORCH_FINAL_PACKAGE_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %TMP_DIR_WIN%\
-    :: 7z: -aos skips if exists because this .bat can be called multiple times
-    7z x %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z -aos
-    popd
-) else (
-    xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
-)
-
 @echo off
 echo @echo off >> %TMP_DIR_WIN%/ci_scripts/pytorch_env_restore.bat
 for /f "usebackq tokens=*" %%i in (`set`) do echo set "%%i" >> %TMP_DIR_WIN%/ci_scripts/pytorch_env_restore.bat
diff --git a/.jenkins/pytorch/win-test-helpers/test_custom_backend.bat b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/test_custom_backend.bat
rename to .ci/pytorch/win-test-helpers/test_custom_backend.bat
diff --git a/.jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/test_custom_script_ops.bat
rename to .ci/pytorch/win-test-helpers/test_custom_script_ops.bat
diff --git a/.jenkins/pytorch/win-test-helpers/test_distributed.bat b/.ci/pytorch/win-test-helpers/test_distributed.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/test_distributed.bat
rename to .ci/pytorch/win-test-helpers/test_distributed.bat
diff --git a/.jenkins/pytorch/win-test-helpers/test_libtorch.bat b/.ci/pytorch/win-test-helpers/test_libtorch.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/test_libtorch.bat
rename to .ci/pytorch/win-test-helpers/test_libtorch.bat
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat b/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
rename to .ci/pytorch/win-test-helpers/test_python_jit_legacy.bat
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat
similarity index 100%
rename from .jenkins/pytorch/win-test-helpers/test_python_shard.bat
rename to .ci/pytorch/win-test-helpers/test_python_shard.bat
diff --git a/.jenkins/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
similarity index 90%
rename from .jenkins/pytorch/win-test.sh
rename to .ci/pytorch/win-test.sh
index 560b039dbf67..8bf85f89c213 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -5,13 +5,6 @@ SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 # shellcheck source=./common.sh
 source "$SCRIPT_PARENT_DIR/common.sh"
 
-IMAGE_COMMIT_ID=$(git rev-parse HEAD)
-export IMAGE_COMMIT_ID
-export IMAGE_COMMIT_TAG=${BUILD_ENVIRONMENT}-${IMAGE_COMMIT_ID}
-if [[ ${JOB_NAME} == *"develop"* ]]; then
-  export IMAGE_COMMIT_TAG=develop-${IMAGE_COMMIT_TAG}
-fi
-
 export TMP_DIR="${PWD}/build/win_tmp"
 TMP_DIR_WIN=$(cygpath -w "${TMP_DIR}")
 export TMP_DIR_WIN
@@ -21,13 +14,12 @@ export PROJECT_DIR_WIN
 export TEST_DIR="${PWD}/test"
 TEST_DIR_WIN=$(cygpath -w "${TEST_DIR}")
 export TEST_DIR_WIN
-export PYTORCH_FINAL_PACKAGE_DIR="${PYTORCH_FINAL_PACKAGE_DIR:-/c/users/circleci/workspace/build-results}"
+export PYTORCH_FINAL_PACKAGE_DIR="${PYTORCH_FINAL_PACKAGE_DIR:-/c/w/build-results}"
 PYTORCH_FINAL_PACKAGE_DIR_WIN=$(cygpath -w "${PYTORCH_FINAL_PACKAGE_DIR}")
 export PYTORCH_FINAL_PACKAGE_DIR_WIN
 
 mkdir -p "$TMP_DIR"/build/torch
 
-
 # This directory is used only to hold "pytorch_env_restore.bat", called via "setup_pytorch_env.bat"
 CI_SCRIPTS_DIR=$TMP_DIR/ci_scripts
 mkdir -p "$CI_SCRIPTS_DIR"
@@ -36,7 +28,6 @@ if [ -n "$(ls "$CI_SCRIPTS_DIR"/*)" ]; then
     rm "$CI_SCRIPTS_DIR"/*
 fi
 
-
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
 
 if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then
diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py
index 5df203b6ce39..23191a6f5508 100644
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@@ -57,7 +57,7 @@ def get_processor_arch_name(gpu_version):
 
 class TopLevelNode(ConfigNode):
     def __init__(self, node_name, config_tree_data, smoke):
-        super(TopLevelNode, self).__init__(None, node_name)
+        super().__init__(None, node_name)
 
         self.config_tree_data = config_tree_data
         self.props["smoke"] = smoke
@@ -68,7 +68,7 @@ def get_children(self):
 
 class OSConfigNode(ConfigNode):
     def __init__(self, parent, os_name, gpu_versions, py_tree):
-        super(OSConfigNode, self).__init__(parent, os_name)
+        super().__init__(parent, os_name)
 
         self.py_tree = py_tree
         self.props["os_name"] = os_name
@@ -80,7 +80,7 @@ def get_children(self):
 
 class PackageFormatConfigNode(ConfigNode):
     def __init__(self, parent, package_format, python_versions):
-        super(PackageFormatConfigNode, self).__init__(parent, package_format)
+        super().__init__(parent, package_format)
 
         self.props["python_versions"] = python_versions
         self.props["package_format"] = package_format
@@ -97,7 +97,7 @@ def get_children(self):
 
 class LinuxGccConfigNode(ConfigNode):
     def __init__(self, parent, gcc_config_variant):
-        super(LinuxGccConfigNode, self).__init__(parent, "GCC_CONFIG_VARIANT=" + str(gcc_config_variant))
+        super().__init__(parent, "GCC_CONFIG_VARIANT=" + str(gcc_config_variant))
 
         self.props["gcc_config_variant"] = gcc_config_variant
 
@@ -122,7 +122,7 @@ def get_children(self):
 
 class WindowsLibtorchConfigNode(ConfigNode):
     def __init__(self, parent, libtorch_config_variant):
-        super(WindowsLibtorchConfigNode, self).__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant))
+        super().__init__(parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant))
 
         self.props["libtorch_config_variant"] = libtorch_config_variant
 
@@ -132,7 +132,7 @@ def get_children(self):
 
 class ArchConfigNode(ConfigNode):
     def __init__(self, parent, gpu):
-        super(ArchConfigNode, self).__init__(parent, get_processor_arch_name(gpu))
+        super().__init__(parent, get_processor_arch_name(gpu))
 
         self.props["gpu"] = gpu
 
@@ -142,7 +142,7 @@ def get_children(self):
 
 class PyVersionConfigNode(ConfigNode):
     def __init__(self, parent, pyver):
-        super(PyVersionConfigNode, self).__init__(parent, pyver)
+        super().__init__(parent, pyver)
 
         self.props["pyver"] = pyver
 
@@ -158,7 +158,7 @@ def get_children(self):
 
 class LinkingVariantConfigNode(ConfigNode):
     def __init__(self, parent, linking_variant):
-        super(LinkingVariantConfigNode, self).__init__(parent, linking_variant)
+        super().__init__(parent, linking_variant)
 
     def get_children(self):
         return [DependencyInclusionConfigNode(self, v) for v in DEPS_INCLUSION_DIMENSIONS]
@@ -166,6 +166,6 @@ def get_children(self):
 
 class DependencyInclusionConfigNode(ConfigNode):
     def __init__(self, parent, deps_variant):
-        super(DependencyInclusionConfigNode, self).__init__(parent, deps_variant)
+        super().__init__(parent, deps_variant)
 
         self.props["libtorch_variant"] = "-".join([self.parent.get_label(), self.get_label()])
diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 4ea80ab4f79d..ebd6e0a38187 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -12,7 +12,7 @@ def get_major_pyver(dotted_version):
 
 class TreeConfigNode(ConfigNode):
     def __init__(self, parent, node_name, subtree):
-        super(TreeConfigNode, self).__init__(parent, self.modify_label(node_name))
+        super().__init__(parent, self.modify_label(node_name))
         self.subtree = subtree
         self.init2(node_name)
 
@@ -28,7 +28,7 @@ def get_children(self):
 
 class TopLevelNode(TreeConfigNode):
     def __init__(self, node_name, subtree):
-        super(TopLevelNode, self).__init__(None, node_name, subtree)
+        super().__init__(None, node_name, subtree)
 
     # noinspection PyMethodMayBeStatic
     def child_constructor(self):
diff --git a/.circleci/config.yml b/.circleci/config.yml
index ccbf0dc5720f..5cb89ac2c140 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -47,7 +47,7 @@ commands:
       - run:
           name: "Calculate docker image hash"
           command: |
-            DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+            DOCKER_TAG=$(git rev-parse HEAD:.ci/docker)
             echo "DOCKER_TAG=${DOCKER_TAG}" >> "${BASH_ENV}"
 
   designate_upload_channel:
@@ -526,8 +526,8 @@ jobs:
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             set -x
 
-            chmod a+x .jenkins/pytorch/macos-build.sh
-            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
 
       - persist_to_workspace:
           root: /Users/distiller/workspace/
@@ -562,8 +562,8 @@ jobs:
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             set -x
 
-            chmod a+x .jenkins/pytorch/macos-build.sh
-            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
 
       - persist_to_workspace:
           root: /Users/distiller/workspace/
@@ -644,7 +644,7 @@ jobs:
             brew link --force libomp
 
             echo "export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${BASH_ENV}"
-            .jenkins/pytorch/macos-build.sh
+            .ci/pytorch/macos-build.sh
 
       - when:
           condition: << parameters.build-generates-artifacts >>
@@ -727,7 +727,7 @@ jobs:
             export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
 
             python3 -mpip install dist/*.whl
-            .jenkins/pytorch/macos-test.sh
+            .ci/pytorch/macos-test.sh
       - run:
           name: Copy files for uploading test stats
           command: |
@@ -757,7 +757,7 @@ jobs:
               exit 0
             fi
             cp -r ~/workspace/test-reports/* ~/project
-            pip3 install requests==2.26 rockset==0.8.3 boto3==1.19.12 six==1.16.0
+            pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
             export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
             export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
             # i dont know how to get the run attempt number for reruns so default to 1
@@ -779,23 +779,8 @@ jobs:
             set -e
             export JOB_BASE_NAME=$CIRCLE_JOB
 
-            chmod a+x .jenkins/pytorch/macos-test.sh
-            unbuffer .jenkins/pytorch/macos-test.sh 2>&1 | ts
-      - run:
-          name: Report results
-          no_output_timeout: "5m"
-          command: |
-            set -ex
-            source /Users/distiller/workspace/miniconda3/bin/activate
-            python3 -m pip install boto3==1.19.12
-
-            export JOB_BASE_NAME=$CIRCLE_JOB
-
-            # Using the same IAM user to write stats to our OSS bucket
-            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
-            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
-            python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-          when: always
+            chmod a+x .ci/pytorch/macos-test.sh
+            unbuffer .ci/pytorch/macos-test.sh 2>&1 | ts
       - store_test_results:
           path: test/test-reports
 
@@ -816,8 +801,8 @@ jobs:
             set -e
             export BUILD_LITE_INTERPRETER=1
             export JOB_BASE_NAME=$CIRCLE_JOB
-            chmod a+x ${HOME}/project/.jenkins/pytorch/macos-lite-interpreter-build-test.sh
-            unbuffer ${HOME}/project/.jenkins/pytorch/macos-lite-interpreter-build-test.sh 2>&1 | ts
+            chmod a+x ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh
+            unbuffer ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh 2>&1 | ts
       - store_test_results:
           path: test/test-reports
 
@@ -1052,7 +1037,7 @@ jobs:
                 $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
             }
 
-            retry conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing_extensions --yes
+            retry conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing-extensions --yes
 
             # sync submodules
             cd ${PROJ_ROOT}
@@ -1116,7 +1101,7 @@ jobs:
             cd ${PROJ_ROOT}/ios/TestApp/benchmark
             mkdir -p ../models
             if [ ${USE_COREML_DELEGATE} == 1 ]; then
-              pip install coremltools==5.0b5 protobuf==3.20.1 six==1.16.0
+              pip install coremltools==5.0b5 protobuf==3.20.1
               python coreml_backend.py
             else
               cd "${PROJ_ROOT}"
@@ -1166,7 +1151,7 @@ jobs:
 
           docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/build.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/build.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1212,9 +1197,9 @@ jobs:
           trap "retrieve_test_reports" ERR
 
           if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
-            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
           else
-            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
           fi
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1337,12 +1322,12 @@ jobs:
                 exit 0
               fi
               # Covers the case where a previous tag doesn't exist for the tree
-              # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-              if ! git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.circleci/docker"; then
-                echo "Directory '.circleci/docker' not found in tree << pipeline.git.base_revision >>, you should probably rebase onto a more recent commit"
+              # this is only really applicable on trees that don't have `.ci/docker` at its merge base, i.e. nightly
+              if ! git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.ci/docker"; then
+                echo "Directory '.ci/docker' not found in tree << pipeline.git.base_revision >>, you should probably rebase onto a more recent commit"
                 exit 1
               fi
-              PREVIOUS_DOCKER_TAG=$(git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.circleci/docker")
+              PREVIOUS_DOCKER_TAG=$(git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):ci/docker")
               # If no image exists but the hash is the same as the previous hash then we should error out here
               if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
                 echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
@@ -1357,7 +1342,7 @@ jobs:
               export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
               export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
               set -x
-              cd .circleci/docker && ./build_docker.sh
+              cd .ci/docker && ./build_docker.sh
 ##############################################################################
 # Workflows
 ##############################################################################
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index d07a1be55127..43d8bb41499d 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -15,7 +15,7 @@ export PATH="~/anaconda/bin:${PATH}"
 source ~/anaconda/bin/activate
 
 # Install dependencies
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing_extensions --yes
+conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing-extensions --yes
 conda install -c conda-forge valgrind --yes
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 67f97aa80ec2..632b51a2ff0c 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -38,8 +38,12 @@ fi
 EXTRA_CONDA_FLAGS=""
 NUMPY_PIN=""
 PROTOBUF_PACKAGE="defaults::protobuf"
+if [[ "\$python_nodot" = *311* ]]; then
+  # Numpy is yet not avaiable on default conda channel
+  EXTRA_CONDA_FLAGS="-c=malfet"
+fi
+
 if [[ "\$python_nodot" = *310* ]]; then
-  EXTRA_CONDA_FLAGS="-c=conda-forge"
   # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
   # we set a lower boundary here just to be safe
   NUMPY_PIN=">=1.21.2"
@@ -47,7 +51,6 @@ if [[ "\$python_nodot" = *310* ]]; then
 fi
 
 if [[ "\$python_nodot" = *39*  ]]; then
-  EXTRA_CONDA_FLAGS="-c=conda-forge"
   # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
   # we set a lower boundary here just to be safe
   NUMPY_PIN=">=1.20"
@@ -79,25 +82,24 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
       mkl>=2018 \
       ninja \
       typing-extensions \
-      ${PROTOBUF_PACKAGE} \
-      six
+      ${PROTOBUF_PACKAGE}
     if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
       retry conda install -c pytorch -y cpuonly
     else
 
       cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
-      CUDA_PACKAGE="cudatoolkit"
-      if [[ "$DESIRED_CUDA" == "cu116" || "$DESIRED_CUDA" == "cu117" || "$DESIRED_CUDA" == "cu118" ]]; then
-        CUDA_PACKAGE="cuda"
+      CUDA_PACKAGE="pytorch-cuda"
+      PYTORCH_CHANNEL="pytorch"
+      if [[ "\${TORCH_CONDA_BUILD_FOLDER}" == "pytorch-nightly" ]]; then
+              PYTORCH_CHANNEL="pytorch-nightly"
       fi
-
-      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "\${CUDA_PACKAGE}=\${cu_ver}"
+      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c "\${PYTORCH_CHANNEL}" "pytorch-cuda=\${cu_ver}"
     fi
     conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
   )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
   pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/nightly/${DESIRED_CUDA}"
-  retry pip install -q future numpy protobuf typing-extensions six
+  retry pip install -q numpy protobuf typing-extensions
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
   pkg="\$(ls /final_pkgs/*-latest.zip)"
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 7714371e2642..41dae4013594 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -92,11 +92,11 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
   POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
   POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
   # Add the Windows-specific JNI path
-  POSSIBLE_JAVA_HOMES+=("$PWD/.circleci/windows-jni/")
+  POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
   for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
     if [[ -e "$JH/include/jni.h" ]] ; then
       # Skip if we're not on Windows but haven't found a JAVA_HOME
-      if [[ "$JH" == "$PWD/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
+      if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
         break
       fi
       echo "Found jni.h under $JH"
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index be77e6483b7e..2394ee8b6c81 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -8,7 +8,7 @@ export CUDA_VERSION="${DESIRED_CUDA/cu/}"
 export USE_SCCACHE=1
 export SCCACHE_BUCKET=ossci-compiler-cache
 export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-export VC_YEAR=2019
+export VC_YEAR=2022
 
 if [[ "${DESIRED_CUDA}" == *"cu11"* ]]; then
     export BUILD_SPLIT_CUDA=ON
diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh
index bbf0efbb5e52..f8bebe234fb1 100644
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@@ -4,7 +4,7 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 
 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-export VC_YEAR=2019
+export VC_YEAR=2022
 
 pushd "$BUILDER_ROOT"
 
diff --git a/.circleci/scripts/functorch_doc_push_script.sh b/.circleci/scripts/functorch_doc_push_script.sh
index aed2a1c451b9..3a688568ce6f 100755
--- a/.circleci/scripts/functorch_doc_push_script.sh
+++ b/.circleci/scripts/functorch_doc_push_script.sh
@@ -7,7 +7,7 @@ sudo apt-get -y install expect-dev
 
 # This is where the local pytorch install in the docker image is located
 pt_checkout="/var/lib/jenkins/workspace"
-source "$pt_checkout/.jenkins/pytorch/common_utils.sh"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
 echo "functorch_doc_push_script.sh: Invoked with $*"
 
 set -ex
diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh
index 07db737e0bc7..e0e6a4ec948d 100755
--- a/.circleci/scripts/python_doc_push_script.sh
+++ b/.circleci/scripts/python_doc_push_script.sh
@@ -7,7 +7,7 @@ sudo apt-get -y install expect-dev
 # This is where the local pytorch install in the docker image is located
 pt_checkout="/var/lib/jenkins/workspace"
 
-source "$pt_checkout/.jenkins/pytorch/common_utils.sh"
+source "$pt_checkout/.ci/pytorch/common_utils.sh"
 
 echo "python_doc_push_script.sh: Invoked with $*"
 
@@ -140,6 +140,7 @@ git status
 if [[ "${WITH_PUSH:-}" == true ]]; then
   # push to a temp branch first to trigger CLA check and satisfy branch protections
   git push -u origin HEAD:pytorchbot/temp-branch-py -f
+  git push -u origin HEAD^:pytorchbot/base -f
   sleep 30
   git push -u origin "${branch}"
 fi
diff --git a/.circleci/verbatim-sources/commands.yml b/.circleci/verbatim-sources/commands.yml
index 1263c4996c62..edc8f8ece1a6 100644
--- a/.circleci/verbatim-sources/commands.yml
+++ b/.circleci/verbatim-sources/commands.yml
@@ -6,7 +6,7 @@ commands:
       - run:
           name: "Calculate docker image hash"
           command: |
-            DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+            DOCKER_TAG=$(git rev-parse HEAD:.ci/docker)
             echo "DOCKER_TAG=${DOCKER_TAG}" >> "${BASH_ENV}"
 
   designate_upload_channel:
diff --git a/.circleci/verbatim-sources/job-specs/docker_jobs.yml b/.circleci/verbatim-sources/job-specs/docker_jobs.yml
index 843986367c22..a4abd92fcac8 100644
--- a/.circleci/verbatim-sources/job-specs/docker_jobs.yml
+++ b/.circleci/verbatim-sources/job-specs/docker_jobs.yml
@@ -33,12 +33,12 @@
                 exit 0
               fi
               # Covers the case where a previous tag doesn't exist for the tree
-              # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-              if ! git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.circleci/docker"; then
-                echo "Directory '.circleci/docker' not found in tree << pipeline.git.base_revision >>, you should probably rebase onto a more recent commit"
+              # this is only really applicable on trees that don't have `.ci/docker` at its merge base, i.e. nightly
+              if ! git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.ci/docker"; then
+                echo "Directory '.ci/docker' not found in tree << pipeline.git.base_revision >>, you should probably rebase onto a more recent commit"
                 exit 1
               fi
-              PREVIOUS_DOCKER_TAG=$(git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.circleci/docker")
+              PREVIOUS_DOCKER_TAG=$(git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):ci/docker")
               # If no image exists but the hash is the same as the previous hash then we should error out here
               if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
                 echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
@@ -53,4 +53,4 @@
               export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
               export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
               set -x
-              cd .circleci/docker && ./build_docker.sh
+              cd .ci/docker && ./build_docker.sh
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 9271cd57de1a..f03e173ccece 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -51,8 +51,8 @@
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             set -x
 
-            chmod a+x .jenkins/pytorch/macos-build.sh
-            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
 
       - persist_to_workspace:
           root: /Users/distiller/workspace/
@@ -87,8 +87,8 @@
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
             set -x
 
-            chmod a+x .jenkins/pytorch/macos-build.sh
-            unbuffer .jenkins/pytorch/macos-build.sh 2>&1 | ts
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
 
       - persist_to_workspace:
           root: /Users/distiller/workspace/
@@ -169,7 +169,7 @@
             brew link --force libomp
 
             echo "export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${BASH_ENV}"
-            .jenkins/pytorch/macos-build.sh
+            .ci/pytorch/macos-build.sh
 
       - when:
           condition: << parameters.build-generates-artifacts >>
@@ -252,7 +252,7 @@
             export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
 
             python3 -mpip install dist/*.whl
-            .jenkins/pytorch/macos-test.sh
+            .ci/pytorch/macos-test.sh
       - run:
           name: Copy files for uploading test stats
           command: |
@@ -282,7 +282,7 @@
               exit 0
             fi
             cp -r ~/workspace/test-reports/* ~/project
-            pip3 install requests==2.26 rockset==0.8.3 boto3==1.19.12 six==1.16.0
+            pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
             export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
             export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
             # i dont know how to get the run attempt number for reruns so default to 1
@@ -304,23 +304,8 @@
             set -e
             export JOB_BASE_NAME=$CIRCLE_JOB
 
-            chmod a+x .jenkins/pytorch/macos-test.sh
-            unbuffer .jenkins/pytorch/macos-test.sh 2>&1 | ts
-      - run:
-          name: Report results
-          no_output_timeout: "5m"
-          command: |
-            set -ex
-            source /Users/distiller/workspace/miniconda3/bin/activate
-            python3 -m pip install boto3==1.19.12
-
-            export JOB_BASE_NAME=$CIRCLE_JOB
-
-            # Using the same IAM user to write stats to our OSS bucket
-            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
-            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
-            python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-          when: always
+            chmod a+x .ci/pytorch/macos-test.sh
+            unbuffer .ci/pytorch/macos-test.sh 2>&1 | ts
       - store_test_results:
           path: test/test-reports
 
@@ -341,8 +326,8 @@
             set -e
             export BUILD_LITE_INTERPRETER=1
             export JOB_BASE_NAME=$CIRCLE_JOB
-            chmod a+x ${HOME}/project/.jenkins/pytorch/macos-lite-interpreter-build-test.sh
-            unbuffer ${HOME}/project/.jenkins/pytorch/macos-lite-interpreter-build-test.sh 2>&1 | ts
+            chmod a+x ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh
+            unbuffer ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh 2>&1 | ts
       - store_test_results:
           path: test/test-reports
 
@@ -577,7 +562,7 @@
                 $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
             }
 
-            retry conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing_extensions --yes
+            retry conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing-extensions --yes
 
             # sync submodules
             cd ${PROJ_ROOT}
@@ -641,7 +626,7 @@
             cd ${PROJ_ROOT}/ios/TestApp/benchmark
             mkdir -p ../models
             if [ ${USE_COREML_DELEGATE} == 1 ]; then
-              pip install coremltools==5.0b5 protobuf==3.20.1 six==1.16.0
+              pip install coremltools==5.0b5 protobuf==3.20.1
               python coreml_backend.py
             else
               cd "${PROJ_ROOT}"
@@ -691,7 +676,7 @@
 
           docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/build.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/build.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -737,9 +722,9 @@
           trap "retrieve_test_reports" ERR
 
           if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
-            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
           else
-            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .jenkins/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
           fi
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
diff --git a/.clang-tidy b/.clang-tidy
index ec43eca88f2e..8c4a341b5185 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -3,11 +3,14 @@
 InheritParentConfig: true
 Checks: '
 bugprone-*,
+-bugprone-easily-swappable-parameters,
 -bugprone-forward-declaration-namespace,
 -bugprone-macro-parentheses,
 -bugprone-lambda-function-name,
 -bugprone-reserved-identifier,
+-bugprone-swapped-arguments,
 cppcoreguidelines-*,
+-cppcoreguidelines-avoid-do-while,
 -cppcoreguidelines-avoid-magic-numbers,
 -cppcoreguidelines-avoid-non-const-global-variables,
 -cppcoreguidelines-interfaces-global-init,
@@ -26,8 +29,11 @@ cppcoreguidelines-*,
 -facebook-hte-RelativeInclude,
 hicpp-exception-baseclass,
 hicpp-avoid-goto,
+misc-unused-alias-decls,
+misc-unused-using-decls,
 modernize-*,
 -modernize-concat-nested-namespaces,
+-modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
 -modernize-use-default-member-init,
@@ -37,9 +43,9 @@ modernize-*,
 performance-*,
 -performance-noexcept-move-constructor,
 -performance-unnecessary-value-param,
+readability-container-size-empty,
 '
 HeaderFilterRegex: '^(c10/(?!test)|torch/csrc/(?!deploy/interpreter/cpython)).*$'
 AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
-CheckOptions:
 ...
diff --git a/.flake8 b/.flake8
index a16d89827371..2fcff14109b5 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,6 @@
 [flake8]
-select = B,C,E,F,P,T4,W,B9
+enable-extensions = G
+select = B,C,E,F,G,P,T4,W,B9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
@@ -11,7 +12,9 @@ ignore =
     # these ignores are from flake8-bugbear; please fix!
     B007,B008,
     # these ignores are from flake8-comprehensions; please fix!
-    C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
+    C407
+    # these ignores are from flake8-logging-format; please fix!
+    G001,G002,G003,G004,G100,G101,G200,G201,G202
 per-file-ignores =
     __init__.py: F401
     torch/utils/cpp_extension.py: B950
diff --git a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
new file mode 100644
index 000000000000..36fe4b592aec
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@@ -0,0 +1,61 @@
+name: 🐛 torch.compile Bug Report
+description: Create a report to help us reproduce and fix the bug
+labels: ["oncall: pt2"]
+
+body:
+  - type: markdown
+    attributes:
+      value: >
+        #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
+        existing and past issues](https://github.com/pytorch/pytorch/issues)
+        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/master/dynamo/index.html)
+  - type: textarea
+    attributes:
+      label: 🐛 Describe the bug
+      description: |
+        Please provide a clear and concise description of what the bug is.
+      placeholder: |
+        A clear and concise description of what the bug is.
+    validations:
+      required: false
+
+  - type: textarea
+    attributes:
+      label: Error logs
+      description: |
+        Please provide the error you're seeing
+      placeholder: |
+        Error...
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: Minified repro
+      description: |
+        Please run the minifier on your example and paste the minified code below
+        Learn more here https://pytorch.org/docs/master/dynamo/troubleshooting.html
+      placeholder: |
+        env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py
+        or
+        env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py
+
+        import torch
+        ...
+
+        # torch version: 2.0.....
+
+        class Repro(torch.nn.Module)
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: Versions
+      description: |
+        Please run the following and paste the output below.
+        ```sh
+        wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+        # For security purposes, please check the contents of collect_env.py before running it.
+        python collect_env.py
+        ```
+    validations:
+      required: true
diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml
index b5f2018831ce..3bfe28e4c7bb 100644
--- a/.github/actions/build-android/action.yml
+++ b/.github/actions/build-android/action.yml
@@ -68,7 +68,7 @@ runs:
         )
         git submodule sync && git submodule update -q --init --recursive --depth 1
         docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace"
-        (echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
+        (echo "sudo chown -R jenkins . && .ci/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1
 
         # Copy install binaries back
         mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}"
diff --git a/.github/actions/calculate-docker-image/action.yml b/.github/actions/calculate-docker-image/action.yml
index 7ddfdfa1ef0b..b7531cb182b5 100644
--- a/.github/actions/calculate-docker-image/action.yml
+++ b/.github/actions/calculate-docker-image/action.yml
@@ -24,9 +24,6 @@ inputs:
   force_push:
     description: If set to any value, always run the push
     required: false
-  push-ghcr-image:
-    description: If set to any value, push docker image to the ghcr.io.
-    required: false
 
 outputs:
   docker-image:
@@ -41,16 +38,16 @@ runs:
       id: calculate-tag
       env:
         IS_XLA: ${{ inputs.xla == 'true' && 'true' || '' }}
-        XLA_IMAGE_TAG: v0.6
+        XLA_IMAGE_TAG: v1.0
         DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ inputs.docker-image-name }}
       run: |
         if [ -n "${IS_XLA}" ]; then
           echo "XLA workflow uses pre-built test image at ${XLA_IMAGE_TAG}"
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          DOCKER_TAG=$(git rev-parse HEAD:.ci/docker)
           echo "docker-tag=${DOCKER_TAG}" >> "${GITHUB_OUTPUT}"
           echo "docker-image=${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" >> "${GITHUB_OUTPUT}"
         else
-          DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker)
+          DOCKER_TAG=$(git rev-parse HEAD:.ci/docker)
           echo "docker-tag=${DOCKER_TAG}" >> "${GITHUB_OUTPUT}"
           echo "docker-image=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_OUTPUT}"
         fi
@@ -78,12 +75,12 @@ runs:
           MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION")
         fi
         # Covers the case where a previous tag doesn't exist for the tree
-        # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly
-        if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then
-          echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
+        # this is only really applicable on trees that don't have `.ci/docker` at its merge base, i.e. nightly
+        if ! git rev-parse "$MERGE_BASE:.ci/docker"; then
+          echo "Directory '.ci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit"
           exit 1
         fi
-        PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker")
+        PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.ci/docker")
         # If no image exists but the hash is the same as the previous hash then we should error out here
         if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
           echo "WARNING: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
@@ -106,9 +103,7 @@ runs:
         # Skip push if we don't need it, or if specified in the inputs
         DOCKER_SKIP_PUSH: ${{ steps.check.outputs.skip_push || inputs.skip_push }}
         DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker-tag }}
-        PUSH_GHCR_IMAGE: ${{ inputs.push-ghcr-image }}
-        GHCR_PAT: ${{ env.GHCR_PAT }}
-      working-directory: .circleci/docker
+      working-directory: .ci/docker
       shell: bash
       run: |
         ./build_docker.sh
diff --git a/.github/actions/diskspace-cleanup/action.yml b/.github/actions/diskspace-cleanup/action.yml
new file mode 100644
index 000000000000..9b7ea7992331
--- /dev/null
+++ b/.github/actions/diskspace-cleanup/action.yml
@@ -0,0 +1,31 @@
+name: Cleans up diskspace
+
+description: Cleans up diskspace if the root directory has used more than seventy percent of your diskspace.
+
+inputs:
+    diskspace-cutoff:
+        description: The percent amount after which docker prune is run.
+        required: true
+        default: 70
+
+runs:
+  using: composite
+  steps:
+    - name: Cleans up diskspace
+      shell: bash
+      run: |
+        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
+        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
+        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
+            docker system prune -af
+            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
+            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
+                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
+                echo "$msg"
+                exit 1
+            else
+                difference=$((diskspace - diskspace_new))
+                echo "Diskspace saved: $difference percent"
+            fi
+        fi
diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index fbdc5c8761b2..4c607313ddf1 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -46,13 +46,43 @@ runs:
       id: parse-ref
       run: .github/scripts/parse_ref.py
 
+    - name: Get the job name
+      id: get-job-name
+      continue-on-error: true
+      shell: bash
+      run: |
+        set -x
+
+        # TODO: This is a very hacky way to get the job name. GitHub runner has the info
+        # but doesn't expose it in anyway. The job name is part of the job message the
+        # runner receives, so it's there and printed out to the diag log. Below is the
+        # code responsible for printing it. Need to check with GitHub to see if they can
+        # expose this variable as part of GitHub context.
+        # https://github.com/actions/runner/blob/main/src/Runner.Worker/JobExtension.cs#L345
+        pushd "${{ runner.workspace }}/../../_diag"
+        pwd
+
+        LOG_FILE=$(grep -l -r "${{ github.sha }}" *.log | tail -n 1)
+        if [ -n "${LOG_FILE}" ]; then
+          JOB_NAME=$(grep -r "\"jobDisplayName\"" "${LOG_FILE}" | awk -F '[:,]' '{print $2}' | sed 's/"//g' | xargs)
+          echo "job-name=${JOB_NAME}" >> "${GITHUB_OUTPUT}"
+        fi
+
+        popd
+
     - name: Select all requested test configurations
       shell: bash
       env:
         GITHUB_TOKEN: ${{ inputs.github-token }}
+        JOB_NAME: ${{ steps.get-job-name.outputs.job-name }}
       id: filter
       run: |
+        echo "Workflow: ${GITHUB_WORKFLOW}"
+        echo "Job name: ${JOB_NAME}"
+
         .github/scripts/filter_test_configs.py \
+          --workflow "${GITHUB_WORKFLOW}" \
+          --job-name "${JOB_NAME}" \
           --test-matrix "${{ inputs.test-matrix }}" \
           --pr-number "${{ github.event.pull_request.number }}" \
           --tag "${{ steps.parse-ref.outputs.tag }}" \
diff --git a/.github/actions/get-workflow-job-id/action.yml b/.github/actions/get-workflow-job-id/action.yml
index b57ce8993acc..be202c960c77 100644
--- a/.github/actions/get-workflow-job-id/action.yml
+++ b/.github/actions/get-workflow-job-id/action.yml
@@ -19,6 +19,7 @@ runs:
       # timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979
       # timeout-minutes: 10
       shell: bash
+      id: get-job-id
       run: |
         set -eux
         GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
index 38b5444d987f..5ff2b9a9a59b 100644
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -30,16 +30,20 @@ runs:
         fi
 
     - name: Log in to ECR
-      shell: bash
+      uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
       env:
         AWS_RETRY_MODE: standard
         AWS_MAX_ATTEMPTS: "5"
         AWS_DEFAULT_REGION: us-east-1
-      run: |
-        AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
-        retry () { "$@"  || (sleep 1 && "$@") || (sleep 2 && "$@") }
-        retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-            --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+      with:
+        shell: bash
+        timeout_minutes: 5
+        max_attempts: 3
+        retry_wait_seconds: 30
+        command: |
+          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+          aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
+              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
 
     - name: Preserve github env variables for use in docker
       shell: bash
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index 70bccf648539..b9833480954b 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -57,6 +57,10 @@ runs:
             exit 1
         fi
 
+    - name: Runner diskspace health check
+      uses: ./.github/actions/diskspace-cleanup
+      if: always()
+
     - name: Runner health check disconnect on failure
       if: ${{ failure() }}
       shell: bash
diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index e1b17a3e8c35..84537811c8ba 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -59,7 +59,9 @@ runs:
 
         if [[ "${EXIT_CODE}" == "0" ]]; then
           echo "Found Python3 at ${PYTHON3}, adding it into GITHUB_PATH"
-          echo "${PYTHON3}" >> "${GITHUB_PATH}"
+
+          PYTHON_PATH=$(dirname "${PYTHON3}")
+          echo "${PYTHON_PATH}" >> "${GITHUB_PATH}"
         else
           # According to https://docs.conda.io/en/latest/miniconda.html, we are using the Miniconda3
           # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
diff --git a/.github/actions/teardown-rocm/action.yml b/.github/actions/teardown-rocm/action.yml
index f2eca13b124f..3d674a35bfd0 100644
--- a/.github/actions/teardown-rocm/action.yml
+++ b/.github/actions/teardown-rocm/action.yml
@@ -14,13 +14,6 @@ runs:
         docker stop $(docker ps -q) || true
         # Prune all stopped containers.
         docker container prune -f
-        # Prune everything docker if there are more than 10 images (~200GB).
-        # This is easier than using a time filter, e.g., "until=24h".
-        # Might fail if a prune is already in progress by another runner.
-        image_count=$(docker images | wc -l)
-        if [[ ${image_count} -gt 10 ]]; then
-            echo "Purging all docker caches"
-            docker system prune -af || true
-        else
-            echo "Will not purge docker, only ${image_count} images found"
-        fi
+    - name: Runner diskspace health check
+      uses: ./.github/actions/diskspace-cleanup
+      if: always()
diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
index d33316491194..abb4de6c015e 100644
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -74,6 +74,7 @@ runs:
 
     - name: Zip usage log for upload
       if: runner.os == 'Windows' && !inputs.use-gha
+      continue-on-error: true
       shell: powershell
       env:
         FILE_SUFFIX: ${{ inputs.file-suffix }}
@@ -105,6 +106,7 @@ runs:
     - name: Store Usage Logs on S3
       uses: seemethere/upload-artifact-s3@v5
       if: ${{ !inputs.use-gha }}
+      continue-on-error: true
       with:
         s3-prefix: |
           ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
@@ -116,6 +118,7 @@ runs:
     - name: Store Test Downloaded JSONs on Github
       uses: actions/upload-artifact@v3
       if: inputs.use-gha
+      continue-on-error: true
       with:
         # Add the run attempt, see [Artifact run attempt]
         name: test-jsons-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
@@ -126,6 +129,7 @@ runs:
     - name: Store Test Reports on Github
       uses: actions/upload-artifact@v3
       if: inputs.use-gha
+      continue-on-error: true
       with:
         # Add the run attempt, see [Artifact run attempt]
         name: test-reports-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
@@ -139,6 +143,7 @@ runs:
     - name: Store Usage Logs on Github
       uses: actions/upload-artifact@v3
       if: inputs.use-gha
+      continue-on-error: true
       with:
         # Add the run attempt, see [Artifact run attempt]
         name: usage-log-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
@@ -147,4 +152,3 @@ runs:
         path: |
           usage_log.txt
           test/**/*.log
-      continue-on-error: true
diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
index 76bb77c2cc74..765fd1715e89 100644
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@@ -8,7 +8,6 @@ reviewers:
       - miladm
       - bdhirsh
       - voznesenskym
-      - SherlockNoMad
       - jbschlosser
 
   per_author:
@@ -16,6 +15,7 @@ reviewers:
       - symbolic-shapes
       - antoniojkim
       - wconstab
+      - SherlockNoMad
 
 files:
   # none yet, TODO: migrate CODEOWNERS here
diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt
deleted file mode 100644
index 7c5e80098f7b..000000000000
--- a/.github/ci_commit_pins/triton.txt
+++ /dev/null
@@ -1 +0,0 @@
-0d7e7532279e45672555e344646f5c19c3972331
diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt
new file mode 120000
index 000000000000..7b62e01173b3
--- /dev/null
+++ b/.github/ci_commit_pins/triton.txt
@@ -0,0 +1 @@
+../../.ci/docker/ci_commit_pins/triton.txt
\ No newline at end of file
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 7cd9c0f239a2..79d951495603 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-d2d448c71b4cb054d160000a0f63eecad7867bdb
+120e7af6466190b754cf3026c685a5d31561da90
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 90134683361a..98bb737eb271 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-eac4e547138ab22a9b41c6f96208613fd7dd19d5
+503401a24e532a9019ef140199319221294045ee
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 14f176546256..def7fb42441d 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -16,6 +16,7 @@
 - torch/_subclasses/fake_utils.py
 - torch/_subclasses/meta_utils.py
 - test/distributed/test_dynamo_distributed.py
+- test/distributed/test_traceable_collectives.py
 - functorch/_src/partitioners.py
 - functorch/_src/aot_autograd.py
 
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 7e775da32ead..b1e267f3b24d 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -1,7 +1,7 @@
 - name: ONNX exporter
   patterns:
-  - .jenkins/caffe2/*
-  - .jenkins/onnx/*
+  - .ci/caffe2/*
+  - .ci/onnx/*
   - aten/src/ATen/core/interned_strings.h
   - docs/source/onnx.rst
   - docs/source/onnx*
@@ -52,7 +52,7 @@
   patterns:
   - .github/**
   - .circleci/**
-  - .jenkins/**
+  - .ci/**
   - scripts/**
   - tools/**
   approved_by:
@@ -191,6 +191,7 @@
   - alband
   - malfet
   - razarmehr
+  - DenisVieriu97
   mandatory_checks_name:
   - EasyCLA
   - Lint
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 627b2648ad42..dafa081dabb2 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,2 +1,14 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
+ciflow_push_tags:
+- ciflow/binaries
+- ciflow/binaries_conda
+- ciflow/binaries_libtorch
+- ciflow/binaries_wheel
+- ciflow/inductor
+- ciflow/inductor-perf-test-nightly
+- ciflow/mps
+- ciflow/nightly
+- ciflow/periodic
+- ciflow/trunk
+- ciflow/unstable
diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
index 7b45c61c1815..822a6fdde457 100644
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@@ -4,13 +4,13 @@
 #   docs/requirements.txt
 #   docs/cpp/requirements.txt
 #   functorch/docs/requirements.txt
-#   .circleci/docker/requirements-ci.txt
+#   .ci/docker/requirements-ci.txt
 boto3==1.19.12
 jinja2==3.0.1
 lintrunner==0.9.2
 ninja==1.10.0.post1
-pynvml==11.4.1
+nvidia-ml-py==11.525.84
 pyyaml==6.0
 requests==2.26
 rich==10.9.0
-rockset==0.8.10
+rockset==1.0.3
diff --git a/.github/requirements/conda-env-Linux-X64 b/.github/requirements/conda-env-Linux-X64
index 8ab2a4211972..43afafcd2601 100644
--- a/.github/requirements/conda-env-Linux-X64
+++ b/.github/requirements/conda-env-Linux-X64
@@ -6,4 +6,4 @@ numpy=1.23.3
 pyyaml=6.0
 requests=2.28.1
 setuptools=65.5.0
-typing_extensions=4.3.0
+typing-extensions=4.3.0
diff --git a/.github/requirements/conda-env-iOS b/.github/requirements/conda-env-iOS
index b38dcc77a30f..722e1fe11b60 100644
--- a/.github/requirements/conda-env-iOS
+++ b/.github/requirements/conda-env-iOS
@@ -7,4 +7,4 @@ numpy=1.23.3
 pyyaml=6.0
 requests=2.28.1
 setuptools=63.4.1
-typing_extensions=4.3.0
+typing-extensions=4.3.0
diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
index dbcd3647d97c..b467a7b04bca 100644
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@@ -2,10 +2,9 @@ numpy=1.22.3
 pyyaml=6.0
 setuptools=61.2.0
 cmake=3.22.*
-typing_extensions=4.3.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
-six=1.16.0
 pillow=9.2.0
 pkg-config=0.29.2
 wheel=0.37.1
diff --git a/.github/requirements/conda-env-macOS-X64 b/.github/requirements/conda-env-macOS-X64
index 2bddda13e17d..a22e6c4f3d86 100644
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@@ -4,10 +4,9 @@ numpy=1.18.5
 pyyaml=5.3
 setuptools=46.0.0
 cmake=3.22.*
-typing_extensions=4.3.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
-six=1.16.0
 pillow=9.2.0
 libuv=1.40.0
 pkg-config=0.29.2
diff --git a/.github/requirements/pip-requirements-iOS.txt b/.github/requirements/pip-requirements-iOS.txt
index 773be0edd9fa..0befad884283 100644
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@@ -1,4 +1,3 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-six==1.16.0
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index d101e584d35b..c82ff53e0cea 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -9,7 +9,7 @@ numba==0.56.0; platform_machine == "arm64"
 numba<=0.49.1; platform_machine != "arm64"
 opt-einsum>=3.3
 psutil==5.9.1
-pynvml==11.4.1
+nvidia-ml-py==11.525.84
 pygments==2.12.0
 pytest==7.2.0
 pytest-xdist==3.0.2
@@ -20,3 +20,4 @@ scipy==1.9.0
 sympy==1.11.1
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
+filelock==3.6.0
diff --git a/.github/requirements/regenerate-requirements.txt b/.github/requirements/regenerate-requirements.txt
index 3265f34b3181..a7557e90a856 100644
--- a/.github/requirements/regenerate-requirements.txt
+++ b/.github/requirements/regenerate-requirements.txt
@@ -1,2 +1,2 @@
-typing_extensions
+typing-extensions
 jinja2
diff --git a/.github/requirements/triton-requirements-rocm.txt b/.github/requirements/triton-requirements-rocm.txt
new file mode 100644
index 000000000000..031e933f2434
--- /dev/null
+++ b/.github/requirements/triton-requirements-rocm.txt
@@ -0,0 +1 @@
+pytorch-triton-rocm>=2.0.0.dev
\ No newline at end of file
diff --git a/.github/scripts/README.md b/.github/scripts/README.md
index 0d62609f4682..bc7dc87ac9e5 100644
--- a/.github/scripts/README.md
+++ b/.github/scripts/README.md
@@ -61,5 +61,6 @@ New runner types can be added by committing changes to `.github/scale-config.yml
 
 In order to test changes to the builder scripts:
 
-1. Specify your builder PR's branch and repo as `builder_repo` and  `builder_branch` in [`.github/templates/common.yml.j2`](https://github.com/pytorch/pytorch/blob/32356aaee6a77e0ae424435a7e9da3d99e7a4ca5/.github/templates/common.yml.j2#LL10C26-L10C32). 2. Regenerate workflow files with `.github/regenerate.sh` (see above).
+1. Specify your builder PR's branch and repo as `builder_repo` and  `builder_branch` in [`.github/templates/common.yml.j2`](https://github.com/pytorch/pytorch/blob/32356aaee6a77e0ae424435a7e9da3d99e7a4ca5/.github/templates/common.yml.j2#LL10C26-L10C32).
+2. Regenerate workflow files with `.github/regenerate.sh` (see above).
 3. Submit fake PR to PyTorch. If changing binaries build, add an appropriate label like `ciflow/binaries` to trigger the builds.
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 5380b5ffee38..f305dc4105d3 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -49,7 +49,8 @@ def build_triton(commit_hash: str, build_conda: bool = False, py_version : Optio
 
             if py_version is None:
                 py_version = f"{sys.version_info.major}.{sys.version_info.minor}"
-            check_call(["conda", "build", "--python", py_version, "--output-folder", tmpdir, "."], cwd=triton_basedir)
+            check_call(["conda", "build", "--python", py_version,
+                        "-c", "pytorch-nightly", "--output-folder", tmpdir, "."], cwd=triton_basedir)
             conda_path = list(Path(tmpdir).glob("linux-64/torchtriton*.bz2"))[0]
             shutil.copy(conda_path, Path.cwd())
             return Path.cwd() / conda_path.name
diff --git a/.github/scripts/check_labels.py b/.github/scripts/check_labels.py
index 2d4a216daf94..df2e1ef1c451 100755
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@@ -1,61 +1,34 @@
 #!/usr/bin/env python3
-"""check_labels.py"""
+"""Check whether a PR has required labels."""
 
-from typing import Any, List
+from typing import Any
 
-from export_pytorch_labels import get_pytorch_labels
 from gitutils import (
     get_git_remote_name,
     get_git_repo_dir,
     GitRepo,
 )
-from trymerge import (
-    _fetch_url,
+from trymerge import GitHubPR
+from github_utils import (
+    gh_delete_comment,
     gh_post_pr_comment,
-    GitHubPR,
 )
-
-
-BOT_AUTHORS = ["github-actions", "pytorchmergebot", "pytorch-bot"]
-
-ERR_MSG_TITLE = "This PR needs a label"
-ERR_MSG = (
-    f"# {ERR_MSG_TITLE}\n"
-    "If your changes are user facing and intended to be a part of release notes, please use a label starting with `release notes:`.\n\n"  # noqa: E501  pylint: disable=line-too-long
-    "If not, please add the `topic: not user facing` label.\n\n"
-    "For more information, see https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work."  # noqa: E501  pylint: disable=line-too-long
+from label_utils import (
+    LABEL_ERR_MSG,
+    is_label_err_comment,
+    has_required_labels,
 )
 
-
-def get_release_notes_labels() -> List[str]:
-    return [label for label in get_pytorch_labels() if label.lstrip().startswith("release notes:")]
-
-
-def delete_comment(comment_id: int) -> None:
-    url = f"https://api.github.com/repos/pytorch/pytorch/issues/comments/{comment_id}"
-    _fetch_url(url, method="DELETE")
-
-
-def has_required_labels(pr: GitHubPR) -> bool:
-    pr_labels = pr.get_labels()
-    # Check if PR is not user facing
-    is_not_user_facing_pr = any(label.strip() == "topic: not user facing" for label in pr_labels)
-    return is_not_user_facing_pr or any(label.strip() in get_release_notes_labels() for label in pr_labels)
-
-
-def delete_comments(pr: GitHubPR) -> None:
-    # Delete all previous comments
+def delete_all_label_err_comments(pr: "GitHubPR") -> None:
     for comment in pr.get_comments():
-        if comment.body_text.lstrip(" #").startswith(ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS:
-            delete_comment(comment.database_id)
+        if is_label_err_comment(comment):
+            gh_delete_comment(pr.org, pr.project, comment.database_id)
 
 
-def add_comment(pr: GitHubPR) -> None:
+def add_label_err_comment(pr: "GitHubPR") -> None:
     # Only make a comment if one doesn't exist already
-    for comment in pr.get_comments():
-        if comment.body_text.lstrip(" #").startswith(ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS:
-            return
-    gh_post_pr_comment(pr.org, pr.project, pr.pr_num, ERR_MSG)
+    if not any(is_label_err_comment(comment) for comment in pr.get_comments()):
+        gh_post_pr_comment(pr.org, pr.project, pr.pr_num, LABEL_ERR_MSG)
 
 
 def parse_args() -> Any:
@@ -72,16 +45,19 @@ def main() -> None:
     org, project = repo.gh_owner_and_name()
     pr = GitHubPR(org, project, args.pr_num)
 
+    exit_code = 0
     try:
         if not has_required_labels(pr):
-            print(ERR_MSG)
-            add_comment(pr)
-            exit(1)
+            exit_code = 1
+            print(LABEL_ERR_MSG)
+            add_label_err_comment(pr)
         else:
-            delete_comments(pr)
+            delete_all_label_err_comments(pr)
     except Exception as e:
         pass
 
+    exit(exit_code)
+
 
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/collect_ciflow_labels.py b/.github/scripts/collect_ciflow_labels.py
new file mode 100755
index 000000000000..16cf1f3d5503
--- /dev/null
+++ b/.github/scripts/collect_ciflow_labels.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+from pathlib import Path
+from typing import Any, Dict, List, Set, cast
+import yaml
+import sys
+
+GITHUB_DIR = Path(__file__).parent.parent
+
+def get_workflows_push_tags() -> Set[str]:
+    "Extract all known push tags from workflows"
+    rc: Set[str] = set()
+    for fname in (GITHUB_DIR / "workflows").glob("*.yml"):
+        with fname.open("r") as f:
+            wf_yml = yaml.safe_load(f)
+        # "on" is alias to True in yaml
+        on_tag = wf_yml.get(True, None)
+        push_tag = on_tag.get("push", None) if isinstance(on_tag, dict) else None
+        tags_tag = push_tag.get("tags", None) if isinstance(push_tag, dict) else None
+        if isinstance(tags_tag, list):
+            rc.update(tags_tag)
+    return rc
+
+
+def filter_ciflow_tags(tags: Set[str]) -> List[str]:
+    " Return sorted list of ciflow tags"
+    return sorted(tag[:-2] for tag in tags if tag.startswith("ciflow/") and tag.endswith("/*"))
+
+
+def read_probot_config() -> Dict[str, Any]:
+    with (GITHUB_DIR / "pytorch-probot.yml").open("r") as f:
+        return cast(Dict[str, Any], yaml.safe_load(f))
+
+
+def update_probot_config(labels: Set[str]) -> None:
+    orig = read_probot_config()
+    orig["ciflow_push_tags"] = filter_ciflow_tags(labels)
+    with (GITHUB_DIR / "pytorch-probot.yml").open("w") as f:
+        yaml.dump(orig, f, indent=4, sort_keys=False)
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser("Validate or update list of tags")
+    parser.add_argument("--validate-tags", action="store_true")
+    args = parser.parse_args()
+    pushtags = get_workflows_push_tags()
+    if args.validate_tags:
+        config = read_probot_config()
+        ciflow_tags = set(filter_ciflow_tags(pushtags))
+        config_tags = set(config["ciflow_push_tags"])
+        if config_tags != ciflow_tags:
+            print("Tags mismatch!")
+            if ciflow_tags.difference(config_tags):
+                print("Reference in workflows but not in config", ciflow_tags.difference(config_tags))
+            if config_tags.difference(ciflow_tags):
+                print("Reference in config, but not in workflows", config_tags.difference(ciflow_tags))
+            print(f"Please run {__file__} to remediate the difference")
+            sys.exit(-1)
+        print("All tags are listed in pytorch-probot.yml")
+    else:
+        update_probot_config(pushtags)
diff --git a/.github/scripts/comment_on_pr.py b/.github/scripts/comment_on_pr.py
index 06b2eefe0988..49b4c47d95b6 100644
--- a/.github/scripts/comment_on_pr.py
+++ b/.github/scripts/comment_on_pr.py
@@ -1,5 +1,5 @@
 from typing import Any
-from trymerge import gh_post_pr_comment
+from github_utils import gh_post_pr_comment
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge_explainer import BOT_COMMANDS_WIKI
 import os
diff --git a/.github/scripts/export_pytorch_labels.py b/.github/scripts/export_pytorch_labels.py
index 4e49514d7136..0a45c4f46d64 100755
--- a/.github/scripts/export_pytorch_labels.py
+++ b/.github/scripts/export_pytorch_labels.py
@@ -12,59 +12,26 @@
 
 import boto3  # type: ignore[import]
 import json
-from functools import lru_cache
-from typing import List, Any
-from urllib.request import urlopen, Request
 
-# Modified from https://github.com/pytorch/pytorch/blob/b00206d4737d1f1e7a442c9f8a1cadccd272a386/torch/hub.py#L129
-def _read_url(url: Any) -> Any:
-    with urlopen(url) as r:
-        return r.headers, r.read().decode(r.headers.get_content_charset('utf-8'))
+from label_utils import gh_get_labels
+from typing import Any
 
 
-def request_for_labels(url: str) -> Any:
-    headers = {'Accept': 'application/vnd.github.v3+json'}
-    return _read_url(Request(url, headers=headers))
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+    parser = ArgumentParser("Export PR labels")
+    parser.add_argument("org", type=str)
+    parser.add_argument("repo", type=str)
 
+    return parser.parse_args()
 
-def get_last_page(header: Any) -> int:
-    # Link info looks like: <https://api.github.com/repositories/65600975/labels?per_page=100&page=2>;
-    # rel="next", <https://api.github.com/repositories/65600975/labels?per_page=100&page=3>; rel="last"
-    link_info = header['link']
-    prefix = "&page="
-    suffix = ">;"
-    return int(link_info[link_info.rindex(prefix) + len(prefix):link_info.rindex(suffix)])
 
-
-def update_labels(labels: List[str], info: str) -> None:
-    labels_json = json.loads(info)
-    labels.extend([x["name"] for x in labels_json])
-
-
-@lru_cache()
-def get_pytorch_labels() -> List[str]:
-    prefix = "https://api.github.com/repos/pytorch/pytorch/labels?per_page=100"
-    header, info = request_for_labels(prefix + "&page=1")
-    labels: List[str] = []
-    update_labels(labels, info)
-
-    last_page = get_last_page(header)
-    assert last_page > 0, "Error reading header info to determine total number of pages of labels"
-    for page_number in range(2, last_page + 1):  # skip page 1
-        _, info = request_for_labels(prefix + f"&page={page_number}")
-        update_labels(labels, info)
-
-    return labels
-
-
-def send_labels_to_S3(labels: List[str]) -> None:
+def main() -> None:
+    args = parse_args()
+    print(f"Exporting labels for {args.org}/{args.repo}")
     labels_file_name = "pytorch_labels.json"
     obj = boto3.resource('s3').Object('ossci-metrics', labels_file_name)
-    obj.put(Body=json.dumps(labels).encode())
-
-
-def main() -> None:
-    send_labels_to_S3(get_pytorch_labels())
+    obj.put(Body=json.dumps(gh_get_labels(args.org, args.repo)).encode())
 
 
 if __name__ == '__main__':
diff --git a/.github/scripts/fetch_latest_green_commit.py b/.github/scripts/fetch_latest_green_commit.py
index 447b76b2dd8b..36301c9fab56 100644
--- a/.github/scripts/fetch_latest_green_commit.py
+++ b/.github/scripts/fetch_latest_green_commit.py
@@ -1,5 +1,5 @@
 import sys
-from typing import Any, Dict, List, NamedTuple, Tuple
+from typing import Any, Dict, List, NamedTuple, Tuple, cast
 from gitutils import _check_output
 
 import rockset  # type: ignore[import]
@@ -39,12 +39,23 @@ def get_latest_commits() -> List[str]:
 
     return commits
 
-def query_commits(commits: List[str], qlambda: Any) -> Any:
-    params = rockset.ParamDict()
-    params['shas'] = ",".join(commits)
-    results = qlambda.execute(parameters=params)
+def query_commits(commits: List[str]) -> List[Dict[str, Any]]:
+    rs = rockset.RocksetClient(
+        host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+    )
+    params = [{
+        "name": "shas",
+        "type": "string",
+        "value": ",".join(commits)
+    }]
+    res = rs.QueryLambdas.execute_query_lambda(
+        query_lambda='commit_jobs_batch_query',
+        version='8003fdfd18b64696',
+        workspace='commons',
+        parameters=params
+    )
 
-    return results
+    return cast(List[Dict[str, Any]], res.results)
 
 def print_commit_status(commit: str, results: Dict[str, Any]) -> None:
     print(commit)
@@ -52,9 +63,9 @@ def print_commit_status(commit: str, results: Dict[str, Any]) -> None:
         if check['sha'] == commit:
             print(f"\t{check['conclusion']:>10}: {check['name']}")
 
-def get_commit_results(commit: str, results: Dict[str, Any]) -> List[Dict[str, Any]]:
+def get_commit_results(commit: str, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     workflow_checks = []
-    for check in results['results']:
+    for check in results:
         if check['sha'] == commit:
             workflow_checks.append(WorkflowCheck(
                 workflowName=check['workflowName'],
@@ -64,7 +75,7 @@ def get_commit_results(commit: str, results: Dict[str, Any]) -> List[Dict[str, A
             )._asdict())
     return workflow_checks
 
-def isGreen(commit: str, results: Dict[str, Any]) -> Tuple[bool, str]:
+def isGreen(commit: str, results: List[Dict[str, Any]]) -> Tuple[bool, str]:
     workflow_checks = get_commit_results(commit, results)
 
     regex = {
@@ -91,7 +102,7 @@ def isGreen(commit: str, results: Dict[str, Any]) -> Tuple[bool, str]:
 
     return (True, "")
 
-def get_latest_green_commit(commits: List[str], results: Dict[str, Any]) -> Any:
+def get_latest_green_commit(commits: List[str], results: List[Dict[str, Any]]) -> Any:
     for commit in commits:
         eprint(f"Checking {commit}")
         is_green, msg = isGreen(commit, results)
@@ -103,16 +114,9 @@ def get_latest_green_commit(commits: List[str], results: Dict[str, Any]) -> Any:
     return None
 
 def main() -> None:
-    rs = rockset.Client(
-        api_server="api.rs2.usw2.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-    )
-    qlambda = rs.QueryLambda.retrieve(
-        'commit_jobs_batch_query',
-        version='8003fdfd18b64696',
-        workspace='commons')
 
     commits = get_latest_commits()
-    results = query_commits(commits, qlambda)
+    results = query_commits(commits)
 
     latest_viable_commit = get_latest_green_commit(commits, results)
     print(latest_viable_commit)
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 3f5217592829..9d99c0eef7b8 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -1,40 +1,45 @@
 #!/usr/bin/env python3
 
-import sys
-import re
 import json
 import os
+import re
+import sys
+import warnings
+from typing import Any, Dict, List, Set
+from urllib.request import urlopen
+
 import requests
-from typing import Any, Dict, Set, List
 import yaml
-import warnings
 
 PREFIX = "test-config/"
 
 # Same as shard names
-VALID_TEST_CONFIG_LABELS = {f"{PREFIX}{label}" for label in {
-    "backwards_compat",
-    "crossref",
-    "default",
-    "deploy",
-    "distributed",
-    "docs_tests",
-    "dynamo",
-    "force_on_cpu",
-    "functorch",
-    "inductor",
-    "inductor_distributed",
-    "inductor_huggingface",
-    "inductor_timm",
-    "inductor_torchbench",
-    "jit_legacy",
-    "multigpu",
-    "nogpu_AVX512",
-    "nogpu_NO_AVX2",
-    "slow",
-    "tsan",
-    "xla",
-}}
+VALID_TEST_CONFIG_LABELS = {
+    f"{PREFIX}{label}"
+    for label in {
+        "backwards_compat",
+        "crossref",
+        "default",
+        "deploy",
+        "distributed",
+        "docs_tests",
+        "dynamo",
+        "force_on_cpu",
+        "functorch",
+        "inductor",
+        "inductor_distributed",
+        "inductor_huggingface",
+        "inductor_timm",
+        "inductor_torchbench",
+        "jit_legacy",
+        "multigpu",
+        "nogpu_AVX512",
+        "nogpu_NO_AVX2",
+        "slow",
+        "tsan",
+        "xla",
+    }
+}
 
 # Supported modes when running periodically
 SUPPORTED_PERIODICAL_MODES = {
@@ -42,15 +47,43 @@
     "rerun_disabled_tests",
 }
 
+# The link to the published list of disabled jobs
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+# Some constants used to remove disabled jobs
+JOB_NAME_SEP = "/"
+BUILD_JOB_NAME = "build"
+TEST_JOB_NAME = "test"
+BUILD_AND_TEST_JOB_NAME = "build-and-test"
+JOB_NAME_CFG_REGEX = re.compile(r"(?P<job>[\w-]+)\s+\((?P<cfg>[\w-]+)\)")
+
 
 def parse_args() -> Any:
     from argparse import ArgumentParser
-    parser = ArgumentParser("Filter all test configurations and keep only requested ones")
-    parser.add_argument("--test-matrix", type=str, required=True, help="the original test matrix")
+
+    parser = ArgumentParser(
+        "Filter all test configurations and keep only requested ones"
+    )
+    parser.add_argument(
+        "--test-matrix", type=str, required=True, help="the original test matrix"
+    )
+    parser.add_argument(
+        "--workflow", type=str, help="the name of the current workflow, i.e. pull"
+    )
+    parser.add_argument(
+        "--job-name",
+        type=str,
+        help="the name of the current job, i.e. linux-focal-py3.8-gcc7 / build",
+    )
     parser.add_argument("--pr-number", type=str, help="the pull request number")
     parser.add_argument("--tag", type=str, help="the associated tag if it exists")
-    parser.add_argument("--event-name", type=str, help="name of the event that triggered the job (pull, schedule, etc)")
-    parser.add_argument("--schedule", type=str, help="cron schedule that triggered the job")
+    parser.add_argument(
+        "--event-name",
+        type=str,
+        help="name of the event that triggered the job (pull, schedule, etc)",
+    )
+    parser.add_argument(
+        "--schedule", type=str, help="cron schedule that triggered the job"
+    )
     return parser.parse_args()
 
 
@@ -74,7 +107,9 @@ def get_labels(pr_number: int) -> Set[str]:
     )
 
     if response.status_code != requests.codes.ok:
-        warnings.warn(f"Failed to get the labels for #{pr_number} (status code {response.status_code})")
+        warnings.warn(
+            f"Failed to get the labels for #{pr_number} (status code {response.status_code})"
+        )
         return set()
 
     return {label.get("name") for label in response.json() if label.get("name")}
@@ -93,9 +128,7 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
     If the PR has none of the test-config label, all tests are run as usual.
     """
 
-    filtered_test_matrix: Dict[str, List[Any]] = {
-        "include": []
-    }
+    filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
 
     for entry in test_matrix.get("include", []):
         config_name = entry.get("config", "")
@@ -104,7 +137,9 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
 
         label = f"{PREFIX}{config_name.strip()}"
         if label in labels:
-            print(f"Select {config_name} because label {label} is presented in the pull request by the time the test starts")
+            print(
+                f"Select {config_name} because label {label} is presented in the pull request by the time the test starts"
+            )
             filtered_test_matrix["include"].append(entry)
 
     valid_test_config_labels = labels.intersection(VALID_TEST_CONFIG_LABELS)
@@ -136,6 +171,133 @@ def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]
     return scheduled_test_matrix
 
 
+def remove_disabled_jobs(
+    workflow: str, job_name: str, test_matrix: Dict[str, List[Any]]
+) -> Dict[str, List[Any]]:
+    """
+    Check the list of disabled jobs, remove the current job and all its dependents
+    if it exists in the list. The list of disabled jobs is as follows:
+
+    {
+        "WORKFLOW / PLATFORM / JOB (CONFIG)": [
+            AUTHOR,
+            ISSUE_NUMBER,
+            ISSUE_URL,
+            WORKFLOW,
+            PLATFORM,
+            JOB (CONFIG),
+        ],
+        "pull / linux-bionic-py3.8-clang9 / test (dynamo)": [
+            "pytorchbot",
+            "94861",
+            "https://github.com/pytorch/pytorch/issues/94861",
+            "pull",
+            "linux-bionic-py3.8-clang9",
+            "test (dynamo)",
+        ],
+    }
+    """
+    try:
+        # The job name from github is in the PLATFORM / JOB (CONFIG) format, so breaking
+        # it into its two components first
+        current_platform, _ = [n.strip() for n in job_name.split(JOB_NAME_SEP, 1) if n]
+    except ValueError as error:
+        warnings.warn(f"Invalid job name {job_name}, returning")
+        return test_matrix
+
+    # The result will be stored here
+    filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
+
+    for _, record in download_json(DISABLED_JOBS_URL).items():
+        (
+            author,
+            _,
+            disabled_url,
+            disabled_workflow,
+            disabled_platform,
+            disabled_job_cfg,
+        ) = record
+
+        if disabled_workflow != workflow or disabled_platform != current_platform:
+            # The current workflow or platform is not disabled by this record
+            continue
+
+        # The logic after this is fairly complicated:
+        #
+        # - If the disabled record doesn't have the optional job (config) name,
+        #   i.e. pull / linux-bionic-py3.8-clang9, all build and test jobs will
+        #   be skipped
+        #
+        # - If the disabled record has the job name and it's a build job, i.e.
+        #   pull / linux-bionic-py3.8-clang9 / build, all build and test jobs
+        #   will be skipped, because the latter requires the former
+        #
+        # - If the disabled record has the job name and it's a test job without
+        #   the config part, i.e. pull / linux-bionic-py3.8-clang9 / test, all
+        #   test jobs will be skipped. TODO: At the moment, the script uses the
+        #   short-circuiting logic to skip the build job automatically when there
+        #   is no test job assuming that it would be a waste of effort building
+        #   for nothing. This might not be the desirable behavior, and could be
+        #   fixed later if needed
+        #
+        # - If the disabled record has the job (config) name, only that test config
+        #   will be skipped, i.e. pull / linux-bionic-py3.8-clang9 / test (dynamo)
+        if not disabled_job_cfg:
+            print(
+                f"Issue {disabled_url} created by {author} has disabled all CI jobs for {workflow} / {job_name}"
+            )
+            return filtered_test_matrix
+
+        if disabled_job_cfg == BUILD_JOB_NAME:
+            print(
+                f"Issue {disabled_url} created by {author} has disabled the build job for {workflow} / {job_name}"
+            )
+            return filtered_test_matrix
+
+        if (
+            disabled_job_cfg == TEST_JOB_NAME
+            or disabled_job_cfg == BUILD_AND_TEST_JOB_NAME
+        ):
+            print(
+                f"Issue {disabled_url} created by {author} has disabled all the test jobs for {workflow} / {job_name}"
+            )
+            return filtered_test_matrix
+
+        m = JOB_NAME_CFG_REGEX.match(disabled_job_cfg)
+        if m:
+            disabled_job = m.group("job")
+            # Make sure that the job name is a valid test job name first before checking the config
+            if disabled_job == TEST_JOB_NAME or disabled_job == BUILD_AND_TEST_JOB_NAME:
+                disabled_cfg = m.group("cfg")
+                # Remove the disabled config from the test matrix
+                filtered_test_matrix["include"] = [
+                    r
+                    for r in test_matrix["include"]
+                    if r.get("config", "") != disabled_cfg
+                ]
+                return filtered_test_matrix
+
+        warnings.warn(
+            f"Found a matching disabled issue {disabled_url} for {workflow} / {job_name}, "
+            f"but the name {disabled_job_cfg} is invalid"
+        )
+
+    # Found no matching disabled issue, return the same input test matrix
+    return test_matrix
+
+
+def download_json(url: str, num_retries: int = 3) -> Any:
+    for _ in range(num_retries):
+        try:
+            content = urlopen(url, timeout=5).read().decode("utf-8")
+            return json.loads(content)
+        except Exception as e:
+            warnings.warn(f"Could not download {url}: {e}")
+
+    warnings.warn(f"All {num_retries} retries exhausted, downloading {url} failed")
+    return {}
+
+
 def set_output(name: str, val: Any) -> None:
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
@@ -190,11 +352,18 @@ def main() -> None:
         # No PR number, no tag, we can just return the test matrix as it is
         filtered_test_matrix = test_matrix
 
-    if args.event_name == "schedule" and args.schedule == '29 8 * * *':
+    if args.event_name == "schedule" and args.schedule == "29 8 * * *":
         # we don't want to run the mem leack check or disabled tests on normal
         # periodically scheduled jobs, only the ones at this time
         filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
 
+    if args.workflow and args.job_name:
+        # If both workflow and job name are available, we will check if the current job
+        # is disabled and remove it and all its dependants from the test matrix
+        filtered_test_matrix = remove_disabled_jobs(
+            args.workflow, args.job_name, filtered_test_matrix
+        )
+
     # Set the filtered test matrix as the output
     set_output("test-matrix", json.dumps(filtered_test_matrix))
 
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 04ae5c7cedb7..7a5b0a86104e 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -13,17 +13,21 @@
 from typing import Dict, List, Tuple, Optional
 
 
-CUDA_ARCHES = ["11.6", "11.7", "11.8"]
+CUDA_ARCHES = ["11.7", "11.8"]
 
 
-ROCM_ARCHES = ["5.2", "5.3"]
+ROCM_ARCHES = ["5.3", "5.4.2"]
 
 
+CPU_CXX11_ABI_ARCH = ['cpu-cxx11-abi']
+
 def arch_type(arch_version: str) -> str:
     if arch_version in CUDA_ARCHES:
         return "cuda"
     elif arch_version in ROCM_ARCHES:
         return "rocm"
+    elif arch_version in CPU_CXX11_ABI_ARCH:
+        return "cpu-cxx11-abi"
     else:  # arch_version should always be "cpu" in this case
         return "cpu"
 
@@ -38,6 +42,7 @@ def arch_type(arch_version: str) -> str:
         for gpu_arch in ROCM_ARCHES
     },
     "cpu": "pytorch/manylinux-builder:cpu",
+    "cpu-cxx11-abi": "pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi",
 }
 
 CONDA_CONTAINER_IMAGES = {
@@ -71,12 +76,13 @@ def arch_type(arch_version: str) -> str:
     ("cpu", CXX11_ABI): "pytorch/libtorch-cxx11-builder:cpu",
 }
 
-FULL_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
+FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
 
 
 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
     return {
         "cpu": "cpu",
+        "cpu-cxx11-abi": "cpu-cxx11-abi",
         "cuda": f"cu{gpu_arch_version.replace('.', '')}",
         "rocm": f"rocm{gpu_arch_version}",
     }.get(gpu_arch_type, gpu_arch_version)
@@ -92,8 +98,6 @@ def generate_conda_matrix(os: str) -> List[Dict[str, str]]:
     python_versions = FULL_PYTHON_VERSIONS
     if os == "linux" or os == "windows":
         arches += CUDA_ARCHES
-    elif os == "macos-arm64":
-        python_versions = list_without(python_versions, ["3.7"])
     for python_version in python_versions:
         # We don't currently build conda packages for rocm
         for arch_version in arches:
@@ -178,21 +182,13 @@ def generate_wheels_matrix(os: str,
         package_type = "manywheel"
 
     if python_versions is None:
-        # Define default python version
-        python_versions = list(FULL_PYTHON_VERSIONS)
-        if os == "macos-arm64":
-            python_versions = list_without(python_versions, ["3.7"])
-
-        if os == "linux":
-            # NOTE: We only build 3.11 wheel on linux as 3.11 is not
-            # available on conda right now
-            python_versions.append("3.11")
+        python_versions = FULL_PYTHON_VERSIONS
 
     if arches is None:
         # Define default compute archivectures
         arches = ["cpu"]
         if os == "linux":
-            arches += CUDA_ARCHES + ROCM_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
         elif os == "windows":
             arches += CUDA_ARCHES
 
@@ -200,7 +196,7 @@ def generate_wheels_matrix(os: str,
     for python_version in python_versions:
         for arch_version in arches:
             gpu_arch_type = arch_type(arch_version)
-            gpu_arch_version = "" if arch_version == "cpu" else arch_version
+            gpu_arch_version = "" if arch_version == "cpu" or arch_version == "cpu-cxx11-abi" else arch_version
             # Skip rocm 3.11 binaries for now as the docker image are not correct
             if python_version == "3.11" and gpu_arch_type == "rocm":
                 continue
@@ -216,20 +212,21 @@ def generate_wheels_matrix(os: str,
                         "desired_cuda": translate_desired_cuda(
                             gpu_arch_type, gpu_arch_version
                         ),
+                        "devtoolset": "",
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                         "package_type": package_type,
                         "pytorch_extra_install_requirements":
-                        "nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' | "
-                        "nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' | "
-                        "nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' | "
-                        "nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | "
-                        "nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' | "
-                        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' | "
-                        "nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' | "
-                        "nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' | "
-                        "nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' | "
-                        "nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' | "
-                        "nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux'",
+                        "nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+                        "nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'",
                         "build_name":
                         f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn"
                         .replace(
@@ -246,6 +243,7 @@ def generate_wheels_matrix(os: str,
                     "desired_cuda": translate_desired_cuda(
                         gpu_arch_type, gpu_arch_version
                     ),
+                    "devtoolset": "cxx11-abi" if arch_version == "cpu-cxx11-abi" else "",
                     "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                     "package_type": package_type,
                     "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 35680e30ee6a..221e4e1fe4c0 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -2,13 +2,13 @@
 
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
-from typing import Dict, Set, List, Iterable
+from typing import Dict, Set, List, Literal, Iterable
 
 import jinja2
 
 import os
 import sys
-from typing_extensions import Literal, TypedDict
+from typing_extensions import TypedDict  # Python 3.11+
 
 import generate_binary_build_matrix  # type: ignore[import]
 
@@ -134,8 +134,17 @@ class OperatingSystem:
         package_type="manywheel",
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.LINUX,
-            arches=["11.6"],
-            python_versions=["3.7"]),
+            arches=["11.8"],
+            python_versions=["3.8"]),
+        branches="master",
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX,
+            arches=["11.7"],
+            python_versions=["3.8"]),
         branches="master",
     ),
     BinaryBuildWorkflow(
diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py
index 7eb3dbf9390d..9f41321f50ef 100644
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@@ -7,6 +7,7 @@
 import os
 import re
 import sys
+import time
 import urllib
 import urllib.parse
 
@@ -34,13 +35,18 @@ def parse_json_and_links(conn: Any) -> Tuple[Any, Dict[str, Dict[str, str]]]:
 
 def fetch_url(url: str, *,
               headers: Optional[Dict[str, str]] = None,
-              reader: Callable[[Any], Any] = lambda x: x.read()) -> Any:
+              reader: Callable[[Any], Any] = lambda x: x.read(),
+              retries: Optional[int] = 3,
+              backoff_timeout: float = .5) -> Any:
     if headers is None:
         headers = {}
     try:
         with urlopen(Request(url, headers=headers)) as conn:
             return reader(conn)
     except urllib.error.HTTPError as err:
+        if isinstance(retries, (int, float)) and retries > 0:
+            time.sleep(backoff_timeout)
+            return fetch_url(url, headers=headers, reader=reader, retries=retries - 1, backoff_timeout=backoff_timeout)
         exception_message = (
             "Is github alright?",
             f"Recieved status code '{err.code}' when attempting to retrieve {url}:\n",
diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py
new file mode 100644
index 000000000000..354cfa12af10
--- /dev/null
+++ b/.github/scripts/github_utils.py
@@ -0,0 +1,103 @@
+"""GitHub Utilities"""
+
+import json
+import os
+
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Dict, List, Optional
+from urllib.error import HTTPError
+from urllib.parse import quote
+from urllib.request import Request, urlopen
+
+
+@dataclass
+class GitHubComment:
+    body_text: str
+    created_at: str
+    author_login: str
+    author_association: str
+    editor_login: Optional[str]
+    database_id: int
+
+
+def gh_fetch_url(
+    url: str, *,
+    headers: Optional[Dict[str, str]] = None,
+    data: Optional[Dict[str, Any]] = None,
+    method: Optional[str] = None,
+    reader: Callable[[Any], Any] = lambda x: x.read()
+) -> Any:
+    if headers is None:
+        headers = {}
+    token = os.environ.get("GITHUB_TOKEN")
+    if token is not None and url.startswith('https://api.github.com/'):
+        headers['Authorization'] = f'token {token}'
+    data_ = json.dumps(data).encode() if data is not None else None
+    try:
+        with urlopen(Request(url, headers=headers, data=data_, method=method)) as conn:
+            return reader(conn)
+    except HTTPError as err:
+        if err.code == 403 and all(key in err.headers for key in ['X-RateLimit-Limit', 'X-RateLimit-Used']):
+            print(f"""Rate limit exceeded:
+                Used: {err.headers['X-RateLimit-Used']}
+                Limit: {err.headers['X-RateLimit-Limit']}
+                Remaining: {err.headers['X-RateLimit-Remaining']}
+                Resets at: {err.headers['x-RateLimit-Reset']}""")
+        raise
+
+
+def gh_fetch_json(
+    url: str,
+    params: Optional[Dict[str, Any]] = None,
+    data: Optional[Dict[str, Any]] = None
+) -> List[Dict[str, Any]]:
+    headers = {'Accept': 'application/vnd.github.v3+json'}
+    if params is not None and len(params) > 0:
+        url += '?' + '&'.join(f"{name}={quote(str(val))}" for name, val in params.items())
+    return cast(List[Dict[str, Any]], gh_fetch_url(url, headers=headers, data=data, reader=json.load))
+
+def _gh_fetch_json_any(
+    url: str,
+    params: Optional[Dict[str, Any]] = None,
+    data: Optional[Dict[str, Any]] = None
+) -> Any:
+    headers = {'Accept': 'application/vnd.github.v3+json'}
+    if params is not None and len(params) > 0:
+        url += '?' + '&'.join(f"{name}={quote(str(val))}" for name, val in params.items())
+    return gh_fetch_url(url, headers=headers, data=data, reader=json.load)
+
+
+def gh_fetch_json_list(
+    url: str,
+    params: Optional[Dict[str, Any]] = None,
+    data: Optional[Dict[str, Any]] = None
+) -> List[Dict[str, Any]]:
+    return cast(List[Dict[str, Any]], _gh_fetch_json_any(url, params, data))
+
+
+def gh_fetch_json_dict(
+    url: str,
+    params: Optional[Dict[str, Any]] = None,
+    data: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any] :
+    return cast(Dict[str, Any], _gh_fetch_json_any(url, params, data))
+
+
+def _gh_post_comment(url: str, comment: str, dry_run: bool = False) -> List[Dict[str, Any]]:
+    if dry_run:
+        print(comment)
+        return []
+    return gh_fetch_json_list(url, data={"body": comment})
+
+
+def gh_post_pr_comment(org: str, repo: str, pr_num: int, comment: str, dry_run: bool = False) -> List[Dict[str, Any]]:
+    return _gh_post_comment(f'https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/comments', comment, dry_run)
+
+
+def gh_post_commit_comment(org: str, repo: str, sha: str, comment: str, dry_run: bool = False) -> List[Dict[str, Any]]:
+    return _gh_post_comment(f'https://api.github.com/repos/{org}/{repo}/commits/{sha}/comments', comment, dry_run)
+
+
+def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
+    url = f"https://api.github.com/repos/{org}/{repo}/issues/comments/{comment_id}"
+    gh_fetch_url(url, method="DELETE")
diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py
index aa64fe15387e..f97c2f6c4403 100644
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@@ -273,6 +273,11 @@ def commit_message(self, ref: str) -> str:
     def amend_commit_message(self, msg: str) -> None:
         self._run_git("commit", "--amend", "-m", msg)
 
+    def diff(self, from_ref: str, to_ref: Optional[str] = None) -> str:
+        if to_ref is None:
+            return self._run_git("diff", f"{from_ref}^!")
+        return self._run_git("diff", f"{from_ref}..{to_ref}")
+
 
 def clone_repo(username: str, password: str, org: str, project: str) -> GitRepo:
     path = tempfile.mkdtemp()
@@ -331,3 +336,18 @@ def patterns_to_regex(allowed_patterns: List[str]) -> Any:
                 rc += c
     rc += ")"
     return re.compile(rc)
+
+def _shasum(value: str) -> str:
+    import hashlib
+    m = hashlib.sha256()
+    m.update(value.encode("utf-8"))
+    return m.hexdigest()
+
+
+def are_ghstack_branches_in_sync(repo: GitRepo, head_ref: str) -> bool:
+    """ Checks that diff between base and head is the same as diff between orig and its parent """
+    orig_ref = re.sub(r'/head$', '/orig', head_ref)
+    base_ref = re.sub(r'/head$', '/base', head_ref)
+    orig_diff_sha = _shasum(repo.diff(f"{repo.remote}/{orig_ref}"))
+    head_diff_sha = _shasum(repo.diff(f"{repo.remote}/{base_ref}", f"{repo.remote}/{head_ref}"))
+    return orig_diff_sha == head_diff_sha
diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json
index 9dcbfe6b6e19..101ca39c5c6a 100644
--- a/.github/scripts/gql_mocks.json
+++ b/.github/scripts/gql_mocks.json
@@ -1,20 +1,20 @@
 {
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=71759 owner=pytorch": {
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=92863 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
           "closed": true,
-          "isCrossRepository": true,
+          "isCrossRepository": false,
           "author": {
-            "login": "coolteemf"
+            "login": "soulitzer"
           },
-          "title": "Optimize grid sample 3d",
-          "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n>     * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n>     * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n>     * Changed the CPU kernels:\r\n>       (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorAccessor<scalar_t, 3>* gInp_slice_ptr` instead of `TensorAccessor<scalar_t, 3>& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n>     * Changed CUDA kernel:\r\n>       (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorInfo<scalar_t, index_t>()` instead of `getTensorInfo<scalar_t, index_t>(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n>     * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n>     * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n",
-          "headRefName": "optimize_grid_sample_3d",
+          "title": "Revert #92688 and #92348 (aot autograd explicitly errors on double backward)",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\r\n* #92604\r\n* #92734\r\n* __->__ #92863\r\n\r\n\r\ncc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @desertfire",
+          "headRefName": "gh/soulitzer/173/head",
           "headRepository": {
-            "nameWithOwner": "coolteemf/pytorch"
+            "nameWithOwner": "pytorch/pytorch"
           },
-          "baseRefName": "master",
+          "baseRefName": "gh/soulitzer/173/base",
           "baseRepository": {
             "nameWithOwner": "pytorch/pytorch",
             "isPrivate": false,
@@ -25,174 +25,24 @@
           "mergeCommit": null,
           "commits_with_authors": {
             "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "563ec73747ad53b63b36736c47c4342f962c2a09"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "f683e8aec7aea76097a264eec01511e704c31154"
-                }
-              },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "coolteemf"
+                      "login": "soulitzer"
                     },
-                    "email": "67541941+coolteemf@users.noreply.github.com",
-                    "name": "Fran\u00e7ois Lecomte"
-                  },
-                  "oid": "b932e9e286c22aaf352375186df851ef060b295a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
+                    "email": "soulitzer@gmail.com",
+                    "name": "soulitzer"
                   },
-                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                  "oid": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "MTY",
+              "endCursor": "MQ",
               "hasNextPage": false
             },
-            "totalCount": 16
+            "totalCount": 1
           },
           "commits": {
             "nodes": [
@@ -200,30 +50,6 @@
                 "commit": {
                   "checkSuites": {
                     "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_T6g="
-                      },
                       {
                         "node": {
                           "app": {
@@ -232,36 +58,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-onnx"
+                              "name": "Labeler"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754066"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169362"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663109808"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214802"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "name": "triage",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214856"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169362/jobs/6845670588"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWnxQ=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubk="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQie2A="
                       },
                       {
                         "node": {
@@ -271,26 +87,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
+                              "name": "Auto Request Review"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754064"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169390"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "Auto Request Review",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754064/jobs/2663109676"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169390/jobs/6845670628"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWn0c=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubw="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQie7c="
                       },
                       {
                         "node": {
@@ -300,41 +116,66 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-bionic-rocm4.5-py3.7"
+                              "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754065"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169394"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "Test tools",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663109684"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670645"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "toc",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401083"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670735"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "Test collect_env (with_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401143"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670831"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401186"
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670917"
+                              },
+                              {
+                                "name": "Test collect_env (older_python_version)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671001"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671075"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671156"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671269"
+                              },
+                              {
+                                "name": "pr-sanity-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671367"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWo1M=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub0="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQie7s="
                       },
                       {
                         "node": {
@@ -344,41 +185,55 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "win-vs2019-cuda11.3-py3"
+                              "name": "Check Labels"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754068"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169391"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663109680"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995756"
-                              },
-                              {
-                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995819"
-                              },
+                                "name": "Check labels",
+                                "conclusion": "CANCELLED",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169391/jobs/6845670642"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWn1k=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQie74="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169396"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995900"
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169396/jobs/6845670670"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWn34=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub8="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQie78="
                       },
                       {
                         "node": {
@@ -388,296 +243,382 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754069"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169410"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "mypy",
+                                "name": "linux-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109683"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845670888"
                               },
                               {
-                                "name": "shellcheck",
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109827"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845670982"
                               },
                               {
-                                "name": "py2-setup-validate-errormsg",
+                                "name": "win-vs2019-cuda11.6-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109962"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671067"
                               },
                               {
-                                "name": "clang-format",
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110044"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671153"
                               },
                               {
-                                "name": "cmakelint",
+                                "name": "linux-focal-py3.7-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110132"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671251"
                               },
                               {
-                                "name": "toc",
+                                "name": "linux-focal-py3-clang7-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110233"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671341"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110320"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671421"
                               },
                               {
-                                "name": "clang-tidy",
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110461"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671504"
                               },
                               {
-                                "name": "flake8-py3",
+                                "name": "linux-focal-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110575"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-asan"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754070"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671612"
+                              },
                               {
-                                "name": "build",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663109804"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671699"
                               },
                               {
-                                "name": "test (default, 3, 3, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233675"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671779"
                               },
                               {
-                                "name": "test (default, 1, 3, linux.2xlarge)",
+                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233731"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671874"
                               },
                               {
-                                "name": "test (default, 2, 3, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233805"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754076"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671946"
+                              },
                               {
-                                "name": "build-and-test",
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754076/jobs/2663109810"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754078"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672034"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-focal-rocm5.3-py3.8 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663109777"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672136"
                               },
                               {
-                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7-pch / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201383"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672239"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201458"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672322"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201512"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672419"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201580"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672509"
                               },
                               {
-                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201672"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845803829"
                               },
                               {
-                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "linux-docs / build-docs-cpp-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201839"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uco="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754079"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845803990"
+                              },
                               {
-                                "name": "build-and-test",
+                                "name": "linux-docs / build-docs-python-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754079/jobs/2663109681"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017798?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017799?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017816?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845804069"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-functorch-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845804156"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845804734"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808552"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808668"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
+                                "conclusion": "CANCELLED",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808750"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808838"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808933"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809050"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809146"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809280"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809596"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809712"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809828"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809924"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810034"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810121"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810227"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810589"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845812809"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845814609"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845817702"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845817778"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845849131"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845854824"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845854914"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855028"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855123"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855197"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXadxU=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQie-c="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWn4Y=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifN4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifQA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifQk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifRo="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "EasyCLA",
                         "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017800?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
                       }
                     ]
                   },
-                  "pushedDate": "2022-02-23T10:39:30Z",
-                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                  "pushedDate": "2023-01-23T22:36:13Z",
+                  "oid": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6"
                 }
               }
             ]
           },
-          "changedFiles": 9,
+          "changedFiles": 2,
           "files": {
             "nodes": [
               {
-                "path": "aten/src/ATen/native/GridSampler.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/GridSampler.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/GridSampler.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/GridSampler.h"
-              },
-              {
-                "path": "aten/src/ATen/native/native_functions.yaml"
-              },
-              {
-                "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py"
+                "path": "test/dynamo/test_aot_autograd.py"
               },
               {
-                "path": "test/test_nn.py"
-              },
-              {
-                "path": "tools/autograd/derivatives.yaml"
+                "path": "torch/_functorch/aot_autograd.py"
               }
             ],
             "pageInfo": {
-              "endCursor": "OQ",
+              "endCursor": "Mg",
               "hasNextPage": false
             }
           },
@@ -685,296 +626,113 @@
             "nodes": [
               {
                 "author": {
-                  "login": "albanD"
+                  "login": "eellison"
                 },
-                "state": "COMMENTED"
-              },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMy0wMS0yM1QxNjo0MDo0NS0wODowMLkyMDIzLTAxLTIzVDE2OjQwOjQ1LTA4OjAwzkt_hPI=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
               {
+                "bodyText": "\ud83d\udd17 Helpful Links\n\ud83e\uddea See artifacts and rendered test results at hud.pytorch.org/pr/92863\n\n\ud83d\udcc4 Preview Python docs built from this PR\n\ud83d\udcc4 Preview C++ docs built from this PR\n\u2753 Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours\n\nNote: Links to docs will display an error until the docs builds have been completed.\n\u274c 2 Failures\nAs of commit 030a6d3:\nNEW FAILURES - The following jobs have failed:\n\nlinux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)\n\n\nBROKEN TRUNK - The following jobs failed but were present on the merge base 8972a9f:\n\nlinux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)\n\n\nThis comment was automatically generated by Dr. CI and updates every 15 minutes.",
+                "createdAt": "2023-01-23T22:36:11Z",
                 "author": {
-                  "login": "coolteemf"
+                  "login": "pytorch-bot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "pytorch-bot"
+                },
+                "databaseId": 1401102837
               },
               {
+                "bodyText": "@pytorchbot merge -f \"Unrelated failure\"",
+                "createdAt": "2023-01-24T02:59:49Z",
                 "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630",
-                "createdAt": "2022-02-23T14:55:36Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1048868910
-              },
-              {
-                "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !",
-                "createdAt": "2022-02-23T16:44:36Z",
-                "author": {
-                  "login": "coolteemf"
-                },
-                "authorAssociation": "CONTRIBUTOR",
-                "editor": null,
-                "databaseId": 1048983572
-              },
-              {
-                "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)",
-                "createdAt": "2022-02-23T17:49:55Z",
-                "author": {
-                  "login": "malfet"
+                  "login": "soulitzer"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1049048119
+                "databaseId": 1401333258
               },
               {
-                "bodyText": "@pytorchbot merge this please",
-                "createdAt": "2022-02-23T19:23:55Z",
+                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
+                "createdAt": "2023-01-24T03:04:02Z",
                 "author": {
-                  "login": "albanD"
+                  "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1049131992
-              },
-              {
-                "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-02-23T19:26:51Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1049134520
+                "databaseId": 1401335638
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==",
-              "hasPreviousPage": true
+              "startCursor": "Y3Vyc29yOnYyOpHOU4Mh9Q==",
+              "hasPreviousPage": false
             }
           },
           "labels": {
             "edges": [
               {
                 "node": {
-                  "name": "triaged"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
+                  "name": "Merged"
                 }
               },
               {
                 "node": {
-                  "name": "release notes: nn"
+                  "name": "module: dynamo"
                 }
               },
               {
                 "node": {
-                  "name": "topic: performance"
+                  "name": "release notes: AO frontend"
                 }
               }
             ]
-          },
-          "headRef": null
+          }
         }
       }
     }
   },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=74649 owner=pytorch": {
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAnQifRo= name=pytorch number=92863 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "malfet"
-          },
-          "title": "This should fail flake8",
-          "body": "Test issue for GHF mandatory checks",
-          "headRefName": "malfet-patch-8",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "57c86ff1c5ab948888fd329986c9d55796680e33"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mg",
-              "hasNextPage": false
-            },
-            "totalCount": 2
-          },
           "commits": {
             "nodes": [
               {
                 "commit": {
+                  "oid": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
                   "checkSuites": {
                     "edges": [
                       {
                         "node": {
                           "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
+                            "name": "Codecov",
+                            "databaseId": 254
                           },
                           "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsK3w=",
+                              "endCursor": null,
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1E="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifS0="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
                           },
                           "workflowRun": null,
                           "checkRuns": {
@@ -986,13 +744,13 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1M="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifVE="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
+                            "name": "CircleCI Checks",
+                            "databaseId": 18001
                           },
                           "workflowRun": null,
                           "checkRuns": {
@@ -1004,62 +762,270 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Q="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifYQ="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3991169600"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "Check labels",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169600/jobs/6845671155"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXWoiQ=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Y="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnQifgA="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3992628517"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "Check labels",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3992628517/jobs/6848645507"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoYR8No=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1s="
-                      },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAnRVjj8="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAoXadxU= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAnQie78= name=pytorch number=92863 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+                  "checkSuites": {
+                    "nodes": [
                       {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855276"
+                            },
+                            {
+                              "name": "linux-bionic-py3_7-clang8-xla / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845868475"
+                            },
+                            {
+                              "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845872827"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845946929"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950678"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950759"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950836"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950938"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951052"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951169"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951282"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951414"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951561"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846274479"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846294540"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846294653"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846294751"
                             }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj14="
-                      },
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAoXjZPc=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=82169 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "ezyang"
+          },
+          "title": "Move test_dtypes so it runs later",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #82169\n\nThe error messages it gives are very unhelpful (because a failure\ngets translated into \"dtype was not supported\" rather than the\nactual backtrace), so I'd rather get error messages about this after\nI've tested basic functionality.\n\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>",
+          "headRefName": "gh/ezyang/1279/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/ezyang/1279/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "cef34da55a59da5a32494bff218ccd4978b659d3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "83ad7e73a07111ac1d85e931d14360cc22c01edd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            },
+            "totalCount": 3
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
                       {
                         "node": {
                           "app": {
@@ -1070,79 +1036,59 @@
                             "workflow": {
                               "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2031576283"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823981"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "clang-format",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925132"
-                              },
-                              {
-                                "name": "clang-tidy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925189"
-                              },
-                              {
-                                "name": "cmakelint",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925230"
-                              },
-                              {
-                                "name": "flake8-py3",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925307"
-                              },
-                              {
-                                "name": "mypy",
+                                "name": "lintrunner",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925365"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310707890"
                               },
                               {
                                 "name": "Test collect_env (with_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925427"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708140"
                               },
                               {
                                 "name": "Test collect_env (without_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925449"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708223"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "Test collect_env (older_python_version)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925537"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708332"
                               },
                               {
-                                "name": "py2-setup-validate-errormsg",
+                                "name": "quick-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925644"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708496"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "toc",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925688"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708710"
                               },
                               {
-                                "name": "toc",
+                                "name": "Test tools",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925809"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708937"
                               },
                               {
-                                "name": "shellcheck",
+                                "name": "workflow-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925945"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310709169"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsMiY=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGj1lc=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFA="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8k="
                       },
                       {
                         "node": {
@@ -1154,24 +1100,93 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2031576288"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823979"
                           },
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576288/jobs/2928925134"
-                              }
-                            ],
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsLW0=",
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8s="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823982"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823982/jobs/4310707884"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjz0w=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFs="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9A="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823980"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9Q="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747824002"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdAs="
                       },
                       {
                         "node": {
@@ -1183,278 +1198,356 @@
                             "workflow": {
                               "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2031576300"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747824048"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935743"
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708487"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935775"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708713"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935850"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708942"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / build",
+                                "name": "linux-focal-py3.7-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935994"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709174"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936064"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709340"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936179"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709579"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936265"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709844"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936309"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710003"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936353"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710175"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "name": "win-vs2019-cuda11.6-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936395"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710516"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "linux-focal-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936426"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710716"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936483"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710890"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936516"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711097"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936558"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711234"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936633"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711429"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "name": "linux-focal-rocm5.2-py3.7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936705"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711603"
                               },
                               {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936736"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711765"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936756"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711946"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936796"
+                                "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712129"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936823"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712276"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990551"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194495"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990588"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194591"
                               },
                               {
-                                "name": "linux-docs / build-docs (cpp)",
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992832"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194659"
                               },
                               {
-                                "name": "linux-docs / build-docs (python)",
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992868"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194749"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992932"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194858"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992965"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194934"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993011"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311195003"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993042"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220458"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993086"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220540"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "linux-docs / build-docs (cpp)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993128"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222725"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs (python)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995802"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222869"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995853"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223128"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995889"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223225"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928997626"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223324"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999058"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223396"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999075"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223496"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012407"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223569"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012438"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223690"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012469"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311224360"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034328"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311230050"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034340"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311301930"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929040801"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302152"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929045939"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302303"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046016"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302433"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046063"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302531"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082254"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491082"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082275"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491172"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157614"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491232"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157635"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491289"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491348"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG0YME=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdIQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157656"
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHxIT4=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjyQg=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkGU="
-                      }
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdMA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdeE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdfU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdgg="
+                      }
                     ],
                     "pageInfo": {
-                      "hasNextPage": false
+                      "hasNextPage": true
                     }
                   },
                   "status": null,
-                  "pushedDate": "2022-03-24T00:42:33Z",
-                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
+                  "pushedDate": "2022-07-27T15:34:17Z",
+                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
                 }
               }
             ]
@@ -1463,7 +1556,7 @@
           "files": {
             "nodes": [
               {
-                "path": "torch/nn/cpp.py"
+                "path": "test/test_ops.py"
               }
             ],
             "pageInfo": {
@@ -1475,179 +1568,109 @@
             "nodes": [
               {
                 "author": {
-                  "login": "seemethere"
+                  "login": "zou3519"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "Chillee"
                 },
                 "state": "APPROVED"
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0yM1QxNTo1MDo0NS0wNzowMLkyMDIyLTAzLTIzVDE1OjUwOjQ1LTA3OjAwzjbPEDg=",
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0yNVQxNDo0NTozNS0wNzowMLkyMDIyLTA3LTI1VDE0OjQ1OjM1LTA3OjAwzj6XYmg=",
               "hasPreviousPage": false
             }
           },
           "comments": {
             "nodes": [
               {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/74649\n\u21a9\ufe0f \u00a0[fb-only] Re-run with SSH instructions\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 6c3c3de (more details on the Dr. CI page):\n\n\n1/1 failures introduced in this PR\n\n\n1 failure not recognized by patterns:\n\n\n\nJob\nStep\nAction\n\n\n\n\n Lint / flake8-py3\nFail if there were any warnings\n\ud83d\udd01 rerun\n\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-03-23T22:40:51Z",
+                "bodyText": "@pytorchbot merge -f FORCE",
+                "createdAt": "2022-07-27T17:56:43Z",
                 "author": {
-                  "login": "facebook-github-bot"
+                  "login": "malfet"
                 },
                 "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
+                "editor": null,
+                "databaseId": 1197107402
+              },
+              {
+                "bodyText": "You need to provide a reason for using force merge, in the format @pytorchbot merge -f '[CATEGORY] Explanation'. With [CATEGORY] being one the following:\nEMERGENCY - an emergency fix to quickly address an issue\nMINOR - a minor fix such as cleaning locally unused variables, which shouldn't break anything\nPRE_TESTED - a previous CI run tested everything and you've only added minor changes like fixing lint\nOTHER - something not covered above",
+                "createdAt": "2022-07-27T17:56:45Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1197107439
+              },
+              {
+                "bodyText": "@pytorchbot merge -f \"[OTHER] normal land failed twice already\"",
+                "createdAt": "2022-07-27T17:57:28Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197108130
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-27T18:08:13Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197119348
+              },
+              {
+                "bodyText": "Hey @ezyang.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-07-27T18:08:58Z",
+                "author": {
+                  "login": "github-actions"
                 },
-                "databaseId": 1076891218
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1197120095
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQDAOUg==",
-              "hasPreviousPage": false
+              "startCursor": "Y3Vyc29yOnYyOpHOR1poyg==",
+              "hasPreviousPage": true
             }
           },
           "labels": {
             "edges": [
+              {
+                "node": {
+                  "name": "Merged"
+                }
+              },
               {
                 "node": {
                   "name": "cla signed"
                 }
               }
             ]
-          },
-          "headRef": null
+          }
         }
       }
     }
   },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=77700 owner=pytorch": {
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAcHRdgg= name=pytorch number=82169 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "kit1980"
-          },
-          "title": "Move pull linux-docs job to Ubuntu 20.04",
-          "body": "",
-          "headRefName": "sdym/pull-xenial-focal-linux-docs",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kit1980"
-                    },
-                    "email": "sdym@fb.com",
-                    "name": "Sergii Dymchenko"
-                  },
-                  "oid": "81261599614423baa17df72300b8e109677b6799"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            },
-            "totalCount": 1
-          },
           "commits": {
             "nodes": [
               {
                 "commit": {
+                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c",
                   "checkSuites": {
                     "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNmNqE=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuMI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuM4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuNU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuOI="
-                      },
                       {
                         "node": {
                           "app": {
@@ -1664,7 +1687,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuPI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdhg="
                       },
                       {
                         "node": {
@@ -1682,66 +1705,168 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuQQ="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdic="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAcG0YME= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAcHRdAs= name=pytorch number=82169 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491405"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867841"
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491484"
+                            },
+                            {
+                              "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491703"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311551941"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311552010"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311552076"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG1sTc=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "seemethere"
+          },
+          "title": "ci: Migrate metrics credentials to managed IAM",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas <eliuriegas@fb.com>",
+          "headRefName": "gh/seemethere/215/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/seemethere/215/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mg",
+              "hasNextPage": false
+            },
+            "totalCount": 2
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
                           },
+                          "workflowRun": null,
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528127876"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128023"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128196"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128519"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128575"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128663"
-                              },
-                              {
-                                "name": "Test tools",
+                                "name": "Facebook CLA Check",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128857"
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYVY=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzA="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcBs="
                       },
                       {
                         "node": {
@@ -1751,26 +1876,20 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867843"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602960"
                           },
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867843/jobs/3528127882"
-                              }
-                            ],
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdXEg=",
+                              "endCursor": null,
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SKIPPED"
+                          "conclusion": "CANCELLED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzg="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPo="
                       },
                       {
                         "node": {
@@ -1780,96 +1899,141 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "docker-builds"
+                              "name": "win-vs2019-cpu-py3"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867844"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602961"
                           },
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127883"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127945"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128001"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-py3.7-clang9)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128067"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-rocm5.0-py3.7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128124"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-rocm5.1-py3.7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128191"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128259"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128321"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-android-ndk-r19c)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128365"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-asan)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128446"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-asan)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128507"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-onnx)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128563"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc5.4)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128639"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128687"
-                              },
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602963"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602964"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602965"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602967"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602966"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "docker-build (pytorch-linux-focal-py3.7-gcc7)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128741"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602966/jobs/2839950629"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYLI=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu0A="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQM="
                       },
                       {
                         "node": {
@@ -1879,280 +2043,70 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "Test tools"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867849"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602968"
                           },
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150762"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150903"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151086"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151258"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151511"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151776"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151896"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152014"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152139"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152216"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152378"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152516"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152599"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152723"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152802"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152913"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152969"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153005"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153062"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153125"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153207"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242483"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242528"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245875"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245914"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245964"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528246008"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528248520"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255086"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255128"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274064"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274097"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274133"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274173"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274209"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528277014"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528308958"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309747"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309810"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309837"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309864"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309895"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309925"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310044"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310101"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384337"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384379"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384408"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384441"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384471"
-                              }
-                            ],
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNi1Nc=",
-                              "hasNextPage": true
+                              "endCursor": null,
+                              "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "CANCELLED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu1E="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-asan"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602970"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU="
                       }
                     ],
                     "pageInfo": {
-                      "hasNextPage": false
+                      "hasNextPage": true
                     }
                   },
-                  "status": null,
-                  "pushedDate": "2022-05-19T00:02:11Z",
-                  "oid": "81261599614423baa17df72300b8e109677b6799"
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044969?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17045014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044975?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-03-14T23:01:55Z",
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
                 }
               }
             ]
@@ -2161,13 +2115,13 @@
           "files": {
             "nodes": [
               {
-                "path": ".circleci/docker/build.sh"
+                "path": ".github/templates/common.yml.j2"
               },
               {
-                "path": ".circleci/docker/common/install_katex.sh"
+                "path": ".github/workflows/generated-macos-11-py3-x86-64.yml"
               },
               {
-                "path": ".github/workflows/pull.yml"
+                "path": ".github/workflows/update_pytorch_labels.yml"
               }
             ],
             "pageInfo": {
@@ -2177,17 +2131,11 @@
           },
           "reviews": {
             "nodes": [
-              {
-                "author": {
-                  "login": "suo"
-                },
-                "state": "COMMENTED"
-              },
               {
                 "author": {
                   "login": "kit1980"
                 },
-                "state": "COMMENTED"
+                "state": "APPROVED"
               },
               {
                 "author": {
@@ -2197,110 +2145,82 @@
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNS0xOFQxMjo0MTowNS0wNzowMLkyMDIyLTA1LTE4VDEyOjQxOjA0LTA3OjAwzjpD7es=",
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=",
               "hasPreviousPage": false
             }
           },
           "comments": {
             "nodes": [
               {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/77700\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 8126159 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-05-17T23:01:48Z",
+                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976",
+                "createdAt": "2022-03-15T17:43:28Z",
                 "author": {
-                  "login": "facebook-github-bot"
+                  "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 1129400934
+                "editor": null,
+                "databaseId": 1068270969
               },
               {
-                "bodyText": "@pytorchbot merge",
-                "createdAt": "2022-05-19T15:39:05Z",
+                "bodyText": "@pytorchbot force merge this",
+                "createdAt": "2022-03-15T20:26:36Z",
                 "author": {
-                  "login": "kit1980"
+                  "login": "seemethere"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1131884232
+                "databaseId": 1068436128
               },
               {
-                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) linux-docs / build-docs (cpp), linux-docs / build-docs (python) are pending/not yet run for rule OSS CI\nRaised by https://github.com/pytorch/pytorch/actions/runs/2353067846",
-                "createdAt": "2022-05-19T15:40:59Z",
+                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952",
+                "createdAt": "2022-03-15T20:27:47Z",
                 "author": {
                   "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1131886153
+                "databaseId": 1068437098
               },
               {
-                "bodyText": "@pytorchbot merge -f",
-                "createdAt": "2022-05-19T16:41:29Z",
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-03-15T21:18:55Z",
                 "author": {
-                  "login": "kit1980"
+                  "login": "seemethere"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1131945610
+                "databaseId": 1068482921
               },
               {
-                "bodyText": "Hey @kit1980.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-05-19T16:43:37Z",
+                "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-03-15T21:20:40Z",
                 "author": {
                   "login": "github-actions"
                 },
                 "authorAssociation": "NONE",
                 "editor": null,
-                "databaseId": 1131947473
+                "databaseId": 1068484404
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQ1FKZg==",
-              "hasPreviousPage": false
+              "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==",
+              "hasPreviousPage": true
             }
           },
           "labels": {
             "edges": [
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
               {
                 "node": {
                   "name": "cla signed"
                 }
               }
             ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": [
-                  {
-                    "node": {
-                      "parents": {
-                        "edges": [
-                          {
-                            "node": {
-                              "oid": "6afe341276f9ffa660446c5fa15b68558791869a"
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                ]
-              }
-            }
           }
         }
       }
     }
   },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAYNi1Nc= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAYduu0A= name=pytorch number=77700 owner=pytorch": {
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcQU= name=pytorch number=73811 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
@@ -2308,20500 +2228,309 @@
             "nodes": [
               {
                 "commit": {
-                  "oid": "81261599614423baa17df72300b8e109677b6799",
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
                   "checkSuites": {
-                    "nodes": [
+                    "edges": [
                       {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384494"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
                             },
-                            {
-                              "name": "linux-docs / build-docs (cpp)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528477548"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602969"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-py3.7-clang9"
                             },
-                            {
-                              "name": "linux-docs / build-docs (python)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528477578"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602971"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-onnx"
                             },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528728152"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602972"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQg="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7"
                             },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528728187"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602973"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2839950664"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2840019714"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2840019747"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2840019794"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP89A=",
+                              "hasNextPage": false
                             }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNqJcE=",
-                            "hasNextPage": false
-                          }
-                        }
-                      }
-                    ]
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=metamates org=pytorch": {
-    "data": {
-      "organization": {
-        "team": {
-          "members": {
-            "nodes": [
-              {
-                "login": "dreiss"
-              },
-              {
-                "login": "kumpera"
-              },
-              {
-                "login": "zpao"
-              },
-              {
-                "login": "ezyang"
-              },
-              {
-                "login": "stephenroller"
-              },
-              {
-                "login": "swolchok"
-              },
-              {
-                "login": "hyuen"
-              },
-              {
-                "login": "orionr"
-              },
-              {
-                "login": "dhruvbird"
-              },
-              {
-                "login": "likethesky"
-              },
-              {
-                "login": "lw"
-              },
-              {
-                "login": "raziel"
-              },
-              {
-                "login": "simpkins"
-              },
-              {
-                "login": "ebyrne"
-              },
-              {
-                "login": "Babar"
-              },
-              {
-                "login": "kostmo"
-              },
-              {
-                "login": "bhosmer"
-              },
-              {
-                "login": "digantdesai"
-              },
-              {
-                "login": "zdevito"
-              },
-              {
-                "login": "bugra"
-              },
-              {
-                "login": "kunalb"
-              },
-              {
-                "login": "kit1980"
-              },
-              {
-                "login": "shoumikhin"
-              },
-              {
-                "login": "huydhn"
-              },
-              {
-                "login": "teytaud"
-              },
-              {
-                "login": "xuzhao9"
-              },
-              {
-                "login": "jansel"
-              },
-              {
-                "login": "abhinavarora"
-              },
-              {
-                "login": "djthorne"
-              },
-              {
-                "login": "Mortimerp9"
-              },
-              {
-                "login": "dadkins20"
-              },
-              {
-                "login": "colesbury"
-              },
-              {
-                "login": "laurencer"
-              },
-              {
-                "login": "nickgg"
-              },
-              {
-                "login": "yzhao30"
-              },
-              {
-                "login": "rmaz"
-              },
-              {
-                "login": "bearzx"
-              },
-              {
-                "login": "mattjgalloway"
-              },
-              {
-                "login": "chenyang78"
-              },
-              {
-                "login": "yns88"
-              },
-              {
-                "login": "lc0"
-              },
-              {
-                "login": "wenleix"
-              },
-              {
-                "login": "jingsh"
-              },
-              {
-                "login": "mthrok"
-              },
-              {
-                "login": "drdarshan"
-              },
-              {
-                "login": "d4l3k"
-              },
-              {
-                "login": "jamiemccrindle"
-              },
-              {
-                "login": "kazhang"
-              },
-              {
-                "login": "simonhollis"
-              },
-              {
-                "login": "govardhan"
-              },
-              {
-                "login": "yinghai"
-              },
-              {
-                "login": "zyan0"
-              },
-              {
-                "login": "ajtulloch"
-              },
-              {
-                "login": "smeenai"
-              },
-              {
-                "login": "vtlam"
-              },
-              {
-                "login": "khabinov"
-              },
-              {
-                "login": "NicolasHug"
-              },
-              {
-                "login": "jfix71"
-              },
-              {
-                "login": "atuljangra"
-              },
-              {
-                "login": "rshraga"
-              },
-              {
-                "login": "idning"
-              },
-              {
-                "login": "soumith"
-              },
-              {
-                "login": "nimin98"
-              },
-              {
-                "login": "chaekit"
-              },
-              {
-                "login": "xunnanxu"
-              },
-              {
-                "login": "mergennachin"
-              },
-              {
-                "login": "javier-m"
-              },
-              {
-                "login": "mostafaelhoushi"
-              },
-              {
-                "login": "brianjo"
-              },
-              {
-                "login": "suo"
-              },
-              {
-                "login": "vkuzo"
-              },
-              {
-                "login": "seemethere"
-              },
-              {
-                "login": "cpuhrsch"
-              },
-              {
-                "login": "qihqi"
-              },
-              {
-                "login": "jackm321"
-              },
-              {
-                "login": "linbinyu"
-              },
-              {
-                "login": "neerajprad"
-              },
-              {
-                "login": "rsemenov"
-              },
-              {
-                "login": "ziky90"
-              },
-              {
-                "login": "gmagogsfm"
-              },
-              {
-                "login": "zzzwen"
-              },
-              {
-                "login": "yanboliang"
-              },
-              {
-                "login": "andrewor14"
-              },
-              {
-                "login": "jianyuh"
-              },
-              {
-                "login": "cykustcc"
-              },
-              {
-                "login": "highker"
-              },
-              {
-                "login": "jeffreyksmithjr"
-              },
-              {
-                "login": "smessmer"
-              },
-              {
-                "login": "ananthsub"
-              },
-              {
-                "login": "malfet"
-              },
-              {
-                "login": "fegin"
-              },
-              {
-                "login": "zanqi"
-              },
-              {
-                "login": "supriyar"
-              },
-              {
-                "login": "kausv"
-              },
-              {
-                "login": "dagitses"
-              },
-              {
-                "login": "yhcharles"
-              },
-              {
-                "login": "bilgeacun"
-              },
-              {
-                "login": "caogao"
-              },
-              {
-                "login": "miguelmartin75"
-              },
-              {
-                "login": "penguinwu"
-              }
-            ],
-            "pageInfo": {
-              "hasNextPage": true,
-              "endCursor": "Y3Vyc29yOnYyOpHOADBnlQ=="
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOADBnlQ== name=metamates org=pytorch": {
-    "data": {
-      "organization": {
-        "team": {
-          "members": {
-            "nodes": [
-              {
-                "login": "shz117"
-              },
-              {
-                "login": "ajliu"
-              },
-              {
-                "login": "msaroufim"
-              },
-              {
-                "login": "davides"
-              },
-              {
-                "login": "alannnna"
-              },
-              {
-                "login": "hlin09"
-              },
-              {
-                "login": "hudeven"
-              },
-              {
-                "login": "terrychenism"
-              },
-              {
-                "login": "xiaomengy"
-              },
-              {
-                "login": "jisaacso"
-              },
-              {
-                "login": "fkhan1337"
-              },
-              {
-                "login": "xing-liu"
-              },
-              {
-                "login": "alanadakotashine"
-              },
-              {
-                "login": "desertfire"
-              },
-              {
-                "login": "YosuaMichael"
-              },
-              {
-                "login": "banitag1"
-              },
-              {
-                "login": "gchanan"
-              },
-              {
-                "login": "dbort"
-              },
-              {
-                "login": "DanilBaibak"
-              },
-              {
-                "login": "serhaty"
-              },
-              {
-                "login": "yf225"
-              },
-              {
-                "login": "mlazos"
-              },
-              {
-                "login": "yifuwang"
-              },
-              {
-                "login": "z-a-f"
-              },
-              {
-                "login": "tenpercent"
-              },
-              {
-                "login": "bertmaher"
-              },
-              {
-                "login": "chauhang"
-              },
-              {
-                "login": "ZainRizvi"
-              },
-              {
-                "login": "jiayisuse"
-              },
-              {
-                "login": "bochko"
-              },
-              {
-                "login": "jeanschmidt"
-              },
-              {
-                "login": "bradleyhd"
-              },
-              {
-                "login": "voznesenskym"
-              },
-              {
-                "login": "bwasti"
-              },
-              {
-                "login": "NivekT"
-              },
-              {
-                "login": "zhxchen17"
-              },
-              {
-                "login": "jerryzh168"
-              },
-              {
-                "login": "wconstab"
-              },
-              {
-                "login": "Hangjun"
-              },
-              {
-                "login": "davidberard98"
-              },
-              {
-                "login": "CamiWilliams"
-              },
-              {
-                "login": "avikchaudhuri"
-              },
-              {
-                "login": "datumbox"
-              },
-              {
-                "login": "aartibasant"
-              },
-              {
-                "login": "xta0"
-              },
-              {
-                "login": "zou3519"
-              },
-              {
-                "login": "xman1979"
-              },
-              {
-                "login": "suraj813"
-              },
-              {
-                "login": "gqchen"
-              },
-              {
-                "login": "abhikrish"
-              },
-              {
-                "login": "zhangguanheng66"
-              },
-              {
-                "login": "mikeiovine"
-              },
-              {
-                "login": "Chillee"
-              },
-              {
-                "login": "albanD"
-              },
-              {
-                "login": "bigfootjon"
-              },
-              {
-                "login": "robotal"
-              },
-              {
-                "login": "MarcioPorto"
-              },
-              {
-                "login": "srsuryadev"
-              },
-              {
-                "login": "IvanKobzarev"
-              },
-              {
-                "login": "eprivezentsev"
-              },
-              {
-                "login": "kwen2501"
-              },
-              {
-                "login": "chandlerzuo"
-              },
-              {
-                "login": "otsneh"
-              },
-              {
-                "login": "husthyc"
-              },
-              {
-                "login": "briancoutinho"
-              },
-              {
-                "login": "fduwjj"
-              },
-              {
-                "login": "frank-wei"
-              },
-              {
-                "login": "QuentinDuval"
-              },
-              {
-                "login": "atalman"
-              },
-              {
-                "login": "xush6528"
-              },
-              {
-                "login": "dracifer"
-              },
-              {
-                "login": "SS-JIA"
-              },
-              {
-                "login": "helunwencser"
-              },
-              {
-                "login": "xw285cornell"
-              },
-              {
-                "login": "hhbyyh"
-              },
-              {
-                "login": "rohan-varma"
-              },
-              {
-                "login": "jcaip"
-              },
-              {
-                "login": "teng-li"
-              },
-              {
-                "login": "larryliu0820"
-              },
-              {
-                "login": "lyoka"
-              },
-              {
-                "login": "cbalioglu"
-              },
-              {
-                "login": "hl475"
-              },
-              {
-                "login": "hwangjeff"
-              },
-              {
-                "login": "Jack-Khuu"
-              },
-              {
-                "login": "mehtanirav"
-              },
-              {
-                "login": "nateanl"
-              },
-              {
-                "login": "fuqianz"
-              },
-              {
-                "login": "boyuantan"
-              },
-              {
-                "login": "muntaqim"
-              },
-              {
-                "login": "fmassa"
-              },
-              {
-                "login": "esantorella"
-              },
-              {
-                "login": "HamidShojanazeri"
-              },
-              {
-                "login": "jubinchheda"
-              },
-              {
-                "login": "mehdimashayekhi"
-              },
-              {
-                "login": "rkindi"
-              },
-              {
-                "login": "wanchaol"
-              },
-              {
-                "login": "zephirefaith"
-              },
-              {
-                "login": "kapilsh"
-              },
-              {
-                "login": "plahera"
-              },
-              {
-                "login": "SherlockNoMad"
-              }
-            ],
-            "pageInfo": {
-              "hasNextPage": true,
-              "endCursor": "Y3Vyc29yOnYyOpHOAJcqOQ=="
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOAJcqOQ== name=metamates org=pytorch": {
-    "data": {
-      "organization": {
-        "team": {
-          "members": {
-            "nodes": [
-              {
-                "login": "pritamdamania87"
-              },
-              {
-                "login": "iseeyuan"
-              },
-              {
-                "login": "protonu"
-              },
-              {
-                "login": "terhuhf"
-              },
-              {
-                "login": "aruntonic"
-              },
-              {
-                "login": "gcatron"
-              },
-              {
-                "login": "yingrliu"
-              },
-              {
-                "login": "alexanderguzhva"
-              },
-              {
-                "login": "angelayi"
-              },
-              {
-                "login": "zhaoalex"
-              },
-              {
-                "login": "vivekmig"
-              },
-              {
-                "login": "sangongs"
-              },
-              {
-                "login": "akshaypandian"
-              },
-              {
-                "login": "drej82"
-              },
-              {
-                "login": "tktrungna"
-              },
-              {
-                "login": "eellison"
-              },
-              {
-                "login": "ydwu4"
-              },
-              {
-                "login": "NarineK"
-              },
-              {
-                "login": "andrewconnors"
-              },
-              {
-                "login": "wenwei202"
-              },
-              {
-                "login": "jg2912"
-              },
-              {
-                "login": "XilunWu"
-              },
-              {
-                "login": "robieta"
-              },
-              {
-                "login": "mreso"
-              },
-              {
-                "login": "soulitzer"
-              },
-              {
-                "login": "PaliC"
-              },
-              {
-                "login": "anijain2305"
-              },
-              {
-                "login": "pvtuan10"
-              },
-              {
-                "login": "osalpekar"
-              },
-              {
-                "login": "xiaohui-zhang"
-              },
-              {
-                "login": "jerry39213gh"
-              },
-              {
-                "login": "jarodhou"
-              },
-              {
-                "login": "hlu1"
-              },
-              {
-                "login": "H-Huang"
-              },
-              {
-                "login": "vtsyvina"
-              },
-              {
-                "login": "PratsBhatt"
-              },
-              {
-                "login": "Nitrokitty"
-              },
-              {
-                "login": "satgera"
-              },
-              {
-                "login": "ngimel"
-              },
-              {
-                "login": "markkm"
-              },
-              {
-                "login": "EscapeZero"
-              },
-              {
-                "login": "bdhirsh"
-              },
-              {
-                "login": "cccclai"
-              },
-              {
-                "login": "carolineechen"
-              },
-              {
-                "login": "tugsbayasgalan"
-              },
-              {
-                "login": "agunapal"
-              },
-              {
-                "login": "frankseide"
-              },
-              {
-                "login": "YazhiGao"
-              },
-              {
-                "login": "mrshenli"
-              },
-              {
-                "login": "bashnick"
-              },
-              {
-                "login": "lena-kashtelyan"
-              },
-              {
-                "login": "brad-mengchi"
-              },
-              {
-                "login": "kimishpatel"
-              },
-              {
-                "login": "aaronenyeshi"
-              },
-              {
-                "login": "shajrawi"
-              },
-              {
-                "login": "samdow"
-              },
-              {
-                "login": "great-way"
-              },
-              {
-                "login": "ashkan-software"
-              },
-              {
-                "login": "mortzur"
-              },
-              {
-                "login": "jbitton"
-              },
-              {
-                "login": "jdsgomes"
-              },
-              {
-                "login": "hatala91"
-              },
-              {
-                "login": "zhangxy988"
-              },
-              {
-                "login": "samlurye"
-              },
-              {
-                "login": "anjali411"
-              },
-              {
-                "login": "williamwen42"
-              },
-              {
-                "login": "joecummings"
-              },
-              {
-                "login": "842974287"
-              },
-              {
-                "login": "JacobSzwejbka"
-              },
-              {
-                "login": "nishantpdce"
-              },
-              {
-                "login": "srinivas212"
-              },
-              {
-                "login": "shreyanb98"
-              },
-              {
-                "login": "naveedgol"
-              },
-              {
-                "login": "Nayef211"
-              },
-              {
-                "login": "HengruiX"
-              },
-              {
-                "login": "sgrigory"
-              },
-              {
-                "login": "chekangliang"
-              },
-              {
-                "login": "ebsmothers"
-              },
-              {
-                "login": "anshuljain1"
-              },
-              {
-                "login": "salilsdesai"
-              },
-              {
-                "login": "vmoens"
-              },
-              {
-                "login": "yoavnavon"
-              },
-              {
-                "login": "printfoo"
-              },
-              {
-                "login": "ErikaLal"
-              },
-              {
-                "login": "xinyang0"
-              },
-              {
-                "login": "kauterry"
-              },
-              {
-                "login": "anirbanraywork"
-              },
-              {
-                "login": "houseroad"
-              },
-              {
-                "login": "erichan1"
-              },
-              {
-                "login": "hsrussell"
-              },
-              {
-                "login": "ilia-cher"
-              },
-              {
-                "login": "ajitmaths"
-              },
-              {
-                "login": "awgu"
-              },
-              {
-                "login": "wz337"
-              },
-              {
-                "login": "qxy11"
-              },
-              {
-                "login": "janeyx99"
-              },
-              {
-                "login": "glaringlee"
-              },
-              {
-                "login": "anj-s"
-              },
-              {
-                "login": "drisspg"
-              },
-              {
-                "login": "kmh4321"
-              }
-            ],
-            "pageInfo": {
-              "hasNextPage": true,
-              "endCursor": "Y3Vyc29yOnYyOpHOAfXMcA=="
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOAfXMcA== name=metamates org=pytorch": {
-    "data": {
-      "organization": {
-        "team": {
-          "members": {
-            "nodes": [
-              {
-                "login": "RdoubleA"
-              },
-              {
-                "login": "jramseyer"
-              },
-              {
-                "login": "jianingfu"
-              },
-              {
-                "login": "gtarjun"
-              },
-              {
-                "login": "mikaylagawarecki"
-              },
-              {
-                "login": "xianxl"
-              },
-              {
-                "login": "aazzolini"
-              },
-              {
-                "login": "Xirider"
-              },
-              {
-                "login": "HDCharles"
-              },
-              {
-                "login": "mcr229"
-              },
-              {
-                "login": "manuelcandales"
-              },
-              {
-                "login": "guangy10"
-              },
-              {
-                "login": "mengwa41"
-              },
-              {
-                "login": "YulunW"
-              },
-              {
-                "login": "danthe3rd"
-              },
-              {
-                "login": "hx89"
-              },
-              {
-                "login": "itang00"
-              },
-              {
-                "login": "hanhsienhuang"
-              },
-              {
-                "login": "clee2000"
-              },
-              {
-                "login": "lhuang04"
-              },
-              {
-                "login": "gottbrath"
-              },
-              {
-                "login": "lessw2020"
-              },
-              {
-                "login": "taivu1998"
-              },
-              {
-                "login": "danrecoskie"
-              },
-              {
-                "login": "zhaojuanmao"
-              },
-              {
-                "login": "johncalab"
-              },
-              {
-                "login": "dhthompson"
-              },
-              {
-                "login": "superwizard2019"
-              },
-              {
-                "login": "TovlyFB"
-              },
-              {
-                "login": "shunting314"
-              },
-              {
-                "login": "xcheng16"
-              },
-              {
-                "login": "adamomainz"
-              },
-              {
-                "login": "sluks"
-              },
-              {
-                "login": "SebastianAment"
-              },
-              {
-                "login": "ansley"
-              },
-              {
-                "login": "cheetah2216"
-              },
-              {
-                "login": "mikekgfb"
-              },
-              {
-                "login": "pinaki-mukerji"
-              },
-              {
-                "login": "kyulee-com"
-              },
-              {
-                "login": "sstsai-adl"
-              },
-              {
-                "login": "dahsh"
-              },
-              {
-                "login": "szewaiyuen7"
-              },
-              {
-                "login": "byterover"
-              },
-              {
-                "login": "wmao533"
-              },
-              {
-                "login": "ejguan"
-              },
-              {
-                "login": "nimaelyasi"
-              },
-              {
-                "login": "qxu-fb"
-              },
-              {
-                "login": "sshawnwu"
-              },
-              {
-                "login": "iramazanli"
-              },
-              {
-                "login": "jnkwok1"
-              },
-              {
-                "login": "kurman"
-              },
-              {
-                "login": "jbschlosser"
-              },
-              {
-                "login": "haichuan-fb"
-              },
-              {
-                "login": "wwang84"
-              },
-              {
-                "login": "JustinPinero"
-              },
-              {
-                "login": "gcramer23"
-              },
-              {
-                "login": "yuguo68"
-              },
-              {
-                "login": "c-odrin"
-              },
-              {
-                "login": "chowarfb"
-              },
-              {
-                "login": "priyaramani"
-              },
-              {
-                "login": "asalioufb"
-              },
-              {
-                "login": "four4fish"
-              },
-              {
-                "login": "kkosik20"
-              },
-              {
-                "login": "KZFB"
-              },
-              {
-                "login": "henryliu-bluehills"
-              },
-              {
-                "login": "minjungkim85"
-              },
-              {
-                "login": "muchulee8"
-              },
-              {
-                "login": "kirklandsign"
-              },
-              {
-                "login": "jiawenliu64"
-              },
-              {
-                "login": "izaitsevfb"
-              },
-              {
-                "login": "ashramac"
-              },
-              {
-                "login": "weiwangmeta"
-              },
-              {
-                "login": "andysamfb"
-              },
-              {
-                "login": "yulin0077"
-              },
-              {
-                "login": "l-kirsch"
-              }
-            ],
-            "pageInfo": {
-              "hasNextPage": false,
-              "endCursor": "Y3Vyc29yOnYyOpHOBvyzkQ=="
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=75095 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "mruberry"
-          },
-          "title": "Initial prims, references, and test architecture for them",
-          "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ",
-          "headRefName": "prims_and_references",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "a790467c650be92775103cde5e866c90b56f5376"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "63fdd580118477416ae160e0670ae722ea248090"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "442c405e9da0d66744ef03e379224c41eedf5b57"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "031ac49ae9c192989385986b6707fa781e3229e0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MjY",
-              "hasNextPage": false
-            },
-            "totalCount": 26
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              },
-                              {
-                                "name": "Meta Internal-Only Changes Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2o="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2w="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3U="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3o="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC34="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC4E="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622865"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622865/jobs/3270915028"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDNo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622869"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915027"
-                              },
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915071"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915141"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915194"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915229"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915283"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915321"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDOY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622878"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927344"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927442"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927507"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927567"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927674"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927727"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927802"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927853"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927948"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927996"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928061"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928116"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928198"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928256"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928291"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928317"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928338"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928367"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928410"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928445"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991071"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991125"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991162"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991195"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991233"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991261"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991305"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991349"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996024"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996068"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996092"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996505"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270998987"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270999027"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006886"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006941"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018097"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018135"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018162"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271021143"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034041"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034072"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271048218"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049553"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049587"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049616"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068293"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068336"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149276"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149321"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-04-25T02:30:31Z",
-                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
-                }
-              }
-            ]
-          },
-          "changedFiles": 5,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/test_ops.py"
-              },
-              {
-                "path": "torch/_prims/__init__.py"
-              },
-              {
-                "path": "torch/_prims/utils.py"
-              },
-              {
-                "path": "torch/_refs/__init__.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_methods_invocations.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "NQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zou3519"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "peterbell10"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.",
-                "createdAt": "2022-04-21T19:00:28Z",
-                "author": {
-                  "login": "ngimel"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1105643418
-              },
-              {
-                "bodyText": "@pytorchbot merge this please",
-                "createdAt": "2022-04-25T04:42:29Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1108072887
-              },
-              {
-                "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244",
-                "createdAt": "2022-04-25T04:43:54Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1108073536
-              },
-              {
-                "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-04-25T04:51:11Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1108075965
-              },
-              {
-                "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-04-25T09:57:56Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1108351107
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "topic: not user facing"
-                }
-              },
-              {
-                "node": {
-                  "name": "module: primTorch"
-                }
-              }
-            ]
-          },
-          "headRef": null
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=73099 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "BowenBao"
-          },
-          "title": "[ONNX] Make graph name spec-compliant (#71961)",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952",
-          "headRefName": "gh/BowenBao/138/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/BowenBao/138/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "BowenBao"
-                    },
-                    "email": "bowbao@microsoft.com",
-                    "name": "BowenBao"
-                  },
-                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            },
-            "totalCount": 1
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041786"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041786/jobs/2626264278"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7k="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041785"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626264385"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417658"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417743"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417885"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7o="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7-no-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041789"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041789/jobs/2626264416"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7s="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041787"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041787/jobs/2626264407"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7w="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041788"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041788/jobs/2626264422"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS74="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041790"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626264414"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349405"
-                              },
-                              {
-                                "name": "test (noarch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349522"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349618"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS78="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-vulkan-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041793"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626264431"
-                              },
-                              {
-                                "name": "test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626359364"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8A="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041792"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041792/jobs/2626264427"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8I="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cpu-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041791"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626264386"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722677"
-                              },
-                              {
-                                "name": "test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722710"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8M="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041803"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626264401"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349045"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349141"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349272"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010288?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010289?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010488?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010326?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-02-18T18:46:28Z",
-                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
-                }
-              }
-            ]
-          },
-          "changedFiles": 162,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/onnx/expect/TestOperators.test_acos.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_addconstant.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_addmm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_argmax.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_asin.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_at_op.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_atan.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_baddbmm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_basic.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_bitshift.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_c2_op.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_chunk.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_clip.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_clip_max.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_clip_min.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_concat2.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_conv.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_convtranspose.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_cos.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_cumsum.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_det.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dict.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dict_str.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dim.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_default.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_training.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_elu.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_empty_like.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_equal.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_erf.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_exp.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_expand.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_flatten.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_flatten2D.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_fmod.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_full.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_full_like.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gather.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_ge.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gelu.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gt.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_hardtanh.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_index.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_isnan.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_le.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_linear.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_lt.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_master_opset.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_max.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_maxpool.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_mean.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_meshgrid.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_min.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_mm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_narrow.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_ne.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_nonzero.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_norm_p1.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_norm_p2.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_ones_like.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_pad.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_params.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_permute2.expect"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTAw",
-              "hasNextPage": true
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "garymm"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n  \n    \n      pytorch/.github/scripts/trymerge.py\n    \n    \n         Line 63\n      in\n      932adf2\n    \n  \n  \n    \n\n        \n          \n                 files(last: 100) { \n        \n    \n  \n\n Can this be relaxed? If not please import.",
-                "createdAt": "2022-02-22T18:22:40Z",
-                "author": {
-                  "login": "BowenBao"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1048084569
-              },
-              {
-                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.",
-                "createdAt": "2022-02-22T18:27:29Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1048088691
-              },
-              {
-                "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.",
-                "createdAt": "2022-02-22T18:29:48Z",
-                "author": {
-                  "login": "BowenBao"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1048090640
-              },
-              {
-                "bodyText": "@pytorchbot merge this",
-                "createdAt": "2022-02-24T21:42:36Z",
-                "author": {
-                  "login": "BowenBao"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1050293881
-              },
-              {
-                "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-02-24T21:44:39Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1050295451
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "oncall: jit"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "release notes: onnx"
-                }
-              },
-              {
-                "node": {
-                  "name": "topic: bug fixes"
-                }
-              }
-            ]
-          },
-          "headRef": null
-        }
-      }
-    }
-  },
-  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=73099 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "files": {
-            "nodes": [
-              {
-                "path": "test/onnx/expect/TestOperators.test_pixel_shuffle.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_pow.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_prelu.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_prod.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_prod_dtype.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_rand.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_randn.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_mean.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_prod.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_sum.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reducemax.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_reducemin.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_remainder.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_repeat.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_round.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_rrelu.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_rsqrt.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_rsub.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_scatter_add.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_scatter_add_opset11.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_selu.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_shape_value_map.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_sign.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_sin.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_slice.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_slice_dynamic.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_split.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_split_with_sizes.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_sqrt.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_std.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_sum.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_sum_dtype.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_tan.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_topk.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_transpose.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_type_as.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_unfold.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_unique.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_unsqueeze.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_size.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_view.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_view_flatten.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_zeros_like.expect"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/export.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/export.h"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTYy",
-              "hasNextPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=73969 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "malfet"
-          },
-          "title": "Dummy change",
-          "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n",
-          "headRefName": "export-D34753911",
-          "headRepository": {
-            "nameWithOwner": "malfet/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "4746da707a9912356f5179625da89616b228dc21"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            },
-            "totalCount": 1
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-vulkan-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280134"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794078044"
-                              },
-                              {
-                                "name": "test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794189060"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280135"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280135/jobs/2794078023"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-rocm4.5-py3.7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280132"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794078060"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292071"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292205"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292306"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cuda11.3-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280139"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794078053"
-                              },
-                              {
-                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536907"
-                              },
-                              {
-                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536998"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794537089"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280136"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280136/jobs/2794078031"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-docs"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280138"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794078055"
-                              },
-                              {
-                                "name": "build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183768"
-                              },
-                              {
-                                "name": "build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183828"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280140"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794078017"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181109"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181305"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181488"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280143"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280143/jobs/2794078025"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280145"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "shellcheck",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078028"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078196"
-                              },
-                              {
-                                "name": "clang-tidy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078407"
-                              },
-                              {
-                                "name": "clang-format",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078610"
-                              },
-                              {
-                                "name": "cmakelint",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078760"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078898"
-                              },
-                              {
-                                "name": "py2-setup-validate-errormsg",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078999"
-                              },
-                              {
-                                "name": "flake8-py3",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079087"
-                              },
-                              {
-                                "name": "mypy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079199"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280146"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280146/jobs/2794078040"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040614?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040643?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040615?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-03-09T15:57:16Z",
-                  "oid": "4746da707a9912356f5179625da89616b228dc21"
-                }
-              }
-            ]
-          },
-          "changedFiles": 1,
-          "files": {
-            "nodes": [
-              {
-                "path": "tools/build_variables.bzl"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [],
-            "pageInfo": {
-              "startCursor": null,
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped",
-                "createdAt": "2022-03-09T15:57:11Z",
-                "author": {
-                  "login": "pytorch-bot"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1063079053
-              },
-              {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-03-09T15:57:12Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 1063079113
-              },
-              {
-                "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911",
-                "createdAt": "2022-03-09T15:57:34Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1063079731
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==",
-              "hasPreviousPage": false
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "fb-exported"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
-          },
-          "headRef": null
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-RA= name=pytorch number=73969 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "4746da707a9912356f5179625da89616b228dc21",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280141"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280141/jobs/2794078056"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2c8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Test tools"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280142"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280142/jobs/2794078033"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2as=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280144"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794078046"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794338293"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794338408"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794338568"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbUkMA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RQ="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7-no-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280148"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280148/jobs/2794078065"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cpu-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280149"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280149/jobs/2794078067"
-                              },
-                              {
-                                "name": "test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280149/jobs/2794407041"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280149/jobs/2794407168"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbWDX8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280150"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280150/jobs/2794078029"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Rc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-asan"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280151"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794078062"
-                              },
-                              {
-                                "name": "test (default, 3, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794225603"
-                              },
-                              {
-                                "name": "test (default, 1, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794225793"
-                              },
-                              {
-                                "name": "test (default, 2, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794226005"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSD-k=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Rk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              },
-                              {
-                                "name": "Meta Internal-Only Changes Check",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://opensource.facebook.com/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO574=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Ro="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-xla-linux-bionic-py3.7-clang8"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280152"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280152/jobs/2794078032"
-                              },
-                              {
-                                "name": "test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280152/jobs/2794227475"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSGAM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Rs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280160"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794078054"
-                              },
-                              {
-                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203297"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203553"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203717"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203878"
-                              },
-                              {
-                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203982"
-                              },
-                              {
-                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794204149"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRlJs=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-SU="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-SU= name=pytorch number=73969 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "4746da707a9912356f5179625da89616b228dc21",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280162"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794078019"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794187280"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794187423"
-                              },
-                              {
-                                "name": "test (noarch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794187582"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRN_c=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Sk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-onnx"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280164"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280164/jobs/2794078039"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280164/jobs/2794213425"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280164/jobs/2794213615"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRySo=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-TY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280168"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280168/jobs/2794078064"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-UI="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "seemethere"
-          },
-          "title": "ci: Migrate metrics credentials to managed IAM",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas <eliuriegas@fb.com>",
-          "headRefName": "gh/seemethere/215/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/seemethere/215/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "seemethere"
-                    },
-                    "email": "eliuriegas@fb.com",
-                    "name": "Eli Uriegas"
-                  },
-                  "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "seemethere"
-                    },
-                    "email": "eliuriegas@fb.com",
-                    "name": "Eli Uriegas"
-                  },
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mg",
-              "hasNextPage": false
-            },
-            "totalCount": 2
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcBs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602960"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cpu-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602961"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602963"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602964"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602965"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602967"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7-no-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602966"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602966/jobs/2839950629"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Test tools"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602968"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQQ="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-asan"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602970"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044969?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17045014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044975?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-03-14T23:01:55Z",
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
-                }
-              }
-            ]
-          },
-          "changedFiles": 3,
-          "files": {
-            "nodes": [
-              {
-                "path": ".github/templates/common.yml.j2"
-              },
-              {
-                "path": ".github/workflows/generated-macos-11-py3-x86-64.yml"
-              },
-              {
-                "path": ".github/workflows/update_pytorch_labels.yml"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "kit1980"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "janeyx99"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976",
-                "createdAt": "2022-03-15T17:43:28Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1068270969
-              },
-              {
-                "bodyText": "@pytorchbot force merge this",
-                "createdAt": "2022-03-15T20:26:36Z",
-                "author": {
-                  "login": "seemethere"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1068436128
-              },
-              {
-                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952",
-                "createdAt": "2022-03-15T20:27:47Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1068437098
-              },
-              {
-                "bodyText": "@pytorchbot merge this",
-                "createdAt": "2022-03-15T21:18:55Z",
-                "author": {
-                  "login": "seemethere"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1068482921
-              },
-              {
-                "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-03-15T21:20:40Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1068484404
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
-          },
-          "headRef": null
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcQU= name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602969"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602971"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-onnx"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602972"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602973"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2839950664"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2840019714"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2840019747"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602973/jobs/2840019794"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP89A=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602974"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602977"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602977/jobs/2839950658"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObTk=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-docs"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602976"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQ4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cuda11.3-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602978"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQ8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602979"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2839950630"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2840213785"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2840213832"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2840213866"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUJII=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-xla-linux-bionic-py3.7-clang8"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602981"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRI="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcRI= name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602982"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-vulkan-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602983"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602984"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2839950624"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2840021854"
-                              },
-                              {
-                                "name": "test (noarch, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2840021946"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2840021988"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP_28=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7-no-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602985"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-onnx"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602988"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602988/jobs/2839950656"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602988/jobs/2840031185"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602988/jobs/2840031288"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQMyA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-asan"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602989"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2839950625"
-                              },
-                              {
-                                "name": "test (default, 3, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2840042498"
-                              },
-                              {
-                                "name": "test (default, 1, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2840042534"
-                              },
-                              {
-                                "name": "test (default, 2, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2840042646"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQcpA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602990"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "cmakelint",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950650"
-                              },
-                              {
-                                "name": "clang-format",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950743"
-                              },
-                              {
-                                "name": "clang-tidy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950808"
-                              },
-                              {
-                                "name": "flake8-py3",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950884"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950992"
-                              },
-                              {
-                                "name": "mypy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951037"
-                              },
-                              {
-                                "name": "py2-setup-validate-errormsg",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951085"
-                              },
-                              {
-                                "name": "shellcheck",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951170"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951266"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcU4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcR4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602993"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602993/jobs/2839950562"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKc=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcR8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602992"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602991"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSI="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcSI= name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-xla-linux-bionic-py3.7-clang8"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602994"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602994/jobs/2839950655"
-                              },
-                              {
-                                "name": "test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602994/jobs/2840047401"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQjCM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cuda11.3-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602996"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2839950632"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2840239369"
-                              },
-                              {
-                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2840239408"
-                              },
-                              {
-                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2840239445"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUs2w=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSQ="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602998"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602998/jobs/2839950621"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQs=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602997"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602997/jobs/2839950665"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObUI=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603001"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603001/jobs/2839950648"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObSk=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-vulkan-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603002"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603002/jobs/2839950741"
-                              },
-                              {
-                                "name": "test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603002/jobs/2840029810"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQKq4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-docs"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603000"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603000/jobs/2839950661"
-                              },
-                              {
-                                "name": "build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603000/jobs/2840023513"
-                              },
-                              {
-                                "name": "build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603000/jobs/2840023552"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQCGQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-rocm4.5-py3.7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603003"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603003/jobs/2839950637"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603003/jobs/2840068586"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603003/jobs/2840068671"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqRADE=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603004"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603004/jobs/2839950560"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKU=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcS0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cpu-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603005"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603005/jobs/2839950626"
-                              },
-                              {
-                                "name": "test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603005/jobs/2840145642"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603005/jobs/2840145755"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSq34=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcS8="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcS8= name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603007"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2839950666"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840025927"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840025995"
-                              },
-                              {
-                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026086"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026134"
-                              },
-                              {
-                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026235"
-                              },
-                              {
-                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026282"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQFvU=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcTE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603009"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcTU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-rocm4.5-py3.7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603010"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcTg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Test tools"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603012"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603012/jobs/2839950623"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQ4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcT0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603013"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603013/jobs/2839950631"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRg=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcT8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "macos-10-15-py3-arm64"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603251"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603251/jobs/2839951040"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_k="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "ios-12-5-1-arm64-coreml"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603253"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603253/jobs/2839951038"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_w="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "ios-12-5-1-arm64"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603254"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603254/jobs/2839951030"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAc=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "macos-11-py3-x86-64"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603255"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603255/jobs/2839951034"
-                              },
-                              {
-                                "name": "test (default, 1, 2, macos-11)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603255/jobs/2840127016"
-                              },
-                              {
-                                "name": "test (default, 2, 2, macos-11)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603255/jobs/2840127073"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSQ2M=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "ios-12-5-1-arm64-custom-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603256"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603256/jobs/2839951041"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBE=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAA="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCdAA= name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "ios-12-5-1-x86-64-coreml"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603259"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603259/jobs/2839951039"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "ios-12-5-1-arm64-metal"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603261"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603261/jobs/2839951042"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "macos-10-15-py3-lite-interpreter-x86-64"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603264"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603264/jobs/2839951036"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAs=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "ios-12-5-1-x86-64"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603269"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603269/jobs/2839951029"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdBE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCddM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCddw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdeI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdeY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdes="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=31093 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "mingxiaoh"
-          },
-          "title": "improve mkldnn convolution test coverage",
-          "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ",
-          "headRefName": "master",
-          "headRepository": {
-            "nameWithOwner": "mingxiaoh/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "11pikachu"
-                    },
-                    "email": "junx.du@intel.com",
-                    "name": "dujun"
-                  },
-                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            },
-            "totalCount": 1
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406538?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406947?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406544?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406931?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406550?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406887?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406526?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406707?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406533?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407256?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407254?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407255?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406556?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406532?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406527?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.8-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406553?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.6-clang9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406537?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.8-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406529?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.5.1-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406554?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.7-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406545?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406543?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406536?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406552?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406535?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406540?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406528?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406541?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-asan",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406549?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-clang7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406555?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc4.8",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406546?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc5.4",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406531?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406534?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7.2",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406523?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.8",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406539?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.3-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406547?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.5.1-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406551?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407209?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406611?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_bazel_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406607?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_bazel_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406984?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_cpp_doc_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407013?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_doc_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407011?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_ios_11_2_1_x86_64_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406548?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406563?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408680?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_backward_compatibility_check_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406567?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406945?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406561?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_coverage_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407422?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_bionic_rocm3_7_py3_6_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406562?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406612?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408107?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408111?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408101?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406613?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406565?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407017?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407019?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407012?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407016?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_vulkan_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406608?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406609?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406606?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test1",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407435?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test2",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407436?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406605?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_custom_build_dynamic",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406610?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_macos_10_13_py3_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406525?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_macos_10_13_py3_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407415?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_python_doc_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407018?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406566?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406946?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cpu_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406542?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406530?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test1",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407028?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test2",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda11.0_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406524?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406572?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407253?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "codecov/patch",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
-                      },
-                      {
-                        "context": "codecov/project",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
-                      },
-                      {
-                        "context": "pr/caffe2-pytorch-linux-bionic-rocm3.7-py3.6-test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://ci.pytorch.org/jenkins/job/caffe2-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger-test/2319/"
-                      },
-                      {
-                        "context": "pr/pytorch-linux-bionic-rocm3.7-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger/2325/"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2020-09-11T01:58:24Z",
-                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
-                }
-              }
-            ]
-          },
-          "changedFiles": 5,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/math_libraries/convolutions.py"
-              },
-              {
-                "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json"
-              },
-              {
-                "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json"
-              },
-              {
-                "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json"
-              },
-              {
-                "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "NQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "CHANGES_REQUESTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "CHANGES_REQUESTED"
-              },
-              {
-                "author": {
-                  "login": "ailzhang"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "VitalyFedyunin"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "VitalyFedyunin"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "VitalyFedyunin"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry  It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes",
-                "createdAt": "2020-08-14T01:36:20Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "mingxiaoh"
-                },
-                "databaseId": 673816925
-              },
-              {
-                "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.",
-                "createdAt": "2020-08-14T03:09:37Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 673858224
-              },
-              {
-                "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@           Coverage Diff           @@\n##           master   #31093   +/-   ##\n=======================================\n  Coverage   68.00%   68.00%           \n=======================================\n  Files         382      382           \n  Lines       49527    49527           \n=======================================\n  Hits        33679    33679           \n  Misses      15848    15848           \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute <relative> (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.",
-                "createdAt": "2020-09-04T05:41:01Z",
-                "author": {
-                  "login": "codecov"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "codecov"
-                },
-                "databaseId": 686921371
-              },
-              {
-                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale.  Feel free to remove the Stale label if you feel this was a mistake.  If you are unable to remove the Stale label please contact a maintainer in order to do so.  Stale pull requests will automatically be closed 30 days after being marked Stale",
-                "createdAt": "2022-04-12T02:35:37Z",
-                "author": {
-                  "login": "pytorchbot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1095860944
-              },
-              {
-                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
-                "createdAt": "2022-06-11T04:40:16Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1152854802
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOKCmhXQ==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "triaged"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Stale"
-                }
-              }
-            ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": []
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOKCmhXQ== name=pytorch number=31093 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Hi, @mingfeima  @soumith  @Jianhui-Li\nthis will improve the test coverage of mkldnn convolution, would you please review it?\nThe current code is forward only, do we need to cover backward, if yes, we can add backward.",
-                "createdAt": "2019-12-12T01:19:02Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 564806270
-              },
-              {
-                "bodyText": "@mingxiaoh, what is the value in testing DNNL as part of Pytorch validation for the Pytorch developers? Shouldn't having these tests run in DNNL validation be enough?",
-                "createdAt": "2019-12-12T01:28:32Z",
-                "author": {
-                  "login": "vpirogov"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 564808528
-              },
-              {
-                "bodyText": "@vpirogov  The main value is to serve as a blind test to DNNL. If DNNL adds these test to DNNL test sets, it lost the value as a blind test.  The spirit of validation is to cross check.\n@gottbrath @gchanan  The test was developed per the request of Pytorch team. Mingxiao made an effort to reduce the execution time to a few second but still with good coverage.  Although the test today is focused on DNNL, it could be easily extended to be blind test for any conv implementation used in Pytorch.",
-                "createdAt": "2019-12-20T07:44:30Z",
-                "author": {
-                  "login": "Jianhui-Li"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 567826907
-              },
-              {
-                "bodyText": "@mruberry thanks for the comment. As for the chainer dependency, we import it is because we would like to use its testing function for pytest test cases combinations, other wise we need to write much more code to achieve same effect. So, can we use it?",
-                "createdAt": "2020-01-15T09:04:34Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 574563012
-              },
-              {
-                "bodyText": "@mingxiaoh You cannot import chainer. Looking at the code you should be able to achieve the same effect without it.",
-                "createdAt": "2020-01-16T17:59:46Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 575272358
-              },
-              {
-                "bodyText": "@mruberry ok, we will change it according to your requirement. Thanks",
-                "createdAt": "2020-02-10T00:59:34Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 583917522
-              },
-              {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/31093\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 29f6aa6 (more details on the Dr. CI page):\n\nCommit 29f6aa6 was recently pushed. Waiting for builds...\n\nThis comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2020-05-14T08:04:30Z",
-                "author": {
-                  "login": "dr-ci"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 628466876
-              },
-              {
-                "bodyText": "@mruberry how about those cudnn UT error? we add check for it but it should be NV to fix cudnn bugs.",
-                "createdAt": "2020-05-18T05:34:11Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 629955767
-              },
-              {
-                "bodyText": "Hey @mingxiaoh! You're right, of course, that you shouldn't have to fix cuDNN bugs. Would you please:\n\nAssert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update.\nFile a new issue explaining the behavior and providing a short PyTorch program to reproduce the issue.\n\nThen we can ping NVIDIA on that issue.",
-                "createdAt": "2020-05-18T07:27:08Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 629997129
-              },
-              {
-                "bodyText": "about the suggestion 'Assert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update. ',  if we only assert it and continue the following test, I guess users might always ignore them in later test. Anyway, any similar example case for reference?",
-                "createdAt": "2020-05-18T07:55:08Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 630010734
-              },
-              {
-                "bodyText": "In this recent PR https://github.com/pytorch/pytorch/pull/38505/files, for example, you can see that the construction of bool tensors wasn't working properly, so the test author cited the relevant issue and asserted that the incorrect behavior happened, as expected. You can also see how these lines are being removed by https://github.com/pytorch/pytorch/pull/38392/files, which fixes the issue.\nAnother common pattern is to use with self.assertRaises(RuntimeError/AssertionError/etc.):.",
-                "createdAt": "2020-05-18T08:02:13Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 630014823
-              },
-              {
-                "bodyText": "@mruberry the failed UT case is not introduced by our modification, how to handle this issue?",
-                "createdAt": "2020-05-20T01:59:13Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 631187735
-              },
-              {
-                "bodyText": "@mingxiaoh You mean the failures on ROCm? You may ignore them. Be sure to re-request review when you're ready.",
-                "createdAt": "2020-05-20T02:12:58Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 631191425
-              },
-              {
-                "bodyText": "@mruberry  we already skipped those ROCm errors, but there are stil somel error caused by the original code, they are not introduced by our modification.",
-                "createdAt": "2020-05-21T05:18:07Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 631886529
-              },
-              {
-                "bodyText": "I understand. Let me know when you're ready for me to review.",
-                "createdAt": "2020-05-21T06:24:15Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 631908011
-              },
-              {
-                "bodyText": "@mruberry thanks, we are ready for review now.",
-                "createdAt": "2020-05-21T06:28:11Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 631909442
-              },
-              {
-                "bodyText": "@mingxiaoh Great! I'll take a look ASAP.",
-                "createdAt": "2020-05-21T06:31:10Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 631910556
-              },
-              {
-                "bodyText": "@mruberry we just pull the latest code and updated the patch according to your comment, may you please help double check it? BTW, the new failed case in preci is not introduced by our modification.",
-                "createdAt": "2020-05-25T07:44:58Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 633430458
-              },
-              {
-                "bodyText": "@ailzhang would you please check the comment below? Thanks.\nIs there a reason why this TestConv2dExt is a new class instead a test inside TestNN?\n//comment: it is actually suggested by Tongzhou Wang in another thread before.\nAlthough this test sits in generic testing framework, it's actually comparing thnn/mkldnn/cudnn results specially. I feel it's better to make it truly generic so that it compares any device result with CPU result. Alternatively you can mark this test only run when torch.backends.mkldnn.is_available()=True\n//comment: but our goal is to compare the result with that of thnn. Anyway, if you insist, we can start to compare it with cpu.",
-                "createdAt": "2020-05-27T05:11:08Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "mingxiaoh"
-                },
-                "databaseId": 634432326
-              },
-              {
-                "bodyText": "Pruning reviewers. @ngimel, @VitalyFedyunin, this PR is looking pretty good from a test framework perspective. Would one of you like to review?",
-                "createdAt": "2020-05-27T09:58:42Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 634557563
-              },
-              {
-                "bodyText": "@mruberry  Thanks, would you please help review it again. BTW: failed case is not introduced by our modification.",
-                "createdAt": "2020-05-28T10:26:32Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 635256214
-              },
-              {
-                "bodyText": "@mruberry  we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code",
-                "createdAt": "2020-06-02T08:00:01Z",
-                "author": {
-                  "login": "1pikachu"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 637364148
-              },
-              {
-                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.",
-                "createdAt": "2020-06-02T10:23:47Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 637444457
-              },
-              {
-                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.\n\n@mruberry  thank you",
-                "createdAt": "2020-06-02T11:32:06Z",
-                "author": {
-                  "login": "1pikachu"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 637479226
-              },
-              {
-                "bodyText": "Improving test coverage of math libraries is certainly a good goal and this PR is moving towards it. I have some doubts about implementation decisions made, and about running this PR as part of regular pytorch CI.\nIf the primary goal of this PR is to test correctness of the convolution implementations in the vendor library, then it does not serve this purpose. The absolute majority of the 4000+ test cases come from group 1, where different kernel sizes/strides/dilations are used to produce the output of size 1x1. This can test whether pytorch correctly passes convolution parameters to the backends (although there are cheaper ways to do that), but as actual library correctness check it is almost useless - libraries use very different kernels depending in the input/output sizes, and tests with toy sizes like this don't invoke the real bread-and-butter kernels.\nAlso, if this test suite is meant as primary a means of testing vendor libraries (which is a good goal!) it does not have a place as a part of pytorch regular CI, and should be run when the corresponding vendor libraries are updated. I'd suggest moving this test out into a separate file (maybe even outside of torch/test directory) and have it as a part of library update/qualification process rather than regular CI.\nAlso, if the primary goal is to enable easier testing of vendor libraries correctness, perhaps we should rethink the mechanism of the generation of test cases. It should be easy to add a test case with a particular set of parameters that was found to be buggy. Also, running a cross-product of cases in a multi-dimensional space (as this PR does) is rarely an efficient way of getting a signal, some forms of random sampling usually provide a way to get better correctness signal why using less resources.\nAlso, when testing libraries it is important to test both forward and backward functions, whereas this PR does forward only. I'm openminded on whether convTransposed should be tested or not - if we are testing vendor libraries, then it's not necessary, convTransposed calls the same underlying functions, if we are testing pytorch, then it makes sense to test it separately because it takes different codepaths.",
-                "createdAt": "2020-06-02T21:56:33Z",
-                "author": {
-                  "login": "ngimel"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 637827507
-              },
-              {
-                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.",
-                "createdAt": "2020-06-03T02:16:07Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 637912105
-              },
-              {
-                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.\n\nWe know this PR has been open for awhile and we respect that your time is valuable, but we want to make sure we're making the right change here, and I think @ngimel's comments reflect that and should not be too difficult to address. As I understand, her points are:\n\nThis is a good PR with an exciting idea. To let it run longer and test more cases maybe it should run outside the regular PyTorch CI.\nTo remedy this, let's create a test/math_libraries folder and put this test there: test/math_libaries/convolutions.py. Yes, this is different from our requests in the past, which is our mistake, but it should be an easy change.\nTo make the test more interesting it'd be good for the test cases to resemble convolutions used in practice. The current test cases seem like similar \"toy\" examples. Without time pressure we should be able to run larger, more computationally intensive convolutions.\nLet's change the test cases to include some practical convolutions, make it easy to add test cases, and think about how we might generate other interesting cases. (We should also test backwards once we have more time!)\n\nAnd I think these are good points. Maybe the PR doesn't create a new way to generate interesting convolutions to start and instead only runs a few representative convolutions, but @ngimel is positioning the work for success so that it's useful and we can continue to improve on it in the future.\nDoes that make sense?",
-                "createdAt": "2020-06-03T03:04:55Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 637924703
-              },
-              {
-                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap.  Given this, it would be be better if you raise all the requirement at a time,  considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.",
-                "createdAt": "2020-06-03T05:22:43Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "mingxiaoh"
-                },
-                "databaseId": 637960626
-              },
-              {
-                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.\n\nI'm sorry, I don't think I've talked to @Jianhui-Li before. It's true that the team we expressed a concern about timing if the test was to be run in the CI initially, but I think now that we understand what the test is trying to do better we're not sure the CI is the best place for it. The PR was also closed after a lengthy period of inactivity, and we assumed it had simply been abandoned.\nDo you know who @Jianhui-Li spoke with about this issue originally? Maybe I can follow-up with them for more context.",
-                "createdAt": "2020-06-03T05:42:28Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 637967153
-              },
-              {
-                "bodyText": "@mruberry  it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?",
-                "createdAt": "2020-06-03T06:13:14Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 637978356
-              },
-              {
-                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.",
-                "createdAt": "2020-06-03T20:34:05Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 638446723
-              },
-              {
-                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.\n\nLet me sync with Mingxiao and follow up with this. Thanks.",
-                "createdAt": "2020-06-03T20:44:44Z",
-                "author": {
-                  "login": "Jianhui-Li"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 638451670
-              },
-              {
-                "bodyText": "@mruberry would you please help review it again?",
-                "createdAt": "2020-07-02T14:09:23Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 653028208
-              },
-              {
-                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?",
-                "createdAt": "2020-07-06T20:15:04Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 654443242
-              },
-              {
-                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks",
-                "createdAt": "2020-07-09T11:04:06Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 656062287
-              },
-              {
-                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry  the code is ready for review now, would you please take time for it? Thanks.",
-                "createdAt": "2020-07-14T09:16:48Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 658071151
-              },
-              {
-                "bodyText": "super nit: renaming files to .json will make it more IDE friendly.",
-                "createdAt": "2020-07-14T23:38:37Z",
-                "author": {
-                  "login": "VitalyFedyunin"
-                },
-                "authorAssociation": "CONTRIBUTOR",
-                "editor": null,
-                "databaseId": 658464685
-              },
-              {
-                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.\n\nCool! I took a look with @ngimel, once these issues are addressed I think we're good to go!",
-                "createdAt": "2020-07-16T05:17:29Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 659164401
-              },
-              {
-                "bodyText": "@ngimel  & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.",
-                "createdAt": "2020-07-20T08:30:01Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 660884305
-              },
-              {
-                "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.\n\nUpdated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.",
-                "createdAt": "2020-07-22T20:26:42Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 662678464
-              },
-              {
-                "bodyText": "Updated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.\n@mruberry we have finished the modification according to your comment, would you please review it again? Thanks.",
-                "createdAt": "2020-07-23T10:24:26Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 662930687
-              },
-              {
-                "bodyText": "The code looks good, but I tried running the test suite and hit the following failures:\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float16, group:1, batchsize:22input channel:448, output channel:384, bias:False, padding:[1, 1], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float32, group:1, batchsize:22input channel:80, output channel:192, bias:False, padding:[0, 0], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 106, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\nLooking at the first invalid convolution, for example, it's:\n    {\n        \"case_name\":\"masknet_p1:conv33\",\n        \"mb\":1,\n        \"g\":1,\n        \"ic\":512,\n        \"ih\":64,\n        \"iw\":64,\n        \"oc\":12,\n        \"kh\":1,\n        \"kw\":1,\n        \"sh\":1,\n        \"sw\":1,\n        \"ph\":0,\n        \"pw\":0,\n        \"dh\":0,\n        \"dw\":0,\n        \"bias\":\"False\"\n    },\n\nwhich has a dh and dw of zero, causing it to be added to invalid cases here:\ndh, dw = case['dh'], case['dw']\n            has_bias = case['bias']\n            if dh == 0 or dw == 0:\n                invalid_cases.append(case_name)",
-                "createdAt": "2020-07-23T21:25:19Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": {
-                  "login": "mruberry"
-                },
-                "databaseId": 663240268
-              },
-              {
-                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.",
-                "createdAt": "2020-07-27T12:43:44Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 664373079
-              },
-              {
-                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.\n\nBefore I run these tests again, is an atol of 1e-2 needed for all types or just half? Also, how does 1e-2 compare to the values that are being compared?",
-                "createdAt": "2020-07-27T18:39:27Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 664569507
-              },
-              {
-                "bodyText": "@mruberry 1e-2 is experimental result, details see below, random means it might be failed sometimes.\n\n\n\natol,rtol\n1e-2,1e-2\n1e-2,1e-3\n1e-3,1e-2\n1e-3,1e-3\n1e-4,1e-3\n1e-3,1e-4\n1e-4,1e-4\n1e-4,1e-5\n1e-5,1e-4\n\n\n\n\nCuda float16\npass\npass\npass\npass\npass\nfail\nFail\nFail\nfail\n\n\nCuda float32\npass\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nfail",
-                "createdAt": "2020-07-31T03:33:27Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 666894774
-              },
-              {
-                "bodyText": "@mruberry  would you please find time to review it again? Thanks.",
-                "createdAt": "2020-08-04T05:01:20Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 668380451
-              },
-              {
-                "bodyText": "@mruberry would you please find time to review it again? Thanks.\n\nI was just about to try and run this again locally but it looks like the files describing the convolutions are missing?",
-                "createdAt": "2020-08-07T03:49:44Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 670306210
-              },
-              {
-                "bodyText": "@mruberry sorry but what is missing actually?",
-                "createdAt": "2020-08-07T05:00:20Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 670322557
-              },
-              {
-                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.",
-                "createdAt": "2020-08-07T16:06:41Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 670591170
-              },
-              {
-                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.\n\n@mruberry sorry, we add them now, would you please check it again? Thanks.",
-                "createdAt": "2020-08-13T10:40:11Z",
-                "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 673402901
-              },
-              {
-                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.",
-                "createdAt": "2020-08-13T23:35:00Z",
-                "author": {
-                  "login": "mruberry"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 673760580
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOIapCfg==",
-              "hasPreviousPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=68111 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "chunyuan-w"
-          },
-          "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)",
-          "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n  \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.",
-          "headRefName": "chunyuan/llga_preview2",
-          "headRepository": {
-            "nameWithOwner": "chunyuan-w/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "81d44f35b8bc043c38837d0694e5bc072203b832"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "954dc23663125897f4b199eb2a8607dc5fca3274"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "edbfc640ea79a0af85757d9e73796dcc90231519"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "62a4642cf3330524990a69ac29e002c97812320a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "ca9b1223be4af2c8b4929303d498eafd71793128"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "6f4a23d24514a02954d2ec792830085f612223c9"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "e88b492be733f24b6aa395829c76add67d0901e7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "5157930f7b3921d41a586260582b574c915f6ca1"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "62991eaad0e638bb0bced327e03f932f66f68732"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "7496bf1588050191595d833d23b8972b2f22655e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "eb32cc65a975361160948bfc3d6a577991ea262e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "a72cd0d02693f45e5354a70654581ad514581ec7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "49a592d9788d08e6cd0593882f867e129057c1cc"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "0b743523d1430fec759d5fefbb687f17c89335a5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "c189eca154b6691919d0e21489d1c322c7435c0b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "29929f48be03dcdd1bbfade572de7feafa825547"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nikita.shulga@gmail.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "NjI",
-              "hasNextPage": false
-            },
-            "totalCount": 62
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              },
-                              {
-                                "name": "Meta Internal-Only Changes Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYwzI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440028"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "clang-format",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895825"
-                              },
-                              {
-                                "name": "py2-setup-validate-errormsg",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895911"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895963"
-                              },
-                              {
-                                "name": "shellcheck",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896134"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896253"
-                              },
-                              {
-                                "name": "clang-tidy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896371"
-                              },
-                              {
-                                "name": "cmakelint",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896525"
-                              },
-                              {
-                                "name": "flake8-py3",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896658"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896771"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896795"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896838"
-                              },
-                              {
-                                "name": "mypy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896897"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440031"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440031/jobs/2903895828"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440039"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896014"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896165"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896394"
-                              },
-                              {
-                                "name": "linux-bionic-rocm4.5-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896572"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896666"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896778"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896837"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896896"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896936"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897025"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897161"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897213"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897280"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897368"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897431"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897476"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897578"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897630"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897699"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897733"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327787"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327838"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327956"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327997"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328035"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328093"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328131"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328177"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904333962"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904334006"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430419"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430459"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430508"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430573"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443663"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443723"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443787"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454239"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454303"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554602"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554698"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588855"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588886"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588924"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904655702"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656104"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656150"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656192"
-                              },
-                              {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706520"
-                              },
-                              {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706565"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048428?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048429?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048431?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048430?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-03-21T19:58:52Z",
-                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
-                }
-              }
-            ]
-          },
-          "changedFiles": 37,
-          "files": {
-            "nodes": [
-              {
-                "path": "aten/src/ATen/core/interned_strings.h"
-              },
-              {
-                "path": "caffe2/CMakeLists.txt"
-              },
-              {
-                "path": "cmake/Dependencies.cmake"
-              },
-              {
-                "path": "cmake/Modules/FindMKLDNN.cmake"
-              },
-              {
-                "path": "cmake/public/mkldnn.cmake"
-              },
-              {
-                "path": "docs/source/jit.rst"
-              },
-              {
-                "path": "test/test_jit_llga_fuser.py"
-              },
-              {
-                "path": "torch/_C/__init__.pyi.in"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/README.md"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_helper.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/guard_shape.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/interface.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/interface.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/kernel.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/kernel.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/operator.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/ir/alias_analysis.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/ir/ir.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/passes/onednn_graph_fuser.h"
-              },
-              {
-                "path": "torch/csrc/jit/python/init.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/operator.cpp"
-              },
-              {
-                "path": "torch/jit/__init__.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mzc",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "pinzhenx"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pinzhenx"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pinzhenx"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "chunyuan-w"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "wukong1992"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "malfet"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "malfet"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "malfet"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.",
-                "createdAt": "2022-03-21T22:51:38Z",
-                "author": {
-                  "login": "suo"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1074498483
-              },
-              {
-                "bodyText": "@pytorchbot revert this",
-                "createdAt": "2022-03-21T22:51:44Z",
-                "author": {
-                  "login": "suo"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1074498550
-              },
-              {
-                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.",
-                "createdAt": "2022-03-21T22:53:34Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1074499668
-              },
-              {
-                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
-                "createdAt": "2022-03-21T23:07:23Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1074508608
-              },
-              {
-                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
-                "createdAt": "2022-03-30T00:53:50Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1082508130
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "oncall: jit"
-                }
-              },
-              {
-                "node": {
-                  "name": "triaged"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Reverted"
-                }
-              },
-              {
-                "node": {
-                  "name": "intel priority"
-                }
-              }
-            ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": [
-                  {
-                    "node": {
-                      "parents": {
-                        "edges": [
-                          {
-                            "node": {
-                              "oid": "3cfc61b84659cea435411a546eca6a891584247f"
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                ]
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOQAuLsw== name=pytorch number=68111 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/chunyuan-w/pytorch/blob/7496bf1588050191595d833d23b8972b2f22655e/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries/conda\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-manywheel\nciflow/binaries, ciflow/binaries/wheel\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\n\n\nYou can add a comment to the PR and tag @pytorchbot with the following commands:\n\n# ciflow rerun, \"ciflow/default\" will always be added automatically\n@pytorchbot ciflow rerun\n\n# ciflow rerun with additional labels \"-l <ciflow/label_name>\", which is equivalent to adding these labels manually and trigger the rerun\n@pytorchbot ciflow rerun -l ciflow/scheduled -l ciflow/slow\n\nFor more information, please take a look at the CI Flow Wiki.",
-                "createdAt": "2021-11-10T08:42:49Z",
-                "author": {
-                  "login": "pytorch-probot"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "pytorch-probot"
-                },
-                "databaseId": 964902865
-              },
-              {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/68111\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 7388141 (more details on the Dr. CI page):\n\n\n29/29 failures introduced in this PR\n\n\n\ud83d\udd75\ufe0f 29 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n pull / linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge) (1/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:31:38.6978776Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:31:38.3001628Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:31:38.5169168Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:31:38.5362923Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:31:38.5413452Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:31:38.5458747Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:31:38.5484014Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:31:38.5497924Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:31:38.5656491Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:31:38.5678893Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:31:38.6888479Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f6488c20adb4dca4\n2022-03-21T21:31:38.6978776Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:31:38.6992648Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:31:38.7003010Z ##[error]Process completed with exit code 2.\n2022-03-21T21:31:38.7044027Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:31:38.7044261Z with:\n2022-03-21T21:31:38.7044413Z env:\n2022-03-21T21:31:38.7044565Z   IN_CI: 1\n2022-03-21T21:31:38.7044709Z   IS_GHA: 1\n2022-03-21T21:31:38.7044885Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:31:38.7045067Z ##[endgroup]\n2022-03-21T21:31:38.7060958Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge) (2/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:35:19.2635222Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:35:18.9028722Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:35:19.1132721Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:35:19.1310590Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:35:19.1360251Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:35:19.1386865Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:35:19.1429182Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:35:19.1441925Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:35:19.1468280Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:35:19.1617667Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:35:19.2545368Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-098be2985e0392130\n2022-03-21T21:35:19.2635222Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:35:19.2648463Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:35:19.2658727Z ##[error]Process completed with exit code 2.\n2022-03-21T21:35:19.2706355Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:35:19.2706591Z with:\n2022-03-21T21:35:19.2706748Z env:\n2022-03-21T21:35:19.2706908Z   IN_CI: 1\n2022-03-21T21:35:19.2707061Z   IS_GHA: 1\n2022-03-21T21:35:19.2707246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:35:19.2707438Z ##[endgroup]\n2022-03-21T21:35:19.2724554Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge) (3/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:11:52.7662022Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T23:11:53.1213298Z      ---------------------------------------- 8.1/8.1 MB 23.6 MB/s eta 0:00:00\n2022-03-21T23:11:53.1644665Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:11:53.2218699Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T23:11:53.2389674Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T23:11:53.2787295Z      -------------------------------------- 247.7/247.7 KB 7.4 MB/s eta 0:00:00\n2022-03-21T23:11:53.3761842Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:11:53.5457622Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T23:11:57.4175080Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T23:11:57.5296815Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0105d4db093574f40\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:11:57.5564814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:11:57.5587712Z ##[error]Process completed with exit code 2.\n2022-03-21T23:11:57.5790311Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T23:11:57.5790832Z with:\n2022-03-21T23:11:57.5791104Z env:\n2022-03-21T23:11:57.5791358Z   IN_CI: 1\n2022-03-21T23:11:57.5791620Z   IS_GHA: 1\n2022-03-21T23:11:57.5791939Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:11:57.5792425Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T23:11:57.5792884Z ##[endgroup]\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu) (4/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T02:17:12.6257577Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T02:17:11.9280556Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T02:17:11.9335199Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:11.9682045Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T02:17:11.9850357Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0403171Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T02:17:12.0468875Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0590000Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T02:17:12.0607093Z Installing collected packages: jmespath, urllib3, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T02:17:12.5273459Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T02:17:12.6032812Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-114\n2022-03-22T02:17:12.6257577Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T02:17:12.6259543Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T02:17:12.6291924Z ##[error]Process completed with exit code 2.\n2022-03-22T02:17:12.6387977Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T02:17:12.6388298Z with:\n2022-03-22T02:17:12.6388521Z   wait-ssh: false\n2022-03-22T02:17:12.6388727Z env:\n2022-03-22T02:17:12.6388932Z   IN_CI: 1\n2022-03-22T02:17:12.6389143Z   IS_GHA: 1\n2022-03-22T02:17:12.6389368Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T02:17:12.6389669Z   DOCKER_HOST: unix:///run/user/1121/docker.sock\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge) (5/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:19:24.4890693Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:19:24.0962005Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:19:24.3152253Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:19:24.3341183Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:19:24.3391374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:19:24.3436392Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:19:24.3448982Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:19:24.3474092Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:19:24.3502003Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:19:24.3655072Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:19:24.4799309Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0bc9250521f338cae\n2022-03-21T22:19:24.4890693Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:19:24.4903625Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:19:24.4913841Z ##[error]Process completed with exit code 2.\n2022-03-21T22:19:24.4957338Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:19:24.4957575Z with:\n2022-03-21T22:19:24.4957735Z env:\n2022-03-21T22:19:24.4957900Z   IN_CI: 1\n2022-03-21T22:19:24.4958055Z   IS_GHA: 1\n2022-03-21T22:19:24.4958246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:19:24.4958437Z ##[endgroup]\n2022-03-21T22:19:24.4989649Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu) (6/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T01:05:07.6983899Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T01:05:06.8364546Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T01:05:06.8431763Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.8949391Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T01:05:06.9180079Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.9803351Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T01:05:06.9882133Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:07.0067062Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T01:05:07.0088676Z Installing collected packages: urllib3, jmespath, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T01:05:07.5819667Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T01:05:07.6774717Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-60\n2022-03-22T01:05:07.6983899Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T01:05:07.6988652Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T01:05:07.7023073Z ##[error]Process completed with exit code 2.\n2022-03-22T01:05:07.7102087Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T01:05:07.7102389Z with:\n2022-03-22T01:05:07.7102603Z   wait-ssh: false\n2022-03-22T01:05:07.7102820Z env:\n2022-03-22T01:05:07.7103015Z   IN_CI: 1\n2022-03-22T01:05:07.7103224Z   IS_GHA: 1\n2022-03-22T01:05:07.7103458Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T01:05:07.7103737Z   DOCKER_HOST: unix:///run/user/1502/docker.sock\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge) (7/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:51:39.3637996Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:51:39.2041249Z   Attempting uninstall: s3transfer\n2022-03-21T20:51:39.2043010Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:51:39.2083799Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:51:39.2089675Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:51:39.2480546Z   Attempting uninstall: boto3\n2022-03-21T20:51:39.2482953Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:51:39.2584292Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:51:39.2599474Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:51:39.3130921Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:51:39.3550598Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03ef7efc3078e3da5\n2022-03-21T20:51:39.3637996Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:51:39.3650651Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:51:39.3660484Z ##[error]Process completed with exit code 2.\n2022-03-21T20:51:39.3696465Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:51:39.3696693Z with:\n2022-03-21T20:51:39.3696850Z env:\n2022-03-21T20:51:39.3697012Z   IN_CI: 1\n2022-03-21T20:51:39.3697161Z   IS_GHA: 1\n2022-03-21T20:51:39.3697342Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:51:39.3697528Z ##[endgroup]\n2022-03-21T20:51:39.3730420Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge) (8/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:36.3916860Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:03:36.0096309Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:03:36.2278560Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:03:36.2461618Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:03:36.2513260Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:03:36.2541524Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:03:36.2554899Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:03:36.2598277Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:03:36.2758299Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:03:36.2780690Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:03:36.3825021Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0a4a552890e6ef7d3\n2022-03-21T21:03:36.3916860Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:03:36.3930343Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:03:36.3941263Z ##[error]Process completed with exit code 2.\n2022-03-21T21:03:36.3979258Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:03:36.3979496Z with:\n2022-03-21T21:03:36.3979654Z env:\n2022-03-21T21:03:36.3979814Z   IN_CI: 1\n2022-03-21T21:03:36.3979968Z   IS_GHA: 1\n2022-03-21T21:03:36.3980157Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:03:36.3980360Z ##[endgroup]\n2022-03-21T21:03:36.3996257Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu) (9/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:41:10.3015614Z   Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n2022-03-22T00:41:10.3625659Z      ---------------------------------------- 79.5/79.5 KB 1.1 MB/s eta 0:00:00\n2022-03-22T00:41:10.4120236Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-22T00:41:10.4170155Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-22T00:41:10.4722115Z      -------------------------------------- 247.7/247.7 KB 5.2 MB/s eta 0:00:00\n2022-03-22T00:41:10.4843512Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:41:10.6596108Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:41:10.8733354Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-22T00:41:15.3745408Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-22T00:41:15.4987162Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-09cacc848abc3dd32\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:41:15.5373630Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:41:15.5404353Z ##[error]Process completed with exit code 2.\n2022-03-22T00:41:15.5790508Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-22T00:41:15.5791192Z with:\n2022-03-22T00:41:15.5791530Z env:\n2022-03-22T00:41:15.5791849Z   IN_CI: 1\n2022-03-22T00:41:15.5792186Z   IS_GHA: 1\n2022-03-22T00:41:15.5792599Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:41:15.5793237Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-22T00:41:15.5793831Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge) (10/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:32.9799307Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:32.8167560Z   Attempting uninstall: s3transfer\n2022-03-21T20:50:32.8169351Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:50:32.8213295Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:50:32.8219209Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:50:32.8602320Z   Attempting uninstall: boto3\n2022-03-21T20:50:32.8603289Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:50:32.8704535Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:50:32.8719403Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:50:32.9244278Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:50:32.9710449Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0c568461a276d4a71\n2022-03-21T20:50:32.9799307Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:32.9812238Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:32.9823052Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:32.9859290Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:32.9859527Z with:\n2022-03-21T20:50:32.9859664Z env:\n2022-03-21T20:50:32.9859817Z   IN_CI: 1\n2022-03-21T20:50:32.9859977Z   IS_GHA: 1\n2022-03-21T20:50:32.9860144Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:32.9860327Z ##[endgroup]\n2022-03-21T20:50:32.9893642Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge) (11/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7163042Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.6660824Z     #10 0x55fc8a3ea801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.6661768Z     #11 0x55fc8a3f57a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.6662455Z     #12 0x55fc8a3f580b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.6663570Z     #13 0x55fc8a3f5908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.6663952Z     #14 0x55fc8a3f5908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.6664431Z     #15 0x55fc8a3f5908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.6665304Z     #16 0x55fc8a3f5ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7162113Z     #17 0x7f940d00f83f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7162534Z     #18 0x55fc8a39a554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7162711Z \n2022-03-21T21:05:00.7163042Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.7334595Z + retcode=1\n2022-03-21T21:05:00.7334954Z + set -e\n2022-03-21T21:05:00.7335215Z + return 1\n2022-03-21T21:05:00.7338688Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.7339232Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.7340113Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.7340612Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.7341187Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.7341668Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.7344466Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge) (12/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:06:03.4437430Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:06:03.0752199Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:06:03.2853252Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:06:03.3032326Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:06:03.3081589Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:06:03.3093911Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:06:03.3120244Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:06:03.3162406Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:06:03.3188431Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:06:03.3337181Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:06:03.4348072Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ee48c8811fafc444\n2022-03-21T22:06:03.4437430Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:06:03.4450920Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:06:03.4461263Z ##[error]Process completed with exit code 2.\n2022-03-21T22:06:03.4502346Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:06:03.4502576Z with:\n2022-03-21T22:06:03.4502730Z env:\n2022-03-21T22:06:03.4502888Z   IN_CI: 1\n2022-03-21T22:06:03.4503038Z   IS_GHA: 1\n2022-03-21T22:06:03.4503302Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:06:03.4503492Z ##[endgroup]\n2022-03-21T22:06:03.4519156Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge) (13/29)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:13.2205634Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:12.8679322Z + python3 -m pip install boto3==1.19.12\n2022-03-21T20:50:13.0744228Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T20:50:13.0916284Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T20:50:13.0964264Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T20:50:13.1005656Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T20:50:13.1017299Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T20:50:13.1041042Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T20:50:13.1189450Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T20:50:13.1208751Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T20:50:13.2119445Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d02da60fd18c22f5\n2022-03-21T20:50:13.2205634Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:13.2217939Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:13.2220259Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:13.2248664Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:13.2249012Z with:\n2022-03-21T20:50:13.2249260Z env:\n2022-03-21T20:50:13.2249500Z   IN_CI: 1\n2022-03-21T20:50:13.2249738Z   IS_GHA: 1\n2022-03-21T20:50:13.2250025Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:13.2250329Z ##[endgroup]\n2022-03-21T20:50:13.2272735Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) (14/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:47:38.0451999Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:47:37.5554508Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:47:37.8411473Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:47:37.8631484Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:47:37.8699561Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:47:37.8737037Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:47:37.8754443Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:47:37.8814393Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:47:37.8849540Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:47:37.9059579Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:47:38.0336298Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0b44f47f4292089a2\n2022-03-21T23:47:38.0451999Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:47:38.0469471Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:47:38.0484106Z ##[error]Process completed with exit code 2.\n2022-03-21T23:47:38.0532678Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:47:38.0533007Z with:\n2022-03-21T23:47:38.0533223Z env:\n2022-03-21T23:47:38.0533440Z   IN_CI: 1\n2022-03-21T23:47:38.0533649Z   IS_GHA: 1\n2022-03-21T23:47:38.0533902Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:47:38.0534170Z   GPU_FLAG: --gpus all\n2022-03-21T23:47:38.0534401Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge) (15/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:04:59.3115800Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:04:59.2595213Z     #10 0x55a7f39a4801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:04:59.2595707Z     #11 0x55a7f39af7a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:04:59.2597203Z     #12 0x55a7f39af80b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:04:59.2598205Z     #13 0x55a7f39af908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:04:59.2598697Z     #14 0x55a7f39af908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:04:59.2599178Z     #15 0x55a7f39af908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:04:59.2599747Z     #16 0x55a7f39afccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:04:59.3114751Z     #17 0x7f3b3822383f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:04:59.3115277Z     #18 0x55a7f3954554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:04:59.3115468Z \n2022-03-21T21:04:59.3115800Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:04:59.3292385Z + retcode=1\n2022-03-21T21:04:59.3292781Z + set -e\n2022-03-21T21:04:59.3293062Z + return 1\n2022-03-21T21:04:59.3295462Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:04:59.3295802Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:04:59.3296394Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:04:59.3296700Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:04:59.3297055Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:04:59.3297416Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:04:59.3299623Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (16/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:14:25.5525714Z Collecting jmespath<1.0.0,>=0.7.1\n2022-03-21T22:14:25.5568155Z   Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n2022-03-21T22:14:25.5952617Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:14:25.6169392Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:14:25.6629996Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:14:25.6710247Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:14:25.8284354Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:14:25.9816751Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:14:31.6672236Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:14:31.7630473Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ed0915ecee5d2424\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:14:31.7876742Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:14:31.7897140Z ##[error]Process completed with exit code 2.\n2022-03-21T22:14:31.8195621Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:14:31.8196110Z with:\n2022-03-21T22:14:31.8196356Z env:\n2022-03-21T22:14:31.8196614Z   IN_CI: 1\n2022-03-21T22:14:31.8196876Z   IS_GHA: 1\n2022-03-21T22:14:31.8197169Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:14:31.8197652Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:14:31.8198093Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge) (17/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:19:15.8845728Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:19:15.5116060Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:19:15.7231476Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:19:15.7409711Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:19:15.7458478Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:19:15.7470508Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:19:15.7496799Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:19:15.7538362Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:19:15.7566161Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:19:15.7711630Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:19:15.8753543Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0e2b3b4ddb246ff2a\n2022-03-21T21:19:15.8845728Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:19:15.8859814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:19:15.8870165Z ##[error]Process completed with exit code 2.\n2022-03-21T21:19:15.8917039Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:19:15.8917279Z with:\n2022-03-21T21:19:15.8917433Z env:\n2022-03-21T21:19:15.8917586Z   IN_CI: 1\n2022-03-21T21:19:15.8917734Z   IS_GHA: 1\n2022-03-21T21:19:15.8917917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:19:15.8918102Z ##[endgroup]\n2022-03-21T21:19:15.8934572Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu) (18/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:19:48.5900162Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:19:48.0742254Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:19:48.3742563Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:19:48.3976536Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:19:48.4048700Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:19:48.4065374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:19:48.4128076Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:19:48.4164273Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:19:48.4202610Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:19:48.4416723Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:19:48.5773033Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-07ab7a3c4a5402af2\n2022-03-21T23:19:48.5900162Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:19:48.5919822Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:19:48.5936087Z ##[error]Process completed with exit code 2.\n2022-03-21T23:19:48.6007930Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:19:48.6008268Z with:\n2022-03-21T23:19:48.6008483Z env:\n2022-03-21T23:19:48.6008701Z   IN_CI: 1\n2022-03-21T23:19:48.6008920Z   IS_GHA: 1\n2022-03-21T23:19:48.6009170Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:19:48.6009440Z   GPU_FLAG: --gpus all\n2022-03-21T23:19:48.6009671Z ##[endgroup]\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (19/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:53:59.0889659Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T22:53:59.6881416Z      ---------------------------------------- 8.1/8.1 MB 14.0 MB/s eta 0:00:00\n2022-03-21T22:53:59.7427779Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:53:59.7691882Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:53:59.7779847Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:53:59.8281663Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:54:00.0185115Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:54:00.2359770Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:54:04.1208891Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:54:04.2505862Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03b4fbe63be8ef4b0\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:54:04.2891082Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:54:04.2919900Z ##[error]Process completed with exit code 2.\n2022-03-21T22:54:04.3377901Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:54:04.3378575Z with:\n2022-03-21T22:54:04.3378930Z env:\n2022-03-21T22:54:04.3379275Z   IN_CI: 1\n2022-03-21T22:54:04.3379600Z   IS_GHA: 1\n2022-03-21T22:54:04.3380023Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:54:04.3380691Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:54:04.3381278Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge) (20/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:09:34.0074610Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:09:33.6365531Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:09:33.8475619Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:09:33.8655152Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:09:33.8704395Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:09:33.8716774Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:09:33.8760145Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:09:33.8785000Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:09:33.8811316Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:09:33.8960134Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:09:33.9984866Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d325eb9fd156146f\n2022-03-21T22:09:34.0074610Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:09:34.0087465Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:09:34.0101743Z ##[error]Process completed with exit code 2.\n2022-03-21T22:09:34.0154014Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:09:34.0154246Z with:\n2022-03-21T22:09:34.0154412Z env:\n2022-03-21T22:09:34.0154574Z   IN_CI: 1\n2022-03-21T22:09:34.0154728Z   IS_GHA: 1\n2022-03-21T22:09:34.0154917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:09:34.0155112Z ##[endgroup]\n2022-03-21T22:09:34.0191047Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge) (21/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:17.8502655Z [E request_callbac...yUniqueId(created_on=0, local_id=0) to be created.\n\n2022-03-21T21:03:14.4669960Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpxgdsmeer\n2022-03-21T21:03:14.4671407Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpxgdsmeer/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.4973023Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp1i2hfmpc\n2022-03-21T21:03:14.4973800Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp1i2hfmpc/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.5532339Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpgx4da7b0\n2022-03-21T21:03:14.5533064Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpgx4da7b0/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.7050673Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 0\n2022-03-21T21:03:14.7097127Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 3\n2022-03-21T21:03:14.7398339Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 2\n2022-03-21T21:03:14.7922283Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 1\n2022-03-21T21:03:17.8502655Z [E request_callback_no_python.cpp:559] Received error while processing request type 261: false INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp\":387, please report a bug to PyTorch. Expected OwnerRRef with id GloballyUniqueId(created_on=0, local_id=0) to be created.\n2022-03-21T21:03:17.8503603Z Exception raised from getOwnerRRef at /var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp:387 (most recent call first):\n2022-03-21T21:03:17.8504385Z frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x69 (0x7f180df19e19 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505131Z frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xd2 (0x7f180df160e2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505927Z frame #2: c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x4e (0x7f180df17a7e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8506674Z frame #3: torch::distributed::rpc::RRefContext::getOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, bool) + 0x4b4 (0x7f18118b7b64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8507642Z frame #4: torch::distributed::rpc::RequestCallbackNoPython::assignOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, torch::distributed::rpc::GloballyUniqueId const&, c10::intrusive_ptr<c10::ivalue::Future, c10::detail::intrusive_target_default_null_type<c10::ivalue::Future> >) const + 0x70 (0x7f18118a7bf0 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8508613Z frame #5: torch::distributed::rpc::RequestCallbackImpl::processPythonRemoteCall(torch::distributed::rpc::RpcCommandBase&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0xc8 (0x7f1819736208 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8509749Z frame #6: torch::distributed::rpc::RequestCallbackNoPython::processRpc(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x194 (0x7f18118ac914 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8510708Z frame #7: torch::distributed::rpc::RequestCallbackImpl::processRpcWithErrors(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x65 (0x7f1819735865 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8511369Z frame #8: <unknown function> + 0x375249a (0x7f18118a949a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test (22/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERR...t available for the merge-base of your branch\"\ufffd[0m\n\n2022-03-21T20:01:07.7012399Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7012634Z \ufffd[36;1m# Covers the case where a previous tag doesn't exist for the tree\ufffd[0m\n2022-03-21T20:01:07.7012992Z \ufffd[36;1m# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly\ufffd[0m\n2022-03-21T20:01:07.7013373Z \ufffd[36;1mif ! git rev-parse \"$MERGE_BASE:.circleci/docker\"; then\ufffd[0m\n2022-03-21T20:01:07.7013784Z \ufffd[36;1m  echo \"Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit\"\ufffd[0m\n2022-03-21T20:01:07.7014149Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7014325Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7014573Z \ufffd[36;1mPREVIOUS_DOCKER_TAG=$(git rev-parse \"$MERGE_BASE:.circleci/docker\")\ufffd[0m\n2022-03-21T20:01:07.7014907Z \ufffd[36;1m# If no image exists but the hash is the same as the previous hash then we should error out here\ufffd[0m\n2022-03-21T20:01:07.7015231Z \ufffd[36;1mif [[ \"${PREVIOUS_DOCKER_TAG}\" = \"${DOCKER_TAG}\" ]]; then\ufffd[0m\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch\"\ufffd[0m\n2022-03-21T20:01:07.7015931Z \ufffd[36;1m  echo \"       contact the PyTorch team to restore the original images\"\ufffd[0m\n2022-03-21T20:01:07.7016225Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7016400Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7016608Z \ufffd[36;1mecho ::set-output name=rebuild::yes\ufffd[0m\n2022-03-21T20:01:07.7027605Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}\n2022-03-21T20:01:07.7027837Z env:\n2022-03-21T20:01:07.7028006Z   IN_CI: 1\n2022-03-21T20:01:07.7028159Z   IS_GHA: 1\n2022-03-21T20:01:07.7028346Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:01:07.7028589Z   BASE_REVISION: 6643522db9ff595f564b8081de58b3a33c546178\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu) (23/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:49:54.2949572Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:49:53.8049151Z + python3 -m pip install boto3==1.19.12\n2022-03-22T00:49:54.0981629Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-22T00:49:54.1207562Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-22T00:49:54.1277146Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-22T00:49:54.1315027Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-22T00:49:54.1331813Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-22T00:49:54.1391622Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:49:54.1609217Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-22T00:49:54.1637417Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:49:54.2830197Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f7c32fe13be12fea\n2022-03-22T00:49:54.2949572Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:49:54.2966933Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:49:54.2982588Z ##[error]Process completed with exit code 2.\n2022-03-22T00:49:54.3031464Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T00:49:54.3031794Z with:\n2022-03-22T00:49:54.3032012Z env:\n2022-03-22T00:49:54.3032227Z   IN_CI: 1\n2022-03-22T00:49:54.3032434Z   IS_GHA: 1\n2022-03-22T00:49:54.3032681Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:49:54.3033084Z   GPU_FLAG: --gpus all\n2022-03-22T00:49:54.3033312Z ##[endgroup]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge) (24/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:56:07.3365589Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T21:56:07.7926584Z      ---------------------------------------- 8.1/8.1 MB 17.3 MB/s eta 0:00:00\n2022-03-21T21:56:07.9319362Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T21:56:07.9366132Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T21:56:08.0077590Z      -------------------------------------- 247.7/247.7 KB 3.0 MB/s eta 0:00:00\n2022-03-21T21:56:08.0164070Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:56:08.1775537Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:56:08.3393469Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T21:56:12.4576766Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T21:56:12.5641959Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0afad69838118af0e\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:56:12.5905611Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:56:12.5927729Z ##[error]Process completed with exit code 2.\n2022-03-21T21:56:12.6239531Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T21:56:12.6240039Z with:\n2022-03-21T21:56:12.6240299Z env:\n2022-03-21T21:56:12.6240557Z   IN_CI: 1\n2022-03-21T21:56:12.6240805Z   IS_GHA: 1\n2022-03-21T21:56:12.6241118Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:56:12.6241613Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T21:56:12.6242052Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge) (25/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:46:39.5474616Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:46:39.1884210Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:46:39.3928976Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:46:39.4105069Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:46:39.4152571Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:46:39.4194931Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:46:39.4218947Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:46:39.4230812Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:46:39.4380089Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:46:39.4399461Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:46:39.5387703Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0888bed1149cca415\n2022-03-21T21:46:39.5474616Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:46:39.5487145Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:46:39.5497480Z ##[error]Process completed with exit code 2.\n2022-03-21T21:46:39.5541319Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:46:39.5541544Z with:\n2022-03-21T21:46:39.5541698Z env:\n2022-03-21T21:46:39.5541851Z   IN_CI: 1\n2022-03-21T21:46:39.5541997Z   IS_GHA: 1\n2022-03-21T21:46:39.5542176Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:46:39.5542361Z ##[endgroup]\n2022-03-21T21:46:39.5557878Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge) (26/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:34:57.0623859Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:34:56.9039884Z   Attempting uninstall: s3transfer\n2022-03-21T21:34:56.9041446Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:34:56.9090783Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:34:56.9095968Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:34:56.9453014Z   Attempting uninstall: boto3\n2022-03-21T21:34:56.9454356Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:34:56.9564320Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:34:56.9578035Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:34:57.0091363Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:34:57.0536230Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-034a3afd5d80b91fd\n2022-03-21T21:34:57.0623859Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:34:57.0637167Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:34:57.0647396Z ##[error]Process completed with exit code 2.\n2022-03-21T21:34:57.0688237Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:34:57.0688481Z with:\n2022-03-21T21:34:57.0688631Z env:\n2022-03-21T21:34:57.0688769Z   IN_CI: 1\n2022-03-21T21:34:57.0688930Z   IS_GHA: 1\n2022-03-21T21:34:57.0689109Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:34:57.0689462Z ##[endgroup]\n2022-03-21T21:34:57.0704768Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge) (27/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7896545Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.7395504Z     #10 0x5597fd5a9801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.7396330Z     #11 0x5597fd5b47a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.7396688Z     #12 0x5597fd5b480b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.7398664Z     #13 0x5597fd5b4908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.7399177Z     #14 0x5597fd5b4908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.7399663Z     #15 0x5597fd5b4908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.7399986Z     #16 0x5597fd5b4ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7895241Z     #17 0x7f0a5905983f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7895772Z     #18 0x5597fd559554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7896033Z \n2022-03-21T21:05:00.7896545Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.8063448Z + retcode=1\n2022-03-21T21:05:00.8063787Z + set -e\n2022-03-21T21:05:00.8064058Z + return 1\n2022-03-21T21:05:00.8067638Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.8068127Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.8069018Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.8069500Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.8070105Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.8070580Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.8072640Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu) (28/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:48:17.3384813Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:48:16.8599645Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:48:17.1464241Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:48:17.1685222Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:48:17.1754164Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:48:17.1771662Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:48:17.1808722Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:48:17.1868636Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:48:17.1903889Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:48:17.2113746Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:48:17.3267404Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-01fe178c405417375\n2022-03-21T22:48:17.3384813Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:48:17.3402286Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:48:17.3418376Z ##[error]Process completed with exit code 2.\n2022-03-21T22:48:17.3470528Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:48:17.3470874Z with:\n2022-03-21T22:48:17.3471096Z env:\n2022-03-21T22:48:17.3471327Z   IN_CI: 1\n2022-03-21T22:48:17.3471538Z   IS_GHA: 1\n2022-03-21T22:48:17.3471802Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:48:17.3472083Z   GPU_FLAG: --gpus all\n2022-03-21T22:48:17.3472322Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge) (29/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:16:38.9646300Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:16:38.7995969Z   Attempting uninstall: s3transfer\n2022-03-21T21:16:38.7998039Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:16:38.8066994Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:16:38.8072844Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:16:38.8449275Z   Attempting uninstall: boto3\n2022-03-21T21:16:38.8451430Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:16:38.8559828Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:16:38.8574290Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:16:38.9100438Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:16:38.9558098Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d779c59d277d32ee\n2022-03-21T21:16:38.9646300Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:16:38.9658894Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:16:38.9673240Z ##[error]Process completed with exit code 2.\n2022-03-21T21:16:38.9720106Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:16:38.9720333Z with:\n2022-03-21T21:16:38.9720485Z env:\n2022-03-21T21:16:38.9720645Z   IN_CI: 1\n2022-03-21T21:16:38.9720793Z   IS_GHA: 1\n2022-03-21T21:16:38.9720970Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:16:38.9721151Z ##[endgroup]\n2022-03-21T21:16:38.9736762Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2021-11-10T08:42:52Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 964902894
-              },
-              {
-                "bodyText": "@vitaly-fedyunin @gottbrath  FYI that this is the oneDNN Graph API integration. It depends on the #63748.",
-                "createdAt": "2021-11-16T16:36:52Z",
-                "author": {
-                  "login": "Jianhui-Li"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 970451860
-              },
-              {
-                "bodyText": "CI failures are currently being caused by some issues in the CI infra, and are also occurring with other PRs.",
-                "createdAt": "2021-12-10T05:59:17Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 990641309
-              },
-              {
-                "bodyText": "CI failures are unrelated.",
-                "createdAt": "2021-12-10T20:44:09Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 991281407
-              },
-              {
-                "bodyText": "The CI failure is unrelated.",
-                "createdAt": "2021-12-16T02:45:59Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 995389295
-              },
-              {
-                "bodyText": "Hi, thank you for the PR!\nDo you mind running a larger amount of torchbench and reporting numbers ? You can look at Jason's post here for what models are supported in script. Initially just the vision models would be useful. @Krovatkin also did some benchmarking of a traced Bert model and found on average a ~16% speedup with this PR.",
-                "createdAt": "2022-01-18T18:22:34Z",
-                "author": {
-                  "login": "eellison"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1015689390
-              },
-              {
-                "bodyText": "Thanks a lot for reviewing, @eellison & @Krovatkin!\nWe just wanted to let you know that we're working on the benchmarking & will get back to you in a day, or two.\nUPDATE (Jan 21): While running some TorchBench models, we discovered some composability issues, and are working to ensure that oneDNN Graph would complement PyTorch's existing fusion capabilities, not hinder them.\nUPDATE (Jan 24): We've resolved the issues & will update this PR later today. Thanks!",
-                "createdAt": "2022-01-20T00:31:01Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": {
-                  "login": "sanchitintel"
-                },
-                "databaseId": 1016996190
-              },
-              {
-                "bodyText": "Hello @eellison,\nWe used this TorchBench branch for comparison. compare_llga.sh can be run for comparison.\nFor benchmarking mobilenet_v3_large with hardswish support in oneDNN Graph, this oneDNN Graph branch can be used in third_party/ideep/mkl-dnn. It delivers a speedup over PyTorch JIT (NNC + OFI) because 21 additional reorders are prevented (the major factor here), and fusion with conv also helps further.\nThe next release of oneDNN Graph would have hardswish support.\nWe're also exploring adding a hardsigmoid op in oneDNN Graph.\nThank you!",
-                "createdAt": "2022-01-26T23:51:38Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": {
-                  "login": "sanchitintel"
-                },
-                "databaseId": 1022709513
-              },
-              {
-                "bodyText": "Please note that this PR should be merged after #71546, as #71546 changes the  third_party/ideep commit (this PR also uses that ideep commit, but it'd probably be better to merge #71546 first, so that oneDNN v2.5.2 upgrade would be in a separate PR). Thank you!",
-                "createdAt": "2022-01-31T23:57:21Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1026330085
-              },
-              {
-                "bodyText": "@sanchitintel mind rebasing and i'll land ?",
-                "createdAt": "2022-03-01T20:07:57Z",
-                "author": {
-                  "login": "eellison"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1055813984
-              },
-              {
-                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-03-02T17:44:47Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1057203495
-              },
-              {
-                "bodyText": "Thanks a lot for taking a look, @eellison! To fix this error, we would enable Bazel build for oneDNN Graph.",
-                "createdAt": "2022-03-07T23:03:45Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": {
-                  "login": "sanchitintel"
-                },
-                "databaseId": 1061230087
-              },
-              {
-                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-03-09T19:24:13Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1063276600
-              },
-              {
-                "bodyText": "@malfet has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-03-21T19:59:41Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1074355779
-              },
-              {
-                "bodyText": "And graph_rewriter.cpp is full of DOS newlines...",
-                "createdAt": "2022-03-21T20:53:40Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1074407452
-              },
-              {
-                "bodyText": "Hey @chunyuan-w.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-03-21T22:12:51Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1074471758
-              },
-              {
-                "bodyText": "Thanks a ton for your help, @malfet & @eellison! :)\nWe'll incorporate your suggestions in subsequent PR(s).",
-                "createdAt": "2022-03-21T22:41:25Z",
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": {
-                  "login": "sanchitintel"
-                },
-                "databaseId": 1074492365
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOOYM_0Q==",
-              "hasPreviousPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=pytorch-dev-infra org=pytorch": {
-    "data": {
-      "organization": {
-        "team": {
-          "members": {
-            "nodes": [
-              {
-                "login": "kit1980"
-              },
-              {
-                "login": "huydhn"
-              },
-              {
-                "login": "seemethere"
-              },
-              {
-                "login": "malfet"
-              },
-              {
-                "login": "DanilBaibak"
-              },
-              {
-                "login": "ZainRizvi"
-              },
-              {
-                "login": "jeanschmidt"
-              },
-              {
-                "login": "atalman"
-              },
-              {
-                "login": "mehtanirav"
-              },
-              {
-                "login": "osalpekar"
-              },
-              {
-                "login": "clee2000"
-              },
-              {
-                "login": "izaitsevfb"
-              },
-              {
-                "login": "weiwangmeta"
-              }
-            ],
-            "pageInfo": {
-              "hasNextPage": false,
-              "endCursor": "Y3Vyc29yOnYyOpHOBoQSVA=="
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=qwertyuiop org=pytorch": {
-    "data": {
-      "organization": {
-        "team": null
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "malfet"
-          },
-          "title": "Dummy change with lots of commits",
-          "body": "Draft PR with 100+ commits, to test mergebot ",
-          "headRefName": "malfet/pr-with-lots-of-commits",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "3067f2240afc7a29dc348000aa19eccbd9772303"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "andrewor14"
-                    },
-                    "email": "andrewor@fb.com",
-                    "name": "Andrew Or"
-                  },
-                  "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "jwtan@fb.com",
-                    "name": "Jiewen Tan"
-                  },
-                  "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "jwtan@fb.com",
-                    "name": "Jiewen Tan"
-                  },
-                  "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "peterbell10"
-                    },
-                    "email": "peterbell10@live.co.uk",
-                    "name": "Peter Bell"
-                  },
-                  "oid": "aac6204bf710beb5e50a383d426ae6222396335a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "486387e8644afb46edff5aa5925b55c8119f67f0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "qihqi"
-                    },
-                    "email": "qihan@fb.com",
-                    "name": "Han Qi"
-                  },
-                  "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "Krovatkin"
-                    },
-                    "email": "korovaikon@gmail.com",
-                    "name": "Nikolay Korovaiko"
-                  },
-                  "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "suo"
-                    },
-                    "email": "suo@fb.com",
-                    "name": "Michael Suo"
-                  },
-                  "oid": "f70b31f62b1c5159eef2725484b175983517c88c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mikeyd@fb.com",
-                    "name": "Michael Andreas Dagitses"
-                  },
-                  "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "46b754a55b63e3168ad5854ad412c124934b675d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "robieta"
-                    },
-                    "email": "taylorrobie@fb.com",
-                    "name": "Taylor Robie"
-                  },
-                  "oid": "13df69e13ee571fdd716139419a00aec47ade7d6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "qihqi"
-                    },
-                    "email": "qihan@fb.com",
-                    "name": "Han Qi"
-                  },
-                  "oid": "7917d789f0a523715041ade5177d271082628236"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kit1980"
-                    },
-                    "email": "sdym@fb.com",
-                    "name": "Sergii Dymchenko (Meta Employee)"
-                  },
-                  "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mikeyd@fb.com",
-                    "name": "Michael Andreas Dagitses"
-                  },
-                  "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mikeyd@fb.com",
-                    "name": "Michael Andreas Dagitses"
-                  },
-                  "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@fb.com",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pearu"
-                    },
-                    "email": "pearu.peterson@gmail.com",
-                    "name": "Pearu Peterson"
-                  },
-                  "oid": "28502265cb5925cb7db8dcb2dd2334963092714a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pritamdamania"
-                    },
-                    "email": "pritam.damania@fb.com",
-                    "name": "pritam"
-                  },
-                  "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "MagiaSN"
-                    },
-                    "email": "magialiao@tencent.com",
-                    "name": "magialiao"
-                  },
-                  "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "awgu"
-                    },
-                    "email": "andgu@fb.com",
-                    "name": "Andrew Gu"
-                  },
-                  "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "davidberard98"
-                    },
-                    "email": "dberard@fb.com",
-                    "name": "David Berard"
-                  },
-                  "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "9608ab28744d5cae32f371490557b248c9549c66"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "rohan-varma"
-                    },
-                    "email": "rvarm1@fb.com",
-                    "name": "Rohan Varma"
-                  },
-                  "oid": "447580dc565f3660eddb2c996c6ed25b88338684"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "jiyuanzFB"
-                    },
-                    "email": "jiyuanz@fb.com",
-                    "name": "Jiyuan Zhang"
-                  },
-                  "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "a366fd01136292544b7862968ae92feba4b6d8fe"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "seemethere"
-                    },
-                    "email": "eliuriegas@fb.com",
-                    "name": "Eli Uriegas"
-                  },
-                  "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "d306c99addc543908f64666baeecacbd0749f4a7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "awgu"
-                    },
-                    "email": "andgu@fb.com",
-                    "name": "Andrew Gu"
-                  },
-                  "oid": "c2456ea658f41f64ea054a422edf22a9c977399f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "awgu"
-                    },
-                    "email": "andgu@fb.com",
-                    "name": "Andrew Gu"
-                  },
-                  "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "anjali411"
-                    },
-                    "email": "chourdiaanjali123@gmail.com",
-                    "name": "anjali411"
-                  },
-                  "oid": "af761d9a5d058c9188f16589bae4f307d35185be"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "clee2000"
-                    },
-                    "email": "csl@fb.com",
-                    "name": "Catherine Lee"
-                  },
-                  "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "1516554e22136db89d0aeba43a1a1a987e995d68"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "qihqi"
-                    },
-                    "email": "qihan@fb.com",
-                    "name": "Han Qi"
-                  },
-                  "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "38c1a2028090353e40a019c673c9ab16b39e4825"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "20d798b319cd107a767fe220f7a3027c18a1c844"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "7f821382db5ad08efe5b09a145c606852b8a9272"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "davidberard98"
-                    },
-                    "email": "dberard@fb.com",
-                    "name": "David Berard"
-                  },
-                  "oid": "28d6258e62c9fc361a18689877c962c69889dc23"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "HarborYuan"
-                    },
-                    "email": "yuanhaobo@whu.edu.cn",
-                    "name": "Haobo Yuan"
-                  },
-                  "oid": "2350fad8391367ebf81c7236a2c883644b4ff622"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "zou3519"
-                    },
-                    "email": "zou3519@gmail.com",
-                    "name": "Richard Zou"
-                  },
-                  "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "jeffdaily"
-                    },
-                    "email": "jeff.daily@amd.com",
-                    "name": "Jeff Daily"
-                  },
-                  "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "peterbell10"
-                    },
-                    "email": "peterbell10@live.co.uk",
-                    "name": "Peter Bell"
-                  },
-                  "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "peterbell10"
-                    },
-                    "email": "peterbell10@live.co.uk",
-                    "name": "Peter Bell"
-                  },
-                  "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "george-qi"
-                    },
-                    "email": "georgeqi94@gmail.com",
-                    "name": "George Qi"
-                  },
-                  "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "60fc3277634365b64465712b13db2acb76d6c890"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "jerryzh168"
-                    },
-                    "email": "jerryzh168@gmail.com",
-                    "name": "Jerry Zhang"
-                  },
-                  "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ysiraichi"
-                    },
-                    "email": "yukio.siraichi@gmail.com",
-                    "name": "Yukio Siraichi"
-                  },
-                  "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "thiagocrepaldi"
-                    },
-                    "email": "thiago.crepaldi@microsoft.com",
-                    "name": "Thiago Crepaldi"
-                  },
-                  "oid": "83208e7dee4503c1bee1df9f6632794694dffa01"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "fatcat-z"
-                    },
-                    "email": "jiz@microsoft.com",
-                    "name": "Jay Zhang"
-                  },
-                  "oid": "f273961c1696b156e35f8c76f7ad37934031050d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pavithranrao"
-                    },
-                    "email": "pavithran@fb.com",
-                    "name": "Pavithran Ramachandran"
-                  },
-                  "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "43675665fa6b5154de8b25125dd03d7be35c884f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "cf3778a35129a40dee14366515201b7ed2c0f346"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "swolchok"
-                    },
-                    "email": "swolchok@fb.com",
-                    "name": "Scott Wolchok"
-                  },
-                  "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "IvanYashchuk"
-                    },
-                    "email": "ivan.yashchuk@aalto.fi",
-                    "name": "Ivan Yashchuk"
-                  },
-                  "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "Chillee"
-                    },
-                    "email": "chilli@fb.com",
-                    "name": "Horace He"
-                  },
-                  "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "mehtanirav"
-                    },
-                    "email": "niravmehta@fb.com",
-                    "name": "Nirav Mehta"
-                  },
-                  "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "mehtanirav"
-                    },
-                    "email": "niravmehta@fb.com",
-                    "name": "Nirav Mehta"
-                  },
-                  "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bigfootjon"
-                    },
-                    "email": "jonjanzen@fb.com",
-                    "name": "Jon Janzen"
-                  },
-                  "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "samdow"
-                    },
-                    "email": "samdow@fb.com",
-                    "name": "samdow"
-                  },
-                  "oid": "128c3ad747093f4970329a82c7c4720420faeff2"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "arindamroy-eng"
-                    },
-                    "email": "61168652+arindamroy-eng@users.noreply.github.com",
-                    "name": "arindamroy-eng"
-                  },
-                  "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTAw",
-              "hasNextPage": true
-            },
-            "totalCount": 131
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRAI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRBA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRB0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRC0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsREE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRE4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192463"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192463/jobs/3232430975"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRsw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192461"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461134"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461211"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461301"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461386"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461521"
-                              },
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461634"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461717"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRtE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192471"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460797"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460951"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461088"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461294"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461410"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461543"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461628"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461719"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461789"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461869"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461946"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462044"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462112"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462244"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462360"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462432"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462521"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462621"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462683"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462738"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545510"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545571"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547522"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547612"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547714"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547764"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547824"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547869"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547909"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547973"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553452"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553558"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553605"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553650"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563716"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563763"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582650"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582703"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582741"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232590204"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608872"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608976"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637097"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637199"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637259"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232639932"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687012"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687074"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785088"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785153"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-04-20T17:10:41Z",
-                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
-                }
-              }
-            ]
-          },
-          "changedFiles": 348,
-          "files": {
-            "nodes": [
-              {
-                "path": ".circleci/cimodel/data/pytorch_build_data.py"
-              },
-              {
-                "path": ".circleci/cimodel/data/pytorch_build_definitions.py"
-              },
-              {
-                "path": ".circleci/scripts/cpp_doc_push_script.sh"
-              },
-              {
-                "path": ".circleci/scripts/python_doc_push_script.sh"
-              },
-              {
-                "path": ".github/actions/checkout-pytorch/action.yml"
-              },
-              {
-                "path": ".github/merge_rules.json"
-              },
-              {
-                "path": ".github/scripts/gitutils.py"
-              },
-              {
-                "path": ".github/scripts/gql_mocks.json"
-              },
-              {
-                "path": ".github/scripts/trymerge.py"
-              },
-              {
-                "path": ".github/workflows/_bazel-build-test.yml"
-              },
-              {
-                "path": ".github/workflows/_linux-build.yml"
-              },
-              {
-                "path": ".github/workflows/_linux-test.yml"
-              },
-              {
-                "path": ".github/workflows/_mac-test.yml"
-              },
-              {
-                "path": ".github/workflows/_rocm-test.yml"
-              },
-              {
-                "path": ".github/workflows/_win-test.yml"
-              },
-              {
-                "path": ".github/workflows/buck_build_test.yml"
-              },
-              {
-                "path": ".github/workflows/lint.yml"
-              },
-              {
-                "path": ".github/workflows/periodic.yml"
-              },
-              {
-                "path": ".github/workflows/pull.yml"
-              },
-              {
-                "path": ".github/workflows/trunk.yml"
-              },
-              {
-                "path": ".jenkins/pytorch/macos-test.sh"
-              },
-              {
-                "path": ".jenkins/pytorch/test.sh"
-              },
-              {
-                "path": ".jenkins/pytorch/win-test.sh"
-              },
-              {
-                "path": ".lintrunner.toml"
-              },
-              {
-                "path": "BUILD.bazel"
-              },
-              {
-                "path": "CODEOWNERS"
-              },
-              {
-                "path": "README.md"
-              },
-              {
-                "path": "aten/src/ATen/BatchingRegistrations.cpp"
-              },
-              {
-                "path": "aten/src/ATen/Dispatch.h"
-              },
-              {
-                "path": "aten/src/ATen/ExpandUtils.h"
-              },
-              {
-                "path": "aten/src/ATen/FunctionalInverses.cpp"
-              },
-              {
-                "path": "aten/src/ATen/FunctionalStorageImpl.cpp"
-              },
-              {
-                "path": "aten/src/ATen/FunctionalStorageImpl.h"
-              },
-              {
-                "path": "aten/src/ATen/FunctionalTensorWrapper.cpp"
-              },
-              {
-                "path": "aten/src/ATen/FunctionalTensorWrapper.h"
-              },
-              {
-                "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp"
-              },
-              {
-                "path": "aten/src/ATen/NestedTensorImpl.cpp"
-              },
-              {
-                "path": "aten/src/ATen/OpMathType.h"
-              },
-              {
-                "path": "aten/src/ATen/SparseCsrTensorUtils.h"
-              },
-              {
-                "path": "aten/src/ATen/ThreadLocalState.cpp"
-              },
-              {
-                "path": "aten/src/ATen/ThreadLocalState.h"
-              },
-              {
-                "path": "aten/src/ATen/autocast_mode.cpp"
-              },
-              {
-                "path": "aten/src/ATen/autocast_mode.h"
-              },
-              {
-                "path": "aten/src/ATen/core/SymIntArrayRef.cpp"
-              },
-              {
-                "path": "aten/src/ATen/core/SymIntArrayRef.h"
-              },
-              {
-                "path": "aten/src/ATen/core/TensorBase.h"
-              },
-              {
-                "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h"
-              },
-              {
-                "path": "aten/src/ATen/core/dispatch/Dispatcher.h"
-              },
-              {
-                "path": "aten/src/ATen/core/interned_strings.h"
-              },
-              {
-                "path": "aten/src/ATen/core/ivalue.cpp"
-              },
-              {
-                "path": "aten/src/ATen/core/ivalue.h"
-              },
-              {
-                "path": "aten/src/ATen/core/ivalue_inl.h"
-              },
-              {
-                "path": "aten/src/ATen/core/jit_type.h"
-              },
-              {
-                "path": "aten/src/ATen/core/jit_type_base.h"
-              },
-              {
-                "path": "aten/src/ATen/core/type.cpp"
-              },
-              {
-                "path": "aten/src/ATen/cuda/CUDASparse.h"
-              },
-              {
-                "path": "aten/src/ATen/cuda/llvm_complex.cpp"
-              },
-              {
-                "path": "aten/src/ATen/cuda/llvm_jit_strings.h"
-              },
-              {
-                "path": "aten/src/ATen/native/Blas.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/Itertools.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/LinearAlgebra.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/SoftMax.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/TensorConversions.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/TensorShape.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/TensorShape.h"
-              },
-              {
-                "path": "aten/src/ATen/native/Unique.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/JitLoops.cuh"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/Lerp.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/SoftMax.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/Unique.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/jit_utils.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/jit_utils.h"
-              },
-              {
-                "path": "aten/src/ATen/native/native_functions.yaml"
-              },
-              {
-                "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/quantized/cudnn/utils.h"
-              },
-              {
-                "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/ts_native_functions.yaml"
-              },
-              {
-                "path": "aten/src/ATen/record_function.cpp"
-              },
-              {
-                "path": "aten/src/ATen/record_function.h"
-              },
-              {
-                "path": "aten/src/ATen/templates/Operators.h"
-              },
-              {
-                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
-              },
-              {
-                "path": "aten/src/ATen/test/basic.cpp"
-              },
-              {
-                "path": "aten/src/ATen/test/vmap_test.cpp"
-              },
-              {
-                "path": "binaries/record_function_benchmark.cc"
-              },
-              {
-                "path": "c10/core/DispatchKey.cpp"
-              },
-              {
-                "path": "c10/core/DispatchKey.h"
-              },
-              {
-                "path": "c10/core/DispatchKeySet.h"
-              },
-              {
-                "path": "c10/test/core/DispatchKeySet_test.cpp"
-              },
-              {
-                "path": "c10/util/ArrayRef.h"
-              },
-              {
-                "path": "caffe2/core/tensor.h"
-              },
-              {
-                "path": "docs/source/conf.py"
-              },
-              {
-                "path": "docs/source/fx.rst"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTAw",
-              "hasNextPage": true
-            }
-          },
-          "reviews": {
-            "nodes": [],
-            "pageInfo": {
-              "startCursor": null,
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...",
-                "createdAt": "2022-04-20T17:26:18Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1104215370
-              },
-              {
-                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet",
-                "createdAt": "2022-04-20T17:31:26Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1104220908
-              },
-              {
-                "bodyText": "@pytorchbot merge this",
-                "createdAt": "2022-04-20T19:30:50Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1104378397
-              },
-              {
-                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090",
-                "createdAt": "2022-04-20T19:32:10Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1104379712
-              },
-              {
-                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
-                "createdAt": "2022-06-20T16:44:05Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1160658699
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQdD9Sg==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Stale"
-                }
-              }
-            ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": [
-                  {
-                    "node": {
-                      "parents": {
-                        "edges": [
-                          {
-                            "node": {
-                              "oid": "6afe341276f9ffa660446c5fa15b68558791869a"
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                ]
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=74bd29fe945c49fde4818e873fa62bc60b55b4ef6ae3f2bb719bab6cddbaa7ce cursor=MTAw name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "clee2000"
-                    },
-                    "email": "csl@fb.com",
-                    "name": "Catherine Lee"
-                  },
-                  "oid": "7f560351ae04ea43e58fbfda885bcf216aa26cde"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "e8677ed168a036bc7e590d800fe98dd15f10581b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "robieta"
-                    },
-                    "email": "taylorrobie@fb.com",
-                    "name": "Taylor Robie"
-                  },
-                  "oid": "ac5611caa13642ef8dbe0db453b283b42cbd900b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "robieta"
-                    },
-                    "email": "taylorrobie@fb.com",
-                    "name": "Taylor Robie"
-                  },
-                  "oid": "1184afbd3bfde0f46133aef09e55e18d3bfb3c3e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "minsii"
-                    },
-                    "email": "msi@fb.com",
-                    "name": "Min Si"
-                  },
-                  "oid": "1c05604f3d049c67dc678d0295c0add470bff3dc"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "eellison@devfair044.h1.fair",
-                    "name": "Elias Ellison"
-                  },
-                  "oid": "76ab5101bd36e8d73637d31bbea125240b7b27f0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "eellison@devfair044.h1.fair",
-                    "name": "Elias Ellison"
-                  },
-                  "oid": "c774050e92c3d8e52968e1eb635dd3e9491104b3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "guoyejun"
-                    },
-                    "email": "yejun.guo@intel.com",
-                    "name": "Guo Yejun"
-                  },
-                  "oid": "8981595c5361f07186f4534f3be71f1d829a3046"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "BowenBao"
-                    },
-                    "email": "bowbao@microsoft.com",
-                    "name": "BowenBao"
-                  },
-                  "oid": "036f362904024ac9481248965009f312bec6656b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "457d994933f164a9fd70da5ca2733dd6c046a28b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "f49ebc77520774e71722111d554a0215a26956df"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "mikeiovine"
-                    },
-                    "email": "mikeiovine@fb.com",
-                    "name": "Mike Iovine"
-                  },
-                  "oid": "f069e1a4a5f98d3fe961e4fc562ede59f59b4026"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "salilsdesai"
-                    },
-                    "email": "salilsdesai@fb.com",
-                    "name": "Salil Desai"
-                  },
-                  "oid": "30bccf58393b288412a0f5a2423a1a41ffce258e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "angelayi"
-                    },
-                    "email": "angelayi@fb.com",
-                    "name": "Angela Yi"
-                  },
-                  "oid": "f4ba440fe8a632c1ee88e01f7746a8a92c8f3902"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "shirong@fb.com",
-                    "name": "Shirong Wu"
-                  },
-                  "oid": "d203346c93ba96d626c6c02910888198c789ba69"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "jamesreed@fb.com",
-                    "name": "James Reed"
-                  },
-                  "oid": "73a4e34963e212b799a191fd031d2fa31d17e0ac"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "Krovatkin"
-                    },
-                    "email": "korovaikon@gmail.com",
-                    "name": "Nikolay Korovaiko"
-                  },
-                  "oid": "b9d5206dfb46f09f953aba3ffb0e1e33a99032ee"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "12114e6937573fead54e11ae6cdebe5b31dee302"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "s4ayub"
-                    },
-                    "email": "shababayub@fb.com",
-                    "name": "Shabab Ayub"
-                  },
-                  "oid": "f2323f76ad6f7f590285bf9c6d20c14a79542563"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "jaglinux"
-                    },
-                    "email": "jagdish.krishna@gmail.com",
-                    "name": "Jagadish Krishnamoorthy"
-                  },
-                  "oid": "acd4b5abe2739c09c1a02524eceda46ff93fd385"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "cccclai"
-                    },
-                    "email": "chenlai@fb.com",
-                    "name": "Chen Lai"
-                  },
-                  "oid": "04179f533283132fa334a9f91a070b1712f7323d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "zaxtax"
-                    },
-                    "email": "rob@zinkov.com",
-                    "name": "Rob Zinkov"
-                  },
-                  "oid": "5097cdcd6994ad82b3cec942b70e75dbeaee8ca4"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "5015ecb5a2b86943f457d71f5a977444dd062732"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "1c42b7789d3966cd541b08fce359b9738fee69f6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "893ac3d334fd3e85e22423a06fe986ce453fe304"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "emcastillo"
-                    },
-                    "email": "ecastill@preferred.jp",
-                    "name": "Emilio Castillo"
-                  },
-                  "oid": "aa5d1b6b031ee2b8bb85f793a842ac1327ae4a19"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "0707a1d00f33d7098f56de339cb30436e8c2ea44"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "NivekT"
-                    },
-                    "email": "ktse@fb.com",
-                    "name": "Kevin Tse"
-                  },
-                  "oid": "ccb082d42af99f6374183cf914cc712bac585f0f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ryandaryl"
-                    },
-                    "email": "ryandarylmills@gmail.com",
-                    "name": "ryandaryl"
-                  },
-                  "oid": "4f2909cc8747808786a1871b0a6825cc4566f48c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "clee2000"
-                    },
-                    "email": "csl@fb.com",
-                    "name": "Catherine Lee"
-                  },
-                  "oid": "f764010648a29223d9ed4b955073d9d2fb1b2f43"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTMx",
-              "hasNextPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "files": {
-            "nodes": [
-              {
-                "path": "docs/source/quantization.rst"
-              },
-              {
-                "path": "docs/source/scripts/build_quantization_configs.py"
-              },
-              {
-                "path": "test/allowlist_for_publicAPI.json"
-              },
-              {
-                "path": "test/cpp/jit/source_range_test.cpp"
-              },
-              {
-                "path": "test/cpp/jit/test_backend.cpp"
-              },
-              {
-                "path": "test/cpp/jit/test_flatbuffer.cpp"
-              },
-              {
-                "path": "test/cpp/jit/test_misc.cpp"
-              },
-              {
-                "path": "test/cpp/jit/test_utils.h"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff"
-              },
-              {
-                "path": "test/cpp/profiler/record_function.cpp"
-              },
-              {
-                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
-              },
-              {
-                "path": "test/distributed/_shard/test_replicated_tensor.py"
-              },
-              {
-                "path": "test/distributed/fsdp/test_fsdp_comm.py"
-              },
-              {
-                "path": "test/distributed/fsdp/test_fsdp_optim_state.py"
-              },
-              {
-                "path": "test/distributed/optim/test_zero_redundancy_optimizer.py"
-              },
-              {
-                "path": "test/jit/test_export_modes.py"
-              },
-              {
-                "path": "test/jit/test_if_hoisting.py"
-              },
-              {
-                "path": "test/jit/test_tracer.py"
-              },
-              {
-                "path": "test/jit/test_upgraders.py"
-              },
-              {
-                "path": "test/mobile/test_lite_script_type.py"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
-              },
-              {
-                "path": "test/onnx/test_operators.py"
-              },
-              {
-                "path": "test/onnx/test_pytorch_onnx_onnxruntime.py"
-              },
-              {
-                "path": "test/quantization/ao_migration/test_quantization_fx.py"
-              },
-              {
-                "path": "test/quantization/core/test_quantized_op.py"
-              },
-              {
-                "path": "test/quantization/core/test_quantized_tensor.py"
-              },
-              {
-                "path": "test/quantization/fx/test_numeric_suite_fx.py"
-              },
-              {
-                "path": "test/quantization/fx/test_quantize_fx.py"
-              },
-              {
-                "path": "test/test_autograd.py"
-              },
-              {
-                "path": "test/test_binary_ufuncs.py"
-              },
-              {
-                "path": "test/test_expanded_weights.py"
-              },
-              {
-                "path": "test/test_functionalization.py"
-              },
-              {
-                "path": "test/test_fx_experimental.py"
-              },
-              {
-                "path": "test/test_jit.py"
-              },
-              {
-                "path": "test/test_jit_cuda_fuser.py"
-              },
-              {
-                "path": "test/test_linalg.py"
-              },
-              {
-                "path": "test/test_nestedtensor.py"
-              },
-              {
-                "path": "test/test_nn.py"
-              },
-              {
-                "path": "test/test_ops.py"
-              },
-              {
-                "path": "test/test_ops_gradients.py"
-              },
-              {
-                "path": "test/test_ops_jit.py"
-              },
-              {
-                "path": "test/test_optim.py"
-              },
-              {
-                "path": "test/test_overrides.py"
-              },
-              {
-                "path": "test/test_profiler.py"
-              },
-              {
-                "path": "test/test_public_bindings.py"
-              },
-              {
-                "path": "test/test_pytree.py"
-              },
-              {
-                "path": "test/test_reductions.py"
-              },
-              {
-                "path": "test/test_sort_and_select.py"
-              },
-              {
-                "path": "test/test_sparse.py"
-              },
-              {
-                "path": "test/test_sparse_csr.py"
-              },
-              {
-                "path": "test/test_spectral_ops.py"
-              },
-              {
-                "path": "test/test_tensor_creation_ops.py"
-              },
-              {
-                "path": "test/test_tensorboard.py"
-              },
-              {
-                "path": "test/test_testing.py"
-              },
-              {
-                "path": "test/test_torch.py"
-              },
-              {
-                "path": "test/test_unary_ufuncs.py"
-              },
-              {
-                "path": "third_party/BUCK.github"
-              },
-              {
-                "path": "third_party/fbgemm"
-              },
-              {
-                "path": "tools/autograd/derivatives.yaml"
-              },
-              {
-                "path": "tools/autograd/gen_inplace_or_view_type.py"
-              },
-              {
-                "path": "tools/autograd/load_derivatives.py"
-              },
-              {
-                "path": "tools/build_variables.bzl"
-              },
-              {
-                "path": "tools/codegen/api/autograd.py"
-              },
-              {
-                "path": "tools/codegen/api/cpp.py"
-              },
-              {
-                "path": "tools/codegen/api/dispatcher.py"
-              },
-              {
-                "path": "tools/codegen/api/functionalization.py"
-              },
-              {
-                "path": "tools/codegen/api/lazy.py"
-              },
-              {
-                "path": "tools/codegen/api/meta.py"
-              },
-              {
-                "path": "tools/codegen/api/native.py"
-              },
-              {
-                "path": "tools/codegen/api/python.py"
-              },
-              {
-                "path": "tools/codegen/api/structured.py"
-              },
-              {
-                "path": "tools/codegen/api/translate.py"
-              },
-              {
-                "path": "tools/codegen/api/types.py"
-              },
-              {
-                "path": "tools/codegen/api/ufunc.py"
-              },
-              {
-                "path": "tools/codegen/api/unboxing.py"
-              },
-              {
-                "path": "tools/codegen/code_template.py"
-              },
-              {
-                "path": "tools/codegen/context.py"
-              },
-              {
-                "path": "tools/codegen/decompositions/gen_jit_decompositions.py"
-              },
-              {
-                "path": "tools/codegen/dest/__init__.py"
-              },
-              {
-                "path": "tools/codegen/dest/lazy_ir.py"
-              },
-              {
-                "path": "tools/codegen/dest/lazy_ts_lowering.py"
-              },
-              {
-                "path": "tools/codegen/dest/native_functions.py"
-              },
-              {
-                "path": "tools/codegen/dest/register_dispatch_key.py"
-              },
-              {
-                "path": "tools/codegen/dest/ufunc.py"
-              },
-              {
-                "path": "tools/codegen/gen.py"
-              },
-              {
-                "path": "tools/codegen/gen_backend_stubs.py"
-              },
-              {
-                "path": "tools/codegen/gen_functionalization_type.py"
-              },
-              {
-                "path": "tools/codegen/gen_lazy_tensor.py"
-              },
-              {
-                "path": "tools/codegen/local.py"
-              },
-              {
-                "path": "tools/codegen/model.py"
-              },
-              {
-                "path": "tools/codegen/operator_versions/gen_mobile_upgraders.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MjAw",
-              "hasNextPage": true
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MjAw name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "files": {
-            "nodes": [
-              {
-                "path": "tools/codegen/selective_build/operator.py"
-              },
-              {
-                "path": "tools/codegen/selective_build/selector.py"
-              },
-              {
-                "path": "tools/codegen/shape_functions/gen_jit_shape_functions.py"
-              },
-              {
-                "path": "tools/codegen/static_runtime/config.py"
-              },
-              {
-                "path": "tools/codegen/static_runtime/gen_static_runtime_ops.py"
-              },
-              {
-                "path": "tools/codegen/static_runtime/gen_structured.py"
-              },
-              {
-                "path": "tools/codegen/utils.py"
-              },
-              {
-                "path": "tools/linter/adapters/circleci_linter.py"
-              },
-              {
-                "path": "tools/linter/adapters/clangformat_linter.py"
-              },
-              {
-                "path": "tools/linter/adapters/grep_linter.py"
-              },
-              {
-                "path": "tools/linter/adapters/nativefunctions_linter.py"
-              },
-              {
-                "path": "tools/setup_helpers/BUILD.bazel"
-              },
-              {
-                "path": "tools/setup_helpers/generate_code.py"
-              },
-              {
-                "path": "torch/_C/__init__.pyi.in"
-              },
-              {
-                "path": "torch/amp/autocast_mode.py"
-              },
-              {
-                "path": "torch/ao/ns/fx/pattern_utils.py"
-              },
-              {
-                "path": "torch/ao/quantization/backend_config/README.md"
-              },
-              {
-                "path": "torch/ao/quantization/backend_config/__init__.py"
-              },
-              {
-                "path": "torch/ao/quantization/backend_config/native.py"
-              },
-              {
-                "path": "torch/ao/quantization/backend_config/observation_type.py"
-              },
-              {
-                "path": "torch/ao/quantization/backend_config/tensorrt.py"
-              },
-              {
-                "path": "torch/ao/quantization/backend_config/utils.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/__init__.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/backend_config/fuse_handler.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/backend_config/quantize_handler.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/backend_config_utils.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/convert.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/fuse.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/fusion_patterns.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/match_utils.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/pattern_utils.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/prepare.py"
-              },
-              {
-                "path": "torch/ao/quantization/fx/quantization_patterns.py"
-              },
-              {
-                "path": "torch/ao/quantization/qconfig.py"
-              },
-              {
-                "path": "torch/ao/quantization/quantization_types.py"
-              },
-              {
-                "path": "torch/ao/quantization/quantize_fx.py"
-              },
-              {
-                "path": "torch/autograd/__init__.py"
-              },
-              {
-                "path": "torch/csrc/Module.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/FunctionsManual.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/FunctionsManual.h"
-              },
-              {
-                "path": "torch/csrc/autograd/engine.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/function.h"
-              },
-              {
-                "path": "torch/csrc/autograd/functions/accumulate_grad.h"
-              },
-              {
-                "path": "torch/csrc/autograd/init.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/python_torch_functions_manual.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/python_variable.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/record_function_ops.h"
-              },
-              {
-                "path": "torch/csrc/autograd/utils/grad_layout_contract.h"
-              },
-              {
-                "path": "torch/csrc/deploy/CMakeLists.txt"
-              },
-              {
-                "path": "torch/csrc/distributed/c10d/logger.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/cuda/graph_fuser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/cuda/parser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/function_schema_parser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/lexer.h"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/parser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/parser.h"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/script_type_parser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/source_range.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/source_range.h"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/source_ref.h"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/tracer.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/frontend/tracer.h"
-              },
-              {
-                "path": "torch/csrc/jit/mobile/debug_info.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/mobile/debug_info.h"
-              },
-              {
-                "path": "torch/csrc/jit/mobile/flatbuffer_loader.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/mobile/module.h"
-              },
-              {
-                "path": "torch/csrc/jit/passes/common_expression_hoisting.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/passes/common_expression_hoisting.h"
-              },
-              {
-                "path": "torch/csrc/jit/passes/frozen_graph_optimizations.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/python/init.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/python/python_tree_views.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/python/script_init.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/graph_executor.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/interpreter.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/script_profile.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.h"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/shape_function_registry.h"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/shape_functions.h"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/shape_functions_1.h"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/static/impl.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/static/passes.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/symbolic_shape_registry.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/runtime/symbolic_shape_registry.h"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/export_module.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/flatbuffer_serializer.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/import.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/import_export_helpers.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/import_export_helpers.h"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/import_source.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/import_source.h"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/source_range_serialization.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/serialization/source_range_serialization.h"
-              },
-              {
-                "path": "torch/csrc/jit/testing/file_check.cpp"
-              },
-              {
-                "path": "torch/csrc/lazy/core/dynamic_ir.cpp"
-              },
-              {
-                "path": "torch/csrc/lazy/core/dynamic_ir.h"
-              },
-              {
-                "path": "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MzAw",
-              "hasNextPage": true
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MzAw name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "files": {
-            "nodes": [
-              {
-                "path": "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
-              },
-              {
-                "path": "torch/csrc/utils/python_arg_parser.cpp"
-              },
-              {
-                "path": "torch/csrc/utils/python_arg_parser.h"
-              },
-              {
-                "path": "torch/csrc/utils/tensor_list.cpp"
-              },
-              {
-                "path": "torch/csrc/utils/tensor_new.cpp"
-              },
-              {
-                "path": "torch/csrc/utils/tensor_new.h"
-              },
-              {
-                "path": "torch/distributed/_shard/__init__.py"
-              },
-              {
-                "path": "torch/distributed/_shard/api.py"
-              },
-              {
-                "path": "torch/distributed/_shard/replicated_tensor.py"
-              },
-              {
-                "path": "torch/distributed/_shard/sharded_tensor/__init__.py"
-              },
-              {
-                "path": "torch/distributed/_shard/sharded_tensor/api.py"
-              },
-              {
-                "path": "torch/distributed/_shard/sharded_tensor/utils.py"
-              },
-              {
-                "path": "torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py"
-              },
-              {
-                "path": "torch/distributed/algorithms/model_averaging/utils.py"
-              },
-              {
-                "path": "torch/distributed/fsdp/_optim_utils.py"
-              },
-              {
-                "path": "torch/distributed/fsdp/fully_sharded_data_parallel.py"
-              },
-              {
-                "path": "torch/distributed/nn/__init__.py"
-              },
-              {
-                "path": "torch/distributed/nn/functional.py"
-              },
-              {
-                "path": "torch/distributed/optim/functional_adagrad.py"
-              },
-              {
-                "path": "torch/fx/experimental/meta_tracer.py"
-              },
-              {
-                "path": "torch/fx/graph.py"
-              },
-              {
-                "path": "torch/jit/_shape_functions.py"
-              },
-              {
-                "path": "torch/nn/parallel/_replicated_tensor_ddp_interop.py"
-              },
-              {
-                "path": "torch/nn/parallel/_replicated_tensor_ddp_utils.py"
-              },
-              {
-                "path": "torch/nn/parallel/distributed.py"
-              },
-              {
-                "path": "torch/nn/utils/_expanded_weights/__init__.py"
-              },
-              {
-                "path": "torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py"
-              },
-              {
-                "path": "torch/onnx/symbolic_opset11.py"
-              },
-              {
-                "path": "torch/onnx/symbolic_opset12.py"
-              },
-              {
-                "path": "torch/onnx/symbolic_opset9.py"
-              },
-              {
-                "path": "torch/optim/adagrad.py"
-              },
-              {
-                "path": "torch/optim/lr_scheduler.py"
-              },
-              {
-                "path": "torch/overrides.py"
-              },
-              {
-                "path": "torch/quantization/fx/pattern_utils.py"
-              },
-              {
-                "path": "torch/quantization/fx/quantization_patterns.py"
-              },
-              {
-                "path": "torch/quantization/fx/quantization_types.py"
-              },
-              {
-                "path": "torch/return_types.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_device_type.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_distributed.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_fx2trt.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_methods_invocations.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_utils.py"
-              },
-              {
-                "path": "torch/testing/_internal/composite_compliance.py"
-              },
-              {
-                "path": "torch/testing/_internal/distributed/distributed_test.py"
-              },
-              {
-                "path": "torch/testing/_internal/jit_metaprogramming_utils.py"
-              },
-              {
-                "path": "torch/utils/cpp_extension.py"
-              },
-              {
-                "path": "torch/utils/data/datapipes/_typing.py"
-              },
-              {
-                "path": "torch/utils/model_dump/__init__.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MzQ4",
-              "hasNextPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAWuVD9M= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAXEsRtE= name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "5696e8357cf38f852ef3d680381513e26f202371",
-                  "checkSuites": {
-                    "nodes": [
-                      {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785220"
-                            }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVECw=",
-                            "hasNextPage": false
-                          }
-                        }
-                      }
-                    ]
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=82169 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "ezyang"
-          },
-          "title": "Move test_dtypes so it runs later",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #82169\n\nThe error messages it gives are very unhelpful (because a failure\ngets translated into \"dtype was not supported\" rather than the\nactual backtrace), so I'd rather get error messages about this after\nI've tested basic functionality.\n\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>",
-          "headRefName": "gh/ezyang/1279/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/ezyang/1279/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "cef34da55a59da5a32494bff218ccd4978b659d3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "83ad7e73a07111ac1d85e931d14360cc22c01edd"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            },
-            "totalCount": 3
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823981"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310707890"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708140"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708223"
-                              },
-                              {
-                                "name": "Test collect_env (older_python_version)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708332"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708496"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708710"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708937"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310709169"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGj1lc=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8k="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823979"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8s="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823982"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823982/jobs/4310707884"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjz0w=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9A="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823980"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9Q="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747824002"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdAs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747824048"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708487"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708713"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708942"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709174"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709340"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709579"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709844"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710003"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710175"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710516"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710716"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710890"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711097"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711234"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711429"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711603"
-                              },
-                              {
-                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711765"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711946"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712129"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712276"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194495"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194591"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194659"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194749"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194858"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194934"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311195003"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220458"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220540"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222725"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222869"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223128"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223225"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223324"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223396"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223496"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223569"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223690"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311224360"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311230050"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311301930"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302152"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302303"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302433"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302531"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491082"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491172"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491232"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491289"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491348"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG0YME=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdIQ="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjyQg=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdMA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdeE="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdfU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdgg="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-07-27T15:34:17Z",
-                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
-                }
-              }
-            ]
-          },
-          "changedFiles": 1,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/test_ops.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "zou3519"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "Chillee"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0yNVQxNDo0NTozNS0wNzowMLkyMDIyLTA3LTI1VDE0OjQ1OjM1LTA3OjAwzj6XYmg=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "@pytorchbot merge -f FORCE",
-                "createdAt": "2022-07-27T17:56:43Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1197107402
-              },
-              {
-                "bodyText": "You need to provide a reason for using force merge, in the format @pytorchbot merge -f '[CATEGORY] Explanation'. With [CATEGORY] being one the following:\nEMERGENCY - an emergency fix to quickly address an issue\nMINOR - a minor fix such as cleaning locally unused variables, which shouldn't break anything\nPRE_TESTED - a previous CI run tested everything and you've only added minor changes like fixing lint\nOTHER - something not covered above",
-                "createdAt": "2022-07-27T17:56:45Z",
-                "author": {
-                  "login": "pytorch-bot"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1197107439
-              },
-              {
-                "bodyText": "@pytorchbot merge -f \"[OTHER] normal land failed twice already\"",
-                "createdAt": "2022-07-27T17:57:28Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1197108130
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
-                "createdAt": "2022-07-27T18:08:13Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1197119348
-              },
-              {
-                "bodyText": "Hey @ezyang.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-07-27T18:08:58Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1197120095
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOR1poyg==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
-          },
-          "headRef": null
-        }
-      }
-    }
-  },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAcG0YME= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAcHRdAs= name=pytorch number=82169 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c",
-                  "checkSuites": {
-                    "nodes": [
-                      {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491405"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491484"
-                            },
-                            {
-                              "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491703"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311551941"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311552010"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311552076"
-                            }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG1sTc=",
-                            "hasNextPage": false
-                          }
-                        }
-                      }
-                    ]
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAcHRdgg= name=pytorch number=82169 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdhg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdic="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=eb979626157e70cf52d29cf16eaa852bedf0f29b1831e9021e1bf3e7457be7fd commit=6882717f73deffb692219ccd1fd6db258d8ed684 name=pytorch owner=pytorch": {
-    "data": {
-      "repository": {
-        "object": {
-          "checkSuites": {
-            "edges": [
-              {
-                "node": {
-                  "app": {
-                    "name": "Facebook GitHub Tools",
-                    "databaseId": 12274
-                  },
-                  "workflowRun": null,
-                  "checkRuns": {
-                    "nodes": [],
-                    "pageInfo": {
-                      "endCursor": null,
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": null
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hng="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "Netlify",
-                    "databaseId": 13473
-                  },
-                  "workflowRun": null,
-                  "checkRuns": {
-                    "nodes": [],
-                    "pageInfo": {
-                      "endCursor": null,
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": null
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hpE="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "Azure Pipelines",
-                    "databaseId": 9426
-                  },
-                  "workflowRun": null,
-                  "checkRuns": {
-                    "nodes": [],
-                    "pageInfo": {
-                      "endCursor": null,
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": null
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hpw="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "Dependabot",
-                    "databaseId": 29110
-                  },
-                  "workflowRun": null,
-                  "checkRuns": {
-                    "nodes": [],
-                    "pageInfo": {
-                      "endCursor": null,
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": null
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hrA="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "Codecov",
-                    "databaseId": 254
-                  },
-                  "workflowRun": null,
-                  "checkRuns": {
-                    "nodes": [],
-                    "pageInfo": {
-                      "endCursor": null,
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": null
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hsM="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "PyTorch Bot",
-                    "databaseId": 40112
-                  },
-                  "workflowRun": null,
-                  "checkRuns": {
-                    "nodes": [],
-                    "pageInfo": {
-                      "endCursor": null,
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": null
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hs0="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "GitHub Actions",
-                    "databaseId": 15368
-                  },
-                  "workflowRun": {
-                    "workflow": {
-                      "name": "Lint"
-                    },
-                    "url": "https://github.com/pytorch/pytorch/actions/runs/2638241883"
-                  },
-                  "checkRuns": {
-                    "nodes": [
-                      {
-                        "name": "workflow-checks",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095495959"
-                      },
-                      {
-                        "name": "quick-checks",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496003"
-                      },
-                      {
-                        "name": "Test tools",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496162"
-                      },
-                      {
-                        "name": "toc",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496320"
-                      },
-                      {
-                        "name": "Test collect_env (with_torch)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496465"
-                      },
-                      {
-                        "name": "Test collect_env (without_torch)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496523"
-                      },
-                      {
-                        "name": "Test collect_env (older_python_version)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496558"
-                      },
-                      {
-                        "name": "lintrunner",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496708"
-                      }
-                    ],
-                    "pageInfo": {
-                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCVA2Y=",
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": "SUCCESS"
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hzg="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "GitHub Actions",
-                    "databaseId": 15368
-                  },
-                  "workflowRun": {
-                    "workflow": {
-                      "name": "trunk"
-                    },
-                    "url": "https://github.com/pytorch/pytorch/actions/runs/2638241915"
-                  },
-                  "checkRuns": {
-                    "nodes": [
-                      {
-                        "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496376"
-                      },
-                      {
-                        "name": "android-emulator-build-test / build-and-test",
-                        "conclusion": "FAILURE",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496525"
-                      },
-                      {
-                        "name": "linux-xenial-cuda11.3-py3.7-gcc7-no-ops / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496611"
-                      },
-                      {
-                        "name": "macos-10-15-py3-arm64 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496713"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496857"
-                      },
-                      {
-                        "name": "ios-12-5-1-x86-64 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497178"
-                      },
-                      {
-                        "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497392"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497580"
-                      },
-                      {
-                        "name": "libtorch-linux-xenial-cuda10.2-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497781"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9-slow / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497886"
-                      },
-                      {
-                        "name": "linux-bionic-rocm5.1-py3.7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497997"
-                      },
-                      {
-                        "name": "macos-10-15-py3-lite-interpreter-x86-64 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498146"
-                      },
-                      {
-                        "name": "macos-11-py3-x86-64 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498338"
-                      },
-                      {
-                        "name": "caffe2-linux-focal-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498448"
-                      },
-                      {
-                        "name": "parallelnative-linux-focal-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498648"
-                      },
-                      {
-                        "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095659992"
-                      },
-                      {
-                        "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095660077"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095798458"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840103"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840227"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (slow, 1, 1, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840377"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840521"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840605"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840689"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840741"
-                      },
-                      {
-                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840795"
-                      },
-                      {
-                        "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095874982"
-                      },
-                      {
-                        "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875042"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875174"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875221"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875266"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875320"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875369"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875417"
-                      },
-                      {
-                        "name": "macos-12.3-py3.8-arm64-test / Run MPS tests",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4096110771"
-                      },
-                      {
-                        "name": "macos-11-py3-x86-64 / test (default, 1, 2, macos-12)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4096408234"
-                      },
-                      {
-                        "name": "macos-11-py3-x86-64 / test (default, 2, 2, macos-12)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4096408307"
-                      }
-                    ],
-                    "pageInfo": {
-                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCn27w=",
-                      "hasNextPage": false
-                    }
-                  },
-                  "conclusion": "FAILURE"
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1h5Q="
-              },
-              {
-                "node": {
-                  "app": {
-                    "name": "GitHub Actions",
-                    "databaseId": 15368
-                  },
-                  "workflowRun": {
-                    "workflow": {
-                      "name": "pull"
-                    },
-                    "url": "https://github.com/pytorch/pytorch/actions/runs/2638241914"
-                  },
-                  "checkRuns": {
-                    "nodes": [
-                      {
-                        "name": "linux-bionic-rocm5.1-py3.7",
-                        "conclusion": "NEUTRAL",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496220"
-                      },
-                      {
-                        "name": "win-vs2019-cuda11.6-py3",
-                        "conclusion": "NEUTRAL",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496344"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496466"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang10-onnx / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496612"
-                      },
-                      {
-                        "name": "win-vs2019-cpu-py3 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496726"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496862"
-                      },
-                      {
-                        "name": "linux-bionic-py3_7-clang8-xla / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497204"
-                      },
-                      {
-                        "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497405"
-                      },
-                      {
-                        "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497578"
-                      },
-                      {
-                        "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497784"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang7-asan / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497875"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498008"
-                      },
-                      {
-                        "name": "linux-xenial-py3.7-clang7-asan / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498155"
-                      },
-                      {
-                        "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498346"
-                      },
-                      {
-                        "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498440"
-                      },
-                      {
-                        "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498650"
-                      },
-                      {
-                        "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498724"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498883"
-                      },
-                      {
-                        "name": "linux-xenial-py3-clang5-mobile-build / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095499064"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7-no-ops / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095499218"
-                      },
-                      {
-                        "name": "linux-xenial-py3.7-gcc7 / build",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095499360"
-                      },
-                      {
-                        "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095615833"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668105"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668215"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668293"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668402"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668480"
-                      },
-                      {
-                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668571"
-                      },
-                      {
-                        "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095776890"
-                      },
-                      {
-                        "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095776922"
-                      },
-                      {
-                        "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095778975"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794308"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794370"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794452"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794502"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794566"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794652"
-                      },
-                      {
-                        "name": "linux-docs / build-docs (cpp)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794748"
-                      },
-                      {
-                        "name": "linux-docs / build-docs (python)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794836"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800591"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800638"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800676"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800723"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800762"
-                      },
-                      {
-                        "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800805"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095813130"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095813208"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858004"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858063"
-                      },
-                      {
-                        "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
-                        "conclusion": "SUCCESS",
-                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858127"
-                      }
-                    ],
-                    "pageInfo": {
-                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCcmdI=",
-                      "hasNextPage": true
-                    }
-                  },
-                  "conclusion": "SUCCESS"
-                },
-                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1h5U="
-              }
-            ],
-            "pageInfo": {
-              "hasNextPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=23d6a47e5fd875c42231779040ec1d35d0042b502c9142cb0d33d6f65d58fead commit=6882717f73deffb692219ccd1fd6db258d8ed684 cr_cursor=Y3Vyc29yOnYyOpHPAAAAAbCcmdI= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAbH1h5Q= name=pytorch owner=pytorch": {
-    "data": {
-      "repository": {
-        "object": {
-          "oid": "6882717f73deffb692219ccd1fd6db258d8ed684",
-          "checkSuites": {
-            "nodes": [
-              {
-                "checkRuns": {
-                  "nodes": [
-                    {
-                      "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
-                      "conclusion": "SUCCESS",
-                      "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858194"
-                    },
-                    {
-                      "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
-                      "conclusion": "SUCCESS",
-                      "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858272"
-                    },
-                    {
-                      "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                      "conclusion": "SUCCESS",
-                      "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4096006884"
-                    }
-                  ],
-                  "pageInfo": {
-                    "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCfo8c=",
-                    "hasNextPage": false
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=76123 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "kumpera"
-          },
-          "title": "Introduce distributed checkpoint with ShardedTensor.",
-          "body": "Co-authored-by: Wen Zhang <zhangwen@fb.com>\r\nCo-authored-by: Yifu Wang <yifu@fb.com>\r\n\r\n",
-          "headRefName": "st_checkpoint",
-          "headRepository": {
-            "nameWithOwner": "kumpera/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kumpera"
-                    },
-                    "email": "kumpera@fb.com",
-                    "name": "Rodrigo Kumpera"
-                  },
-                  "oid": "6bf248bc20a71f248064b795f38276326fe43aae"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kumpera"
-                    },
-                    "email": "kumpera@fb.com",
-                    "name": "Rodrigo Kumpera"
-                  },
-                  "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kumpera"
-                    },
-                    "email": "kumpera@fb.com",
-                    "name": "Rodrigo Kumpera"
-                  },
-                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            },
-            "totalCount": 3
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSmtI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063614"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063614/jobs/3379894109"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0k="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063615"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894107"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894332"
-                              },
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894444"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894520"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894567"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894616"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894672"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0o="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063632"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902301"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902363"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902507"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902560"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902579"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902603"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902637"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902685"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902740"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902761"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902794"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902874"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903006"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903111"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903193"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903284"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903357"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903446"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903512"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903546"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944655"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944695"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946308"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946337"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946359"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946391"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946423"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946453"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946496"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946529"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950041"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950137"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950165"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950192"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950646"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951202"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951230"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963877"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963928"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963976"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379964018"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379966372"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996173"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996218"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379997861"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998374"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998397"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998422"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998441"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3380042106"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm14="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796859"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419477"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419699"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419923"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419992"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420129"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420208"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420309"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNGg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796862"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796862/jobs/3387419465"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNIc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796865"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387419999"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420164"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420316"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420477"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420675"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420934"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421278"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421672"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421888"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421982"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422191"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422303"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422476"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422715"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422963"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423092"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423234"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423421"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423622"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423739"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387545789"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546032"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546119"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553028"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553144"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553251"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553438"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553556"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553668"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554002"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554098"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387558927"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559016"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559071"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559139"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563803"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563894"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580868"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580936"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580993"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387581053"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387592286"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387631950"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387632035"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649916"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649974"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650084"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650151"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650373"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387753429"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-05-05T00:34:26Z",
-                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
-                }
-              }
-            ]
-          },
-          "changedFiles": 11,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/distributed/_shard/checkpoint/test_checkpoint.py"
-              },
-              {
-                "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py"
-              },
-              {
-                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/__init__.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/filesystem.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/metadata.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/resharding.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py"
-              },
-              {
-                "path": "torch/distributed/_shard/checkpoint/storage.py"
-              },
-              {
-                "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTE",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zzzwen"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zzzwen"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "wanchaol"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zzzwen"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zzzwen"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "simpkins"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zzzwen"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "zzzwen"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "simpkins"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "simpkins"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "wilson100hong"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "wilson100hong"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "wilson100hong"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "DISMISSED"
-              },
-              {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=",
-              "hasPreviousPage": true
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
-                "createdAt": "2022-05-05T12:35:49Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1118495479
-              },
-              {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
-                "createdAt": "2022-05-05T12:53:15Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1118511287
-              },
-              {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
-                "createdAt": "2022-05-05T15:00:08Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1118662274
-              },
-              {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.",
-                "createdAt": "2022-05-05T15:20:46Z",
-                "author": {
-                  "login": "janeyx99"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1118689010
-              },
-              {
-                "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?",
-                "createdAt": "2022-05-05T15:24:08Z",
-                "author": {
-                  "login": "janeyx99"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1118693497
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "oncall: distributed"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": [
-                  {
-                    "node": {
-                      "parents": {
-                        "edges": [
-                          {
-                            "node": {
-                              "oid": "a8b098859688a3f1993821eecc036be973a15605"
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                ]
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=6a8ce6412a780d5804bfe180ed1dc807269e1eae2ae50de2346d56d1283884bc cursor=Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0= name=pytorch number=76123 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yMlQyMDozNzo1NC0wNzowMLkyMDIyLTA0LTIyVDE2OjAyOjA5LTA3OjAwzjip7G8=",
-              "hasPreviousPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=79694 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "kshitij12345"
-          },
-          "title": "[complex] conv_transpose1d",
-          "body": "Reference: https://github.com/pytorch/pytorch/issues/71108",
-          "headRefName": "develop/complex/conv_transpose1d",
-          "headRepository": {
-            "nameWithOwner": "kshitij12345/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "d1ea948e65ac6d31ad056287ab65d38ecc68b30d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "b4ba1db9a3a71bd8c03158dcd1b68711360633d8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "655a4220beae163bfe578f0318a130df01ec05d6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "Kshiteej K"
-                  },
-                  "oid": "8181716be7a8005eb13ad5c3f2e1279ed1c60aff"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "9e5ca3663e7471786eeebebfdf84aea5d761712f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "9c110f39bcdc4e56386b6f9c4e2c082c8940ade6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "49315e79d0eee8008e2a74575c6fc0f6a9531ee4"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "728752480760226270c374a0acc08e28b9b133f3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "ffe43399d6f60ef7844523a5f465c11d9a67062f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "9672a2198472567bae4ac6f55d004f7e1fa8a9fa"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "48a0ebf32b895286f036b36c871f671dc867e400"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "52fbe80d5c8a94e03d816c0bd21fd82019dcd5ac"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
-                  },
-                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTM",
-              "hasNextPage": false
-            },
-            "totalCount": 13
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.facebook.com/cla/"
-                              },
-                              {
-                                "name": "Meta Internal-Only Changes Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdtq8Hc=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqFo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393316"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393316/jobs/4628529923"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTEwk=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393315"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628529910"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530162"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530698"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530867"
-                              },
-                              {
-                                "name": "Test collect_env (older_python_version)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530989"
-                              },
-                              {
-                                "name": "pr-sanity-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531151"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531475"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531753"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531853"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTHFY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393329"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531149"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531473"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531754"
-                              },
-                              {
-                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531857"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-pch / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532179"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532543"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532694"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532918"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533033"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533181"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533420"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533630"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533825"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533959"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534129"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534256"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534388"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534571"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534714"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534989"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628535311"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639115"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639198"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639265"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639339"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639395"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639450"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639509"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639572"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639635"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647047"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647119"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647215"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647277"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647348"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647432"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647522"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647641"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647762"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628653797"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679376"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679431"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679469"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679519"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679594"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628681226"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628854932"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856434"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856501"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856575"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ2fA=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqZs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "windows-binary-libtorch-debug"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351637"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-debug-build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4634503587"
-                              },
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-debug-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4635312938"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsbsmM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "windows-binary-wheel"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351640"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "wheel-py3_7-cuda11_3-build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4634503571"
-                              },
-                              {
-                                "name": "wheel-py3_7-cuda11_3-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4636146265"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsskcw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "windows-binary-libtorch-release"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351643"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-release-build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4634503570"
-                              },
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-release-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4635003925"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsVbD8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-binary-libtorch-cxx11-abi"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351698"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4634504079"
-                              },
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4635072931"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW5Aw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2E="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-binary-libtorch-pre-cxx11"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351700"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4634503897"
-                              },
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4635077148"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW-jo=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2I="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-binary-manywheel"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351699"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "manywheel-py3_7-cuda10_2-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4634503896"
-                              },
-                              {
-                                "name": "manywheel-py3_7-cuda10_2-test / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4635934290"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsoMEA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2M="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-08-22T22:04:19Z",
-                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
-                }
-              }
-            ]
-          },
-          "changedFiles": 3,
-          "files": {
-            "nodes": [
-              {
-                "path": "aten/src/ATen/native/Convolution.cpp"
-              },
-              {
-                "path": "torch/testing/_internal/common_methods_invocations.py"
-              },
-              {
-                "path": "torch/testing/_internal/common_modules.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0xOVQxMDowNzo1NC0wNzowMLkyMDIyLTA3LTE5VDEwOjA3OjU0LTA3OjAwzj43QcY=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "@pytorchbot merge -g\nAll is green internally!",
-                "createdAt": "2022-08-23T19:29:55Z",
-                "author": {
-                  "login": "albanD"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1224702749
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here.\nThe merge job was triggered with the green (-g) flag. This means that your change will be merged once all checks on your PR have passed (ETA: 0-4 Hours). If this is not the intended behavior, feel free to use some of the other merge options in the wiki.\nPlease reach out to the PyTorch DevX Team with feedback or questions!",
-                "createdAt": "2022-08-23T19:31:18Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1224705564
-              },
-              {
-                "bodyText": "Thanks for looking into it \ud83d\ude42 @albanD @jeanschmidt",
-                "createdAt": "2022-08-23T19:34:36Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1224712351
-              },
-              {
-                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-08-23T22:31:58Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1224956051
-              },
-              {
-                "bodyText": "Yeah, discussed with my manager and I got the required permissions to do so. Sorry for not responding promptly yesterday. But I am available from now on to provide assistance :)",
-                "createdAt": "2022-08-24T09:24:04Z",
-                "author": {
-                  "login": "jeanschmidt"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1225462612
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOSP97HQ==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Reverted"
-                }
-              },
-              {
-                "node": {
-                  "name": "ciflow/trunk"
-                }
-              },
-              {
-                "node": {
-                  "name": "ciflow/periodic"
-                }
-              }
-            ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": [
-                  {
-                    "node": {
-                      "parents": {
-                        "edges": [
-                          {
-                            "node": {
-                              "oid": "d3d163af8061e08097c3ae37079bf61535b81ff1"
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                ]
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOSP97HQ== name=pytorch number=79694 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/79694\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 2fd08f1 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-06-16T09:43:16Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 1157454523
-              },
-              {
-                "bodyText": "Unable to reproduce jit failure locally (will skip the test)\nCI Failure : https://github.com/pytorch/pytorch/runs/6926187074?check_suite_focus=true#step:9:20230\npytest test/test_ops_jit.py -k test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 -v\n=============================================================== test session starts ===============================================================\nplatform linux -- Python 3.10.0, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 -- /home/kshiteej/.conda/envs/pytorch-cuda-dev/bin/python\ncachedir: .pytest_cache\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/kshiteej/Pytorch/pytorch_complex_convolution.py/.hypothesis/examples')\nrootdir: /home/kshiteej/Pytorch/pytorch_complex_convolution.py, configfile: pytest.ini\nplugins: hypothesis-6.23.2, repeat-0.9.1\ncollected 1976 items / 1975 deselected / 1 selected                                                                                               \n\ntest/test_ops_jit.py::TestJitCPU::test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 PASSED                          [100%]\n\n================================================================ warnings summary =================================================================\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9\n  /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives\n    from distutils.version import LooseVersion\n\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91\n  /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.\n    warnings.warn(\n\n-- Docs: https://docs.pytest.org/en/stable/warnings.html\n================================================= 1 passed, 1975 deselected, 2 warnings in 4.90s =================================================",
-                "createdAt": "2022-07-18T09:05:35Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": {
-                  "login": "kshitij12345"
-                },
-                "databaseId": 1186949486
-              },
-              {
-                "bodyText": "@pytorchbot merge",
-                "createdAt": "2022-07-19T17:12:23Z",
-                "author": {
-                  "login": "ngimel"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189347786
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
-                "createdAt": "2022-07-19T17:13:42Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189350009
-              },
-              {
-                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-07-19T17:14:25Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1189350932
-              },
-              {
-                "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"",
-                "createdAt": "2022-07-19T19:15:41Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1189459845
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
-                "createdAt": "2022-07-19T19:16:59Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189460926
-              },
-              {
-                "bodyText": "Will not revert as @kshitij12345 is not a MEMBER, but COLLABORATOR",
-                "createdAt": "2022-07-19T19:17:00Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189460942
-              },
-              {
-                "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"",
-                "createdAt": "2022-07-19T20:40:04Z",
-                "author": {
-                  "login": "anjali411"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189529734
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
-                "createdAt": "2022-07-19T20:41:20Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189530756
-              },
-              {
-                "bodyText": "@kshitij12345 your PR has been successfully reverted.",
-                "createdAt": "2022-07-19T20:41:25Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1189530831
-              },
-              {
-                "bodyText": "@pytorchbot merge -g",
-                "createdAt": "2022-07-20T09:53:08Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1190070141
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
-                "createdAt": "2022-07-20T09:54:24Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1190071424
-              },
-              {
-                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-07-20T13:00:51Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1190258272
-              },
-              {
-                "bodyText": "commit is breaking internal builds/tests https://pastebin.com/HX4RUusH (pytorch/functorch/test:test_eager_transforms)",
-                "createdAt": "2022-07-21T10:39:01Z",
-                "author": {
-                  "login": "jeanschmidt"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1191327616
-              },
-              {
-                "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"",
-                "createdAt": "2022-07-21T10:39:27Z",
-                "author": {
-                  "login": "jeanschmidt"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1191328013
-              },
-              {
-                "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"",
-                "createdAt": "2022-07-21T10:41:23Z",
-                "author": {
-                  "login": "jeanschmidt"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1191329792
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
-                "createdAt": "2022-07-21T10:42:16Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1191330586
-              },
-              {
-                "bodyText": "@kshitij12345 your PR has been successfully reverted.",
-                "createdAt": "2022-07-21T10:42:23Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1191330690
-              },
-              {
-                "bodyText": "@jeanschmidt which test is it failing on? I tried running the test_eager_transforms in functorch but couldn't reproduce it.",
-                "createdAt": "2022-07-25T07:11:19Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1193667568
-              },
-              {
-                "bodyText": "@jbschlosser have added a ref as discussed offline. Can you please take a look? And if it looks good, can you import the PR to check if it is breaking anything internally.\nThanks",
-                "createdAt": "2022-08-03T18:30:17Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1204329491
-              },
-              {
-                "bodyText": "@jbschlosser @jeanschmidt @albanD anything we can do to unblock this on our side?",
-                "createdAt": "2022-08-20T09:27:17Z",
-                "author": {
-                  "login": "lezcano"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1221266218
-              },
-              {
-                "bodyText": "Functorch tests should be running here now so can you rebase on top of master please?",
-                "createdAt": "2022-08-22T21:42:37Z",
-                "author": {
-                  "login": "albanD"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1223129944
-              },
-              {
-                "bodyText": "@albanD have rebased on latest master.",
-                "createdAt": "2022-08-23T08:49:10Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1223758571
-              },
-              {
-                "bodyText": "I triggered all the tests not to have any issues with slow tests again",
-                "createdAt": "2022-08-23T09:20:18Z",
-                "author": {
-                  "login": "lezcano"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1223796413
-              },
-              {
-                "bodyText": "Thanks @lezcano! However, last time it was reverted for internal failures. So it would be great if someone can import and verify that.\ncc: @albanD @jeanschmidt",
-                "createdAt": "2022-08-23T10:17:50Z",
-                "author": {
-                  "login": "kshitij12345"
-                },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 1223863075
-              },
-              {
-                "bodyText": "@albanD has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-08-23T14:43:02Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1224175731
-              },
-              {
-                "bodyText": "I am not the right person to provide assistence, as currently I am not based in a Tier 1 location, so my permissions to access are so restricted that I am not able to import this commit, run the tests and provide meaningful responses.",
-                "createdAt": "2022-08-23T15:57:48Z",
-                "author": {
-                  "login": "jeanschmidt"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1224272324
-              },
-              {
-                "bodyText": "@jeanschmidt has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-08-23T17:00:53Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1224351135
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHORP1auw==",
-              "hasPreviousPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAdqZ2fA= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAdioqXw= name=pytorch number=79694 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce",
-                  "checkSuites": {
-                    "nodes": [
-                      {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856668"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856772"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856812"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856867"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628858900"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628858948"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628859006"
-                            }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ5lE=",
-                            "hasNextPage": false
-                          }
-                        }
-                      }
-                    ]
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAdkUS2M= name=pytorch number=79694 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "trunk"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351701"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "macos-12-py3-x86-64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504326"
-                              },
-                              {
-                                "name": "macos-12-py3-arm64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504522"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504655"
-                              },
-                              {
-                                "name": "caffe2-linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504882"
-                              },
-                              {
-                                "name": "android-emulator-build-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505033"
-                              },
-                              {
-                                "name": "ios-12-5-1-x86-64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505167"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9-slow / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505347"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505499"
-                              },
-                              {
-                                "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505639"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505767"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506032"
-                              },
-                              {
-                                "name": "macos-12-py3-x86-64-lite-interpreter / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506202"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506357"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506535"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634664404"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634669945"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634670046"
-                              },
-                              {
-                                "name": "macos-12-py3-x86-64 / test (default, 1, 2, macos-12)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634734165"
-                              },
-                              {
-                                "name": "macos-12-py3-x86-64 / test (default, 2, 2, macos-12)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634734293"
-                              },
-                              {
-                                "name": "macos-12-py3-x86-64 / test (functorch, 1, 1, macos-12)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634734388"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634772323"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634772410"
-                              },
-                              {
-                                "name": "macos-12-py3-arm64 / test (default, 1, 2, macos-m1-12)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634812657"
-                              },
-                              {
-                                "name": "macos-12-py3-arm64 / test (default, 2, 2, macos-m1-12)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634812746"
-                              },
-                              {
-                                "name": "macos-12-py3-arm64-mps / Run MPS tests",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634812878"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634868761"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634868884"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869012"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869132"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869240"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (slow, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869348"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (slow, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869457"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869537"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869649"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869743"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869861"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869984"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635049837"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635049935"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050025"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050129"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050234"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (functorch, 1, 1, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050323"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050460"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsWbDg=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2g="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "periodic"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351759"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "ios-12-5-1-arm64-metal / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634504650"
-                              },
-                              {
-                                "name": "linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634504883"
-                              },
-                              {
-                                "name": "ios-12-5-1-arm64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505024"
-                              },
-                              {
-                                "name": "buck-build-test / buck-build-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505165"
-                              },
-                              {
-                                "name": "ios-12-5-1-arm64-coreml / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505316"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505521"
-                              },
-                              {
-                                "name": "libtorch-linux-bionic-cuda11.7-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505667"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505786"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7-slow / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506031"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506209"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7-distributed / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506353"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.7-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506550"
-                              },
-                              {
-                                "name": "ios-12-5-1-x86-64-coreml / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506968"
-                              },
-                              {
-                                "name": "ios-12-5-1-arm64-custom-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634507176"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7-distributed / test (distributed, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634799214"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7-distributed / test (distributed, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634799342"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7-slow / test (slow, 1, 1, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634800216"
-                              },
-                              {
-                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (multigpu, 1, 1, linux.16xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634896194"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634955955"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634956066"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634956160"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634956251"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987167"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987289"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987406"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987543"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.7-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635020787"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.7-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635020896"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.7-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635021008"
-                              },
-                              {
-                                "name": "linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635184380"
-                              },
-                              {
-                                "name": "linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635184472"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsZHek=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS_k="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=fa3b2971800534b98820337848107cc1b9096b7e0ca3711ec21c47ba1182099d name=pytorch number=90791 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "bdhirsh"
-          },
-          "title": "functionalization: check for undefined tensors in advanced indexing",
-          "body": "cc @wonjoolee95 - XLA folks were seeing an advanced indexing issue with undefined tensors.\r\n\r\nIt looks like running code like `a[:, tensor_idx] = b` can results in:\r\n\r\n(1) calling `index_put_()`\r\n(2) passing (potential undefined) tensors as the indices to index_put_().\r\n\r\n\r\nStack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* #91001\n* __->__ #90791\n* #90722\n\r\n",
-          "headRefName": "gh/bdhirsh/356/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/bdhirsh/356/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@meta.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "c9e8e71b8ba2ba62bfac29900e71dde3ab6589cb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@meta.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "ed3eff87d5cc76ce6d8e5f1db901be21acc86cb6"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@meta.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "00ca22160d89060815e2be50e52f462f811c1087"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@meta.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "b00e14c4a90e33721a406772bf548fbfffb065d4"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@meta.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "NQ",
-              "hasNextPage": false
-            },
-            "totalCount": 5
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Meta Internal-Only Changes Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP3Pw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rl0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rn4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rpY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "CircleCI Checks",
-                            "databaseId": 18001
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://circleci.com/workflow-run/0456c68a-2cb2-4b5c-beff-42ff31937439?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-checks-link&utm_content=bottom"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7Hg=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rrI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rtI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68ruk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rv8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Check Labels"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206640"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Check labels",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206640/jobs/6297806113"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7rU=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684e0="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206646"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206646/jobs/6297806176"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7vk=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684fY="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206650"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297806783"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297806967"
-                              },
-                              {
-                                "name": "pr-sanity-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807120"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807302"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807451"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807633"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807764"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807891"
-                              },
-                              {
-                                "name": "Test collect_env (older_python_version)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297808026"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP-Fs=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684gc="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "EasyCLA",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-12-16T15:04:35Z",
-                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e"
-                }
-              }
-            ]
-          },
-          "changedFiles": 2,
-          "files": {
-            "nodes": [
-              {
-                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
-              },
-              {
-                "path": "test/test_functionalization.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mg",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0xMi0xM1QxNzo0NTo1Ny0wODowMLkyMDIyLTEyLTEzVDE3OjQ1OjU3LTA4OjAwzkiEx9E=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "\ud83d\udd17 Helpful Links\n\ud83e\uddea See artifacts and rendered test results at hud.pytorch.org/pr/90791\n\n\ud83d\udcc4 Preview Python docs built from this PR\n\ud83d\udcc4 Preview C++ docs built from this PR\n\u2753 Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours\n\nNote: Links to docs will display an error until the docs builds have been completed.\n\u274c 2 Failures\nAs of commit 70711ab:\nNEW FAILURES - The following jobs have failed:\n\nlintrunner\nTest tools\n\n\nThis comment was automatically generated by Dr. CI and updates every 15 minutes.",
-                "createdAt": "2022-12-13T20:48:29Z",
-                "author": {
-                  "login": "pytorch-bot"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "pytorch-bot"
-                },
-                "databaseId": 1349670291
-              },
-              {
-                "bodyText": "@pytorchbot merge -f \"lint tests are flaky\"",
-                "createdAt": "2022-12-19T16:09:30Z",
-                "author": {
-                  "login": "bdhirsh"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1357898146
-              },
-              {
-                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
-                "createdAt": "2022-12-19T16:11:00Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1357900127
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOUHJVkw==",
-              "hasPreviousPage": false
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
-              {
-                "node": {
-                  "name": "release notes: composability"
-                }
-              }
-            ]
-          },
-          "headRef": {
-            "compare": {
-              "commits": {
-                "edges": [
-                  {
-                    "node": {
-                      "parents": {
-                        "edges": [
-                          {
-                            "node": {
-                              "oid": "634555d9817fd2047a3f4c2d8d26ce959f1f6662"
-                            }
-                          }
-                        ]
-                      }
-                    }
-                  }
-                ]
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAk684gc= name=pytorch number=90791 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e",
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Labeler"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206652"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "triage",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206652/jobs/6297806231"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7z0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684gk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206658"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297806627"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297806814"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807002"
-                              },
-                              {
-                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807233"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807392"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807527"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807706"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807915"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808137"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808315"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808528"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808733"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-pch / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808911"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297809658"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297809822"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297809996"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.3-py3.8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297810168"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297810328"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297810479"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298023287"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298028658"
-                              },
-                              {
-                                "name": "linux-docs / build-docs-cpp-false",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298028841"
-                              },
-                              {
-                                "name": "linux-docs / build-docs-python-false",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298028976"
-                              },
-                              {
-                                "name": "linux-docs / build-docs-functorch-false",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298029091"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030237"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030451"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030577"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030712"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030845"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030983"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298031137"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298031279"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298033927"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298035896"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036008"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036149"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036286"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036389"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036502"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036635"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036767"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036993"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298040119"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298040269"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298109574"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298116983"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117143"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117258"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117401"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117536"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyWETY=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684iI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Check Labels"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3716423635"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Check labels",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3716423635/jobs/6302732322"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlzyfKM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk8UBDA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Check Labels"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3733139393"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Check labels",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3733139393/jobs/6333531377"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAl8pm1U=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAlEdVYM="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAlyWETY= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAk684gk= name=pytorch number=90791 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e",
-                  "checkSuites": {
-                    "nodes": [
-                      {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117670"
-                            },
-                            {
-                              "name": "linux-bionic-py3_7-clang8-xla / filter",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298123873"
-                            },
-                            {
-                              "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298130231"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / filter",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298216660"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298218524"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298223405"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298223604"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298223779"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225106"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225234"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225373"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225516"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225636"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225752"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225878"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298226024"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298226177"
-                            }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyYNZQ=",
-                            "hasNextPage": false
-                          }
-                        }
-                      }
-                    ]
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=91340 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "tugsbayasgalan"
-          },
-          "title": "Symintify pytorch slicing logic",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #91340\n\nDifferential Revision: [D42398023](https://our.internmc.facebook.com/intern/diff/D42398023)",
-          "headRefName": "gh/tugsbayasgalan/86/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/tugsbayasgalan/86/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "ae8889feecb96f0ba0a7ad9888dae340f21487de"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "88ac30a6fbfc65012deeeb3662d8a9272e191cca"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "99540ebd8bb3f5bff0d90325c35f49290c35cd2d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "85043a88f6847463a275633be1ccb07eacca93be"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "00ed45052b95d64051d0cca228cecad40f2e45ae"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "aeba29c8272975c0c25c40d395f5c8e9952f42a0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "0691dc8b2a96860dadc6d5fd47487933ed69d13d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "7052a80984320c7f74a26ab0cbeb683d71835f05"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "8555d264c5aa18a0e3f609bdb21889f3600de85d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "4bd8ffe4d985250e0fb3f71dc7046859620386ca"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "a6d53387bb92ce42f002a270bac73468e7ad2b0d"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "842377100ffcb2ba4d69775f9d91812d6d4fce9f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "5db8aa548077f0a3e32150951aac8b7b2d910102"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "acdb2d71b7bcbc31f7192fb7025799009e406d1e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "92e13828c1a6095a0e117f0a048201b84ccdb0dd"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "3d9bb36d7871dc528b4dd1d8526720768287327b"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "1cdcd7ea89a58bfee14d32e78ca2104e14124fb5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "tugsbayasgalan"
-                    },
-                    "email": "tmanlaibaatar@fb.com",
-                    "name": "Tugsbayasgalan Manlaibaatar"
-                  },
-                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTg",
-              "hasNextPage": false
-            },
-            "totalCount": 18
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Meta Internal-Only Changes Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIk8lw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6VI="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Vg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Vw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6WM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Wo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6XM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "CircleCI Checks",
-                            "databaseId": 18001
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Xc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Labeler"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512812"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "triage",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512812/jobs/6587338912"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUHWY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6no="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512853"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512853/jobs/6587339023"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUHf4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6uw="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512861"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587338996"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339034"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339070"
-                              },
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339110"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339139"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339176"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339209"
-                              },
-                              {
-                                "name": "Test collect_env (older_python_version)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339236"
-                              },
-                              {
-                                "name": "pr-sanity-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339268"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUH1c=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6u4="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "EasyCLA",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2023-01-08T00:07:00Z",
-                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7"
-                }
-              }
-            ]
-          },
-          "changedFiles": 4,
-          "files": {
-            "nodes": [
-              {
-                "path": "aten/src/ATen/TensorIndexing.h"
-              },
-              {
-                "path": "c10/core/SymInt.h"
-              },
-              {
-                "path": "torch/csrc/autograd/python_variable_indexing.cpp"
-              },
-              {
-                "path": "torch/csrc/autograd/python_variable_indexing.h"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "NA",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "Skylion007"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "Skylion007"
-                },
-                "state": "CHANGES_REQUESTED"
-              },
-              {
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "Skylion007"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "Skylion007"
-                },
-                "state": "COMMENTED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0xMi0yM1QxMjoxOToxNy0wODowMLkyMDIyLTEyLTIzVDEyOjE5OjE2LTA4OjAwzklG9o4=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "@tugsbayasgalan your PR has been successfully reverted.",
-                "createdAt": "2023-01-05T17:14:54Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1372498362
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-07T01:57:54Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1374346186
-              },
-              {
-                "bodyText": "Rebased gh/tugsbayasgalan/87/orig onto refs/remotes/origin/viable/strict because #91341 was rebased, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/91340)",
-                "createdAt": "2023-01-07T10:17:26Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1374432230
-              },
-              {
-                "bodyText": "@pytorchbot merge -f \"Landed internally\"",
-                "createdAt": "2023-01-08T22:50:06Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1374948938
-              },
-              {
-                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
-                "createdAt": "2023-01-08T22:51:38Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1374949218
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOUc6pug==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
-              {
-                "node": {
-                  "name": "Reverted"
-                }
-              },
-              {
-                "node": {
-                  "name": "ciflow/trunk"
-                }
-              },
-              {
-                "node": {
-                  "name": "topic: not user facing"
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOUc6pug== name=pytorch number=91340 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "\ud83d\udd17 Helpful Links\n\ud83e\uddea See artifacts and rendered test results at hud.pytorch.org/pr/91340\n\n\ud83d\udcc4 Preview Python docs built from this PR\n\ud83d\udcc4 Preview C++ docs built from this PR\n\u2753 Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours\n\nNote: Links to docs will display an error until the docs builds have been completed.\n\u2705 No Failures\nAs of commit 18a466e:\n\ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a\nThis comment was automatically generated by Dr. CI and updates every 15 minutes.",
-                "createdAt": "2022-12-23T00:37:54Z",
-                "author": {
-                  "login": "pytorch-bot"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "pytorch-bot"
-                },
-                "databaseId": 1363473085
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-12-23T00:40:19Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1363474061
-              },
-              {
-                "bodyText": "@pytorchbot rebase",
-                "createdAt": "2022-12-23T07:30:45Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1363693611
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here",
-                "createdAt": "2022-12-23T07:32:50Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1363694709
-              },
-              {
-                "bodyText": "Rebase failed due to\nRaised by https://github.com/pytorch/pytorch/actions/runs/3764003479",
-                "createdAt": "2022-12-23T07:33:01Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1363694807
-              },
-              {
-                "bodyText": "Rebased gh/tugsbayasgalan/87/orig onto refs/remotes/origin/viable/strict because #91341 was rebased, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/91340)",
-                "createdAt": "2022-12-23T07:33:06Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1363694844
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-12-26T05:57:30Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1364912846
-              },
-              {
-                "bodyText": "Does this need testing changes? or new tests?",
-                "createdAt": "2023-01-03T19:01:39Z",
-                "author": {
-                  "login": "voznesenskym"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370121847
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-03T19:52:38Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370165547
-              },
-              {
-                "bodyText": "@voznesenskym pytorch itself has very comprehensive testing suite for slicing logic, so i think as long as CI is green, it should be good.",
-                "createdAt": "2023-01-03T19:54:35Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370167103
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-03T23:45:05Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370335952
-              },
-              {
-                "bodyText": "@pytorchbot rebase",
-                "createdAt": "2023-01-04T01:28:56Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370391232
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here",
-                "createdAt": "2023-01-04T01:30:51Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370391970
-              },
-              {
-                "bodyText": "Successfully rebased gh/tugsbayasgalan/86/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/91340)",
-                "createdAt": "2023-01-04T01:31:08Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1370392083
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-04T19:19:45Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1371323220
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-04T20:27:49Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1371385625
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-04T20:53:28Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1371406675
-              },
-              {
-                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2023-01-04T22:11:06Z",
-                "author": {
-                  "login": "tugsbayasgalan"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1371489068
-              },
-              {
-                "bodyText": "@pytorchbot merge\n(Initiating merge automatically since Phabricator Diff has merged)",
-                "createdAt": "2023-01-05T10:30:00Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1372040514
-              },
-              {
-                "bodyText": "Merge started\nYour change will be merged once all checks pass (ETA 0-4 Hours).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
-                "createdAt": "2023-01-05T10:33:34Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1372044055
-              },
-              {
-                "bodyText": "@pytorchbot revert -m \"breaking mac builds https://hud.pytorch.org/pytorch/pytorch/commit/8c172fa98a52e95675e9425ac4b23f190f53f9ed https://github.com/pytorch/pytorch/actions/runs/3845932024/jobs/6550654339, marking this as weird because it was merged via codev?\" -c weird",
-                "createdAt": "2023-01-05T17:13:04Z",
-                "author": {
-                  "login": "clee2000"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1372496233
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here.\nQuestions? Feedback? Please reach out to the PyTorch DevX Team",
-                "createdAt": "2023-01-05T17:14:44Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1372498188
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOUUTyvQ==",
-              "hasPreviousPage": false
-            }
-          }
-        }
-      }
-    }
-  },
-  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAmJq6u4= name=pytorch number=91340 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602974"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602977"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602977/jobs/2839950658"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObTk=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-docs"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602976"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQ4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cuda11.3-py3"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602978"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQ8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602979"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2839950630"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2840213785"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2840213832"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602979/jobs/2840213866"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUJII=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602981"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRI="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcRI= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
               {
                 "commit": {
-                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7",
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
                   "checkSuites": {
                     "edges": [
                       {
@@ -22812,26 +2541,110 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Check Labels"
+                              "name": "linux-xenial-py3.7-gcc5.4"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602982"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-vulkan-bionic-py3.7-clang9"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602983"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-py3.7-clang9"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602984"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2839950624"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2840021854"
+                              },
+                              {
+                                "name": "test (noarch, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2840021946"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602984/jobs/2840021988"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP_28=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512856"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602985"
                           },
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Check labels",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512856/jobs/6587338995"
-                              }
-                            ],
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUHds=",
+                              "endCursor": null,
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "CANCELLED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6u8="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRo="
                       },
                       {
                         "node": {
@@ -22841,271 +2654,149 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "linux-xenial-py3.7-clang7-onnx"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512865"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602988"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415492"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415532"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415589"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415644"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415726"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415784"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415826"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.3-py3.8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415854"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415903"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415937"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-pch / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415960"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415997"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416037"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416078"
-                              },
-                              {
-                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416114"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416153"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416206"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416247"
-                              },
-                              {
-                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416281"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416485"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416517"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416556"
-                              },
-                              {
-                                "name": "linux-docs / build-docs-cpp-false",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416590"
-                              },
-                              {
-                                "name": "linux-docs / build-docs-python-false",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416626"
-                              },
-                              {
-                                "name": "linux-docs / build-docs-functorch-false",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416652"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416705"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416738"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416778"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416806"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416852"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416996"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417029"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417053"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417086"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417117"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602988/jobs/2839950656"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417151"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602988/jobs/2840031185"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417179"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602988/jobs/2840031288"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQMyA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-asan"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602989"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417205"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2839950625"
                               },
                               {
-                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417239"
+                                "name": "test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2840042498"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "name": "test (default, 1, 3, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417275"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2840042534"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "name": "test (default, 2, 3, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417300"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602989/jobs/2840042646"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQcpA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602990"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "cmakelint",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417337"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950650"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "clang-format",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417365"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950743"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "name": "clang-tidy",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417394"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950808"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "name": "flake8-py3",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417410"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950884"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "name": "quick-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417443"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839950992"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "name": "mypy",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417475"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951037"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "name": "py2-setup-validate-errormsg",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417521"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951085"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "shellcheck",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417564"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951170"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "name": "toc",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417601"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602990/jobs/2839951266"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnInHI8=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcU4=",
+                              "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6v0="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcR4="
                       },
                       {
                         "node": {
@@ -23115,31 +2806,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "windows-binary-libtorch-debug"
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513095"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602993"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "libtorch-cpu-shared-with-deps-debug-build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513095/jobs/6587342116"
-                              },
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-debug-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513095/jobs/6587939020"
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602993/jobs/2839950562"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIerac=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKc=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7UQ="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcR8="
                       },
                       {
                         "node": {
@@ -23149,31 +2835,101 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "windows-binary-libtorch-release"
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513096"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602992"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602991"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSI="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcSI= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602994"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "libtorch-cpu-shared-with-deps-release-build",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513096/jobs/6587339456"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602994/jobs/2839950655"
                               },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-release-test",
+                                "name": "test (xla, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513096/jobs/6587642833"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602994/jobs/2840047401"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIZcgM=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQjCM=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7UU="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSM="
                       },
                       {
                         "node": {
@@ -23183,31 +2939,41 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-binary-manywheel"
+                              "name": "win-vs2019-cuda11.3-py3"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513132"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602996"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "manywheel-py3_7-cuda11_6-build / build",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513132/jobs/6587344127"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2839950632"
                               },
                               {
-                                "name": "manywheel-py3_7-cuda11_6-test / test",
+                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2840239369"
+                              },
+                              {
+                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2840239408"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513132/jobs/6588050173"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602996/jobs/2840239445"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIgpUU=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUs2w=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "FAILURE"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7Ys="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSQ="
                       },
                       {
                         "node": {
@@ -23217,31 +2983,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-binary-libtorch-pre-cxx11"
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513134"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602998"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "libtorch-cpu-shared-with-deps-pre-cxx11-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513134/jobs/6587339538"
-                              },
-                              {
-                                "name": "libtorch-cpu-shared-with-deps-pre-cxx11-test / test",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513134/jobs/6587614329"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602998/jobs/2839950621"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIY81E=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQs=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7Yw="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSU="
                       },
                       {
                         "node": {
@@ -23251,31 +3012,55 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-binary-libtorch-cxx11-abi"
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513133"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602997"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "name": "build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513133/jobs/6587339544"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602997/jobs/2839950665"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObUI=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603001"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / test",
+                                "name": "build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513133/jobs/6587579045"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603001/jobs/2839950648"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIYVKs=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObSk=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7Y0="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSc="
                       },
                       {
                         "node": {
@@ -23285,488 +3070,182 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "trunk"
+                              "name": "linux-vulkan-bionic-py3.7-clang9"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513136"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603002"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "android-emulator-build-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587375890"
-                              },
-                              {
-                                "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587375971"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-tsan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376023"
-                              },
-                              {
-                                "name": "ios-12-5-1-x86-64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376090"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.3-py3.8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376141"
-                              },
-                              {
-                                "name": "pytorch-linux-focal-py3-clang7-android-ndk-r19c-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376183"
-                              },
-                              {
-                                "name": "macos-12-py3-x86-64-lite-interpreter / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376247"
-                              },
-                              {
-                                "name": "macos-12-py3-arm64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376285"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376325"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376368"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376420"
-                              },
-                              {
-                                "name": "macos-12-py3-x86-64 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376474"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376524"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376577"
-                              },
-                              {
-                                "name": "caffe2-linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376647"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9-slow / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376697"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-tsan / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587466558"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9-slow / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587466800"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587470226"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-tsan / test (tsan, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587472364"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587514019"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587516320"
-                              },
-                              {
-                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587516365"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.3-py3.8 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587527524"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.3-py3.8 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587530460"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.3-py3.8 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587530531"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587540455"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542564"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542599"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542630"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542674"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542727"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542772"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542805"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542846"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542879"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542911"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542950"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / filter",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587545736"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 1, 4, linux.g5.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548567"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 2, 4, linux.g5.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548593"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 3, 4, linux.g5.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548643"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 4, 4, linux.g5.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548672"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (slow, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548710"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (slow, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548730"
-                              },
-                              {
-                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (functorch, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548761"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603002/jobs/2839950741"
                               },
                               {
-                                "name": "macos-12-py3-arm64 / filter",
+                                "name": "test (default, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587781241"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603002/jobs/2840029810"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQKq4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSg="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-docs"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603000"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "macos-12-py3-arm64-mps / Run MPS tests",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587781320"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603000/jobs/2839950661"
                               },
                               {
-                                "name": "macos-12-py3-arm64 / test (default, 1, 2, macos-m1-12)",
+                                "name": "build-docs (cpp)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587784438"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603000/jobs/2840023513"
                               },
                               {
-                                "name": "macos-12-py3-arm64 / test (default, 2, 2, macos-m1-12)",
+                                "name": "build-docs (python)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587784531"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603000/jobs/2840023552"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIb-Fc=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQCGQ=",
+                              "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7ZM="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAnInHI8= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAmJq6u8= name=pytorch number=91340 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7",
-                  "checkSuites": {
-                    "nodes": [
-                      {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417631"
-                            },
-                            {
-                              "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417664"
-                            },
-                            {
-                              "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417705"
-                            },
-                            {
-                              "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417734"
-                            },
-                            {
-                              "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417775"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417817"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417859"
-                            },
-                            {
-                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417907"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418062"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418100"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418127"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418163"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418200"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418228"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418252"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418285"
-                            },
-                            {
-                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418317"
-                            }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnInH7M=",
-                            "hasNextPage": false
-                          }
-                        }
-                      }
-                    ]
-                  }
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAnIb-Fc= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAmJq7Y0= name=pytorch number=91340 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7",
-                  "checkSuites": {
-                    "nodes": [
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSk="
+                      },
                       {
-                        "checkRuns": {
-                          "nodes": [
-                            {
-                              "name": "macos-12-py3-arm64 / test (functorch, 1, 1, macos-m1-12)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587784596"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / filter",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587796241"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798805"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798838"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798865"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798903"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798942"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (functorch, 1, 1, windows.8xlarge.nvidia.gpu)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798976"
-                            },
-                            {
-                              "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587799010"
-                            },
-                            {
-                              "name": "macos-12-py3-x86-64 / filter",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587834238"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-rocm4.5-py3.7"
                             },
-                            {
-                              "name": "macos-12-py3-x86-64 / test (default, 1, 2, macos-12)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587836679"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603003"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603003/jobs/2839950637"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603003/jobs/2840068586"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603003/jobs/2840068671"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqRADE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
                             },
-                            {
-                              "name": "macos-12-py3-x86-64 / test (default, 2, 2, macos-12)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587836820"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603004"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603004/jobs/2839950560"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKU=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcS0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cpu-py3"
                             },
-                            {
-                              "name": "macos-12-py3-x86-64 / test (functorch, 1, 1, macos-12)",
-                              "conclusion": "SUCCESS",
-                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587836879"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603005"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603005/jobs/2839950626"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603005/jobs/2840145642"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603005/jobs/2840145755"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSq34=",
+                              "hasNextPage": false
                             }
-                          ],
-                          "pageInfo": {
-                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIc5ZE=",
-                            "hasNextPage": false
-                          }
-                        }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcS8="
                       }
-                    ]
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
                   }
                 }
               }
@@ -23776,79 +3255,15 @@
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=82169 owner=pytorch": {
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcS8= name=pytorch number=73811 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "ezyang"
-          },
-          "title": "Move test_dtypes so it runs later",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #82169\n\nThe error messages it gives are very unhelpful (because a failure\ngets translated into \"dtype was not supported\" rather than the\nactual backtrace), so I'd rather get error messages about this after\nI've tested basic functionality.\n\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>",
-          "headRefName": "gh/ezyang/1279/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/ezyang/1279/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "cef34da55a59da5a32494bff218ccd4978b659d3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "83ad7e73a07111ac1d85e931d14360cc22c01edd"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            },
-            "totalCount": 3
-          },
           "commits": {
             "nodes": [
               {
                 "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
                   "checkSuites": {
                     "edges": [
                       {
@@ -23859,61 +3274,56 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "linux-xenial-py3.7-gcc5.4"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823981"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603007"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310707890"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708140"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2839950666"
                               },
                               {
-                                "name": "Test collect_env (without_torch)",
+                                "name": "test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708223"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840025927"
                               },
                               {
-                                "name": "Test collect_env (older_python_version)",
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708332"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840025995"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708496"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026086"
                               },
                               {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708710"
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026134"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708937"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026235"
                               },
                               {
-                                "name": "workflow-checks",
+                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310709169"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603007/jobs/2840026282"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGj1lc=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQFvU=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "FAILURE"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8k="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcTE="
                       },
                       {
                         "node": {
@@ -23923,9 +3333,9 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                              "name": "linux-xenial-py3.7-gcc7"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823979"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603009"
                           },
                           "checkRuns": {
                             "nodes": [],
@@ -23936,7 +3346,7 @@
                           },
                           "conclusion": "CANCELLED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8s="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcTU="
                       },
                       {
                         "node": {
@@ -23946,26 +3356,49 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                              "name": "linux-bionic-rocm4.5-py3.7"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823982"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603010"
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcTg="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Test tools"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603012"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823982/jobs/4310707884"
+                                "name": "test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603012/jobs/2839950623"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjz0w=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQ4=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SKIPPED"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9A="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcT0="
                       },
                       {
                         "node": {
@@ -23975,20 +3408,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747823980"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603013"
                           },
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603013/jobs/2839950631"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRg=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "CANCELLED"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9Q="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcT8="
                       },
                       {
                         "node": {
@@ -23998,20 +3437,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "macos-10-15-py3-arm64"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747824002"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603251"
                           },
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603251/jobs/2839951040"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA8=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "CANCELLED"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdAs="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_k="
                       },
                       {
                         "node": {
@@ -24021,2922 +3466,7707 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "ios-12-5-1-arm64-coreml"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2747824048"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603253"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708487"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708713"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708942"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709174"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709340"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709579"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709844"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710003"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710175"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710516"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710716"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710890"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711097"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711234"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711429"
-                              },
-                              {
-                                "name": "linux-focal-rocm5.2-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711603"
-                              },
-                              {
-                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711765"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711946"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712129"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712276"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194495"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194591"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194659"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194749"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194858"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194934"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311195003"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220458"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220540"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222725"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222869"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223128"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223225"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223324"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223396"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223496"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223569"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223690"
-                              },
-                              {
-                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311224360"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311230050"
-                              },
-                              {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311301930"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603253/jobs/2839951038"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_w="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "ios-12-5-1-arm64"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603254"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302152"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603254/jobs/2839951030"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "macos-11-py3-x86-64"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603255"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302303"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603255/jobs/2839951034"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                                "name": "test (default, 1, 2, macos-11)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302433"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603255/jobs/2840127016"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302531"
-                              },
+                                "name": "test (default, 2, 2, macos-11)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603255/jobs/2840127073"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSQ2M=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCc_4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "ios-12-5-1-arm64-custom-ops"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603256"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491082"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603256/jobs/2839951041"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAA="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCdAA= name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7",
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "ios-12-5-1-x86-64-coreml"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603259"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491172"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603259/jobs/2839951039"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "ios-12-5-1-arm64-metal"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603261"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491232"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603261/jobs/2839951042"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "macos-10-15-py3-lite-interpreter-x86-64"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603264"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491289"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603264/jobs/2839951036"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAs=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "ios-12-5-1-x86-64"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983603269"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491348"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983603269/jobs/2839951029"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG0YME=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdBE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCddM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdIQ="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCddw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdeI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdeY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdes="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=31093 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "mingxiaoh"
+          },
+          "title": "improve mkldnn convolution test coverage",
+          "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ",
+          "headRefName": "master",
+          "headRepository": {
+            "nameWithOwner": "mingxiaoh/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "11pikachu"
+                    },
+                    "email": "junx.du@intel.com",
+                    "name": "dujun"
+                  },
+                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406538?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406947?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406544?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406931?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406550?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406887?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406526?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406707?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406533?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407256?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407254?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407255?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406556?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406532?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406527?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.8-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406553?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.6-clang9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406537?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.8-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406529?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.5.1-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406554?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.7-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406545?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406543?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406536?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406552?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjyQg=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdMA="
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406535?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdeE="
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406540?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406528?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406541?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-asan",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406549?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-clang7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406555?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc4.8",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406546?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc5.4",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406531?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406534?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7.2",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406523?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.8",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406539?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.3-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406547?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.5.1-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406551?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407209?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406611?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_bazel_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406607?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_bazel_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406984?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_cpp_doc_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407013?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_doc_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407011?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_ios_11_2_1_x86_64_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406548?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406563?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408680?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_backward_compatibility_check_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406567?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406945?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406561?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_coverage_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407422?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_rocm3_7_py3_6_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406562?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406612?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408107?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408111?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408101?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406613?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406565?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407017?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407019?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407012?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407016?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_vulkan_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406608?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406609?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406606?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test1",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407435?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test2",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407436?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406605?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdfU="
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_custom_build_dynamic",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406610?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdgg="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-07-27T15:34:17Z",
-                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
-                }
-              }
-            ]
-          },
-          "changedFiles": 1,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/test_ops.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "zou3519"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "Chillee"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0yNVQxNDo0NTozNS0wNzowMLkyMDIyLTA3LTI1VDE0OjQ1OjM1LTA3OjAwzj6XYmg=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
-              {
-                "bodyText": "@pytorchbot merge -f FORCE",
-                "createdAt": "2022-07-27T17:56:43Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1197107402
-              },
-              {
-                "bodyText": "You need to provide a reason for using force merge, in the format @pytorchbot merge -f '[CATEGORY] Explanation'. With [CATEGORY] being one the following:\nEMERGENCY - an emergency fix to quickly address an issue\nMINOR - a minor fix such as cleaning locally unused variables, which shouldn't break anything\nPRE_TESTED - a previous CI run tested everything and you've only added minor changes like fixing lint\nOTHER - something not covered above",
-                "createdAt": "2022-07-27T17:56:45Z",
-                "author": {
-                  "login": "pytorch-bot"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1197107439
-              },
-              {
-                "bodyText": "@pytorchbot merge -f \"[OTHER] normal land failed twice already\"",
-                "createdAt": "2022-07-27T17:57:28Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1197108130
-              },
-              {
-                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
-                "createdAt": "2022-07-27T18:08:13Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1197119348
-              },
-              {
-                "bodyText": "Hey @ezyang.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-07-27T18:08:58Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1197120095
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOR1poyg==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=73811 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "seemethere"
-          },
-          "title": "ci: Migrate metrics credentials to managed IAM",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas <eliuriegas@fb.com>",
-          "headRefName": "gh/seemethere/215/head",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "gh/seemethere/215/base",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "seemethere"
-                    },
-                    "email": "eliuriegas@fb.com",
-                    "name": "Eli Uriegas"
-                  },
-                  "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "seemethere"
-                    },
-                    "email": "eliuriegas@fb.com",
-                    "name": "Eli Uriegas"
-                  },
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mg",
-              "hasNextPage": false
-            },
-            "totalCount": 2
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
+                        "context": "ci/circleci: pytorch_macos_10_13_py3_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406525?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
                       {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcBs="
+                        "context": "ci/circleci: pytorch_macos_10_13_py3_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407415?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_python_doc_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407018?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406566?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602960"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPo="
+                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406946?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cpu-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602961"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPw="
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cpu_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406542?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602963"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP4="
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406530?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602964"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP8="
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test1",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407028?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602965"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQE="
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test2",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602967"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQI="
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda11.0_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406524?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7-no-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602966"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602966/jobs/2839950629"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQM="
+                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406572?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Test tools"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602968"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQQ="
+                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407253?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
                       },
                       {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-asan"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1983602970"
-                          },
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "CANCELLED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
+                        "context": "codecov/patch",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                      },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "context": "codecov/project",
                         "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044969?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
                       },
                       {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "context": "pr/caffe2-pytorch-linux-bionic-rocm3.7-py3.6-test",
                         "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17045014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "targetUrl": "https://ci.pytorch.org/jenkins/job/caffe2-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger-test/2319/"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "context": "pr/pytorch-linux-bionic-rocm3.7-py3.6",
                         "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044975?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "targetUrl": "https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger/2325/"
                       }
                     ]
                   },
-                  "pushedDate": "2022-03-14T23:01:55Z",
-                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
+                  "pushedDate": "2020-09-11T01:58:24Z",
+                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                }
+              }
+            ]
+          },
+          "changedFiles": 5,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/math_libraries/convolutions.py"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "CHANGES_REQUESTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "CHANGES_REQUESTED"
+              },
+              {
+                "author": {
+                  "login": "ailzhang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry  It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes",
+                "createdAt": "2020-08-14T01:36:20Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 673816925
+              },
+              {
+                "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.",
+                "createdAt": "2020-08-14T03:09:37Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 673858224
+              },
+              {
+                "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@           Coverage Diff           @@\n##           master   #31093   +/-   ##\n=======================================\n  Coverage   68.00%   68.00%           \n=======================================\n  Files         382      382           \n  Lines       49527    49527           \n=======================================\n  Hits        33679    33679           \n  Misses      15848    15848           \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute <relative> (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.",
+                "createdAt": "2020-09-04T05:41:01Z",
+                "author": {
+                  "login": "codecov"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "codecov"
+                },
+                "databaseId": 686921371
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale.  Feel free to remove the Stale label if you feel this was a mistake.  If you are unable to remove the Stale label please contact a maintainer in order to do so.  Stale pull requests will automatically be closed 30 days after being marked Stale",
+                "createdAt": "2022-04-12T02:35:37Z",
+                "author": {
+                  "login": "pytorchbot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1095860944
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
+                "createdAt": "2022-06-11T04:40:16Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1152854802
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOKCmhXQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "triaged"
+                }
+              },
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Stale"
                 }
               }
             ]
-          },
-          "changedFiles": 3,
-          "files": {
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOKCmhXQ== name=pytorch number=31093 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
             "nodes": [
               {
-                "path": ".github/templates/common.yml.j2"
+                "bodyText": "Hi, @mingfeima  @soumith  @Jianhui-Li\nthis will improve the test coverage of mkldnn convolution, would you please review it?\nThe current code is forward only, do we need to cover backward, if yes, we can add backward.",
+                "createdAt": "2019-12-12T01:19:02Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 564806270
+              },
+              {
+                "bodyText": "@mingxiaoh, what is the value in testing DNNL as part of Pytorch validation for the Pytorch developers? Shouldn't having these tests run in DNNL validation be enough?",
+                "createdAt": "2019-12-12T01:28:32Z",
+                "author": {
+                  "login": "vpirogov"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 564808528
+              },
+              {
+                "bodyText": "@vpirogov  The main value is to serve as a blind test to DNNL. If DNNL adds these test to DNNL test sets, it lost the value as a blind test.  The spirit of validation is to cross check.\n@gottbrath @gchanan  The test was developed per the request of Pytorch team. Mingxiao made an effort to reduce the execution time to a few second but still with good coverage.  Although the test today is focused on DNNL, it could be easily extended to be blind test for any conv implementation used in Pytorch.",
+                "createdAt": "2019-12-20T07:44:30Z",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 567826907
+              },
+              {
+                "bodyText": "@mruberry thanks for the comment. As for the chainer dependency, we import it is because we would like to use its testing function for pytest test cases combinations, other wise we need to write much more code to achieve same effect. So, can we use it?",
+                "createdAt": "2020-01-15T09:04:34Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 574563012
+              },
+              {
+                "bodyText": "@mingxiaoh You cannot import chainer. Looking at the code you should be able to achieve the same effect without it.",
+                "createdAt": "2020-01-16T17:59:46Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 575272358
+              },
+              {
+                "bodyText": "@mruberry ok, we will change it according to your requirement. Thanks",
+                "createdAt": "2020-02-10T00:59:34Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 583917522
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/31093\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 29f6aa6 (more details on the Dr. CI page):\n\nCommit 29f6aa6 was recently pushed. Waiting for builds...\n\nThis comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2020-05-14T08:04:30Z",
+                "author": {
+                  "login": "dr-ci"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 628466876
+              },
+              {
+                "bodyText": "@mruberry how about those cudnn UT error? we add check for it but it should be NV to fix cudnn bugs.",
+                "createdAt": "2020-05-18T05:34:11Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 629955767
+              },
+              {
+                "bodyText": "Hey @mingxiaoh! You're right, of course, that you shouldn't have to fix cuDNN bugs. Would you please:\n\nAssert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update.\nFile a new issue explaining the behavior and providing a short PyTorch program to reproduce the issue.\n\nThen we can ping NVIDIA on that issue.",
+                "createdAt": "2020-05-18T07:27:08Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 629997129
+              },
+              {
+                "bodyText": "about the suggestion 'Assert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update. ',  if we only assert it and continue the following test, I guess users might always ignore them in later test. Anyway, any similar example case for reference?",
+                "createdAt": "2020-05-18T07:55:08Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 630010734
+              },
+              {
+                "bodyText": "In this recent PR https://github.com/pytorch/pytorch/pull/38505/files, for example, you can see that the construction of bool tensors wasn't working properly, so the test author cited the relevant issue and asserted that the incorrect behavior happened, as expected. You can also see how these lines are being removed by https://github.com/pytorch/pytorch/pull/38392/files, which fixes the issue.\nAnother common pattern is to use with self.assertRaises(RuntimeError/AssertionError/etc.):.",
+                "createdAt": "2020-05-18T08:02:13Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 630014823
+              },
+              {
+                "bodyText": "@mruberry the failed UT case is not introduced by our modification, how to handle this issue?",
+                "createdAt": "2020-05-20T01:59:13Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631187735
+              },
+              {
+                "bodyText": "@mingxiaoh You mean the failures on ROCm? You may ignore them. Be sure to re-request review when you're ready.",
+                "createdAt": "2020-05-20T02:12:58Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 631191425
+              },
+              {
+                "bodyText": "@mruberry  we already skipped those ROCm errors, but there are stil somel error caused by the original code, they are not introduced by our modification.",
+                "createdAt": "2020-05-21T05:18:07Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631886529
+              },
+              {
+                "bodyText": "I understand. Let me know when you're ready for me to review.",
+                "createdAt": "2020-05-21T06:24:15Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 631908011
+              },
+              {
+                "bodyText": "@mruberry thanks, we are ready for review now.",
+                "createdAt": "2020-05-21T06:28:11Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631909442
+              },
+              {
+                "bodyText": "@mingxiaoh Great! I'll take a look ASAP.",
+                "createdAt": "2020-05-21T06:31:10Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 631910556
+              },
+              {
+                "bodyText": "@mruberry we just pull the latest code and updated the patch according to your comment, may you please help double check it? BTW, the new failed case in preci is not introduced by our modification.",
+                "createdAt": "2020-05-25T07:44:58Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 633430458
+              },
+              {
+                "bodyText": "@ailzhang would you please check the comment below? Thanks.\nIs there a reason why this TestConv2dExt is a new class instead a test inside TestNN?\n//comment: it is actually suggested by Tongzhou Wang in another thread before.\nAlthough this test sits in generic testing framework, it's actually comparing thnn/mkldnn/cudnn results specially. I feel it's better to make it truly generic so that it compares any device result with CPU result. Alternatively you can mark this test only run when torch.backends.mkldnn.is_available()=True\n//comment: but our goal is to compare the result with that of thnn. Anyway, if you insist, we can start to compare it with cpu.",
+                "createdAt": "2020-05-27T05:11:08Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 634432326
+              },
+              {
+                "bodyText": "Pruning reviewers. @ngimel, @VitalyFedyunin, this PR is looking pretty good from a test framework perspective. Would one of you like to review?",
+                "createdAt": "2020-05-27T09:58:42Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 634557563
               },
               {
-                "path": ".github/workflows/generated-macos-11-py3-x86-64.yml"
+                "bodyText": "@mruberry  Thanks, would you please help review it again. BTW: failed case is not introduced by our modification.",
+                "createdAt": "2020-05-28T10:26:32Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 635256214
               },
               {
-                "path": ".github/workflows/update_pytorch_labels.yml"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
+                "bodyText": "@mruberry  we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code",
+                "createdAt": "2020-06-02T08:00:01Z",
+                "author": {
+                  "login": "1pikachu"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637364148
+              },
+              {
+                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.",
+                "createdAt": "2020-06-02T10:23:47Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 637444457
+              },
+              {
+                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.\n\n@mruberry  thank you",
+                "createdAt": "2020-06-02T11:32:06Z",
+                "author": {
+                  "login": "1pikachu"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637479226
+              },
+              {
+                "bodyText": "Improving test coverage of math libraries is certainly a good goal and this PR is moving towards it. I have some doubts about implementation decisions made, and about running this PR as part of regular pytorch CI.\nIf the primary goal of this PR is to test correctness of the convolution implementations in the vendor library, then it does not serve this purpose. The absolute majority of the 4000+ test cases come from group 1, where different kernel sizes/strides/dilations are used to produce the output of size 1x1. This can test whether pytorch correctly passes convolution parameters to the backends (although there are cheaper ways to do that), but as actual library correctness check it is almost useless - libraries use very different kernels depending in the input/output sizes, and tests with toy sizes like this don't invoke the real bread-and-butter kernels.\nAlso, if this test suite is meant as primary a means of testing vendor libraries (which is a good goal!) it does not have a place as a part of pytorch regular CI, and should be run when the corresponding vendor libraries are updated. I'd suggest moving this test out into a separate file (maybe even outside of torch/test directory) and have it as a part of library update/qualification process rather than regular CI.\nAlso, if the primary goal is to enable easier testing of vendor libraries correctness, perhaps we should rethink the mechanism of the generation of test cases. It should be easy to add a test case with a particular set of parameters that was found to be buggy. Also, running a cross-product of cases in a multi-dimensional space (as this PR does) is rarely an efficient way of getting a signal, some forms of random sampling usually provide a way to get better correctness signal why using less resources.\nAlso, when testing libraries it is important to test both forward and backward functions, whereas this PR does forward only. I'm openminded on whether convTransposed should be tested or not - if we are testing vendor libraries, then it's not necessary, convTransposed calls the same underlying functions, if we are testing pytorch, then it makes sense to test it separately because it takes different codepaths.",
+                "createdAt": "2020-06-02T21:56:33Z",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 637827507
+              },
+              {
+                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.",
+                "createdAt": "2020-06-03T02:16:07Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637912105
+              },
+              {
+                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.\n\nWe know this PR has been open for awhile and we respect that your time is valuable, but we want to make sure we're making the right change here, and I think @ngimel's comments reflect that and should not be too difficult to address. As I understand, her points are:\n\nThis is a good PR with an exciting idea. To let it run longer and test more cases maybe it should run outside the regular PyTorch CI.\nTo remedy this, let's create a test/math_libraries folder and put this test there: test/math_libaries/convolutions.py. Yes, this is different from our requests in the past, which is our mistake, but it should be an easy change.\nTo make the test more interesting it'd be good for the test cases to resemble convolutions used in practice. The current test cases seem like similar \"toy\" examples. Without time pressure we should be able to run larger, more computationally intensive convolutions.\nLet's change the test cases to include some practical convolutions, make it easy to add test cases, and think about how we might generate other interesting cases. (We should also test backwards once we have more time!)\n\nAnd I think these are good points. Maybe the PR doesn't create a new way to generate interesting convolutions to start and instead only runs a few representative convolutions, but @ngimel is positioning the work for success so that it's useful and we can continue to improve on it in the future.\nDoes that make sense?",
+                "createdAt": "2020-06-03T03:04:55Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 637924703
+              },
+              {
+                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap.  Given this, it would be be better if you raise all the requirement at a time,  considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.",
+                "createdAt": "2020-06-03T05:22:43Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 637960626
+              },
+              {
+                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.\n\nI'm sorry, I don't think I've talked to @Jianhui-Li before. It's true that the team we expressed a concern about timing if the test was to be run in the CI initially, but I think now that we understand what the test is trying to do better we're not sure the CI is the best place for it. The PR was also closed after a lengthy period of inactivity, and we assumed it had simply been abandoned.\nDo you know who @Jianhui-Li spoke with about this issue originally? Maybe I can follow-up with them for more context.",
+                "createdAt": "2020-06-03T05:42:28Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 637967153
+              },
+              {
+                "bodyText": "@mruberry  it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?",
+                "createdAt": "2020-06-03T06:13:14Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637978356
+              },
+              {
+                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.",
+                "createdAt": "2020-06-03T20:34:05Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 638446723
+              },
+              {
+                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.\n\nLet me sync with Mingxiao and follow up with this. Thanks.",
+                "createdAt": "2020-06-03T20:44:44Z",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 638451670
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?",
+                "createdAt": "2020-07-02T14:09:23Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 653028208
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?",
+                "createdAt": "2020-07-06T20:15:04Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 654443242
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks",
+                "createdAt": "2020-07-09T11:04:06Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 656062287
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry  the code is ready for review now, would you please take time for it? Thanks.",
+                "createdAt": "2020-07-14T09:16:48Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 658071151
+              },
+              {
+                "bodyText": "super nit: renaming files to .json will make it more IDE friendly.",
+                "createdAt": "2020-07-14T23:38:37Z",
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 658464685
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.\n\nCool! I took a look with @ngimel, once these issues are addressed I think we're good to go!",
+                "createdAt": "2020-07-16T05:17:29Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 659164401
+              },
+              {
+                "bodyText": "@ngimel  & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.",
+                "createdAt": "2020-07-20T08:30:01Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 660884305
+              },
+              {
+                "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.\n\nUpdated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.",
+                "createdAt": "2020-07-22T20:26:42Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 662678464
+              },
+              {
+                "bodyText": "Updated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.\n@mruberry we have finished the modification according to your comment, would you please review it again? Thanks.",
+                "createdAt": "2020-07-23T10:24:26Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 662930687
+              },
+              {
+                "bodyText": "The code looks good, but I tried running the test suite and hit the following failures:\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float16, group:1, batchsize:22input channel:448, output channel:384, bias:False, padding:[1, 1], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float32, group:1, batchsize:22input channel:80, output channel:192, bias:False, padding:[0, 0], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 106, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\nLooking at the first invalid convolution, for example, it's:\n    {\n        \"case_name\":\"masknet_p1:conv33\",\n        \"mb\":1,\n        \"g\":1,\n        \"ic\":512,\n        \"ih\":64,\n        \"iw\":64,\n        \"oc\":12,\n        \"kh\":1,\n        \"kw\":1,\n        \"sh\":1,\n        \"sw\":1,\n        \"ph\":0,\n        \"pw\":0,\n        \"dh\":0,\n        \"dw\":0,\n        \"bias\":\"False\"\n    },\n\nwhich has a dh and dw of zero, causing it to be added to invalid cases here:\ndh, dw = case['dh'], case['dw']\n            has_bias = case['bias']\n            if dh == 0 or dw == 0:\n                invalid_cases.append(case_name)",
+                "createdAt": "2020-07-23T21:25:19Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "mruberry"
+                },
+                "databaseId": 663240268
+              },
+              {
+                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.",
+                "createdAt": "2020-07-27T12:43:44Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 664373079
+              },
               {
+                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.\n\nBefore I run these tests again, is an atol of 1e-2 needed for all types or just half? Also, how does 1e-2 compare to the values that are being compared?",
+                "createdAt": "2020-07-27T18:39:27Z",
                 "author": {
-                  "login": "kit1980"
+                  "login": "mruberry"
                 },
-                "state": "APPROVED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 664569507
               },
               {
+                "bodyText": "@mruberry 1e-2 is experimental result, details see below, random means it might be failed sometimes.\n\n\n\natol,rtol\n1e-2,1e-2\n1e-2,1e-3\n1e-3,1e-2\n1e-3,1e-3\n1e-4,1e-3\n1e-3,1e-4\n1e-4,1e-4\n1e-4,1e-5\n1e-5,1e-4\n\n\n\n\nCuda float16\npass\npass\npass\npass\npass\nfail\nFail\nFail\nfail\n\n\nCuda float32\npass\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nfail",
+                "createdAt": "2020-07-31T03:33:27Z",
                 "author": {
-                  "login": "janeyx99"
+                  "login": "mingxiaoh"
                 },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 666894774
+              },
               {
-                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976",
-                "createdAt": "2022-03-15T17:43:28Z",
+                "bodyText": "@mruberry  would you please find time to review it again? Thanks.",
+                "createdAt": "2020-08-04T05:01:20Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "mingxiaoh"
                 },
-                "authorAssociation": "MEMBER",
+                "authorAssociation": "NONE",
                 "editor": null,
-                "databaseId": 1068270969
+                "databaseId": 668380451
               },
               {
-                "bodyText": "@pytorchbot force merge this",
-                "createdAt": "2022-03-15T20:26:36Z",
+                "bodyText": "@mruberry would you please find time to review it again? Thanks.\n\nI was just about to try and run this again locally but it looks like the files describing the convolutions are missing?",
+                "createdAt": "2020-08-07T03:49:44Z",
                 "author": {
-                  "login": "seemethere"
+                  "login": "mruberry"
                 },
-                "authorAssociation": "MEMBER",
+                "authorAssociation": "COLLABORATOR",
                 "editor": null,
-                "databaseId": 1068436128
+                "databaseId": 670306210
               },
               {
-                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952",
-                "createdAt": "2022-03-15T20:27:47Z",
+                "bodyText": "@mruberry sorry but what is missing actually?",
+                "createdAt": "2020-08-07T05:00:20Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "mingxiaoh"
                 },
-                "authorAssociation": "MEMBER",
+                "authorAssociation": "NONE",
                 "editor": null,
-                "databaseId": 1068437098
+                "databaseId": 670322557
               },
               {
-                "bodyText": "@pytorchbot merge this",
-                "createdAt": "2022-03-15T21:18:55Z",
+                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.",
+                "createdAt": "2020-08-07T16:06:41Z",
                 "author": {
-                  "login": "seemethere"
+                  "login": "mruberry"
                 },
-                "authorAssociation": "MEMBER",
+                "authorAssociation": "COLLABORATOR",
                 "editor": null,
-                "databaseId": 1068482921
+                "databaseId": 670591170
               },
               {
-                "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-03-15T21:20:40Z",
+                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.\n\n@mruberry sorry, we add them now, would you please check it again? Thanks.",
+                "createdAt": "2020-08-13T10:40:11Z",
                 "author": {
-                  "login": "github-actions"
+                  "login": "mingxiaoh"
                 },
                 "authorAssociation": "NONE",
                 "editor": null,
-                "databaseId": 1068484404
+                "databaseId": 673402901
+              },
+              {
+                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.",
+                "createdAt": "2020-08-13T23:35:00Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 673760580
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==",
-              "hasPreviousPage": true
+              "startCursor": "Y3Vyc29yOnYyOpHOIapCfg==",
+              "hasPreviousPage": false
             }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
           }
         }
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=31093 owner=pytorch": {
+  "query_sha=eb979626157e70cf52d29cf16eaa852bedf0f29b1831e9021e1bf3e7457be7fd commit=6882717f73deffb692219ccd1fd6db258d8ed684 name=pytorch owner=pytorch": {
     "data": {
       "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "mingxiaoh"
-          },
-          "title": "improve mkldnn convolution test coverage",
-          "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ",
-          "headRefName": "master",
-          "headRepository": {
-            "nameWithOwner": "mingxiaoh/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
+        "object": {
+          "checkSuites": {
+            "edges": [
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "11pikachu"
-                    },
-                    "email": "junx.du@intel.com",
-                    "name": "dujun"
+                "node": {
+                  "app": {
+                    "name": "Facebook GitHub Tools",
+                    "databaseId": 12274
                   },
-                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            },
-            "totalCount": 1
-          },
-          "commits": {
-            "nodes": [
+                  "workflowRun": null,
+                  "checkRuns": {
+                    "nodes": [],
+                    "pageInfo": {
+                      "endCursor": null,
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": null
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hng="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "Netlify",
+                    "databaseId": 13473
+                  },
+                  "workflowRun": null,
+                  "checkRuns": {
+                    "nodes": [],
+                    "pageInfo": {
+                      "endCursor": null,
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": null
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hpE="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "Azure Pipelines",
+                    "databaseId": 9426
+                  },
+                  "workflowRun": null,
+                  "checkRuns": {
+                    "nodes": [],
+                    "pageInfo": {
+                      "endCursor": null,
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": null
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hpw="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "Dependabot",
+                    "databaseId": 29110
+                  },
+                  "workflowRun": null,
+                  "checkRuns": {
+                    "nodes": [],
+                    "pageInfo": {
+                      "endCursor": null,
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": null
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hrA="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "Codecov",
+                    "databaseId": 254
+                  },
+                  "workflowRun": null,
+                  "checkRuns": {
+                    "nodes": [],
+                    "pageInfo": {
+                      "endCursor": null,
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": null
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hsM="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "PyTorch Bot",
+                    "databaseId": 40112
+                  },
+                  "workflowRun": null,
+                  "checkRuns": {
+                    "nodes": [],
+                    "pageInfo": {
+                      "endCursor": null,
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": null
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hs0="
+              },
               {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [],
+                "node": {
+                  "app": {
+                    "name": "GitHub Actions",
+                    "databaseId": 15368
+                  },
+                  "workflowRun": {
+                    "workflow": {
+                      "name": "Lint"
+                    },
+                    "url": "https://github.com/pytorch/pytorch/actions/runs/2638241883"
+                  },
+                  "checkRuns": {
+                    "nodes": [
+                      {
+                        "name": "workflow-checks",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095495959"
+                      },
+                      {
+                        "name": "quick-checks",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496003"
+                      },
+                      {
+                        "name": "Test tools",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496162"
+                      },
+                      {
+                        "name": "toc",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496320"
+                      },
+                      {
+                        "name": "Test collect_env (with_torch)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496465"
+                      },
+                      {
+                        "name": "Test collect_env (without_torch)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496523"
+                      },
+                      {
+                        "name": "Test collect_env (older_python_version)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496558"
+                      },
+                      {
+                        "name": "lintrunner",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241883/jobs/4095496708"
+                      }
+                    ],
                     "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCVA2Y=",
                       "hasNextPage": false
                     }
                   },
-                  "status": {
-                    "contexts": [
+                  "conclusion": "SUCCESS"
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1hzg="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "GitHub Actions",
+                    "databaseId": 15368
+                  },
+                  "workflowRun": {
+                    "workflow": {
+                      "name": "trunk"
+                    },
+                    "url": "https://github.com/pytorch/pytorch/actions/runs/2638241915"
+                  },
+                  "checkRuns": {
+                    "nodes": [
                       {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406538?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496376"
                       },
                       {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406947?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "android-emulator-build-test / build-and-test",
+                        "conclusion": "FAILURE",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496525"
                       },
                       {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406544?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-cuda11.3-py3.7-gcc7-no-ops / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496611"
                       },
                       {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406931?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "macos-10-15-py3-arm64 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496713"
                       },
                       {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406550?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095496857"
                       },
                       {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406887?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "ios-12-5-1-x86-64 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497178"
                       },
                       {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406526?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497392"
                       },
                       {
-                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406707?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "win-vs2019-cuda11.6-py3 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497580"
                       },
                       {
-                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406533?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "libtorch-linux-xenial-cuda10.2-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497781"
                       },
                       {
-                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407256?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9-slow / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497886"
                       },
                       {
-                        "context": "ci/circleci: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407254?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-rocm5.1-py3.7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095497997"
                       },
                       {
-                        "context": "ci/circleci: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407255?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "macos-10-15-py3-lite-interpreter-x86-64 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498146"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406556?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "macos-11-py3-x86-64 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498338"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406532?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "caffe2-linux-focal-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498448"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406527?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "parallelnative-linux-focal-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095498648"
+                      },
+                      {
+                        "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095659992"
+                      },
+                      {
+                        "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095660077"
+                      },
+                      {
+                        "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095798458"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840103"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840227"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (slow, 1, 1, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840377"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840521"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840605"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840689"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840741"
+                      },
+                      {
+                        "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095840795"
+                      },
+                      {
+                        "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095874982"
+                      },
+                      {
+                        "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875042"
+                      },
+                      {
+                        "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875174"
+                      },
+                      {
+                        "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875221"
+                      },
+                      {
+                        "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875266"
+                      },
+                      {
+                        "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875320"
+                      },
+                      {
+                        "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875369"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.8-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406553?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4095875417"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.6-clang9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406537?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "macos-12.3-py3.8-arm64-test / Run MPS tests",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4096110771"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.8-gcc9",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406529?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "macos-11-py3-x86-64 / test (default, 1, 2, macos-12)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4096408234"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.5.1-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406554?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
+                        "name": "macos-11-py3-x86-64 / test (default, 2, 2, macos-12)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241915/jobs/4096408307"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCn27w=",
+                      "hasNextPage": false
+                    }
+                  },
+                  "conclusion": "FAILURE"
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1h5Q="
+              },
+              {
+                "node": {
+                  "app": {
+                    "name": "GitHub Actions",
+                    "databaseId": 15368
+                  },
+                  "workflowRun": {
+                    "workflow": {
+                      "name": "pull"
+                    },
+                    "url": "https://github.com/pytorch/pytorch/actions/runs/2638241914"
+                  },
+                  "checkRuns": {
+                    "nodes": [
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.7-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406545?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-rocm5.1-py3.7",
+                        "conclusion": "NEUTRAL",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496220"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406543?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "win-vs2019-cuda11.6-py3",
+                        "conclusion": "NEUTRAL",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496344"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406536?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496466"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406552?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang10-onnx / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496612"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406535?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "win-vs2019-cpu-py3 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496726"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406540?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095496862"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406528?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3_7-clang8-xla / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497204"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406541?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497405"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-asan",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406549?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497578"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-clang7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406555?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497784"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc4.8",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406546?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang7-asan / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095497875"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc5.4",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406531?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498008"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406534?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-py3.7-clang7-asan / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498155"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7.2",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406523?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498346"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.8",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406539?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498440"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.3-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406547?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498650"
                       },
                       {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.5.1-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406551?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498724"
                       },
                       {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407209?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095498883"
                       },
                       {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406611?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-py3-clang5-mobile-build / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095499064"
                       },
                       {
-                        "context": "ci/circleci: pytorch_bazel_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406607?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7-no-ops / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095499218"
                       },
                       {
-                        "context": "ci/circleci: pytorch_bazel_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406984?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-xenial-py3.7-gcc7 / build",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095499360"
                       },
                       {
-                        "context": "ci/circleci: pytorch_cpp_doc_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407013?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095615833"
                       },
                       {
-                        "context": "ci/circleci: pytorch_doc_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407011?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668105"
                       },
                       {
-                        "context": "ci/circleci: pytorch_ios_11_2_1_x86_64_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406548?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668215"
                       },
                       {
-                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406563?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668293"
                       },
                       {
-                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408680?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668402"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_backward_compatibility_check_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668480"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406567?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095668571"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406945?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095776890"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406561?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095776922"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_coverage_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407422?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095778975"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_bionic_rocm3_7_py3_6_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406562?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794308"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406612?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794370"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408107?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794452"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408111?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794502"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408101?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794566"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406613?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794652"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406565?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-docs / build-docs (cpp)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794748"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407017?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-docs / build-docs (python)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095794836"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407019?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800591"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407012?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800638"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407016?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800676"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_vulkan_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406608?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800723"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406609?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800762"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406606?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095800805"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test1",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407435?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095813130"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test2",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407436?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095813208"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406605?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858004"
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_custom_build_dynamic",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406610?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858063"
                       },
                       {
-                        "context": "ci/circleci: pytorch_macos_10_13_py3_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406525?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                        "conclusion": "SUCCESS",
+                        "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858127"
+                      }
+                    ],
+                    "pageInfo": {
+                      "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCcmdI=",
+                      "hasNextPage": true
+                    }
+                  },
+                  "conclusion": "SUCCESS"
+                },
+                "cursor": "Y3Vyc29yOnYyOpHPAAAAAbH1h5U="
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=23d6a47e5fd875c42231779040ec1d35d0042b502c9142cb0d33d6f65d58fead commit=6882717f73deffb692219ccd1fd6db258d8ed684 cr_cursor=Y3Vyc29yOnYyOpHPAAAAAbCcmdI= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAbH1h5Q= name=pytorch owner=pytorch": {
+    "data": {
+      "repository": {
+        "object": {
+          "oid": "6882717f73deffb692219ccd1fd6db258d8ed684",
+          "checkSuites": {
+            "nodes": [
+              {
+                "checkRuns": {
+                  "nodes": [
+                    {
+                      "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                      "conclusion": "SUCCESS",
+                      "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858194"
+                    },
+                    {
+                      "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
+                      "conclusion": "SUCCESS",
+                      "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4095858272"
+                    },
+                    {
+                      "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                      "conclusion": "SUCCESS",
+                      "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2638241914/jobs/4096006884"
+                    }
+                  ],
+                  "pageInfo": {
+                    "endCursor": "Y3Vyc29yOnYyOpHPAAAAAbCfo8c=",
+                    "hasNextPage": false
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "Dummy change with lots of commits",
+          "body": "Draft PR with 100+ commits, to test mergebot ",
+          "headRefName": "malfet/pr-with-lots-of-commits",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "3067f2240afc7a29dc348000aa19eccbd9772303"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "andrewor14"
+                    },
+                    "email": "andrewor@fb.com",
+                    "name": "Andrew Or"
+                  },
+                  "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "jwtan@fb.com",
+                    "name": "Jiewen Tan"
+                  },
+                  "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "jwtan@fb.com",
+                    "name": "Jiewen Tan"
+                  },
+                  "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "aac6204bf710beb5e50a383d426ae6222396335a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "486387e8644afb46edff5aa5925b55c8119f67f0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Krovatkin"
+                    },
+                    "email": "korovaikon@gmail.com",
+                    "name": "Nikolay Korovaiko"
+                  },
+                  "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "suo"
+                    },
+                    "email": "suo@fb.com",
+                    "name": "Michael Suo"
+                  },
+                  "oid": "f70b31f62b1c5159eef2725484b175983517c88c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "46b754a55b63e3168ad5854ad412c124934b675d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "13df69e13ee571fdd716139419a00aec47ade7d6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "7917d789f0a523715041ade5177d271082628236"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kit1980"
+                    },
+                    "email": "sdym@fb.com",
+                    "name": "Sergii Dymchenko (Meta Employee)"
+                  },
+                  "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@fb.com",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pearu"
+                    },
+                    "email": "pearu.peterson@gmail.com",
+                    "name": "Pearu Peterson"
+                  },
+                  "oid": "28502265cb5925cb7db8dcb2dd2334963092714a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pritamdamania"
+                    },
+                    "email": "pritam.damania@fb.com",
+                    "name": "pritam"
+                  },
+                  "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "MagiaSN"
+                    },
+                    "email": "magialiao@tencent.com",
+                    "name": "magialiao"
+                  },
+                  "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "davidberard98"
+                    },
+                    "email": "dberard@fb.com",
+                    "name": "David Berard"
+                  },
+                  "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "9608ab28744d5cae32f371490557b248c9549c66"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "rohan-varma"
+                    },
+                    "email": "rvarm1@fb.com",
+                    "name": "Rohan Varma"
+                  },
+                  "oid": "447580dc565f3660eddb2c996c6ed25b88338684"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jiyuanzFB"
+                    },
+                    "email": "jiyuanz@fb.com",
+                    "name": "Jiyuan Zhang"
+                  },
+                  "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "a366fd01136292544b7862968ae92feba4b6d8fe"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "d306c99addc543908f64666baeecacbd0749f4a7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "c2456ea658f41f64ea054a422edf22a9c977399f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "anjali411"
+                    },
+                    "email": "chourdiaanjali123@gmail.com",
+                    "name": "anjali411"
+                  },
+                  "oid": "af761d9a5d058c9188f16589bae4f307d35185be"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "1516554e22136db89d0aeba43a1a1a987e995d68"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "38c1a2028090353e40a019c673c9ab16b39e4825"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "20d798b319cd107a767fe220f7a3027c18a1c844"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "7f821382db5ad08efe5b09a145c606852b8a9272"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "davidberard98"
+                    },
+                    "email": "dberard@fb.com",
+                    "name": "David Berard"
+                  },
+                  "oid": "28d6258e62c9fc361a18689877c962c69889dc23"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "HarborYuan"
+                    },
+                    "email": "yuanhaobo@whu.edu.cn",
+                    "name": "Haobo Yuan"
+                  },
+                  "oid": "2350fad8391367ebf81c7236a2c883644b4ff622"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "zou3519"
+                    },
+                    "email": "zou3519@gmail.com",
+                    "name": "Richard Zou"
+                  },
+                  "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jeffdaily"
+                    },
+                    "email": "jeff.daily@amd.com",
+                    "name": "Jeff Daily"
+                  },
+                  "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "george-qi"
+                    },
+                    "email": "georgeqi94@gmail.com",
+                    "name": "George Qi"
+                  },
+                  "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "60fc3277634365b64465712b13db2acb76d6c890"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jerryzh168"
+                    },
+                    "email": "jerryzh168@gmail.com",
+                    "name": "Jerry Zhang"
+                  },
+                  "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ysiraichi"
+                    },
+                    "email": "yukio.siraichi@gmail.com",
+                    "name": "Yukio Siraichi"
+                  },
+                  "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "thiagocrepaldi"
+                    },
+                    "email": "thiago.crepaldi@microsoft.com",
+                    "name": "Thiago Crepaldi"
+                  },
+                  "oid": "83208e7dee4503c1bee1df9f6632794694dffa01"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "fatcat-z"
+                    },
+                    "email": "jiz@microsoft.com",
+                    "name": "Jay Zhang"
+                  },
+                  "oid": "f273961c1696b156e35f8c76f7ad37934031050d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pavithranrao"
+                    },
+                    "email": "pavithran@fb.com",
+                    "name": "Pavithran Ramachandran"
+                  },
+                  "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "43675665fa6b5154de8b25125dd03d7be35c884f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "cf3778a35129a40dee14366515201b7ed2c0f346"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "swolchok"
+                    },
+                    "email": "swolchok@fb.com",
+                    "name": "Scott Wolchok"
+                  },
+                  "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "IvanYashchuk"
+                    },
+                    "email": "ivan.yashchuk@aalto.fi",
+                    "name": "Ivan Yashchuk"
+                  },
+                  "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Chillee"
+                    },
+                    "email": "chilli@fb.com",
+                    "name": "Horace He"
+                  },
+                  "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mehtanirav"
+                    },
+                    "email": "niravmehta@fb.com",
+                    "name": "Nirav Mehta"
+                  },
+                  "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mehtanirav"
+                    },
+                    "email": "niravmehta@fb.com",
+                    "name": "Nirav Mehta"
+                  },
+                  "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bigfootjon"
+                    },
+                    "email": "jonjanzen@fb.com",
+                    "name": "Jon Janzen"
+                  },
+                  "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "samdow"
+                    },
+                    "email": "samdow@fb.com",
+                    "name": "samdow"
+                  },
+                  "oid": "128c3ad747093f4970329a82c7c4720420faeff2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "arindamroy-eng"
+                    },
+                    "email": "61168652+arindamroy-eng@users.noreply.github.com",
+                    "name": "arindamroy-eng"
+                  },
+                  "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            },
+            "totalCount": 131
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRAI="
                       },
                       {
-                        "context": "ci/circleci: pytorch_macos_10_13_py3_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407415?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRBA="
                       },
                       {
-                        "context": "ci/circleci: pytorch_python_doc_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407018?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRB0="
                       },
                       {
-                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406566?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRC0="
                       },
                       {
-                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406946?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsREE="
                       },
                       {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cpu_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406542?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRE4="
                       },
                       {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406530?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192463"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192463/jobs/3232430975"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRsw="
                       },
                       {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test1",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407028?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192461"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461134"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461211"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461301"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461386"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461521"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461634"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461717"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRtE="
                       },
                       {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test2",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192471"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460797"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460951"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461088"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461294"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461410"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461543"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461628"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461719"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461789"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461869"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461946"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462044"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462112"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462244"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462360"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462432"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462521"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462621"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462683"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462738"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545510"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545571"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547522"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547612"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547714"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547764"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547824"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547869"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547909"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547973"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553452"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553558"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553605"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553650"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563716"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563763"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582650"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582703"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582741"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232590204"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608872"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608976"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637097"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637199"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637259"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232639932"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687012"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687074"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785088"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785153"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-04-20T17:10:41Z",
+                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
+                }
+              }
+            ]
+          },
+          "changedFiles": 348,
+          "files": {
+            "nodes": [
+              {
+                "path": ".circleci/cimodel/data/pytorch_build_data.py"
+              },
+              {
+                "path": ".circleci/cimodel/data/pytorch_build_definitions.py"
+              },
+              {
+                "path": ".circleci/scripts/cpp_doc_push_script.sh"
+              },
+              {
+                "path": ".circleci/scripts/python_doc_push_script.sh"
+              },
+              {
+                "path": ".github/actions/checkout-pytorch/action.yml"
+              },
+              {
+                "path": ".github/merge_rules.json"
+              },
+              {
+                "path": ".github/scripts/gitutils.py"
+              },
+              {
+                "path": ".github/scripts/gql_mocks.json"
+              },
+              {
+                "path": ".github/scripts/trymerge.py"
+              },
+              {
+                "path": ".github/workflows/_bazel-build-test.yml"
+              },
+              {
+                "path": ".github/workflows/_linux-build.yml"
+              },
+              {
+                "path": ".github/workflows/_linux-test.yml"
+              },
+              {
+                "path": ".github/workflows/_mac-test.yml"
+              },
+              {
+                "path": ".github/workflows/_rocm-test.yml"
+              },
+              {
+                "path": ".github/workflows/_win-test.yml"
+              },
+              {
+                "path": ".github/workflows/buck_build_test.yml"
+              },
+              {
+                "path": ".github/workflows/lint.yml"
+              },
+              {
+                "path": ".github/workflows/periodic.yml"
+              },
+              {
+                "path": ".github/workflows/pull.yml"
+              },
+              {
+                "path": ".github/workflows/trunk.yml"
+              },
+              {
+                "path": ".jenkins/pytorch/macos-test.sh"
+              },
+              {
+                "path": ".jenkins/pytorch/test.sh"
+              },
+              {
+                "path": ".jenkins/pytorch/win-test.sh"
+              },
+              {
+                "path": ".lintrunner.toml"
+              },
+              {
+                "path": "BUILD.bazel"
+              },
+              {
+                "path": "CODEOWNERS"
+              },
+              {
+                "path": "README.md"
+              },
+              {
+                "path": "aten/src/ATen/BatchingRegistrations.cpp"
+              },
+              {
+                "path": "aten/src/ATen/Dispatch.h"
+              },
+              {
+                "path": "aten/src/ATen/ExpandUtils.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalInverses.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalStorageImpl.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalStorageImpl.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalTensorWrapper.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalTensorWrapper.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp"
+              },
+              {
+                "path": "aten/src/ATen/NestedTensorImpl.cpp"
+              },
+              {
+                "path": "aten/src/ATen/OpMathType.h"
+              },
+              {
+                "path": "aten/src/ATen/SparseCsrTensorUtils.h"
+              },
+              {
+                "path": "aten/src/ATen/ThreadLocalState.cpp"
+              },
+              {
+                "path": "aten/src/ATen/ThreadLocalState.h"
+              },
+              {
+                "path": "aten/src/ATen/autocast_mode.cpp"
+              },
+              {
+                "path": "aten/src/ATen/autocast_mode.h"
+              },
+              {
+                "path": "aten/src/ATen/core/SymIntArrayRef.cpp"
+              },
+              {
+                "path": "aten/src/ATen/core/SymIntArrayRef.h"
+              },
+              {
+                "path": "aten/src/ATen/core/TensorBase.h"
+              },
+              {
+                "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h"
+              },
+              {
+                "path": "aten/src/ATen/core/dispatch/Dispatcher.h"
+              },
+              {
+                "path": "aten/src/ATen/core/interned_strings.h"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue.cpp"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue.h"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue_inl.h"
+              },
+              {
+                "path": "aten/src/ATen/core/jit_type.h"
+              },
+              {
+                "path": "aten/src/ATen/core/jit_type_base.h"
+              },
+              {
+                "path": "aten/src/ATen/core/type.cpp"
+              },
+              {
+                "path": "aten/src/ATen/cuda/CUDASparse.h"
+              },
+              {
+                "path": "aten/src/ATen/cuda/llvm_complex.cpp"
+              },
+              {
+                "path": "aten/src/ATen/cuda/llvm_jit_strings.h"
+              },
+              {
+                "path": "aten/src/ATen/native/Blas.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/Itertools.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/LinearAlgebra.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/SoftMax.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorConversions.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorShape.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorShape.h"
+              },
+              {
+                "path": "aten/src/ATen/native/Unique.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/JitLoops.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/Lerp.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/SoftMax.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/Unique.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/jit_utils.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/jit_utils.h"
+              },
+              {
+                "path": "aten/src/ATen/native/native_functions.yaml"
+              },
+              {
+                "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/utils.h"
+              },
+              {
+                "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/ts_native_functions.yaml"
+              },
+              {
+                "path": "aten/src/ATen/record_function.cpp"
+              },
+              {
+                "path": "aten/src/ATen/record_function.h"
+              },
+              {
+                "path": "aten/src/ATen/templates/Operators.h"
+              },
+              {
+                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
+              },
+              {
+                "path": "aten/src/ATen/test/basic.cpp"
+              },
+              {
+                "path": "aten/src/ATen/test/vmap_test.cpp"
+              },
+              {
+                "path": "binaries/record_function_benchmark.cc"
+              },
+              {
+                "path": "c10/core/DispatchKey.cpp"
+              },
+              {
+                "path": "c10/core/DispatchKey.h"
+              },
+              {
+                "path": "c10/core/DispatchKeySet.h"
+              },
+              {
+                "path": "c10/test/core/DispatchKeySet_test.cpp"
+              },
+              {
+                "path": "c10/util/ArrayRef.h"
+              },
+              {
+                "path": "caffe2/core/tensor.h"
+              },
+              {
+                "path": "docs/source/conf.py"
+              },
+              {
+                "path": "docs/source/fx.rst"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            }
+          },
+          "reviews": {
+            "nodes": [],
+            "pageInfo": {
+              "startCursor": null,
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...",
+                "createdAt": "2022-04-20T17:26:18Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104215370
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet",
+                "createdAt": "2022-04-20T17:31:26Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104220908
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-04-20T19:30:50Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104378397
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090",
+                "createdAt": "2022-04-20T19:32:10Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104379712
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
+                "createdAt": "2022-06-20T16:44:05Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1160658699
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQdD9Sg==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Stale"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=74bd29fe945c49fde4818e873fa62bc60b55b4ef6ae3f2bb719bab6cddbaa7ce cursor=MTAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "7f560351ae04ea43e58fbfda885bcf216aa26cde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "e8677ed168a036bc7e590d800fe98dd15f10581b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "ac5611caa13642ef8dbe0db453b283b42cbd900b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "1184afbd3bfde0f46133aef09e55e18d3bfb3c3e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "minsii"
+                    },
+                    "email": "msi@fb.com",
+                    "name": "Min Si"
+                  },
+                  "oid": "1c05604f3d049c67dc678d0295c0add470bff3dc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "eellison@devfair044.h1.fair",
+                    "name": "Elias Ellison"
+                  },
+                  "oid": "76ab5101bd36e8d73637d31bbea125240b7b27f0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "eellison@devfair044.h1.fair",
+                    "name": "Elias Ellison"
+                  },
+                  "oid": "c774050e92c3d8e52968e1eb635dd3e9491104b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "guoyejun"
+                    },
+                    "email": "yejun.guo@intel.com",
+                    "name": "Guo Yejun"
+                  },
+                  "oid": "8981595c5361f07186f4534f3be71f1d829a3046"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "BowenBao"
+                    },
+                    "email": "bowbao@microsoft.com",
+                    "name": "BowenBao"
+                  },
+                  "oid": "036f362904024ac9481248965009f312bec6656b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "457d994933f164a9fd70da5ca2733dd6c046a28b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "f49ebc77520774e71722111d554a0215a26956df"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mikeiovine"
+                    },
+                    "email": "mikeiovine@fb.com",
+                    "name": "Mike Iovine"
+                  },
+                  "oid": "f069e1a4a5f98d3fe961e4fc562ede59f59b4026"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "salilsdesai"
+                    },
+                    "email": "salilsdesai@fb.com",
+                    "name": "Salil Desai"
+                  },
+                  "oid": "30bccf58393b288412a0f5a2423a1a41ffce258e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "angelayi"
+                    },
+                    "email": "angelayi@fb.com",
+                    "name": "Angela Yi"
+                  },
+                  "oid": "f4ba440fe8a632c1ee88e01f7746a8a92c8f3902"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "shirong@fb.com",
+                    "name": "Shirong Wu"
+                  },
+                  "oid": "d203346c93ba96d626c6c02910888198c789ba69"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "jamesreed@fb.com",
+                    "name": "James Reed"
+                  },
+                  "oid": "73a4e34963e212b799a191fd031d2fa31d17e0ac"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Krovatkin"
+                    },
+                    "email": "korovaikon@gmail.com",
+                    "name": "Nikolay Korovaiko"
+                  },
+                  "oid": "b9d5206dfb46f09f953aba3ffb0e1e33a99032ee"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "12114e6937573fead54e11ae6cdebe5b31dee302"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "s4ayub"
+                    },
+                    "email": "shababayub@fb.com",
+                    "name": "Shabab Ayub"
+                  },
+                  "oid": "f2323f76ad6f7f590285bf9c6d20c14a79542563"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jaglinux"
+                    },
+                    "email": "jagdish.krishna@gmail.com",
+                    "name": "Jagadish Krishnamoorthy"
+                  },
+                  "oid": "acd4b5abe2739c09c1a02524eceda46ff93fd385"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "cccclai"
+                    },
+                    "email": "chenlai@fb.com",
+                    "name": "Chen Lai"
+                  },
+                  "oid": "04179f533283132fa334a9f91a070b1712f7323d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "zaxtax"
+                    },
+                    "email": "rob@zinkov.com",
+                    "name": "Rob Zinkov"
+                  },
+                  "oid": "5097cdcd6994ad82b3cec942b70e75dbeaee8ca4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "5015ecb5a2b86943f457d71f5a977444dd062732"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "1c42b7789d3966cd541b08fce359b9738fee69f6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "893ac3d334fd3e85e22423a06fe986ce453fe304"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "emcastillo"
+                    },
+                    "email": "ecastill@preferred.jp",
+                    "name": "Emilio Castillo"
+                  },
+                  "oid": "aa5d1b6b031ee2b8bb85f793a842ac1327ae4a19"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "0707a1d00f33d7098f56de339cb30436e8c2ea44"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "NivekT"
+                    },
+                    "email": "ktse@fb.com",
+                    "name": "Kevin Tse"
+                  },
+                  "oid": "ccb082d42af99f6374183cf914cc712bac585f0f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ryandaryl"
+                    },
+                    "email": "ryandarylmills@gmail.com",
+                    "name": "ryandaryl"
+                  },
+                  "oid": "4f2909cc8747808786a1871b0a6825cc4566f48c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "f764010648a29223d9ed4b955073d9d2fb1b2f43"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTMx",
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=76123 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "kumpera"
+          },
+          "title": "Introduce distributed checkpoint with ShardedTensor.",
+          "body": "Co-authored-by: Wen Zhang <zhangwen@fb.com>\r\nCo-authored-by: Yifu Wang <yifu@fb.com>\r\n\r\n",
+          "headRefName": "st_checkpoint",
+          "headRepository": {
+            "nameWithOwner": "kumpera/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "6bf248bc20a71f248064b795f38276326fe43aae"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            },
+            "totalCount": 3
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
                       {
-                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda11.0_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406524?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSmtI="
                       },
                       {
-                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406572?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063614"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063614/jobs/3379894109"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0k="
                       },
                       {
-                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407253?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063615"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894107"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894332"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894444"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894520"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894567"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894616"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894672"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0o="
                       },
                       {
-                        "context": "codecov/patch",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063632"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902301"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902363"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902507"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902560"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902579"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902603"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902637"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902685"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902740"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902761"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902794"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902874"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903006"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903111"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903193"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903284"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903357"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903446"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903512"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903546"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944655"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944695"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946308"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946337"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946359"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946391"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946423"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946453"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946496"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946529"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950041"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950137"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950165"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950192"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950646"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951202"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951230"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963877"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963928"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963976"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379964018"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379966372"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996173"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996218"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379997861"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998374"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998397"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998422"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998441"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3380042106"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm14="
                       },
                       {
-                        "context": "codecov/project",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796859"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419477"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419699"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419923"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419992"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420129"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420208"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420309"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNGg="
                       },
                       {
-                        "context": "pr/caffe2-pytorch-linux-bionic-rocm3.7-py3.6-test",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://ci.pytorch.org/jenkins/job/caffe2-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger-test/2319/"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796862"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796862/jobs/3387419465"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNIc="
                       },
                       {
-                        "context": "pr/pytorch-linux-bionic-rocm3.7-py3.6",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger/2325/"
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796865"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387419999"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420164"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420316"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420477"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420675"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420934"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421278"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421672"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421888"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421982"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422191"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422303"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422476"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422715"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422963"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423092"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423234"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423421"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423622"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423739"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387545789"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546032"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546119"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553028"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553144"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553251"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553438"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553556"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553668"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554002"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554098"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387558927"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559016"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559071"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559139"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563803"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563894"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580868"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580936"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580993"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387581053"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387592286"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387631950"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387632035"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649916"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649974"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650084"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650151"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650373"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387753429"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ="
                       }
-                    ]
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
                   },
-                  "pushedDate": "2020-09-11T01:58:24Z",
-                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                  "status": null,
+                  "pushedDate": "2022-05-05T00:34:26Z",
+                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
                 }
               }
             ]
           },
-          "changedFiles": 5,
+          "changedFiles": 11,
           "files": {
             "nodes": [
               {
-                "path": "test/math_libraries/convolutions.py"
+                "path": "test/distributed/_shard/checkpoint/test_checkpoint.py"
+              },
+              {
+                "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py"
+              },
+              {
+                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/__init__.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/filesystem.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/metadata.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/resharding.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/storage.py"
+              },
+              {
+                "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTE",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wanchaol"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json"
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json"
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json"
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "NQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "DISMISSED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
-                "state": "CHANGES_REQUESTED"
+                "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "xunnanxu"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "xunnanxu"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "xunnanxu"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
-                "state": "CHANGES_REQUESTED"
+                "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "ailzhang"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "ngimel"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "VitalyFedyunin"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "ngimel"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mingxiaoh"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mingxiaoh"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "VitalyFedyunin"
+                  "login": "kumpera"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "VitalyFedyunin"
+                  "login": "pritamdamania87"
                 },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
+                "state": "COMMENTED"
+              },
               {
-                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry  It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes",
-                "createdAt": "2020-08-14T01:36:20Z",
                 "author": {
-                  "login": "mingxiaoh"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "mingxiaoh"
+                  "login": "pritamdamania87"
                 },
-                "databaseId": 673816925
+                "state": "COMMENTED"
               },
               {
-                "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.",
-                "createdAt": "2020-08-14T03:09:37Z",
                 "author": {
-                  "login": "mruberry"
+                  "login": "kumpera"
                 },
-                "authorAssociation": "COLLABORATOR",
-                "editor": null,
-                "databaseId": 673858224
+                "state": "COMMENTED"
               },
               {
-                "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@           Coverage Diff           @@\n##           master   #31093   +/-   ##\n=======================================\n  Coverage   68.00%   68.00%           \n=======================================\n  Files         382      382           \n  Lines       49527    49527           \n=======================================\n  Hits        33679    33679           \n  Misses      15848    15848           \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute <relative> (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.",
-                "createdAt": "2020-09-04T05:41:01Z",
                 "author": {
-                  "login": "codecov"
-                },
-                "authorAssociation": "NONE",
-                "editor": {
-                  "login": "codecov"
+                  "login": "pritamdamania87"
                 },
-                "databaseId": 686921371
+                "state": "COMMENTED"
               },
               {
-                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale.  Feel free to remove the Stale label if you feel this was a mistake.  If you are unable to remove the Stale label please contact a maintainer in order to do so.  Stale pull requests will automatically be closed 30 days after being marked Stale",
-                "createdAt": "2022-04-12T02:35:37Z",
                 "author": {
-                  "login": "pytorchbot"
+                  "login": "pritamdamania87"
                 },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1095860944
+                "state": "APPROVED"
               },
               {
-                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
-                "createdAt": "2022-06-11T04:40:16Z",
                 "author": {
-                  "login": "github-actions"
+                  "login": "kumpera"
                 },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1152854802
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOKCmhXQ==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "triaged"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Stale"
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=76118 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "malfet"
-          },
-          "title": "Dummy change with lots of commits",
-          "body": "Draft PR with 100+ commits, to test mergebot ",
-          "headRefName": "malfet/pr-with-lots-of-commits",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "3067f2240afc7a29dc348000aa19eccbd9772303"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "andrewor14"
-                    },
-                    "email": "andrewor@fb.com",
-                    "name": "Andrew Or"
-                  },
-                  "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3"
-                }
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "jwtan@fb.com",
-                    "name": "Jiewen Tan"
-                  },
-                  "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "jwtan@fb.com",
-                    "name": "Jiewen Tan"
-                  },
-                  "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "peterbell10"
-                    },
-                    "email": "peterbell10@live.co.uk",
-                    "name": "Peter Bell"
-                  },
-                  "oid": "aac6204bf710beb5e50a383d426ae6222396335a"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=",
+              "hasPreviousPage": true
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "createdAt": "2022-05-05T12:35:49Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118495479
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "486387e8644afb46edff5aa5925b55c8119f67f0"
-                }
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "createdAt": "2022-05-05T12:53:15Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118511287
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f"
-                }
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "createdAt": "2022-05-05T15:00:08Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118662274
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a"
-                }
+                "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.",
+                "createdAt": "2022-05-05T15:20:46Z",
+                "author": {
+                  "login": "janeyx99"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118689010
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "qihqi"
-                    },
-                    "email": "qihan@fb.com",
-                    "name": "Han Qi"
-                  },
-                  "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a"
+                "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?",
+                "createdAt": "2022-05-05T15:24:08Z",
+                "author": {
+                  "login": "janeyx99"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118693497
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "oncall: distributed"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "Krovatkin"
-                    },
-                    "email": "korovaikon@gmail.com",
-                    "name": "Nikolay Korovaiko"
-                  },
-                  "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4"
+                "node": {
+                  "name": "cla signed"
                 }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=6a8ce6412a780d5804bfe180ed1dc807269e1eae2ae50de2346d56d1283884bc cursor=Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0= name=pytorch number=76123 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "suo"
-                    },
-                    "email": "suo@fb.com",
-                    "name": "Michael Suo"
-                  },
-                  "oid": "f70b31f62b1c5159eef2725484b175983517c88c"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mikeyd@fb.com",
-                    "name": "Michael Andreas Dagitses"
-                  },
-                  "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "46b754a55b63e3168ad5854ad412c124934b675d"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "robieta"
-                    },
-                    "email": "taylorrobie@fb.com",
-                    "name": "Taylor Robie"
-                  },
-                  "oid": "13df69e13ee571fdd716139419a00aec47ade7d6"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4"
-                }
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
               },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yMlQyMDozNzo1NC0wNzowMLkyMDIyLTA0LTIyVDE2OjAyOjA5LTA3OjAwzjip7G8=",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=71759 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "coolteemf"
+          },
+          "title": "Optimize grid sample 3d",
+          "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n>     * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n>     * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n>     * Changed the CPU kernels:\r\n>       (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorAccessor<scalar_t, 3>* gInp_slice_ptr` instead of `TensorAccessor<scalar_t, 3>& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n>     * Changed CUDA kernel:\r\n>       (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorInfo<scalar_t, index_t>()` instead of `getTensorInfo<scalar_t, index_t>(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n>     * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n>     * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n",
+          "headRefName": "optimize_grid_sample_3d",
+          "headRepository": {
+            "nameWithOwner": "coolteemf/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376"
+                  "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde"
+                  "oid": "563ec73747ad53b63b36736c47c4342f962c2a09"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "qihqi"
-                    },
-                    "email": "qihan@fb.com",
-                    "name": "Han Qi"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "7917d789f0a523715041ade5177d271082628236"
+                  "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "kit1980"
-                    },
-                    "email": "sdym@fb.com",
-                    "name": "Sergii Dymchenko (Meta Employee)"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3"
+                  "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": null,
-                    "email": "mikeyd@fb.com",
-                    "name": "Michael Andreas Dagitses"
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa"
+                  "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": null,
-                    "email": "mikeyd@fb.com",
-                    "name": "Michael Andreas Dagitses"
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352"
+                  "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": null,
-                    "email": "mruberry@fb.com",
-                    "name": "Mike Ruberry"
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954"
+                  "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pearu"
-                    },
-                    "email": "pearu.peterson@gmail.com",
-                    "name": "Pearu Peterson"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "28502265cb5925cb7db8dcb2dd2334963092714a"
+                  "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2"
+                  "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pritamdamania"
-                    },
-                    "email": "pritam.damania@fb.com",
-                    "name": "pritam"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3"
+                  "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "MagiaSN"
-                    },
-                    "email": "magialiao@tencent.com",
-                    "name": "magialiao"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de"
+                  "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6"
+                  "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "awgu"
-                    },
-                    "email": "andgu@fb.com",
-                    "name": "Andrew Gu"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0"
+                  "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "davidberard98"
-                    },
-                    "email": "dberard@fb.com",
-                    "name": "David Berard"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854"
+                  "oid": "f683e8aec7aea76097a264eec01511e704c31154"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "ngimel"
+                      "login": "coolteemf"
                     },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
+                    "email": "67541941+coolteemf@users.noreply.github.com",
+                    "name": "Fran\u00e7ois Lecomte"
                   },
-                  "oid": "9608ab28744d5cae32f371490557b248c9549c66"
+                  "oid": "b932e9e286c22aaf352375186df851ef060b295a"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
                   },
-                  "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34"
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
                 }
-              },
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTY",
+              "hasNextPage": false
+            },
+            "totalCount": 16
+          },
+          "commits": {
+            "nodes": [
               {
                 "commit": {
-                  "author": {
-                    "user": {
-                      "login": "rohan-varma"
-                    },
-                    "email": "rvarm1@fb.com",
-                    "name": "Rohan Varma"
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_T6g="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-onnx"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754066"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663109808"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214802"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214856"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754064"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754064/jobs/2663109676"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-rocm4.5-py3.7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754065"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663109684"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401083"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401143"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401186"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cuda11.3-py3"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754068"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663109680"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995756"
+                              },
+                              {
+                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995819"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995900"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754069"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109683"
+                              },
+                              {
+                                "name": "shellcheck",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109827"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109962"
+                              },
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110044"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110132"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110233"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110320"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110461"
+                              },
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110575"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-asan"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754070"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663109804"
+                              },
+                              {
+                                "name": "test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233675"
+                              },
+                              {
+                                "name": "test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233731"
+                              },
+                              {
+                                "name": "test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233805"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754076"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754076/jobs/2663109810"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc5.4"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754078"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663109777"
+                              },
+                              {
+                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201383"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201458"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201512"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201580"
+                              },
+                              {
+                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201672"
+                              },
+                              {
+                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201839"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uco="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754079"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754079/jobs/2663109681"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
                   },
-                  "oid": "447580dc565f3660eddb2c996c6ed25b88338684"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017798?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017799?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017816?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017800?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
                   },
-                  "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3"
+                  "pushedDate": "2022-02-23T10:39:30Z",
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
                 }
-              },
+              }
+            ]
+          },
+          "changedFiles": 9,
+          "files": {
+            "nodes": [
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe"
-                }
+                "path": "aten/src/ATen/native/GridSampler.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8"
-                }
+                "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "jiyuanzFB"
-                    },
-                    "email": "jiyuanz@fb.com",
-                    "name": "Jiyuan Zhang"
-                  },
-                  "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299"
-                }
+                "path": "aten/src/ATen/native/cuda/GridSampler.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c"
-                }
+                "path": "aten/src/ATen/native/cuda/GridSampler.cu"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87"
-                }
+                "path": "aten/src/ATen/native/cuda/GridSampler.h"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0"
-                }
+                "path": "aten/src/ATen/native/native_functions.yaml"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "a366fd01136292544b7862968ae92feba4b6d8fe"
-                }
+                "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "seemethere"
-                    },
-                    "email": "eliuriegas@fb.com",
-                    "name": "Eli Uriegas"
-                  },
-                  "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0"
-                }
+                "path": "test/test_nn.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "bdhirsh"
-                    },
-                    "email": "hirsheybar@fb.com",
-                    "name": "Brian Hirsh"
-                  },
-                  "oid": "d306c99addc543908f64666baeecacbd0749f4a7"
-                }
-              },
+                "path": "tools/autograd/derivatives.yaml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "OQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "awgu"
-                    },
-                    "email": "andgu@fb.com",
-                    "name": "Andrew Gu"
-                  },
-                  "oid": "c2456ea658f41f64ea054a422edf22a9c977399f"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "awgu"
-                    },
-                    "email": "andgu@fb.com",
-                    "name": "Andrew Gu"
-                  },
-                  "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "anjali411"
-                    },
-                    "email": "chourdiaanjali123@gmail.com",
-                    "name": "anjali411"
-                  },
-                  "oid": "af761d9a5d058c9188f16589bae4f307d35185be"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "clee2000"
-                    },
-                    "email": "csl@fb.com",
-                    "name": "Catherine Lee"
-                  },
-                  "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "1516554e22136db89d0aeba43a1a1a987e995d68"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "qihqi"
-                    },
-                    "email": "qihan@fb.com",
-                    "name": "Han Qi"
-                  },
-                  "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "janeyx99"
-                    },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
-                  },
-                  "oid": "38c1a2028090353e40a019c673c9ab16b39e4825"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "20d798b319cd107a767fe220f7a3027c18a1c844"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
-                  },
-                  "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "dzdang"
-                    },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
-                  },
-                  "oid": "7f821382db5ad08efe5b09a145c606852b8a9272"
-                }
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "davidberard98"
-                    },
-                    "email": "dberard@fb.com",
-                    "name": "David Berard"
-                  },
-                  "oid": "28d6258e62c9fc361a18689877c962c69889dc23"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "APPROVED"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "HarborYuan"
-                    },
-                    "email": "yuanhaobo@whu.edu.cn",
-                    "name": "Haobo Yuan"
-                  },
-                  "oid": "2350fad8391367ebf81c7236a2c883644b4ff622"
-                }
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630",
+                "createdAt": "2022-02-23T14:55:36Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048868910
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "zou3519"
-                    },
-                    "email": "zou3519@gmail.com",
-                    "name": "Richard Zou"
-                  },
-                  "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92"
-                }
+                "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !",
+                "createdAt": "2022-02-23T16:44:36Z",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1048983572
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "jeffdaily"
-                    },
-                    "email": "jeff.daily@amd.com",
-                    "name": "Jeff Daily"
-                  },
-                  "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8"
-                }
+                "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)",
+                "createdAt": "2022-02-23T17:49:55Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1049048119
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "peterbell10"
-                    },
-                    "email": "peterbell10@live.co.uk",
-                    "name": "Peter Bell"
-                  },
-                  "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e"
-                }
+                "bodyText": "@pytorchbot merge this please",
+                "createdAt": "2022-02-23T19:23:55Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1049131992
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "peterbell10"
-                    },
-                    "email": "peterbell10@live.co.uk",
-                    "name": "Peter Bell"
-                  },
-                  "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20"
+                "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-02-23T19:26:51Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1049134520
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "triaged"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b"
+                "node": {
+                  "name": "open source"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2"
+                "node": {
+                  "name": "cla signed"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
-                  },
-                  "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e"
+                "node": {
+                  "name": "release notes: nn"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "george-qi"
-                    },
-                    "email": "georgeqi94@gmail.com",
-                    "name": "George Qi"
-                  },
-                  "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698"
+                "node": {
+                  "name": "topic: performance"
                 }
-              },
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=75095 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "mruberry"
+          },
+          "title": "Initial prims, references, and test architecture for them",
+          "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ",
+          "headRefName": "prims_and_references",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "60fc3277634365b64465712b13db2acb76d6c890"
+                  "oid": "a790467c650be92775103cde5e866c90b56f5376"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd"
+                  "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "jerryzh168"
-                    },
-                    "email": "jerryzh168@gmail.com",
-                    "name": "Jerry Zhang"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763"
+                  "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "ysiraichi"
-                    },
-                    "email": "yukio.siraichi@gmail.com",
-                    "name": "Yukio Siraichi"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705"
+                  "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5"
+                  "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "thiagocrepaldi"
-                    },
-                    "email": "thiago.crepaldi@microsoft.com",
-                    "name": "Thiago Crepaldi"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "83208e7dee4503c1bee1df9f6632794694dffa01"
+                  "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "kshitij12345"
-                    },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff"
+                  "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7"
+                  "oid": "63fdd580118477416ae160e0670ae722ea248090"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "fatcat-z"
-                    },
-                    "email": "jiz@microsoft.com",
-                    "name": "Jay Zhang"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "f273961c1696b156e35f8c76f7ad37934031050d"
+                  "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pavithranrao"
-                    },
-                    "email": "pavithran@fb.com",
-                    "name": "Pavithran Ramachandran"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61"
+                  "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e"
+                  "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "pytorchmergebot"
-                    },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "43675665fa6b5154de8b25125dd03d7be35c884f"
+                  "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "albanD"
+                      "login": "ezyang"
                     },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
                   },
-                  "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5"
+                  "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "pytorchmergebot"
+                      "login": "ezyang"
                     },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
                   },
-                  "oid": "cf3778a35129a40dee14366515201b7ed2c0f346"
+                  "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "dzdang"
+                      "login": "ezyang"
                     },
-                    "email": "dzdang@umich.edu",
-                    "name": "dzdang"
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
                   },
-                  "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6"
+                  "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "pytorchmergebot"
+                      "login": "ezyang"
                     },
-                    "email": "pytorchmergebot@users.noreply.github.com",
-                    "name": "PyTorch MergeBot"
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
                   },
-                  "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6"
+                  "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "janeyx99"
+                      "login": "ezyang"
                     },
-                    "email": "janeyx@fb.com",
-                    "name": "Jane Xu"
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
                   },
-                  "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a"
+                  "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "swolchok"
+                      "login": "ngimel"
                     },
-                    "email": "swolchok@fb.com",
-                    "name": "Scott Wolchok"
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
                   },
-                  "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d"
+                  "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "IvanYashchuk"
+                      "login": "ngimel"
                     },
-                    "email": "ivan.yashchuk@aalto.fi",
-                    "name": "Ivan Yashchuk"
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
                   },
-                  "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845"
+                  "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "Chillee"
-                    },
-                    "email": "chilli@fb.com",
-                    "name": "Horace He"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca"
+                  "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "mehtanirav"
+                      "login": "ngimel"
                     },
-                    "email": "niravmehta@fb.com",
-                    "name": "Nirav Mehta"
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
                   },
-                  "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc"
+                  "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "mehtanirav"
+                      "login": "ngimel"
                     },
-                    "email": "niravmehta@fb.com",
-                    "name": "Nirav Mehta"
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
                   },
-                  "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b"
+                  "oid": "442c405e9da0d66744ef03e379224c41eedf5b57"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "albanD"
-                    },
-                    "email": "albandes@fb.com",
-                    "name": "Alban Desmaison"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee"
+                  "oid": "031ac49ae9c192989385986b6707fa781e3229e0"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "bigfootjon"
-                    },
-                    "email": "jonjanzen@fb.com",
-                    "name": "Jon Janzen"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde"
+                  "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "samdow"
-                    },
-                    "email": "samdow@fb.com",
-                    "name": "samdow"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "128c3ad747093f4970329a82c7c4720420faeff2"
+                  "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb"
                 }
               },
               {
                 "commit": {
                   "author": {
-                    "user": {
-                      "login": "arindamroy-eng"
-                    },
-                    "email": "61168652+arindamroy-eng@users.noreply.github.com",
-                    "name": "arindamroy-eng"
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
                   },
-                  "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973"
+                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "MTAw",
-              "hasNextPage": true
+              "endCursor": "MjY",
+              "hasNextPage": false
             },
-            "totalCount": 131
+            "totalCount": 26
           },
           "commits": {
             "nodes": [
@@ -26957,16 +11187,21 @@
                                 "name": "Facebook CLA Check",
                                 "conclusion": "SUCCESS",
                                 "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              },
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRAI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2o="
                       },
                       {
                         "node": {
@@ -26984,7 +11219,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRBA="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2w="
                       },
                       {
                         "node": {
@@ -27002,7 +11237,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRB0="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3U="
                       },
                       {
                         "node": {
@@ -27020,7 +11255,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRC0="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3o="
                       },
                       {
                         "node": {
@@ -27038,7 +11273,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsREE="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC34="
                       },
                       {
                         "node": {
@@ -27056,7 +11291,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRE4="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC4E="
                       },
                       {
                         "node": {
@@ -27068,24 +11303,24 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192463"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622865"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "run-torchbench",
                                 "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192463/jobs/3232430975"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622865/jobs/3270915028"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRsw="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDNo="
                       },
                       {
                         "node": {
@@ -27097,54 +11332,54 @@
                             "workflow": {
                               "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192461"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622869"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "Test collect_env (with_torch)",
+                                "name": "quick-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461134"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915027"
                               },
                               {
-                                "name": "Test collect_env (without_torch)",
+                                "name": "lintrunner",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461211"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915071"
                               },
                               {
-                                "name": "toc",
+                                "name": "Test tools",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461301"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915141"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "Test collect_env (with_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461386"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915194"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "Test collect_env (without_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461521"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915229"
                               },
                               {
-                                "name": "lintrunner",
+                                "name": "toc",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461634"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915283"
                               },
                               {
                                 "name": "workflow-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461717"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915321"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRtE="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDOY="
                       },
                       {
                         "node": {
@@ -27155,270 +11390,270 @@
                           "workflowRun": {
                             "workflow": {
                               "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2197192471"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460797"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460951"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461088"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461294"
-                              },
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622878"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461410"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927344"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "name": "linux-bionic-rocm5.0-py3.7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461543"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927442"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461628"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927507"
                               },
                               {
-                                "name": "linux-bionic-rocm5.0-py3.7 / build",
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461719"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927567"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461789"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927674"
                               },
                               {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "name": "win-vs2019-cuda11.3-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461869"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927727"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "name": "linux-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461946"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927802"
                               },
                               {
                                 "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462044"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927853"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462112"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927948"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462244"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927996"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462360"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928061"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462432"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928116"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462521"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928198"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462621"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928256"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462683"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928291"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462738"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928317"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545510"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928338"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545571"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928367"
                               },
                               {
-                                "name": "linux-docs / build-docs (cpp)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547522"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928410"
                               },
                               {
-                                "name": "linux-docs / build-docs (python)",
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547612"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928445"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547714"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991071"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547764"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991125"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547824"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991162"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547869"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991195"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547909"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991233"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547973"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991261"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "linux-docs / build-docs (cpp)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553452"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991305"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991349"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553558"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996024"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553605"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996068"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553650"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996092"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996505"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270998987"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270999027"
                               },
                               {
                                 "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563716"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006886"
                               },
                               {
                                 "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563763"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006941"
                               },
                               {
                                 "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582650"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018097"
                               },
                               {
                                 "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582703"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018135"
                               },
                               {
                                 "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582741"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018162"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232590204"
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271021143"
                               },
                               {
                                 "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608872"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034041"
                               },
                               {
                                 "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608976"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034072"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637097"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271048218"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637199"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049553"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637259"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049587"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232639932"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049616"
                               },
                               {
                                 "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687012"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068293"
                               },
                               {
                                 "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687074"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068336"
                               },
                               {
                                 "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785088"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149276"
                               },
                               {
                                 "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785153"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149321"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=",
                               "hasNextPage": true
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA="
                       }
                     ],
                     "pageInfo": {
@@ -27426,383 +11661,609 @@
                     }
                   },
                   "status": null,
-                  "pushedDate": "2022-04-20T17:10:41Z",
-                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
+                  "pushedDate": "2022-04-25T02:30:31Z",
+                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
                 }
               }
             ]
           },
-          "changedFiles": 348,
+          "changedFiles": 5,
           "files": {
             "nodes": [
               {
-                "path": ".circleci/cimodel/data/pytorch_build_data.py"
-              },
-              {
-                "path": ".circleci/cimodel/data/pytorch_build_definitions.py"
-              },
-              {
-                "path": ".circleci/scripts/cpp_doc_push_script.sh"
-              },
-              {
-                "path": ".circleci/scripts/python_doc_push_script.sh"
-              },
-              {
-                "path": ".github/actions/checkout-pytorch/action.yml"
-              },
-              {
-                "path": ".github/merge_rules.json"
-              },
-              {
-                "path": ".github/scripts/gitutils.py"
-              },
-              {
-                "path": ".github/scripts/gql_mocks.json"
-              },
-              {
-                "path": ".github/scripts/trymerge.py"
-              },
-              {
-                "path": ".github/workflows/_bazel-build-test.yml"
-              },
-              {
-                "path": ".github/workflows/_linux-build.yml"
+                "path": "test/test_ops.py"
               },
               {
-                "path": ".github/workflows/_linux-test.yml"
+                "path": "torch/_prims/__init__.py"
               },
               {
-                "path": ".github/workflows/_mac-test.yml"
+                "path": "torch/_prims/utils.py"
               },
               {
-                "path": ".github/workflows/_rocm-test.yml"
+                "path": "torch/_refs/__init__.py"
               },
               {
-                "path": ".github/workflows/_win-test.yml"
-              },
+                "path": "torch/testing/_internal/common_methods_invocations.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
               {
-                "path": ".github/workflows/buck_build_test.yml"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".github/workflows/lint.yml"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".github/workflows/periodic.yml"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".github/workflows/pull.yml"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".github/workflows/trunk.yml"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".jenkins/pytorch/macos-test.sh"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".jenkins/pytorch/test.sh"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".jenkins/pytorch/win-test.sh"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": ".lintrunner.toml"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "BUILD.bazel"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "CODEOWNERS"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "README.md"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/BatchingRegistrations.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/Dispatch.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/ExpandUtils.h"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/FunctionalInverses.cpp"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/FunctionalStorageImpl.cpp"
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/FunctionalStorageImpl.h"
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/FunctionalTensorWrapper.cpp"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/FunctionalTensorWrapper.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp"
+                "author": {
+                  "login": "zou3519"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/NestedTensorImpl.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/OpMathType.h"
+                "author": {
+                  "login": "peterbell10"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/SparseCsrTensorUtils.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/ThreadLocalState.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/ThreadLocalState.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/autocast_mode.cpp"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/autocast_mode.h"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/SymIntArrayRef.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/SymIntArrayRef.h"
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/TensorBase.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/dispatch/Dispatcher.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/interned_strings.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/ivalue.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/ivalue.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/ivalue_inl.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/jit_type.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/jit_type_base.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/core/type.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/cuda/CUDASparse.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/cuda/llvm_complex.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/cuda/llvm_jit_strings.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/Blas.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/Itertools.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/LinearAlgebra.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/SoftMax.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/TensorConversions.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/TensorShape.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/TensorShape.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/Unique.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/JitLoops.cuh"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/Lerp.cu"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/SoftMax.cu"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu"
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/Unique.cu"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/jit_utils.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/cuda/jit_utils.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/native_functions.yaml"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/quantized/cudnn/utils.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/native/ts_native_functions.yaml"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/record_function.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/record_function.h"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/templates/Operators.h"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/test/basic.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "aten/src/ATen/test/vmap_test.cpp"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "binaries/record_function_benchmark.cc"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "c10/core/DispatchKey.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "c10/core/DispatchKey.h"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "c10/core/DispatchKeySet.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "c10/test/core/DispatchKeySet_test.cpp"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "c10/util/ArrayRef.h"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "caffe2/core/tensor.h"
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "APPROVED"
               },
               {
-                "path": "docs/source/conf.py"
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "docs/source/fx.rst"
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
               }
             ],
             "pageInfo": {
-              "endCursor": "MTAw",
-              "hasNextPage": true
-            }
-          },
-          "reviews": {
-            "nodes": [],
-            "pageInfo": {
-              "startCursor": null,
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=",
               "hasPreviousPage": false
             }
           },
           "comments": {
             "nodes": [
               {
-                "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...",
-                "createdAt": "2022-04-20T17:26:18Z",
+                "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.",
+                "createdAt": "2022-04-21T19:00:28Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "ngimel"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1104215370
+                "databaseId": 1105643418
               },
               {
-                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet",
-                "createdAt": "2022-04-20T17:31:26Z",
+                "bodyText": "@pytorchbot merge this please",
+                "createdAt": "2022-04-25T04:42:29Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "mruberry"
                 },
-                "authorAssociation": "MEMBER",
+                "authorAssociation": "COLLABORATOR",
                 "editor": null,
-                "databaseId": 1104220908
+                "databaseId": 1108072887
               },
               {
-                "bodyText": "@pytorchbot merge this",
-                "createdAt": "2022-04-20T19:30:50Z",
+                "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244",
+                "createdAt": "2022-04-25T04:43:54Z",
                 "author": {
-                  "login": "malfet"
+                  "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1104378397
+                "databaseId": 1108073536
               },
               {
-                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090",
-                "createdAt": "2022-04-20T19:32:10Z",
+                "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-04-25T04:51:11Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "facebook-github-bot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1104379712
+                "databaseId": 1108075965
               },
               {
-                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
-                "createdAt": "2022-06-20T16:44:05Z",
+                "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-04-25T09:57:56Z",
                 "author": {
                   "login": "github-actions"
                 },
                 "authorAssociation": "NONE",
                 "editor": null,
-                "databaseId": 1160658699
+                "databaseId": 1108351107
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQdD9Sg==",
+              "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==",
               "hasPreviousPage": true
             }
           },
@@ -27815,7 +12276,12 @@
               },
               {
                 "node": {
-                  "name": "Stale"
+                  "name": "topic: not user facing"
+                }
+              },
+              {
+                "node": {
+                  "name": "module: primTorch"
                 }
               }
             ]
@@ -27824,20 +12290,20 @@
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=76123 owner=pytorch": {
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=77700 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
           "closed": true,
-          "isCrossRepository": true,
+          "isCrossRepository": false,
           "author": {
-            "login": "kumpera"
+            "login": "kit1980"
           },
-          "title": "Introduce distributed checkpoint with ShardedTensor.",
-          "body": "Co-authored-by: Wen Zhang <zhangwen@fb.com>\r\nCo-authored-by: Yifu Wang <yifu@fb.com>\r\n\r\n",
-          "headRefName": "st_checkpoint",
+          "title": "Move pull linux-docs job to Ubuntu 20.04",
+          "body": "",
+          "headRefName": "sdym/pull-xenial-focal-linux-docs",
           "headRepository": {
-            "nameWithOwner": "kumpera/pytorch"
+            "nameWithOwner": "pytorch/pytorch"
           },
           "baseRefName": "master",
           "baseRepository": {
@@ -27854,44 +12320,20 @@
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kumpera"
-                    },
-                    "email": "kumpera@fb.com",
-                    "name": "Rodrigo Kumpera"
-                  },
-                  "oid": "6bf248bc20a71f248064b795f38276326fe43aae"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kumpera"
-                    },
-                    "email": "kumpera@fb.com",
-                    "name": "Rodrigo Kumpera"
-                  },
-                  "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "kumpera"
+                      "login": "kit1980"
                     },
-                    "email": "kumpera@fb.com",
-                    "name": "Rodrigo Kumpera"
+                    "email": "sdym@fb.com",
+                    "name": "Sergii Dymchenko"
                   },
-                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                  "oid": "81261599614423baa17df72300b8e109677b6799"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "Mw",
+              "endCursor": "MQ",
               "hasNextPage": false
             },
-            "totalCount": 3
+            "totalCount": 1
           },
           "commits": {
             "nodes": [
@@ -27902,26 +12344,175 @@
                       {
                         "node": {
                           "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNmNqE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuMI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuM4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuNU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuOI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuPI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
                           },
                           "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuQQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867841"
+                          },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "Facebook CLA Check",
+                                "name": "lintrunner",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528127876"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128023"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128196"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128519"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128575"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128663"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128857"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYVY=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSmtI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzA="
                       },
                       {
                         "node": {
@@ -27933,24 +12524,24 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063614"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867843"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "run-torchbench",
                                 "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063614/jobs/3379894109"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867843/jobs/3528127882"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdXEg=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0k="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzg="
                       },
                       {
                         "node": {
@@ -27960,56 +12551,96 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "docker-builds"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063615"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867844"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "quick-checks",
+                                "name": "docker-build (pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894107"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127883"
                               },
                               {
-                                "name": "toc",
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894332"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127945"
                               },
                               {
-                                "name": "lintrunner",
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894444"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128001"
                               },
                               {
-                                "name": "Test collect_env (with_torch)",
+                                "name": "docker-build (pytorch-linux-bionic-py3.7-clang9)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894520"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128067"
                               },
                               {
-                                "name": "Test collect_env (without_torch)",
+                                "name": "docker-build (pytorch-linux-bionic-rocm5.0-py3.7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894567"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128124"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "docker-build (pytorch-linux-bionic-rocm5.1-py3.7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894616"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128191"
                               },
                               {
-                                "name": "workflow-checks",
+                                "name": "docker-build (pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894672"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128259"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128321"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-android-ndk-r19c)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128365"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-asan)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128446"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-asan)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128507"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-onnx)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128563"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc5.4)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128639"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128687"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-py3.7-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128741"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYLI=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0o="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu0A="
                       },
                       {
                         "node": {
@@ -28021,269 +12652,1271 @@
                             "workflow": {
                               "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2273063632"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867849"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "linux-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902301"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150762"
                               },
                               {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "name": "linux-focal-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902363"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150903"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902507"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151086"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151258"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902560"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151511"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "linux-bionic-rocm5.1-py3.7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902579"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151776"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902603"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151896"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902637"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152014"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902685"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152139"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902740"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152216"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "name": "win-vs2019-cuda11.3-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902761"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152378"
                               },
                               {
                                 "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902794"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152516"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902874"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152599"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903006"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152723"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903111"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152802"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903193"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152913"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903284"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152969"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903357"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153005"
                               },
                               {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903446"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153062"
                               },
                               {
                                 "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903512"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153125"
                               },
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903546"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153207"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944655"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242483"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944695"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242528"
                               },
                               {
-                                "name": "linux-docs / build-docs (cpp)",
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946308"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245875"
                               },
                               {
-                                "name": "linux-docs / build-docs (python)",
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946337"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245914"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946359"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245964"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946391"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528246008"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946423"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528248520"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946453"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255086"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946496"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255128"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946529"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274064"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950041"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274097"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950137"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274133"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950165"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274173"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950192"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274209"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950646"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528277014"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951202"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528308958"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951230"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309747"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963877"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309810"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963928"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309837"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963976"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309864"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379964018"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309895"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379966372"
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309925"
                               },
                               {
                                 "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996173"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310044"
                               },
                               {
                                 "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996218"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310101"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379997861"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384337"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998374"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384379"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998397"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384408"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998422"
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384441"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384471"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNi1Nc=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu1E="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-05-19T00:02:11Z",
+                  "oid": "81261599614423baa17df72300b8e109677b6799"
+                }
+              }
+            ]
+          },
+          "changedFiles": 3,
+          "files": {
+            "nodes": [
+              {
+                "path": ".circleci/docker/build.sh"
+              },
+              {
+                "path": ".circleci/docker/common/install_katex.sh"
+              },
+              {
+                "path": ".github/workflows/pull.yml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "suo"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kit1980"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "janeyx99"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNS0xOFQxMjo0MTowNS0wNzowMLkyMDIyLTA1LTE4VDEyOjQxOjA0LTA3OjAwzjpD7es=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/77700\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 8126159 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-05-17T23:01:48Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1129400934
+              },
+              {
+                "bodyText": "@pytorchbot merge",
+                "createdAt": "2022-05-19T15:39:05Z",
+                "author": {
+                  "login": "kit1980"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1131884232
+              },
+              {
+                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) linux-docs / build-docs (cpp), linux-docs / build-docs (python) are pending/not yet run for rule OSS CI\nRaised by https://github.com/pytorch/pytorch/actions/runs/2353067846",
+                "createdAt": "2022-05-19T15:40:59Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1131886153
+              },
+              {
+                "bodyText": "@pytorchbot merge -f",
+                "createdAt": "2022-05-19T16:41:29Z",
+                "author": {
+                  "login": "kit1980"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1131945610
+              },
+              {
+                "bodyText": "Hey @kit1980.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-05-19T16:43:37Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1131947473
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQ1FKZg==",
+              "hasPreviousPage": false
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "Merged"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAYNi1Nc= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAYduu0A= name=pytorch number=77700 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "81261599614423baa17df72300b8e109677b6799",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384494"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (cpp)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528477548"
+                            },
+                            {
+                              "name": "linux-docs / build-docs (python)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528477578"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528728152"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528728187"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNqJcE=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=68111 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "chunyuan-w"
+          },
+          "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)",
+          "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n  \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.",
+          "headRefName": "chunyuan/llga_preview2",
+          "headRepository": {
+            "nameWithOwner": "chunyuan-w/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "81d44f35b8bc043c38837d0694e5bc072203b832"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "954dc23663125897f4b199eb2a8607dc5fca3274"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "edbfc640ea79a0af85757d9e73796dcc90231519"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "62a4642cf3330524990a69ac29e002c97812320a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ca9b1223be4af2c8b4929303d498eafd71793128"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "6f4a23d24514a02954d2ec792830085f612223c9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e88b492be733f24b6aa395829c76add67d0901e7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "5157930f7b3921d41a586260582b574c915f6ca1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "62991eaad0e638bb0bced327e03f932f66f68732"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "7496bf1588050191595d833d23b8972b2f22655e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "eb32cc65a975361160948bfc3d6a577991ea262e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "a72cd0d02693f45e5354a70654581ad514581ec7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "49a592d9788d08e6cd0593882f867e129057c1cc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "0b743523d1430fec759d5fefbb687f17c89335a5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c189eca154b6691919d0e21489d1c322c7435c0b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "29929f48be03dcdd1bbfade572de7feafa825547"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nikita.shulga@gmail.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NjI",
+              "hasNextPage": false
+            },
+            "totalCount": 62
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998441"
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "name": "Meta Internal-Only Changes Check",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3380042106"
+                                "detailsUrl": "https://opensource.facebook.com/"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=",
+                              "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm14="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYwzI="
                       },
                       {
                         "node": {
@@ -28295,54 +13928,79 @@
                             "workflow": {
                               "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796859"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440028"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "lintrunner",
+                                "name": "clang-format",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419477"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895825"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895911"
                               },
                               {
                                 "name": "quick-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419699"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895963"
                               },
                               {
-                                "name": "Test collect_env (with_torch)",
+                                "name": "shellcheck",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419923"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896134"
                               },
                               {
-                                "name": "Test collect_env (without_torch)",
+                                "name": "toc",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419992"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896253"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "clang-tidy",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420129"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896371"
                               },
                               {
-                                "name": "workflow-checks",
+                                "name": "cmakelint",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420208"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896525"
                               },
                               {
-                                "name": "toc",
+                                "name": "flake8-py3",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420309"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896658"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896771"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896795"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896838"
+                              },
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896897"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNGg="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPI="
                       },
                       {
                         "node": {
@@ -28354,24 +14012,24 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796862"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440031"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "run-torchbench",
                                 "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796862/jobs/3387419465"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440031/jobs/2903895828"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNIc="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPc="
                       },
                       {
                         "node": {
@@ -28383,1233 +14041,1858 @@
                             "workflow": {
                               "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2276796865"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440039"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387419999"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420164"
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896014"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420316"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896165"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420477"
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896394"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "name": "linux-bionic-rocm4.5-py3.7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420675"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896572"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420934"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896666"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421278"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896778"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "linux-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421672"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896837"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421888"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896896"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421982"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896936"
                               },
                               {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422191"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897025"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422303"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897161"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "name": "linux-xenial-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422476"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897213"
                               },
                               {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422715"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897280"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422963"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897368"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "win-vs2019-cuda11.3-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423092"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897431"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423234"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897476"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423421"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897578"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423622"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897630"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423739"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897699"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387545789"
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897733"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs (cpp)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546032"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327787"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs (python)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546119"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327838"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553028"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327956"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553144"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327997"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553251"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328035"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553438"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328093"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
                                 "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553556"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328131"
                               },
                               {
                                 "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553668"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328177"
                               },
                               {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554002"
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904333962"
                               },
                               {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554098"
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904334006"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387558927"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430419"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559016"
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430459"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559071"
+                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430508"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559139"
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430573"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563803"
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443663"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563894"
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443723"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580868"
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443787"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580936"
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454239"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580993"
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454303"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387581053"
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554602"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387592286"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554698"
                               },
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387631950"
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588855"
                               },
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387632035"
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588886"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649916"
+                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588924"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649974"
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904655702"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650084"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656104"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650151"
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656150"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650373"
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656192"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387753429"
+                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706520"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706565"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=",
+                              "hasNextPage": false
                             }
                           },
                           "conclusion": "FAILURE"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs="
                       }
                     ],
                     "pageInfo": {
                       "hasNextPage": false
                     }
                   },
-                  "status": null,
-                  "pushedDate": "2022-05-05T00:34:26Z",
-                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048428?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048429?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048431?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048430?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-03-21T19:58:52Z",
+                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
                 }
               }
-            ]
+            ]
+          },
+          "changedFiles": 37,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/core/interned_strings.h"
+              },
+              {
+                "path": "caffe2/CMakeLists.txt"
+              },
+              {
+                "path": "cmake/Dependencies.cmake"
+              },
+              {
+                "path": "cmake/Modules/FindMKLDNN.cmake"
+              },
+              {
+                "path": "cmake/public/mkldnn.cmake"
+              },
+              {
+                "path": "docs/source/jit.rst"
+              },
+              {
+                "path": "test/test_jit_llga_fuser.py"
+              },
+              {
+                "path": "torch/_C/__init__.pyi.in"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/README.md"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_helper.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/guard_shape.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/interface.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/interface.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/kernel.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/kernel.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/operator.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/ir/alias_analysis.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/ir/ir.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onednn_graph_fuser.h"
+              },
+              {
+                "path": "torch/csrc/jit/python/init.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/operator.cpp"
+              },
+              {
+                "path": "torch/jit/__init__.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mzc",
+              "hasNextPage": false
+            }
           },
-          "changedFiles": 11,
-          "files": {
+          "reviews": {
             "nodes": [
               {
-                "path": "test/distributed/_shard/checkpoint/test_checkpoint.py"
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "chunyuan-w"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py"
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/__init__.py"
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/filesystem.py"
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/metadata.py"
+                "author": {
+                  "login": "wukong1992"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/resharding.py"
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py"
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py"
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
               {
-                "path": "torch/distributed/_shard/checkpoint/storage.py"
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
               },
-              {
-                "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTE",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "eellison"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "zzzwen"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "zzzwen"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "eellison"
                 },
-                "state": "COMMENTED"
+                "state": "APPROVED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "eellison"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "malfet"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "malfet"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "malfet"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "wanchaol"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
                 "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.",
+                "createdAt": "2022-03-21T22:51:38Z",
+                "author": {
+                  "login": "suo"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074498483
               },
               {
+                "bodyText": "@pytorchbot revert this",
+                "createdAt": "2022-03-21T22:51:44Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "suo"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074498550
               },
               {
+                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.",
+                "createdAt": "2022-03-21T22:53:34Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1074499668
+              },
+              {
+                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
+                "createdAt": "2022-03-21T23:07:23Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074508608
               },
               {
+                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
+                "createdAt": "2022-03-30T00:53:50Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "facebook-github-bot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1082508130
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "oncall: jit"
+                }
+              },
+              {
+                "node": {
+                  "name": "triaged"
+                }
+              },
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Reverted"
+                }
+              },
+              {
+                "node": {
+                  "name": "intel priority"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOQAuLsw== name=pytorch number=68111 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/chunyuan-w/pytorch/blob/7496bf1588050191595d833d23b8972b2f22655e/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries/conda\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-manywheel\nciflow/binaries, ciflow/binaries/wheel\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\n\n\nYou can add a comment to the PR and tag @pytorchbot with the following commands:\n\n# ciflow rerun, \"ciflow/default\" will always be added automatically\n@pytorchbot ciflow rerun\n\n# ciflow rerun with additional labels \"-l <ciflow/label_name>\", which is equivalent to adding these labels manually and trigger the rerun\n@pytorchbot ciflow rerun -l ciflow/scheduled -l ciflow/slow\n\nFor more information, please take a look at the CI Flow Wiki.",
+                "createdAt": "2021-11-10T08:42:49Z",
+                "author": {
+                  "login": "pytorch-probot"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "pytorch-probot"
+                },
+                "databaseId": 964902865
               },
               {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/68111\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 7388141 (more details on the Dr. CI page):\n\n\n29/29 failures introduced in this PR\n\n\n\ud83d\udd75\ufe0f 29 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n pull / linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge) (1/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:31:38.6978776Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:31:38.3001628Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:31:38.5169168Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:31:38.5362923Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:31:38.5413452Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:31:38.5458747Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:31:38.5484014Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:31:38.5497924Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:31:38.5656491Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:31:38.5678893Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:31:38.6888479Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f6488c20adb4dca4\n2022-03-21T21:31:38.6978776Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:31:38.6992648Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:31:38.7003010Z ##[error]Process completed with exit code 2.\n2022-03-21T21:31:38.7044027Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:31:38.7044261Z with:\n2022-03-21T21:31:38.7044413Z env:\n2022-03-21T21:31:38.7044565Z   IN_CI: 1\n2022-03-21T21:31:38.7044709Z   IS_GHA: 1\n2022-03-21T21:31:38.7044885Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:31:38.7045067Z ##[endgroup]\n2022-03-21T21:31:38.7060958Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge) (2/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:35:19.2635222Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:35:18.9028722Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:35:19.1132721Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:35:19.1310590Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:35:19.1360251Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:35:19.1386865Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:35:19.1429182Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:35:19.1441925Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:35:19.1468280Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:35:19.1617667Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:35:19.2545368Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-098be2985e0392130\n2022-03-21T21:35:19.2635222Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:35:19.2648463Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:35:19.2658727Z ##[error]Process completed with exit code 2.\n2022-03-21T21:35:19.2706355Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:35:19.2706591Z with:\n2022-03-21T21:35:19.2706748Z env:\n2022-03-21T21:35:19.2706908Z   IN_CI: 1\n2022-03-21T21:35:19.2707061Z   IS_GHA: 1\n2022-03-21T21:35:19.2707246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:35:19.2707438Z ##[endgroup]\n2022-03-21T21:35:19.2724554Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge) (3/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:11:52.7662022Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T23:11:53.1213298Z      ---------------------------------------- 8.1/8.1 MB 23.6 MB/s eta 0:00:00\n2022-03-21T23:11:53.1644665Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:11:53.2218699Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T23:11:53.2389674Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T23:11:53.2787295Z      -------------------------------------- 247.7/247.7 KB 7.4 MB/s eta 0:00:00\n2022-03-21T23:11:53.3761842Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:11:53.5457622Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T23:11:57.4175080Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T23:11:57.5296815Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0105d4db093574f40\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:11:57.5564814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:11:57.5587712Z ##[error]Process completed with exit code 2.\n2022-03-21T23:11:57.5790311Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T23:11:57.5790832Z with:\n2022-03-21T23:11:57.5791104Z env:\n2022-03-21T23:11:57.5791358Z   IN_CI: 1\n2022-03-21T23:11:57.5791620Z   IS_GHA: 1\n2022-03-21T23:11:57.5791939Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:11:57.5792425Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T23:11:57.5792884Z ##[endgroup]\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu) (4/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T02:17:12.6257577Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T02:17:11.9280556Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T02:17:11.9335199Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:11.9682045Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T02:17:11.9850357Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0403171Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T02:17:12.0468875Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0590000Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T02:17:12.0607093Z Installing collected packages: jmespath, urllib3, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T02:17:12.5273459Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T02:17:12.6032812Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-114\n2022-03-22T02:17:12.6257577Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T02:17:12.6259543Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T02:17:12.6291924Z ##[error]Process completed with exit code 2.\n2022-03-22T02:17:12.6387977Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T02:17:12.6388298Z with:\n2022-03-22T02:17:12.6388521Z   wait-ssh: false\n2022-03-22T02:17:12.6388727Z env:\n2022-03-22T02:17:12.6388932Z   IN_CI: 1\n2022-03-22T02:17:12.6389143Z   IS_GHA: 1\n2022-03-22T02:17:12.6389368Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T02:17:12.6389669Z   DOCKER_HOST: unix:///run/user/1121/docker.sock\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge) (5/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:19:24.4890693Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:19:24.0962005Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:19:24.3152253Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:19:24.3341183Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:19:24.3391374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:19:24.3436392Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:19:24.3448982Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:19:24.3474092Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:19:24.3502003Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:19:24.3655072Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:19:24.4799309Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0bc9250521f338cae\n2022-03-21T22:19:24.4890693Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:19:24.4903625Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:19:24.4913841Z ##[error]Process completed with exit code 2.\n2022-03-21T22:19:24.4957338Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:19:24.4957575Z with:\n2022-03-21T22:19:24.4957735Z env:\n2022-03-21T22:19:24.4957900Z   IN_CI: 1\n2022-03-21T22:19:24.4958055Z   IS_GHA: 1\n2022-03-21T22:19:24.4958246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:19:24.4958437Z ##[endgroup]\n2022-03-21T22:19:24.4989649Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu) (6/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T01:05:07.6983899Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T01:05:06.8364546Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T01:05:06.8431763Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.8949391Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T01:05:06.9180079Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.9803351Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T01:05:06.9882133Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:07.0067062Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T01:05:07.0088676Z Installing collected packages: urllib3, jmespath, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T01:05:07.5819667Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T01:05:07.6774717Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-60\n2022-03-22T01:05:07.6983899Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T01:05:07.6988652Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T01:05:07.7023073Z ##[error]Process completed with exit code 2.\n2022-03-22T01:05:07.7102087Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T01:05:07.7102389Z with:\n2022-03-22T01:05:07.7102603Z   wait-ssh: false\n2022-03-22T01:05:07.7102820Z env:\n2022-03-22T01:05:07.7103015Z   IN_CI: 1\n2022-03-22T01:05:07.7103224Z   IS_GHA: 1\n2022-03-22T01:05:07.7103458Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T01:05:07.7103737Z   DOCKER_HOST: unix:///run/user/1502/docker.sock\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge) (7/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:51:39.3637996Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:51:39.2041249Z   Attempting uninstall: s3transfer\n2022-03-21T20:51:39.2043010Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:51:39.2083799Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:51:39.2089675Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:51:39.2480546Z   Attempting uninstall: boto3\n2022-03-21T20:51:39.2482953Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:51:39.2584292Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:51:39.2599474Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:51:39.3130921Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:51:39.3550598Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03ef7efc3078e3da5\n2022-03-21T20:51:39.3637996Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:51:39.3650651Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:51:39.3660484Z ##[error]Process completed with exit code 2.\n2022-03-21T20:51:39.3696465Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:51:39.3696693Z with:\n2022-03-21T20:51:39.3696850Z env:\n2022-03-21T20:51:39.3697012Z   IN_CI: 1\n2022-03-21T20:51:39.3697161Z   IS_GHA: 1\n2022-03-21T20:51:39.3697342Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:51:39.3697528Z ##[endgroup]\n2022-03-21T20:51:39.3730420Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge) (8/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:36.3916860Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:03:36.0096309Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:03:36.2278560Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:03:36.2461618Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:03:36.2513260Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:03:36.2541524Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:03:36.2554899Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:03:36.2598277Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:03:36.2758299Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:03:36.2780690Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:03:36.3825021Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0a4a552890e6ef7d3\n2022-03-21T21:03:36.3916860Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:03:36.3930343Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:03:36.3941263Z ##[error]Process completed with exit code 2.\n2022-03-21T21:03:36.3979258Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:03:36.3979496Z with:\n2022-03-21T21:03:36.3979654Z env:\n2022-03-21T21:03:36.3979814Z   IN_CI: 1\n2022-03-21T21:03:36.3979968Z   IS_GHA: 1\n2022-03-21T21:03:36.3980157Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:03:36.3980360Z ##[endgroup]\n2022-03-21T21:03:36.3996257Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu) (9/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:41:10.3015614Z   Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n2022-03-22T00:41:10.3625659Z      ---------------------------------------- 79.5/79.5 KB 1.1 MB/s eta 0:00:00\n2022-03-22T00:41:10.4120236Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-22T00:41:10.4170155Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-22T00:41:10.4722115Z      -------------------------------------- 247.7/247.7 KB 5.2 MB/s eta 0:00:00\n2022-03-22T00:41:10.4843512Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:41:10.6596108Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:41:10.8733354Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-22T00:41:15.3745408Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-22T00:41:15.4987162Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-09cacc848abc3dd32\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:41:15.5373630Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:41:15.5404353Z ##[error]Process completed with exit code 2.\n2022-03-22T00:41:15.5790508Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-22T00:41:15.5791192Z with:\n2022-03-22T00:41:15.5791530Z env:\n2022-03-22T00:41:15.5791849Z   IN_CI: 1\n2022-03-22T00:41:15.5792186Z   IS_GHA: 1\n2022-03-22T00:41:15.5792599Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:41:15.5793237Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-22T00:41:15.5793831Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge) (10/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:32.9799307Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:32.8167560Z   Attempting uninstall: s3transfer\n2022-03-21T20:50:32.8169351Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:50:32.8213295Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:50:32.8219209Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:50:32.8602320Z   Attempting uninstall: boto3\n2022-03-21T20:50:32.8603289Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:50:32.8704535Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:50:32.8719403Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:50:32.9244278Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:50:32.9710449Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0c568461a276d4a71\n2022-03-21T20:50:32.9799307Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:32.9812238Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:32.9823052Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:32.9859290Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:32.9859527Z with:\n2022-03-21T20:50:32.9859664Z env:\n2022-03-21T20:50:32.9859817Z   IN_CI: 1\n2022-03-21T20:50:32.9859977Z   IS_GHA: 1\n2022-03-21T20:50:32.9860144Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:32.9860327Z ##[endgroup]\n2022-03-21T20:50:32.9893642Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge) (11/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7163042Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.6660824Z     #10 0x55fc8a3ea801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.6661768Z     #11 0x55fc8a3f57a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.6662455Z     #12 0x55fc8a3f580b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.6663570Z     #13 0x55fc8a3f5908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.6663952Z     #14 0x55fc8a3f5908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.6664431Z     #15 0x55fc8a3f5908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.6665304Z     #16 0x55fc8a3f5ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7162113Z     #17 0x7f940d00f83f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7162534Z     #18 0x55fc8a39a554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7162711Z \n2022-03-21T21:05:00.7163042Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.7334595Z + retcode=1\n2022-03-21T21:05:00.7334954Z + set -e\n2022-03-21T21:05:00.7335215Z + return 1\n2022-03-21T21:05:00.7338688Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.7339232Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.7340113Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.7340612Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.7341187Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.7341668Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.7344466Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge) (12/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:06:03.4437430Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:06:03.0752199Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:06:03.2853252Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:06:03.3032326Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:06:03.3081589Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:06:03.3093911Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:06:03.3120244Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:06:03.3162406Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:06:03.3188431Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:06:03.3337181Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:06:03.4348072Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ee48c8811fafc444\n2022-03-21T22:06:03.4437430Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:06:03.4450920Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:06:03.4461263Z ##[error]Process completed with exit code 2.\n2022-03-21T22:06:03.4502346Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:06:03.4502576Z with:\n2022-03-21T22:06:03.4502730Z env:\n2022-03-21T22:06:03.4502888Z   IN_CI: 1\n2022-03-21T22:06:03.4503038Z   IS_GHA: 1\n2022-03-21T22:06:03.4503302Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:06:03.4503492Z ##[endgroup]\n2022-03-21T22:06:03.4519156Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge) (13/29)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:13.2205634Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:12.8679322Z + python3 -m pip install boto3==1.19.12\n2022-03-21T20:50:13.0744228Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T20:50:13.0916284Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T20:50:13.0964264Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T20:50:13.1005656Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T20:50:13.1017299Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T20:50:13.1041042Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T20:50:13.1189450Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T20:50:13.1208751Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T20:50:13.2119445Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d02da60fd18c22f5\n2022-03-21T20:50:13.2205634Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:13.2217939Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:13.2220259Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:13.2248664Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:13.2249012Z with:\n2022-03-21T20:50:13.2249260Z env:\n2022-03-21T20:50:13.2249500Z   IN_CI: 1\n2022-03-21T20:50:13.2249738Z   IS_GHA: 1\n2022-03-21T20:50:13.2250025Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:13.2250329Z ##[endgroup]\n2022-03-21T20:50:13.2272735Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) (14/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:47:38.0451999Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:47:37.5554508Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:47:37.8411473Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:47:37.8631484Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:47:37.8699561Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:47:37.8737037Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:47:37.8754443Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:47:37.8814393Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:47:37.8849540Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:47:37.9059579Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:47:38.0336298Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0b44f47f4292089a2\n2022-03-21T23:47:38.0451999Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:47:38.0469471Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:47:38.0484106Z ##[error]Process completed with exit code 2.\n2022-03-21T23:47:38.0532678Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:47:38.0533007Z with:\n2022-03-21T23:47:38.0533223Z env:\n2022-03-21T23:47:38.0533440Z   IN_CI: 1\n2022-03-21T23:47:38.0533649Z   IS_GHA: 1\n2022-03-21T23:47:38.0533902Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:47:38.0534170Z   GPU_FLAG: --gpus all\n2022-03-21T23:47:38.0534401Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge) (15/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:04:59.3115800Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:04:59.2595213Z     #10 0x55a7f39a4801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:04:59.2595707Z     #11 0x55a7f39af7a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:04:59.2597203Z     #12 0x55a7f39af80b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:04:59.2598205Z     #13 0x55a7f39af908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:04:59.2598697Z     #14 0x55a7f39af908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:04:59.2599178Z     #15 0x55a7f39af908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:04:59.2599747Z     #16 0x55a7f39afccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:04:59.3114751Z     #17 0x7f3b3822383f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:04:59.3115277Z     #18 0x55a7f3954554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:04:59.3115468Z \n2022-03-21T21:04:59.3115800Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:04:59.3292385Z + retcode=1\n2022-03-21T21:04:59.3292781Z + set -e\n2022-03-21T21:04:59.3293062Z + return 1\n2022-03-21T21:04:59.3295462Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:04:59.3295802Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:04:59.3296394Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:04:59.3296700Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:04:59.3297055Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:04:59.3297416Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:04:59.3299623Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (16/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:14:25.5525714Z Collecting jmespath<1.0.0,>=0.7.1\n2022-03-21T22:14:25.5568155Z   Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n2022-03-21T22:14:25.5952617Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:14:25.6169392Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:14:25.6629996Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:14:25.6710247Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:14:25.8284354Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:14:25.9816751Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:14:31.6672236Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:14:31.7630473Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ed0915ecee5d2424\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:14:31.7876742Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:14:31.7897140Z ##[error]Process completed with exit code 2.\n2022-03-21T22:14:31.8195621Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:14:31.8196110Z with:\n2022-03-21T22:14:31.8196356Z env:\n2022-03-21T22:14:31.8196614Z   IN_CI: 1\n2022-03-21T22:14:31.8196876Z   IS_GHA: 1\n2022-03-21T22:14:31.8197169Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:14:31.8197652Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:14:31.8198093Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge) (17/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:19:15.8845728Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:19:15.5116060Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:19:15.7231476Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:19:15.7409711Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:19:15.7458478Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:19:15.7470508Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:19:15.7496799Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:19:15.7538362Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:19:15.7566161Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:19:15.7711630Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:19:15.8753543Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0e2b3b4ddb246ff2a\n2022-03-21T21:19:15.8845728Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:19:15.8859814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:19:15.8870165Z ##[error]Process completed with exit code 2.\n2022-03-21T21:19:15.8917039Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:19:15.8917279Z with:\n2022-03-21T21:19:15.8917433Z env:\n2022-03-21T21:19:15.8917586Z   IN_CI: 1\n2022-03-21T21:19:15.8917734Z   IS_GHA: 1\n2022-03-21T21:19:15.8917917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:19:15.8918102Z ##[endgroup]\n2022-03-21T21:19:15.8934572Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu) (18/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:19:48.5900162Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:19:48.0742254Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:19:48.3742563Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:19:48.3976536Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:19:48.4048700Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:19:48.4065374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:19:48.4128076Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:19:48.4164273Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:19:48.4202610Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:19:48.4416723Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:19:48.5773033Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-07ab7a3c4a5402af2\n2022-03-21T23:19:48.5900162Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:19:48.5919822Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:19:48.5936087Z ##[error]Process completed with exit code 2.\n2022-03-21T23:19:48.6007930Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:19:48.6008268Z with:\n2022-03-21T23:19:48.6008483Z env:\n2022-03-21T23:19:48.6008701Z   IN_CI: 1\n2022-03-21T23:19:48.6008920Z   IS_GHA: 1\n2022-03-21T23:19:48.6009170Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:19:48.6009440Z   GPU_FLAG: --gpus all\n2022-03-21T23:19:48.6009671Z ##[endgroup]\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (19/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:53:59.0889659Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T22:53:59.6881416Z      ---------------------------------------- 8.1/8.1 MB 14.0 MB/s eta 0:00:00\n2022-03-21T22:53:59.7427779Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:53:59.7691882Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:53:59.7779847Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:53:59.8281663Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:54:00.0185115Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:54:00.2359770Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:54:04.1208891Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:54:04.2505862Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03b4fbe63be8ef4b0\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:54:04.2891082Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:54:04.2919900Z ##[error]Process completed with exit code 2.\n2022-03-21T22:54:04.3377901Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:54:04.3378575Z with:\n2022-03-21T22:54:04.3378930Z env:\n2022-03-21T22:54:04.3379275Z   IN_CI: 1\n2022-03-21T22:54:04.3379600Z   IS_GHA: 1\n2022-03-21T22:54:04.3380023Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:54:04.3380691Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:54:04.3381278Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge) (20/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:09:34.0074610Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:09:33.6365531Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:09:33.8475619Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:09:33.8655152Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:09:33.8704395Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:09:33.8716774Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:09:33.8760145Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:09:33.8785000Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:09:33.8811316Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:09:33.8960134Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:09:33.9984866Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d325eb9fd156146f\n2022-03-21T22:09:34.0074610Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:09:34.0087465Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:09:34.0101743Z ##[error]Process completed with exit code 2.\n2022-03-21T22:09:34.0154014Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:09:34.0154246Z with:\n2022-03-21T22:09:34.0154412Z env:\n2022-03-21T22:09:34.0154574Z   IN_CI: 1\n2022-03-21T22:09:34.0154728Z   IS_GHA: 1\n2022-03-21T22:09:34.0154917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:09:34.0155112Z ##[endgroup]\n2022-03-21T22:09:34.0191047Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge) (21/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:17.8502655Z [E request_callbac...yUniqueId(created_on=0, local_id=0) to be created.\n\n2022-03-21T21:03:14.4669960Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpxgdsmeer\n2022-03-21T21:03:14.4671407Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpxgdsmeer/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.4973023Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp1i2hfmpc\n2022-03-21T21:03:14.4973800Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp1i2hfmpc/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.5532339Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpgx4da7b0\n2022-03-21T21:03:14.5533064Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpgx4da7b0/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.7050673Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 0\n2022-03-21T21:03:14.7097127Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 3\n2022-03-21T21:03:14.7398339Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 2\n2022-03-21T21:03:14.7922283Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 1\n2022-03-21T21:03:17.8502655Z [E request_callback_no_python.cpp:559] Received error while processing request type 261: false INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp\":387, please report a bug to PyTorch. Expected OwnerRRef with id GloballyUniqueId(created_on=0, local_id=0) to be created.\n2022-03-21T21:03:17.8503603Z Exception raised from getOwnerRRef at /var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp:387 (most recent call first):\n2022-03-21T21:03:17.8504385Z frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x69 (0x7f180df19e19 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505131Z frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xd2 (0x7f180df160e2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505927Z frame #2: c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x4e (0x7f180df17a7e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8506674Z frame #3: torch::distributed::rpc::RRefContext::getOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, bool) + 0x4b4 (0x7f18118b7b64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8507642Z frame #4: torch::distributed::rpc::RequestCallbackNoPython::assignOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, torch::distributed::rpc::GloballyUniqueId const&, c10::intrusive_ptr<c10::ivalue::Future, c10::detail::intrusive_target_default_null_type<c10::ivalue::Future> >) const + 0x70 (0x7f18118a7bf0 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8508613Z frame #5: torch::distributed::rpc::RequestCallbackImpl::processPythonRemoteCall(torch::distributed::rpc::RpcCommandBase&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0xc8 (0x7f1819736208 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8509749Z frame #6: torch::distributed::rpc::RequestCallbackNoPython::processRpc(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x194 (0x7f18118ac914 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8510708Z frame #7: torch::distributed::rpc::RequestCallbackImpl::processRpcWithErrors(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x65 (0x7f1819735865 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8511369Z frame #8: <unknown function> + 0x375249a (0x7f18118a949a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test (22/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERR...t available for the merge-base of your branch\"\ufffd[0m\n\n2022-03-21T20:01:07.7012399Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7012634Z \ufffd[36;1m# Covers the case where a previous tag doesn't exist for the tree\ufffd[0m\n2022-03-21T20:01:07.7012992Z \ufffd[36;1m# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly\ufffd[0m\n2022-03-21T20:01:07.7013373Z \ufffd[36;1mif ! git rev-parse \"$MERGE_BASE:.circleci/docker\"; then\ufffd[0m\n2022-03-21T20:01:07.7013784Z \ufffd[36;1m  echo \"Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit\"\ufffd[0m\n2022-03-21T20:01:07.7014149Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7014325Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7014573Z \ufffd[36;1mPREVIOUS_DOCKER_TAG=$(git rev-parse \"$MERGE_BASE:.circleci/docker\")\ufffd[0m\n2022-03-21T20:01:07.7014907Z \ufffd[36;1m# If no image exists but the hash is the same as the previous hash then we should error out here\ufffd[0m\n2022-03-21T20:01:07.7015231Z \ufffd[36;1mif [[ \"${PREVIOUS_DOCKER_TAG}\" = \"${DOCKER_TAG}\" ]]; then\ufffd[0m\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch\"\ufffd[0m\n2022-03-21T20:01:07.7015931Z \ufffd[36;1m  echo \"       contact the PyTorch team to restore the original images\"\ufffd[0m\n2022-03-21T20:01:07.7016225Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7016400Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7016608Z \ufffd[36;1mecho ::set-output name=rebuild::yes\ufffd[0m\n2022-03-21T20:01:07.7027605Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}\n2022-03-21T20:01:07.7027837Z env:\n2022-03-21T20:01:07.7028006Z   IN_CI: 1\n2022-03-21T20:01:07.7028159Z   IS_GHA: 1\n2022-03-21T20:01:07.7028346Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:01:07.7028589Z   BASE_REVISION: 6643522db9ff595f564b8081de58b3a33c546178\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu) (23/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:49:54.2949572Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:49:53.8049151Z + python3 -m pip install boto3==1.19.12\n2022-03-22T00:49:54.0981629Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-22T00:49:54.1207562Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-22T00:49:54.1277146Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-22T00:49:54.1315027Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-22T00:49:54.1331813Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-22T00:49:54.1391622Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:49:54.1609217Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-22T00:49:54.1637417Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:49:54.2830197Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f7c32fe13be12fea\n2022-03-22T00:49:54.2949572Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:49:54.2966933Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:49:54.2982588Z ##[error]Process completed with exit code 2.\n2022-03-22T00:49:54.3031464Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T00:49:54.3031794Z with:\n2022-03-22T00:49:54.3032012Z env:\n2022-03-22T00:49:54.3032227Z   IN_CI: 1\n2022-03-22T00:49:54.3032434Z   IS_GHA: 1\n2022-03-22T00:49:54.3032681Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:49:54.3033084Z   GPU_FLAG: --gpus all\n2022-03-22T00:49:54.3033312Z ##[endgroup]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge) (24/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:56:07.3365589Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T21:56:07.7926584Z      ---------------------------------------- 8.1/8.1 MB 17.3 MB/s eta 0:00:00\n2022-03-21T21:56:07.9319362Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T21:56:07.9366132Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T21:56:08.0077590Z      -------------------------------------- 247.7/247.7 KB 3.0 MB/s eta 0:00:00\n2022-03-21T21:56:08.0164070Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:56:08.1775537Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:56:08.3393469Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T21:56:12.4576766Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T21:56:12.5641959Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0afad69838118af0e\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:56:12.5905611Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:56:12.5927729Z ##[error]Process completed with exit code 2.\n2022-03-21T21:56:12.6239531Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T21:56:12.6240039Z with:\n2022-03-21T21:56:12.6240299Z env:\n2022-03-21T21:56:12.6240557Z   IN_CI: 1\n2022-03-21T21:56:12.6240805Z   IS_GHA: 1\n2022-03-21T21:56:12.6241118Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:56:12.6241613Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T21:56:12.6242052Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge) (25/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:46:39.5474616Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:46:39.1884210Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:46:39.3928976Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:46:39.4105069Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:46:39.4152571Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:46:39.4194931Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:46:39.4218947Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:46:39.4230812Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:46:39.4380089Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:46:39.4399461Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:46:39.5387703Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0888bed1149cca415\n2022-03-21T21:46:39.5474616Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:46:39.5487145Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:46:39.5497480Z ##[error]Process completed with exit code 2.\n2022-03-21T21:46:39.5541319Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:46:39.5541544Z with:\n2022-03-21T21:46:39.5541698Z env:\n2022-03-21T21:46:39.5541851Z   IN_CI: 1\n2022-03-21T21:46:39.5541997Z   IS_GHA: 1\n2022-03-21T21:46:39.5542176Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:46:39.5542361Z ##[endgroup]\n2022-03-21T21:46:39.5557878Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge) (26/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:34:57.0623859Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:34:56.9039884Z   Attempting uninstall: s3transfer\n2022-03-21T21:34:56.9041446Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:34:56.9090783Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:34:56.9095968Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:34:56.9453014Z   Attempting uninstall: boto3\n2022-03-21T21:34:56.9454356Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:34:56.9564320Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:34:56.9578035Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:34:57.0091363Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:34:57.0536230Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-034a3afd5d80b91fd\n2022-03-21T21:34:57.0623859Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:34:57.0637167Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:34:57.0647396Z ##[error]Process completed with exit code 2.\n2022-03-21T21:34:57.0688237Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:34:57.0688481Z with:\n2022-03-21T21:34:57.0688631Z env:\n2022-03-21T21:34:57.0688769Z   IN_CI: 1\n2022-03-21T21:34:57.0688930Z   IS_GHA: 1\n2022-03-21T21:34:57.0689109Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:34:57.0689462Z ##[endgroup]\n2022-03-21T21:34:57.0704768Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge) (27/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7896545Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.7395504Z     #10 0x5597fd5a9801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.7396330Z     #11 0x5597fd5b47a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.7396688Z     #12 0x5597fd5b480b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.7398664Z     #13 0x5597fd5b4908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.7399177Z     #14 0x5597fd5b4908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.7399663Z     #15 0x5597fd5b4908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.7399986Z     #16 0x5597fd5b4ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7895241Z     #17 0x7f0a5905983f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7895772Z     #18 0x5597fd559554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7896033Z \n2022-03-21T21:05:00.7896545Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.8063448Z + retcode=1\n2022-03-21T21:05:00.8063787Z + set -e\n2022-03-21T21:05:00.8064058Z + return 1\n2022-03-21T21:05:00.8067638Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.8068127Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.8069018Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.8069500Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.8070105Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.8070580Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.8072640Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu) (28/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:48:17.3384813Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:48:16.8599645Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:48:17.1464241Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:48:17.1685222Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:48:17.1754164Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:48:17.1771662Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:48:17.1808722Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:48:17.1868636Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:48:17.1903889Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:48:17.2113746Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:48:17.3267404Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-01fe178c405417375\n2022-03-21T22:48:17.3384813Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:48:17.3402286Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:48:17.3418376Z ##[error]Process completed with exit code 2.\n2022-03-21T22:48:17.3470528Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:48:17.3470874Z with:\n2022-03-21T22:48:17.3471096Z env:\n2022-03-21T22:48:17.3471327Z   IN_CI: 1\n2022-03-21T22:48:17.3471538Z   IS_GHA: 1\n2022-03-21T22:48:17.3471802Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:48:17.3472083Z   GPU_FLAG: --gpus all\n2022-03-21T22:48:17.3472322Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge) (29/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:16:38.9646300Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:16:38.7995969Z   Attempting uninstall: s3transfer\n2022-03-21T21:16:38.7998039Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:16:38.8066994Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:16:38.8072844Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:16:38.8449275Z   Attempting uninstall: boto3\n2022-03-21T21:16:38.8451430Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:16:38.8559828Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:16:38.8574290Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:16:38.9100438Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:16:38.9558098Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d779c59d277d32ee\n2022-03-21T21:16:38.9646300Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:16:38.9658894Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:16:38.9673240Z ##[error]Process completed with exit code 2.\n2022-03-21T21:16:38.9720106Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:16:38.9720333Z with:\n2022-03-21T21:16:38.9720485Z env:\n2022-03-21T21:16:38.9720645Z   IN_CI: 1\n2022-03-21T21:16:38.9720793Z   IS_GHA: 1\n2022-03-21T21:16:38.9720970Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:16:38.9721151Z ##[endgroup]\n2022-03-21T21:16:38.9736762Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2021-11-10T08:42:52Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "facebook-github-bot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 964902894
               },
               {
+                "bodyText": "@vitaly-fedyunin @gottbrath  FYI that this is the oneDNN Graph API integration. It depends on the #63748.",
+                "createdAt": "2021-11-16T16:36:52Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "Jianhui-Li"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 970451860
               },
               {
+                "bodyText": "CI failures are currently being caused by some issues in the CI infra, and are also occurring with other PRs.",
+                "createdAt": "2021-12-10T05:59:17Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 990641309
               },
               {
+                "bodyText": "CI failures are unrelated.",
+                "createdAt": "2021-12-10T20:44:09Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 991281407
               },
               {
+                "bodyText": "The CI failure is unrelated.",
+                "createdAt": "2021-12-16T02:45:59Z",
                 "author": {
-                  "login": "zzzwen"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 995389295
               },
               {
+                "bodyText": "Hi, thank you for the PR!\nDo you mind running a larger amount of torchbench and reporting numbers ? You can look at Jason's post here for what models are supported in script. Initially just the vision models would be useful. @Krovatkin also did some benchmarking of a traced Bert model and found on average a ~16% speedup with this PR.",
+                "createdAt": "2022-01-18T18:22:34Z",
                 "author": {
-                  "login": "zzzwen"
+                  "login": "eellison"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1015689390
               },
               {
+                "bodyText": "Thanks a lot for reviewing, @eellison & @Krovatkin!\nWe just wanted to let you know that we're working on the benchmarking & will get back to you in a day, or two.\nUPDATE (Jan 21): While running some TorchBench models, we discovered some composability issues, and are working to ensure that oneDNN Graph would complement PyTorch's existing fusion capabilities, not hinder them.\nUPDATE (Jan 24): We've resolved the issues & will update this PR later today. Thanks!",
+                "createdAt": "2022-01-20T00:31:01Z",
                 "author": {
-                  "login": "simpkins"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1016996190
               },
               {
+                "bodyText": "Hello @eellison,\nWe used this TorchBench branch for comparison. compare_llga.sh can be run for comparison.\nFor benchmarking mobilenet_v3_large with hardswish support in oneDNN Graph, this oneDNN Graph branch can be used in third_party/ideep/mkl-dnn. It delivers a speedup over PyTorch JIT (NNC + OFI) because 21 additional reorders are prevented (the major factor here), and fusion with conv also helps further.\nThe next release of oneDNN Graph would have hardswish support.\nWe're also exploring adding a hardsigmoid op in oneDNN Graph.\nThank you!",
+                "createdAt": "2022-01-26T23:51:38Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1022709513
               },
               {
+                "bodyText": "Please note that this PR should be merged after #71546, as #71546 changes the  third_party/ideep commit (this PR also uses that ideep commit, but it'd probably be better to merge #71546 first, so that oneDNN v2.5.2 upgrade would be in a separate PR). Thank you!",
+                "createdAt": "2022-01-31T23:57:21Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1026330085
               },
               {
+                "bodyText": "@sanchitintel mind rebasing and i'll land ?",
+                "createdAt": "2022-03-01T20:07:57Z",
                 "author": {
-                  "login": "zzzwen"
+                  "login": "eellison"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1055813984
               },
               {
+                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-03-02T17:44:47Z",
                 "author": {
-                  "login": "zzzwen"
+                  "login": "facebook-github-bot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1057203495
               },
               {
+                "bodyText": "Thanks a lot for taking a look, @eellison! To fix this error, we would enable Bazel build for oneDNN Graph.",
+                "createdAt": "2022-03-07T23:03:45Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1061230087
               },
               {
+                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-03-09T19:24:13Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "facebook-github-bot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1063276600
               },
               {
+                "bodyText": "@malfet has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-03-21T19:59:41Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "facebook-github-bot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074355779
               },
               {
+                "bodyText": "And graph_rewriter.cpp is full of DOS newlines...",
+                "createdAt": "2022-03-21T20:53:40Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "malfet"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074407452
               },
               {
+                "bodyText": "Hey @chunyuan-w.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-03-21T22:12:51Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "github-actions"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1074471758
               },
               {
+                "bodyText": "Thanks a ton for your help, @malfet & @eellison! :)\nWe'll incorporate your suggestions in subsequent PR(s).",
+                "createdAt": "2022-03-21T22:41:25Z",
                 "author": {
-                  "login": "kumpera"
+                  "login": "sanchitintel"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1074492365
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOOYM_0Q==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=94787 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "voznesenskym"
+          },
+          "title": "Fine grained dynamic shape controls",
+          "body": "https://docs.google.com/document/d/1aoIyYE8_6cYpWqS25thzVoIiKsT5aaUEOiiPwbIXt8k/edit\r\n\r\ncc @mlazos @soumith @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @desertfire",
+          "headRefName": "voz/shape_api",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "315f665336384c0ca116fd482f24567c9f40d38d"
+                }
               },
               {
-                "author": {
-                  "login": "simpkins"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "e9c00f7cfbde36beca8176464a4d78d8531c9153"
+                }
               },
               {
-                "author": {
-                  "login": "simpkins"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "9da26aca0323a2231136617f34dddb802d3be62e"
+                }
               },
               {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "c3ccd4399f118e438810eb23d34c5d914b4d236e"
+                }
               },
               {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "b8b59302a5acacd02cc6d73258958116ba2cd4bd"
+                }
               },
               {
-                "author": {
-                  "login": "pritamdamania87"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "72e76dac8ed31b6f1d415e54514fd554c9ec34fd"
+                }
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "72e3fbc065ccd06e4afeb72a5f350073006755ae"
+                }
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "e4ff378fb2bffb2f9095b01922d95fd775686682"
+                }
               },
               {
-                "author": {
-                  "login": "wilson100hong"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "c2bb93f588afe86479b34ab50ef9a98bcd28f2ff"
+                }
               },
               {
-                "author": {
-                  "login": "wilson100hong"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "3b6b0d356f41da5ededfb23841da0b87e347911e"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTA",
+              "hasNextPage": false
+            },
+            "totalCount": 10
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlG-LQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfKo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfLQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfMA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfNk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfOs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfQ8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "CircleCI Checks",
+                            "databaseId": 18001
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNfTA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Labeler"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208041537"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "triage",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041537/jobs/7303594092"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlG-ec=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNgQA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208041711"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Check labels",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041711/jobs/7303594470"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlG-7U=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNgmU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208041714"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041714/jobs/7303594537"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlG_As=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNgmk="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "EasyCLA",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2023-02-17T22:24:37Z",
+                  "oid": "3b6b0d356f41da5ededfb23841da0b87e347911e"
+                }
+              }
+            ]
+          },
+          "changedFiles": 10,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/dynamo/test_dynamic_shapes.py"
               },
               {
-                "author": {
-                  "login": "wilson100hong"
-                },
-                "state": "COMMENTED"
+                "path": "test/dynamo/test_export.py"
               },
               {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "DISMISSED"
+                "path": "test/dynamo/test_misc.py"
               },
               {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
+                "path": "test/dynamo/test_subgraphs.py"
               },
               {
-                "author": {
-                  "login": "xunnanxu"
-                },
-                "state": "COMMENTED"
+                "path": "torch/_dynamo/__init__.py"
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
+                "path": "torch/_dynamo/config.py"
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
+                "path": "torch/_dynamo/output_graph.py"
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
+                "path": "torch/_dynamo/symbolic_convert.py"
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
+                "path": "torch/_dynamo/variables/builder.py"
               },
               {
-                "author": {
-                  "login": "kumpera"
-                },
-                "state": "COMMENTED"
-              },
+                "path": "torch/fx/experimental/symbolic_shapes.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTA",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "xunnanxu"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "xunnanxu"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "xunnanxu"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "pritamdamania87"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "pritamdamania87"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "pritamdamania87"
+                  "login": "ezyang"
                 },
-                "state": "COMMENTED"
+                "state": "APPROVED"
               },
               {
                 "author": {
-                  "login": "pritamdamania87"
+                  "login": "voznesenskym"
                 },
-                "state": "APPROVED"
+                "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "kumpera"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=",
-              "hasPreviousPage": true
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMy0wMi0xNFQxMTo1NjozOS0wODowMLkyMDIzLTAyLTE0VDExOjU2OjM5LTA4OjAwzk1itV0=",
+              "hasPreviousPage": false
             }
           },
           "comments": {
             "nodes": [
               {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
-                "createdAt": "2022-05-05T12:35:49Z",
+                "bodyText": "@voznesenskym your PR has been successfully reverted.",
+                "createdAt": "2023-02-17T19:52:21Z",
                 "author": {
                   "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1118495479
+                "databaseId": 1435164065
               },
               {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
-                "createdAt": "2022-05-05T12:53:15Z",
+                "bodyText": "test_autocast_sdpa_dynamic_shapes_static_default\n\nThanks, this is a coverage bug, we probably just need to exclude this test.",
+                "createdAt": "2023-02-17T21:08:53Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "voznesenskym"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1118511287
+                "databaseId": 1435269902
               },
               {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
-                "createdAt": "2022-05-05T15:00:08Z",
+                "bodyText": "After this PR, test_autocast_sdpa_dynamic_shapes_static_default started to fail with RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides: https://github.com/pytorch/pytorch/actions/runs/4206176846/jobs/7299657478\n\nLooks like the test was skipped on the PR because of some other issue that was later fixed.\n\nI wonder if for large PRs that change public API, we can forward fix?",
+                "createdAt": "2023-02-17T21:33:20Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "voznesenskym"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1118662274
+                "databaseId": 1435295882
               },
               {
-                "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.",
-                "createdAt": "2022-05-05T15:20:46Z",
+                "bodyText": "@pytorchbot merge -f \"I trust this is fine, it was passing all release CI but one spurious one yesterday. Now we have insanely flaky CI, 404s, ptxas not found, out of space on runners, etc etc\"",
+                "createdAt": "2023-02-17T22:25:51Z",
                 "author": {
-                  "login": "janeyx99"
+                  "login": "voznesenskym"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1118689010
+                "databaseId": 1435346423
               },
               {
-                "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?",
-                "createdAt": "2022-05-05T15:24:08Z",
+                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
+                "createdAt": "2023-02-17T22:28:33Z",
                 "author": {
-                  "login": "janeyx99"
+                  "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1118693497
+                "databaseId": 1435348851
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==",
+              "startCursor": "Y3Vyc29yOnYyOpHOVYrdoQ==",
               "hasPreviousPage": true
             }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "oncall: distributed"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=71759 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "coolteemf"
-          },
-          "title": "Optimize grid sample 3d",
-          "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n>     * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n>     * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n>     * Changed the CPU kernels:\r\n>       (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorAccessor<scalar_t, 3>* gInp_slice_ptr` instead of `TensorAccessor<scalar_t, 3>& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n>     * Changed CUDA kernel:\r\n>       (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorInfo<scalar_t, index_t>()` instead of `getTensorInfo<scalar_t, index_t>(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n>     * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n>     * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n",
-          "headRefName": "optimize_grid_sample_3d",
-          "headRepository": {
-            "nameWithOwner": "coolteemf/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "563ec73747ad53b63b36736c47c4342f962c2a09"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5"
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "Merged"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309"
+                "node": {
+                  "name": "Reverted"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69"
+                "node": {
+                  "name": "ciflow/trunk"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48"
+                "node": {
+                  "name": "release notes: fx"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "f683e8aec7aea76097a264eec01511e704c31154"
+                "node": {
+                  "name": "module: dynamo"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "coolteemf"
-                    },
-                    "email": "67541941+coolteemf@users.noreply.github.com",
-                    "name": "Fran\u00e7ois Lecomte"
-                  },
-                  "oid": "b932e9e286c22aaf352375186df851ef060b295a"
+                "node": {
+                  "name": "ciflow/inductor"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "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",
-                    "name": "coolteemf"
-                  },
-                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                "node": {
+                  "name": "ciflow/inductor-perf-test-nightly"
                 }
               }
-            ],
-            "pageInfo": {
-              "endCursor": "MTY",
-              "hasNextPage": false
-            },
-            "totalCount": 16
-          },
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAApMNgmk= name=pytorch number=94787 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
           "commits": {
             "nodes": [
               {
                 "commit": {
+                  "oid": "3b6b0d356f41da5ededfb23841da0b87e347911e",
                   "checkSuites": {
                     "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_T6g="
-                      },
                       {
                         "node": {
                           "app": {
@@ -29618,36 +15901,271 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-onnx"
+                              "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754066"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208041735"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "linux-bionic-py3.8-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663109808"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303701615"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214802"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303701747"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303701852"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7-pch / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303701925"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.11-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702051"
+                              },
+                              {
+                                "name": "linux-focal-py3.9-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702154"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702271"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702381"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702460"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.7-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702563"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702661"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702808"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.4.2-py3.8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702894"
+                              },
+                              {
+                                "name": "linux-bionic-py3_8-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303702996"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703150"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703293"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703385"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703532"
+                              },
+                              {
+                                "name": "linux-jammy-cuda11.7-cudnn8-py3.8-clang12 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703638"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703737"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303703829"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303828889"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303835049"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303835153"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-cpp-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303837897"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-python-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303838021"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-functorch-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303838107"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303838199"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303838339"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303840161"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303842442"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303842542"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303842625"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303842711"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303842795"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303842882"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843181"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843267"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843372"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843487"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843604"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843698"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303843776"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845232"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845321"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845404"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845483"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845574"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845667"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.8-clang9 / test (functorch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214856"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303845746"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=",
-                              "hasNextPage": false
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlMAIw=",
+                              "hasNextPage": true
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "FAILURE"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubk="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNgo8="
                       },
                       {
                         "node": {
@@ -29657,26 +16175,71 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
+                              "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754064"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208041752"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "pr-sanity-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754064/jobs/2663109676"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303594654"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303594794"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303594903"
+                              },
+                              {
+                                "name": "Test collect_env (older_python_version)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303595028"
+                              },
+                              {
+                                "name": "docker-image / calculate-docker-image",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303595161"
+                              },
+                              {
+                                "name": "toc / linux-job",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303600397"
+                              },
+                              {
+                                "name": "Test tools / linux-job",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303600533"
+                              },
+                              {
+                                "name": "lintrunner / linux-job",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303600661"
+                              },
+                              {
+                                "name": "quick-checks / linux-job",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303600763"
+                              },
+                              {
+                                "name": "workflow-checks / linux-job",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041752/jobs/7303600890"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlHG-4=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubw="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNgrs="
                       },
                       {
                         "node": {
@@ -29686,41 +16249,31 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-bionic-rocm4.5-py3.7"
+                              "name": "windows-binary-libtorch-debug"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754065"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042519"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663109684"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "libtorch-cpu-shared-with-deps-debug-build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401083"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042519/jobs/7303603228"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "libtorch-cpu-shared-with-deps-debug-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401143"
-                              },
-                              {
-                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401186"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042519/jobs/7304861868"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlgTXU=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub0="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNic0="
                       },
                       {
                         "node": {
@@ -29730,41 +16283,31 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "win-vs2019-cuda11.3-py3"
+                              "name": "windows-binary-libtorch-release"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754068"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042523"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663109680"
-                              },
-                              {
-                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "libtorch-cpu-shared-with-deps-release-build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995756"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042523/jobs/7303605458"
                               },
                               {
-                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "name": "libtorch-cpu-shared-with-deps-release-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995819"
-                              },
-                              {
-                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995900"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042523/jobs/7304083009"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlQmr8=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub8="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNido="
                       },
                       {
                         "node": {
@@ -29774,110 +16317,271 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "trunk"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754069"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042713"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "mypy",
+                                "name": "libtorch-linux-bionic-cuda11.7-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109683"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303704758"
                               },
                               {
-                                "name": "shellcheck",
+                                "name": "macos-12-py3-x86-64 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109827"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303704873"
                               },
                               {
-                                "name": "py2-setup-validate-errormsg",
+                                "name": "macos-12-py3-x86-64-lite-interpreter / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109962"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303704980"
                               },
                               {
-                                "name": "clang-format",
+                                "name": "win-vs2019-cuda11.7-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110044"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705124"
                               },
                               {
-                                "name": "cmakelint",
+                                "name": "ios-12-5-1-x86-64 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110132"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705267"
                               },
                               {
-                                "name": "toc",
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110233"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705363"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "pytorch-linux-focal-py3-clang7-android-ndk-r19c-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110320"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705455"
                               },
                               {
-                                "name": "clang-tidy",
+                                "name": "linux-focal-py3.9-clang7-tsan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110461"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705559"
                               },
                               {
-                                "name": "flake8-py3",
+                                "name": "linux-bionic-py3.8-clang9-slow / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110575"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-clang7-asan"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754070"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705667"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.4.2-py3.8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705741"
+                              },
+                              {
+                                "name": "caffe2-linux-focal-py3.8-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705843"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303705932"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303706054"
+                              },
+                              {
+                                "name": "android-emulator-build-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303706203"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303706302"
+                              },
+                              {
+                                "name": "linux-focal-py3.9-clang7-tsan / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303835916"
+                              },
+                              {
+                                "name": "linux-focal-py3.9-clang7-tsan / test (tsan, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303840642"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9-slow / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303843564"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303848092"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.4.2-py3.8 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303938113"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.4.2-py3.8 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303942927"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.4.2-py3.8 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303943019"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303970913"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303971592"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974388"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974458"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974522"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974601"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974670"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974734"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974815"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974888"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303974962"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303975047"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.8-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303975129"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303975250"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303975351"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7303975457"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304247703"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64-mps / Run MPS tests",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304247822"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / test (default, 1, 2, macos-m1-12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304251854"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / test (default, 2, 2, macos-m1-12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304251962"
+                              },
                               {
-                                "name": "build",
+                                "name": "macos-12-py3-arm64 / test (functorch, 1, 1, macos-m1-12)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663109804"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304252042"
                               },
                               {
-                                "name": "test (default, 3, 3, linux.2xlarge)",
+                                "name": "win-vs2019-cuda11.7-py3 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233675"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304425704"
                               },
                               {
-                                "name": "test (default, 1, 3, linux.2xlarge)",
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 1, 5, windows.g5.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233731"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304429501"
                               },
                               {
-                                "name": "test (default, 2, 3, linux.2xlarge)",
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 2, 5, windows.g5.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233805"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304429568"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 3, 5, windows.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304429634"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 4, 5, windows.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304429698"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 5, 5, windows.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304429775"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.7-py3 / test (functorch, 1, 1, windows.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304429850"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=",
-                              "hasNextPage": false
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlXirw=",
+                              "hasNextPage": true
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcE="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNi7c="
                       },
                       {
                         "node": {
@@ -29887,26 +16591,31 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                              "name": "linux-binary-libtorch-pre-cxx11"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754076"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042720"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build-and-test",
+                                "name": "libtorch-cpu-shared-with-deps-pre-cxx11-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754076/jobs/2663109810"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042720/jobs/7303603413"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-pre-cxx11-test / test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042720/jobs/7304107632"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlREYs=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcY="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNi8c="
                       },
                       {
                         "node": {
@@ -29916,56 +16625,41 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4"
+                              "name": "linux-binary-manywheel"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754078"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042724"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663109777"
-                              },
-                              {
-                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201383"
-                              },
-                              {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201458"
-                              },
-                              {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "name": "manywheel-py3_8-cuda11_7-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201512"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042724/jobs/7303702881"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "manywheel-py3_8-cuda11_7-with-pypi-cudnn-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201580"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042724/jobs/7303703005"
                               },
                               {
-                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "manywheel-py3_8-cuda11_7-with-pypi-cudnn-test / test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201672"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042724/jobs/7304417906"
                               },
                               {
-                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "manywheel-py3_8-cuda11_7-test / test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201839"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042724/jobs/7304565223"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlaTiE=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uco="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNi9Q="
                       },
                       {
                         "node": {
@@ -29975,291 +16669,366 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                              "name": "linux-binary-libtorch-cxx11-abi"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1886754079"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042734"
                           },
                           "checkRuns": {
                             "nodes": [
-                              {
-                                "name": "build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754079/jobs/2663109681"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017798?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017799?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017816?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017800?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-02-23T10:39:30Z",
-                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
-                }
-              }
-            ]
-          },
-          "changedFiles": 9,
-          "files": {
-            "nodes": [
-              {
-                "path": "aten/src/ATen/native/GridSampler.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/GridSampler.cpp"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/GridSampler.cu"
-              },
-              {
-                "path": "aten/src/ATen/native/cuda/GridSampler.h"
-              },
-              {
-                "path": "aten/src/ATen/native/native_functions.yaml"
-              },
-              {
-                "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py"
-              },
-              {
-                "path": "test/test_nn.py"
-              },
-              {
-                "path": "tools/autograd/derivatives.yaml"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "OQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "coolteemf"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "APPROVED"
-              },
-              {
-                "author": {
-                  "login": "albanD"
-                },
-                "state": "APPROVED"
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042734/jobs/7303597291"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042734/jobs/7304014242"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlPTXg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNi-Y="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "inductor"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208042744"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303659293"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303659419"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303929219"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303931817"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / test (inductor_torchbench_smoketest_perf, 1, 1, linux.gcp.a100)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303933247"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303936137"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_huggingface, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303936197"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_timm, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303936265"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_timm, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303936343"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_torchbench, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303936420"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_distributed, 1, 1, linux.g5.12xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042744/jobs/7303936482"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlNw7A=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNi_w="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "inductor-A100-perf"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4208043854"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208043854/jobs/7303599705"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208043854/jobs/7303875874"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / test (inductor_huggingface_perf, 1, 1, linux.gcp.a100)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208043854/jobs/7303880431"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / test (inductor_timm_perf, 1, 2, linux.gcp.a100)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208043854/jobs/7303880558"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / test (inductor_timm_perf, 2, 2, linux.gcp.a100)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208043854/jobs/7303880671"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / test (inductor_torchbench_perf, 1, 1, linux.gcp.a100)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208043854/jobs/7303880772"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlMqhE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAApMNlcw="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  }
+                }
               }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAqlMAIw= cs_cursor=Y3Vyc29yOnYyOpHPAAAAApMNgmk= name=pytorch number=94787 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
             "nodes": [
               {
-                "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630",
-                "createdAt": "2022-02-23T14:55:36Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1048868910
-              },
-              {
-                "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !",
-                "createdAt": "2022-02-23T16:44:36Z",
-                "author": {
-                  "login": "coolteemf"
-                },
-                "authorAssociation": "CONTRIBUTOR",
-                "editor": null,
-                "databaseId": 1048983572
-              },
-              {
-                "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)",
-                "createdAt": "2022-02-23T17:49:55Z",
-                "author": {
-                  "login": "malfet"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1049048119
-              },
-              {
-                "bodyText": "@pytorchbot merge this please",
-                "createdAt": "2022-02-23T19:23:55Z",
-                "author": {
-                  "login": "albanD"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1049131992
-              },
-              {
-                "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-02-23T19:26:51Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1049134520
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "triaged"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "release notes: nn"
-                }
-              },
-              {
-                "node": {
-                  "name": "topic: performance"
+                "commit": {
+                  "oid": "3b6b0d356f41da5ededfb23841da0b87e347911e",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-vulkan-bionic-py3.11-clang9 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303846368"
+                            },
+                            {
+                              "name": "linux-vulkan-bionic-py3.11-clang9 / test (default, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303851789"
+                            },
+                            {
+                              "name": "linux-bionic-py3_8-clang8-xla / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303889527"
+                            },
+                            {
+                              "name": "linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303893798"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303894483"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / test (default, 1, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303898484"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / test (default, 2, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303898598"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / test (default, 3, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303898734"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / test (default, 4, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303898877"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / test (default, 5, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303899024"
+                            },
+                            {
+                              "name": "linux-focal-py3.9-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303899113"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303972169"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303973854"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975438"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975525"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975583"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975657"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975728"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975810"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975863"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975906"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303975975"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (default, 1, 4, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303977784"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (default, 2, 4, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303977848"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (default, 3, 4, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303977907"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (default, 4, 4, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303977968"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (slow, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303978026"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (slow, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "FAILURE",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303978092"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / test (functorch, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7303978145"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7304059664"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7304068409"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7304068629"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208041735/jobs/7304068814"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlQV9M=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
                 }
               }
             ]
@@ -30268,441 +17037,1292 @@
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=75095 owner=pytorch": {
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAqlXirw= cs_cursor=Y3Vyc29yOnYyOpHPAAAAApMNido= name=pytorch number=94787 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "mruberry"
-          },
-          "title": "Initial prims, references, and test architecture for them",
-          "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ",
-          "headRefName": "prims_and_references",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
+          "commits": {
             "nodes": [
               {
                 "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "a790467c650be92775103cde5e866c90b56f5376"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "63fdd580118477416ae160e0670ae722ea248090"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ezyang"
-                    },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
-                  },
-                  "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466"
+                  "oid": "3b6b0d356f41da5ededfb23841da0b87e347911e",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "win-vs2019-cuda11.7-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304430807"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304458221"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (default, 1, 3, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304461432"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (default, 2, 3, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304461489"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (default, 3, 3, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304461560"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (functorch, 1, 1, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4208042713/jobs/7304461616"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAqlYLyw=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
                 }
-              },
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "Dummy change",
+          "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n",
+          "headRefName": "export-D34753911",
+          "headRepository": {
+            "nameWithOwner": "malfet/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "ezyang"
+                      "login": "malfet"
                     },
-                    "email": "ezyang@fb.com",
-                    "name": "Edward Z. Yang"
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
                   },
-                  "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574"
+                  "oid": "4746da707a9912356f5179625da89616b228dc21"
                 }
-              },
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
               {
                 "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-vulkan-bionic-py3.7-clang9"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280134"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794078044"
+                              },
+                              {
+                                "name": "test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794189060"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280135"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280135/jobs/2794078023"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-rocm4.5-py3.7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280132"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794078060"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292071"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292205"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292306"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cuda11.3-py3"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280139"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794078053"
+                              },
+                              {
+                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536907"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536998"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794537089"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280136"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280136/jobs/2794078031"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-docs"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280138"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794078055"
+                              },
+                              {
+                                "name": "build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183768"
+                              },
+                              {
+                                "name": "build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183828"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280140"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794078017"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181109"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181305"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181488"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280143"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280143/jobs/2794078025"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280145"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "shellcheck",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078028"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078196"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078407"
+                              },
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078610"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078760"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078898"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078999"
+                              },
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079087"
+                              },
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079199"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280146"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280146/jobs/2794078040"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
                   },
-                  "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3"
-                }
-              },
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040614?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040643?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040615?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
                   },
-                  "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8"
+                  "pushedDate": "2022-03-09T15:57:16Z",
+                  "oid": "4746da707a9912356f5179625da89616b228dc21"
                 }
-              },
+              }
+            ]
+          },
+          "changedFiles": 1,
+          "files": {
+            "nodes": [
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e"
-                }
-              },
+                "path": "tools/build_variables.bzl"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [],
+            "pageInfo": {
+              "startCursor": null,
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58"
-                }
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped",
+                "createdAt": "2022-03-09T15:57:11Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1063079053
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "ngimel"
-                    },
-                    "email": "ngimel@fb.com",
-                    "name": "Natalia Gimelshein"
-                  },
-                  "oid": "442c405e9da0d66744ef03e379224c41eedf5b57"
-                }
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-03-09T15:57:12Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1063079113
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "031ac49ae9c192989385986b6707fa781e3229e0"
-                }
-              },
+                "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911",
+                "createdAt": "2022-03-09T15:57:34Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1063079731
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==",
+              "hasPreviousPage": false
+            }
+          },
+          "labels": {
+            "edges": [
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba"
+                "node": {
+                  "name": "fb-exported"
                 }
               },
               {
-                "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb"
+                "node": {
+                  "name": "cla signed"
                 }
-              },
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-RA= name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
               {
                 "commit": {
-                  "author": {
-                    "user": null,
-                    "email": "mruberry@devfair044.h1.fair",
-                    "name": "Mike Ruberry"
-                  },
-                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
+                  "oid": "4746da707a9912356f5179625da89616b228dc21",
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280141"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280141/jobs/2794078056"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2c8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Test tools"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280142"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280142/jobs/2794078033"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2as=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280144"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794078046"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794338293"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794338408"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280144/jobs/2794338568"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbUkMA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280148"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280148/jobs/2794078065"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cpu-py3"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280149"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280149/jobs/2794078067"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280149/jobs/2794407041"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280149/jobs/2794407168"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbWDX8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280150"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280150/jobs/2794078029"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Rc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-asan"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280151"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794078062"
+                              },
+                              {
+                                "name": "test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794225603"
+                              },
+                              {
+                                "name": "test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794225793"
+                              },
+                              {
+                                "name": "test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280151/jobs/2794226005"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSD-k=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Rk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              },
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO574=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Ro="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-xla-linux-bionic-py3.7-clang8"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280152"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280152/jobs/2794078032"
+                              },
+                              {
+                                "name": "test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280152/jobs/2794227475"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSGAM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Rs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc5.4"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280160"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794078054"
+                              },
+                              {
+                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203297"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203553"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203717"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203878"
+                              },
+                              {
+                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794203982"
+                              },
+                              {
+                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280160/jobs/2794204149"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRlJs=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-SU="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  }
                 }
               }
-            ],
-            "pageInfo": {
-              "endCursor": "MjY",
-              "hasNextPage": false
-            },
-            "totalCount": 26
-          },
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-SU= name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
           "commits": {
             "nodes": [
               {
                 "commit": {
+                  "oid": "4746da707a9912356f5179625da89616b228dc21",
                   "checkSuites": {
                     "edges": [
                       {
                         "node": {
                           "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-py3.7-clang9"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280162"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "Facebook CLA Check",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794078019"
                               },
                               {
-                                "name": "Meta Internal-Only Changes Check",
+                                "name": "test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794187280"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794187423"
+                              },
+                              {
+                                "name": "test (noarch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280162/jobs/2794187582"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRN_c=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2o="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Sk="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-onnx"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280164"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280164/jobs/2794078039"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280164/jobs/2794213425"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280164/jobs/2794213615"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRySo=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2w="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-TY="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280168"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280168/jobs/2794078064"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d0=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3U="
-                      },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-UI="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=73099 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "BowenBao"
+          },
+          "title": "[ONNX] Make graph name spec-compliant (#71961)",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952",
+          "headRefName": "gh/BowenBao/138/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/BowenBao/138/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "BowenBao"
+                    },
+                    "email": "bowbao@microsoft.com",
+                    "name": "BowenBao"
+                  },
+                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
                       {
                         "node": {
                           "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041786"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041786/jobs/2626264278"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3o="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7k="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041785"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626264385"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417658"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417743"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417885"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC34="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7o="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041789"
                           },
-                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [],
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041789/jobs/2626264416"
+                              }
+                            ],
                             "pageInfo": {
-                              "endCursor": null,
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": null
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC4E="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7s="
                       },
                       {
                         "node": {
@@ -30712,26 +18332,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                              "name": "linux-xenial-py3-clang5-mobile-build"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622865"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041787"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622865/jobs/3270915028"
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041787/jobs/2626264407"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SKIPPED"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDNo="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7w="
                       },
                       {
                         "node": {
@@ -30741,56 +18361,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622869"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041788"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915027"
-                              },
-                              {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915071"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915141"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915194"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915229"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915283"
-                              },
-                              {
-                                "name": "workflow-checks",
+                                "name": "build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915321"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041788/jobs/2626264422"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDOY="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS74="
                       },
                       {
                         "node": {
@@ -30800,760 +18390,1593 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "linux-bionic-py3.7-clang9"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2217622878"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041790"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927344"
-                              },
-                              {
-                                "name": "linux-bionic-rocm5.0-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927442"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927507"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927567"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927674"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927727"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927802"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927853"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927948"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927996"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928061"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928116"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928198"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928256"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928291"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928317"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928338"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928367"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928410"
-                              },
-                              {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928445"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991071"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991125"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991162"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991195"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991233"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991261"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991305"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991349"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996024"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996068"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996092"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996505"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270998987"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270999027"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006886"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006941"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018097"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626264414"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "name": "test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018135"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349405"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "name": "test (noarch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018162"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349522"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271021143"
-                              },
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349618"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS78="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-vulkan-bionic-py3.7-clang9"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041793"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034041"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626264431"
                               },
                               {
-                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "test (default, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034072"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626359364"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8A="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041792"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271048218"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041792/jobs/2626264427"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8I="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cpu-py3"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041791"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049553"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626264386"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "test (default, 1, 2, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049587"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722677"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "name": "test (default, 2, 2, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049616"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722710"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8M="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041803"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "name": "build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068293"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626264401"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068336"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349045"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149276"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349141"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149321"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349272"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=",
+                              "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q="
                       }
                     ],
                     "pageInfo": {
-                      "hasNextPage": false
+                      "hasNextPage": true
                     }
                   },
-                  "status": null,
-                  "pushedDate": "2022-04-25T02:30:31Z",
-                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010288?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010289?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010488?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010326?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-02-18T18:46:28Z",
+                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
                 }
               }
             ]
           },
-          "changedFiles": 5,
+          "changedFiles": 162,
           "files": {
             "nodes": [
               {
-                "path": "test/test_ops.py"
+                "path": "test/onnx/expect/TestOperators.test_acos.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_addconstant.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_addmm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_argmax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_asin.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_at_op.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_atan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_baddbmm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_basic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect"
               },
               {
-                "path": "torch/_prims/__init__.py"
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect"
               },
               {
-                "path": "torch/_prims/utils.py"
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect"
               },
               {
-                "path": "torch/_refs/__init__.py"
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect"
               },
               {
-                "path": "torch/testing/_internal/common_methods_invocations.py"
+                "path": "test/onnx/expect/TestOperators.test_bitshift.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_c2_op.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_chunk.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip_max.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip_min.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_concat2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_convtranspose.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_cos.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_cumsum.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_det.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dict.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dict_str.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dim.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_default.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_training.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_elu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_empty_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_equal.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_erf.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_exp.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_expand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_flatten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_flatten2D.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_fmod.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_full.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_full_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gather.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ge.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gelu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_hardtanh.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_index.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_isnan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_le.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_linear.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_lt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_master_opset.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_max.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_meshgrid.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_min.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_narrow.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ne.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_nonzero.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_norm_p1.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_norm_p2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ones_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_pad.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_params.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_permute2.expect"
               }
             ],
             "pageInfo": {
-              "endCursor": "NQ",
-              "hasNextPage": false
+              "endCursor": "MTAw",
+              "hasNextPage": true
             }
           },
           "reviews": {
             "nodes": [
               {
                 "author": {
-                  "login": "lezcano"
+                  "login": "garymm"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n  \n    \n      pytorch/.github/scripts/trymerge.py\n    \n    \n         Line 63\n      in\n      932adf2\n    \n  \n  \n    \n\n        \n          \n                 files(last: 100) { \n        \n    \n  \n\n Can this be relaxed? If not please import.",
+                "createdAt": "2022-02-22T18:22:40Z",
+                "author": {
+                  "login": "BowenBao"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1048084569
               },
               {
+                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.",
+                "createdAt": "2022-02-22T18:27:29Z",
                 "author": {
-                  "login": "mruberry"
+                  "login": "malfet"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048088691
               },
               {
+                "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.",
+                "createdAt": "2022-02-22T18:29:48Z",
                 "author": {
-                  "login": "mruberry"
+                  "login": "BowenBao"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1048090640
               },
               {
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-02-24T21:42:36Z",
                 "author": {
-                  "login": "mruberry"
+                  "login": "BowenBao"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1050293881
               },
               {
+                "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-02-24T21:44:39Z",
                 "author": {
-                  "login": "mruberry"
+                  "login": "github-actions"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1050295451
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "oncall: jit"
+                }
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "open source"
+                }
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "cla signed"
+                }
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "release notes: onnx"
+                }
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "topic: bug fixes"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=73099 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "files": {
+            "nodes": [
+              {
+                "path": "test/onnx/expect/TestOperators.test_pixel_shuffle.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_pow.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_prelu.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_prod.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_prod_dtype.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_rand.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_randn.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect"
               },
               {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_mean.expect"
               },
               {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_prod.expect"
               },
               {
-                "author": {
-                  "login": "zou3519"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect"
               },
               {
-                "author": {
-                  "login": "peterbell10"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_sum.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reducemax.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_reducemin.expect"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_remainder.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_repeat.expect"
               },
               {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_round.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_rrelu.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_rsqrt.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_rsub.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_scatter_add.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_scatter_add_opset11.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_selu.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_shape_value_map.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_sign.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_sin.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_slice.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_slice_dynamic.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_split.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_split_with_sizes.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_sqrt.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_std.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_sum.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_sum_dtype.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_tan.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_topk.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_transpose.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_type_as.expect"
               },
               {
-                "author": {
-                  "login": "ngimel"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_unfold.expect"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_unique.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_unsqueeze.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_upsample_nearest_size.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_view.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_view_flatten.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "test/onnx/expect/TestOperators.test_zeros_like.expect"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "torch/csrc/jit/serialization/export.cpp"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "torch/csrc/jit/serialization/export.h"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTYy",
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=94146 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "voznesenskym"
+          },
+          "title": "Add benchmarks.py to run all benchmarks, add new file with all torchbench model names",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #94146\n\n\n\ncc @mlazos @soumith @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @desertfire",
+          "headRefName": "gh/voznesenskym/48/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/voznesenskym/48/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "fdc6de58a67f0a1544441700ca2b6d3eea3d7265"
+                }
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "05820041836f94d9b0b58c1cd2e8e676897486ed"
+                }
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "voznesenskym"
+                    },
+                    "email": "voznesenskym@gmail.com",
+                    "name": "Michael Voznesensky"
+                  },
+                  "oid": "307120d6d3f7fcc3f92cfd26be891d360ad6a92a"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            },
+            "totalCount": 3
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApotJds=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7JZo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Labeler"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4117580328"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "triage",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580328/jobs/7109050767"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApotKI8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7JgI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4117580490"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Check labels",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580490/jobs/7109051146"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApotKo8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7Jqo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4117580484"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109051128"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109051412"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109051633"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109051825"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109052043"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109052171"
+                              },
+                              {
+                                "name": "Test collect_env (older_python_version)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109052311"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109052470"
+                              },
+                              {
+                                "name": "pr-sanity-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580484/jobs/7109052591"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApotMiY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7Jq0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4117580496"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580496/jobs/7109051218"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApotKuk=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7Jq4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4117580543"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-vulkan-bionic-py3.11-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109051516"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109051774"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109051945"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052100"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7-pch / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052238"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052396"
+                              },
+                              {
+                                "name": "linux-bionic-py3_8-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052565"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052688"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052812"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109052987"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109053154"
+                              },
+                              {
+                                "name": "linux-jammy-cuda11.7-cudnn8-py3.8-clang12 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109053345"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.7-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109053509"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109053667"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109053856"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109054063"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109054232"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.4.2-py3.8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109054387"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109054522"
+                              },
+                              {
+                                "name": "linux-focal-py3.9-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109054720"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109054850"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109226581"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109227335"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.11-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109229723"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109232328"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109232500"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109232642"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109232812"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109232971"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109233112"
+                              },
+                              {
+                                "name": "linux-bionic-py3.8-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109233226"
+                              },
+                              {
+                                "name": "linux-bionic-py3.11-clang9 / test (smoke, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109233581"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.11-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109235597"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109236990"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109243124"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109243245"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109248093"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-cpp-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109248230"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-python-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109248395"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-functorch-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109248579"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109254734"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109255047"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109255258"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109255408"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109255603"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109255755"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109255917"
+                              },
+                              {
+                                "name": "linux-focal-py3.8-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109256077"
+                              },
+                              {
+                                "name": "linux-focal-py3.9-clang7-asan / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109318155"
+                              },
+                              {
+                                "name": "linux-focal-py3.9-clang7-asan / test (default, 1, 5, linux.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117580543/jobs/7109324085"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApozDL8=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7Jt0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "inductor"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4117581803"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109054078"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109054225"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109383782"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109388657"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm80 / test (inductor_torchbench_smoketest_perf, 1, 1, linux.gcp.a100)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109389546"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109396942"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_huggingface, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109397127"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_timm, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109397286"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_timm, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109397449"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_torchbench, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109397660"
+                              },
+                              {
+                                "name": "cuda11.7-py3.10-gcc7-sm86 / test (inductor_distributed, 1, 1, linux.g5.12xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4117581803/jobs/7109397898"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApo0pos=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoX7LI0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4118244339"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Check labels",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4118244339/jobs/7110535231"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAppMOus=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoYV920="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-release"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4118245342"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4118245342/jobs/7110537241"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4118245342/jobs/7111588299"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAApph-Pc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoYWAS4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-debug"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4118245343"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-debug-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4118245343/jobs/7110537315"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-debug-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4118245343/jobs/7112221106"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAppvIsc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAoYWATM="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "EasyCLA",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
+                      }
+                    ]
+                  },
+                  "pushedDate": null,
+                  "oid": "307120d6d3f7fcc3f92cfd26be891d360ad6a92a"
+                }
+              }
+            ]
+          },
+          "changedFiles": 6,
+          "files": {
+            "nodes": [
+              {
+                "path": "benchmarks/dynamo/all_torchbench_models_list.txt"
               },
               {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
+                "path": "benchmarks/dynamo/benchmarks.py"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "benchmarks/dynamo/huggingface.py"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "benchmarks/dynamo/run_all.sh"
               },
               {
-                "author": {
-                  "login": "lezcano"
-                },
-                "state": "COMMENTED"
+                "path": "benchmarks/dynamo/timm_models.py"
               },
               {
-                "author": {
-                  "login": "ezyang"
-                },
-                "state": "COMMENTED"
-              },
+                "path": "benchmarks/dynamo/torchbench.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Ng",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
               {
                 "author": {
                   "login": "ezyang"
@@ -31566,115 +19989,87 @@
                 },
                 "state": "COMMENTED"
               },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
               {
                 "author": {
                   "login": "ezyang"
                 },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "mruberry"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "ngimel"
-                },
                 "state": "APPROVED"
               },
               {
                 "author": {
-                  "login": "ezyang"
+                  "login": "voznesenskym"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "mruberry"
+                  "login": "ezyang"
                 },
                 "state": "COMMENTED"
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=",
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMy0wMi0wNFQxOTozOTo0NS0wODowMLkyMDIzLTAyLTA0VDE5OjM5OjQ1LTA4OjAwzkyKd3I=",
               "hasPreviousPage": false
             }
           },
           "comments": {
             "nodes": [
               {
-                "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.",
-                "createdAt": "2022-04-21T19:00:28Z",
+                "bodyText": "Ok, so following graphql:\nquery {\n  repository(owner: \"pytorch\", name: \"pytorch\") {\n    pullRequest(number: 94146) {\n      commits(last:1) {\n        nodes {\n          commit {\n            oid\n            committedDate\n            pushedDate\n          }\n        }\n      }\n    }\n  }\n}\nreturns\n{\n  \"data\": {\n    \"repository\": {\n      \"pullRequest\": {\n        \"commits\": {\n          \"nodes\": [\n            {\n              \"commit\": {\n                \"oid\": \"307120d6d3f7fcc3f92cfd26be891d360ad6a92a\",\n                \"committedDate\": \"2023-02-07T19:37:26Z\",\n                \"pushedDate\": null\n              }\n            }\n          ]\n        }\n      }\n    }\n  }\n}",
+                "createdAt": "2023-02-07T23:37:08Z",
                 "author": {
-                  "login": "ngimel"
+                  "login": "malfet"
                 },
                 "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1105643418
+                "editor": {
+                  "login": "malfet"
+                },
+                "databaseId": 1421647117
               },
               {
-                "bodyText": "@pytorchbot merge this please",
-                "createdAt": "2022-04-25T04:42:29Z",
+                "bodyText": "#91134 looks sus\n\nI though the same, but no, that is not the case",
+                "createdAt": "2023-02-08T00:02:44Z",
                 "author": {
-                  "login": "mruberry"
+                  "login": "malfet"
                 },
-                "authorAssociation": "COLLABORATOR",
+                "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1108072887
+                "databaseId": 1421670890
               },
               {
-                "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244",
-                "createdAt": "2022-04-25T04:43:54Z",
+                "bodyText": "@malfet what shall we do?",
+                "createdAt": "2023-02-08T00:26:33Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "voznesenskym"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1108073536
+                "databaseId": 1421695330
               },
               {
-                "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
-                "createdAt": "2022-04-25T04:51:11Z",
+                "bodyText": "@pytorchbot merge -f \"Hopefully this avoid recency check\"",
+                "createdAt": "2023-02-08T01:16:51Z",
                 "author": {
-                  "login": "facebook-github-bot"
+                  "login": "malfet"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1108075965
+                "databaseId": 1421754796
               },
               {
-                "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-04-25T09:57:56Z",
+                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
+                "createdAt": "2023-02-08T01:18:34Z",
                 "author": {
-                  "login": "github-actions"
+                  "login": "pytorchmergebot"
                 },
-                "authorAssociation": "NONE",
+                "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1108351107
+                "databaseId": 1421759377
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==",
+              "startCursor": "Y3Vyc29yOnYyOpHOVLydDQ==",
               "hasPreviousPage": true
             }
           },
@@ -31682,7 +20077,12 @@
             "edges": [
               {
                 "node": {
-                  "name": "cla signed"
+                  "name": "Merged"
+                }
+              },
+              {
+                "node": {
+                  "name": "ciflow/trunk"
                 }
               },
               {
@@ -31692,7 +20092,12 @@
               },
               {
                 "node": {
-                  "name": "module: primTorch"
+                  "name": "module: dynamo"
+                }
+              },
+              {
+                "node": {
+                  "name": "ciflow/inductor"
                 }
               }
             ]
@@ -31701,22 +20106,22 @@
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=77700 owner=pytorch": {
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=90791 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
           "closed": true,
           "isCrossRepository": false,
           "author": {
-            "login": "kit1980"
+            "login": "bdhirsh"
           },
-          "title": "Move pull linux-docs job to Ubuntu 20.04",
-          "body": "",
-          "headRefName": "sdym/pull-xenial-focal-linux-docs",
+          "title": "functionalization: check for undefined tensors in advanced indexing",
+          "body": "cc @wonjoolee95 - XLA folks were seeing an advanced indexing issue with undefined tensors.\r\n\r\nIt looks like running code like `a[:, tensor_idx] = b` can results in:\r\n\r\n(1) calling `index_put_()`\r\n(2) passing (potential undefined) tensors as the indices to index_put_().\r\n\r\n\r\nStack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* #91001\n* __->__ #90791\n* #90722\n\r\n",
+          "headRefName": "gh/bdhirsh/356/head",
           "headRepository": {
             "nameWithOwner": "pytorch/pytorch"
           },
-          "baseRefName": "master",
+          "baseRefName": "gh/bdhirsh/356/base",
           "baseRepository": {
             "nameWithOwner": "pytorch/pytorch",
             "isPrivate": false,
@@ -31731,20 +20136,68 @@
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kit1980"
+                      "login": "bdhirsh"
                     },
-                    "email": "sdym@fb.com",
-                    "name": "Sergii Dymchenko"
+                    "email": "hirsheybar@meta.com",
+                    "name": "Brian Hirsh"
                   },
-                  "oid": "81261599614423baa17df72300b8e109677b6799"
+                  "oid": "c9e8e71b8ba2ba62bfac29900e71dde3ab6589cb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@meta.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "ed3eff87d5cc76ce6d8e5f1db901be21acc86cb6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@meta.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "00ca22160d89060815e2be50e52f462f811c1087"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@meta.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "b00e14c4a90e33721a406772bf548fbfffb065d4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@meta.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "MQ",
+              "endCursor": "NQ",
               "hasNextPage": false
             },
-            "totalCount": 1
+            "totalCount": 5
           },
           "commits": {
             "nodes": [
@@ -31762,19 +20215,19 @@
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "Facebook CLA Check",
+                                "name": "Meta Internal-Only Changes Check",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.facebook.com/cla/"
+                                "detailsUrl": "https://opensource.facebook.com/"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNmNqE=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP3Pw=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuMI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rl0="
                       },
                       {
                         "node": {
@@ -31792,7 +20245,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuM4="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rn4="
                       },
                       {
                         "node": {
@@ -31810,7 +20263,31 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuNU="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rpY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "CircleCI Checks",
+                            "databaseId": 18001
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://circleci.com/workflow-run/0456c68a-2cb2-4b5c-beff-42ff31937439?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-checks-link&utm_content=bottom"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7Hg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rrI="
                       },
                       {
                         "node": {
@@ -31828,7 +20305,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuOI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rtI="
                       },
                       {
                         "node": {
@@ -31846,7 +20323,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuPI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68ruk="
                       },
                       {
                         "node": {
@@ -31864,7 +20341,7 @@
                           },
                           "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuQQ="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk68rv8="
                       },
                       {
                         "node": {
@@ -31874,56 +20351,26 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "Check Labels"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867841"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206640"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "lintrunner",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528127876"
-                              },
-                              {
-                                "name": "workflow-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128023"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128196"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128519"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128575"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128663"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128857"
+                                "name": "Check labels",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206640/jobs/6297806113"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYVY=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7rU=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": "FAILURE"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzA="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684e0="
                       },
                       {
                         "node": {
@@ -31935,123 +20382,240 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867843"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206646"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "run-torchbench",
                                 "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867843/jobs/3528127882"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206646/jobs/6297806176"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdXEg=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7vk=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzg="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "docker-builds"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867844"
-                          },
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127883"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127945"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128001"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-py3.7-clang9)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128067"
-                              },
-                              {
-                                "name": "docker-build (pytorch-linux-bionic-rocm5.0-py3.7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128124"
-                              },
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684fY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206650"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "docker-build (pytorch-linux-bionic-rocm5.1-py3.7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128191"
+                                "name": "lintrunner",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297806783"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128259"
+                                "name": "Test tools",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297806967"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)",
+                                "name": "pr-sanity-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128321"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807120"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-android-ndk-r19c)",
+                                "name": "workflow-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128365"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807302"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-asan)",
+                                "name": "toc",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128446"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807451"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-asan)",
+                                "name": "quick-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128507"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807633"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-onnx)",
+                                "name": "Test collect_env (with_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128563"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807764"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc5.4)",
+                                "name": "Test collect_env (without_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128639"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297807891"
                               },
                               {
-                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc7)",
+                                "name": "Test collect_env (older_python_version)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128687"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206650/jobs/6297808026"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP-Fs=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684gc="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "EasyCLA",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-12-16T15:04:35Z",
+                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e"
+                }
+              }
+            ]
+          },
+          "changedFiles": 2,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
+              },
+              {
+                "path": "test/test_functionalization.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mg",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0xMi0xM1QxNzo0NTo1Ny0wODowMLkyMDIyLTEyLTEzVDE3OjQ1OjU3LTA4OjAwzkiEx9E=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful Links\n\ud83e\uddea See artifacts and rendered test results at hud.pytorch.org/pr/90791\n\n\ud83d\udcc4 Preview Python docs built from this PR\n\ud83d\udcc4 Preview C++ docs built from this PR\n\u2753 Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours\n\nNote: Links to docs will display an error until the docs builds have been completed.\n\u274c 2 Failures\nAs of commit 70711ab:\nNEW FAILURES - The following jobs have failed:\n\nlintrunner\nTest tools\n\n\nThis comment was automatically generated by Dr. CI and updates every 15 minutes.",
+                "createdAt": "2022-12-13T20:48:29Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "pytorch-bot"
+                },
+                "databaseId": 1349670291
+              },
+              {
+                "bodyText": "@pytorchbot merge -f \"lint tests are flaky\"",
+                "createdAt": "2022-12-19T16:09:30Z",
+                "author": {
+                  "login": "bdhirsh"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1357898146
+              },
+              {
+                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
+                "createdAt": "2022-12-19T16:11:00Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1357900127
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOUHJVkw==",
+              "hasPreviousPage": false
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "Merged"
+                }
+              },
+              {
+                "node": {
+                  "name": "release notes: composability"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAk684gc= name=pytorch number=90791 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e",
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Labeler"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206652"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "docker-build (pytorch-linux-focal-py3.7-gcc7)",
+                                "name": "triage",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128741"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206652/jobs/6297806231"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYLI=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyP7z0=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu0A="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684gk="
                       },
                       {
                         "node": {
@@ -32063,1180 +20627,2701 @@
                             "workflow": {
                               "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2348867849"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3714206658"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "linux-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150762"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297806627"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150903"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297806814"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151086"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807002"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151258"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807233"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "linux-focal-py3-clang7-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151511"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807392"
                               },
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "name": "linux-focal-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151776"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807527"
                               },
                               {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151896"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807706"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "name": "win-vs2019-cuda11.6-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152014"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297807915"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152139"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808137"
                               },
                               {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808315"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808528"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152216"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808733"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "name": "linux-focal-py3.7-gcc7-pch / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152378"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297808911"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "name": "linux-focal-py3.7-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152516"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297809658"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152599"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297809822"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152723"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297809996"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "linux-focal-rocm5.3-py3.8 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152802"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297810168"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152913"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297810328"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152969"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6297810479"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "linux-bionic-py3.7-clang9 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153005"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298023287"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-focal-py3.7-gcc7 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153062"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298028658"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "name": "linux-docs / build-docs-cpp-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153125"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298028841"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "linux-docs / build-docs-python-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153207"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298028976"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs-functorch-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242483"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298029091"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242528"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030237"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245875"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030451"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245914"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030577"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245964"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030712"
                               },
                               {
                                 "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528246008"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030845"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528248520"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298030983"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255086"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298031137"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255128"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298031279"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-clang10-onnx / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274064"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298033927"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274097"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298035896"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274133"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036008"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274173"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036149"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274209"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036286"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528277014"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036389"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528308958"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036502"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309747"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036635"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309810"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036767"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309837"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298036993"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309864"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298040119"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309895"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298040269"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309925"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298109574"
                               },
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310044"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298116983"
                               },
                               {
-                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310101"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117143"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384337"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117258"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384379"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117401"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384408"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117536"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyWETY=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk684iI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3716423635"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "Check labels",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384441"
-                              },
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3716423635/jobs/6302732322"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlzyfKM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAk8UBDA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3733139393"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "Check labels",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384471"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3733139393/jobs/6333531377"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNi1Nc=",
-                              "hasNextPage": true
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAl8pm1U=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAlEdVYM="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAlyWETY= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAk684gk= name=pytorch number=90791 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "70711ab89515aa4515ce60d3c29a04dbdba8e06e",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298117670"
+                            },
+                            {
+                              "name": "linux-bionic-py3_7-clang8-xla / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298123873"
+                            },
+                            {
+                              "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298130231"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298216660"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298218524"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298223405"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298223604"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298223779"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225106"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225234"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225373"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225516"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225636"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225752"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298225878"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298226024"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3714206658/jobs/6298226177"
                             }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu1E="
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAlyYNZQ=",
+                            "hasNextPage": false
+                          }
+                        }
                       }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": false
-                    }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-05-19T00:02:11Z",
-                  "oid": "81261599614423baa17df72300b8e109677b6799"
+                    ]
+                  }
                 }
               }
-            ]
-          },
-          "changedFiles": 3,
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "dreiss"
+              },
+              {
+                "login": "kumpera"
+              },
+              {
+                "login": "zpao"
+              },
+              {
+                "login": "ezyang"
+              },
+              {
+                "login": "jad"
+              },
+              {
+                "login": "swolchok"
+              },
+              {
+                "login": "hyuen"
+              },
+              {
+                "login": "orionr"
+              },
+              {
+                "login": "dhruvbird"
+              },
+              {
+                "login": "likethesky"
+              },
+              {
+                "login": "lw"
+              },
+              {
+                "login": "raziel"
+              },
+              {
+                "login": "simpkins"
+              },
+              {
+                "login": "ebyrne"
+              },
+              {
+                "login": "Babar"
+              },
+              {
+                "login": "kostmo"
+              },
+              {
+                "login": "bhosmer"
+              },
+              {
+                "login": "digantdesai"
+              },
+              {
+                "login": "zdevito"
+              },
+              {
+                "login": "bugra"
+              },
+              {
+                "login": "kunalb"
+              },
+              {
+                "login": "kit1980"
+              },
+              {
+                "login": "shoumikhin"
+              },
+              {
+                "login": "huydhn"
+              },
+              {
+                "login": "teytaud"
+              },
+              {
+                "login": "xuzhao9"
+              },
+              {
+                "login": "jansel"
+              },
+              {
+                "login": "abhinavarora"
+              },
+              {
+                "login": "djthorne"
+              },
+              {
+                "login": "Mortimerp9"
+              },
+              {
+                "login": "dadkins20"
+              },
+              {
+                "login": "colesbury"
+              },
+              {
+                "login": "laurencer"
+              },
+              {
+                "login": "nickgg"
+              },
+              {
+                "login": "yzhao30"
+              },
+              {
+                "login": "rmaz"
+              },
+              {
+                "login": "bearzx"
+              },
+              {
+                "login": "mattjgalloway"
+              },
+              {
+                "login": "chenyang78"
+              },
+              {
+                "login": "yns88"
+              },
+              {
+                "login": "lc0"
+              },
+              {
+                "login": "michaelay"
+              },
+              {
+                "login": "wenleix"
+              },
+              {
+                "login": "jingsh"
+              },
+              {
+                "login": "mthrok"
+              },
+              {
+                "login": "drdarshan"
+              },
+              {
+                "login": "jamiemccrindle"
+              },
+              {
+                "login": "kazhang"
+              },
+              {
+                "login": "simonhollis"
+              },
+              {
+                "login": "govardhan"
+              },
+              {
+                "login": "yinghai"
+              },
+              {
+                "login": "zyan0"
+              },
+              {
+                "login": "ajtulloch"
+              },
+              {
+                "login": "smeenai"
+              },
+              {
+                "login": "vtlam"
+              },
+              {
+                "login": "khabinov"
+              },
+              {
+                "login": "NicolasHug"
+              },
+              {
+                "login": "jfix71"
+              },
+              {
+                "login": "atuljangra"
+              },
+              {
+                "login": "rshraga"
+              },
+              {
+                "login": "idning"
+              },
+              {
+                "login": "soumith"
+              },
+              {
+                "login": "nimin98"
+              },
+              {
+                "login": "chaekit"
+              },
+              {
+                "login": "xunnanxu"
+              },
+              {
+                "login": "mergennachin"
+              },
+              {
+                "login": "javier-m"
+              },
+              {
+                "login": "mostafaelhoushi"
+              },
+              {
+                "login": "brianjo"
+              },
+              {
+                "login": "suo"
+              },
+              {
+                "login": "vkuzo"
+              },
+              {
+                "login": "seemethere"
+              },
+              {
+                "login": "cpuhrsch"
+              },
+              {
+                "login": "qihqi"
+              },
+              {
+                "login": "jackm321"
+              },
+              {
+                "login": "linbinyu"
+              },
+              {
+                "login": "neerajprad"
+              },
+              {
+                "login": "rsemenov"
+              },
+              {
+                "login": "ziky90"
+              },
+              {
+                "login": "gmagogsfm"
+              },
+              {
+                "login": "zzzwen"
+              },
+              {
+                "login": "yanboliang"
+              },
+              {
+                "login": "andrewor14"
+              },
+              {
+                "login": "jianyuh"
+              },
+              {
+                "login": "cykustcc"
+              },
+              {
+                "login": "highker"
+              },
+              {
+                "login": "jeffreyksmithjr"
+              },
+              {
+                "login": "smessmer"
+              },
+              {
+                "login": "ananthsub"
+              },
+              {
+                "login": "malfet"
+              },
+              {
+                "login": "fegin"
+              },
+              {
+                "login": "zanqi"
+              },
+              {
+                "login": "supriyar"
+              },
+              {
+                "login": "kausv"
+              },
+              {
+                "login": "dagitses"
+              },
+              {
+                "login": "yhcharles"
+              },
+              {
+                "login": "bilgeacun"
+              },
+              {
+                "login": "caogao"
+              },
+              {
+                "login": "miguelmartin75"
+              },
+              {
+                "login": "penguinwu"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOADBnlQ=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOADBnlQ== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "shz117"
+              },
+              {
+                "login": "ajliu"
+              },
+              {
+                "login": "msaroufim"
+              },
+              {
+                "login": "davides"
+              },
+              {
+                "login": "alannnna"
+              },
+              {
+                "login": "hlin09"
+              },
+              {
+                "login": "terrychenism"
+              },
+              {
+                "login": "xiaomengy"
+              },
+              {
+                "login": "jisaacso"
+              },
+              {
+                "login": "fkhan1337"
+              },
+              {
+                "login": "xing-liu"
+              },
+              {
+                "login": "harshitkhaitan"
+              },
+              {
+                "login": "alanadakotashine"
+              },
+              {
+                "login": "desertfire"
+              },
+              {
+                "login": "banitag1"
+              },
+              {
+                "login": "gchanan"
+              },
+              {
+                "login": "dbort"
+              },
+              {
+                "login": "DanilBaibak"
+              },
+              {
+                "login": "serhaty"
+              },
+              {
+                "login": "yf225"
+              },
+              {
+                "login": "mlazos"
+              },
+              {
+                "login": "yifuwang"
+              },
+              {
+                "login": "tenpercent"
+              },
+              {
+                "login": "bertmaher"
+              },
+              {
+                "login": "chauhang"
+              },
+              {
+                "login": "ZainRizvi"
+              },
+              {
+                "login": "jiayisuse"
+              },
+              {
+                "login": "bochko"
+              },
+              {
+                "login": "jeanschmidt"
+              },
+              {
+                "login": "bradleyhd"
+              },
+              {
+                "login": "voznesenskym"
+              },
+              {
+                "login": "bwasti"
+              },
+              {
+                "login": "NivekT"
+              },
+              {
+                "login": "zhxchen17"
+              },
+              {
+                "login": "jerryzh168"
+              },
+              {
+                "login": "wconstab"
+              },
+              {
+                "login": "Hangjun"
+              },
+              {
+                "login": "davidberard98"
+              },
+              {
+                "login": "CamiWilliams"
+              },
+              {
+                "login": "avikchaudhuri"
+              },
+              {
+                "login": "aartibasant"
+              },
+              {
+                "login": "xta0"
+              },
+              {
+                "login": "8Keep"
+              },
+              {
+                "login": "zou3519"
+              },
+              {
+                "login": "xman1979"
+              },
+              {
+                "login": "suraj813"
+              },
+              {
+                "login": "gqchen"
+              },
+              {
+                "login": "abhikrish"
+              },
+              {
+                "login": "zhangguanheng66"
+              },
+              {
+                "login": "Chillee"
+              },
+              {
+                "login": "albanD"
+              },
+              {
+                "login": "bigfootjon"
+              },
+              {
+                "login": "robotal"
+              },
+              {
+                "login": "MarcioPorto"
+              },
+              {
+                "login": "srsuryadev"
+              },
+              {
+                "login": "IvanKobzarev"
+              },
+              {
+                "login": "eprivezentsev"
+              },
+              {
+                "login": "kwen2501"
+              },
+              {
+                "login": "chandlerzuo"
+              },
+              {
+                "login": "otsneh"
+              },
+              {
+                "login": "husthyc"
+              },
+              {
+                "login": "briancoutinho"
+              },
+              {
+                "login": "fduwjj"
+              },
+              {
+                "login": "frank-wei"
+              },
+              {
+                "login": "QuentinDuval"
+              },
+              {
+                "login": "atalman"
+              },
+              {
+                "login": "xush6528"
+              },
+              {
+                "login": "dracifer"
+              },
+              {
+                "login": "SS-JIA"
+              },
+              {
+                "login": "helunwencser"
+              },
+              {
+                "login": "xw285cornell"
+              },
+              {
+                "login": "hhbyyh"
+              },
+              {
+                "login": "dulinriley"
+              },
+              {
+                "login": "rohan-varma"
+              },
+              {
+                "login": "jcaip"
+              },
+              {
+                "login": "teng-li"
+              },
+              {
+                "login": "larryliu0820"
+              },
+              {
+                "login": "lyoka"
+              },
+              {
+                "login": "cbalioglu"
+              },
+              {
+                "login": "hl475"
+              },
+              {
+                "login": "hwangjeff"
+              },
+              {
+                "login": "Jack-Khuu"
+              },
+              {
+                "login": "nateanl"
+              },
+              {
+                "login": "kylesyoon"
+              },
+              {
+                "login": "fuqianz"
+              },
+              {
+                "login": "boyuantan"
+              },
+              {
+                "login": "muntaqim"
+              },
+              {
+                "login": "fmassa"
+              },
+              {
+                "login": "esantorella"
+              },
+              {
+                "login": "HamidShojanazeri"
+              },
+              {
+                "login": "jubinchheda"
+              },
+              {
+                "login": "mehdimashayekhi"
+              },
+              {
+                "login": "rkindi"
+              },
+              {
+                "login": "wanchaol"
+              },
+              {
+                "login": "zephirefaith"
+              },
+              {
+                "login": "kapilsh"
+              },
+              {
+                "login": "plahera"
+              },
+              {
+                "login": "SherlockNoMad"
+              },
+              {
+                "login": "iseeyuan"
+              },
+              {
+                "login": "protonu"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOAKJKeQ=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOAKJKeQ== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "terhuhf"
+              },
+              {
+                "login": "aruntonic"
+              },
+              {
+                "login": "gcatron"
+              },
+              {
+                "login": "yingrliu"
+              },
+              {
+                "login": "alexanderguzhva"
+              },
+              {
+                "login": "angelayi"
+              },
+              {
+                "login": "zhaoalex"
+              },
+              {
+                "login": "vivekmig"
+              },
+              {
+                "login": "sangongs"
+              },
+              {
+                "login": "akshaypandian"
+              },
+              {
+                "login": "drej82"
+              },
+              {
+                "login": "tktrungna"
+              },
+              {
+                "login": "eellison"
+              },
+              {
+                "login": "ydwu4"
+              },
+              {
+                "login": "NarineK"
+              },
+              {
+                "login": "andrewconnors"
+              },
+              {
+                "login": "wenwei202"
+              },
+              {
+                "login": "jg2912"
+              },
+              {
+                "login": "XilunWu"
+              },
+              {
+                "login": "mreso"
+              },
+              {
+                "login": "soulitzer"
+              },
+              {
+                "login": "tiandiao123"
+              },
+              {
+                "login": "PaliC"
+              },
+              {
+                "login": "anijain2305"
+              },
+              {
+                "login": "pvtuan10"
+              },
+              {
+                "login": "osalpekar"
+              },
+              {
+                "login": "xiaohui-zhang"
+              },
+              {
+                "login": "jerry39213gh"
+              },
+              {
+                "login": "jarodhou"
+              },
+              {
+                "login": "H-Huang"
+              },
+              {
+                "login": "vtsyvina"
+              },
+              {
+                "login": "PratsBhatt"
+              },
+              {
+                "login": "Nitrokitty"
+              },
+              {
+                "login": "satgera"
+              },
+              {
+                "login": "ngimel"
+              },
+              {
+                "login": "markkm"
+              },
+              {
+                "login": "EscapeZero"
+              },
+              {
+                "login": "bdhirsh"
+              },
+              {
+                "login": "cccclai"
+              },
+              {
+                "login": "tugsbayasgalan"
+              },
+              {
+                "login": "agunapal"
+              },
+              {
+                "login": "frankseide"
+              },
+              {
+                "login": "YazhiGao"
+              },
+              {
+                "login": "mrshenli"
+              },
+              {
+                "login": "bashnick"
+              },
+              {
+                "login": "lena-kashtelyan"
+              },
+              {
+                "login": "brad-mengchi"
+              },
+              {
+                "login": "kimishpatel"
+              },
+              {
+                "login": "aaronenyeshi"
+              },
+              {
+                "login": "shajrawi"
+              },
+              {
+                "login": "great-way"
+              },
+              {
+                "login": "ashkan-software"
+              },
+              {
+                "login": "mortzur"
+              },
+              {
+                "login": "jbitton"
+              },
+              {
+                "login": "hatala91"
+              },
+              {
+                "login": "zhangxy988"
+              },
+              {
+                "login": "samlurye"
+              },
+              {
+                "login": "anjali411"
+              },
+              {
+                "login": "williamwen42"
+              },
+              {
+                "login": "joecummings"
+              },
+              {
+                "login": "842974287"
+              },
+              {
+                "login": "JacobSzwejbka"
+              },
+              {
+                "login": "nishantpdce"
+              },
+              {
+                "login": "srinivas212"
+              },
+              {
+                "login": "shreyanb98"
+              },
+              {
+                "login": "naveedgol"
+              },
+              {
+                "login": "Nayef211"
+              },
+              {
+                "login": "HengruiX"
+              },
+              {
+                "login": "sgrigory"
+              },
+              {
+                "login": "chekangliang"
+              },
+              {
+                "login": "ebsmothers"
+              },
+              {
+                "login": "anshuljain1"
+              },
+              {
+                "login": "salilsdesai"
+              },
+              {
+                "login": "vmoens"
+              },
+              {
+                "login": "yoavnavon"
+              },
+              {
+                "login": "printfoo"
+              },
+              {
+                "login": "ErikaLal"
+              },
+              {
+                "login": "xinyang0"
+              },
+              {
+                "login": "kauterry"
+              },
+              {
+                "login": "anirbanraywork"
+              },
+              {
+                "login": "houseroad"
+              },
+              {
+                "login": "erichan1"
+              },
+              {
+                "login": "hsrussell"
+              },
+              {
+                "login": "ilia-cher"
+              },
+              {
+                "login": "ajitmaths"
+              },
+              {
+                "login": "awgu"
+              },
+              {
+                "login": "wz337"
+              },
+              {
+                "login": "qxy11"
+              },
+              {
+                "login": "janeyx99"
+              },
+              {
+                "login": "glaringlee"
+              },
+              {
+                "login": "anj-s"
+              },
+              {
+                "login": "drisspg"
+              },
+              {
+                "login": "kmh4321"
+              },
+              {
+                "login": "RdoubleA"
+              },
+              {
+                "login": "jramseyer"
+              },
+              {
+                "login": "jianingfu"
+              },
+              {
+                "login": "mikaylagawarecki"
+              },
+              {
+                "login": "xianxl"
+              },
+              {
+                "login": "aazzolini"
+              },
+              {
+                "login": "Xirider"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": true,
+              "endCursor": "Y3Vyc29yOnYyOpHOAj2vcw=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOAj2vcw== name=metamates org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
+            "nodes": [
+              {
+                "login": "HDCharles"
+              },
+              {
+                "login": "mcr229"
+              },
+              {
+                "login": "manuelcandales"
+              },
+              {
+                "login": "guangy10"
+              },
+              {
+                "login": "mengwa41"
+              },
+              {
+                "login": "YulunW"
+              },
+              {
+                "login": "danthe3rd"
+              },
+              {
+                "login": "hx89"
+              },
+              {
+                "login": "itang00"
+              },
+              {
+                "login": "hanhsienhuang"
+              },
+              {
+                "login": "clee2000"
+              },
+              {
+                "login": "lhuang04"
+              },
+              {
+                "login": "gottbrath"
+              },
+              {
+                "login": "lessw2020"
+              },
+              {
+                "login": "taivu1998"
+              },
+              {
+                "login": "danrecoskie"
+              },
+              {
+                "login": "zhaojuanmao"
+              },
+              {
+                "login": "johncalab"
+              },
+              {
+                "login": "dhthompson"
+              },
+              {
+                "login": "superwizard2019"
+              },
+              {
+                "login": "TovlyFB"
+              },
+              {
+                "login": "shunting314"
+              },
+              {
+                "login": "xcheng16"
+              },
+              {
+                "login": "adamomainz"
+              },
+              {
+                "login": "sluks"
+              },
+              {
+                "login": "SebastianAment"
+              },
+              {
+                "login": "ansley"
+              },
+              {
+                "login": "cheetah2216"
+              },
+              {
+                "login": "mikekgfb"
+              },
+              {
+                "login": "pinaki-mukerji"
+              },
+              {
+                "login": "kyulee-com"
+              },
+              {
+                "login": "dahsh"
+              },
+              {
+                "login": "byterover"
+              },
+              {
+                "login": "wmao533"
+              },
+              {
+                "login": "ejguan"
+              },
+              {
+                "login": "nimaelyasi"
+              },
+              {
+                "login": "qxu-fb"
+              },
+              {
+                "login": "sshawnwu"
+              },
+              {
+                "login": "iramazanli"
+              },
+              {
+                "login": "jnkwok1"
+              },
+              {
+                "login": "kurman"
+              },
+              {
+                "login": "jbschlosser"
+              },
+              {
+                "login": "haichuan-fb"
+              },
+              {
+                "login": "JustinPinero"
+              },
+              {
+                "login": "gcramer23"
+              },
+              {
+                "login": "yuguo68"
+              },
+              {
+                "login": "c-odrin"
+              },
+              {
+                "login": "chowarfb"
+              },
+              {
+                "login": "priyaramani"
+              },
+              {
+                "login": "asalioufb"
+              },
+              {
+                "login": "four4fish"
+              },
+              {
+                "login": "kkosik20"
+              },
+              {
+                "login": "KZFB"
+              },
+              {
+                "login": "henryliu-bluehills"
+              },
+              {
+                "login": "minjungkim85"
+              },
+              {
+                "login": "muchulee8"
+              },
+              {
+                "login": "kirklandsign"
+              },
+              {
+                "login": "jiawenliu64"
+              },
+              {
+                "login": "izaitsevfb"
+              },
+              {
+                "login": "ashramac"
+              },
+              {
+                "login": "weiwangmeta"
+              },
+              {
+                "login": "andysamfb"
+              },
+              {
+                "login": "nanoax"
+              },
+              {
+                "login": "yulin0077"
+              },
+              {
+                "login": "kwanghoon-meta"
+              },
+              {
+                "login": "l-kirsch"
+              },
+              {
+                "login": "YXIE14"
+              },
+              {
+                "login": "lzterpm"
+              }
+            ],
+            "pageInfo": {
+              "hasNextPage": false,
+              "endCursor": "Y3Vyc29yOnYyOpHOB32goQ=="
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "files": {
+            "nodes": [
+              {
+                "path": "docs/source/quantization.rst"
+              },
+              {
+                "path": "docs/source/scripts/build_quantization_configs.py"
+              },
+              {
+                "path": "test/allowlist_for_publicAPI.json"
+              },
+              {
+                "path": "test/cpp/jit/source_range_test.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_backend.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_flatbuffer.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_misc.cpp"
+              },
+              {
+                "path": "test/cpp/jit/test_utils.h"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff"
+              },
+              {
+                "path": "test/cpp/profiler/record_function.cpp"
+              },
+              {
+                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
+              },
+              {
+                "path": "test/distributed/_shard/test_replicated_tensor.py"
+              },
+              {
+                "path": "test/distributed/fsdp/test_fsdp_comm.py"
+              },
+              {
+                "path": "test/distributed/fsdp/test_fsdp_optim_state.py"
+              },
+              {
+                "path": "test/distributed/optim/test_zero_redundancy_optimizer.py"
+              },
+              {
+                "path": "test/jit/test_export_modes.py"
+              },
+              {
+                "path": "test/jit/test_if_hoisting.py"
+              },
+              {
+                "path": "test/jit/test_tracer.py"
+              },
+              {
+                "path": "test/jit/test_upgraders.py"
+              },
+              {
+                "path": "test/mobile/test_lite_script_type.py"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
+              },
+              {
+                "path": "test/onnx/test_operators.py"
+              },
+              {
+                "path": "test/onnx/test_pytorch_onnx_onnxruntime.py"
+              },
+              {
+                "path": "test/quantization/ao_migration/test_quantization_fx.py"
+              },
+              {
+                "path": "test/quantization/core/test_quantized_op.py"
+              },
+              {
+                "path": "test/quantization/core/test_quantized_tensor.py"
+              },
+              {
+                "path": "test/quantization/fx/test_numeric_suite_fx.py"
+              },
+              {
+                "path": "test/quantization/fx/test_quantize_fx.py"
+              },
+              {
+                "path": "test/test_autograd.py"
+              },
+              {
+                "path": "test/test_binary_ufuncs.py"
+              },
+              {
+                "path": "test/test_expanded_weights.py"
+              },
+              {
+                "path": "test/test_functionalization.py"
+              },
+              {
+                "path": "test/test_fx_experimental.py"
+              },
+              {
+                "path": "test/test_jit.py"
+              },
+              {
+                "path": "test/test_jit_cuda_fuser.py"
+              },
+              {
+                "path": "test/test_linalg.py"
+              },
+              {
+                "path": "test/test_nestedtensor.py"
+              },
+              {
+                "path": "test/test_nn.py"
+              },
+              {
+                "path": "test/test_ops.py"
+              },
+              {
+                "path": "test/test_ops_gradients.py"
+              },
+              {
+                "path": "test/test_ops_jit.py"
+              },
+              {
+                "path": "test/test_optim.py"
+              },
+              {
+                "path": "test/test_overrides.py"
+              },
+              {
+                "path": "test/test_profiler.py"
+              },
+              {
+                "path": "test/test_public_bindings.py"
+              },
+              {
+                "path": "test/test_pytree.py"
+              },
+              {
+                "path": "test/test_reductions.py"
+              },
+              {
+                "path": "test/test_sort_and_select.py"
+              },
+              {
+                "path": "test/test_sparse.py"
+              },
+              {
+                "path": "test/test_sparse_csr.py"
+              },
+              {
+                "path": "test/test_spectral_ops.py"
+              },
+              {
+                "path": "test/test_tensor_creation_ops.py"
+              },
+              {
+                "path": "test/test_tensorboard.py"
+              },
+              {
+                "path": "test/test_testing.py"
+              },
+              {
+                "path": "test/test_torch.py"
+              },
+              {
+                "path": "test/test_unary_ufuncs.py"
+              },
+              {
+                "path": "third_party/BUCK.github"
+              },
+              {
+                "path": "third_party/fbgemm"
+              },
+              {
+                "path": "tools/autograd/derivatives.yaml"
+              },
+              {
+                "path": "tools/autograd/gen_inplace_or_view_type.py"
+              },
+              {
+                "path": "tools/autograd/load_derivatives.py"
+              },
+              {
+                "path": "tools/build_variables.bzl"
+              },
+              {
+                "path": "tools/codegen/api/autograd.py"
+              },
+              {
+                "path": "tools/codegen/api/cpp.py"
+              },
+              {
+                "path": "tools/codegen/api/dispatcher.py"
+              },
+              {
+                "path": "tools/codegen/api/functionalization.py"
+              },
+              {
+                "path": "tools/codegen/api/lazy.py"
+              },
+              {
+                "path": "tools/codegen/api/meta.py"
+              },
+              {
+                "path": "tools/codegen/api/native.py"
+              },
+              {
+                "path": "tools/codegen/api/python.py"
+              },
+              {
+                "path": "tools/codegen/api/structured.py"
+              },
+              {
+                "path": "tools/codegen/api/translate.py"
+              },
+              {
+                "path": "tools/codegen/api/types.py"
+              },
+              {
+                "path": "tools/codegen/api/ufunc.py"
+              },
+              {
+                "path": "tools/codegen/api/unboxing.py"
+              },
+              {
+                "path": "tools/codegen/code_template.py"
+              },
+              {
+                "path": "tools/codegen/context.py"
+              },
+              {
+                "path": "tools/codegen/decompositions/gen_jit_decompositions.py"
+              },
+              {
+                "path": "tools/codegen/dest/__init__.py"
+              },
+              {
+                "path": "tools/codegen/dest/lazy_ir.py"
+              },
+              {
+                "path": "tools/codegen/dest/lazy_ts_lowering.py"
+              },
+              {
+                "path": "tools/codegen/dest/native_functions.py"
+              },
+              {
+                "path": "tools/codegen/dest/register_dispatch_key.py"
+              },
+              {
+                "path": "tools/codegen/dest/ufunc.py"
+              },
+              {
+                "path": "tools/codegen/gen.py"
+              },
+              {
+                "path": "tools/codegen/gen_backend_stubs.py"
+              },
+              {
+                "path": "tools/codegen/gen_functionalization_type.py"
+              },
+              {
+                "path": "tools/codegen/gen_lazy_tensor.py"
+              },
+              {
+                "path": "tools/codegen/local.py"
+              },
+              {
+                "path": "tools/codegen/model.py"
+              },
+              {
+                "path": "tools/codegen/operator_versions/gen_mobile_upgraders.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MjAw",
+              "hasNextPage": true
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MjAw name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
           "files": {
             "nodes": [
               {
-                "path": ".circleci/docker/build.sh"
+                "path": "tools/codegen/selective_build/operator.py"
+              },
+              {
+                "path": "tools/codegen/selective_build/selector.py"
+              },
+              {
+                "path": "tools/codegen/shape_functions/gen_jit_shape_functions.py"
+              },
+              {
+                "path": "tools/codegen/static_runtime/config.py"
+              },
+              {
+                "path": "tools/codegen/static_runtime/gen_static_runtime_ops.py"
+              },
+              {
+                "path": "tools/codegen/static_runtime/gen_structured.py"
+              },
+              {
+                "path": "tools/codegen/utils.py"
+              },
+              {
+                "path": "tools/linter/adapters/circleci_linter.py"
+              },
+              {
+                "path": "tools/linter/adapters/clangformat_linter.py"
+              },
+              {
+                "path": "tools/linter/adapters/grep_linter.py"
+              },
+              {
+                "path": "tools/linter/adapters/nativefunctions_linter.py"
+              },
+              {
+                "path": "tools/setup_helpers/BUILD.bazel"
+              },
+              {
+                "path": "tools/setup_helpers/generate_code.py"
+              },
+              {
+                "path": "torch/_C/__init__.pyi.in"
+              },
+              {
+                "path": "torch/amp/autocast_mode.py"
+              },
+              {
+                "path": "torch/ao/ns/fx/pattern_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/README.md"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/__init__.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/native.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/observation_type.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/tensorrt.py"
+              },
+              {
+                "path": "torch/ao/quantization/backend_config/utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/__init__.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/backend_config/fuse_handler.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/backend_config/quantize_handler.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/backend_config_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/convert.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/fuse.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/fusion_patterns.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/match_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/pattern_utils.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/prepare.py"
+              },
+              {
+                "path": "torch/ao/quantization/fx/quantization_patterns.py"
+              },
+              {
+                "path": "torch/ao/quantization/qconfig.py"
+              },
+              {
+                "path": "torch/ao/quantization/quantization_types.py"
+              },
+              {
+                "path": "torch/ao/quantization/quantize_fx.py"
+              },
+              {
+                "path": "torch/autograd/__init__.py"
+              },
+              {
+                "path": "torch/csrc/Module.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/FunctionsManual.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/FunctionsManual.h"
+              },
+              {
+                "path": "torch/csrc/autograd/engine.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/function.h"
+              },
+              {
+                "path": "torch/csrc/autograd/functions/accumulate_grad.h"
+              },
+              {
+                "path": "torch/csrc/autograd/init.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/python_torch_functions_manual.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/python_variable.cpp"
+              },
+              {
+                "path": "torch/csrc/autograd/record_function_ops.h"
+              },
+              {
+                "path": "torch/csrc/autograd/utils/grad_layout_contract.h"
+              },
+              {
+                "path": "torch/csrc/deploy/CMakeLists.txt"
+              },
+              {
+                "path": "torch/csrc/distributed/c10d/logger.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/cuda/graph_fuser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/cuda/parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/function_schema_parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/lexer.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/parser.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/script_type_parser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/source_range.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/source_range.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/source_ref.h"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/tracer.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/frontend/tracer.h"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/debug_info.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/debug_info.h"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/flatbuffer_loader.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/mobile/module.h"
+              },
+              {
+                "path": "torch/csrc/jit/passes/common_expression_hoisting.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/common_expression_hoisting.h"
+              },
+              {
+                "path": "torch/csrc/jit/passes/frozen_graph_optimizations.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/python/init.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/python/python_tree_views.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/python/script_init.cpp"
               },
               {
-                "path": ".circleci/docker/common/install_katex.sh"
+                "path": "torch/csrc/jit/runtime/graph_executor.cpp"
               },
               {
-                "path": ".github/workflows/pull.yml"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mw",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [
+                "path": "torch/csrc/jit/runtime/interpreter.cpp"
+              },
               {
-                "author": {
-                  "login": "suo"
-                },
-                "state": "COMMENTED"
+                "path": "torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp"
               },
               {
-                "author": {
-                  "login": "kit1980"
-                },
-                "state": "COMMENTED"
+                "path": "torch/csrc/jit/runtime/script_profile.cpp"
               },
               {
-                "author": {
-                  "login": "janeyx99"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNS0xOFQxMjo0MTowNS0wNzowMLkyMDIyLTA1LTE4VDEyOjQxOjA0LTA3OjAwzjpD7es=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
+                "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp"
+              },
               {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/77700\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 8126159 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-05-17T23:01:48Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 1129400934
+                "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.h"
               },
               {
-                "bodyText": "@pytorchbot merge",
-                "createdAt": "2022-05-19T15:39:05Z",
-                "author": {
-                  "login": "kit1980"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1131884232
+                "path": "torch/csrc/jit/runtime/shape_function_registry.h"
               },
               {
-                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) linux-docs / build-docs (cpp), linux-docs / build-docs (python) are pending/not yet run for rule OSS CI\nRaised by https://github.com/pytorch/pytorch/actions/runs/2353067846",
-                "createdAt": "2022-05-19T15:40:59Z",
-                "author": {
-                  "login": "pytorchmergebot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1131886153
+                "path": "torch/csrc/jit/runtime/shape_functions.h"
               },
               {
-                "bodyText": "@pytorchbot merge -f",
-                "createdAt": "2022-05-19T16:41:29Z",
-                "author": {
-                  "login": "kit1980"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1131945610
+                "path": "torch/csrc/jit/runtime/shape_functions_1.h"
               },
               {
-                "bodyText": "Hey @kit1980.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-05-19T16:43:37Z",
-                "author": {
-                  "login": "github-actions"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1131947473
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQ1FKZg==",
-              "hasPreviousPage": false
-            }
-          },
-          "labels": {
-            "edges": [
+                "path": "torch/csrc/jit/runtime/static/impl.cpp"
+              },
               {
-                "node": {
-                  "name": "Merged"
-                }
+                "path": "torch/csrc/jit/runtime/static/passes.cpp"
               },
               {
-                "node": {
-                  "name": "cla signed"
-                }
+                "path": "torch/csrc/jit/runtime/symbolic_shape_registry.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/symbolic_shape_registry.h"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/export_module.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/flatbuffer_serializer.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_export_helpers.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_export_helpers.h"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_source.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/import_source.h"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/source_range_serialization.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/serialization/source_range_serialization.h"
+              },
+              {
+                "path": "torch/csrc/jit/testing/file_check.cpp"
+              },
+              {
+                "path": "torch/csrc/lazy/core/dynamic_ir.cpp"
+              },
+              {
+                "path": "torch/csrc/lazy/core/dynamic_ir.h"
+              },
+              {
+                "path": "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp"
               }
-            ]
+            ],
+            "pageInfo": {
+              "endCursor": "MzAw",
+              "hasNextPage": true
+            }
           }
         }
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=68111 owner=pytorch": {
+  "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MzAw name=pytorch number=76118 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "chunyuan-w"
-          },
-          "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)",
-          "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n  \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.",
-          "headRefName": "chunyuan/llga_preview2",
-          "headRepository": {
-            "nameWithOwner": "chunyuan-w/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
+          "files": {
             "nodes": [
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98"
-                }
+                "path": "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f"
-                }
+                "path": "torch/csrc/utils/python_arg_parser.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da"
-                }
+                "path": "torch/csrc/utils/python_arg_parser.h"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f"
-                }
+                "path": "torch/csrc/utils/tensor_list.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "81d44f35b8bc043c38837d0694e5bc072203b832"
-                }
+                "path": "torch/csrc/utils/tensor_new.cpp"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2"
-                }
+                "path": "torch/csrc/utils/tensor_new.h"
+              },
+              {
+                "path": "torch/distributed/_shard/__init__.py"
+              },
+              {
+                "path": "torch/distributed/_shard/api.py"
+              },
+              {
+                "path": "torch/distributed/_shard/replicated_tensor.py"
+              },
+              {
+                "path": "torch/distributed/_shard/sharded_tensor/__init__.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "954dc23663125897f4b199eb2a8607dc5fca3274"
-                }
+                "path": "torch/distributed/_shard/sharded_tensor/api.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc"
-                }
+                "path": "torch/distributed/_shard/sharded_tensor/utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c"
-                }
+                "path": "torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70"
-                }
+                "path": "torch/distributed/algorithms/model_averaging/utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589"
-                }
+                "path": "torch/distributed/fsdp/_optim_utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb"
-                }
+                "path": "torch/distributed/fsdp/fully_sharded_data_parallel.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2"
-                }
+                "path": "torch/distributed/nn/__init__.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f"
-                }
+                "path": "torch/distributed/nn/functional.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58"
-                }
+                "path": "torch/distributed/optim/functional_adagrad.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "edbfc640ea79a0af85757d9e73796dcc90231519"
-                }
+                "path": "torch/fx/experimental/meta_tracer.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "chunyuan-w"
-                    },
-                    "email": "chunyuan.wu@intel.com",
-                    "name": "chunyuan"
-                  },
-                  "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf"
-                }
+                "path": "torch/fx/graph.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d"
-                }
+                "path": "torch/jit/_shape_functions.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86"
-                }
+                "path": "torch/nn/parallel/_replicated_tensor_ddp_interop.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a"
-                }
+                "path": "torch/nn/parallel/_replicated_tensor_ddp_utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "62a4642cf3330524990a69ac29e002c97812320a"
-                }
+                "path": "torch/nn/parallel/distributed.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "ca9b1223be4af2c8b4929303d498eafd71793128"
-                }
+                "path": "torch/nn/utils/_expanded_weights/__init__.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "6f4a23d24514a02954d2ec792830085f612223c9"
-                }
+                "path": "torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0"
-                }
+                "path": "torch/onnx/symbolic_opset11.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "e88b492be733f24b6aa395829c76add67d0901e7"
-                }
+                "path": "torch/onnx/symbolic_opset12.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937"
-                }
+                "path": "torch/onnx/symbolic_opset9.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "5157930f7b3921d41a586260582b574c915f6ca1"
-                }
+                "path": "torch/optim/adagrad.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406"
-                }
+                "path": "torch/optim/lr_scheduler.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "62991eaad0e638bb0bced327e03f932f66f68732"
-                }
+                "path": "torch/overrides.py"
+              },
+              {
+                "path": "torch/quantization/fx/pattern_utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "7496bf1588050191595d833d23b8972b2f22655e"
-                }
+                "path": "torch/quantization/fx/quantization_patterns.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c"
-                }
+                "path": "torch/quantization/fx/quantization_types.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105"
-                }
+                "path": "torch/return_types.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "eb32cc65a975361160948bfc3d6a577991ea262e"
-                }
+                "path": "torch/testing/_internal/common_device_type.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50"
-                }
+                "path": "torch/testing/_internal/common_distributed.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0"
-                }
+                "path": "torch/testing/_internal/common_fx2trt.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "a72cd0d02693f45e5354a70654581ad514581ec7"
-                }
+                "path": "torch/testing/_internal/common_methods_invocations.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9"
-                }
+                "path": "torch/testing/_internal/common_utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "49a592d9788d08e6cd0593882f867e129057c1cc"
-                }
+                "path": "torch/testing/_internal/composite_compliance.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f"
-                }
+                "path": "torch/testing/_internal/distributed/distributed_test.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa"
-                }
+                "path": "torch/testing/_internal/jit_metaprogramming_utils.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c"
-                }
+                "path": "torch/utils/cpp_extension.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5"
-                }
+                "path": "torch/utils/data/datapipes/_typing.py"
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
-                  },
-                  "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd"
-                }
-              },
+                "path": "torch/utils/model_dump/__init__.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MzQ4",
+              "hasNextPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAWuVD9M= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAXEsRtE= name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
               {
                 "commit": {
-                  "author": {
-                    "user": {
-                      "login": "sanchitintel"
-                    },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
-                  },
-                  "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1"
+                  "oid": "5696e8357cf38f852ef3d680381513e26f202371",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785220"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVECw=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
                 }
-              },
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=91340 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "tugsbayasgalan"
+          },
+          "title": "Symintify pytorch slicing logic",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #91340\n\nDifferential Revision: [D42398023](https://our.internmc.facebook.com/intern/diff/D42398023)",
+          "headRefName": "gh/tugsbayasgalan/86/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/tugsbayasgalan/86/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647"
+                  "oid": "ae8889feecb96f0ba0a7ad9888dae340f21487de"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d"
+                  "oid": "88ac30a6fbfc65012deeeb3662d8a9272e191cca"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0"
+                  "oid": "99540ebd8bb3f5bff0d90325c35f49290c35cd2d"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "0b743523d1430fec759d5fefbb687f17c89335a5"
+                  "oid": "85043a88f6847463a275633be1ccb07eacca93be"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb"
+                  "oid": "00ed45052b95d64051d0cca228cecad40f2e45ae"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "c189eca154b6691919d0e21489d1c322c7435c0b"
+                  "oid": "aeba29c8272975c0c25c40d395f5c8e9952f42a0"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8"
+                  "oid": "0691dc8b2a96860dadc6d5fd47487933ed69d13d"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8"
+                  "oid": "7052a80984320c7f74a26ab0cbeb683d71835f05"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432"
+                  "oid": "8555d264c5aa18a0e3f609bdb21889f3600de85d"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7"
+                  "oid": "4bd8ffe4d985250e0fb3f71dc7046859620386ca"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904"
+                  "oid": "a6d53387bb92ce42f002a270bac73468e7ad2b0d"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "29929f48be03dcdd1bbfade572de7feafa825547"
+                  "oid": "842377100ffcb2ba4d69775f9d91812d6d4fce9f"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684"
+                  "oid": "5db8aa548077f0a3e32150951aac8b7b2d910102"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6"
+                  "oid": "acdb2d71b7bcbc31f7192fb7025799009e406d1e"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264"
+                  "oid": "92e13828c1a6095a0e117f0a048201b84ccdb0dd"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchit.jain"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42"
+                  "oid": "3d9bb36d7871dc528b4dd1d8526720768287327b"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "sanchitintel"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "sanchit.jain@intel.com",
-                    "name": "sanchitintel"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a"
+                  "oid": "1cdcd7ea89a58bfee14d32e78ca2104e14124fb5"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "malfet"
+                      "login": "tugsbayasgalan"
                     },
-                    "email": "nikita.shulga@gmail.com",
-                    "name": "Nikita Shulga"
+                    "email": "tmanlaibaatar@fb.com",
+                    "name": "Tugsbayasgalan Manlaibaatar"
                   },
-                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
+                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "NjI",
+              "endCursor": "MTg",
               "hasNextPage": false
             },
-            "totalCount": 62
+            "totalCount": 18
           },
           "commits": {
             "nodes": [
@@ -33253,11 +23338,6 @@
                           "workflowRun": null,
                           "checkRuns": {
                             "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              },
                               {
                                 "name": "Meta Internal-Only Changes Check",
                                 "conclusion": "SUCCESS",
@@ -33265,126 +23345,121 @@
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIk8lw=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYwzI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6VI="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
+                            "name": "Netlify",
+                            "databaseId": 13473
                           },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440028"
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Vg="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
                           },
+                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "clang-format",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895825"
-                              },
-                              {
-                                "name": "py2-setup-validate-errormsg",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895911"
-                              },
-                              {
-                                "name": "quick-checks",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895963"
-                              },
-                              {
-                                "name": "shellcheck",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896134"
-                              },
-                              {
-                                "name": "toc",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896253"
-                              },
-                              {
-                                "name": "clang-tidy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896371"
-                              },
-                              {
-                                "name": "cmakelint",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896525"
-                              },
-                              {
-                                "name": "flake8-py3",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896658"
-                              },
-                              {
-                                "name": "Test collect_env (with_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896771"
-                              },
-                              {
-                                "name": "Test collect_env (without_torch)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896795"
-                              },
-                              {
-                                "name": "Test tools",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896838"
-                              },
-                              {
-                                "name": "mypy",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896897"
-                              }
-                            ],
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Vw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6WM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=",
+                              "endCursor": null,
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SUCCESS"
+                          "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPI="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Wo="
                       },
                       {
                         "node": {
                           "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
                           },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440031"
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6XM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "CircleCI Checks",
+                            "databaseId": 18001
                           },
+                          "workflowRun": null,
                           "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440031/jobs/2903895828"
-                              }
-                            ],
+                            "nodes": [],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=",
+                              "endCursor": null,
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "SKIPPED"
+                          "conclusion": null
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPc="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6Xc="
                       },
                       {
                         "node": {
@@ -33394,424 +23469,163 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "Labeler"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2018440039"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512812"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896014"
-                              },
-                              {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896165"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896394"
-                              },
-                              {
-                                "name": "linux-bionic-rocm4.5-py3.7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896572"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896666"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896778"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896837"
-                              },
-                              {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896896"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896936"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897025"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897161"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897213"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897280"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897368"
-                              },
-                              {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897431"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897476"
-                              },
-                              {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897578"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897630"
-                              },
-                              {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897699"
-                              },
-                              {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897733"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (cpp)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327787"
-                              },
-                              {
-                                "name": "linux-docs / build-docs (python)",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327838"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327956"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327997"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328035"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328093"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328131"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328177"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904333962"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904334006"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430419"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430459"
-                              },
-                              {
-                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430508"
-                              },
-                              {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430573"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443663"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443723"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443787"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454239"
-                              },
-                              {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454303"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554602"
-                              },
-                              {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554698"
-                              },
+                                "name": "triage",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512812/jobs/6587338912"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUHWY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6no="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512853"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588855"
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512853/jobs/6587339023"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUHf4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6uw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512861"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587338996"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588886"
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339034"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588924"
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339070"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904655702"
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339110"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656104"
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339139"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656150"
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339176"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656192"
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339209"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706520"
+                                "name": "Test collect_env (older_python_version)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339236"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706565"
+                                "name": "pr-sanity-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512861/jobs/6587339268"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUH1c=",
                               "hasNextPage": false
                             }
                           },
-                          "conclusion": "FAILURE"
+                          "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6u4="
                       }
                     ],
                     "pageInfo": {
-                      "hasNextPage": false
+                      "hasNextPage": true
                     }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048428?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048429?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048431?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048430?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-03-21T19:58:52Z",
-                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
-                }
-              }
-            ]
-          },
-          "changedFiles": 37,
-          "files": {
-            "nodes": [
-              {
-                "path": "aten/src/ATen/core/interned_strings.h"
-              },
-              {
-                "path": "caffe2/CMakeLists.txt"
-              },
-              {
-                "path": "cmake/Dependencies.cmake"
-              },
-              {
-                "path": "cmake/Modules/FindMKLDNN.cmake"
-              },
-              {
-                "path": "cmake/public/mkldnn.cmake"
-              },
-              {
-                "path": "docs/source/jit.rst"
-              },
-              {
-                "path": "test/test_jit_llga_fuser.py"
-              },
-              {
-                "path": "torch/_C/__init__.pyi.in"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/README.md"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_helper.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/guard_shape.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/interface.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/interface.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/kernel.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/kernel.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/operator.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h"
-              },
-              {
-                "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/ir/alias_analysis.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/ir/ir.cpp"
-              },
-              {
-                "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp"
-              },
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "EasyCLA",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2023-01-08T00:07:00Z",
+                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7"
+                }
+              }
+            ]
+          },
+          "changedFiles": 4,
+          "files": {
+            "nodes": [
               {
-                "path": "torch/csrc/jit/passes/onednn_graph_fuser.h"
+                "path": "aten/src/ATen/TensorIndexing.h"
               },
               {
-                "path": "torch/csrc/jit/python/init.cpp"
+                "path": "c10/core/SymInt.h"
               },
               {
-                "path": "torch/csrc/jit/runtime/operator.cpp"
+                "path": "torch/csrc/autograd/python_variable_indexing.cpp"
               },
               {
-                "path": "torch/jit/__init__.py"
+                "path": "torch/csrc/autograd/python_variable_indexing.h"
               }
             ],
             "pageInfo": {
-              "endCursor": "Mzc",
+              "endCursor": "NA",
               "hasNextPage": false
             }
           },
@@ -33819,449 +23633,415 @@
             "nodes": [
               {
                 "author": {
-                  "login": "pinzhenx"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pinzhenx"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "pinzhenx"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "chunyuan-w"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "sanchitintel"
+                  "login": "Skylion007"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "albanD"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "Skylion007"
                 },
-                "state": "COMMENTED"
+                "state": "CHANGES_REQUESTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "albanD"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "albanD"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "albanD"
                 },
-                "state": "COMMENTED"
+                "state": "APPROVED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "Skylion007"
                 },
-                "state": "COMMENTED"
+                "state": "APPROVED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "albanD"
                 },
                 "state": "COMMENTED"
               },
               {
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "Skylion007"
                 },
                 "state": "COMMENTED"
-              },
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0xMi0yM1QxMjoxOToxNy0wODowMLkyMDIyLTEyLTIzVDEyOjE5OjE2LTA4OjAwzklG9o4=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
               {
+                "bodyText": "@tugsbayasgalan your PR has been successfully reverted.",
+                "createdAt": "2023-01-05T17:14:54Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1372498362
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-07T01:57:54Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1374346186
               },
               {
+                "bodyText": "Rebased gh/tugsbayasgalan/87/orig onto refs/remotes/origin/viable/strict because #91341 was rebased, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/91340)",
+                "createdAt": "2023-01-07T10:17:26Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1374432230
               },
               {
+                "bodyText": "@pytorchbot merge -f \"Landed internally\"",
+                "createdAt": "2023-01-08T22:50:06Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1374948938
               },
               {
+                "bodyText": "Merge started\nYour change will be merged immediately since you used the force (-f) flag, bypassing any CI checks (ETA: 1-5 minutes).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
+                "createdAt": "2023-01-08T22:51:38Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
-              },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1374949218
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOUc6pug==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
               {
-                "author": {
-                  "login": "wukong1992"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "Merged"
+                }
               },
               {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "Reverted"
+                }
               },
               {
-                "author": {
-                  "login": "eellison"
-                },
-                "state": "COMMENTED"
+                "node": {
+                  "name": "ciflow/trunk"
+                }
               },
               {
-                "author": {
-                  "login": "sanchitintel"
-                },
-                "state": "COMMENTED"
-              },
+                "node": {
+                  "name": "topic: not user facing"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOUc6pug== name=pytorch number=91340 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
               {
+                "bodyText": "\ud83d\udd17 Helpful Links\n\ud83e\uddea See artifacts and rendered test results at hud.pytorch.org/pr/91340\n\n\ud83d\udcc4 Preview Python docs built from this PR\n\ud83d\udcc4 Preview C++ docs built from this PR\n\u2753 Need help or want to give feedback on the CI? Visit the bot commands wiki or our office hours\n\nNote: Links to docs will display an error until the docs builds have been completed.\n\u2705 No Failures\nAs of commit 18a466e:\n\ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a\nThis comment was automatically generated by Dr. CI and updates every 15 minutes.",
+                "createdAt": "2022-12-23T00:37:54Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorch-bot"
                 },
-                "state": "COMMENTED"
-              },
-              {
-                "author": {
-                  "login": "eellison"
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "pytorch-bot"
                 },
-                "state": "COMMENTED"
+                "databaseId": 1363473085
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-12-23T00:40:19Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1363474061
               },
               {
+                "bodyText": "@pytorchbot rebase",
+                "createdAt": "2022-12-23T07:30:45Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1363693611
               },
               {
+                "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here",
+                "createdAt": "2022-12-23T07:32:50Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1363694709
               },
               {
+                "bodyText": "Rebase failed due to\nRaised by https://github.com/pytorch/pytorch/actions/runs/3764003479",
+                "createdAt": "2022-12-23T07:33:01Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1363694807
               },
               {
+                "bodyText": "Rebased gh/tugsbayasgalan/87/orig onto refs/remotes/origin/viable/strict because #91341 was rebased, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/91340)",
+                "createdAt": "2022-12-23T07:33:06Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1363694844
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-12-26T05:57:30Z",
                 "author": {
-                  "login": "eellison"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "APPROVED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1364912846
               },
               {
+                "bodyText": "Does this need testing changes? or new tests?",
+                "createdAt": "2023-01-03T19:01:39Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "voznesenskym"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370121847
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-03T19:52:38Z",
                 "author": {
-                  "login": "eellison"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370165547
               },
               {
+                "bodyText": "@voznesenskym pytorch itself has very comprehensive testing suite for slicing logic, so i think as long as CI is green, it should be good.",
+                "createdAt": "2023-01-03T19:54:35Z",
                 "author": {
-                  "login": "malfet"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370167103
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-03T23:45:05Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370335952
               },
               {
+                "bodyText": "@pytorchbot rebase",
+                "createdAt": "2023-01-04T01:28:56Z",
                 "author": {
-                  "login": "malfet"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370391232
               },
               {
+                "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here",
+                "createdAt": "2023-01-04T01:30:51Z",
                 "author": {
-                  "login": "malfet"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370391970
               },
               {
+                "bodyText": "Successfully rebased gh/tugsbayasgalan/86/orig onto refs/remotes/origin/viable/strict, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/91340)",
+                "createdAt": "2023-01-04T01:31:08Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1370392083
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-04T19:19:45Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1371323220
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-04T20:27:49Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1371385625
               },
               {
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-04T20:53:28Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "tugsbayasgalan"
                 },
-                "state": "COMMENTED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1371406675
+              },
               {
-                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.",
-                "createdAt": "2022-03-21T22:51:38Z",
+                "bodyText": "@tugsbayasgalan has imported this pull request.  If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2023-01-04T22:11:06Z",
                 "author": {
-                  "login": "suo"
+                  "login": "tugsbayasgalan"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1074498483
+                "databaseId": 1371489068
               },
               {
-                "bodyText": "@pytorchbot revert this",
-                "createdAt": "2022-03-21T22:51:44Z",
+                "bodyText": "@pytorchbot merge\n(Initiating merge automatically since Phabricator Diff has merged)",
+                "createdAt": "2023-01-05T10:30:00Z",
                 "author": {
-                  "login": "suo"
+                  "login": "facebook-github-bot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1074498550
+                "databaseId": 1372040514
               },
               {
-                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.",
-                "createdAt": "2022-03-21T22:53:34Z",
+                "bodyText": "Merge started\nYour change will be merged once all checks pass (ETA 0-4 Hours).\nLearn more about merging in the wiki.\nQuestions? Feedback? Please reach out to the PyTorch DevX TeamAdvanced Debugging\nCheck the merge workflow status\nhere",
+                "createdAt": "2023-01-05T10:33:34Z",
                 "author": {
-                  "login": "sanchitintel"
+                  "login": "pytorchmergebot"
                 },
-                "authorAssociation": "COLLABORATOR",
+                "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1074499668
+                "databaseId": 1372044055
               },
               {
-                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
-                "createdAt": "2022-03-21T23:07:23Z",
+                "bodyText": "@pytorchbot revert -m \"breaking mac builds https://hud.pytorch.org/pytorch/pytorch/commit/8c172fa98a52e95675e9425ac4b23f190f53f9ed https://github.com/pytorch/pytorch/actions/runs/3845932024/jobs/6550654339, marking this as weird because it was merged via codev?\" -c weird",
+                "createdAt": "2023-01-05T17:13:04Z",
                 "author": {
-                  "login": "facebook-github-bot"
+                  "login": "clee2000"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1074508608
+                "databaseId": 1372496233
               },
               {
-                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
-                "createdAt": "2022-03-30T00:53:50Z",
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here.\nQuestions? Feedback? Please reach out to the PyTorch DevX Team",
+                "createdAt": "2023-01-05T17:14:44Z",
                 "author": {
-                  "login": "facebook-github-bot"
+                  "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1082508130
+                "databaseId": 1372498188
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==",
-              "hasPreviousPage": true
+              "startCursor": "Y3Vyc29yOnYyOpHOUUTyvQ==",
+              "hasPreviousPage": false
             }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "oncall: jit"
-                }
-              },
-              {
-                "node": {
-                  "name": "triaged"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Reverted"
-                }
-              },
-              {
-                "node": {
-                  "name": "intel priority"
-                }
-              }
-            ]
           }
         }
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=73969 owner=pytorch": {
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAmJq6u4= name=pytorch number=91340 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
-          "isCrossRepository": true,
-          "author": {
-            "login": "malfet"
-          },
-          "title": "Dummy change",
-          "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n",
-          "headRefName": "export-D34753911",
-          "headRepository": {
-            "nameWithOwner": "malfet/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "4746da707a9912356f5179625da89616b228dc21"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            },
-            "totalCount": 1
-          },
           "commits": {
             "nodes": [
               {
                 "commit": {
+                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7",
                   "checkSuites": {
                     "edges": [
                       {
@@ -34272,31 +24052,334 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-vulkan-bionic-py3.7-clang9"
+                              "name": "Check Labels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512856"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Check labels",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512856/jobs/6587338995"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIUHds=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6u8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864512865"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415492"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415532"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415589"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415644"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415726"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415784"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415826"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.3-py3.8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415854"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415903"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415937"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-pch / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415960"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588415997"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416037"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416078"
+                              },
+                              {
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416114"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416153"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416206"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416247"
+                              },
+                              {
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416281"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416485"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416517"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416556"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-cpp-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416590"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-python-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416626"
+                              },
+                              {
+                                "name": "linux-docs / build-docs-functorch-false",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416652"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416705"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416738"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416778"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416806"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416852"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588416996"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417029"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417053"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417086"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417117"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417151"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417179"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417205"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417239"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417275"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417300"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417337"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417365"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417394"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417410"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417443"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417475"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417521"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417564"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417601"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnInHI8=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq6v0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-debug"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280134"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513095"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "libtorch-cpu-shared-with-deps-debug-build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794078044"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513095/jobs/6587342116"
                               },
                               {
-                                "name": "test (default, 1, 1, linux.2xlarge)",
+                                "name": "libtorch-cpu-shared-with-deps-debug-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794189060"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513095/jobs/6587939020"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIerac=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QM="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7UQ="
                       },
                       {
                         "node": {
@@ -34306,26 +24389,31 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                              "name": "windows-binary-libtorch-release"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280135"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513096"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "libtorch-cpu-shared-with-deps-release-build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280135/jobs/2794078023"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513096/jobs/6587339456"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513096/jobs/6587642833"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIZcgM=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QU="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7UU="
                       },
                       {
                         "node": {
@@ -34335,41 +24423,99 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-bionic-rocm4.5-py3.7"
+                              "name": "linux-binary-manywheel"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280132"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513132"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "manywheel-py3_7-cuda11_6-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794078060"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513132/jobs/6587344127"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "manywheel-py3_7-cuda11_6-test / test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292071"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513132/jobs/6588050173"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIgpUU=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7Ys="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-libtorch-pre-cxx11"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513134"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-pre-cxx11-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513134/jobs/6587339538"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "libtorch-cpu-shared-with-deps-pre-cxx11-test / test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292205"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513134/jobs/6587614329"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIY81E=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7Yw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-libtorch-cxx11-abi"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513133"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513133/jobs/6587339544"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292306"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513133/jobs/6587579045"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIYVKs=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QY="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7Y0="
                       },
                       {
                         "node": {
@@ -34379,380 +24525,489 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "win-vs2019-cuda11.3-py3"
+                              "name": "trunk"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280139"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/3864513136"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "android-emulator-build-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587375890"
+                              },
+                              {
+                                "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587375971"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-tsan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376023"
+                              },
+                              {
+                                "name": "ios-12-5-1-x86-64 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376090"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.3-py3.8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376141"
+                              },
+                              {
+                                "name": "pytorch-linux-focal-py3-clang7-android-ndk-r19c-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376183"
+                              },
+                              {
+                                "name": "macos-12-py3-x86-64-lite-interpreter / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376247"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376285"
+                              },
+                              {
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376325"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376368"
+                              },
+                              {
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376420"
+                              },
+                              {
+                                "name": "macos-12-py3-x86-64 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376474"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376524"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376577"
+                              },
+                              {
+                                "name": "caffe2-linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376647"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9-slow / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587376697"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-tsan / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587466558"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9-slow / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587466800"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587470226"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-tsan / test (tsan, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587472364"
+                              },
+                              {
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / filter",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587514019"
+                              },
+                              {
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794078053"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587516320"
                               },
                               {
-                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536907"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587516365"
                               },
                               {
-                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "linux-focal-rocm5.3-py3.8 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536998"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587527524"
                               },
                               {
-                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "linux-focal-rocm5.3-py3.8 / test (default, 1, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794537089"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qc="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280136"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587530460"
+                              },
                               {
-                                "name": "build-and-test",
+                                "name": "linux-focal-rocm5.3-py3.8 / test (default, 2, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280136/jobs/2794078031"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qk="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-docs"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280138"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587530531"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794078055"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587540455"
                               },
                               {
-                                "name": "build-docs (cpp)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183768"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542564"
                               },
                               {
-                                "name": "build-docs (python)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183828"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qo="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280140"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542599"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794078017"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542630"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181109"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542674"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181305"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542727"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181488"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280143"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542772"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280143/jobs/2794078025"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q4="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "Lint"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280145"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542805"
+                              },
                               {
-                                "name": "shellcheck",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078028"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542846"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078196"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542879"
                               },
                               {
-                                "name": "clang-tidy",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078407"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542911"
                               },
                               {
-                                "name": "clang-format",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078610"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587542950"
                               },
                               {
-                                "name": "cmakelint",
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078760"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587545736"
                               },
                               {
-                                "name": "toc",
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 1, 4, linux.g5.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078898"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548567"
                               },
                               {
-                                "name": "py2-setup-validate-errormsg",
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 2, 4, linux.g5.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078999"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548593"
                               },
                               {
-                                "name": "flake8-py3",
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 3, 4, linux.g5.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079087"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548643"
                               },
                               {
-                                "name": "mypy",
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (default, 4, 4, linux.g5.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079199"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q8="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1958280146"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548672"
+                              },
+                              {
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (slow, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548710"
+                              },
+                              {
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (slow, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548730"
+                              },
+                              {
+                                "name": "cuda11.6-py3.10-gcc7-sm86 / test (functorch, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587548761"
+                              },
                               {
-                                "name": "build-and-test",
+                                "name": "macos-12-py3-arm64 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280146/jobs/2794078040"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587781241"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64-mps / Run MPS tests",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587781320"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / test (default, 1, 2, macos-m1-12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587784438"
+                              },
+                              {
+                                "name": "macos-12-py3-arm64 / test (default, 2, 2, macos-m1-12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587784531"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=",
-                              "hasNextPage": false
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIb-Fc=",
+                              "hasNextPage": true
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAmJq7ZM="
                       }
                     ],
                     "pageInfo": {
-                      "hasNextPage": true
+                      "hasNextPage": false
                     }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040614?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040643?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040615?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-03-09T15:57:16Z",
-                  "oid": "4746da707a9912356f5179625da89616b228dc21"
+                  }
                 }
               }
             ]
-          },
-          "changedFiles": 1,
-          "files": {
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAnInHI8= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAmJq6u8= name=pytorch number=91340 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
             "nodes": [
               {
-                "path": "tools/build_variables.bzl"
+                "commit": {
+                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417631"
+                            },
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417664"
+                            },
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417705"
+                            },
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417734"
+                            },
+                            {
+                              "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417775"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417817"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417859"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588417907"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418062"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418100"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418127"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418163"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418200"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418228"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418252"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418285"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864512865/jobs/6588418317"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnInH7M=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
+                }
               }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
-            "nodes": [],
-            "pageInfo": {
-              "startCursor": null,
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAnIb-Fc= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAmJq7Y0= name=pytorch number=91340 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
             "nodes": [
               {
-                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped",
-                "createdAt": "2022-03-09T15:57:11Z",
-                "author": {
-                  "login": "pytorch-bot"
-                },
-                "authorAssociation": "NONE",
-                "editor": null,
-                "databaseId": 1063079053
-              },
-              {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-03-09T15:57:12Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 1063079113
-              },
-              {
-                "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911",
-                "createdAt": "2022-03-09T15:57:34Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": null,
-                "databaseId": 1063079731
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==",
-              "hasPreviousPage": false
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "fb-exported"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
+                "commit": {
+                  "oid": "18a466ebc23ed04879972cfd0a2fb3d85c3895f7",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "macos-12-py3-arm64 / test (functorch, 1, 1, macos-m1-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587784596"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587796241"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798805"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798838"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798865"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798903"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798942"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (functorch, 1, 1, windows.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587798976"
+                            },
+                            {
+                              "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587799010"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / filter",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587834238"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (default, 1, 2, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587836679"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (default, 2, 2, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587836820"
+                            },
+                            {
+                              "name": "macos-12-py3-x86-64 / test (functorch, 1, 1, macos-12)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/3864513136/jobs/6587836879"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAnIc5ZE=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
                 }
               }
             ]
@@ -34761,22 +25016,22 @@
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=73099 owner=pytorch": {
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=79694 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
           "closed": true,
-          "isCrossRepository": false,
+          "isCrossRepository": true,
           "author": {
-            "login": "BowenBao"
+            "login": "kshitij12345"
           },
-          "title": "[ONNX] Make graph name spec-compliant (#71961)",
-          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952",
-          "headRefName": "gh/BowenBao/138/head",
+          "title": "[complex] conv_transpose1d",
+          "body": "Reference: https://github.com/pytorch/pytorch/issues/71108",
+          "headRefName": "develop/complex/conv_transpose1d",
           "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
+            "nameWithOwner": "kshitij12345/pytorch"
           },
-          "baseRefName": "gh/BowenBao/138/base",
+          "baseRefName": "master",
           "baseRepository": {
             "nameWithOwner": "pytorch/pytorch",
             "isPrivate": false,
@@ -34791,20 +25046,164 @@
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "BowenBao"
+                      "login": "kshitij12345"
                     },
-                    "email": "bowbao@microsoft.com",
-                    "name": "BowenBao"
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
                   },
-                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
+                  "oid": "d1ea948e65ac6d31ad056287ab65d38ecc68b30d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "b4ba1db9a3a71bd8c03158dcd1b68711360633d8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "655a4220beae163bfe578f0318a130df01ec05d6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "Kshiteej K"
+                  },
+                  "oid": "8181716be7a8005eb13ad5c3f2e1279ed1c60aff"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "9e5ca3663e7471786eeebebfdf84aea5d761712f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "9c110f39bcdc4e56386b6f9c4e2c082c8940ade6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "49315e79d0eee8008e2a74575c6fc0f6a9531ee4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "728752480760226270c374a0acc08e28b9b133f3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "ffe43399d6f60ef7844523a5f465c11d9a67062f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "9672a2198472567bae4ac6f55d004f7e1fa8a9fa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "48a0ebf32b895286f036b36c871f671dc867e400"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "52fbe80d5c8a94e03d816c0bd21fd82019dcd5ac"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "MQ",
+              "endCursor": "MTM",
               "hasNextPage": false
             },
-            "totalCount": 1
+            "totalCount": 13
           },
           "commits": {
             "nodes": [
@@ -34812,6 +25211,35 @@
                 "commit": {
                   "checkSuites": {
                     "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.facebook.com/cla/"
+                              },
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdtq8Hc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqFo="
+                      },
                       {
                         "node": {
                           "app": {
@@ -34822,24 +25250,24 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041786"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393316"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "run-torchbench",
                                 "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041786/jobs/2626264278"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393316/jobs/4628529923"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTEwk=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7k="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXs="
                       },
                       {
                         "node": {
@@ -34849,70 +25277,66 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                              "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041785"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393315"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "lintrunner",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626264385"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628529910"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "quick-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417658"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530162"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "Test collect_env (with_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417743"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530698"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "name": "Test collect_env (without_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417885"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7o="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7-no-ops"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041789"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530867"
+                              },
                               {
-                                "name": "build",
+                                "name": "Test collect_env (older_python_version)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041789/jobs/2626264416"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530989"
+                              },
+                              {
+                                "name": "pr-sanity-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531151"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531475"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531753"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531853"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTHFY=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7s="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXw="
                       },
                       {
                         "node": {
@@ -34922,872 +25346,937 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-build"
+                              "name": "pull"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041787"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393329"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "build",
+                                "name": "linux-focal-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531149"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531473"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531754"
+                              },
+                              {
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531857"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-pch / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532179"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532543"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532694"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532918"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533033"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533181"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533420"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533630"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533825"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533959"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534129"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534256"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.2-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534388"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534571"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534714"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534989"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628535311"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639115"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639198"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639265"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639339"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639395"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639450"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639509"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639572"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639635"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647047"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647119"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041787/jobs/2626264407"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7w="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041788"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647215"
+                              },
                               {
-                                "name": "build-and-test",
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041788/jobs/2626264422"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS74="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041790"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647277"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626264414"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647348"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349405"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647432"
                               },
                               {
-                                "name": "test (noarch, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349522"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647522"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349618"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS78="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-vulkan-bionic-py3.7-clang9"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041793"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647641"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626264431"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647762"
                               },
                               {
-                                "name": "test (default, 1, 1, linux.2xlarge)",
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626359364"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8A="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041792"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628653797"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041792/jobs/2626264427"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8I="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "win-vs2019-cpu-py3"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041791"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679376"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626264386"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679431"
                               },
                               {
-                                "name": "test (default, 1, 2, windows.4xlarge)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722677"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679469"
                               },
                               {
-                                "name": "test (default, 2, 2, windows.4xlarge)",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722710"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8M="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-xenial-py3.7-gcc7"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/1866041803"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679519"
+                              },
                               {
-                                "name": "build",
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626264401"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679594"
                               },
                               {
-                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349045"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628681226"
                               },
                               {
-                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349141"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628854932"
                               },
                               {
-                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349272"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q="
-                      }
-                    ],
-                    "pageInfo": {
-                      "hasNextPage": true
-                    }
-                  },
-                  "status": {
-                    "contexts": [
-                      {
-                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010288?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010289?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      },
-                      {
-                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010488?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856434"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856501"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856575"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ2fA=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqZs="
                       },
                       {
-                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
-                        "state": "SUCCESS",
-                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010326?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
-                      }
-                    ]
-                  },
-                  "pushedDate": "2022-02-18T18:46:28Z",
-                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
-                }
-              }
-            ]
-          },
-          "changedFiles": 162,
-          "files": {
-            "nodes": [
-              {
-                "path": "test/onnx/expect/TestOperators.test_acos.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_addconstant.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_addmm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_argmax.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_asin.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_at_op.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_atan.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_baddbmm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_basic.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_bitshift.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_c2_op.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_chunk.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_clip.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_clip_max.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_clip_min.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_concat2.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_conv.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_convtranspose.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_cos.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_cumsum.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_det.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dict.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dict_str.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dim.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_default.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_training.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_elu.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_empty_like.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_equal.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_erf.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_exp.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_expand.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_flatten.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_flatten2D.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_fmod.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_full.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_full_like.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gather.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_ge.expect"
-              },
-              {
-                "path": "test/onnx/expect/TestOperators.test_gelu.expect"
-              },
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-debug"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351637"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-debug-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4634503587"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-debug-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4635312938"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsbsmM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-wheel"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351640"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "wheel-py3_7-cuda11_3-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4634503571"
+                              },
+                              {
+                                "name": "wheel-py3_7-cuda11_3-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4636146265"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsskcw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-release"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351643"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4634503570"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4635003925"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsVbD8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-libtorch-cxx11-abi"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351698"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4634504079"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4635072931"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW5Aw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2E="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-libtorch-pre-cxx11"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351700"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4634503897"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4635077148"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW-jo=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2I="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-manywheel"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351699"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "manywheel-py3_7-cuda10_2-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4634503896"
+                              },
+                              {
+                                "name": "manywheel-py3_7-cuda10_2-test / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4635934290"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsoMEA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2M="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-08-22T22:04:19Z",
+                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
+                }
+              }
+            ]
+          },
+          "changedFiles": 3,
+          "files": {
+            "nodes": [
               {
-                "path": "test/onnx/expect/TestOperators.test_gt.expect"
+                "path": "aten/src/ATen/native/Convolution.cpp"
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_hardtanh.expect"
+                "path": "torch/testing/_internal/common_methods_invocations.py"
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect"
-              },
+                "path": "torch/testing/_internal/common_modules.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
               {
-                "path": "test/onnx/expect/TestOperators.test_index.expect"
-              },
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0xOVQxMDowNzo1NC0wNzowMLkyMDIyLTA3LTE5VDEwOjA3OjU0LTA3OjAwzj43QcY=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
               {
-                "path": "test/onnx/expect/TestOperators.test_isnan.expect"
+                "bodyText": "@pytorchbot merge -g\nAll is green internally!",
+                "createdAt": "2022-08-23T19:29:55Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224702749
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here.\nThe merge job was triggered with the green (-g) flag. This means that your change will be merged once all checks on your PR have passed (ETA: 0-4 Hours). If this is not the intended behavior, feel free to use some of the other merge options in the wiki.\nPlease reach out to the PyTorch DevX Team with feedback or questions!",
+                "createdAt": "2022-08-23T19:31:18Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224705564
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_le.expect"
+                "bodyText": "Thanks for looking into it \ud83d\ude42 @albanD @jeanschmidt",
+                "createdAt": "2022-08-23T19:34:36Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1224712351
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_linear.expect"
+                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-08-23T22:31:58Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1224956051
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect"
-              },
+                "bodyText": "Yeah, discussed with my manager and I got the required permissions to do so. Sorry for not responding promptly yesterday. But I am available from now on to provide assistance :)",
+                "createdAt": "2022-08-24T09:24:04Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1225462612
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOSP97HQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
               {
-                "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect"
+                "node": {
+                  "name": "open source"
+                }
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect"
+                "node": {
+                  "name": "Merged"
+                }
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_lt.expect"
+                "node": {
+                  "name": "cla signed"
+                }
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_master_opset.expect"
+                "node": {
+                  "name": "Reverted"
+                }
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_max.expect"
+                "node": {
+                  "name": "ciflow/trunk"
+                }
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_maxpool.expect"
-              },
+                "node": {
+                  "name": "ciflow/periodic"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOSP97HQ== name=pytorch number=79694 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
               {
-                "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect"
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/79694\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 2fd08f1 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-06-16T09:43:16Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1157454523
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect"
+                "bodyText": "Unable to reproduce jit failure locally (will skip the test)\nCI Failure : https://github.com/pytorch/pytorch/runs/6926187074?check_suite_focus=true#step:9:20230\npytest test/test_ops_jit.py -k test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 -v\n=============================================================== test session starts ===============================================================\nplatform linux -- Python 3.10.0, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 -- /home/kshiteej/.conda/envs/pytorch-cuda-dev/bin/python\ncachedir: .pytest_cache\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/kshiteej/Pytorch/pytorch_complex_convolution.py/.hypothesis/examples')\nrootdir: /home/kshiteej/Pytorch/pytorch_complex_convolution.py, configfile: pytest.ini\nplugins: hypothesis-6.23.2, repeat-0.9.1\ncollected 1976 items / 1975 deselected / 1 selected                                                                                               \n\ntest/test_ops_jit.py::TestJitCPU::test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 PASSED                          [100%]\n\n================================================================ warnings summary =================================================================\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9\n  /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives\n    from distutils.version import LooseVersion\n\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91\n  /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.\n    warnings.warn(\n\n-- Docs: https://docs.pytest.org/en/stable/warnings.html\n================================================= 1 passed, 1975 deselected, 2 warnings in 4.90s =================================================",
+                "createdAt": "2022-07-18T09:05:35Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "kshitij12345"
+                },
+                "databaseId": 1186949486
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_mean.expect"
+                "bodyText": "@pytorchbot merge",
+                "createdAt": "2022-07-19T17:12:23Z",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189347786
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect"
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-19T17:13:42Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189350009
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_meshgrid.expect"
+                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-07-19T17:14:25Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1189350932
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_min.expect"
+                "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"",
+                "createdAt": "2022-07-19T19:15:41Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1189459845
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_mm.expect"
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
+                "createdAt": "2022-07-19T19:16:59Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189460926
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_narrow.expect"
+                "bodyText": "Will not revert as @kshitij12345 is not a MEMBER, but COLLABORATOR",
+                "createdAt": "2022-07-19T19:17:00Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189460942
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_ne.expect"
+                "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"",
+                "createdAt": "2022-07-19T20:40:04Z",
+                "author": {
+                  "login": "anjali411"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189529734
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_nonzero.expect"
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
+                "createdAt": "2022-07-19T20:41:20Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189530756
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_norm_p1.expect"
+                "bodyText": "@kshitij12345 your PR has been successfully reverted.",
+                "createdAt": "2022-07-19T20:41:25Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189530831
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_norm_p2.expect"
+                "bodyText": "@pytorchbot merge -g",
+                "createdAt": "2022-07-20T09:53:08Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1190070141
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_ones_like.expect"
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-20T09:54:24Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1190071424
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_pad.expect"
+                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-07-20T13:00:51Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1190258272
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_params.expect"
+                "bodyText": "commit is breaking internal builds/tests https://pastebin.com/HX4RUusH (pytorch/functorch/test:test_eager_transforms)",
+                "createdAt": "2022-07-21T10:39:01Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191327616
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect"
+                "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"",
+                "createdAt": "2022-07-21T10:39:27Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191328013
               },
               {
-                "path": "test/onnx/expect/TestOperators.test_permute2.expect"
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "MTAw",
-              "hasNextPage": true
-            }
-          },
-          "reviews": {
-            "nodes": [
-              {
+                "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"",
+                "createdAt": "2022-07-21T10:41:23Z",
                 "author": {
-                  "login": "garymm"
+                  "login": "jeanschmidt"
                 },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191329792
+              },
               {
-                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n  \n    \n      pytorch/.github/scripts/trymerge.py\n    \n    \n         Line 63\n      in\n      932adf2\n    \n  \n  \n    \n\n        \n          \n                 files(last: 100) { \n        \n    \n  \n\n Can this be relaxed? If not please import.",
-                "createdAt": "2022-02-22T18:22:40Z",
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
+                "createdAt": "2022-07-21T10:42:16Z",
                 "author": {
-                  "login": "BowenBao"
+                  "login": "pytorchmergebot"
                 },
-                "authorAssociation": "COLLABORATOR",
+                "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1048084569
+                "databaseId": 1191330586
               },
-              {
-                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.",
-                "createdAt": "2022-02-22T18:27:29Z",
+              {
+                "bodyText": "@kshitij12345 your PR has been successfully reverted.",
+                "createdAt": "2022-07-21T10:42:23Z",
                 "author": {
-                  "login": "malfet"
+                  "login": "pytorchmergebot"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1048088691
+                "databaseId": 1191330690
               },
               {
-                "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.",
-                "createdAt": "2022-02-22T18:29:48Z",
+                "bodyText": "@jeanschmidt which test is it failing on? I tried running the test_eager_transforms in functorch but couldn't reproduce it.",
+                "createdAt": "2022-07-25T07:11:19Z",
                 "author": {
-                  "login": "BowenBao"
+                  "login": "kshitij12345"
                 },
                 "authorAssociation": "COLLABORATOR",
                 "editor": null,
-                "databaseId": 1048090640
+                "databaseId": 1193667568
               },
               {
-                "bodyText": "@pytorchbot merge this",
-                "createdAt": "2022-02-24T21:42:36Z",
+                "bodyText": "@jbschlosser have added a ref as discussed offline. Can you please take a look? And if it looks good, can you import the PR to check if it is breaking anything internally.\nThanks",
+                "createdAt": "2022-08-03T18:30:17Z",
                 "author": {
-                  "login": "BowenBao"
+                  "login": "kshitij12345"
                 },
                 "authorAssociation": "COLLABORATOR",
                 "editor": null,
-                "databaseId": 1050293881
+                "databaseId": 1204329491
               },
               {
-                "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-02-24T21:44:39Z",
+                "bodyText": "@jbschlosser @jeanschmidt @albanD anything we can do to unblock this on our side?",
+                "createdAt": "2022-08-20T09:27:17Z",
                 "author": {
-                  "login": "github-actions"
+                  "login": "lezcano"
                 },
-                "authorAssociation": "NONE",
+                "authorAssociation": "COLLABORATOR",
                 "editor": null,
-                "databaseId": 1050295451
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==",
-              "hasPreviousPage": true
-            }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "oncall: jit"
-                }
-              },
-              {
-                "node": {
-                  "name": "open source"
-                }
+                "databaseId": 1221266218
               },
               {
-                "node": {
-                  "name": "cla signed"
-                }
+                "bodyText": "Functorch tests should be running here now so can you rebase on top of master please?",
+                "createdAt": "2022-08-22T21:42:37Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1223129944
               },
               {
-                "node": {
-                  "name": "release notes: onnx"
-                }
+                "bodyText": "@albanD have rebased on latest master.",
+                "createdAt": "2022-08-23T08:49:10Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1223758571
               },
               {
-                "node": {
-                  "name": "topic: bug fixes"
-                }
-              }
-            ]
-          }
-        }
-      }
-    }
-  },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=74649 owner=pytorch": {
-    "data": {
-      "repository": {
-        "pullRequest": {
-          "closed": true,
-          "isCrossRepository": false,
-          "author": {
-            "login": "malfet"
-          },
-          "title": "This should fail flake8",
-          "body": "Test issue for GHF mandatory checks",
-          "headRefName": "malfet-patch-8",
-          "headRepository": {
-            "nameWithOwner": "pytorch/pytorch"
-          },
-          "baseRefName": "master",
-          "baseRepository": {
-            "nameWithOwner": "pytorch/pytorch",
-            "isPrivate": false,
-            "defaultBranchRef": {
-              "name": "master"
-            }
-          },
-          "mergeCommit": null,
-          "commits_with_authors": {
-            "nodes": [
-              {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "57c86ff1c5ab948888fd329986c9d55796680e33"
-                }
+                "bodyText": "I triggered all the tests not to have any issues with slow tests again",
+                "createdAt": "2022-08-23T09:20:18Z",
+                "author": {
+                  "login": "lezcano"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1223796413
               },
               {
-                "commit": {
-                  "author": {
-                    "user": {
-                      "login": "malfet"
-                    },
-                    "email": "nshulga@fb.com",
-                    "name": "Nikita Shulga"
-                  },
-                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
-                }
-              }
-            ],
-            "pageInfo": {
-              "endCursor": "Mg",
-              "hasNextPage": false
-            },
-            "totalCount": 2
-          },
-          "commits": {
-            "nodes": [
-              {
-                "commit": {
-                  "checkSuites": {
-                    "edges": [
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Facebook GitHub Tools",
-                            "databaseId": 12274
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [
-                              {
-                                "name": "Facebook CLA Check",
-                                "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.intern.facebook.com/cla/"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsK3w=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1E="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Netlify",
-                            "databaseId": 13473
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1M="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Azure Pipelines",
-                            "databaseId": 9426
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Q="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Dependabot",
-                            "databaseId": 29110
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Y="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "Codecov",
-                            "databaseId": 254
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1s="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "PyTorch Bot",
-                            "databaseId": 40112
-                          },
-                          "workflowRun": null,
-                          "checkRuns": {
-                            "nodes": [],
-                            "pageInfo": {
-                              "endCursor": null,
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": null
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj14="
-                      },
+                "bodyText": "Thanks @lezcano! However, last time it was reverted for internal failures. So it would be great if someone can import and verify that.\ncc: @albanD @jeanschmidt",
+                "createdAt": "2022-08-23T10:17:50Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1223863075
+              },
+              {
+                "bodyText": "@albanD has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-08-23T14:43:02Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224175731
+              },
+              {
+                "bodyText": "I am not the right person to provide assistence, as currently I am not based in a Tier 1 location, so my permissions to access are so restricted that I am not able to import this commit, run the tests and provide meaningful responses.",
+                "createdAt": "2022-08-23T15:57:48Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224272324
+              },
+              {
+                "bodyText": "@jeanschmidt has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-08-23T17:00:53Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224351135
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHORP1auw==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=c3b8ce3ee21a8d1b76d5fecfd22eba5b3fe45380777556e3f6886b267fa65e79 cursor=Y3Vyc29yOnYyOpHPAAAAAdkUS2M= name=pytorch number=79694 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce",
+                  "checkSuites": {
+                    "edges": [
                       {
                         "node": {
                           "app": {
@@ -35796,470 +26285,573 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "trunk"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2031576283"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351701"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "clang-format",
+                                "name": "macos-12-py3-x86-64 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925132"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504326"
                               },
                               {
-                                "name": "clang-tidy",
+                                "name": "macos-12-py3-arm64 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925189"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504522"
                               },
                               {
-                                "name": "cmakelint",
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925230"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504655"
                               },
                               {
-                                "name": "flake8-py3",
-                                "conclusion": "FAILURE",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925307"
+                                "name": "caffe2-linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634504882"
                               },
                               {
-                                "name": "mypy",
+                                "name": "android-emulator-build-test / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925365"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505033"
                               },
                               {
-                                "name": "Test collect_env (with_torch)",
+                                "name": "ios-12-5-1-x86-64 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925427"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505167"
                               },
                               {
-                                "name": "Test collect_env (without_torch)",
+                                "name": "linux-bionic-py3.7-clang9-slow / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925449"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505347"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925537"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505499"
                               },
                               {
-                                "name": "py2-setup-validate-errormsg",
+                                "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925644"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505639"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925688"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634505767"
                               },
                               {
-                                "name": "toc",
+                                "name": "win-vs2019-cuda11.6-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925809"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506032"
                               },
                               {
-                                "name": "shellcheck",
+                                "name": "macos-12-py3-x86-64-lite-interpreter / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925945"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsMiY=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "FAILURE"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2031576288"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506202"
+                              },
                               {
-                                "name": "run-torchbench",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576288/jobs/2928925134"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsLW0=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SKIPPED"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "pull"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2031576300"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "name": "linux-focal-rocm5.2-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506357"
+                              },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935743"
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634506535"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935775"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634664404"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935850"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634669945"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / build",
+                                "name": "parallelnative-linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935994"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634670046"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "macos-12-py3-x86-64 / test (default, 1, 2, macos-12)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936064"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634734165"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "name": "macos-12-py3-x86-64 / test (default, 2, 2, macos-12)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936179"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634734293"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "macos-12-py3-x86-64 / test (functorch, 1, 1, macos-12)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936265"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634734388"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-focal-rocm5.2-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936309"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634772323"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "linux-focal-rocm5.2-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936353"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634772410"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "name": "macos-12-py3-arm64 / test (default, 1, 2, macos-m1-12)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936395"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634812657"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "macos-12-py3-arm64 / test (default, 2, 2, macos-m1-12)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936426"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634812746"
                               },
                               {
-                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "macos-12-py3-arm64-mps / Run MPS tests",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936483"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634812878"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936516"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634868761"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936558"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634868884"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869012"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869132"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869240"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (slow, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869348"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (slow, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869457"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869537"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869649"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869743"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869861"
+                              },
+                              {
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936633"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4634869984"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635049837"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635049935"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050025"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936705"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050129"
                               },
                               {
-                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936736"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050234"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "name": "win-vs2019-cuda11.6-py3 / test (functorch, 1, 1, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936756"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050323"
                               },
                               {
-                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
-                                "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936796"
-                              },
+                                "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351701/jobs/4635050460"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsWbDg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2g="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "periodic"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351759"
+                          },
+                          "checkRuns": {
+                            "nodes": [
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "name": "ios-12-5-1-arm64-metal / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936823"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634504650"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990551"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634504883"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "ios-12-5-1-arm64 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990588"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505024"
                               },
                               {
-                                "name": "linux-docs / build-docs (cpp)",
+                                "name": "buck-build-test / buck-build-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992832"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505165"
                               },
                               {
-                                "name": "linux-docs / build-docs (python)",
+                                "name": "ios-12-5-1-arm64-coreml / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992868"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505316"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992932"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505521"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "libtorch-linux-bionic-cuda11.7-py3.7-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992965"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505667"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993011"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634505786"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-rocm5.2-py3.7-slow / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993042"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506031"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993086"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506209"
                               },
                               {
-                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-rocm5.2-py3.7-distributed / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993128"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506353"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "win-vs2019-cuda11.7-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995802"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506550"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "ios-12-5-1-x86-64-coreml / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995853"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634506968"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                                "name": "ios-12-5-1-arm64-custom-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995889"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634507176"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-rocm5.2-py3.7-distributed / test (distributed, 1, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928997626"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634799214"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-focal-rocm5.2-py3.7-distributed / test (distributed, 2, 2, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999058"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634799342"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-focal-rocm5.2-py3.7-slow / test (slow, 1, 1, linux.rocm.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999075"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634800216"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "name": "linux-bionic-cuda10.2-py3.9-gcc7 / test (multigpu, 1, 1, linux.16xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012407"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634896194"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012438"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634955955"
                               },
                               {
-                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012469"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634956066"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034328"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634956160"
                               },
                               {
-                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7-debug / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034340"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634956251"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929040801"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987167"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929045939"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987289"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046016"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987406"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.7-gcc7-debug / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046063"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4634987543"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082254"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635020787"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "name": "win-vs2019-cuda11.7-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082275"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635020896"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "win-vs2019-cuda11.7-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157614"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635021008"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "name": "linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157635"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635184380"
                               },
                               {
-                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "name": "linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157656"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351759/jobs/4635184472"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHxIT4=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsZHek=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkGU="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS_k="
                       }
                     ],
                     "pageInfo": {
                       "hasNextPage": false
                     }
-                  },
-                  "status": null,
-                  "pushedDate": "2022-03-24T00:42:33Z",
-                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
+                  }
                 }
               }
             ]
-          },
-          "changedFiles": 1,
-          "files": {
+          }
+        }
+      }
+    }
+  },
+  "query_sha=4c16925415d1fcc12ac0f5f7ce73b8e6122997d2f51c4c2757c2543e6493c60d cr_cursor=Y3Vyc29yOnYyOpHPAAAAAdqZ2fA= cs_cursor=Y3Vyc29yOnYyOpHPAAAAAdioqXw= name=pytorch number=79694 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "commits": {
             "nodes": [
               {
-                "path": "torch/nn/cpp.py"
+                "commit": {
+                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce",
+                  "checkSuites": {
+                    "nodes": [
+                      {
+                        "checkRuns": {
+                          "nodes": [
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856668"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856772"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856812"
+                            },
+                            {
+                              "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856867"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628858900"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628858948"
+                            },
+                            {
+                              "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+                              "conclusion": "SUCCESS",
+                              "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628859006"
+                            }
+                          ],
+                          "pageInfo": {
+                            "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ5lE=",
+                            "hasNextPage": false
+                          }
+                        }
+                      }
+                    ]
+                  }
+                }
               }
-            ],
-            "pageInfo": {
-              "endCursor": "MQ",
-              "hasNextPage": false
-            }
-          },
-          "reviews": {
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=pytorch-dev-infra org=pytorch": {
+    "data": {
+      "organization": {
+        "team": {
+          "members": {
             "nodes": [
               {
-                "author": {
-                  "login": "seemethere"
-                },
-                "state": "APPROVED"
-              }
-            ],
-            "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0yM1QxNTo1MDo0NS0wNzowMLkyMDIyLTAzLTIzVDE1OjUwOjQ1LTA3OjAwzjbPEDg=",
-              "hasPreviousPage": false
-            }
-          },
-          "comments": {
-            "nodes": [
+                "login": "kit1980"
+              },
+              {
+                "login": "huydhn"
+              },
+              {
+                "login": "seemethere"
+              },
+              {
+                "login": "malfet"
+              },
+              {
+                "login": "DanilBaibak"
+              },
+              {
+                "login": "ZainRizvi"
+              },
+              {
+                "login": "jeanschmidt"
+              },
+              {
+                "login": "atalman"
+              },
               {
-                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/74649\n\u21a9\ufe0f \u00a0[fb-only] Re-run with SSH instructions\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 6c3c3de (more details on the Dr. CI page):\n\n\n1/1 failures introduced in this PR\n\n\n1 failure not recognized by patterns:\n\n\n\nJob\nStep\nAction\n\n\n\n\n Lint / flake8-py3\nFail if there were any warnings\n\ud83d\udd01 rerun\n\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
-                "createdAt": "2022-03-23T22:40:51Z",
-                "author": {
-                  "login": "facebook-github-bot"
-                },
-                "authorAssociation": "MEMBER",
-                "editor": {
-                  "login": "facebook-github-bot"
-                },
-                "databaseId": 1076891218
+                "login": "osalpekar"
+              },
+              {
+                "login": "clee2000"
+              },
+              {
+                "login": "izaitsevfb"
+              },
+              {
+                "login": "weiwangmeta"
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOQDAOUg==",
-              "hasPreviousPage": false
+              "hasNextPage": false,
+              "endCursor": "Y3Vyc29yOnYyOpHOBoQSVA=="
             }
-          },
-          "labels": {
-            "edges": [
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              }
-            ]
           }
         }
       }
     }
   },
-  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=79694 owner=pytorch": {
+  "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=qwertyuiop org=pytorch": {
+    "data": {
+      "organization": {
+        "team": null
+      }
+    }
+  },
+  "query_sha=41327b4a6ac68efcb80ddbf3f4155a15906baac6c9c304aa4674e7b60f0c046f name=pytorch number=95233 owner=pytorch": {
     "data": {
       "repository": {
         "pullRequest": {
-          "closed": true,
+          "closed": false,
           "isCrossRepository": true,
           "author": {
-            "login": "kshitij12345"
+            "login": "huydhn"
           },
-          "title": "[complex] conv_transpose1d",
-          "body": "Reference: https://github.com/pytorch/pytorch/issues/71108",
-          "headRefName": "develop/complex/conv_transpose1d",
+          "title": "Build Triton in Docker image",
+          "body": "See a bunch of timeout error when trying to clone and build Triton today https://hud.pytorch.org/pytorch/pytorch/commit/c6d8d10b3e974019dae7ec91a85c6192c6d511fa, so let's build triton as part of the Docker image.\r\n\r\n* The pinned commit file is moved to the Docker context at `.ci/docker/ci_commit_pins/triton.txt`, and `.github/ci_commit_pins/triton.txt` is now a soft link pointing to it\r\n* New Docker images are built whenever the pinned commit is updated\r\n* The build logic is in `.ci/docker/common/install_triton.sh` which copies `install_triton` step in the CI.  The latter can be removed in a separate PR after this one\r\n",
+          "headRefName": "build-triton-in-docker",
           "headRepository": {
-            "nameWithOwner": "kshitij12345/pytorch"
+            "nameWithOwner": "huydhn/pytorch"
           },
           "baseRefName": "master",
           "baseRepository": {
@@ -36276,164 +26868,332 @@
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "d1ea948e65ac6d31ad056287ab65d38ecc68b30d"
+                  "oid": "a20c0fd79db9df12e9082bbed66bb43c5f51e725"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "b4ba1db9a3a71bd8c03158dcd1b68711360633d8"
+                  "oid": "de62417694429836573f0272b1f3d3ffbde9ffea"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "655a4220beae163bfe578f0318a130df01ec05d6"
+                  "oid": "d7fe0c2483cdb875bf4107bdc8db28174de823d9"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "Kshiteej K"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "8181716be7a8005eb13ad5c3f2e1279ed1c60aff"
+                  "oid": "99d30d1533d7ed648ab81c8c6d8270ae8a79d73b"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "9e5ca3663e7471786eeebebfdf84aea5d761712f"
+                  "oid": "d5c163cc7b410e2ed5f255449f84104fa19fb6bf"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "9c110f39bcdc4e56386b6f9c4e2c082c8940ade6"
+                  "oid": "217ffd8641ac09c5669e81f3a74c36a2170093d9"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "49315e79d0eee8008e2a74575c6fc0f6a9531ee4"
+                  "oid": "76a7f199265fcd2aa73e5c911c8e1847e26e0f3c"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "728752480760226270c374a0acc08e28b9b133f3"
+                  "oid": "b6ad1fffe50aadb6df70b47439d71911258c15c5"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "ffe43399d6f60ef7844523a5f465c11d9a67062f"
+                  "oid": "ac94bacc65c7f4ee8bfa8a9c9d1b4ebd31eae97b"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "9672a2198472567bae4ac6f55d004f7e1fa8a9fa"
+                  "oid": "f02e99dffa12e04b78bc85277dba9bc99422222f"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "48a0ebf32b895286f036b36c871f671dc867e400"
+                  "oid": "94193a531033e4691ee26d799303e5472ac80870"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "52fbe80d5c8a94e03d816c0bd21fd82019dcd5ac"
+                  "oid": "faff468b295a0383838452f880970c39e2b1451a"
                 }
               },
               {
                 "commit": {
                   "author": {
                     "user": {
-                      "login": "kshitij12345"
+                      "login": "huydhn"
                     },
-                    "email": "kshitijkalambarkar@gmail.com",
-                    "name": "kshitij12345"
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
                   },
-                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
+                  "oid": "a66b52be3e022e4c385c84dc20722a5c2cd8616d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "64ed5e93241b80dc95319ca40675fae77fddba3c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "6398ad76a3e515225794acadfb63c72e8dbd0f51"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "c3354a165fe1eb48a73db0dd763c4230d2996d0a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "83b23ec908153078b51f574372d69f26098d6ae7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "458847ccd8e84bc6ecd8ce27238d2b7ccda2de89"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "be589fa6b913133897b3e1f3a9c6d03ce168e756"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "ff8fb3e6c4b6205f1859fcbb9c7d2aab041fd4c2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "f97ede4b15945a36c9c7a7747f8a9e5a2dbf3ea0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "69764ca7ed532c0f6c4c9db8b28a4510b1d651fd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "ef904cdbeee52be86e68f8c2c80c3c99389c4864"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "f61b818c91d0373c9042f5cc1684742f5d94d2b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "5ee361891099e21ee92618edbed481f6917b8354"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "3d010ea834253e386b2e068faa8cc4bcfe63b3df"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "huydhn"
+                    },
+                    "email": "huydhn@gmail.com",
+                    "name": "Huy Do"
+                  },
+                  "oid": "2b158d601260c45533fee8a0d270410cbb910a14"
                 }
               }
             ],
             "pageInfo": {
-              "endCursor": "MTM",
+              "endCursor": "Mjc",
               "hasNextPage": false
             },
-            "totalCount": 13
+            "totalCount": 27
           },
           "commits": {
             "nodes": [
@@ -36441,34 +27201,157 @@
                 "commit": {
                   "checkSuites": {
                     "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Labeler"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687107"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "triage",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687107/jobs/7468837675"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXNQ1U=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdEU="
+                      },
                       {
                         "node": {
                           "app": {
                             "name": "Facebook GitHub Tools",
                             "databaseId": 12274
                           },
-                          "workflowRun": null,
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXNQis=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdGU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "docker-builds"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687247"
+                          },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "Facebook CLA Check",
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://code.facebook.com/cla/"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469660514"
                               },
                               {
-                                "name": "Meta Internal-Only Changes Check",
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://opensource.facebook.com/"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469660679"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469660861"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-py3.8-clang9)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469661044"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-py3.11-clang9)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469661228"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-rocm-n-1-py3)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469661427"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-rocm-n-py3)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469661686"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469661930"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469662096"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469662317"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-py3-clang7-android-ndk-r19c)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469662567"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-py3.8-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469662734"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-py3-clang7-asan)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469662890"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-py3-clang10-onnx)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469663024"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-linter)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687247/jobs/7469663177"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdtq8Hc=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXeS18=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqFo="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdeE="
                       },
                       {
                         "node": {
@@ -36480,24 +27363,24 @@
                             "workflow": {
                               "name": "TorchBench CI (pytorch-linux-py3.8-cu116)"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393316"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687250"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
                                 "name": "run-torchbench",
                                 "conclusion": "NEUTRAL",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393316/jobs/4628529923"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687250/jobs/7468837956"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTEwk=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXNRQ4=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SKIPPED"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXs="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdec="
                       },
                       {
                         "node": {
@@ -36507,66 +27390,95 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "Lint"
+                              "name": "Check Labels"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393315"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687254"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "lintrunner",
+                                "name": "Check labels",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628529910"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687254/jobs/7468837977"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXNRSI=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdes="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Build Triton wheels"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687251"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Build Triton Wheel (3.8)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468837960"
                               },
                               {
-                                "name": "quick-checks",
+                                "name": "Build Triton Wheel (3.9)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530162"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838140"
                               },
                               {
-                                "name": "Test collect_env (with_torch)",
+                                "name": "Build Triton Wheel (3.10)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530698"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838254"
                               },
                               {
-                                "name": "Test collect_env (without_torch)",
+                                "name": "Build Triton Wheel (3.11)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530867"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838429"
                               },
                               {
-                                "name": "Test collect_env (older_python_version)",
+                                "name": "Build Triton Conda (3.8)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530989"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838522"
                               },
                               {
-                                "name": "pr-sanity-checks",
+                                "name": "Build Triton Conda (3.9)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531151"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838620"
                               },
                               {
-                                "name": "workflow-checks",
+                                "name": "Build Triton Conda (3.10)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531475"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838728"
                               },
                               {
-                                "name": "Test tools",
+                                "name": "Build Triton Conda (3.11)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531753"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468838830"
                               },
                               {
-                                "name": "toc",
+                                "name": "upload-wheel",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531853"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687251/jobs/7468930304"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTHFY=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXPMso=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXw="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xde0="
                       },
                       {
                         "node": {
@@ -36576,407 +27488,345 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "pull"
+                              "name": "Lint"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2907393329"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687315"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / build",
+                                "name": "pr-sanity-checks",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531149"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469173102"
                               },
                               {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+                                "name": "Test collect_env (with_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531473"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469173222"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "name": "Test collect_env (without_torch)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531754"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469173322"
                               },
                               {
-                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+                                "name": "Test collect_env (older_python_version)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531857"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469173425"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7-pch / build",
+                                "name": "docker-image / calculate-docker-image",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532179"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469173540"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang10-onnx / build",
+                                "name": "lintrunner / linux-job",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532543"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469178039"
                               },
                               {
-                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "name": "toc / linux-job",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532694"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469178234"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "name": "Test tools / linux-job",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532918"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469178442"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "name": "workflow-checks / linux-job",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533033"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469178629"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
+                                "name": "quick-checks / linux-job",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533181"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687315/jobs/7469178859"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXUWtg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdmE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            },
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287687310"
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471234457"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "name": "win-vs2019-cpu-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533420"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471234651"
                               },
                               {
-                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "name": "linux-focal-py3-clang7-mobile-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533630"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471234779"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533825"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471234912"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "name": "win-vs2019-cuda11.7-py3 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533959"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235053"
                               },
                               {
-                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534129"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235249"
                               },
                               {
-                                "name": "linux-bionic-py3_7-clang8-xla / build",
+                                "name": "linux-focal-py3.9-clang7-asan / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534256"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235371"
                               },
                               {
-                                "name": "linux-focal-rocm5.2-py3.7 / build",
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534388"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235486"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+                                "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534571"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235614"
                               },
                               {
-                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / build",
+                                "name": "linux-bionic-py3.8-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534714"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235722"
                               },
                               {
-                                "name": "win-vs2019-cuda11.6-py3 / build",
+                                "name": "linux-focal-py3.8-gcc7-no-ops / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534989"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235818"
                               },
                               {
-                                "name": "win-vs2019-cpu-py3 / build",
+                                "name": "linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628535311"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471235929"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-jammy-cuda11.7-cudnn8-py3.8-clang12 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639115"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236032"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.11-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639198"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236175"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.8-gcc7-pch / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639265"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236290"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "name": "linux-vulkan-bionic-py3.11-clang9 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639339"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236405"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.8-gcc7 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639395"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236490"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-rocm5.4.2-py3.8 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639450"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236553"
                               },
                               {
-                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639509"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236633"
                               },
                               {
-                                "name": "linux-docs / build-docs (cpp)",
+                                "name": "linux-bionic-py3_8-clang8-xla / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639572"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471236725"
                               },
                               {
-                                "name": "linux-docs / build-docs (python)",
+                                "name": "linux-focal-py3.8-clang10-onnx / build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639635"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237108"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647047"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237385"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "name": "win-vs2019-cpu-py3 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647119"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237460"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "name": "linux-bionic-py3.8-clang9 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647215"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237549"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "name": "linux-vulkan-bionic-py3.11-clang9 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647277"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237660"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs-cpp-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647348"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237762"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs-python-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647432"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237814"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "name": "linux-docs / build-docs-functorch-false",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647522"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237886"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "name": "linux-focal-py3.9-clang7-asan / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647641"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471237977"
                               },
                               {
-                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "name": "linux-focal-py3.8-gcc7 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647762"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238039"
                               },
                               {
-                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-py3_8-clang8-xla / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628653797"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238118"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7-sm86 / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679376"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238229"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "name": "linux-focal-py3.8-clang10-onnx / filter",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679431"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238358"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679469"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238700"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679519"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238775"
                               },
                               {
-                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679594"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238847"
                               },
                               {
-                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628681226"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238910"
                               },
                               {
-                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628854932"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471238985"
                               },
                               {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856434"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239052"
                               },
                               {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856501"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239167"
                               },
                               {
-                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239243"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856575"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ2fA=",
-                              "hasNextPage": true
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqZs="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "windows-binary-libtorch-debug"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351637"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239302"
+                              },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-debug-build",
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4634503587"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239371"
                               },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-debug-test",
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4635312938"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsbsmM=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuA="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "windows-binary-wheel"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351640"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239429"
+                              },
                               {
-                                "name": "wheel-py3_7-cuda11_3-build",
+                                "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4634503571"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239486"
                               },
                               {
-                                "name": "wheel-py3_7-cuda11_3-test",
+                                "name": "linux-focal-py3.9-clang7-asan / test (default, 1, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4636146265"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsskcw=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuM="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "windows-binary-libtorch-release"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351643"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239790"
+                              },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-release-build",
+                                "name": "linux-focal-py3.9-clang7-asan / test (default, 2, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4634503570"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239835"
                               },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-release-test",
+                                "name": "linux-focal-py3.9-clang7-asan / test (default, 3, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4635003925"
-                              }
-                            ],
-                            "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsVbD8=",
-                              "hasNextPage": false
-                            }
-                          },
-                          "conclusion": "SUCCESS"
-                        },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuU="
-                      },
-                      {
-                        "node": {
-                          "app": {
-                            "name": "GitHub Actions",
-                            "databaseId": 15368
-                          },
-                          "workflowRun": {
-                            "workflow": {
-                              "name": "linux-binary-libtorch-cxx11-abi"
-                            },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351698"
-                          },
-                          "checkRuns": {
-                            "nodes": [
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471239931"
+                              },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "name": "linux-focal-py3.9-clang7-asan / test (default, 4, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4634504079"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471240007"
                               },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
+                                "name": "linux-focal-py3.9-clang7-asan / test (default, 5, 5, linux.4xlarge)",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4635072931"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287687310/jobs/7471240084"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW5Aw=",
-                              "hasNextPage": false
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArX9PME=",
+                              "hasNextPage": true
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2E="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xdmQ="
                       },
                       {
                         "node": {
@@ -36986,31 +27836,31 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-binary-libtorch-pre-cxx11"
+                              "name": "windows-binary-libtorch-release"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351700"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287688037"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "name": "libtorch-cpu-shared-with-deps-release-build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4634503897"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287688037/jobs/7468840010"
                               },
                               {
-                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
+                                "name": "libtorch-cpu-shared-with-deps-release-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4635077148"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287688037/jobs/7469438193"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW-jo=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXZtk0=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2I="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xf-4="
                       },
                       {
                         "node": {
@@ -37020,59 +27870,88 @@
                           },
                           "workflowRun": {
                             "workflow": {
-                              "name": "linux-binary-manywheel"
+                              "name": "windows-binary-libtorch-debug"
                             },
-                            "url": "https://github.com/pytorch/pytorch/actions/runs/2910351699"
+                            "url": "https://github.com/pytorch/pytorch/actions/runs/4287688040"
                           },
                           "checkRuns": {
                             "nodes": [
                               {
-                                "name": "manywheel-py3_7-cuda10_2-build / build",
+                                "name": "libtorch-cpu-shared-with-deps-debug-build",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4634503896"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287688040/jobs/7468840035"
                               },
                               {
-                                "name": "manywheel-py3_7-cuda10_2-test / build",
+                                "name": "libtorch-cpu-shared-with-deps-debug-test",
                                 "conclusion": "SUCCESS",
-                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4635934290"
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/4287688040/jobs/7469838838"
                               }
                             ],
                             "pageInfo": {
-                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsoMEA=",
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAArXh0IA=",
                               "hasNextPage": false
                             }
                           },
                           "conclusion": "SUCCESS"
                         },
-                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2M="
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAp3xf_g="
                       }
                     ],
                     "pageInfo": {
                       "hasNextPage": true
                     }
                   },
-                  "status": null,
-                  "pushedDate": "2022-08-22T22:04:19Z",
-                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "EasyCLA",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://easycla.lfx.linuxfoundation.org/#/?version=2"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2023-02-27T23:07:01Z",
+                  "oid": "2b158d601260c45533fee8a0d270410cbb910a14"
                 }
               }
             ]
           },
-          "changedFiles": 3,
+          "changedFiles": 9,
           "files": {
             "nodes": [
               {
-                "path": "aten/src/ATen/native/Convolution.cpp"
+                "path": ".ci/docker/build.sh"
               },
               {
-                "path": "torch/testing/_internal/common_methods_invocations.py"
+                "path": ".ci/docker/ci_commit_pins/triton.txt"
               },
               {
-                "path": "torch/testing/_internal/common_modules.py"
+                "path": ".ci/docker/common/common_utils.sh"
+              },
+              {
+                "path": ".ci/docker/common/install_triton.sh"
+              },
+              {
+                "path": ".ci/docker/requirements-ci.txt"
+              },
+              {
+                "path": ".ci/docker/ubuntu-cuda/Dockerfile"
+              },
+              {
+                "path": ".ci/docker/ubuntu/Dockerfile"
+              },
+              {
+                "path": ".github/ci_commit_pins/triton.txt"
+              },
+              {
+                "path": ".github/ci_commit_pins/triton.txt"
+              },
+              {
+                "path": ".github/workflows/build-triton-wheel.yml"
               }
             ],
             "pageInfo": {
-              "endCursor": "Mw",
+              "endCursor": "MTA",
               "hasNextPage": false
             }
           },
@@ -37080,71 +27959,101 @@
             "nodes": [
               {
                 "author": {
-                  "login": "ngimel"
+                  "login": "weiwangmeta"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "weiwangmeta"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "huydhn"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "weiwangmeta"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
                 },
                 "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "huydhn"
+                },
+                "state": "COMMENTED"
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0xOVQxMDowNzo1NC0wNzowMLkyMDIyLTA3LTE5VDEwOjA3OjU0LTA3OjAwzj43QcY=",
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMy0wMi0yM1QxMjoyOTo0Ny0wODowMLkyMDIzLTAyLTIzVDEyOjI5OjQ3LTA4OjAwzk40_3I=",
               "hasPreviousPage": false
             }
           },
           "comments": {
             "nodes": [
               {
-                "bodyText": "@pytorchbot merge -g\nAll is green internally!",
-                "createdAt": "2022-08-23T19:29:55Z",
+                "bodyText": "Per discussion with @weiwangmeta, this would be rolled out after finalize the RC on Feb 27th.",
+                "createdAt": "2023-02-23T22:43:41Z",
                 "author": {
-                  "login": "albanD"
+                  "login": "huydhn"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1224702749
+                "databaseId": 1442527935
               },
               {
-                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here.\nThe merge job was triggered with the green (-g) flag. This means that your change will be merged once all checks on your PR have passed (ETA: 0-4 Hours). If this is not the intended behavior, feel free to use some of the other merge options in the wiki.\nPlease reach out to the PyTorch DevX Team with feedback or questions!",
-                "createdAt": "2022-08-23T19:31:18Z",
+                "bodyText": "@pytorchbot merge",
+                "createdAt": "2023-02-28T16:59:45Z",
                 "author": {
-                  "login": "pytorchmergebot"
+                  "login": "huydhn"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1224705564
+                "databaseId": 1448528832
               },
               {
-                "bodyText": "Thanks for looking into it \ud83d\ude42 @albanD @jeanschmidt",
-                "createdAt": "2022-08-23T19:34:36Z",
+                "bodyText": "Merge failed\nReason: Changed file count mismatch\nDetails for Dev Infra team\nRaised by workflow job",
+                "createdAt": "2023-02-28T17:09:37Z",
                 "author": {
-                  "login": "kshitij12345"
+                  "login": "pytorchmergebot"
                 },
-                "authorAssociation": "COLLABORATOR",
+                "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1224712351
+                "databaseId": 1448547222
               },
               {
-                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
-                "createdAt": "2022-08-23T22:31:58Z",
+                "bodyText": "Merge failed\nReason: 'GitHubPR' object has no attribute 'changed_file'",
+                "createdAt": "2023-02-28T18:03:23Z",
                 "author": {
-                  "login": "github-actions"
+                  "login": "huydhn"
                 },
-                "authorAssociation": "NONE",
+                "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1224956051
+                "databaseId": 1448634513
               },
               {
-                "bodyText": "Yeah, discussed with my manager and I got the required permissions to do so. Sorry for not responding promptly yesterday. But I am available from now on to provide assistance :)",
-                "createdAt": "2022-08-24T09:24:04Z",
+                "bodyText": "Merge failed\nReason: 'GitHubPR' object has no attribute 'changed_file'",
+                "createdAt": "2023-02-28T18:04:33Z",
                 "author": {
-                  "login": "jeanschmidt"
+                  "login": "huydhn"
                 },
                 "authorAssociation": "MEMBER",
                 "editor": null,
-                "databaseId": 1225462612
+                "databaseId": 1448636158
               }
             ],
             "pageInfo": {
-              "startCursor": "Y3Vyc29yOnYyOpHOSP97HQ==",
+              "startCursor": "Y3Vyc29yOnYyOpHOVfs6vw==",
               "hasPreviousPage": true
             }
           },
@@ -37152,32 +28061,17 @@
             "edges": [
               {
                 "node": {
-                  "name": "open source"
-                }
-              },
-              {
-                "node": {
-                  "name": "Merged"
-                }
-              },
-              {
-                "node": {
-                  "name": "cla signed"
-                }
-              },
-              {
-                "node": {
-                  "name": "Reverted"
+                  "name": "ciflow/trunk"
                 }
               },
               {
                 "node": {
-                  "name": "ciflow/trunk"
+                  "name": "topic: not user facing"
                 }
               },
               {
                 "node": {
-                  "name": "ciflow/periodic"
+                  "name": "ciflow/inductor"
                 }
               }
             ]
diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py
new file mode 100644
index 000000000000..1fd32eb5ff7a
--- /dev/null
+++ b/.github/scripts/label_utils.py
@@ -0,0 +1,93 @@
+"""GitHub Label Utilities."""
+
+import json
+
+from functools import lru_cache
+from typing import List, Any, Tuple, TYPE_CHECKING, Union
+from urllib.request import urlopen, Request
+
+from github_utils import (
+    GitHubComment,
+    gh_fetch_json,
+)
+
+# TODO: this is a temp workaround to avoid circular dependencies,
+#       and should be removed once GitHubPR is refactored out of trymerge script.
+if TYPE_CHECKING:
+    from trymerge import GitHubPR
+
+BOT_AUTHORS = ["github-actions", "pytorchmergebot", "pytorch-bot"]
+
+LABEL_ERR_MSG_TITLE = "This PR needs a label"
+LABEL_ERR_MSG = f"""# {LABEL_ERR_MSG_TITLE}
+    If your changes are user facing and intended to be a part of release notes, please use a label starting with `release notes:`.
+
+    If not, please add the `topic: not user facing` label.
+    For more information, see
+    https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work.
+"""
+
+# Modified from https://github.com/pytorch/pytorch/blob/b00206d4737d1f1e7a442c9f8a1cadccd272a386/torch/hub.py#L129
+def _read_url(url: Request) -> Tuple[Any, Any]:
+    with urlopen(url) as r:
+        return r.headers, r.read().decode(r.headers.get_content_charset('utf-8'))
+
+
+def request_for_labels(url: str) -> Tuple[Any, Any]:
+    headers = {'Accept': 'application/vnd.github.v3+json'}
+    return _read_url(Request(url, headers=headers))
+
+
+def update_labels(labels: List[str], info: str) -> None:
+    labels_json = json.loads(info)
+    labels.extend([x["name"] for x in labels_json])
+
+
+def get_last_page_num_from_header(header: Any) -> int:
+    # Link info looks like: <https://api.github.com/repositories/65600975/labels?per_page=100&page=2>;
+    # rel="next", <https://api.github.com/repositories/65600975/labels?per_page=100&page=3>; rel="last"
+    link_info = header['link']
+    prefix = "&page="
+    suffix = ">;"
+    return int(link_info[link_info.rindex(prefix) + len(prefix):link_info.rindex(suffix)])
+
+
+@lru_cache()
+def gh_get_labels(org: str, repo: str) -> List[str]:
+    prefix = f"https://api.github.com/repos/{org}/{repo}/labels?per_page=100"
+    header, info = request_for_labels(prefix + "&page=1")
+    labels: List[str] = []
+    update_labels(labels, info)
+
+    last_page = get_last_page_num_from_header(header)
+    assert last_page > 0, "Error reading header info to determine total number of pages of labels"
+    for page_number in range(2, last_page + 1):  # skip page 1
+        _, info = request_for_labels(prefix + f"&page={page_number}")
+        update_labels(labels, info)
+
+    return labels
+
+
+def gh_add_labels(org: str, repo: str, pr_num: int, labels: Union[str, List[str]]) -> None:
+    gh_fetch_json(
+        f'https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels',
+        data={"labels": labels},
+    )
+
+
+def get_release_notes_labels(org: str, repo: str) -> List[str]:
+    return [label for label in gh_get_labels(org, repo) if label.lstrip().startswith("release notes:")]
+
+
+def has_required_labels(pr: "GitHubPR") -> bool:
+    pr_labels = pr.get_labels()
+    # Check if PR is not user facing
+    is_not_user_facing_pr = any(label.strip() == "topic: not user facing" for label in pr_labels)
+    return (
+        is_not_user_facing_pr or
+        any(label.strip() in get_release_notes_labels(pr.org, pr.project) for label in pr_labels)
+    )
+
+
+def is_label_err_comment(comment: GitHubComment) -> bool:
+    return comment.body_text.lstrip(" #").startswith(LABEL_ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS
diff --git a/.github/scripts/rockset_mocks.json b/.github/scripts/rockset_mocks.json
new file mode 100644
index 000000000000..56dea53eae34
--- /dev/null
+++ b/.github/scripts/rockset_mocks.json
@@ -0,0 +1,3703 @@
+{
+  "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6 8972a9fe6aa8be8f8035c83094ed371973bfbe73": [
+    {
+      "workflow_name": "Lint",
+      "id": 10792635251,
+      "name": "workflow-checks",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:41:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147335",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 11
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792782135,
+      "name": "Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T03:00:54Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972943157/jobs/6811267740",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792635109,
+      "name": "Test tools",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:43:38Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147235",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "windows-binary-libtorch-release",
+      "id": 10792634843,
+      "name": "libtorch-cpu-shared-with-deps-release-build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:39:37Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873146/jobs/6811147030",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "sccache: error: couldn't connect to server"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792634869,
+      "name": "Test collect_env (without_torch)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:41:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147054",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792634832,
+      "name": "Test collect_env (with_torch)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:42:09Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147021",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792634981,
+      "name": "toc",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:41:12Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147139",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792780797,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:00:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972943157/jobs/6811266701",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792673360,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:45:08Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972890028/jobs/6811179470",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792673308,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:45:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972890028/jobs/6811179424",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792634920,
+      "name": "Test collect_env (older_python_version)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:41:06Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147089",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "You are using pip version 20.3.4, however version 22.3.1 is available."
+      ],
+      "steps": 9
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792635296,
+      "name": "lintrunner",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:51:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147373",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 11
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792712764,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:50:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972908804/jobs/6811211788",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Update viable/strict",
+      "id": 10792724917,
+      "name": "do_update_viablestrict",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:54:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972915344/jobs/6811221940",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792868985,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972985741/jobs/6811341670",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792694550,
+      "name": "Upload test stats for 3954288986, attempt 2",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:52:08Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972899274/jobs/6811196744",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Validate and merge PR",
+      "id": 10792835074,
+      "name": "try_merge_pr_92734",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972968262/jobs/6811313079",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: 1 mandatory check(s) failed (Rule `superuser`).  The first few are:"
+      ],
+      "steps": 10
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792740803,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:54:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972923948/jobs/6811235442",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792869037,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972985741/jobs/6811341713",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792651510,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:42:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972881313/jobs/6811160982",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792780712,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:00:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972943157/jobs/6811266641",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792653457,
+      "name": "Upload test stats for 3971997968, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:45:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972881313/jobs/6811162657",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792651433,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:42:48Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972881313/jobs/6811160916",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792635341,
+      "name": "pr-sanity-checks",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T02:40:31Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147406",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "windows-binary-libtorch-debug",
+      "id": 10793266810,
+      "name": "libtorch-cpu-shared-with-deps-debug-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:21:00Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873154/jobs/6811674722",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "windows-binary-libtorch-debug",
+      "id": 10792634849,
+      "name": "libtorch-cpu-shared-with-deps-debug-build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:08:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873154/jobs/6811147035",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "sccache: error: couldn't connect to server"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792740754,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:54:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972923948/jobs/6811235396",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792742112,
+      "name": "Upload test stats for 3972261064, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:58:33Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972923948/jobs/6811236521",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "windows-binary-libtorch-release",
+      "id": 10793081469,
+      "name": "libtorch-cpu-shared-with-deps-release-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:50:54Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873146/jobs/6811521006",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835753781,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:12:00Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951561",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792930423,
+      "name": "Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T03:18:38Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973016881/jobs/6811393665",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792714281,
+      "name": "Upload test stats for 3972331499, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:53:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972908804/jobs/6811213054",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792675148,
+      "name": "Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T02:45:14Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972890028/jobs/6811180903",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835639218,
+      "name": "linux-bionic-py3_7-clang8-xla / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:53:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845868475",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10792635181,
+      "name": "quick-checks",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:41:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873147/jobs/6811147286",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792928838,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:18:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973016881/jobs/6811392256",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792870296,
+      "name": "Upload test stats for 3971869981, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:16:43Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972985741/jobs/6811342759",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835621236,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:42:12Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845854914",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x7f4719fe3290>"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792804560,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:03:09Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972953931/jobs/6811286740",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835621653,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:19:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855197",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558326,
+      "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
+      "conclusion": "cancelled",
+      "completed_at": "2023-01-24T02:48:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808750",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "##[error]The operation was canceled."
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370289,
+      "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:43:52Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845670982",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792693300,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:47:59Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972899274/jobs/6811195673",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792693264,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:48:52Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972899274/jobs/6811195641",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835559007,
+      "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:00:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809280",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Auto Request Review",
+      "id": 10835369799,
+      "name": "Auto Request Review",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:36:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169390/jobs/6845670628",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835552197,
+      "name": "linux-docs / build-docs-python-false",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:05:10Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845804069",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 15
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371644,
+      "name": "linux-focal-py3.7-gcc7-no-ops / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:13:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671946",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792950322,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:21:20Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973028094/jobs/6811410425",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792928907,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:18:37Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973016881/jobs/6811392317",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792862823,
+      "name": "Upload test stats for 3971766848, attempt 2",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:12:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972981482/jobs/6811336524",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792712702,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:50:54Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972908804/jobs/6811211734",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792868178,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:28Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972985345/jobs/6811341001",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "TorchBench CI (pytorch-linux-py3.8-cu116)",
+      "id": 10835369854,
+      "name": "run-torchbench",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-23T22:36:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169396/jobs/6845670670",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "Labeler",
+      "id": 10835369748,
+      "name": "triage",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:36:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169362/jobs/6845670588",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792660242,
+      "name": "update-html (whl/lts/1.8)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:44:00Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972884968/jobs/6811168279",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835752788,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:41:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950836",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: CUDA error: device-side assert triggered"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558540,
+      "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:49:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808933",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835372060,
+      "name": "linux-focal-py3.7-gcc7-pch / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:48:31Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672239",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371292,
+      "name": "win-vs2019-cpu-py3 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:22:25Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671699",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 14
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370201,
+      "name": "Test collect_env (without_torch)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:38:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670917",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835753101,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T01:05:14Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951052",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "AttributeError: 'TestCollectivesWithBaseClass' object has no attribute '_tls'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835559545,
+      "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:27:18Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809712",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x7f5a7928d9d0>"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370407,
+      "name": "win-vs2019-cuda11.6-py3 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:51:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671067",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 14
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370320,
+      "name": "Test collect_env (older_python_version)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:38:33Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671001",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "You are using pip version 20.3.4, however version 22.3.1 is available."
+      ],
+      "steps": 9
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370412,
+      "name": "lintrunner",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:47:18Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671075",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 11
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371543,
+      "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:44:31Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671874",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792950269,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:21:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973028094/jobs/6811410386",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792660170,
+      "name": "update-html (whl/nightly)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:02:11Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972884968/jobs/6811168210",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792788563,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:01:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972946613/jobs/6811273129",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370093,
+      "name": "Test collect_env (with_torch)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:40:44Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670831",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835753595,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:10:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951414",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835621101,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:07:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845854824",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370795,
+      "name": "linux-focal-py3-clang7-mobile-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:43:48Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671341",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792742173,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:54:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972924707/jobs/6811236568",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792797462,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:02:31Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972950976/jobs/6811280738",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558225,
+      "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:22:51Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808668",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x7f2f27264b50>"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835369945,
+      "name": "toc",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:38:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670735",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835752656,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:13:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950759",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792799766,
+      "name": "Upload test stats for 3972185507, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:05:56Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972950976/jobs/6811282754",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835559684,
+      "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:54:22Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809828",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792968823,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / test (inductor_torchbench, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:23:52Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811425988",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792761975,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:57:41Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972935009/jobs/6811252953",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792731367,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:53:22Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972918996/jobs/6811227472",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792659998,
+      "name": "update-html (whl)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:46:28Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972884968/jobs/6811168058",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835621389,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:18:55Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855028",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793225159,
+      "name": "win-vs2019-cuda11.6-py3 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:04:12Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811638443",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986303,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:35:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440870",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Create Release",
+      "id": 10792634818,
+      "name": "Create Release",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:42:59Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873148/jobs/6811147007",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835560720,
+      "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:48:25Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810589",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966915,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (slow, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:01:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424317",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Loader error"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792833728,
+      "name": "linux-focal-py3.7-clang10-onnx / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:06:18Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811311961",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635717,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7-no-ops / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:25:10Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147694",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792912663,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:16:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973008282/jobs/6811378463",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792951661,
+      "name": "Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T03:21:25Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973028094/jobs/6811411524",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792852683,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:08:28Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972977077/jobs/6811328004",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Close stale pull requests",
+      "id": 10792658274,
+      "name": "stale",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:44:01Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972884251/jobs/6811166542",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "inductor-A100-perf-smoke-test",
+      "id": 10792634986,
+      "name": "cuda11.6-py3.10-gcc7-sm80 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873198/jobs/6811147137",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635498,
+      "name": "caffe2-linux-focal-py3.7-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:40Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147526",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635326,
+      "name": "macos-12-py3-x86-64-lite-interpreter / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:44Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147395",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10836206561,
+      "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T01:11:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846294540",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835645296,
+      "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
+      "conclusion": "failure",
+      "completed_at": "2023-01-24T00:12:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845872827",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "[  FAILED  ] AtenXlaTensorTest.TestFrobeniusNormInDims"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792743645,
+      "name": "Upload test stats for 3972353676, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:57:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972924707/jobs/6811237830",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792874342,
+      "name": "linux-bionic-py3_7-clang8-xla / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:11:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811346203",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835369823,
+      "name": "Test tools",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:40:43Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845670645",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792761944,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:57:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972935009/jobs/6811252927",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370542,
+      "name": "quick-checks",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:39:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671156",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835753414,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:52:44Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951282",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "AttributeError: 'DistElementwiseOpsTest' object has no attribute '_tls'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792968470,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / test (inductor_huggingface, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:04:14Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811425673",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Check Labels",
+      "id": 10835370532,
+      "name": "Check labels",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:38:37Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169600/jobs/6845671155",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10793104496,
+      "name": "win-vs2019-cpu-py3 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:44:55Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811539514",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792983414,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:26:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811438353",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792863618,
+      "name": "linux-focal-py3.7-gcc7 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:06Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811337210",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635277,
+      "name": "libtorch-linux-bionic-cuda11.6-py3.7-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147355",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792732782,
+      "name": "Upload test stats for 3971865391, attempt 2",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:56:05Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972918996/jobs/6811228710",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792804444,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:03:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972953931/jobs/6811286636",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792968426,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / test (inductor, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:25:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811425629",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "/tmp/torchinductor_jenkins/ve/cve6njq56azxp75wdavy2zq7yor4h4u7lif5gtf6xwk6lgnbji6s.cpp:35:27: error: no matching function for call to 'atomic_add(bfloat16* __restrict__, float&)'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792861172,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972981482/jobs/6811335083",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986250,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:55:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440827",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967683,
+      "name": "linux-focal-rocm5.3-py3.8 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:24:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424967",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848712,
+      "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:12:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324837",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635866,
+      "name": "linux-focal-py3-clang7-mobile-custom-build-static / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:55:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147797",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792852613,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:08:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972977077/jobs/6811327941",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792788620,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:01:43Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972946613/jobs/6811273177",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10793106674,
+      "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:06:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811541260",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory."
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966942,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (functorch, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:08:11Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424340",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "inductor-A100-perf-smoke-test",
+      "id": 10792967219,
+      "name": "cuda11.6-py3.10-gcc7-sm80 / test (test_inductor_torchbench_smoketest_perf, 1, 1, linux.gcp.a100)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:59:31Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873198/jobs/6811424560",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "curl: (22) The requested URL returned error:"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792854342,
+      "name": "Upload test stats for 3972353706, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:12:46Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972977077/jobs/6811329375",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792895667,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:49:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811364272",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "linux-binary-manywheel",
+      "id": 10792634980,
+      "name": "manywheel-py3_7-cuda11_6-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:56:11Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873204/jobs/6811147132",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 22
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835560228,
+      "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:52:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810227",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792869481,
+      "name": "Upload test stats for 3971706031, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:16:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972985345/jobs/6811342079",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967360,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:11:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424681",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: hello"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370835,
+      "name": "pr-sanity-checks",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:39:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671367",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "linux-binary-libtorch-cxx11-abi",
+      "id": 10792634990,
+      "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:13:38Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873197/jobs/6811147142",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 22
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835372424,
+      "name": "linux-bionic-py3_7-clang8-xla / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:52:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672509",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229653,
+      "name": "win-vs2019-cuda11.6-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:16:16Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642435",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986038,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:46:01Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440638",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966783,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (default, 2, 4, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:29:16Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424197",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866891,
+      "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:46:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339915",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635788,
+      "name": "linux-bionic-py3_7-clang8-xla / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:11:10Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147735",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10836179619,
+      "name": "win-vs2019-cpu-py3 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:24:04Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846274479",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835570854,
+      "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:17:56Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845817778",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835563929,
+      "name": "linux-focal-py3.7-clang10-onnx / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:48:41Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845812809",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229456,
+      "name": "win-vs2019-cuda11.6-py3 / test (default, 3, 5, windows.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T06:10:17Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642264",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967317,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:19:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424646",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792843879,
+      "name": "linux-vulkan-bionic-py3.7-clang9 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811320835",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792816643,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:04:22Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972959537/jobs/6811297140",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635978,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:01:09Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147887",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 19
+    },
+    {
+      "workflow_name": "Lint",
+      "id": 10835370690,
+      "name": "workflow-checks",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:38:52Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169394/jobs/6845671269",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 11
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10836206839,
+      "name": "win-vs2019-cpu-py3 / test (functorch, 1, 1, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:50:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846294751",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory."
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835559951,
+      "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:58:27Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810034",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "AttributeError: 'builtin_function_or_method' object has no attribute '__code__'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835372180,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-bazel-test / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:56:28Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672322",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 19
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792968872,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / test (inductor_distributed, 1, 1, linux.g5.12xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:35:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811426035",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792964223,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:37Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811422110",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848547,
+      "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:31:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324688",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792731408,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:53:18Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972918996/jobs/6811227502",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10836206711,
+      "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T01:24:31Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6846294653",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371404,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:59:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671779",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371172,
+      "name": "linux-focal-py3.7-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:47:28Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671612",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635808,
+      "name": "macos-12-py3-x86-64 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:47:10Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147753",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "inductor-A100-perf-smoke-test",
+      "id": 10792964678,
+      "name": "cuda11.6-py3.10-gcc7-sm80 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:43Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873198/jobs/6811422499",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792797570,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:02:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972950976/jobs/6811280835",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Check Labels",
+      "id": 10835369817,
+      "name": "Check labels",
+      "conclusion": "cancelled",
+      "completed_at": "2023-01-23T22:36:16Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169391/jobs/6845670642",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792936266,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:19:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973020865/jobs/6811398630",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792914105,
+      "name": "Upload test stats for 3972015418, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:19:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973008282/jobs/6811379678",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793122279,
+      "name": "macos-12-py3-x86-64 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:47:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811554250",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792937537,
+      "name": "Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T03:19:38Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973020865/jobs/6811399718",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792964483,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811422326",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792762532,
+      "name": "Upload test stats for 3972238542, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:01:06Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972935009/jobs/6811253382",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Update viable/strict",
+      "id": 10792956069,
+      "name": "do_update_viablestrict",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:24:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973031316/jobs/6811415319",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792877635,
+      "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
+      "conclusion": "failure",
+      "completed_at": "2023-01-21T04:30:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811349082",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "[  FAILED  ] AtenXlaTensorTest.TestFrobeniusNormInDims"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848412,
+      "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:42:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324581",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835621534,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:24:40Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855123",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792912609,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:16:27Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973008282/jobs/6811378416",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229601,
+      "name": "win-vs2019-cuda11.6-py3 / test (functorch, 1, 1, windows.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:50:59Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642391",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793125475,
+      "name": "macos-12-py3-x86-64 / test (default, 2, 3, macos-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:30:04Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811556834",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10793106598,
+      "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:35:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811541202",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986488,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:38:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811441059",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967244,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_NO_AVX2, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:32:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424578",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967142,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:21:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424497",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792831904,
+      "name": "update-html (whl/lts/1.8)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:06:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966607/jobs/6811310357",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792826789,
+      "name": "Upload test stats for 3972398611, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972962158/jobs/6811305969",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835566456,
+      "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:54:55Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845814609",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370674,
+      "name": "linux-focal-py3.7-clang7-asan / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:51:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671251",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792660094,
+      "name": "update-html (whl/test)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:44:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972884968/jobs/6811168141",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792936328,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:19:37Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973020865/jobs/6811398675",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "linux-binary-libtorch-cxx11-abi",
+      "id": 10792893058,
+      "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873197/jobs/6811361989",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "undefined reference to `c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::string const&)'"
+      ],
+      "steps": 21
+    },
+    {
+      "workflow_name": "linux-binary-libtorch-pre-cxx11",
+      "id": 10792936651,
+      "name": "libtorch-cpu-shared-with-deps-pre-cxx11-test / test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:30:13Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873199/jobs/6811398949",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 21
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635460,
+      "name": "linux-focal-py3.7-gcc7-pch / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147500",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635552,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:16Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147562",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835621768,
+      "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:30:00Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845855276",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835551861,
+      "name": "linux-focal-py3.7-gcc7 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:47:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845803829",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792968531,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / test (inductor_timm, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:57:15Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811425723",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229347,
+      "name": "win-vs2019-cuda11.6-py3 / test (default, 1, 5, windows.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T06:22:27Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642166",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792895859,
+      "name": "linux-focal-py3.7-clang7-asan / test (functorch, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:53:57Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811364437",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792836895,
+      "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:35:20Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811314645",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635153,
+      "name": "win-vs2019-cuda11.6-py3",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T02:40:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147267",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "ossf-scorecard",
+      "id": 10792634781,
+      "name": "Scorecards analysis",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T02:40:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873145/jobs/6811146983",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371021,
+      "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:02:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671504",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792790756,
+      "name": "Upload test stats for 3972331494, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:05:44Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972946613/jobs/6811275037",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792742142,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:54:51Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972924707/jobs/6811236540",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229549,
+      "name": "win-vs2019-cuda11.6-py3 / test (default, 5, 5, windows.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:59:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642344",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986438,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (functorch, 1, 1, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:24:49Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440992",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Check Labels",
+      "id": 10839257306,
+      "name": "Check labels",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T03:05:12Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3992628517/jobs/6848645507",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835747044,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:00:00Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845946929",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986390,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:22:43Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440944",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: 'DistElementwiseOpsTest' object has no attribute '_tls'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967067,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:04:48Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424439",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792822366,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:05Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972962158/jobs/6811302034",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635391,
+      "name": "linux-bionic-py3.7-clang9-slow / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:15Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147445",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370522,
+      "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:23:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671153",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792831699,
+      "name": "update-html (whl/test)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:06:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966607/jobs/6811310170",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635298,
+      "name": "linux-focal-py3.7-clang7-asan / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:13:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147374",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835552073,
+      "name": "linux-docs / build-docs-cpp-false",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:58:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845803990",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 15
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635441,
+      "name": "macos-12-py3-arm64 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:24:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147487",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835559809,
+      "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:12:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809924",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835553061,
+      "name": "linux-bionic-py3.7-clang9 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:47:57Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845804734",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792634961,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811147118",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986094,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:43:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440697",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966735,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (default, 1, 4, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:17:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424157",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635566,
+      "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:45:16Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147571",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371918,
+      "name": "linux-focal-rocm5.3-py3.8 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:56:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672136",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558841,
+      "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:59:22Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809146",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558690,
+      "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:53:10Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809050",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848641,
+      "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:18:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324775",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: 'builtin_function_or_method' object has no attribute '__code__'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792806937,
+      "name": "Upload test stats for 3972290783, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:06:49Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972953931/jobs/6811288904",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866678,
+      "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:15:49Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339725",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370909,
+      "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:55:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845671421",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792868223,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972985345/jobs/6811341038",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986347,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:26:20Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440906",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: 'TestCollectivesWithBaseClass' object has no attribute '_tls'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848598,
+      "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:20:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324736",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: '_CachedForward' object has no attribute '__getattr__'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635741,
+      "name": "linux-focal-py3.7-clang10-onnx / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:05:55Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147700",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635220,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:14Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147316",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835753262,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (distributed, 2, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:53:44Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845951169",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558431,
+      "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:37:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808838",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "AttributeError: 'NoneType' object has no attribute '_free_weak_ref'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792968626,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / test (inductor_timm, 2, 2, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "failure",
+      "completed_at": "2023-01-21T05:27:07Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811425836",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "sebotnet33ts_256",
+        "fail_accuracy"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792973102,
+      "name": "macos-12-py3-arm64 / test (functorch, 1, 1, macos-m1-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:54:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811429600",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835752455,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:25:45Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950678",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792893680,
+      "name": "linux-focal-py3.7-clang7-asan / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:13:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811362497",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792863841,
+      "name": "linux-docs / build-docs-functorch-false",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:14:21Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811337363",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 15
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835613396,
+      "name": "linux-focal-py3.7-clang7-asan / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:51:44Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845849131",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835372309,
+      "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:48:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672419",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835370169,
+      "name": "linux-bionic-py3.7-clang9 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:47:33Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845670888",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "inductor",
+      "id": 10792965399,
+      "name": "cuda11.6-py3.10-gcc7-sm86 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:56Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873195/jobs/6811423056",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792970597,
+      "name": "linux-focal-rocm5.3-py3.8 / test (default, 2, 2, linux.rocm.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:55:25Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811427498",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 17
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966820,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (default, 3, 4, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:46:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424231",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: CUDA error: device-side assert triggered"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635599,
+      "name": "win-vs2019-cuda11.6-py3 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:03:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147604",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 14
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635351,
+      "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:10Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147416",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835552307,
+      "name": "linux-docs / build-docs-functorch-false",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:52:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845804156",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 15
+    },
+    {
+      "workflow_name": "Validate and merge PR",
+      "id": 10792945471,
+      "name": "try_merge_pr_92664",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:22:20Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3973025704/jobs/6811406499",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 10
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792836806,
+      "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:25:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811314568",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835558085,
+      "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:14:00Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845808552",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792861264,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972981482/jobs/6811335166",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792830774,
+      "name": "check-api-rate",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:08Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966024/jobs/6811309309",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635632,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:26:07Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147627",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "unstable",
+      "id": 10792634847,
+      "name": "introduction",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:40:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873143/jobs/6811147031",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835752946,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:26:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845950938",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635952,
+      "name": "android-emulator-build-test / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:27:05Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147867",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 8
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635704,
+      "name": "linux-focal-py3.7-gcc7-no-ops / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:28Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147672",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835570714,
+      "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T23:07:52Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845817702",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835560087,
+      "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:04:40Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845810121",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "AttributeError: '_CachedForward' object has no attribute '__getattr__'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835559385,
+      "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-24T00:22:32Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845809596",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10835371755,
+      "name": "linux-focal-py3.7-clang10-onnx / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-23T22:48:16Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3991169410/jobs/6845672034",
+      "head_sha": "030a6d3fe98e46c82cdbae9b93a72ceab4febfd6",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793125514,
+      "name": "macos-12-py3-x86-64 / test (default, 3, 3, macos-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:42:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811556869",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x7fd73e434fd0>"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635994,
+      "name": "linux-focal-py3.7-clang7-tsan / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:04:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147903",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792818709,
+      "name": "Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T03:04:27Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972959537/jobs/6811298973",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635834,
+      "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:59:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147771",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229408,
+      "name": "win-vs2019-cuda11.6-py3 / test (default, 2, 5, windows.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T06:19:47Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642219",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10793106643,
+      "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:33:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811541238",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792830680,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:05:52Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966024/jobs/6811309233",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866809,
+      "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:20:24Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339847",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "linux-binary-libtorch-pre-cxx11",
+      "id": 10792634991,
+      "name": "libtorch-cpu-shared-with-deps-pre-cxx11-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:19:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873199/jobs/6811147143",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 22
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793125434,
+      "name": "macos-12-py3-x86-64 / test (default, 1, 3, macos-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:25:19Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811556799",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792895612,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:54:39Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811364222",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635591,
+      "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:49:45Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147594",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793229504,
+      "name": "win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T06:17:38Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811642305",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: CUDA error: device-side assert triggered"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967394,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (distributed, 3, 3, linux.8xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:11:21Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424711",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: 'TestCollectivesWithBaseClass' object has no attribute '_tls'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792895732,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:48:06Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811364327",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x7ffabb977110>"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635911,
+      "name": "win-vs2019-cpu-py3 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:35:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147833",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 14
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792847605,
+      "name": "linux-bionic-py3.7-clang9-slow / test (slow, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:47:38Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811323909",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Loader error"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792833252,
+      "name": "Upload test stats for 3972245592, attempt 1",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:10:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966024/jobs/6811311559",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 9
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792863785,
+      "name": "linux-docs / build-docs-python-false",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:20:57Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811337317",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 15
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792973052,
+      "name": "macos-12-py3-arm64 / test (default, 2, 2, macos-m1-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:08:23Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811429557",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x126979550>"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792970565,
+      "name": "linux-focal-rocm5.3-py3.8 / test (default, 1, 2, linux.rocm.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:10:21Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811427474",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 17
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967033,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:33:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424408",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848505,
+      "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:15:48Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324657",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792831606,
+      "name": "update-html (whl)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:36Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966607/jobs/6811310085",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966993,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:38:54Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424379",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966854,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (default, 4, 4, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:34:08Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424265",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792972974,
+      "name": "macos-12-py3-arm64 / test (default, 1, 2, macos-m1-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:01:46Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811429494",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866511,
+      "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:51:49Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339582",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: incorrect results of backend <torch._dynamo.output_graph.WrapperBackend object at 0x7fee7072eb90>"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635670,
+      "name": "linux-focal-py3-clang7-mobile-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T02:47:45Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147651",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792986179,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:07:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811440758",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: CUDA error: device-side assert triggered"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866734,
+      "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:15:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339775",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792844539,
+      "name": "linux-bionic-py3.7-clang9-slow / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:40Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811321342",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "Update S3 HTML indices for download.pytorch.org",
+      "id": 10792831807,
+      "name": "update-html (whl/nightly)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:21:59Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972966607/jobs/6811310264",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 4
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866625,
+      "name": "linux-focal-py3.7-gcc7 / test (distributed, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:17:56Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339680",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: 'Replicate' object has no attribute 'dim'"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792863698,
+      "name": "linux-docs / build-docs-cpp-false",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:18:29Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811337276",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 15
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635524,
+      "name": "linux-bionic-py3.7-clang9 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:20Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147541",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967107,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:50:27Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424469",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: CUDA error: device-side assert triggered"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792822302,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:05:12Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972962158/jobs/6811301983",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792636035,
+      "name": "linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:06:55Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147941",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635905,
+      "name": "linux-focal-rocm5.3-py3.8 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:23:48Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147826",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635154,
+      "name": "ios-12-5-1-x86-64 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:50:25Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147268",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 13
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792846633,
+      "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:15:15Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811323053",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792819036,
+      "name": "linux-focal-py3.7-clang7-tsan / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:04:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811299271",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792895795,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.4xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:40:18Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811364382",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866440,
+      "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:40:02Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339525",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792848458,
+      "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:50:34Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811324614",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792822286,
+      "name": "linux-focal-py3.7-clang7-tsan / test (tsan, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:46:58Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811301966",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792635859,
+      "name": "pytorch-linux-focal-py3-clang7-android-ndk-r19c-build / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:45:21Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811147792",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 19
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967204,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (nogpu_AVX512, 1, 1, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:31:53Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424545",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: !schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/aten/src/ATen/FunctionalizeFallbackKernel.cpp\":32, please report a bug to PyTorch. mutating and aliasing ops should all have codegen'd kernels"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792966886,
+      "name": "linux-bionic-cuda11.6-py3.10-gcc7-sm86 / test (slow, 1, 2, linux.g5.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:05:51Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424292",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635250,
+      "name": "linux-focal-rocm5.3-py3.8",
+      "conclusion": "skipped",
+      "completed_at": "2023-01-21T02:40:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147336",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 0
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792970190,
+      "name": "macos-12-py3-arm64-mps / Run MPS tests",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:30:21Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811427149",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 14
+    },
+    {
+      "workflow_name": "Upload test stats",
+      "id": 10792816509,
+      "name": "get_workflow_conclusion",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:04:26Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972959537/jobs/6811297020",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 3
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792970116,
+      "name": "macos-12-py3-arm64 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:24:35Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811427083",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792895556,
+      "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:25:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811364170",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: where expected condition to be a boolean tensor, but got a tensor with dtype Float"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792635406,
+      "name": "linux-focal-py3.7-gcc7 / build",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:09:42Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811147454",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 16
+    },
+    {
+      "workflow_name": "linux-binary-manywheel",
+      "id": 10793564471,
+      "name": "manywheel-py3_7-cuda11_6-test / test",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:17:40Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873204/jobs/6811922172",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 21
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10793125544,
+      "name": "macos-12-py3-x86-64 / test (functorch, 1, 1, macos-12)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T05:47:41Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811556896",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides"
+      ],
+      "steps": 18
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792866568,
+      "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T04:42:50Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811339634",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": [
+        "AttributeError: Can't get attribute 'foo_add' on <module 'torch.testing._internal.distributed.rpc.rpc_test' from '/opt/conda/envs/py_3.7/lib/python3.7/site-packages/torch/testing/_internal/distributed/rpc/rpc_test.py'> Default RPC pickler does not serialize"
+      ],
+      "steps": 20
+    },
+    {
+      "workflow_name": "pull",
+      "id": 10792845023,
+      "name": "linux-bionic-py3.7-clang9 / filter",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:07:45Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873201/jobs/6811321705",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 5
+    },
+    {
+      "workflow_name": "trunk",
+      "id": 10792967277,
+      "name": "linux-bionic-cuda11.7-py3.10-gcc7 / test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu)",
+      "conclusion": "success",
+      "completed_at": "2023-01-21T03:38:30Z",
+      "html_url": "https://github.com/pytorch/pytorch/actions/runs/3972873205/jobs/6811424611",
+      "head_sha": "8972a9fe6aa8be8f8035c83094ed371973bfbe73",
+      "failure_captures": null,
+      "steps": 20
+    }
+  ]
+}
diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py
index 64e91dcd8ecb..3ef9a30a4914 100644
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@@ -1,77 +1,124 @@
 """test_check_labels.py"""
 
-from typing import Any
+from typing import Any, List
 from unittest import TestCase, mock, main
 
+from check_labels import (
+    main as check_labels_main,
+    add_label_err_comment,
+    delete_all_label_err_comments,
+)
+from github_utils import GitHubComment
+from label_utils import BOT_AUTHORS, LABEL_ERR_MSG_TITLE
+from test_trymerge import mocked_gh_graphql, mock_gh_get_info
 from trymerge import GitHubPR
-from test_trymerge import mocked_gh_graphql
-from check_labels import has_required_labels
 
-release_notes_labels = [
-    "release notes: AO frontend",
-    "release notes: autograd",
-    "release notes: benchmark",
-    "release notes: build",
-    "release notes: complex",
-    "release notes: composability",
-    "release notes: cpp",
-    "release notes: cuda",
-    "release notes: cudnn",
-    "release notes: dataloader",
-    "release notes: distributed (c10d)",
-    "release notes: distributed (ddp)",
-    "release notes: distributed (fsdp)",
-    "release notes: distributed (pipeline)",
-    "release notes: distributed (rpc)",
-    "release notes: distributed (sharded)",
-    "release notes: foreach_frontend",
-    "release notes: functorch",
-    "release notes: fx",
-    "release notes: hub",
-    "release notes: jit",
-    "release notes: lazy",
-    "release notes: linalg_frontend",
-    "release notes: memory format",
-    "release notes: Meta API",
-    "release notes: mobile",
-    "release notes: mps",
-    "release notes: nested tensor",
-    "release notes: nn",
-    "release notes: onnx",
-    "release notes: package/deploy",
-    "release notes: performance_as_product",
-    "release notes: profiler",
-    "release notes: python_frontend",
-    "release notes: quantization",
-    "release notes: releng",
-    "release notes: rocm",
-    "release notes: sparse",
-    "release notes: visualization",
-    "release notes: vulkan",
-]
+def mock_parse_args() -> object:
+    class Object(object):
+        def __init__(self) -> None:
+            self.pr_num = 76123
+    return Object()
+
+def mock_add_label_err_comment(pr: "GitHubPR") -> None:
+    pass
+
+def mock_delete_all_label_err_comments(pr: "GitHubPR") -> None:
+    pass
+
+def mock_get_comments() -> List[GitHubComment]:
+    return [
+        # Case 1 - a non label err comment
+        GitHubComment(
+            body_text="mock_body_text",
+            created_at="",
+            author_login="",
+            author_association="",
+            editor_login=None,
+            database_id=1,
+        ),
+        # Case 2 - a label err comment
+        GitHubComment(
+            body_text=" #" + LABEL_ERR_MSG_TITLE,
+            created_at="",
+            author_login=BOT_AUTHORS[1],
+            author_association="",
+            editor_login=None,
+            database_id=2,
+        ),
+    ]
 
 
 class TestCheckLabels(TestCase):
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels)
-    def test_pr_with_missing_labels(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
-        "Test PR with no 'release notes:' label or 'topic: not user facing' label"
-        pr = GitHubPR("pytorch", "pytorch", 82169)
-        self.assertFalse(has_required_labels(pr))
+    @mock.patch('trymerge.GitHubPR.get_comments', return_value=[mock_get_comments()[0]])
+    @mock.patch('check_labels.gh_post_pr_comment')
+    def test_correctly_add_label_err_comment(
+        self, mock_gh_post_pr_comment: Any, mock_get_comments: Any, mock_gh_grphql: Any
+    ) -> None:
+        "Test add label err comment when similar comments don't exist."
+        pr = GitHubPR("pytorch", "pytorch", 75095)
+        add_label_err_comment(pr)
+        mock_gh_post_pr_comment.assert_called_once()
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels)
-    def test_pr_with_release_notes_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
-        "Test PR with 'release notes: nn' label"
-        pr = GitHubPR("pytorch", "pytorch", 71759)
-        self.assertTrue(has_required_labels(pr))
+    @mock.patch('trymerge.GitHubPR.get_comments', return_value=[mock_get_comments()[1]])
+    @mock.patch('check_labels.gh_post_pr_comment')
+    def test_not_add_label_err_comment(
+        self, mock_gh_post_pr_comment: Any, mock_get_comments: Any, mock_gh_grphql: Any
+    ) -> None:
+        "Test not add label err comment when similar comments exist."
+        pr = GitHubPR("pytorch", "pytorch", 75095)
+        add_label_err_comment(pr)
+        mock_gh_post_pr_comment.assert_not_called()
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels)
-    def test_pr_with_not_user_facing_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
-        "Test PR with 'topic: not user facing' label"
+    @mock.patch('trymerge.GitHubPR.get_comments', return_value=mock_get_comments())
+    @mock.patch('check_labels.gh_delete_comment')
+    def test_correctly_delete_all_label_err_comments(
+        self, mock_gh_delete_comment: Any, mock_get_comments: Any, mock_gh_grphql: Any
+    ) -> None:
+        "Test only delete label err comment."
         pr = GitHubPR("pytorch", "pytorch", 75095)
-        self.assertTrue(has_required_labels(pr))
+        delete_all_label_err_comments(pr)
+        mock_gh_delete_comment.assert_called_once_with("pytorch", "pytorch", 2)
+
+    @mock.patch('trymerge.gh_get_pr_info', return_value=mock_gh_get_info())
+    @mock.patch('check_labels.parse_args', return_value=mock_parse_args())
+    @mock.patch('check_labels.has_required_labels', return_value=False)
+    @mock.patch('check_labels.delete_all_label_err_comments', side_effect=mock_delete_all_label_err_comments)
+    @mock.patch('check_labels.add_label_err_comment', side_effect=mock_add_label_err_comment)
+    def test_ci_fails_without_required_labels(
+        self,
+        mock_add_label_err_comment: Any,
+        mock_delete_all_label_err_comments: Any,
+        mock_has_required_labels: Any,
+        mock_parse_args: Any,
+        mock_gh_get_info: Any,
+    ) -> None:
+        with self.assertRaises(SystemExit) as sys_exit:
+            check_labels_main()
+        self.assertEqual(str(sys_exit.exception), "1")
+        mock_add_label_err_comment.assert_called_once()
+        mock_delete_all_label_err_comments.assert_not_called()
+
+    @mock.patch('trymerge.gh_get_pr_info', return_value=mock_gh_get_info())
+    @mock.patch('check_labels.parse_args', return_value=mock_parse_args())
+    @mock.patch('check_labels.has_required_labels', return_value=True)
+    @mock.patch('check_labels.delete_all_label_err_comments', side_effect=mock_delete_all_label_err_comments)
+    @mock.patch('check_labels.add_label_err_comment', side_effect=mock_add_label_err_comment)
+    def test_ci_success_with_required_labels(
+        self,
+        mock_add_label_err_comment: Any,
+        mock_delete_all_label_err_comments: Any,
+        mock_has_required_labels: Any,
+        mock_parse_args: Any,
+        mock_gh_get_info: Any,
+    ) -> None:
+        with self.assertRaises(SystemExit) as sys_exit:
+            check_labels_main()
+        self.assertEqual(str(sys_exit.exception), "0")
+        mock_add_label_err_comment.assert_not_called()
+        mock_delete_all_label_err_comments.assert_called_once()
 
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
index 55410e846c97..4bd91c13822c 100755
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@@ -1,20 +1,90 @@
 #!/usr/bin/env python3
 
+import json
 import os
+from typing import Any, Dict
+from unittest import main, mock, TestCase
+
+import requests
 import yaml
-import json
-from unittest import TestCase, main, mock
 from filter_test_configs import (
-    get_labels,
     filter,
-    set_periodic_modes,
+    get_labels,
     PREFIX,
+    remove_disabled_jobs,
+    set_periodic_modes,
+    SUPPORTED_PERIODICAL_MODES,
     VALID_TEST_CONFIG_LABELS,
-    SUPPORTED_PERIODICAL_MODES
 )
-import requests
 from requests.models import Response
-from typing import Any, Dict
+
+
+MOCKED_DISABLED_JOBS = {
+    "pull / mock-platform-1": [
+        "pytorchbot",
+        "1",
+        "https://github.com/pytorch/pytorch/issues/1",
+        "pull",
+        "mock-platform-1",
+        "",
+    ],
+    "trunk / mock-platform-2 / build": [
+        "pytorchbot",
+        "2",
+        "https://github.com/pytorch/pytorch/issues/2",
+        "trunk",
+        "mock-platform-2",
+        "build",
+    ],
+    "periodic / mock-platform-3 / test": [
+        "pytorchbot",
+        "3",
+        "https://github.com/pytorch/pytorch/issues/3",
+        "periodic",
+        "mock-platform-3",
+        "test",
+    ],
+    "pull / mock-platform-4 / build-and-test": [
+        "pytorchbot",
+        "4",
+        "https://github.com/pytorch/pytorch/issues/4",
+        "pull",
+        "mock-platform-4",
+        "build-and-test",
+    ],
+    "trunk / mock-platform-5 / test (backward_compat)": [
+        "pytorchbot",
+        "5",
+        "https://github.com/pytorch/pytorch/issues/5",
+        "trunk",
+        "mock-platform-5",
+        "test (backward_compat)",
+    ],
+    "periodic / mock-platform-6 / build-and-test (default)": [
+        "pytorchbot",
+        "6",
+        "https://github.com/pytorch/pytorch/issues/6",
+        "periodic",
+        "mock-platform-6",
+        "build-and-test (default)",
+    ],
+    "pull / mock-platform-7 / test [invalid syntax]": [
+        "pytorchbot",
+        "7",
+        "https://github.com/pytorch/pytorch/issues/7",
+        "pull",
+        "mock-platform-7",
+        "test [invalid syntax]",
+    ],
+    "trunk / mock-platform-8 / build (dynamo)": [
+        "pytorchbot",
+        "8",
+        "https://github.com/pytorch/pytorch/issues/8",
+        "trunk",
+        "mock-platform-8",
+        "build (dynamo)",
+    ],
+}
 
 
 def mocked_gh_get_labels_failed(url: str, headers: Dict[str, str]) -> Response:
@@ -31,7 +101,6 @@ def mocked_gh_get_labels(url: str, headers: Dict[str, str]) -> Response:
 
 
 class TestConfigFilter(TestCase):
-
     def setUp(self) -> None:
         os.environ["GITHUB_TOKEN"] = "GITHUB_TOKEN"
         if os.getenv("GITHUB_OUTPUT"):
@@ -42,7 +111,9 @@ def test_get_labels(self, mocked_gh: Any) -> None:
         labels = get_labels(pr_number=12345)
         self.assertSetEqual({"foo", "bar"}, labels)
 
-    @mock.patch("filter_test_configs.requests.get", side_effect=mocked_gh_get_labels_failed)
+    @mock.patch(
+        "filter_test_configs.requests.get", side_effect=mocked_gh_get_labels_failed
+    )
     def test_get_labels_failed(self, mocked_gh: Any) -> None:
         labels = get_labels(pr_number=54321)
         self.assertFalse(labels)
@@ -68,7 +139,9 @@ def test_filter(self) -> None:
         ]
 
         for case in testcases:
-            filtered_test_matrix = filter(yaml.safe_load(case["test_matrix"]), mocked_labels)
+            filtered_test_matrix = filter(
+                yaml.safe_load(case["test_matrix"]), mocked_labels
+            )
             self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
 
     def test_filter_with_valid_label(self) -> None:
@@ -89,10 +162,11 @@ def test_filter_with_valid_label(self) -> None:
         ]
 
         for case in testcases:
-            filtered_test_matrix = filter(yaml.safe_load(case["test_matrix"]), mocked_labels)
+            filtered_test_matrix = filter(
+                yaml.safe_load(case["test_matrix"]), mocked_labels
+            )
             self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
 
-
     def test_set_periodic_modes(self) -> None:
         testcases = [
             {
@@ -110,9 +184,101 @@ def test_set_periodic_modes(self) -> None:
             scheduled_test_matrix = set_periodic_modes(test_matrix)
             self.assertEqual(
                 len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES),
-                len(scheduled_test_matrix["include"])
+                len(scheduled_test_matrix["include"]),
             )
 
+    @mock.patch("filter_test_configs.download_json")
+    def test_remove_disabled_jobs(self, mock_download_json: Any) -> None:
+        mock_download_json.return_value = MOCKED_DISABLED_JOBS
+
+        testcases = [
+            {
+                "workflow": "pull",
+                "job_name": "invalid job name",
+                "test_matrix": '{include: [{config: "default"}]}',
+                "expected": '{"include": [{"config": "default"}]}',
+                "description": "invalid job name",
+            },
+            {
+                "workflow": "pull",
+                "job_name": "mock-platform-1 / build",
+                "test_matrix": '{include: [{config: "default"}]}',
+                "expected": '{"include": []}',
+                "description": "disable build and test jobs",
+            },
+            {
+                "workflow": "trunk",
+                "job_name": "mock-platform-2 / build",
+                "test_matrix": '{include: [{config: "default"}]}',
+                "expected": '{"include": []}',
+                "description": "disable build job",
+            },
+            {
+                "workflow": "periodic",
+                "job_name": "mock-platform-3 / test",
+                "test_matrix": '{include: [{config: "default"}]}',
+                "expected": '{"include": []}',
+                "description": "disable test job",
+            },
+            {
+                "workflow": "pull",
+                "job_name": "mock-platform-4 / build-and-test",
+                "test_matrix": '{include: [{config: "default"}]}',
+                "expected": '{"include": []}',
+                "description": "disable build-and-test job",
+            },
+            {
+                "workflow": "trunk",
+                "job_name": "mock-platform-5 / test",
+                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "backward_compat"}]}',
+                "expected": '{"include": [{"config": "default", "runner": "linux"}]}',
+                "description": "disable a test config",
+            },
+            {
+                "workflow": "periodic",
+                "job_name": "mock-platform-6 / build-and-test",
+                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "backward_compat"}]}',
+                "expected": '{"include": [{"config": "backward_compat"}]}',
+                "description": "disable a build-and-test config",
+            },
+            {
+                "workflow": "pull",
+                "job_name": "mock-platform-7 / test",
+                "test_matrix": '{include: [{config: "default"}, {config: "backward_compat"}]}',
+                "expected": '{"include": [{"config": "default"}, {"config": "backward_compat"}]}',
+                "description": "include an invalid job name in the disabled issue",
+            },
+            {
+                "workflow": "trunk",
+                "job_name": "mock-platform-8 / build",
+                "test_matrix": '{include: [{config: "default"}, {config: "backward_compat"}]}',
+                "expected": '{"include": [{"config": "default"}, {"config": "backward_compat"}]}',
+                "description": "include an invalid combination of build and test config",
+            },
+            {
+                "workflow": "inductor",
+                "job_name": "mock-platform-8 / build",
+                "test_matrix": '{include: [{config: "default"}, {config: "backward_compat"}]}',
+                "expected": '{"include": [{"config": "default"}, {"config": "backward_compat"}]}',
+                "description": "not disabled on this workflow",
+            },
+            {
+                "workflow": "pull",
+                "job_name": "mock-platform-9 / build",
+                "test_matrix": '{include: [{config: "default"}, {config: "backward_compat"}]}',
+                "expected": '{"include": [{"config": "default"}, {"config": "backward_compat"}]}',
+                "description": "not disabled on this platform",
+            },
+        ]
+
+        for case in testcases:
+            workflow = case["workflow"]
+            job_name = case["job_name"]
+            test_matrix = yaml.safe_load(case["test_matrix"])
+
+            filtered_test_matrix = remove_disabled_jobs(workflow, job_name, test_matrix)
+            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/.github/scripts/test_gitutils.py b/.github/scripts/test_gitutils.py
index 78696771d993..9987cdea9781 100644
--- a/.github/scripts/test_gitutils.py
+++ b/.github/scripts/test_gitutils.py
@@ -1,6 +1,11 @@
 #!/usr/bin/env python3
-from gitutils import PeekableIterator, patterns_to_regex
-from unittest import TestCase, main
+from gitutils import PeekableIterator, patterns_to_regex, GitRepo, are_ghstack_branches_in_sync, _shasum
+from unittest import TestCase, main, SkipTest
+from pathlib import Path
+
+
+BASE_DIR = Path(__file__).parent
+
 
 class TestPeekableIterator(TestCase):
     def test_iterator(self, input_: str = "abcdef") -> None:
@@ -35,5 +40,34 @@ def test_double_asterisks(self) -> None:
             self.assertTrue(patterns_re.match(filename))
 
 
+class TestGitRepo(TestCase):
+    def setUp(self) -> None:
+        repo_dir = BASE_DIR.parent.parent.absolute()
+        if not (repo_dir / ".git").is_dir():
+            raise SkipTest("Can't find git directory, make sure to run this test on real repo checkout")
+        self.repo = GitRepo(str(repo_dir))
+
+    def _skip_if_ref_does_not_exist(self, ref: str) -> None:
+        """ Skip test if ref is missing as stale branches are deleted with time """
+        try:
+            self.repo.show_ref(ref)
+        except RuntimeError as e:
+            raise SkipTest(f"Can't find head ref {ref} due to {str(e)}") from e
+
+    def test_compute_diff(self) -> None:
+        diff = self.repo.diff("HEAD")
+        sha = _shasum(diff)
+        self.assertEqual(len(sha), 64)
+
+    def test_ghstack_branches_in_sync(self) -> None:
+        head_ref = "gh/SS-JIA/206/head"
+        self._skip_if_ref_does_not_exist(head_ref)
+        self.assertTrue(are_ghstack_branches_in_sync(self.repo, head_ref))
+
+    def test_ghstack_branches_not_in_sync(self) -> None:
+        head_ref = "gh/clee2000/1/head"
+        self._skip_if_ref_does_not_exist(head_ref)
+        self.assertFalse(are_ghstack_branches_in_sync(self.repo, head_ref))
+
 if __name__ == '__main__':
     main()
diff --git a/.github/scripts/test_label_utils.py b/.github/scripts/test_label_utils.py
new file mode 100644
index 000000000000..e908ee03c3b3
--- /dev/null
+++ b/.github/scripts/test_label_utils.py
@@ -0,0 +1,75 @@
+from typing import Any
+from unittest import TestCase, mock, main
+
+from label_utils import (
+    get_last_page_num_from_header,
+    gh_get_labels,
+    has_required_labels,
+)
+from trymerge import GitHubPR
+from test_trymerge import mocked_gh_graphql
+
+
+release_notes_labels = [
+    "release notes: nn",
+]
+
+class TestLabelUtils(TestCase):
+    MOCK_HEADER_LINKS_TO_PAGE_NUMS = {
+        1: {"link": "<https://api.github.com/dummy/labels?per_page=10&page=1>; rel='last'"},
+        2: {"link": "<https://api.github.com/dummy/labels?per_page=1&page=2>;"},
+        3: {"link": "<https://api.github.com/dummy/labels?per_page=1&page=2&page=3>;"},
+    }
+
+    def test_get_last_page_num_from_header(self) -> None:
+        for expected_page_num, mock_header in self.MOCK_HEADER_LINKS_TO_PAGE_NUMS.items():
+            self.assertEqual(get_last_page_num_from_header(mock_header), expected_page_num)
+
+    MOCK_LABEL_INFO = '[{"name": "foo"}]'
+
+    @mock.patch("label_utils.get_last_page_num_from_header", return_value=3)
+    @mock.patch("label_utils.request_for_labels", return_value=(None, MOCK_LABEL_INFO))
+    def test_gh_get_labels(
+        self,
+        mock_request_for_labels: Any,
+        mock_get_last_page_num_from_header: Any,
+    ) -> None:
+        res = gh_get_labels("mock_org", "mock_repo")
+        mock_get_last_page_num_from_header.assert_called_once()
+        self.assertEqual(res, ["foo"] * 3)
+
+    @mock.patch("label_utils.get_last_page_num_from_header", return_value=0)
+    @mock.patch("label_utils.request_for_labels", return_value=(None, MOCK_LABEL_INFO))
+    def test_gh_get_labels_raises_with_no_pages(
+        self,
+        mock_request_for_labels: Any,
+        get_last_page_num_from_header: Any,
+    ) -> None:
+        with self.assertRaises(AssertionError) as err:
+            gh_get_labels("foo", "bar")
+        self.assertIn("number of pages of labels", str(err.exception))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('label_utils.get_release_notes_labels', return_value=release_notes_labels)
+    def test_pr_with_missing_labels(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
+        "Test PR with no 'release notes:' label or 'topic: not user facing' label"
+        pr = GitHubPR("pytorch", "pytorch", 82169)
+        self.assertFalse(has_required_labels(pr))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('label_utils.get_release_notes_labels', return_value=release_notes_labels)
+    def test_pr_with_release_notes_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
+        "Test PR with 'release notes: nn' label"
+        pr = GitHubPR("pytorch", "pytorch", 71759)
+        self.assertTrue(has_required_labels(pr))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('label_utils.get_release_notes_labels', return_value=release_notes_labels)
+    def test_pr_with_not_user_facing_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
+        "Test PR with 'topic: not user facing' label"
+        pr = GitHubPR("pytorch", "pytorch", 75095)
+        self.assertTrue(has_required_labels(pr))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index b6224d829f33..8b89f4e09b97 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -11,28 +11,40 @@
 import os
 from hashlib import sha256
 
-from trymerge import (find_matching_merge_rule,
-                      get_land_checkrun_conclusions,
-                      validate_land_time_checks,
-                      gh_graphql,
-                      gh_get_team_members,
-                      read_merge_rules,
-                      validate_revert,
-                      GitHubPR,
-                      MergeRule,
-                      MandatoryChecksMissingError,
-                      PostCommentError,
-                      main as trymerge_main)
+from trymerge import (
+    find_matching_merge_rule,
+    get_land_checkrun_conclusions,
+    validate_land_time_checks,
+    gh_graphql,
+    gh_get_team_members,
+    read_merge_rules,
+    validate_revert,
+    GitHubPR,
+    MergeRule,
+    MandatoryChecksMissingError,
+    PostCommentError,
+    FlakyRule,
+    categorize_checks,
+    get_combined_checks_from_pr_and_land_validation,
+    get_rockset_results,
+    main as trymerge_main,
+    get_classifications,
+)
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional
 from unittest import TestCase, main, mock
 from urllib.error import HTTPError
 
 if 'GIT_REMOTE_URL' not in os.environ:
     os.environ['GIT_REMOTE_URL'] = "https://github.com/pytorch/pytorch"
 
-def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
-    gql_db_fname = os.path.join(os.path.dirname(__file__), "gql_mocks.json")
+def mock_query(
+    fallback_function: Any,
+    file_name: str,
+    key_function: Any,
+    *args: Any,
+) -> Any:
+    gql_db_fname = os.path.join(os.path.dirname(__file__), file_name)
 
     def get_mocked_queries() -> Any:
         if not os.path.exists(gql_db_fname):
@@ -45,21 +57,25 @@ def save_mocked_queries(obj: Any) -> None:
             json.dump(obj, f, indent=2)
             f.write("\n")
 
-    key = f"query_sha={sha256(query.encode('utf-8')).hexdigest()} " + " ".join([f"{k}={kwargs[k]}" for k in sorted(kwargs.keys())])
+    key = key_function(*args)
     mocked_queries = get_mocked_queries()
 
     if key in mocked_queries:
         return mocked_queries[key]
 
     try:
-        rc = gh_graphql(query, **kwargs)
+        rc = fallback_function(*args)
     except HTTPError as err:
         if err.code == 401:
-            err_msg = "If you are seeing this message during workflow run, please make sure to update gql_mocks.json"
+            err_msg = f"If you are seeing this message during workflow run, please make sure to update {file_name}"
             err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with "
             err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN environment variable"
-            if os.getenv("GITHUB_TOKEN") is None:
-                err_msg = "Failed to update cached GraphQL queries as GITHUB_TOKEN is not defined." + err_msg
+            err_msg += " the rockset api key passed via ROCKSET_API_KEY environment variable"
+            if os.getenv("GITHUB_TOKEN") is None or os.getenv("ROCKSET_API_KEY") is None:
+                err_msg = (
+                    "Failed to update cached GraphQL queries as GITHUB_TOKEN or ROCKSET_API_KEY is not defined."
+                    + err_msg
+                )
             raise RuntimeError(err_msg) from err
     mocked_queries[key] = rc
 
@@ -67,8 +83,27 @@ def save_mocked_queries(obj: Any) -> None:
 
     return rc
 
-def mock_parse_args(revert: bool = False,
-                    force: bool = False) -> Any:
+
+def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
+    def key_function(query: str, kwargs: Any) -> str:
+        return f"query_sha={sha256(query.encode('utf-8')).hexdigest()} " + " ".join(
+            [f"{k}={kwargs[k]}" for k in sorted(kwargs.keys())]
+        )
+
+    def gh_graphql_wrapper(query: str, kwargs: Any) -> Any:
+        return gh_graphql(query, **kwargs)
+    return mock_query(gh_graphql_wrapper, "gql_mocks.json", key_function, query, kwargs)
+
+def mocked_rockset_results(head_sha: str, merge_base: str) -> Any:
+    return mock_query(
+        get_rockset_results,
+        "rockset_mocks.json",
+        lambda x, y: f"{x} {y}",
+        head_sha,
+        merge_base,
+    )
+
+def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
     class Object(object):
         def __init__(self) -> None:
             self.revert = revert
@@ -132,6 +167,15 @@ def mocked_read_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule
 def mocked_read_merge_rules_raise(repo: Any, org: str, project: str) -> List[MergeRule]:
     raise RuntimeError("testing")
 
+def empty_flaky_rules() -> List[FlakyRule]:
+    return []
+
+def empty_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
+    return []
+
+def dummy_merge_base() -> str:
+    return "dummy"
+
 class DummyGitRepo(GitRepo):
     def __init__(self) -> None:
         super().__init__(get_git_repo_dir(), get_git_remote_name())
@@ -142,15 +186,20 @@ def commits_resolving_gh_pr(self, pr_num: int) -> List[str]:
     def commit_message(self, ref: str) -> str:
         return "super awsome commit message"
 
+
+@mock.patch("trymerge.read_flaky_rules", side_effect=empty_flaky_rules)
+@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
+@mock.patch("trymerge.GitHubPR.get_merge_base", side_effect=dummy_merge_base)
 class TestGitHubPR(TestCase):
-    def test_merge_rules_valid(self) -> None:
+    def test_merge_rules_valid(self, *args: Any) -> None:
         "Test that merge_rules.yaml can be parsed"
         repo = DummyGitRepo()
-        self.assertGreater(len(read_merge_rules(repo, "pytorch", "pytorch")), 1)
+        merge_rules = read_merge_rules(repo, "pytorch", "pytorch")
+        self.assertGreater(len(merge_rules), 1)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
     @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules)
-    def test_match_rules(self, mocked_gql: Any, mocked_rmr: Any) -> None:
+    def test_match_rules(self, mocked_gql: Any, mocked_rmr: Any, *args: Any) -> None:
         "Tests that PR passes merge rules"
         pr = GitHubPR("pytorch", "pytorch", 77700)
         repo = DummyGitRepo()
@@ -158,7 +207,7 @@ def test_match_rules(self, mocked_gql: Any, mocked_rmr: Any) -> None:
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
     @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules_raise)
-    def test_read_merge_rules_fails(self, mocked_gql: Any, mocked_rmr: Any) -> None:
+    def test_read_merge_rules_fails(self, mocked_gql: Any, mocked_rmr: Any, *args: Any) -> None:
         "Tests that PR fails to read the merge rules"
         pr = GitHubPR("pytorch", "pytorch", 77700)
         repo = DummyGitRepo()
@@ -166,14 +215,14 @@ def test_read_merge_rules_fails(self, mocked_gql: Any, mocked_rmr: Any) -> None:
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
     @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules)
-    def test_lint_fails(self, mocked_gql: Any, mocked_rmr: Any) -> None:
+    def test_lint_fails(self, mocked_gql: Any, mocked_rmr: Any, *args: Any) -> None:
         "Tests that PR fails mandatory lint check"
-        pr = GitHubPR("pytorch", "pytorch", 74649)
+        pr = GitHubPR("pytorch", "pytorch", 90791)
         repo = DummyGitRepo()
         self.assertRaises(RuntimeError, lambda: find_matching_merge_rule(pr, repo))
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_get_last_comment(self, mocked_gql: Any) -> None:
+    def test_get_last_comment(self, mocked_gql: Any, *args: Any) -> None:
         "Tests that last comment can be fetched"
         pr = GitHubPR("pytorch", "pytorch", 71759)
         comment = pr.get_last_comment()
@@ -182,7 +231,7 @@ def test_get_last_comment(self, mocked_gql: Any) -> None:
         self.assertTrue("You've committed this PR" in comment.body_text)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_get_author_null(self, mocked_gql: Any) -> None:
+    def test_get_author_null(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that PR author can be computed
             If reply contains NULL
         """
@@ -199,7 +248,20 @@ def test_get_author_null(self, mocked_gql: Any) -> None:
         self.assertTrue(author is not None)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_large_diff(self, mocked_gql: Any) -> None:
+    def test_last_pushed_at(self, mocked_gql: Any, *args: Any) -> None:
+        """ Tests that last_pushed_at will return None on merge commits.
+        """
+        pr = GitHubPR("pytorch", "pytorch", 71759)
+        self.assertIsNotNone(pr.last_pushed_at())
+
+        # 307120d6d3f7fcc3f92cfd26be891d360ad6a92a is merge commit
+        # and as such does not have a pushedDate
+        # See https://github.com/pytorch/pytorch/pull/94146#issuecomment-1421647117
+        pr = GitHubPR("pytorch", "pytorch", 94146)
+        self.assertIsNone(pr.last_pushed_at())
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_large_diff(self, mocked_gql: Any, *args: Any) -> None:
         "Tests that PR with 100+ files can be fetched"
         pr = GitHubPR("pytorch", "pytorch", 73099)
         self.assertTrue(pr.get_changed_files_count() > 100)
@@ -207,25 +269,25 @@ def test_large_diff(self, mocked_gql: Any) -> None:
         self.assertEqual(len(flist), pr.get_changed_files_count())
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_internal_changes(self, mocked_gql: Any) -> None:
+    def test_internal_changes(self, mocked_gql: Any, *args: Any) -> None:
         "Tests that PR with internal changes is detected"
         pr = GitHubPR("pytorch", "pytorch", 73969)
         self.assertTrue(pr.has_internal_changes())
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_checksuites_pagination(self, mocked_gql: Any) -> None:
+    def test_checksuites_pagination(self, mocked_gql: Any, *args: Any) -> None:
         "Tests that PR with lots of checksuits can be fetched"
         pr = GitHubPR("pytorch", "pytorch", 73811)
         self.assertEqual(len(pr.get_checkrun_conclusions()), 76)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_comments_pagination(self, mocked_gql: Any) -> None:
+    def test_comments_pagination(self, mocked_gql: Any, *args: Any) -> None:
         "Tests that PR with 50+ comments can be fetched"
         pr = GitHubPR("pytorch", "pytorch", 31093)
         self.assertGreater(len(pr.get_comments()), 50)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_gql_complexity(self, mocked_gql: Any) -> None:
+    def test_gql_complexity(self, mocked_gql: Any, *args: Any) -> None:
         "Fetch comments and conclusions for PR with 60 commits"
         # Previous version of GrapQL query used to cause HTTP/502 error
         # see https://gist.github.com/malfet/9b93bc7eeddeaf1d84546efc4f0c577f
@@ -235,7 +297,13 @@ def test_gql_complexity(self, mocked_gql: Any) -> None:
         self.assertGreater(pr.get_commit_count(), 60)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_team_members(self, mocked_gql: Any) -> None:
+    def test_gql_retrieve_checksuites(self, mocked_gql: Any, *args: Any) -> None:
+        "Fetch comments and conclusions for PR with 60 commits"
+        pr = GitHubPR("pytorch", "pytorch", 94787)
+        self.assertEqual(len(pr.get_checkrun_conclusions()), 183)
+
+    @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
+    def test_team_members(self, mocked_gql: Any, *args: Any) -> None:
         "Test fetching team members works"
         dev_infra_team = gh_get_team_members("pytorch", "pytorch-dev-infra")
         self.assertGreater(len(dev_infra_team), 2)
@@ -244,7 +312,7 @@ def test_team_members(self, mocked_gql: Any) -> None:
             self.assertEqual(len(non_existing_team), 0)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_get_author_many_commits(self, mocked_gql: Any) -> None:
+    def test_get_author_many_commits(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that authors for all commits can be fetched
         """
         pr = GitHubPR("pytorch", "pytorch", 76118)
@@ -255,7 +323,7 @@ def test_get_author_many_commits(self, mocked_gql: Any) -> None:
 
     @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules_NE)
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_pending_status_check(self, mocked_gql: Any, mocked_read_merge_rules: Any) -> None:
+    def test_pending_status_check(self, mocked_gql: Any, mocked_read_merge_rules: Any, *args: Any) -> None:
         """ Tests that PR with nonexistent/pending status checks fails with the right reason.
         """
         pr = GitHubPR("pytorch", "pytorch", 76118)
@@ -265,7 +333,7 @@ def test_pending_status_check(self, mocked_gql: Any, mocked_read_merge_rules: An
                                lambda: find_matching_merge_rule(pr, repo))
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_get_author_many_reviews(self, mocked_gql: Any) -> None:
+    def test_get_author_many_reviews(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that all reviews can be fetched
         """
         pr = GitHubPR("pytorch", "pytorch", 76123)
@@ -275,7 +343,7 @@ def test_get_author_many_reviews(self, mocked_gql: Any) -> None:
         self.assertGreater(len(pr._reviews), 100)
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_get_checkruns_many_runs(self, mocked_gql: Any) -> None:
+    def test_get_checkruns_many_runs(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that all checkruns can be fetched
         """
         pr = GitHubPR("pytorch", "pytorch", 77700)
@@ -284,7 +352,7 @@ def test_get_checkruns_many_runs(self, mocked_gql: Any) -> None:
         self.assertTrue("pull / linux-docs / build-docs (cpp)" in conclusions.keys())
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_cancelled_gets_ignored(self, mocked_gql: Any) -> None:
+    def test_cancelled_gets_ignored(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that cancelled workflow does not override existing successfull status
         """
         pr = GitHubPR("pytorch", "pytorch", 82169)
@@ -294,7 +362,7 @@ def test_cancelled_gets_ignored(self, mocked_gql: Any) -> None:
         self.assertTrue(all([conclusions[name].status == "SUCCESS" for name in lint_checks]))
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_get_many_land_checks(self, mocked_gql: Any) -> None:
+    def test_get_many_land_checks(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that all checkruns can be fetched for a commit
         """
         conclusions = get_land_checkrun_conclusions('pytorch', 'pytorch', '6882717f73deffb692219ccd1fd6db258d8ed684')
@@ -302,7 +370,7 @@ def test_get_many_land_checks(self, mocked_gql: Any) -> None:
         self.assertTrue("pull / linux-docs / build-docs (cpp)" in conclusions.keys())
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_failed_land_checks(self, mocked_gql: Any) -> None:
+    def test_failed_land_checks(self, mocked_gql: Any, *args: Any) -> None:
         """ Tests that PR with Land Checks fail with a RunTime error
         """
         self.assertRaisesRegex(RuntimeError,
@@ -312,14 +380,14 @@ def test_failed_land_checks(self, mocked_gql: Any) -> None:
     @mock.patch('trymerge.gh_get_pr_info', return_value=mock_gh_get_info())
     @mock.patch('trymerge.parse_args', return_value=mock_parse_args(True, False))
     @mock.patch('trymerge.try_revert', side_effect=mock_revert)
-    def test_main_revert(self, mock_revert: Any, mock_parse_args: Any, gh_get_pr_info: Any) -> None:
+    def test_main_revert(self, mock_revert: Any, mock_parse_args: Any, gh_get_pr_info: Any, *args: Any) -> None:
         trymerge_main()
         mock_revert.assert_called_once()
 
     @mock.patch('trymerge.gh_get_pr_info', return_value=mock_gh_get_info())
     @mock.patch('trymerge.parse_args', return_value=mock_parse_args(False, True))
     @mock.patch('trymerge.merge', side_effect=mock_merge)
-    def test_main_force(self, mock_merge: Any, mock_parse_args: Any, mock_gh_get_info: Any) -> None:
+    def test_main_force(self, mock_merge: Any, mock_parse_args: Any, mock_gh_get_info: Any, *args: Any) -> None:
         trymerge_main()
         mock_merge.assert_called_once_with(mock.ANY,
                                            mock.ANY,
@@ -333,7 +401,7 @@ def test_main_force(self, mock_merge: Any, mock_parse_args: Any, mock_gh_get_inf
     @mock.patch('trymerge.gh_get_pr_info', return_value=mock_gh_get_info())
     @mock.patch('trymerge.parse_args', return_value=mock_parse_args(False, False))
     @mock.patch('trymerge.merge', side_effect=mock_merge)
-    def test_main_merge(self, mock_merge: Any, mock_parse_args: Any, mock_gh_get_info: Any) -> None:
+    def test_main_merge(self, mock_merge: Any, mock_parse_args: Any, mock_gh_get_info: Any, *args: Any) -> None:
         trymerge_main()
         mock_merge.assert_called_once_with(mock.ANY,
                                            mock.ANY,
@@ -346,14 +414,27 @@ def test_main_merge(self, mock_merge: Any, mock_parse_args: Any, mock_gh_get_inf
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
     @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules)
-    def test_revert_rules(self, mock_gql: Any, mock_mr: Any) -> None:
+    def test_revert_rules(self, mock_gql: Any, mock_mr: Any, *args: Any) -> None:
         """ Tests that reverts from collaborators are allowed """
         pr = GitHubPR("pytorch", "pytorch", 79694)
         repo = DummyGitRepo()
         self.assertIsNotNone(validate_revert(repo, pr, comment_id=1189459845))
 
     @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
-    def test_revert_codev_fails(self, mock_gql: Any) -> None:
+    def test_get_changed_files(self, mock_gql: Any, *args: Any) -> None:
+        """
+        Tests that the list changed files in a PR doesn't include duplicates
+        """
+        pr = GitHubPR("pytorch", "pytorch", 95233)
+        try:
+            changed_files = pr.get_changed_files()
+        except RuntimeError as error:
+            self.fail(f"get_changed_files throws an exception: {error}")
+
+        self.assertEqual(len(changed_files), pr.get_changed_files_count())
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    def test_revert_codev_fails(self, mock_gql: Any, *args: Any) -> None:
         pr = GitHubPR("pytorch", "pytorch", 91340)
 
         class GitRepoCoDev(GitRepo):
@@ -369,5 +450,32 @@ def commit_message(self, ref: str) -> str:
         repo = GitRepoCoDev()
         self.assertRaisesRegex(PostCommentError, "landed via phabricator", lambda: validate_revert(repo, pr, comment_id=1372496233))
 
+@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
+@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
+class TestBypassFailures(TestCase):
+    def test_get_classifications(self, *args: Any) -> None:
+        flaky_rules = [FlakyRule("distributed", ["##[error]The operation was canceled."])]
+        pr = GitHubPR("pytorch", "pytorch", 92863)
+        checks = get_combined_checks_from_pr_and_land_validation(pr, None)
+        checks = get_classifications(pr.last_commit()['oid'], pr.get_merge_base(), checks, flaky_rules)
+        self.assertTrue(
+            checks[
+                "pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
+            ].classification
+            == "BROKEN_TRUNK"
+        )
+        self.assertTrue(
+            checks[
+                "pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
+            ].classification
+            == "FLAKY"
+        )
+        pending, failed = categorize_checks(checks, list(checks.keys()), ok_failed_checks_threshold=2)
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 0)
+        pending, failed = categorize_checks(checks, list(checks.keys()), ok_failed_checks_threshold=1)
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 2)
+
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 182d39e0f5de..0c70e022a718 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -1,5 +1,15 @@
 #!/usr/bin/env python3
 
+# NB: the following functions are used in Meta-internal workflows
+# (github_first_try_merge/my_handler.py) and thus have functionality limitations
+# (no `git` command access, no network access besides the strict allow list):
+#
+# find_matching_merge_rule
+# read_merge_rules
+#
+# Also any signature changes of these functions, as well as changes to the `GitHubPR`
+# class, will likely require corresponding changes for the internal workflows.
+
 import base64
 import json
 import os
@@ -15,33 +25,43 @@
     Callable,
     Dict,
     List,
-    NamedTuple,
     Optional,
     Pattern,
     Tuple,
-    Union,
     cast,
 )
-from urllib.error import HTTPError
-from urllib.request import Request, urlopen
 from warnings import warn
 from pathlib import Path
 
 from gitutils import (
     GitRepo,
+    are_ghstack_branches_in_sync,
     get_git_remote_name,
     get_git_repo_dir,
     patterns_to_regex,
 )
+from github_utils import (
+    GitHubComment,
+    gh_fetch_json_list,
+    gh_fetch_url,
+    gh_post_commit_comment,
+    gh_post_pr_comment,
+)
+from label_utils import gh_add_labels
 from trymerge_explainer import (
     TryMergeExplainer,
     get_revert_message,
 )
 
-class JobCheckState(NamedTuple):
-    name: str
-    url: str
-    status: Optional[str]
+class JobCheckState:
+    def __init__(self, name: str, url: str, status: Optional[str], classification: Optional[str] = None):
+        self.name = name
+        self.url = url
+        self.status = status
+        self.classification = classification
+
+    def __repr__(self) -> str:
+        return f"JobCheckState([{self.name},{self.url},{self.status},{self.classification}])"
 
 JobNameToStateDict = Dict[str, JobCheckState]
 
@@ -52,6 +72,18 @@ def __init__(self, name: str, url: str, status: Optional[str]):
         self.status: Optional[str] = status
         self.jobs: JobNameToStateDict = {}
 
+class FlakyRule:
+    def __init__(self, name: str, captures: List[str]):
+        self.name = name
+        self.captures = captures
+
+    def matches(self, job: Optional[Dict[str, Any]]) -> bool:
+        return (
+            job is not None
+            and self.name in job.get('name', '')
+            and job.get("failure_captures") is not None
+            and all([capture in job.get("failure_captures", []) for capture in self.captures])
+        )
 
 GH_PR_REVIEWS_FRAGMENT = """
 fragment PRReviews on PullRequestReviewConnection {
@@ -423,63 +455,8 @@ def __init__(self, name: str, url: str, status: Optional[str]):
 MERGE_RULE_PATH = Path(".github") / "merge_rules.yaml"
 
 
-def _fetch_url(url: str, *,
-               headers: Optional[Dict[str, str]] = None,
-               data: Optional[Dict[str, Any]] = None,
-               method: Optional[str] = None,
-               reader: Callable[[Any], Any] = lambda x: x.read()) -> Any:
-    if headers is None:
-        headers = {}
-    token = os.environ.get("GITHUB_TOKEN")
-    if token is not None and url.startswith('https://api.github.com/'):
-        headers['Authorization'] = f'token {token}'
-    data_ = json.dumps(data).encode() if data is not None else None
-    try:
-        with urlopen(Request(url, headers=headers, data=data_, method=method)) as conn:
-            return reader(conn)
-    except HTTPError as err:
-        if err.code == 403 and all(key in err.headers for key in ['X-RateLimit-Limit', 'X-RateLimit-Used']):
-            print(f"Rate limit exceeded: {err.headers['X-RateLimit-Used']}/{err.headers['X-RateLimit-Limit']}")
-        raise
-
-def fetch_json(url: str,
-               params: Optional[Dict[str, Any]] = None,
-               data: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
-    headers = {'Accept': 'application/vnd.github.v3+json'}
-    if params is not None and len(params) > 0:
-        url += '?' + '&'.join(f"{name}={urllib.parse.quote(str(val))}" for name, val in params.items())
-    return cast(List[Dict[str, Any]], _fetch_url(url, headers=headers, data=data, reader=json.load))
-
-def fetch_json_dict(url: str,
-                    params: Optional[Dict[str, Any]] = None,
-                    data: Optional[Dict[str, Any]] = None) -> Dict[str, Any] :
-    headers = {'Accept': 'application/vnd.github.v3+json'}
-    if params is not None and len(params) > 0:
-        url += '?' + '&'.join(f"{name}={urllib.parse.quote(str(val))}" for name, val in params.items())
-    return cast(Dict[str, Any], _fetch_url(url, headers=headers, data=data, reader=json.load))
-
-def _gh_post_comment(url: str, comment: str, dry_run: bool = False) -> List[Dict[str, Any]]:
-    if dry_run:
-        print(comment)
-        return []
-    return fetch_json(url, data={"body": comment})
-
-
-def gh_post_pr_comment(org: str, project: str, pr_num: int, comment: str, dry_run: bool = False) -> List[Dict[str, Any]]:
-    return _gh_post_comment(f'https://api.github.com/repos/{org}/{project}/issues/{pr_num}/comments', comment, dry_run)
-
-
-def gh_post_commit_comment(org: str, project: str, sha: str, comment: str, dry_run: bool = False) -> List[Dict[str, Any]]:
-    return _gh_post_comment(f'https://api.github.com/repos/{org}/{project}/commits/{sha}/comments', comment, dry_run)
-
-
-def gh_add_labels(org: str, project: str, pr_num: int, labels: Union[str, List[str]]) -> None:
-    fetch_json(f'https://api.github.com/repos/{org}/{project}/issues/{pr_num}/labels',
-               data={"labels": labels})
-
-
 def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]:
-    rc = _fetch_url("https://api.github.com/graphql", data={"query": query, "variables": kwargs}, reader=json.load)
+    rc = gh_fetch_url("https://api.github.com/graphql", data={"query": query, "variables": kwargs}, reader=json.load)
     if "errors" in rc:
         raise RuntimeError(f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}")
     return cast(Dict[str, Any], rc)
@@ -573,10 +550,12 @@ def add_conclusions(edges: Any) -> None:
                 else:
                     checkruns = None
 
-    add_conclusions(checksuites["edges"])
+    all_edges = checksuites["edges"].copy()
     while bool(checksuites["pageInfo"]["hasNextPage"]):
         checksuites = get_next_checksuites(checksuites)
-        add_conclusions(checksuites["edges"])
+        all_edges.extend(checksuites["edges"])
+
+    add_conclusions(all_edges)
 
     # Flatten the dictionaries.  If there exists jobs in the workflow run, put
     # the jobs in but don't put the workflow in.  We care more about the jobs in
@@ -619,6 +598,7 @@ def can_skip_internal_checks(pr: "GitHubPR", comment_id: Optional[int] = None) -
         return False
     return comment.author_login == "facebook-github-bot"
 
+
 def get_ghstack_prs(repo: GitRepo, pr: "GitHubPR") -> List[Tuple["GitHubPR", str]]:
     '''
     Get the open PRs in the stack that are below this PR.  Throws error if any of the PRs are out of sync.
@@ -646,9 +626,7 @@ def get_ghstack_prs(repo: GitRepo, pr: "GitHubPR") -> List[Tuple["GitHubPR", str
             entire_stack.append((pr, rev))
 
     for stacked_pr, rev in entire_stack:
-        commit_sha = stacked_pr.last_commit()['oid']
-        tree_sha = repo._run_git("rev-parse", commit_sha + "^{tree}")
-        if tree_sha not in repo.commit_message(rev):
+        if not are_ghstack_branches_in_sync(repo, stacked_pr.head_ref()):
             raise RuntimeError(
                 f"PR {stacked_pr.pr_num} is out of sync with the corresponding revision {rev} on " +
                 f"branch {orig_ref} that would be merged into master.  " +
@@ -657,15 +635,6 @@ def get_ghstack_prs(repo: GitRepo, pr: "GitHubPR") -> List[Tuple["GitHubPR", str
             )
     return entire_stack
 
-@dataclass
-class GitHubComment:
-    body_text: str
-    created_at: str
-    author_login: str
-    author_association: str
-    editor_login: Optional[str]
-    database_id: int
-
 
 class GitHubPR:
     def __init__(self, org: str, project: str, pr_num: int) -> None:
@@ -680,6 +649,7 @@ def __init__(self, org: str, project: str, pr_num: int) -> None:
         self.comments: Optional[List[GitHubComment]] = None
         self._authors: Optional[List[Tuple[str, str]]] = None
         self._reviews: Optional[List[Tuple[str, str]]] = None
+        self.merge_base: Optional[str] = None
 
     def is_closed(self) -> bool:
         return bool(self.info["closed"])
@@ -705,19 +675,42 @@ def is_base_repo_private(self) -> bool:
     def get_changed_files_count(self) -> int:
         return int(self.info["changedFiles"])
 
-    def last_pushed_at(self) -> datetime:
-        return datetime.fromisoformat(self.last_commit()['pushedDate'][:-1])
+    def last_pushed_at(self) -> Optional[datetime]:
+        pushed_date = self.last_commit()["pushedDate"]
+        if pushed_date is None:
+            return None
+        return datetime.fromisoformat(pushed_date[:-1])
 
     def last_commit(self) -> Any:
         return self.info["commits"]["nodes"][-1]["commit"]
 
+    def fetch(self, branch_name: Optional[str] = None) -> None:
+        repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        if branch_name is None:
+            branch_name = f"__pull-request-{self.pr_num}__init__"
+        try:
+            r = repo._run_git("rev-parse", branch_name)
+            if r.strip() == self.last_commit()['oid']:
+                return
+        except Exception:
+            pass
+        repo.fetch(f"pull/{self.pr_num}/head", branch_name)
+
+    def get_merge_base(self) -> str:
+        if self.merge_base is not None:
+            return self.merge_base
+        self.fetch()
+        gitrepo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+        self.merge_base = gitrepo.get_merge_base("origin/master", self.last_commit()['oid'])
+        return self.merge_base
+
     def get_changed_files(self) -> List[str]:
         if self.changed_files is None:
             info = self.info
-            self.changed_files = []
+            unique_changed_files = set()
             # Do not try to fetch more than 10K files
             for _ in range(100):
-                self.changed_files += [x["path"] for x in info["files"]["nodes"]]
+                unique_changed_files.update([x["path"] for x in info["files"]["nodes"]])
                 if not info["files"]["pageInfo"]["hasNextPage"]:
                     break
                 rc = gh_graphql(GH_GET_PR_NEXT_FILES_QUERY,
@@ -726,6 +719,7 @@ def get_changed_files(self) -> List[str]:
                                 number=self.pr_num,
                                 cursor=info["files"]["pageInfo"]["endCursor"])
                 info = rc["data"]["repository"]["pullRequest"]
+            self.changed_files = list(unique_changed_files)
 
         if len(self.changed_files) != self.get_changed_files_count():
             raise RuntimeError("Changed file count mismatch")
@@ -808,7 +802,7 @@ def get_checkrun_conclusions(self) -> JobNameToStateDict:
         """ Returns dict of checkrun -> [conclusion, url] """
         if self.conclusions is not None:
             return self.conclusions
-        orig_last_commit = self.info["commits"]["nodes"][-1]["commit"]
+        orig_last_commit = self.last_commit()
 
         def get_pr_next_check_runs(edges: List[Dict[str, Dict[str, Any]]], edge_idx: int, checkruns: Any) -> Any:
             rc = gh_graphql(GH_GET_PR_NEXT_CHECK_RUNS,
@@ -1020,7 +1014,7 @@ def merge_changes(self,
         if not self.is_ghstack_pr():
             msg = self.gen_commit_message()
             pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-            repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name)
+            self.fetch(pr_branch_name)
             repo._run_git("merge", "--squash", pr_branch_name)
             repo._run_git("commit", f"--author=\"{self.get_author()}\"", "-m", msg)
             return []
@@ -1063,11 +1057,14 @@ def delete_land_time_check_branch(self,
         repo._run_git('push', 'origin', '-d', land_check_branch)
 
 
-class MandatoryChecksMissingError(Exception):
+class MergeRuleFailedError(RuntimeError):
     def __init__(self, message: str, rule: Optional['MergeRule'] = None) -> None:
         super().__init__(message)
         self.rule = rule
 
+class MandatoryChecksMissingError(MergeRuleFailedError):
+    pass
+
 class PostCommentError(Exception):
     pass
 
@@ -1078,7 +1075,7 @@ class MergeRule:
     patterns: List[str]
     approved_by: List[str]
     mandatory_checks_name: Optional[List[str]]
-
+    ignore_flaky_failures: bool = True
 
 def gen_new_issue_link(
     org: str,
@@ -1093,9 +1090,14 @@ def gen_new_issue_link(
 
 
 def read_merge_rules(repo: Optional[GitRepo], org: str, project: str) -> List[MergeRule]:
+    """Returns the list of all merge rules for the repo or project.
+
+    NB: this function is used in Meta-internal workflows, see the comment
+    at the top of this file for details.
+    """
     repo_relative_rules_path = MERGE_RULE_PATH
     if repo is None:
-        json_data = _fetch_url(
+        json_data = gh_fetch_url(
             f"https://api.github.com/repos/{org}/{project}/contents/{repo_relative_rules_path}",
             headers={'Accept': 'application/vnd.github.v3+json'},
             reader=json.load,
@@ -1112,6 +1114,12 @@ def read_merge_rules(repo: Optional[GitRepo], org: str, project: str) -> List[Me
         return [MergeRule(**x) for x in rc]
 
 
+def read_flaky_rules() -> List[FlakyRule]:
+    # NOTE: This is currently hardcoded, can be extended to do per repo rules
+    FLAKY_RULES_URL = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/flaky-rules.json"
+    return _get_flaky_rules(FLAKY_RULES_URL)
+
+
 def find_matching_merge_rule(
     pr: GitHubPR,
     repo: Optional[GitRepo] = None,
@@ -1119,10 +1127,13 @@ def find_matching_merge_rule(
     skip_internal_checks: bool = False,
     land_check_commit: Optional[str] = None,
 ) -> MergeRule:
-    """Returns merge rule matching to this pr or raises an exception"""
+    """Returns merge rule matching to this pr or raises an exception.
+
+    NB: this function is used in Meta-internal workflows, see the comment
+    at the top of this file for details.
+    """
     changed_files = pr.get_changed_files()
     approved_by = set(pr.get_approved_by())
-    checks = get_combined_checks_from_pr_and_land_validation(pr, land_check_commit)
 
     issue_link = gen_new_issue_link(
         org=pr.org,
@@ -1132,9 +1143,22 @@ def find_matching_merge_rule(
     reject_reason = f"No rule found to match PR. Please [report]{issue_link} this issue to DevX team."
 
     rules = read_merge_rules(repo, pr.org, pr.project)
+    flaky_rules = read_flaky_rules()
     if not rules:
         reject_reason = f"Rejecting the merge as no rules are defined for the repository in {MERGE_RULE_PATH}"
         raise RuntimeError(reject_reason)
+    checks = get_combined_checks_from_pr_and_land_validation(pr, land_check_commit)
+    base_rev = None
+    try:
+        # is allowed to fail if git is not available
+        base_rev = pr.get_merge_base()
+    except Exception as e:
+        print(
+            f"Failed fetching base git revision for {pr.pr_num}. Skipping additional classifications.\n"
+            f"{type(e)}\n{e}"
+        )
+    if base_rev is not None:
+        checks = get_classifications(pr.last_commit()['oid'], base_rev, checks, flaky_rules)
 
     # PRs can fail multiple merge rules, but it only needs to pass one rule to be approved.
     # If it fails all rules, we need to find the rule that it came closest to passing and report
@@ -1173,7 +1197,7 @@ def find_matching_merge_rule(
         if len(rule.approved_by) > 0 and len(approved_by) == 0:
             if reject_reason_score < 10000:
                 reject_reason_score = 10000
-                reject_reason = f"PR #{pr.pr_num} has not been reviewed yet (Rule {rule_name})"
+                reject_reason = f"PR #{pr.pr_num} has not been reviewed yet"
             continue
 
         # Does the PR have the required approvals for this rule?
@@ -1190,7 +1214,7 @@ def find_matching_merge_rule(
             if reject_reason_score < 10000:
                 reject_reason_score = 10000
                 reject_reason = "\n".join((
-                    f"Approval needed from one of the following (Rule '{rule_name}'):",
+                    "Approval needed from one of the following:",
                     f"{', '.join(list(rule_approvers_set)[:5])}{', ...' if len(rule_approvers_set) > 5 else ''}"
                 ))
             continue
@@ -1198,14 +1222,18 @@ def find_matching_merge_rule(
         # Does the PR pass the checks required by this rule?
         mandatory_checks = rule.mandatory_checks_name if rule.mandatory_checks_name is not None else []
         required_checks = list(filter(lambda x: "EasyCLA" in x or not skip_mandatory_checks, mandatory_checks))
-        [pending_checks, failed_checks] = categorize_checks(checks, required_checks)
+        [pending_checks, failed_checks] = categorize_checks(
+            checks,
+            required_checks,
+            ok_failed_checks_threshold=3 if rule.ignore_flaky_failures else 0
+        )
 
         hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
         if len(failed_checks) > 0:
             if reject_reason_score < 30000:
                 reject_reason_score = 30000
                 reject_reason = "\n".join((
-                    f"{len(failed_checks)} mandatory check(s) failed (Rule `{rule_name}`).  The first few are:",
+                    f"{len(failed_checks)} mandatory check(s) failed.  The first few are:",
                     *checks_to_markdown_bullets(failed_checks),
                     "",
                     f"Dig deeper by [viewing the failures on hud]({hud_link})"
@@ -1215,7 +1243,7 @@ def find_matching_merge_rule(
             if reject_reason_score < 20000:
                 reject_reason_score = 20000
                 reject_reason = "\n".join((
-                    f"{len(pending_checks)} mandatory check(s) are pending/not yet run (Rule `{rule_name}`).  The first few are:",
+                    f"{len(pending_checks)} mandatory check(s) are pending/not yet run.  The first few are:",
                     *checks_to_markdown_bullets(pending_checks),
                     "",
                     f"Dig deeper by [viewing the pending checks on hud]({hud_link})"
@@ -1229,7 +1257,7 @@ def find_matching_merge_rule(
 
     if reject_reason_score == 20000:
         raise MandatoryChecksMissingError(reject_reason, rule)
-    raise RuntimeError(reject_reason)
+    raise MergeRuleFailedError(reject_reason, rule)
 
 
 def get_land_checkrun_conclusions(org: str, project: str, commit: str) -> JobNameToStateDict:
@@ -1265,6 +1293,92 @@ def checks_to_str(checks: List[Tuple[str, Optional[str]]]) -> str:
 def checks_to_markdown_bullets(checks: List[Tuple[str, Optional[str]]]) -> List[str]:
     return [f"- [{c[0]}]({c[1]})" if c[1] is not None else f"- {c[0]}" for c in checks[:5]]
 
+
+def _get_flaky_rules(url: str, num_retries: int = 3) -> List[FlakyRule]:
+    try:
+        return [FlakyRule(**rule) for rule in gh_fetch_json_list(url)]
+    except Exception as e:
+        print(f"Could not download {url} because: {e}.")
+        if num_retries > 0:
+            return _get_flaky_rules(url, num_retries=num_retries - 1)
+        return []
+
+
+def get_rockset_results(head_sha: str, merge_base: str, num_retries: int = 3) -> List[Dict[str, Any]]:
+    query = f"""
+SELECT
+    w.name as workflow_name,
+    j.id,
+    j.name,
+    j.conclusion,
+    j.completed_at,
+    j.html_url,
+    j.head_sha,
+    j.torchci_classification.captures as failure_captures,
+    LENGTH(j.steps) as steps,
+FROM
+    commons.workflow_job j join commons.workflow_run w on w.id = j.run_id
+where
+    j.head_sha in ('{head_sha}','{merge_base}')
+"""
+    try:
+        import rockset  # type: ignore[import]
+        res = rockset.RocksetClient(
+            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+        ).sql(query)
+        return cast(List[Dict[str, Any]], res.results)
+    except ModuleNotFoundError:
+        print("Could not use RockSet as rocket dependency is missing")
+        return []
+    except Exception as e:
+        print(f"Could not download rockset data because: {e}.")
+        if num_retries > 0:
+            return get_rockset_results(head_sha, merge_base, num_retries=num_retries - 1)
+        return []
+
+
+def get_classifications(
+    head_sha: str,
+    merge_base: str,
+    checks: Dict[str, JobCheckState],
+    flaky_rules: List[FlakyRule]
+) -> Dict[str, JobCheckState]:
+
+    rockset_results = get_rockset_results(head_sha, merge_base)
+    head_sha_jobs: Dict[str, Dict[str, Any]] = {}
+    merge_base_jobs: Dict[str, Dict[str, Any]] = {}
+
+    def insert(d: Dict[str, Dict[str, Any]], key: str, val: Dict[str, Any]) -> None:
+        if key not in d:
+            d[key] = val
+            return
+        if d[key]["id"] < val["id"]:
+            d[key] = val
+
+    for rockset_result in rockset_results:
+        name = f"{rockset_result['workflow_name']} / {rockset_result['name']}"
+        if rockset_result["head_sha"] == head_sha:
+            insert(head_sha_jobs, name, rockset_result)
+        else:
+            insert(merge_base_jobs, name, rockset_result)
+
+    for name, check in checks.items():
+        if check.status == "SUCCESS":
+            continue
+        head_sha_job = head_sha_jobs.get(name)
+        merge_base_job = merge_base_jobs.get(name)
+        if (
+            head_sha_job is not None
+            and merge_base_job is not None
+            and head_sha_job["conclusion"] == merge_base_job["conclusion"]
+            and head_sha_job["failure_captures"] == merge_base_job["failure_captures"]
+        ):
+            check.classification = "BROKEN_TRUNK"
+        elif any([rule.matches(head_sha_job) for rule in flaky_rules]):
+            check.classification = "FLAKY"
+    return checks
+
+
 def get_combined_checks_from_pr_and_land_validation(
     pr: GitHubPR,
     land_check_commit: Optional[str],
@@ -1367,7 +1481,7 @@ def check_for_sev(org: str, project: str, skip_mandatory_checks: bool) -> None:
         return
     response = cast(
         Dict[str, Any],
-        fetch_json(
+        gh_fetch_json_list(
             "https://api.github.com/search/issues",
             params={"q": f'repo:{org}/{project} is:open is:issue label:"ci: sev"'},
         ),
@@ -1400,9 +1514,11 @@ def has_label(labels: List[str], pattern: Pattern[str] = CIFLOW_LABEL) -> bool:
 def categorize_checks(
     check_runs: JobNameToStateDict,
     required_checks: List[str],
+    ok_failed_checks_threshold: int = 3
 ) -> Tuple[List[Tuple[str, Optional[str]]], List[Tuple[str, Optional[str]]]]:
     pending_checks: List[Tuple[str, Optional[str]]] = []
     failed_checks: List[Tuple[str, Optional[str]]] = []
+    ok_failed_checks: List[Tuple[str, Optional[str]]] = []
 
     relevant_checknames = [name for name in check_runs.keys() if any([x in name for x in required_checks])]
 
@@ -1413,7 +1529,23 @@ def categorize_checks(
         if check_runs[checkname].status is None:
             pending_checks.append((checkname, check_runs[checkname].url))
         elif not is_passing_status(check_runs[checkname].status):
-            failed_checks.append((checkname, check_runs[checkname].url))
+            if check_runs[checkname].classification in ('BROKEN_TRUNK', 'FLAKY'):
+                ok_failed_checks.append((checkname, check_runs[checkname].url))
+            else:
+                failed_checks.append((checkname, check_runs[checkname].url))
+
+    if ok_failed_checks:
+        print(
+            f"The following {len(ok_failed_checks)} checks failed but were likely due flakiness or broken trunk: " +
+            ", ".join([x[0] for x in ok_failed_checks]) +
+            (f" but this is greater than the threshold of {ok_failed_checks_threshold} so merge will fail"
+             if len(ok_failed_checks) > ok_failed_checks_threshold
+             else '')
+        )
+
+    if len(ok_failed_checks) > ok_failed_checks_threshold:
+        failed_checks = failed_checks + ok_failed_checks
+
     return (pending_checks, failed_checks)
 
 def merge(pr_num: int, repo: GitRepo,
@@ -1465,7 +1597,9 @@ def merge(pr_num: int, repo: GitRepo,
         )
 
     gh_post_pr_comment(org, project, pr.pr_num, explainer.get_merge_message(land_check_commit), dry_run=dry_run)
-    if (datetime.utcnow() - pr.last_pushed_at()).days > stale_pr_days:
+    if pr.last_pushed_at() is None:
+        print(f"Can't get commit {pr.last_commit()['oid']} pushed date. Is it merge commit by chance?")
+    elif (datetime.utcnow() - cast(datetime, pr.last_pushed_at())).days > stale_pr_days:
         if land_checks and not dry_run:
             pr.delete_land_time_check_branch(repo)
         raise RuntimeError(f"This PR is too stale; the last push date was more than {stale_pr_days} days ago. "
@@ -1475,6 +1609,7 @@ def merge(pr_num: int, repo: GitRepo,
     start_time = time.time()
     last_exception = ''
     elapsed_time = 0.0
+    flaky_rules = read_flaky_rules()
     while elapsed_time < timeout_minutes * 60:
         check_for_sev(org, project, skip_mandatory_checks)
         current_time = time.time()
@@ -1488,15 +1623,23 @@ def merge(pr_num: int, repo: GitRepo,
         try:
             required_checks = []
             failed_rule_message = None
+            ignore_flaky_failures = True
             try:
                 find_matching_merge_rule(pr, repo)
             except MandatoryChecksMissingError as ex:
-                if ex.rule is not None and ex.rule.mandatory_checks_name is not None:
-                    required_checks = ex.rule.mandatory_checks_name
+                if ex.rule is not None:
+                    ignore_flaky_failures = ex.rule.ignore_flaky_failures
+                    if ex.rule.mandatory_checks_name is not None:
+                        required_checks = ex.rule.mandatory_checks_name
                 failed_rule_message = ex
 
             checks = get_combined_checks_from_pr_and_land_validation(pr, land_check_commit)
-            pending, failing = categorize_checks(checks, required_checks + [x for x in checks.keys() if x not in required_checks])
+            checks = get_classifications(pr.last_commit()['oid'], pr.get_merge_base(), checks, flaky_rules)
+            pending, failing = categorize_checks(
+                checks,
+                required_checks + [x for x in checks.keys() if x not in required_checks],
+                ok_failed_checks_threshold=3 if ignore_flaky_failures else 0
+            )
             # HACK until GitHub will be better about surfacing those
             startup_failures = filter_checks_with_lambda(checks, lambda status: status == "STARTUP_FAILURE")
             if len(startup_failures) > 0:
@@ -1549,15 +1692,20 @@ def main() -> None:
     def handle_exception(e: Exception, title: str = "Merge failed") -> None:
         exception = f"**Reason**: {e}"
 
+        failing_rule = None
+        if (isinstance(e, MergeRuleFailedError)):
+            failing_rule = e.rule.name if e.rule else None
+
         internal_debugging = ""
         run_url = os.getenv("GH_RUN_URL")
         if run_url is not None:
             # Hide this behind a collapsed bullet since it's not helpful to most devs
-            internal_debugging = "\n".join((
+            internal_debugging = "\n".join(line for line in (
                 "<details><summary>Details for Dev Infra team</summary>",
-                f"Raised by <a href=\"{run_url}\">workflow job</a>",
+                f"Raised by <a href=\"{run_url}\">workflow job</a>\n",
+                f"Failing merge rule: {failing_rule}" if failing_rule else "",
                 "</details>"
-            ))
+            ) if line)  # ignore empty lines during the join
 
         msg = "\n".join((
             f"## {title}",
diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
index 9f088e3d48b6..6681ee629c5d 100755
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@@ -6,7 +6,8 @@
 import re
 from typing import Any
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
-from trymerge import gh_post_pr_comment as gh_post_comment, GitHubPR
+from github_utils import gh_post_pr_comment as gh_post_comment
+from trymerge import GitHubPR
 
 SAME_SHA_ERROR = (
     "\n```\nAborting rebase because rebasing the branch resulted in the same sha as the target branch.\n" +
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index f62e90cc3c45..70a2bd42ae9b 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -26,6 +26,11 @@
 {%- if not is_windows %}
       DOCKER_IMAGE: !{{ config["container_image"] }}
 {%- endif %}
+{%- if config["package_type"] == "manywheel" %}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
+{%- endif %}
 {%- if config["package_type"] == "libtorch" %}
   {%- if config["libtorch_config"] %}
       LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
@@ -37,7 +42,7 @@
   {%- if is_windows %}
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
   {%- endif %}
 {%- else %}
       DESIRED_PYTHON: "!{{ config["python_version"] }}"
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index 2df7c2cd59e3..76911c20600d 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -101,7 +101,7 @@ jobs:
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}"
           )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh'
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .ci/pytorch/build.sh'
 
       # !{{ common_android.upload_android_binary_size("", "")}}
       - name: Test
@@ -162,7 +162,7 @@ jobs:
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}"
           )
-          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports'
+          docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .ci/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports'
 
       - name: Print remaining test logs
         shell: bash
@@ -180,28 +180,6 @@ jobs:
         with:
           file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
 
-      - name: Upload test statistics
-        if: always()
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          PYTORCH_RETRY_TEST_CASES: 1
-          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: ${{ github.run_id }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        shell: bash
-        run: |
-          set -x
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index a8c533070c8b..70753356648c 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -68,7 +68,7 @@ on:
 jobs:
   build:
     runs-on: linux.12xlarge
-    timeout-minutes: 150
+    timeout-minutes: 180
     env:
       PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
       BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }}
diff --git a/.github/workflows/_calculate-docker-image.yml b/.github/workflows/_calculate-docker-image.yml
new file mode 100644
index 000000000000..6b3294e6fa8f
--- /dev/null
+++ b/.github/workflows/_calculate-docker-image.yml
@@ -0,0 +1,37 @@
+name: calculate-docker-image
+
+on:
+  workflow_call:
+    inputs:
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.calculate-docker-image.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+
+jobs:
+  calculate-docker-image:
+    if: github.repository_owner == 'pytorch'
+    runs-on: [self-hosted, linux.large]
+    timeout-minutes: 15
+    outputs:
+      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: ./.github/actions/calculate-docker-image
+        with:
+          docker-image-name: ${{ inputs.docker-image-name }}
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index d7efa1e9198f..850cc887b430 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -40,6 +40,7 @@ jobs:
     if: github.repository_owner == 'pytorch'
     runs-on: ${{ matrix.runner }}
     strategy:
+      fail-fast: false
       matrix:
         include:
           - docs_type: cpp
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index be3d2ce98c03..a1b55ad6b893 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -158,7 +158,7 @@ jobs:
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}"
           )
-          docker exec -t "${container_name}" sh -c '.jenkins/pytorch/build.sh'
+          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
 
       - name: Archive artifacts into zip
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index d0ad326634de..8b1ae777a01f 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -98,7 +98,7 @@ jobs:
         shell: bash
         continue-on-error: true
         run: |
-          python3 -m pip install psutil==5.9.1 pynvml==11.4.1
+          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
           python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -111,8 +111,17 @@ jobs:
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
       - name: Test
         id: test
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         env:
           BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
           PR_NUMBER: ${{ github.event.pull_request.number }}
@@ -138,11 +147,11 @@ jobs:
           set -x
 
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+            TEST_COMMAND=.ci/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/onnx/test.sh
+            TEST_COMMAND=.ci/onnx/test.sh
           else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
+            TEST_COMMAND=.ci/pytorch/test.sh
           fi
 
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
@@ -251,30 +260,6 @@ jobs:
           if-no-files-found: ignore
           path: ./**/core.[1-9]*
 
-      - name: Upload test statistics
-        if: always()
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          PYTORCH_RETRY_TEST_CASES: 1
-          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: ${{ github.run_id }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        shell: bash
-        run: |
-          set -x
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 5ee909f02c22..f5f66ae5129b 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -74,6 +74,9 @@ jobs:
     outputs:
       build-outcome: ${{ steps.build.outcome }}
     steps:
+      - name: Clean up disk space before running MacOS workflow
+        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -143,7 +146,7 @@ jobs:
           OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
         run: |
           echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          ${CONDA_RUN} .jenkins/pytorch/macos-build.sh
+          ${CONDA_RUN} .ci/pytorch/macos-build.sh
 
       - name: Archive artifacts into zip
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
@@ -168,3 +171,8 @@ jobs:
           retention-days: 14
           if-no-files-found: warn
           path: sccache-stats-*.json
+
+      - name: Clean up disk space
+        if: always()
+        continue-on-error: true
+        uses: pytorch/test-infra/.github/actions/check-disk-space@main
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 5fac3126e20d..9748e3cc48d3 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -25,6 +25,11 @@ jobs:
     name: "Run MPS tests"
     runs-on: ${{ inputs.runs-on }}
     steps:
+      - name: Print runner OS/HW info
+        shell: arch -arch arm64 bash {0}
+        run: |
+          sysctl machdep.cpu.brand_string kern.osproductversion
+
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
         with:
@@ -74,12 +79,13 @@ jobs:
         id: test
         env:
           ENV_NAME: conda-test-env-${{ github.run_id }}
+          PR_BODY: ${{ github.event.pull_request.body }}
+          PYTORCH_RETRY_TEST_CASES: 1
+          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
         shell: arch -arch arm64 bash {0}
         run: |
           # shellcheck disable=SC1090
           set -ex
-          # TODO(https://github.com/pytorch/pytorch/issues/79293)
-
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
       - name: Print remaining test logs
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 39236a0dd082..d8ede95f2958 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -23,7 +23,12 @@ on:
         type: string
         description: |
           Contains the architecture to run the tests with
-
+      timeout-minutes:
+        required: false
+        type: number
+        default: 270
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
         required: true
@@ -67,7 +72,7 @@ jobs:
       matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 240
+    timeout-minutes: ${{ inputs.timeout-minutes }}
     env:
       GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
       BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
@@ -78,6 +83,17 @@ jobs:
       PYTORCH_RETRY_TEST_CASES: 1
       PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
     steps:
+      - name: Clean up leftover processes on MacOS pet runner
+        continue-on-error: true
+        run: |
+          for PROCESS in "python" "conda" "ninja" "clang"; do
+            echo "Cleaning up all remaining ${PROCESS} process"
+            pkill "${PROCESS}" || true
+          done
+
+      - name: Clean up disk space before running MacOS workflow
+        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -127,8 +143,17 @@ jobs:
           # As wheels are cross-compiled they are reported as x86_64 ones
           ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv "${ORIG_WHLNAME}" "${ARM_WHLNAME}"
 
+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
       - name: Test
         id: test
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         env:
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
@@ -149,7 +174,7 @@ jobs:
           arch
 
           ${CONDA_RUN} python3 -mpip install --no-index --no-deps $(echo dist/*.whl)
-          ${CONDA_RUN} .jenkins/pytorch/macos-test.sh
+          ${CONDA_RUN} .ci/pytorch/macos-test.sh
 
       - name: Print remaining test logs
         shell: bash
@@ -179,25 +204,7 @@ jobs:
           use-gha: true
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
-      - name: Upload test statistics
+      - name: Clean up disk space
         if: always()
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          PYTORCH_RETRY_TEST_CASES: 1
-          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: ${{ github.run_id }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        run: |
-          set -x
-          ${CONDA_RUN} python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+        continue-on-error: true
+        uses: pytorch/test-infra/.github/actions/check-disk-space@main
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 2af091651e3f..cb0b85bdca88 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -26,6 +26,12 @@ on:
         description: |
           If this is set, our linter will use this to make sure that every other
           job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 300
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
 
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID:
@@ -64,7 +70,7 @@ jobs:
     needs: filter
     # Don't run on forked repos or empty test matrix
     if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
-    timeout-minutes: 300
+    timeout-minutes: ${{ inputs.timeout-minutes }}
     strategy:
       matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
@@ -89,7 +95,7 @@ jobs:
         shell: bash
         continue-on-error: true
         run: |
-          python3 -m pip install psutil==5.9.1 pynvml==11.4.1
+          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
           python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -102,6 +108,14 @@ jobs:
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
       - name: Test
         id: test
         env:
@@ -120,16 +134,16 @@ jobs:
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
-        timeout-minutes: 270
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
         run: |
           set -x
 
           if [[ $TEST_CONFIG == 'multigpu' ]]; then
-            TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh
+            TEST_COMMAND=.ci/pytorch/multigpu-test.sh
           elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-            TEST_COMMAND=.jenkins/caffe2/test.sh
+            TEST_COMMAND=.ci/caffe2/test.sh
           else
-            TEST_COMMAND=.jenkins/pytorch/test.sh
+            TEST_COMMAND=.ci/pytorch/test.sh
           fi
 
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
@@ -224,31 +238,20 @@ jobs:
           use-gha: true
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
-      - name: Upload test statistics
+      - name: Collect backtraces from coredumps (if any)
         if: always()
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          PYTORCH_RETRY_TEST_CASES: 1
-          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: ${{ github.run_id }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        shell: bash
         run: |
-          set -x
-          python3 -m pip install -r requirements.txt
-          python3 -m pip install boto3==1.19.12
-          python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Store Core dumps on GitHub
+        uses: actions/upload-artifact@v3
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
 
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index b04dc7f6626c..21d2c3a2e305 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -60,6 +60,12 @@ jobs:
               call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
               call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
 
+      # Duplicated in win-test because this MUST go before a checkout
+      - name: Enable git symlinks on Windows
+        shell: bash
+        run: |
+          git config --global core.symlinks true
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -112,11 +118,11 @@ jobs:
           PR_NUMBER: ${{ github.event.pull_request.number }}
           SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
           DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
-          TORCH_CUDA_ARCH_LIST: "7.0"
+          TORCH_CUDA_ARCH_LIST: "8.6"
           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
           OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
         run: |
-          .jenkins/pytorch/win-build.sh
+          .ci/pytorch/win-build.sh
 
       # Upload to github so that people can click and download artifacts
       - name: Upload artifacts to s3
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 100bd8cd006e..b74b82f37c64 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -63,6 +63,45 @@ jobs:
         run: |
           git config --global core.symlinks true
 
+      - name: Clean up leftover processes on non-ephemeral Windows runner
+        shell: powershell
+        continue-on-error: true
+        run: |
+          # This needs to be run before checking out PyTorch to avoid locking the working directory.
+          # Below is the list of commands that could lock $GITHUB_WORKSPACE gathered from sysinternals
+          # handle tool
+          $processes = "python", "ninja", "cl", "nvcc", "cmd"
+          Foreach ($process In $processes) {
+            Try {
+              # https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.management/stop-process
+              Get-Process -Name $process -ErrorAction Stop | Stop-Process -Force
+            }
+            Catch {
+              Write-Output "No leftover $process process, continuing"
+              Write-Output $_
+            }
+          }
+
+          # Try it again https://stackoverflow.com/questions/40585754/powershell-wont-terminate-hung-process
+          # for hung processes
+          Foreach ($process In $processes) {
+            Try {
+              (Get-WmiObject -Class Win32_Process -Filter "Name LIKE '${process}%'").terminate()
+            }
+            Catch {
+              Write-Output $_
+            }
+          }
+
+          Try {
+            # Print all the processes for debugging
+            Wmic Path Win32_Process Get Caption,Processid,Commandline | Format-List
+          }
+          Catch {
+            # Better to write out whatever exception thrown to help debugging any potential issue
+            Write-Output $_
+          }
+
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
@@ -133,7 +172,7 @@ jobs:
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
           TEST_CONFIG: ${{ matrix.config }}
           PR_BODY: ${{ github.event.pull_request.body }}
-          TORCH_CUDA_ARCH_LIST: "7.0"
+          TORCH_CUDA_ARCH_LIST: "8.6"
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         run: |
@@ -151,7 +190,12 @@ jobs:
           export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
           export PR_BODY="${PR_BODY//[\'\"]}"
 
-          .jenkins/pytorch/win-test.sh
+          pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
+          # shellcheck disable=SC2046
+          python3 -mpip install $(echo *.whl)[opt-einsum]
+          popd
+
+          .ci/pytorch/win-test.sh
 
       - name: Print remaining test logs
         shell: bash
@@ -185,29 +229,6 @@ jobs:
         id: parse-ref
         run: .github/scripts/parse_ref.py
 
-      - name: Upload test statistics
-        if: always()
-        env:
-          AWS_DEFAULT_REGION: us-east-1
-          GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-          BRANCH: ${{ steps.parse-ref.outputs.branch }}
-          TEST_CONFIG: ${{ matrix.config }}
-          SHARD_NUMBER: ${{ matrix.shard }}
-          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          PYTORCH_RETRY_TEST_CASES: 1
-          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-          TAG: ${{ steps.parse-ref.outputs.tag }}
-          WORKFLOW_ID: ${{ github.run_id }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        shell: bash
-        run: |
-          set -x
-          # Windows conda doesn't have python3 binary, only python, but it's python3
-          ${CONDA_RUN} python -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
-
       - name: Teardown Windows
         uses: ./.github/actions/teardown-win
         if: always()
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 171495c0322d..29bb67a04f2f 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -9,11 +9,13 @@ on:
       - .github/workflows/build-triton-wheel.yml
       - .github/scripts/build_triton_wheel.py
       - .github/ci_commit_pins/triton.txt
+      - .ci/docker/ci_commit_pins/triton.txt
   pull_request:
     paths:
       - .github/workflows/build-triton-wheel.yml
       - .github/scripts/build_triton_wheel.py
       - .github/ci_commit_pins/triton.txt
+      - .ci/docker/ci_commit_pins/triton.txt
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -21,14 +23,15 @@ concurrency:
 
 jobs:
   build-wheel:
+    name: "Build Triton Wheel"
     runs-on: [self-hosted, linux.2xlarge]
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.7", "3.8", "3.9", "3.10", "3.11" ]
+        py_vers: [ "3.8", "3.9", "3.10", "3.11" ]
     timeout-minutes: 40
     env:
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
       PY_VERS: ${{ matrix.py_vers }}
     steps:
       - name: Setup SSH (Click me for login details)
@@ -64,9 +67,6 @@ jobs:
 
           # Determine python executable for given version
           case $PY_VERS in
-          3.7)
-            PYTHON_EXECUTABLE=/opt/python/cp37-cp37m/bin/python
-            ;;
           3.8)
             PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
             ;;
@@ -85,7 +85,8 @@ jobs:
             ;;
           esac
 
-          docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
+          docker exec -t "${container_name}" yum install -y zlib-devel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==67.4.0
           docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" /pytorch/.github/scripts/build_triton_wheel.py
           docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
 
@@ -107,11 +108,6 @@ jobs:
     env:
       GITHUB_TOKEN: ${{ secrets.github-token }}
     steps:
-      - name: Download Build Artifacts (3.7)
-        uses: actions/download-artifact@v3
-        with:
-          name: "pytorch-triton-wheel-3.7"
-          path: "${{ runner.temp }}/artifacts/"
       - name: Download Build Artifacts (3.8)
         uses: actions/download-artifact@v3
         with:
@@ -148,14 +144,15 @@ jobs:
               aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}"
              done
   build-conda:
+    name: "Build Triton Conda"
     runs-on: [self-hosted, linux.2xlarge]
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.7", "3.8", "3.9", "3.10" ]
+        py_vers: [ "3.8", "3.9", "3.10", "3.11" ]
     timeout-minutes: 40
     env:
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
       PY_VERS: ${{ matrix.py_vers }}
       ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     steps:
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index f40f610fa2ad..59c2d871e31a 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -41,7 +41,7 @@ jobs:
             cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
             mv "/tmp/$PT_RELEASE_NAME" .
             # Cleanup
-            rm -rf "$PT_RELEASE_NAME"/{.circleci,.jenkins}
+            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
             find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
             # Create archive
             tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index d7c5177898af..36f25345162d 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
   pull_request:
     paths:
-      - .circleci/docker/**
+      - .ci/docker/**
       - .github/workflows/docker-builds.yml
   push:
     branches:
@@ -13,7 +13,7 @@ on:
       - release/*
       - landchecks/*
     paths:
-      - .circleci/docker/**
+      - .ci/docker/**
       - .github/workflows/docker-builds.yml
   schedule:
     - cron: 1 3 * * 3
@@ -36,15 +36,19 @@ jobs:
         include:
           - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
           - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-          - docker-image-name: pytorch-linux-bionic-py3.7-clang9
-          - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
-          - docker-image-name: pytorch-linux-focal-rocm5.3-py3.8
+          - docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+          - docker-image-name: pytorch-linux-bionic-py3.8-clang9
+          - docker-image-name: pytorch-linux-bionic-py3.11-clang9
+          - docker-image-name: pytorch-linux-focal-rocm-n-1-py3
+          - docker-image-name: pytorch-linux-focal-rocm-n-py3
           - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
           - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
+          - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
           - docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-          - docker-image-name: pytorch-linux-focal-py3.7-gcc7
+          - docker-image-name: pytorch-linux-focal-py3.8-gcc7
           - docker-image-name: pytorch-linux-focal-py3-clang7-asan
           - docker-image-name: pytorch-linux-focal-py3-clang10-onnx
+          - docker-image-name: pytorch-linux-focal-linter
     env:
       DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
     steps:
@@ -66,20 +70,38 @@ jobs:
       - name: Build docker image
         id: build-docker-image
         uses: ./.github/actions/calculate-docker-image
-        env:
-          GHCR_PAT: ${{ secrets.GHCR_PAT }}
         with:
           docker-image-name: ${{ matrix.docker-image-name }}
           always-rebuild: true
           skip_push: false
           force_push: true
-          push-ghcr-image: ${{ github.event_name == 'push' }}
 
       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
           docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
 
+      - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        name: Push to https://https://ghcr.io/
+        id: push-to-ghcr-io
+        if: ${{ github.event_name == 'push' }}
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.build-docker-image.outputs.docker-image }}
+          GHCR_PAT: ${{ secrets.GHCR_PAT }}
+          IMAGE_NAME: ${{ matrix.docker-image-name }}
+        with:
+          shell: bash
+          timeout_minutes: 15
+          max_attempts: 5
+          retry_wait_seconds: 90
+          command: |
+            ghcr_image="ghcr.io/pytorch/ci-image"
+            tag=${ECR_DOCKER_IMAGE##*:}
+            # Push docker image to the ghcr.io
+            echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin
+            docker tag "${ECR_DOCKER_IMAGE}" "${ghcr_image}:${IMAGE_NAME}-${tag}"
+            docker push "${ghcr_image}:${IMAGE_NAME}-${tag}"
+
       - name: Chown workspace
         uses: ./.github/actions/chown-workspace
         if: always()
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index e0d0ec825b8b..590df7f3fee3 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -72,6 +72,8 @@ jobs:
           QEMU_BINARY_PATH: ${{ runner.temp }}/bin
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
+        with:
+          version: v0.10.0
       - name: Setup job specific variables
         run: |
           set -eou pipefail
diff --git a/.github/workflows/generated-linux-binary-conda-nightly.yml b/.github/workflows/generated-linux-binary-conda-nightly.yml
index 6928e7fd3d53..1d0b6fa14b7b 100644
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@@ -36,7 +36,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  conda-py3_7-cpu-build:
+  conda-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -48,15 +48,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cpu
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cpu
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_7-cpu-test:  # Testing
+  conda-py3_8-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-build
+    needs: conda-py3_8-cpu-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -67,15 +67,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cpu
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cpu
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_7-cpu-upload:  # Uploading
+  conda-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-test
+    needs: conda-py3_8-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -85,75 +85,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_6
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_7-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_6-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_6
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_7-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_6-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_6
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_7-build:
+  conda-py3_8-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -166,15 +106,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_7
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda11_7
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_7-cuda11_7-test:  # Testing
+  conda-py3_8-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_7-build
+    needs: conda-py3_8-cuda11_7-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -186,15 +126,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_7
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda11_7
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_7-cuda11_7-upload:  # Uploading
+  conda-py3_8-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_7-test
+    needs: conda-py3_8-cuda11_7-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -205,15 +145,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_7
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_8-build:
+  conda-py3_8-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -226,15 +166,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_8
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda11_8
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_7-cuda11_8-test:  # Testing
+  conda-py3_8-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_8-build
+    needs: conda-py3_8-cuda11_8-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -246,15 +186,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_8
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda11_8
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_7-cuda11_8-upload:  # Uploading
+  conda-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_8-test
+    needs: conda-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -265,15 +205,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_8
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cpu-build:
+  conda-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -285,15 +225,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cpu
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_8-cpu-test:  # Testing
+  conda-py3_9-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
+    needs: conda-py3_9-cpu-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -304,15 +244,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cpu
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cpu-upload:  # Uploading
+  conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-test
+    needs: conda-py3_9-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -322,75 +262,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_6
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_8-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_6-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_6
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_6-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_6
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_7-build:
+  conda-py3_9-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -403,15 +283,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_7
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_7
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_8-cuda11_7-test:  # Testing
+  conda-py3_9-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_7-build
+    needs: conda-py3_9-cuda11_7-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -423,15 +303,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_7
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_7
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda11_7-upload:  # Uploading
+  conda-py3_9-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_7-test
+    needs: conda-py3_9-cuda11_7-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -442,15 +322,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_7
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_8-build:
+  conda-py3_9-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -463,15 +343,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_8
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_8-cuda11_8-test:  # Testing
+  conda-py3_9-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_8-build
+    needs: conda-py3_9-cuda11_8-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -483,15 +363,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_8
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda11_8-upload:  # Uploading
+  conda-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_8-test
+    needs: conda-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -502,15 +382,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cpu-build:
+  conda-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -522,15 +402,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cpu
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_9-cpu-test:  # Testing
+  conda-py3_10-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
+    needs: conda-py3_10-cpu-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -541,15 +421,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cpu
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_9-cpu-upload:  # Uploading
+  conda-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-test
+    needs: conda-py3_10-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -559,75 +439,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_6
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_9-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_6-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_6
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_9-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_6-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_6
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_7-build:
+  conda-py3_10-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -640,15 +460,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_7
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_7
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_9-cuda11_7-test:  # Testing
+  conda-py3_10-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_7-build
+    needs: conda-py3_10-cuda11_7-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -660,15 +480,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_7
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_7
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_9-cuda11_7-upload:  # Uploading
+  conda-py3_10-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_7-test
+    needs: conda-py3_10-cuda11_7-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -679,15 +499,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_7
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_8-build:
+  conda-py3_10-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -700,15 +520,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_8
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_8
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_9-cuda11_8-test:  # Testing
+  conda-py3_10-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_8-build
+    needs: conda-py3_10-cuda11_8-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -720,15 +540,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_8
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_8
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_9-cuda11_8-upload:  # Uploading
+  conda-py3_10-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_8-test
+    needs: conda-py3_10-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -739,15 +559,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_8
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cpu-build:
+  conda-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -759,15 +579,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_10-cpu-test:  # Testing
+  conda-py3_11-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
+    needs: conda-py3_11-cpu-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -778,15 +598,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_10-cpu-upload:  # Uploading
+  conda-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-test
+    needs: conda-py3_11-cpu-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -796,75 +616,15 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_6
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  conda-py3_10-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_6-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_6
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_10-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_6-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_7-build:
+  conda-py3_11-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -877,15 +637,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_7
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_7
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_10-cuda11_7-test:  # Testing
+  conda-py3_11-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_7-build
+    needs: conda-py3_11-cuda11_7-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -897,15 +657,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_7
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_7
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_10-cuda11_7-upload:  # Uploading
+  conda-py3_11-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_7-test
+    needs: conda-py3_11-cuda11_7-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -916,15 +676,15 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.7
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_7
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_8-build:
+  conda-py3_11-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -937,15 +697,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_8
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_8
       build_environment: linux-binary-conda
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  conda-py3_10-cuda11_8-test:  # Testing
+  conda-py3_11-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-build
+    needs: conda-py3_11-cuda11_8-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -957,15 +717,15 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_8
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_8
       build_environment: linux-binary-conda
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_10-cuda11_8-upload:  # Uploading
+  conda-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-test
+    needs: conda-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -976,8 +736,8 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: pytorch/conda-builder:cuda11.8
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_8
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
index d016f5d9b52a..81688881c92b 100644
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@@ -276,258 +276,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-shared-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-shared-with-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-shared-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-shared-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-shared-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-shared-without-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-shared-without-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-static-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-static-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-static-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-static-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-static-with-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-without-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-static-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-static-without-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-static-without-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-static-without-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda11_6-static-without-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_7-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1032,7 +780,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-shared-with-deps-cxx11-abi-build:
+  libtorch-rocm5_3-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1041,20 +789,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_3-shared-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_3-shared-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1063,11 +811,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1076,7 +824,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
+          name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1109,34 +857,34 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.2
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_2-shared-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_3-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_3-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-static-with-deps-cxx11-abi-build:
+  libtorch-rocm5_3-static-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1145,20 +893,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-static-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-static-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_3-static-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_3-static-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1167,11 +915,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1180,7 +928,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-static-with-deps-cxx11-abi
+          name: libtorch-rocm5_3-static-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1213,34 +961,34 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.2
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_2-static-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_3-static-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_3-static-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-static-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_3-shared-with-deps-cxx11-abi-build:
+  libtorch-rocm5_4_2-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1249,20 +997,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.4.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_4_2-shared-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_3-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_4_2-shared-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-shared-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_4_2-shared-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1271,11 +1019,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.4.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1284,7 +1032,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
+          name: libtorch-rocm5_4_2-shared-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1317,34 +1065,34 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.3
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_3-shared-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_4_2-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-shared-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_4_2-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.4.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_4_2-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_3-static-with-deps-cxx11-abi-build:
+  libtorch-rocm5_4_2-static-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1353,20 +1101,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.4.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_3-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_4_2-static-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_3-static-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_4_2-static-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-static-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_4_2-static-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1375,11 +1123,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.4.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1388,7 +1136,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_3-static-with-deps-cxx11-abi
+          name: libtorch-rocm5_4_2-static-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1421,27 +1169,27 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.3
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_3-static-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_4_2-static-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-static-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_4_2-static-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.4.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_3-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_4_2-static-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
index e4a1dbad98ef..ed2f1f08619b 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@@ -276,258 +276,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-shared-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-shared-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-shared-with-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-shared-with-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-shared-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-shared-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-shared-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-shared-without-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: shared-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-shared-without-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-static-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-static-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-static-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-static-with-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: static-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-static-with-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-without-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-static-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  libtorch-cuda11_6-static-without-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-pre-cxx11-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-static-without-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda11_6-static-without-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      LIBTORCH_VARIANT: static-without-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-cuda11_6-static-without-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_7-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1032,7 +780,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-shared-with-deps-pre-cxx11-build:
+  libtorch-rocm5_3-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1041,20 +789,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_3-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_3-shared-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1063,11 +811,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1076,7 +824,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
+          name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1109,34 +857,34 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_2-shared-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_3-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_3-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-static-with-deps-pre-cxx11-build:
+  libtorch-rocm5_3-static-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1145,20 +893,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-static-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-static-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_3-static-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_3-static-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1167,11 +915,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1180,7 +928,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-static-with-deps-pre-cxx11
+          name: libtorch-rocm5_3-static-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1213,34 +961,34 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_2-static-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_3-static-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_3-static-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-static-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_3-shared-with-deps-pre-cxx11-build:
+  libtorch-rocm5_4_2-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1249,20 +997,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_4_2-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_3-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_4_2-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-shared-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_4_2-shared-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1271,11 +1019,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1284,7 +1032,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
+          name: libtorch-rocm5_4_2-shared-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1317,34 +1065,34 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.3
+          docker-image: pytorch/manylinux-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_3-shared-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_4_2-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-shared-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_4_2-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_4_2-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_3-static-with-deps-pre-cxx11-build:
+  libtorch-rocm5_4_2-static-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1353,20 +1101,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_3-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_4_2-static-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_3-static-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_4_2-static-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-static-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_4_2-static-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1375,11 +1123,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1388,7 +1136,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_3-static-with-deps-pre-cxx11
+          name: libtorch-rocm5_4_2-static-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1421,27 +1169,27 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.3
+          docker-image: pytorch/manylinux-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm5_3-static-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_4_2-static-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_3-static-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_4_2-static-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_3-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_4_2-static-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-master.yml b/.github/workflows/generated-linux-binary-manywheel-master.yml
index e085fb5eb5fb..684cc8fe0fa5 100644
--- a/.github/workflows/generated-linux-binary-manywheel-master.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-master.yml
@@ -31,7 +31,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  manywheel-py3_7-cuda11_6-build:
+  manywheel-py3_8-cuda11_7-with-pypi-cudnn-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -40,19 +40,20 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_6
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_7-cuda11_6-test:  # Testing
+  manywheel-py3_8-cuda11_7-with-pypi-cudnn-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_6-build
+    needs: manywheel-py3_8-cuda11_7-with-pypi-cudnn-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -60,12 +61,51 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_6
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda11_7-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda11_7
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_8-cuda11_7-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda11_7-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda11_7
       build_environment: linux-binary-manywheel
       runs_on: linux.4xlarge.nvidia.gpu
     secrets:
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 4ab2014e1c56..42eb38910cfe 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -36,506 +36,6 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  manywheel-py3_7-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cpu-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cpu
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_7-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_6
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_6-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_6
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_7-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_6-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-cuda11_7-with-pypi-cudnn-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_7-with-pypi-cudnn
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-cuda11_7-with-pypi-cudnn-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_7-with-pypi-cudnn-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_7-with-pypi-cudnn
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_7-cuda11_7-with-pypi-cudnn-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_7-with-pypi-cudnn-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_7-with-pypi-cudnn
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-cuda11_7-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_7
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-cuda11_7-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_7-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_7
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_7-cuda11_7-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_7-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.7
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_7
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_8
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_8-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_8
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_7-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-rocm5_2-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_2
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-rocm5_2-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_2-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-rocm5_2
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_7-rocm5_2-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_2-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_2
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-rocm5_3-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_3
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_7-rocm5_3-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_3-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_7-rocm5_3
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm5.3
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_7-rocm5_3-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_3-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
-      DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_3
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   manywheel-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -593,7 +93,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_8-cuda11_6-build:
+  manywheel-py3_8-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -602,19 +102,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_6
+      build_name: manywheel-py3_8-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_8-cuda11_6-test:  # Testing
+  manywheel-py3_8-cpu-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_6-build
+    needs: manywheel-py3_8-cpu-cxx11-abi-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -622,31 +122,31 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_6
+      build_name: manywheel-py3_8-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_6-upload:  # Uploading
+  manywheel-py3_8-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_6-test
+    needs: manywheel-py3_8-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_6
+      build_name: manywheel-py3_8-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -669,7 +169,7 @@ jobs:
       DESIRED_PYTHON: "3.8"
       build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -834,7 +334,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_8-rocm5_2-build:
+  manywheel-py3_8-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -843,19 +343,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_2
+      build_name: manywheel-py3_8-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_8-rocm5_2-test:  # Testing
+  manywheel-py3_8-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_2-build
+    needs: manywheel-py3_8-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -864,11 +364,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.8"
     steps:
       - name: Setup ROCm
@@ -876,7 +376,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_8-rocm5_2
+          name: manywheel-py3_8-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -909,33 +409,33 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_8-rocm5_2-upload:  # Uploading
+  manywheel-py3_8-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_2-test
+    needs: manywheel-py3_8-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_2
+      build_name: manywheel-py3_8-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_8-rocm5_3-build:
+  manywheel-py3_8-rocm5_4_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -944,19 +444,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_3
+      build_name: manywheel-py3_8-rocm5_4_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_8-rocm5_3-test:  # Testing
+  manywheel-py3_8-rocm5_4_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_3-build
+    needs: manywheel-py3_8-rocm5_4_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -965,11 +465,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.8"
     steps:
       - name: Setup ROCm
@@ -977,7 +477,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_8-rocm5_3
+          name: manywheel-py3_8-rocm5_4_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1010,26 +510,26 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.3
+          docker-image: pytorch/manylinux-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_8-rocm5_3-upload:  # Uploading
+  manywheel-py3_8-rocm5_4_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_3-test
+    needs: manywheel-py3_8-rocm5_4_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_3
+      build_name: manywheel-py3_8-rocm5_4_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -1093,7 +593,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_9-cuda11_6-build:
+  manywheel-py3_9-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1102,19 +602,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_6
+      build_name: manywheel-py3_9-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_9-cuda11_6-test:  # Testing
+  manywheel-py3_9-cpu-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_6-build
+    needs: manywheel-py3_9-cpu-cxx11-abi-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -1122,31 +622,31 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_6
+      build_name: manywheel-py3_9-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_6-upload:  # Uploading
+  manywheel-py3_9-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cuda11_6-test
+    needs: manywheel-py3_9-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_6
+      build_name: manywheel-py3_9-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -1169,7 +669,7 @@ jobs:
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1334,7 +834,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_9-rocm5_2-build:
+  manywheel-py3_9-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1343,19 +843,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_2
+      build_name: manywheel-py3_9-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_9-rocm5_2-test:  # Testing
+  manywheel-py3_9-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_2-build
+    needs: manywheel-py3_9-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1364,11 +864,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Setup ROCm
@@ -1376,7 +876,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm5_2
+          name: manywheel-py3_9-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1409,33 +909,33 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_9-rocm5_2-upload:  # Uploading
+  manywheel-py3_9-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_2-test
+    needs: manywheel-py3_9-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_2
+      build_name: manywheel-py3_9-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_9-rocm5_3-build:
+  manywheel-py3_9-rocm5_4_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1444,19 +944,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_3
+      build_name: manywheel-py3_9-rocm5_4_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_9-rocm5_3-test:  # Testing
+  manywheel-py3_9-rocm5_4_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_3-build
+    needs: manywheel-py3_9-rocm5_4_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1465,11 +965,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Setup ROCm
@@ -1477,7 +977,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm5_3
+          name: manywheel-py3_9-rocm5_4_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1510,26 +1010,26 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.3
+          docker-image: pytorch/manylinux-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_9-rocm5_3-upload:  # Uploading
+  manywheel-py3_9-rocm5_4_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_3-test
+    needs: manywheel-py3_9-rocm5_4_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_3
+      build_name: manywheel-py3_9-rocm5_4_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -1593,7 +1093,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_10-cuda11_6-build:
+  manywheel-py3_10-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1602,19 +1102,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_6
+      build_name: manywheel-py3_10-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_10-cuda11_6-test:  # Testing
+  manywheel-py3_10-cpu-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_6-build
+    needs: manywheel-py3_10-cpu-cxx11-abi-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -1622,31 +1122,31 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_6
+      build_name: manywheel-py3_10-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda11_6-upload:  # Uploading
+  manywheel-py3_10-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cuda11_6-test
+    needs: manywheel-py3_10-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_6
+      build_name: manywheel-py3_10-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -1669,7 +1169,7 @@ jobs:
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1834,7 +1334,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_10-rocm5_2-build:
+  manywheel-py3_10-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1843,19 +1343,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_2
+      build_name: manywheel-py3_10-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_10-rocm5_2-test:  # Testing
+  manywheel-py3_10-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_2-build
+    needs: manywheel-py3_10-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1864,11 +1364,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
@@ -1876,7 +1376,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm5_2
+          name: manywheel-py3_10-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1909,33 +1409,33 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_10-rocm5_2-upload:  # Uploading
+  manywheel-py3_10-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_2-test
+    needs: manywheel-py3_10-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_2
+      build_name: manywheel-py3_10-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_10-rocm5_3-build:
+  manywheel-py3_10-rocm5_4_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1944,19 +1444,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_3
+      build_name: manywheel-py3_10-rocm5_4_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_10-rocm5_3-test:  # Testing
+  manywheel-py3_10-rocm5_4_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_3-build
+    needs: manywheel-py3_10-rocm5_4_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1965,11 +1465,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
@@ -1977,7 +1477,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm5_3
+          name: manywheel-py3_10-rocm5_4_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2010,26 +1510,26 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.3
+          docker-image: pytorch/manylinux-builder:rocm5.4.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_10-rocm5_3-upload:  # Uploading
+  manywheel-py3_10-rocm5_4_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_3-test
+    needs: manywheel-py3_10-rocm5_4_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.3
-      GPU_ARCH_VERSION: 5.3
+      DESIRED_CUDA: rocm5.4.2
+      GPU_ARCH_VERSION: 5.4.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.4.2
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_3
+      build_name: manywheel-py3_10-rocm5_4_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -2093,7 +1593,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_11-cuda11_6-build:
+  manywheel-py3_11-cpu-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -2102,19 +1602,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_6
+      build_name: manywheel-py3_11-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_11-cuda11_6-test:  # Testing
+  manywheel-py3_11-cpu-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-cuda11_6-build
+    needs: manywheel-py3_11-cpu-cxx11-abi-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -2122,31 +1622,31 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_6
+      build_name: manywheel-py3_11-cpu-cxx11-abi
       build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.4xlarge
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda11_6-upload:  # Uploading
+  manywheel-py3_11-cpu-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-cuda11_6-test
+    needs: manywheel-py3_11-cpu-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi
+      DESIRED_DEVTOOLSET: cxx11-abi
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_6
+      build_name: manywheel-py3_11-cpu-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -2169,7 +1669,7 @@ jobs:
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.7.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.7.101; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.2.10.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.0.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.4.91; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.14.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.7.91; platform_system == 'Linux' and platform_machine == 'x86_64'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
index 609e690a8989..4501bc027d83 100644
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@@ -370,3 +370,115 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+  conda-py3_11-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12-xl
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        uses: nick-fields/retry@v2.8.2
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: conda-py3_11-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  conda-py3_11-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: conda-py3_11-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: conda
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/conda-builder:cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index eead92dd56df..5bc8184e4ef5 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -370,3 +370,115 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_11-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-12-xl
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.11"
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        uses: nick-fields/retry@v2.8.2
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: wheel-py3_11-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_11-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: wheel-py3_11-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml
index a1b64c7cb308..9bfc1f461bb0 100644
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@@ -32,7 +32,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  conda-py3_7-cpu-build:
+  conda-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -45,7 +45,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -119,13 +119,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_7-cpu
+          name: conda-py3_8-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_7-cpu-upload:  # Uploading
+  conda-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-build
+    needs: conda-py3_8-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -135,8 +135,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cpu
+      DESIRED_PYTHON: "3.8"
+      build_name: conda-py3_8-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -144,7 +144,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cpu-build:
+  conda-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -157,7 +157,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -231,13 +231,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cpu
+          name: conda-py3_9-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_8-cpu-upload:  # Uploading
+  conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
+    needs: conda-py3_9-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -247,8 +247,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -256,7 +256,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cpu-build:
+  conda-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -269,7 +269,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -343,13 +343,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cpu
+          name: conda-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_9-cpu-upload:  # Uploading
+  conda-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
+    needs: conda-py3_10-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -359,8 +359,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -368,7 +368,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cpu-build:
+  conda-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -381,7 +381,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -455,13 +455,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cpu
+          name: conda-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_10-cpu-upload:  # Uploading
+  conda-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
+    needs: conda-py3_11-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -471,8 +471,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/conda-builder:cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
index 5d65c959fe8a..a53a0aa3fb66 100644
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@@ -49,7 +49,7 @@ jobs:
       DESIRED_DEVTOOLSET: cxx11-abi
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -166,7 +166,7 @@ jobs:
       DESIRED_DEVTOOLSET: cxx11-abi
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -283,7 +283,7 @@ jobs:
       DESIRED_DEVTOOLSET: cxx11-abi
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -400,7 +400,7 @@ jobs:
       DESIRED_DEVTOOLSET: cxx11-abi
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
index a874bdf1fe69..26be90cd18a9 100644
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@@ -49,7 +49,7 @@ jobs:
       DESIRED_DEVTOOLSET: pre-cxx11
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -166,7 +166,7 @@ jobs:
       DESIRED_DEVTOOLSET: pre-cxx11
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -283,7 +283,7 @@ jobs:
       DESIRED_DEVTOOLSET: pre-cxx11
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -400,7 +400,7 @@ jobs:
       DESIRED_DEVTOOLSET: pre-cxx11
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml
index a84277169115..0448752786fc 100644
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@@ -32,7 +32,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  wheel-py3_7-cpu-build:
+  wheel-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -45,7 +45,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -119,13 +119,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_7-cpu
+          name: wheel-py3_8-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_7-cpu-upload:  # Uploading
+  wheel-py3_8-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-build
+    needs: wheel-py3_8-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -135,8 +135,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cpu
+      DESIRED_PYTHON: "3.8"
+      build_name: wheel-py3_8-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -144,7 +144,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cpu-build:
+  wheel-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -157,7 +157,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -231,13 +231,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cpu
+          name: wheel-py3_9-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_8-cpu-upload:  # Uploading
+  wheel-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
+    needs: wheel-py3_9-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -247,8 +247,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -256,7 +256,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cpu-build:
+  wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -269,7 +269,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -343,13 +343,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_9-cpu-upload:  # Uploading
+  wheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
+    needs: wheel-py3_10-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -359,8 +359,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -368,7 +368,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
+  wheel-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl
     timeout-minutes: 240
@@ -381,7 +381,7 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@@ -455,13 +455,13 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_10-cpu-upload:  # Uploading
+  wheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
+    needs: wheel-py3_11-cpu-build
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
@@ -471,8 +471,8 @@ jobs:
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
       DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
       use_s3: False
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-binary-conda-nightly.yml b/.github/workflows/generated-windows-binary-conda-nightly.yml
index bd706aaf9784..8a60d0536936 100644
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@@ -32,927 +32,6 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  conda-py3_7-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cpu-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_7-cuda11_6
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_6
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_6-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_7-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_7-cuda11_7
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_7-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_7
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_7-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_7-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_7
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_7-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_7-cuda11_8
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_7-cuda11_8
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_7-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_7-cuda11_8-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: conda-py3_7-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   conda-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
@@ -1181,7 +260,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_6-build:
+  conda-py3_8-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1191,8 +270,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1270,7 +349,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cuda11_6
+          name: conda-py3_8-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1287,9 +366,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_6-test:  # Testing
+  conda-py3_8-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_6-build
+    needs: conda-py3_8-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1298,8 +377,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1344,7 +423,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cuda11_6
+          name: conda-py3_8-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1392,27 +471,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_6-upload:  # Uploading
+  conda-py3_8-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_6-test
+    needs: conda-py3_8-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_6
+      build_name: conda-py3_8-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_7-build:
+  conda-py3_8-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1422,8 +501,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1501,7 +580,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cuda11_7
+          name: conda-py3_8-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1518,9 +597,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_7-test:  # Testing
+  conda-py3_8-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_7-build
+    needs: conda-py3_8-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1529,8 +608,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1575,7 +654,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cuda11_7
+          name: conda-py3_8-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1623,27 +702,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_7-upload:  # Uploading
+  conda-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_7-test
+    needs: conda-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_7
+      build_name: conda-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_8-build:
+  conda-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1653,11 +732,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1732,7 +810,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_8-cuda11_8
+          name: conda-py3_9-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1749,10 +827,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_8-test:  # Testing
+  conda-py3_9-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_9-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1760,11 +838,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1806,7 +883,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_8-cuda11_8
+          name: conda-py3_9-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1854,27 +931,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_8-upload:  # Uploading
+  conda-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_8-test
+    needs: conda-py3_9-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: conda-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cpu-build:
+  conda-py3_9-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1884,8 +960,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -1962,7 +1039,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cpu
+          name: conda-py3_9-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1979,10 +1056,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cpu-test:  # Testing
+  conda-py3_9-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    runs-on: windows.4xlarge
+    needs: conda-py3_9-cuda11_7-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1990,8 +1067,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -2035,7 +1113,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_9-cpu
+          name: conda-py3_9-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2083,26 +1161,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cpu-upload:  # Uploading
+  conda-py3_9-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-test
+    needs: conda-py3_9-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cpu
+      build_name: conda-py3_9-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_6-build:
+  conda-py3_9-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2112,8 +1191,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -2191,7 +1270,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cuda11_6
+          name: conda-py3_9-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2208,9 +1287,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_6-test:  # Testing
+  conda-py3_9-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_6-build
+    needs: conda-py3_9-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2219,8 +1298,8 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -2265,7 +1344,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_9-cuda11_6
+          name: conda-py3_9-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2313,27 +1392,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_6-upload:  # Uploading
+  conda-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_6-test
+    needs: conda-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_6
+      build_name: conda-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_7-build:
+  conda-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2343,11 +1422,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2422,7 +1500,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cuda11_7
+          name: conda-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2439,10 +1517,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_7-test:  # Testing
+  conda-py3_10-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_10-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2450,11 +1528,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2496,7 +1573,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_9-cuda11_7
+          name: conda-py3_10-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2544,27 +1621,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_7-upload:  # Uploading
+  conda-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_7-test
+    needs: conda-py3_10-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_7
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_9-cuda11_8-build:
+  conda-py3_10-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2574,11 +1650,11 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2653,7 +1729,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_9-cuda11_8
+          name: conda-py3_10-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2670,9 +1746,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_8-test:  # Testing
+  conda-py3_10-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_8-build
+    needs: conda-py3_10-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2681,11 +1757,11 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2727,7 +1803,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_9-cuda11_8
+          name: conda-py3_10-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2775,27 +1851,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_9-cuda11_8-upload:  # Uploading
+  conda-py3_10-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_8-test
+    needs: conda-py3_10-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: conda-py3_9-cuda11_8
+      DESIRED_PYTHON: "3.10"
+      build_name: conda-py3_10-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cpu-build:
+  conda-py3_10-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2805,8 +1881,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -2883,7 +1960,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cpu
+          name: conda-py3_10-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2900,10 +1977,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cpu-test:  # Testing
+  conda-py3_10-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    runs-on: windows.4xlarge
+    needs: conda-py3_10-cuda11_8-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2911,8 +1988,9 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -2956,7 +2034,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cpu
+          name: conda-py3_10-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3004,26 +2082,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cpu-upload:  # Uploading
+  conda-py3_10-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-test
+    needs: conda-py3_10-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cpu
+      build_name: conda-py3_10-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_6-build:
+  conda-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -3033,11 +2112,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3112,7 +2190,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cuda11_6
+          name: conda-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3129,10 +2207,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_6-test:  # Testing
+  conda-py3_11-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: conda-py3_11-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -3140,11 +2218,10 @@ jobs:
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3186,7 +2263,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cuda11_6
+          name: conda-py3_11-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3234,27 +2311,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_6-upload:  # Uploading
+  conda-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_6-test
+    needs: conda-py3_11-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: conda
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_6
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_7-build:
+  conda-py3_11-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -3268,7 +2344,7 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3343,7 +2419,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cuda11_7
+          name: conda-py3_11-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3360,9 +2436,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_7-test:  # Testing
+  conda-py3_11-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_7-build
+    needs: conda-py3_11-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3375,7 +2451,7 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3417,7 +2493,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cuda11_7
+          name: conda-py3_11-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3465,9 +2541,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_7-upload:  # Uploading
+  conda-py3_11-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_7-test
+    needs: conda-py3_11-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -3477,15 +2553,15 @@ jobs:
       DESIRED_CUDA: cu117
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_7
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_10-cuda11_8-build:
+  conda-py3_11-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -3499,7 +2575,7 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3574,7 +2650,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: conda-py3_10-cuda11_8
+          name: conda-py3_11-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3591,9 +2667,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_8-test:  # Testing
+  conda-py3_11-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-build
+    needs: conda-py3_11-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3606,7 +2682,7 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3648,7 +2724,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: conda-py3_10-cuda11_8
+          name: conda-py3_11-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3696,9 +2772,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_10-cuda11_8-upload:  # Uploading
+  conda-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-test
+    needs: conda-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -3708,8 +2784,8 @@ jobs:
       DESIRED_CUDA: cu118
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: conda-py3_10-cuda11_8
+      DESIRED_PYTHON: "3.11"
+      build_name: conda-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-master.yml b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
index 58816fd3d1ea..754705bdcbc0 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml
@@ -44,7 +44,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -154,7 +154,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 603010f83ffd..f83ca97fbce9 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -49,7 +49,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -159,7 +159,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -264,7 +264,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -289,7 +289,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -399,7 +399,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -504,7 +504,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-shared-without-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -529,7 +529,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -639,7 +639,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -744,7 +744,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-static-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -769,7 +769,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -879,7 +879,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -984,7 +984,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-static-without-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -992,978 +992,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-shared-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-without-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-shared-without-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-without-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-shared-without-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-without-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-shared-without-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-static-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-static-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-with-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-static-with-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-without-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-static-without-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-without-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-static-without-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-without-deps-debug-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-debug-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-static-without-deps-debug
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_7-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
@@ -1982,7 +1010,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2093,7 +1121,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2199,7 +1227,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2225,7 +1253,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2336,7 +1364,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2442,7 +1470,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-shared-without-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2468,7 +1496,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2579,7 +1607,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2685,7 +1713,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-static-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2711,7 +1739,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2822,7 +1850,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2928,7 +1956,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-static-without-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2954,7 +1982,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3065,7 +2093,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3171,7 +2199,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -3197,7 +2225,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3308,7 +2336,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3414,7 +2442,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-shared-without-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -3440,7 +2468,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3551,7 +2579,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3657,7 +2685,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-static-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -3683,7 +2711,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3794,7 +2822,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3900,7 +2928,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-static-without-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-master.yml b/.github/workflows/generated-windows-binary-libtorch-release-master.yml
index 471600e77690..b004c66542dc 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-master.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-master.yml
@@ -44,7 +44,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -154,7 +154,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index de2615cd866b..f29a5b60ae12 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -49,7 +49,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -159,7 +159,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -264,7 +264,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -289,7 +289,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -399,7 +399,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -504,7 +504,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-shared-without-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -529,7 +529,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -639,7 +639,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -744,7 +744,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-static-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -769,7 +769,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -879,7 +879,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -984,7 +984,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cpu-static-without-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -992,978 +992,6 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-shared-without-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-shared-without-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-without-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-shared-without-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-shared-without-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-shared-without-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-shared-without-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-static-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-static-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-with-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-static-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda11_6-static-without-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: libtorch-cuda11_6-static-without-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-without-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cuda11_6-static-without-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda11_6-static-without-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_6-static-without-deps-release-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: static-without-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
-      build_name: libtorch-cuda11_6-static-without-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   libtorch-cuda11_7-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
@@ -1982,7 +1010,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2093,7 +1121,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2199,7 +1227,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2225,7 +1253,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2336,7 +1364,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2442,7 +1470,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-shared-without-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2468,7 +1496,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2579,7 +1607,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2685,7 +1713,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-static-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2711,7 +1739,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2822,7 +1850,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2928,7 +1956,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_7-static-without-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -2954,7 +1982,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3065,7 +2093,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3171,7 +2199,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -3197,7 +2225,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3308,7 +2336,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3414,7 +2442,7 @@ jobs:
       LIBTORCH_VARIANT: shared-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-shared-without-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -3440,7 +2468,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3551,7 +2579,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3657,7 +2685,7 @@ jobs:
       LIBTORCH_VARIANT: static-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-static-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -3683,7 +2711,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3794,7 +2822,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3900,7 +2928,7 @@ jobs:
       LIBTORCH_VARIANT: static-without-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.7"
+      DESIRED_PYTHON: "3.8"
       build_name: libtorch-cuda11_8-static-without-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index 5b6a453a7dbe..d0f3290c6698 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -32,927 +32,6 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  wheel-py3_7-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-build
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_7-cuda11_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_6
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_6
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_6-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cuda11_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_7-cuda11_7-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_7
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_7-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_7
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_7-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_7-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cuda11_7
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_7-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cuda11_8
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: seemethere/add-github-ssh-key@v1
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails
-      - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory.
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_7-cuda11_8
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_7-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cuda11_8-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
@@ -1181,7 +260,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_6-build:
+  wheel-py3_8-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1191,8 +270,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1270,7 +349,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cuda11_6
+          name: wheel-py3_8-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1287,9 +366,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_6-test:  # Testing
+  wheel-py3_8-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_6-build
+    needs: wheel-py3_8-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1298,8 +377,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1344,7 +423,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cuda11_6
+          name: wheel-py3_8-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1392,27 +471,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_6-upload:  # Uploading
+  wheel-py3_8-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_6-test
+    needs: wheel-py3_8-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_6
+      build_name: wheel-py3_8-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_7-build:
+  wheel-py3_8-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1422,8 +501,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1501,7 +580,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cuda11_7
+          name: wheel-py3_8-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1518,9 +597,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_7-test:  # Testing
+  wheel-py3_8-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_7-build
+    needs: wheel-py3_8-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -1529,8 +608,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.8"
@@ -1575,7 +654,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cuda11_7
+          name: wheel-py3_8-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1623,27 +702,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_7-upload:  # Uploading
+  wheel-py3_8-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_7-test
+    needs: wheel-py3_8-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_7
+      build_name: wheel-py3_8-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_8-build:
+  wheel-py3_9-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1653,11 +732,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1732,7 +810,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_8-cuda11_8
+          name: wheel-py3_9-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1749,10 +827,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_8-test:  # Testing
+  wheel-py3_9-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_9-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1760,11 +838,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
+      DESIRED_PYTHON: "3.9"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -1806,7 +883,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_8-cuda11_8
+          name: wheel-py3_9-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -1854,27 +931,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_8-upload:  # Uploading
+  wheel-py3_9-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_8-test
+    needs: wheel-py3_9-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_8
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.9"
+      build_name: wheel-py3_9-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cpu-build:
+  wheel-py3_9-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -1884,8 +960,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -1962,7 +1039,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_9-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -1979,10 +1056,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-test:  # Testing
+  wheel-py3_9-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    runs-on: windows.4xlarge
+    needs: wheel-py3_9-cuda11_7-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -1990,8 +1067,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
     steps:
@@ -2035,7 +1113,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cpu
+          name: wheel-py3_9-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2083,26 +1161,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cpu-upload:  # Uploading
+  wheel-py3_9-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-test
+    needs: wheel-py3_9-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cpu
+      build_name: wheel-py3_9-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda11_6-build:
+  wheel-py3_9-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2112,8 +1191,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -2191,7 +1270,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cuda11_6
+          name: wheel-py3_9-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2208,9 +1287,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_6-test:  # Testing
+  wheel-py3_9-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_6-build
+    needs: wheel-py3_9-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2219,8 +1298,8 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.9"
@@ -2265,7 +1344,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda11_6
+          name: wheel-py3_9-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2313,27 +1392,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_6-upload:  # Uploading
+  wheel-py3_9-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_6-test
+    needs: wheel-py3_9-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda11_6
+      build_name: wheel-py3_9-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda11_7-build:
+  wheel-py3_10-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2343,11 +1422,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2422,7 +1500,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cuda11_7
+          name: wheel-py3_10-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2439,10 +1517,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_7-test:  # Testing
+  wheel-py3_10-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_7-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_10-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2450,11 +1528,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2496,7 +1573,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda11_7
+          name: wheel-py3_10-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2544,27 +1621,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_7-upload:  # Uploading
+  wheel-py3_10-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_7-test
+    needs: wheel-py3_10-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu117
-      GPU_ARCH_VERSION: 11.7
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda11_7
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_9-cuda11_8-build:
+  wheel-py3_10-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2574,11 +1650,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2653,7 +1729,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_9-cuda11_8
+          name: wheel-py3_10-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2670,9 +1746,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_8-test:  # Testing
+  wheel-py3_10-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_8-build
+    needs: wheel-py3_10-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -2681,11 +1757,11 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -2727,7 +1803,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_9-cuda11_8
+          name: wheel-py3_10-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -2775,27 +1851,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_9-cuda11_8-upload:  # Uploading
+  wheel-py3_10-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_8-test
+    needs: wheel-py3_10-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
+      DESIRED_CUDA: cu117
+      GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.9"
-      build_name: wheel-py3_9-cuda11_8
+      DESIRED_PYTHON: "3.10"
+      build_name: wheel-py3_10-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cpu-build:
+  wheel-py3_10-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -2805,8 +1881,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -2883,7 +1960,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_10-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2900,10 +1977,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-test:  # Testing
+  wheel-py3_10-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    runs-on: windows.4xlarge
+    needs: wheel-py3_10-cuda11_8-build
+    runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -2911,8 +1988,9 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
     steps:
@@ -2956,7 +2034,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cpu
+          name: wheel-py3_10-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3004,26 +2082,27 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cpu-upload:  # Uploading
+  wheel-py3_10-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-test
+    needs: wheel-py3_10-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cpu
+      build_name: wheel-py3_10-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda11_6-build:
+  wheel-py3_11-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -3033,11 +2112,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3112,7 +2190,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cuda11_6
+          name: wheel-py3_11-cpu
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3129,10 +2207,10 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_6-test:  # Testing
+  wheel-py3_11-cpu-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_6-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs: wheel-py3_11-cpu-build
+    runs-on: windows.4xlarge
     timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@@ -3140,11 +2218,10 @@ jobs:
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3186,7 +2263,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda11_6
+          name: wheel-py3_11-cpu
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3234,27 +2311,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_6-upload:  # Uploading
+  wheel-py3_11-cpu-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_6-test
+    needs: wheel-py3_11-cpu-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu116
-      GPU_ARCH_VERSION: 11.6
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda11_6
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cpu
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda11_7-build:
+  wheel-py3_11-cuda11_7-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -3268,7 +2344,7 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3343,7 +2419,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cuda11_7
+          name: wheel-py3_11-cuda11_7
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3360,9 +2436,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_7-test:  # Testing
+  wheel-py3_11-cuda11_7-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_7-build
+    needs: wheel-py3_11-cuda11_7-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3375,7 +2451,7 @@ jobs:
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3417,7 +2493,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda11_7
+          name: wheel-py3_11-cuda11_7
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3465,9 +2541,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_7-upload:  # Uploading
+  wheel-py3_11-cuda11_7-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_7-test
+    needs: wheel-py3_11-cuda11_7-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -3477,15 +2553,15 @@ jobs:
       DESIRED_CUDA: cu117
       GPU_ARCH_VERSION: 11.7
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda11_7
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda11_7
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda11_8-build:
+  wheel-py3_11-cuda11_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: windows.4xlarge
     timeout-minutes: 240
@@ -3499,7 +2575,7 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3574,7 +2650,7 @@ jobs:
       - uses: actions/upload-artifact@v3
         if: always()
         with:
-          name: wheel-py3_10-cuda11_8
+          name: wheel-py3_11-cuda11_8
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3591,9 +2667,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_8-test:  # Testing
+  wheel-py3_11-cuda11_8-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_8-build
+    needs: wheel-py3_11-cuda11_8-build
     runs-on: windows.8xlarge.nvidia.gpu
     timeout-minutes: 240
     env:
@@ -3606,7 +2682,7 @@ jobs:
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.10"
+      DESIRED_PYTHON: "3.11"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -3648,7 +2724,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda11_8
+          name: wheel-py3_11-cuda11_8
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -3696,9 +2772,9 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda11_8-upload:  # Uploading
+  wheel-py3_11-cuda11_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_8-test
+    needs: wheel-py3_11-cuda11_8-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -3708,8 +2784,8 @@ jobs:
       DESIRED_CUDA: cu118
       GPU_ARCH_VERSION: 11.8
       GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda11_8
+      DESIRED_PYTHON: "3.11"
+      build_name: wheel-py3_11-cuda11_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/inductor-perf-smoke-test.yml b/.github/workflows/inductor-perf-smoke-test.yml
deleted file mode 100644
index 8ba3a48ad7c7..000000000000
--- a/.github/workflows/inductor-perf-smoke-test.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: inductor-A100-perf-smoke-test
-
-on:
-  push:
-    branches:
-      - master
-      - main
-  pull_request:
-    paths:
-      - .github/workflows/inductor-perf-smoke-test.yml
-      - benchmarks/dynamo/check_hf_bert_perf_csv.py
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  linux-bionic-cuda11_6-py3_10-gcc7-inductor-build:
-    name: cuda11.6-py3.10-gcc7-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm80
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "test_inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
-        ]}
-
-  linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
-    name: cuda11.6-py3.10-gcc7-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-inductor-build
-    with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm80
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.test-matrix }}
-      use-gha: anything-non-empty-to-use-gha
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 4967a70732cf..dabf74f872c2 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -2,10 +2,7 @@ name: inductor-A100-perf
 
 on:
   schedule:
-    - cron: 45 1,9,17 * * *
-  pull_request:
-    paths:
-      - .github/workflows/inductor-perf-test-nightly.yml
+    - cron: 45 1 * * *
   push:
     tags:
       - ciflow/inductor-perf-test-nightly/*
@@ -16,12 +13,12 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  linux-bionic-cuda11_6-py3_10-gcc7-inductor-build:
-    name: cuda11.6-py3.10-gcc7-sm80
+  linux-bionic-cuda11_7-py3_10-gcc7-inductor-build:
+    name: cuda11.7-py3.10-gcc7-sm80
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm80
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm80
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       cuda-arch-list: '8.0'
       test-matrix: |
         { include: [
@@ -31,12 +28,12 @@ jobs:
           { config: "inductor_torchbench_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
         ]}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
-    name: cuda11.6-py3.10-gcc7-sm80
+  linux-bionic-cuda11_7-py3_10-gcc7-inductor-test:
+    name: cuda11.7-py3.10-gcc7-sm80
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-inductor-build
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-inductor-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm80
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm80
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build.outputs.test-matrix }}
       use-gha: anything-non-empty-to-use-gha
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index da01fb02adad..40ec9dfe6dc4 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -14,12 +14,12 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  linux-bionic-cuda11_6-py3_10-gcc7-inductor-build:
-    name: cuda11.6-py3.10-gcc7-sm86
+  linux-bionic-cuda11_7-py3_10-gcc7-inductor-build:
+    name: cuda11.7-py3.10-gcc7-sm86
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       cuda-arch-list: '8.6'
       test-matrix: |
         { include: [
@@ -31,11 +31,56 @@ jobs:
           { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
-    name: cuda11.6-py3.10-gcc7-sm86
+  linux-bionic-cuda11_7-py3_10-gcc7-inductor-test:
+    name: cuda11.7-py3.10-gcc7-sm86
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-inductor-build
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-inductor-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build.outputs.test-matrix }}
+
+  linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp:
+    name: cuda11.7-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm80
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+
+  linux-bionic-cuda11_7-py3_10-gcc7-inductor-test-gcp:
+    name: cuda11.7-py3.10-gcc7-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp
+    with:
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm80
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-inductor-build-gcp.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+
+  linux-focal-cpu-py3_8-gcc7-inductor-build:
+    name: linux-focal-cpu-py3.8-gcc7-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-py3_8-gcc7-build
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_cpu_accuracy", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
+          { config: "inductor_timm_cpu_accuracy", shard: 1, num_shards: 2, runner: "linux.4xlarge" },
+          { config: "inductor_timm_cpu_accuracy", shard: 2, num_shards: 2, runner: "linux.4xlarge" },
+          { config: "inductor_torchbench_cpu_accuracy", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
+        ]}
+
+  linux-focal-cpu-py3_8-gcc7-inductor-test:
+    name: linux-focal-cpu-py3.8-gcc7-inductor
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cpu-py3_8-gcc7-inductor-build
+    with:
+      build-environment: linux-focal-py3_8-gcc7-build
+      docker-image: ${{ needs.linux-focal-cpu-py3_8-gcc7-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cpu-py3_8-gcc7-inductor-build.outputs.test-matrix }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 330780677769..5dc152286e50 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -13,116 +13,96 @@ on:
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
+  docker-image:
+    name: docker-image
+    uses: ./.github/workflows/_calculate-docker-image.yml
+    with:
+      docker-image-name: pytorch-linux-focal-linter
+
   lintrunner:
-    runs-on: linux.20_04.16x
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
+    needs: docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ${{ needs.docker-image.outputs.docker-image }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
 
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
+        CACHE_DIRECTORY="/tmp/.lintbin"
+        # Try to recover the cached binaries
+        if [[ -d "${CACHE_DIRECTORY}" ]]; then
+          # It's ok to fail this as lintrunner init would download these binaries
+          # again if they do not exist
+          cp -r "${CACHE_DIRECTORY}" . || true
+        fi
 
-      - name: Install requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
+        # This has already been cached in the docker image
+        lintrunner init 2> /dev/null
 
-      - name: Initialize lint dependencies
-        run: lintrunner init
+        # Do build steps necessary for linters
+        python3 -m tools.linter.clang_tidy.generate_build_files
+        python3 -m tools.generate_torch_version --is_debug=false
+        python3 -m tools.pyi.gen_pyi \
+          --native-functions-path aten/src/ATen/native/native_functions.yaml \
+          --tags-path aten/src/ATen/native/tags.yaml \
+          --deprecated-functions-path "tools/autograd/deprecated.yaml"
 
-      - name: Do build steps necessary for linters
-        run: |
-          python3 -m tools.linter.clang_tidy.generate_build_files
-          python3 -m tools.generate_torch_version --is_debug=false
-          python3 -m tools.pyi.gen_pyi \
-            --native-functions-path aten/src/ATen/native/native_functions.yaml \
-            --tags-path aten/src/ATen/native/tags.yaml \
-            --deprecated-functions-path "tools/autograd/deprecated.yaml"
+        RC=0
+        # Run lintrunner on all files
+        if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
+          echo ""
+          echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+          echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+          RC=1
+        fi
 
-      - name: Run lintrunner on all files (nonretryable)
-        run: |
-          set +e
-          if ! lintrunner --force-color --all-files --tee-json=lint.json; then
-              echo ""
-              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
-              echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-              exit 1
-          fi
+        # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
+        jq --raw-output \
+          '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
+          lint.json || true
 
-      - name: Store annotations
-        if: always() && github.event_name == 'pull_request'
-        # Don't show this as an error; the above step will have already failed.
-        continue-on-error: true
-        run: |
-          # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-          jq --raw-output \
-            '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-            lint.json
+        exit $RC
 
   quick-checks:
-    name: quick-checks
-    runs-on: linux.20_04.4x
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.x'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Install requirements
-        id: requirements
-        run: pip install -r requirements.txt --user
-      - name: Ensure no non-breaking spaces
-        if: always()
-        run: |
-          # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
-          # does not support the '\u000a' syntax (which is relevant for local linters)
-          (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
-      - name: Ensure cross-OS compatible file names (nonretryable)
-        if: always()
-        run: |
-          (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false))
-      - name: Ensure no versionless Python shebangs (nonretryable)
-        if: always()
-        run: |
-          (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
-      - name: C++ docs check (nonretryable)
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: |
-          sudo apt-get install -y doxygen
-          cd docs/cpp/source && ./check-doxygen.sh
-      - name: CUDA kernel launch check (nonretryable)
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: |
-          set -eux
-          python torch/testing/_internal/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
+    needs: docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ${{ needs.docker-image.outputs.docker-image }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Ensure no non-breaking spaces
+        # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
+        # does not support the '\u000a' syntax (which is relevant for local linters)
+        (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
+
+        # Ensure cross-OS compatible file names
+        (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false))
+
+        # Ensure no versionless Python shebangs
+        (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
+
+        # Ensure ciflow tags mentioned in config
+        python3 .github/scripts/collect_ciflow_labels.py --validate-tags
+
+        # C++ docs check
+        pushd docs/cpp/source
+        ./check-doxygen.sh
+        popd
+
+        # CUDA kernel launch check
+        set -eux
+        python3 torch/testing/_internal/check_kernel_launches.py |& tee cuda_kernel_launch_checks.txt
 
   pr-sanity-checks:
     name: pr-sanity-checks
-    runs-on: linux.20_04.4x
-    # Only run this on pull requests
+    runs-on: [self-hosted, linux.large]
+    # Only run this on pull requests. This check is simple enough to be done without a Docker image
     if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
     steps:
       - name: Checkout PyTorch
@@ -130,6 +110,7 @@ jobs:
         with:
           submodules: false
           fetch-depth: -1
+
       - name: PR size check (nonretryable)
         env:
           BASE: ${{ github.event.pull_request.base.sha }}
@@ -138,136 +119,91 @@ jobs:
           bash .github/scripts/pr-sanity-check.sh
 
   workflow-checks:
-    name: workflow-checks
-    runs-on: linux.20_04.4x
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.x'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-            **/.github/requirements-gha-cache.txt
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r requirements.txt --user
-      - name: Install Jinja2
-        run: |
-          pip install Jinja2==3.0.1 --user
-      - name: Regenerate workflows (nonretryable)
-        id: generate_workflows
-        run: .github/scripts/generate_ci_workflows.py
-      - name: Assert that regenerating the workflows didn't change them (nonretryable)
-        run: |
-          if ! .github/scripts/report_git_status.sh .github/workflows; then
-            echo
-            echo 'As shown by the above diff, the committed .github/workflows'
-            echo 'are not up to date according to .github/templates.'
-            echo 'Please run this command, commit, and push again to your PR:'
-            echo
-            echo '    .github/scripts/generate_ci_workflows.py'
-            echo
-            echo 'If running that command does nothing, you may need to rebase'
-            echo 'onto a more recent commit from the PyTorch master branch.'
-            false
-          fi
-      - name: Check that jobs will be cancelled (nonretryable)
-        if: ${{ always() && steps.generate_workflows.outcome == 'success' }}
-        run: |
-          .github/scripts/ensure_actions_will_cancel.py
+    needs: docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ${{ needs.docker-image.outputs.docker-image }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Regenerate workflows
+        .github/scripts/generate_ci_workflows.py
+
+        RC=0
+        # Assert that regenerating the workflows didn't change them
+        if ! .github/scripts/report_git_status.sh .github/workflows; then
+          echo
+          echo 'As shown by the above diff, the committed .github/workflows'
+          echo 'are not up to date according to .github/templates.'
+          echo 'Please run this command, commit, and push again to your PR:'
+          echo
+          echo '    .github/scripts/generate_ci_workflows.py'
+          echo
+          echo 'If running that command does nothing, you may need to rebase'
+          echo 'onto a more recent commit from the PyTorch master branch.'
+          RC=1
+        fi
+
+        # Check that jobs will be cancelled
+        .github/scripts/ensure_actions_will_cancel.py
+
+        exit $RC
 
   toc:
-    name: toc
-    runs-on: linux.20_04.4x
-    # https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
-    env:
-      NPM_CONFIG_PREFIX: ~/.npm-global
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      # This is not a node project so there is no package-lock.json to cache
-      - name: Setup Node
-        uses: actions/setup-node@v3
-      - name: Install markdown-toc
-        run: npm install -g markdown-toc
-      - name: Regenerate ToCs and check that they didn't change (nonretryable)
-        run: |
-          set -eu
-          export PATH=~/.npm-global/bin:"$PATH"
-          for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
-            markdown-toc --bullets='-' -i "$FILE"
-          done
+    needs: docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ${{ needs.docker-image.outputs.docker-image }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Regenerate ToCs and check that they didn't change
+        set -eu
 
-          if ! .github/scripts/report_git_status.sh .; then
-            echo
-            echo 'As shown by the above diff, the table of contents in one or'
-            echo 'more Markdown files is not up to date with the file contents.'
-            echo 'You can either apply that Git diff directly to correct the'
-            echo 'table of contents, or if you have npm installed, you can'
-            echo 'install the npm package markdown-toc and run the following'
-            # shellcheck disable=SC2016
-            echo 'command (replacing $FILE with the filename for which you want'
-            echo 'to regenerate the table of contents):'
-            echo
-            # shellcheck disable=SC2016
-            echo "    markdown-toc --bullets='-' -i \"\$FILE\""
-            false
-          fi
+        export PATH=~/.npm-global/bin:"$PATH"
+        for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
+          markdown-toc --bullets='-' -i "$FILE"
+        done
+
+        if ! .github/scripts/report_git_status.sh .; then
+          echo
+          echo 'As shown by the above diff, the table of contents in one or'
+          echo 'more Markdown files is not up to date with the file contents.'
+          echo 'You can either apply that Git diff directly to correct the'
+          echo 'table of contents, or if you have npm installed, you can'
+          echo 'install the npm package markdown-toc and run the following'
+          # shellcheck disable=SC2016
+          echo 'command (replacing $FILE with the filename for which you want'
+          echo 'to regenerate the table of contents):'
+          echo
+          # shellcheck disable=SC2016
+          echo "    markdown-toc --bullets='-' -i \"\$FILE\""
+          false
+        fi
 
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: linux.20_04.4x
-    steps:
-      # [see note: pytorch repo ref]
-      # deep clone (fetch-depth 0) required, to allow us to use git log
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-            **/requirements-flake8.txt
-            **/.circleci/docker/requirements-ci.txt
-            **/.github/requirements-gha-cache.txt
-      - name: Install dependencies
-        # mypy and boto3 versions copied from
-        # .circleci/docker/common/install_conda.sh
-        run: |
-          set -eux
-          pip install -r requirements.txt
-          pip install boto3==1.19.12
-          pip install typing-extensions==3.10 --user
-          pip install -r requirements-flake8.txt --user
-          pip install rockset==0.8.10 --user
-          pip install -r requirements.txt --user
-          pip install mypy==0.960 --user
-          make setup_lint
-      - name: Test tools (nonretryable)
-        run: |
-          python3 -m unittest discover -vs tools/test -p 'test_*.py'
-          python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
+    needs: docker-image
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ${{ needs.docker-image.outputs.docker-image }}
+      fetch-depth: 0
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Test tools
+        python3 -m unittest discover -vs tools/test -p 'test_*.py'
+        python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
 
   test_collect_env:
     if: ${{ github.repository == 'pytorch/pytorch' }}
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index 663eac84514f..bc76b1b796a2 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -17,8 +17,7 @@ jobs:
     with:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
-      xcode-version: "13.3.1"
-      runner-type: macos-12-xl
+      runner-type: macos-m1-12
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 5c1de3dac547..d0b362d34d5e 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -21,17 +21,17 @@ jobs:
     name: docs build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: linux-focal-py3.8-gcc7
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
 
   docs-push:
     name: docs push
     uses: ./.github/workflows/_docs.yml
     needs: docs-build
     with:
-      build-environment: linux-focal-py3.7-gcc7
+      build-environment: linux-focal-py3.8-gcc7
       docker-image: ${{ needs.docs-build.outputs.docker-image }}
-      push: true
+      push: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
       run-doxygen: true
     secrets:
       GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index a76e37413f3b..a9b41e379650 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -14,126 +14,128 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  parallelnative-linux-focal-py3_7-gcc7-build:
-    name: parallelnative-linux-focal-py3.7-gcc7
+  parallelnative-linux-focal-py3_8-gcc7-build:
+    name: parallelnative-linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: parallelnative-linux-focal-py3.7-gcc7
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: parallelnative-linux-focal-py3.8-gcc7
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
         ]}
 
-  parallelnative-linux-focal-py3_7-gcc7-test:
-    name: parallelnative-linux-focal-py3.7-gcc7
+  parallelnative-linux-focal-py3_8-gcc7-test:
+    name: parallelnative-linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-test.yml
-    needs: parallelnative-linux-focal-py3_7-gcc7-build
+    needs: parallelnative-linux-focal-py3_8-gcc7-build
     with:
-      build-environment: parallelnative-linux-focal-py3.7-gcc7
-      docker-image: ${{ needs.parallelnative-linux-focal-py3_7-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.parallelnative-linux-focal-py3_7-gcc7-build.outputs.test-matrix }}
+      build-environment: parallelnative-linux-focal-py3.8-gcc7
+      docker-image: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.docker-image }}
+      test-matrix: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
 
-  linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build:
-    name: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
+  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build:
+    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-test:
-    name: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
+  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-test:
+    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build
+    needs: linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
       timeout-minutes: 300
 
-  linux-bionic-cuda11_6-py3_10-gcc7-periodic-dynamo-benchmarks-build:
-    name: cuda11.6-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
+  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build:
+    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       cuda-arch-list: '8.6'
       test-matrix: |
         { include: [
           { config: "aot_eager_all", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          # These jobs run too slowly so they must be sharded, unfortunately
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-periodic-dynamo-benchmarks-test:
-    name: cuda11.6-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
+  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-test:
+    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-periodic-dynamo-benchmarks-build
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
 
-  linux-focal-rocm5_3-py3_8-build:
-    name: linux-focal-rocm5.3-py3.8
+  linux-focal-rocm5_4_2-py3_8-build:
+    name: linux-focal-rocm5.4.2-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.3-py3.8
-      docker-image-name: pytorch-linux-focal-rocm5.3-py3.8
+      build-environment: linux-focal-rocm5.4.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm-n-py3
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
         ]}
-      # test-matrix: |
-      #   { include: [
-      #     { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-      #     { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-      #     { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
-      #   ]}
-
-  linux-focal-rocm5_3-py3_8-test:
-    name: linux-focal-rocm5.3-py3.8
+
+  linux-focal-rocm5_4_2-py3_8-test:
+    name: linux-focal-rocm5.4.2-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_3-py3_8-build
+    needs: linux-focal-rocm5_4_2-py3_8-build
     with:
-      build-environment: linux-focal-rocm5.3-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_3-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_3-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm5.4.2-py3.8
+      docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
 
-  linux-bionic-cuda11_6-py3_9-gcc7-build:
-    name: linux-bionic-cuda11.6-py3.9-gcc7
+  linux-bionic-cuda11_7-py3_9-gcc7-build:
+    name: linux-bionic-cuda11.7-py3.9-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.9-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       test-matrix: |
         { include: [
           { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" },
         ]}
       build-with-debug: false
 
-  linux-bionic-cuda11_6-py3_9-gcc7-test:
-    name: linux-bionic-cuda11.6-py3.9-gcc7
+  linux-bionic-cuda11_7-py3_9-gcc7-test:
+    name: linux-bionic-cuda11.7-py3.9-gcc7
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_9-gcc7-build
+    needs: linux-bionic-cuda11_7-py3_9-gcc7-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.9-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_9-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_9-gcc7-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.test-matrix }}
 
-  linux-bionic-cuda11_6-py3_7-gcc7-debug-build:
-    name: linux-bionic-cuda11.6-py3.7-gcc7-debug
+  linux-bionic-cuda11_7-py3_10-gcc7-debug-build:
+    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.7-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       build-with-debug: true
       test-matrix: |
         { include: [
@@ -143,21 +145,21 @@ jobs:
           { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_6-py3_7-gcc7-debug-test:
-    name: linux-bionic-cuda11.6-py3.7-gcc7-debug
+  linux-bionic-cuda11_7-py3_10-gcc7-debug-test:
+    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_7-gcc7-debug-build
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-debug-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.7-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_7-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_7-gcc7-debug-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.test-matrix }}
 
-  linux-bionic-cuda11_7-py3_7-gcc7-debug-build:
-    name: linux-bionic-cuda11.7-py3.7-gcc7-debug
+  linux-bionic-cuda11_8-py3_8-gcc7-debug-build:
+    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.7-py3.7-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
       build-with-debug: true
       test-matrix: |
         { include: [
@@ -167,20 +169,51 @@ jobs:
           { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_7-py3_7-gcc7-debug-test:
-    name: linux-bionic-cuda11.7-py3.7-gcc7-debug
+  linux-bionic-cuda11_8-py3_8-gcc7-debug-test:
+    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_7-gcc7-debug-build
+    needs: linux-bionic-cuda11_8-py3_8-gcc7-debug-build
+    with:
+      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
+      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.test-matrix }}
+
+  libtorch-linux-bionic-cuda11_8-gcc7-build:
+    name: libtorch-linux-bionic-cuda11.8-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: libtorch-linux-bionic-cuda11.8-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      build-generates-artifacts: false
+
+  win-vs2019-cuda11_8-py3-build:
+    name: win-vs2019-cuda11.8-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2019-cuda11.8-py3
+      cuda-version: "11.8"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
+
+  win-vs2019-cuda11_8-py3-test:
+    name: win-vs2019-cuda11.8-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs: win-vs2019-cuda11_8-py3-build
     with:
-      build-environment: linux-bionic-cuda11.7-py3.7-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.test-matrix }}
+      build-environment: win-vs2019-cuda11.8-py3
+      cuda-version: "11.8"
+      test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }}
 
-  libtorch-linux-bionic-cuda11_7-py3_7-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
+  libtorch-linux-bionic-cuda11_7-gcc7-build:
+    name: libtorch-linux-bionic-cuda11.7-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
+      build-environment: libtorch-linux-bionic-cuda11.7-gcc7
       docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       build-generates-artifacts: false
 
@@ -192,9 +225,9 @@ jobs:
       cuda-version: "11.7"
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
           { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
         ]}
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9b210716d10a..90259dc80d68 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -17,85 +17,83 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  linux-focal-py3_7-gcc7-build:
-    name: linux-focal-py3.7-gcc7
+  linux-focal-py3_8-gcc7-build:
+    name: linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: linux-focal-py3.8-gcc7
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  linux-focal-py3_7-gcc7-test:
-    name: linux-focal-py3.7-gcc7
+  linux-focal-py3_8-gcc7-test:
+    name: linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-gcc7-build
+    needs: linux-focal-py3_8-gcc7-build
     with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image: ${{ needs.linux-focal-py3_7-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-gcc7-build.outputs.test-matrix }}
+      build-environment: linux-focal-py3.8-gcc7
+      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
 
   linux-docs:
     name: linux-docs
     uses: ./.github/workflows/_docs.yml
-    needs: linux-focal-py3_7-gcc7-build
+    needs: linux-focal-py3_8-gcc7-build
     with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image: ${{ needs.linux-focal-py3_7-gcc7-build.outputs.docker-image }}
+      build-environment: linux-focal-py3.8-gcc7
+      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
 
-  linux-focal-py3_7-gcc7-no-ops:
-    name: linux-focal-py3.7-gcc7-no-ops
+  linux-focal-py3_8-gcc7-no-ops:
+    name: linux-focal-py3.8-gcc7-no-ops
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-gcc7-no-ops
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: linux-focal-py3.8-gcc7-no-ops
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
 
-  linux-focal-py3_7-gcc7-pch:
-    name: linux-focal-py3.7-gcc7-pch
+  linux-focal-py3_8-gcc7-pch:
+    name: linux-focal-py3.8-gcc7-pch
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-gcc7-pch
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: linux-focal-py3.8-gcc7-pch
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
 
-  linux-focal-py3_7-clang7-asan-build:
-    name: linux-focal-py3.7-clang7-asan
+  linux-focal-py3_9-clang7-asan-build:
+    name: linux-focal-py3.9-clang7-asan
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-clang7-asan
+      build-environment: linux-focal-py3.9-clang7-asan
       docker-image-name: pytorch-linux-focal-py3-clang7-asan
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge" },
           { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
           { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  linux-focal-py3_7-clang7-asan-test:
-    name: linux-focal-py3.7-clang7-asan
+  linux-focal-py3_9-clang7-asan-test:
+    name: linux-focal-py3.9-clang7-asan
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-clang7-asan-build
+    needs: linux-focal-py3_9-clang7-asan-build
     with:
-      build-environment: linux-focal-py3.7-clang7-asan
-      docker-image: ${{ needs.linux-focal-py3_7-clang7-asan-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-clang7-asan-build.outputs.test-matrix }}
+      build-environment: linux-focal-py3.9-clang7-asan
+      docker-image: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.test-matrix }}
 
-  linux-focal-py3_7-clang10-onnx-build:
-    name: linux-focal-py3.7-clang10-onnx
+  linux-focal-py3_8-clang10-onnx-build:
+    name: linux-focal-py3.8-clang10-onnx
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-clang10-onnx
+      build-environment: linux-focal-py3.8-clang10-onnx
       docker-image-name: pytorch-linux-focal-py3-clang10-onnx
       test-matrix: |
         { include: [
@@ -103,21 +101,21 @@ jobs:
           { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
         ]}
 
-  linux-focal-py3_7-clang10-onnx-test:
-    name: linux-focal-py3.7-clang10-onnx
+  linux-focal-py3_8-clang10-onnx-test:
+    name: linux-focal-py3.8-clang10-onnx
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-clang10-onnx-build
+    needs: linux-focal-py3_8-clang10-onnx-build
     with:
-      build-environment: linux-focal-py3.7-clang10-onnx
-      docker-image: ${{ needs.linux-focal-py3_7-clang10-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-clang10-onnx-build.outputs.test-matrix }}
+      build-environment: linux-focal-py3.8-clang10-onnx
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
 
-  linux-bionic-py3_7-clang9-build:
-    name: linux-bionic-py3.7-clang9
+  linux-bionic-py3_8-clang9-build:
+    name: linux-bionic-py3.8-clang9
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-py3.7-clang9
-      docker-image-name: pytorch-linux-bionic-py3.7-clang9
+      build-environment: linux-bionic-py3.8-clang9
+      docker-image-name: pytorch-linux-bionic-py3.8-clang9
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@@ -129,41 +127,67 @@ jobs:
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  linux-bionic-py3_7-clang9-test:
-    name: linux-bionic-py3.7-clang9
+  linux-bionic-py3_8-clang9-test:
+    name: linux-bionic-py3.8-clang9
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_7-clang9-build
+    needs: linux-bionic-py3_8-clang9-build
     with:
-      build-environment: linux-bionic-py3.7-clang9
-      docker-image: ${{ needs.linux-bionic-py3_7-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_7-clang9-build.outputs.test-matrix }}
+      build-environment: linux-bionic-py3.8-clang9
+      docker-image: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.test-matrix }}
 
-  linux-vulkan-bionic-py3_7-clang9-build:
-    name: linux-vulkan-bionic-py3.7-clang9
+  linux-bionic-py3_11-clang9-build:
+    name: linux-bionic-py3.11-clang9
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-vulkan-bionic-py3.7-clang9
-      docker-image-name: pytorch-linux-bionic-py3.7-clang9
+      build-environment: linux-bionic-py3.11-clang9
+      docker-image-name: pytorch-linux-bionic-py3.11-clang9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+        ]}
+
+  linux-bionic-py3_11-clang9-test:
+    name: linux-bionic-py3.11-clang9
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-py3_11-clang9-build
+    with:
+      build-environment: linux-bionic-py3.11-clang9
+      docker-image: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.test-matrix }}
+
+  linux-vulkan-bionic-py3_11-clang9-build:
+    name: linux-vulkan-bionic-py3.11-clang9
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-vulkan-bionic-py3.11-clang9
+      docker-image-name: pytorch-linux-bionic-py3.11-clang9
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  linux-vulkan-bionic-py3_7-clang9-test:
-    name: linux-vulkan-bionic-py3.7-clang9
+  linux-vulkan-bionic-py3_11-clang9-test:
+    name: linux-vulkan-bionic-py3.11-clang9
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-vulkan-bionic-py3_7-clang9-build
+    needs: linux-vulkan-bionic-py3_11-clang9-build
     with:
-      build-environment: linux-vulkan-bionic-py3.7-clang9
-      docker-image: ${{ needs.linux-vulkan-bionic-py3_7-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-vulkan-bionic-py3_7-clang9-build.outputs.test-matrix }}
+      build-environment: linux-vulkan-bionic-py3.11-clang9
+      docker-image: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.test-matrix }}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-build:
-    name: linux-bionic-cuda11.6-py3.10-gcc7
+  linux-bionic-cuda11_7-py3_10-gcc7-build:
+    name: linux-bionic-cuda11.7-py3.10-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
@@ -177,14 +201,14 @@ jobs:
           { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-test:
-    name: linux-bionic-cuda11.6-py3.10-gcc7
+  linux-bionic-cuda11_7-py3_10-gcc7-test:
+    name: linux-bionic-cuda11.7-py3.10-gcc7
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-build
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.test-matrix }}
 
   linux-focal-py3-clang7-mobile-build:
     name: linux-focal-py3-clang7-mobile-build
@@ -194,12 +218,12 @@ jobs:
       docker-image-name: pytorch-linux-focal-py3-clang7-asan
       build-generates-artifacts: false
 
-  linux-jammy-cuda-11_6-cudnn8-py3_8-clang12-build:
-    name: linux-jammy-cuda11.6-cudnn8-py3.8-clang12
+  linux-jammy-cuda-11_7-cudnn8-py3_8-clang12-build:
+    name: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-jammy-cuda11.6-cudnn8-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
+      build-environment: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
+      docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
 
   linux-focal-py3-clang7-mobile-custom-build-static:
     name: linux-focal-py3-clang7-mobile-custom-build-static
@@ -209,6 +233,26 @@ jobs:
       docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
       build-generates-artifacts: false
 
+  linux-bionic-py3_8-clang8-xla-build:
+    name: linux-bionic-py3_8-clang8-xla
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-py3_8-clang8-xla
+      docker-image-name: xla_base
+      test-matrix: |
+        { include: [
+          { config: "xla", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
+        ]}
+
+  linux-bionic-py3_8-clang8-xla-test:
+    name: linux-bionic-py3_8-clang8-xla
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-py3_8-clang8-xla-build
+    with:
+      build-environment: linux-bionic-py3_8-clang8-xla
+      docker-image: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.test-matrix }}
+
   win-vs2019-cpu-py3-build:
     name: win-vs2019-cpu-py3
     uses: ./.github/workflows/_win-build.yml
@@ -231,31 +275,31 @@ jobs:
       cuda-version: cpu
       test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
 
-  win-vs2019-cuda11_6-py3-build:
+  win-vs2019-cuda11_7-py3-build:
     if: github.event_name == 'pull_request'
-    name: win-vs2019-cuda11.6-py3
+    name: win-vs2019-cuda11.7-py3
     uses: ./.github/workflows/_win-build.yml
     with:
-      build-environment: win-vs2019-cuda11.6-py3
-      cuda-version: "11.6"
+      build-environment: win-vs2019-cuda11.7-py3
+      cuda-version: "11.7"
       sync-tag: win-cuda-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.g5.4xlarge.nvidia.gpu" },
           { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
         ]}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-bazel-test:
-    name: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test
+  linux-bionic-cuda11_7-py3_10-gcc7-bazel-test:
+    name: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
     uses: ./.github/workflows/_bazel-build-test.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
 
   linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single:
     name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
@@ -271,25 +315,52 @@ jobs:
       build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
       docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
 
-  linux-focal-py3_7-gcc7-mobile-lightweight-dispatch-build:
-    name: linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build
+  linux-focal-py3_8-gcc7-mobile-lightweight-dispatch-build:
+    name: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
       build-generates-artifacts: false
 
-  linux-focal-rocm5_3-py3_8-build:
+  linux-focal-rocm5_4_2-py3_8-build:
     # don't run build twice on master
     if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.3-py3.8
+    name: linux-focal-rocm5.4.2-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.3-py3.8
-      docker-image-name: pytorch-linux-focal-rocm5.3-py3.8
+      build-environment: linux-focal-rocm5.4.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
         ]}
+
+  linux-bionic-cuda11_7-py3_10-gcc7-sm86-build:
+    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-bionic-cuda11_7-py3_10-gcc7-sm86-test:
+    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_7-py3_10-gcc7-sm86-build
+    with:
+      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.test-matrix }}
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index 6ffdf31d1da8..8d55f6a9479c 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -40,8 +40,8 @@ jobs:
           . "${SETUP_SCRIPT}"
           conda activate pr-ci
           conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
-                           setuptools cmake=3.22.* typing_extensions boto3 \
-                           six pillow pytest tabulate gitpython git-lfs tqdm psutil
+                           setuptools cmake=3.22.* typing-extensions boto3 \
+                           pillow pytest tabulate gitpython git-lfs tqdm psutil
           pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
       - name: Setup TorchBench branch
         run: |
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 5f2339e3c7de..85683d41c145 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -19,13 +19,14 @@ concurrency:
 
 jobs:
   # Build PyTorch with BUILD_CAFFE2=ON
-  caffe2-linux-focal-py3_7-gcc7-build:
-    name: caffe2-linux-focal-py3.7-gcc7
+  caffe2-linux-focal-py3_8-gcc7-build:
+    name: caffe2-linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: caffe2-linux-focal-py3.7-gcc7
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
+      build-environment: caffe2-linux-focal-py3.8-gcc7
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
 
+  # We only have the configs that are not already on the same pull job here
   linux-bionic-cuda11_7-py3_10-gcc7-build:
     name: linux-bionic-cuda11.7-py3.10-gcc7
     uses: ./.github/workflows/_linux-build.yml
@@ -34,17 +35,9 @@ jobs:
       docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
         ]}
 
   linux-bionic-cuda11_7-py3_10-gcc7-test:
@@ -56,39 +49,42 @@ jobs:
       docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.test-matrix }}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-sm86-build:
-    name: linux-bionic-cuda11.6-py3.10-gcc7-sm86
+  linux-bionic-cuda11_8-py3_10-gcc7-build:
+    name: linux-bionic-cuda11.8-py3.10-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
-      cuda-arch-list: 8.6
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda11_6-py3_10-gcc7-sm86-test:
-    name: linux-bionic-cuda11.6-py3.10-gcc7-sm86
+  linux-bionic-cuda11_8-py3_10-gcc7-test:
+    name: linux-bionic-cuda11.8-py3.10-gcc7
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-sm86-build
+    needs: linux-bionic-cuda11_8-py3_10-gcc7-build
     with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-sm86-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda11.8-py3.10-gcc7
+      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-build.outputs.test-matrix }}
 
-  libtorch-linux-bionic-cuda11_6-py3_7-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.6-py3.7-gcc7
+  libtorch-linux-bionic-cuda11_7-py3_7-gcc7-build:
+    name: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: libtorch-linux-bionic-cuda11.6-py3.7-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
+      build-environment: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
       build-generates-artifacts: false
       runner: linux.4xlarge
 
@@ -107,45 +103,45 @@ jobs:
       build-environment: pytorch-linux-focal-py3-clang7-android-ndk-r19c-build
       docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
 
-  linux-bionic-py3_7-clang9-slow-build:
-    name: linux-bionic-py3.7-clang9-slow
+  linux-bionic-py3_8-clang9-slow-build:
+    name: linux-bionic-py3.8-clang9-slow
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-py3.7-clang9-slow
-      docker-image-name: pytorch-linux-bionic-py3.7-clang9
+      build-environment: linux-bionic-py3.8-clang9-slow
+      docker-image-name: pytorch-linux-bionic-py3.8-clang9
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  linux-bionic-py3_7-clang9-slow-test:
-    name: linux-bionic-py3.7-clang9-slow
+  linux-bionic-py3_8-clang9-slow-test:
+    name: linux-bionic-py3.8-clang9-slow
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_7-clang9-slow-build
+    needs: linux-bionic-py3_8-clang9-slow-build
     with:
-      build-environment: linux-bionic-py3.7-clang9-slow
-      docker-image: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.test-matrix }}
+      build-environment: linux-bionic-py3.8-clang9-slow
+      docker-image: ${{ needs.linux-bionic-py3_8-clang9-slow-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-py3_8-clang9-slow-build.outputs.test-matrix }}
 
-  linux-focal-py3_7-clang7-tsan-build:
-    name: linux-focal-py3.7-clang7-tsan
+  linux-focal-py3_9-clang7-tsan-build:
+    name: linux-focal-py3.9-clang7-tsan
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-py3.7-clang7-tsan
+      build-environment: linux-focal-py3.9-clang7-tsan
       docker-image-name: pytorch-linux-focal-py3-clang7-asan
       test-matrix: |
         { include: [
           { config: "tsan", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 
-  linux-focal-py3_7-clang7-tsan-test:
-    name: linux-focal-py3.7-clang7-tsan
+  linux-focal-py3_9-clang7-tsan-test:
+    name: linux-focal-py3.9-clang7-tsan
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-clang7-tsan-build
+    needs: linux-focal-py3_9-clang7-tsan-build
     with:
-      build-environment: linux-focal-py3.7-clang7-tsan
-      docker-image: ${{ needs.linux-focal-py3_7-clang7-tsan-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-clang7-tsan-build.outputs.test-matrix }}
+      build-environment: linux-focal-py3.9-clang7-tsan
+      docker-image: ${{ needs.linux-focal-py3_9-clang7-tsan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_9-clang7-tsan-build.outputs.test-matrix }}
 
   ios-12-5-1-x86-64:
     name: ios-12-5-1-x86-64
@@ -204,8 +200,7 @@ jobs:
     with:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
-      xcode-version: "13.3.1"
-      runner-type: macos-12-xl
+      runner-type: macos-m1-12
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
@@ -244,39 +239,39 @@ jobs:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
 
-  win-vs2019-cuda11_6-py3-build:
-    name: win-vs2019-cuda11.6-py3
+  win-vs2019-cuda11_7-py3-build:
+    name: win-vs2019-cuda11.7-py3
     uses: ./.github/workflows/_win-build.yml
     with:
-      build-environment: win-vs2019-cuda11.6-py3
-      cuda-version: "11.6"
+      build-environment: win-vs2019-cuda11.7-py3
+      cuda-version: "11.7"
       sync-tag: win-cuda-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
+          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.g5.4xlarge.nvidia.gpu" },
           { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
         ]}
 
-  win-vs2019-cuda11_6-py3-test:
-    name: win-vs2019-cuda11.6-py3
+  win-vs2019-cuda11_7-py3-test:
+    name: win-vs2019-cuda11.7-py3
     uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_6-py3-build
+    needs: win-vs2019-cuda11_7-py3-build
     with:
-      build-environment: win-vs2019-cuda11.6-py3
-      cuda-version: "11.6"
-      test-matrix: ${{ needs.win-vs2019-cuda11_6-py3-build.outputs.test-matrix }}
+      build-environment: win-vs2019-cuda11.7-py3
+      cuda-version: "11.7"
+      test-matrix: ${{ needs.win-vs2019-cuda11_7-py3-build.outputs.test-matrix }}
 
-  linux-focal-rocm5_3-py3_8-build:
-    name: linux-focal-rocm5.3-py3.8
+  linux-focal-rocm5_4_2-py3_8-build:
+    name: linux-focal-rocm5.4.2-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.3-py3.8
-      docker-image-name: pytorch-linux-focal-rocm5.3-py3.8
+      build-environment: linux-focal-rocm5.4.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
         { include: [
@@ -284,14 +279,14 @@ jobs:
           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
         ]}
 
-  linux-focal-rocm5_3-py3_8-test:
-    name: linux-focal-rocm5.3-py3.8
+  linux-focal-rocm5_4_2-py3_8-test:
+    name: linux-focal-rocm5.4.2-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_3-py3_8-build
+    needs: linux-focal-rocm5_4_2-py3_8-build
     with:
-      build-environment: linux-focal-rocm5.3-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_3-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_3-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm5.4.2-py3.8
+      docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 3d1d92967d88..9cdcd8a36ef0 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -25,7 +25,7 @@ jobs:
           check-latest: false
           cache: pip
           architecture: x64
-      - run: pip install pyyaml==6.0
+      - run: pip install pyyaml==6.0 rockset==1.0.3
 
       - name: Setup committer id
         run: |
@@ -40,6 +40,7 @@ jobs:
           LAND_CHECKS: ${{ github.event.client_payload.land_checks }}
           COMMENT_ID: ${{ github.event.client_payload.comment_id }}
           REBASE: ${{ github.event.client_payload.rebase }}
+          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
         run: |
           set -ex
           if [ -n "${REBASE}" ]; then
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 1eeaf255c85f..39c157708392 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -29,25 +29,26 @@ jobs:
           echo " PR to trigger this workflow. That can be done either manually or"
           echo " automatically using PyTorch auto-label bot."
           echo
-          echo "Once the jobs are deemed stable enough (% red signal < 20% and TTS < 3h),"
+          echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h),"
           echo " they can graduate and move back to pull or trunk."
 
-  linux-bionic-py3_7-clang8-xla-build:
-    name: linux-bionic-py3_7-clang8-xla
+  linux-focal-py3_8-gcc7-build:
+    name: linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-py3_7-clang8-xla
-      docker-image-name: xla_base
+      build-environment: linux-focal-py3.8-gcc7
+      docker-image-name: pytorch-linux-focal-py3.8-gcc7
       test-matrix: |
         { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
         ]}
 
-  linux-bionic-py3_7-clang8-xla-test:
-    name: linux-bionic-py3_7-clang8-xla
+  linux-focal-py3_8-gcc7-test:
+    name: linux-focal-py3.8-gcc7
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_7-clang8-xla-build
+    needs: linux-focal-py3_8-gcc7-build
     with:
-      build-environment: linux-bionic-py3_7-clang8-xla
-      docker-image: ${{ needs.linux-bionic-py3_7-clang8-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_7-clang8-xla-build.outputs.test-matrix }}
+      build-environment: linux-focal-py3.8-gcc7
+      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 12bf4e271f92..86e47f33d88a 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -27,14 +27,13 @@ jobs:
           check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/.circleci/docker/requirements-ci.txt
+            **/.ci/docker/requirements-ci.txt
             **/.github/requirements-gha-cache.txt
 
       - name: Install Python Packages
         run: |
-          pip3 install rockset==0.8.10
+          pip3 install rockset==1.0.3
           pip3 install boto3==1.19.12
-          pip3 install six==1.16.0
 
       - name: Get latest viable commit
         env:
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index 9b5daff3df5d..5d5a05cc8927 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -24,4 +24,4 @@ jobs:
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
         run: |
           python3 -m pip install boto3==1.19.12
-          .github/scripts/export_pytorch_labels.py
+          .github/scripts/export_pytorch_labels.py pytorch pytorch
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index 3f3db80670d8..0f1a74a5d9e1 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -2,7 +2,7 @@ name: Upload test stats
 
 on:
   workflow_run:
-    workflows: [pull, trunk, periodic, inductor]
+    workflows: [pull, trunk, periodic, inductor, inductor-A100-perf]
     types:
       - completed
 
@@ -41,9 +41,8 @@ jobs:
 
       - run: |
           pip3 install requests==2.26
-          pip3 install rockset==0.8.3
+          pip3 install rockset==1.0.3
           pip3 install boto3==1.19.12
-          pip3 install six==1.16.0
 
       - name: Upload test stats
         env:
diff --git a/.gitignore b/.gitignore
index 316383260d9d..9f7128d495a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
 torch/csrc/lazy/generated/*.[!m]*
+torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
@@ -131,11 +132,24 @@ torchgen/packaged/*
 .ipynb_checkpoints
 
 # Editor temporaries
+*.swa
+*.swb
+*.swc
+*.swd
+*.swe
+*.swf
+*.swg
+*.swh
+*.swi
+*.swj
+*.swk
+*.swl
+*.swm
 *.swn
 *.swo
 *.swp
-*.swm
 *~
+.~lock.*
 
 # macOS dir files
 .DS_Store
@@ -342,3 +356,4 @@ venv/
 
 # Log files
 *.log
+sweep/
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 6356a5b5bad3..940dea358dd2 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -35,6 +35,7 @@ init_command = [
     'flake8-bugbear==20.1.4',
     'flake8-comprehensions==3.3.0',
     'flake8-executable==2.0.4',
+    'flake8-logging-format==0.9.0',
     'flake8-pyi==20.5.0',
     'mccabe==0.6.1',
     'pycodestyle==2.6.0',
@@ -122,6 +123,13 @@ exclude_patterns = [
     'caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py',
     'test/test_numpy_interop.py',
     'torch/torch_version.py',
+    'torch/fx/proxy.py',
+    'torch/fx/passes/shape_prop.py',
+    'torch/fx/node.py',
+    'torch/fx/experimental/symbolic_shapes.py',
+    'torch/fx/experimental/proxy_tensor.py',
+    'torch/_subclasses/fake_utils.py',
+    'torch/_subclasses/fake_tensor.py',
 ]
 command = [
     'python3',
@@ -138,7 +146,6 @@ init_command = [
     'expecttest==0.1.3',
     'mypy==0.960',
     'types-requests==2.27.25',
-    'types-six==1.16.15',
     'types-PyYAML==6.0.7',
     'types-tabulate==0.8.8',
     'types-protobuf==3.19.18',
@@ -222,7 +229,6 @@ exclude_patterns = [
     # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job.
     # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
     # in a follow up PR.
-    # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
     # that are not easily converted to accepted c++
     'c10/test/**/*.cpp',
     'torch/csrc/jit/passes/onnx/helper.cpp',
@@ -235,7 +241,6 @@ exclude_patterns = [
     'torch/csrc/cuda/nccl.*',
     'torch/csrc/cuda/python_nccl.cpp',
     'torch/csrc/autograd/FunctionsManual.cpp',
-    'torch/csrc/generic/*.cpp',
     'torch/csrc/jit/codegen/cuda/runtime/*',
     'torch/csrc/utils/disable_torch_function.cpp',
 ]
@@ -345,12 +350,24 @@ command = [
 ]
 is_formatter = true
 
+[[linter]]
+code = 'CONSTEXPR'
+include_patterns=['aten/src/ATen/native/cuda/*.cu']
+command = [
+    'python3',
+    'tools/linter/adapters/constexpr_linter.py',
+    '--',
+    '@{{PATHSFILE}}',
+]
+is_formatter = true
+
 [[linter]]
 code = 'SPACES'
 include_patterns = ['**']
 exclude_patterns = [
     '**/contrib/**',
     '**/*.diff',
+    '**/*.patch',
     'third_party/**',
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
     'test/cpp/jit/upgrader_models/*.ptl',
@@ -680,7 +697,7 @@ init_command = [
 [[linter]]
 code = 'SHELLCHECK'
 include_patterns = [
-    '.jenkins/pytorch/**/*.sh'
+    '.ci/pytorch/**/*.sh'
 ]
 command = [
     'python3',
@@ -826,9 +843,10 @@ include_patterns = [
     'torch/_*.py',
     'torch/testing/_internal/opinfo/**/*.py',
     'torchgen/**/*.py',
-    'functorch/functorch/_src/aot_autograd.py',
-    'functorch/functorch/_src/compilers.py',
     'torch/_functorch/make_functional.py',
+    'torch/_functorch/functional_call.py',
+    'torch/nn/utils/_named_member_accessor.py',
+    'torch/nn/utils/stateless.py',
     'torch/testing/*.py',
     'torch/distributed/fsdp/**/*.py',
     'test/distributed/fsdp/**/*.py',
@@ -836,6 +854,9 @@ include_patterns = [
     'torch/distributed/_composable/**/*.py',
     'test/distributed/_composable/**/*.py',
     'torch/testing/_internal/common_dist_composable.py',
+    'test/test_value_ranges.py',
+    'torch/utils/_sympy/interp.py',
+    'torch/utils/_sympy/reference.py',
 ]
 command = [
     'python3',
@@ -875,3 +896,24 @@ command = [
     '--',
     '@{{PATHSFILE}}'
 ]
+
+[[linter]]
+code = 'BAZEL_LINTER'
+include_patterns = ['WORKSPACE']
+command = [
+    'python3',
+    'tools/linter/adapters/bazel_linter.py',
+    '--binary=.lintbin/bazel',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/s3_init.py',
+    '--config-json=tools/linter/adapters/s3_init_config.json',
+    '--linter=bazel',
+    '--dry-run={{DRYRUN}}',
+    '--output-dir=.lintbin',
+    '--output-name=bazel',
+]
+is_formatter = true
diff --git a/.vscode/settings_recommended.json b/.vscode/settings_recommended.json
index e9eae8ead3c9..db356b7d16fe 100644
--- a/.vscode/settings_recommended.json
+++ b/.vscode/settings_recommended.json
@@ -1,12 +1,16 @@
 {
-  "[python]": {
-    "editor.tabSize": 4
-  },
-  "files.eol": "\n",
-  "files.insertFinalNewline": true,
-  "files.trimFinalNewlines": true,
-  "files.trimTrailingWhitespace": true,
-  "python.formatting.provider": "none",
-  "python.linting.enabled": true,
-  "python.linting.flake8Enabled": true
+    "[python]": {
+        "editor.tabSize": 4
+    },
+    "files.associations": {
+        "*.py.in": "python",
+        "*.pyi.in": "python"
+    },
+    "files.eol": "\n",
+    "files.insertFinalNewline": true,
+    "files.trimFinalNewlines": true,
+    "files.trimTrailingWhitespace": true,
+    "python.formatting.provider": "none",
+    "python.linting.enabled": true,
+    "python.linting.flake8Enabled": true
 }
diff --git a/BUILD.bazel b/BUILD.bazel
index 887647b2363e..843b27a8f83d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1573,25 +1573,7 @@ cc_library(
 )
 
 # torch
-py_binary(
-    name = "stringify_file",
-    srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"],
-)
-
-generated_nvfuser_hdrs = ["generated_" + hdr for hdr in libtorch_nvfuser_generated_headers]
-
-[
-    genrule(
-        name = name,
-        srcs = [src],
-        outs = ["nvfuser_resources/{}".format(hdr)],
-        cmd = "$(location :stringify_file) -i $< -o $@",
-        tools = [":stringify_file"],
-    )
-    for name, src, hdr in zip(generated_nvfuser_hdrs, libtorch_nvfuser_runtime_sources, libtorch_nvfuser_generated_headers)
-]
-
-torch_cuda_headers = glob(["torch/csrc/cuda/*.h"]) + generated_nvfuser_hdrs
+torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
 
 cc_library(
     name = "torch_headers",
@@ -1603,7 +1585,6 @@ cc_library(
             "torch/csrc/**/*.h",
             "torch/csrc/distributed/c10d/*.hpp",
             "torch/lib/libshm/*.h",
-            "torch/csrc/generic/*.cpp",
         ],
         exclude = [
             "torch/csrc/autograd/generated/VariableType.h",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7081ad868298..b9addcf005b3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,12 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 cmake_policy(SET CMP0010 NEW)
 cmake_policy(SET CMP0025 NEW)
 
+# Enables CMake to set LTO on compilers other than Intel.
+cmake_policy(SET CMP0069 NEW)
+# Enable the policy for CMake subprojects.
+# protobuf currently causes issues
+#set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
+
 # Suppress warning flags in default MSVC configuration.  It's not
 # mandatory that we do this (and we don't if cmake is old), but it's
 # nice when it's possible, and it's possible on our Windows configs.
@@ -34,14 +40,19 @@ endif()
 set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_C_STANDARD   11 CACHE STRING "The C standard whose features are requested to build this target.")
 
-if(DEFINED GLIBCXX_USE_CXX11_ABI)
+# ---[ Utils
+include(cmake/public/utils.cmake)
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  include(cmake/CheckAbi.cmake)
+  string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
   if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1)
     set(CXX_STANDARD_REQUIRED ON)
-    string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=1")
   else()
     # Please note this is required in order to ensure compatibility between gcc 9 and gcc 7
     # This could be removed when all Linux PyTorch binary builds are compiled by the same toolchain again
-    string(APPEND CMAKE_CXX_FLAGS " -fabi-version=11")
+    include(CheckCXXCompilerFlag)
+    append_cxx_flag_if_supported("-fabi-version=11" CMAKE_CXX_FLAGS)
   endif()
 endif()
 
@@ -183,6 +194,9 @@ option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
      BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
+cmake_dependent_option(
+    BUILD_NVFUSER "Build NVFUSER" ON
+    "USE_CUDA OR USE_ROCM" OFF)
 option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
 cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
@@ -465,21 +479,6 @@ if(MSVC)
         string(REGEX REPLACE "/Z[iI]" "/Z7" ${flag_var} "${${flag_var}}")
       endif(${flag_var} MATCHES "/Z[iI]")
     endif(MSVC_Z7_OVERRIDE)
-    # Turn off warnings on Windows.  In an ideal world we'd be warning
-    # clean on Windows too, but this is too much work for our
-    # non-Windows developers.
-    #
-    # NB: Technically, this is not necessary if CMP0092 was applied
-    # properly, but only cmake >= 3.15 has this policy, so we nail
-    # it one more time just be safe.
-    #
-    # NB2: This is NOT enough to prevent warnings from nvcc on MSVC.  At the
-    # moment only CMP0092 is enough to prevent those warnings too.
-    string(REPLACE "/W3" "" ${flag_var} "${${flag_var}}")
-
-    # Turn off warnings (Windows build is currently is extremely warning
-    # unclean and the warnings aren't telling us anything useful.)
-    string(APPEND ${flag_var} " /w")
 
     if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
       if(${flag_var} MATCHES "/MD")
@@ -630,9 +629,6 @@ if(INTERN_BUILD_MOBILE)
   set(INTERN_DISABLE_MOBILE_INTERP ON)
 endif()
 
-# ---[ Utils
-include(cmake/public/utils.cmake)
-
 # ---[ Version numbers for generated libraries
 file(READ version.txt TORCH_DEFAULT_VERSION)
 # Strip trailing newline
@@ -717,7 +713,7 @@ include(cmake/Dependencies.cmake)
 cmake_dependent_option(
   USE_FLASH_ATTENTION
   "Whether to build the flash_attention kernel for scaled dot product attention" ON
-  "USE_CUDA AND NOT ROCM AND NOT MSVC AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
+  "USE_CUDA AND NOT ROCM AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
 if(USE_FLASH_ATTENTION)
     ADD_DEFINITIONS(-DUSE_FLASH_ATTENTION)
 ENDIF()
@@ -777,7 +773,7 @@ if(USE_SOURCE_DEBUG_ON_MOBILE)
   string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
 
-if(USE_LITE_INTERPRETER_PROFILER)
+if(BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER)
   string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
 endif()
 
@@ -806,13 +802,11 @@ if(NOT MSVC)
   append_cxx_flag_if_supported("-Werror=braced-scalar-init" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Werror=range-loop-construct" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Werror=bool-operation" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Winconsistent-missing-override" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-missing-field-initializers" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wunused-local-typedefs" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-unused-function" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-unused-result" CMAKE_CXX_FLAGS)
@@ -863,21 +857,16 @@ if(NOT MSVC)
   endif()
 
   append_cxx_flag_if_supported("-Wno-error=pedantic" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-error=redundant-decls" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-error=old-style-cast" CMAKE_CXX_FLAGS)
   # These flags are not available in GCC-4.8.5. Set only when using clang.
   # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     append_cxx_flag_if_supported("-Wconstant-conversion" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-invalid-partial-specialization" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wno-typedef-redefinition" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wno-inconsistent-missing-override" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-aligned-allocation-unavailable" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wunused-lambda-capture" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wunused-local-typedef" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Qunused-arguments" CMAKE_CXX_FLAGS)
     if(${USE_COLORIZE_OUTPUT})
     endif()
@@ -911,6 +900,35 @@ if(NOT MSVC)
   append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Werror=cast-function-type" CMAKE_CXX_FLAGS)
+else()
+  # skip unwanted includes from windows.h
+  add_compile_definitions(WIN32_LEAN_AND_MEAN)
+  # Windows SDK broke compatibility since version 25131, but introduced this
+  # define for backward compatibility.
+  add_compile_definitions(_UCRT_LEGACY_INFINITY)
+  # disable min/max macros
+  add_compile_definitions(NOMINMAX)
+  # The source code is in utf-8 encoding
+  append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
+  # Turn off these warnings on Windows.
+  # destructor was implicitly defined as delete
+  append_cxx_flag_if_supported("/wd4624" CMAKE_CXX_FLAGS)
+  # unknown pragma
+  append_cxx_flag_if_supported("/wd4068" CMAKE_CXX_FLAGS)
+  # unexpected tokens following preprocessor directive - expected a newline
+  append_cxx_flag_if_supported("/wd4067" CMAKE_CXX_FLAGS)
+  # conversion from 'size_t' to 'unsigned int', possible loss of data
+  append_cxx_flag_if_supported("/wd4267" CMAKE_CXX_FLAGS)
+  # no suitable definition provided for explicit template instantiation request
+  append_cxx_flag_if_supported("/wd4661" CMAKE_CXX_FLAGS)
+  # recursive on all control paths, function will cause runtime stack overflow
+  append_cxx_flag_if_supported("/wd4717" CMAKE_CXX_FLAGS)
+  # conversion from '_Ty' to '_Ty', possible loss of data
+  append_cxx_flag_if_supported("/wd4244" CMAKE_CXX_FLAGS)
+  # unsafe use of type 'bool' in operation
+  append_cxx_flag_if_supported("/wd4804" CMAKE_CXX_FLAGS)
+  # inconsistent dll linkage
+  append_cxx_flag_if_supported("/wd4273" CMAKE_CXX_FLAGS)
 endif()
 
 if(USE_ASAN)
@@ -978,7 +996,6 @@ if(APPLE)
     endif()
     append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
 endif()
 
 if(EMSCRIPTEN)
@@ -1111,7 +1128,6 @@ if(BUILD_SHARED_LIBS)
       ${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/mkldnn.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
-      ${PROJECT_SOURCE_DIR}/cmake/public/threads.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
       ${PROJECT_SOURCE_DIR}/cmake/public/LoadHIP.cmake
       DESTINATION share/cmake/Caffe2/public
@@ -1120,6 +1136,10 @@ if(BUILD_SHARED_LIBS)
       ${PROJECT_SOURCE_DIR}/cmake/Modules_CUDA_fix
       DESTINATION share/cmake/Caffe2/
       COMPONENT dev)
+  install(FILES
+      ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUDAToolkit.cmake
+      DESTINATION share/cmake/Caffe2/
+      COMPONENT dev)
 
   install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2
       FILE Caffe2Targets.cmake
@@ -1156,6 +1176,19 @@ if(BUILD_JNI)
   add_subdirectory(android/pytorch_android)
 endif()
 
+if(NOT USE_CUDA AND NOT USE_ROCM)
+  set(BUILD_NVFUSER OFF CACHE BOOL "BUILD nvfuser" FORCE)
+endif()
+
+if(BUILD_NVFUSER)
+  if(DEFINED ENV{NVFUSER_SOURCE_DIR})
+    add_subdirectory($ENV{NVFUSER_SOURCE_DIR} nvfuser)
+  else()
+    add_subdirectory(third_party/nvfuser nvfuser)
+  endif()
+  add_compile_definitions(BUILD_NVFUSER)
+endif()
+
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
 
diff --git a/CODEOWNERS b/CODEOWNERS
index 1dcdfb161b74..46de19276d2f 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,16 +45,16 @@ nn/qat/ @jerryzh168
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
-/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
-/torch/distributed/_composable @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @yhcharles
-/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
+/torch/csrc/distributed/ @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin
+/torch/distributed/ @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin
+/torch/distributed/_composable @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @yhcharles @fegin
+/torch/nn/parallel/ @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin
 
 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
-/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
+/test/distributed @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin
+/torch/testing/_internal/distributed @mrshenli @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol @fegin
 
 # ONNX Export
 /torch/csrc/jit/passes/onnx.h @bowenbao @abock
@@ -64,7 +64,7 @@ nn/qat/ @jerryzh168
 /test/onnx/ @bowenbao @abock
 
 # Docker
-/.circleci/docker/ @jeffdaily
+/.ci/docker/ @jeffdaily
 
 # Github Actions
 # This list is for people wanting to be notified every time there's a change
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a38b0ad60513..7afb4d5cce90 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -958,8 +958,8 @@ than Linux, which are worth keeping in mind when fixing these problems.
    transitive dependencies can be used to fulfill unresolved symbols.)
 
 3. If you have a Windows box (we have a few on EC2 which you can request access to) and
-   you want to run the build, the easiest way is to just run `.jenkins/pytorch/win-build.sh`.
-   If you need to rebuild, run `REBUILD=1 .jenkins/pytorch/win-build.sh` (this will avoid
+   you want to run the build, the easiest way is to just run `.ci/pytorch/win-build.sh`.
+   If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh` (this will avoid
    blowing away your Conda environment.)
 
 Even if you don't know anything about MSVC, you can use cmake to build simple programs on
diff --git a/Dockerfile b/Dockerfile
index 36e6a57bc95c..e5bd901a33c9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,7 @@ ARG BASE_IMAGE=ubuntu:18.04
 ARG PYTHON_VERSION=3.8
 
 FROM ${BASE_IMAGE} as dev-base
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
         ccache \
@@ -36,8 +36,9 @@ RUN case ${TARGETPLATFORM} in \
     esac && \
     curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
 COPY requirements.txt .
+# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431
 RUN chmod +x ~/miniconda.sh && \
-    ~/miniconda.sh -b -p /opt/conda && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
     rm ~/miniconda.sh && \
     /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
     /opt/conda/bin/python -mpip install -r requirements.txt && \
@@ -59,7 +60,7 @@ RUN --mount=type=cache,target=/opt/ccache \
 
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.8
-ARG CUDA_VERSION=11.6
+ARG CUDA_VERSION=11.7
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
 # Automatically set by buildx
@@ -67,7 +68,7 @@ RUN /opt/conda/bin/conda update -y conda
 RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION}
 ARG TARGETPLATFORM
 
-# On arm64 we can only install wheel packages
+# On arm64 we can only install wheel packages.
 RUN case ${TARGETPLATFORM} in \
          "linux/arm64")  pip install --extra-index-url https://download.pytorch.org/whl/cpu/ torch torchvision torchaudio torchtext ;; \
          *)              /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch torchvision torchaudio torchtext "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
@@ -81,20 +82,16 @@ ARG TRITON_VERSION
 ARG TARGETPLATFORM
 ARG CUDA_VERSION
 LABEL com.nvidia.volumes.needed="nvidia_driver"
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         ca-certificates \
         libjpeg-dev \
-        libpng-dev
+        libpng-dev \
+        && rm -rf /var/lib/apt/lists/*
 COPY --from=conda-installs /opt/conda /opt/conda
 RUN if test -n "${TRITON_VERSION}" -a "${TARGETPLATFORM}" != "linux/arm64"; then \
-        apt install -y --no-install-recommends gcc; \
-        CU_VER=$(echo $CUDA_VERSION | cut -d'.' -f 1-2) && \
-        mkdir -p /usr/local/triton-min-cuda-${CU_VER} && \
-        ln -s /usr/local/triton-min-cuda-${CU_VER} /usr/local/cuda; \
-        mkdir -p /usr/local/cuda/bin; cp /opt/conda/bin/ptxas /usr/local/cuda/bin; \
-        mkdir -p /usr/local/cuda/include; cp /opt/conda/include/cuda.h /usr/local/cuda/include; \
+        DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends gcc; \
+        rm -rf /var/lib/apt/lists/*; \
     fi
-RUN rm -rf /var/lib/apt/lists/*
 ENV PATH /opt/conda/bin:$PATH
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
diff --git a/NOTICE b/NOTICE
index 5abaac479a75..6effb8b5d707 100644
--- a/NOTICE
+++ b/NOTICE
@@ -416,3 +416,41 @@ derivation and reference the following license:
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+=======================================================================
+PILLOW-SIMD Software License
+=======================================================================
+
+Code derived from implementations in PILLOW-SIMD should mention its derivation
+and reference the following license:
+
+    The Python Imaging Library (PIL) is
+
+        Copyright © 1997-2011 by Secret Labs AB
+        Copyright © 1995-2011 by Fredrik Lundh
+
+    Pillow is the friendly PIL fork. It is
+
+        Copyright © 2010-2022 by Alex Clark and contributors
+
+    Like PIL, Pillow is licensed under the open source HPND License:
+
+    By obtaining, using, and/or copying this software and/or its associated
+    documentation, you agree that you have read, understood, and will comply
+    with the following terms and conditions:
+
+    Permission to use, copy, modify, and distribute this software and its
+    associated documentation for any purpose and without fee is hereby granted,
+    provided that the above copyright notice appears in all copies, and that
+    both that copyright notice and this permission notice appear in supporting
+    documentation, and that the name of Secret Labs AB or the author not be
+    used in advertising or publicity pertaining to distribution of the software
+    without specific, written prior permission.
+
+    SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+    SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+    IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
+    INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+    LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+    OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+    PERFORMANCE OF THIS SOFTWARE.
diff --git a/README.md b/README.md
index fceaef190805..087171c76133 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 
 #### Prerequisites
 If you are installing from source, you will need:
-- Python 3.7 or later (for Linux, Python 3.7.6+ or 3.8.1+ is needed)
+- Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
 - A C++17 compatible compiler, such as clang
 
 We highly recommend installing an [Anaconda](https://www.anaconda.com/distribution/#download-section) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
@@ -184,7 +184,9 @@ Other potentially useful environment variables may be found in `setup.py`.
 **Common**
 
 ```bash
-conda install astunparse numpy ninja pyyaml setuptools cmake typing_extensions six requests dataclasses
+conda install cmake ninja
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
+pip install -r requirements.txt
 ```
 
 **On Linux**
@@ -237,15 +239,15 @@ export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 python setup.py develop
 ```
 
-Note that if you are using [Anaconda](https://www.anaconda.com/distribution/#download-section), you may experience an error caused by the linker:
-
-```plaintext
-build/temp.linux-x86_64-3.7/torch/csrc/stub.o: file not recognized: file format not recognized
-collect2: error: ld returned 1 exit status
-error: command 'g++' failed with exit status 1
-```
-
-This is caused by `ld` from the Conda environment shadowing the system `ld`. You should use a newer version of Python that fixes this issue. The recommended Python version is 3.7.6+ and 3.8.1+.
+> _Aside:_ If you are using [Anaconda](https://www.anaconda.com/distribution/#download-section), you may experience an error caused by the linker:
+>
+> ```plaintext
+> build/temp.linux-x86_64-3.7/torch/csrc/stub.o: file not recognized: file format not recognized
+> collect2: error: ld returned 1 exit status
+> error: command 'g++' failed with exit status 1
+> ```
+>
+> This is caused by `ld` from the Conda environment shadowing the system `ld`. You should use a newer version of Python that fixes this issue. The recommended Python version is 3.8.1+.
 
 **On macOS**
 
@@ -287,9 +289,9 @@ Currently, VS 2017 / 2019, and Ninja are supported as the generator of CMake. If
 <br/> If Ninja is selected as the generator, the latest MSVC will get selected as the underlying toolchain.
 
 Additional libraries such as
-[Magma](https://developer.nvidia.com/magma), [oneDNN, a.k.a MKLDNN or DNNL](https://github.com/oneapi-src/oneDNN), and [Sccache](https://github.com/mozilla/sccache) are often needed. Please refer to the [installation-helper](https://github.com/pytorch/pytorch/tree/master/.jenkins/pytorch/win-test-helpers/installation-helpers) to install them.
+[Magma](https://developer.nvidia.com/magma), [oneDNN, a.k.a MKLDNN or DNNL](https://github.com/oneapi-src/oneDNN), and [Sccache](https://github.com/mozilla/sccache) are often needed. Please refer to the [installation-helper](https://github.com/pytorch/pytorch/tree/master/.ci/pytorch/win-test-helpers/installation-helpers) to install them.
 
-You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
+You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob/master/.ci/pytorch/win-test-helpers/build_pytorch.bat) script for some other environment variables configurations
 
 
 ```cmd
diff --git a/RELEASE.md b/RELEASE.md
index 1c0255dcfb9b..f53ea80fc4c8 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -2,6 +2,7 @@
 
 <!-- toc -->
 
+  - [Release Compatibility Matrix](#release-compatibility-matrix)
   - [General Overview](#general-overview)
   - [Cutting a release branch preparations](#cutting-a-release-branch-preparations)
   - [Cutting release branches](#cutting-release-branches)
@@ -34,6 +35,16 @@
 
 <!-- tocstop -->
 
+## Release Compatibility Matrix
+
+Following is the Release Compatibility Matrix for PyTorch releases:
+
+| PyTorch version | Python | Stable CUDA | Experimental CUDA |
+| --- | --- | --- | --- |
+| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 |
+| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 |
+| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 |
+
 ## General Overview
 
 Releasing a new version of PyTorch generally entails 3 major steps:
@@ -95,6 +106,7 @@ them:
 * Update backwards compatibility tests to use RC binaries instead of nightlies
   * Example: https://github.com/pytorch/pytorch/pull/77983 and https://github.com/pytorch/pytorch/pull/77986
 * A release branches should also be created in [`pytorch/xla`](https://github.com/pytorch/xla) and [`pytorch/builder`](https://github.com/pytorch/builder) repos and pinned in `pytorch/pytorch`
+  * Example: https://github.com/pytorch/pytorch/pull/86290 and https://github.com/pytorch/pytorch/pull/90506
 
 These are examples of changes that should be made to the *default* branch after a release branch is cut
 
diff --git a/SECURITY.md b/SECURITY.md
index 5faa2fb1da47..0651f82b70c6 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -2,6 +2,8 @@
 
 If you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 
+Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
+
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 
 https://www.facebook.com/whitehat
diff --git a/WORKSPACE b/WORKSPACE
index e8591f291abd..9272e448c50a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -3,16 +3,27 @@ workspace(name = "pytorch")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//tools/rules:workspace.bzl", "new_patched_local_repository")
 
+http_archive(
+    name = "rules_cc",
+    strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556",
+    patches = [
+        "//:tools/rules_cc/cuda_support.patch",
+    ],
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.tar.gz",
+        "https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.tar.gz",
+    ],
+)
+
 http_archive(
     name = "rules_cuda",
-    sha256 = "f80438bee9906e9ecb1a8a4ae2365374ac1e8a283897281a2db2fb7fcf746333",
     strip_prefix = "runtime-b1c7cce21ba4661c17ac72421c6a0e2015e7bef3/third_party/rules_cuda",
     urls = ["https://github.com/tensorflow/runtime/archive/b1c7cce21ba4661c17ac72421c6a0e2015e7bef3.tar.gz"],
 )
 
 load("@rules_cuda//cuda:dependencies.bzl", "rules_cuda_dependencies")
 
-rules_cuda_dependencies()
+rules_cuda_dependencies(with_rules_cc = False)
 
 load("@rules_cc//cc:repositories.bzl", "rules_cc_toolchains")
 
@@ -29,7 +40,6 @@ http_archive(
   name = "pybind11_bazel",
   strip_prefix = "pybind11_bazel-992381ced716ae12122360b0fbadbc3dda436dbf",
   urls = ["https://github.com/pybind/pybind11_bazel/archive/992381ced716ae12122360b0fbadbc3dda436dbf.zip"],
-  sha256 = "3dc6435bd41c058453efe102995ef084d0a86b0176fd6a67a6b7100a2e9a940e",
 )
 
 new_local_repository(
@@ -52,7 +62,6 @@ http_archive(
     urls = [
         "https://github.com/gflags/gflags/archive/v2.2.2.tar.gz",
     ],
-    sha256 = "34af2f15cf7367513b352bdcd2493ab14ce43692d2dcd9dfc499492966c64dcf",
 )
 
 new_local_repository(
@@ -258,6 +267,31 @@ local_repository(
     path = "third_party/fbgemm",
 )
 
+local_repository(
+    name = "unused_ftm_bazel",
+    path = "third_party/fmt/support/bazel",
+)
+
+local_repository(
+    name = "unused_kineto_fmt_bazel",
+    path = "third_party/kineto/libkineto/third_party/fmt/support/bazel",
+)
+
+local_repository(
+    name = "unused_kineto_dynolog_googletest",
+    path = "third_party/kineto/libkineto/third_party/dynolog/third_party/googletest",
+)
+
+local_repository(
+    name = "unused_kineto_dynolog_gflags",
+    path = "third_party/kineto/libkineto/third_party/dynolog/third_party/gflags",
+)
+
+local_repository(
+    name = "unused_kineto_dynolog_glog",
+    path = "third_party/kineto/libkineto/third_party/dynolog/third_party/glog",
+)
+
 local_repository(
     name = "unused_kineto_googletest",
     path = "third_party/kineto/libkineto/third_party/googletest",
diff --git a/android/README.md b/android/README.md
index 99ae265105f5..e13344aebe52 100644
--- a/android/README.md
+++ b/android/README.md
@@ -111,12 +111,12 @@ dependencies {
     implementation(name:'pytorch_android', ext:'aar')
     implementation(name:'pytorch_android_torchvision', ext:'aar')
     ...
-    implementation 'com.facebook.soloader:nativeloader:0.10.4'
+    implementation 'com.facebook.soloader:nativeloader:0.10.5'
     implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
 }
 ```
 We also have to add all transitive dependencies of our aars.
-As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.10.4'` and `'com.facebook.fbjni:fbjni-java-only:0.2.2'`, we need to add them.
+As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.10.5'` and `'com.facebook.fbjni:fbjni-java-only:0.2.2'`, we need to add them.
 (In case of using maven dependencies they are added automatically from `pom.xml`).
 
 You can check out [test app example](https://github.com/pytorch/pytorch/blob/master/android/test_app/app/build.gradle) that uses aars directly.
diff --git a/android/build.gradle b/android/build.gradle
index cd3755883f92..d58faaff95fd 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -13,7 +13,7 @@ allprojects {
             junitVersion = "4.12"
 
             fbjniJavaOnlyVersion = "0.2.2"
-            soLoaderNativeLoaderVersion = "0.10.4"
+            soLoaderNativeLoaderVersion = "0.10.5"
         }
 
         repositories {
diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py
index 909f824fb26d..897c430c01f1 100644
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@@ -15,9 +15,6 @@ def scriptAndSave(module, fileName):
     print('=' * 80)
 
 class Test(torch.jit.ScriptModule):
-    def __init__(self):
-        super(Test, self).__init__()
-
     @torch.jit.script_method
     def forward(self, input):
         return None
diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle
index d726e6424d88..71c58d4a5b90 100644
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@@ -139,7 +139,7 @@ tasks.all { task ->
 
 dependencies {
     implementation 'com.android.support:appcompat-v7:28.0.0'
-    implementation 'com.facebook.soloader:nativeloader:0.10.4'
+    implementation 'com.facebook.soloader:nativeloader:0.10.5'
 
     localImplementation project(':pytorch_android')
     localImplementation project(':pytorch_android_torchvision')
@@ -154,7 +154,7 @@ dependencies {
 
     aarImplementation(name:'pytorch_android', ext:'aar')
     aarImplementation(name:'pytorch_android_torchvision', ext:'aar')
-    aarImplementation 'com.facebook.soloader:nativeloader:0.10.4'
+    aarImplementation 'com.facebook.soloader:nativeloader:0.10.5'
     aarImplementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
 
     def camerax_version = "1.0.0-alpha05"
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 9ba141c29e42..f8780c3e8c8c 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -43,6 +43,7 @@ set(ATen_PUBLIC_HIP_DEPENDENCY_LIBS)
 set(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
 set(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
 set(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
+set(MEM_EFF_ATTENTION_CUDA_SOURCES)
 
 if(USE_CUDA)
   list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS})
@@ -93,8 +94,7 @@ else()
 endif()
 
 list(APPEND ATen_CPU_INCLUDE
-  ${CMAKE_CURRENT_SOURCE_DIR}/src
-  ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/catch/single_include)
+  ${CMAKE_CURRENT_SOURCE_DIR}/src)
 add_subdirectory(src/ATen)
 
 # Pass source, includes, and libs to parent
@@ -125,3 +125,4 @@ set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
+set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 2ec08f43d2e8..b50f38d82e14 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -32,6 +32,7 @@ set_bool(AT_BLAS_F2C BLAS_F2C)
 set_bool(AT_BLAS_USE_CBLAS_DOT BLAS_USE_CBLAS_DOT)
 set_bool(AT_MAGMA_ENABLED USE_MAGMA)
 set_bool(CAFFE2_STATIC_LINK_CUDA_INT CAFFE2_STATIC_LINK_CUDA)
+set_bool(AT_CUDNN_ENABLED CAFFE2_USE_CUDNN)
 
 configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
 # TODO: Do not generate CUDAConfig.h for ROCm BUILDS
@@ -83,7 +84,7 @@ file(GLOB native_cpp "native/*.cpp")
 file(GLOB native_mkl_cpp "native/mkl/*.cpp")
 file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 file(GLOB vulkan_cpp "vulkan/*.cpp")
-file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/ops/*.cpp")
+file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")
 
 # Metal
 file(GLOB metal_h "metal/*.h")
@@ -172,6 +173,7 @@ if(USE_FLASH_ATTENTION)
   list(APPEND native_transformers_cuda_cu ${mem_eff_attention_cuda_cu})
   list(APPEND native_transformers_cuda_cu ${mem_eff_attention_cuda_kernels_cu})
   list(APPEND native_transformers_cuda_cpp ${mem_eff_attention_cuda_cpp})
+  list(APPEND MEM_EFF_ATTENTION_CUDA_SOURCES ${native_transformers_cuda_cu} ${mem_eff_attention_cuda_cu} ${mem_eff_attention_cuda_kernels_cu})
 endif()
 
 # XNNPACK
@@ -437,25 +439,26 @@ if(USE_CUDA AND NOT USE_ROCM)
   if($ENV{ATEN_STATIC_CUDA})
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
       ${CUDA_LIBRARIES}
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusparse_static.a
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static_nocallback.a
-      )
+      CUDA::cusparse_static
+      CUDA::curand_static
+      CUDA::cufft_static_nocallback
+    )
    if(NOT BUILD_LAZY_CUDA_LINALG)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
-       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a
-       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a     # needed for libcusolver_static
-       )
+       CUDA::cusolver_static
+       ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a     # needed for libcusolver_static
+     )
    endif()
   else()
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS
       ${CUDA_LIBRARIES}
-      ${CUDA_cusparse_LIBRARY}
-      ${CUDA_curand_LIBRARY}
-      )
+      CUDA::cusparse
+      CUDA::curand
+      CUDA::cufft
+    )
    if(NOT BUILD_LAZY_CUDA_LINALG)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
-       ${CUDA_cusolver_LIBRARY}
+       CUDA::cusolver
      )
    endif()
   endif()
@@ -464,8 +467,10 @@ if(USE_CUDA AND NOT USE_ROCM)
     list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${CUDNN_LIBRARIES})
   endif()
   if($ENV{ATEN_STATIC_CUDA})
-    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a")
-    list(APPEND ATen_CUDA_DEPENDENCY_LIBS "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart_static.a")
+    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+      CUDA::culibos
+      CUDA::cudart_static
+    )
   endif($ENV{ATEN_STATIC_CUDA})
 endif()
 
@@ -621,3 +626,4 @@ set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index d98e07527293..0e87b0916fed 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -106,7 +106,7 @@ struct strided_tensor_iter {
 };
 
 inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
-  if (tensors.size() == 0)
+  if (tensors.empty())
     return true;
   int64_t all_numel = tensors[0].numel();
   for (const auto i : c10::irange(1, tensors.size())) {
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index 5fd06c442750..02ed04cc4895 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -1,7 +1,6 @@
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/Utils.h>
 #include <ATen/core/MT19937RNGEngine.h>
-#include <c10/util/C++17.h>
 #include <c10/util/MathConstants.h>
 #include <algorithm>
 
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index b6cda72cf1e9..1ec545dfc060 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -2,18 +2,12 @@
 
 #include <ATen/Context.h>
 
-#include <c10/core/TensorOptions.h>
 #include <c10/core/CPUAllocator.h>
 
 #include <algorithm>
 #include <cctype>
-#include <mutex>
-#include <sstream>
-#include <stdexcept>
 #include <string>
-#include <thread>
 
-#include <ATen/Tensor.h>
 #include <ATen/cpu/FlushDenormal.h>
 
 #ifdef USE_FBGEMM
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index ef938399ae05..3cbf0d5e8675 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -281,6 +281,21 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
       AT_DISPATCH_CASE_FLOATING_TYPES_AND2(    \
           SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
 
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND3(   \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)  \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND3(                    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND3(                 \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
 #define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
   AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 55cdc09268f0..49fb917d01bc 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -164,7 +164,8 @@ TensorBase _empty_generic(
   auto tensor = detail::make_tensor_base<TensorImpl>(
       std::move(storage_impl), ks, dtype);
   // Default TensorImpl has size [0]
-  if (size.size() != 1 || size[0] != 0) {
+  // NB: test for meta dispatch key to avoid guarding on zero-ness
+  if (ks.has(c10::DispatchKey::Meta) || size.size() != 1 || size[0] != 0) {
     tensor.unsafeGetTensorImpl()->generic_set_sizes_contiguous(size);
   }
 
@@ -242,8 +243,7 @@ TensorBase empty_cpu(
     c10::optional<Device> device_opt,
     c10::optional<bool> pin_memory_opt,
     c10::optional<c10::MemoryFormat> memory_format_opt) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
   auto pin_memory = pinned_memory_or_default(pin_memory_opt);
@@ -277,8 +277,7 @@ TensorBase empty_strided_cpu(
     c10::optional<Layout> layout_opt,
     c10::optional<Device> device_opt,
     c10::optional<bool> pin_memory_opt) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
   auto pin_memory = pinned_memory_or_default(pin_memory_opt);
@@ -335,8 +334,7 @@ TensorBase empty_meta(
   c10::optional<bool> pin_memory_opt,
   c10::optional<c10::MemoryFormat> memory_format_opt
 ) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::Meta);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
   // NB: because there is no SparseMeta (yet), non-strided layout is
   // exerciseable
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -388,8 +386,7 @@ TensorBase empty_strided_meta(
     c10::optional<Layout> layout_opt,
     c10::optional<Device> device_opt,
     c10::optional<bool> pin_memory_opt) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::Meta);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
   auto dtype = dtype_or_default(dtype_opt);
@@ -424,8 +421,7 @@ TensorBase empty_strided_symint_meta(
     c10::optional<Layout> layout_opt,
     c10::optional<Device> device_opt,
     c10::optional<bool> pin_memory_opt) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::Meta);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
 
   auto dtype = dtype_or_default(dtype_opt);
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index 8a68503df329..8f99a5df73ce 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -172,7 +172,7 @@ Tensor FunctionalInverses::_reshape_alias_copy_inverse(const Tensor& base, const
 
 Tensor FunctionalInverses::select_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, c10::SymInt index) {
     // Pessimism: we can't reapply views for slice_scatter.
-    return base.select_scatter_symint(mutated_view, dim, index);
+    return base.select_scatter_symint(mutated_view, dim, std::move(index));
 }
 
 Tensor FunctionalInverses::detach_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index 14edae650005..edbdd289c8a7 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -3,7 +3,6 @@
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/core/LegacyTypeDispatch.h>
-#include <c10/core/CPUAllocator.h>
 #include <c10/util/Exception.h>
 #include <vector>
 
@@ -43,9 +42,10 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
 const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
   at::Tensor t = update.new_val;
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.size() == 0) return t;
+  if (update.view_metas.empty()) return t;
 
   std::vector<at::Tensor> tmp_values({base});
+  tmp_values.reserve(update.view_metas.size());
   for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
     at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
     // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
@@ -113,7 +113,7 @@ bool FunctionalStorageImpl::apply_updates() {
   // It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
   // which means that we need to explicitly exclude it here before doing any other work underneath the pass.
   at::AutoDispatchSkipFunctionalize guard;
-  bool any_updates = updates_.size() > 0;
+  bool any_updates = !updates_.empty();
   for (auto& update_data: updates_) {
     base_ = apply_update(update_data, base_);
   }
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 4c2023def8e0..0b71d435c32c 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -132,7 +132,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
 {
   set_constructor_metadata();
   // Copy the original tensor's ViewMeta vector and push the current one.
-  if (base->view_metas_.size() > 0) {
+  if (!base->view_metas_.empty()) {
       view_metas_ = base->view_metas_;  // copy
   }
   view_metas_.push_back(meta);
@@ -238,7 +238,7 @@ void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
   //
   // Given all of the above, for now we're just banning the above usage.
   TORCH_CHECK(storage().use_count() == 1, "Attempted to resize a view tensor to a larger size. This is not allowed in the functionalization pass");
-  TORCH_CHECK(view_metas_.size() == 0, "Attempted to resize a view tensor to a larger size. This is not allowed in the functionalization pass");
+  TORCH_CHECK(view_metas_.empty(), "Attempted to resize a view tensor to a larger size. This is not allowed in the functionalization pass");
   // If this tensor is not a view (and has no outstanding views taken out on it),
   // Then it's safe to throw out the old storage and replace it with the new, larger one.
   storage_ = c10::Storage(c10::make_intrusive<functionalization::FunctionalStorageImpl>(other));
@@ -343,7 +343,7 @@ int64_t FunctionalTensorWrapper::numel_custom() const {
   return value_.unsafeGetTensorImpl()->numel();
 }
 bool FunctionalTensorWrapper::is_contiguous_custom(at::MemoryFormat memory_format) const {
-  return value_.unsafeGetTensorImpl()->is_contiguous();
+  return value_.unsafeGetTensorImpl()->is_contiguous(memory_format);
 }
 c10::SymIntArrayRef FunctionalTensorWrapper::sym_sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sym_sizes();
@@ -508,7 +508,7 @@ bool isFunctionalTensor(const c10::optional<Tensor>& t) {
 }
 
 bool isFunctionalTensor(const c10::List<c10::optional<Tensor>>& t_list) {
-  if (t_list.size() == 0) return false;
+  if (t_list.empty()) return false;
   auto functional_count = 0;
   for (const auto i : c10::irange(t_list.size())) {
     if (!t_list[i].has_value() || !t_list[i]->defined()) continue;
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 2702bf350239..48242bdc01d0 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -6,6 +6,7 @@
 #include <ATen/TensorUtils.h>
 #include <torch/library.h>
 #include <c10/util/irange.h>
+#include <c10/util/strides.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@@ -29,7 +30,17 @@
 namespace {
   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) {
     const auto& schema = op.schema();
-    TORCH_INTERNAL_ASSERT(!schema.hasAnyAliasInfo(), "mutating and aliasing ops should all have codegen'd kernels");
+    TORCH_CHECK(
+      !schema.hasAnyAliasInfo(),
+      "Found a custom (non-ATen) operator that either mutates or its inputs: ",
+      op.operator_name().name, ".", op.operator_name().overload_name,
+      ". Getting these operators to work with functionalization requires some extra work",
+      ". For mutable ops you need to register a corresponding out-of-place variant of the op,",
+      " and you also need to register a Functionalization kernel that performs some boilerplate,",
+      " telling functionalization to map from the mutable op to the out-of-place op",
+      ". See a more complete example of how to do this at ",
+      "https://gist.github.com/bdhirsh/7dadbf6296f8f7d1abcf4c482f438aaa.",
+      " Please file a GitHub issue if you run into any problems.");
     const auto num_arguments = schema.arguments().size();
     const auto arguments_begin = stack->size() - num_arguments;
     auto arguments = torch::jit::last(stack, num_arguments);
@@ -98,20 +109,6 @@ namespace {
   }
 }
 
-// Vanilla implementation to compute contiguous strides given some sizes.
-// Should probably refactor this into shared code (also used in TensorImpl.h)
-std::vector<int64_t> compute_contiguous_strides(c10::IntArrayRef sizes) {
-  auto n = sizes.size();
-  std::vector<int64_t> strides(n);
-  if (n == 0) return strides;
-
-  strides[n - 1] = 1;
-  for (int64_t i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i+1] * sizes[i];
-  }
-  return strides;
-}
-
 // resize_() is special because:
 // - when we resize to a larger size, it acts as a mutation
 // - when we resize to a smaller size, it acts as a view
@@ -128,7 +125,7 @@ const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatchKeySet,
   // Case 1: arguments are not functional tensors, so we no-op and redispatch.
   if (!at::functionalization::impl::isFunctionalTensor(self)) {
      at::AutoDispatchSkipFunctionalize guard;
-     at::Tensor tmp_output = self_.resize_(size, memory_format);
+     self_.resize_(size, memory_format);
      return self;
   }
 
@@ -162,13 +159,13 @@ const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatchKeySet,
   at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
     [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx) -> at::Tensor {
       if (reapply_views) {
-        return base.as_strided(size, compute_contiguous_strides(size));
+        return base.as_strided(size, c10::contiguous_strides(size));
       } else {
-        return at::as_strided_copy(base, size, compute_contiguous_strides(size));
+        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
       }
     },
     [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx) -> at::Tensor {
-      return base.as_strided_scatter(mutated_view, size, compute_contiguous_strides(size));
+      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
     }
   );
   at::functionalization::impl::mutate_view_meta(self, std::move(view_meta));
diff --git a/aten/src/ATen/LegacyBatchedFallback.cpp b/aten/src/ATen/LegacyBatchedFallback.cpp
index 83e95472a685..c53ee5c6204b 100644
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@@ -156,7 +156,7 @@ void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, torch::j
     batched_tensor_inputs.push_back(tensor);
     batched_tensor_inputs_position.push_back(idx);
   }
-  TORCH_INTERNAL_ASSERT(batched_tensor_inputs.size() > 0);
+  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
   // MultiBatchVmapTransform the BatchedTensor arguments. This returns
   // VmapPhysicalViews that contain all of the batch dimensions.
@@ -290,7 +290,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
     batched_tensor_inputs.push_back(tensor);
     batched_tensor_inputs_position.push_back(idx);
   }
-  TORCH_INTERNAL_ASSERT(batched_tensor_inputs.size() > 0);
+  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
   // MultiBatchVmapTransform the BatchedTensor arguments. This returns
   // VmapPhysicalViews that contain all of the batch dimensions.
diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index b832c34e3ac7..bbb599748d2e 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <bitset>
+#include <utility>
 
 #include <ATen/ArrayRef.h>
 #include <ATen/SmallVector.h>
@@ -120,7 +121,7 @@ inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) {
   if (!isBatchedTensor(tensor)) {
     return nullptr;
   }
-  return unsafeGetBatchedImpl(tensor);
+  return unsafeGetBatchedImpl(std::move(tensor));
 }
 
 // Returns a bitset. If bit i is set, then that means dim i is a batchdim.
diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp
index 77c64105f972..4be6f2890be9 100644
--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@@ -69,7 +69,7 @@ Tensor sum_batching_rule(const Tensor& self, OptionalIntArrayRef opt_dims, bool
     // >>> x = torch.randn(B0)  # the per-examples are all scalars
     // >>> vmap(partial(torch.sum, dim=0), x)
     // then we replicate the behavior of sum(scalar_tensor, dim=0).
-    if (/*logical*/self.dim() == 0 && (dims.size() == 0 || (dims.size() == 1 && is_allowed_dim_on_scalar_tensor(dims[0])))) {
+    if (/*logical*/self.dim() == 0 && (dims.empty() || (dims.size() == 1 && is_allowed_dim_on_scalar_tensor(dims[0])))) {
       return self.clone();
     }
   }
@@ -477,7 +477,7 @@ Tensor view_batching_rule(const Tensor& self, IntArrayRef size) {
 Tensor view_as_complex_batching_rule(const Tensor& self) {
   // guard against the user passing in a batch of scalar tensors with batch
   // size equal to 2.
-  TORCH_CHECK(self.sizes().size() != 0, "Input tensor must have one or more dimensions");
+  TORCH_CHECK(!self.sizes().empty(), "Input tensor must have one or more dimensions");
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto result = at::view_as_complex(self_physical.tensor());
   return self_physical.getPhysicalToLogicalMap().apply(result);
@@ -931,7 +931,7 @@ Tensor cat_batching_rule(const ITensorListRef& tensors, int64_t dim) {
   auto physical_tensors = fmap(
       physical_views, [](const VmapPhysicalView& view) -> Tensor { return view.tensor(); });
   TORCH_INTERNAL_ASSERT(
-      tensors.size() > 0, "The dispatcher should not have dispatched here otherwise.");
+      !tensors.empty(), "The dispatcher should not have dispatched here otherwise.");
   auto result = at::cat(physical_tensors, physical_views[0].getPhysicalDim(dim));
   return physical_views[0].getPhysicalToLogicalMap().apply(result);
 }
@@ -941,7 +941,7 @@ Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
   auto physical_tensors = fmap(
       physical_views, [](const VmapPhysicalView& view) -> Tensor { return view.tensor(); });
   TORCH_INTERNAL_ASSERT(
-      tensors.size() > 0, "The dispatcher should not have dispatched here otherwise.");
+      !tensors.empty(), "The dispatcher should not have dispatched here otherwise.");
   // NB: stack wraps the dimensionality to (logical dim + 1), so we have to
   // manually handle that here.
   auto dim_physical =
diff --git a/aten/src/ATen/LegacyVmapTransforms.cpp b/aten/src/ATen/LegacyVmapTransforms.cpp
index 1457e572812a..ca43993ed7d3 100644
--- a/aten/src/ATen/LegacyVmapTransforms.cpp
+++ b/aten/src/ATen/LegacyVmapTransforms.cpp
@@ -61,7 +61,7 @@ VmapDimVector VmapPhysicalView::getPhysicalDims(OptionalIntArrayRef opt_logical_
   // NB: fmap doesn't have a SmallVector variant, so we don't use it here.
   VmapDimVector result;
   result.reserve(logical_ndim);
-  if (opt_logical_dims.has_value()) {
+  if (opt_logical_dims.has_value() && !opt_logical_dims.value().empty()) {
     auto logical_dims = opt_logical_dims.value();
     for (auto dim : logical_dims) {
       result.push_back(maybe_wrap_dim(dim, logical_ndim) + numBatchDims());
@@ -239,7 +239,7 @@ MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) {
 
 static std::pair<std::bitset<kVmapNumLevels>,int64_t>
 getLevelsAndLargestLogicalDim(TensorList logical_tensors) {
-  TORCH_INTERNAL_ASSERT(logical_tensors.size() > 0);
+  TORCH_INTERNAL_ASSERT(!logical_tensors.empty());
   std::bitset<kVmapNumLevels> levels;
   int64_t largest_logical_dim = -1;
   for (const auto& tensor : logical_tensors) {
diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp
index 56e840cadc2c..6d3c7058e8f9 100644
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@@ -236,12 +236,8 @@ MapAllocator::MapAllocator(WithFd, std::string filename, int fd, int flags, size
 #else /* _WIN32 */
   {
     /* open file */
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int fd;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int flags; // shadow
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-    struct stat file_stat;
+    int fd{-1};
+    int flags{}; // shadow
 
     if (flags_ & (ALLOCATOR_MAPPED_SHARED | ALLOCATOR_MAPPED_SHAREDMEM)) {
       flags = O_RDWR | O_CREAT;
@@ -278,6 +274,7 @@ MapAllocator::MapAllocator(WithFd, std::string filename, int fd, int flags, size
       fd = fd_;
     }
 
+    struct stat file_stat;
     if (fstat(fd, &file_stat) == -1) {
       int last_err = errno;
       if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) {
@@ -471,6 +468,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int
 }
 
 void RefcountedMapAllocator::initializeAlloc() {
+  TORCH_CHECK(base_ptr_, "base_ptr_ is null");
   MapInfo *map_info = (MapInfo*)base_ptr_;
 
 #ifdef _WIN32
diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h
index 7f602935cba1..11ac1c9dac9a 100644
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@@ -121,7 +121,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
   int decref();
   void close() override;
 
-  virtual ~RefcountedMapAllocator() {
+  ~RefcountedMapAllocator() override {
     close();
   }
 
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 63a4f1d5668d..7195d04f0f4c 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -207,7 +207,7 @@ void propagate_names_for_reduction(const Tensor& result, const Tensor& src, IntA
     return;
   }
   // This actually means "full reduction"
-  if (reduced_dims.size() == 0) {
+  if (reduced_dims.empty()) {
     return;
   }
   propagate_names_except(result, src, reduced_dims);
@@ -303,7 +303,7 @@ static int64_t num_batch_dims(DimnameList names) {
 static std::vector<Dimname> compute_matmul_outnames(
     DimnameList self_names,
     DimnameList other_names) {
-  TORCH_CHECK(self_names.size() >= 1 && other_names.size() >= 1,
+  TORCH_CHECK(!self_names.empty() && !other_names.empty(),
       "both arguments to matmul need to be at least 1D, but they are ",
       self_names.size(), "D and ", other_names.size(), "D");
 
@@ -430,7 +430,7 @@ std::vector<Dimname> compute_cat_outnames(const MaterializedITensorListRef& tens
   std::vector<Dimname> result;
   for (const Tensor& tensor : tensors) {
     const auto tensor_names = tensor.names();
-    TORCH_CHECK(tensor_names.size() > 0, "zero-dimensional tensor cannot be concatenated");
+    TORCH_CHECK(!tensor_names.empty(), "zero-dimensional tensor cannot be concatenated");
     TORCH_CHECK(result.empty() || tensor_names.size() == result.size(),
         "Tensors must have same number of dimensions: got ", result.size(),
         " and ", tensor_names.size());
diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h
index a26bbe75baff..4e1c08769c2c 100644
--- a/aten/src/ATen/NumericUtils.h
+++ b/aten/src/ATen/NumericUtils.h
@@ -39,7 +39,7 @@ inline C10_HOST_DEVICE bool _isnan(T val) {
 template <
     typename T,
     typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
-inline bool _isnan(T val) {
+inline C10_HOST_DEVICE bool _isnan(T val) {
   return std::isnan(val.real()) || std::isnan(val.imag());
 }
 
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
index 00f372f370e6..9e0fdb88469a 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.cpp
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -11,8 +11,8 @@ void PythonTorchFunctionTLS::push_onto_stack(std::shared_ptr<SafePyObject> mode)
 }
 
 const std::shared_ptr<SafePyObject> PythonTorchFunctionTLS::pop_stack() {
-  TORCH_CHECK(pythonTorchFunctionState.stack_.size() > 0, "trying to pop from empty mode stack");
-  const auto out = pythonTorchFunctionState.stack_.back();
+  TORCH_CHECK(!pythonTorchFunctionState.stack_.empty(), "trying to pop from empty mode stack");
+  auto out = pythonTorchFunctionState.stack_.back();
   pythonTorchFunctionState.stack_.pop_back();
   return out;
 }
diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp
index d1b210c36c3c..c1c963409f40 100644
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@@ -26,7 +26,7 @@ bool SavedTensorDefaultHooks::is_enabled() {
 
 void SavedTensorDefaultHooks::disable(const std::string& message) {
   tls.disabled_error_message = message;
-  if (tls.stack.size() > 0) {
+  if (!tls.stack.empty()) {
     assertSavedTensorHooksNotDisabled();
   }
 }
diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h
index 7f4149a7d084..d060423c52bb 100644
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@@ -5,6 +5,14 @@
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/core/Tensor.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/resize_as_sparse_native.h>
+#endif
+
 #define AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, ...) \
   [&] {                                                              \
     const auto& the_layout = LAYOUT;                                 \
@@ -16,7 +24,7 @@
         return __VA_ARGS__();                                        \
       default:                                                       \
         AT_ERROR(                                                    \
-            #NAME,                                                   \
+            NAME,                                                    \
             " expected sparse compressed tensor layout but got ",    \
             the_layout);                                             \
     }                                                                \
@@ -35,7 +43,7 @@
         return (COLUMN_DIM_ACTION)();                             \
       default:                                                    \
         AT_ERROR(                                                 \
-            #NAME,                                                \
+            NAME,                                                 \
             " expected sparse compressed tensor layout but got ", \
             the_layout);                                          \
     }                                                             \
@@ -54,7 +62,7 @@
         return (BLOCK_ACTION)();                                  \
       default:                                                    \
         AT_ERROR(                                                 \
-            #NAME,                                                \
+            NAME,                                                 \
             " expected sparse compressed tensor layout but got ", \
             the_layout);                                          \
     }                                                             \
@@ -70,7 +78,7 @@
         return (ROW_DIM_ACTION)();                                    \
       default:                                                        \
         AT_ERROR(                                                     \
-            #NAME,                                                    \
+            NAME,                                                     \
             " expected sparse row compressed tensor layout but got ", \
             the_layout);                                              \
     }                                                                 \
@@ -86,7 +94,7 @@
         return (COL_DIM_ACTION)();                                       \
       default:                                                           \
         AT_ERROR(                                                        \
-            #NAME,                                                       \
+            NAME,                                                        \
             " expected sparse column compressed tensor layout but got ", \
             the_layout);                                                 \
     }                                                                    \
@@ -101,7 +109,7 @@
         return (ACTION)();                                                    \
       default:                                                                \
         AT_ERROR(                                                             \
-            #NAME,                                                            \
+            NAME,                                                             \
             " expected sparse compressed (non-block) tensor layout but got ", \
             the_layout);                                                      \
     }                                                                         \
@@ -116,7 +124,7 @@
         return (ACTION)();                                                \
       default:                                                            \
         AT_ERROR(                                                         \
-            #NAME,                                                        \
+            NAME,                                                         \
             " expected sparse compressed block tensor layout but got ",   \
             the_layout);                                                  \
     }                                                                     \
@@ -308,5 +316,56 @@ inline at::OptionalArray<at::SymInt> getSymIntBlockSize(Tensor const& self) {
   }
 }
 
+template <typename binary_op_t, typename binary_op_out_t>
+inline bool only_sparse_compressed_binary_op_trivial_cases(
+    const Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha,
+    Tensor& out,
+    const binary_op_t& binary_op,
+    const binary_op_out_t& binary_op_out) {
+  // Only sparse compressed! Just like the name says :)
+  TORCH_INTERNAL_ASSERT(at::sparse_csr::is_sparse_compressed(self));
+  TORCH_INTERNAL_ASSERT(at::sparse_csr::is_sparse_compressed(other));
+  TORCH_INTERNAL_ASSERT(at::sparse_csr::is_sparse_compressed(out));
+
+  // Bypass BLAS if there are matches in (self, other, out)
+  if (self.is_same(out) && self.is_same(other)) {
+    binary_op_out(self.values(), other.values(), alpha);
+    return true;
+  }
+  if (self.is_same(other)) {
+    Tensor compressed_indices, plain_indices;
+    std::tie(compressed_indices, plain_indices) =
+        at::sparse_csr::getCompressedPlainIndices(self);
+    static_cast<SparseCsrTensorImpl*>(out.unsafeGetTensorImpl())
+        ->set_member_tensors(
+            compressed_indices,
+            plain_indices,
+            binary_op(self.values(), other.values(), alpha),
+            self.sizes());
+    return true;
+  }
+  return false;
+}
+
+inline bool only_sparse_compressed_add_trivial_cases(
+    const Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha,
+    Tensor& out) {
+  return only_sparse_compressed_binary_op_trivial_cases(
+      self,
+      other,
+      alpha,
+      out,
+      [](const Tensor& v1, const Tensor& v2, const Scalar& alpha) {
+        return v1.add(v2, alpha);
+      },
+      [](const Tensor& v1, const Tensor& v2, const Scalar& alpha) {
+        return v1.add_(v2, alpha);
+      });
+}
+
 } // namespace sparse_csr
 } // namespace at
diff --git a/aten/src/ATen/StorageUtils.cpp b/aten/src/ATen/StorageUtils.cpp
new file mode 100644
index 000000000000..69045e7e3bc7
--- /dev/null
+++ b/aten/src/ATen/StorageUtils.cpp
@@ -0,0 +1,52 @@
+#include <ATen/Functions.h>
+#include <ATen/MapAllocator.h>
+#include <ATen/StorageUtils.h>
+#include <c10/core/TensorOptions.h>
+
+namespace at {
+
+C10_EXPORT c10::intrusive_ptr<c10::StorageImpl> new_shm_fd_storage(
+    size_t size) {
+  int flags = ALLOCATOR_MAPPED_SHAREDMEM | ALLOCATOR_MAPPED_EXCLUSIVE |
+      ALLOCATOR_MAPPED_KEEPFD | ALLOCATOR_MAPPED_UNLINK;
+  std::string handle = NewProcessWideShmHandle();
+  auto sptr = MapAllocator::makeDataPtr(
+      handle.c_str(), flags, size * sizeof(uint8_t), nullptr);
+  return c10::make_intrusive<StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size,
+      std::move(sptr),
+      /*allocator=*/nullptr,
+      /*resizable=*/false);
+}
+
+C10_EXPORT void storage_copy(
+    c10::Storage& dst,
+    const c10::Storage& src,
+    bool non_blocking) {
+  auto dst_options = c10::TensorOptions().device(dst.device()).dtype(at::kByte);
+  auto dst_t = at::empty({0}, {}, dst_options).set_(dst);
+
+  auto src_options = c10::TensorOptions().device(src.device()).dtype(at::kByte);
+  auto src_t = at::empty({0}, {}, src_options).set_(src);
+  dst_t.copy_(src_t, non_blocking);
+}
+
+C10_EXPORT void share_memory_(TensorBase& t) {
+  if (t.device() != at::kCPU) {
+    return;
+  }
+
+  const at::Storage& origStorage = t.storage();
+
+  if (MapAllocator::fromDataPtr(origStorage.data_ptr()) != nullptr) {
+    // already shared
+    return;
+  }
+  at::Storage newStorage(new_shm_fd_storage(origStorage.nbytes()));
+  storage_copy(newStorage, origStorage);
+  std::swap(
+      *origStorage.unsafeGetStorageImpl(), *newStorage.unsafeGetStorageImpl());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/StorageUtils.h b/aten/src/ATen/StorageUtils.h
new file mode 100644
index 000000000000..f7a9fdab0cc7
--- /dev/null
+++ b/aten/src/ATen/StorageUtils.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <c10/core/Storage.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace at {
+
+class TensorBase;
+
+// Here we define a series of utils to create/manipulate ATen backed
+// c10 storage implementations.
+
+/**
+ * Create a new shared memory storage impl managed by file descriptor
+ *
+ * @param size  size in bytes
+ */
+C10_EXPORT c10::intrusive_ptr<c10::StorageImpl> new_shm_fd_storage(size_t size);
+
+/**
+ * Copy src to dst
+ * Caller must guarantee the validness of the storage objects
+ * during the entire copy process, esp. when it's async.
+ *
+ * This can probably live in c10 namespace later if needed,
+ * but for now keep it in at to keep implementation simple.
+ *
+ * @param dst  dst tensor
+ * @param src  src tensor
+ * @param non_blocking  (default false) whether this operation blocks caller
+ */
+C10_EXPORT void storage_copy(
+    c10::Storage& dst,
+    const c10::Storage& src,
+    bool non_blocking = false);
+
+/**
+ * In place change the storage to shm based.
+ *
+ * This is only applicable to CPU tensors not already shared.
+ * Otherwise, it's a no op to mirror the THP tensor behavior:
+ * https://pytorch.org/docs/stable/generated/torch.Tensor.share_memory_.html
+ *
+ * @param t  a tensor
+ */
+C10_EXPORT void share_memory_(TensorBase& t);
+
+} // namespace at
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 110f2356c3a5..02e9954cf273 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -13,12 +13,11 @@ namespace at {
 TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides);
 
 struct TORCH_API TensorGeometry {
-  TensorGeometry() : storage_offset_(0) {}
+  TensorGeometry() = default;
 
   explicit TensorGeometry(c10::SymIntArrayRef sizes)
       : sizes_(sizes.vec()),
         strides_(sizes.size()),
-        storage_offset_(0),
         has_symbolic_sizes_strides_(
             !c10::asIntArrayRefSlowOpt(sizes).has_value()) {
     int64_t dim = sizes.size();
@@ -119,7 +118,7 @@ struct TORCH_API TensorGeometry {
   std::vector<c10::SymInt> strides_;
   c10::SymInt storage_offset_;
   c10::SymInt numel_;
-  bool has_symbolic_sizes_strides_;
+  bool has_symbolic_sizes_strides_{false};
 };
 
 } // namespace at
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index 95d70132f43f..bd50282b46ec 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -65,7 +65,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
 } // namespace indexing
 
 Tensor Tensor::index(ArrayRef<at::indexing::TensorIndex> indices) const {
-  TORCH_CHECK(indices.size() > 0, "Passing an empty index list to Tensor::index() is not valid syntax");
+  TORCH_CHECK(!indices.empty(), "Passing an empty index list to Tensor::index() is not valid syntax");
   OptionalDeviceGuard device_guard(device_of(*this));
   return at::indexing::get_item(*this, indices);
 }
@@ -74,13 +74,13 @@ Tensor Tensor::index(std::initializer_list<at::indexing::TensorIndex> indices) c
 }
 
 Tensor & Tensor::index_put_(ArrayRef<at::indexing::TensorIndex> indices, Tensor const & rhs) {
-  TORCH_CHECK(indices.size() > 0, "Passing an empty index list to Tensor::index_put_() is not valid syntax");
+  TORCH_CHECK(!indices.empty(), "Passing an empty index list to Tensor::index_put_() is not valid syntax");
   OptionalDeviceGuard device_guard(device_of(*this));
   at::indexing::set_item(*this, indices, rhs);
   return *this;
 }
 Tensor & Tensor::index_put_(ArrayRef<at::indexing::TensorIndex> indices, const Scalar& v) {
-  TORCH_CHECK(indices.size() > 0, "Passing an empty index list to Tensor::index_put_() is not valid syntax");
+  TORCH_CHECK(!indices.empty(), "Passing an empty index list to Tensor::index_put_() is not valid syntax");
   OptionalDeviceGuard device_guard(device_of(*this));
   at::indexing::set_item(*this, indices, v);
   return *this;
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index c7296fbd909d..cc73c41af847 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -20,6 +20,8 @@
 
 #include <ATen/core/List.h>
 
+#include <utility>
+
 namespace at {
 namespace indexing {
 
@@ -109,15 +111,12 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
   // Case 1: `at::indexing::None`
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(c10::nullopt_t) : type_(TensorIndexType::None) {}
 
   // Case 2: "..." / `at::indexing::Ellipsis`
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(at::indexing::EllipsisIndexType)
       : type_(TensorIndexType::Ellipsis) {}
   TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
-    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
     TORCH_CHECK_VALUE(
         strcmp(str, "...") == 0,
         "Expected \"...\" to represent an ellipsis index, but got \"",
@@ -126,26 +125,21 @@ struct TORCH_API TensorIndex final {
   }
 
   // Case 3: Integer value
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(int64_t integer)
       : integer_(integer), type_(TensorIndexType::Integer) {}
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(int integer) : TensorIndex((int64_t)integer) {}
 
   // Case 4: Boolean value
   template <
       class T,
       class = typename std::enable_if<std::is_same<bool, T>::value>::type>
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(T boolean) : boolean_(boolean), type_(TensorIndexType::Boolean) {}
 
   // Case 5: Slice represented in `at::indexing::Slice` form
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(Slice slice)
       : slice_(std::move(slice)), type_(TensorIndexType::Slice) {}
 
   // Case 6: Tensor value
-  // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.UninitializedObject)
   TensorIndex(Tensor tensor)
       : tensor_(std::move(tensor)), type_(TensorIndexType::Tensor) {}
 
@@ -230,7 +224,7 @@ static inline Tensor applySlice(
       return self;
     }
   }
-  return self.slice_symint(dim, start, stop, step);
+  return self.slice_symint(dim, start, stop, std::move(step));
 }
 
 static inline Tensor applySelect(
@@ -243,7 +237,7 @@ static inline Tensor applySelect(
   // See NOTE [nested tensor size for indexing]
   if (self_sizes.has_value()) {
     TORCH_CHECK_INDEX(
-        !(index == 0 && dim == 0 && self_sizes->size() == 0),
+        !(index == 0 && dim == 0 && self_sizes->empty()),
         "invalid index of a 0-dim tensor. ",
         "Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number");
 
@@ -388,7 +382,10 @@ static inline Tensor scalarToTensor(
 static inline SymIntArrayRef slicePrefix1sSize(const SymIntArrayRef& sizes) {
   size_t first_non1_src = sizes.size();
   for (const auto i : c10::irange(sizes.size())) {
-    if (sizes[i] != 1) {
+    // Unbacked SymInt has different behavior, but this is sound because
+    // failing to slice will only ever cause an error, not divergent
+    // behavior
+    if (!sizes[i].has_hint() || sizes[i] != 1) {
       first_non1_src = i;
       break;
     }
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 7e86163f1ca4..5d7c7879f0b1 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -69,8 +69,8 @@ static OptionalTensorRef make_otr(const TensorBase &tensor) {
 namespace internal {
 
 OpaqueOptionalTensorRef::OpaqueOptionalTensorRef() {
-  static_assert(alignof(OptionalTensorRef) == alignof(TensorBase), "");
-  static_assert(sizeof(OptionalTensorRef) == sizeof(TensorBase), "");
+  static_assert(alignof(OptionalTensorRef) == alignof(TensorBase));
+  static_assert(sizeof(OptionalTensorRef) == sizeof(TensorBase));
   new (data_.data()) OptionalTensorRef();
 }
 
@@ -163,7 +163,7 @@ TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef sha
 
 TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef shape, IntArrayRef squash_dims) {
   declare_static_shape(shape);
-  if (!static_shape_->size()) return *this;
+  if (static_shape_->empty()) return *this;
   for (const auto& squash_dim : squash_dims) {
     TORCH_CHECK(squash_dim >= 0 && squash_dim < static_cast<int64_t>(static_shape_->size()),
                 "squash_dim ", squash_dim, " must be in [0, ", static_shape_->size(), ").");
@@ -715,7 +715,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
   // Update shape and strides
   shape_ = reorder(shape_);
   for (auto& op : operands_) {
-    if (op.stride_bytes.size() > 0) {
+    if (!op.stride_bytes.empty()) {
       op.stride_bytes = reorder(op.stride_bytes);
     }
   }
@@ -1221,8 +1221,11 @@ void TensorIteratorBase::compute_shape(const TensorIteratorConfig& config) {
     // the destination tensor.  If the output tensor is also an input, we'll
     // pick it up later in the operands.
     if (config.resize_outputs_ && op.is_output) continue;
+    TORCH_CHECK(!op.tensor_base().unsafeGetTensorImpl()->has_symbolic_sizes_strides(),
+      "TensorIterator does not support symbolic shapes; please implement this operator in torch/_refs "
+      "using the elementwise or reduction helpers (look at backtrace to find out what operator this is)");
     auto shape = op.tensor_base().sizes();
-    if (shape.size() == 0) {
+    if (shape.empty()) {
       has_scalars = true;
     } else {
       has_tensors = true;
@@ -1721,7 +1724,7 @@ void DimCounter::increment(const std::array<int64_t, 2>& step) {
 std::array<int64_t, 2> DimCounter::max_2d_step() const {
   int64_t step0 = std::min(shape[0] - values[0], range.end - offset);
   int64_t step1 = 1;
-  if (step0 == shape[0] && shape.size() >= 1) {
+  if (step0 == shape[0] && !shape.empty()) {
     step1 = std::min(shape[1] - values[1], (range.end - offset) / shape[0]);
   }
   return {step0, step1};
diff --git a/aten/src/ATen/ThreadLocalPythonObjects.cpp b/aten/src/ATen/ThreadLocalPythonObjects.cpp
new file mode 100644
index 000000000000..69fbade990bb
--- /dev/null
+++ b/aten/src/ATen/ThreadLocalPythonObjects.cpp
@@ -0,0 +1,36 @@
+#include <c10/core/TensorImpl.h>
+#include <ATen/ThreadLocalPythonObjects.h>
+#include <c10/util/Exception.h>
+
+#include <utility>
+
+namespace at {
+namespace impl {
+
+static thread_local ThreadLocalPythonObjects py_objects;
+
+
+void ThreadLocalPythonObjects::set(const std::string& key, std::shared_ptr<SafePyObject> value) {
+  py_objects.obj_dict_[key] = std::move(value);
+}
+
+const std::shared_ptr<SafePyObject>& ThreadLocalPythonObjects::get(const std::string& key) {
+  TORCH_CHECK(py_objects.obj_dict_.count(key));
+  return py_objects.obj_dict_[key];
+}
+
+bool ThreadLocalPythonObjects::contains(const std::string& key) {
+  return py_objects.obj_dict_.count(key);
+}
+
+void ThreadLocalPythonObjects::set_state(ThreadLocalPythonObjects state) {
+  py_objects = std::move(state);
+}
+
+const ThreadLocalPythonObjects& ThreadLocalPythonObjects::get_state() {
+  return py_objects;
+}
+
+
+}
+}
diff --git a/aten/src/ATen/ThreadLocalPythonObjects.h b/aten/src/ATen/ThreadLocalPythonObjects.h
new file mode 100644
index 000000000000..892d8a61f00a
--- /dev/null
+++ b/aten/src/ATen/ThreadLocalPythonObjects.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Macros.h>
+#include <unordered_map>
+
+namespace at {
+namespace impl {
+
+struct TORCH_API ThreadLocalPythonObjects {
+  static void set(const std::string& key, std::shared_ptr<SafePyObject> value);
+  static const std::shared_ptr<SafePyObject>& get(const std::string& key);
+  static bool contains(const std::string& key);
+
+  static const ThreadLocalPythonObjects& get_state();
+  static void set_state(ThreadLocalPythonObjects state);
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<c10::SafePyObject>> obj_dict_;
+};
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index c86cddb803e9..c22f07866f71 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -17,7 +17,8 @@ ThreadLocalState::ThreadLocalState()
       autograd_tls_(c10::AutogradState::get_tls_state()),
       torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
       python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
-      saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()) {}
+      saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
+      saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {}
 
 void ThreadLocalState::set_grad_mode(bool enabled) {
   autograd_tls_.set_grad_mode(enabled);
@@ -51,6 +52,8 @@ void ThreadLocalState::setThreadLocalState(
   functorch::setFuncTorchTLS(state.functorch_tls_);
 
   at::functionalization::impl::setFunctionalizationReapplyViewsTLS(state.functionalization_reapply_views_state_);
+
+  at::impl::ThreadLocalPythonObjects::set_state(state.saved_objects_);
 }
 
 } // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 0184cc9b82c4..7cae9997ab05 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -10,6 +10,7 @@
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/PythonTorchFunctionTLS.h>
 #include <ATen/SavedTensorHooks.h>
+#include <ATen/ThreadLocalPythonObjects.h>
 #include <ATen/record_function.h>
 #include <c10/core/impl/PythonDispatcherTLS.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
@@ -76,6 +77,9 @@ class TORCH_API ThreadLocalState {
 
   bool functionalization_reapply_views_state_;
 
+  // TLS for arbitrary python objects that is registered via hooks
+  at::impl::ThreadLocalPythonObjects saved_objects_;
+
   friend class ThreadLocalStateGuard;
 };
 
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index b0bc583b90c2..142665b7c8b2 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -19,7 +19,7 @@ inline int64_t maybe_wrap_dim(int64_t dim, TensorImpl* tensor) {
 }
 
 inline int64_t maybe_wrap_dim(int64_t dim, TensorList tensors) {
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     // can't wrap empty TensorList; rely on underlying implementation to throw
     // error if necessary.
     return dim;
@@ -30,7 +30,7 @@ inline int64_t maybe_wrap_dim(int64_t dim, TensorList tensors) {
 inline int64_t maybe_wrap_dim(
     int64_t dim,
     const std::vector<std::vector<int64_t>>& tensor_sizes) {
-  if (tensor_sizes.size() == 0) {
+  if (tensor_sizes.empty()) {
     // can't wrap empty list; rely on underlying implementation to throw error
     // if necessary
     return dim;
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 9b4220fb053a..178558dcc1b1 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -390,7 +390,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(rnn_tanh_cell, lower_precision_fp)
   KERNEL(rnn_relu_cell, lower_precision_fp)
   KERNEL(_scaled_dot_product_flash_attention, lower_precision_fp)
-  KERNEL(_scaled_dot_product_attention, lower_precision_fp)
+  KERNEL(scaled_dot_product_attention, lower_precision_fp)
 
   // fp32
   KERNEL(acos, fp32)
@@ -507,11 +507,12 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(matmul, lower_precision_fp)
   KERNEL_CPU(conv_tbc, lower_precision_fp)
   KERNEL_CPU(mkldnn_rnn_layer, lower_precision_fp)
+  KERNEL_CPU(conv_transpose1d, lower_precision_fp)
+  KERNEL_CPU2(conv_transpose2d, input, lower_precision_fp)
+  KERNEL_CPU2(conv_transpose3d, input, lower_precision_fp)
+  KERNEL_CPU(prelu, lower_precision_fp)
 
   // fp32 cast policy
-  KERNEL_CPU(conv_transpose1d, fp32)
-  KERNEL_CPU2(conv_transpose2d, input, fp32)
-  KERNEL_CPU2(conv_transpose3d, input, fp32)
   KERNEL_CPU(avg_pool3d, fp32)
   KERNEL_CPU(binary_cross_entropy, fp32)
   KERNEL_CPU(grid_sampler, fp32)
@@ -601,7 +602,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(_lu_with_info, fp32)
   KERNEL_CPU(qr, fp32)
   KERNEL_CPU(svd, fp32)
-  KERNEL_CPU(symeig, fp32)
   KERNEL_CPU(triangular_solve, fp32)
   KERNEL_CPU(fractional_max_pool2d, fp32)
   KERNEL_CPU(fractional_max_pool3d, fp32)
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index 3d57ac923116..1f834ad37b45 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -201,7 +201,7 @@ inline std::vector<Tensor> cached_cast(
   std::vector<Tensor> vec;
   vec.reserve(arg.size());
   for (const auto& t : arg) {
-    vec.push_back(cached_cast(to_type, t, device_type));
+    vec.emplace_back(cached_cast(to_type, t, device_type));
   }
   return vec;
 }
@@ -213,7 +213,7 @@ inline std::vector<Tensor> cached_cast(
   std::vector<Tensor> vec;
   vec.reserve(arg.size());
   for (const auto& t : arg) {
-    vec.push_back(cached_cast(to_type, t, device_type));
+    vec.emplace_back(cached_cast(to_type, t, device_type));
   }
   return vec;
 }
diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h
index c84165e67ec3..41aff6c36536 100644
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@@ -18,9 +18,7 @@ namespace jit {
 // in the top level environment, and then recurses into a parent
 // environment if the key is not found.)
 struct TemplateEnv {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  TemplateEnv() : parent(nullptr) {}
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  TemplateEnv() = default;
   TemplateEnv(TemplateEnv& parent) : parent(&parent) {}
 
   using string_list = std::vector<std::string>;
@@ -88,7 +86,7 @@ struct TemplateEnv {
 
   std::unordered_map<std::string, std::string> strings_;
   std::unordered_map<std::string, string_list> lists_;
-  TemplateEnv* parent;
+  TemplateEnv* parent{nullptr};
 };
 
 /*
@@ -194,14 +192,14 @@ struct CodeTemplate {
       const string_list& strings,
       bool comma_before,
       bool comma_after) const {
-    if (comma_before && strings.size() > 0)
+    if (comma_before && !strings.empty())
       out << ", ";
     for (const auto i : c10::irange(strings.size())) {
       if (i > 0)
         out << ", ";
       out << strings[i];
     }
-    if (comma_after && strings.size() > 0)
+    if (comma_after && !strings.empty())
       out << ", ";
   }
   // These indentation functions follow the convention that they never emit
diff --git a/aten/src/ATen/core/MT19937RNGEngine.h b/aten/src/ATen/core/MT19937RNGEngine.h
index 68b9c0c7e64c..b208d6ba7fac 100644
--- a/aten/src/ATen/core/MT19937RNGEngine.h
+++ b/aten/src/ATen/core/MT19937RNGEngine.h
@@ -118,7 +118,7 @@ class mt19937_engine {
     return data_;
   }
 
-  inline void set_data(mt19937_data_pod data) {
+  inline void set_data(const mt19937_data_pod& data) {
     data_ = data;
   }
 
diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
index 00d3c635859a..9ff841390e35 100644
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <ATen/core/dispatch/Dispatcher.h>
 
 // TODO: this can probably live in c10
diff --git a/aten/src/ATen/core/QuantizerBase.h b/aten/src/ATen/core/QuantizerBase.h
index 922ea8a38f50..b6031f0d7798 100644
--- a/aten/src/ATen/core/QuantizerBase.h
+++ b/aten/src/ATen/core/QuantizerBase.h
@@ -39,7 +39,7 @@ using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
 struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
   const ScalarType scalar_type_;
   explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
-  virtual ~Quantizer();
+  ~Quantizer() override;
 
   // Copied from torch/csrc/jit/ir/scope.h
   QuantizerPtr intrusive_from_this() {
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 0a54cf0357cb..d0001a358b2e 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -6,6 +6,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Storage.h>
+#include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/core/UndefinedTensorImpl.h>
@@ -18,8 +19,8 @@
 
 #include <ATen/core/NamedTensor.h>
 #include <ATen/core/QuantizerBase.h>
-#include <c10/core/SymIntArrayRef.h>
 #include <ATen/core/TensorAccessor.h>
+#include <ATen/StorageUtils.h>
 
 namespace c10 {
 class Scalar;
@@ -341,6 +342,25 @@ class TORCH_API TensorBase {
     return impl_->storage().is_alias_of(other.storage());
   }
 
+  // Move the storage backend to shm based
+  // to enable memory sharing across processes.
+  //
+  // NB1: the ideal behavior of this API still requires further discussion
+  // but for now we are inclined to keep it consistent with existing THP behavior
+  // https://github.com/pytorch/pytorch/blob/4dca9bde0552afc67b5b74f4a0696fe6055709c4/torch/storage.py#L196-L212
+  // so we don't assert on anything here and rely on caller knowing
+  // what it's doing.
+  //
+  // NB2: this currently provides Linux fd based shm support only
+  // to simplify the storage lifetime management logic in ATen
+  // and similarly for now we are not adding support for file system based
+  // shm support like in THP due to additional GC manager support needed
+  // to prevent leaks.
+  // As such, calling this from non supported systems (e.g. Windows) would fail.
+  void share_memory_() {
+    at::share_memory_(*this);
+  }
+
   inline bool _is_zerotensor() const {
     return impl_->_is_zerotensor();
   }
@@ -858,7 +878,7 @@ auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_void_t
 
 template <typename T>
 auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_var_t<T> {
-  return _register_hook(std::move(hook));
+  return _register_hook(std::forward<T>(hook));
 }
 
 namespace detail {
diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h
index f473082a5c5b..4147c3f74082 100644
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@@ -123,7 +123,7 @@ C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
  * exponentialy distributed with `lambda` parameter of the distribution.
  */
 template <typename T>
-C10_HOST_DEVICE __ubsan_ignore_float_divide_by_zero__ inline T exponential(T val, T lambda) {
+C10_HOST_DEVICE inline T exponential(T val, T lambda) {
   // https://en.wikipedia.org/wiki/Exponential_distribution#Generating_exponential_variates
   // Different implementations for CUDA and CPU to preserve original logic
   // TODO: must be investigated and unified!!!
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index ebc54d8e7cba..d2e82de512ee 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -19,12 +19,7 @@
 // TODO This whole file should be deleted and replaced with the mechanism
 //      described in https://github.com/pytorch/pytorch/issues/29548
 
-using c10::OperatorHandle;
 using c10::Stack;
-using c10::DispatchKey;
-using c10::DispatchKeySet;
-using c10::Dispatcher;
-using c10::KernelFunction;
 
 namespace {
 
@@ -60,6 +55,10 @@ TORCH_LIBRARY_IMPL(_, AutogradMPS, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
 
+TORCH_LIBRARY_IMPL(_, AutogradMeta, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
 // see Note [ADInplaceOrView key]
 TORCH_LIBRARY_IMPL(_, ADInplaceOrView, m) {
       m.fallback(torch::CppFunction::makeFallthrough());
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index 68250be2daf5..6746540f43e1 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -44,7 +44,7 @@ bool torchVitalEnabled() {
   bool enabled = []() {
     auto e = getenv("TORCH_VITAL");
     if (e != nullptr) {
-      return strlen(e) > 0;
+      return e[0] != '\0';
     }
     return false;
   }();
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index cc7a181a0b88..d7469ffe9f4f 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -27,7 +27,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
    * Initializes an empty Blob.
    */
   Blob() noexcept : meta_(), pointer_(nullptr), has_ownership_(false) {}
-  ~Blob() {
+  ~Blob() override {
     Reset();
   }
 
diff --git a/aten/src/ATen/core/boxing/OperatorKernel.h b/aten/src/ATen/core/boxing/OperatorKernel.h
index ac4f06a91c47..82c68935540e 100644
--- a/aten/src/ATen/core/boxing/OperatorKernel.h
+++ b/aten/src/ATen/core/boxing/OperatorKernel.h
@@ -21,7 +21,7 @@ namespace c10 {
  * See below for how to register this kernel with PyTorch.
  */
 struct TORCH_API OperatorKernel : public c10::intrusive_ptr_target {
-  virtual ~OperatorKernel() = default;
+  ~OperatorKernel() override = default;
 };
 
 }  // namespace c10
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index ccac9ebe8f61..571e8c5bff7b 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -234,7 +234,7 @@ struct BoxedKernelWrapper<
       [&] {
         // op returns void, boxed kernel has pushed nothing onto stack.
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-          stack.size() == 0,
+          stack.empty(),
           "Boxed kernel was expected to return no values on the stack, ",
           "but instead returned ", stack.size(), " values."
         );
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index 2478bde034bc..c7843f489c1c 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -152,7 +152,7 @@ void checkForwardHookInputArguments(
   if (forward_args.size() == 1) {
     // check for empty forward case
     TORCH_CHECK(
-        input_tuple_types.size() == 0,
+        input_tuple_types.empty(),
         hook_id,
         "was expecting Tuple[()] as the input type. Received type: '",
         input_arg.type()->annotation_str(),
@@ -213,7 +213,7 @@ void ClassType::checkForwardPreHookSchema(
   // or the contained single type if the input was a tuple containing a single
   // type.
   TORCH_CHECK(
-            pre_hook_schema.returns().size() != 0,
+            !pre_hook_schema.returns().empty(),
             hook_id,
             "is missing a return annotation. Return annotations are required, please add one.\n",
             pre_hook_err_msg
@@ -254,7 +254,7 @@ void ClassType::checkForwardPreHookSchema(
   // check for edge case of Tuple[()] for when forward has no arguments
   if (forward_args.size() == 1) {
     TORCH_CHECK(
-        return_tuple_types.size() == 0,
+        return_tuple_types.empty(),
         wrong_type_returned_err_msg,
         " Was expecting either 'None' or 'Tuple[()]' since forward had ",
         "no arguments.\n",
@@ -524,9 +524,9 @@ void ClassType::checkNotExist(const std::string& name, const std::string& what)
 }
 
 void ClassType::addAttribute(ClassAttribute classAttribute) {
-    attributes_.push_back(classAttribute);
-    attributeTypes_.push_back(classAttribute.getType());
     AT_ASSERT(attributes_.size() == attributeTypes_.size());
+    attributeTypes_.emplace_back(classAttribute.getType());
+    attributes_.emplace_back(std::move(classAttribute));
 }
 
 size_t ClassType::addAttribute(
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 28bbb48ded1d..38820f20a303 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -329,7 +329,7 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker
     backendFallbackKernels_[idx].debug, ", new registration ", debug
   );
   // NB: inferred function schema is always nullptr for fallbacks, as fallbacks
-  // cannot be unobxed
+  // cannot be unboxed
   backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug));
 
   for (auto& op : operators_) {
@@ -403,7 +403,7 @@ std::vector<OperatorName> Dispatcher::getRegistrationsForDispatchKey(c10::option
 int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey) {
   int64_t seq_num = -1;
   // Setting sequence number in the Autograd case to associate
-  // the forward range with the coresponding Autograd's node
+  // the forward range with the corresponding Autograd's node
   if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
     seq_num = at::sequence_number::peek();
   }
@@ -416,7 +416,7 @@ void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction
 
 void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey) {
   // Setting sequence number in the Autograd case to associate
-  // the forward range with the coresponding Autograd's node
+  // the forward range with the corresponding Autograd's node
   guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey));
 }
 
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index cbc7ff8bf309..a5f154093df8 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -64,7 +64,7 @@ const AnnotatedKernel& OperatorEntry::ambiguousAutogradOtherKernel() const {
   return kernel;
 }
 
-void OperatorEntry::assertSignatureIsCorrect(const CppSignature call_signature, bool has_symint) const {
+void OperatorEntry::assertSignatureIsCorrect(const CppSignature& call_signature, bool has_symint) const {
   if (has_symint) {
     if (C10_UNLIKELY(sym_cpp_signature_.has_value() && (call_signature != sym_cpp_signature_->signature))) {
       reportSignatureError(call_signature, *sym_cpp_signature_);
@@ -145,12 +145,13 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
 #ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
   if (k[0].kernel.isValid()) {
 #else
-  if (k.size() > 0) {
+  if (!k.empty()) {
 #endif
     // Suppress the warning for Meta key as we are overriding C++ meta functions with python meta functions
     // for some ops
     if (dispatch_key != DispatchKey::Meta) {
-      TORCH_WARN("Overriding a previously registered kernel for the same operator and the same dispatch key\n",
+      TORCH_WARN_ONCE("Warning only once for all operators,  other operators may also be overrided.\n",
+            "  Overriding a previously registered kernel for the same operator and the same dispatch key\n",
             "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
             "    ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n",
             "  dispatch key: ", toString(dispatch_key), "\n",
@@ -221,12 +222,12 @@ bool OperatorEntry::hasKernelForDispatchKey(DispatchKey k) const {
   TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end());
   auto it = kernels_.find(k);
   if (it == kernels_.end()) return false;
-  return it->second.size() > 0;
+  return !it->second.empty();
 }
 
 const KernelFunction& OperatorEntry::kernelForDispatchKey(DispatchKey k) const {
   auto it = kernels_.find(k);
-  TORCH_CHECK(it != kernels_.end() && it->second.size(), "no kernel for ", k, " on ", name_);
+  TORCH_CHECK(it != kernels_.end() && !it->second.empty(), "no kernel for ", k, " on ", name_);
   auto jt = it->second.begin();
   TORCH_INTERNAL_ASSERT(jt->kernel.isValid())
   return jt->kernel;
@@ -462,7 +463,7 @@ void OperatorEntry::checkInvariants() const {
   }
   TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end(), dumpState());
   for (const auto& kv : kernels_) {
-    TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState());
+    TORCH_INTERNAL_ASSERT(!kv.second.empty(), dumpState());
   }
   for (auto k : DispatchKeySet(DispatchKeySet::FULL)) {
     auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k);
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index f7fcbba70109..ea6d53a72e37 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -167,7 +167,7 @@ class TORCH_API OperatorEntry final {
     assertSignatureIsCorrect(CppSignature::make<FuncType>(), fn_has_symint<FuncType>::value);
   }
 
-  void assertSignatureIsCorrect(const CppSignature call_signature, bool has_symint) const;
+  void assertSignatureIsCorrect(const CppSignature& call_signature, bool has_symint) const;
 
   [[noreturn]] void reportError(DispatchKey dispatchKey) const;
 
diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp
index e22edc14a8a0..128b06bcbb69 100644
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@@ -38,7 +38,7 @@ std::string DynamicType::str() const {
   std::string ret = "Dynamic<";
   ret += std::to_string(static_cast<DynamicTypeBits>(tag_));
   ret += ">";
-  if (tag_ != Tag::Class && arguments_.elems.size() > 0) {
+  if (tag_ != Tag::Class && !arguments_.elems.empty()) {
     ret += "[";
     for (const auto& arg : arguments_.elems) {
       if (arg.label) {
@@ -293,6 +293,8 @@ TypePtr DynamicType::fallback() const {
       return RRefType::create(arguments_.elems[0].ty->fallback());
     case Tag::Future:
       return FutureType::create(arguments_.elems[0].ty->fallback());
+    case Tag::Await:
+      return AwaitType::create(arguments_.elems[0].ty->fallback());
     case Tag::Any:
       return AnyType::get();
   }
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index 1f649c8217cb..37ffd6224142 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -56,6 +56,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
   _(AnyEnum, DYNAMIC_TYPE_BIT(20), 1)                                        \
   _(RRef, DYNAMIC_TYPE_BIT(21), 0)                                           \
   _(Future, DYNAMIC_TYPE_BIT(22), 0)                                         \
+  _(Await, DYNAMIC_TYPE_BIT(23), 0)                                          \
   _(Any, 0xffffffff, 1)
 
 #define FORALL_DYNAMIC_TYPES_FAKE(_) \
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 1c1101466f71..7463e283ea9f 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -109,7 +109,7 @@ c10::optional<AliasTypeSet> FunctionSchema::mapTypeToAliasTypeSet(const TypePtr&
               (*maybe_inner_types).end());
         }
       }
-      if (mutable_types.size() == 0) {
+      if (mutable_types.empty()) {
         return c10::nullopt;
       }
       return mutable_types;
@@ -130,7 +130,7 @@ c10::optional<AliasTypeSet> FunctionSchema::mapTypeToAliasTypeSet(const TypePtr&
               (*maybe_inner_types).end());
         }
       }
-      if (mutable_types.size() == 0) {
+      if (mutable_types.empty()) {
         return c10::nullopt;
       }
       return {AliasTypeSet{TupleType::create(std::move(mutable_types))}};
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index 7b7faa7a62dd..3daefc1de2e5 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -11,7 +11,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
   // it is simpler for now to work directly on this schema
 
   out << schema.name();
-  if (schema.overload_name() != "") {
+  if (!schema.overload_name().empty()) {
     out << "." << schema.overload_name();
   }
   out << "(";
@@ -27,7 +27,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
   }
 
   if(schema.is_vararg()) {
-    if(schema.arguments().size() > 0)
+    if(!schema.arguments().empty())
       out << ", ";
     out << "...";
   }
@@ -51,7 +51,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
    */
   bool need_paren = !(
     (returns.size() == 1 && !schema.is_varret()) ||
-    (returns.size() == 0 && schema.is_varret()));
+    (returns.empty() && schema.is_varret()));
 
   if (returns.size() == 1 && !schema.is_varret()) {
     std::stringstream return_ss;
@@ -69,7 +69,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
     // PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
     // this. test_serialize_and_deserialize (https://github.com/pytorch/pytorch/blob/master/test/test_function_schema.py#L15)
     // also covers this case.
-    if (return_str.size() > 0 && return_str.front() == '(') {
+    if (!return_str.empty() && return_str.front() == '(') {
       need_paren = true;
     }
   }
@@ -84,7 +84,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema)
     out << returns.at(i);
   }
   if (schema.is_varret()) {
-    if (returns.size() != 0) {
+    if (!returns.empty()) {
       out << ", ";
     }
     out << "...";
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 2abc6217516d..b3837a54485e 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -163,7 +163,11 @@ namespace c10 {
   _(aten, is_scripting)              \
   _(aten, _unwrap_optional)          \
   _(prim, fork)                      \
+  _(prim, awaitable)                 \
   _(prim, forkClosure)               \
+  _(prim, awaitableClosure)          \
+  _(prim, awaitable_nowait)          \
+  _(prim, awaitable_wait)            \
   _(prim, RaiseException)            \
   _(prim, Closure)                   \
   _(prim, CreateObject)              \
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 4062792695c8..22182f98395d 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -108,6 +108,8 @@ c10::TypePtr IValue::TagType<c10::Type>::get(const IValue& v) {
       }
       case Tag::GenericList:
         return ListType::create(v.toList().elementType());
+      case Tag::Await:
+        return AwaitType::create(v.toAwait()->elementType());
       case Tag::Future:
         return FutureType::create(v.toFuture()->elementType());
       case Tag::RRef:
@@ -235,6 +237,7 @@ void IValue::getSubValues(HashAliasedIValues& subValues) const {
       break;
     }
     case Tag::Future:
+    case Tag::Await:
     case Tag::Device:
     case Tag::Uninitialized:
     case Tag::Capsule:
@@ -325,6 +328,7 @@ IValue IValue::equals(const IValue& rhs) const {
       return rhs.isList() && lhs.toList() == rhs.toList();
     case Tag::Blob:
     case Tag::Future:
+    case Tag::Await:
     case Tag::RRef:
     case Tag::Object:
     case Tag::PyObject:
@@ -375,6 +379,7 @@ size_t IValue::hash(const IValue& v) {
     case Tag::GenericList:
     case Tag::Blob:
     case Tag::Future:
+    case Tag::Await:
     case Tag::RRef:
     case Tag::Object:
     case Tag::PyObject:
@@ -490,7 +495,7 @@ std::ostream& printMaybeAnnotatedList(
     const IValue& the_list,
     IValueFormatter formatter) {
   auto list_elem_type = the_list.type()->containedType(0);
-  if (the_list.toListRef().size() == 0 ||
+  if (the_list.toListRef().empty() ||
       !elementTypeCanBeInferredFromMembers(list_elem_type)) {
     out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
     printList(out, the_list.toListRef(), "[", "]", std::move(formatter));
@@ -531,7 +536,7 @@ std::ostream& printMaybeAnnotatedDict(
     const IValue& the_dict,
     IValueFormatter formatter) {
   auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
-  if (the_dict.toGenericDict().size() == 0 ||
+  if (the_dict.toGenericDict().empty() ||
       !elementTypeCanBeInferredFromMembers(value_type)) {
     out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
     printDict(out, the_dict.toGenericDict(), std::move(formatter)) << ")";
@@ -805,6 +810,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return out << "RRef";
     case IValue::Tag::Future:
       return out << "Future";
+    case IValue::Tag::Await:
+      return out << "Await";
     case IValue::Tag::Uninitialized:
       return out << "Uninitialized";
     case IValue::Tag::Device:
@@ -872,7 +879,7 @@ IValue IValue::deepcopy(
     case IValue::Tag::Tuple: {
       std::vector<IValue> copied_tuple;
       for (const auto& e : toTupleRef().elements()) {
-        copied_tuple.push_back(e.deepcopy(memo));
+        copied_tuple.emplace_back(e.deepcopy(memo));
       }
       copy = IValue(ivalue::Tuple::create(std::move(copied_tuple)));
     }
@@ -1060,11 +1067,11 @@ std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> ivalue::Future::extractSt
       if (tensor.is_sparse()) {
         // Sparse tensor is indices and values. Both are tensors
         // and contain storage.
-        weakStorageImpls.push_back(tensor.indices().storage().getWeakStorageImpl());
-        weakStorageImpls.push_back(tensor.values().storage().getWeakStorageImpl());
+        weakStorageImpls.emplace_back(tensor.indices().storage().getWeakStorageImpl());
+        weakStorageImpls.emplace_back(tensor.values().storage().getWeakStorageImpl());
       } else {
         // A dense/strided tensor contains 1 storage
-        weakStorageImpls.push_back(tensor.storage().getWeakStorageImpl());
+        weakStorageImpls.emplace_back(tensor.storage().getWeakStorageImpl());
       }
     }
   } else {
@@ -1074,7 +1081,7 @@ std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> ivalue::Future::extractSt
     value.getSubValues(sub_values);
     for (const at::IValue& sub_value : sub_values) {
       if (sub_value.isTensor()) {
-        weakStorageImpls.push_back(sub_value.toTensor().storage().getWeakStorageImpl());
+        weakStorageImpls.emplace_back(sub_value.toTensor().storage().getWeakStorageImpl());
       }
     }
   }
@@ -1098,7 +1105,7 @@ TORCH_API intrusive_ptr<ivalue::Future> collectAll(
   };
 
   auto ctx = std::make_shared<Ctx>(std::move(srcs));
-  if (ctx->srcFutures.size() == 0) {
+  if (ctx->srcFutures.empty()) {
     ctx->dstFuture->markCompleted(ctx->asIvalue);
   } else {
     auto typePtr = ctx->srcFutures.get(0)->elementType();
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 365b93d86797..82d99a0a8d6a 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -54,6 +54,7 @@ TORCH_API IValueComparator getGreaterThanComparator(const IValue& v);
 namespace ivalue {
 struct Tuple;
 struct Future;
+struct Await;
 struct ConstantString;
 struct GenericDict;
 struct Object;
@@ -79,7 +80,7 @@ struct StreamData3Holder : c10::intrusive_ptr_target {
     StreamData3Holder(struct c10::StreamData3 d) {
       val = d;
     }
-    StreamData3Holder() {}
+    StreamData3Holder() = delete;
     struct c10::StreamData3 val;
 };
 
@@ -168,6 +169,7 @@ struct Capsule {
   _(GenericList)             \
   _(GenericDict)             \
   _(Future)                  \
+  _(Await)                   \
   _(Device)                  \
   _(Stream)                  \
   _(Object)                  \
@@ -551,6 +553,13 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<ivalue::Future> toFuture() &&;
   c10::intrusive_ptr<ivalue::Future> toFuture() const&;
 
+  IValue(c10::intrusive_ptr<ivalue::Await> v);
+  bool isAwait() const {
+    return Tag::Await == tag;
+  }
+  c10::intrusive_ptr<ivalue::Await> toAwait() &&;
+  c10::intrusive_ptr<ivalue::Await> toAwait() const&;
+
   // RRef
   IValue(c10::intrusive_ptr<c10::RRefInterface> v);
   bool isRRef() const {
@@ -1176,10 +1185,12 @@ struct TORCH_API IValue final {
         return true;
       case Tag::Future:
         return true;
+      case Tag::Await:
+        return true;
       case Tag::Device:
         return false;
       case Tag::Stream:
-        return false;
+        return true;
       case Tag::Object:
         return true;
       case Tag::PyObject:
@@ -1250,12 +1261,12 @@ struct TORCH_API IValue final {
   friend MaybeOwnedTraits<IValue>;
 
   Payload payload;
-  Tag tag;
+  Tag tag{IValue::Tag::None};
   friend struct WeakIValue;
 };
 
 struct TORCH_API WeakIValue final {
-  WeakIValue() : tag(IValue::Tag::None), is_intrusive_ptr(false) {}
+  WeakIValue() = default;
 
   WeakIValue(const WeakIValue& rhs)
       : payload(rhs.payload),
@@ -1367,8 +1378,8 @@ struct TORCH_API WeakIValue final {
  private:
   using Payload = IValue::Payload::TriviallyCopyablePayload;
   Payload payload;
-  IValue::Tag tag;
-  bool is_intrusive_ptr;
+  IValue::Tag tag{IValue::Tag::None};
+  bool is_intrusive_ptr{false};
 };
 
 // An owning pointer to a type. When the type is class type, it requires a pair
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 47067516a6ae..c16ff79c978a 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -92,6 +92,14 @@ inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() const& {
   AT_ASSERT(isFuture(), "Expected Future but got ", tagKind());
   return toIntrusivePtr<ivalue::Future>();
 }
+inline c10::intrusive_ptr<ivalue::Await> IValue::toAwait() && {
+  AT_ASSERT(isAwait(), "Expected Await but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Await>();
+}
+inline c10::intrusive_ptr<ivalue::Await> IValue::toAwait() const& {
+  AT_ASSERT(isAwait(), "Expected Await but got ", tagKind());
+  return toIntrusivePtr<ivalue::Await>();
+}
 inline c10::intrusive_ptr<c10::RRefInterface> IValue::toRRef() && {
   AT_ASSERT(isRRef(), "Expected RRef but got ", tagKind());
   return moveToIntrusivePtr<c10::RRefInterface>();
@@ -944,7 +952,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
           "Skipping setting following error on the Future since "
           "it is already marked completed (this is not necessarily "
           "an error):\n",
-          tryRetrieveErrorMessageInternal(eptr));
+          tryRetrieveErrorMessageInternal(std::move(eptr)));
       if (eptr_) {
         msg += c10::str(
             ", \nOriginal exception:\n",
@@ -1199,7 +1207,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   // Tries to retrieve the error message from std::exception_ptr.
   std::string tryRetrieveErrorMessageInternal(std::exception_ptr eptr) const {
     try {
-      std::rethrow_exception(eptr);
+      std::rethrow_exception(std::move(eptr));
     } catch (const std::exception& e) {
       return e.what();
     } catch (...) {
@@ -1364,6 +1372,78 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
   const std::vector<c10::Device> devices_;
 };
 
+struct C10_EXPORT ivalue::Await final : c10::intrusive_ptr_target {
+ private:
+  explicit Await(TypePtr elType, std::function<IValue()> fn)
+      : elType_(std::move(elType)), type_(AwaitType::create(elType_)), fn_(std::move(fn)) {}
+
+  explicit Await(TypePtr elType) : elType_(std::move(elType)), type_(AwaitType::create(elType_)) { }
+
+  friend c10::intrusive_ptr<Await>;
+
+ public:
+  Await(const Await&) = delete;
+  Await(Await&&) = delete;
+  Await& operator=(const Await&) = delete;
+  Await& operator=(Await&&) = delete;
+
+  IValue wait() {
+    if (!completed_) {
+      TORCH_CHECK(fn_, "Incompleted Await: fn can't be None");
+      value_ = fn_();
+      completed_ = true;
+      args_ = {};
+    }
+    return value_;
+  }
+
+  IValue value() {
+    TORCH_CHECK(completed_, "Await must be completed");
+    return value_;
+  }
+
+  void setFn(std::function<IValue()> fn) {
+    fn_ = std::move(fn);
+  }
+
+  bool completed() {
+    return completed_;
+  }
+
+  void markCompleted(IValue value) {
+    value_ = std::move(value);
+    completed_ = true;
+  }
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const Await& v);
+
+  TypePtr elementType() const {
+    return elType_;
+  }
+
+  TypePtr type() const {
+    return type_;
+  }
+
+  void setArgs(std::vector<IValue> args) {
+    args_ = std::move(args);
+  }
+
+  std::vector<IValue>& args() {
+    return args_;
+  }
+
+ private:
+  TypePtr elType_;
+  TypePtr type_;
+  std::vector<IValue> args_;
+  std::function<IValue()> fn_;
+  IValue value_;
+  bool completed_{};
+};
+
 // Input is a list of Futures with the same target type.
 // Output is a Future to the List of completed Futures.
 TORCH_API intrusive_ptr<ivalue::Future> collectAll(
@@ -1510,7 +1590,7 @@ struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
   virtual std::string toStr() = 0;
   virtual std::vector<at::Tensor> extractTensors() = 0;
 
-  virtual ~PyObjectHolder()= default;
+  ~PyObjectHolder() override = default;
 };
 
 struct ivalue::EnumHolder : c10::intrusive_ptr_target {
@@ -1621,6 +1701,7 @@ DEFINE_TO(c10::intrusive_ptr<ivalue::Tuple>, toTuple)
 DEFINE_TO(std::string, toStringRef)
 DEFINE_TO(c10::string_view, toStringView)
 DEFINE_TO(c10::intrusive_ptr<ivalue::Future>, toFuture)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Await>, toAwait)
 DEFINE_TO(c10::intrusive_ptr<c10::RRefInterface>, toRRef)
 DEFINE_TO(c10::intrusive_ptr<at::Quantizer>, toQuantizer)
 DEFINE_TO(IValue, toIValue)
@@ -2182,6 +2263,11 @@ inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Await> v)
+    : tag(Tag::Await) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
 inline IValue::IValue(c10::intrusive_ptr<c10::RRefInterface> v)
     : tag(Tag::RRef) {
   payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 7b81edce0848..b4d58b03f4c5 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -16,6 +16,7 @@
 #include <ostream>
 #include <sstream>
 #include <type_traits>
+#include <utility>
 
 namespace torch {
 namespace jit {
@@ -239,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Optional[" << getElementType()->annotation_str(std::move(printer)) << "]";
     return ss.str();
   }
 };
@@ -906,7 +907,7 @@ struct TORCH_API ListType
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << "]";
+    ss << "List[" << getElementType()->annotation_str(std::move(printer)) << "]";
     return ss.str();
   }
 };
@@ -1000,8 +1001,8 @@ struct TORCH_API DictType : public SharedType {
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Dict[" << getKeyType()->annotation_str(printer) << ", "
-       << getValueType()->annotation_str(printer) << "]";
+    ss << "Dict[" << getKeyType()->annotation_str(printer) << ", ";
+    ss << getValueType()->annotation_str(std::move(printer)) << "]";
     return ss.str();
   }
 
@@ -1046,7 +1047,49 @@ struct TORCH_API FutureType
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Future[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    return ss.str();
+  }
+};
+
+struct AwaitType;
+using AwaitTypePtr = std::shared_ptr<AwaitType>;
+
+struct TORCH_API AwaitType
+    : public SingleElementType<TypeKind::AwaitType, AwaitType> {
+  friend struct Type;
+  template <typename... T>
+  static AwaitTypePtr create(TypePtr elem) {
+    return AwaitTypePtr(
+        new AwaitType(std::move(elem))); // NOLINT(modernize-make-shared)
+  }
+
+  std::string str() const override {
+    std::stringstream ss;
+    ss << "Await(" << getElementType()->str() << ")";
+    return ss.str();
+  }
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    return create(std::move(contained_types.at(0)));
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override {
+    if (Type::isSubtypeOfExt(rhs, why_not)) {
+      return true;
+    }
+    if (auto rhs_ = rhs.castRaw<AwaitType>()) {
+      return getElementType()->isSubtypeOfExt(*rhs_->getElementType(), why_not);
+    }
+    return false;
+  }
+
+ private:
+  AwaitType(TypePtr elem) : SingleElementType(std::move(elem)) {}
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    std::stringstream ss;
+    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -1078,7 +1121,7 @@ struct TORCH_API RRefType
 
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
     std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
+    ss << "RRef[" << getElementType()->annotation_str(std::move(printer)) << "]";
     return ss.str();
   }
 };
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
index beb553eb935a..c777bafa48a4 100644
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -3,6 +3,7 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include <ATen/core/qualified_name.h>
 #include <ATen/core/type_ptr.h>
@@ -29,6 +30,7 @@ namespace c10 {
   _(FloatType)              \
   _(ComplexType)            \
   _(FutureType)             \
+  _(AwaitType)              \
   _(RRefType)               \
   _(IntType)                \
   _(NoneType)               \
@@ -451,7 +453,7 @@ struct TORCH_API Type {
         return *renamed;
       }
     }
-    return annotation_str_impl(printer);
+    return annotation_str_impl(std::move(printer));
   }
   std::string annotation_str() const {
     // Overload instead of define a default value for `printer` to help
diff --git a/aten/src/ATen/core/op_registration/infer_schema.cpp b/aten/src/ATen/core/op_registration/infer_schema.cpp
index e9e93a2556e0..dd6851b2ba99 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.cpp
+++ b/aten/src/ATen/core/op_registration/infer_schema.cpp
@@ -30,17 +30,17 @@ std::vector<Argument> createArgumentVector(c10::ArrayRef<ArgumentDef> args) {
 }
 // This is intentionally a separate function and in a .cpp file
 // because then the template is smaller and that benefits binary size
-C10_EXPORT FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns) {
+FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns) {
   return FunctionSchema(std::move(name), std::move(overload_name), createArgumentVector(arguments), createArgumentVector(returns));
 }
 
-C10_EXPORT FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns) {
+FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns) {
   return make_function_schema("", "", arguments, returns);
 }
 }
 }
 
-C10_EXPORT c10::optional<std::string> findSchemaDifferences(const FunctionSchema& lhs, const FunctionSchema& rhs) {
+c10::optional<std::string> findSchemaDifferences(const FunctionSchema& lhs, const FunctionSchema& rhs) {
   if (lhs.arguments().size() != rhs.arguments().size()) {
     return "The number of arguments is different. " + guts::to_string(lhs.arguments().size()) +
              " vs " + guts::to_string(rhs.arguments().size()) + ".";
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index 2938e2a8d564..e4c7e0e12ce0 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -108,8 +108,8 @@ struct createSingleReturn {
   }
 };
 
-C10_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
-C10_API FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+TORCH_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+TORCH_API FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
 
 /// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
 /// function. Flattens std::tuple returns into multiple return types
diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp
index 252ed951a19d..bfce95da1c60 100644
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@@ -13,8 +13,10 @@ void build_feature_required_feature_not_available(const char* feature) {
 }
 }
 
-static_assert(std::is_nothrow_move_constructible<c10::optional<RegistrationHandleRAII>>::value, "");
-static_assert(std::is_nothrow_move_assignable<c10::optional<RegistrationHandleRAII>>::value, "");
+static_assert(std::is_nothrow_move_constructible<
+              c10::optional<RegistrationHandleRAII>>::value);
+static_assert(std::is_nothrow_move_assignable<
+              c10::optional<RegistrationHandleRAII>>::value);
 
 void RegisterOperators::checkSchemaAndRegisterOp_(Options&& options) {
   TORCH_CHECK(options.schemaOrName_.has_value(), "In operator registration: Tried to register an operator without specifying a schema or operator name.");
@@ -55,7 +57,7 @@ void RegisterOperators::checkSchemaAndRegisterOp_(Options&& options) {
 }
 
 c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(const OperatorName& opName, const RegisterOperators::Options& options) {
-  TORCH_CHECK(options.kernels.size() > 0, "Cannot infer operator schema in registration of operator ", opName, " because there is no kernel specified.");
+  TORCH_CHECK(!options.kernels.empty(), "Cannot infer operator schema in registration of operator ", opName, " because there is no kernel specified.");
 
   c10::optional<FunctionSchema> inferred_schema = c10::nullopt;
   for (const auto& kernel : options.kernels) {
diff --git a/aten/src/ATen/core/operator_name.cpp b/aten/src/ATen/core/operator_name.cpp
index 11057106f7a6..a340badbab76 100644
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@@ -12,7 +12,7 @@ std::string toString(const OperatorName& opName) {
 
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
   os << opName.name;
-  if (opName.overload_name.size() != 0) {
+  if (!opName.overload_name.empty()) {
     os << "." << opName.overload_name;
   }
   return os;
diff --git a/aten/src/ATen/core/qualified_name.h b/aten/src/ATen/core/qualified_name.h
index ee880e9306b6..324fbb73a1c1 100644
--- a/aten/src/ATen/core/qualified_name.h
+++ b/aten/src/ATen/core/qualified_name.h
@@ -22,7 +22,7 @@ struct QualifiedName {
     while (pos != std::string::npos) {
       auto atom = name.substr(startSearchFrom, pos - startSearchFrom);
       TORCH_INTERNAL_ASSERT(
-          atom.size() > 0, "Invalid name for qualified name: '", name, "'");
+          !atom.empty(), "Invalid name for qualified name: '", name, "'");
       atoms_.push_back(std::move(atom));
       startSearchFrom = pos + 1;
       pos = name.find(delimiter_, startSearchFrom);
@@ -30,7 +30,7 @@ struct QualifiedName {
 
     auto finalAtom = name.substr(startSearchFrom, pos - startSearchFrom);
     TORCH_INTERNAL_ASSERT(
-        finalAtom.size() > 0, "Invalid name for qualified name: '", name, "'");
+        !finalAtom.empty(), "Invalid name for qualified name: '", name, "'");
     atoms_.emplace_back(std::move(finalAtom));
 
     cacheAccessors();
@@ -134,7 +134,7 @@ struct QualifiedName {
       prefix_ = join(delimiter_, prefixView);
     }
 
-    if (atoms_.size() >= 1) {
+    if (!atoms_.empty()) {
       name_ = atoms_.back();
     }
   }
diff --git a/aten/src/ATen/core/rref_interface.h b/aten/src/ATen/core/rref_interface.h
index 95f7ff9e9e2f..cefb29c08ddc 100644
--- a/aten/src/ATen/core/rref_interface.h
+++ b/aten/src/ATen/core/rref_interface.h
@@ -19,7 +19,7 @@ class C10_EXPORT RRefInterface : public c10::intrusive_ptr_target {
   RRefInterface(RRefInterface&& other) = delete;
   RRefInterface& operator=(RRefInterface&& other) = delete;
 
-  virtual ~RRefInterface() = default;
+  ~RRefInterface() override = default;
 
   // returns the worker id of the owner
   virtual worker_id_t owner() const = 0;
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 34a43fa8ddc7..96f6c22de334 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -431,7 +431,7 @@ c10::optional<TypePtr> unifyTypeList(
     std::ostream& why_not,
     bool default_to_union,
     TypePtr type_hint) {
-  if (elements.size() == 0) {
+  if (elements.empty()) {
     why_not << "Cannot get unified type from empty list";
     return c10::nullopt;
   }
@@ -534,6 +534,19 @@ MatchTypeReturn matchTypeVariables(
       ss << "Cannot match a future to " << actual->repr_str();
       return ss.str();
     }
+  } else if (auto lt_formal = formal->castRaw<AwaitType>()) {
+    if (auto lt_actual = actual->castRaw<AwaitType>()) {
+      auto innerMatch = matchTypeVariables(
+          lt_formal->getElementType(), lt_actual->getElementType(), type_env);
+      if (!innerMatch.success()) {
+        return innerMatch;
+      }
+      return MatchTypeReturn::Success();
+    } else {
+      std::stringstream ss;
+      ss << "Cannot match an await to " << actual->repr_str();
+      return ss.str();
+    }
   } else if (auto lt_formal = formal->castRaw<RRefType>()) {
     if (auto lt_actual = actual->castRaw<RRefType>()) {
       auto innerMatch = matchTypeVariables(
@@ -879,7 +892,7 @@ std::string TupleType::annotation_str_impl(TypePrinter printer) const {
     ss << name()->qualifiedName();
   } else {
     ss << "Tuple[";
-    if (elements().size() == 0) {
+    if (elements().empty()) {
       // `typing.Tuple` special-cases the annotation syntax for empty tuple
       // with `typing.Tuple[()]`. See
       // https://docs.python.org/3/library/typing.html#typing.Tuple
diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h
index cfe7d8dac251..d14c3b8a4564 100644
--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@@ -38,7 +38,7 @@ class SingletonTypePtr {
   }
 
  private:
-  T* repr_;
+  T* repr_{nullptr};
 };
 
 template <typename T, typename U>
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index d36ac75a9728..ead162438fd4 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -48,7 +48,7 @@ c10::optional<TypePtr> subtractTypeSetFrom(std::vector<TypePtr>& to_subtract, Ar
                 return !should_subtract(t);
               });
 
-  if (types.size() == 0) {
+  if (types.empty()) {
     return c10::nullopt;
   } else if (types.size() == 1) {
     return types[0];
@@ -162,7 +162,7 @@ void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten) {
                         "passed a `nullptr`");
   std::vector<TypePtr> to_fill;
   standardizeVectorForUnion(*to_flatten, &to_fill);
-  *to_flatten = to_fill;
+  *to_flatten = std::move(to_fill);
 }
 
 OptionalType::OptionalType(TypePtr contained)
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index f2ad65cf0591..19107b1a2c2d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -367,17 +367,24 @@ template <> Vectorized<c10::complex<double>> inline operator*(const Vectorized<c
 
 template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c10::complex<double>> &a, const Vectorized<c10::complex<double>> &b) {
   //re + im*i = (a + bi)  / (c + di)
-  //re = (ac + bd)/abs_2()
-  //im = (bc - ad)/abs_2()
-  const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0);
-  auto ac_bd = _mm256_mul_pd(a, b);         //ac       bd
+  auto mask = _mm256_set1_pd(-0.f);
+  auto fabs_cd = _mm256_andnot_pd(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm256_permute_pd(fabs_cd, 0x05);   // |d|    |c|
+  auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm256_mul_pd(a, scale);         // a/sc     b/sc
+  auto b2 = _mm256_mul_pd(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm256_mul_pd(a2, b2);
 
-  auto d_c = _mm256_permute_pd(b, 0x05);    //d        c
-  d_c = _mm256_xor_pd(sign_mask, d_c);      //-d       c
-  auto ad_bc = _mm256_mul_pd(a, d_c);       //-ad      bc
-
-  auto re_im = _mm256_hadd_pd(ac_bd, ad_bc);//ac + bd  bc - ad
-  return _mm256_div_pd(re_im, b.abs_2_());
+  const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm256_permute_pd(b2, 0x05);    // d/sc         c/sc
+  dc2 = _mm256_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm256_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = _mm256_hadd_pd(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm256_div_pd(res2, denom2);
+  return res2;
 }
 
 // reciprocal. Implement this here so we can use multiplication.
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index 8a865cad7501..d478214a5923 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -402,18 +402,25 @@ template <> Vectorized<c10::complex<float>> inline operator*(const Vectorized<c1
 
 template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c10::complex<float>> &a, const Vectorized<c10::complex<float>> &b) {
   //re + im*i = (a + bi)  / (c + di)
-  //re = (ac + bd)/abs_2()
-  //im = (bc - ad)/abs_2()
-  const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
-  auto ac_bd = _mm256_mul_ps(a, b);         //ac       bd
+  auto mask = _mm256_set1_ps(-0.f);
+  auto fabs_cd = _mm256_andnot_ps(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm256_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+  auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm256_mul_ps(a, scale);         // a/sc     b/sc
+  auto b2 = _mm256_mul_ps(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm256_mul_ps(a2, b2);
 
-  auto d_c = _mm256_permute_ps(b, 0xB1);    //d        c
-  d_c = _mm256_xor_ps(sign_mask, d_c);      //-d       c
-  auto ad_bc = _mm256_mul_ps(a, d_c);       //-ad      bc
-
-  auto re_im = _mm256_hadd_ps(ac_bd, ad_bc);//ac + bd  bc - ad
-  re_im = _mm256_permute_ps(re_im, 0xD8);
-  return _mm256_div_ps(re_im, b.abs_2_());
+  const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm256_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  dc2 = _mm256_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm256_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = _mm256_hadd_ps(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+  res2 = _mm256_permute_ps(res2, 0xD8);
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm256_div_ps(res2, denom2);
+  return res2;
 }
 
 // reciprocal. Implement this here so we can use multiplication.
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index b3469571e99e..923ffa4e5d09 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -219,13 +219,13 @@ template <> class Vectorized<float> {
   }
   Vectorized<float> frac() const;
   Vectorized<float> sin() const {
-    return Vectorized<float>(Sleef_sinf8_u10(values));
+    return Vectorized<float>(Sleef_sinf8_u35(values));
   }
   Vectorized<float> sinh() const {
     return Vectorized<float>(Sleef_sinhf8_u10(values));
   }
   Vectorized<float> cos() const {
-    return Vectorized<float>(Sleef_cosf8_u10(values));
+    return Vectorized<float>(Sleef_cosf8_u35(values));
   }
   Vectorized<float> cosh() const {
     return Vectorized<float>(Sleef_coshf8_u10(values));
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
index b5bdd14389d3..92947a07cca8 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -438,17 +438,24 @@ template <> Vectorized<c10::complex<double>> inline operator*(const Vectorized<c
 template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c10::complex<double>> &a,
                                                              const Vectorized<c10::complex<double>> &b) {
   //re + im*i = (a + bi)  / (c + di)
-  //re = (ac + bd)/abs_2()
-  //im = (bc - ad)/abs_2()
-  const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
-  auto ac_bd = _mm512_mul_pd(a, b);         //ac       bd
+  auto mask = _mm512_set1_pd(-0.f);
+  auto fabs_cd = _mm512_andnot_pd(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm512_permute_pd(fabs_cd, 0x55);   // |d|    |c|
+  auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm512_mul_pd(a, scale);         // a/sc     b/sc
+  auto b2 = _mm512_mul_pd(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm512_mul_pd(a2, b2);
 
-  auto d_c = _mm512_permute_pd(b, 0x55);    //d        c
-  d_c = _mm512_xor_pd(sign_mask, d_c);      //-d       c
-  auto ad_bc = _mm512_mul_pd(a, d_c);       //-ad      bc
-
-  auto re_im = Vectorized<c10::complex<double>>::hadd_pd(ac_bd, ad_bc);//ac + bd  bc - ad
-  return _mm512_div_pd(re_im, b.abs_2_());
+  const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm512_permute_pd(b2, 0x55);    // d/sc         c/sc
+  dc2 = _mm512_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm512_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = Vectorized<c10::complex<double>>::hadd_pd(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm512_div_pd(res2, denom2);
+  return res2;
 }
 
 // reciprocal. Implement this here so we can use multiplication.
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
index f43dbb5e2b76..564e2e2a0763 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -676,7 +676,7 @@ template <> class Vectorized<c10::complex<float>> {
   }
   __m512 abs_2_() const {
     auto val_2 = _mm512_mul_ps(values, values);     // a*a     b*b
-    auto ret = hadd_ps(val_2, val_2);        // a*a+b*b a*a+b*b
+    auto ret = hadd_ps(val_2, val_2);               // a*a+b*b a*a+b*b
     return ret;
   }
   __m512 abs_() const {
@@ -939,18 +939,25 @@ template <> Vectorized<c10::complex<float>> inline operator*(const Vectorized<c1
 template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c10::complex<float>> &a,
                                                             const Vectorized<c10::complex<float>> &b) {
   //re + im*i = (a + bi)  / (c + di)
-  //re = (ac + bd)/abs_2()
-  //im = (bc - ad)/abs_2()
+  auto mask = _mm512_set1_ps(-0.f);
+  auto fabs_cd = _mm512_andnot_ps(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm512_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+  auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm512_mul_ps(a, scale);         // a/sc     b/sc
+  auto b2 = _mm512_mul_ps(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm512_mul_ps(a2, b2);
+
   const __m512 sign_mask = _mm512_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
                                           -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
-  auto ac_bd = _mm512_mul_ps(a, b);         //ac       bd
-
-  auto d_c = _mm512_permute_ps(b, 0xB1);    //d        c
-  d_c = _mm512_xor_ps(sign_mask, d_c);      //-d       c
-  auto ad_bc = _mm512_mul_ps(a, d_c);       //-ad      bc
+  auto dc2 = _mm512_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  dc2 = _mm512_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm512_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = Vectorized<c10::complex<float>>::hadd_ps(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
 
-  auto re_im = Vectorized<c10::complex<float>>::hadd_ps(ac_bd, ad_bc);//ac + bd  bc - ad
-  return _mm512_div_ps(re_im, b.abs_2_());
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm512_div_ps(res2, denom2);
+  return res2;
 }
 
 // reciprocal. Implement this here so we can use multiplication.
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index bc53ccd34387..41590b0684b7 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -248,13 +248,13 @@ template <> class Vectorized<float> {
   }
   Vectorized<float> frac() const;
   Vectorized<float> sin() const {
-    return Vectorized<float>(Sleef_sinf16_u10(values));
+    return Vectorized<float>(Sleef_sinf16_u35(values));
   }
   Vectorized<float> sinh() const {
     return Vectorized<float>(Sleef_sinhf16_u10(values));
   }
   Vectorized<float> cos() const {
-    return Vectorized<float>(Sleef_cosf16_u10(values));
+    return Vectorized<float>(Sleef_cosf16_u35(values));
   }
   Vectorized<float> cosh() const {
     return Vectorized<float>(Sleef_coshf16_u10(values));
diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h
index 66069bf2997f..7e330bf679c4 100644
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@@ -56,17 +56,12 @@ inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) {
 
 // NB: We ignore numerical errors by convention and leave them to the user
 
-#define IMPLEMENT_VML(op)                                                         \
-  template <typename scalar_t>                                                    \
-  inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) {            \
-    parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {           \
-      using vecscalar_t = at::opmath_type<scalar_t>;                              \
-      map([](const Vectorized<vecscalar_t>& x) { return x.op(); },                \
-          out + begin,                                                            \
-          in + begin,                                                             \
-          end - begin);                                                           \
-    });                                                                           \
-  }
+#define IMPLEMENT_VML(op)                                               \
+  template <typename scalar_t>                                          \
+  inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) {  \
+    using vec_t = Vectorized<vec_scalar_t<scalar_t>>;                   \
+    vec::map([](vec_t x) { return x.op(); }, out, in, size);            \
+  }                                                                     \
 
 IMPLEMENT_VML(abs)
 IMPLEMENT_VML(acos)
@@ -108,9 +103,9 @@ IMPLEMENT_VML(lgamma)
 static_assert(
     std::is_same<MKL_INT, int32_t>::value,
     "MKL_INT is assumed to be int32_t");
-#define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype)                    \
+#define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype)                \
   template <>                                                           \
-  inline void v##op(type * out, const type * in, int64_t size) {          \
+  inline void v##op(type * out, const type * in, int64_t size) {        \
     int64_t max_mkl_ind = std::numeric_limits<MKL_INT>::max();          \
     if (size <= static_cast<int64_t>(max_mkl_ind)) {                    \
       vm##mkltype##mklop(                                               \
@@ -140,7 +135,6 @@ static_assert(
 
 // NB: abs, cosh and sinh were temporarily disabled due to issues with Apple
 // NB: expm1 is disabled because on some configs it produces expm1(nan)=-1
-IMPLEMENT_VML_MKL(abs, Abs)
 IMPLEMENT_VML_MKL(acos, Acos)
 IMPLEMENT_VML_MKL(asin, Asin)
 IMPLEMENT_VML_MKL(atan, Atan)
@@ -153,7 +147,6 @@ IMPLEMENT_VML_MKL(exp, Exp)
 // IMPLEMENT_VML_MKL(expm1, Expm1)
 IMPLEMENT_VML_MKL(log, Ln)
 IMPLEMENT_VML_MKL(log10, Log10)
-IMPLEMENT_VML_MKL(log1p, Log1p)
 IMPLEMENT_VML_MKL(sin, Sin)
 // IMPLEMENT_VML_MKL(sinh, Sinh)
 IMPLEMENT_VML_MKL(sqrt, Sqrt)
@@ -161,6 +154,10 @@ IMPLEMENT_VML_MKL(tan, Tan)
 IMPLEMENT_VML_MKL(tanh, Tanh)
 IMPLEMENT_VML_MKL(trunc, Trunc)
 
+// Not vectorized in MKL version tested
+// IMPLEMENT_VML_MKL(abs, Abs)
+// IMPLEMENT_VML_MKL(log1p, Log1p)
+
 #if INTEL_MKL_VERSION >= 20180406
 IMPLEMENT_VML_MKL(log2, Log2)
 #endif
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 659ef114120d..9ca9ba5e7647 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -618,7 +618,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
 };
 } // namespace
 
-template <typename Dtype>
+template <typename Dtype, typename RDtype, typename BDtype>
 void gemm_and_bias(
     bool transpose_mat1,
     bool transpose_mat2,
@@ -630,12 +630,11 @@ void gemm_and_bias(
     int64_t mat1_ld,
     const Dtype* mat2_ptr,
     int64_t mat2_ld,
-    const Dtype* bias,
-    Dtype* result_ptr,
+    const BDtype* bias,
+    RDtype* result_ptr,
     int64_t result_ld,
-    GEMMAndBiasActivationEpilogue activation) {
-  using opmath_t = at::opmath_type<Dtype>;
-  opmath_t beta_val = 0; // bias is added in epilogue
+    GEMMAndBiasActivationEpilogue activation,
+    bool use_heuristic) {
 
   cudaDataType_t abcType = CUDA_R_32F;
   cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
@@ -654,6 +653,19 @@ void gemm_and_bias(
   } else if (std::is_same<Dtype, at::BFloat16>::value) {
     abcType = CUDA_R_16BF;
   }
+  cudaDataType_t abType = abcType;
+  cudaDataType_t cType = abcType;
+  if (std::is_same<Dtype, int8_t>::value) {
+    abType = CUDA_R_8I;
+    cType = CUDA_R_32I;
+    computeType = CUBLAS_COMPUTE_32I;
+    scaleType = CUDA_R_32I;
+    bool valid_rdtype = std::is_same<RDtype, int32_t>::value;
+    TORCH_CHECK(valid_rdtype, "Expected int32_t for result Tensor if given int8_t mat1, mat2.");
+  } else {
+    bool valid_rdtype = std::is_same<RDtype, Dtype>::value;
+    TORCH_CHECK(valid_rdtype, "Expected result and input dtypes to match.");
+  }
 
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   cublasOperation_t transa = transpose_mat1 ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -668,64 +680,87 @@ void gemm_and_bias(
       CUBLASLT_MATMUL_DESC_TRANSB,
       &transb,
       sizeof(transb)));
-  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
-  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
+
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+  if (activation == GEMMAndBiasActivationEpilogue::BIAS) {
+    epilogue = CUBLASLT_EPILOGUE_BIAS;
+  }
+  if (activation == GEMMAndBiasActivationEpilogue::BIAS_RELU) {
     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
-  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
+  }
+  if (activation == GEMMAndBiasActivationEpilogue::BIAS_GELU) {
 #if CUDA_VERSION >= 11040
-    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+      epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+#else
+      TORCH_CHECK(false, "CUBLASLT_EPILOGUE_GELU_BIAS is an unsupported feature for CUDA version ", CUDA_VERSION);
 #endif
   }
-  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
-      computeDesc.descriptor(),
-      CUBLASLT_MATMUL_DESC_EPILOGUE,
-      &epilogue,
-      sizeof(epilogue)));
-  TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
-      computeDesc.descriptor(),
-      CUBLASLT_MATMUL_DESC_BIAS_POINTER,
-      &bias,
-      sizeof(Dtype*)));
+  if (activation == GEMMAndBiasActivationEpilogue::NONE) {
+    TORCH_CHECK(bias == nullptr, "Expected bias to be a nullptr.");
+  } else {
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+        computeDesc.descriptor(),
+        CUBLASLT_MATMUL_DESC_EPILOGUE,
+        &epilogue,
+        sizeof(epilogue)));
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute(
+        computeDesc.descriptor(),
+        CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+        &bias,
+        sizeof(Dtype*)));
+  }
 
   CuBlasLtMatrixLayout Adesc(
-      abcType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld);
+      abType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld);
   CuBlasLtMatrixLayout Bdesc(
-      abcType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld);
-  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
+      abType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);
 
   CuBlasLtMatmulPreference preference;
   // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
   // setting this to 1M.
   size_t workspaceSize = 1024 * 1024;
-  TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
-      preference.descriptor(),
-      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-      &workspaceSize,
-      sizeof(workspaceSize)));
+  void* workspace_data_ptr;
 
-  auto workspace = at::empty(
-      {static_cast<int64_t>(workspaceSize)},
-      at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
+  if (std::is_same<Dtype, int8_t>::value) {
+    workspaceSize = 0;
+  }
+  if (workspaceSize > 0) {
+    TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
+        preference.descriptor(),
+        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+        &workspaceSize,
+        sizeof(workspaceSize)));
+
+    auto workspace = at::empty(
+        {static_cast<int64_t>(workspaceSize)},
+        at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte));
+    workspace_data_ptr = workspace.data_ptr();
+  }
 
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
-  int returnedResult = 0;
   cublasLtHandle_t ltHandle =
       reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
-  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
-      ltHandle,
-      computeDesc.descriptor(),
-      Adesc.descriptor(),
-      Bdesc.descriptor(),
-      Cdesc.descriptor(),
-      Cdesc.descriptor(),
-      preference.descriptor(),
-      1,
-      &heuristicResult,
-      &returnedResult));
-  if (returnedResult == 0) {
-    TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+  if (use_heuristic) {
+    int returnedResult = 0;
+    auto heuristic_return_value = cublasLtMatmulAlgoGetHeuristic(
+        ltHandle,
+        computeDesc.descriptor(),
+        Adesc.descriptor(),
+        Bdesc.descriptor(),
+        Cdesc.descriptor(),
+        Cdesc.descriptor(),
+        preference.descriptor(),
+        1,
+        &heuristicResult,
+        &returnedResult);
+    TORCH_CUDABLAS_CHECK(heuristic_return_value);
+    if (returnedResult == 0) {
+      TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
+    }
   }
 
+  std::conditional_t<std::is_same<BDtype, std::nullptr_t>::value, float, at::opmath_type<Dtype>> beta_val = 0;
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
       computeDesc.descriptor(),
@@ -739,8 +774,8 @@ void gemm_and_bias(
       Cdesc.descriptor(),
       result_ptr,
       Cdesc.descriptor(),
-      &heuristicResult.algo,
-      workspace.data_ptr(),
+      use_heuristic ? &heuristicResult.algo : nullptr,
+      workspaceSize > 0 ? workspace_data_ptr : nullptr,
       workspaceSize,
       at::cuda::getCurrentCUDAStream());
   TORCH_CHECK(
@@ -763,8 +798,10 @@ void gemm_and_bias(
       mat2_ld,
       " result_ld ",
       result_ld,
-      " abcType ",
-      abcType,
+      " abType ",
+      abType,
+      " cType ",
+      cType,
       " computeType ",
       computeType,
       " scaleType ",
@@ -785,7 +822,8 @@ template void gemm_and_bias(
     const double* bias,
     double* result_ptr,
     int64_t result_ld,
-    GEMMAndBiasActivationEpilogue activation);
+    GEMMAndBiasActivationEpilogue activation,
+    bool use_heuristic);
 
 template void gemm_and_bias(
     bool transpose_mat1,
@@ -801,7 +839,8 @@ template void gemm_and_bias(
     const float* bias,
     float* result_ptr,
     int64_t result_ld,
-    GEMMAndBiasActivationEpilogue activation);
+    GEMMAndBiasActivationEpilogue activation,
+    bool use_heuristic);
 
 template void gemm_and_bias(
     bool transpose_mat1,
@@ -817,7 +856,8 @@ template void gemm_and_bias(
     const at::Half* bias,
     at::Half* result_ptr,
     int64_t result_ld,
-    GEMMAndBiasActivationEpilogue activation);
+    GEMMAndBiasActivationEpilogue activation,
+    bool use_heuristic);
 
 template void gemm_and_bias(
     bool transpose_mat1,
@@ -833,7 +873,25 @@ template void gemm_and_bias(
     const at::BFloat16* bias,
     at::BFloat16* result_ptr,
     int64_t result_ld,
-    GEMMAndBiasActivationEpilogue activation);
+    GEMMAndBiasActivationEpilogue activation,
+    bool use_heuristic);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<int8_t> alpha_val,
+    const int8_t* mat1_ptr,
+    int64_t mat1_ld,
+    const int8_t* mat2_ptr,
+    int64_t mat2_ld,
+    const std::nullptr_t* bias,
+    int32_t* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation,
+    bool use_heuristic);
 #endif // !defined(USE_ROCM) && !defined(_MSC_VER)
 
 template <>
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index da01bbe3dcf9..c722390ad31c 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -70,14 +70,15 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 
 #if !defined(USE_ROCM) && !defined(_MSC_VER)
 enum GEMMAndBiasActivationEpilogue {
-  None,
-  RELU,
-  GELU,
+  NONE,
+  BIAS,
+  BIAS_RELU,
+  BIAS_GELU,
 };
 
 // NOTE: GELU activation is not supported prior to CUDA 11.4 and will
 // do nothing if passed in that case.
-template <typename Dtype>
+template <typename Dtype, typename RDtype, typename BDtype>
 void gemm_and_bias(
     bool transpose_mat1,
     bool transpose_mat2,
@@ -89,10 +90,11 @@ void gemm_and_bias(
     int64_t mat1_ld,
     const Dtype* mat2_ptr,
     int64_t mat2_ld,
-    const Dtype* bias,
-    Dtype* result_ptr,
+    const BDtype* bias,
+    RDtype* result_ptr,
     int64_t result_ld,
-    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);
+    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::BIAS,
+    bool use_heuristic = true);
 #endif
 
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 98fa9a5f6dd2..d274fda0f71e 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -47,7 +47,7 @@ cudaDeviceProp* getCurrentDeviceProperties() {
 cudaDeviceProp* getDeviceProperties(int64_t device) {
   c10::call_once(init_flag, initCUDAContextVectors);
   if (device == -1) device = c10::cuda::current_device();
-  AT_ASSERT(device >= 0 && device < num_gpus);
+  AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus);
   c10::call_once(device_flags[device], initDeviceProperty, device);
   return &device_properties[device];
 }
@@ -55,8 +55,8 @@ cudaDeviceProp* getDeviceProperties(int64_t device) {
 bool canDeviceAccessPeer(int64_t device, int64_t peer_device) {
   c10::call_once(init_flag, initCUDAContextVectors);
   if (device == -1) device = c10::cuda::current_device();
-  AT_ASSERT(device >= 0 && device < num_gpus);
-  AT_ASSERT(peer_device >= 0 && peer_device < num_gpus);
+  AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus);
+  AT_ASSERT(peer_device >= 0 && peer_device < num_gpus, "peer_device=", peer_device, ", num_gpus=", num_gpus);
   int can_access = 0;
   AT_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, device, peer_device));
   return can_access != 0;
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index 1c3c67949e58..467970b33b49 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -28,8 +28,8 @@ namespace at { namespace cuda {
 struct TORCH_CUDA_CPP_API CUDAEvent {
   // Constructors
   // Default value for `flags` is specified below - it's cudaEventDisableTiming
-  CUDAEvent() {}
-  CUDAEvent(unsigned int flags) : flags_{flags} {}
+  CUDAEvent() noexcept = default;
+  CUDAEvent(unsigned int flags) noexcept : flags_{flags} {}
 
   CUDAEvent(
       DeviceIndex device_index, const cudaIpcEventHandle_t* handle) {
@@ -58,9 +58,11 @@ struct TORCH_CUDA_CPP_API CUDAEvent {
   CUDAEvent(const CUDAEvent&) = delete;
   CUDAEvent& operator=(const CUDAEvent&) = delete;
 
-  CUDAEvent(CUDAEvent&& other) { moveHelper(std::move(other)); }
-  CUDAEvent& operator=(CUDAEvent&& other) {
-    moveHelper(std::move(other));
+  CUDAEvent(CUDAEvent&& other) noexcept { moveHelper(std::move(other)); }
+  CUDAEvent& operator=(CUDAEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(std::move(other));
+    }
     return *this;
   }
 
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index fefeebe036bb..353f1b4caab1 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -11,14 +11,14 @@ namespace cuda {
 static bool _cuda_graphs_debug = false;
 
 MempoolId_t graph_pool_handle() {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   // uuid count starts at 1. 0 is reserved to mean "wasn't set by graph_pool_handle".
   static std::atomic<CaptureId_t> uuid{1};
   // Sets just the second value, to distinguish it from MempoolId_ts created from
   // cudaStreamGetCaptureInfo id_s in capture_begin.
   return {0, uuid++};
 #else
-  TORCH_CHECK(false, "CUDA graphs may is not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3")
   return {0, 0};
 #endif
 }
@@ -47,13 +47,13 @@ MempoolId_t graph_pool_handle() {
 CUDAGraph::CUDAGraph()
   // CUDAStreams may not be default-constructed.
   : capture_stream_(at::cuda::getCurrentCUDAStream()) {
-#if defined(USE_ROCM)
-  TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM");
+#if (defined(USE_ROCM) && ROCM_VERSION < 50300)
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3");
 #endif
 }
 
 void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/) {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   TORCH_CHECK(!has_graph_exec_,
               "This CUDAGraph instance already owns a captured graph. "
               "To capture a new graph, create a new instance.");
@@ -124,12 +124,12 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/) {
   // kernel will end up as part of the capture or not.
   c10::cuda::CUDACachingAllocator::notifyCaptureBegin(capture_dev_, id_, mempool_id_);
 #else
-  TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3")
 #endif
 }
 
 void CUDAGraph::capture_end() {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   auto stream = at::cuda::getCurrentCUDAStream();
 
   TORCH_CHECK(stream == capture_stream_,
@@ -154,7 +154,7 @@ void CUDAGraph::capture_end() {
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
   // cudaGraphInstantiateWithFlags
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
-#if CUDA_VERSION >= 11040
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040)
   int version;
   AT_CUDA_CHECK(cudaDriverGetVersion(&version));
   if (version < 11040) {
@@ -162,12 +162,12 @@ void CUDAGraph::capture_end() {
     // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
     // who prefer not to report error message through these arguments moving forward
     // (they prefer return value, or errors on api calls internal to the capture)
-#if CUDA_VERSION >= 12000
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
     AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
 #else
     AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
 #endif
-#if CUDA_VERSION >= 11040
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040)
   } else {
     AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                 graph_,
@@ -202,12 +202,12 @@ void CUDAGraph::capture_end() {
     TORCH_WARN("DEBUG: TORCH_CUDAGRAPHS_DEBUG_PATH detected. graph_ will not be freed until debug_dump is called.");
   }
 #else
-  TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3")
 #endif
 }
 
 void CUDAGraph::replay() {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   TORCH_CHECK(has_graph_exec_,
               "Called CUDAGraph::replay without a preceding successful capture.");
 
@@ -242,7 +242,7 @@ void CUDAGraph::replay() {
 }
 
 void CUDAGraph::enable_debug_mode() {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   _cuda_graphs_debug = true;
 #else
   TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM");
@@ -251,7 +251,7 @@ void CUDAGraph::enable_debug_mode() {
 }
 
 void CUDAGraph::debug_dump(const std::string& debug_path) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11030
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11030)
   if (_cuda_graphs_debug) {
     TORCH_WARN("DEBUG: calling debug_dump()");
     if (has_graph_) {
@@ -263,12 +263,12 @@ void CUDAGraph::debug_dump(const std::string& debug_path) {
     TORCH_WARN("CUDA Graphs debug not enabled, set with torch._C._cuda_enable_graphs_debug_mode");
   }
 #else
-  TORCH_CHECK(false, "CUDA graphs debug dump may only be used in Pytorch built with CUDA >= 11.3 and is not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.3 and is not yet supported on ROCM");
 #endif
 }
 
 void CUDAGraph::reset() {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   // I'd prefer these checks throw exceptions, not print warnings,
   // but the destructor calls reset(), and at least one CI build
   // refuses to compile with a throwing destructor.
@@ -299,17 +299,17 @@ void CUDAGraph::reset() {
     C10_CUDA_CHECK_WARN(cudaGraphExecDestroy(graph_exec_));
   }
 #else
-  TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3")
 #endif
 }
 
 // Returns an id another graph's capture_begin can use to share the same memory pool as this graph.
 MempoolId_t CUDAGraph::pool() {
-#if !defined(USE_ROCM)
-  TORCH_CHECK(has_graph_exec_,
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+TORCH_CHECK(has_graph_exec_,
               "Called CUDAGraph::pool() without a preceding successful capture.");
 #else
-  TORCH_CHECK(false, "CUDA graphs is not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 or ROCM >= 5.3")
 #endif
   return mempool_id_;
 }
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index 16e9445e111a..c4b6fe44d958 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -28,7 +28,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   void debug_dump(const std::string& debug_path);
 
   protected:
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   cudaGraph_t graph_ = NULL;
   cudaGraphExec_t graph_exec_ = NULL;
 #endif
diff --git a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
index fe1348e6bcfa..0a6ec7590885 100644
--- a/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAGraphsUtils.cuh
@@ -20,7 +20,7 @@ using CaptureStatus = c10::cuda::CaptureStatus;
 
 // Use this version where you don't want to create a CUDA context if none exists.
 inline CaptureStatus currentStreamCaptureStatus() {
-#if !defined(USE_ROCM)
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   // don't create a context if we don't have to
   if (at::cuda::detail::hasPrimaryContext(c10::cuda::current_device())) {
     return c10::cuda::currentStreamCaptureStatusMayInitCtx();
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index d53c3dc7b6b4..a4635c51bbe4 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -225,7 +225,7 @@ class CUDAHostAllocator {
     } else {
       std::lock_guard<std::mutex> g(cuda_events_mutex_);
       for (auto&& event : *events) {
-        cuda_events_.push_front({std::move(event), block});
+        cuda_events_.emplace_front(std::move(event), block);
       }
     }
   }
diff --git a/aten/src/ATen/cuda/detail/IntegerDivider.cuh b/aten/src/ATen/cuda/detail/IntegerDivider.cuh
index 761e16aea3c2..b79143c5be62 100644
--- a/aten/src/ATen/cuda/detail/IntegerDivider.cuh
+++ b/aten/src/ATen/cuda/detail/IntegerDivider.cuh
@@ -65,7 +65,7 @@ struct DivMod {
 // everything else, we use plain division.
 template <typename Value>
 struct IntDivider {
-  IntDivider() { }  // Dummy constructor for arrays.
+  IntDivider() = default;
   IntDivider(Value d) : divisor(d) { }
 
   C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
@@ -82,7 +82,7 @@ template <>
 struct IntDivider<unsigned int> {
   static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
 
-  IntDivider() { }  // Dummy constructor for arrays.
+  IntDivider() = default;
 
   IntDivider(unsigned int d) : divisor(d) {
     assert(divisor >= 1 && divisor <= INT32_MAX);
diff --git a/aten/src/ATen/cuda/llvm_complex.cpp b/aten/src/ATen/cuda/llvm_complex.cpp
index 0bb2c2ba9a09..f210275beab3 100644
--- a/aten/src/ATen/cuda/llvm_complex.cpp
+++ b/aten/src/ATen/cuda/llvm_complex.cpp
@@ -497,6 +497,14 @@ operator&&(const complex<_Tp>& __x, const complex<_Tp>& __y)
     return bool(__x) && bool(__y);
 }
 
+template<class _Tp>
+inline constexpr
+bool
+isnan(const complex<_Tp>& __x)
+{
+    return isnan(__x.real()) || isnan(__x.imag());
+}
+
 template<class _Tp>
 inline constexpr
 bool
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index e111987785cc..9960845809c2 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -123,7 +123,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
                                                &cudnnCreateTensorDescriptor,
                                                &cudnnDestroyTensorDescriptor> {
  public:
-  TensorDescriptor() {}
+  TensorDescriptor() = default;
   explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
     set(t, pad);
   }
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index d4d888a93e57..db6f22a51d06 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -197,7 +197,7 @@ struct TORCH_API CUDAHooksInterface {
 // for the "..." in a variadic macro"
 struct TORCH_API CUDAHooksArgs {};
 
-C10_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs);
+TORCH_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs);
 #define REGISTER_CUDA_HOOKS(clsname) \
   C10_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
 
diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h
index 64a1fd77cd02..26126c560808 100644
--- a/aten/src/ATen/detail/HIPHooksInterface.h
+++ b/aten/src/ATen/detail/HIPHooksInterface.h
@@ -60,7 +60,7 @@ struct TORCH_API HIPHooksInterface {
 // for the "..." in a variadic macro"
 struct TORCH_API HIPHooksArgs {};
 
-C10_DECLARE_REGISTRY(HIPHooksRegistry, HIPHooksInterface, HIPHooksArgs);
+TORCH_DECLARE_REGISTRY(HIPHooksRegistry, HIPHooksInterface, HIPHooksArgs);
 #define REGISTER_HIP_HOOKS(clsname) \
   C10_REGISTER_CLASS(HIPHooksRegistry, clsname, clsname)
 
diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
index 4fff139f2774..7d67d63c808a 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -28,6 +28,10 @@ struct TORCH_API MPSHooksInterface {
     return false;
   }
 
+  virtual bool isOnMacOS13orNewer(unsigned minor = 0) const {
+    AT_ERROR("MPS backend is not available.");
+  }
+
   virtual const Generator& getDefaultMPSGenerator() const {
     AT_ERROR("Cannot get default MPS generator without MPS backend.");
   }
@@ -35,11 +39,31 @@ struct TORCH_API MPSHooksInterface {
   virtual Allocator* getMPSDeviceAllocator() const {
     AT_ERROR("MPSDeviceAllocator requires MPS.");
   }
+
+  virtual void deviceSynchronize() const {
+    AT_ERROR("Cannot synchronize MPS device without MPS backend.");
+  }
+
+  virtual void emptyCache() const {
+    AT_ERROR("Cannot execute emptyCache() without MPS backend.");
+  }
+
+  virtual size_t getCurrentAllocatedMemory() const {
+    AT_ERROR("Cannot execute getCurrentAllocatedMemory() without MPS backend.");
+  }
+
+  virtual size_t getDriverAllocatedMemory() const {
+    AT_ERROR("Cannot execute getDriverAllocatedMemory() without MPS backend.");
+  }
+
+  virtual void setMemoryFraction(double /*ratio*/) const {
+    AT_ERROR("Cannot execute setMemoryFraction() without MPS backend.");
+  }
 };
 
 struct TORCH_API MPSHooksArgs {};
 
-C10_DECLARE_REGISTRY(MPSHooksRegistry, MPSHooksInterface, MPSHooksArgs);
+TORCH_DECLARE_REGISTRY(MPSHooksRegistry, MPSHooksInterface, MPSHooksArgs);
 #define REGISTER_MPS_HOOKS(clsname) \
   C10_REGISTER_CLASS(MPSHooksRegistry, clsname, clsname)
 
diff --git a/aten/src/ATen/detail/ORTHooksInterface.h b/aten/src/ATen/detail/ORTHooksInterface.h
index 4dd51d06caba..f49969ec66a5 100644
--- a/aten/src/ATen/detail/ORTHooksInterface.h
+++ b/aten/src/ATen/detail/ORTHooksInterface.h
@@ -25,7 +25,7 @@ struct TORCH_API ORTHooksInterface {
 // for the "..." in a variadic macro"
 struct TORCH_API ORTHooksArgs {};
 
-C10_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs);
+TORCH_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs);
 #define REGISTER_ORT_HOOKS(clsname) \
   C10_REGISTER_CLASS(ORTHooksRegistry, clsname, clsname)
 
diff --git a/aten/src/ATen/functorch/ADInterpreters.cpp b/aten/src/ATen/functorch/ADInterpreters.cpp
index 1e2abbb25fc3..aa52cb73b8e7 100644
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@@ -169,7 +169,7 @@ static void autogradBasedTransformSendToNext(
   }
 
   // Re-dispatch
-  if (getDynamicLayerStack().size() == 0) {
+  if (getDynamicLayerStack().empty()) {
     sanityCheckStack(op, stack);
   }
 
diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
index cc478faef7c5..5a00f7d466c6 100644
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@@ -9,6 +9,8 @@
 #include <ATen/Operators.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
+#include <utility>
+
 namespace at { namespace functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
@@ -306,7 +308,7 @@ std::tuple<Tensor, optional<int64_t>> log_sigmoid_backward_batch_rule(
 }
 
 Tensor binomial_wrapper(const Tensor& count, const Tensor& prob, c10::optional<Generator> gen) {
-  return at::binomial(count, prob.contiguous(), gen); // Bug in PyTorch, prob shouldn't need to be contiguous
+  return at::binomial(count, prob.contiguous(), std::move(gen)); // Bug in PyTorch, prob shouldn't need to be contiguous
 }
 
 TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
@@ -359,10 +361,20 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   POINTWISE_BOXED(addcmul);
   BINARY_POINTWISE(atan2);
   BINARY_SCALAR_2(bitwise_and, Tensor, Scalar);
+  BINARY_POINTWISE2(bitwise_and_, Tensor);
+  POINTWISE_BOXED(bitwise_and.Scalar_Tensor);
   BINARY_POINTWISE2(bitwise_or, Tensor);
+  BINARY_POINTWISE2(bitwise_or_, Tensor);
+  POINTWISE_BOXED(bitwise_or.Scalar_Tensor);
   BINARY_POINTWISE2(bitwise_xor, Tensor);
+  BINARY_POINTWISE2(bitwise_xor_, Tensor);
+  POINTWISE_BOXED(bitwise_xor.Scalar_Tensor);
   BINARY_SCALAR_3(bitwise_left_shift, Tensor, Tensor_Scalar, Scalar_Tensor);
+  POINTWISE_BOXED(bitwise_left_shift_.Tensor_Scalar);
+  POINTWISE_BOXED(bitwise_left_shift_.Tensor);
   BINARY_SCALAR_3(bitwise_right_shift, Tensor, Tensor_Scalar, Scalar_Tensor);
+  POINTWISE_BOXED(bitwise_right_shift_.Tensor_Scalar);
+  POINTWISE_BOXED(bitwise_right_shift_.Tensor);
 
   UNARY_POINTWISE(clamp);
   POINTWISE_BOXED(clamp.Tensor);
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 7e0d90cd6d8b..daa3b6bd5739 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -61,8 +61,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(atleast_3d);
   OP_DECOMPOSE2(atleast_3d, Sequence);
   OP_DECOMPOSE(batch_norm);
+  OP_DECOMPOSE2(bitwise_and_, Scalar);
   OP_DECOMPOSE2(bitwise_or, Scalar);
+  OP_DECOMPOSE2(bitwise_or_, Scalar);
   OP_DECOMPOSE2(bitwise_xor, Scalar);
+  OP_DECOMPOSE2(bitwise_xor_, Scalar);
   OP_DECOMPOSE(broadcast_tensors);
   m.impl("broadcast_to", native::broadcast_to_symint);
   OP_DECOMPOSE(cartesian_prod);
@@ -115,6 +118,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE2(float_power, Tensor_Tensor);
   OP_DECOMPOSE2(float_power, Tensor_Scalar);
   OP_DECOMPOSE2(floor_divide, Scalar);
+  OP_DECOMPOSE(gather_backward);
   OP_DECOMPOSE(ger);
   OP_DECOMPOSE2(gradient, scalarint);
   OP_DECOMPOSE2(gradient, scalararray);
@@ -162,6 +166,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(linalg_tensorinv);
   OP_DECOMPOSE(linalg_vander);
   OP_DECOMPOSE(cumprod_backward);
+  OP_DECOMPOSE(linalg_matrix_power);
+  OP_DECOMPOSE(linalg_vecdot);
   OP_DECOMPOSE(_lu_with_info);
   OP_DECOMPOSE(matmul);
   OP_DECOMPOSE(matrix_H);
@@ -201,8 +207,12 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(resolve_neg);
   OP_DECOMPOSE(row_stack);
   OP_DECOMPOSE(rrelu);
+  OP_DECOMPOSE(rrelu_);
+  OP_DECOMPOSE(relu6);
+  OP_DECOMPOSE(relu6_);
   OP_DECOMPOSE(prelu);
   OP_DECOMPOSE2(softmax, int);
+  OP_DECOMPOSE(scaled_dot_product_attention);
   OP_DECOMPOSE(special_gammainc);
   OP_DECOMPOSE(special_gammaincc);
   OP_DECOMPOSE(special_logit);
@@ -242,6 +252,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE2(size, int);
   OP_DECOMPOSE(is_complex);
   OP_DECOMPOSE(std);
+  OP_DECOMPOSE(selu);
+  OP_DECOMPOSE(selu_);
   OP_DECOMPOSE2(std, dim);
   OP_DECOMPOSE(std_mean);
   OP_DECOMPOSE2(std_mean, dim);
@@ -311,6 +323,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE2(multiply_, Tensor)
   OP_DECOMPOSE2(multiply, Scalar)
   OP_DECOMPOSE2(multiply_, Scalar)
+
+  OP_DECOMPOSE2(linalg_matrix_rank, atol_rtol_tensor);
+  OP_DECOMPOSE2(linalg_matrix_rank, atol_rtol_float);
+
 }
 
 }}
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 8e78ba71029b..774d9a723369 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -19,6 +19,8 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/VmapGeneratedPlumbing.h>
 
+#include <utility>
+
 // This file contains helper functions for batching rules.
 
 namespace at { namespace functorch {
@@ -152,7 +154,7 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S
   Func(tensor_inputs);
 
   size_t tensor_idx = 0;
-  TORCH_INTERNAL_ASSERT(tensor_pos.size() > 0);
+  TORCH_INTERNAL_ASSERT(!tensor_pos.empty());
   for (const auto arg_idx : c10::irange(0, num_arguments)) {
     if (tensor_idx >= tensor_pos.size() || (int64_t)arg_idx != tensor_pos[tensor_idx]) {
       torch::jit::push(stack, arguments[arg_idx]);
@@ -339,7 +341,7 @@ inline void boxed_all_tensors_have_optional_bdim(
       if (tensor_idx == contig_tensor_index) {
         value_ = value_.contiguous();
       }
-      (*stack)[args_begin + tensor_pos[tensor_idx]] = value_;
+      (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_);
       continue;
     }
     TORCH_INTERNAL_ASSERT(logical_rank == feature_rank + 1);
@@ -347,7 +349,7 @@ inline void boxed_all_tensors_have_optional_bdim(
     if (tensor_idx == contig_tensor_index) {
       value_ = value_.contiguous();
     }
-    (*stack)[args_begin + tensor_pos[tensor_idx]] = value_;
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_);
   }
 
   op.callBoxed(stack);
diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
index f26a4f79b146..f963916d453b 100644
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@@ -30,6 +30,9 @@ std::tuple<Tensor, optional<int64_t>> dot_batch_rule(const Tensor& A, optional<i
     return std::make_tuple(at::matmul(A_, B_.t()), 0);
   }
 }
+Tensor vdot_decomp(const Tensor& A, const Tensor& B) {
+  return at::dot(A.is_complex() ? A.conj() : A, B);
+}
 
 // NB: I wrote this like this because we *might* want its for a future matmul
 // batch rule that isn't decomposed...
@@ -467,14 +470,6 @@ atol_rtol_tensor_batch_rule(
   return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0);
 }
 
-std::tuple<Tensor, c10::optional<int64_t>>
-matrix_rank_atol_rtol_tensor_batch_rule(
-    const Tensor& input, c10::optional<int64_t> input_bdim, const optional<Tensor>& atol,
-    const c10::optional<int64_t> atol_bdim, const optional<Tensor>& rtol,
-    const c10::optional<int64_t> rtol_bdim, bool hermitian) {
-  return atol_rtol_tensor_batch_rule(ATEN_FN2(linalg_matrix_rank, atol_rtol_tensor), input, input_bdim, atol, atol_bdim, rtol, rtol_bdim, hermitian, "torch.linalg.matrix_rank");
-}
-
 std::tuple<Tensor, c10::optional<int64_t>>
 pinv_batch_rule(
     const Tensor& input, c10::optional<int64_t> input_bdim, const optional<Tensor>& atol,
@@ -483,14 +478,6 @@ pinv_batch_rule(
   return atol_rtol_tensor_batch_rule(ATEN_FN2(linalg_pinv, atol_rtol_tensor), input, input_bdim, atol, atol_bdim, rtol, rtol_bdim, hermitian, "linalg.pinv");
 }
 
-std::tuple<Tensor,optional<int64_t>>
-matrix_rank_atol_rtol_float_batch_rule(
-    const Tensor& input, optional<int64_t> input_bdim, optional<double> atol, optional<double> rtol, bool hermitian) {
-  TORCH_CHECK(rankWithoutBatchDim(input, input_bdim) >= 2,
-            "torch.linalg.matrix_rank: The input tensor input must have at least 2 dimensions.");
-  return std::make_tuple(linalg_matrix_rank(moveBatchDimToFront(input, input_bdim), atol, rtol, hermitian), 0);
-}
-
 #define LINALG_CHECK_MATRIX_UNARY_BATCH_RULE(fn, num_out) SINGLE_ARG(\
   LinalgCheckMatrixUnaryRuleHelper<\
     func_string_##fn,\
@@ -584,7 +571,6 @@ LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_cholesky_ex, linalg.cholesky);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_eig, linalg.eig);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_inv_ex, linalg.inv_ex);
 LINALG_CHECK_MATRIX_UNARY_THREE_OUT(linalg_ldl_factor_ex, torch.linalg.ldl_factor_ex);
-LINALG_CHECK_MATRIX_UNARY_ONE_OUT(linalg_matrix_power, linalg.matrix_power);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(linalg_pinv, linalg.pinv);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT2(linalg_pinv, atol_rtol_float, linalg.pinv);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_qr, linalg.qr);
@@ -593,7 +579,6 @@ LINALG_CHECK_MATRIX_BINARY_ONE_OUT(linalg_solve_triangular, linalg.solve_triangu
 
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(geqrf, geqrf);
 LINALG_CHECK_MATRIX_UNARY_ONE_OUT(logdet, logdet);
-LINALG_CHECK_MATRIX_UNARY_TWO_OUT(symeig, symeig);
 LINALG_CHECK_MATRIX_BINARY_TWO_OUT(triangular_solve, triangular_solve);
 LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_det, linalg.det);
 LINALG_CHECK_MATRIX_UNARY_TWO_OUT(_linalg_eigh, linalg.eigh);
@@ -618,10 +603,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(linalg_matrix_exp, matrix_exp_batch_rule);
   VMAP_SUPPORT(_linalg_solve_ex, solve_ex_batch_rule);
   VMAP_SUPPORT(linalg_cross, cross_batch_rule);
-  VMAP_SUPPORT2(linalg_matrix_rank, atol_rtol_tensor, matrix_rank_atol_rtol_tensor_batch_rule);
-  VMAP_SUPPORT2(linalg_matrix_rank, atol_rtol_float, matrix_rank_atol_rtol_float_batch_rule);
   VMAP_SUPPORT2(linalg_pinv, atol_rtol_tensor, pinv_batch_rule);
 
   VMAP_SUPPORT(_linalg_check_errors, _linalg_check_errors_batch_rule);
+
+  m.impl("vdot", vdot_decomp);
 }
 }}
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 506ed3ae4405..6a596f706afc 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -8,9 +8,11 @@
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
+#include <utility>
+
 namespace at { namespace functorch {
 
-static Tensor getStepTensor(const Tensor& indices, c10::SymInt bdim_size, c10::SymInt num_embeddings) {
+static Tensor getStepTensor(const Tensor& indices, const c10::SymInt& bdim_size, const c10::SymInt& num_embeddings) {
   // [batch_size, 1, 1, 1, ..., 1]
   c10::SymDimVector view_shape(indices.dim(), 1);
   view_shape[0] = bdim_size;
@@ -24,13 +26,13 @@ std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
     c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
   if (!weight_bdim && indices_bdim) {
     // B*, ED -> B*D
-    const auto result = at::embedding_symint(weight, indices, padding_idx, scale_grad_by_freq, sparse);
-    return std::make_tuple(result, indices_bdim);
+    auto result = at::embedding_symint(weight, indices, std::move(padding_idx), scale_grad_by_freq, sparse);
+    return std::make_tuple(std::move(result), indices_bdim);
   } else if (weight_bdim && !indices_bdim) {
     // *, BED -> *, E(BD) -> *(BD) -> *BD
     const auto batch_size = weight.size(*weight_bdim);
     const auto weight_ = reshape_dim_into(*weight_bdim, /*embedding_dim*/1, weight);
-    auto result = at::embedding_symint(weight_, indices, padding_idx, scale_grad_by_freq, sparse);
+    auto result = at::embedding_symint(weight_, indices, std::move(padding_idx), scale_grad_by_freq, sparse);
     result = reshape_dim_outof(-1, batch_size, result);
     return std::make_tuple(result, result.dim() - 2);
   }
@@ -44,8 +46,8 @@ std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
 
   const auto range = getStepTensor(indices, batch_size, num_embeddings);
   indices_ = indices_ + range;
-  const auto result = at::embedding_symint(weight_, indices_, padding_idx, scale_grad_by_freq, sparse);
-  return std::make_tuple(result, 0);
+  auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse);
+  return std::make_tuple(std::move(result), 0);
 }
 
 std::tuple<Tensor,optional<int64_t>>
@@ -59,9 +61,9 @@ embedding_dense_backward_batch_rule(
     const auto bdim_size = grad.sym_size(*grad_bdim);
     grad = reshape_dim_into(*grad_bdim, -1, grad);
     auto result = at::embedding_dense_backward_symint(
-        grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+        grad, indices, std::move(num_weights), std::move(padding_idx), scale_grad_by_freq);
     result = reshape_dim_outof_symint(1, bdim_size, result);
-    return std::make_tuple(result, 1);
+    return std::make_tuple(std::move(result), 1);
   }
   const auto bdim_size = indices.size(*indices_bdim);
   indices = moveBatchDimToFront(indices, indices_bdim);
@@ -74,9 +76,9 @@ embedding_dense_backward_batch_rule(
   // Fill in the padding. We can't do it in the embedding_dense_backward call
   // because we need to fill in multiple rows!
   if (padding_idx >= 0) {
-    result.select_symint(1, padding_idx).fill_(0);
+    result.select_symint(1, std::move(padding_idx)).fill_(0);
   }
-  return std::make_tuple(result, 0);
+  return std::make_tuple(std::move(result), 0);
 }
 
 /**
@@ -114,19 +116,19 @@ grid_sample_batch_rule(const Tensor& input, optional<int64_t> input_bdim, const
     auto new_input = reshape_dim_into(*input_bdim, 1, input);
     auto out = Func(new_input, grid, std::forward<ExtraArgs>(extra_args)...);
     out = reshape_dim_outof(1, input.sizes()[*input_bdim], out);
-    result = std::make_tuple(out, 1);
+    result = std::make_tuple(std::move(out), 1);
   } else if (!input_bdim && grid_bdim) {
     // grid of N(BH)W2 -> NC(BH)W or grid of N(BD)HBW3 -> NC(BD)HW
     auto new_grid = reshape_dim_into(*grid_bdim, 1, grid);
     auto out = Func(input, new_grid, std::forward<ExtraArgs>(extra_args)...);
     out = reshape_dim_outof(2, grid.sizes()[*grid_bdim], out);
-    result = std::make_tuple(out, 2);
+    result = std::make_tuple(std::move(out), 2);
   } else if (input_bdim && grid_bdim) {
     auto new_input = reshape_dim_into(*input_bdim, 0, input);
     auto new_grid = reshape_dim_into(*grid_bdim, 0, grid);
     auto out = Func(new_input, new_grid, std::forward<ExtraArgs>(extra_args)...);
     out = reshape_dim_outof(0, input.sizes()[*grid_bdim], out);
-    result = std::make_tuple(out, 0);
+    result = std::make_tuple(std::move(out), 0);
   } else {
     result = std::make_tuple(Func(input, grid, std::forward<ExtraArgs>(extra_args)...), nullopt);
   }
@@ -154,7 +156,7 @@ grid_sample_backward_helper_in(
   grid_ = ensure_has_bdim(grid_, grid_bdim.has_value(), batch_size);
   grid_ = reshape_dim_into(0, 0, grid_);
 
-  return std::make_tuple(grad_output_, input_, grid_, batch_size);
+  return std::make_tuple(std::move(grad_output_), std::move(input_), std::move(grid_), batch_size);
 }
 
 std::tuple<Tensor, optional<int64_t>, Tensor, optional<int64_t>>
@@ -298,7 +300,7 @@ struct UpsampleBackwardBatchRuleHelper<F, Func, typelist<A, B, C, T...>> {
       c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size,
       T... extra_args) {
     auto grad_output_ = reshape_dim_into(*grad_output_bdim, 0, grad_output);
-    TORCH_INTERNAL_ASSERT(input_size.size() > 0);
+    TORCH_INTERNAL_ASSERT(!input_size.empty());
 
     // input_size is wrong so we correct it
     c10::SymDimVector physical_input_size(input_size.begin(), input_size.end());
diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
index 159abc4108e8..c9482305bbd2 100644
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@@ -8,6 +8,8 @@
 #include <ATen/functorch/DynamicLayer.h>
 #include <ATen/functorch/BatchRulesHelper.h>
 
+#include <utility>
+
 // This file contains batching rules for random operations. These are different
 // from our regular batching rules: regular batching rules get registered to the
 // FuncTorchBatched key, but batching rules for random operations get
@@ -99,11 +101,11 @@ Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor& p_, c
     "If this is necessary for your usage, please file an issue with functorch.");
   if (randomness == RandomnessType::Same && self_bdim) {
     auto intermediate = empty(self.sizes(), self.options());
-    intermediate.bernoulli_(other_, gen);
+    intermediate.bernoulli_(other_, std::move(gen));
     self.copy_(intermediate); // batching should make this just work out...
     return self;
   } else {
-    self_.bernoulli_(other_, gen);
+    self_.bernoulli_(other_, std::move(gen));
     return self;
   }
 }
@@ -198,9 +200,21 @@ std::tuple<Tensor,Tensor> native_dropout_batching_rule(const Tensor& tensor, dou
     check_randomness(randomness); // if we are in eval mode, we don't use about randomness
   }
 
-  if ((train.has_value() && !train) || randomness == RandomnessType::Different) {
-    auto res = at::native_dropout(tensor_value, p, train);
-    return std::make_tuple(makeBatched(std::get<0>(res), 0, cur_level), makeBatched(std::get<1>(res), 0, cur_level));
+  if ((train.has_value() && !train) ||
+      randomness == RandomnessType::Different) {
+    if (!tensor_bdim) {
+      // if tensor is unbatched, add batch dim before
+      // calling dropout.
+      auto shape = tensor_value.sizes();
+      VmapDimVector shapeVec(1, maybe_layer->batchSize());
+      shapeVec.reserve(shape.size() + 1);
+      shapeVec.insert(shapeVec.end(), shape.begin(), shape.end());
+      tensor_value = tensor_value.expand(shapeVec);
+    }
+    auto [output, mask] = at::native_dropout(tensor_value, p, train);
+    return std::make_tuple(
+        makeBatched(std::move(output), 0, cur_level),
+        makeBatched(std::move(mask), 0, cur_level));
   }
 
   // repeated code from the CPU kernel since the CUDA one doesn't call bernoulli_ explicitly
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index f721e1171046..6f7ab7cdce06 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -9,6 +9,8 @@
 #include <ATen/Operators.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 
+#include <utility>
+
 namespace at { namespace functorch {
 
 bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
@@ -133,7 +135,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
   if (arguments[dim_arg_pos].isIntList()) {
     reduction_case = ReductionCase::DimArray;
     dims = arguments[dim_arg_pos].toIntList().vec();
-    if (dims.size() == 0) {
+    if (dims.empty()) {
       auto all_dims = range(0, std::max((int64_t)1, logical_dim));
       dims = std::vector<int64_t>(all_dims.begin(), all_dims.end());
     }
@@ -205,7 +207,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
     self = self.unsqueeze(-1);
     new_dims = {1};
   }
-  arguments[0] = self;
+  arguments[0] = std::move(self);
   if (reduction_case == ReductionCase::DimArray) {
     arguments[dim_arg_pos] = std::vector<int64_t>(new_dims.begin(), new_dims.end());
   } else if (reduction_case == ReductionCase::Dim) {
@@ -388,21 +390,21 @@ std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
     // B<...>D, B<...>V -> no change
     if (buckets_bdim.has_value() && self_bdim.has_value()) {
       auto self_ = moveBatchDimToFront(self, self_bdim);
-      auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
-      return std::make_tuple(result, 0);
+      auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
+      return std::make_tuple(std::move(result), 0);
     }
     // B<...>D, <...>V -> B<...>D, B<...>V
     if (buckets_bdim.has_value() && !self_bdim.has_value()) {
       auto self_ = moveBatchDimToFront(self, self_bdim);
       self_ = ensure_has_bdim(self_, self_bdim.has_value(), buckets.size(0));
-      auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
-      return std::make_tuple(result, 0);
+      auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
+      return std::make_tuple(std::move(result), 0);
     }
     // <...>D, B<...>V -> <...>D, <...>(BV)
     if (!buckets_bdim.has_value() && self_bdim.has_value()) {
       auto bdim_size = self.size(*self_bdim);
       auto self_ = reshape_dim_into(*self_bdim, -1, self);
-      auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
+      auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
       result = reshape_dim_outof(-1, bdim_size, result);
       return std::make_tuple(result, result.dim() - 2);
     }
@@ -413,23 +415,23 @@ std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
   if (buckets_bdim.has_value() && self_bdim.has_value()) {
     auto self_ = moveBatchDimToFront(self, self_bdim);
     self_ = self_.flatten(1);
-    auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
+    auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
     result = result.view(self_.sizes());
-    return std::make_tuple(result, 0);
+    return std::make_tuple(std::move(result), 0);
   }
   // BD, * -> BD, flat(*) -> BD, B flat(*)
   if (buckets_bdim.has_value() && !self_bdim.has_value()) {
     auto bdim_size = buckets.size(*buckets_bdim);
     auto self_ = ensure_has_bdim(self, false, bdim_size);
     self_ = self_.flatten(1);
-    auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
+    auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
     result = result.view(self_.sizes());
-    return std::make_tuple(result, 0);
+    return std::make_tuple(std::move(result), 0);
   }
   // D, B* -> no change
   if (!buckets_bdim.has_value() && self_bdim.has_value()) {
-    auto result = at::searchsorted(buckets, self, out_int32, right, side, sorter_);
-    return std::make_tuple(result, self_bdim);
+    auto result = at::searchsorted(buckets, self, out_int32, right, std::move(side), sorter_);
+    return std::make_tuple(std::move(result), self_bdim);
   }
   TORCH_INTERNAL_ASSERT(false);
 }
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 51ee898bb745..0593dc824294 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -43,7 +43,7 @@ static int64_t get_max_index_logical_dim(
     ArrayRef<optional<int64_t>> indices_bdims) {
   int64_t max_logical_dim = -1;
   TORCH_INTERNAL_ASSERT(indices.size() == indices_bdims.size());
-  TORCH_INTERNAL_ASSERT(indices.size() > 0);
+  TORCH_INTERNAL_ASSERT(!indices.empty());
   for (const auto i : c10::irange(0, indices.size())) {
     const auto& maybe_tensor = indices[i];
     if (!maybe_tensor.has_value() || !maybe_tensor->defined()) {
@@ -350,14 +350,12 @@ namespace {
   // /aten/src/ATen/native/TensorAdvancedIndexing.cpp#L294-L312
   VmapDimVector compute_indexed_shape(const Tensor &src, TensorList indices_list)
   {
-    int64_t dims_before = 0, dims_after = 0, dims_indexed = 0;
+    int64_t dims_before = 0, dims_indexed = 0;
     IntArrayRef replacement_shape;
     for (const auto dim : c10::irange(indices_list.size())) {
       if (!indices_list[dim].defined()) {
         if (dims_indexed == 0) {
           dims_before++;
-        } else {
-          dims_after++;
         }
       } else {
         dims_indexed++;
@@ -833,43 +831,6 @@ std::tuple<Tensor,optional<int64_t>> gather_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-std::tuple<Tensor,optional<int64_t>> gather_backward_batch_rule(
-    const Tensor& grad, optional<int64_t> grad_bdim,
-    const Tensor& self, optional<int64_t> self_bdim,
-    int64_t dim,
-    const Tensor& index, optional<int64_t> index_bdim,
-    bool sparse_grad) {
-  auto batch_size = get_bdim_size3(grad, grad_bdim, self, self_bdim, index, index_bdim);
-  auto grad_ = moveBatchDimToFront(grad, grad_bdim);
-  auto self_ = moveBatchDimToFront(self, self_bdim);
-  auto index_ = moveBatchDimToFront(index, index_bdim);
-
-  auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
-  auto index_logical_rank = rankWithoutBatchDim(index, index_bdim);
-  auto grad_logical_rank = rankWithoutBatchDim(grad, grad_bdim);
-
-  if (grad_logical_rank == 0) {
-    grad_ = grad_.unsqueeze(-1);
-  }
-  if (self_logical_rank == 0) {
-    self_ = self_.unsqueeze(-1);
-  }
-  if (index_logical_rank == 0) {
-    index_ = index_.unsqueeze(-1);
-  }
-  grad_ = ensure_has_bdim(grad_, grad_bdim.has_value(), batch_size);
-  self_ = ensure_has_bdim(self_, self_bdim.has_value(), batch_size);
-  index_ = ensure_has_bdim(index_, index_bdim.has_value(), batch_size);
-
-  auto physical_dim = getPhysicalDim(self_, /*has_batch_dim*/true, dim);
-  auto result = at::gather_backward(grad_, self_, physical_dim, index_, sparse_grad);
-  // result should has same rank as self
-  if (self_logical_rank == 0) {
-    result = result.squeeze(-1);
-  }
-  return std::make_tuple(result, 0);
-}
-
 namespace {
 Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t dim) {
   if (index.dim() == 0) {
@@ -1056,6 +1017,164 @@ std::tuple<Tensor,optional<int64_t>> masked_fill_scalar_batch_rule(
   return std::make_tuple(result, 0);
 }
 
+std::tuple<Tensor,optional<int64_t>> index_fill_int_scalar_batch_rule_impl(
+    Tensor & self, optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor & index, optional<int64_t> index_bdim,
+    const Scalar & value,
+    const bool inplace) {
+  const auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
+  const auto index_logical_rank = rankWithoutBatchDim(index, index_bdim);
+  Tensor self_ = moveBatchDimToFront(self, self_bdim);
+  Tensor index_ = moveBatchDimToFront(index, index_bdim);
+  dim = maybe_wrap_dim(dim, self_logical_rank);
+
+  if (inplace && !self_bdim.has_value()) {
+    vmapIncompatibleInplaceError("index_fill_");
+  }
+
+  if (!index_bdim) {
+    if (self_logical_rank == 0){
+      self_.unsqueeze_(-1);
+    }
+    self_.index_fill_(dim + 1, index_, value);
+    if (self_logical_rank == 0) {
+      self_.squeeze_(-1);
+    }
+    return std::make_tuple(self_, 0);
+  }
+
+  auto batch_size = get_bdim_size2(self, self_bdim, index, index_bdim);
+  self_ = ensure_has_bdim(self_, self_bdim.has_value(), batch_size);
+  index_ = ensure_has_bdim(index_, index_bdim.has_value(), batch_size);
+
+  if (inplace) {
+    // Do for-loop for in-place because we cannot reshape
+    // `self_` having an incompatible stride without copying
+    for (const auto i : c10::irange(0, batch_size)) {
+      const auto& self_slice = self_.select(0, i);
+      const auto& index_slice = index_.select(0, i);
+      self_slice.index_fill_(
+        dim,
+        index_slice,
+        value
+      );
+    }
+    return std::make_tuple(self_, 0);
+  }
+
+  self_ = self_bdim.has_value() ? self_ : self_.clone();
+
+  if (self_logical_rank != 0){
+    auto index_offset = at::arange(
+      batch_size,
+      at::TensorOptions().dtype(index_.scalar_type()).device(index_.device())
+    );
+    if (index_logical_rank == 0){
+      index_ = index_.unsqueeze(-1);
+    }
+    index_ = index_.add(index_offset.unsqueeze(-1), self_.size(dim + 1));
+    index_ = reshape_dim_into(0, 0, index_);
+    self_ = reshape_dim_into(0, dim, self_);
+    self_.index_fill_(dim, index_, value);
+    self_ = reshape_dim_outof(dim, batch_size, self_);
+    return std::make_tuple(self_, dim);
+  }
+
+  // If self_logical_rank == 0, the batch dim is certainly 0, and we must apply batched indices to each row.
+  if (index_logical_rank != 0){
+    index_ = reshape_dim_into(0, 0, index_);
+  }
+  self_.unsqueeze_(-1);
+  self_.index_fill_(dim + 1, index_, value);
+  self_.squeeze_(-1);
+
+  return std::make_tuple(self_, 0);
+}
+
+std::tuple<Tensor,optional<int64_t>> index_fill_int_tensor_batch_rule_impl(
+    Tensor & self, optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor & index, optional<int64_t> index_bdim,
+    const Tensor & value, optional<int64_t> value_bdim,
+    const bool inplace) {
+  const auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
+  Tensor self_ = moveBatchDimToFront(self, self_bdim);
+  Tensor index_ = moveBatchDimToFront(index, index_bdim);
+  Tensor value_ = moveBatchDimToFront(value, value_bdim);
+  dim = maybe_wrap_dim(dim, self_logical_rank);
+
+  if (inplace && !self_bdim.has_value()) {
+    vmapIncompatibleInplaceError("index_fill_");
+  }
+
+  if (!index_bdim && !value_bdim) {
+    if (self_logical_rank == 0){
+      self_.unsqueeze_(-1);
+    }
+    self_.index_fill_(dim + 1, index_, value);
+    if (self_logical_rank == 0) {
+      self_.squeeze_(-1);
+    }
+    return std::make_tuple(self_, 0);
+  }
+
+  auto batch_size = get_bdim_size3(self, self_bdim, index, index_bdim, value, value_bdim);
+  self_ = ensure_has_bdim(self_, self_bdim.has_value(), batch_size);
+  index_ = ensure_has_bdim(index_, index_bdim.has_value(), batch_size);
+  value_ = ensure_has_bdim(value_, value_bdim.has_value(), batch_size);
+
+  self_ = self_bdim.has_value() ? self_ : self_.clone();
+
+  for (const auto i : c10::irange(0, batch_size)) {
+    const auto& self_slice = self_.select(0, i);
+    const auto& index_slice = index_.select(0, i);
+    const auto& value_slice = value_.select(0, i);
+    self_slice.index_fill_(
+      dim,
+      index_slice,
+      value_slice
+    );
+  }
+
+  return std::make_tuple(self_, 0);
+}
+
+void index_fill__int_scalar_batch_rule(
+    Tensor & self, optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor & index, optional<int64_t> index_bdim,
+    const Scalar & value) {
+  index_fill_int_scalar_batch_rule_impl(self, self_bdim, dim, index, index_bdim, value, true);
+}
+
+void index_fill__int_tensor_batch_rule(
+    Tensor & self, optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor & index, optional<int64_t> index_bdim,
+    const Tensor & value, optional<int64_t> value_bdim) {
+  index_fill_int_tensor_batch_rule_impl(self, self_bdim, dim, index, index_bdim, value, value_bdim, true);
+}
+
+std::tuple<Tensor,optional<int64_t>> index_fill_int_scalar_batch_rule(
+    const Tensor & self, optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor & index, optional<int64_t> index_bdim,
+    const Scalar & value) {
+  auto self_ = self.clone(at::MemoryFormat::Preserve);
+  return index_fill_int_scalar_batch_rule_impl(self_, self_bdim, dim, index, index_bdim, value, false);
+}
+
+std::tuple<Tensor,optional<int64_t>> index_fill_int_tensor_batch_rule(
+    const Tensor & self, optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor & index, optional<int64_t> index_bdim,
+    const Tensor & value, optional<int64_t> value_bdim) {
+  auto self_ = self.clone(at::MemoryFormat::Preserve);
+  return index_fill_int_tensor_batch_rule_impl(self_, self_bdim, dim, index, index_bdim, value, value_bdim, false);
+}
+
+
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("index.Tensor", index_plumbing);
   m.impl("index_put_", index_put__plumbing);
@@ -1066,10 +1185,13 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   m.impl("index_copy", index_copy_decomp);
   m.impl("index_select", index_select_decomp);
   VMAP_SUPPORT2(masked_fill, Scalar, masked_fill_scalar_batch_rule);
+  VMAP_SUPPORT2(index_fill_, int_Tensor, index_fill__int_tensor_batch_rule);
+  VMAP_SUPPORT2(index_fill_, int_Scalar, index_fill__int_scalar_batch_rule);
+  VMAP_SUPPORT2(index_fill, int_Tensor, index_fill_int_tensor_batch_rule);
+  VMAP_SUPPORT2(index_fill, int_Scalar, index_fill_int_scalar_batch_rule);
   VMAP_SUPPORT(index_add, index_add_batch_rule);
   VMAP_SUPPORT(diagonal_scatter, diagonal_scatter_batch_rule);
   VMAP_SUPPORT(gather, gather_batch_rule);
-  VMAP_SUPPORT(gather_backward, gather_backward_batch_rule);
   VMAP_SUPPORT2(scatter, value, scatter_value_batch_rule);
   VMAP_SUPPORT2(scatter, src, scatter_src_batch_rule);
   VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index 8cd4385fea86..8727144dd1fb 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -177,8 +177,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   UNARY_POINTWISE_ALL(leaky_relu);
   UNARY_POINTWISE(log_sigmoid);
   UNARY_POINTWISE_ALL(relu);
-  UNARY_POINTWISE_ALL(relu6);
-  UNARY_POINTWISE_ALL(selu);
   UNARY_POINTWISE_ALL(celu);
   UNARY_POINTWISE(gelu);
   UNARY_POINTWISE_ALL(sigmoid);
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 9bc67cbe8812..b0ea5e5dc454 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -6,6 +6,7 @@
 
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <iostream>
+#include <utility>
 
 #include <ATen/Operators.h>
 #include <ATen/functorch/PlumbingHelper.h>
@@ -236,7 +237,7 @@ std::tuple<Tensor, optional<int64_t>> squeeze_batch_rule(const Tensor& self, opt
   }
 
   auto result = self.view(squeezed_sizes);
-  return std::make_tuple(result, c10::optional<int64_t>(new_batch_idx));
+  return std::make_tuple(std::move(result), c10::optional<int64_t>(new_batch_idx));
 }
 
 std::tuple<Tensor, optional<int64_t>> squeeze_dims_batch_rule(
@@ -246,7 +247,7 @@ std::tuple<Tensor, optional<int64_t>> squeeze_dims_batch_rule(
   auto ndim = self.dim();
   if (ndim == 1) {
     TORCH_CHECK(
-        dims.size() == 0 || (dims.size() == 1 && dims[0] == 0),
+        dims.empty() || (dims.size() == 1 && dims[0] == 0),
         "Dimension is out of range (expected to be in range of [-1, 0], but got ", dims);
     return std::make_tuple(self.alias(), bdim);
   }
@@ -284,13 +285,13 @@ std::tuple<std::vector<Tensor>, optional<int64_t>> chunk_batching_rule(const Ten
 
 std::tuple<Tensor, optional<int64_t>> select_batching_rule(const Tensor& self, optional<int64_t> bdim, int64_t dim, c10::SymInt index) {
   if (!bdim) {
-    return std::make_tuple(self.select_symint(dim, index), nullopt);
+    return std::make_tuple(self.select_symint(dim, std::move(index)), nullopt);
   }
 
   auto _self = moveBatchDimToFront(self, bdim);
   auto dim_physical = getPhysicalDim(_self, true, dim);
-  auto result = _self.select_symint(dim_physical, index);
-  return std::make_tuple(result, 0);
+  auto result = _self.select_symint(dim_physical, std::move(index));
+  return std::make_tuple(std::move(result), 0);
 }
 
 std::tuple<Tensor, optional<int64_t>> _reshape_alias_batch_rule(const Tensor& self, optional<int64_t> bdim, const c10::SymIntArrayRef shape, const c10::SymIntArrayRef strides) {
@@ -318,7 +319,14 @@ std::tuple<Tensor, optional<int64_t>> roll_batch_rule(const Tensor& self, option
   // We will do something like: t.reshape(a, -1).roll(1, dims=[1, ]).reshape(old_shape)
   auto old_shape = self_.sizes();
   new_dims.push_back(1);
+  auto logical_rank = rankWithoutBatchDim(self, bdim);
+  if (logical_rank == 0) {
+    self_ = self_.unsqueeze(0);
+  }
+
   auto output = at::roll(self_.flatten(1), shifts, new_dims);
+  // NOTE: For scalar tensor, we don't need to unsqueeze as reshape
+  // with `old_shape` takes care of it.
   output = output.reshape(old_shape);
   return std::make_tuple(output, 0);
 }
@@ -359,8 +367,8 @@ std::tuple<Tensor,optional<int64_t>> slice_batch_rule(
   auto self_ = moveBatchDimToFront(self, self_bdim);
   dim = getPhysicalDim(self, self_bdim.has_value(), dim);
 
-  auto result = self_.slice_symint(dim, start, end, step);
-  return std::make_tuple(result, 0);
+  auto result = self_.slice_symint(dim, std::move(start), std::move(end), std::move(step));
+  return std::make_tuple(std::move(result), 0);
 }
 
 static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
@@ -386,7 +394,7 @@ transpose_int_batch_rule(
   dim0 = getPhysicalDim(self, self_bdim.has_value(), dim0);
   dim1 = getPhysicalDim(self, self_bdim.has_value(), dim1);
   auto result = self_.transpose(dim0, dim1);
-  return std::make_tuple(result, 0);
+  return std::make_tuple(std::move(result), 0);
 }
 
 std::tuple<Tensor, optional<int64_t>> permute_batching_rule(
@@ -416,7 +424,7 @@ std::tuple<Tensor,optional<int64_t>> select_backward_batch_rule(
   c10::SymDimVector input_sizes_(input_sizes.size() + 1);
   input_sizes_[0] = grad_input_.sym_size(0);
   std::copy(input_sizes.begin(), input_sizes.end(), input_sizes_.begin() + 1);
-  auto result = at::select_backward_symint(grad_input_, input_sizes_, dim, index);
+  auto result = at::select_backward_symint(grad_input_, input_sizes_, dim, std::move(index));
   return std::make_tuple(std::move(result), 0);
 }
 
@@ -429,7 +437,7 @@ std::tuple<Tensor,optional<int64_t>> slice_backward_batch_rule(
   c10::SymDimVector input_sizes_(input_sizes.size() + 1);
   input_sizes_[0] = grad_input_.size(0);
   std::copy(input_sizes.begin(), input_sizes.end(), input_sizes_.begin() + 1);
-  auto result = at::slice_backward_symint(grad_input_, input_sizes_, dim, start, end, step);
+  auto result = at::slice_backward_symint(grad_input_, input_sizes_, dim, std::move(start), std::move(end), std::move(step));
   return std::make_tuple(std::move(result), 0);
 }
 
@@ -507,7 +515,7 @@ std::tuple<Tensor, optional<int64_t>> unfold_batch_rule(
   if (logical_rank==0) {
     result = result.squeeze(-1);
   }
-  return std::make_tuple(result, 0);
+  return std::make_tuple(std::move(result), 0);
 }
 
 std::tuple<Tensor, optional<int64_t>> narrow_copy_batch_rule(
@@ -517,9 +525,9 @@ std::tuple<Tensor, optional<int64_t>> narrow_copy_batch_rule(
   auto self_ = moveBatchDimToFront(self, self_bdim);
   auto logical_rank = rankWithoutBatchDim(self, self_bdim);
   dim = maybe_wrap_dim(dim, logical_rank) + 1;
-  auto result = self_.narrow_copy_symint(dim, start, length);
+  auto result = self_.narrow_copy_symint(dim, std::move(start), std::move(length));
 
-  return std::make_tuple(result, 0);
+  return std::make_tuple(std::move(result), 0);
 }
 
 std::tuple<std::vector<Tensor>, optional<int64_t>> unsafe_split_batch_rule(
@@ -531,8 +539,8 @@ std::tuple<std::vector<Tensor>, optional<int64_t>> unsafe_split_batch_rule(
   auto self_ = moveBatchDimToFront(self, self_bdim);
   auto logical_rank = rankWithoutBatchDim(self, self_bdim);
   dim = maybe_wrap_dim(dim, logical_rank) + 1;
-  auto result = self_.unsafe_split_symint(split_size, dim);
-  return std::make_tuple(result, 0);
+  auto result = self_.unsafe_split_symint(std::move(split_size), dim);
+  return std::make_tuple(std::move(result), 0);
 }
 
 std::tuple<Tensor, optional<int64_t>> movedim_batch_rule(const Tensor& self, optional<int64_t> self_bdim, IntArrayRef source, IntArrayRef destination) {
diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp
index ccb7609cc84e..b12778228a8e 100644
--- a/aten/src/ATen/functorch/BatchedFallback.cpp
+++ b/aten/src/ATen/functorch/BatchedFallback.cpp
@@ -161,7 +161,7 @@ void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, torch::j
     batched_tensor_inputs.push_back(tensor);
     batched_tensor_inputs_position.push_back(idx);
   }
-  TORCH_INTERNAL_ASSERT(batched_tensor_inputs.size() > 0);
+  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
   // MultiBatchVmapTransform the BatchedTensor arguments. This returns
   // VmapPhysicalViews that contain all of the batch dimensions.
@@ -306,7 +306,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
     batched_tensor_inputs.push_back(tensor);
     batched_tensor_inputs_position.push_back(idx);
   }
-  TORCH_INTERNAL_ASSERT(batched_tensor_inputs.size() > 0);
+  TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty());
 
   // MultiBatchVmapTransform the BatchedTensor arguments. This returns
   // VmapPhysicalViews that contain all of the batch dimensions.
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index 320989604570..b61edd986580 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <bitset>
+#include <utility>
 
 #include <ATen/ArrayRef.h>
 #include <ATen/SmallVector.h>
@@ -116,7 +117,7 @@ inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) {
   if (!isBatchedTensor(tensor)) {
     return nullptr;
   }
-  return unsafeGetBatchedImpl(tensor);
+  return unsafeGetBatchedImpl(std::move(tensor));
 }
 
 // Returns a bitset. If bit i is set, then that means dim i is a batchdim.
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 5acec2a3b019..c34c849bdc52 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -92,7 +92,7 @@ class FuncTorchTLS : public FuncTorchTLSBase {
   }
 
   int64_t checkSupportsSingleLevelAutogradFunction() const override {
-    TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() == 0 || getSingleLevelAutogradFunctionAllowed(),
+    TORCH_INTERNAL_ASSERT(dynamicLayerStack.empty() || getSingleLevelAutogradFunctionAllowed(),
         "functorch functions (vmap, grad, vjp, etc.) incorrectly used with ",
         "torch.autograd.function._SingleLevelFunction. ",
         "This is not expected, please file a bug.");
@@ -100,7 +100,7 @@ class FuncTorchTLS : public FuncTorchTLSBase {
   }
 
   void checkSupportsInplaceRequiresGrad() const override {
-    TORCH_CHECK(dynamicLayerStack.size() == 0 || allow_inplace_requires_grad_,
+    TORCH_CHECK(dynamicLayerStack.empty() || allow_inplace_requires_grad_,
         "You are attempting to call Tensor.requires_grad_() (or perhaps using ",
         "torch.autograd.functional.* APIs) inside of a function being transformed ",
         "by a functorch transform. ",
@@ -109,7 +109,7 @@ class FuncTorchTLS : public FuncTorchTLSBase {
         "outside of a function being transformed instead.");
   }
   void checkSupportsRetainGrad() const override {
-    TORCH_CHECK(dynamicLayerStack.size() == 0,
+    TORCH_CHECK(dynamicLayerStack.empty(),
         "You are attempting to call Tensor.retain_grad() ",
         "inside of a function being transformed ",
         "by a functorch transform. ",
@@ -172,7 +172,7 @@ const std::shared_ptr<bool>& getLifeHandleForLevel(int64_t level) {
 
 optional<DynamicLayer> maybeCurrentDynamicLayer() {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
-  if (dynamicLayerStack.size() == 0) {
+  if (dynamicLayerStack.empty()) {
     return {};
   }
   return dynamicLayerStack.back();
@@ -182,14 +182,14 @@ struct SaveLocalDispatchKeySet {
  public:
   SaveLocalDispatchKeySet() {
     auto& dynamicLayerStack = dynamicLayerStackAccessor();
-    TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
+    TORCH_INTERNAL_ASSERT(!dynamicLayerStack.empty());
     auto& layer = dynamicLayerStack.back();
     auto tmp = c10::impl::tls_local_dispatch_key_set();
     layer.interpreter().saveLocalDispatchKeySet(tmp);
   }
   ~SaveLocalDispatchKeySet() {
     auto& dynamicLayerStack = dynamicLayerStackAccessor();
-    TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
+    TORCH_INTERNAL_ASSERT(!dynamicLayerStack.empty());
     auto& layer = dynamicLayerStack.back();
     auto tmp = layer.interpreter().getSavedLocalDispatchKeySet();
     layer.interpreter().clearSavedLocalDispatchKeySet();
@@ -209,11 +209,11 @@ void setDynamicLayerStack(const std::vector<DynamicLayer>& stack) {
 
 DynamicLayer popDynamicLayer() {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
-  TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
+  TORCH_INTERNAL_ASSERT(!dynamicLayerStack.empty());
   auto result = dynamicLayerStack.back();
   dynamicLayerStack.pop_back();
 
-  if (dynamicLayerStack.size() == 0) {
+  if (dynamicLayerStack.empty()) {
 #ifdef HAS_TORCH_SHOW_DISPATCH_TRACE
     if (c10::show_dispatch_trace_enabled()) {
       std::cout << "DynamicLayer off" << std::endl;
@@ -253,10 +253,10 @@ int64_t initAndPushDynamicLayer(
   const auto& dynamicLayerStack = dynamicLayerStackAccessor();
   const auto layerId = 1 + dynamicLayerStack.size();
   DynamicLayer new_layer(transform_type, layerId, batch_size, randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views);
-  pushDynamicLayer(std::move(new_layer));
-
   // NB: this function should be called while holding the GIL to avoid races
   new_layer.interpreter().set_is_alive(true);
+  pushDynamicLayer(std::move(new_layer));
+
 
   if (transform_type == TransformType::Grad) {
     TORCH_INTERNAL_ASSERT(prev_grad_mode.has_value());
@@ -439,7 +439,7 @@ static void dynamicLayerFrontFallback(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
-  TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
+  TORCH_INTERNAL_ASSERT(!dynamicLayerStack.empty());
 #ifdef HAS_TORCH_SHOW_DISPATCH_TRACE
   if (c10::show_dispatch_trace_enabled()) {
     std::cout << dynamicLayerStack << std::endl;
diff --git a/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp b/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp
index 40e22c455509..0916a450ed29 100644
--- a/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp
+++ b/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp
@@ -57,7 +57,7 @@ void FunctionalizeInterpreterPtr::sendToNextInterpreterImpl(
   sanityCheckNotFunctional(op, stack, args_size);
 
   // Re-dispatch
-  if (getDynamicLayerStack().size() == 0) {
+  if (getDynamicLayerStack().empty()) {
     sanityCheckStack(op, stack);
   }
   op.callBoxed(stack);
diff --git a/aten/src/ATen/functorch/Interpreter.cpp b/aten/src/ATen/functorch/Interpreter.cpp
index 6db36eb33030..b2c4dda12570 100644
--- a/aten/src/ATen/functorch/Interpreter.cpp
+++ b/aten/src/ATen/functorch/Interpreter.cpp
@@ -6,6 +6,8 @@
 #include <ATen/functorch/ADInterpreters.h>
 #include <ATen/functorch/DynamicLayer.h>
 
+#include <utility>
+
 namespace at { namespace functorch {
 
 static DispatchKeySet get_all_dynlayer_keyset() {
@@ -92,7 +94,7 @@ void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
         auto result = unwrapIfDead(tensor);
         auto* wrapper = maybeGetTensorWrapper(result);
         TORCH_INTERNAL_ASSERT(wrapper == nullptr);
-        auto* batched = maybeGetBatchedImpl(result);
+        auto* batched = maybeGetBatchedImpl(std::move(result));
         TORCH_INTERNAL_ASSERT(batched == nullptr);
         return tensor;
       });
diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
index d9f6ed21f13d..0273fcd17fcc 100644
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@@ -16,6 +16,8 @@
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/functorch/BatchRulesHelper.h>
 
+#include <utility>
+
 namespace at {
 namespace functorch {
 
@@ -155,7 +157,7 @@ Tensor& squeeze_dims__batching_rule(Tensor& self, IntArrayRef dims) {
 
   if (logical_dim == 0) {
     TORCH_CHECK(
-        dims.size() == 0 || (dims.size() == 1 && dims[0] == 0),
+        dims.empty() || (dims.size() == 1 && dims[0] == 0),
         "Dimension is out of range (expected to be in range of [-1, 0], but got ", dims);
     return self;
   }
@@ -476,7 +478,7 @@ Tensor as_strided_batching_rule(
     optional<c10::SymInt> storage_offset) {
   if (!participatesInCurrentLevel(tensor)) {
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
-    return at::as_strided_symint(tensor, sizes, strides, storage_offset);
+    return at::as_strided_symint(tensor, sizes, strides, std::move(storage_offset));
   }
   auto physical_view = MultiBatchVmapTransform::logicalToPhysical(tensor);
   auto num_batch_dims = physical_view.numBatchDims();
@@ -511,7 +513,7 @@ Tensor as_strided_batching_rule(
   // and creates a tensor y such that each y[i] references the same memory
   // locations as zi. See NOTE: [When will the as_strided batching rule fail?]
   auto result = physical_view.tensor().as_strided_symint(
-      physical_sizes, physical_strides, storage_offset);
+      physical_sizes, physical_strides, std::move(storage_offset));
   return physical_view.getPhysicalToLogicalMap().apply(result);
 }
 
@@ -699,7 +701,7 @@ Tensor block_diag_batching_rule(TensorList tensors) {
   auto physical_tensors = fmap(
       physical_views, [](const VmapPhysicalView& view) -> Tensor { return view.tensor(); });
   TORCH_INTERNAL_ASSERT(
-      tensors.size() > 0, "The dispatcher should not have dispatched here otherwise.");
+      !tensors.empty(), "The dispatcher should not have dispatched here otherwise.");
   // Implementing this as a dummy for loop for now, since I'm not sure how to do it any better.
   // I'm probably not accounting for potentially multiple batched dimensions?
   auto bdim = physical_tensors[0].size(0);
@@ -727,7 +729,7 @@ Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
   auto physical_tensors = fmap(
       physical_views, [](const VmapPhysicalView& view) -> Tensor { return view.tensor(); });
   TORCH_INTERNAL_ASSERT(
-      tensors.size() > 0, "The dispatcher should not have dispatched here otherwise.");
+      !tensors.empty(), "The dispatcher should not have dispatched here otherwise.");
   // NB: stack wraps the dimensionality to (logical dim + 1), so we have to
   // manually handle that here.
   auto dim_physical =
diff --git a/aten/src/ATen/functorch/VmapInterpreter.cpp b/aten/src/ATen/functorch/VmapInterpreter.cpp
index a7db8f13a031..ccef0b40b57c 100644
--- a/aten/src/ATen/functorch/VmapInterpreter.cpp
+++ b/aten/src/ATen/functorch/VmapInterpreter.cpp
@@ -16,7 +16,7 @@ void VmapInterpreterPtr::sendToNextInterpreterImpl(
     torch::jit::Stack* stack,
     bool grad_special_case) {
   // Re-dispatch
-  if (getDynamicLayerStack().size() == 0) {
+  if (getDynamicLayerStack().empty()) {
     sanityCheckStack(op, stack);
   }
   op.callBoxed(stack);
diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
index 5eb8b4b7601f..069ec825766c 100644
--- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@@ -74,11 +74,12 @@ class HIPStreamMasqueradingAsCUDA {
     return unwrap().pack3();
   }
 
-  static HIPStreamMasqueradingAsCUDA unpack3(int64_t stream_id,
-                                             int64_t device_index,
-                                             int64_t device_type) {
+  static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
+                                             DeviceIndex device_index,
+                                             DeviceType device_type) {
     // NB: constructor manages CUDA->HIP translation for us
-    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(stream_id, device_index, device_type));
+    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
+        stream_id, device_index, device_type));
   }
 
   static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }
diff --git a/aten/src/ATen/mps/IndexKernels.h b/aten/src/ATen/mps/IndexKernels.h
index df22c616baac..650da6ae9514 100644
--- a/aten/src/ATen/mps/IndexKernels.h
+++ b/aten/src/ATen/mps/IndexKernels.h
@@ -177,5 +177,226 @@ kernel void index_put_accumulate_native_dtypes<atomic_int, int>(constant IndexAB
                                                                 device   void    * outputData   [[buffer(5)]],
                                                                 uint thread_index [[thread_position_in_grid]]);
 )INDEX_METAL";
+
+static const char *SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
+struct __attribute__ ((packed)) packed_uint5{{
+  uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
+}};
+
+kernel void scatter_kernel_5(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint5 & size   [[buffer(2)]],
+                             constant packed_uint5 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint5 local_index;
+    local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x;
+    local_index.y = linear_index / (size.u * size.w * size.z) % size.y;
+    local_index.z = linear_index / (size.u * size.w) % size.z;
+    local_index.w = linear_index / size.u % size.w;
+    local_index.u = linear_index % size.u;
+
+    packed_uint5 strided_index;
+    strided_index.x = local_index.x * stride.x;
+    strided_index.y = local_index.y * stride.y;
+    strided_index.z = local_index.z * stride.z;
+    strided_index.w = local_index.w * stride.w;
+    strided_index.u = local_index.u * stride.u;
+
+    dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u] = src[linear_index];
+}}
+
+kernel void scatter_kernel_4(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint4 & size   [[buffer(2)]],
+                             constant packed_uint4 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint4 local_index;
+    local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0];
+    local_index.y = linear_index / (size[3] * size[2]) % size[1];
+    local_index.z = linear_index / size[3] % size[2];
+    local_index.w = linear_index % size[3];
+
+    const packed_uint4 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w] = src[linear_index];
+}}
+
+kernel void scatter_kernel_3(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint3 & size   [[buffer(2)]],
+                             constant packed_uint3 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint3 local_index;
+    local_index.x = linear_index / (size[2] * size[1]) % size[0];
+    local_index.y = linear_index / size[2] % size[1];
+    local_index.z = linear_index % size[2];
+
+    const packed_uint3 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y + strided_index.z] = src[linear_index];
+}}
+
+kernel void scatter_kernel_2(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint2 & size   [[buffer(2)]],
+                             constant packed_uint2 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint2 local_index;
+    local_index.x = linear_index / size[1] % size[0];
+    local_index.y = linear_index % size[1];
+
+    const packed_uint2 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y] = src[linear_index];
+}}
+
+kernel void scatter_kernel_1(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant int & size            [[buffer(2)]],
+                             constant int & stride          [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    const int local_index = linear_index % size;
+    const int strided_index = local_index * stride;
+    dst[strided_index] = src[linear_index];
+}}
+)METAL_SCATTER";
+
+static const char *GATHER_OPS_TEMPLATE = R"METAL_GATHER(
+struct __attribute__ ((packed)) packed_uint5{{
+  uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
+}};
+
+kernel void gather_kernel_5(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint5 & size    [[buffer(2)]],
+                            constant packed_uint5 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+
+    packed_uint5 local_index;
+    local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x;
+    local_index.y = linear_index / (size.u * size.w * size.z) % size.y;
+    local_index.z = linear_index / (size.u * size.w) % size.z;
+    local_index.w = linear_index / size.u % size.w;
+    local_index.u = linear_index % size.u;
+
+    packed_uint5 strided_index;
+    strided_index.x = local_index.x * stride.x;
+    strided_index.y = local_index.y * stride.y;
+    strided_index.z = local_index.z * stride.z;
+    strided_index.w = local_index.w * stride.w;
+    strided_index.u = local_index.u * stride.u;
+
+    dst[linear_index] = src[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u];
+}}
+
+kernel void gather_kernel_4(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint4 & size    [[buffer(2)]],
+                            constant packed_uint4 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint4 local_index;
+    local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0];
+    local_index.y = linear_index / (size[3] * size[2]) % size[1];
+    local_index.z = linear_index / size[3] % size[2];
+    local_index.w = linear_index % size[3];
+
+    const packed_uint4 strided_index = local_index * stride;
+    dst[linear_index] = src[strided_index.x + strided_index.y + strided_index.z + strided_index.w];
+}}
+
+kernel void gather_kernel_3(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint3 & size    [[buffer(2)]],
+                            constant packed_uint3 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint3 local_index;
+    local_index.x = linear_index / (size[2] * size[1]) % size[0];
+    local_index.y = linear_index / size[2] % size[1];
+    local_index.z = linear_index % size[2];
+
+    const packed_uint3 strided_index = local_index * stride;
+    dst[linear_index] = src[strided_index.x + strided_index.y + strided_index.z];
+}}
+
+kernel void gather_kernel_2(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint2 & size    [[buffer(2)]],
+                            constant packed_uint2 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint2 local_index;
+    local_index.x = linear_index / size[1] % size[0];
+    local_index.y = linear_index % size[1];
+
+    const packed_uint2 strided_index = local_index * stride;
+    dst[linear_index] = src[strided_index.x + strided_index.y];
+}}
+
+kernel void gather_kernel_1(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant int & size             [[buffer(2)]],
+                            constant int & stride           [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    const int local_index = linear_index % size;
+    const int strided_index = local_index * stride;
+    dst[linear_index] = src[strided_index];
+}}
+)METAL_GATHER";
 }
 }
diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h
index a6df567b5658..746d42712da9 100644
--- a/aten/src/ATen/mps/MPSAllocator.h
+++ b/aten/src/ATen/mps/MPSAllocator.h
@@ -1,5 +1,8 @@
 //  Copyright © 2022 Apple Inc.
 
+#pragma once
+
+#include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSStream.h>
 #include <cstdio>
 #include <mutex>
@@ -9,27 +12,10 @@
 
 // this implementation is based on CUDACachingAllocator.
 // It utilizes Metal Heaps to improve the performance with buffer allocation.
+// Do not include this header. Use MPSAllocatorInterface.h instead.
 // TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
 namespace at {
 namespace mps {
-
-class IMpsAllocatorCallback {
- public:
-  enum class EventType {
-    ALLOCATED, // buffer got allocated to be used immediately
-    RECYCLED,  // buffer pulled from free list to be reused
-    FREED,     // buffer put to free list for future recycling
-    RELEASED,  // buffer memory released
-  };
-  virtual ~IMpsAllocatorCallback() = default;
-  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
-};
-
-// MPS allocator will execute every registered callback when a block of memory is freed.
-C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
-#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
-  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
-
 namespace HeapAllocator {
 
 #define MB(x) round_page(x * 1048576UL)
@@ -255,42 +241,67 @@ class MPSHeapAllocatorImpl
     m_small_pool_private(m_device, UsageFlags::SMALL   | UsageFlags::PRIVATE | UsageFlags::HAZARD),
     // no Hazard Tracking required for the Scalar pool (synchronized manually)
     m_scalar_pool(m_device, UsageFlags::SMALL | UsageFlags::SHARED | UsageFlags::SCALAR),
-    m_total_allocated_memory(0), m_max_buffer_size([m_device maxBufferLength]),
-    m_stream(getDefaultMPSStream())
+    m_total_allocated_memory(0), m_current_allocated_memory(0),
+    m_max_buffer_size([m_device maxBufferLength]), m_stream(getDefaultMPSStream())
   {
     init_allocator();
   }
 
   // interface exposed to at::Allocator
   id<MTLBuffer> malloc(size_t size, uint32_t usage);
+  // frees a buffer and returns it into buffer pool
   void free(void* ptr);
+  // releases all the cached buffers and their associated heaps
   void emptyCache();
-  // interface exposed to internal MPS operations
+  // returns true if buffer was allocated from the shared pool
   bool isSharedBuffer(void* ptr);
-  ssize_t getRequestedBufferSize(void* ptr);
+  // get the requested unaligned size of an MTLBuffer
+  ssize_t getUnalignedBufferSize(void* ptr);
+  // set the shape of a base tensor from a view tensor
   void setBufferShape(void* ptr, const IntArrayRef& shape);
+  // retrieve the shape of a base tensor from a view tensor
   IntArrayRef getBufferShape(void* ptr);
+  // allocate a buffer from a specialized pool to import CPU scalars into GPU
   id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
   // this indicates how far (in Megabytes) the current total allocations are from the
   // low watermark limit which is used to detect if we're under memory pressure
   // This returns zero if we've reached the low watermark limit
   ssize_t getLowWatermarkValue();
-
-  bool getDebugVerbosity() const { return m_debug_verbosity; }
-  size_t getMaxTotalAllowedSize() const { return m_max_total_allowed_size; }
+  // (see m_low_watermark_ratio for description)
+  void setLowWatermarkRatio(double ratio);
+  // (see m_high_watermark_ratio for description)
+  void setHighWatermarkRatio(double ratio);
+  // (see m_low_watermark_limit for description)
   size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
+  // (see m_max_total_allowed_size for description)
+  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
+  // (see m_total_allocated_memory for description)
+  size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
+  // (see m_current_allocated_memory for description)
+  size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
+  // total GPU memory allocated in the process by Metal driver; including
+  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
+  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
+  // (see enum DebugVerbosity for description)
+  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
+  // returns the device that we allocate from
   inline id<MTLDevice> Device() const { return m_device; }
 
+  // TODO: make a common function to do size unit conversions in PyTorch.
+  inline std::string format_size(uint64_t size) const;
+
 private:
   // (see m_high_watermark_ratio for description)
   constexpr static double default_high_watermark_ratio = 1.7;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  constexpr static double default_high_watermark_upper_bound = 2.0;
   // (see m_low_watermark_ratio for description)
   // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
   constexpr static double default_low_watermark_ratio_unified  = 1.4;
   constexpr static double default_low_watermark_ratio_discrete = 1.0;
 
   const id<MTLDevice> m_device;
-  std::mutex m_mutex;
+  std::recursive_mutex m_mutex;
   // allocated buffers by device pointer
   ska::flat_hash_map<void*, BufferBlock*> m_allocated_buffers;
   // unallocated cached buffers larger than 1 MB
@@ -299,21 +310,26 @@ class MPSHeapAllocatorImpl
   BufferPool m_small_pool_shared, m_small_pool_private;
   // small cached buffers to import scalar values into MPS stream
   BufferPool m_scalar_pool;
-  // total memory allocated by HeapAllocator
+  // total memory allocated by HeapAllocator (including blocks in pools)
   size_t m_total_allocated_memory;
+  // currently active memory allocations in use (i.e., blocks not in pools)
+  size_t m_current_allocated_memory;
   // max buffer size allowed by Metal
   size_t m_max_buffer_size;
   // maximum total size allowed to be allocated
   size_t m_max_total_allowed_size;
-  // high watermark ratio is a hard limit for the total allowed allocations (between 0 and 1)
-  // 0 means unlimited (would spill to disk or system failure if OOM)
-  // 1 is maximum allowed by device.recommendedMaxWorkingSetSize
-  // (e.g., value 0.95 means we allocate up to 95% of total memory; beyond that allocations fail)
+  // high watermark ratio is a hard limit for the total allowed allocations
+  // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
+  // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
+  // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
+  // e.g., value 0.95 means we allocate up to 95% of recommended maximum
+  // allocation size; beyond that, the allocations would fail with OOM error.
   double m_high_watermark_ratio;
   // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
   // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
   // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
-  // (e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of total memory)
+  // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
+  // allocation size.
   double m_low_watermark_ratio;
   // low watermark size limit (in Bytes) at the time we initialize the allocator
   size_t m_low_watermark_limit;
@@ -355,37 +371,14 @@ class MPSHeapAllocatorImpl
   // total allocated size instead of manually tracking in MPSAllocator
   size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
 
-  void trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
+  bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
     for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
-      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block->buffer, event);
+      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
     }
-  }
-
-  // TODO: make a common function to do size unit conversions in PyTorch.
-  static std::string format_size(uint64_t size) {
-    std::ostringstream os;
-    os.precision(2);
-    os << std::fixed;
-    if (size <= 1024UL) { os << size << " bytes"; }
-    else if (size <= 1048576UL) { os << ((float) size / 1024.0) << " KB"; }
-    else if (size <= 1073741824UL) { os << ((float) size / 1048576.0) << " MB"; }
-    else { os << ((float) size / 1073741824.0) << " GB"; }
-    return os.str();
+    return true;
   }
 };
 
 } // namespace HeapAllocator
-
-// interface exposed to internal MPS operations
-
-// get the requested non-aligned size of an MTL buffer
-ssize_t get_requested_buffer_size(void* ptr);
-// retrieve the shape of a base tensor from a view tensor
-IntArrayRef get_buffer_shape(void* ptr);
-// set the shape of a base tensor from a view tensor
-void set_buffer_shape(void* ptr, const IntArrayRef& shape);
-// allocate a buffer from a specialized pool to import CPU scalars into GPU
-DataPtr allocate_scalar_buffer(void* value, size_t size);
-
 } // namespace mps
 } // namespace at
diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index 72ed5a47e9d8..236816905c54 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -16,37 +16,49 @@
 uint64_t BufferBlock::buffer_counter = 0;
 uint64_t HeapBlock::heap_counter = 0;
 
-void MPSHeapAllocatorImpl::init_allocator()
-{
+void MPSHeapAllocatorImpl::init_allocator() {
   // debug verbosity flags (see DebugVerbosity enum)
   static const char *verbosity_str = getenv("PYTORCH_DEBUG_MPS_ALLOCATOR");
   m_debug_verbosity = verbosity_str ? strtol(verbosity_str, nullptr, 0) : DebugVerbosity::SILENT;
 
-  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
-  const double high_watermark_upper_bound = 2.0;
-
   static const char *high_watermark_ratio_str = getenv("PYTORCH_MPS_HIGH_WATERMARK_RATIO");
-  m_high_watermark_ratio = high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) : default_high_watermark_ratio;
-  TORCH_CHECK(m_high_watermark_ratio >= 0.0 && m_high_watermark_ratio <= high_watermark_upper_bound,
-              "invalid high watermark ratio ", m_high_watermark_ratio);
+  const double high_watermark_ratio = high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) :
+                                                                 default_high_watermark_ratio;
+  setHighWatermarkRatio(high_watermark_ratio);
 
-  m_max_total_allowed_size = (m_high_watermark_ratio == 0.0) ? std::numeric_limits<size_t>::max() :
-                              static_cast<size_t>(m_high_watermark_ratio * (double)max_device_size());
-  // used for comparison with lower_watermark_ratio
-  const double high_watermark_limit = m_high_watermark_ratio == 0.0 ? high_watermark_upper_bound : m_high_watermark_ratio;
   const double default_low_watermark_ratio =  m_device.hasUnifiedMemory ? default_low_watermark_ratio_unified :
                                                                           default_low_watermark_ratio_discrete;
   static const char *low_watermark_ratio_str = getenv("PYTORCH_MPS_LOW_WATERMARK_RATIO");
-  m_low_watermark_ratio = low_watermark_ratio_str ? strtod(low_watermark_ratio_str, nullptr) : default_low_watermark_ratio;
-  TORCH_CHECK(m_low_watermark_ratio >= 0.0 && m_low_watermark_ratio <= high_watermark_limit,
-              "invalid low watermark ratio ", m_low_watermark_ratio);
+  const double low_watermark_ratio = low_watermark_ratio_str ? strtod(low_watermark_ratio_str, nullptr) : default_low_watermark_ratio;
+  setLowWatermarkRatio(low_watermark_ratio);
+}
+
+void MPSHeapAllocatorImpl::setHighWatermarkRatio(double ratio) {
+  TORCH_CHECK(ratio >= 0.0 && ratio <= default_high_watermark_upper_bound, "invalid high watermark ratio ", ratio);
+  m_max_total_allowed_size = (ratio == 0.0) ? std::numeric_limits<size_t>::max() :
+                             static_cast<size_t>(ratio * (double)max_device_size());
+  if (m_debug_verbosity & DebugVerbosity::PROFILING) {
+    std::cerr << "\nHigh watermark memory allocation limit: "
+              << (ratio == 0.0 ? "unlimited" : format_size(m_max_total_allowed_size)) << "\n";
+  }
+  m_high_watermark_ratio = ratio;
+}
+
+void MPSHeapAllocatorImpl::setLowWatermarkRatio(double ratio) {
+  // used for comparison with lower_watermark_ratio
+  const double high_watermark_limit = m_high_watermark_ratio == 0.0 ? default_high_watermark_upper_bound : m_high_watermark_ratio;
+  TORCH_CHECK(ratio >= 0.0 && ratio <= high_watermark_limit, "invalid low watermark ratio ", ratio);
   // we use this to detect if there's memory pressure
-  m_low_watermark_limit = (m_low_watermark_ratio == 0.0) ? std::numeric_limits<size_t>::max() :
-                          static_cast<size_t>(m_low_watermark_ratio * (double)max_device_size());
+  m_low_watermark_limit = (ratio == 0.0) ? std::numeric_limits<size_t>::max() :
+                          static_cast<size_t>(ratio * (double)max_device_size());
+  if (m_debug_verbosity & DebugVerbosity::PROFILING) {
+    std::cerr << "Low watermark memory allocation limit: "
+              << (ratio == 0.0 ? "unlimited" : format_size(m_low_watermark_limit)) << "\n";
+  }
+  m_low_watermark_ratio = ratio;
 }
 
-HeapBlock* MPSHeapAllocatorImpl::get_free_heap(AllocParams& params)
-{
+HeapBlock* MPSHeapAllocatorImpl::get_free_heap(AllocParams& params) {
   BufferPool& pool = *params.pool;
   HeapBlock *heap_block = nullptr;
   HeapBlock search_key(params.size());
@@ -73,16 +85,15 @@
   return heap_block;
 }
 
-bool MPSHeapAllocatorImpl::alloc_buffer(AllocParams& params)
-{
+bool MPSHeapAllocatorImpl::alloc_buffer(AllocParams& params) {
   if (m_max_total_allowed_size != std::numeric_limits<size_t>::max() &&
-      current_allocated_size() + params.size() > m_max_total_allowed_size)
+      current_allocated_size() + params.size() > m_max_total_allowed_size) {
     return false;
-
+  }
   HeapBlock *heap = get_free_heap(params);
-  if (!heap)
+  if (!heap) {
     return false; // this will cause releasing pool buffers to free up memory
-
+  }
   BufferPool& pool = *params.pool;
 
   id<MTLBuffer> buffer = heap->newMTLBuffer(params.size(), pool.usage);
@@ -112,12 +123,11 @@
   return true;
 }
 
-bool MPSHeapAllocatorImpl::get_free_buffer(AllocParams& params)
-{
+bool MPSHeapAllocatorImpl::get_free_buffer(AllocParams& params) {
   // this helps to monitor "implicit" allocations from MPS backend and to prevent OOM and system failure.
-  if (m_high_watermark_ratio > 0.0 && current_allocated_size() + params.size() > m_max_total_allowed_size)
+  if (m_high_watermark_ratio > 0.0 && current_allocated_size() + params.size() > m_max_total_allowed_size) {
     return false;
-
+  }
   BufferPool& pool = *params.pool;
   // track buffer reuse intervals only on large pool when low watermark limit is enabled.
   if (m_low_watermark_ratio > 0.0 && !(pool.usage & UsageFlags::SMALL)) {
@@ -157,9 +167,9 @@
     }
   }
 
-  if (!params.buffer_block)
+  if (!params.buffer_block) {
     return false; // this will make allocator to allocate a new buffer
-
+  }
   pool.buffers.erase(params.buffer_block);
   params.buffer_block->gc_count = 0;
   pool.available_size -= params.buffer_block->size;
@@ -179,8 +189,7 @@
   return true;
 }
 
-BufferBlock* MPSHeapAllocatorImpl::alloc_buffer_block(size_t size, uint32_t usage)
-{
+BufferBlock* MPSHeapAllocatorImpl::alloc_buffer_block(size_t size, uint32_t usage) {
   TORCH_CHECK(size < m_max_buffer_size, "Invalid buffer size: ", format_size(size));
 
   size_t alloc_size = get_allocation_size(size, usage);
@@ -202,6 +211,9 @@
     block_found =
         // Attempt allocate
         alloc_buffer(params) ||
+        // Callbacks might release more memory (eg. by forcing a GC in the host language) thus
+        // we can retry getting a free buffer in the pool, before trying to alloc again.
+        (trigger_memory_callbacks(nullptr, IMpsAllocatorCallback::EventType::ALLOCATION_FAILED) && get_free_buffer(params)) ||
         // Free enough available cached blocks to satisfy alloc and retry alloc.
         (release_available_cached_buffers(params) && alloc_buffer(params)) ||
         // Free all cached buffers and retry alloc.
@@ -230,12 +242,12 @@
   }
   buffer_block->in_use = true;
   buffer_block->use_count++;
+  m_current_allocated_memory += buffer_block->size;
 
   return buffer_block;
 }
 
-void MPSHeapAllocatorImpl::free_buffer(BufferBlock* buffer_block)
-{
+void MPSHeapAllocatorImpl::free_buffer(BufferBlock* buffer_block) {
   TORCH_INTERNAL_ASSERT(buffer_block->in_use);
 
   BufferPool& pool = *buffer_block->heap->pool;
@@ -244,19 +256,19 @@
   pool.available_size += buffer_block->size;
   buffer_block->shape.clear(); // reset shape
   buffer_block->in_use = false;
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(m_current_allocated_memory >= buffer_block->size);
+  m_current_allocated_memory -= buffer_block->size;
 }
 
-BufferBlock* MPSHeapAllocatorImpl::get_allocated_buffer_block(void* ptr)
-{
+BufferBlock* MPSHeapAllocatorImpl::get_allocated_buffer_block(void* ptr) {
   auto it = m_allocated_buffers.find(ptr);
-  if (it == m_allocated_buffers.end())
+  if (it == m_allocated_buffers.end()) {
     return nullptr;
-
+  }
   return it->second;
 }
 
-bool MPSHeapAllocatorImpl::release_buffer(BufferBlock* buffer_block, bool remove_empty_heap)
-{
+bool MPSHeapAllocatorImpl::release_buffer(BufferBlock* buffer_block, bool remove_empty_heap) {
   HeapBlock *heap_block = buffer_block->heap;
   BufferPool& pool = *heap_block->pool;
   m_total_allocated_memory -= buffer_block->size;
@@ -300,7 +312,7 @@
       pool.heaps_pending_update.insert(heap_block);
       m_mutex.unlock();
       m_stream->addCompletedHandler(^(id <MTLCommandBuffer>) {
-        std::lock_guard<std::mutex> lock(m_mutex);
+        std::lock_guard<std::recursive_mutex> lock(m_mutex);
         // check if the heap block still exists
         if (pool.heaps_pending_update.find(heap_block) != pool.heaps_pending_update.end()) {
           pool.heaps_pending_update.erase(heap_block);
@@ -315,16 +327,18 @@
   return false;
 }
 
-void MPSHeapAllocatorImpl::release_buffers(BufferPool& pool)
-{
-  if ((m_debug_verbosity & DebugVerbosity::PROFILING) && pool.n_buffers > 0) {
-    std::cerr << "Releasing " << pool.n_buffers
+void MPSHeapAllocatorImpl::release_buffers(BufferPool& pool) {
+  if (pool.buffers.empty()) {
+    return;
+  }
+  if ((m_debug_verbosity & DebugVerbosity::RELEASES)) {
+    std::cerr << "Releasing " << pool.buffers.size()
               << " buffers from "
               << ((pool.usage & UsageFlags::SMALL ) ? "small " : "large ")
               << ((pool.usage & UsageFlags::SHARED) ? "shared" : "private")
               << ((pool.usage & UsageFlags::SCALAR) ? " scalar" : "")
               << " pool (total size: " << format_size(pool.allocated_size)
-              << ", free buffers: " << pool.buffers.size() << ")\n";
+              << ", #buffers: " << pool.n_buffers << ")\n";
   }
   auto it = pool.buffers.begin();
   while (it != pool.buffers.end()) {
@@ -334,13 +348,12 @@
   }
 }
 
-bool MPSHeapAllocatorImpl::release_available_cached_buffers(AllocParams& params)
-{
+bool MPSHeapAllocatorImpl::release_available_cached_buffers(AllocParams& params) {
   BufferPool& pool = *params.pool;
 
-  if (pool.buffers.empty())
+  if (pool.buffers.empty()) {
     return false;
-
+  }
   auto it = pool.buffers.lower_bound(&params.search_key);
   if (it == pool.buffers.end()) {
     size_t totalReleased = 0;
@@ -356,19 +369,21 @@
         break;
       }
     }
-    if (totalReleased < params.search_key.size)
+    if (totalReleased < params.search_key.size) {
       return false;
+    }
   } else {
     release_buffer(*it);
   }
   return true;
 }
 
-bool MPSHeapAllocatorImpl::release_cached_buffers()
-{
+bool MPSHeapAllocatorImpl::release_cached_buffers() {
   if (m_debug_verbosity >= DebugVerbosity::PROFILING) {
-    std::cerr << "Releasing buffer pools (MPS allocated: " << format_size(m_total_allocated_memory)
-              << ", other allocations: " << format_size(current_allocated_size() - m_total_allocated_memory) << ")\n";
+    std::cerr << "Attempting to release cached buffers (MPS allocated: "
+              << format_size(m_total_allocated_memory)
+              << ", other allocations: "
+              << format_size(current_allocated_size() - m_total_allocated_memory) << ")\n";
   }
   // before releasing the buffers make sure the command buffer has finished.
   // we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
@@ -384,11 +399,11 @@
   return true;
 }
 
-void MPSHeapAllocatorImpl::garbage_collect_cached_buffers(AllocParams& params)
-{
+void MPSHeapAllocatorImpl::garbage_collect_cached_buffers(AllocParams& params) {
   // skip garbage collection if memory pressure has already relieved
-  if (current_allocated_size() < m_low_watermark_limit)
+  if (current_allocated_size() < m_low_watermark_limit) {
     return;
+  }
   // attempt to collect garbage until we reach below low watermark limit
   const auto target_size = current_allocated_size() - m_low_watermark_limit;
   const BufferPool& pool = *params.pool;
@@ -438,52 +453,49 @@
 }
 
 // public interface to MPSAllocator
-id<MTLBuffer> MPSHeapAllocatorImpl::malloc(size_t size, uint32_t usage)
-{
-  std::lock_guard<std::mutex> lock(m_mutex);
+id<MTLBuffer> MPSHeapAllocatorImpl::malloc(size_t size, uint32_t usage) {
+  std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
   BufferBlock* buffer_block = alloc_buffer_block(size, usage);
   return buffer_block ? buffer_block->buffer : nullptr;
 }
 
-bool MPSHeapAllocatorImpl::isSharedBuffer(void* ptr)
-{
-  std::lock_guard<std::mutex> lock(m_mutex);
+bool MPSHeapAllocatorImpl::isSharedBuffer(void* ptr) {
+  std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
   BufferBlock *buffer_block = get_allocated_buffer_block(ptr);
   // it's OK for the buffer_block to not exist yet
   return buffer_block && (buffer_block->heap->pool->usage & UsageFlags::SHARED);
 }
 
-id<MTLBuffer> MPSHeapAllocatorImpl::allocScalarBufferWithValue(void* value, size_t size)
-{
+id<MTLBuffer> MPSHeapAllocatorImpl::allocScalarBufferWithValue(void* value, size_t size) {
   BufferBlock* buffer_block = nullptr;
   {
-    std::lock_guard<std::mutex> lock(m_mutex);
+    std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
     buffer_block = alloc_buffer_block(size, UsageFlags::SCALAR);
-    if (!buffer_block)
+    if (!buffer_block) {
       return nullptr;
+    }
   }
   // buffer is out of the pool, so no mutex lock is needed
   memcpy([buffer_block->buffer contents], value, size);
   return buffer_block->buffer;
 }
 
-ssize_t MPSHeapAllocatorImpl::getRequestedBufferSize(void* ptr)
-{
-  std::lock_guard<std::mutex> lock(m_mutex);
+ssize_t MPSHeapAllocatorImpl::getUnalignedBufferSize(void* ptr) {
+  std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
   BufferBlock *buffer_block = get_allocated_buffer_block(ptr);
-  if (buffer_block)
+  if (buffer_block) {
     return (ssize_t) buffer_block->requested_size;
+  }
   // -1 indicates the passed buffer pointer wasn't found
   return -1;
 }
 
-void MPSHeapAllocatorImpl::setBufferShape(void* ptr, const IntArrayRef& shape)
-{
-  std::lock_guard<std::mutex> lock(m_mutex);
+void MPSHeapAllocatorImpl::setBufferShape(void* ptr, const IntArrayRef& shape) {
+  std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
   BufferBlock *buffer_block = get_allocated_buffer_block(ptr);
   TORCH_INTERNAL_ASSERT(buffer_block, "failed to find the buffer ", ptr);
@@ -493,22 +505,20 @@
   buffer_block->shape = shape.vec();
 }
 
-IntArrayRef MPSHeapAllocatorImpl::getBufferShape(void* ptr)
-{
-  std::lock_guard<std::mutex> lock(m_mutex);
+IntArrayRef MPSHeapAllocatorImpl::getBufferShape(void* ptr) {
+  std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
   BufferBlock *buffer_block = get_allocated_buffer_block(ptr);
-  if (buffer_block && buffer_block->shape.size() > 0)
+  if (buffer_block && buffer_block->shape.size() > 0) {
     return IntArrayRef{buffer_block->shape};
-
+  }
   return IntArrayRef();
 }
 
-void MPSHeapAllocatorImpl::free(void* ptr)
-{
+void MPSHeapAllocatorImpl::free(void* ptr) {
   BufferBlock *buffer_block = nullptr;
   {
-    std::lock_guard<std::mutex> lock(m_mutex);
+    std::lock_guard<std::recursive_mutex> lock(m_mutex);
 
     buffer_block = get_allocated_buffer_block(ptr);
     TORCH_INTERNAL_ASSERT(buffer_block);
@@ -521,26 +531,36 @@
   // we sync the scalar pool manually with completion handler at the time buffer is
   // freed when the MPSScalar instance goes our of scope
   m_stream->addCompletedHandler(^(id <MTLCommandBuffer>) {
-    std::lock_guard<std::mutex> lock(m_mutex);
+    std::lock_guard<std::recursive_mutex> lock(m_mutex);
     free_buffer(buffer_block);
   });
 }
 
-void MPSHeapAllocatorImpl::emptyCache()
-{
-  std::lock_guard<std::mutex> lock(m_mutex);
+void MPSHeapAllocatorImpl::emptyCache() {
+  std::lock_guard<std::recursive_mutex> lock(m_mutex);
   release_cached_buffers();
 }
 
-ssize_t MPSHeapAllocatorImpl::getLowWatermarkValue()
-{
+ssize_t MPSHeapAllocatorImpl::getLowWatermarkValue() {
   // check if low watermark limit is disabled
-  if (m_low_watermark_ratio == 0.0)
+  if (m_low_watermark_ratio == 0.0) {
     return std::numeric_limits<ssize_t>::max();
+  }
   // current_allocated_size could exceed m_low_watermark_limit (e.g., when swapping to disk)
   return std::max<ssize_t>(0, (ssize_t)(m_low_watermark_limit - current_allocated_size()) / 1048576L);
 }
 
+inline std::string MPSHeapAllocatorImpl::format_size(uint64_t size) const {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024UL) { os << size << " bytes"; }
+  else if (size <= 1048576UL) { os << ((float) size / 1024.0) << " KB"; }
+  else if (size <= 1073741824UL) { os << ((float) size / 1048576.0) << " MB"; }
+  else { os << ((float) size / 1073741824.0) << " GB"; }
+  return os.str();
+}
+
 } // namespace HeapAllocator
 
 // Use "at::mps::GetMPSAllocator()" to acquire a handle to MPS Allocator
@@ -552,27 +572,19 @@
 }
 
 // MPS allocator struct to be registered with Pytorch
-struct TORCH_API MPSAllocator final : public at::Allocator {
+struct TORCH_API MPSAllocator final : public IMPSAllocator {
 public:
   explicit MPSAllocator(uint32_t Usage) :
       m_has_unified_memory(_getAllocImpl().Device().hasUnifiedMemory), m_usage(Usage)
   {
     if (_getAllocImpl().getDebugVerbosity()) {
       if (!(m_usage & HeapAllocator::UsageFlags::SHARED) || m_has_unified_memory) {
-        const size_t max_total_allowed_size = _getAllocImpl().getMaxTotalAllowedSize();
-        const size_t low_watermark_limit = _getAllocImpl().getLowWatermarkLimit();
         std::cerr << "Initializing "
                   << ((m_usage & HeapAllocator::UsageFlags::SHARED) ? "shared" : "private")
                   << " heap allocator on "
                   << (m_has_unified_memory ? "unified" : "discrete")
                   << " device memory of size "
-                  << _getAllocImpl().Device().recommendedMaxWorkingSetSize / 1048576UL << " MB"
-                  << " (max allowed: "
-                  << (max_total_allowed_size == std::numeric_limits<size_t>::max() ? "unlimited" :
-                     (to_string(max_total_allowed_size / 1048576UL) + " MB"))
-                  << ", low watermark: "
-                  << (low_watermark_limit == std::numeric_limits<size_t>::max() ? "unlimited" :
-                     (to_string(low_watermark_limit / 1048576UL) + " MB"))  << ")\n";
+                  << _getAllocImpl().format_size(_getAllocImpl().Device().recommendedMaxWorkingSetSize) << "\n";
       }
     }
   }
@@ -580,20 +592,32 @@ explicit MPSAllocator(uint32_t Usage) :
   ~MPSAllocator() override {
     _getAllocImpl().emptyCache();
   }
+  DeleterFnPtr raw_deleter() const override { return &Delete; }
 
   DataPtr allocate(const size_t nbytes) const override {
     __block id<MTLBuffer> buf = nbytes > 0 ? _getAllocImpl().malloc(nbytes, m_usage) : nullptr;
     return { buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
   }
 
-  DataPtr allocate_scalar_buffer(void *value, size_t size) const {
+  // implementation of IMPSAllocator interface
+  DataPtr allocScalarBufferWithValue(void *value, size_t size) const override {
     id<MTLBuffer> buf = _getAllocImpl().allocScalarBufferWithValue(value, size);
     return { buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
   }
-
-  DeleterFnPtr raw_deleter() const override { return &Delete; }
-  bool is_shared(void* ptr) const { return _getAllocImpl().isSharedBuffer(ptr); }
-  bool is_shared_storage_supported() const { return m_has_unified_memory; }
+  bool isSharedBuffer(void* ptr) const override { return _getAllocImpl().isSharedBuffer(ptr); }
+  bool isSharedStorageSupported() const override { return m_has_unified_memory; }
+  void emptyCache() const override { _getAllocImpl().emptyCache(); }
+  ssize_t getUnalignedBufferSize(void* ptr) const override { return _getAllocImpl().getUnalignedBufferSize(ptr); }
+  IntArrayRef getBufferShape(void* ptr) const override { return _getAllocImpl().getBufferShape(ptr); }
+  void setBufferShape(void* ptr, const IntArrayRef& shape) const override { _getAllocImpl().setBufferShape(ptr, shape); }
+  size_t getTotalAllocatedMemory() const override { return _getAllocImpl().getTotalAllocatedMemory(); }
+  size_t getCurrentAllocatedMemory() const override { return _getAllocImpl().getCurrentAllocatedMemory(); }
+  size_t getDriverAllocatedMemory() const override { return _getAllocImpl().getDriverAllocatedMemory(); }
+  ssize_t getLowWatermarkValue() const override { return _getAllocImpl().getLowWatermarkValue(); }
+  size_t getLowWatermarkLimit() const override { return _getAllocImpl().getLowWatermarkLimit(); }
+  size_t getHighWatermarkLimit() const override { return _getAllocImpl().getHighWatermarkLimit(); }
+  void setLowWatermarkRatio(double ratio) const override { _getAllocImpl().setLowWatermarkRatio(ratio); }
+  void setHighWatermarkRatio(double ratio) const override { _getAllocImpl().setHighWatermarkRatio(ratio); }
 
 private:
   bool m_has_unified_memory;
@@ -618,41 +642,17 @@ static void Delete(void* ptr) {
 }
 } // anonymous namespace
 
-at::Allocator* getMPSSharedAllocator()
-{
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator) {
+  if (!sharedAllocator) {
+    return &_getPrivateAllocator();
+  }
   auto& sa = _getSharedAllocator();
-  if (sa.is_shared_storage_supported()) {
+  if (sa.isSharedStorageSupported()) {
     return &sa;
   }
-
   return nullptr;
 }
 
-at::Allocator* getMPSPrivateAllocator() {
-  return &_getPrivateAllocator();
-}
-
-// TODO: create MPSHooks interface and move these there.
-ssize_t get_requested_buffer_size(void* ptr) {
-  return _getAllocImpl().getRequestedBufferSize(ptr);
-}
-
-void set_buffer_shape(void* ptr, const IntArrayRef& shape) {
-  _getAllocImpl().setBufferShape(ptr, shape);
-}
-
-IntArrayRef get_buffer_shape(void* ptr) {
-  return _getAllocImpl().getBufferShape(ptr);
-}
-
-DataPtr allocate_scalar_buffer(void *value, size_t size) {
-  return _getPrivateAllocator().allocate_scalar_buffer(value, size);
-}
-
-uint32_t get_adaptive_commit_threshold() {
-  return _getAllocImpl().getLowWatermarkValue();
-}
-
 } // namespace mps
 
 namespace native {
@@ -664,14 +664,14 @@ uint32_t get_adaptive_commit_threshold() {
 bool is_pinned_mps(const Tensor& self, c10::optional<Device> device)
 {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
-  return at::mps::_getSharedAllocator().is_shared(self.storage().data());
+  return at::mps::_getSharedAllocator().isSharedBuffer(self.storage().data());
 }
 
 // torch.pin_memory() implementation
 Tensor _pin_memory_mps(const Tensor& self, c10::optional<Device> device)
 {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
-  auto* shared_allocator = at::mps::getMPSSharedAllocator();
+  auto* shared_allocator = at::mps::getIMPSAllocator(true);
   TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device");
 
   const size_t storage_size = detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize());
diff --git a/aten/src/ATen/mps/MPSAllocatorInterface.h b/aten/src/ATen/mps/MPSAllocatorInterface.h
new file mode 100644
index 000000000000..a7a187963e18
--- /dev/null
+++ b/aten/src/ATen/mps/MPSAllocatorInterface.h
@@ -0,0 +1,53 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Registry.h>
+#include <ATen/core/ATen_fwd.h>
+
+namespace at { namespace mps {
+
+// this is a public interface to access MPSAllocator.
+// Do not declare methods that would depend on MPS or Metal frameworks.
+class IMPSAllocator : public c10::Allocator {
+public:
+  // see the comments in MPSAllocator.h for the description of these methods.
+  virtual void emptyCache() const = 0;
+  virtual ssize_t getUnalignedBufferSize(void* ptr) const = 0;
+  virtual IntArrayRef getBufferShape(void* ptr) const = 0;
+  virtual void setBufferShape(void* ptr, const IntArrayRef& shape) const = 0;
+  virtual bool isSharedBuffer(void* ptr) const = 0;
+  virtual bool isSharedStorageSupported() const = 0;
+  virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0;
+  virtual void setLowWatermarkRatio(double ratio) const = 0;
+  virtual void setHighWatermarkRatio(double ratio) const = 0;
+  virtual ssize_t getLowWatermarkValue() const = 0;
+  virtual size_t getLowWatermarkLimit() const = 0;
+  virtual size_t getHighWatermarkLimit() const = 0;
+  virtual size_t getTotalAllocatedMemory() const = 0;
+  virtual size_t getCurrentAllocatedMemory() const = 0;
+  virtual size_t getDriverAllocatedMemory() const = 0;
+};
+
+class IMpsAllocatorCallback {
+ public:
+  enum class EventType {
+    ALLOCATED, // buffer got allocated to be used immediately
+    RECYCLED,  // buffer pulled from free list to be reused
+    FREED,     // buffer put to free list for future recycling
+    RELEASED,  // buffer memory released
+    ALLOCATION_FAILED // buffer allocation failed
+  };
+  virtual ~IMpsAllocatorCallback() = default;
+  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
+};
+
+// MPS allocator will execute every registered callback when a block of memory is freed.
+C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
+#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
+
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
+
+}} // namespace at::mps
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 48e1904346c1..1890d6050d94 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -27,6 +27,14 @@ using namespace std;
 namespace at {
 namespace mps {
 
+// Helper enum to check if a MPSGraph op is supported in a given macOS version
+enum class MacOSVersion : uint32_t {
+  MACOS_VER_13_0_PLUS = 0,
+  MACOS_VER_13_1_PLUS,
+  MACOS_VER_13_2_PLUS,
+  MACOS_VER_13_3_PLUS,
+};
+
 //-----------------------------------------------------------------
 //  MPSDevice
 //
@@ -56,7 +64,7 @@ class TORCH_API MPSDevice {
   /**
    * Returns whether running on Ventura or newer
    */
-  bool isMacOS13Plus() const;
+  bool isMacOS13Plus(MacOSVersion version) const;
 
   MTLFunction_t metalIndexingFunction(const std::string &kernel, MTLFunctionConstantValues_t constantValues);
 
@@ -65,14 +73,13 @@ class TORCH_API MPSDevice {
  private:
   static MPSDevice* _device;
   MTLDevice_t _mtl_device;
-  bool _macos13plus;
   MTLLibrary_t _mtl_indexing_library;
   MPSDevice();
 };
 
 TORCH_API bool is_available();
-TORCH_API bool is_macos_13_or_newer();
-
+TORCH_API bool is_macos_13_or_newer(MacOSVersion version = MacOSVersion::MACOS_VER_13_0_PLUS);
+TORCH_API void device_synchronize();
 TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
 
 } // namespace mps
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index c11621b3f354..0576f9bb7899 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -3,6 +3,8 @@
 #include <c10/util/CallOnce.h>
 
 #include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/IndexKernels.h>
 
 namespace at {
@@ -66,7 +68,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   // Create the MPSGraph and check method introduced in 12.3+
   // which is used by MPS backend.
   id mpsCD = NSClassFromString(@"MPSGraph");
-  _macos13plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
+
   if ([mpsCD instancesRespondToSelector:@selector(LSTMWithSourceTensor:
                                                        recurrentWeight:
                                                            inputWeight:
@@ -90,22 +92,39 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 
 }
 
-bool MPSDevice::isMacOS13Plus() const {
-  return _macos13plus;
+bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
+  id mpsCD = NSClassFromString(@"MPSGraph");
+  static bool _macos_13_0_plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
+  static bool _macos_13_1_plus = [mpsCD instancesRespondToSelector:@selector(
+    sampleGridWithSourceTensor:coordinateTensor:layout:normalizeCoordinates:relativeCoordinates:alignCorners:paddingMode:samplingMode:constantValue:name:)] == YES;
+  static bool _macos_13_2_plus = [mpsCD instancesRespondToSelector:@selector(convolution3DWithSourceTensor:weightsTensor:descriptor:name:)] == YES;
+  static bool _macos_13_3_plus = NO;
+  if (@available(macOS 13.3, *))
+    _macos_13_3_plus = YES;
+
+  switch (version) {
+    case MacOSVersion::MACOS_VER_13_0_PLUS:  return _macos_13_0_plus;
+    case MacOSVersion::MACOS_VER_13_1_PLUS:  return _macos_13_1_plus;
+    case MacOSVersion::MACOS_VER_13_2_PLUS:  return _macos_13_2_plus;
+    case MacOSVersion::MACOS_VER_13_3_PLUS:  return _macos_13_3_plus;
+    default: return false;
+  }
 }
 
-at::Allocator* getMPSSharedAllocator();
-at::Allocator* getMPSPrivateAllocator();
 at::Allocator* GetMPSAllocator(bool useSharedAllocator) {
-  return useSharedAllocator ? getMPSSharedAllocator() : getMPSPrivateAllocator();
+  return getIMPSAllocator(useSharedAllocator);
 }
 
 bool is_available() {
   return MPSDevice::getInstance()->device() != nil;
 }
 
-bool is_macos_13_or_newer() {
-  return MPSDevice::getInstance()->isMacOS13Plus();
+bool is_macos_13_or_newer(MacOSVersion version) {
+  return MPSDevice::getInstance()->isMacOS13Plus(version);
+}
+
+void device_synchronize() {
+  getDefaultMPSStream()->synchronize(SyncType::COMMIT_AND_WAIT);
 }
 
 } // namespace mps
diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index 69dd47f9c145..1d51a26b18f2 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -54,11 +54,8 @@ Tensor slow_conv2d_forward_mps(
   m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("repeat_interleave.Tensor", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("repeat_interleave.self_Tensor", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("repeat_interleave.self_int", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
+  m.impl("im2col", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); // Used in  preprocessing by nn.Unfold
+  m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("sgn.out", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
diff --git a/aten/src/ATen/mps/MPSHooks.cpp b/aten/src/ATen/mps/MPSHooks.cpp
index 5fde8f3843fe..89adac6c34b1 100644
--- a/aten/src/ATen/mps/MPSHooks.cpp
+++ b/aten/src/ATen/mps/MPSHooks.cpp
@@ -3,6 +3,7 @@
 #include <ATen/mps/MPSHooks.h>
 #include <ATen/mps/MPSDevice.h>
 #include <ATen/mps/MPSGeneratorImpl.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 
 namespace at {
 namespace mps {
@@ -16,6 +17,20 @@ bool MPSHooks::hasMPS() const {
   return at::mps::is_available();
 }
 
+bool MPSHooks::isOnMacOS13orNewer(unsigned minor) const {
+  switch (minor) {
+    case 0:
+      return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_0_PLUS);
+    case 1:
+      return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_1_PLUS);
+    case 2:
+      return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
+    default:
+      TORCH_WARN("Can't check whether running on 13.",minor,"+ returning one for 13.2+");
+      return is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS);
+  }
+}
+
 Allocator* MPSHooks::getMPSDeviceAllocator() const {
   return at::mps::GetMPSAllocator();
 }
@@ -24,6 +39,26 @@ const Generator& MPSHooks::getDefaultMPSGenerator() const {
   return at::mps::detail::getDefaultMPSGenerator();
 }
 
+void MPSHooks::deviceSynchronize() const {
+  at::mps::device_synchronize();
+}
+
+void MPSHooks::emptyCache() const {
+  at::mps::getIMPSAllocator()->emptyCache();
+}
+
+size_t MPSHooks::getCurrentAllocatedMemory() const {
+  return at::mps::getIMPSAllocator()->getCurrentAllocatedMemory();
+}
+
+size_t MPSHooks::getDriverAllocatedMemory() const {
+  return at::mps::getIMPSAllocator()->getDriverAllocatedMemory();
+}
+
+void MPSHooks::setMemoryFraction(double ratio) const {
+  at::mps::getIMPSAllocator()->setHighWatermarkRatio(ratio);
+}
+
 using at::MPSHooksRegistry;
 using at::RegistererMPSHooksRegistry;
 
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
index 2bef3eac4264..9e913b38a2e1 100644
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@@ -13,8 +13,14 @@ struct MPSHooks : public at::MPSHooksInterface {
   MPSHooks(at::MPSHooksArgs) {}
   void initMPS() const override;
   bool hasMPS() const override;
+  bool isOnMacOS13orNewer(unsigned minor) const override;
   Allocator* getMPSDeviceAllocator() const override;
   const Generator& getDefaultMPSGenerator() const override;
+  void deviceSynchronize() const override;
+  void emptyCache() const override;
+  size_t getCurrentAllocatedMemory() const override;
+  size_t getDriverAllocatedMemory() const override;
+  void setMemoryFraction(double ratio) const override;
 };
 
 }} // at::mps
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index 04115fc268c7..f1f2d47cf1e6 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -1,15 +1,13 @@
 //  Copyright © 2022 Apple Inc.
 
 #include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 
 namespace at {
 namespace mps {
 
 #define USE_COMMIT_AND_CONTINUE 1
 
-// the frequency that we commit the command buffer calculated based on low watermark ratio in MPSAllocator
-uint32_t get_adaptive_commit_threshold();
-
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@@ -52,7 +50,7 @@
       break;
     case SyncType::COMMIT_ADAPTIVE:
       // the adaptive commit only commits if we hit the low watermark memory threshold
-      if (get_adaptive_commit_threshold() <= 1) {
+      if (getIMPSAllocator()->getLowWatermarkValue() <= 1) {
 #if USE_COMMIT_AND_CONTINUE
         commitAndContinue();
 #else
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index afe1cf91a57b..83613da65502 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -34,8 +34,6 @@
 #include <ATen/ops/_linalg_svd_meta.h>
 #include <ATen/ops/_linalg_svd_native.h>
 #include <ATen/ops/_lu_with_info_native.h>
-#include <ATen/ops/_symeig_helper.h>
-#include <ATen/ops/_symeig_helper_native.h>
 #include <ATen/ops/all.h>
 #include <ATen/ops/arange.h>
 #include <ATen/ops/cat.h>
@@ -110,8 +108,6 @@
 #include <ATen/ops/resize_as_native.h>
 #include <ATen/ops/sum.h>
 #include <ATen/ops/svd_native.h>
-#include <ATen/ops/symeig.h>
-#include <ATen/ops/symeig_native.h>
 #include <ATen/ops/triangular_solve_meta.h>
 #include <ATen/ops/triangular_solve_native.h>
 #include <ATen/ops/tril.h>
@@ -289,12 +285,6 @@ extern "C" void cunmqr_(char *side, char *trans, int *m, int *n, int *k, std::co
 extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
 extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
 
-// syev
-extern "C" void zheev_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *info);
-extern "C" void cheev_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *info);
-extern "C" void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
-extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
-
 // syevd
 extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *lrwork, int *iwork, int *liwork, int *info);
 extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *lrwork, int *iwork, int *liwork, int *info);
@@ -910,24 +900,6 @@ template<> void lapackOrmqr<float>(char side, char trans, int m, int n, int k, f
   sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
 }
 
-template<> void lapackSymeig<c10::complex<double>, double>(char jobz, char uplo, int n, c10::complex<double> *a, int lda, double *w, c10::complex<double> *work, int lwork, double *rwork, int *info) {
-  zheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, w, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, info);
-}
-
-template<> void lapackSymeig<c10::complex<float>, float>(char jobz, char uplo, int n, c10::complex<float> *a, int lda, float *w, c10::complex<float> *work, int lwork, float *rwork, int *info) {
-  cheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, w, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, info);
-}
-
-template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, double* rwork, int *info) {
-  (void)rwork;  // unused
-  dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
-}
-
-template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, float* rwork, int *info) {
-  (void)rwork;  // unused
-  ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
-}
-
 template<> void lapackSyevd<c10::complex<double>, double>(char jobz, char uplo, int n, c10::complex<double> *a, int lda, double *w, c10::complex<double> *work, int lwork, double *rwork, int lrwork, int *iwork, int liwork, int *info) {
   zheevd_(&jobz, &uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, w, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, &lrwork, iwork, &liwork, info);
 }
@@ -2815,134 +2787,6 @@ Tensor& linalg_eigvalsh_out(const Tensor& A, c10::string_view uplo, Tensor& L) {
   return L;
 }
 
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ symeig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-template <typename scalar_t>
-static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool upper, int* infos) {
-#if !AT_BUILD_WITH_LAPACK()
-  AT_ERROR("symeig: LAPACK library not found in compilation");
-#else
-  using value_t = typename c10::scalar_value_type<scalar_t>::type;
-  auto self_data = self.data_ptr<scalar_t>();
-  auto eigvals_data = eigvals.data_ptr<value_t>();
-  auto self_matrix_stride = matrixStride(self);
-  auto eigvals_stride = eigvals.size(-1);
-  auto batch_size = batchCount(self);
-  auto n = self.size(-1);
-
-  char uplo = upper ? 'U' : 'L';
-  char jobz = eigenvectors ? 'V' : 'N';
-
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int info;
-  // Run once, first to get the optimum work size.
-  // Since we deal with batches of matrices with the same dimensions, doing this outside
-  // the loop saves (batch_size - 1) workspace queries which would provide the same result
-  // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
-  int lwork = -1;
-  scalar_t wkopt;
-
-  Tensor rwork;
-  value_t* rwork_data = nullptr;
-  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    int64_t lrwork = std::max(int64_t(1), 3 * n - 2);
-    ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
-    rwork = at::empty({lrwork}, self.options().dtype(dtype));
-    rwork_data = rwork.data_ptr<value_t>();
-  }
-
-  lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info);
-  lwork = std::max<int>(1, real_impl<scalar_t, value_t>(wkopt));
-  Tensor work = at::empty({lwork}, self.options());
-
-  for (const auto i : c10::irange(batch_size)) {
-    scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
-
-    // now compute the eigenvalues and the eigenvectors (optionally)
-    lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr<scalar_t>(), lwork, rwork_data, &info);
-    infos[i] = info;
-    if (info != 0) {
-      return;
-    }
-  }
-#endif
-}
-
-std::tuple<Tensor, Tensor> _symeig_helper_cpu(const Tensor& self, bool eigenvectors, bool upper) {
-  auto infos = at::zeros({batchCount(self)}, self.options().dtype(kInt));
-
-  auto self_sizes = self.sizes().vec();
-  self_sizes.pop_back();
-  ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype()));
-  auto eigvals = at::empty(self_sizes, self.options().dtype(dtype));
-
-  if (self.numel() == 0) {
-    return std::tuple<Tensor, Tensor>(eigvals, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-  }
-
-  auto self_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cpu", [&]{
-    apply_symeig<scalar_t>(self_working_copy, eigvals, eigenvectors, upper, infos.data_ptr<int>());
-  });
-
-  at::_linalg_check_errors(infos, "symeig", self.dim() == 2);
-  if (eigenvectors) {
-    return std::tuple<Tensor, Tensor>(eigvals, self_working_copy);
-  } else {
-    return std::tuple<Tensor, Tensor>(eigvals, at::empty({0}, self.options()));
-  }
-}
-
-std::tuple<Tensor, Tensor> symeig(const Tensor& self, bool eigenvectors, bool upper) {
-  TORCH_WARN_ONCE(
-    "torch.symeig is deprecated in favor of torch.linalg.eigh and will be removed in a future ",
-    "PyTorch release.\n",
-    "The default behavior has changed from using the upper triangular portion of the matrix by default ",
-    "to using the lower triangular portion.\n",
-    "L, _ = torch.symeig(A, upper=upper)\n",
-    "should be replaced with\n",
-    "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n",
-    "and\n",
-    "L, V = torch.symeig(A, eigenvectors=True)\n"
-    "should be replaced with\n",
-    "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
-  );
-  squareCheckInputs(self, "linalg.symeig");
-  return at::_symeig_helper(self, eigenvectors, upper);
-}
-
-std::tuple<Tensor&, Tensor&> symeig_out(const Tensor& self, bool eigenvectors, bool upper, Tensor& vals, Tensor& vecs) {
-  TORCH_WARN_ONCE(
-    "torch.symeig is deprecated in favor of torch.linalg.eigh and will be removed in a future ",
-    "PyTorch release.\n",
-    "The default behavior has changed from using the upper triangular portion of the matrix by default ",
-    "to using the lower triangular portion.\n",
-    "L, _ = torch.symeig(A, upper=upper)\n",
-    "should be replaced with\n",
-    "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n",
-    "and\n",
-    "L, V = torch.symeig(A, eigenvectors=True)\n"
-    "should be replaced with\n",
-    "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
-  );
-  checkSameDevice("symeig", vals, self, "eigenvalues");
-  checkSameDevice("symeig", vecs, self, "eigenvectors");
-  checkLinalgCompatibleDtype("symeig", vecs, self, "eigenvectors");
-  // eigenvalues are always real-valued here
-  ScalarType real_dtype = toRealValueType(self.scalar_type());
-  checkLinalgCompatibleDtype("symeig", vals.scalar_type(), real_dtype, "eigenvalues");
-
-  Tensor vals_tmp, vecs_tmp;
-  std::tie(vals_tmp, vecs_tmp) = at::symeig(self, eigenvectors, upper);
-
-  at::native::resize_output(vals, vals_tmp.sizes());
-  at::native::resize_output(vecs, vecs_tmp.sizes());
-  vals.copy_(vals_tmp);
-  vecs.copy_(vecs_tmp);
-  return std::tuple<Tensor&, Tensor&>(vals, vecs);
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // This function returns complex-valued eigenvectors that is obtained from LAPACK GEEV's real-valued output
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index e53d8cd2d38f..8f36ae8c3fa9 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -16,7 +16,6 @@
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_strided.h>
 #endif
-
 namespace at { namespace native {
 
 namespace {
@@ -915,12 +914,28 @@ void apply_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& in
   auto n = input.size(-1);
   auto leading_dimension = std::max<int64_t>(1, m);
 
-  for (const auto i : c10::irange(batch_size)) {
-    scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
-    int* pivots_working_ptr = &pivots_data[i * pivots_stride];
-    int* infos_working_ptr = &infos_data[i];
-    lapackLu<scalar_t>(m, n, input_working_ptr, leading_dimension, pivots_working_ptr, infos_working_ptr);
-  }
+  const auto loop = [&](int64_t start, int64_t end) {
+    for (const auto i : c10::irange(start, end)) {
+      scalar_t* input_working_ptr = &input_data[i * input_matrix_stride];
+      int* pivots_working_ptr = &pivots_data[i * pivots_stride];
+      int* infos_working_ptr = &infos_data[i];
+      lapackLu<scalar_t>(
+          m,
+          n,
+          input_working_ptr,
+          leading_dimension,
+          pivots_working_ptr,
+          infos_working_ptr);
+    }
+  };
+  // avoid overflow
+  float matrix_rank = float(std::min(m, n));
+  // A heuristic tested on a 32 core/socket ICX system
+  // https://github.com/pytorch/pytorch/pull/93037#discussion_r1090112948
+  int64_t chunk_size_per_thread = int64_t(
+      std::min(1.0, 3200.0 / (matrix_rank * matrix_rank * matrix_rank)));
+  int64_t grain_size = chunk_size_per_thread * at::get_num_threads();
+  at::parallel_for(0, batch_size, grain_size, loop);
 #endif
 }
 
diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h
index e23fa1267807..06dbcce033fd 100644
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@@ -134,6 +134,13 @@ inline void searchsorted_pre_check(
 
     TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ",
       "dtype but got dtype ", sorter.scalar_type());
+
+    if (sorter.numel() > 0) {
+      auto minmax = sorter.aminmax();
+      int64_t vmin = std::get<0>(minmax).item().toLong();
+      int64_t vmax = std::get<1>(minmax).item().toLong();
+      TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range");
+    }
   }
 
   TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1),
diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp
index 985ee15a5a99..e1c6b6fcda86 100644
--- a/aten/src/ATen/native/CPUFallback.cpp
+++ b/aten/src/ATen/native/CPUFallback.cpp
@@ -50,7 +50,7 @@ c10::optional<c10::Device> compute_target_device(std::vector<at::Tensor>& t_args
   // Decide what device to move the output tensor(s) to.
   // The current convention is that we use the first tensor arg to pick the device
   // Barring that, we take the first tensor from a TensorList arg.
-  if (t_args.size() > 0) {
+  if (!t_args.empty()) {
     return t_args[0].device();
   } else {
     // We need to loop through all of the (potentially multiple) TensorList arguments
diff --git a/aten/src/ATen/native/Collectives.cpp b/aten/src/ATen/native/Collectives.cpp
new file mode 100644
index 000000000000..302a7331e72a
--- /dev/null
+++ b/aten/src/ATen/native/Collectives.cpp
@@ -0,0 +1,32 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#endif
+
+namespace at {
+namespace native {
+
+// Dummy impls required by codegen infra, not used
+// These should never get called
+// Defer to python impls in torch/distributed/_functional_collectives.py and _meta_registrations.py
+
+at::Tensor all_reduce(at::Tensor const& self, const c10::string_view reduceOp, const c10::string_view tag, c10::ArrayRef<int64_t> ranks, int64_t group_size) {
+    TORCH_INTERNAL_ASSERT(false);
+}
+
+at::Tensor all_gather_into_tensor(at::Tensor const& shard, const c10::string_view tag, c10::ArrayRef<int64_t> ranks, int64_t group_size) {
+    TORCH_INTERNAL_ASSERT(false);
+}
+
+at::Tensor wait_tensor(at::Tensor const& self) {
+    TORCH_INTERNAL_ASSERT(false);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index 9533115a7066..688b592c7d2b 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -8,6 +8,8 @@
 #else
 #include <ATen/ops/view_as_real_native.h>
 #include <ATen/ops/view_as_complex_native.h>
+
+#include <utility>
 #endif
 
 // WARNING: this header contains non-inline functions and should be only
@@ -47,7 +49,7 @@ Tensor _view_as_real_physical(const Tensor& self) {
   auto new_strides = computeStrideForViewAsReal(self.sym_strides());
   auto new_storage_offset = self.sym_storage_offset() * 2;
   const auto float_type = c10::toRealValueType(self.scalar_type());
-  auto real_tensor = view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides);
+  auto real_tensor = view_tensor(self, float_type, std::move(new_storage_offset), new_sizes, new_strides);
   return real_tensor;
 }
 
@@ -79,7 +81,7 @@ Tensor view_as_complex(const Tensor& self) {
     "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type());
 
   auto old_sizes = self.sym_sizes();
-  TORCH_CHECK(old_sizes.size() != 0, "Input tensor must have one or more dimensions");
+  TORCH_CHECK(!old_sizes.empty(), "Input tensor must have one or more dimensions");
   TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2");
   SymDimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
 
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index da702e1bc8c0..0d9a0a049624 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -44,6 +44,13 @@ using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tens
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, int64_t, std::array<bool,3>);
 DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
+using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const c10::optional<Tensor>&,
+    IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
+using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub);
 using slow_conv_dilated2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
     const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
     at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
@@ -91,6 +98,7 @@ enum class ConvBackend {
   MiopenDepthwise,
   MiopenTranspose,
   Mkldnn,
+  MkldnnTranspose,
   MkldnnEmpty,
   NnpackSpatial,
   Overrideable,
@@ -207,7 +215,7 @@ static inline std::vector<T> _conv_output_size(
 ) {
   // ASSERT(input_size.size() > 2)
   // ASSERT(input_size.size() == weight_size.size())
-  bool has_dilation = dilation.size() > 0;
+  bool has_dilation = !dilation.empty();
   auto dim = input_size.size();
   std::vector<T> output_size(dim);
   output_size[0] = input_size[input_batch_size_dim];
@@ -336,7 +344,6 @@ static inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor
 }
 
 static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
-
   // disable NHWC for float64 input.
   if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
       input.scalar_type() == at::kDouble ||
@@ -344,20 +351,13 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const
     return false;
   }
 
-  bool can_use_miopen_channels_last_2d = false;
-#if defined(USE_ROCM) && (ROCM_VERSION >= 40300)
-  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
-  // See #64427
-  static c10::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
-
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
 
-  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
-            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
-            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
-        );
-#endif
+  bool can_use_miopen_channels_last_2d = (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast)
+  );
 
   bool can_use_miopen_channels_last_3d = false;
 
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 687a89c298b9..a5959ef36cae 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -500,13 +500,18 @@ struct ConvParams {
     if (!at::globalContext().userEnabledMkldnn()) {
       return false;
     }
+    if (transposed && is_output_padding_big()) {
+      return false;
+    }
+    if (transposed && groups > 1 && at::symint::size<T>(input, 1) == groups) {
+      return false;
+    }
     if (input.device().is_cpu() && input.scalar_type() == kBFloat16 && mkldnn_bf16_device_check()) {
       return true;
     }
     return (input.is_mkldnn()) || // input is mkldnn Tensor
       (input.device().is_cpu() &&
        input.scalar_type() == kFloat && // only on CPU Float Tensors
-       !transposed && // or transposed tensors
        // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
        // but THNN is faster when single-threaded.
        (is_strided() || is_dilated() || at::symint::size<T>(input, 0) >= 16 ||
@@ -598,6 +603,8 @@ DEFINE_DISPATCH(miopen_convolution_backward_stub);
 DEFINE_DISPATCH(miopen_convolution_transpose_backward_stub);
 DEFINE_DISPATCH(miopen_depthwise_convolution_backward_stub);
 DEFINE_DISPATCH(mkldnn_convolution_backward_stub);
+DEFINE_DISPATCH(mkldnn_convolution_transpose_stub);
+DEFINE_DISPATCH(mkldnn_convolution_transpose_backward_stub);
 DEFINE_DISPATCH(slow_conv_dilated2d_backward_stub);
 DEFINE_DISPATCH(slow_conv_dilated3d_backward_stub);
 DEFINE_DISPATCH(slow_conv_transpose2d_backward_stub);
@@ -762,7 +769,7 @@ static void check_input_same_type_as_parameters(
     const Tensor& weight,
     const Tensor& bias,
     const ConvBackend backend) {
-  if (backend == ConvBackend::Mkldnn) {
+  if (backend == ConvBackend::Mkldnn || backend == ConvBackend::MkldnnTranspose) {
     TORCH_CHECK(input.options().type_equal(weight.options())
         || (input.is_mkldnn() && weight.device().is_cpu() && weight.scalar_type() == kFloat),
         "Input type (", input.toString(), ") and weight type (", weight.toString(),
@@ -1164,9 +1171,6 @@ at::Tensor convolution_overrideable(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
     bool transposed, IntArrayRef output_padding, int64_t groups) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-
   TORCH_CHECK_NOT_IMPLEMENTED(false, "convolution_overrideable not implemented. You are likely triggering this with tensor backend other than CPU/CUDA/MKLDNN, if this is intended, please use TORCH_LIBRARY_IMPL to override this function ");
 }
 
@@ -1218,7 +1222,11 @@ ConvBackend _select_conv_backend(
       return ConvBackend::Miopen;
     }
   } else if (params.use_mkldnn(input, weight)) {
-    return ConvBackend::Mkldnn;
+    if (params.transposed) {
+      return ConvBackend::MkldnnTranspose;
+    } else {
+      return ConvBackend::Mkldnn;
+    }
   } else if (!need_backward && params.use_xnnpack(input, weight, bias_sizes_opt)) {
     // Using prepacked conv is preferred, but XNNPACK is still the fastest
     // option for NHWC.
@@ -1407,12 +1415,14 @@ static inline at::MemoryFormat determine_backend_memory_format(
       }
       break;
     case ConvBackend::Mkldnn:
+    case ConvBackend::MkldnnTranspose:
       if (mkldnn_conv_use_channels_last(input, weight)) {
-        backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
       }
       break;
     case ConvBackend::Slow2d:
     case ConvBackend::SlowDilated2d:
+    case ConvBackend::SlowTranspose2d:
       if (thnn_conv_use_channels_last(input, weight)) {
         backend_memory_format = at::MemoryFormat::ChannelsLast;
       }
@@ -1563,6 +1573,21 @@ at::Tensor _convolution(
           input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
 #else
       TORCH_INTERNAL_ASSERT(false, "Mkldnn backend was selected in PyTorch compiled without mkldnn support");
+#endif
+      break;
+    case ConvBackend::MkldnnTranspose:
+#if AT_MKLDNN_ENABLED()
+      check_input_same_type_as_parameters(input, weight, bias, backend);
+      if (!input.is_mkldnn()) {
+        // need to ensure contiguous for non-mkldnn tensors
+        input = input.contiguous(backend_memory_format);
+        weight = weight.contiguous(backend_memory_format);
+        bias = bias.defined() ? bias.contiguous() : bias;
+      }
+      output = mkldnn_convolution_transpose_stub(input.device().type(),
+          input, weight, bias, params.padding, params.output_padding, params.stride, params.dilation, params.groups);
+#else
+      TORCH_INTERNAL_ASSERT(false, "Mkldnn backend was selected in PyTorch compiled without mkldnn support");
 #endif
       break;
     case ConvBackend::MkldnnEmpty:
@@ -2137,6 +2162,17 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
         mkldnn_convolution_backward_stub(input.device().type(), input, grad_output, weight, params.padding,
           params.stride, params.dilation, params.groups, output_mask);
       break;
+    case ConvBackend::MkldnnTranspose:
+      TORCH_CHECK(!weight.is_mkldnn(),
+          "The MKLDNN backend does not support weight as an MKLDNN tensor during training");
+      if (!input.is_mkldnn()) {
+        input = input.contiguous(backend_memory_format);
+        weight = weight.contiguous(backend_memory_format);
+      }
+      std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
+        mkldnn_convolution_transpose_backward_stub(input.device().type(), input, grad_output, weight, params.padding,
+        params.output_padding, params.stride, params.dilation, params.groups, output_mask);
+      break;
     case ConvBackend::Overrideable:
       // Only reach here when input is backend with out-of-source implementation.
       std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) =
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index 3569a9a55d8e..b8d6afdde604 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -574,7 +574,7 @@ Tensor& slow_conv3d_forward_out_cpu(const Tensor& self,
 
   // TODO: hacky way of deciding the groups
   // Assuming the group size is checked in upstream functions
-  const int64_t groups = self.size(1) / weight.size(1);
+  const int64_t groups = weight.size(1) > 0 ? self.size(1) / weight.size(1) : 0;
 
   slow_conv3d_shape_check(
       self,
diff --git a/aten/src/ATen/native/DilatedMaxPool2d.cpp b/aten/src/ATen/native/DilatedMaxPool2d.cpp
index 576e28866cbc..86d247244037 100644
--- a/aten/src/ATen/native/DilatedMaxPool2d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool2d.cpp
@@ -32,7 +32,7 @@ bool ceil_mode) {
 
   // NB: stride default is not expressible as an integer constant, so we accept
   // empty stride for this case
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
     "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
   const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
   const int dW = stride.empty() ? kW :
@@ -105,7 +105,7 @@ const Tensor& indices) {
 
   // NB: stride default is not expressible as an integer constant, so we accept
   // empty stride for this case
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
     "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
   const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
   const int dW = stride.empty() ? kW :
diff --git a/aten/src/ATen/native/DilatedMaxPool3d.cpp b/aten/src/ATen/native/DilatedMaxPool3d.cpp
index 643943160556..dcb1a09d379e 100644
--- a/aten/src/ATen/native/DilatedMaxPool3d.cpp
+++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp
@@ -164,7 +164,7 @@ void max_pool3d_with_indices_out_cpu_template(
   const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
   const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
 
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 3,
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
     "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints")
   const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
   const int dH = stride.empty() ? kH :
@@ -372,7 +372,7 @@ Tensor& max_pool3d_with_indices_backward_out_cpu_template(
   const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
   const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
 
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 3,
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
     "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints")
   const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
   const int dH = stride.empty() ? kH :
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index 2132407df80f..0a9b5c4fea8c 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -312,7 +312,7 @@ Tensor& geometric_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
 
 template<template<typename> class exponential_kernel, typename RNG>
 Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator> gen) {
-  TORCH_CHECK(lambda >= 0.0, "exponential_ expects lambda >= 0.0, but found lambda=", lambda);
+  TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
   auto iter = TensorIterator::borrowing_nullary_op(self);
   exponential_kernel<RNG>()(iter, lambda, gen);
   return self;
@@ -322,6 +322,10 @@ Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator>
 
 template<template<typename> class cauchy_kernel, typename RNG>
 Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+  // TODO: instead of variable name 'sigma', use 'gamma' or 'scale'
+  // the variance, squared sigma, is undefined for cauchy distribution
+  TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma);
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype());
   auto iter = TensorIterator::borrowing_nullary_op(self);
   cauchy_kernel<RNG>()(iter, median, sigma, gen);
   return self;
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 48537aacbdc2..b592b248f0e3 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -1,10 +1,11 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/native/EmbeddingBag.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorOperators.h>
-#include <ATen/TensorUtils.h>
 #include <ATen/TensorSubclassLikeUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/EmbeddingBag.h>
 
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/NonSymbolicBC.h>
@@ -86,14 +87,20 @@ std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
 // is only applicable if special conditions are met
 template<typename index_t>
 bool is_fast_path_index_select(const Tensor& src, Tensor& output, index_t padding_idx) {
-  return (src.scalar_type() == kFloat || src.scalar_type() == kHalf) && src.strides()[1] == 1 && output.strides()[1] == 1 && padding_idx < static_cast<index_t>(0);
+  return (src.scalar_type() == kFloat || src.scalar_type() == kHalf ||
+          src.scalar_type() == kBFloat16) &&
+      src.strides()[1] == 1 && output.strides()[1] == 1 &&
+      padding_idx < static_cast<index_t>(0);
 }
 
 // Determines if we can use a fast implementation for index_select_scale_add,
 // which is only applicable if special conditions are met
 template<typename index_t>
 bool is_fast_path_index_select_scale(const Tensor& src, const Tensor& scale, Tensor& output, index_t padding_idx) {
-  return (src.scalar_type() == kFloat || src.scalar_type() == kHalf) && src.strides()[1] == 1 && output.strides()[1] == 1 && scale.strides()[0] == 1 && padding_idx < static_cast<index_t>(0);
+  return (src.scalar_type() == kFloat || src.scalar_type() == kHalf ||
+          src.scalar_type() == kBFloat16) &&
+      src.strides()[1] == 1 && output.strides()[1] == 1 &&
+      scale.strides()[0] == 1 && padding_idx < static_cast<index_t>(0);
 }
 
 template<typename index_t>
@@ -106,17 +113,18 @@ bool is_fast_path(const Tensor& src, const c10::optional<Tensor>& scale, Tensor&
 // This function combines index_select (using select_indices as the index) and
 // index_add (using add_indices as the index), without creating an intermediary
 // tensor to hold the selected embeddings
-template<typename data_t, typename index_t>
-typename std::enable_if<!std::is_same<data_t, float>::value && !std::is_same<data_t, at::Half>::value, void>::type
-index_select_add(const Tensor &select_indices,
-                             const Tensor &add_indices,
-                             const Tensor &src,
-                             Tensor &output,
-                             const Tensor& /*offsets*/,
-                             bool /*include_last_offset*/,
-                             Tensor &bag_size,
-                             index_t padding_idx,
-                             _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
+template <typename data_t, typename index_t>
+static typename std::enable_if<std::is_same<data_t, double>::value, void>::type
+index_select_add(
+    const Tensor& select_indices,
+    const Tensor& add_indices,
+    const Tensor& src,
+    Tensor& output,
+    const Tensor& /*offsets*/,
+    bool /*include_last_offset*/,
+    Tensor& bag_size,
+    index_t padding_idx,
+    _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
   TORCH_CHECK(select_indices.numel() == add_indices.numel());
   auto* add_indices_data = add_indices.data_ptr<index_t>();
   auto* select_indices_data = select_indices.data_ptr<index_t>();
@@ -184,24 +192,28 @@ void fbgemm_spmdm_report_error_(
 }
 } // namespace
 
-template<typename data_t, typename index_t>
-typename std::enable_if<std::is_same<data_t, at::Half>::value, void>::type
-index_select_add(const Tensor &select_indices,
-                             const Tensor &add_indices,
-                             const Tensor &src,
-                             Tensor &output,
-                             const Tensor& offsets,
-                             bool include_last_offset,
-                             Tensor &bag_size,
-                             index_t padding_idx,
-                             _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+template <typename data_t, typename index_t>
+typename std::enable_if<
+    std::is_same<data_t, at::Half>::value ||
+        std::is_same<data_t, at::BFloat16>::value,
+    void>::type
+index_select_add(
+    const Tensor& select_indices,
+    const Tensor& add_indices,
+    const Tensor& src,
+    Tensor& output,
+    const Tensor& offsets,
+    bool include_last_offset,
+    Tensor& bag_size,
+    index_t padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   int64_t ddim = src.size(1);
   auto* select_indices_data = select_indices.data_ptr<index_t>();
-  auto* output_data = output.data_ptr<at::Half>();
+  auto* output_data = output.data_ptr<data_t>();
 
   if (is_fast_path_index_select(src, output, padding_idx)) {
     auto src_contig = src.contiguous();
-    auto* src_data = src_contig.data_ptr<at::Half>();
+    auto* src_data = src_contig.data_ptr<data_t>();
     int64_t output_size = offsets.numel() - 1;
     auto* offsets_data = offsets.data_ptr<index_t>();
     std::vector<index_t> offsets_include_last;
@@ -220,36 +232,31 @@ index_select_add(const Tensor &select_indices,
       offsets_include_last[offsets.numel()] = select_indices.numel();
       offsets_data = offsets_include_last.data();
     }
-
-#ifdef USE_FBGEMM
-    using float16 = uint16_t;
-    auto kernel_fp16_index_t = fbgemm_kernel_cache ?
-      fbgemm_kernel_cache->getCallback</* has_weight */ false, index_t, float16>(ddim) :
-      fbgemm::GenerateEmbeddingSpMDM<float16, index_t, index_t, float16>(
-        /* block_size */ddim,
-        /* has_weight */false,
-        /* normalize_by_lengths */false,
-        /* prefetch */16,
-        /* is_weight_positional */false,
-        /* use_offsets */true
-      );
-#else
-    // Initialize the intermediate output buffer to be 0.
-    Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat));
-    auto* output_data_fp32 = output_fp32.data_ptr<float>();
-#endif
+#if defined(USE_FBGEMM)
+    bool isbf16 = std::is_same<data_t, at::Half>::value ? false : true;
+    auto kernel_16bit_index_t = fbgemm_kernel_cache
+        ? fbgemm_kernel_cache
+              ->getCallback</* has_weight */ false, index_t, uint16_t>(ddim)
+        : fbgemm::GenerateEmbeddingSpMDM<uint16_t, index_t, index_t, uint16_t>(
+              /* block_size */ ddim,
+              /* has_weight */ false,
+              /* normalize_by_lengths */ false,
+              /* prefetch */ 16,
+              /* is_weight_positional */ false,
+              /* use_offsets */ true,
+              /* isbf16*/ isbf16);
     at::parallel_for(
         0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
-#ifdef USE_FBGEMM
-          bool success = kernel_fp16_index_t(
-            /* output_size */end_idx - start_idx,
-            /* index_size */offsets_data[end_idx] - offsets_data[start_idx],
-            /* data_size */src.size(0),
-            /* input */reinterpret_cast<const float16*>(src_data),
-            /* indices */select_indices_data + offsets_data[start_idx],
-            /* offsets_or_lengths */offsets_data + start_idx,
-            /* weights */nullptr,
-            /* output */reinterpret_cast<float16*>(output_data + start_idx * ddim));
+          bool success = kernel_16bit_index_t(
+              /* output_size */ end_idx - start_idx,
+              /* index_size */ offsets_data[end_idx] - offsets_data[start_idx],
+              /* data_size */ src.size(0),
+              /* input */ reinterpret_cast<const uint16_t*>(src_data),
+              /* indices */ select_indices_data + offsets_data[start_idx],
+              /* offsets_or_lengths */ offsets_data + start_idx,
+              /* weights */ nullptr,
+              /* output */
+              reinterpret_cast<uint16_t*>(output_data + start_idx * ddim));
           if (!success) {
             fbgemm_spmdm_report_error_(
                 end_idx - start_idx,
@@ -258,7 +265,15 @@ index_select_add(const Tensor &select_indices,
                 offsets_data + start_idx,
                 select_indices_data + offsets_data[start_idx]);
           }
+        });
 #else
+    // Initialize the intermediate output buffer to be 0.
+    Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat));
+    auto* output_data_fp32 = output_fp32.data_ptr<float>();
+    using bVec = vec::Vectorized<BFloat16>;
+    using fVec = vec::Vectorized<float>;
+    at::parallel_for(
+        0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
           caffe2::EmbeddingLookupIdx(
               /*block_size=*/ddim,
               /*output_size=*/end_idx - start_idx,
@@ -271,18 +286,36 @@ index_select_add(const Tensor &select_indices,
               /*scale_bias=*/nullptr,
               /*normalize_by_lengths=*/false,
               /*out=*/output_data_fp32 + start_idx * ddim);
-          for (const auto i : c10::irange(output_size)) {
-            // Convert FP32 intermediate buffer result back to FP16 for output dtype
-            for (const auto d : c10::irange(ddim)) {
-              (output_data + i * ddim)[d] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+          for (int64_t i = start_idx; i < end_idx; i++) {
+            // Convert FP32 intermediate buffer result back to 16 bit for
+            // output dtype
+            if (std::is_same<data_t, at::Half>::value) {
+              // FP16
+              for (const auto d : c10::irange(ddim)) {
+                (output_data + i * ddim)[d] =
+                    static_cast<data_t>((output_data_fp32 + ddim * i)[d]);
+              }
+            } else {
+              // BF16
+              int64_t d = 0;
+              for (; d < ddim - (ddim % bVec::size()); d += bVec::size()) {
+                fVec temp_fp32_0 = fVec::loadu(output_data_fp32 + ddim * i + d);
+                fVec temp_fp32_1 =
+                    fVec::loadu(output_data_fp32 + ddim * i + d + fVec::size());
+                convert_float_bfloat16(temp_fp32_0, temp_fp32_1)
+                    .store(output_data + i * ddim + d);
+              }
+              for (; d < ddim; d++) {
+                (output_data + i * ddim)[d] =
+                    static_cast<data_t>((output_data_fp32 + ddim * i)[d]);
+              }
             }
           }
-#endif
         });
-
+#endif
   } else {
     TORCH_CHECK(select_indices.numel() == add_indices.numel());
-    auto* src_data = src.data_ptr<at::Half>();
+    auto* src_data = src.data_ptr<data_t>();
     auto* add_indices_data = add_indices.data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     index_t* bag_size_data = nullptr;
@@ -300,7 +333,8 @@ index_select_add(const Tensor &select_indices,
     auto* src_data_fp32 = src_fp32.data_ptr<float>();
 
     // Initialize the intermediate output buffer to be 0.
-    Tensor output_fp32 = at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat));
+    Tensor output_fp32 =
+        at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat));
     auto* output_data_fp32 = output_fp32.data_ptr<float>();
 
     for (const auto i : c10::irange(numel)) {
@@ -314,11 +348,16 @@ index_select_add(const Tensor &select_indices,
       if (idx != padding_idx) {
         // Copy src_data + src_stride0 * idx to src_data_fp32
         for (const auto d : c10::irange(ddim)) {
-          src_data_fp32[d] = static_cast<float>((src_data + src_stride0 * idx)[d * src_stride1]);
+          src_data_fp32[d] = static_cast<float>(
+              (src_data + src_stride0 * idx)[d * src_stride1]);
         }
-        at::native::cpublas::axpy<float>(ddim, 1,
-                src_data_fp32, 1,
-                output_data_fp32 + ddim * add_indices_data[i], 1);
+        at::native::cpublas::axpy<float>(
+            ddim,
+            1,
+            src_data_fp32,
+            1,
+            output_data_fp32 + ddim * add_indices_data[i],
+            1);
 
       } else if (bag_size.defined()) {
         // Decrement bag_size to reflect that the index is padded
@@ -327,14 +366,15 @@ index_select_add(const Tensor &select_indices,
       }
     }
     for (const auto i : c10::irange(output.size(0))) {
-      // Convert FP32 intermediate buffer result back to FP16 for output dtype
+      // Convert FP32 intermediate buffer result back to 16 bit for output
+      // dtype
       for (const auto d : c10::irange(ddim)) {
-        (output_data + output_stride0 * i)[d * output_stride1] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+        (output_data + output_stride0 * i)[d * output_stride1] =
+            static_cast<data_t>((output_data_fp32 + ddim * i)[d]);
       }
     }
   }
 }
-
 template<typename data_t, typename index_t>
 typename std::enable_if<std::is_same<data_t, float>::value, void>::type
 index_select_add(const Tensor &select_indices,
@@ -464,18 +504,19 @@ index_select_add(const Tensor &select_indices,
 // index_select (using select_indices as the index)
 // mul (scaling by per_sample_weights)
 // index_add (using add_indices as the index)
-template<typename data_t, typename index_t>
-static typename std::enable_if<!std::is_same<data_t, float>::value && !std::is_same<data_t, at::Half>::value, void>::type
-index_select_scale_add(const Tensor &select_indices,
-                                   const Tensor &add_indices,
-                                   const Tensor &scale,
-                                   const Tensor &src,
-                                   Tensor &output,
-                                   const Tensor& /*offsets*/,
-                                   bool /*include_last_offset*/,
-                                   Tensor &bag_size,
-                                   index_t padding_idx,
-                                  _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
+template <typename data_t, typename index_t>
+static typename std::enable_if<std::is_same<data_t, double>::value, void>::type
+index_select_scale_add(
+    const Tensor& select_indices,
+    const Tensor& add_indices,
+    const Tensor& scale,
+    const Tensor& src,
+    Tensor& output,
+    const Tensor& /*offsets*/,
+    bool /*include_last_offset*/,
+    Tensor& bag_size,
+    index_t padding_idx,
+    _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) {
   AT_ASSERT(select_indices.numel() == add_indices.numel());
   auto* add_indices_data = add_indices.data_ptr<index_t>();
   auto* select_indices_data = select_indices.data_ptr<index_t>();
@@ -520,26 +561,30 @@ index_select_scale_add(const Tensor &select_indices,
   }
 }
 
-template<typename data_t, typename index_t>
-typename std::enable_if<std::is_same<data_t, at::Half>::value, void>::type
-index_select_scale_add(const Tensor &select_indices,
-                       const Tensor &add_indices,
-                       const Tensor &scale,
-                       const Tensor &src,
-                       Tensor &output,
-                       const Tensor& offsets,
-                       bool include_last_offset,
-                       Tensor &bag_size,
-                       index_t padding_idx,
-                       _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
+template <typename data_t, typename index_t>
+typename std::enable_if<
+    std::is_same<data_t, at::Half>::value ||
+        std::is_same<data_t, at::BFloat16>::value,
+    void>::type
+index_select_scale_add(
+    const Tensor& select_indices,
+    const Tensor& add_indices,
+    const Tensor& scale,
+    const Tensor& src,
+    Tensor& output,
+    const Tensor& offsets,
+    bool include_last_offset,
+    Tensor& bag_size,
+    index_t padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache) {
   int64_t ddim = src.size(1);
-  auto* scale_data = scale.data_ptr<at::Half>();
+  auto* scale_data = scale.data_ptr<data_t>();
   auto* select_indices_data = select_indices.data_ptr<index_t>();
-  auto* output_data = output.data_ptr<at::Half>();
+  auto* output_data = output.data_ptr<data_t>();
 
   if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) {
     auto src_contig = src.contiguous();
-    auto* src_data = src_contig.data_ptr<at::Half>();
+    auto* src_data = src_contig.data_ptr<data_t>();
     int64_t output_size = offsets.numel() - 1;
     auto* offsets_data = offsets.data_ptr<index_t>();
     std::vector<index_t> offsets_include_last;
@@ -560,40 +605,42 @@ index_select_scale_add(const Tensor &select_indices,
     Tensor scale_fp32 = at::empty(scale.sizes(), scale.options().dtype(at::kFloat));
     auto* scale_data_fp32 = scale_fp32.data_ptr<float>();
 
-#ifdef USE_FBGEMM
-    using float16 = uint16_t;
-    fbgemm::Float16ToFloat_simd(reinterpret_cast<const float16*>(scale_data), scale_data_fp32, scale_fp32.numel());
-    auto kernel_fp16_index_t =
-      fbgemm_kernel_cache ?
-      fbgemm_kernel_cache->getCallback</* has_weight */ true, index_t, float16>(ddim) :
-      fbgemm::GenerateEmbeddingSpMDM<float16, index_t, index_t, float16>(
-        /* block_size */ddim,
-        /* has_weight */true,
-        /* normalize_by_lengths */false,
-        /* prefetch */16,
-        /* is_weight_positional */false,
-        /* use_offsets */true
-      );
-#else
-    // Initialize the intermediate output buffer to be 0.
-    Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat));
-    auto* output_data_fp32 = output_fp32.data_ptr<float>();
-    for (const auto i : c10::irange(scale.numel())) {
-      scale_data_fp32[i] = static_cast<float>(scale_data[i]);
+#if defined(USE_FBGEMM)
+    bool isbf16 = std::is_same<data_t, at::Half>::value ? false : true;
+    if (isbf16) {
+      fbgemm::Bfloat16ToFloat_simd(
+          reinterpret_cast<const fbgemm::bfloat16*>(scale_data),
+          scale_data_fp32,
+          scale_fp32.numel());
+    } else {
+      fbgemm::Float16ToFloat_simd(
+          reinterpret_cast<const fbgemm::float16*>(scale_data),
+          scale_data_fp32,
+          scale_fp32.numel());
     }
-#endif
+    auto kernel_16bit_index_t = fbgemm_kernel_cache
+        ? fbgemm_kernel_cache
+              ->getCallback</* has_weight */ true, index_t, uint16_t>(ddim)
+        : fbgemm::GenerateEmbeddingSpMDM<uint16_t, index_t, index_t, uint16_t>(
+              /* block_size */ ddim,
+              /* has_weight */ true,
+              /* normalize_by_lengths */ false,
+              /* prefetch */ 16,
+              /* is_weight_positional */ false,
+              /* use_offsets */ true,
+              /* isbf16*/ isbf16);
     at::parallel_for(
         0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
-#ifdef USE_FBGEMM
-          bool success = kernel_fp16_index_t(
-            /* output_size */end_idx - start_idx,
-            /* index_size */offsets_data[end_idx] - offsets_data[start_idx],
-            /* data_size */src.size(0),
-            /* input */reinterpret_cast<const float16*>(src_data),
-            /* indices */select_indices_data + offsets_data[start_idx],
-            /* offsets_or_lengths */offsets_data + start_idx,
-            /* weights */scale_data_fp32 + offsets_data[start_idx],
-            /* output */reinterpret_cast<float16*>(output_data + start_idx * ddim));
+          bool success = kernel_16bit_index_t(
+              /* output_size */ end_idx - start_idx,
+              /* index_size */ offsets_data[end_idx] - offsets_data[start_idx],
+              /* data_size */ src.size(0),
+              /* input */ reinterpret_cast<const uint16_t*>(src_data),
+              /* indices */ select_indices_data + offsets_data[start_idx],
+              /* offsets_or_lengths */ offsets_data + start_idx,
+              /* weights */ scale_data_fp32 + offsets_data[start_idx],
+              /* output */
+              reinterpret_cast<uint16_t*>(output_data + start_idx * ddim));
           if (!success) {
             fbgemm_spmdm_report_error_(
                 end_idx - start_idx,
@@ -602,7 +649,19 @@ index_select_scale_add(const Tensor &select_indices,
                 offsets_data + start_idx,
                 select_indices_data + offsets_data[start_idx]);
           }
+        });
 #else
+    // Initialize the intermediate output buffer to be 0.
+    Tensor output_fp32 =
+        at::zeros({output_size, ddim}, output.options().dtype(at::kFloat));
+    auto* output_data_fp32 = output_fp32.data_ptr<float>();
+    for (const auto i : c10::irange(scale.numel())) {
+      scale_data_fp32[i] = static_cast<float>(scale_data[i]);
+    }
+    using bVec = vec::Vectorized<BFloat16>;
+    using fVec = vec::Vectorized<float>;
+    at::parallel_for(
+        0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
           caffe2::EmbeddingLookupIdx(
               /*block_size=*/ddim,
               /*output_size=*/end_idx - start_idx,
@@ -615,17 +674,36 @@ index_select_scale_add(const Tensor &select_indices,
               /*scale_bias=*/nullptr,
               /*normalize_by_lengths=*/false,
               /*out=*/output_data_fp32 + start_idx * ddim);
-          for (const auto i : c10::irange(output_size)) {
-            // Convert FP32 intermediate buffer result back to FP16 for output dtype
-            for (const auto d : c10::irange(ddim)) {
-              (output_data + i * ddim)[d] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+          for (int64_t i = start_idx; i < end_idx; i++) {
+            // Convert FP32 intermediate buffer result back to 16 bit for
+            // output dtype
+            if (std::is_same<data_t, at::Half>::value) {
+              // FP16
+              for (const auto d : c10::irange(ddim)) {
+                (output_data + i * ddim)[d] =
+                    static_cast<data_t>((output_data_fp32 + ddim * i)[d]);
+              }
+            } else {
+              // BF16
+              int64_t d = 0;
+              for (; d < ddim - (ddim % bVec::size()); d += bVec::size()) {
+                fVec temp_fp32_0 = fVec::loadu(output_data_fp32 + ddim * i + d);
+                fVec temp_fp32_1 =
+                    fVec::loadu(output_data_fp32 + ddim * i + d + fVec::size());
+                convert_float_bfloat16(temp_fp32_0, temp_fp32_1)
+                    .store(output_data + i * ddim + d);
+              }
+              for (; d < ddim; d++) {
+                (output_data + i * ddim)[d] =
+                    static_cast<data_t>((output_data_fp32 + ddim * i)[d]);
+              }
             }
           }
-#endif
         });
+#endif
   } else {
     AT_ASSERT(select_indices.numel() == add_indices.numel());
-    auto* src_data = src.data_ptr<at::Half>();
+    auto* src_data = src.data_ptr<data_t>();
     auto* add_indices_data = add_indices.data_ptr<index_t>();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     index_t* bag_size_data = nullptr;
@@ -641,7 +719,8 @@ index_select_scale_add(const Tensor &select_indices,
     auto numel = add_indices.numel();
 
     // Initialize the intermediate output buffer to be 0.
-    Tensor output_fp32 = at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat));
+    Tensor output_fp32 =
+        at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat));
     auto* output_data_fp32 = output_fp32.data_ptr<float>();
 
     for (const auto i : c10::irange(numel)) {
@@ -653,12 +732,12 @@ index_select_scale_add(const Tensor &select_indices,
           "embedding_bag: Expected idx >= 0 && idx < num_embeddings but found idx to be ",
           idx);
       if (idx != padding_idx) {
-
         auto* src_base = src_data + src_stride0 * idx;
         auto* output_base_fp32 = output_data_fp32 + ddim * add_indices_data[i];
         auto scale = scale_data[i * scale_stride];
         for (const auto j : c10::irange(ddim)) {
-          output_base_fp32[j] += static_cast<float>(src_base[j * src_stride1]) * static_cast<float>(scale);
+          output_base_fp32[j] += static_cast<float>(src_base[j * src_stride1]) *
+              static_cast<float>(scale);
         }
       } else if (bag_size.defined()) {
         // Decrement bag_size to reflect that the index is padded
@@ -667,14 +746,15 @@ index_select_scale_add(const Tensor &select_indices,
       }
     }
     for (const auto i : c10::irange(output.size(0))) {
-      // Convert FP32 intermediate buffer result back to FP16 for output dtype
+      // Convert FP32 intermediate buffer result back to 16 bit for output
+      // dtype
       for (const auto d : c10::irange(ddim)) {
-        (output_data + output_stride0 * i)[d * output_stride1] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+        (output_data + output_stride0 * i)[d * output_stride1] =
+            static_cast<data_t>((output_data_fp32 + ddim * i)[d]);
       }
     }
   }
 }
-
 template<typename data_t, typename index_t>
 typename std::enable_if<std::is_same<data_t, float>::value, void>::type
 index_select_scale_add(const Tensor &select_indices,
@@ -817,7 +897,8 @@ void check_arguments(
   checkScalarTypes("embedding_bag", offsets_arg, {kLong, kInt});
   checkSameType("embedding_bag", indices_arg, offsets_arg);
   auto weight_arg = TensorArg(weight, "weight", 1);
-  checkScalarTypes("embedding_bag", weight_arg, {kHalf, kFloat, kDouble});
+  checkScalarTypes(
+      "embedding_bag", weight_arg, {kHalf, kBFloat16, kFloat, kDouble});
 
   AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "_embedding_bag_cpu_impl", [&]() {
     if (offsets.size(0) > 0) {
@@ -1086,12 +1167,22 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
       max_indices->copy_(bag_size);
     }
   } else { // MODE_MAX
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      weight.scalar_type(), "embedding_bag_cpu_max_out", [&]() {
-        embedding_bag_cpu_max_out<scalar_t>(
-          max_indices, weight, indices, offset2bag, output, include_last_offset, bag_size, padding_idx);
-      }
-    );
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        weight.scalar_type(),
+        "embedding_bag_cpu_max_out",
+        [&]() {
+          embedding_bag_cpu_max_out<scalar_t>(
+              max_indices,
+              weight,
+              indices,
+              offset2bag,
+              output,
+              include_last_offset,
+              bag_size,
+              padding_idx);
+        });
   }
 }
 
@@ -1114,6 +1205,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_cpu_impl(
         "offsets has to be a 1D Tensor, but got Tensor of dimension ",
         offsets_.dim());
   }
+  TORCH_CHECK(weight.dim() == 2,
+      "weight has to be a 2D Tensor, but got Tensor of dimension ",
+      weight.dim());
   Tensor indices, offsets;
   std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
   check_arguments(weight, indices, offsets, mode, per_sample_weights, include_last_offset);
@@ -1521,7 +1615,8 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
   // for more details.
   auto grad = grad_.contiguous();
   auto grad_arg = TensorArg(grad, "grad_", 1);
-  checkScalarTypes("embedding_bag", grad_arg, {kHalf, kFloat, kDouble});
+  checkScalarTypes(
+      "embedding_bag", grad_arg, {kHalf, kBFloat16, kFloat, kDouble});
 
   if (mode == MODE_MAX) {
     return _embedding_bag_dense_backward_cpu_max(
diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h
index 9d44fa688b2b..8ba7abe706c3 100644
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@@ -98,14 +98,14 @@ struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
 // instantiate the cache with the list of storage mixins
 // for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file
 using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
-      _CallbackAndBlockSize<true, int32_t, float>,
-      _CallbackAndBlockSize<false, int32_t, float>,
-      _CallbackAndBlockSize<true, int64_t, float>,
-      _CallbackAndBlockSize<false, int64_t, float>,
-      _CallbackAndBlockSize<true, int32_t, unsigned short>,
-      _CallbackAndBlockSize<false, int32_t, unsigned short>,
-      _CallbackAndBlockSize<true, int64_t, unsigned short>,
-      _CallbackAndBlockSize<false, int64_t, unsigned short>>;
+    _CallbackAndBlockSize<true, int32_t, float>,
+    _CallbackAndBlockSize<false, int32_t, float>,
+    _CallbackAndBlockSize<true, int64_t, float>,
+    _CallbackAndBlockSize<false, int64_t, float>,
+    _CallbackAndBlockSize<true, int32_t, unsigned short>,
+    _CallbackAndBlockSize<false, int32_t, unsigned short>,
+    _CallbackAndBlockSize<true, int64_t, unsigned short>,
+    _CallbackAndBlockSize<false, int64_t, unsigned short>>;
 #else
 struct _EmbeddingBagKernelCache {
     explicit _EmbeddingBagKernelCache(c10::optional<int64_t> /* maybe_block_size */) {}
diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index bca5f3e6b389..30c25875971b 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -1,3 +1,4 @@
+#include <vector>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/native/ForeachUtils.h>
@@ -49,10 +50,12 @@
 #include <ATen/ops/_foreach_zero_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
 #include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
 #include <ATen/ops/linalg_vector_norm.h>
 #include <ATen/ops/maximum.h>
 #include <ATen/ops/minimum.h>
 #include <ATen/ops/zeros_like_ops.h>
+#include <ATen/ops/pow.h>
 #endif
 
 namespace at { namespace native {
@@ -234,6 +237,7 @@ FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
 FOREACH_BINARY_OP_SCALAR(clamp_min);
 FOREACH_BINARY_OP_SCALAR(clamp_max);
+FOREACH_BINARY_OP_SCALAR(pow);
 
 FOREACH_BINARY_OP_SCALARLIST(add);
 FOREACH_BINARY_OP_SCALARLIST(sub);
@@ -241,11 +245,13 @@ FOREACH_BINARY_OP_SCALARLIST(mul);
 FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_SCALARLIST(clamp_min);
 FOREACH_BINARY_OP_SCALARLIST(clamp_max);
+FOREACH_BINARY_OP_SCALARLIST(pow);
 
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_BINARY_OP_LIST(clamp_min);
 FOREACH_BINARY_OP_LIST(clamp_max);
+FOREACH_BINARY_OP_LIST(pow);
 
 FOREACH_UNARY_OP(sqrt);
 FOREACH_UNARY_OP(exp);
@@ -321,4 +327,14 @@ std::vector<Tensor> foreach_tensor_norm_slow(TensorList tensors, const Scalar& o
   return result;
 }
 
+std::vector<Tensor> foreach_scalar_pow_list_kernel_slow(const Scalar& self, TensorList exponent) {
+  check_foreach_api_restrictions(exponent);
+  std::vector<Tensor> result;
+  result.reserve(exponent.size());
+  for (const auto & t : exponent) {
+    result.emplace_back(at::pow(self, t));
+  }
+  return result;
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 0166d040863c..6daf046623fe 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -29,7 +29,7 @@ bool has_bool_tensor(TensorList tensors) {
 // - All TensorLists and ScalarLists must have the same number of elements.
 // - Corresponding tensors must have the same size.
 void check_foreach_api_restrictions(TensorList tensors) {
-  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor.");
 }
 
 void check_foreach_api_restrictions(TensorList tensors, ArrayRef<Scalar> scalars) {
@@ -38,15 +38,15 @@ void check_foreach_api_restrictions(TensorList tensors, ArrayRef<Scalar> scalars
 }
 
 void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
-  TORCH_CHECK(tensors1.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(tensors2.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
   TORCH_CHECK(tensors1.size() == tensors2.size(), "Tensor lists must have the same number of tensors, got ", tensors1.size(), " and ", tensors2.size());
 }
 
 void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2, TensorList tensors3) {
-  TORCH_CHECK(tensors1.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(tensors2.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(tensors3.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
   TORCH_CHECK(tensors1.size() == tensors2.size(), "Tensor lists must have the same number of tensors, got ", tensors1.size(), " and ", tensors2.size());
   TORCH_CHECK(tensors1.size() == tensors3.size(), "Tensor lists must have the same number of tensors, got ", tensors1.size(), " and ", tensors3.size());
 }
@@ -110,7 +110,7 @@ bool check_fast_path_restrictions(
           return false;
         }
       }
-      if (scalarList.size() > 0) {
+      if (!scalarList.empty()) {
         const auto& scalar = scalarList.size() == 1 ? scalarList[0] : scalarList[i];
         const auto& tensor = tensorLists[0][i];
         // note(mkozuki): This check might be responsible for `_foreach_add(bool_tensors, bool_tensors)`
diff --git a/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp b/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp
index d31789051104..28abc812f4be 100644
--- a/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp
+++ b/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp
@@ -26,6 +26,7 @@ DEFINE_DISPATCH(_compute_linear_combination_stub);
 // Note: if input.dtype == scalar_t<T>, then coefficients.dtype == T.
 // This is relevant when scalar_t<T> == complex<T>.
 Tensor _compute_linear_combination(const Tensor& input, const Tensor& coefficients) {
+  TORCH_CHECK(input.ndimension() > 0 && input.numel() > 0, "Empty tensor not supported");
   auto output_first_dim_size = coefficients.size(0);
 
   auto output_sizes = input.sizes().vec();
@@ -55,7 +56,7 @@ Tensor& _compute_linear_combination_out(const Tensor& input, const Tensor& coeff
   // output.sizes() = [m, 1 (instead of n), ...].
   // The second dimension in newly restrided Tensors is traversed inside the kernels.
   // This is done to avoid synchronizations/atomic operations in the kernels
-  // and also quarantees determinism, required by the autograd.
+  // and also guarantees determinism, required by the autograd.
 
   // restride output
   auto output_to_broadcasted_dim = output.unsqueeze(1);
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index e4be04dbcf47..6a1cabfa8b9d 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -102,7 +102,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   // assumes that tensors have been pre-unsqueezed (so that all dimensions match - after broadcasting)
   // but makes no other assumptions on the order of dimensions
   TORCH_CHECK(left_.dim()==right_.dim(), "number of dimensions must match");
-  if (sum_dims_.size() == 0)
+  if (sum_dims_.empty())
     return at::mul(left_, right_);
   int64_t dim = left_.dim();
   auto sum_dims = at::dim_list_to_bitset(sum_dims_, dim);
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index c99c0dae63ca..804b91705306 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -17,6 +17,7 @@
 #include <ATen/TensorIterator.h>
 #include <ATen/TensorOperators.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <c10/util/variant.h>
@@ -198,7 +199,7 @@ TORCH_META_FUNC(linalg_vector_norm)(const Tensor& self, const Scalar& scalar_ord
   //   - We cannot reduce over an empty dimension
   if (self.numel() == 0 && (ord < 0. || ord == INFINITY)) {
     // dim=None or dim=() reduces the whole tensor
-    TORCH_CHECK(opt_dim.has_value() && opt_dim->size() != 0,
+    TORCH_CHECK(opt_dim.has_value() && !opt_dim->empty(),
       "linalg.vector_norm cannot compute the ", scalar_ord, " norm on an empty ",
       "tensor because the operation does not have an identity");
     for (auto dim_num : dim) {
@@ -735,6 +736,11 @@ Tensor& matrix_rank_impl(
 
   Tensor tol = at::max(atol.unsqueeze(-1), rtol.unsqueeze(-1) * max_S);
 
+  if (isTensorSubclassLike(input)) {
+     result = at::sum(S > tol, /*dim=*/-1);
+     return result;
+  }
+
   result = at::sum_out(result, S > tol, /*dim=*/-1);
   return result;
 }
@@ -1078,7 +1084,7 @@ Tensor chain_matmul(TensorList matrices) {
   checkAllSameDim(matrices, 2);
 
   TORCH_CHECK(
-      matrices.size() > 0, "chain_matmul(): Expected one or more matrices");
+      !matrices.empty(), "chain_matmul(): Expected one or more matrices");
 
   if (matrices.size() == 1) {
     return matrices[0].clone();
@@ -1096,7 +1102,7 @@ Tensor& chain_matmul_out(TensorList matrices, Tensor& result) {
   checkAllSameDim(matrices, 2);
 
   TORCH_CHECK(
-      matrices.size() > 0, "chain_matmul(): Expected one or more matrices");
+      !matrices.empty(), "chain_matmul(): Expected one or more matrices");
 
   if (matrices.size() == 1) {
     at::native::resize_output(result, matrices[0].sizes());
@@ -1791,30 +1797,62 @@ Tensor& vdot_out(const Tensor& self, const Tensor& other, Tensor& result) {
   return result.fill_(self.vdot(other));
 }
 
-bool should_fold(const Tensor& tensor1, const int64_t dim_tensor2) {
-  const auto dim_tensor1 = tensor1.dim();
-  if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
-    const auto t1_sizes_ptr = tensor1.sizes().cbegin();
-    const auto t1_strides = tensor1.strides();
-    if (dim_tensor1 == 3 && dim_tensor2 == 2 &&
-        t1_strides.back() != 1 &&
-        t1_strides.front() == t1_sizes_ptr[1] * t1_sizes_ptr[2]) {
-      // First dim is slowest moving, and then the following two dims are
-      // transposed. This can happen for example by permute(0, 2, 1).
-      // First 2 dims could be folded to use mm but would require permutation
-      // with actual data movement, which can be instead handled by BMM with each
-      // GEMM transposed.
-      // This can be generalized to a tensor with dim X + Y + Z where X, Y, and Z
-      // dims are contiguous, Y dims and Z dims are transposed, and X, Y, Z > 0.
-      // For example, this can happen by permute(0, 1, 5, 2, 3, 4), where X = 2,
-      // Y = 3, and Z = 1.
+bool should_fold(const Tensor& tensor1, const Tensor& tensor2) {
+  // We check that we can fold the larger tensor into a matrix and dispatch to mm or mv rather than
+  // to bmm. We want to make sure we can do so without incurring in any extra copy
+  const auto tensor1_larger = tensor1.dim() >= tensor2.dim();
+
+  // We order the tensors. t1 will be the larger tensor
+  // We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul)
+  // and tensor1_larger iff tensor2.dim() > tensor1.dim(9
+  const auto t1 = tensor1_larger ? MaybeOwned<Tensor>::borrowed(tensor1)
+                                 : MaybeOwned<Tensor>::owned(tensor2.mT());
+  const int64_t dim_t1 = t1->dim();
+  const auto dim_t2 = tensor1_larger ? tensor2.dim()
+                                     : tensor1.dim();
+
+  // Just fold for dim_t1 >= 3 and (dim_t2 == 1 || dim_t2 == 2)
+  if (!(dim_t1 >= 3 && dim_t2 <= 2)) {
+    return false;
+  }
+
+  // In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward
+  // Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer
+  // t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded)
+  // The issue appears in the backward.
+  // The output gradient g of this operation would have shape [b, m, k]
+  // The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
+  // Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
+  // of shape [b, n, k] unnacessarily, which may cause a large memory footprint, and in the
+  // worst case, an OOM
+  bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
+  if (t2_requires_grad) {
+    return true;
+  }
+
+  // Don't fold in this case, as we would have to call mm on the transposed tensor, the result
+  // would be contiguous, and then we would need to transpose it and call contiguous on it, thus
+  // having to copy the tensor
+  if (tensor1.dim() == 2) {
+    return false;
+  }
+
+  // Can always fold if the tensor is empty
+  // This serves as a precondition for the code below
+  if (t1->numel() == 0) {
+    return true;
+  }
+
+  // t1->view(-1, t1->size(-1)) does not copy only when the first n-1 dimensions are contiguous
+  // in the sense that t1_stride[i] = t1_stride[i+1]*t1_shape[i+1]
+  const auto t1_shape = t1->sizes();
+  const auto t1_strides = t1->strides();
+  for (auto i = int64_t{0}; i < dim_t1 - int64_t{2}; ++i) {
+    if (t1_strides[i] != t1_strides[i+1] * t1_shape[i+1]) {
       return false;
-    } else {
-      return true;
     }
-  } else {
-    return false;
   }
+  return true;
 }
 
 /*
@@ -1856,10 +1894,12 @@ Tensor _matmul_impl(
                    : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
   } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
     return has_out ? at::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2);
-  } else if (should_fold(tensor1, dim_tensor2) || should_fold(tensor2, dim_tensor1)) {
+  } else if (should_fold(tensor1, tensor2)) {
     // dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) ||
     // dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2)
-    // and some condition on the strides is fulfilled
+    // and at least one of the following two conditions hold
+    // - the small tensor requires grad (see should_fold for the why)
+    // - we can fold the larger tensor t1 into a matrix as t1.view(-1, t1.size(-1)) without copying
 
     // optimization: use mm instead of bmm by folding the batch of the larger tensor
     // into its leading matrix dimension
@@ -1885,41 +1925,38 @@ Tensor _matmul_impl(
     if (t2_is_matrix) {
       output_shape.push_back(t2->sizes()[1]);
     }
+    // This will almost always be a view.
+    // It may not be a view if t2->requires_grad(). See should_fold for an explanation
     const auto t1_folded = t1->reshape({folded_dim1, sizes_1.back()});
     if (!has_out) {
       if (t2_is_matrix) {
-        // FIXME This path always does an unnecessary copy when transpose == true as the returned
-        // result from BLAS is already C-transposed
         const auto output = at::_unsafe_view(t1_folded.mm(*t2), output_shape);
+        // This copies if we perform a 2D @ 3D and the first tensor requires_grad
+        // See should_fold for why.
+        // If mm_out were differentiable, we could use it here, and pass a result with the
+        // correct strides to avoid this unnecessary copy.
         return transpose ? output.mT().contiguous() : output;
       } else {
         return at::_unsafe_view(t1_folded.mv(*t2), output_shape);
       }
     } else {
+      // See the !has_out branch for an explanation
+      TORCH_INTERNAL_ASSERT(!(transpose && t2_is_matrix));
+
       // Resize output into the correct shape
-      const auto transpose_out = transpose && t2_is_matrix;
-      if (transpose_out) {
-        // Swap last two elements of output_shape
-        std::iter_swap(output_shape.end() - 2, output_shape.end() - 1);
-        at::native::resize_output(out, output_shape);
-        std::iter_swap(output_shape.end() - 2, output_shape.end() - 1);
-      } else {
-        at::native::resize_output(out, output_shape);
-      }
-      const auto out_ = transpose_out ? c10::MaybeOwned<Tensor>::owned(out.mT())
-                                      : c10::MaybeOwned<Tensor>::borrowed(out);
+      at::native::resize_output(out, output_shape);
 
       // We then reshape the output to the expected shape and call mm/mv
       // and transpose back if necessary
-      auto reshaped_out = t2_is_matrix ? out_->reshape({folded_dim1, t2->sizes().back()})
-                                       : out_->reshape({folded_dim1});
+      auto reshaped_out = t2_is_matrix ? out.reshape({folded_dim1, t2->sizes().back()})
+                                       : out.reshape({folded_dim1});
       if (t2_is_matrix) {
         at::mm_out(reshaped_out, t1_folded, *t2);
       } else {
         at::mv_out(reshaped_out, t1_folded, *t2);
       }
       if (!reshaped_out.is_alias_of(out)) {
-        out_->copy_(reshaped_out.view_as(*out_));
+        out.copy_(reshaped_out);
       }
       return out;
     }
@@ -1928,28 +1965,51 @@ Tensor _matmul_impl(
     // We track m1 vs m2 separately even though they must match for nicer error messages
     const int64_t n = dim_tensor1 > 1 ? tensor1.sizes().cend()[-2] : 1LL;
     const int64_t m1 = tensor1.sizes().back();
-    const IntArrayRef batch_tensor1(tensor1.sizes().data(),
-                                    std::max<int64_t>(dim_tensor1 - 2, 0LL));
-    const int64_t m2 = dim_tensor2 > 1 ? tensor2.sizes().cend()[-2] : tensor2.sizes().back();
+    auto batch_tensor1 = tensor1.sizes().slice(0, std::max<int64_t>(dim_tensor1 - 2, 0LL));
+    const int64_t m2 = dim_tensor2 > 1 ? tensor2.sizes().cend()[-2] : tensor2.sizes().front();
     const int64_t p = dim_tensor2 > 1 ? tensor2.sizes().back() : 1LL;
     const IntArrayRef batch_tensor2(tensor2.sizes().data(),
                                     std::max<int64_t>(dim_tensor2 - 2, 0LL));
-    auto output_shape = infer_size_dimvector(batch_tensor1, batch_tensor2);
 
-    const auto tensor1_expand_size = [&output_shape, n, m1]{ DimVector ret(output_shape);
-                                                             ret.append({n, m1});
-                                                             return ret; }();
-    const auto tensor2_expand_size = [&output_shape, m2, p]{ DimVector ret(output_shape);
-                                                             ret.append({m2, p});
-                                                             return ret; }();
+    // Same optimization for the gradients as that in should_fold
+    // If we're going to broadcast we force it to go through the should_fold branch
+    if (dim_tensor1 == 3 && dim_tensor2 == 3 && batch_tensor1[0] != batch_tensor2[0]) {
+      if (batch_tensor1[0] == 1 && (tensor1.requires_grad() || isTensorSubclassLike(tensor1))) {
+        return _matmul_impl(out, tensor1.squeeze(0), tensor2);
+      }
+      if (batch_tensor2[0] == 1 && (tensor2.requires_grad() || isTensorSubclassLike(tensor2))) {
+        return _matmul_impl(out, tensor1, tensor2.squeeze(0));
+      }
+    }
 
+    auto output_shape = infer_size_dimvector(batch_tensor1, batch_tensor2);
     const int64_t expand_batch_product = c10::multiply_integers(output_shape);
 
     // flatten expanded batches
+    const auto tensor1_expand_size = [&output_shape, n, m1]{ DimVector ret(output_shape);
+                                                             ret.append({n, m1});
+                                                             return ret; }();
     const auto tensor1_expanded = tensor1.expand(tensor1_expand_size)
                                          .reshape({expand_batch_product, n, m1});
-    const auto tensor2_expanded = tensor2.expand(tensor2_expand_size)
-                                         .reshape({expand_batch_product, m2, p});
+    // We need to treat the dim_tensor2 == 1 case separately as broadcasting would not convert
+    // a vector of shape (n,) into a batch of matrices of shape (*, n, 1)
+    auto vector_rhs = dim_tensor2 == 1;
+    const auto tensor2_expand_size = [&output_shape, m2, p, vector_rhs]{
+      DimVector ret(output_shape);
+      if (vector_rhs) {
+        ret.push_back(m2);
+      } else {
+        ret.append({m2, p});
+      }
+      return ret;
+    }();
+    auto tensor2_expanded = tensor2.expand(tensor2_expand_size);
+    if (vector_rhs) {
+      tensor2_expanded = tensor2_expanded.reshape({expand_batch_product, m2}).unsqueeze(2);
+    } else {
+      tensor2_expanded = tensor2_expanded.reshape({expand_batch_product, m2, p});
+    }
+
     if (dim_tensor1 > 1) {
       output_shape.push_back(n);
     }
@@ -1958,11 +2018,18 @@ Tensor _matmul_impl(
     }
 
     if (!has_out) {
-      return at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape);
+      if (vector_rhs) {
+        return at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded).squeeze(-1), output_shape);
+      } else {
+        return at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape);
+      }
     } else {
       at::native::resize_output(out, output_shape);
       auto reshaped_out = out.reshape({expand_batch_product, n, p});
       at::bmm_out(reshaped_out, tensor1_expanded, tensor2_expanded);
+      if (vector_rhs) {
+        reshaped_out = reshaped_out.squeeze(-1);
+      }
       if (!reshaped_out.is_alias_of(out)) {
         out.copy_(reshaped_out.view_as(out));
       }
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 7f3d80212bc6..484d58255fdb 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -246,13 +246,7 @@ Tensor kl_div(const Tensor& input, const Tensor& target, int64_t reduction, bool
   if (log_target) {
     output = at::exp(target) * (target - input);
   } else {
-    if (input.is_mps() || target.is_mps()) {
-      // MPS fallback, as MPS does not currently implement xlogy.
-      // MPS will give the wrong results at `target[i] = 0`
-      output = target * (at::log(target) - input);
-    } else {
-      output = at::xlogy(target, target) - target * input;
-    }
+    output = at::xlogy(target, target) - target * input;
   }
   return apply_loss_reduction(output, reduction);
 }
diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp
index 515ef588b441..efc640413046 100644
--- a/aten/src/ATen/native/MaxPooling.cpp
+++ b/aten/src/ATen/native/MaxPooling.cpp
@@ -40,7 +40,7 @@ static void check_max_pool1d(
       "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ",
       kernel_size.size());
   TORCH_CHECK(
-      stride.size() == 0 || stride.size() == 1,
+      stride.empty() || stride.size() == 1,
       "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ",
       stride.size());
   TORCH_CHECK(
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index adab802d65cd..3ba0c3ce2e7e 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -84,9 +84,7 @@ static void max_unpooling3d_shape_check(
     IntArrayRef stride,
     IntArrayRef padding,
     const char *fn_name) {
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
+
   TORCH_CHECK(
       indices.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64");
@@ -118,6 +116,10 @@ static void max_unpooling3d_shape_check(
       "strides should be greater than zero, but got stride: ",
       stride);
 
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
   int dimw = 3;
   int dimh = 2;
   int dimt = 1;
@@ -167,9 +169,6 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
   at::globalContext().alertNotDeterministic("max_unpooling3d_forward_out");
 
   TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
 
   auto self = self_.contiguous();
   auto indices = indices_.contiguous();
@@ -177,6 +176,10 @@ Tensor& max_unpooling3d_forward_out_cpu(const Tensor& self_,
   max_unpooling3d_shape_check(
     self_, Tensor(), indices_, output_size, stride, padding, "max_unpooling3d_forward_out_cpu()");
 
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
   if (self_.ndimension() == 5) {
     output.resize_({self.size(0), self.size(1), oT, oH, oW});
   } else {
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
index a9cf36a004f4..404b26e72c46 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
@@ -209,7 +209,10 @@ TORCH_META_FUNC(slow_conv_transpose2d)
 
   int n_output_plane = weight.size(1);
 
-  Tensor input_ = input.contiguous();
+  bool use_channels_last = native::thnn_conv_use_channels_last(input, weight);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
+  Tensor input_ = input.contiguous(memory_format);
 
   if (input_.dim() == 3) {
     input_.resize_({1, input_.size(0), input_.size(1), input_.size(2)});
@@ -231,15 +234,12 @@ TORCH_META_FUNC(slow_conv_transpose2d)
       0,
       {batch_size, n_output_plane, output_height, output_width},
       {},
-      options.memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT));
+      options.memory_format(memory_format));
 }
 } // namespace meta
 
 namespace native {
 
-template<typename scalar_t>
-void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda, scalar_t *x, int64_t incx, scalar_t beta, scalar_t *y, int64_t incy);
-
 namespace {
 void slow_conv_transpose2d_out_cpu_template(
     const Tensor& output,
@@ -265,19 +265,18 @@ void slow_conv_transpose2d_out_cpu_template(
   int n_input_plane = weight.size(0);
   int n_output_plane = weight.size(1);
 
-  Tensor input_ = input.contiguous();
-  Tensor weight_ = weight.contiguous();
+  bool use_channels_last = thnn_conv_use_channels_last(input, weight);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
 
-  Tensor bias_ = Tensor();
-
-  if (bias.defined()) {
-    bias_ = bias.contiguous();
-  }
+  Tensor input_ = input.contiguous(memory_format);
+  Tensor weight_ = weight.contiguous(memory_format);
+  Tensor bias_ = bias.defined() ? bias.contiguous() : Tensor();
 
   bool is_batch = false;
   if (input_.dim() == 3) {
     // Force batch
     is_batch = true;
+    input_.resize_({1, input.size(0), input.size(1), input.size(2)});
   }
 
   int64_t input_height = input_.size(2);
@@ -291,98 +290,97 @@ void slow_conv_transpose2d_out_cpu_template(
   int64_t batch_size = input_.size(0);
 
   // Create temporary columns
-  Tensor columns = at::zeros({n_output_plane * kernel_width * kernel_height,
-      input_height * input_width}, input_.options());
-
-  // Define a buffer of ones, for bias accumulation
-  Tensor ones = bias.defined() ? at::ones({output_height, output_width}, input_.options()) : Tensor();
+  Tensor columns = at::empty({0}, input.options());
+  if (use_channels_last) {
+    columns.resize_({batch_size, input_height * input_width, kernel_height * kernel_width * n_output_plane});
+  } else {
+    columns.resize_({batch_size, n_output_plane * kernel_height * kernel_width, input_height * input_width});
+  }
+  columns.zero_();
 
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long,
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Long, at::ScalarType::BFloat16,
       input.scalar_type(), "slow_conv_transpose2d_out_cpu", [&] {
-        // For each elt in batch, do:
-        for (const auto elt : c10::irange(batch_size)) {
-          // Helpers
-          Tensor input_n;
-          Tensor output_n;
 
-          // Matrix mulitply per output:
-          input_n = input_.select(0, elt);
-          output_n = output.select(0, elt);
+    at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) {
+      // For each elt in batch, do:
+      for (const auto elt : c10::irange(begin, end)) {
+        // Matrix mulitply per output:
+        Tensor input_n = input_.select(0, elt);
+        Tensor output_n = output.select(0, elt);
+        Tensor columns_n = columns.select(0, elt);
 
-          // M,N,K are dims of matrix A and B
-          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-          int64_t m = weight_.size(1) * weight_.size(2) * weight_.size(3);
+        if (use_channels_last) {
+          int64_t m = kernel_height * kernel_width * n_output_plane;
           int64_t n = input_height * input_width;
-          int64_t k = weight_.size(0);
+          int64_t k = n_input_plane;
 
-          // Do GEMM (note: this is a bit confusing because gemm assumes
-          // column-major matrices)
+          // column-major matrices
           cpublas::gemm(
               TransposeType::NoTranspose,
-              TransposeType::Transpose,
-              n,
+              TransposeType::NoTranspose,
               m,
+              n,
               k,
-              1,
+              static_cast<scalar_t>(1),
+              weight_.data_ptr<scalar_t>(),
+              m,
               input_n.data_ptr<scalar_t>(),
+              k,
+              static_cast<scalar_t>(0),
+              columns_n.data_ptr<scalar_t>(),
+              m);
+        } else {
+          int64_t m = input_height * input_width;
+          int64_t n = n_output_plane * kernel_height * kernel_width;
+          int64_t k = n_input_plane;
+
+          // column-major matrices
+          cpublas::gemm(
+              TransposeType::NoTranspose,
+              TransposeType::Transpose,
+              m,
               n,
-              weight_.data_ptr<scalar_t>(),
+              k,
+              static_cast<scalar_t>(1),
+              input_n.data_ptr<scalar_t>(),
               m,
-              0,
-              columns.data_ptr<scalar_t>(),
-              n);
-
-          // Unpack columns back into input:
-          col2im<scalar_t>(
-              columns.data_ptr<scalar_t>(),
-              n_output_plane,
-              output_height,
-              output_width,
-              input_height,
-              input_width,
-              kernel_height,
-              kernel_width,
-              pad_height,
-              pad_width,
-              stride_height,
-              stride_width,
-              dilation_height,
-              dilation_width,
-              output_n.data_ptr<scalar_t>());
-
-          // Do Bias after:
-          // M,N,K are dims of matrix A and B
-          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-          int64_t m_ = n_output_plane;
-          int64_t n_ = output_height * output_width;
-          int64_t k_ = 1;
-
-          // Do GEMM (note: this is a bit confusing because gemm assumes
-          // column-major matrices)
-          if (bias.defined()) {
-            cpublas::gemm(
-                TransposeType::Transpose,
-                TransposeType::NoTranspose,
-                n_,
-                m_,
-                k_,
-                1,
-                ones.data_ptr<scalar_t>(),
-                k_,
-                bias_.data_ptr<scalar_t>(),
-                k_,
-                1,
-                output_n.data_ptr<scalar_t>(),
-                n_);
-          }
+              weight_.data_ptr<scalar_t>(),
+              n,
+              static_cast<scalar_t>(0),
+              columns_n.data_ptr<scalar_t>(),
+              m);
         }
 
-        // Resize output
-        if (is_batch) {
-          output.resize_({n_output_plane, output_height, output_width});
-          input_.resize_({n_input_plane, input_height, input_width});
-        }
-      });
+        // Unpack columns back into input:
+        col2im<scalar_t>(
+            columns_n.data_ptr<scalar_t>(),
+            n_output_plane,
+            output_height,
+            output_width,
+            input_height,
+            input_width,
+            kernel_height,
+            kernel_width,
+            pad_height,
+            pad_width,
+            stride_height,
+            stride_width,
+            dilation_height,
+            dilation_width,
+            output_n.data_ptr<scalar_t>(),
+            use_channels_last);
+      }
+    });
+  });
+
+  if (bias.defined()) {
+    output.add_(bias_.reshape({-1, 1, 1}));
+  }
+
+  // Resize output
+  if (is_batch) {
+    output.resize_({n_output_plane, output_height, output_width});
+  }
 }
 
 static void slow_conv_transpose2d_backward_out_cpu_template(
@@ -434,6 +432,9 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
   int64_t n_input_plane = weight_.size(0);
   int64_t n_output_plane = weight_.size(1);
 
+  bool use_channels_last = thnn_conv_use_channels_last(input_, weight_);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
   slow_conv_transpose2d_shape_check(
       input_,
       grad_output_,
@@ -451,9 +452,9 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
       dilation_width,
       false);
 
-  Tensor input = input_.contiguous();
-  Tensor grad_output = grad_output_.contiguous();
-  Tensor weight = weight_.contiguous();
+  Tensor input = input_.contiguous(memory_format);
+  Tensor grad_output = grad_output_.contiguous(memory_format);
+  Tensor weight = weight_.contiguous(memory_format);
 
   bool is_batch = false;
   if (input.dim() == 3) {
@@ -475,17 +476,24 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
   int64_t batch_size = input.size(0);
 
   // Resize output
-  grad_input.resize_({batch_size, n_input_plane, input_height, input_width});
+  grad_input.resize_({batch_size, n_input_plane, input_height, input_width}, memory_format);
   grad_input.zero_();
 
   // Create temporary columns
   bool need_columns = (kernel_height != 1 || kernel_width != 1 || stride_height != 1 ||
       stride_width != 1 || pad_height != 0 || pad_width != 0 ||
       dilation_height != 1 || dilation_width != 1);
-  Tensor grad_columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height,
-      input_height * input_width}, input.options()) : Tensor();
 
-  AT_DISPATCH_FLOATING_TYPES(
+  Tensor grad_columns = at::empty({0}, input.options());
+  if (need_columns) {
+    if (use_channels_last) {
+      grad_columns.resize_({input_height * input_width, kernel_height * kernel_width * n_output_plane});
+    } else {
+      grad_columns.resize_({n_output_plane * kernel_height * kernel_width, input_height * input_width});
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16,
       grad_output.scalar_type(), "slow_conv_transpose2d_backward_out_cpu", [&] {
         // Helpers
         Tensor grad_input_n = Tensor();
@@ -514,39 +522,59 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
                   stride_width,
                   dilation_height,
                   dilation_width,
-                  grad_columns.data_ptr<scalar_t>());
+                  grad_columns.data_ptr<scalar_t>(),
+                  use_channels_last);
           }
 
-          // M,N,K are dims of matrix A and B
-          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-          int64_t m = weight.size(0);
-          int64_t n = input_height * input_width;
-          int64_t k = weight.size(1) * weight.size(2) * weight.size(3);
-
-          // Do GEMM (note: this is a bit confusing because gemm assumes
-          // column-major matrices)
           auto gemm_in_ptr = need_columns ? grad_columns.data_ptr<scalar_t>()
               : grad_output_n.data_ptr<scalar_t>();
-          cpublas::gemm(
-              TransposeType::NoTranspose,
-              TransposeType::NoTranspose,
-              n,
-              m,
-              k,
-              1,
-              gemm_in_ptr,
-              n,
-              weight.data_ptr<scalar_t>(),
-              k,
-              0,
-              grad_input_n.data_ptr<scalar_t>(),
-              n);
+
+          if (use_channels_last) {
+            int64_t m = n_input_plane;
+            int64_t n = input_height * input_width;
+            int64_t k = n_output_plane * kernel_height * kernel_width;
+
+            // column-major matrices
+            cpublas::gemm(
+                TransposeType::Transpose,
+                TransposeType::NoTranspose,
+                m,
+                n,
+                k,
+                static_cast<scalar_t>(1),
+                weight.data_ptr<scalar_t>(),
+                k,
+                gemm_in_ptr,
+                k,
+                static_cast<scalar_t>(0),
+                grad_input_n.data_ptr<scalar_t>(),
+                m);
+
+          } else {
+            int64_t m = input_height * input_width;
+            int64_t n = n_input_plane;
+            int64_t k = n_output_plane * kernel_height * kernel_width;
+
+            // column-major matrices
+            cpublas::gemm(
+                TransposeType::NoTranspose,
+                TransposeType::NoTranspose,
+                m,
+                n,
+                k,
+                static_cast<scalar_t>(1),
+                gemm_in_ptr,
+                m,
+                weight.data_ptr<scalar_t>(),
+                k,
+                static_cast<scalar_t>(0),
+                grad_input_n.data_ptr<scalar_t>(),
+                m);
+          }
         }
 
         // Resize output
         if (is_batch) {
-          grad_output.resize_({n_output_plane, output_height, output_width});
-          input.resize_({n_input_plane, input_height, input_width});
           grad_input.resize_({n_input_plane, input_height, input_width});
         }
       });
@@ -554,6 +582,7 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
 
 void slow_conv_transpose2d_acc_grad_parameters_cpu(
     const Tensor& input_,
+    const Tensor& weight_,
     const Tensor& grad_output_,
     Tensor& grad_weight,
     Tensor& grad_bias,
@@ -599,6 +628,9 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
   int64_t output_padding_height = output_padding[0];
   int64_t output_padding_width = output_padding[1];
 
+  bool use_channels_last = thnn_conv_use_channels_last(input_, weight_);
+  auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous;
+
   slow_conv_transpose2d_shape_check(
       input_,
       grad_output_,
@@ -616,31 +648,14 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
       dilation_width,
       true);
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t n_output_plane;
-  if (grad_weight.defined()) {
-    n_output_plane = grad_weight.size(1);
-  } else if (grad_bias.defined()) {
-    n_output_plane = grad_bias.size(0);
-  } else {
-    return;
-  }
-
-  Tensor input = input_.contiguous();
-  Tensor grad_output = grad_output_.contiguous();
+  int n_input_plane = weight_.size(0);
+  int n_output_plane = weight_.size(1);
 
-  if (grad_weight.defined()) {
-    TORCH_CHECK(
-        grad_weight.is_contiguous(), "grad_weight needs to be contiguous");
-  }
-  if (grad_bias.defined()) {
-    TORCH_CHECK(grad_bias.is_contiguous(), "grad_bias needs to be contiguous");
-  }
+  Tensor input = input_.contiguous(memory_format);
+  Tensor grad_output = grad_output_.contiguous(memory_format);
+  TORCH_CHECK(grad_weight.is_contiguous(memory_format), "grad_weight needs to be contiguous");
 
-  bool is_batch = false;
   if (input.dim() == 3) {
-    // Force batch
-    is_batch = true;
     input.resize_({1, input.size(0), input.size(1), input.size(2)});
     grad_output.resize_(
         {1, grad_output.size(0), grad_output.size(1), grad_output.size(2)});
@@ -660,10 +675,17 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
   bool need_columns = (kernel_height != 1 || kernel_width != 1 || stride_height != 1 ||
       stride_width != 1 || pad_height != 0 || pad_width != 0 ||
       dilation_height != 1 || dilation_width != 1);
-  Tensor columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height,
-      input_height * input_width}, input.options()) : Tensor();
 
-  AT_DISPATCH_FLOATING_TYPES(
+  Tensor columns = at::empty({0}, input.options());
+  if (need_columns) {
+    if (use_channels_last) {
+      columns.resize_({input_height * input_width, kernel_height * kernel_width * n_output_plane});
+    } else {
+      columns.resize_({n_output_plane * kernel_height * kernel_width, input_height * input_width});
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16,
       input.scalar_type(), "slow_conv_transpose2d_acc_grad_parameters_cpu", [&] {
         // Helpers
         Tensor input_n = Tensor();
@@ -698,44 +720,55 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
                   stride_width,
                   dilation_height,
                   dilation_width,
-                  columns.data_ptr<scalar_t>());
+                  columns.data_ptr<scalar_t>(),
+                  use_channels_last);
             }
 
-            // M,N,K are dims of matrix A and B
-            // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-            int64_t n = n_output_plane * kernel_height * kernel_width;
-            int64_t m = input_n.size(0); // n_input_plane
-            int64_t k = input_height * input_width;
-
-            // Do GEMM (note: this is a bit confusing because gemm assumes
-            // column-major matrices)
             auto gemm_in_ptr = need_columns ? columns.data_ptr<scalar_t>()
                 : grad_output_n.data_ptr<scalar_t>();
-            cpublas::gemm(
-                TransposeType::Transpose,
-                TransposeType::NoTranspose,
-                n,
-                m,
-                k,
-                scale,
-                gemm_in_ptr,
-                k,
-                input_n.data_ptr<scalar_t>(),
-                k,
-                1,
-                grad_weight.data_ptr<scalar_t>(),
-                n);
-          }
-        }
 
-        if (grad_bias.defined()) {
-          at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
-        }
-
-        // Resize
-        if (is_batch) {
-          grad_output.resize_({n_output_plane, output_height, output_width});
-          input.resize_({input.size(1), input_height, input_width});
+            if (use_channels_last) {
+              int64_t m = kernel_height * kernel_width * n_output_plane;
+              int64_t n = n_input_plane;
+              int64_t k = input_height * input_width;
+
+              // column-major matrices
+              cpublas::gemm(
+                  TransposeType::NoTranspose,
+                  TransposeType::Transpose,
+                  m,
+                  n,
+                  k,
+                  static_cast<scalar_t>(scale),
+                  gemm_in_ptr,
+                  m,
+                  input_n.data_ptr<scalar_t>(),
+                  n,
+                  static_cast<scalar_t>(1),
+                  grad_weight.data_ptr<scalar_t>(),
+                  m);
+            } else {
+              int64_t m = n_output_plane * kernel_height * kernel_width;
+              int64_t n = n_input_plane;
+              int64_t k = input_height * input_width;
+
+              // column-major matrices
+              cpublas::gemm(
+                  TransposeType::Transpose,
+                  TransposeType::NoTranspose,
+                  m,
+                  n,
+                  k,
+                  static_cast<scalar_t>(scale),
+                  gemm_in_ptr,
+                  k,
+                  input_n.data_ptr<scalar_t>(),
+                  k,
+                  static_cast<scalar_t>(1),
+                  grad_weight.data_ptr<scalar_t>(),
+                  m);
+            }
+          }
         }
       });
 }
@@ -790,19 +823,16 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv_transpose2d_backward_out_cpu(con
         dilation);
   }
 
-  if (grad_weight.defined()) {
-    grad_weight.resize_(weight.sizes());
-    grad_weight.zero_();
-  }
-
   if (grad_bias.defined()) {
-    grad_bias.resize_({weight.size(1)});
-    grad_bias.zero_();
+    at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
   }
 
-  if (grad_weight.defined() || grad_bias.defined()) {
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes(), weight.suggest_memory_format());
+    grad_weight.zero_();
     slow_conv_transpose2d_acc_grad_parameters_cpu(
         input,
+        weight,
         grad_output,
         grad_weight,
         grad_bias,
@@ -863,19 +893,16 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv_transpose2d_backward_cpu(
         dilation);
   }
 
-  if (grad_weight.defined()) {
-    grad_weight.resize_(weight.sizes());
-    grad_weight.zero_();
-  }
-
   if (grad_bias.defined()) {
-    grad_bias.resize_({weight.size(1)});
-    grad_bias.zero_();
+    at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
   }
 
-  if (grad_weight.defined() || grad_bias.defined()) {
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes(), weight.suggest_memory_format());
+    grad_weight.zero_();
     slow_conv_transpose2d_acc_grad_parameters_cpu(
         input,
+        weight,
         grad_output,
         grad_weight,
         grad_bias,
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index cf60f56f9df4..6ff61684aa8a 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -291,7 +291,7 @@ void slow_conv_transpose3d_out_cpu_template(
   // Define a buffer of ones, for bias accumulation
   Tensor ones = bias.defined() ? at::ones({output_depth, output_height, output_width}, input_.options()) : Tensor();
 
-  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long,
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Long, at::ScalarType::BFloat16,
       input.scalar_type(), "slow_conv_transpose3d_out_cpu", [&] {
         // Helpers
         Tensor input_n;
@@ -319,12 +319,12 @@ void slow_conv_transpose3d_out_cpu_template(
               n,
               m,
               k,
-              1,
+              static_cast<scalar_t>(1),
               input_n.data_ptr<scalar_t>(),
               n,
               weight.data_ptr<scalar_t>(),
               m,
-              0,
+              static_cast<scalar_t>(0),
               columns.data_ptr<scalar_t>(),
               n);
 
@@ -368,12 +368,12 @@ void slow_conv_transpose3d_out_cpu_template(
                 n_,
                 m_,
                 k_,
-                1,
+                static_cast<scalar_t>(1),
                 ones.data_ptr<scalar_t>(),
                 k_,
                 bias.data_ptr<scalar_t>(),
                 k_,
-                1,
+                static_cast<scalar_t>(1),
                 output_n.data_ptr<scalar_t>(),
                 n_);
           }
@@ -515,7 +515,7 @@ void slow_conv_transpose3d_backward_out_cpu_template(
   Tensor grad_columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height * kernel_depth,
       input_depth * input_height * input_width}, input.options()) : Tensor();
 
-  AT_DISPATCH_FLOATING_TYPES(
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16,
       input.scalar_type(), "slow_conv_transpose3d_backward_out_cpu", [&] {
         // Helpers
         Tensor grad_input_n;
@@ -571,12 +571,12 @@ void slow_conv_transpose3d_backward_out_cpu_template(
               n,
               m,
               k,
-              1,
+              static_cast<scalar_t>(1),
               gemm_in_ptr,
               n,
               weight.data_ptr<scalar_t>(),
               k,
-              0,
+              static_cast<scalar_t>(0),
               grad_input_n.data_ptr<scalar_t>(),
               n);
         }
@@ -728,7 +728,7 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu(
   Tensor columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height * kernel_depth,
       input_depth * input_height * input_width}, input.options()) : Tensor();
 
-  AT_DISPATCH_FLOATING_TYPES(
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16,
       input.scalar_type(),
       "slow_conv_transpose3d_acc_grad_parameters_cpu",
       [&] {
@@ -791,12 +791,12 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu(
                 n,
                 m,
                 k,
-                scale,
+                static_cast<scalar_t>(scale),
                 gemm_in_ptr,
                 k,
                 input_n.data_ptr<scalar_t>(),
                 k,
-                1,
+                static_cast<scalar_t>(1),
                 grad_weight.data_ptr<scalar_t>(),
                 n);
           }
diff --git a/aten/src/ATen/native/NonEmptyUtils.h b/aten/src/ATen/native/NonEmptyUtils.h
index bd830cb67816..fdfded039aa8 100644
--- a/aten/src/ATen/native/NonEmptyUtils.h
+++ b/aten/src/ATen/native/NonEmptyUtils.h
@@ -18,7 +18,7 @@ inline int64_t ensure_nonempty_stride(const TensorBase &t, int64_t dim) {
 
 using IdxVec = std::vector<int64_t>;
 inline IdxVec ensure_nonempty_vec(IdxVec vec) {
-  if (vec.size() == 0) {
+  if (vec.empty()) {
     vec.push_back(1);
   }
   return vec;
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index ab9094d9b598..dc03d5209777 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -43,13 +43,15 @@
 #include <ATen/ops/native_batch_norm_backward.h>
 #include <ATen/ops/native_batch_norm_backward_native.h>
 #include <ATen/ops/native_batch_norm_native.h>
+#include <ATen/ops/_native_batch_norm_legit.h>
 #include <ATen/ops/renorm_native.h>
 #include <ATen/ops/sum.h>
 #include <ATen/ops/sqrt.h>
 #endif
 
-#include <vector>
 #include <c10/core/SymIntArrayRef.h>
+#include <utility>
+#include <vector>
 
 static const int MIOPEN_DIM_MAX = 5;
 
@@ -135,8 +137,10 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
 
   // inference contiguous path
   if (all_contiguous) {
-    batch_norm_cpu_stub(kCPU, output, input, weight, bias,
-        save_mean, save_invstd, running_mean, running_var, train, eps);
+    if (input.numel() != 0) {
+      batch_norm_cpu_stub(kCPU, output, input, weight, bias,
+          save_mean, save_invstd, running_mean, running_var, train, eps);
+    }
     return std::make_tuple(output, save_mean, save_invstd);
   }
 
@@ -490,7 +494,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
     auto options = input.options().dtype(
         at::toAccumulateType(input.scalar_type(), /*is_cuda=*/input.is_cuda()));
     auto save_mean = at::empty_symint(c10::SymIntArrayRef({num_features}), options);
-    auto save_invstd = at::empty_symint(c10::SymIntArrayRef({num_features}), options);
+    auto save_invstd = at::empty_symint(c10::SymIntArrayRef({std::move(num_features)}), options);
 
     // don't return view of input, don't return empty tensor because it will break gradient chain
     auto out = input.clone();
@@ -514,7 +518,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
     check_dims_match_num_input_features("weight", num_features, weight.sym_numel());
   }
   if (bias.defined()) {
-    check_dims_match_num_input_features("bias", num_features, bias.sym_numel());
+    check_dims_match_num_input_features("bias", std::move(num_features), bias.sym_numel());
   }
 
   const bool use_cudnn = (
@@ -672,7 +676,7 @@ Tensor instance_norm(
     at::alias(running_mean).copy_(running_mean_.view_symint({ b, c }).mean(0, false));
   }
   if (running_var.defined()) {
-    at::alias(running_var).copy_(running_var_.view_symint({ b, c }).mean(0, false));
+    at::alias(running_var).copy_(running_var_.view_symint({ std::move(b), std::move(c) }).mean(0, false));
   }
 
   return out.view_symint(input.sym_sizes());
@@ -799,6 +803,11 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cpu(
     bool train, double momentum, double eps) {
   return batch_norm_cpu(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps);
 }
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_training(
+    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) {
+  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*train=*/false, momentum, eps);
+}
 
 
 std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 0ff4490086b7..15c16d1d7ba5 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -4,6 +4,8 @@
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/irange.h>
 
+#include <utility>
+
 #pragma once
 
 namespace at {
@@ -93,7 +95,7 @@ inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
 
 inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
     c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) {
-  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+  return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), stride, dilation);
 }
 
 // AveragePool2d/DilatedMaxPool2d (forward)
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
index fcbe741ab0ea..24e813a485a6 100644
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -9,7 +9,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_mps_max_pool2d.h>
 #include <ATen/ops/adaptive_avg_pool1d_native.h>
 #include <ATen/ops/adaptive_avg_pool2d.h>
 #include <ATen/ops/adaptive_max_pool1d_native.h>
@@ -141,12 +140,6 @@ Tensor max_pool2d(
     return at::mkldnn_max_pool2d(
         self, kernel_size, stride, padding, dilation, ceil_mode);
   }
-#ifdef USE_MPS
-  if (self.is_mps()) {
-    return at::_mps_max_pool2d(
-        self, kernel_size, stride, padding, dilation, ceil_mode);
-  }
-#endif
 #if defined(C10_MOBILE)
   if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride,
                              dilation, ceil_mode)) {
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 3fcd6f366dc0..6b2b985bdd92 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -403,11 +403,6 @@ struct QuantizedCellParamsDynamic : public CellParamsBase {
     return b_hh_;
   }
   CellParamsSerializationType __getstate__() const override {
-    // Boxed dispatch nonsense
-    // This will be cleaned up in the subsequent PR
-    auto unpacked_ih = packed_w_ih->unpack();
-    auto unpacked_hh = packed_w_hh->unpack();
-
     std::vector<at::Tensor> tensors_to_serialize{
         /*b_ih=*/b_ih_,
         /*b_hh=*/b_hh_,
@@ -1428,7 +1423,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
   }
 #ifdef USE_MPS
   if (_input.is_mps() && !bidirectional) {
-    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
+    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> output = at::_lstm_mps(_input, hx, _params, has_biases,
             num_layers, dropout_p, train, bidirectional, batch_first);
     std::tuple<Tensor, Tensor, Tensor> return_values = std::make_tuple(std::get<0>(output), std::get<1>(output), std::get<2>(output));
     return return_values;
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index db5bdd088bc1..027ccbeb72df 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -26,6 +26,8 @@
 #include <ATen/ops/_cummin_helper_native.h>
 #include <ATen/ops/_logcumsumexp.h>
 #include <ATen/ops/_logcumsumexp_native.h>
+#include <ATen/ops/_sparse_sum.h>
+#include <ATen/ops/_sparse_sum_native.h>
 #include <ATen/ops/add.h>
 #include <ATen/ops/all_meta.h>
 #include <ATen/ops/all_native.h>
@@ -126,20 +128,6 @@
 namespace at {
 namespace native {
 
-inline ScalarType get_dtype_from_self(
-    const Tensor& self,
-    const optional<ScalarType>& dtype,
-    bool promote_integers) {
-  if (dtype.has_value()) {
-    return dtype.value();
-  }
-  ScalarType src_type = self.scalar_type();
-  if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) {
-    return kLong;
-  }
-  return src_type;
-}
-
 } // namespace native
 
 namespace meta {
@@ -917,6 +905,10 @@ static inline void diff_check(const Tensor& self, int64_t n, int64_t dim, const
       self.dim() >= 1,
       "diff expects input to be at least one-dimensional");
 
+  TORCH_CHECK(
+      n >= 0,
+      "order must be non-negative but got ", n);
+
   diff_check_compatible_shape(self, prepend, dim);
   diff_check_compatible_shape(self, append, dim);
 }
@@ -1161,14 +1153,6 @@ std::vector<Tensor> gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o
 
 // ALL REDUCE #################################################################
 
-inline ScalarType get_dtype_from_result(Tensor& result, optional<ScalarType> dtype) {
-  TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor.");
-  if (dtype.has_value()) {
-    return dtype.value();
-  } else {
-    return result.scalar_type();
-  }
-}
 
 TORCH_IMPL_FUNC(sum_out)
 (const Tensor& self,
@@ -1199,7 +1183,10 @@ Tensor& sum_out(const Tensor& self, DimnameList dim,
 
 Tensor& nansum_out(const Tensor& self, at::OptionalIntArrayRef dim,
                        bool keepdim, optional<ScalarType> opt_dtype, Tensor& result) {
-  TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs");
+  if (self.device().is_cpu()) {
+    TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs");
+  }
+
   // For integral types, use existing sum as
   // integral types don't have `Nan`.
   if (c10::isIntegralType(self.scalar_type(), true)){
@@ -1319,7 +1306,7 @@ TORCH_IMPL_FUNC(mean_out)
   // in lieu of the sum + divide implementation below.
   if (self.device().is_cpu()) {
     int64_t dim_prod = 1;
-    if (!opt_dim.has_value() || opt_dim.value().size() == 0 || self.ndimension() == 0) {
+    if (!opt_dim.has_value() || opt_dim.value().empty() || self.ndimension() == 0) {
       dim_prod = self.numel();
     } else {
       auto dim = opt_dim.value();
@@ -1361,8 +1348,8 @@ Tensor& nanmean_out(
     c10::optional<ScalarType> opt_dtype,
     Tensor& result) {
   TORCH_CHECK(
-      self.is_floating_point(),
-      "nanmean(): expected input to have floating point dtype but got ",
+      self.is_floating_point() || self.is_complex(),
+      "nanmean(): expected input to have floating point or complex dtype but got ",
       self.scalar_type());
   const auto factor = at::native::isnan(self).logical_not_().sum(dim, keepdim);
   at::native::nansum_out(self, dim, keepdim, opt_dtype, result).div_(factor);
@@ -1375,8 +1362,8 @@ Tensor nanmean(
     bool keepdim,
     optional<ScalarType> opt_dtype) {
   TORCH_CHECK(
-      self.is_floating_point(),
-      "nanmean(): expected input to have floating point dtype but got ",
+      self.is_floating_point() || self.is_complex(),
+      "nanmean(): expected input to have floating point or complex dtype but got ",
       self.scalar_type());
   const auto factor =
       at::native::isnan(self.detach()).logical_not_().sum(dim, keepdim);
@@ -1620,7 +1607,7 @@ TORCH_IMPL_FUNC(argmin_out)
   argmax_argmin_impl(self, dim, keepdim, result, argmin_stub);
 }
 
-static double std_var_all_cpu(const Tensor& self, int64_t correction, bool take_sqrt) {
+static double std_var_all_cpu(const Tensor& self, double correction, bool take_sqrt) {
   const auto dtype = self.scalar_type();
   TORCH_CHECK(dtype == kDouble || dtype == kFloat,
               "std_var_all: Unsupported dtype ", dtype);
@@ -1658,7 +1645,7 @@ static double std_var_all_cpu(const Tensor& self, int64_t correction, bool take_
       0, iter.numel(), at::internal::GRAIN_SIZE, 0.0, reduction, std::plus<>{});
 
   const auto var = [&] () __ubsan_ignore_float_divide_by_zero__ {
-    return sum_dx2 / std::max(int64_t{0}, self.numel() - correction);
+    return sum_dx2 / std::max(0.0, self.numel() - correction);
   }();
   const auto result = take_sqrt ? std::sqrt(var) : var;
 
@@ -1672,7 +1659,7 @@ static double std_var_all_cpu(const Tensor& self, int64_t correction, bool take_
 
 static Tensor& std_var_out(
     const char* fname, Tensor& result, const Tensor& self,
-    at::OptionalIntArrayRef dim, c10::optional<int64_t> correction_opt,
+    at::OptionalIntArrayRef dim, const c10::optional<Scalar>& correction_opt,
     bool keepdim, bool take_sqrt) {
   TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda(),
               "std and var only supports tensors on a CPU or CUDA device, but got: ",
@@ -1716,7 +1703,7 @@ static Tensor& std_var_out(
   }
 
   // Computation for floating point
-  const auto correction = correction_opt.value_or(1);
+  const auto correction = correction_opt.value_or(1).toDouble();
   ScalarType dtype = get_dtype_from_result(result, {});
   auto iter = make_reduction(fname, result, self, dim, keepdim, dtype);
   TORCH_CHECK(at::canCast(self.scalar_type(), result.scalar_type()),
@@ -1743,7 +1730,7 @@ static Tensor& std_var_out(
 
 static std::tuple<Tensor&, Tensor&> std_var_mean_out(
     const char* fname, Tensor& result1, Tensor& result2, const Tensor& self,
-    at::OptionalIntArrayRef dim, c10::optional<int64_t> correction_opt,
+    at::OptionalIntArrayRef dim, const c10::optional<Scalar>& correction_opt,
     bool keepdim, bool take_sqrt) {
   AT_ASSERT(result1.defined() && result2.defined());
   TORCH_CHECK(self.device().is_cpu() || self.is_cuda(),
@@ -1797,7 +1784,7 @@ static std::tuple<Tensor&, Tensor&> std_var_mean_out(
   }
 
   // Computation for floating point
-  const auto correction = correction_opt.value_or(1);
+  const auto correction = correction_opt.value_or(1).toDouble();
   ScalarType dtype = get_dtype_from_result(result1, {});
   auto iter =
       make_reduction(fname, result1, result2, self, dim, keepdim, dtype);
@@ -1816,7 +1803,7 @@ std::tuple<Tensor, Tensor> var_mean(
     const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) {
   return at::var_mean(
       self, /*dim=*/at::OptionalIntArrayRef(dim),
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}),
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0),
       keepdim);
 }
 
@@ -1824,22 +1811,21 @@ std::tuple<Tensor, Tensor> std_mean(
     const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) {
   return at::std_mean(
       self, /*dim=*/at::OptionalIntArrayRef(dim),
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}),
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0),
       keepdim);
 }
 
 std::tuple<Tensor, Tensor> std_mean(const Tensor& self, bool unbiased) {
   return at::std_mean(
       self, /*dim=*/c10::nullopt,
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}));
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0));
 }
 
 std::tuple<Tensor, Tensor> var_mean(const Tensor& self, bool unbiased) {
   return at::var_mean(
       self, /*dim=*/c10::nullopt,
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}));
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0));
 }
-
 std::tuple<Tensor&, Tensor&> var_mean_out(
     Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim,
     int64_t correction, bool keepdim) {
@@ -1854,7 +1840,7 @@ static TensorOptions options_to_value_type(TensorOptions opts) {
 
 std::tuple<Tensor, Tensor> var_mean(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction, bool keepdim) {
+    const c10::optional<Scalar>& correction, bool keepdim) {
   Tensor result1 = at::empty({0}, options_to_value_type(self.options()));
   Tensor result2 = at::empty({0}, self.options());
   return std_var_mean_out(
@@ -1863,7 +1849,7 @@ std::tuple<Tensor, Tensor> var_mean(
 
 std::tuple<Tensor, Tensor> std_mean(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction, bool keepdim) {
+    const c10::optional<Scalar>& correction, bool keepdim) {
   Tensor result1 = at::empty({0}, options_to_value_type(self.options()));
   Tensor result2 = at::empty({0}, self.options());
   return std_var_mean_out(
@@ -1873,59 +1859,59 @@ std::tuple<Tensor, Tensor> std_mean(
 Tensor var(const Tensor& self, bool unbiased) {
   return at::var(
       self, /*dim=*/c10::nullopt,
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}));
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0));
 }
 
 Tensor var(const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) {
   return at::var(
       self, /*dim=*/at::OptionalIntArrayRef(dim),
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}),
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0),
       keepdim);
 }
 
 Tensor& var_out(const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim, Tensor& result) {
   return at::var_out(
       result, self, /*dim=*/at::OptionalIntArrayRef(dim),
-      /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}),
+      /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0),
       keepdim);
 }
 
 Tensor std(const Tensor& self, bool unbiased) {
   return at::std(
-      self, /*dim=*/c10::nullopt, /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}));
+      self, /*dim=*/c10::nullopt, /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0));
 }
 
 Tensor std(const Tensor& self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) {
   return at::std(self, dim,
-                 /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}), keepdim);
+                 /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0), keepdim);
 }
 
 Tensor& std_out(const Tensor& self, at::OptionalIntArrayRef opt_dim, bool unbiased, bool keepdim, Tensor& result) {
   return at::std_out(result, self, opt_dim,
-                     /*correction=*/c10::make_optional<int64_t>({unbiased ? 1 : 0}), keepdim);
+                     /*correction=*/c10::make_optional<Scalar>(unbiased ? 1 : 0), keepdim);
 }
 
 Tensor std(const Tensor& self, at::OptionalIntArrayRef dim,
-           c10::optional<int64_t> correction, bool keepdim) {
+           const c10::optional<Scalar>& correction, bool keepdim) {
   Tensor result = at::empty({0}, options_to_value_type(self.options()));
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
 Tensor& std_out(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction, bool keepdim, Tensor& result) {
+    const c10::optional<Scalar>& correction, bool keepdim, Tensor& result) {
   return std_var_out("std", result, self, dim, correction, keepdim, true);
 }
 
 Tensor& var_out(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction, bool keepdim, Tensor& result) {
+    const c10::optional<Scalar>& correction, bool keepdim, Tensor& result) {
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
 
 Tensor var(
     const Tensor& self, at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction, bool keepdim) {
+    const c10::optional<Scalar>& correction, bool keepdim) {
   Tensor result = at::empty({0}, options_to_value_type(self.options()));
   return std_var_out("var", result, self, dim, correction, keepdim, false);
 }
@@ -1955,32 +1941,32 @@ std::tuple<Tensor,Tensor> std_mean(const Tensor& self, DimnameList dim, bool unb
   return at::std_mean(self, dimnames_to_positions(self, dim), unbiased, keepdim);
 }
 
-Tensor std(const Tensor& self, DimnameList dim, c10::optional<int64_t> correction, bool keepdim) {
+Tensor std(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction, bool keepdim) {
   return at::std(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
-Tensor& std_out(const Tensor& self, DimnameList dim, c10::optional<int64_t> correction,
+Tensor& std_out(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction,
                 bool keepdim, Tensor& result) {
   return at::std_out(result, self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
-Tensor var(const Tensor& self, DimnameList dim, c10::optional<int64_t> correction, bool keepdim) {
+Tensor var(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction, bool keepdim) {
   return at::var(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
-Tensor& var_out(const Tensor& self, DimnameList dim, c10::optional<int64_t> correction,
+Tensor& var_out(const Tensor& self, DimnameList dim, const c10::optional<Scalar>& correction,
                 bool keepdim, Tensor& result) {
   return at::var_out(
       result, self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
 std::tuple<Tensor,Tensor> var_mean(const Tensor& self, DimnameList dim,
-                                   c10::optional<int64_t> correction, bool keepdim) {
+                                   const c10::optional<Scalar>& correction, bool keepdim) {
   return at::var_mean(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
 std::tuple<Tensor,Tensor> std_mean(const Tensor& self, DimnameList dim,
-                                   c10::optional<int64_t> correction, bool keepdim) {
+                                   const c10::optional<Scalar>& correction, bool keepdim) {
   return at::std_mean(self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
@@ -2120,7 +2106,7 @@ Tensor value_selecting_reduction_backward_symint(const Tensor& grad, int64_t dim
         return grad_in.scatter_(dim, indices_, grad_out);
       };
 
-  if (!keepdim && sizes.size() > 0) {
+  if (!keepdim && !sizes.empty()) {
     auto grad_ = grad.unsqueeze(dim);
     auto indices_ = indices.unsqueeze(dim);
     return inplace_scatter_if_not_tensor_subclass(grad_, indices_);
@@ -2136,5 +2122,31 @@ Tensor sum_coo(const Tensor &self, c10::optional<ScalarType> dtype) {
   return self._values().sum(dtype);
 }
 
+Tensor sum_sparse_coo(const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype) {
+  Tensor result;
+  if (dim.has_value()) {
+    if (dtype.has_value()) {
+      result = at::_sparse_sum(self, *dim, *dtype);
+    } else {
+      if (c10::isIntegralType(self.scalar_type(), true)) {
+        result = at::_sparse_sum(self, *dim, at::kLong);
+      } else {
+        result = at::_sparse_sum(self, *dim);
+      }
+    }
+  } else {
+    result = sum_coo(self, dtype);
+  }
+  if (keepdim) {
+    auto dim_mask = make_dim_mask(dim, self.dim());
+    for (int dim = 0; dim < self.dim(); dim++) {
+      if (dim_mask[dim]) {
+        result = result.unsqueeze(dim);
+      }
+    }
+  }
+  return result;
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/ReduceOps.h b/aten/src/ATen/native/ReduceOps.h
index c14033de634d..d3c922901157 100644
--- a/aten/src/ATen/native/ReduceOps.h
+++ b/aten/src/ATen/native/ReduceOps.h
@@ -28,7 +28,7 @@ DECLARE_DISPATCH(reduce_fn, argmax_stub);
 DECLARE_DISPATCH(reduce_fn, argmin_stub);
 
 using reduce_std_var_function =
-    void (*)(TensorIterator&, int64_t correction, bool take_sqrt);
+    void (*)(TensorIterator&, double correction, bool take_sqrt);
 DECLARE_DISPATCH(reduce_std_var_function, std_var_stub);
 
 using reduce_norm_fn =
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index 2b46eb683f1c..8aa94c4b45ee 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -320,6 +320,30 @@ static C10_UNUSED void zero_numel_tensor_resize(Tensor& result, Tensor& result_i
   at::native::resize_output(result_indices, sizes);
 }
 
+inline ScalarType get_dtype_from_self(
+    const Tensor& self,
+    const c10::optional<ScalarType>& dtype,
+    bool promote_integers) {
+  if (dtype.has_value()) {
+    return dtype.value();
+  }
+  ScalarType src_type = self.scalar_type();
+  if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) {
+    return kLong;
+  }
+  return src_type;
+}
+
+inline ScalarType get_dtype_from_result(Tensor& result, c10::optional<ScalarType> dtype) {
+  TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor.");
+  if (dtype.has_value()) {
+    return dtype.value();
+  } else {
+    return result.scalar_type();
+  }
+}
+
+
 } // native
 
 namespace meta {
diff --git a/aten/src/ATen/native/ReductionType.h b/aten/src/ATen/native/ReductionType.h
index 2251dc4f50c2..63cd4e094ce6 100644
--- a/aten/src/ATen/native/ReductionType.h
+++ b/aten/src/ATen/native/ReductionType.h
@@ -7,11 +7,11 @@ namespace at { namespace native {
 enum ReductionType {MAX, MEAN, MIN, SUM, PROD};
 
 static inline ReductionType get_reduction_enum(const c10::string_view& reduce) {
-  if (reduce == "amax") {
+  if (reduce == "max" || reduce == "amax") {
     return ReductionType::MAX;
   } else if (reduce == "mean") {
     return ReductionType::MEAN;
-  } else if (reduce == "amin") {
+  } else if (reduce == "min" || reduce == "amin") {
     return ReductionType::MIN;
   } else if (reduce == "sum") {
     return ReductionType::SUM;
diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp
index 3a6ad683d045..b712c8ea9e9e 100644
--- a/aten/src/ATen/native/ReflectionPad.cpp
+++ b/aten/src/ATen/native/ReflectionPad.cpp
@@ -81,7 +81,6 @@ TORCH_META_FUNC(reflection_pad1d)(const Tensor& input, IntArrayRef padding) {
 TORCH_META_FUNC(reflection_pad1d_backward)(const Tensor& grad_output,
     const Tensor& input,
     IntArrayRef padding) {
-  int64_t dim_plane = 0;
   int64_t dim_w = 1;
   int64_t nbatch = 1;
 
@@ -89,7 +88,6 @@ TORCH_META_FUNC(reflection_pad1d_backward)(const Tensor& grad_output,
     nbatch = input.size(0);
     (void)nbatch;
     dim_w++;
-    dim_plane++;
   }
 
   /* sizes */
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index d0a4ea919acb..af97d1979c5c 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -70,19 +70,13 @@ TORCH_META_FUNC(replication_pad1d_backward) (
   IntArrayRef paddingSize
 ) {
   int64_t dimw = 1;
-  int64_t dimslices = 0;
-  int64_t nbatch = 1;
   TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
   int64_t pad_l = paddingSize[0];
   int64_t pad_r = paddingSize[1];
 
   if (input.ndimension() == 3)
   {
-    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-    nbatch = input.size(0);
-    (void)nbatch;
     dimw++;
-    dimslices++;
   }
 
   /* sizes */
@@ -154,7 +148,7 @@ static inline void shapeCheck3d(
   int dimw = 3;
   int dimh = 2;
   int dimd = 1;
-  int dimslices = 0;
+  /* int dimslices = 0; */
 
   // allow batch size of 0-dim.
   bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
@@ -169,7 +163,7 @@ static inline void shapeCheck3d(
     dimw++;
     dimh++;
     dimd++;
-    dimslices++;
+    /* dimslices++; */
   }
 
   /* sizes */
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index c93e4cbe84ba..c328afcfad9b 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -7,6 +7,8 @@
 
 #include <c10/core/CPUAllocator.h>
 
+#include <utility>
+
 
 namespace at { namespace native {
 
@@ -130,7 +132,7 @@ static inline void checkSetStorage(Tensor& result, Storage storage, T storage_of
                 "Attempted to set the storage of a tensor on device \"", result.storage().device(),
                 "\" to a storage on different device \"", storage.device(),
                 "\".  This is no longer allowed; the devices must match.");
-    result.unsafeGetTensorImpl()->set_storage_keep_dtype(storage);
+    result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage));
   }
 
   // storageOffset
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index 0519bfa57e61..8e93ee12d5b9 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -74,12 +74,12 @@ template <typename T1, typename T2> using pair = std::pair<T1, T2>;
 
 } // namespace detail
 
-template <typename scalar_t, typename index_t, typename combine_t>
+template <typename scalar_t, typename index_t>
 struct WelfordData {
   scalar_t mean;
   scalar_t m2;
   index_t n;
-  combine_t nf;
+  scalar_t nf;
 
   C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
 
@@ -87,28 +87,30 @@ struct WelfordData {
       scalar_t mean,
       scalar_t m2,
       index_t n,
-      combine_t nf)
+      scalar_t nf)
       : mean(mean), m2(m2), n(n), nf(nf) {}
 };
 
 
-template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t, typename res_t>
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename res_t>
 struct WelfordOps {
-  index_t correction;
+  acc_scalar_t correction;
   bool take_sqrt;
  public:
-  using acc_t = WelfordData<acc_scalar_t, index_t, combine_t>;
+  using acc_t = WelfordData<acc_scalar_t, index_t>;
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    // We accumulate n in index_t to avoid cumulative rounding error, but still
+    // need nf for use in combine where int32 may overflow.
+    index_t new_n = acc.n + 1;
+    acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
     acc_scalar_t delta = data - acc.mean;
-    // using acc.nf(combine_t) here, as acc.n(index_t) would still be converted
-    // accumulation in reduce is done through index_T
-    acc_scalar_t new_mean = acc.mean + delta / (acc.nf + 1);
+    acc_scalar_t new_mean = acc.mean + delta / new_nf;
     acc_scalar_t new_delta = data - new_mean;
     return {
       new_mean,
       acc.m2 + delta * new_delta,
-      acc.n + 1,
-      combine_t(acc.n + 1), // accumulate for combine_t uses index_t
+      new_n,
+      new_nf,
     };
   }
   inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
@@ -119,7 +121,7 @@ struct WelfordOps {
       return a;
     }
     acc_scalar_t delta = b.mean - a.mean;
-    combine_t new_count = a.nf + b.nf;
+    acc_scalar_t new_count = a.nf + b.nf;
     acc_scalar_t nb_over_n = b.nf / new_count;
     return {
       a.mean + delta * nb_over_n,
@@ -132,7 +134,7 @@ struct WelfordOps {
   }
   inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
     const auto mean = static_cast<scalar_t>(acc.mean);
-    const combine_t divisor = acc.nf > correction ? acc.nf - correction : 0;
+    const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
     const auto var = acc.m2 / divisor;
     res_t results(take_sqrt ? device_sqrt(var) : var, mean);
     return results;
@@ -152,7 +154,7 @@ struct WelfordOps {
     };
   }
 #endif
-  C10_HOST_DEVICE WelfordOps(index_t correction, bool take_sqrt)
+  C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt)
       : correction(correction), take_sqrt(take_sqrt) {}
 };
 
@@ -190,7 +192,7 @@ struct MeanOps {
 // a set of numbers.
 // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
 // value. These types differ for complex number input support.
-template <typename scalar_t, typename acc_t=scalar_t>
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
 struct AbsMinOps {
 
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
@@ -201,7 +203,7 @@ struct AbsMinOps {
     return MIN(a, b);
   }
 
-  inline C10_DEVICE acc_t project(acc_t a) const {
+  inline C10_DEVICE out_t project(acc_t a) const {
     return a;
   }
 
@@ -220,9 +222,8 @@ struct AbsMinOps {
 // a set of numbers.
 // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
 // value. These types differ for complex number input support.
-template <typename scalar_t, typename acc_t=scalar_t>
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
 struct AbsMaxOps {
-
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
     return MAX(acc, static_cast<acc_t>(std::abs(data)));
   }
@@ -231,7 +232,7 @@ struct AbsMaxOps {
     return MAX(a, b);
   }
 
-  inline C10_DEVICE acc_t project(acc_t a) const {
+  inline C10_DEVICE out_t project(acc_t a) const {
     return a;
   }
 
@@ -250,7 +251,7 @@ struct AbsMaxOps {
 // of a set of numbers.
 // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
 // value. These types differ for complex number input support.
-template <typename scalar_t, typename acc_t=scalar_t>
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
 struct NormOps {
   acc_t norm_;
 
@@ -262,7 +263,7 @@ struct NormOps {
     return a + b;
   }
 
-  inline C10_DEVICE acc_t project(acc_t a) const {
+  inline C10_DEVICE out_t project(acc_t a) const {
     return compat_pow(a, static_cast<acc_t>(1.0) / norm_);
   }
 
@@ -284,7 +285,7 @@ struct NormOps {
 // absolute value of a set of numbers.
 // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
 // value. These types differ for complex number input support.
-template <typename scalar_t, typename acc_t=scalar_t>
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
 struct NormZeroOps {
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
     return acc + (data == static_cast<scalar_t>(0) ? static_cast<acc_t>(0) : static_cast<acc_t>(1));
@@ -294,7 +295,7 @@ struct NormZeroOps {
     return a + b;
   }
 
-  inline C10_DEVICE acc_t project(acc_t a) const {
+  inline C10_DEVICE out_t project(acc_t a) const {
     return a;
   }
 
@@ -314,7 +315,7 @@ struct NormZeroOps {
 // absolute value of a set of numbers.
 // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
 // value. These types differ for complex number input support.
-template <typename scalar_t, typename acc_t=scalar_t>
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
 struct NormOneOps {
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
     return acc + static_cast<acc_t>(std::abs(data));
@@ -324,7 +325,7 @@ struct NormOneOps {
     return a + b;
   }
 
-  inline C10_DEVICE acc_t project(acc_t a) const {
+  inline C10_DEVICE out_t project(acc_t a) const {
     return a;
   }
 
@@ -362,7 +363,7 @@ inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<ac
 // absolute value of a set of numbers.
 // `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
 // value. These types differ for complex number input support.
-template <typename scalar_t, typename acc_t=scalar_t>
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
 struct NormTwoOps {
   inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
     acc_t data_ = abs_if_complex(data, AbsSwitch<acc_t>());
@@ -373,7 +374,7 @@ struct NormTwoOps {
     return a + b;
   }
 
-  inline C10_DEVICE acc_t project(acc_t a) const {
+  inline C10_DEVICE out_t project(acc_t a) const {
     return device_sqrt(a);
   }
 
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 0332f57e9e23..853e9bcc4da9 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -155,7 +155,7 @@ void host_softmax(
     const Tensor& input,
     const int64_t dim,
     bool* mask = nullptr,
-    const c10::optional<int64_t> mask_type_ = NULL) {
+    const c10::optional<int64_t> mask_type_ = {}) {
 
   if (MaskedSoftMax) {
     TORCH_CHECK(mask_type_.has_value(), "Mask Type should be defined");
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index 656737c62e2c..a78f16e6cc14 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -64,7 +64,7 @@ TORCH_META_FUNC(topk)
   // Build the output size, which is the dim being selected set to
   // size k
   DimVector topKSize(self.sizes().vec());
-  if (topKSize.size() > 0) {
+  if (!topKSize.empty()) {
     topKSize[dim] = k;
   }
   set_output_raw_strided(0, topKSize, {}, self.options());
diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h
index f6065927eba4..7229e01741e6 100644
--- a/aten/src/ATen/native/SortingUtils.h
+++ b/aten/src/ATen/native/SortingUtils.h
@@ -23,7 +23,7 @@ inline void _reduction_with_indices_allocate_or_resize_output(
     bool keepdim) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
   auto result_sizes = self.sizes().vec();
-  if (result_sizes.size() > 0) {
+  if (!result_sizes.empty()) {
     result_sizes[dim] = 1;
   }
   if (values.defined()) {
@@ -63,7 +63,7 @@ inline void _allocate_or_resize_output_with_indices(
     int64_t k) {
   int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
   auto result_sizes = self.sizes().vec();
-  if (result_sizes.size() > 0) {
+  if (!result_sizes.empty()) {
     result_sizes[dim] = k;
   }
   if (values.defined()) {
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 1d2be112ad4a..9ec95dd3477c 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -479,7 +479,7 @@ static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
                              const c10::optional<c10::string_view>& norm_str) {
   TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type());
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
-  TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis");
+  TORCH_CHECK(!desc.shape.empty(), "rfftn must transform at least one axis");
   Tensor input = promote_tensor_fft(self, /*require_complex=*/false);
   Tensor x = resize_fft_input(input, desc.dim, desc.shape);
   const auto norm = static_cast<int64_t>(norm_from_string(norm_str, /*forward=*/true));
@@ -507,7 +507,7 @@ ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args(
     const at::OptionalIntArrayRef& dims,
     int64_t& last_dim_size) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dims);
-  TORCH_CHECK(desc.shape.size() > 0, fname, " must transform at least one axis");
+  TORCH_CHECK(!desc.shape.empty(), fname, " must transform at least one axis");
 
   // Expected output size of the hermitian-symmetric dimension
   last_dim_size = [&] {
@@ -607,7 +607,7 @@ static Tensor fft_ihfftn_impl(
     const Tensor& out) {
   constexpr c10::string_view fname = "ihfftn";
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
-  TORCH_CHECK(desc.shape.size() > 0, "ihfftn must transform at least one axis");
+  TORCH_CHECK(!desc.shape.empty(), "ihfftn must transform at least one axis");
   auto input = promote_tensor_fft(self, /*require_complex=*/false);
   auto x = resize_fft_input(input, desc.dim, desc.shape);
   const auto norm = static_cast<int64_t>(
@@ -1186,7 +1186,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
 void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_) {
   const auto input_sizes = input.sizes();
   const auto input_strides = input.strides();
-  TORCH_CHECK(dim_.size() > 0);
+  TORCH_CHECK(!dim_.empty());
   DimVector dim(dim_.begin(), dim_.end());
   at::maybe_wrap_dims(dim, input_strides.size(), /*wrap_scalars=*/false);
 
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index e94ee7078117..24ea40652e82 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -221,6 +221,10 @@ TORCH_META_FUNC2(scatter, reduce)
  const Tensor& index,
  const Tensor& src,
  const c10::string_view reduce) {
+  TORCH_WARN_ONCE(
+      "The reduce argument of torch.scatter with Tensor src is deprecated and will be removed ",
+      "in a future PyTorch release. Use torch.scatter_reduce instead for more reduction options."
+  );
   scatter_meta_impl(*this, self, dim, index, src, reduce);
 }
 
@@ -286,11 +290,11 @@ TORCH_PRECOMPUTE_META_FUNC(index_copy)
 
   // Check that source and destination slices have the same size
   auto selfSlicedSizes = self.sizes().vec();
-  if (selfSlicedSizes.size() > 0) {
+  if (!selfSlicedSizes.empty()) {
     selfSlicedSizes.erase(selfSlicedSizes.begin() + dim);
   }
   auto sourceSlicedSizes = source.sizes().vec();
-  if (sourceSlicedSizes.size() > 0) {
+  if (!sourceSlicedSizes.empty()) {
     sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim);
   }
   if (selfSlicedSizes.size() != sourceSlicedSizes.size() ||
@@ -471,7 +475,7 @@ DEFINE_DISPATCH(scatter_reduce_expanded_index_stub);
 DEFINE_DISPATCH(gather_expanded_index_stub);
 
 static bool all_strides_match(TensorList tensors) {
-  TORCH_CHECK(tensors.size() >= 1);
+  TORCH_CHECK(!tensors.empty());
   auto strides = tensors[0].strides();
   for (auto& tensor : tensors.slice(1)) {
     if (!strides.equals(tensor.strides())) {
@@ -1191,6 +1195,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor &
   dim = maybe_wrap_dim(dim, self.dim());
   auto numel = index.numel();
   TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector");
+  TORCH_CHECK(!(self.dim() == 0 && index.numel() > 1), "index_select(): Index to scalar cannot have multiple values.");
   TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(self.scalar_type() == result.scalar_type(),
               "index_select(): self and result must have the same scalar type");
@@ -1472,9 +1477,9 @@ Tensor gather_backward(const Tensor& grad, const Tensor& self, int64_t dim, cons
     return at::_gather_sparse_backward(self, dim, index, grad);
   }
   auto result = grad.new_zeros_symint(self.sym_sizes());
-  // for composite compliance, use out-of-place variant of
-  // `scatter_add` if index tensor is a Tensor Subclass.
-  if (isTensorSubclassLike(index)) {
+  // for composite, vmap and inductor compliance, use out-of-place variant of
+  // `scatter_add` if index or grad tensors is a Tensor Subclass.
+  if (areAnyTensorSubclassLike({index, grad})) {
     return result.scatter_add(dim, index, grad);
   }
   result.scatter_add_(dim, index, grad);
@@ -1721,8 +1726,6 @@ TORCH_IMPL_FUNC(scatter_reduce_two)
  const c10::string_view reduce,
  bool include_self,
  const Tensor& out) {
-  // See issue https://github.com/pytorch/pytorch/issues/74770
-  TORCH_WARN_ONCE("scatter_reduce() is in beta and the API may change at any time.");
 
   dim = at::maybe_wrap_dim(dim, self.dim());
 
@@ -2084,7 +2087,7 @@ Tensor count_nonzero_cuda(const Tensor& self, IntArrayRef dims){
 }
 
 Tensor count_nonzero_cpu(const Tensor& self, IntArrayRef dims){
-  if (dims.size() > 0) {
+  if (!dims.empty()) {
     return (self != 0).sum(dims);
   }
 
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 209f0ceffd4b..fb748c23b1d8 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -52,6 +52,7 @@
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/NonSymbolicBC.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <algorithm>
 #include <numeric>
 
 namespace at {
@@ -339,7 +340,7 @@ Tensor _to_copy(
   // at::empty also does not work here because there is no proper at::empty support for quantized tensors
   // as it would return a quantized tensor with an UnknownQuantizer
   auto r = self.is_quantized() ? at::empty_like(self, memory_format)
-                               : at::empty(self.sizes(),
+                               : at::empty_symint(self.sym_sizes(),
                                  options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
   r.copy_(self, non_blocking);
   return r;
@@ -904,9 +905,9 @@ static Tensor dense_to_sparse_compressed(const Tensor& self, IntArrayRef blocksi
   auto values = blocked_layout ? _batch_tile_tensor(self, blocksize, dense_dim) :  self;
   auto not_zero_mask = blocked_layout ? _batch_tile_tensor(self != 0, blocksize, dense_dim) : self != 0;
   if (blocked_layout || dense_dim > 0) {
-    std::vector<int64_t> reduce_dims((blocked_layout ? 2 : 0) + dense_dim);
-    std::iota(reduce_dims.begin(), reduce_dims.end(), n_batch_dim + 2);
-    not_zero_mask = not_zero_mask.sum(reduce_dims) != 0;
+    std::vector<int64_t> reduce_dim((blocked_layout ? 2 : 0) + dense_dim);
+    std::iota(reduce_dim.begin(), reduce_dim.end(), n_batch_dim + 2);
+    not_zero_mask = not_zero_mask.sum(reduce_dim) != 0;
   }
 
   if (is_batched) {
@@ -1066,17 +1067,18 @@ Tensor sparse_compressed_to_flipped(
     values.unsqueeze_(0);
   }
 
-  // NOTE: these sparse_dims are true sparse dims only for CSR/CSC inputs.
-  // And for BSR/BSC these are <true sparse dims> / <blocksize>.
-  // In other words, sparse_dims stores ranges of valid indices in the row/col dims.
-  const auto sparse_dims = [&]() -> at::DimVector {
-    auto sparse_dims = at::DimVector(self.sizes().slice(n_batches, 2));
+  // NOTE: these sparse_dim are true sparse dims only for CSR/CSC
+  // inputs.  And for BSR/BSC these are <true sparse dims> /
+  // <blocksize>.  In other words, sparse_dim stores ranges of valid
+  // indices in the row/col dims.
+  const auto sparse_dim = [&]() -> at::DimVector {
+    auto sparse_dim = at::DimVector(self.sizes().slice(n_batches, 2));
     if (layout == at::kSparseBsr || layout == at::kSparseBsc) {
       auto blocksize = at::sparse_csr::getBlockSize(self);
-      sparse_dims[0] /= blocksize[0];
-      sparse_dims[1] /= blocksize[1];
+      sparse_dim[0] /= blocksize[0];
+      sparse_dim[1] /= blocksize[1];
     }
-    return sparse_dims;
+    return sparse_dim;
   }();
 
   // batch_sizes_nonempty stores at least one, potentially fake, batch dimension.
@@ -1097,7 +1099,7 @@ Tensor sparse_compressed_to_flipped(
   // performance.
   const auto batch_nnz_offset = [&]() -> Tensor {
     const auto wrapped_nnz = at::tensor({nnz}, compressed_indices.options());
-    const auto offset = wrapped_nnz
+    auto offset = wrapped_nnz
       .expand({batch_numel_nonzero})
       .cumsum(-1).sub_(wrapped_nnz)
       .reshape(batch_sizes_nonempty);
@@ -1152,7 +1154,7 @@ Tensor sparse_compressed_to_flipped(
   // To CSC/BSC inputs these indices will appear "transposed".
   const auto is_transposed_indices = layout == at::kSparseCsc || layout == at::kSparseBsc;
   const auto coo_indices_2d_transposed = [&]() -> Tensor {
-    const auto coo_indices_2d = _convert_indices_from_csr_to_coo(
+    auto coo_indices_2d = _convert_indices_from_csr_to_coo(
         compressed_indices_2d,
         plain_indices_2d,
         is_out_int32,
@@ -1167,10 +1169,10 @@ Tensor sparse_compressed_to_flipped(
     // NOTE: we used transposed=true above!
     auto i = coo_indices_2d.select(0, 1);
     auto j = coo_indices_2d.select(0, 0);
-    auto b = i.div(is_transposed_indices ? sparse_dims[1] : sparse_dims[0], "trunc");
+    auto b = i.div(is_transposed_indices ? sparse_dim[1] : sparse_dim[0], "trunc");
     // Modify i, j in-place.
-    i.fmod_(is_transposed_indices ? sparse_dims[1] : sparse_dims[0]);
-    j.add_(b * (is_transposed_indices ? sparse_dims[0] : sparse_dims[1]));
+    i.fmod_(is_transposed_indices ? sparse_dim[1] : sparse_dim[0]);
+    j.add_(b * (is_transposed_indices ? sparse_dim[0] : sparse_dim[1]));
     return coo_indices_2d;
   }();
 
@@ -1182,8 +1184,8 @@ Tensor sparse_compressed_to_flipped(
   // more "weight" (aka stride) placed on the "transposed" dimension.
   const auto coo_indices_2d_transposed_hashed = at::sparse::flatten_indices(
       coo_indices_2d_transposed,
-      is_transposed_indices ? at::DimVector({sparse_dims[0], sparse_dims[1] * batch_numel_nonzero})
-                            : at::DimVector({sparse_dims[1], sparse_dims[0] * batch_numel_nonzero}));
+      is_transposed_indices ? at::DimVector({sparse_dim[0], sparse_dim[1] * batch_numel_nonzero})
+                            : at::DimVector({sparse_dim[1], sparse_dim[0] * batch_numel_nonzero}));
   const auto hash_argsort = std::get<1>(coo_indices_2d_transposed_hashed.sort());
   const auto coo_indices_2d_transposed_sorted = coo_indices_2d_transposed.index_select(1, hash_argsort);
 
@@ -1195,8 +1197,8 @@ Tensor sparse_compressed_to_flipped(
       _convert_indices_from_coo_to_csr(
         new_compressed_indices_coo_2d,
         is_transposed_indices
-          ? batch_numel_nonzero * sparse_dims[0]
-          : batch_numel_nonzero * sparse_dims[1],
+          ? batch_numel_nonzero * sparse_dim[0]
+          : batch_numel_nonzero * sparse_dim[1],
         is_out_int32),
       batch_numel_nonzero,
       is_out_int32)
@@ -1235,6 +1237,22 @@ Tensor sparse_compressed_to_sparse_csr(const Tensor& self, c10::optional<int64_t
       self.layout());
 }
 
+Tensor sparse_compressed_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
+  if (dense_dim_opt.has_value()) {
+    AT_ERROR("sparse_compressed_to_sparse_csc conversion does not support specifying number of dense dimensions");
+  }
+  if (self.layout() == kSparseCsr) {
+    return sparse_compressed_to_flipped(self, c10::nullopt, "to_sparse_csc");
+  }
+  if (self.layout() == kSparseCsc) {
+    return sparse_compressed_clone(self, c10::nullopt, "to_sparse_csc");
+  }
+  AT_ERROR(
+      "sparse_compressed_to_sparse_csc expected SparseCsr or SparseCsc layout but got ",
+      self.layout());
+  return self;
+}
+
 Tensor coo_to_sparse_csr(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
   TORCH_CHECK(
       self.sparse_dim() == 2,
@@ -1266,29 +1284,29 @@ Tensor coo_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_op
   if (dense_dim_opt.has_value()) {
     AT_ERROR("coo_to_sparse_csc conversion does not support specifying number of dense dimensions");
   }
-  auto coalesced_self = self.transpose(0, 1).coalesce().to_sparse_csr();
+  auto transposed_csr = self.transpose(0, 1).to_sparse_csr(dense_dim_opt);
   return at::native::_sparse_csc_tensor_unsafe(
-      coalesced_self.crow_indices(),
-      coalesced_self.col_indices(),
-      coalesced_self.values(),
+      transposed_csr.crow_indices(),
+      transposed_csr.col_indices(),
+      transposed_csr.values(),
       self.sizes(),
-      coalesced_self.scalar_type(),
+      transposed_csr.scalar_type(),
       c10::kSparseCsc,
-      coalesced_self.device());
+      transposed_csr.device());
 }
 
 Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   if (dense_dim_opt.has_value()) {
     AT_ERROR("coo_to_sparse_bsr conversion does not support specifying number of dense dimensions");
   }
-  return self.to_sparse_csr().to_sparse_bsr(blocksize, dense_dim_opt);
+  return self.to_sparse_csr(dense_dim_opt).to_sparse_bsr(blocksize);
 }
 
 Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt) {
   if (dense_dim_opt.has_value()) {
     AT_ERROR("coo_to_sparse_bsc conversion does not support specifying number of dense dimensions");
   }
-  return self.to_sparse_bsr(blocksize, dense_dim_opt).to_sparse_bsc(blocksize, dense_dim_opt);
+  return self.to_sparse_csc(dense_dim_opt).to_sparse_bsc(blocksize);
 }
 
 namespace {
@@ -1399,90 +1417,89 @@ TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu)
  * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h
  * Modified to ensure sorted BSR column indices.
  */
-template <class I, class T>
-void _csr_to_block_csr_cpu_kernel(
-    const I n_row,
-    const I n_col,
-    const I R,
-    const I C,
-    const I* input_crow_indices,
-    const I* input_col_indices,
-    const T* input_values,
-    I* result_crow_indices,
-    I* result_col_indices,
-    T* result_values) {
-  // All blocks are possible, that is, may be allocated if a single non-zero
-  // value lives within them. Otherwise they're not.
-
-  // Allocate pointers for all possible column blocks plus 1
-  std::vector<T*> blocks(n_col / C + 1, (T*)0);
-
-  assert(n_row % R == 0);
-  assert(n_col % C == 0);
-
-  // Major assumptions
-  // 1. Blocks must be square
-
-  // Number of blocks along rows
-  I n_brow = n_row / R;
-  // Number of blocks along columns
-  I n_bcol = n_col / C;
+template <class index_t, class scalar_t, bool compressed_rows>
+void _compressed_to_block_compressed_cpu_kernel(
+    const index_t n_compressed, // Tensor size along compressed dimension
+    const index_t n_plain, // Tensor size along plain dimension
+    const index_t C, // Block size along compressed dimensions
+    const index_t P, // Block size along plain dimension
+    const index_t D, // Number of elements in dense dimensions
+    const index_t* input_compressed_indices,
+    const index_t* input_plain_indices,
+    const scalar_t* input_values,
+    index_t* result_compressed_indices,
+    index_t* result_plain_indices,
+    scalar_t* result_values) {
+  // All blocks are possible, that is, may be allocated if a single
+  // non-zero value lives within them. Otherwise they're not.
+
+  // Allocate pointers for all possible plain blocks plus 1
+  std::vector<scalar_t*> blocks(n_plain / P + 1, nullptr);
+
+  assert(n_compressed % C == 0);
+  assert(n_plain % P == 0);
+
+  // Number of blocks along compressed dim
+  index_t n_bcompressed = n_compressed / C;
+  // Number of blocks along plain_dim
+  index_t n_bplain = n_plain / P;
 
   // Number of elements per block
-  I RC = R * C;
+  index_t CPD = C * P * D;
   // Number of blocks overall
-  I n_blks = 0;
-
-  result_crow_indices[0] = 0;
-
-  // Iterate over blocks along rows
-  for (I block_i = 0; block_i < n_brow; block_i++) {
-    // Iterate over blocks along columns to locate non-zero blocks,
-    // this guarantees sorted block-column indices
-    for (I block_j = 0; block_j < n_bcol; block_j ++) {
-      for (I jj = input_crow_indices[R * block_i]; jj < input_crow_indices[R * (block_i + 1)]; jj++) {
-        I j = input_col_indices[jj]; // column index
-        if (j / C == block_j) {
-          blocks[block_j] = result_values + RC * n_blks;
-          result_col_indices[n_blks] = block_j;
+  index_t n_blks = 0;
+
+  result_compressed_indices[0] = 0;
+
+  // Iterate over blocks along compressed dim
+  for (index_t block_c = 0; block_c < n_bcompressed; block_c++) {
+    // Iterate over blocks along plain dim to locate non-zero blocks,
+    // this guarantees sorted plain dim indices
+    for (index_t block_p = 0; block_p < n_bplain; block_p ++) {
+      for (index_t i = input_compressed_indices[C * block_c]; i < input_compressed_indices[C * (block_c + 1)]; i++) {
+        index_t p = input_plain_indices[i]; // plain dim element index
+        if (p / P == block_p) {
+          blocks[block_p] = result_values + CPD * n_blks;
+          result_plain_indices[n_blks] = block_p;
           n_blks++;
           break;
         }
       }
     }
 
-    // Iterate over rows within block
-    for (I r = 0; r < R; r++) {
-      I i = R * block_i + r; // row index
-      for (I jj = input_crow_indices[i]; jj < input_crow_indices[i + 1]; jj++) {
-        I j = input_col_indices[jj]; // column index
+    // Iterate over compressed dim within block
+    for (index_t cb = 0; cb < C; cb++) {
+      index_t c = C * block_c + cb; // compressed dim index
+      for (index_t i = input_compressed_indices[c]; i < input_compressed_indices[c + 1]; i++) {
+        index_t p = input_plain_indices[i]; // plain dim index
 
-        // Block corresponding to column index
-        I block_j = j / C;
-        // Column within block
-        I c = j % C;
+        // Block corresponding to plain dim index
+        index_t block_p = p / P;
+        // Plain dim index within block
+        index_t pb = p % P;
 
-        // Specific blocks entries should not be visited more than once.
-        // Scipy code does an addition here. Why?
+        // Specific blocks entries should not be visited more than
+        // once.  Scipy code does an addition here. Why?
         // A possible answer: Scipy code supports "uncoalesced CSR"
-        // format that allows repeated columns per row and column
-        // indices may be unsorted.
-        *(blocks[block_j] + C * r + c) = input_values[jj];
+        // format that allows repeated plain dim indices, and
+        // compressed and plain indices may be unsorted.
+        std::copy(input_values + i * D, input_values + (i + 1) * D,
+                  blocks[block_p] + (compressed_rows ? P * cb + pb : C * pb + cb) * D);
       }
     }
 
     // Scipy code has
     /*
-      for (I jj = input_crow_indices[R * block_i];
-           jj < input_crow_indices[R * (block_i + 1)];
-           jj++) {
-             blocks[input_col_indices[jj] / C] = 0;
+      for (I i = input_compressed_indices[C * block_c];
+           i < input_compressed_indices[C * (block_c + 1)];
+           i++) {
+             blocks[input_plain_indices[i] / P] = 0;
            }
     */
-    // but we don't need it because the modified code (see the block_j
-    // loop above) does not need to evaluate `blocks[block_j] == 0`
+    // but we don't need it because the modified code (see the block_p
+    // loop above) does not need to evaluate `blocks[block_p] == 0`
     // that the original code did.
-    result_crow_indices[block_i + 1] = n_blks;
+    result_compressed_indices[block_c + 1] = n_blks;
   }
 }
 
@@ -1490,22 +1507,23 @@ void _csr_to_block_csr_cpu_kernel(
  * Based on
  * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h
  */
-template <class I>
-I csr_count_blocks(
-    const I n_row,
-    const I n_col,
-    const I R,
-    const I C,
-    const I Ap[],
-    const I Aj[]) {
-  std::vector<I> mask(n_col / C + 1, -1);
-  I n_blks = 0;
-  for (I i = 0; i < n_row; i++) {
-    I bi = i / R;
-    for (I jj = Ap[i]; jj < Ap[i + 1]; jj++) {
-      I bj = Aj[jj] / C;
-      if (mask[bj] != bi) {
-        mask[bj] = bi;
+template <class index_t>
+index_t compressed_count_blocks(
+    const index_t n_compressed, // Tensor size along compressed dimension
+    const index_t n_plain, // Tensor size along plain dimension
+    const index_t C, // Block size along compressed dimensions
+    const index_t P, // Block size along plain dimension
+    const index_t Ac[], // Compressed indices
+    const index_t Ap[] // Plain indices
+  ) {
+  std::vector<index_t> mask(n_plain / P + 1, -1);
+  index_t n_blks = 0;
+  for (index_t c = 0; c < n_compressed; c++) {
+    index_t bc = c / C;
+    for (index_t i = Ac[c]; i < Ac[c + 1]; i++) {
+      index_t bp = Ap[i] / P;
+      if (mask[bp] != bc) {
+        mask[bp] = bc;
         n_blks++;
       }
     }
@@ -1513,15 +1531,11 @@ I csr_count_blocks(
   return n_blks;
 }
 
-Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) {
-  TORCH_CHECK(
-      blocksize[0] == blocksize[1],
-      "blocks must be square. ",
-      "Got (",
-      blocksize[0],
-      ", ",
-      blocksize[1],
-      ") instead.");
+template<Layout target_layout>
+Tensor _compressed_to_block_compressed_cpu(const Tensor& self, IntArrayRef blocksize) {
+  static_assert(target_layout == Layout::SparseBsr || target_layout == Layout::SparseBsc,
+                "invalid layout template parameter for _compressed_to_block_compressed_cpu");
+
   TORCH_CHECK(
       self.size(0) % blocksize[0] == 0 && self.size(1) % blocksize[1] == 0,
       "Block sparse CSR Tensors must have a size that is an ",
@@ -1535,68 +1549,72 @@ Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) {
       ", ",
       blocksize[1],
       ") instead.");
-  Tensor input_values = self.values().contiguous();
-  Tensor input_crow_indices = self.crow_indices().contiguous();
-  Tensor input_col_indices = self.col_indices().contiguous();
 
-  // First we determine the number of blocks needed. For each given block, if it
-  // contains a non-zero element we will allocate values and indices for it.
+  auto input_values = self.values().contiguous();
+  Tensor input_compressed_indices;
+  Tensor input_plain_indices;
+  std::tie(input_compressed_indices, input_plain_indices) = sparse_csr::getCompressedPlainIndices(self);
+  input_compressed_indices = input_compressed_indices.contiguous();
+  input_plain_indices = input_plain_indices.contiguous();
+
+  // First we determine the number of blocks needed. For each given
+  // block, if it contains a non-zero element we will allocate values
+  // and indices for it.
   int64_t num_blocks;
-  int64_t n_row = self.size(0);
-  int64_t n_col = self.size(1);
+  auto compressed_dim = (target_layout == Layout::SparseBsr) ? self.size(0) : self.size(1);
+  auto plain_dim = (target_layout == Layout::SparseBsr) ? self.size(1) : self.size(0);
+  auto compressed_blocksize = (target_layout == Layout::SparseBsr) ? blocksize[0] : blocksize[1];
+  auto plain_blocksize = (target_layout == Layout::SparseBsr) ? blocksize[1] : blocksize[0];
+
   AT_DISPATCH_INDEX_TYPES(
-      input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] {
-        num_blocks = csr_count_blocks<index_t>(
-            n_row,
-            n_col,
-            blocksize[0],
-            blocksize[1],
-            input_crow_indices.data_ptr<index_t>(),
-            input_col_indices.data_ptr<index_t>());
+      input_compressed_indices.scalar_type(), "_compressed_to_block_compressed_cpu", [&] {
+        num_blocks =
+          compressed_count_blocks<index_t>(
+              compressed_dim,
+              plain_dim,
+              compressed_blocksize,
+              plain_blocksize,
+              input_compressed_indices.data_ptr<index_t>(),
+              input_plain_indices.data_ptr<index_t>());
       });
-  DimVector values_size{num_blocks, blocksize[0], blocksize[1]};
-
-  // While we don't support conversion of hybrid csr-to-bsr yet, we'll
-  // compute hybrid compatible values sizes to meet the invariants of
-  // the BSR tensor when the support will be implemented.
-  int64_t numel_dense = 1;
-  for (int i=0; i<self.dense_dim(); i++) {
-    values_size.push_back(self.size(2 + i));
-    numel_dense *= self.size(2 + i);
-  }
-  TORCH_CHECK(numel_dense == 1, "conversion from hybrid csr to block csr is not supported yet.");
+  DimVector dense_shape{input_values.sizes().slice(1, input_values.dim() - 1)};
+  DimVector values_shape{num_blocks, blocksize[0], blocksize[1]};
+  values_shape.append(dense_shape);
 
-  Tensor result_values =
-      input_values.new_zeros(values_size);
-  Tensor result_crow_indices =
-      input_crow_indices.new_empty({(n_row / blocksize[0]) + 1});
-  Tensor result_col_indices = input_col_indices.new_empty({num_blocks});
+  Tensor result_values = input_values.new_zeros(values_shape);
+  Tensor result_compressed_indices =
+      input_compressed_indices.new_empty({compressed_dim /compressed_blocksize + 1});
+  Tensor result_plain_indices = input_plain_indices.new_empty({num_blocks});
 
   // Next we copy over non-zero elements into the allocated blocks.
+  auto n_dense = std::accumulate(
+      dense_shape.begin(), dense_shape.end(), 1, std::multiplies<int64_t>());
   AT_DISPATCH_INDEX_TYPES(
-      input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] {
+      input_compressed_indices.scalar_type(), "_compressed_to_block_compressed_cpu", [&] {
         AT_DISPATCH_SPARSE_VALUE_TYPES(
-            input_values.scalar_type(), "_csr_to_block_csr_cpu", [&] {
-              _csr_to_block_csr_cpu_kernel<index_t, scalar_t>(
-                  n_row,
-                  n_col,
-                  blocksize[0],
-                  blocksize[1],
-                  input_crow_indices.data_ptr<index_t>(),
-                  input_col_indices.data_ptr<index_t>(),
+            input_values.scalar_type(), "_compressed_to_block_compressed_cpu", [&] {
+              _compressed_to_block_compressed_cpu_kernel<index_t, scalar_t, target_layout == Layout::SparseBsr>(
+                  compressed_dim,
+                  plain_dim,
+                  compressed_blocksize,
+                  plain_blocksize,
+                  n_dense,
+                  input_compressed_indices.data_ptr<index_t>(),
+                  input_plain_indices.data_ptr<index_t>(),
                   input_values.data_ptr<scalar_t>(),
-                  result_crow_indices.data_ptr<index_t>(),
-                  result_col_indices.data_ptr<index_t>(),
+                  result_compressed_indices.data_ptr<index_t>(),
+                  result_plain_indices.data_ptr<index_t>(),
                   result_values.data_ptr<scalar_t>());
             });
       });
-  return at::native::_sparse_bsr_tensor_unsafe(
-      result_crow_indices,
-      result_col_indices,
+
+  return at::native::_sparse_compressed_tensor_unsafe(
+      result_compressed_indices,
+      result_plain_indices,
       result_values,
       self.sizes(),
       result_values.scalar_type(),
-      c10::kSparseBsr,
+      target_layout,
       result_values.device());
 }
 
@@ -1619,39 +1637,19 @@ Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize
     return sparse_compressed_clone(self, blocksize, "to_sparse_bsr");
   }
   if (self.layout() == kSparseCsr) {
-    TORCH_CHECK(self.dim() == 2,
-        "to_sparse_bsr(): conversion from Csr to Bsr is only possible for 2d inputs, ",
-        "but got input of dimension ", self.dim(), " instead.");
-    Tensor self_values = self.values();
-    Tensor self_crow_indices = self.crow_indices();
-    Tensor self_col_indices = self.col_indices();
-    Tensor cpu_result = _csr_to_block_csr_cpu(
-        _sparse_csr_tensor_unsafe(
-            self_crow_indices.cpu(),
-            self_col_indices.cpu(),
-            self_values.cpu(),
-            self.sizes(),
-            self_values.scalar_type(),
-            self.layout(),
-            at::kCPU),
-        blocksize);
-    Tensor result_values = cpu_result.values().to(self_values.options());
-    Tensor result_crow_indices =
-        cpu_result.crow_indices().to(self_crow_indices.options());
-    Tensor result_col_indices =
-        cpu_result.col_indices().to(self_col_indices.options());
-    return at::native::_sparse_bsr_tensor_unsafe(
-        result_crow_indices,
-        result_col_indices,
-        result_values,
-        self.sizes(),
-        result_values.scalar_type(),
-        c10::kSparseBsr,
-        result_values.device());
+    TORCH_CHECK(self.dim() == 2 + self.dense_dim(),
+                "to_sparse_bsr: conversion from Csr to Bsr for batched inputs is not implemented.");
+
+    if (self.device() != kCPU) {
+      TORCH_WARN("sparse_compressed_to_sparse_bsr executing on the CPU device, the performance may be sub-optimal");
+    }
+    return _compressed_to_block_compressed_cpu<kSparseBsr>(self.cpu(), blocksize).to(self.device());
   }
-  AT_ERROR(
-      "sparse_compressed_to_sparse_bsr expected SparseCsr, SparseBsr or SparseBsc layout but got ",
-      self.layout());
+  if (self.layout() == kSparseCsc) {
+    return self.to_sparse_csr(dense_dim_opt).to_sparse_bsr(blocksize);
+  }
+
+  AT_ERROR("sparse_compressed_to_sparse_bsr: expected SparseCsr, SparseCsc, SparseBsr or SparseBsc layout but got ", self.layout());
   return self;
 }
 
@@ -1673,25 +1671,20 @@ Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize
                 "[blocksize=", blocksize,"] is not implemented.");
     return sparse_compressed_clone(self, blocksize, "to_sparse_bsc");
   }
-  AT_ERROR(
-      "sparse_compressed_to_sparse_bsc expected SparseBsr or SparseBsc layout but got ",
-      self.layout());
-  return self;
-}
+  if (self.layout() == kSparseCsc) {
+    TORCH_CHECK(self.dim() == 2 + self.dense_dim(),
+                "to_sparse_bsc: conversion from Csc to Bsc for batched inputs is not implemented.");
 
-Tensor sparse_compressed_to_sparse_csc(const Tensor& self, c10::optional<int64_t> dense_dim_opt) {
-  if (dense_dim_opt.has_value()) {
-    AT_ERROR("sparse_compressed_to_sparse_csc conversion does not support specifying number of dense dimensions");
+    if (self.device() != kCPU) {
+      TORCH_WARN("sparse_compressed_to_sparse_bsc executing on the CPU device, the performance may be sub-optimal");
+    }
+    return _compressed_to_block_compressed_cpu<kSparseBsc>(self.cpu(), blocksize).to(self.device());
   }
   if (self.layout() == kSparseCsr) {
-    return sparse_compressed_to_flipped(self, c10::nullopt, "to_sparse_csc");
+    return self.to_sparse_csc(dense_dim_opt).to_sparse_bsc(blocksize);
   }
-  if (self.layout() == kSparseCsc) {
-    return sparse_compressed_clone(self, c10::nullopt, "to_sparse_csc");
-  }
-  AT_ERROR(
-      "sparse_compressed_to_sparse_csc expected SparseCsr or SparseCsc layout but got ",
-      self.layout());
+
+  AT_ERROR("sparse_compressed_to_sparse_bsc: expected SparseCsr, SparseCsc, SparseBsr or SparseBsc layout but got ", self.layout());
   return self;
 }
 
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 4c0ba048eca8..cf98348abe6b 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -46,6 +46,7 @@
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/empty_like_native.h>
 #include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_permuted_native.h>
 #include <ATen/ops/empty_strided.h>
 #include <ATen/ops/empty_strided_native.h>
 #include <ATen/ops/eye.h>
@@ -278,6 +279,45 @@ Tensor empty_names(
   return result;
 }
 
+Tensor empty_permuted_symint(SymIntArrayRef size, IntArrayRef physical_layout, c10::optional<ScalarType> dtype_opt,
+  c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt
+) {
+  // size is logical; aka, the output size you'll get from the operation overall
+  //
+  // physical_layout follows NCHW/NHWC convention:
+  // contiguous is [0,1,2,3], channels last is [0,2,3,1]
+  //
+  // this means if i is physical index, physical_layout[i] is logical index;
+  // e.g., to find what is innermost physical dim (3), query NHWC[3] == 1
+  // (aka it is channels)
+  int64_t dim = static_cast<int64_t>(size.size());
+  SymDimVector phys_size(dim);
+  TORCH_CHECK(static_cast<int64_t>(physical_layout.size()) == dim,
+    "Number of dimensions in size does not match the "
+    "length of the physical_layout; i.e. len(size) = ", dim,
+    " is not equal to len(physical_layout) = ", physical_layout.size());
+  std::vector<bool> seen_dims(dim);
+  for (const auto i : c10::irange(dim)) {
+    TORCH_CHECK(physical_layout[i] >= 0 && physical_layout[i] < dim,
+      "Dimension out of range (expected to be between 0 and ", dim - 1, ", but got ",
+      physical_layout[i], " at index ", i, ").  NB: negative dims "
+      "not currently supported; file an issue if you want it.");
+    TORCH_CHECK(!seen_dims[physical_layout[i]], "Duplicate dim not allowed");
+    phys_size[i] = size[physical_layout[i]];
+    seen_dims[physical_layout[i]] = true;
+  }
+  // do a contiguous allocation
+  Tensor phys_tensor = at::empty_symint(phys_size, dtype_opt, layout_opt, device_opt, pin_memory_opt, c10::nullopt);
+  SymIntArrayRef phys_strides = phys_tensor.sym_strides();
+  // permute the strides (inverse permutation!  This is why this is
+  // empty_permute*d*, not empty_permute; it's not an empty + permute)
+  SymDimVector strides(dim);
+  for (const auto i : c10::irange(dim)) {
+    strides[physical_layout[i]] = phys_strides[i];
+  }
+  return phys_tensor.as_strided_symint(size, strides);
+}
+
 Tensor empty_strided_cpu(IntArrayRef size, IntArrayRef stride, c10::optional<ScalarType> dtype_opt,
                          c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
   return at::detail::empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 1cd231b6719f..26b03289494c 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -253,7 +253,7 @@ TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) {
   auto maybe_outnames = namedinference::compute_cat_outnames(materialized);
 
   TORCH_CHECK(
-      materialized.size() > 0, "torch.cat(): expected a non-empty list of Tensors");
+      !materialized.empty(), "torch.cat(): expected a non-empty list of Tensors");
 
   // Look for the first valid tensor.
   size_t valid = materialized.size();
@@ -523,7 +523,7 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
 
   Tensor new_values = values.expand(broadcast_dense_sizes).repeat_interleave(nnz_factor, 0);
   Tensor new_indices = indices.new_empty(new_indices_size);
-  if (broadcast_sizes.size()>0) {
+  if (!broadcast_sizes.empty()) {
     // ones(broadcast_sizes).nonzero() is equivalent to
     // product(map(arange, broadcast_sizes)) but avoids creating
     // auxilary arange tensors
@@ -825,7 +825,7 @@ Tensor cat_sparse(const ITensorListRef& tensors, int64_t dim) {
 
 Tensor block_diag(TensorList tensors) {
   Tensor result;
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     result = at::empty({1, 0});
     return result;
   }
@@ -1567,6 +1567,11 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
   if (self.is_sparse()) {
     AT_ERROR("reshape is not implemented for sparse tensors");
   }
+
+  if (self.is_contiguous() && !self.is_mkldnn()) {
+    return self.view_symint(proposed_shape);
+  }
+
   c10::SymDimVector shape = infer_size_dv(proposed_shape, self.sym_numel());
 
   if (self.is_mkldnn()) {
@@ -1669,7 +1674,7 @@ Tensor _reshape_alias(const Tensor& self, IntArrayRef sizes, IntArrayRef strides
 }
 
 Tensor reshape_as(const Tensor& self, const Tensor& other) {
-  return self.reshape(other.sizes());
+  return self.reshape_symint(other.sym_sizes());
 }
 
 static Tensor select_sparse(const Tensor& self, int64_t dim, int64_t index) {
@@ -1804,7 +1809,7 @@ Tensor select_symint(const Tensor& self, int64_t dim, c10::SymInt index) {
 
 Tensor select_backward_symint(const Tensor& grad, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
   auto grad_input = at::zeros_symint(input_sizes, grad.options());
-  grad_input.select_symint(dim, index).copy_(grad);
+  grad_input.select_symint(dim, std::move(index)).copy_(grad);
   return grad_input;
 }
 
@@ -2655,7 +2660,7 @@ void check_stack_inputs(TensorList tensors, int64_t dim) {
 
 // TODO(msubkhankulov): refactor to use _stack
 Tensor stack(TensorList tensors, int64_t dim) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "stack expects a non-empty TensorList");
   auto wrapped_dim = maybe_wrap_dim(dim, tensors[0].ndimension()+1);
   if (wrapped_dim < tensors[0].ndimension() && !tensors[0].is_sparse()) {
@@ -2685,7 +2690,7 @@ Tensor& _stack_out(TensorList tensors, int64_t dim, Tensor& result) {
 
 // TODO(msubkhankulov): refactor to use _stack_out
 Tensor& stack_out(TensorList tensors, int64_t dim, Tensor& result) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "stack expects a non-empty TensorList");
   auto wrapped_dim = maybe_wrap_dim(dim, tensors[0].ndimension()+1);
   if (wrapped_dim < tensors[0].ndimension() && !tensors[0].is_sparse()) {
@@ -2708,7 +2713,7 @@ Tensor& stack_out(TensorList tensors, int64_t dim, Tensor& result) {
 }
 
 Tensor hstack(TensorList tensors) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "hstack expects a non-empty TensorList");
   auto rep = at::atleast_1d(tensors);
   if (rep[0].dim() == 1) {
@@ -2718,7 +2723,7 @@ Tensor hstack(TensorList tensors) {
 }
 
 Tensor& hstack_out(TensorList tensors, Tensor& result) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "hstack expects a non-empty TensorList");
   auto rep = at::atleast_1d(tensors);
   if (rep[0].dim() == 1) {
@@ -2728,27 +2733,27 @@ Tensor& hstack_out(TensorList tensors, Tensor& result) {
 }
 
 Tensor vstack(TensorList tensors) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "vstack expects a non-empty TensorList");
   auto rep = at::atleast_2d(tensors);
   return at::cat(rep, 0);
 }
 
 Tensor& vstack_out(TensorList tensors, Tensor& result) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "vstack expects a non-empty TensorList");
   auto rep = at::atleast_2d(tensors);
   return at::cat_out(result, rep, 0);
 }
 
 Tensor dstack(TensorList tensors) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "dstack expects a non-empty TensorList");
   auto rep = at::atleast_3d(tensors);
   return at::cat(rep, 2);
 }
 Tensor& dstack_out(TensorList tensors, Tensor& result) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
            "dstack expects a non-empty TensorList");
   auto rep = at::atleast_3d(tensors);
   return at::cat_out(result, rep, 2);
@@ -2812,7 +2817,7 @@ static std::vector<Tensor> reshape_input_for_column_stack(TensorList tensors) {
 }
 
 Tensor& column_stack_out(TensorList tensors, Tensor& result) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
               "column_stack expects a non-empty TensorList");
 
   auto reshaped_tensors = reshape_input_for_column_stack(tensors);
@@ -2820,7 +2825,7 @@ Tensor& column_stack_out(TensorList tensors, Tensor& result) {
 }
 
 Tensor column_stack(TensorList tensors) {
-  TORCH_CHECK(tensors.size() > 0,
+  TORCH_CHECK(!tensors.empty(),
               "column_stack expects a non-empty TensorList");
 
   auto reshaped_tensors = reshape_input_for_column_stack(tensors);
@@ -3374,7 +3379,7 @@ Tensor flatten(const Tensor& self, Dimname start_dim, Dimname end_dim, Dimname o
 
 Tensor flatten(const Tensor& self, DimnameList dims, Dimname out_dim) {
   auto positions = dimnames_to_positions(self, dims);
-  TORCH_CHECK(positions.size() > 0,
+  TORCH_CHECK(!positions.empty(),
       "flatten(tensor, dims, out_dim): dims cannot be empty");
   for (const auto i : c10::irange(positions.size() - 1)) {
     if (positions[i] + 1 == positions[i + 1]) continue;
@@ -3413,7 +3418,7 @@ static inline void handle_unflatten_exception(const std::runtime_error &e,
 Tensor unflatten_impl(const Tensor& self, int64_t dim, IntArrayRef sizes, c10::optional<DimnameList> names) {
   dim = maybe_wrap_dim(dim, self.dim());
 
-  TORCH_CHECK(sizes.size() > 0, "unflatten: sizes must be non-empty");
+  TORCH_CHECK(!sizes.empty(), "unflatten: sizes must be non-empty");
   TORCH_INTERNAL_ASSERT(!names || names->size() == sizes.size());
   if (self.has_names()) {
     TORCH_CHECK(names, "unflatten: input is a named tensor but no names were given for unflattened sizes");
@@ -3458,7 +3463,7 @@ Tensor unflatten(const Tensor& self, Dimname dim, IntArrayRef sizes, DimnameList
 }
 
 Tensor view_as(const Tensor& self, const Tensor& other) {
-  return self.view(other.sizes());
+  return self.view_symint(other.sym_sizes());
 }
 
 int64_t numel(const Tensor& self) {
@@ -3541,17 +3546,18 @@ std::vector<Tensor> meshgrid(TensorList tensors,
                 "but received: ", indexing);
   }
 
-  std::vector<int64_t> shape(size);
+  std::vector<c10::SymInt> shape(size);
   for(const auto i: c10::irange(size)){
     TORCH_CHECK(tensor_refs[i].get().dim() <= 1,
                 "torch.meshgrid: Expected 0D or 1D tensor in the tensor list but got: ", tensor_refs[i]);
-    shape[i] = tensor_refs[i].get().numel();  // treat 0D tensors as if they were a 1D tensor
+    shape[i] = tensor_refs[i].get().sym_numel();  // treat 0D tensors as if they were a 1D tensor
   }
   std::vector<Tensor> grids;
-  std::vector<int64_t> view_shape(size, 1);
+  grids.reserve(size);
+  std::vector<c10::SymInt> view_shape(size, 1);
   for(const auto i: c10::irange(size)){
     view_shape[i] = -1;  // select this dimension to infer
-    grids.push_back(tensor_refs[i].get().view(view_shape).expand(shape));
+    grids.push_back(tensor_refs[i].get().view_symint(view_shape).expand_symint(shape));
     view_shape[i] = 1;  // restore to previous value
   }
 
@@ -3879,7 +3885,7 @@ at::Tensor clone_preserve_strides(const at::Tensor& self) {
   auto nbytes = self.storage().sym_nbytes();
   TORCH_INTERNAL_ASSERT(nbytes % dtype_size == 0);
   auto numel = nbytes / dtype_size;
-  auto self_full_size = self.as_strided_symint({numel}, {1}, 0);
+  auto self_full_size = self.as_strided_symint({std::move(numel)}, {1}, 0);
   auto clone = self_full_size.clone();
   auto out = clone.as_strided_symint(self.sym_sizes(), self.sym_strides(), self.sym_storage_offset());
   return out;
@@ -3896,7 +3902,7 @@ at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t
 }
 at::Tensor select_scatter_symint(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::SymInt index) {
     auto output = clone_preserve_strides(self);
-    auto slice = output.select_symint(dim, index);
+    auto slice = output.select_symint(dim, std::move(index));
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
     return output;
@@ -3932,133 +3938,7 @@ at::Tensor lift_fresh(const at::Tensor& self) {
     return self;
 }
 
-at::Tensor& _fw_primal_copy_out(const at::Tensor & self, int64_t level, at::Tensor & out) {
-  auto tmp = self._fw_primal(level);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _make_dual_copy_out(const at::Tensor & primal, const at::Tensor & tangent, int64_t level, at::Tensor & out) {
-  auto tmp = at::_make_dual(primal, tangent, level);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& view_as_real_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = at::view_as_real(self);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& view_as_complex_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = at::view_as_complex(self);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _conj_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self._conj();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _neg_view_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self._neg_view();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& as_strided_copy_out_symint(const at::Tensor & self, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset, at::Tensor & out) {
-  auto tmp = self.as_strided_symint(size, stride, std::move(storage_offset));
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _sparse_broadcast_to_copy_out(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
-  auto tmp = at::_sparse_broadcast_to(self, size);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& diagonal_copy_out(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
-  TORCH_CHECK(
-    out.device() == self.device(),
-    "diagonal_copy: Expected out and self tensors to be on the same device, but got ",
-    "out on ", out.device(), " and self on ", self.device());
-  auto result = self.diagonal(offset, dim1, dim2);
-  at::native::resize_output(out, result.sizes());
-  TORCH_CHECK(
-      canCast(result.scalar_type(), out.scalar_type()),
-      "diagonal_copy: result type ", result.scalar_type(), " can't be cast to the desired out= type ", out.scalar_type());
-  out.copy_(result);
-  return out;
-}
-
-
-at::Tensor& expand_copy_SymInt_out(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out) {
-  auto tmp = self.expand_symint(size, implicit);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& expand_copy_out_symint(const at::Tensor & self, at::SymIntArrayRef size, bool implicit, at::Tensor & out) {
-  auto tmp = self.expand_symint(size, implicit);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& narrow_copy_out(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) {
-  auto tmp = self.narrow(dim, start, length);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& permute_copy_out(const at::Tensor & self, at::IntArrayRef dims, at::Tensor & out) {
-  auto tmp = self.permute(dims);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _reshape_alias_copy_out(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) {
-  auto tmp = self._reshape_alias(size, stride);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& select_copy_symint_out(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out) {
-  auto tmp = self.select_symint(dim, index);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& detach_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.detach();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& slice_copy_Tensor_out(const at::Tensor & self, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step, at::Tensor & out) {
-  auto tmp = self.slice(dim, start, end, step);
-  out.copy_(tmp);
-  return out;
-}
-
-
+// Autogen kernels for tensor list ops dont work on XLA. TODO(jakeszwe)
 void split_copy_Tensor_out(const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList  out) {
   auto tmp = self.split(split_size, dim);
 
@@ -4068,7 +3948,6 @@ void split_copy_Tensor_out(const at::Tensor & self, int64_t split_size, int64_t
   }
 }
 
-
 void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList  out) {
   auto tmp = self.split_with_sizes(split_sizes, dim);
 
@@ -4078,91 +3957,6 @@ void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_si
   }
 }
 
-
-at::Tensor& squeeze_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.squeeze();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& squeeze_copy_dim_out(const at::Tensor & self, int64_t dim, at::Tensor & out) {
-  auto tmp = self.squeeze(dim);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& squeeze_copy_dims_out(const at::Tensor & self, IntArrayRef dims, at::Tensor & out) {
-  auto tmp = self.squeeze(dims);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& t_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.t();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& transpose_copy_int_out(const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out) {
-  auto tmp = self.transpose(dim0, dim1);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& unsqueeze_copy_out(const at::Tensor & self, int64_t dim, at::Tensor & out) {
-  auto tmp = self.unsqueeze(dim);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _indices_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self._indices();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& _values_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self._values();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& indices_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.indices();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& values_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.values();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& crow_indices_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.crow_indices();
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& col_indices_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.col_indices();
-  out.copy_(tmp);
-  return out;
-}
-
-
 void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList  out) {
   auto tmp = self.unbind(dim);
 
@@ -4172,34 +3966,6 @@ void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList  o
   }
 }
 
-
-at::Tensor& view_copy_out_symint(const at::Tensor & self, at::SymIntArrayRef size, at::Tensor & out) {
-  auto tmp = self.view_symint(size);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& view_copy_dtype_out(const at::Tensor & self, at::ScalarType dtype, at::Tensor & out) {
-  auto tmp = self.view(dtype);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& unfold_copy_out(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out) {
-  auto tmp = self.unfold(dimension, size, step);
-  out.copy_(tmp);
-  return out;
-}
-
-
-at::Tensor& alias_copy_out(const at::Tensor & self, at::Tensor & out) {
-  auto tmp = self.alias();
-  out.copy_(tmp);
-  return out;
-}
-
 int64_t sparse_dim_strided(const at::Tensor& self) {
   return 0;
 }
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index 768fb56b6de7..7802a177121b 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -99,7 +99,7 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
   return out_tensor;
 }
 
-Tensor roll_cpu(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
+Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used by CPU and MPS dispatch.
   if (dims.size() != 1 || shifts.size() != 1) {
     return roll_common(self, shifts, dims);
   }
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
index 4909ebe84bb0..f17c96c7bdb7 100644
--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -12,8 +12,8 @@ namespace at {
 namespace native {
 
 static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
-  TORCH_CHECK(shifts.size() > 0, "`shifts` required");
-  if (dims.size() == 0 && shifts.size() == 1) {
+  TORCH_CHECK(!shifts.empty(), "`shifts` required");
+  if (dims.empty() && shifts.size() == 1) {
     auto flattened = self.contiguous().view(self.numel());
     return roll(flattened, shifts[0], 0).view(self.sizes());
   }
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index 0444b15968a0..4dcb4ce71a17 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -28,14 +28,20 @@
 #include <unordered_set>
 
 namespace std {
-  template<> struct hash<at::BFloat16>
-    {
-        size_t operator()(const at::BFloat16& v) const noexcept
-        {
-            return std::hash<uint16_t>()(v.x);
-        }
-    };
-}
+template <>
+struct hash<at::BFloat16> {
+  size_t operator()(const at::BFloat16& v) const noexcept {
+    return std::hash<uint16_t>()(v.x);
+  }
+};
+
+template <>
+struct hash<at::Half> {
+  size_t operator()(const at::Half& v) const noexcept {
+    return std::hash<uint16_t>()(v.x);
+  }
+};
+} // namespace std
 
 namespace at {
 namespace native{
@@ -315,7 +321,7 @@ std::tuple<Tensor, Tensor, Tensor> _unique_dim_cpu_template(
 
 std::tuple<Tensor, Tensor>
 _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
-  return AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "unique", [&] {
+  return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique", [&] {
     Tensor output, inverse;
     std::tie(output, inverse, std::ignore) = unique_cpu_template<scalar_t>(self, sorted, return_inverse, false);
     return std::make_tuple(output, inverse);
@@ -324,14 +330,14 @@ _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) {
 
 std::tuple<Tensor, Tensor, Tensor>
 _unique2_cpu(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "unique", [&] {
+  return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique", [&] {
     return unique_cpu_template<scalar_t>(self, sorted, return_inverse, return_counts);
   });
 }
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "unique_dim", [&] {
+  return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
     // The current implementation using `dim` always sorts due to unhashable tensors
     return _unique_dim_cpu_template<scalar_t>(self, dim, false, return_inverse, return_counts);
   });
@@ -339,7 +345,7 @@ unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const b
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
-  return AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "unique_dim", [&] {
+  return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique_dim", [&] {
     return _unique_dim_cpu_template<scalar_t>(self, dim, true, return_inverse, return_counts);
   });
 }
@@ -347,7 +353,7 @@ unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool ret
 std::tuple<Tensor, Tensor, Tensor>
 unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
   if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) {
-    return AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "unique", [&] {
+    return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique", [&] {
       return unique_consecutive_cpu_template<scalar_t>(self, return_inverse, return_counts);
     });
   }
diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp
index 1a6af7526030..02cf9a6864c6 100644
--- a/aten/src/ATen/native/UpSample.cpp
+++ b/aten/src/ATen/native/UpSample.cpp
@@ -3,6 +3,7 @@
 
 #include <ATen/native/UpSample.h>
 #include <c10/util/irange.h>
+#include <c10/util/TypeCast.h>
 
 namespace at {
 namespace native {
@@ -23,7 +24,8 @@ TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
     TORCH_CHECK(static_cast<int64_t>(scale_factors->size()) == spatial_dimensions);
     c10::SmallVector<int64_t, 3> ret;
     for (const auto i : c10::irange(spatial_dimensions)) {
-      ret.push_back(static_cast<double>(input_size[i+2]) * scale_factors.value()[i]);
+      const double odim = static_cast<double>(input_size[i+2]) * scale_factors.value()[i];
+      ret.push_back(c10::checked_convert<int64_t>(odim, "int64_t"));
     }
     return ret;
   }
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index a9e8cf2243f0..d0393aaf18bf 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -11,6 +11,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/Math.h>
+#include <ATen/native/cpu/LogAddExp.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/copysign.h>
@@ -893,6 +894,14 @@ void logaddexp_kernel(TensorIteratorBase& iter) {
               (a1 == b1) & (a1.abs() == inf));
           return convert_float_bfloat16(a0, a1);
         });
+  } else if (isComplexType(iter.dtype())) {
+    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "logaddexp_cpu", [&]() {
+      cpu_kernel(
+        iter,
+        [=](scalar_t a, scalar_t b) -> scalar_t {
+          return _log_add_exp_helper(a, b);
+        });
+    });
   } else {
     AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "logaddexp_cpu", [&]() {
       cpu_kernel_vec(
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index de463b516e6d..5b9d844b7a37 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -109,6 +109,8 @@ void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional<G
 }
 #else
 void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<Generator> gen) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
+
   Tensor self = iter.tensor(0);
   if (lambda > 0 && !std::isinf(lambda) && !std::isnan(lambda)) {
     CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
@@ -124,7 +126,7 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, c10::optional<G
     int64_t n = self.numel();
     bool contig = self.is_contiguous();
 
-    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "exponential_cpu", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "exponential_cpu", [&] {
       at::Tensor tmp_tensor;
       constexpr bool is_df = std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value;
       if (is_df && contig) {
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 37c799803eaf..ebe05c944a0d 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -290,6 +290,7 @@ struct GeometricKernel {
 
 template<typename RNG>
 void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() {
     std::lock_guard<std::mutex> lock(generator->mutex_);
     at::exponential_distribution<double> exponential(lambda);
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index b69f9a8a7909..7ac9c3ff6070 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -569,6 +569,145 @@ void cpu_vflip_memcpy(at::TensorIterator& iter) {
   iter.cast_outputs();
 }
 
+constexpr int64_t hflip_mask_size = 32;
+
+std::array<char, hflip_mask_size> generate_vec_hflip_reg_mask(int64_t data_stride) {
+    std::array<char, hflip_mask_size> mask;
+    for (const auto k : c10::irange(hflip_mask_size / 2)) {
+      int j = k / data_stride + 1;
+      int v = (j * data_stride - 1) - (k % data_stride);
+      v = std::min(v, (int) (hflip_mask_size / 2 - 1));
+      mask[hflip_mask_size - 1 - k] = v;
+      mask[hflip_mask_size / 2 - 1 - k] = v;
+    }
+    return mask;
+}
+
+int64_t vectorized_cpu_hflip_channels_last(
+    char * C10_RESTRICT *data, const int64_t data_size, const int64_t data_stride, const std::array<char, 32> & mdata) {
+
+  int64_t i = 0;
+#ifdef CPU_CAPABILITY_AVX2
+
+  constexpr auto vec_size = 256 / 8;
+
+  if (data_size > vec_size) {
+
+      // Example for num channels=3 and dtype=uint8
+      // -> data_stride = 3
+      // -> usable_vec_stride = 30
+      // -> usable_vec_half_stride = 15
+      // Data: (1 2 3) (4 5 6) (7 8 9) (10 11 12) (13 14 15) (16 17 18) (19 20 21) (22 23 24) (25 26 27) (28 29 30) (31 32 33)
+      // load by 2 parts
+      // R = [ (1 2 3) (4 5 6) (7 8 9) (10 11 12) (13 14 15) (16 | (16 17 18) (19 20 21) (22 23 24) (25 26 27) (28 29 30) (31 ]
+      // flip(R) ->
+      // R = [ 31 (28 29 30) (25 26 27) (22 23 24) (19 20 21) (16 17 18) | 16 (13 14 15) (10 11 12) (7 8 9) (4 5 6) (1 2 3) ]
+      //
+      // Write in 2 parts
+      // Output pointer: output_ptr = data[0]                                                                                  v
+      // - Init:
+      //                (X X X)  (X X X)    (X X X)    (X X X)    (X X X)    (X X X)    (X X X)    (X X X)    (X X X) (X X X) (X X X)
+      // 0) Move to initial position: output_ptr = data[0] + data_stride - vec_size / 2;
+      //                                                                          v
+      //                (X X X)  (X X X)    (X X X)    (X X X)    (X X X)    (X X X)    (X X X)    (X X X)    (X X X) (X X X) (X X X)
+      // - In the loop:
+      // 1) Write 1st block from output_ptr
+      //                                                                            v
+      //                                                                            |----> vec_size / 2 ---------------------------|
+      // Output part 1: (X X X)  (X X X)    (X X X)    (X X X)    (X X X)     (X X 16)  (13 14 15) (10 11 12) (7 8 9) (4 5 6) (1 2 3)
+      // 2) Write 2nd block from output_ptr - usable_vec_half_stride:
+      //                                                                            v
+      //                     |-----> vec_size / 2 ----------------------------------|
+      // Output part 2: (X X 31) (28 29 30) (25 26 27) (22 23 24) (19 20 21) (16 17 18) (13 14 15) (10 11 12) (7 8 9) (4 5 6) (1 2 3)
+      //
+      // 3) Move to the next position: output_ptr -= usable_vec_stride
+      //
+      // - After the loop:
+      // 4) Move to write position
+      //                 v
+      //                (X X 31) (28 29 30) (25 26 27) (22 23 24) (19 20 21) (16 17 18) (13 14 15) (10 11 12) (7 8 9) (4 5 6) (1 2 3)
+
+    const __m256i mask = _mm256_loadu_si256((__m256i *) mdata.data());
+
+    const auto usable_vec_stride = 2 * (vec_size / 2 / data_stride) * data_stride;
+    const auto usable_vec_half_stride = usable_vec_stride / 2;
+
+    auto output_ptr = data[0] + data_stride - vec_size / 2;
+    auto input_ptr = data[1];
+
+    for (; i < data_size - vec_size; i += usable_vec_stride) {
+
+      // load 256-bits by two 128-bits parts
+      auto a0 = _mm_loadu_si128((__m128i *) (input_ptr + i));
+      auto b0 = _mm256_castsi128_si256(a0);
+      auto a1 = _mm_loadu_si128((__m128i *) (input_ptr + i + usable_vec_half_stride));
+      auto data_vec = _mm256_inserti128_si256(b0, a1, 1);
+
+      auto reversed_vec = _mm256_shuffle_epi8(data_vec, mask);
+
+      // write output in two parts
+      auto rev_vec_h = _mm256_extracti128_si256(reversed_vec, 0);
+      _mm_storeu_si128((__m128i *) (output_ptr - i), rev_vec_h);
+      auto rev_vec_l = _mm256_extracti128_si256(reversed_vec, 1);
+      _mm_storeu_si128((__m128i *) (output_ptr - i - usable_vec_half_stride), rev_vec_l);
+    }
+
+    data[0] -= i;
+    data[1] += i;
+  }
+#endif
+  return i;
+}
+
+void cpu_hflip_channels_last_vec(at::TensorIterator& iter) {
+
+  auto input_strides = iter.strides(1);
+  const auto data_stride = input_strides[1];
+
+  // Generate avx mask once
+  alignas(hflip_mask_size) auto mdata = generate_vec_hflip_reg_mask(data_stride);
+
+  auto loop2d = [&](char** base, const int64_t *strides, int64_t size0, int64_t size1) {
+
+    // Here ntensors is defined for output and 1 input. But tensor iterator has defined output, input
+    // and restrided_input (see aten/src/ATen/native/TensorTransformations.cpp#L64-L66) but we use only
+    // output and input.
+    static constexpr int ntensors = 2;
+    const int64_t *outer_strides = &strides[3];
+    const int64_t stride = strides[0];
+
+    TORCH_INTERNAL_ASSERT(stride == strides[1]);
+
+    auto c = -outer_strides[0];
+    TORCH_INTERNAL_ASSERT(c == outer_strides[1]);
+
+    char* C10_RESTRICT data[ntensors] = {base[0], base[1]};
+    const int64_t size = size0 * size1;
+
+    int64_t i = 0;
+
+    if (c >= 2 && c <= 16) {
+      i = vectorized_cpu_hflip_channels_last(data, size * stride, c, mdata) / stride;
+    }
+
+    auto data_stride = size0 * stride;
+    for (; i < size; i += size0) {
+
+      memcpy(data[0], data[1], data_stride);
+
+      // advance:
+      for (const auto arg : c10::irange(ntensors)) {
+        data[arg] += outer_strides[arg];
+      }
+    }
+
+  };
+
+  int64_t grain_size = at::internal::GRAIN_SIZE;
+  iter.for_each(loop2d, grain_size);
+  iter.cast_outputs();
+}
+
 void flip_kernel(TensorIterator& iter, const bool quantized) {
   if (quantized) {
     AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(iter.dtype(), "flip_quantized_cpu",
@@ -613,10 +752,21 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
         } else if (iter_dtype == kDouble) {
           return cpu_hflip_vec<double>(iter);
         }
-
       }
       // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
     } else if (iter.has_contiguous_first_dim()) {
+      // Special cases:
+      // a) channels last hflip on (N, C, H, W) and outer_stride(=dtype_size * C) in [2, 16]
+      // b) flip dim=-2 on (N, ..., M, C) and outer_stride(=dtype_size * C) in [2, 16]
+      auto output_strides = iter.strides(0);
+      auto input_strides = iter.strides(1);
+      auto c = -output_strides[1];
+      if (c >= 2 && c <= 16 &&
+          c == input_strides[1] &&
+          c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
+      ) {
+        return cpu_hflip_channels_last_vec(iter);
+      }
       // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
       return cpu_vflip_memcpy(iter);
     }
diff --git a/aten/src/ATen/native/cpu/LogAddExp.h b/aten/src/ATen/native/cpu/LogAddExp.h
new file mode 100644
index 000000000000..c03cbebafaff
--- /dev/null
+++ b/aten/src/ATen/native/cpu/LogAddExp.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/util/complex.h>
+#include <ATen/NumericUtils.h>
+
+namespace at { namespace native {
+inline namespace CPU_CAPABILITY {
+
+// custom min and max to be used in logcumsumexp for complex arguments
+template <typename scalar_t>
+std::pair<c10::complex<scalar_t>, c10::complex<scalar_t>> _logcumsumexp_minmax(c10::complex<scalar_t> x, c10::complex<scalar_t> y) {
+  if (at::_isnan(y)) {  // either real is nan or imag is nan
+    return std::make_pair(y, y);
+  } else if (at::_isnan(x)) {  // either real is nan or imag is nan
+    return std::make_pair(x, x);
+  } else {
+    return (x.real() < y.real()) ? std::make_pair(x, y) : std::make_pair(y, x);
+  }
+}
+
+template <typename scalar_t>
+scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  scalar_t min = at::_isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan
+  scalar_t max = at::_isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan
+  if (min != max || std::isfinite(min)) {
+    // nan will be propagated here
+    return std::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  auto [min, max] = _logcumsumexp_minmax<scalar_t>(x, y);
+  auto min_real = std::real(min);
+  auto max_real = std::real(max);
+
+  if (at::_isnan(min)) {  // either real is nan or imag is nan
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  } else if (!std::isfinite(min_real) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      return std::log(std::exp(min) + std::exp(max));
+    }
+  } else {
+    return std::log1p(std::exp(min - max)) + max;
+  }
+}
+
+} // end namespace
+}} //end at::native
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 89a4b1dcd3df..1014980006a3 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -12,6 +12,7 @@
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/cpu/Reduce.h>
+#include <ATen/native/cpu/LogAddExp.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -72,7 +73,8 @@ static inline void cpu_cum_base_kernel(const Tensor& result,
     }
   };
 
-  iter.for_each(loop);
+  int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, self.size(dim));
+  iter.for_each(loop, grain_size);
 }
 
 static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
@@ -113,66 +115,11 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t
   });
 }
 
-// custom min and max to be used in logcumsumexp for complex arguments
-template <typename scalar_t>
-c10::complex<scalar_t> _logcumsumexp_minmax(c10::complex<scalar_t> x, c10::complex<scalar_t> y, bool min) {
-  scalar_t xr = std::real(x);
-  scalar_t yr = std::real(y);
-  if (std::isnan(yr) || (std::isnan(std::imag(y)))) {
-    return y;
-  } else if (std::isnan(xr) || (std::isnan(std::imag(x)))) {
-    return x;
-  } else {
-    return ((xr < yr) == min) ? x : y;  // logical xnor
-  }
-}
-
-template <typename scalar_t>
-scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) {
-  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
-  scalar_t min = std::isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan
-  scalar_t max = std::isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan
-  if (min != max || std::isfinite(min)) {
-    // nan will be propagated here
-    return std::log1p(std::exp(min - max)) + max;
-  } else {
-    // special case to correctly handle infinite cases
-    return x;
-  }
-}
-
-template <typename scalar_t>
-c10::complex<scalar_t> _log_add_exp_helper(c10::complex<scalar_t> x, c10::complex<scalar_t> y) {
-  c10::complex<scalar_t> min = _logcumsumexp_minmax(x, y, /*min=*/true);
-  c10::complex<scalar_t> max = _logcumsumexp_minmax(x, y, /*min=*/false);
-  scalar_t min_real = std::real(min);
-  scalar_t max_real = std::real(max);
-
-  if (std::isnan(min_real) || std::isnan(std::imag(min))) {
-    // handling the "infectious" NaNs
-    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
-  } else if ((!std::isfinite(min_real)) && (min_real == max_real)) {
-    if (min_real < 0) {
-      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
-      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
-      // It does not matter if we're taking the exp of this value
-      return min;
-    } else {
-      // handle the +inf case, we don't need the special precision for log1p for small values
-      // and to avoid producing nan in case of real(max) == real(min) == +inf
-      return std::log(std::exp(min) + std::exp(max));
-    }
-  } else {
-    return std::log1p(std::exp(min - max)) + max;
-  }
-}
-
 static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, self.scalar_type(), "logcumsumexp_out_cpu", [&] {
-  // AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, self.scalar_type(), "logcumsumexp_out_cpu", [&] {
     cpu_cum_base_kernel<scalar_t>(result, self, wrap_dim, [&] (
       scalar_t* result_data, auto result_dim_stride,
       const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) {
@@ -200,7 +147,7 @@ static void mean_kernel_impl(TensorIterator& iter) {
   });
 }
 
-static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool take_sqrt) {
+static void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] {
     binary_kernel_reduce(
         iter,
@@ -208,9 +155,8 @@ static void std_var_kernel_impl(TensorIterator& iter, int64_t correction, bool t
             scalar_t,
             double,
             int64_t,
-            double,
             std::tuple<scalar_t, scalar_t>>{correction, take_sqrt},
-        WelfordData<double, int64_t, double>());
+        WelfordData<double, int64_t>());
   });
 }
 
@@ -254,55 +200,50 @@ inline void norm_two_reduce_step(Vectorized<float>& acc_fvec, Vectorized<BFloat1
   acc_fvec += data_fvec1 * data_fvec1;
 }
 
+// This reduction accumulates results as the type `acc_t`. By default, when
+// `scalar_t` is complex, `acc_t` is the downgraded real number type.
+// Otherwise, `acc_t` and `scalar_t` are the same type.
+template <typename scalar_t, typename acc_t=typename scalar_value_type<scalar_t>::type, typename out_t=typename scalar_value_type<scalar_t>::type>
+void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) {
+  if (val == 0.0) {
+    binary_kernel_reduce(iter, NormZeroOps<scalar_t, acc_t, out_t>(), acc_t(0));
+  } else if (val == 0.0) {
+    binary_kernel_reduce(iter, NormOneOps<scalar_t, acc_t, out_t>(), acc_t(0));
+  } else if (val == 2.0) {
+    binary_kernel_reduce(iter, NormTwoOps<scalar_t, acc_t, out_t>(), acc_t(0));
+  } else if (val == INFINITY) {
+    binary_kernel_reduce(iter, AbsMaxOps<scalar_t, acc_t, out_t>(), acc_t(0));
+  } else if (val == -INFINITY) {
+    binary_kernel_reduce(iter, AbsMinOps<scalar_t, acc_t, out_t>(), std::numeric_limits<acc_t>::infinity());
+  } else {
+    binary_kernel_reduce(iter, NormOps<scalar_t, acc_t, out_t>{acc_t(val)}, acc_t(0));
+  }
+}
+
 static void norm_kernel_tensor_iterator_impl(
     TensorIterator& iter,
     const Scalar& p) {
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  float val;
+  double val;
   if (p.isIntegral(false)) {
     val = p.to<int64_t>();
   } else if (p.isFloatingPoint()) {
-    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
     val = p.to<double>();
   } else {
-    AT_ERROR("norm_kernel_tensor_iterator_impl expects norm to be integer or float");
+    TORCH_CHECK(false, "norm_kernel_cpu expects norm to be integer or float");
   }
   if (iter.numel() == 0) {
     iter.output().fill_((val < 0) ? INFINITY : 0);
     return;
   }
 
-  // In the dispatch code blocks below, reduction kernels accumulate results as
-  // the type `acc_t`. When `scalar_t` is complex, `acc_t` is the downgraded
-  // real number type. Otherwise, `acc_t` and `scalar_t` are the same type.
-  if (val == 0) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
-      using acc_t = typename scalar_value_type<scalar_t>::type;
-      binary_kernel_reduce(
-        iter,
-        NormZeroOps<scalar_t, acc_t>(),
-        acc_t(0)
-      );
-    });
-  } else if (val == 1) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
-      using acc_t = typename scalar_value_type<scalar_t>::type;
-      binary_kernel_reduce(
-        iter,
-        NormOneOps<scalar_t, acc_t>(),
-        acc_t(0)
-      );
-    });
-  } else if (val == 2) {
+  if (val == 2.0 && is_reduce_lastdim(iter) &&
+      iter.dtype(0) == iter.input_dtype() &&
+      (iter.input_dtype() == kFloat || iter.input_dtype() == kDouble ||
+       iter.input_dtype() == kBFloat16)) {
     // If we can vectorize over the last dimension and the dtype
     // of the output is the same as that of the input,
     // then we go through the vectorised path.
-    if (is_reduce_lastdim(iter) &&
-        iter.dtype(0) == iter.input_dtype() &&
-        (iter.input_dtype() == kFloat ||
-         iter.input_dtype() == kDouble ||
-         iter.input_dtype() == kBFloat16)) {
-      AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
         // use float as accumulate type for BFloat16
         using acc_t = at::opmath_type<scalar_t>;
         binary_kernel_reduce_lastdim(iter, [](char* result_data_bytes, char* self_data_bytes, int64_t size) {
@@ -329,49 +270,28 @@ static void norm_kernel_tensor_iterator_impl(
           result_data[0] = scalar_t(std::sqrt(buffer[0]));
         });
       });
-      return;
-    }
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
-      using acc_t = typename scalar_value_type<scalar_t>::type;
-      binary_kernel_reduce(
-        iter,
-        NormTwoOps<scalar_t, acc_t>(),
-        acc_t(0)
-      );
-    });
-  } else if (val == INFINITY) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
-      using acc_t = typename scalar_value_type<scalar_t>::type;
-      binary_kernel_reduce(
-        iter,
-        AbsMaxOps<scalar_t, acc_t>(),
-        acc_t(0)
-      );
-    });
-  } else if (val == -INFINITY) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
-      using acc_t = typename scalar_value_type<scalar_t>::type;
-      binary_kernel_reduce(
-        iter,
-        AbsMinOps<scalar_t, acc_t>(),
-        std::numeric_limits<acc_t>::infinity()
-      );
-    });
   } else {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] {
-      using acc_t = typename scalar_value_type<scalar_t>::type;
-      binary_kernel_reduce(
-        iter,
-        NormOps<scalar_t, acc_t> { acc_t(val) },
-        acc_t(0)
-      );
+    if (iter.dtype(0) == kHalf) {
+      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+    } else if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
+      // type promotion that does cast and reduction in a single kernel
+      return norm_kernel_cpu_impl<at::Half, float, float>(iter, val);
+    } else if(iter.dtype(0) == kBFloat16) {
+      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+    } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
+      // type promotion that does cast and reduction in a single kernel
+      return norm_kernel_cpu_impl<at::BFloat16, float, float>(iter, val);
+    }
+
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "norm_cpu", [&] {
+      norm_kernel_cpu_impl<scalar_t>(iter, val);
     });
-  }
 
-  // For complex outputs, the above kernels do not touch the imaginary values,
-  // so we must zero them out
-  if (isComplexType(iter.output().scalar_type())) {
-    at::imag(iter.output()).zero_();
+    // For complex outputs, the above kernels do not touch the imaginary values,
+    // so we must zero them out
+    if (isComplexType(iter.output().scalar_type())) {
+      at::imag(iter.output()).zero_();
+    }
   }
 }
 
diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h
new file mode 100644
index 000000000000..68b19d5b5b90
--- /dev/null
+++ b/aten/src/ATen/native/cpu/ReduceUtils.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/native/ReductionType.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+#define AT_DISPATCH_REDUCTION_TYPES(op, ...)                                   \
+  [&] {                                                                        \
+    switch (op) {                                                              \
+      case SUM: {                                                              \
+        static constexpr ReductionType reduce = SUM;                           \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case MEAN: {                                                             \
+        static constexpr ReductionType reduce = MEAN;                          \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case MIN: {                                                              \
+        static constexpr ReductionType reduce = MIN;                           \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case MAX: {                                                              \
+        static constexpr ReductionType reduce = MAX;                           \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case PROD: {                                                             \
+        static constexpr ReductionType reduce = PROD;                          \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+    }                                                                          \
+  }()
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value() {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val;
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    val = static_cast<acc_t>(0);
+  } else if (reduce == ReductionType::PROD) {
+    val = static_cast<acc_t>(1);
+  } else if (reduce == ReductionType::MAX) {
+    val = -std::numeric_limits<acc_t>::infinity();
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    val = std::numeric_limits<acc_t>::infinity();
+  }
+  return val;
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value(const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (initial.has_value()) {
+    return initial.value().to<acc_t>();
+  } else {
+    return init_value<scalar_t, reduce>();
+  }
+}
+
+template <typename scalar_t>
+inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val) {
+  using Vec = Vectorized<vec_scalar_t<scalar_t>>;
+  map<scalar_t>(
+      [val](Vec x) { return Vec(val); },
+      out,
+      out,
+      size);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val = init_value<scalar_t, reduce>(initial);
+  init(out, size, val);
+}
+
+// overload with `include_self`, used by scatter_reduce
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, bool include_self = false) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (!include_self) {
+    acc_t val = init_value<scalar_t, reduce>();
+    init(out, size, val);
+  }
+}
+
+template <typename scalar_t>
+inline scalar_t _max(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::max(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::maximum propagates NaN
+  return vec::maximum(x, y);
+}
+
+template <typename scalar_t>
+inline scalar_t _min(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::min(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::minimum propagates NaN
+  return vec::minimum(x, y);
+}
+
+// for Max and Min, propagate NaN:
+template <typename T, ReductionType reduce>
+inline T update(const T& x, const T& y) {
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    return x + y;
+  } else if (reduce == ReductionType::PROD) {
+    return x * y;
+  } else if (reduce == ReductionType::MAX) {
+    return _max(x, y);
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    return _min(x, y);
+  }
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void update(scalar_t* out, scalar_t* data, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  map2<scalar_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void write(scalar_t* out, int64_t count, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  if (reduce == ReductionType::MEAN) {
+    if (count > 0) {
+      vec::map<scalar_t>(
+          [count](Vec x) { return x / Vec(count); },
+          out,
+          out,
+          K);
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index c3d74655c0ba..849ed43bfb5c 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -8,6 +8,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/Parallel.h>
+#include <ATen/native/cpu/ReduceUtils.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
@@ -573,45 +574,6 @@ struct cpu_scatter_gather_base_kernel {
   }
 };
 
-template <typename scalar_t, ReductionType reduce>
-inline void init(scalar_t* ptr, int64_t size, bool include_self) {
-  if (!include_self) {
-    using acc_t = vec::vec_scalar_t<scalar_t>;
-    using Vec = vec::Vectorized<acc_t>;
-
-    acc_t val;
-    if (reduce == ReductionType::SUM ||
-        reduce == ReductionType::MEAN) {
-      val = static_cast<acc_t>(0);
-    } else if (reduce == ReductionType::PROD) {
-      val = static_cast<acc_t>(1);
-    } else if (reduce == ReductionType::MAX) {
-      val = std::numeric_limits<acc_t>::lowest();
-    } else {
-      val = std::numeric_limits<acc_t>::max();
-    }
-    vec::map<scalar_t>(
-        [val](Vec x) { return Vec(val); },
-        ptr,
-        ptr,
-        size);
-  }
-}
-
-template <typename vec_t, ReductionType reduce>
-inline vec_t update(const vec_t& x, const vec_t& y) {
-  if (reduce == ReductionType::SUM ||
-      reduce == ReductionType::MEAN) {
-    return x + y;
-  } else if (reduce == ReductionType::PROD) {
-    return x * y;
-  } else if (reduce == ReductionType::MAX) {
-    return vec::maximum(x, y);
-  } else {
-    return vec::minimum(x, y);
-  }
-}
-
 // Note [scatter reduce optimization]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 //
@@ -713,7 +675,6 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index,
   });
 
   // TODO: do blocking on col dimension to reduce WR bandwidth
-  using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
   at::parallel_for(0, num_nonzero_rows, 1, [&](int64_t begin, int64_t end) {
     for (const auto m : c10::irange(begin, end)) {
       int64_t row = row_index[m];
@@ -721,31 +682,19 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index,
       int64_t off_end = row_index_offset[m + 1];
       scalar_t* self_ptr = self_data + row * K;
 
-      // reinit rows in `self` if needed
+      // step 1: reinit rows in `self` if needed
       init<scalar_t, reduce>(self_ptr, K, include_self);
 
+      // step 2: reduce
       for (const auto n : c10::irange(off_start, off_end)) {
         int64_t col = sorted_col_index_values[n];
-        scalar_t* src_ptr = src_data + col * K;
-        vec::map2<scalar_t>(
-            [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
-            self_ptr,
-            self_ptr,
-            src_ptr,
-            K);
+        update<scalar_t, reduce>(self_ptr, src_data + col * K, K);
       }
 
-      if (reduce == ReductionType::MEAN) {
-        int64_t count = include_self ? 1 : 0;
-        count += off_end - off_start;
-        if (count != 0) {
-          vec::map<scalar_t>(
-              [count](Vec x) { return x / Vec(count); },
-              self_ptr,
-              self_ptr,
-              K);
-        }
-      }
+      // step 3: finalize
+      int64_t count = include_self ? 1 : 0;
+      count += off_end - off_start;
+      write<scalar_t, reduce>(self_ptr, count, K);
     }
   });
 }
@@ -797,26 +746,12 @@ void scatter_add_expanded_index_kernel(const Tensor& self, const Tensor& index,
 
 void scatter_reduce_expanded_index_kernel(
     const Tensor& self, const Tensor& index, const Tensor& src,
-    const ReductionType& reduce, bool include_self) {
+    const ReductionType& reduction, bool include_self) {
   AT_DISPATCH_FLOATING_TYPES_AND(
     ScalarType::BFloat16, self.scalar_type(), "scatter_reduce_expanded_index", [&] {
-      switch (reduce) {
-      case ReductionType::SUM :
-        cpu_scatter_reduce_expanded_index<scalar_t, ReductionType::SUM>(self, index, src, include_self);
-        break;
-      case ReductionType::PROD :
-        cpu_scatter_reduce_expanded_index<scalar_t, ReductionType::PROD>(self, index, src, include_self);
-        break;
-      case ReductionType::MAX :
-        cpu_scatter_reduce_expanded_index<scalar_t, ReductionType::MAX>(self, index, src, include_self);
-        break;
-      case ReductionType::MIN :
-        cpu_scatter_reduce_expanded_index<scalar_t, ReductionType::MIN>(self, index, src, include_self);
-        break;
-      case ReductionType::MEAN :
-        cpu_scatter_reduce_expanded_index<scalar_t, ReductionType::MEAN>(self, index, src, include_self);
-        break;
-      }
+    AT_DISPATCH_REDUCTION_TYPES(reduction, [&]() {
+      cpu_scatter_reduce_expanded_index<scalar_t, reduce>(self, index, src, include_self);
+    });
   });
 }
 
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index af23b11f310d..337ddb546ffd 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -23,80 +23,85 @@
 // computations per task. Each task works across dim_size elements. 16 should be
 // a very rough approximation of the number of computations per dim_size element
 // by counting simple computations (*, +, -) as 1 and exp or log as 4.
+//
+// We use a chunk size such that it'd fit in L1D.
 
 namespace at::native {
-namespace {
 
+namespace {
 template <typename scalar_t>
 inline void _vec_log_softmax_lastdim(
     scalar_t* input_data_base,
     scalar_t* output_data_base,
     int64_t outer_size,
     int64_t dim_size) {
-  using Vec = vec::Vectorized<at::opmath_type<scalar_t>>;
-  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size();
-  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
-  if (grain_size < CHUNK_SIZE)
-    grain_size = CHUNK_SIZE;
+  using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
+  // Coincidentally, at::internal::GRAIN_SIZE is 32768, which is equal to the
+  // size of L1D cache on many processors. Some processors have 48 KB L1D cache
+  // nowadays, so maybe in the future, we can leverage the knowledge of a
+  // machine's L1D cache size.
+  int64_t CHUNK_SIZE = std::max<int64_t>(
+      1,
+      at::internal::GRAIN_SIZE / (sizeof(scalar_t) * dim_size));
 
-  parallel_for(
-      0,
-      outer_size,
-      grain_size,
-      [&](int64_t begin, int64_t end) {
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-        scalar_t tmp_sum_scalar[CHUNK_SIZE];
-        // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-        scalar_t max_input_arr[CHUNK_SIZE];
-        for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) {
-          int64_t loop_end = CHUNK_SIZE;
-          if (ii + CHUNK_SIZE > end)
-            loop_end = end - ii;
-          for (const auto j : c10::irange(loop_end)) {
-            int64_t i = ii + j;
-            scalar_t* input_data = input_data_base + i * dim_size;
-            max_input_arr[j] = vec::reduce_all<scalar_t>(
-                [](Vec& x, Vec& y) { return vec::maximum(x, y); },
-                input_data,
-                dim_size);
-          }
-          for (const auto j : c10::irange(loop_end)) {
-            int64_t i = ii + j;
-            scalar_t* input_data = input_data_base + i * dim_size;
-            scalar_t max_input = max_input_arr[j];
-            tmp_sum_scalar[j] = vec::map_reduce_all<scalar_t>(
-                [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
-                [](Vec x, Vec y) { return x + y; },
-                input_data,
-                dim_size);
-          }
-          // See [Note AVX-SSE transitions] for why this should call the
-          // vectorized version (aside from perf improvements).
-          vec::map(
-              [](Vec x) { return x.log(); },
-              tmp_sum_scalar,
-              tmp_sum_scalar,
-              loop_end);
-          for (const auto j : c10::irange(loop_end)) {
-            int64_t i = ii + j;
-            scalar_t* input_data = input_data_base + i * dim_size;
-            scalar_t* output_data = output_data_base + i * dim_size;
-            scalar_t tmp_sum = tmp_sum_scalar[j];
-            scalar_t max_input = max_input_arr[j];
-
-            // It's necessary to keep the order of the operations below.
-            // In some cases that input is large digits and the difference
-            // is small, if we compute `max_input` plus `tmp_sum` before,
-            // there would be a numerical problem. See an example in
-            // https://github.com/pytorch/pytorch/issues/11752#issuecomment-422883379
-            vec::map(
-                [tmp_sum, max_input](Vec x) { return x - Vec(max_input) - Vec(tmp_sum); },
-                output_data,
-                input_data,
-                dim_size);
-          }
-        }
-      });
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
+
+  parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
+    // MSVC requires such a declaration of dynamic arrays
+    // Source: https://stackoverflow.com/a/33423538
+    std::unique_ptr<scalar_t[]> tmp_sum_scalar(new scalar_t[CHUNK_SIZE]);
+    std::unique_ptr<scalar_t[]> max_input_arr(new scalar_t[CHUNK_SIZE]);
+    for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) {
+      int64_t loop_end = CHUNK_SIZE;
+      if (ii + CHUNK_SIZE > end)
+        loop_end = end - ii;
+      for (const auto j : c10::irange(loop_end)) {
+        int64_t i = ii + j;
+        scalar_t* input_data = input_data_base + i * dim_size;
+        max_input_arr[j] = vec::reduce_all<scalar_t>(
+            [](Vec& x, Vec& y) { return vec::maximum(x, y); },
+            input_data,
+            dim_size);
+      }
+      for (const auto j : c10::irange(loop_end)) {
+        int64_t i = ii + j;
+        scalar_t* input_data = input_data_base + i * dim_size;
+        scalar_t max_input = max_input_arr[j];
+        tmp_sum_scalar[j] = vec::map_reduce_all<scalar_t>(
+            [max_input](Vec x) { return (x - Vec(max_input)).exp(); },
+            [](Vec x, Vec y) { return x + y; },
+            input_data,
+            dim_size);
+      }
+      // See [Note AVX-SSE transitions] for why this should call the
+      // vectorized version (aside from perf improvements).
+      vec::map(
+          [](Vec x) { return x.log(); },
+          tmp_sum_scalar.get(),
+          tmp_sum_scalar.get(),
+          loop_end);
+      for (const auto j : c10::irange(loop_end)) {
+        int64_t i = ii + j;
+        scalar_t* input_data = input_data_base + i * dim_size;
+        scalar_t* output_data = output_data_base + i * dim_size;
+        scalar_t tmp_sum = tmp_sum_scalar[j];
+        scalar_t max_input = max_input_arr[j];
+
+        // It's necessary to keep the order of the operations below.
+        // In some cases that input is large digits and the difference
+        // is small, if we compute `max_input` plus `tmp_sum` before,
+        // there would be a numerical problem. See an example in
+        // https://github.com/pytorch/pytorch/issues/11752#issuecomment-422883379
+        vec::map(
+            [tmp_sum, max_input](Vec x) {
+              return x - Vec(max_input) - Vec(tmp_sum);
+            },
+            output_data,
+            input_data,
+            dim_size);
+      }
+    }
+  });
 }
 
 template <typename scalar_t>
@@ -106,7 +111,7 @@ inline void _vec_softmax_lastdim(
     int64_t outer_size,
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
-  int64_t grain_size = std::max(internal::GRAIN_SIZE / (16 * dim_size), (int64_t)1);
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
   parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
     for (const auto i : c10::irange(begin, end)) {
       scalar_t* input_data = input_data_base + i * dim_size;
@@ -140,7 +145,7 @@ inline void _vec_softmax_lastdim<BFloat16>(
     int64_t dim_size) {
   using bVec = vec::Vectorized<BFloat16>;
   using fVec = vec::Vectorized<float>;
-  int64_t grain_size = std::max(internal::GRAIN_SIZE / (16 * dim_size), (int64_t)1);
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
   parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) {
     // thread local temp buffer.
     std::unique_ptr<float []> buffer(new float[dim_size]);
@@ -262,8 +267,8 @@ inline void _vec_softmax_backward(
   using Vec = vec::Vectorized<scalar_t>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max(
-      int64_t(BLOCK_SIZE / dim_size / sizeof(scalar_t)), (int64_t)Vec::size());
+  int64_t CHUNK_SIZE = std::max<int64_t>(
+      BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
   CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
   int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
@@ -345,8 +350,8 @@ inline void _vec_softmax_backward<BFloat16>(
   using fVec = vec::Vectorized<float>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max(
-      int64_t(BLOCK_SIZE / dim_size / sizeof(BFloat16)), (int64_t)bVec::size());
+  int64_t CHUNK_SIZE = std::max<int64_t>(
+      BLOCK_SIZE / dim_size / sizeof(BFloat16), bVec::size());
   CHUNK_SIZE = CHUNK_SIZE / bVec::size() * bVec::size();
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
   int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
@@ -473,8 +478,8 @@ inline void _vec_log_softmax_backward(
   using Vec = vec::Vectorized<scalar_t>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max(
-      int64_t(BLOCK_SIZE / dim_size / sizeof(scalar_t)), (int64_t)Vec::size());
+  int64_t CHUNK_SIZE = std::max<int64_t>(
+      BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
   CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
   int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
@@ -555,8 +560,8 @@ inline void _vec_log_softmax_backward<BFloat16>(
   using fVec = vec::Vectorized<float>;
   int64_t outer_stride = dim_size * inner_size;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max(
-      int64_t(BLOCK_SIZE / dim_size / sizeof(BFloat16)), (int64_t)bVec::size());
+  int64_t CHUNK_SIZE = std::max<int64_t>(
+      BLOCK_SIZE / dim_size / sizeof(BFloat16), bVec::size());
   CHUNK_SIZE = CHUNK_SIZE / bVec::size() * bVec::size();
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
   int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
@@ -687,7 +692,7 @@ inline void _vec_softmax(
   using Vec_bf16 = vec::Vectorized<BFloat16>;
   int64_t dim_stride = inner_size;
   int64_t outer_stride = dim_size * dim_stride;
-  int64_t grain_size = std::max(internal::GRAIN_SIZE / dim_size, (int64_t)1);
+  int64_t grain_size = internal::GRAIN_SIZE / dim_size;
   int vectorized_step = Vec_bf16().size(); // Currently, we only support BFloat16 in this special implementation
   parallel_for(
       0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
@@ -793,7 +798,7 @@ inline void _vec_softmax(
   using Vec = vec::Vectorized<scalar_t>;
   int64_t dim_stride = inner_size;
   int64_t outer_stride = dim_size * dim_stride;
-  int64_t grain_size = std::max(internal::GRAIN_SIZE / dim_size, (int64_t)1);
+  int64_t grain_size = internal::GRAIN_SIZE / dim_size;
   int vectorized_step = Vec().size();
   parallel_for(
       0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
@@ -885,7 +890,7 @@ inline void _vec_logsoftmax(
     int64_t dim_size) {
   using Vec = vec::Vectorized<scalar_t>;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max(int64_t(BLOCK_SIZE / dim_size / sizeof(scalar_t)), (int64_t) Vec::size());
+  int64_t CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size());
   CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size();
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
 
@@ -989,7 +994,7 @@ inline void _vec_logsoftmax<BFloat16>(
   using bVec = vec::Vectorized<BFloat16>;
   using fVec = vec::Vectorized<float>;
   int64_t BLOCK_SIZE = 128 * 1024;
-  int64_t CHUNK_SIZE = std::max(int64_t(BLOCK_SIZE / dim_size / sizeof(BFloat16)), (int64_t) bVec::size());
+  int64_t CHUNK_SIZE = std::max<int64_t>(BLOCK_SIZE / dim_size / sizeof(BFloat16), bVec::size());
   CHUNK_SIZE = CHUNK_SIZE / bVec::size() * bVec::size();
   int64_t num_chunks = divup(inner_size, CHUNK_SIZE);
 
diff --git a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
new file mode 100644
index 000000000000..b1a7788e829d
--- /dev/null
+++ b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp
@@ -0,0 +1,512 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/cpu/SpmmReduceKernel.h>
+#include <ATen/native/cpu/ReduceUtils.h>
+#include <ATen/native/cpu/utils.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_native.h>
+#endif
+
+namespace at { namespace native {
+
+namespace {
+
+template <typename scalar_t, typename index_t, ReductionType reduce>
+void spmm_reduce_kernel_impl(
+    const Tensor& out,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const Tensor& values,
+    const Tensor& other_) {
+
+  int64_t nnz = values.numel();
+  if (nnz == 0) {
+    return;
+  }
+
+  auto other = other_.contiguous();
+
+  // access `crow_indices`, `col_indices` and `values` via TessorAccessor
+  scalar_t* out_data = out.data_ptr<scalar_t>();
+  auto csr_data = crow_indices.accessor<index_t, 1>();
+  auto col_data = col_indices.accessor<index_t, 1>();
+  auto val_data = values.accessor<scalar_t, 1>();
+  scalar_t* other_data = other.data_ptr<scalar_t>();
+
+  int64_t M = crow_indices.numel() - 1;
+  int64_t K = other.size(-1);
+
+  using Vec = vec::Vectorized<scalar_t>;
+  utils::parallel_sparse_csr(csr_data, M, nnz, [&](int64_t begin, int64_t end) {
+    int64_t row_start, row_end, c;
+    for (const auto m : c10::irange(begin, end)) {
+      row_start = csr_data[m];
+      row_end = csr_data[m + 1];
+
+      scalar_t* out_ptr = out_data + m * K;
+
+      constexpr int64_t kVecSize = Vec::size();
+      constexpr int64_t kVLEN = kVecSize * 4;
+      constexpr int64_t CHUNK_SIZE = 16;
+
+      // step 1: reinit the output row for reduce type 'amax' and 'amin'
+      int64_t count = row_end - row_start;
+      if (count != 0) {
+        init<scalar_t, reduce>(out_ptr, K, /*include_self*/false);
+      }
+
+      // step 2: reduce, do blocking on rowwise to reduce write memory bandwidth
+      for (int64_t e0 = row_start; e0 < row_end; e0 += CHUNK_SIZE) {
+        int64_t e1 = std::min(e0 + CHUNK_SIZE, row_end);
+
+        int64_t k = 0;
+        for (; k < K - (K % kVLEN); k += kVLEN) {
+          Vec out_vec0 = Vec::loadu(out_ptr + k);
+          Vec out_vec1 = Vec::loadu(out_ptr + k + kVecSize);
+          Vec out_vec2 = Vec::loadu(out_ptr + k + kVecSize * 2);
+          Vec out_vec3 = Vec::loadu(out_ptr + k + kVecSize * 3);
+          for (const auto e : c10::irange(e0, e1)) {
+            c = col_data[e];
+            scalar_t val = val_data[e];
+            scalar_t* other_ptr = other_data + c * K + k;
+
+            out_vec0 = update<Vec, reduce>(out_vec0, Vec::loadu(other_ptr) * Vec(val));
+            out_vec1 = update<Vec, reduce>(out_vec1, Vec::loadu(other_ptr + kVecSize) * Vec(val));
+            out_vec2 = update<Vec, reduce>(out_vec2, Vec::loadu(other_ptr + kVecSize * 2) * Vec(val));
+            out_vec3 = update<Vec, reduce>(out_vec3, Vec::loadu(other_ptr + kVecSize * 3) * Vec(val));
+          }
+          out_vec0.store(out_ptr + k);
+          out_vec1.store(out_ptr + k + kVecSize);
+          out_vec2.store(out_ptr + k + kVecSize * 2);
+          out_vec3.store(out_ptr + k + kVecSize * 3);
+        }
+        for (; k < K - (K % kVecSize); k += kVecSize) {
+          Vec out_vec = Vec::loadu(out_ptr + k);
+          for (const auto e : c10::irange(e0, e1)) {
+            c = col_data[e];
+            scalar_t val = val_data[e];
+            scalar_t* other_ptr = other_data + c * K;
+            out_vec = update<Vec, reduce>(out_vec, Vec::loadu(other_ptr + k) * Vec(val));
+          }
+          out_vec.store(out_ptr + k);
+        }
+        for (; k < K; k++) {
+          scalar_t out_val = out_ptr[k];
+          for (const auto e : c10::irange(e0, e1)) {
+            c = col_data[e];
+            scalar_t val = val_data[e];
+            scalar_t* other_ptr = other_data + c * K;
+            out_val = update<scalar_t, reduce>(out_val, other_ptr[k] * val);
+          }
+          out_ptr[k] = out_val;
+        }
+      }
+
+      // step 3: finalize
+      write<scalar_t, reduce>(out_ptr, count, K);
+    }
+  });
+}
+
+// update both val and arg, used for `amin` and `amax`
+// it is a little troublesome to vectorize it since `scalar_t` and `index_t`
+// might have different vector length, for example, each vector holds 8 floats
+// and 4 int64_t.
+template <typename scalar_t, typename index_t, ReductionType reduce>
+inline void update_with_index(scalar_t *val, scalar_t new_val, index_t *arg, index_t new_arg) {
+  if ((reduce == ReductionType::MIN && new_val < *val) ||
+      (reduce == ReductionType::MAX && new_val > *val) ||
+      at::_isnan<scalar_t>(new_val)) {
+    *val = new_val;
+    *arg = new_arg;
+  }
+}
+
+template <typename scalar_t, typename index_t, ReductionType reduce>
+void spmm_reduce_arg_kernel_impl(
+    const Tensor& out,
+    const Tensor& arg_out,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const Tensor& values,
+    const Tensor& other_) {
+
+  TORCH_CHECK(reduce == ReductionType::MAX || reduce == ReductionType::MIN);
+  int64_t nnz = values.numel();
+  if (nnz == 0) {
+    return;
+  }
+
+  auto other = other_.contiguous();
+
+  scalar_t* out_data = out.data_ptr<scalar_t>();
+  index_t* arg_out_data = arg_out.data_ptr<index_t>();
+  auto csr_data = crow_indices.accessor<index_t, 1>();
+  auto col_data = col_indices.accessor<index_t, 1>();
+  auto val_data = values.accessor<scalar_t, 1>();
+  scalar_t* other_data = other.data_ptr<scalar_t>();
+
+  int64_t M = crow_indices.numel() - 1;
+  int64_t K = other.size(-1);
+
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    int64_t row_start, row_end, c;
+    for (const auto m : c10::irange(begin, end)) {
+      row_start = csr_data[m];
+      row_end = csr_data[m + 1];
+
+      scalar_t* out_ptr = out_data + m * K;
+      index_t* arg_out_ptr = arg_out_data + m * K;
+
+      if (row_end != row_start) {
+        init<scalar_t, reduce>(out_ptr, K, /*include_self*/false);
+        for (const auto e : c10::irange(row_start, row_end)) {
+          c = col_data[e];
+          scalar_t val = val_data[e];
+
+          scalar_t* other_ptr = other_data + c * K;
+          for (const auto k : c10::irange(K)) {
+            update_with_index<scalar_t, index_t, reduce>(
+                &out_ptr[k], val *  other_ptr[k], &arg_out_ptr[k], index_t(e));
+          };
+        }
+      }
+    }
+  });
+}
+
+template <typename scalar_t, typename index_t, ReductionType reduce>
+void spmm_reduce_backward_input_kernel_impl(
+    const Tensor& grad_self,
+    const Tensor& grad_out_,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const Tensor& other_,
+    const Tensor& row_indices) {
+
+  int64_t nnz = grad_self._nnz();
+  if (nnz == 0) {
+    return;
+  }
+
+  auto grad_out = grad_out_.contiguous();
+  auto other = other_.contiguous();
+
+  auto values = grad_self.values();
+  auto grad_values_data = values.accessor<scalar_t, 1>();
+  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
+  auto crow_data = crow_indices.accessor<index_t, 1>();
+  auto col_data = col_indices.accessor<index_t, 1>();
+  scalar_t* other_data = other.data_ptr<scalar_t>();
+  auto row_data = row_indices.accessor<index_t, 1>();
+
+  int64_t K = grad_out.size(1);
+
+  using Vec = vec::Vectorized<vec::vec_scalar_t<scalar_t>>;
+  at::parallel_for(0, nnz, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      index_t row = row_data[i], col = col_data[i];
+
+      scalar_t val = vec::map2_reduce_all<scalar_t>(
+          [](Vec x, Vec y) { return x * y; },
+          [](Vec x, Vec y) { return x + y; },
+          other_data + col * K,
+          grad_out_data + row * K,
+          K);
+
+      if (reduce == ReductionType::MEAN) {
+        index_t row_start = crow_data[row], row_end = crow_data[row + 1];
+        val /= (row_end - row_start);
+      }
+
+      grad_values_data[i] = val;
+    }
+  });
+}
+
+// backward for reduce type 'amax' or 'amin'
+template <typename scalar_t, typename index_t>
+void spmm_reduce_backward_input_arg_kernel_impl(
+    const Tensor& grad_self,
+    const Tensor& grad_out_,
+    const Tensor& col_indices,
+    const Tensor& other_,
+    const Tensor& arg_out_) {
+
+  int64_t nnz = grad_self._nnz();
+  if (nnz == 0) {
+    return;
+  }
+
+  auto grad_out = grad_out_.contiguous();
+  auto other = other_.contiguous();
+  auto arg_out = arg_out_.contiguous();
+
+  auto grad_values = grad_self.values();
+  auto grad_values_data = grad_values.accessor<scalar_t, 1>();
+  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
+  auto col_data = col_indices.accessor<index_t, 1>();
+  scalar_t* other_data = other.data_ptr<scalar_t>();
+  index_t* arg_out_data = arg_out.data_ptr<index_t>();
+
+  int64_t M = grad_out.size(0);
+  int64_t K = grad_out.size(1);
+  auto grad = at::empty({M, K}, grad_out.options());
+  scalar_t* grad_data = grad.data_ptr<scalar_t>();
+
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    for (const auto m : c10::irange(begin, end)) {
+      scalar_t* grad_out_ptr = grad_out_data + m * K;
+      scalar_t* grad_ptr = grad_data + m * K;
+      index_t* arg_out_ptr = arg_out_data + m * K;
+
+      for (const auto k : c10::irange(K)) {
+        if (arg_out_ptr[k] == index_t(nnz)) {
+          grad_ptr[k] = scalar_t(0);
+        } else {
+          // collect weight at max/min indices
+          index_t col = col_data[arg_out_data[m * K + k]];
+          grad_ptr[k] = other_data[col * K + k] * grad_out_ptr[k];
+        }
+      }
+    }
+  });
+
+  // scatter_add, consider to parallel this with atomic
+  for (const auto i : c10::irange(M * K)) {
+    index_t ind = arg_out_data[i];
+    if (ind != index_t(nnz)) {
+      grad_values_data[ind] += grad_data[i];
+    }
+  }
+}
+
+template <typename scalar_t, typename index_t>
+void spmm_reduce_normalize_values_kernel_impl(
+    const Tensor& normalized_values,
+    const Tensor& values,
+    const Tensor& crow_indices,
+    const Tensor& row_indices) {
+
+  int64_t nnz = values.numel();
+  if (nnz == 0) {
+    return;
+  }
+
+  auto normalized_values_data = normalized_values.accessor<scalar_t, 1>();
+  auto values_data = values.accessor<scalar_t, 1>();
+  auto crow_data = crow_indices.accessor<index_t, 1>();
+  auto row_data = row_indices.accessor<index_t, 1>();
+
+  at::parallel_for(0, nnz, 1, [&](int64_t begin, int64_t end) {
+    for (const auto i : c10::irange(begin, end)) {
+      index_t row = row_data[i];
+      index_t row_start = crow_data[row], row_end = crow_data[row + 1];
+      // Note that when the row index row is listed in row_indices,
+      // then crow_indices[row+1] > crow_indices[row] holds
+      normalized_values_data[i] = values_data[i] / (row_end - row_start);
+    }
+  });
+}
+
+template <typename scalar_t, typename index_t>
+void spmm_reduce_backward_other_arg_kernel_impl(
+    const Tensor& grad_other,
+    const Tensor& grad_out_,
+    const Tensor& col_indices,
+    const Tensor& values,
+    const Tensor& arg_out_) {
+
+  int64_t nnz = values.numel();
+  if (nnz == 0) {
+    return;
+  }
+
+  auto grad_out = grad_out_.contiguous();
+  auto arg_out = arg_out_.contiguous();
+
+  scalar_t* grad_other_data = grad_other.data_ptr<scalar_t>();
+  scalar_t* grad_out_data = grad_out.data_ptr<scalar_t>();
+  auto col_data = col_indices.accessor<index_t, 1>();
+  auto values_data = values.accessor<scalar_t, 1>();
+  index_t* arg_out_data = arg_out.data_ptr<index_t>();
+
+  int64_t M = grad_out.size(0);
+  int64_t K = grad_out.size(1);
+  auto grad = at::empty({M, K}, grad_out.options());
+  scalar_t* grad_data = grad.data_ptr<scalar_t>();
+
+  at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) {
+    for (const auto m : c10::irange(begin, end)) {
+      scalar_t* grad_out_ptr = grad_out_data + m * K;
+      scalar_t* grad_ptr = grad_data + m * K;
+      index_t* arg_out_ptr = arg_out_data + m * K;
+
+      for (const auto k : c10::irange(K)) {
+        if (arg_out_ptr[k] == index_t(nnz)) {
+          grad_ptr[k] = scalar_t(0);
+        } else {
+          grad_ptr[k] = values_data[arg_out_ptr[k]] * grad_out_ptr[k];
+        }
+      }
+    }
+  });
+
+  // scatter_add, consider to parallel this with atomic
+  for (const auto m : c10::irange(M)) {
+    for (const auto k : c10::irange(K)) {
+      index_t ind = arg_out_data[m * K + k];
+      if (ind != index_t(nnz)) {
+        index_t col = col_data[ind];
+        grad_other_data[col * K + k] += grad_data[m * K + k];
+      }
+    }
+  }
+}
+
+void spmm_reduce_kernel(
+    const Tensor& out,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const Tensor& values,
+    const Tensor& other,
+    ReductionType reduce_op) {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, values.scalar_type(), "spmm_reduce_kernel", [&]() {
+    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_indices", [&]() {
+      AT_DISPATCH_REDUCTION_TYPES(reduce_op, [&]() {
+        spmm_reduce_kernel_impl<scalar_t, index_t, reduce>(
+            out, crow_indices, col_indices, values, other);
+      });
+    });
+  });
+}
+
+void spmm_reduce_arg_kernel(
+    const Tensor& out,
+    const Tensor& arg_out,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const Tensor& values,
+    const Tensor& other,
+    ReductionType reduce_op) {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, values.scalar_type(), "spmm_reduce_kernel", [&]() {
+    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_indices", [&]() {
+      AT_DISPATCH_REDUCTION_TYPES(reduce_op, [&]() {
+        spmm_reduce_arg_kernel_impl<scalar_t, index_t, reduce>(
+            out, arg_out, crow_indices, col_indices, values, other);
+      });
+    });
+  });
+}
+
+void spmm_reduce_backward_input_kernel(
+    const Tensor& grad_self,
+    const Tensor& grad_out,
+    const Tensor& crow_indices,
+    const Tensor& col_indices,
+    const Tensor& other,
+    const Tensor& row_indices,
+    ReductionType reduce_op) {
+  TORCH_CHECK(reduce_op == ReductionType::SUM || reduce_op == ReductionType::MEAN);
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, other.scalar_type(), "spmm_reduce_backward_input_kernel", [&]() {
+    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_backward_input_indices", [&]() {
+      AT_DISPATCH_REDUCTION_TYPES(reduce_op, [&]() {
+        spmm_reduce_backward_input_kernel_impl<scalar_t, index_t, reduce>(
+            grad_self, grad_out, crow_indices, col_indices, other, row_indices);
+      });
+    });
+  });
+}
+
+void spmm_reduce_backward_input_arg_kernel(
+    const Tensor& grad_self,
+    const Tensor& grad_out,
+    const Tensor& col_indices,
+    const Tensor& other,
+    const Tensor& arg_out,
+    ReductionType reduce_op) {
+  TORCH_CHECK(reduce_op == ReductionType::MAX || reduce_op == ReductionType::MIN);
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, other.scalar_type(), "spmm_reduce_backward_input_arg_kernel", [&]() {
+    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_backward_input_arg_indices", [&]() {
+      spmm_reduce_backward_input_arg_kernel_impl<scalar_t, index_t>(
+          grad_self, grad_out, col_indices, other, arg_out);
+    });
+  });
+}
+
+void spmm_reduce_normalize_values_kernel(
+    const Tensor& normalized_values,
+    const Tensor& values,
+    const Tensor& crow_indices,
+    const Tensor& row_indices) {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, values.scalar_type(), "spmm_reduce_normalize_values_kernel", [&]() {
+    AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "spmm_reduce_normalize_values_indices", [&]() {
+      spmm_reduce_normalize_values_kernel_impl<scalar_t, index_t>(
+          normalized_values, values, crow_indices, row_indices);
+    });
+  });
+}
+
+void spmm_reduce_backward_other_kernel(
+    const Tensor& grad_other,
+    const Tensor& grad_out,
+    const Tensor& crow_indices,
+    const Tensor& values,
+    const Tensor& row_indices,
+    const Tensor& ccol_indices,
+    const Tensor& csr2csc,
+    ReductionType reduce_op) {
+  TORCH_CHECK(reduce_op == ReductionType::SUM || reduce_op == ReductionType::MEAN);
+  // need to permute row_indices to CSC order
+  auto row = row_indices.index_select(0, csr2csc);
+
+  Tensor val;
+  if (reduce_op == ReductionType::MEAN) {
+    // for reduce type "mean", need to normalize the values
+    // with rowcount for each of the nonzero element.
+    Tensor normalized_values = at::empty(values.sizes(), values.options());
+    spmm_reduce_normalize_values_kernel(normalized_values, values, crow_indices, row_indices);
+    val = normalized_values.index_select(0, csr2csc);
+  } else {
+    val = values.index_select(0, csr2csc);
+  }
+
+  spmm_reduce_kernel(grad_other, ccol_indices, row, val, grad_out, ReductionType::SUM);
+}
+
+void spmm_reduce_backward_other_arg_kernel(
+    const Tensor& grad_other,
+    const Tensor& grad_out,
+    const Tensor& col_indices,
+    const Tensor& values,
+    const Tensor& arg_out,
+    ReductionType reduce_op) {
+  TORCH_CHECK(reduce_op == ReductionType::MAX || reduce_op == ReductionType::MIN);
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, values.scalar_type(), "spmm_reduce_backward_other_arg_kernel", [&]() {
+    AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "spmm_reduce_backward_other_arg_indices", [&]() {
+      spmm_reduce_backward_other_arg_kernel_impl<scalar_t, index_t>(
+          grad_other, grad_out, col_indices, values, arg_out);
+    });
+  });
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(spmm_reduce_stub, &spmm_reduce_kernel);
+REGISTER_DISPATCH(spmm_reduce_arg_stub, &spmm_reduce_arg_kernel);
+REGISTER_DISPATCH(spmm_reduce_backward_input_stub, &spmm_reduce_backward_input_kernel);
+REGISTER_DISPATCH(spmm_reduce_backward_input_arg_stub, &spmm_reduce_backward_input_arg_kernel);
+REGISTER_DISPATCH(spmm_reduce_backward_other_stub, &spmm_reduce_backward_other_kernel);
+REGISTER_DISPATCH(spmm_reduce_backward_other_arg_stub, &spmm_reduce_backward_other_arg_kernel);
+
+}} // at::native
diff --git a/aten/src/ATen/native/cpu/SpmmReduceKernel.h b/aten/src/ATen/native/cpu/SpmmReduceKernel.h
new file mode 100644
index 000000000000..cbcbf3c63d99
--- /dev/null
+++ b/aten/src/ATen/native/cpu/SpmmReduceKernel.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReductionType.h>
+
+namespace at::native {
+
+using spmm_reduce_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_input_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_input_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_other_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+
+DECLARE_DISPATCH(spmm_reduce_fn, spmm_reduce_stub);
+DECLARE_DISPATCH(spmm_reduce_arg_fn, spmm_reduce_arg_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_fn, spmm_reduce_backward_input_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_input_arg_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_other_fn, spmm_reduce_backward_other_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_other_arg_stub);
+
+} // at::native
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 50d946f98a0e..292c2e6b7ed5 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -655,27 +655,32 @@ static void modified_bessel_k1_kernel(TensorIteratorBase& iterator) {
 
 // TODO: Disable cont. branch to test more risky code
 
-#define IMPLEMENT_ITERATOR_LAMBDA(op)                                         \
-          [&](char** data_, const int64_t* strides, int64_t n) {              \
-            scalar_t* out_data = reinterpret_cast<scalar_t*>(data_[0]);       \
-            scalar_t* in_data = reinterpret_cast<scalar_t*>(data_[1]);        \
-            int64_t out_stride = strides[0] / sizeof(scalar_t);               \
-            int64_t in_stride = strides[1] / sizeof(scalar_t);                \
-            if (out_stride == 1 && in_stride == 1) {                          \
-              vml::v##op(out_data, in_data, n);                               \
-            } else {                                                          \
-              static constexpr int64_t WIDTH = 131072 / sizeof(scalar_t);     \
-              for (int64_t i = 0; i < n; i += WIDTH) {                        \
-                scalar_t buffer[WIDTH];                                       \
-                int64_t width = WIDTH;                                        \
-                width = std::min(width, n - i);                               \
-                for (const auto j : c10::irange(width))\
-                  buffer[j] = in_data[in_stride * (i + j)];                   \
-                vml::v##op(buffer, buffer, width);                            \
-                for (const auto j : c10::irange(width))\
-                  out_data[out_stride * (i + j)] = buffer[j];                 \
-              }                                                               \
-            }                                                                 \
+#define IMPLEMENT_ITERATOR_LAMBDA(op)                                              \
+          [&](char** data_, const int64_t* strides, int64_t n) {                   \
+            scalar_t* out_data = reinterpret_cast<scalar_t*>(data_[0]);            \
+            scalar_t* in_data = reinterpret_cast<scalar_t*>(data_[1]);             \
+            int64_t out_stride = strides[0] / sizeof(scalar_t);                    \
+            int64_t in_stride = strides[1] / sizeof(scalar_t);                     \
+            if (out_stride == 1 && in_stride == 1) {                               \
+              vml::v##op(out_data, in_data, n);                                    \
+              return;                                                              \
+            }                                                                      \
+            static constexpr int64_t WIDTH = (8*1024) / sizeof(scalar_t);          \
+            for (int64_t i = 0; i < n; i += WIDTH) {                               \
+              scalar_t buffer[WIDTH];                                              \
+              const int64_t width = std::min(WIDTH, n - i);                        \
+              /* If either tensor is contiguous use it, otherwise copy into */     \
+              /* a contiguous buffer so compute can still be vectorized */         \
+              scalar_t * in_buffer = in_stride == 1 ? &in_data[i] : &buffer[0];    \
+              scalar_t * out_buffer = out_stride == 1 ? &out_data[i] : &buffer[0]; \
+              if (in_stride != 1)                                                  \
+                for (const auto j : c10::irange(width))                            \
+                  in_buffer[j] = in_data[in_stride * (i + j)];                     \
+              vml::v##op(out_buffer, in_buffer, width);                            \
+              if (out_stride != 1)                                                 \
+                for (const auto j : c10::irange(width))                            \
+                    out_data[out_stride * (i + j)] = out_buffer[j];                \
+            }                                                                      \
           }
 
 #define IMPLEMENT_FLOAT_KERNEL(op)                                                  \
@@ -683,9 +688,8 @@ static void modified_bessel_k1_kernel(TensorIteratorBase& iterator) {
   void op##_kernel(TensorIteratorBase& iter) {                                      \
     TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                    \
     AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), #op "_vml_cpu", [&]() { \
-      iter.serial_for_each(                                                         \
-          IMPLEMENT_ITERATOR_LAMBDA(op),                                            \
-          {0, iter.numel()});                                                       \
+      constexpr int64_t grain_size = 2048;                                          \
+      iter.for_each(IMPLEMENT_ITERATOR_LAMBDA(op), grain_size);                     \
     });                                                                             \
     iter.cast_outputs();                                                            \
   }                                                                                 \
@@ -697,9 +701,8 @@ static void modified_bessel_k1_kernel(TensorIteratorBase& iterator) {
   void op##_kernel(TensorIteratorBase& iter) {                                                   \
     TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                                                 \
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kBFloat16, iter.dtype(), #op "_vml_cpu", [&]() { \
-      iter.serial_for_each(                                                                      \
-          IMPLEMENT_ITERATOR_LAMBDA(op),                                                         \
-          {0, iter.numel()});                                                                    \
+        constexpr int64_t grain_size = 2048;                                                     \
+        iter.for_each(IMPLEMENT_ITERATOR_LAMBDA(op), grain_size);                                \
     });                                                                                          \
     iter.cast_outputs();                                                                         \
   }                                                                                              \
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 7b8bd9ad65d3..1f471d495df7 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -8,6 +8,7 @@
 #include <ATen/native/UpSample.h>
 #include <ATen/native/cpu/utils.h>
 #include <c10/util/irange.h>
+#include <ATen/native/cpu/UpSampleKernelAVXAntialias.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -22,12 +23,53 @@ namespace {
 
 using scale_t = std::vector<c10::optional<double>>;
 
+// TODO: this file could benefit from a global renaming of its functions /
+// classes and terms, as well as from adding more comments. In particular:
+// - It's not obvious that despite their names (and the file name), all these
+//   kernels don't just do upsampling: they do general interpolation, i.e. they
+//   also all support downscaling.
+// - the term "horizontal" or "within dims" or "contiguous dim" refers to the
+//   last dimension.
+//   It's not specific to 2D images and applies to 3D (and 1D??) inputs as well.
+//   Similarly "vertical" or "across dims" refers to all dims that aren't the
+//   last one. In other kernels these are also referred to as "zero-stride" and
+//   "non-zero-stride" - we should unify all this.
+// - the terms "zero-stride" and "non-zero strides" refer to the weights and
+//   indices, not to the contiguity of input or output
+// - It's not always clear which kernel is vectorized and which one isn't.
+// - The functions like _use_vectorized_kernel_cond() should be renamed and
+//   their description updated, because they're not the only "fork" in the
+//   code-path where a choice is made between a vectorized kernel vs a
+//   non-vectorized one. See e.g. upsample_bilinear2d_kernel_impl() where we
+//   already make a similar check, before the one in
+//   _use_vectorized_kernel_cond().
+// - It's not always clear which code is part of a "separable interpolation"
+//   code-path.
+// - Some names need to be more specific. For example
+//   "cpu_upsample_generic_aa()" looks like a super generic name, but the function
+//   is instead fairly specific - we need to make that clearer.
+// - Some functions have a "aa" suffix but it doesn't mean that they only
+//   support antialias. Some of them also support antialias=False now.
+// - Various comments are outdated. Case in point: the one just below about the
+//   `Interpolate` struct being used for cpu_upsample_linear:
+//   cpu_upsample_linear doesn't exist anymore, and these structs are used for
+//   various modes, *not* just linear.
+// - It'd be useful to document how interpolation works in general, and in particular state explicitly:
+//   - that the weights and indices across a given dimension are the same for
+//     all pixels (hence the benefit of pre-computing them)
+//   - that it can be "separated", i.e. we can do the horizontal pass and the
+//     vertical pass independently (and that some kernels are written this way,
+//     while some aren't.)
+// - we can probably remove the template over index_t, because it's always
+//   hard-coded as int64_t
+
+
 // Helper structs and methods for cpu_upsample_linear
 //
 // Interpolation methods that used below are separable, and as such we can compute the interpolation
 // independently per dimension in a recursive way. Please, refer to #10482 for more context.
 //
-// Linear Interpolation structure to compute output value in n-dimensional case.
+// Interpolation structure to compute output value in n-dimensional case.
 // - recursively compute interpolated output for each dimension
 // - we rely a lot on compiler's code optimization such that implemented operations
 //   can be automatically factorized and vectorized using SSE and AVX2
@@ -255,48 +297,129 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
   }
 }
 
-template <typename scalar_t, typename index_t>
-static inline void basic_loop_aa_single_dim_zero_strides(
+template <typename scalar_t>
+static inline void basic_loop_aa_vertical(
     char** data,
     const int64_t* strides,
-    int64_t n) {
+    int64_t n,
+    unsigned int weights_precision) {
   char* dst = data[0];
   char* src = data[1];
   // index stride is constant for the given dimension
-  const index_t ids_stride = *(index_t*)&data[2 + 2][0];
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
 
   for (const auto i : c10::irange(n)) {
     *(scalar_t*)&dst[i * strides[0]] =
-        interpolate_aa_single_dim_zero_strides<scalar_t, index_t>(
+        interpolate_aa_single_dim_zero_strides<scalar_t, int64_t>(
             src + i * strides[1], &data[2], ids_stride);
   }
 }
 
-template <typename scalar_t, typename index_t>
-static inline void basic_loop_aa_single_dim_nonzero_strides(
+template <>
+inline void basic_loop_aa_vertical<uint8_t>(
     char** data,
     const int64_t* strides,
-    int64_t n) {
+    int64_t n,
+    unsigned int weights_precision) {
+  // See Note [ Weights computation for uint8_t and multiplication trick ]
   char* dst = data[0];
   char* src = data[1];
+
   // index stride is constant for the given dimension
-  const index_t ids_stride = *(index_t*)&data[2 + 2][0];
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
+  const int64_t ids_size = *(int64_t*)&data[2 + 1][0];
+  const int64_t ids_min = *(int64_t*)&data[2 + 0][0];
+
+  int64_t i = 0;
+
+  for (; i<n; i++) {
+
+    char* src_min = src + i * strides[1] + ids_min;
+
+    uint8_t t = *(uint8_t*)&src_min[0];
+    int64_t wts_idx = *(int64_t*)&data[2 + 4][0];
+    int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
+    int16_t wts = wts_ptr[0];
+
+    // Intermediate computations are using integer type
+    int output = 1 << (weights_precision - 1);  // accounts for the +0.5 part
+    output += t * wts;
+    for (const auto j : c10::irange(1, ids_size)) {
+      wts = wts_ptr[j];
+      t = *(uint8_t*)&src_min[j * ids_stride];
+      output += t * wts;
+    }
+    *(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
+  }
+}
+
+template <typename scalar_t>
+static inline void basic_loop_aa_horizontal(
+    char** data,
+    const int64_t* strides,
+    int64_t n,
+    unsigned int weights_precision) {
+  char* dst = data[0];
+  char* src = data[1];
+  // index stride is constant for the given dimension
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
 
   if (strides[1] == 0) {
     for (const auto i : c10::irange(n)) {
       *(scalar_t*)&dst[i * strides[0]] =
-          interpolate_aa_single_dim<scalar_t, index_t>(
+          interpolate_aa_single_dim<scalar_t, int64_t>(
               src, &data[2], &strides[2], i, ids_stride);
     }
   } else {
     for (const auto i : c10::irange(n)) {
       *(scalar_t*)&dst[i * strides[0]] =
-          interpolate_aa_single_dim<scalar_t, index_t>(
+          interpolate_aa_single_dim<scalar_t, int64_t>(
               src + i * strides[1], &data[2], &strides[2], i, ids_stride);
     }
   }
 }
 
+template <>
+inline void basic_loop_aa_horizontal<uint8_t>(
+    char** data,
+    const int64_t* strides,
+    int64_t n,
+    unsigned int weights_precision) {
+  // See Note [ Weights computation for uint8_t and multiplication trick ]
+  char* dst = data[0];
+  char* src = data[1];
+  // index stride is constant for the given dimension
+  const int64_t ids_stride = *(int64_t*)&data[2 + 2][0];
+
+  int64_t i = 0;
+
+  // Here we are implementing data interpolation within the same line (vs between the lines)
+  // output[x, y] = input[xmin[x], y] * W[x] + input[xmin[x] + 1, y] * W[x + 1] + ... + input[xmin[x] + xsize, y] * W[x + xsize]
+
+  for (; i<n; i++) {
+
+    int64_t ids_min = *(int64_t*)&data[2 + 0][i * strides[2 + 0]];
+    int64_t ids_size = *(int64_t*)&data[2 + 1][i * strides[2 + 1]];
+
+    char* src_min = src + i * strides[1] + ids_min;
+
+    uint8_t t = *(uint8_t*)&src_min[0];
+    int64_t wts_idx = *(int64_t*)&data[2 + 4][i * strides[2 + 4]];
+    int16_t* wts_ptr = (int16_t*)&data[2 + 3][wts_idx];
+    int16_t wts = wts_ptr[0];
+
+    // Intermediate computations are using integer type
+    int output = 1 << (weights_precision - 1);  // accounts for the +0.5 part
+    output += t * wts;
+    for (const auto j : c10::irange(1, ids_size)) {
+      wts = wts_ptr[j];
+      t = *(uint8_t*)&src_min[j * ids_stride];
+      output += t * wts;
+    }
+    *(uint8_t*)&dst[i * strides[0]] = (uint8_t)std::clamp(output >> weights_precision, 0, 255);
+  }
+}
+
 // Generic upsampling computation method using TensorIterator for Nd case.
 // Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4
 //
@@ -621,21 +744,23 @@ struct HelperInterpBase {
   template <typename scalar_t, typename aa_filter_fn_t>
   static inline void _compute_weights_aa(
     const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support,
-    scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn,
-    int64_t& xmin, int64_t& xsize
+    scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn,
+    int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta
   ) {
 
-    scalar_t center = scale * (i + 0.5);
+    // align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false
+    //                     is 0.0 otherwise
+    scalar_t center = scale * (i + 0.5 - align_corners_delta);
     scalar_t total_w = 0.0;
-    scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
+    scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0;
     xmin = std::max(
-        static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
-    xsize = std::min(static_cast<int64_t>(center + support + 0.5), input_size) -
-        xmin;
+        static_cast<int64_t>(center - support + 0.5 + align_corners_delta), static_cast<int64_t>(0));
+    xsize = std::min(
+        static_cast<int64_t>(center + support + 0.5 + align_corners_delta), input_size) - xmin;
 
     int64_t j = 0;
     for (; j < xsize; j++) {
-      scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
+      scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale);
       wt_ptr[j] = w;
       total_w += w;
     }
@@ -644,23 +769,39 @@ struct HelperInterpBase {
         wt_ptr[j] /= total_w;
       }
     }
-    for (; j < interp_size; j++) {
+    for (; j < max_interp_size; j++) {
       wt_ptr[j] = static_cast<scalar_t>(0.0);
     }
   }
 
-  template <typename scalar_t, typename aa_filter_fn_t>
-  static inline std::vector<Tensor> _compute_indices_weights_aa(
+  // Note [ Support for antialias=False as a subcase of antilias=True ]
+  // This function was originally written with the hard assumption that
+  // antialias=True (hence the aa in the name). It was later extended to support
+  // antialias=False. The only difference between aa and no-aa is in how the
+  // weights and indices are computed (and their number). In aa their number is
+  // variable but with no-aa, they're fixed to interp_size. The same "filters"
+  // can be used otherwise. HOWEVER, support for antialias=False here may not be
+  // optimally optimized: the code assumes an arbitrary number of weights and
+  // indices, but this can be optimized further when aa=False since we know
+  // their actual dimensions.
+  template <typename scalar_t, typename aa_filter_fn_t, int weight_index_stride=sizeof(scalar_t)>
+  static inline std::tuple<std::vector<Tensor>, int> _compute_indices_weights_aa(
     int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
     int64_t reshape_dim, scalar_t scale,
-    int interp_size, aa_filter_fn_t aa_filter_fn
+    int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta
   ) {
 
     std::vector<Tensor> output;
 
-    scalar_t support =
-        (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
-    interp_size = (int)ceilf(support) * 2 + 1;
+    scalar_t support;
+    int max_interp_size;
+    if (antialias) {
+        support = (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
+        max_interp_size = (int) std::ceil(support) * 2 + 1;
+    } else {
+        support = interp_size * 0.5;
+        max_interp_size = interp_size;
+    }
 
     auto new_shape = std::vector<int64_t>(ndims, 1);
     new_shape[reshape_dim] = output_size;
@@ -675,7 +816,7 @@ struct HelperInterpBase {
 
     {
       // Weights
-      new_shape[reshape_dim] = output_size * interp_size;
+      new_shape[reshape_dim] = output_size * max_interp_size;
       auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>()));
       auto strides = wts.strides().vec();
       strides[reshape_dim] = 0;
@@ -701,20 +842,130 @@ struct HelperInterpBase {
           input_size,
           scale,
           support,
-          wt_ptr + i * interp_size,
-          interp_size,
+          wt_ptr + i * max_interp_size,
+          max_interp_size,
           aa_filter_fn,
           xmin,
-          xmax);
+          xmax,
+          antialias,
+          align_corners_delta);
 
       idx_ptr_xmin[i] = xmin * stride;
       idx_ptr_size[i] = xmax;
       idx_ptr_stride[i] = stride;
-      wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t);
+      wt_idx_ptr[i] = i * max_interp_size * weight_index_stride;
     }
-    return output;
+    return {output, max_interp_size};
   }
 
+  /*
+  NOTE [ Weights computation for uint8_t and multiplication trick ]
+  When the input/output dtype is uint8_t, we still compute the interpolation
+  weights as double, but then convert them to int16 via some conversion logic
+  detailed below. This allows us to compute all interpolation operation (sum of
+  multiplications) as ints instead of floats. The result is converted back into
+  uint8 in basic_loop_aa_horizontal<uint8_t> (and vertical)
+
+  In essence the idea is to avoid a multiplication between a float (the
+  weight) and an int (the pixel value) and instead run a multpilication between
+  2 ints:
+
+  ```py
+  COEF_PREC = 16
+
+  def mul(a:float, b:int) -> Tuple[float, int]:
+    # return a * b, round(a * b)
+    actual = a * b
+
+    assert a > 0  # I'm lazy
+    int_a = floor(0.5 + a * (1 << COEF_PREC))
+    with_trick = ((int_a * b) + (1 << (COEF_PREC - 1))) >> COEF_PREC
+
+    return actual, with_trick  # round(actual) == with_trick!!
+  ```
+
+  Here's how it works:
+  N == COEFF_PREC
+  1 << N == 2**N
+  floor(0.5 + x) == round(x)
+
+  So the operation is something like
+
+  int_a = round(a * 2**N)  -- let's just say it's `a * 2**N` for simplicity
+
+  res = ((int_a * b) + (1 << (N - 1))) >> N
+      = ((a * 2**N * b + 2**(N - 1)) / 2**N
+      = a * b + 0.5
+      = round(a * b)
+      = what we wanted
+  */
+  template <typename aa_filter_fn_t>
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> _compute_indices_int16_weights_aa(
+    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
+    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale,
+    int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false
+  ) {
+
+    double scale = area_pixel_compute_scale<double>(
+        input_size, output_size, align_corners, opt_scale);
+
+    std::vector<Tensor> indices_weights;
+    auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0;
+    std::tie(indices_weights, interp_size) = HelperInterpBase::_compute_indices_weights_aa<double, aa_filter_fn_t, sizeof(int16_t)>(
+        input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta);
+
+    // Rescale float weights to int16 and compute weights precision
+    auto weights_f64 = indices_weights[3];
+    double * data_f64 = weights_f64.data_ptr<double>();
+    int64_t weights_f64_size = output_size * interp_size;
+    // can't use weights_f64.max() here as tensor is restrided
+    double w_max = data_f64[0];
+    for (const auto i : c10::irange(weights_f64_size)) {
+        double v = data_f64[i];
+        if (w_max < v) {
+            w_max = v;
+        }
+    }
+
+    unsigned int weights_precision = 0;
+    for (weights_precision = 0; weights_precision < 22; weights_precision += 1) {
+        int next_value = (int) (0.5 + w_max * (1 << (weights_precision + 1)));
+        if (next_value >= (1 << 15))
+            break;
+    }
+
+    // Rescale float values to int16
+    int16_t * data_i16 = (int16_t *) data_f64;
+    auto aligned_interp_size = interp_size;
+
+    if (align_i32) {
+      // We should respect int32 alignment as
+      // we will load data as int32 with AVX2
+      // See ImagingResampleHorizontalConvolution8u4x, mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
+      // compute aligned_interp_size = nearest pair value to interp_size
+      while (aligned_interp_size % sizeof(int32_t) != 0) {
+        aligned_interp_size += 1;
+      }
+      // assert that we wont go out of bounds
+      TORCH_INTERNAL_ASSERT(aligned_interp_size * sizeof(int16_t) < interp_size * sizeof(double));
+    }
+
+    for (const auto j : c10::irange(output_size)) {
+      for (const auto k : c10::irange(interp_size)) {
+        double v = data_f64[j * interp_size + k];
+        if (v < 0) {
+            data_i16[j * aligned_interp_size + k] = (int) (-0.5 + v * (1 << weights_precision));
+        } else {
+            data_i16[j * aligned_interp_size + k] = (int) (0.5 + v * (1 << weights_precision));
+        }
+      }
+    }
+
+    return {indices_weights, aligned_interp_size, weights_precision};
+  }
+
+
+
 };
 
 struct HelperInterpNearest : public HelperInterpBase {
@@ -923,8 +1174,9 @@ struct HelperInterpLinear : public HelperInterpBase {
             input_size, output_size, align_corners, opt_scale);
 
         auto interp_size = HelperInterpLinear::interp_size;
+        int unused;
 
-        indices_weights = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
+        std::tie(indices_weights, unused) = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
             input_size,
             output_size,
             stride,
@@ -932,11 +1184,32 @@ struct HelperInterpLinear : public HelperInterpBase {
             reshape_dim,
             scale,
             interp_size,
-            &HelperInterpLinear::aa_filter<scalar_t>);
+            &HelperInterpLinear::aa_filter<scalar_t>,
+            /*antialias=*/true,
+            /*align_corners_delta=*/0.0);
       }
     );
     return indices_weights;
   }
+
+  static inline std::tuple<std::vector<Tensor>, int, unsigned int> compute_indices_int16_weights_aa(
+    int64_t input_size,
+    int64_t output_size,
+    int64_t stride,
+    int64_t ndims,
+    int64_t reshape_dim,
+    bool align_corners,
+    const c10::optional<double> opt_scale,
+    bool antialias,
+    bool align_i32=false
+  ) {
+
+    auto interp_size = HelperInterpLinear::interp_size;
+    auto fn = HelperInterpLinear::aa_filter<double>;
+    return HelperInterpLinear::_compute_indices_int16_weights_aa(
+        input_size, output_size, stride, ndims, reshape_dim,
+        align_corners, opt_scale, interp_size, fn, antialias, align_i32);
+  }
 };
 
 struct HelperInterpCubic : public HelperInterpBase {
@@ -1033,8 +1306,9 @@ struct HelperInterpCubic : public HelperInterpBase {
             input_size, output_size, align_corners, opt_scale);
 
         auto interp_size = HelperInterpCubic::interp_size;
+        int unused;
 
-        indices_weights = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
+        std::tie(indices_weights, unused) = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
             input_size,
             output_size,
             stride,
@@ -1042,11 +1316,14 @@ struct HelperInterpCubic : public HelperInterpBase {
             reshape_dim,
             scale,
             interp_size,
-            &HelperInterpCubic::aa_filter<scalar_t>);
+            &HelperInterpCubic::aa_filter<scalar_t>,
+            /*antialias=*/true,
+            /*align_corners_delta*/0.0);
       }
     );
     return indices_weights;
   }
+
 };
 
 // Generic upsampling interpolation kernel for N-d case.
@@ -1133,31 +1410,50 @@ void upsample_generic_Nd_kernel_impl(
   }
 }
 
-template <typename scalar_t>
-void cpu_upsample_generic_aa(at::TensorIterator& iter) {
+template <typename scalar_t, bool is_horizontal>
+void cpu_upsample_generic_aa(at::TensorIterator& iter, unsigned int weights_precision) {
 
   auto loop = [&](char** data, const int64_t* strides, int64_t n) {
-    if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
-        is_zero_stride<3 + 2>(&strides[2])) {
-      basic_loop_aa_single_dim_zero_strides<scalar_t, int64_t>(
-          data, strides, n);
+    if (is_horizontal) {
+
+      // Strides are : X 0 | 8 8 8 0 8  (Channels first)
+      // Strides are : X X | 0 0 0 0 0  (Channels last)
+      // upsampling data within a contiguous dimension (aka horizontal resampling)
+      if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
+          is_zero_stride<3 + 2>(&strides[2])) {
+        // channels last case
+        basic_loop_aa_horizontal<scalar_t>(
+            data, strides, n, weights_precision);
+      } else {
+        basic_loop_aa_horizontal<scalar_t>(
+            data, strides, n, weights_precision);
+      }
     } else {
-      basic_loop_aa_single_dim_nonzero_strides<scalar_t, int64_t>(
-          data, strides, n);
+      // Strides are : X Y | 0 0 0 0 0 (Channels first)
+      // Strides are : X X | 0 0 0 0 0 (Channels last)
+      // upsampling data between contiguous dimensions (aka vertical resampling)
+      if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
+          is_zero_stride<3 + 2>(&strides[2])) {
+        basic_loop_aa_vertical<scalar_t>(
+            data, strides, n, weights_precision);
+      } else {
+        basic_loop_aa_vertical<scalar_t>(
+            data, strides, n, weights_precision);
+      }
     }
   };
 
   iter.for_each(loop);
 }
 
-// Generic separable upsampling interpolation kernels for N-d case with anti-aliasing
-template <int out_ndims, typename scale_type, class F>
+template <int out_ndims, typename scale_type, class F, bool is_horizontal>
 void _separable_upsample_generic_Nd_kernel_impl_single_dim(
     const Tensor& output,
     const Tensor& input,
     int interp_dim,
     bool align_corners,
-    const scale_type& scales) {
+    const scale_type& scales,
+    bool antialias) {
 
   // input can be NCHW, NCL or NCKHW
   auto shape = input.sizes().vec();
@@ -1174,21 +1470,29 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
   strides[interp_dim] = 0;
   auto restrided_input = input.as_strided(shape, strides);
 
-  std::vector<std::vector<Tensor>> indices_weights;
-
-  int interp_size = F::interp_size;
   auto input_scalar_type = input.scalar_type();
-  if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) {
-    // nearest also supports uint8 tensor, but we have to use float
-    // with compute_indices_weights
-    input_scalar_type = at::ScalarType::Float;
-  }
 
-  indices_weights.emplace_back(
+  std::vector<Tensor> indices_weights;
+  unsigned int weights_precision = 0;
+  int unused;
+
+  if (input_scalar_type == at::kByte) {
+    std::tie(indices_weights, unused, weights_precision) =
+      // TODO: change that to F:: once / if bicubic mode supports uint8 after all
+      HelperInterpLinear::compute_indices_int16_weights_aa(
+        input.size(interp_dim), oshape[interp_dim],
+        input.stride(interp_dim) * input.element_size(),
+        input.dim(), interp_dim, align_corners, scales[interp_dim - 2],
+        antialias);
+    TORCH_INTERNAL_ASSERT(weights_precision > 0);
+  } else {
+    TORCH_INTERNAL_ASSERT(antialias);
+    indices_weights =
       F::compute_indices_weights_aa(
         input_scalar_type, input.size(interp_dim), oshape[interp_dim],
         input.stride(interp_dim) * input.element_size(),
-        input.dim(), interp_dim, align_corners, scales[interp_dim - 2]));
+        input.dim(), interp_dim, align_corners, scales[interp_dim - 2]);
+  }
 
   TensorIteratorConfig config;
   config.check_all_same_dtype(false)
@@ -1196,51 +1500,95 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim(
       .add_output(output)
       .add_input(restrided_input);
 
-  for (auto& idx_weight : indices_weights) {
-    for (auto& tensor : idx_weight) {
-      config.add_input(tensor);
-    }
+  for (auto& tensor : indices_weights) {
+    config.add_input(tensor);
   }
 
   auto iter = config.build();
 
-  if (interp_size > 1) {
-    // Nearest also supports uint8 tensor, so need to handle it separately
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] {
-      cpu_upsample_generic_aa<scalar_t>(iter);
-    });
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
-          cpu_upsample_generic_aa<scalar_t>(iter);
-        });
-  }
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
+        cpu_upsample_generic_aa<scalar_t, is_horizontal>(iter, weights_precision);
+      });
 }
 
+// Generic separable upsampling interpolation kernel for N-d case with anti-aliasing.
+// It also supports antialias=False iff
+// (dtype == uint8 and mode in ("bilinear", "bicubic")): this is used as
+// fallback in these settings when AVX isn't supported.
 template <int out_ndims, typename scale_type, class F>
 void separable_upsample_generic_Nd_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
-    const scale_type& scales) {
+    const scale_type& scales,
+    bool antialias) {
+
+  auto output_shape = output.sizes();
+  auto input_shape = input.sizes();
+  auto temp_oshape = input_shape.vec();
+
+  if (output_shape == input_shape) {
+    output.copy_(input);
+    return;
+  }
 
-  auto temp_oshape = input.sizes().vec();
   at::Tensor temp_output, temp_input = input;
-  for (const auto i : c10::irange(out_ndims - 1)) {
-    int interp_dim = 2 + out_ndims - 1 - i;
-    temp_oshape[interp_dim] = output.sizes()[interp_dim];
-    temp_output = at::empty(temp_oshape, input.options().memory_format(input.suggest_memory_format()));
+
+  int interp_dim = 0;
+  // Precompute the number of single dim resize method invocations
+  // to avoid copying temporary buffer to output
+  int num_single_dim_ops = 0;
+  for (const auto i : c10::irange(out_ndims)) {
+    interp_dim = 2 + out_ndims - 1 - i;
+    if (output_shape[interp_dim] != input_shape[interp_dim]) {
+      num_single_dim_ops += 1;
+    }
+  }
+
+  // upsampling data within the contiguous dimension (aka horizontal resampling)
+  interp_dim = 2 + out_ndims - 1;
+  if (output_shape[interp_dim] != input_shape[interp_dim]) {
+
+    num_single_dim_ops -= 1;
+    if (num_single_dim_ops > 0) {
+      temp_oshape[interp_dim] = output_shape[interp_dim];
+      temp_output = at::empty(temp_oshape, input.options());
+    } else {
+      temp_output = output;
+    }
+
     _separable_upsample_generic_Nd_kernel_impl_single_dim<
         out_ndims,
         scale_t,
-        F>(
-        temp_output, temp_input, interp_dim, align_corners, scales);
+        F,
+        true>(
+        temp_output, temp_input, interp_dim, align_corners, scales, antialias);
     temp_input = temp_output;
   }
-  _separable_upsample_generic_Nd_kernel_impl_single_dim<
-      out_ndims,
-      scale_t,
-      F>(output, temp_input, 2, align_corners, scales);
+
+  // upsampling data between contiguous dimensions (aka vertical resampling)
+  for (const auto i : c10::irange(1, out_ndims)) {
+    interp_dim = 2 + out_ndims - 1 - i;
+    if (output_shape[interp_dim] != input_shape[interp_dim]) {
+
+      num_single_dim_ops -= 1;
+      if (num_single_dim_ops > 0) {
+        temp_oshape[interp_dim] = output_shape[interp_dim];
+        temp_output = at::empty(temp_oshape, input.options());
+      } else {
+        temp_output = output;
+      }
+
+      _separable_upsample_generic_Nd_kernel_impl_single_dim<
+          out_ndims,
+          scale_t,
+          F,
+          false>(
+          temp_output, temp_input, interp_dim, align_corners, scales, antialias);
+      temp_input = temp_output;
+    }
+  }
 }
 
 void upsample_nearest1d_kernel_impl(
@@ -1356,7 +1704,8 @@ void upsample_linear1d_kernel_impl(
     output, input, align_corners, {scales_w});
 }
 
-void upsample_bilinear2d_kernel_impl(
+
+void upsample_bilinear2d_kernel_impl_float(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
@@ -1378,15 +1727,56 @@ void upsample_bilinear2d_kernel_impl(
   }
 }
 
-void upsample_bilinear2d_aa_kernel_impl(
+void upsample_bilinear2d_kernel_impl(
     const Tensor& output,
     const Tensor& input,
     bool align_corners,
     c10::optional<double> scales_h,
     c10::optional<double> scales_w) {
 
+  if (input.dtype() == at::kByte){
+    #ifdef CPU_CAPABILITY_AVX2
+      if (input.size(1) <= 4) {
+        upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(input,
+          output, align_corners, {scales_h, scales_w},
+          /*antialias=*/false);
+      } else {
+        separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
+          output, input, align_corners, {scales_h, scales_w},
+          /*antialias=*/false);
+      }
+    #else  // CPU_CAPABILITY_AVX2
+      separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
+        output, input, align_corners, {scales_h, scales_w},
+        /*antialias=*/false);
+    #endif  // CPU_CAPABILITY_AVX2
+  } else {
+    upsample_bilinear2d_kernel_impl_float(output, input, align_corners, scales_h, scales_w);
+  }
+}
+
+
+void upsample_bilinear2d_aa_kernel_impl(
+    const Tensor& output,
+    const Tensor& input,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+#ifdef CPU_CAPABILITY_AVX2
+  if (input.dtype() == at::kByte && input.size(1) <= 4) {
+    upsample_avx_bilinear_uint8<scale_t, HelperInterpLinear>(
+      input, output, align_corners, {scales_h, scales_w},
+      /*antialias=*/true);
+  } else {
+    separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
+        output, input, align_corners, {scales_h, scales_w},
+        /*antialias=*/true);
+  }
+#else // CPU_CAPABILITY_AVX2
   separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
-      output, input, align_corners, {scales_h, scales_w});
+      output, input, align_corners, {scales_h, scales_w},
+      /*antialias=*/true);
+#endif // CPU_CAPABILITY_AVX2
 }
 
 void upsample_trilinear3d_kernel_impl(
@@ -1424,7 +1814,8 @@ void upsample_bicubic2d_aa_kernel_impl(
     c10::optional<double> scales_w) {
 
   separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>(
-      output, input, align_corners, {scales_h, scales_w});
+    output, input, align_corners, {scales_h, scales_w},
+    /*antialias=*/true);
 }
 
 template <
@@ -1500,7 +1891,9 @@ void cpu_upsample_genNd_backward_aa(
           interp_height,
           filter_fn,
           ymin,
-          ysize);
+          ysize,
+          /*antialias=*/true,
+          /*align_corners_delta=*/0.0);
 
       for (const auto ow : c10::irange(output_width)) {
         F::_compute_weights_aa(
@@ -1512,7 +1905,9 @@ void cpu_upsample_genNd_backward_aa(
             interp_width,
             filter_fn,
             xmin,
-            xsize);
+            xsize,
+            /*antialias=*/true,
+            /*align_corners_delta=*/0.0);
 
         for (const auto c : c10::irange(begin, end)) {
           scalar_t grad_output_value =
diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
new file mode 100644
index 000000000000..e8239cf6b86c
--- /dev/null
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@@ -0,0 +1,719 @@
+/*
+The Python Imaging Library (PIL) is
+
+    Copyright © 1997-2011 by Secret Labs AB
+    Copyright © 1995-2011 by Fredrik Lundh
+
+Pillow is the friendly PIL fork. It is
+
+    Copyright © 2010-2022 by Alex Clark and contributors
+
+Like PIL, Pillow is licensed under the open source HPND License
+*/
+
+// This code is heavily inspired from PILLOW-SIMD's implementation:
+// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c
+
+#pragma once
+#ifdef CPU_CAPABILITY_AVX2
+// TODO: This file only supports AVX2. We could split the AVX kernels into
+// smaller logical blocks in order to port them into the Vec.h logic. This would
+// allow to support other vectorization architectures and perhaps also support
+// the non-vectorized fallback (we'd need to make sure it's not slower than the
+// current fallback).
+
+#include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+
+namespace {
+
+static __m128i inline mm_cvtepu8_epi32(const uint32_t* C10_RESTRICT ptr) {
+  return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
+}
+
+// TODO: We may want to hard-code an unrolled version for the case where
+// num_channels=3 to hint the compiler to vectorize this (looks at original
+// PIL-SIMD's code).
+at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
+  // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into
+  // RGBARGBARGBA format where A is hard-coded to 255. Each pixel is encoded
+  // into as 32bits. This generalizes to num_channels <= 4 and also works for
+  // non-channels_last tensors.
+
+  const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
+  auto num_channels = packed_tensor.size(0);
+
+  constexpr int rgba_size = 4;
+  auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte));
+  uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr<uint8_t>();
+
+  auto stride_i = packed_tensor.stride(2);
+  auto stride_j = packed_tensor.stride(0);
+
+  for (const auto i : c10::irange(num_pixels)) {
+    for (const auto j : c10::irange(rgba_size)) {
+      unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0;
+    }
+  }
+  return unpacked_tensor;
+}
+
+void pack_rgb(
+    const at::Tensor& unpacked_tensor, // IN
+    const at::Tensor& packed_tensor // OUT
+) {
+  constexpr int rgba_size = 4;
+  uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr<uint8_t>();
+  uint8_t* packed = (uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
+  auto num_channels = packed_tensor.size(0);
+
+  auto packed_increment = packed_tensor.stride(2);
+  auto packed_stride = packed_tensor.stride(0);
+
+  for (const auto i C10_UNUSED : c10::irange(num_pixels)) {
+    for (const auto j : c10::irange(num_channels)) {
+      packed[j * packed_stride] = unpacked[j];
+    }
+    unpacked += rgba_size;
+    packed += packed_increment;
+  }
+}
+
+void ImagingResampleHorizontalConvolution8u4x(
+    uint32_t* C10_RESTRICT lineOut0,
+    uint32_t* C10_RESTRICT lineOut1,
+    uint32_t* C10_RESTRICT lineOut2,
+    uint32_t* C10_RESTRICT lineOut3,
+    const uint32_t* C10_RESTRICT lineIn0,
+    const uint32_t* C10_RESTRICT lineIn1,
+    const uint32_t* C10_RESTRICT lineIn2,
+    const uint32_t* C10_RESTRICT lineIn3,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision);
+
+void ImagingResampleHorizontalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT lineIn,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision);
+
+void ImagingResampleVerticalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT imIn,
+    int xmin,
+    int xmax,
+    int16_t* k,
+    int coefs_precision,
+    int xin);
+
+void ImagingResampleHorizontal(
+    const at::Tensor & unpacked_output,
+    const at::Tensor & unpacked_input,
+    int ksize,
+    const std::vector<at::Tensor>& horiz_indices_weights,
+    unsigned int horiz_weights_precision) {
+  // TODO: we may want to merge that into the fallback code (currently called
+  // basic_loop_aa_horizontal<uint8_t>)
+  // Although this may not be needed if / when we port all this code to use
+  // Vec.h since this would potentially give us another fall-back implem
+  int yy;
+
+  int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
+
+  auto xout = unpacked_output.size(2);
+  auto yout = unpacked_output.size(1);
+  auto xin = unpacked_input.size(2);
+
+  std::vector<int> bounds_vec(2 * xout, 0);
+  int* bounds = bounds_vec.data();
+
+  int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
+  int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
+  for (int i = 0; i < xout; i++) {
+    bounds[2 * i + 0] = idx_ptr_xmin[i];
+    bounds[2 * i + 1] = idx_ptr_size[i];
+  }
+
+  uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
+  uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
+
+  yy = 0;
+  for (; yy < yout - 3; yy += 4) {
+    ImagingResampleHorizontalConvolution8u4x(
+        unpacked_output_p + yy * xout,
+        unpacked_output_p + (yy + 1) * xout,
+        unpacked_output_p + (yy + 2) * xout,
+        unpacked_output_p + (yy + 3) * xout,
+        unpacked_input_p + yy * xin,
+        unpacked_input_p + (yy + 1) * xin,
+        unpacked_input_p + (yy + 2) * xin,
+        unpacked_input_p + (yy + 3) * xin,
+        xout,
+        bounds,
+        kk,
+        ksize,
+        (int)horiz_weights_precision);
+  }
+  for (; yy < yout; yy++) {
+    ImagingResampleHorizontalConvolution8u(
+        unpacked_output_p + yy * xout,
+        unpacked_input_p + yy * xin,
+        xout,
+        bounds,
+        kk,
+        ksize,
+        (int)horiz_weights_precision);
+  }
+}
+
+void ImagingResampleVertical(
+    const at::Tensor & unpacked_output,
+    const at::Tensor & unpacked_input,
+    int ksize,
+    const std::vector<at::Tensor>& vert_indices_weights,
+    unsigned int vert_weights_precision) {
+  // TODO: we may want to merge that into the fallback code (currently called
+  // basic_loop_aa_vertical<uint8_t>)
+  // Although this may not be needed if / when we port all this code to use
+  // Vec.h since this would potentially give us another fall-back implem
+  int ymin, ymax;
+  int16_t* k = nullptr;
+  int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
+
+  int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
+  int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
+
+  uint32_t* unpacked_output_p = (uint32_t*) unpacked_output.data_ptr<uint8_t>();
+  uint32_t* unpacked_input_p = (uint32_t*) unpacked_input.data_ptr<uint8_t>();
+
+  auto xout = unpacked_output.size(2);
+  auto yout = unpacked_output.size(1);
+
+  for (const auto yy : c10::irange(yout)) {
+    k = &kk[yy * ksize];
+
+    ymin = idx_ptr_xmin[yy];
+    ymax = idx_ptr_size[yy];
+    ImagingResampleVerticalConvolution8u(
+        unpacked_output_p + yy * xout,
+        unpacked_input_p,
+        ymin,
+        ymax,
+        k,
+        (int)vert_weights_precision,
+        xout);
+  }
+}
+
+// This is the only public entry point in this file.  It supports bilinear
+// mode for uint8 dtype when C <= 4, with or without antialias. The
+// implem is based on PIL-SIMD.
+// Its equivalent implementation (fallback) for when AVX isn't supported or when
+// C > 4 is separable_upsample_generic_Nd_kernel_impl()  There are a bunch of
+// future improvement that can be done: look for the TODOs in this file.
+// For details on how the weights are computed and how the multiplications are
+// run on int (instead of float weights), see
+// [ Weights computation for uint8_t and multiplication trick ]
+// For details on how the AVX kernels are implemented, see
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+// See also [ Support for antialias=False as a subcase of antilias=True ] to
+// learn more about how the antialias=False case is computed. The same holds
+// here: all these kernels are general enough to handle an arbitrary number of
+// weights, but when aa=False they could be optimized further.
+template <typename scale_type, class F>
+void upsample_avx_bilinear_uint8(
+    const at::Tensor& input,
+    const at::Tensor& output,
+    bool align_corners,
+    const scale_type& scales,
+    bool antialias) {
+  auto batch_size = input.size(0);
+  auto num_channels = input.size(1);
+  auto xin = input.size(3);
+  auto yin = input.size(2);
+  auto xout = output.size(3);
+  auto yout = output.size(2);
+
+  if (xin == xout && yin == yout) {
+    output.copy_(input);
+    return;
+  }
+
+  auto need_horizontal = xout != xin;
+  auto need_vertical = yout != yin;
+
+  int ksize_horiz, ksize_vert;
+  std::vector<at::Tensor> horiz_indices_weights, vert_indices_weights;
+  unsigned int horiz_weights_precision, vert_weights_precision;
+
+  if (need_horizontal) {
+    int interp_dim = 3;
+    std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
+        F::compute_indices_int16_weights_aa(
+            /*input_size=*/xin,
+            /*output_size=*/xout,
+            /*stride=*/1,
+            /*ndims=*/4,
+            /*reshape_dim=*/interp_dim,
+            /*align_corners=*/align_corners,
+            /*opt_scale=*/scales[interp_dim - 2],
+            /*antialias=*/antialias,
+            /*align_i32=*/true);
+  }
+
+  if (need_vertical) {
+    int interp_dim = 2;
+    std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
+        F::compute_indices_int16_weights_aa(
+            /*input_size=*/yin,
+            /*output_size=*/yout,
+            /*stride=*/1,
+            /*ndims=*/4,
+            /*reshape_dim=*/interp_dim,
+            /*align_corners=*/align_corners,
+            /*opt_scale=*/scales[interp_dim - 2],
+            /*antialias=*/antialias,
+            /*align_i32=*/true);
+  }
+
+  bool is_rgba = num_channels == 4 && input.is_contiguous(at::MemoryFormat::ChannelsLast);
+
+  at::Tensor buffer_horiz, buffer_vert;
+  if (need_horizontal && !(is_rgba && !need_vertical)) {
+    buffer_horiz = at::empty({4, yin, xout}, input.options());
+  }
+  if (need_vertical && !is_rgba) {
+    buffer_vert = at::empty({4, yout, xout}, input.options());
+  }
+
+  // TODO: The unpack / pack operations create a copy of the original input and
+  // output tensor. There should be a way to avoid these copies by instead
+  // modifying the low-level kernels. Or maybe at least avoid copying the entire
+  // tensors and just copy part of them (line by line).
+  for (const auto i : c10::irange(batch_size)) {
+
+    at::Tensor unpacked_input = (is_rgba) ? input[i] : unpack_rgb(input[i]);
+    at::Tensor unpacked_output;
+
+    if (need_horizontal) {
+
+      at::Tensor unpacked_output_temp = (is_rgba && !need_vertical) ? output[i] : buffer_horiz;
+
+      ImagingResampleHorizontal(
+          unpacked_output_temp,
+          unpacked_input,
+          ksize_horiz,
+          horiz_indices_weights,
+          horiz_weights_precision);
+      unpacked_output = unpacked_input = unpacked_output_temp;
+    }
+    if (need_vertical) {
+      unpacked_output = (is_rgba) ? output[i] : buffer_vert;
+
+      ImagingResampleVertical(
+          unpacked_output,
+          unpacked_input,
+          ksize_vert,
+          vert_indices_weights,
+          vert_weights_precision);
+    }
+
+    TORCH_INTERNAL_ASSERT(unpacked_output.defined());
+
+    if (!is_rgba) {
+      pack_rgb(unpacked_output, output[i]);
+    }
+  }
+}
+
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+void ImagingResampleHorizontalConvolution8u4x(
+    uint32_t* C10_RESTRICT lineOut0,
+    uint32_t* C10_RESTRICT lineOut1,
+    uint32_t* C10_RESTRICT lineOut2,
+    uint32_t* C10_RESTRICT lineOut3,
+    const uint32_t* C10_RESTRICT lineIn0,
+    const uint32_t* C10_RESTRICT lineIn1,
+    const uint32_t* C10_RESTRICT lineIn2,
+    const uint32_t* C10_RESTRICT lineIn3,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision) {
+  int xmin, xmax, x;
+  int16_t* k;
+
+  for (const auto xx : c10::irange(xsize)) {
+    xmin = xbounds[xx * 2 + 0];
+    xmax = xbounds[xx * 2 + 1];
+    k = &kk[xx * kmax];
+    x = 0;
+
+    __m256i sss0, sss1;
+    __m256i zero = _mm256_setzero_si256();
+    __m256i initial = _mm256_set1_epi32(1 << (coefs_precision - 1));
+    sss0 = initial;
+    sss1 = initial;
+
+    for (; x < xmax - 3; x += 4) {
+      __m256i pix, mmk0, mmk1, source;
+
+      mmk0 = _mm256_set1_epi32(*(int32_t*)&k[x]);
+      mmk1 = _mm256_set1_epi32(*(int32_t*)&k[x + 2]);
+
+      source = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn0[x + xmin])),
+          _mm_loadu_si128((__m128i*)&lineIn1[x + xmin]),
+          1);
+      // clang-format off
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk0));
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk1));
+
+      source = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lineIn2[x + xmin])),
+          _mm_loadu_si128((__m128i*)&lineIn3[x + xmin]),
+          1);
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk0));
+      pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+        -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk1));
+    }
+
+    for (; x < xmax - 1; x += 2) {
+      __m256i pix, mmk;
+
+      mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
+
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn0[x + xmin])),
+          _mm_loadl_epi64((__m128i*)&lineIn1[x + xmin]),
+          1);
+      pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lineIn2[x + xmin])),
+          _mm_loadl_epi64((__m128i*)&lineIn3[x + xmin]),
+          1);
+      pix = _mm256_shuffle_epi8(pix, _mm256_set_epi8(
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+        -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+      // clang-format on
+    }
+
+    for (; x < xmax; x++) {
+      __m256i pix, mmk;
+
+      // [16] xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0 xx k0
+      mmk = _mm256_set1_epi32(k[x]);
+
+      // [16] xx a0 xx b0 xx g0 xx r0 xx a0 xx b0 xx g0 xx r0
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn0[x + xmin])),
+          mm_cvtepu8_epi32(&lineIn1[x + xmin]),
+          1);
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+
+      pix = _mm256_inserti128_si256(
+          _mm256_castsi128_si256(mm_cvtepu8_epi32(&lineIn2[x + xmin])),
+          mm_cvtepu8_epi32(&lineIn3[x + xmin]),
+          1);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+    }
+
+    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
+    sss0 = _mm256_packs_epi32(sss0, zero);
+    sss1 = _mm256_packs_epi32(sss1, zero);
+    sss0 = _mm256_packus_epi16(sss0, zero);
+    sss1 = _mm256_packus_epi16(sss1, zero);
+    lineOut0[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 0));
+    lineOut1[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
+    lineOut2[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 0));
+    lineOut3[xx] = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
+  }
+}
+
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+void ImagingResampleHorizontalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT lineIn,
+    int xsize,
+    int* xbounds,
+    int16_t* kk,
+    int kmax,
+    int coefs_precision) {
+  int xmin, xmax, x;
+  int16_t* k;
+
+  for (const auto xx : c10::irange(xsize)) {
+    __m128i sss;
+    xmin = xbounds[xx * 2 + 0];
+    xmax = xbounds[xx * 2 + 1];
+    k = &kk[xx * kmax];
+    x = 0;
+
+    if (xmax < 8) {
+      sss = _mm_set1_epi32(1 << (coefs_precision - 1));
+    } else {
+      // Lower part will be added to higher, use only half of the error
+      __m256i sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2));
+
+      for (; x < xmax - 7; x += 8) {
+        __m256i pix, mmk, source;
+        __m128i tmp = _mm_loadu_si128((__m128i*)&k[x]);
+        __m256i ksource =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // clang-format off
+        source = _mm256_loadu_si256((__m256i*)&lineIn[x + xmin]);
+        pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+          -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0,
+          -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+        mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
+          11,10, 9,8, 11,10, 9,8, 11,10, 9,8, 11,10, 9,8,
+          3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+
+        pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+          -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+          -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8));
+        mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
+          15,14, 13,12, 15,14, 13,12, 15,14, 13,12, 15,14, 13,12,
+          7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4));
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+        // clang-format on
+      }
+
+      for (; x < xmax - 3; x += 4) {
+        __m256i pix, mmk, source;
+        __m128i tmp = _mm_loadl_epi64((__m128i*)&k[x]);
+        __m256i ksource =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        tmp = _mm_loadu_si128((__m128i*)&lineIn[x + xmin]);
+        source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // clang-format off
+        pix = _mm256_shuffle_epi8(source, _mm256_set_epi8(
+          -1,15, -1,11, -1,14, -1,10, -1,13, -1,9, -1,12, -1,8,
+          -1,7, -1,3, -1,6, -1,2, -1,5, -1,1, -1,4, -1,0));
+        mmk = _mm256_shuffle_epi8(ksource, _mm256_set_epi8(
+          7,6, 5,4, 7,6, 5,4, 7,6, 5,4, 7,6, 5,4,
+          3,2, 1,0, 3,2, 1,0, 3,2, 1,0, 3,2, 1,0));
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+        // clang-format on
+      }
+
+      sss = _mm_add_epi32(
+          _mm256_extracti128_si256(sss256, 0),
+          _mm256_extracti128_si256(sss256, 1));
+    }
+
+    for (; x < xmax - 1; x += 2) {
+      __m128i mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
+      __m128i source = _mm_loadl_epi64((__m128i*)&lineIn[x + xmin]);
+      __m128i pix = _mm_shuffle_epi8(
+          source,
+          _mm_set_epi8(-1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0));
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    for (; x < xmax; x++) {
+      __m128i pix = mm_cvtepu8_epi32(&lineIn[x + xmin]);
+      __m128i mmk = _mm_set1_epi32(k[x]);
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    sss = _mm_packs_epi32(sss, sss);
+    lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
+  }
+}
+
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+void ImagingResampleVerticalConvolution8u(
+    uint32_t* C10_RESTRICT lineOut,
+    const uint32_t* C10_RESTRICT imIn,
+    int xmin,
+    int xmax,
+    int16_t* k,
+    int coefs_precision,
+    int xin) {
+  int x;
+  int xx = 0;
+  int xsize = xin;
+
+  __m128i initial = _mm_set1_epi32(1 << (coefs_precision - 1));
+  __m256i initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
+
+  for (; xx < xsize - 7; xx += 8) {
+    __m256i sss0 = initial_256;
+    __m256i sss1 = initial_256;
+    __m256i sss2 = initial_256;
+    __m256i sss3 = initial_256;
+    x = 0;
+    for (; x < xmax - 1; x += 2) {
+      __m256i source, source1, source2;
+      __m256i pix, mmk;
+
+      // Load two coefficients at once
+      mmk = _mm256_set1_epi32(*(int32_t*)&k[x]);
+
+      // Load 2 lines
+      //                           (__m256i *) &imIn->image32[x + xmin][xx]
+      source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
+      //                           (__m256i *) &imIn->image32[x + 1 + xmin][xx]
+      source2 =
+          _mm256_loadu_si256((__m256i*)(imIn + (x + 1 + xmin) * xin + xx));
+
+      source = _mm256_unpacklo_epi8(source1, source2);
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+
+      source = _mm256_unpackhi_epi8(source1, source2);
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
+    }
+    for (; x < xmax; x += 1) {
+      __m256i source, source1, pix, mmk;
+      mmk = _mm256_set1_epi32(k[x]);
+
+      //                           (__m256i *) &imIn->image32[x + xmin][xx])
+      source1 = _mm256_loadu_si256((__m256i*)(imIn + (x + xmin) * xin + xx));
+
+      source = _mm256_unpacklo_epi8(source1, _mm256_setzero_si256());
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix, mmk));
+
+      source = _mm256_unpackhi_epi8(source1, _mm256_setzero_si256());
+      pix = _mm256_unpacklo_epi8(source, _mm256_setzero_si256());
+      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix, mmk));
+      pix = _mm256_unpackhi_epi8(source, _mm256_setzero_si256());
+      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix, mmk));
+    }
+    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
+    sss2 = _mm256_srai_epi32(sss2, coefs_precision);
+    sss3 = _mm256_srai_epi32(sss3, coefs_precision);
+
+    sss0 = _mm256_packs_epi32(sss0, sss1);
+    sss2 = _mm256_packs_epi32(sss2, sss3);
+    sss0 = _mm256_packus_epi16(sss0, sss2);
+    _mm256_storeu_si256((__m256i*)&lineOut[xx], sss0);
+  }
+
+  for (; xx < xsize - 1; xx += 2) {
+    __m128i sss0 = initial; // left row
+    __m128i sss1 = initial; // right row
+    x = 0;
+    for (; x < xmax - 1; x += 2) {
+      __m128i source, source1, source2;
+      __m128i pix, mmk;
+
+      // Load two coefficients at once
+      mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
+
+      // Load 2 lines
+      //                        (__m128i *) &imIn->image32[x + xmin][xx])
+      source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
+      //                        (__m128i *) &imIn->image32[x + 1 + xmin][xx]
+      source2 = _mm_loadl_epi64((__m128i*)(imIn + (x + 1 + xmin) * xin + xx));
+
+      source = _mm_unpacklo_epi8(source1, source2);
+      pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
+      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
+      pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
+      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
+    }
+    for (; x < xmax; x += 1) {
+      __m128i source, source1, pix, mmk;
+      mmk = _mm_set1_epi32(k[x]);
+
+      //                        (__m128i *) &imIn->image32[x + xmin][xx]);
+      source1 = _mm_loadl_epi64((__m128i*)(imIn + (x + xmin) * xin + xx));
+
+      source = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
+      pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
+      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
+      pix = _mm_unpackhi_epi8(source, _mm_setzero_si128());
+      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
+    }
+    sss0 = _mm_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm_srai_epi32(sss1, coefs_precision);
+
+    sss0 = _mm_packs_epi32(sss0, sss1);
+    sss0 = _mm_packus_epi16(sss0, sss0);
+    _mm_storel_epi64((__m128i*)&lineOut[xx], sss0);
+  }
+
+  for (; xx < xsize; xx++) {
+    __m128i sss = initial;
+    x = 0;
+    for (; x < xmax - 1; x += 2) {
+      __m128i source, source1, source2;
+      __m128i pix, mmk;
+
+      // Load two coefficients at once
+      mmk = _mm_set1_epi32(*(int32_t*)&k[x]);
+
+      // Load 2 lines
+      //                           *(int *) &imIn->image32[x + xmin][xx]
+      source1 = _mm_cvtsi32_si128(*(int*)(imIn + (x + xmin) * xin + xx));
+      //                          *(int *) &imIn->image32[x + 1 + xmin][xx]
+      source2 = _mm_cvtsi32_si128(*(int*)(imIn + (x + 1 + xmin) * xin + xx));
+
+      source = _mm_unpacklo_epi8(source1, source2);
+      pix = _mm_unpacklo_epi8(source, _mm_setzero_si128());
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    for (; x < xmax; x++) {
+      //                             &imIn->image32[x + xmin][xx]
+      __m128i pix = mm_cvtepu8_epi32(imIn + (x + xmin) * xin + xx);
+      __m128i mmk = _mm_set1_epi32(k[x]);
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    sss = _mm_packs_epi32(sss, sss);
+    lineOut[xx] = _mm_cvtsi128_si32(_mm_packus_epi16(sss, sss));
+  }
+}
+
+} // anonymous namespace
+#endif // CPU_CAPABILITY_AVX2
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index 22d612461b84..3171f3ff04fe 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -192,7 +192,316 @@ void LayerNormKernelImpl(
   });
 }
 
-template <typename T>
+template <typename T, typename T2, typename T_ACC>
+void layer_norm_backward_frame(
+    const T* dY_data,
+    const T* X_data,
+    const T2* mean_data,
+    const T2* rstd_data,
+    const T2* gamma_data,
+    T* dX_data,
+    T* dgamma_buffer_ptr,
+    T* dbeta_buffer_ptr,
+    const T_ACC scale,
+    const bool gamma_null,
+    const bool dX_null,
+    const bool dgamma_null,
+    const bool dbeta_null,
+    int64_t N,
+    int64_t i) {
+  using Vec = vec::Vectorized<T_ACC>;
+  const T* dY_ptr = dY_data + i * N;
+  const T* X_ptr = X_data + i * N;
+  if (!dgamma_null) {
+    const T_ACC a = rstd_data[i];
+    const T_ACC b = -a * mean_data[i];
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   dgamma_data[j] += dY_ptr[j] * (a * X_ptr[j] + b);
+    // }
+    vec::map3<T>(
+        [a, b](Vec dgamma, Vec dy, Vec x) {
+          return dgamma + dy * (Vec(a) * x + Vec(b));
+        },
+        dgamma_buffer_ptr,
+        dgamma_buffer_ptr,
+        dY_ptr,
+        X_ptr,
+        N);
+  }
+  if (!dbeta_null) {
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   dbeta_data[j] += dY_ptr[j];
+    // }
+    vec::map2<T>(
+        [](Vec dbeta, Vec dy) { return dbeta + dy; },
+        dbeta_buffer_ptr,
+        dbeta_buffer_ptr,
+        dY_ptr,
+        N);
+  }
+  if (!dX_null) {
+    T* dX_ptr = dX_data + i * N;
+    T_ACC ds = T_ACC(0);
+    T_ACC db = T_ACC(0);
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+    //   ds += dY_ptr[j] * X_ptr[j] * gamma_v;
+    //   db += dY_ptr[j] * gamma_v;
+    // }
+    if (gamma_null) {
+      ds = vec::map2_reduce_all<T>(
+          [](Vec x, Vec y) { return x * y; },
+          [](Vec x, Vec y) { return x + y; },
+          dY_ptr,
+          X_ptr,
+          N);
+      db = vec::reduce_all<T>(
+          [](Vec& x, Vec& y) { return x + y; }, dY_ptr, N);
+    } else {
+      ds = vec::map3_reduce_all<T>(
+          [](Vec x, Vec y, Vec z) { return x * y * z; },
+          [](Vec x, Vec y) { return x + y; },
+          dY_ptr,
+          X_ptr,
+          gamma_data,
+          N);
+      db = vec::map2_reduce_all<T>(
+          [](Vec x, Vec y) { return x * y; },
+          [](Vec x, Vec y) { return x + y; },
+          dY_ptr,
+          gamma_data,
+          N);
+    }
+    const T_ACC a = rstd_data[i];
+    const T_ACC b = (db * mean_data[i] - ds) * a * a * a * scale;
+    const T_ACC c = -b * mean_data[i] - db * a * scale;
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+    //   dX_ptr[j] = a * dY_ptr[j] * gamma_v + b * X_ptr[j] + c;
+    // }
+    if (gamma_null) {
+      vec::map2<T>(
+          [a, b, c](Vec dy, Vec x) {
+            return Vec(a) * dy + Vec(b) * x + Vec(c);
+          },
+          dX_ptr,
+          dY_ptr,
+          X_ptr,
+          N);
+    } else {
+      vec::map3<T>(
+          [a, b, c](Vec dy, Vec gamma, Vec x) {
+            return Vec(a) * dy * gamma + Vec(b) * x + Vec(c);
+          },
+          dX_ptr,
+          dY_ptr,
+          gamma_data,
+          X_ptr,
+          N);
+    }
+  }
+}
+
+template <>
+void layer_norm_backward_frame<BFloat16, float, float>(
+    const BFloat16* dY_data,
+    const BFloat16* X_data,
+    const float* mean_data,
+    const float* rstd_data,
+    const float* gamma_data,
+    BFloat16* dX_data,
+    BFloat16* dgamma_buffer_ptr,
+    BFloat16* dbeta_buffer_ptr,
+    const float scale,
+    const bool gamma_null,
+    const bool dX_null,
+    const bool dgamma_null,
+    const bool dbeta_null,
+    int64_t N,
+    int64_t i) {
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  const BFloat16* dY_ptr = dY_data + i * N;
+  const BFloat16* X_ptr = X_data + i * N;
+  if (!dgamma_null) {
+    const float a = rstd_data[i];
+    const float b = -a * mean_data[i];
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   dgamma_data[j] += dY_ptr[j] * (a * X_ptr[j] + b);
+    // }
+    vec::map3<BFloat16>(
+        [a, b](fVec dgamma, fVec dy, fVec x) {
+          return dgamma + dy * (fVec(a) * x + fVec(b));
+        },
+        dgamma_buffer_ptr,
+        dgamma_buffer_ptr,
+        dY_ptr,
+        X_ptr,
+        N);
+  }
+  if (!dbeta_null) {
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   dbeta_data[j] += dY_ptr[j];
+    // }
+    vec::map2<BFloat16>(
+        [](fVec dbeta, fVec dy) { return dbeta + dy; },
+        dbeta_buffer_ptr,
+        dbeta_buffer_ptr,
+        dY_ptr,
+        N);
+  }
+  if (!dX_null) {
+    BFloat16* dX_ptr = dX_data + i * N;
+    float ds = float(0);
+    float db = float(0);
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+    //   ds += dY_ptr[j] * X_ptr[j] * gamma_v;
+    //   db += dY_ptr[j] * gamma_v;
+    // }
+    if (gamma_null) {
+      ds = vec::map2_reduce_all<BFloat16>(
+          [](fVec x, fVec y) { return x * y; },
+          [](fVec x, fVec y) { return x + y; },
+          dY_ptr,
+          X_ptr,
+          N);
+      db = vec::reduce_all<BFloat16>(
+          [](fVec& x, fVec& y) { return x + y; }, dY_ptr, N);
+    } else {
+      if (N < bVec::size()) {
+        bVec x_bvec = bVec::loadu(X_ptr, N);
+        bVec dy_bvec = bVec::loadu(dY_ptr, N);
+        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
+        std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+        std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data, N);
+        if (N > fVec::size()) {
+          fVec db_fvec0 = dy_fvec0 * gamma_fvec0;
+          fVec db_fvec1 = dy_fvec1 * gamma_fvec1;
+          fVec ds_fvec0 = x_fvec0 * db_fvec0;
+          fVec ds_fvec1 = x_fvec1 * db_fvec1;
+          ds_fvec0 = fVec::set(ds_fvec0, ds_fvec0 + ds_fvec1, N - fVec::size());
+          ds = vec_reduce_all<float>([](fVec x, fVec y) { return x + y; }, ds_fvec0);
+          db_fvec0 = fVec::set(db_fvec0, db_fvec0 + db_fvec1, N - fVec::size());
+          db = vec_reduce_all<float>([](fVec x, fVec y) { return x + y; }, db_fvec0);
+        } else {
+          fVec db_fvec0 = dy_fvec0 * gamma_fvec0;
+          fVec ds_fvec0 = x_fvec0 * db_fvec0;
+          ds = vec_reduce_all<float>([](fVec x, fVec y) { return x + y; }, ds_fvec0, N);
+          db = vec_reduce_all<float>([](fVec x, fVec y) { return x + y; }, db_fvec0, N);
+        }
+      } else {
+        int64_t d = bVec::size();
+        bVec x_bvec = bVec::loadu(X_ptr);
+        bVec dy_bvec = bVec::loadu(dY_ptr);
+        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
+        fVec ds_fvec0, ds_fvec1, db_fvec0, db_fvec1, acc_ds_fvec0, acc_ds_fvec1, acc_db_fvec0, acc_db_fvec1;
+        std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+        std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data);
+        acc_db_fvec0 = dy_fvec0 * gamma_fvec0;
+        acc_db_fvec1 = dy_fvec1 * gamma_fvec1;
+        acc_ds_fvec0 = x_fvec0 * acc_db_fvec0;
+        acc_ds_fvec1 = x_fvec1 * acc_db_fvec1;
+        for (; d < N - (N % bVec::size()); d += bVec::size()) {
+          x_bvec = bVec::loadu(X_ptr + d);
+          dy_bvec = bVec::loadu(dY_ptr + d);
+          std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+          std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+          std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d);
+          db_fvec0 = dy_fvec0 * gamma_fvec0;
+          db_fvec1 = dy_fvec1 * gamma_fvec1;
+          ds_fvec0 = x_fvec0 * db_fvec0;
+          ds_fvec1 = x_fvec1 * db_fvec1;
+          acc_ds_fvec0 = acc_ds_fvec0 + ds_fvec0;
+          acc_ds_fvec1 = acc_ds_fvec1 + ds_fvec1;
+          acc_db_fvec0 = acc_db_fvec0 + db_fvec0;
+          acc_db_fvec1 = acc_db_fvec1 + db_fvec1;
+        }
+        if (N - d > 0) {
+          x_bvec = bVec::loadu(X_ptr + d, N - d);
+          dy_bvec = bVec::loadu(dY_ptr + d, N - d);
+          std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+          std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+          std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d, N - d);
+          if (N - d > fVec::size()) {
+            db_fvec0 = dy_fvec0 * gamma_fvec0;
+            db_fvec1 = dy_fvec1 * gamma_fvec1;
+            ds_fvec0 = x_fvec0 * db_fvec0;
+            ds_fvec1 = x_fvec1 * db_fvec1;
+            acc_ds_fvec0 = acc_ds_fvec0 + ds_fvec0;
+            acc_ds_fvec1 = fVec::set(acc_ds_fvec1, acc_ds_fvec1 + ds_fvec1, N - d - fVec::size());
+            acc_db_fvec0 = acc_db_fvec0 + db_fvec0;
+            acc_db_fvec1 = fVec::set(acc_db_fvec1, acc_db_fvec1 + db_fvec1, N - d - fVec::size());
+          } else {
+            db_fvec0 = dy_fvec0 * gamma_fvec0;
+            ds_fvec0 = x_fvec0 * db_fvec0;
+            acc_ds_fvec0 = fVec::set(acc_ds_fvec0, acc_ds_fvec0 + ds_fvec0, N - d);
+            acc_db_fvec0 = fVec::set(acc_db_fvec0, acc_db_fvec0 + db_fvec0, N - d);
+          }
+        }
+        acc_ds_fvec0 = acc_ds_fvec0 + acc_ds_fvec1;
+        acc_db_fvec0 = acc_db_fvec0 + acc_db_fvec1;
+        ds = vec_reduce_all<float>([](fVec x, fVec y) { return x + y; }, acc_ds_fvec0);
+        db = vec_reduce_all<float>([](fVec x, fVec y) { return x + y; }, acc_db_fvec0);
+      }
+    }
+    const float a = rstd_data[i];
+    const float b = (db * mean_data[i] - ds) * a * a * a * scale;
+    const float c = -b * mean_data[i] - db * a * scale;
+    // Scalar math:
+    // for (const auto j : c10::irange(N)) {
+    //   const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+    //   dX_ptr[j] = a * dY_ptr[j] * gamma_v + b * X_ptr[j] + c;
+    // }
+    if (gamma_null) {
+      vec::map2<BFloat16>(
+          [a, b, c](fVec dy, fVec x) {
+            return fVec(a) * dy + fVec(b) * x + fVec(c);
+          },
+          dX_ptr,
+          dY_ptr,
+          X_ptr,
+          N);
+    } else {
+      int64_t d = 0;
+      for (; d < N - (N % bVec::size()); d += bVec::size()) {
+        bVec x_bvec = bVec::loadu(X_ptr + d);
+        bVec dy_bvec = bVec::loadu(dY_ptr + d);
+        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
+        std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+        std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d);
+        fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c);
+        fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c);
+        bVec r_bvec = convert_float_bfloat16(r_fvec0, r_fvec1);
+        r_bvec.store(dX_ptr + d);
+      }
+      if (N - d > 0) {
+        bVec x_bvec = bVec::loadu(X_ptr + d, N - d);
+        bVec dy_bvec = bVec::loadu(dY_ptr + d, N - d);
+        fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1;
+        std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+        std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec);
+        std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d, N - d);
+        fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c);
+        fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c);
+        bVec r_bvec = convert_float_bfloat16(r_fvec0, r_fvec1);
+        r_bvec.store(dX_ptr + d, N - d);
+      }
+    }
+  }
+}
+
+template <typename T, typename T2>
 void LayerNormBackwardKernelImplInternal(
     const Tensor& dY,
     const Tensor& X,
@@ -205,7 +514,6 @@ void LayerNormBackwardKernelImplInternal(
     Tensor* dgamma,
     Tensor* dbeta) {
   using T_ACC = at::opmath_type<T>;
-  using Vec = vec::Vectorized<T_ACC>;
   TORCH_DCHECK_EQ(dY.numel(), M * N);
   TORCH_DCHECK_EQ(X.numel(), M * N);
   TORCH_DCHECK_EQ(mean.numel(), M);
@@ -213,13 +521,13 @@ void LayerNormBackwardKernelImplInternal(
   DCHECK(!gamma.defined() || gamma.numel() == N);
   const T* dY_data = dY.template data_ptr<T>();
   const T* X_data = X.template data_ptr<T>();
-  const T* mean_data = mean.template data_ptr<T>();
-  const T* rstd_data = rstd.template data_ptr<T>();
-  const T* gamma_data =
-      gamma.defined() ? gamma.template data_ptr<T>() : nullptr;
+  const T2* mean_data = mean.template data_ptr<T2>();
+  const T2* rstd_data = rstd.template data_ptr<T2>();
+  const T2* gamma_data =
+      gamma.defined() ? gamma.template data_ptr<T2>() : nullptr;
   T* dX_data = dX->defined() ? dX->template data_ptr<T>() : nullptr;
-  T* dgamma_data = dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
-  T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
+  T2* dgamma_data = dgamma->defined() ? dgamma->template data_ptr<T2>() : nullptr;
+  T2* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T2>() : nullptr;
   const T_ACC scale = T_ACC(1) / static_cast<T_ACC>(N);
   const bool gamma_null = gamma_data == nullptr;
   const bool dX_null = dX_data == nullptr;
@@ -257,100 +565,7 @@ void LayerNormBackwardKernelImplInternal(
     T* dbeta_buffer_ptr =
         dbeta_null ? nullptr : buffer_data + num_threads * N + tid * N;
     for (const auto i : c10::irange(start, end)) {
-      const T* dY_ptr = dY_data + i * N;
-      const T* X_ptr = X_data + i * N;
-      if (!dgamma_null) {
-        const T_ACC a = rstd_data[i];
-        const T_ACC b = -a * mean_data[i];
-        // Scalar math:
-        // for (const auto j : c10::irange(N)) {
-        //   dgamma_data[j] += dY_ptr[j] * (a * X_ptr[j] + b);
-        // }
-        vec::map3<T>(
-            [a, b](Vec dgamma, Vec dy, Vec x) {
-              return dgamma + dy * (Vec(a) * x + Vec(b));
-            },
-            dgamma_buffer_ptr,
-            dgamma_buffer_ptr,
-            dY_ptr,
-            X_ptr,
-            N);
-      }
-      if (!dbeta_null) {
-        // Scalar math:
-        // for (const auto j : c10::irange(N)) {
-        //   dbeta_data[j] += dY_ptr[j];
-        // }
-        vec::map2<T>(
-            [](Vec dbeta, Vec dy) { return dbeta + dy; },
-            dbeta_buffer_ptr,
-            dbeta_buffer_ptr,
-            dY_ptr,
-            N);
-      }
-      if (!dX_null) {
-        T* dX_ptr = dX_data + i * N;
-        T_ACC ds = T_ACC(0);
-        T_ACC db = T_ACC(0);
-        // Scalar math:
-        // for (const auto j : c10::irange(N)) {
-        //   const T gamma_v = gamma_null ? T(1) : gamma_data[j];
-        //   ds += dY_ptr[j] * X_ptr[j] * gamma_v;
-        //   db += dY_ptr[j] * gamma_v;
-        // }
-        if (gamma_null) {
-          ds = vec::map2_reduce_all<T>(
-              [](Vec x, Vec y) { return x * y; },
-              [](Vec x, Vec y) { return x + y; },
-              dY_ptr,
-              X_ptr,
-              N);
-          db = vec::reduce_all<T>(
-              [](Vec& x, Vec& y) { return x + y; }, dY_ptr, N);
-        } else {
-          ds = vec::map3_reduce_all<T>(
-              [](Vec x, Vec y, Vec z) { return x * y * z; },
-              [](Vec x, Vec y) { return x + y; },
-              dY_ptr,
-              X_ptr,
-              gamma_data,
-              N);
-          db = vec::map2_reduce_all<T>(
-              [](Vec x, Vec y) { return x * y; },
-              [](Vec x, Vec y) { return x + y; },
-              dY_ptr,
-              gamma_data,
-              N);
-        }
-        const T_ACC a = rstd_data[i];
-        const T_ACC b = (db * mean_data[i] - ds) * a * a * a * scale;
-        const T_ACC c = -b * mean_data[i] - db * a * scale;
-        // Scalar math:
-        // for (const auto j : c10::irange(N)) {
-        //   const T gamma_v = gamma_null ? T(1) : gamma_data[j];
-        //   dX_ptr[j] = a * dY_ptr[j] * gamma_v + b * X_ptr[j] + c;
-        // }
-        if (gamma_null) {
-          vec::map2<T>(
-              [a, b, c](Vec dy, Vec x) {
-                return Vec(a) * dy + Vec(b) * x + Vec(c);
-              },
-              dX_ptr,
-              dY_ptr,
-              X_ptr,
-              N);
-        } else {
-          vec::map3<T>(
-              [a, b, c](Vec dy, Vec gamma, Vec x) {
-                return Vec(a) * dy * gamma + Vec(b) * x + Vec(c);
-              },
-              dX_ptr,
-              dY_ptr,
-              gamma_data,
-              X_ptr,
-              N);
-        }
-      }
+      layer_norm_backward_frame<T, T2, T_ACC>(dY_data, X_data, mean_data, rstd_data, gamma_data, dX_data, dgamma_buffer_ptr, dbeta_buffer_ptr, scale, gamma_null, dX_null, dgamma_null, dbeta_null, N, i);
     }
   });
 
@@ -390,8 +605,13 @@ void LayerNormBackwardKernelImpl(
     Tensor* dbeta) {
   AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, X.scalar_type(),
       "LayerNormBackwardKernelImpl", [&]() {
-    LayerNormBackwardKernelImplInternal<scalar_t>(
-        dY.contiguous(), X, mean, rstd, gamma, M, N, dX, dgamma, dbeta);
+    if (X.scalar_type() == at::kBFloat16 && gamma.scalar_type() == at::kFloat) {
+      LayerNormBackwardKernelImplInternal<BFloat16, float>(
+          dY.contiguous(), X, mean, rstd, gamma, M, N, dX, dgamma, dbeta);
+    } else {
+      LayerNormBackwardKernelImplInternal<scalar_t, scalar_t>(
+          dY.contiguous(), X, mean, rstd, gamma, M, N, dX, dgamma, dbeta);
+    }
   });
 }
 
diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
index 404957cceb66..e029b275291c 100644
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@@ -72,6 +72,19 @@ inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr)
   return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size()));
 }
 
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const BFloat16* ptr, int64_t count) {
+  return convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr, count));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr, int64_t count) {
+  using Vec = Vectorized<float>;
+  if (count > Vec::size()) {
+  return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size(), count - Vec::size()));
+  } else {
+    return std::make_tuple(Vec::loadu(ptr, count), Vec(0));
+  }
+}
+
 } // namespace
 
 namespace utils {
diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu
index 65092ead1169..980bd6637341 100644
--- a/aten/src/ATen/native/cuda/AbsKernel.cu
+++ b/aten/src/ATen/native/cuda/AbsKernel.cu
@@ -15,7 +15,7 @@ struct AbsFunctor {
   }
 };
 
-const char abs_name[] = "abs_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char abs_name[] = "abs_kernel";
 void abs_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu b/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu
index 38a2addaaecd..aa955a9c7e54 100644
--- a/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu
@@ -16,7 +16,7 @@
 namespace at::native {
 namespace binary_internal {
 
-const char div_name[] = "div_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char div_name[] = "div_kernel";
 void div_true_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (iter.common_dtype() == kComplexHalf) {
diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index 7be798f3b258..eaa01ac1accc 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char logical_and_name[] = "logical_and_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char logical_and_name[] = "logical_and_kernel";
 void logical_and_kernel_cuda(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -48,7 +48,7 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-const char logical_or_name[] = "logical_or_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char logical_or_name[] = "logical_or_kernel";
 void logical_or_kernel_cuda(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -84,7 +84,7 @@ void logical_or_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-const char logical_xor_name[] = "logical_xor_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char logical_xor_name[] = "logical_xor_kernel";
 void logical_xor_kernel_cuda(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index dc8c6327e962..75d5991f93db 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -15,7 +15,7 @@
 
 namespace at::native {
 
-const char sigmoid_backward_name[] = "sigmoid_backward";
+CONSTEXPR_EXCEPT_WIN_CUDA char sigmoid_backward_name[] = "sigmoid_backward";
 void sigmoid_backward_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if(isComplexType(dtype)) {
@@ -86,7 +86,7 @@ void logit_backward_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scal
       });
 }
 
-const char tanh_backward_name[] = "tanh_backward";
+CONSTEXPR_EXCEPT_WIN_CUDA char tanh_backward_name[] = "tanh_backward";
 void tanh_backward_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if(isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/BinaryMulKernel.cu b/aten/src/ATen/native/cuda/BinaryMulKernel.cu
index 8c7d6d14ba3a..251221f7adcd 100644
--- a/aten/src/ATen/native/cuda/BinaryMulKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulKernel.cu
@@ -18,7 +18,7 @@
 
 namespace at::native {
 
-const char mul_name[] = "mul_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char mul_name[] = "mul_kernel";
 void mul_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (common_dtype == kComplexHalf) {
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index ce78f517a0bc..bc702f374b64 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -126,14 +126,14 @@ enum class Activation {
 cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
   switch (a) {
     case Activation::None:
-      return cuda::blas::GEMMAndBiasActivationEpilogue::None;
+      return cuda::blas::GEMMAndBiasActivationEpilogue::BIAS;
     case Activation::RELU:
-      return cuda::blas::GEMMAndBiasActivationEpilogue::RELU;
+      return cuda::blas::GEMMAndBiasActivationEpilogue::BIAS_RELU;
     case Activation::GELU:
-      return cuda::blas::GEMMAndBiasActivationEpilogue::GELU;
+      return cuda::blas::GEMMAndBiasActivationEpilogue::BIAS_GELU;
     default:
       TORCH_CHECK(false);
-      return cuda::blas::GEMMAndBiasActivationEpilogue::None;
+      return cuda::blas::GEMMAndBiasActivationEpilogue::BIAS;
   }
 }
 #endif
@@ -158,7 +158,15 @@ uint8_t getAlignment(const Tensor &t) {
   return alignment;
 }
 
-Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) {
+Tensor& addmm_out_cuda_impl(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Activation activation = Activation::None,
+    bool allow_extended = false) {
   // Make sure to keep addmm_cuda below in sync with this code; it
   // preflights a check to try to avoid actually needing to call
   // expand().
@@ -317,7 +325,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
               // path until we confirm which version it's working in.
               activation != Activation::GELU
               ? activation_to_gemm_and_blas_arg(activation)
-              : cuda::blas::GEMMAndBiasActivationEpilogue::None
+              : cuda::blas::GEMMAndBiasActivationEpilogue::BIAS
 #endif
           );
         });
@@ -672,4 +680,83 @@ TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Ten
   }
 }
 
+
+Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result) {
+  // NOTE: cuBLAS is currently broken for some combination of transposed inputs.
+  TORCH_CHECK(self.dim() == 2, "Expected self to be of dimension 2 but got ", self.dim());
+  TORCH_CHECK(mat2.dim() == 2, "Expected mat2 to be of dimension 2 but got ", mat2.dim());
+  TORCH_CHECK(self.size(0) > 16, "self.size(0) needs to be greater than 16, but got ", self.size(0));
+  TORCH_CHECK(self.size(1) > 0 && self.size(1) % 8 == 0, "self.size(1) needs to be greater than 0 and a multiple of 8, but got ", self.size(1));
+  TORCH_CHECK(self.size(1) == mat2.size(0), "self.size(1) needs to match mat2.size(0) but got ", self.size(1), " and ", mat2.size(0));
+  TORCH_CHECK(mat2.size(1) > 0 && mat2.size(1) % 8 == 0, "mat2.size(1) needs to be greater than 0 and a multiple of 8, but got ", mat2.size(1));
+
+  TORCH_CHECK(result.dtype() == at::kInt, "Expected result dtype to be of type kInt but got ", result.dtype());
+  TORCH_CHECK(result.size(0) == self.size(0), "Expected result.size(0) to be ", self.size(0), " but got ", result.size(0));
+  TORCH_CHECK(result.size(1) == mat2.size(1), "Expected result.size(1) to be ", mat2.size(1), " but got ", result.size(1));
+
+  TORCH_CHECK(result.dim() == 2, "Expected result to be of dimension 2 but got ", result.dim());
+
+  TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
+
+#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION) && CUDA_VERSION == 11070
+  auto mat1 = self;
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  bool transpose_result;
+  c10::MaybeOwned<Tensor> result_ = prepare_matrix_for_cublas(result, transpose_result);
+  bool transpose_mat1;
+  bool transpose_mat2;
+  c10::MaybeOwned<Tensor> mat1_ = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
+  c10::MaybeOwned<Tensor> mat2_ = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
+
+  if (transpose_result) {
+    transpose_mat1 = !transpose_mat1;
+    transpose_mat2 = !transpose_mat2;
+    mat1_sizes = mat1_->sizes();
+    mat2_sizes = mat2_->sizes();
+  }
+
+  int64_t m = mat1_sizes[transpose_result ? 1 : 0];
+  int64_t k = mat1_sizes[transpose_result ? 0 : 1];
+  int64_t n = mat2_sizes[transpose_result ? 0 : 1];
+  int64_t mat1_ld = mat1_->stride((transpose_mat1 == transpose_result) ? 1 : 0);
+  int64_t mat2_ld = mat2_->stride((transpose_mat2 == transpose_result) ? 1 : 0);
+  int64_t result_ld = result_->stride(transpose_result ? 0 : 1);
+
+  at::cuda::blas::gemm_and_bias<int8_t, int32_t, std::nullptr_t>(
+      transpose_mat1,
+      transpose_mat2,
+      m,
+      n,
+      k,
+      1.0,
+      mat1_->data_ptr<int8_t>(),
+      mat1_ld,
+      mat2_->data_ptr<int8_t>(),
+      mat2_ld,
+      nullptr,
+      result_->data_ptr<int32_t>(),
+      result_ld,
+      cuda::blas::GEMMAndBiasActivationEpilogue::NONE,
+      false /* use_heuristic */);
+
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+#else
+#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION)
+  TORCH_CHECK(false, "_int_mm_out_cuda not compiled for CUDA ", CUDA_VERSION);
+#else
+  TORCH_CHECK(false, "_int_mm_out_cuda not compiled for this platform.");
+#endif
+#endif
+
+  return result;
+}
+
+Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
+  Tensor result = at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
+  return _int_mm_out_cuda(self, mat2, result);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h
index 6a096b42f719..83ea275ab605 100644
--- a/aten/src/ATen/native/cuda/DistributionTemplates.h
+++ b/aten/src/ATen/native/cuda/DistributionTemplates.h
@@ -529,6 +529,7 @@ struct GeometricKernel {
 
 template<typename RNG>
 void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
     auto lambda = static_cast<accscalar_t>(lambda_);
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 7a1e2663b49a..6f7d468616c4 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -343,6 +343,9 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_,
         "offsets has to be a 1D Tensor, but got Tensor of dimension ",
         offsets_.dim());
   }
+  TORCH_CHECK(weight.dim() == 2,
+      "weight has to be a 2D Tensor, but got Tensor of dimension ",
+      weight.dim());
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
   const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index f05d0f257839..ef9c63305baa 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -13,6 +13,7 @@
 #include <ATen/ops/_foreach_sub_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
 #include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
 
 #include <ATen/ops/empty_like_native.h>
 #endif
@@ -58,6 +59,7 @@ void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, const Sca
                                                    /* res_arg_index */ 0>(),
                           Op<opmath_t>(),
                           alpha.to<opmath_t>());
+    increment_version(tensors1);
 }
 
 template<template<class> class Op>
@@ -81,6 +83,13 @@ std::vector<Tensor> all_types_half_bfloat16(TensorList tensors1, TensorList tens
     });
 }
 
+template<template<class> class Op>
+void all_types_complex_half_bfloat16_(TensorList tensors1, TensorList tensors2, const Scalar& alpha = 1) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() {
+        foreach_tensor_list_op_<scalar_t, Op>(tensors1, tensors2, alpha);
+    });
+}
+
 template<template<class> class Op>
 void all_types_half_bfloat16_(TensorList tensors1, TensorList tensors2, const Scalar& alpha = 1) {
     AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() {
@@ -88,6 +97,13 @@ void all_types_half_bfloat16_(TensorList tensors1, TensorList tensors2, const Sc
     });
 }
 
+template<template<class> class Op>
+std::vector<Tensor> all_types_complex_half_bfloat16(TensorList tensors1, TensorList tensors2, const Scalar& alpha = 1) {
+    return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() {
+        return foreach_tensor_list_op<scalar_t, Op>(tensors1, tensors2, alpha);
+    });
+}
+
 #define FOREACH_BINARY_OP_LIST(FUNCTION, NAME, OP, DIVISION_OP)                                             \
 void foreach_tensor_##NAME##_list_kernel_cuda_(TensorList tensors1, TensorList tensors2) {                  \
     check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -132,5 +148,9 @@ FOREACH_BINARY_OP_LIST(all_types_complex_bool_half_bfloat16, mul, std::multiplie
 FOREACH_BINARY_OP_LIST(all_types_complex_bool_half_bfloat16, div, std::divides, /*division_op*/ true);
 FOREACH_BINARY_OP_LIST(all_types_half_bfloat16, clamp_max, minimum, /*division_op*/ false);
 FOREACH_BINARY_OP_LIST(all_types_half_bfloat16, clamp_min, maximum, /*division_op*/ false);
+// NOTE(crcrpar): [Why is foreach_pow's division_op=true?]
+// To push integer inputs to slow path. This is because with integer type inputs the fast path behaves differently
+// from the slow one. Need to investigate later.
+FOREACH_BINARY_OP_LIST(all_types_complex_half_bfloat16, pow, power_functor, /*division_op*/ true);
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index b1e7d84008c6..e2819a0a6707 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -14,6 +14,7 @@
 #include <ATen/ops/_foreach_sub_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
 #include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
 
 #include <ATen/ops/empty_like_native.h>
 #endif
@@ -56,6 +57,7 @@ void foreach_binary_op_(TensorList tensors, const Scalar& scalar) {
                                                 /* res_arg_index */ 0>(),
                                                 Op<opmath_t>(),
                           scalar.to<opmath_t>());
+    increment_version(tensors);
 }
 
 template<template<class> class Op>
@@ -86,6 +88,20 @@ void all_types_half_bfloat16_(TensorList tensors, const Scalar& scalar) {
     });
 }
 
+template<template<class> class Op>
+std::vector<Tensor> all_types_complex_half_bfloat16(TensorList tensors, const Scalar& scalar) {
+    return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() {
+        return foreach_binary_op<scalar_t, Op>(tensors, scalar);
+    });
+}
+
+template<template<class> class Op>
+void all_types_complex_half_bfloat16_(TensorList tensors, const Scalar& scalar) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda_", [&]() {
+        foreach_binary_op_<scalar_t, Op>(tensors, scalar);
+    });
+}
+
 #define FOREACH_BINARY_OP_SCALAR(FUNCTION, NAME, OP, DIVISION_OP)                                   \
 void foreach_tensor_##NAME##_scalar_kernel_cuda_(TensorList tensors, const Scalar& scalar) {        \
     check_foreach_api_restrictions(tensors);                                                        \
@@ -107,6 +123,15 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_cuda(TensorList tensor
 
 FOREACH_BINARY_OP_SCALAR(all_types_complex_bool_half_bfloat16, add, std::plus, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(all_types_complex_bool_half_bfloat16, mul, std::multiplies, /*div_op*/ false);
+// See [Why is foreach_pow's division_op=true?]
+FOREACH_BINARY_OP_SCALAR(all_types_complex_half_bfloat16, pow, power_functor, /*div_op*/ true);
+std::vector<Tensor> foreach_scalar_pow_list_kernel_cuda(const Scalar& scalar, TensorList exponent) {
+  check_foreach_api_restrictions(exponent);
+  if (!can_use_fast_route(exponent)) {
+    return at::native::foreach_scalar_pow_list_kernel_slow(scalar, exponent);
+  }
+  return all_types_complex_half_bfloat16<reverse_power_functor>(exponent, scalar);
+}
 
 // In the case of division, integer inputs will result in float.
 // Currently multi tensor apply can only return result of the same type as input.
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index f0c7cacd044c..47d124772944 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -14,6 +14,7 @@
 #include <ATen/ops/_foreach_sub_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
 #include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
 
 #include <ATen/ops/empty_like_native.h>
 #endif
@@ -57,6 +58,7 @@ void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
                                                               /* r_args_depth */ 1,
                                                               /* res_arg_index */ 0>(),
                                     Op<opmath_t>());
+    increment_version(tensors);
 }
 
 template<template<class> class Op>
@@ -87,6 +89,20 @@ void all_types_half_bfloat16_(TensorList tensors, at::ArrayRef<Scalar> scalars)
     });
 }
 
+template<template<class> class Op>
+std::vector<Tensor> all_types_complex_half_bfloat16(TensorList tensors, at::ArrayRef<Scalar> scalars) {
+    return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
+        return foreach_binary_op<scalar_t, Op>(tensors, scalars);
+    });
+}
+
+template<template<class> class Op>
+void all_types_complex_half_bfloat16_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
+        foreach_binary_op_<scalar_t, Op>(tensors, scalars);
+    });
+}
+
 #define FOREACH_BINARY_OP_SCALARLIST(FUNCTION, NAME, OP, DIV_OP)                                                         \
 void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<Scalar> scalars) {                 \
     check_foreach_api_restrictions(tensors, scalars);                                                                    \
@@ -109,6 +125,8 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList te
 FOREACH_BINARY_OP_SCALARLIST(all_types_complex_bool_half_bfloat16, add, std::plus, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(all_types_complex_bool_half_bfloat16, mul, std::multiplies, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(all_types_complex_bool_half_bfloat16, div, std::divides, /*div_op*/ true);
+// See [Why is foreach_pow's division_op=true?]
+FOREACH_BINARY_OP_SCALARLIST(all_types_complex_half_bfloat16, pow, power_functor, /*div_op*/ true);
 
 // This does not use FOREACH_BINARY_OP_SCALARLIST because
 // In the case of subtraction, we dont allow scalar to be boolean following the torch.sub logic
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index ec625e1762ed..2269588a14f4 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <ATen/native/cuda/Pow.cuh>
 #include <ATen/OpMathType.h>
 
 namespace at { namespace native {
@@ -547,5 +548,19 @@ struct TernaryOpScalarFunctor {
   }
 };
 
+template <typename T>
+struct power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(a, b);
+  }
+};
+
+template <typename T>
+struct reverse_power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(b, a);
+  }
+};
+
 } // namespace
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 8a95da396971..e0ba175f1d8d 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -66,6 +66,7 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
                               Op<opmath_t>(),
                               scalar.to<opmath_t>());
     });
+    increment_version(input);
 }
 
 template<template<class> class Op>
@@ -86,6 +87,7 @@ void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList ten
                                                                      /* res_arg_index */ 0>(),
                                         Op<opmath_t>());
     });
+    increment_version(input);
 }
 
 template<template<class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
index 26d3ff2160d3..3ad6367908a6 100644
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@@ -66,6 +66,7 @@ void foreach_tensor_lerp_ternary_cuda_(TensorList tensors1, TensorList tensors2,
                 LerpFunctor<opmath_t>());
         }
   );
+  increment_version(tensors1);
 }
 
 std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(TensorList tensors1, TensorList tensors2, const Scalar& weight) {
diff --git a/aten/src/ATen/native/cuda/FusedAdamKernel.cu b/aten/src/ATen/native/cuda/FusedAdamKernel.cu
index 361f7d4ba284..b8f514e0f1c2 100644
--- a/aten/src/ATen/native/cuda/FusedAdamKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedAdamKernel.cu
@@ -33,12 +33,12 @@ void _fused_adam_kernel_cuda_(
     TORCH_CHECK(
         at::native::check_fast_path_restrictions({params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs}),
         "params, grads, exp_avgs, exp_avg_sqs, and max_exp_avg_sqs must have same dtype, device, and layout");
-    _fused_adam_cuda_impl_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    _fused_adam_amsgrad_cuda_impl_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, maximize, grad_scale, found_inf);
   } else {
     TORCH_CHECK(
         at::native::check_fast_path_restrictions({params, grads, exp_avgs, exp_avg_sqs}),
         "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
-    _fused_adam_cuda_impl_(params, grads, exp_avgs, exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    _fused_adam_cuda_impl_(params, grads, exp_avgs, exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, maximize, grad_scale, found_inf);
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/FusedAdamWKernel.cu b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
new file mode 100644
index 000000000000..e11b82fafec7
--- /dev/null
+++ b/aten/src/ATen/native/cuda/FusedAdamWKernel.cu
@@ -0,0 +1,45 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TypeDefault.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/fused_adamw_amsgrad_impl.cuh>
+#include <ATen/native/cuda/fused_adamw_impl.cuh>
+#include <c10/util/Exception.h>
+
+
+namespace at { namespace native {
+
+// note(crcrpar): To observe the CI rules, i.e. 20 minutes per file to compile, defensively split instantiations into _impl files.
+// this is only for CUDA 11.3 for which it took about 20 minutes and 28 minutes in my workstation and CI, respectively.
+// As a data point, it took about 20 seconds for CUDA 11.7 installed in my environment.
+// See https://github.com/pytorch/pytorch/pull/81705 for details.
+void _fused_adamw_kernel_cuda_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf
+) {
+  if (amsgrad) {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions({params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs}),
+        "params, grads, exp_avgs, exp_avg_sqs, and max_exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adamw_amsgrad_cuda_impl_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, maximize, grad_scale, found_inf);
+  } else {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions({params, grads, exp_avgs, exp_avg_sqs}),
+        "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adamw_cuda_impl_(params, grads, exp_avgs, exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, maximize, grad_scale, found_inf);
+  }
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/GcdLcmKernel.cu b/aten/src/ATen/native/cuda/GcdLcmKernel.cu
index ee576f93c1f9..c4a8cdfaf1f8 100644
--- a/aten/src/ATen/native/cuda/GcdLcmKernel.cu
+++ b/aten/src/ATen/native/cuda/GcdLcmKernel.cu
@@ -14,7 +14,7 @@
 namespace at::native {
 
 // See note [Jiterator]
-const char gcd_name[] = "gcd";
+CONSTEXPR_EXCEPT_WIN_CUDA char gcd_name[] = "gcd";
 void gcd_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_cuda", [&]() {
@@ -33,7 +33,7 @@ void gcd_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-const char lcm_name[] = "lcm";
+CONSTEXPR_EXCEPT_WIN_CUDA char lcm_name[] = "lcm";
 void lcm_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "lcm_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 85ff7c380577..5abfd15971c1 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -248,6 +248,9 @@ static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
   // computes the stride as if tensor were contiguous
   auto sizes = tensor.sizes();
   std::vector<int64_t> stride(tensor.dim());
+  if (stride.empty()) {
+    return stride;
+  }
   stride[tensor.dim() - 1] = 1;
   std::partial_sum(sizes.rbegin(), sizes.rend() - 1, stride.rbegin() + 1, std::multiplies<int64_t>());
   return stride;
@@ -331,6 +334,8 @@ int64_t largestIndex(const Tensor &self) {
 }
 
 void index_put_with_sort_kernel(Tensor & self, const c10::List<c10::optional<Tensor>>& indices, const Tensor & value, bool accumulate, bool unsafe) {
+  TORCH_CHECK(!indices.empty() || is_expandable_to(value.sizes(), self.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
+             " cannot be broadcast to indexing result of shape ", self.sizes());
   if (indices.size() > (size_t)self.dim()) {
     TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   }
diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu
index 38f2cca897d5..01053a3beeab 100644
--- a/aten/src/ATen/native/cuda/Lerp.cu
+++ b/aten/src/ATen/native/cuda/Lerp.cu
@@ -9,7 +9,7 @@
 namespace at::native {
 namespace {
 
-const char lerp_tensor_name[] = "lerp_tensor";
+CONSTEXPR_EXCEPT_WIN_CUDA char lerp_tensor_name[] = "lerp_tensor";
 void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   auto dtype = iter.common_dtype();
   if(at::isComplexType(dtype)) {
@@ -63,7 +63,7 @@ void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   }
 }
 
-const char lerp_scalar_name[] = "lerp_scalar";
+CONSTEXPR_EXCEPT_WIN_CUDA char lerp_scalar_name[] = "lerp_scalar";
 void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
index b445e3ae13de..045bfa8d1f90 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -32,8 +32,7 @@ struct MagmaInitializer {
 namespace at::native {
 #if defined(BUILD_LAZY_CUDA_LINALG)
 namespace {
-cuda::detail::LinalgDispatch disp = {_symeig_helper_cuda,
-                                     _cholesky_solve_helper_cuda};
+cuda::detail::LinalgDispatch disp = {_cholesky_solve_helper_cuda};
 
 at::DynamicLibrary& getTorchLinalgLibrary() {
   static at::DynamicLibrary lib("libtorch_cuda_linalg.so", nullptr, true);
@@ -174,12 +173,6 @@ Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upp
     return disp.cholesky_solve_helper(self, A, upper);
 }
 
-std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvectors, bool upper) {
-    getTorchLinalgLibrary();
-    TORCH_CHECK(disp.symeig_helper != _symeig_helper_cuda, "Can't find _symeig_helper_cuda");
-    return disp.symeig_helper(self, eigenvectors, upper);
-}
-
 #endif /*defined(BUILD_LAZY_CUDA_LINALG)*/
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
index f1eb3bd68082..ea4188c970c4 100644
--- a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
@@ -12,8 +12,8 @@
 namespace at::native {
 
 // custom min and max to be used in logcumsumexp for complex arguments
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _logcumsumexp_minmax(c10::complex<scalar_t> x, c10::complex<scalar_t> y, bool min) {
+template <typename scalar_t, bool min>
+__host__ __device__ c10::complex<scalar_t> _logcumsumexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
   scalar_t xr = std::real(x);
   scalar_t yr = std::real(y);
   if (::isnan(yr) || (::isnan(std::imag(y)))) {
@@ -28,7 +28,7 @@ __host__ __device__ c10::complex<scalar_t> _logcumsumexp_minmax(c10::complex<sca
 }
 
 template <typename scalar_t>
-__host__ __device__ scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) {
+__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
   // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
   // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
   auto isnan_x = at::_isnan(x);
@@ -45,16 +45,43 @@ __host__ __device__ scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) {
 }
 
 template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(c10::complex<scalar_t> x, c10::complex<scalar_t> y) {
-  c10::complex<scalar_t> min = _logcumsumexp_minmax(x, y, /*min=*/true);
-  c10::complex<scalar_t> max = _logcumsumexp_minmax(x, y, /*min=*/false);
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the x is finite (not inf nor nan)
+  auto xreal = std::real(x);
+  auto ximag = std::imag(x);
+  auto exp_x_abs = std::exp(xreal);
+  auto exp_x_real = exp_x_abs * std::cos(ximag);
+  auto exp_x_imag = exp_x_abs * std::sin(ximag);
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the real part of x is infinite
+  auto ximag = std::imag(x);
+  auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
+  auto sin = std::sin(ximag);
+  auto cos = std::cos(ximag);
+  // special case if the angle is exactly the multiple of pi/2
+  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
+  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  c10::complex<scalar_t> min = _logcumsumexp_minmax<scalar_t, /*min=*/true>(x, y);
+  c10::complex<scalar_t> max = _logcumsumexp_minmax<scalar_t, /*min=*/false>(x, y);
   scalar_t min_real = std::real(min);
   scalar_t max_real = std::real(max);
 
   if (::isnan(min_real) || ::isnan(std::imag(min))) {
     // handling the "infectious" NaNs
     return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
-  } else if ((!::isfinite(min_real)) && (min_real == max_real)) {
+  }
+  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
     if (min_real < 0) {
       // handle the -inf case, the imaginary part here does not really matter as the exp(value)
       // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
@@ -63,10 +90,14 @@ __host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(c10::complex<scal
     } else {
       // handle the +inf case, we don't need the special precision for log1p for small values
       // and to avoid producing nan in case of real(max) == real(min) == +inf
-      return ::log(::exp(min) + ::exp(max));
+      auto exp_min = _fast_build_exp_inf(min);
+      auto exp_max = _fast_build_exp_inf(max);
+      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
     }
   } else {
-    return ::log1p(::exp(min - max)) + max;
+    auto minmax = min - max;
+    auto exp_minmax = _fast_build_exp(minmax);
+    return ::log1p(exp_minmax) + max;
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index a74144974a48..9254e7c579dd 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -97,6 +97,9 @@ void multi_tensor_apply(
         int loc_block_info = 0;
         int loc_tensor_info = 0;
         for(size_t t = 0; t < n_tensors; t++) {
+            if (tensor_lists[0][t].numel() == 0) {
+                continue;
+            }
 
             tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
 
@@ -156,6 +159,9 @@ void multi_tensor_apply(
         int loc_block_info = 0;
         int loc_tensor_info = 0;
         for(size_t t = 0; t < n_tensors; t++) {
+            if (tensor_lists[0][t].numel() == 0) {
+                continue;
+            }
             tensorListMeta.numel_for_tensor[loc_tensor_info] = tensor_lists[0][t].numel();
             for (int d = 0; d < depth; d++) {
                 tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
@@ -212,6 +218,9 @@ void multi_tensor_apply_for_fused_optimizer(
   int loc_block_info = 0;
   int loc_tensor_info = 0;
   for (const auto & tensor_index : c10::irange(num_tensors)) {
+    if (tensor_lists[0][tensor_index].numel() == 0) {
+      continue;
+    }
     tensorListMeta.state_steps_addresses[loc_tensor_info] = state_steps[tensor_index].data_ptr();
     tensorListMeta.numel_for_tensor[loc_tensor_info] = tensor_lists[0][tensor_index].numel();
     for (const auto & d : c10::irange(depth)) {
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
index 4108e0f9c6fe..53b67125222e 100644
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char addcmul_name[] = "addcmul";
+CONSTEXPR_EXCEPT_WIN_CUDA char addcmul_name[] = "addcmul";
 void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -56,7 +56,7 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
 }
 
 // return a + alpha * (b / static_cast<accscalar_t>(c));
-const char addcdiv_name[] = "addcdiv";
+CONSTEXPR_EXCEPT_WIN_CUDA char addcdiv_name[] = "addcdiv";
 void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/PowKernel.cu b/aten/src/ATen/native/cuda/PowKernel.cu
index 8697c2e43f3e..eb56da722fbb 100644
--- a/aten/src/ATen/native/cuda/PowKernel.cu
+++ b/aten/src/ATen/native/cuda/PowKernel.cu
@@ -38,7 +38,7 @@ void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<value_t> base
 }
 
 /* complex<Half> support impl */
-const char pow_scalar_base_name[] = "pow_scalar_base_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char pow_scalar_base_name[] = "pow_scalar_base_kernel";
 template <>
 void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<at::Half> base) {
   using scalar_t = c10::complex<at::Half>;
@@ -68,7 +68,7 @@ namespace {
 
 #if AT_USE_JITERATOR()
 /* complex<Half> support impl */
-const char pow_name[] = "pow_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char pow_name[] = "pow_kernel";
 static const auto pow_kernel_string =
     jiterator_stringify(template <typename T> T pow_kernel(T base, T exp) {
       return std::pow(base, exp);
diff --git a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
index 45474b0822fe..5b9a4530791d 100644
--- a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
@@ -11,21 +11,16 @@
 namespace at::native {
 
 template <typename scalar_t, typename out_t=scalar_t>
-void std_var_kernel_impl(TensorIterator& iter, int32_t correction, bool take_sqrt) {
+void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
   // reducing unrolling factor to 2 for welford kernel
   // This is necessary to lower register usage that leads to register spills.
   using accscalar_t = at::acc_type<scalar_t, true>;
-  using ops_t = WelfordOps<scalar_t, accscalar_t, int32_t, float, thrust::pair<out_t, out_t>>;
-  gpu_reduce_kernel<scalar_t, out_t, 2>(
-      iter, ops_t{correction, take_sqrt}, typename ops_t::acc_t{});
+  using ops_t = WelfordOps<scalar_t, accscalar_t, int32_t, thrust::pair<out_t, out_t>>;
+  ops_t ops(static_cast<accscalar_t>(correction), take_sqrt);
+  gpu_reduce_kernel<scalar_t, out_t, 2>(iter, ops, typename ops_t::acc_t{});
 }
 
-static void std_var_kernel_cuda(TensorIterator& iter, int64_t correction, bool take_sqrt) {
-  using limits = std::numeric_limits<int32_t>;
-  TORCH_CHECK(
-      correction < limits::max() && correction > limits::min(),
-      "The correction argument for std and var computation on CUDA must "
-      "fit within a 32-bit integer, but got ", correction);
+static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool take_sqrt) {
   const auto input_dtype = iter.input_dtype();
   if (input_dtype == kHalf && iter.dtype() == kFloat) {
     // type promotion that does cast and reduction in a single kernel
diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
index 5ad037f66181..dd03f79be949 100644
--- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
@@ -16,17 +16,17 @@ namespace at::native {
 template <typename scalar_t, typename acc_t=typename scalar_value_type<scalar_t>::type, typename out_t=typename scalar_value_type<scalar_t>::type>
 void norm_kernel_cuda_impl(TensorIterator& iter, double p) {
   if (p == static_cast<double>(0)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, NormZeroOps<scalar_t, acc_t>(), 0);
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormZeroOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(1)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, NormOneOps<scalar_t, acc_t>(), 0);
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormOneOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(2)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, NormTwoOps<scalar_t, acc_t>(), 0);
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormTwoOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(INFINITY)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t>(), 0);
+    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t, out_t>(), 0);
   } else if (p == static_cast<double>(-INFINITY)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMinOps<scalar_t, acc_t>(), std::numeric_limits<acc_t>::infinity());
+    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMinOps<scalar_t, acc_t, out_t>(), std::numeric_limits<acc_t>::infinity());
   } else {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, NormOps<scalar_t, acc_t>{ acc_t(p) }, 0);
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormOps<scalar_t, acc_t, out_t>{acc_t(p)}, 0);
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
index 94e4ca6fe838..e628e1916f9e 100644
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@@ -21,7 +21,7 @@ struct sum_functor {
 };
 
 // jiterated specialization for `complex<Half>`
-const char sum_name[] = "sum";
+CONSTEXPR_EXCEPT_WIN_CUDA char sum_name[] = "sum";
 template <>
 struct sum_functor<c10::complex<at::Half>> {
 // jiterator reduction fails on windows
@@ -57,8 +57,29 @@ struct nansum_functor {
   }
 };
 
-const char prod_name[] = "prod";
+CONSTEXPR_EXCEPT_WIN_CUDA char nansum_name[] = "nansum";
+template <typename scalar_t>
+struct nansum_functor_complex {
+#if AT_USE_JITERATOR()
+  void operator()(TensorIterator& iter) {
+    std::string func = jiterator_stringify(
+        arg_t combine(arg_t a, scalar_t b) {
+          return a + (std::isnan(b) ? arg_t{0.} : arg_t{b});
+        }
+    );
+    jitted_gpu_reduce_kernel<nansum_name, scalar_t, scalar_t>(
+        iter, func, 0.);
+  }
+#else
+  void operator()(TensorIterator& iter) {
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, acc_t>(
+        iter, NanSumOps<acc_t, acc_t>{});
+  }
+#endif
+};
 
+CONSTEXPR_EXCEPT_WIN_CUDA char prod_name[] = "prod";
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct prod_functor {
   // jiterator reduction fails on windows
@@ -162,9 +183,16 @@ static void sum_kernel_cuda(TensorIterator& iter){
 
 static void nansum_kernel_cuda(TensorIterator& iter) {
   auto general_dispatcher = [](TensorIterator& iter) {
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nansum_cuda", [&]() {
-      nansum_functor<scalar_t>{}(iter);
-    });
+    auto dtype = iter.dtype();
+    if (at::isComplexType(dtype)) {
+        AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "nansum_cuda", [&]() {
+          nansum_functor_complex<scalar_t>{}(iter);
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nansum_cuda", [&]() {
+          nansum_functor<scalar_t>{}(iter);
+        });
+    }
   };
 
   reduce_dispatch<nansum_functor>(iter, general_dispatcher);
diff --git a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
index d43ad694dafa..ead1ff6326ea 100644
--- a/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
+++ b/aten/src/ATen/native/cuda/SparseBinaryOpIntersectionKernel.cu
@@ -18,16 +18,23 @@ struct CUDAKernelLauncher {
 
 struct MulOp {
   template <typename scalar_t>
-  static FUNCAPI scalar_t apply(scalar_t a, scalar_t b) {
+  static FUNCAPI INLINE scalar_t apply(scalar_t a, scalar_t b) {
     return a * b;
   }
 };
 
 template <>
-FUNCAPI bool MulOp::apply(bool a, bool b) {
+FUNCAPI INLINE bool MulOp::apply(bool a, bool b) {
   return a && b;
 }
 
+struct LhsProjOp {
+  template <typename scalar_t>
+  static FUNCAPI scalar_t apply(scalar_t a, scalar_t b) {
+    return a;
+  }
+};
+
 template <int nt, int vt, typename loop_t>
 C10_LAUNCH_BOUNDS_2(nt, vt)
 __global__ void apply_kernel(int n, loop_t loop) {
@@ -75,8 +82,9 @@ void binary_op_intersection_kernel(
   const auto* RESTRICT ptr_lhs_select_idx_bytes = reinterpret_cast<char*>(iter.data_ptr(2));
   const auto* RESTRICT ptr_rhs_values_bytes = reinterpret_cast<char*>(iter.data_ptr(3));
   const auto* RESTRICT ptr_rhs_select_idx_bytes = reinterpret_cast<char*>(iter.data_ptr(4));
+  const auto* RESTRICT ptr_match_bytes = reinterpret_cast<char*>(iter.data_ptr(5));
 
-  auto offset_calc = make_offset_calculator<5>(iter);
+  auto offset_calc = make_offset_calculator<6>(iter);
   auto loop = [=] FUNCAPI (int i) {
     auto offsets = offset_calc.get(i);
 
@@ -85,10 +93,15 @@ void binary_op_intersection_kernel(
     const auto lhs_nnz_idx = *reinterpret_cast<const index_t*>(ptr_lhs_select_idx_bytes + offsets[2]);
     const auto* RESTRICT ptr_rhs_values = reinterpret_cast<const scalar_t*>(ptr_rhs_values_bytes + offsets[3]);
     const auto rhs_nnz_idx = *reinterpret_cast<const index_t*>(ptr_rhs_select_idx_bytes + offsets[4]);
-
-    *ptr_res_values = binary_op_t::apply(
-        *(ptr_lhs_values + lhs_nnz_idx * lhs_nnz_stride),
-        *(ptr_rhs_values + rhs_nnz_idx * rhs_nnz_stride));
+    const auto match = *reinterpret_cast<const bool*>(ptr_match_bytes + offsets[5]);
+
+    if (match) {
+      *ptr_res_values = binary_op_t::apply(
+          *(ptr_lhs_values + lhs_nnz_idx * lhs_nnz_stride),
+          *(ptr_rhs_values + rhs_nnz_idx * rhs_nnz_stride));
+    } else {
+      *ptr_res_values = 0;
+    }
   };
 
   launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
@@ -101,12 +114,14 @@ struct CUDAValueSelectionIntersectionKernel {
       const Tensor& lhs_values,
       const Tensor& lhs_select_idx,
       const Tensor& rhs_values,
-      const Tensor& rhs_select_idx) {
+      const Tensor& rhs_select_idx,
+      const c10::optional<Tensor>& match_mask = c10::nullopt) {
     auto iter = make_value_selection_intersection_iter(
         lhs_values,
         lhs_select_idx,
         rhs_values,
-        rhs_select_idx);
+        rhs_select_idx,
+        match_mask);
     auto res_values = iter.tensor(0);
 
     // If res_values is empty, we can return it right away.
@@ -142,8 +157,19 @@ void mul_sparse_sparse_out_cuda_kernel(
   );
 }
 
+void sparse_mask_intersection_out_cuda_kernel(
+    Tensor& result,
+    const Tensor& x,
+    const Tensor& y) {
+  using CUDAValueLhsProjKernel = CUDAValueSelectionIntersectionKernel<LhsProjOp>;
+  _sparse_binary_op_intersection_kernel_out<CUDAKernelLauncher, CUDAValueLhsProjKernel>(
+      result, x, y, true
+  );
+}
+
 }
 
 REGISTER_CUDA_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cuda_kernel);
+REGISTER_CUDA_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cuda_kernel);
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 030cb90fc4b1..bd48c9b05808 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -512,11 +512,22 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
 
                            T *kthValues,
                            uint32_t* withinKCounts,
-                           uint32_t* kthCounts) {
+                           uint32_t* kthCounts,
+                           uint32_t num_blocks) {
 
   uint32_t items_per_block = items_per_thread * BLOCK_THREADS;
   uint32_t tidx = threadIdx.x;
   uint32_t block_idx = getLinearBlockId<uint32_t>();
+
+  // The grid is computed from `getGridFromTiles`, when there are lots of
+  // elements, we will use both blockIdx.x and blockIdx.y, and maybe blockIdx.z
+  // when this is the case, the number of blocks that we are launching can be
+  // more than the number of blocks we need. So we need to check the range of
+  // `block_idx`.
+  if (block_idx >= num_blocks) {
+    return;
+  }
+
   uint32_t slice_idx = block_idx / blocks_per_slice;
   uint32_t blk_idx_in_slice = block_idx % blocks_per_slice;
 
@@ -731,7 +742,7 @@ void launch(
   gatherTopK<T, IndexType, Dim><<<grid, block, 0, stream>>>(
     input, inputSliceSize, outputSliceSize, largest, numInputSlices, inputWithinSliceStride,
     topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
-    blocks_per_slice, kthValues, withinKCounts, kthCounts);
+    blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 #else
   // Find topk values based on kth values
diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
index 230e155a9a5f..688974db517c 100644
--- a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
@@ -25,7 +25,7 @@ __host__ __device__ static inline c10::complex<T> angle_wrapper(c10::complex<T>
   return c10::complex<T>{std::arg(v), 0};
 }
 
-const char angle_name[] = "angle_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char angle_name[] = "angle_kernel";
 void angle_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -60,7 +60,7 @@ void angle_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // NB: Ignores the negative bit on tensors
-const char conj_name[] = "conj_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char conj_name[] = "conj_kernel";
 void conj_kernel_cuda(TensorIteratorBase& iter) {
   auto conj_chalf = [&] {
     using scalar_t = c10::complex<at::Half>;
diff --git a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
index 34e2e80604b7..f4a540fcf939 100644
--- a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 // See note [Jiterator]
-const char digamma_name[] = "digamma";
+CONSTEXPR_EXCEPT_WIN_CUDA char digamma_name[] = "digamma";
 void digamma_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "digamma_cuda", [&]() {
@@ -32,7 +32,7 @@ void digamma_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-const char trigamma_name[] = "trigamma";
+CONSTEXPR_EXCEPT_WIN_CUDA char trigamma_name[] = "trigamma";
 void trigamma_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "trigamma_cuda", [&]() {
@@ -50,7 +50,7 @@ void trigamma_kernel_cuda(TensorIteratorBase& iter) {
   #endif // AT_USE_JITERATOR()
 }
 
-const char polygamma_name[] = "polygamma";
+CONSTEXPR_EXCEPT_WIN_CUDA char polygamma_name[] = "polygamma";
 void polygamma_kernel_cuda(TensorIteratorBase& iter, int64_t n) {
   if (n == 0) {
     digamma_kernel_cuda(iter);
@@ -83,7 +83,7 @@ void polygamma_kernel_cuda(TensorIteratorBase& iter, int64_t n) {
   }
 }
 
-const char lgamma_name[] = "lgamma_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char lgamma_name[] = "lgamma_kernel";
 void lgamma_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "lgamma_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu
index e15fe358a2df..329fd465d2fc 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char acos_name[] = "acos";
+CONSTEXPR_EXCEPT_WIN_CUDA char acos_name[] = "acos";
 void acos_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu
index 06928c291cc6..ad48e51af3cf 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char acosh_name[] = "acosh";
+CONSTEXPR_EXCEPT_WIN_CUDA char acosh_name[] = "acosh";
 void acosh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if(at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu
index 0e618dc01896..6b3cec3b96c0 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char asin_name[] = "asin";
+CONSTEXPR_EXCEPT_WIN_CUDA char asin_name[] = "asin";
 void asin_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu
index a9f8fa120cad..7ffe938181d9 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char asinh_name[] = "asinh";
+CONSTEXPR_EXCEPT_WIN_CUDA char asinh_name[] = "asinh";
 void asinh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu
index c8830e56aa35..d56f75efd4e2 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char atan_name[] = "atan";
+CONSTEXPR_EXCEPT_WIN_CUDA char atan_name[] = "atan";
 void atan_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu
index 34a24439f2a2..55c9919c2ca6 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char atanh_name[] = "atanh";
+CONSTEXPR_EXCEPT_WIN_CUDA char atanh_name[] = "atanh";
 void atanh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu
index 4bc9fa9a2d08..1359d0a16ae7 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char cos_name[] = "cos";
+CONSTEXPR_EXCEPT_WIN_CUDA char cos_name[] = "cos";
 void cos_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu
index 081690e80d5c..c9608a1ba2aa 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char cosh_name[] = "cosh";
+CONSTEXPR_EXCEPT_WIN_CUDA char cosh_name[] = "cosh";
 void cosh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu
index 0a7a3a1f7aff..f7d6d5e3b42a 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char sin_name[] = "sin";
+CONSTEXPR_EXCEPT_WIN_CUDA char sin_name[] = "sin";
 void sin_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu
index c1567cf67739..22dd2bf2ab2f 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char sinh_name[] = "sinh";
+CONSTEXPR_EXCEPT_WIN_CUDA char sinh_name[] = "sinh";
 void sinh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
index ffae442f3892..91208b69e48d 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char tan_name[] = "tan";
+CONSTEXPR_EXCEPT_WIN_CUDA char tan_name[] = "tan";
 void tan_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
index 3242e96a4e5c..9e6184f7a3f0 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-const char tanh_name[] = "tanh";
+CONSTEXPR_EXCEPT_WIN_CUDA char tanh_name[] = "tanh";
 void tanh_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryLogKernels.cu b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
index cdcfe41ae281..fb3d19baca35 100644
--- a/aten/src/ATen/native/cuda/UnaryLogKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
@@ -12,7 +12,7 @@
 
 namespace at::native {
 
-const char log_name[] = "log_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char log_name[] = "log_kernel";
 void log_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -44,7 +44,7 @@ void log_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-const char log10_name[] = "log10_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char log10_name[] = "log10_kernel";
 void log10_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -81,7 +81,7 @@ void log1p_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-const char log2_name[] = "log2_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char log2_name[] = "log2_kernel";
 void log2_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 00132ac3fe04..07d5527e87d3 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -34,7 +34,7 @@ void bitwise_not_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-const char exp_name[] = "exp_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char exp_name[] = "exp_kernel";
 void exp_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -92,7 +92,7 @@ C10_HOST_DEVICE static inline c10::complex<T> rsqrt_wrapper(c10::complex<T> v) {
   return one / ::sqrt(v);
 }
 
-const char rsqrt_name[] = "rsqrt_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char rsqrt_name[] = "rsqrt_kernel";
 void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -131,7 +131,7 @@ void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-const char sqrt_name[] = "sqrt_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char sqrt_name[] = "sqrt_kernel";
 void sqrt_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index b8c13318c7b4..83233f3143cb 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -25,7 +25,7 @@ void logical_not_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // NB: Ignores the negative bit on tensors
-const char neg_name[] = "neg_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char neg_name[] = "neg_kernel";
 void neg_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if (at::isComplexType(dtype)) {
@@ -96,7 +96,7 @@ C10_HOST_DEVICE static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
   }
 }
 
-const char sgn_name[] = "sgn_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char sgn_name[] = "sgn_kernel";
 void sgn_kernel_cuda(TensorIteratorBase& iter){
   auto dtype = iter.dtype();
   #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
index 8d75eb719e19..cd62641a80d7 100644
--- a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
@@ -19,7 +19,7 @@
 
 namespace at::native {
 
-const char exp2_name[] = "exp2_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char exp2_name[] = "exp2_kernel";
 void exp2_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
@@ -41,7 +41,7 @@ void exp2_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-const char i0_name[] = "i0";
+CONSTEXPR_EXCEPT_WIN_CUDA char i0_name[] = "i0";
 void i0_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0_cuda", [&]() {
@@ -63,7 +63,7 @@ void i0_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-const char i0e_name[] = "calc_i0e";
+CONSTEXPR_EXCEPT_WIN_CUDA char i0e_name[] = "calc_i0e";
 void i0e_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0e_cuda", [&]() {
@@ -84,7 +84,7 @@ void i0e_kernel_cuda(TensorIteratorBase& iter) {
 
 // See note [Jiterator]
 
-const char i1_name[] = "i1";
+CONSTEXPR_EXCEPT_WIN_CUDA char i1_name[] = "i1";
 void i1_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "i1_cuda", [&]() {
@@ -102,7 +102,7 @@ void i1_kernel_cuda(TensorIteratorBase& iter) {
   #endif // AT_USE_JITERATOR()
 }
 
-const char i1e_name[] = "i1e";
+CONSTEXPR_EXCEPT_WIN_CUDA char i1e_name[] = "i1e";
 void i1e_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "i1e_cuda", [&]() {
@@ -120,7 +120,7 @@ void i1e_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-const char sigmoid_name[] = "sigmoid";
+CONSTEXPR_EXCEPT_WIN_CUDA char sigmoid_name[] = "sigmoid";
 void sigmoid_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -159,7 +159,7 @@ void sigmoid_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-const char sinc_name[] = "sinc";
+CONSTEXPR_EXCEPT_WIN_CUDA char sinc_name[] = "sinc";
 void sinc_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
@@ -217,7 +217,7 @@ void logit_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scalar) {
       });
 }
 
-const char ndtri_name[] = "ndtri";
+CONSTEXPR_EXCEPT_WIN_CUDA char ndtri_name[] = "ndtri";
 void ndtri_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "ndtri_cuda", [&]() {
@@ -234,7 +234,7 @@ void ndtri_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-const char log_ndtr_name[] = "log_ndtr";
+CONSTEXPR_EXCEPT_WIN_CUDA char log_ndtr_name[] = "log_ndtr";
 void log_ndtr_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cuda", [&]() {
@@ -259,7 +259,7 @@ void erf_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-const char erfc_name[] = "erfc_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char erfc_name[] = "erfc_kernel";
 void erfc_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "erfc_cuda", [&]() {
@@ -278,7 +278,7 @@ void erfc_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-const char erfinv_name[] = "erfinv_kernel";
+CONSTEXPR_EXCEPT_WIN_CUDA char erfinv_name[] = "erfinv_kernel";
 void erfinv_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "erfinv_cuda", [&]() {
@@ -296,7 +296,7 @@ void erfinv_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-const char erfcx_name[] = "erfcx";
+CONSTEXPR_EXCEPT_WIN_CUDA char erfcx_name[] = "erfcx";
 void erfcx_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "erfcx_cuda", [&]() {
@@ -313,7 +313,7 @@ void erfcx_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-const char kaiser_window_name[] = "kaiser_window";
+CONSTEXPR_EXCEPT_WIN_CUDA char kaiser_window_name[] = "kaiser_window";
 void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length, double beta_){
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
@@ -347,7 +347,7 @@ void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length,
   #endif
 }
 
-const char entr_name[] = "entr";
+CONSTEXPR_EXCEPT_WIN_CUDA char entr_name[] = "entr";
 void entr_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "entr_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
index 3589e06b52f6..c96d7dbae763 100644
--- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
@@ -190,7 +190,8 @@ static void upsample_bicubic2d_out_cuda_template(
   // Launch kernel
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       input.scalar_type(), "upsample_bicubic2d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
@@ -245,7 +246,8 @@ static void upsample_bicubic2d_backward_out_cuda_template(
       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       grad_output.scalar_type(), "upsample_bicubic2d_backward_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index e7d1bb02eeb4..938793890a5a 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -283,7 +283,9 @@ static void upsample_bilinear2d_out_cuda_template(
     return;
   }
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
     // heuristic: only use channels_last path when it's faster than the contiguous path
     if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 16 && \
           output.is_contiguous(memory_format)) {
@@ -395,7 +397,9 @@ static void upsample_bilinear2d_backward_out_cuda_template(
     return;
   }
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
     if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
           grad_input.is_contiguous(memory_format)) {
       using accscalar_t = at::acc_type<scalar_t, true>;
@@ -695,7 +699,8 @@ static void upsample_gen2d_aa_out_cuda_template(
   int block_x = std::min<int>(maxThreadsDim[0], at::cuda::warp_size());
   int grid_x = std::min<int>(maxGridSize[0], ceil_div(output_width, block_x));
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
@@ -796,7 +801,8 @@ static void upsample_gen2d_aa_backward_out_cuda_template(
   int grid_y = std::min<int>(maxGridSize[1], ceil_div(output_height, block_y));
   const dim3 grid(grid_x, grid_y);
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       grad_output.scalar_type(), "upsample_gen2d_backward_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
index fd29c2ec8551..54a03ae61b8f 100644
--- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
@@ -138,7 +138,8 @@ static void upsample_linear1d_out_cuda_template(
       //at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       input.scalar_type(), "upsample_linear1d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
@@ -181,7 +182,8 @@ static void upsample_linear1d_backward_out_cuda_template(
       //at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       grad_output.scalar_type(), "upsample_linear1d_out_frame_backward", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 26048202a456..aa35103627ed 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -129,7 +129,7 @@ static void upsample_nearest1d_out_cuda_template(
   TORCH_CHECK(output.numel() <= std::numeric_limits<int32_t>::max());
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest1d_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, input.scalar_type(), "upsample_nearest1d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = input.data_ptr<scalar_t>();
@@ -177,7 +177,7 @@ static void upsample_nearest1d_backward_out_cuda_template(
   TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = grad_input.data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index 5f4f4100da5c..25aea554fcb5 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -247,7 +247,7 @@ static void upsample_nearest2d_out_cuda_template(
     const int64_t num_kernels = output.numel();
     const int64_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
 
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_nhwc_out_frame", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_nhwc_out_frame", [&] {
       const scalar_t* idata = input.data_ptr<scalar_t>();
       scalar_t* odata = output.data_ptr<scalar_t>();
 
@@ -305,7 +305,7 @@ static void upsample_nearest2d_out_cuda_template(
         "input tensor has spatial dimension larger than the kernel capacity");
 
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_out_frame", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_out_frame", [&] {
           using accscalar_t = at::acc_type<scalar_t, true>;
 
           auto idata = input.data_ptr<scalar_t>();
@@ -377,7 +377,7 @@ static void upsample_nearest2d_backward_out_cuda_template(
     const int num_kernels = grad_input.numel();
     const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
 
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_nhwc_out_frame", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_nhwc_out_frame", [&] {
       using accscalar_t = at::acc_type<scalar_t, true>;
 
       const scalar_t* go = grad_output.data_ptr<scalar_t>();
@@ -412,7 +412,7 @@ static void upsample_nearest2d_backward_out_cuda_template(
     TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
 
     cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
       using accscalar_t = at::acc_type<scalar_t, true>;
 
       auto idata = grad_input_c.data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index d06fc571a2da..8dde1c187c86 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -184,7 +184,7 @@ static void upsample_nearest3d_out_cuda_template(
   TORCH_CHECK(output.numel() <= std::numeric_limits<int32_t>::max());
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte,input.scalar_type(), "upsample_nearest3d_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte,input.scalar_type(), "upsample_nearest3d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = input.data_ptr<scalar_t>();
@@ -257,7 +257,7 @@ static void upsample_nearest3d_backward_out_cuda_template(
   TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         auto idata = grad_input.data_ptr<scalar_t>();
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index d443082c4a04..9470d4675408 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -264,7 +264,8 @@ static void upsample_trilinear3d_out_cuda_template(
       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 512);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       input.scalar_type(), "upsample_trilinear3d_out_frame", [&] {
         using accscalar_t = at::acc_type<scalar_t, true>;
 
@@ -330,7 +331,8 @@ static void upsample_trilinear3d_backward_out_cuda_template(
       at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 256);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
       grad_output.scalar_type(),
       "upsample_trilinear3d_backward_out_frame",
       [&] {
diff --git a/aten/src/ATen/native/cuda/ZetaKernel.cu b/aten/src/ATen/native/cuda/ZetaKernel.cu
index c184329b796c..7459504f508c 100644
--- a/aten/src/ATen/native/cuda/ZetaKernel.cu
+++ b/aten/src/ATen/native/cuda/ZetaKernel.cu
@@ -15,7 +15,7 @@ namespace {
  * See note [3-Clause BSD License for the Cephes Math Library].
  */
 // See note [Jiterator]
-const char zeta_name[] = "zeta";
+CONSTEXPR_EXCEPT_WIN_CUDA char zeta_name[] = "zeta";
 void zeta_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "zeta_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/airy_ai.cu b/aten/src/ATen/native/cuda/airy_ai.cu
index 195fb35503b4..35e6b002260c 100644
--- a/aten/src/ATen/native/cuda/airy_ai.cu
+++ b/aten/src/ATen/native/cuda/airy_ai.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
 namespace {
-const char airy_ai_name[] = "airy_ai_forward";
+CONSTEXPR_EXCEPT_WIN_CUDA char airy_ai_name[] = "airy_ai_forward";
 
 void airy_ai_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_j0.cu b/aten/src/ATen/native/cuda/bessel_j0.cu
index 005a275ec6e7..2ebfe676e50b 100644
--- a/aten/src/ATen/native/cuda/bessel_j0.cu
+++ b/aten/src/ATen/native/cuda/bessel_j0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
 namespace {
-const char bessel_j0_name[] = "bessel_j0_forward";
+CONSTEXPR_EXCEPT_WIN_CUDA char bessel_j0_name[] = "bessel_j0_forward";
 
 void bessel_j0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_j1.cu b/aten/src/ATen/native/cuda/bessel_j1.cu
index 1d78b1f1e833..42bd43321f40 100644
--- a/aten/src/ATen/native/cuda/bessel_j1.cu
+++ b/aten/src/ATen/native/cuda/bessel_j1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
 namespace {
-const char bessel_j1_name[] = "bessel_j1_forward";
+CONSTEXPR_EXCEPT_WIN_CUDA char bessel_j1_name[] = "bessel_j1_forward";
 
 void bessel_j1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_y0.cu b/aten/src/ATen/native/cuda/bessel_y0.cu
index db9917945a3d..631031d4e26c 100644
--- a/aten/src/ATen/native/cuda/bessel_y0.cu
+++ b/aten/src/ATen/native/cuda/bessel_y0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char bessel_y0_name[] = "bessel_y0_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char bessel_y0_name[] = "bessel_y0_forward";
 
             void bessel_y0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_y1.cu b/aten/src/ATen/native/cuda/bessel_y1.cu
index 38ca3967890a..1375061e43e0 100644
--- a/aten/src/ATen/native/cuda/bessel_y1.cu
+++ b/aten/src/ATen/native/cuda/bessel_y1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char bessel_y1_name[] = "bessel_y1_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char bessel_y1_name[] = "bessel_y1_forward";
 
             void bessel_y1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu
index a4756b68f381..7736d20e0188 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char chebyshev_polynomial_t_name[] = "chebyshev_polynomial_t_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_t_name[] = "chebyshev_polynomial_t_forward";
 
             void chebyshev_polynomial_t_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu
index 02084c31f010..412479e11f49 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char chebyshev_polynomial_u_name[] = "chebyshev_polynomial_u_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_u_name[] = "chebyshev_polynomial_u_forward";
 
             void chebyshev_polynomial_u_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu
index 21c069c9f2aa..ca2e534e641b 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char chebyshev_polynomial_v_name[] = "chebyshev_polynomial_v_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_v_name[] = "chebyshev_polynomial_v_forward";
 
             void chebyshev_polynomial_v_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu
index 5de275f2420b..9d5a0e3a7bd3 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char chebyshev_polynomial_w_name[] = "chebyshev_polynomial_w_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_w_name[] = "chebyshev_polynomial_w_forward";
 
             void chebyshev_polynomial_w_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
index 0651206a5641..ec8ac6b4f267 100644
--- a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu
@@ -8,7 +8,7 @@
 
 namespace at::native {
 
-void _fused_adam_cuda_impl_(
+void _fused_adam_amsgrad_cuda_impl_(
     at::TensorList params,
     at::TensorList grads,
     at::TensorList exp_avgs,
@@ -20,7 +20,6 @@ void _fused_adam_cuda_impl_(
     const double beta2,
     const double weight_decay,
     const double eps,
-    const bool amsgrad,
     const bool maximize,
     const c10::optional<at::Tensor>& grad_scale,
     const c10::optional<at::Tensor>& found_inf
@@ -45,7 +44,8 @@ void _fused_adam_cuda_impl_(
             maximize,
             /* amsgrad */true,
             grad_scale_ptr,
-            found_inf_ptr);
+            found_inf_ptr,
+            ADAM_MODE::ORIGINAL);
         });
 }
 
diff --git a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
index 46e893e564d9..f71b2df4d218 100644
--- a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
+++ b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
@@ -3,7 +3,7 @@
 
 namespace at { namespace native {
 
-void _fused_adam_cuda_impl_(
+void _fused_adam_amsgrad_cuda_impl_(
     at::TensorList params,
     at::TensorList grads,
     at::TensorList exp_avgs,
@@ -15,7 +15,6 @@ void _fused_adam_cuda_impl_(
     const double beta2,
     const double weight_decay,
     const double eps,
-    const bool amsgrad,
     const bool maximize,
     const c10::optional<at::Tensor>& grad_scale,
     const c10::optional<at::Tensor>& found_inf
diff --git a/aten/src/ATen/native/cuda/fused_adam_impl.cu b/aten/src/ATen/native/cuda/fused_adam_impl.cu
index 64f79771f94f..d91be6bfc990 100644
--- a/aten/src/ATen/native/cuda/fused_adam_impl.cu
+++ b/aten/src/ATen/native/cuda/fused_adam_impl.cu
@@ -19,7 +19,6 @@ void _fused_adam_cuda_impl_(
     const double beta2,
     const double weight_decay,
     const double eps,
-    const bool amsgrad,
     const bool maximize,
     const c10::optional<at::Tensor>& grad_scale,
     const c10::optional<at::Tensor>& found_inf
@@ -44,7 +43,8 @@ void _fused_adam_cuda_impl_(
             maximize,
             /* amsgrad */false,
             grad_scale_ptr,
-            found_inf_ptr);
+            found_inf_ptr,
+            ADAM_MODE::ORIGINAL);
         });
 }
 
diff --git a/aten/src/ATen/native/cuda/fused_adam_impl.cuh b/aten/src/ATen/native/cuda/fused_adam_impl.cuh
index a76ba566970f..ff76fbf36226 100644
--- a/aten/src/ATen/native/cuda/fused_adam_impl.cuh
+++ b/aten/src/ATen/native/cuda/fused_adam_impl.cuh
@@ -14,7 +14,6 @@ void _fused_adam_cuda_impl_(
     const double beta2,
     const double weight_decay,
     const double eps,
-    const bool amsgrad,
     const bool maximize,
     const c10::optional<at::Tensor>& grad_scale,
     const c10::optional<at::Tensor>& found_inf
diff --git a/aten/src/ATen/native/cuda/fused_adam_utils.cuh b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
index 8d7c410915c1..97e60b9de955 100644
--- a/aten/src/ATen/native/cuda/fused_adam_utils.cuh
+++ b/aten/src/ATen/native/cuda/fused_adam_utils.cuh
@@ -7,6 +7,11 @@
 
 namespace at { namespace native {
 
+enum class ADAM_MODE: uint8_t {
+  ORIGINAL = 0,
+  ADAMW = 1
+};
+
 namespace {
 
 constexpr uint8_t kParamIdx = 0;
@@ -27,7 +32,8 @@ C10_DEVICE __forceinline__ void adam_math(
     const bool maximize,
     const bool amsgrad,
     const float* grad_scale_ptr,
-    const float* found_inf_ptr
+    const float* found_inf_ptr,
+    const ADAM_MODE adam_mode
 ) {
 #pragma unroll
     for (int ii = 0; ii < kILP; ii++) {
@@ -47,34 +53,32 @@ C10_DEVICE __forceinline__ void adam_math(
         if (amsgrad) {
             max_exp_avg_sq = static_cast<opmath_t>(r_args[kMaxExpAvgSqIdx][ii]);
         }
-
         // Update param, grad, 1st and 2nd order momentum.
         if (weight_decay != 0) {
-            grad += param * weight_decay;
+          switch (adam_mode) {
+            case ADAM_MODE::ORIGINAL:
+              grad += param * weight_decay;
+              break;
+            case ADAM_MODE::ADAMW:
+              param -= lr * weight_decay * param;
+              break;
+          }
         }
         // todo(crcrpar): use lerp
         // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/
         exp_avg = beta1 * exp_avg + (1 - beta1) * grad;
         exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad;
-
-        if (amsgrad) {
-            max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq);
-        }
-
         const opmath_t bias_correction1 = 1 - at::native::pow_(beta1, *step_count);
-        const opmath_t bias_correction2 = 1 - at::native::pow_(beta2, *step_count);
-
         const opmath_t step_size = lr / bias_correction1;
-
+        const opmath_t bias_correction2 = 1 - at::native::pow_(beta2, *step_count);
         const opmath_t bias_correction2_sqrt = std::sqrt(bias_correction2);
-
         opmath_t denom;
         if (amsgrad) {
+            max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq);
             denom = (std::sqrt(max_exp_avg_sq) / bias_correction2_sqrt) + eps;
         } else {
             denom = (std::sqrt(exp_avg_sq) / bias_correction2_sqrt) + eps;
         }
-
         param -= step_size * exp_avg / denom;
 
         // Store results.
@@ -115,7 +119,8 @@ struct FusedAdamMathFunctor {
             const bool maximize,
             const bool amsgrad,
             const float* grad_scale_ptr,
-            const float* found_inf_ptr
+            const float* found_inf_ptr,
+            const ADAM_MODE adam_mode
   ) {
         int tensor_loc = tl.block_to_tensor[blockIdx.x];
         int chunk_idx = tl.block_to_chunk[blockIdx.x];
@@ -138,7 +143,7 @@ struct FusedAdamMathFunctor {
                     load_store(r_args[i], args[i], 0, i_start);
                 }
                 adam_math<scalar_type, opmath_t, depth>(
-                    r_args, step_count, lr, beta1, beta2, weight_decay, eps, maximize, amsgrad, grad_scale_ptr, found_inf_ptr);
+                    r_args, step_count, lr, beta1, beta2, weight_decay, eps, maximize, amsgrad, grad_scale_ptr, found_inf_ptr, adam_mode);
 #pragma unroll
                 for (int i = 0; i < depth; i++) {
                   if (i != kGradIdx || grad_scale_ptr) {
@@ -150,7 +155,7 @@ struct FusedAdamMathFunctor {
             for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
               load_args<depth>(r_args, args, i_start, chunk_size, n);
               adam_math<scalar_type, opmath_t, depth>(
-                  r_args, step_count, lr, beta1, beta2, weight_decay, eps, maximize, amsgrad, grad_scale_ptr, found_inf_ptr);
+                  r_args, step_count, lr, beta1, beta2, weight_decay, eps, maximize, amsgrad, grad_scale_ptr, found_inf_ptr, adam_mode);
 #pragma unroll
               for (int i = 0; i < depth; i++) {
                   if (i != kGradIdx || grad_scale_ptr) {
diff --git a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
new file mode 100644
index 000000000000..b82db1d7763a
--- /dev/null
+++ b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu
@@ -0,0 +1,52 @@
+#include <ATen/native/cuda/fused_adamw_amsgrad_impl.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/fused_adam_utils.cuh>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <vector>
+
+namespace at { namespace native {
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf
+) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+    params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec(), max_exp_avg_sqs.vec() };
+
+  float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, params[0].scalar_type(),
+      "fused_adamw_kernel_cuda", [&]() {
+        multi_tensor_apply_for_fused_optimizer<5>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 5>(),
+            lr,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            /* amsgrad */true,
+            grad_scale_ptr,
+            found_inf_ptr,
+            ADAM_MODE::ADAMW);
+        });
+}
+
+} } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
new file mode 100644
index 000000000000..f084bda2080f
--- /dev/null
+++ b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
@@ -0,0 +1,23 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at { namespace native {
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf
+);
+
+} } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/fused_adamw_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_impl.cu
new file mode 100644
index 000000000000..fff29afd7b47
--- /dev/null
+++ b/aten/src/ATen/native/cuda/fused_adamw_impl.cu
@@ -0,0 +1,51 @@
+#include <ATen/native/cuda/fused_adamw_impl.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/fused_adam_utils.cuh>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <vector>
+
+namespace at { namespace native {
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf
+) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+    params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec() };
+
+  float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, params[0].scalar_type(),
+      "fused_adamw_kernel_cuda", [&]() {
+        multi_tensor_apply_for_fused_optimizer<4>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 4>(),
+            lr,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            /* amsgrad */false,
+            grad_scale_ptr,
+            found_inf_ptr,
+            ADAM_MODE::ADAMW);
+        });
+}
+
+} } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/fused_adamw_impl.cuh b/aten/src/ATen/native/cuda/fused_adamw_impl.cuh
new file mode 100644
index 000000000000..3afb89281457
--- /dev/null
+++ b/aten/src/ATen/native/cuda/fused_adamw_impl.cuh
@@ -0,0 +1,22 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at { namespace native {
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf
+);
+
+} } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu
index 876f7f429e2f..04bdca8ad112 100644
--- a/aten/src/ATen/native/cuda/group_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu
@@ -36,9 +36,9 @@ __global__ void RowwiseMomentsCUDAKernel(
     T* mean,
     T* rstd) {
   using T_ACC = acc_type<T, true>;
-  using WelfordType = WelfordData<T_ACC, int64_t, T_ACC>;
+  using WelfordType = WelfordData<T_ACC, int64_t>;
   using WelfordOp =
-      WelfordOps<T_ACC, T_ACC, int64_t, T_ACC, thrust::pair<T_ACC, T_ACC>>;
+      WelfordOps<T_ACC, T_ACC, int64_t, thrust::pair<T_ACC, T_ACC>>;
 
   const int64_t i = blockIdx.x;
   WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
diff --git a/aten/src/ATen/native/cuda/hermite_polynomial_h.cu b/aten/src/ATen/native/cuda/hermite_polynomial_h.cu
index 3b4d410ddaa4..f53253bcd099 100644
--- a/aten/src/ATen/native/cuda/hermite_polynomial_h.cu
+++ b/aten/src/ATen/native/cuda/hermite_polynomial_h.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char hermite_polynomial_h_name[] = "hermite_polynomial_h_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char hermite_polynomial_h_name[] = "hermite_polynomial_h_forward";
 
             void hermite_polynomial_h_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/hermite_polynomial_he.cu b/aten/src/ATen/native/cuda/hermite_polynomial_he.cu
index 06abfaeb4c0c..bab376565858 100644
--- a/aten/src/ATen/native/cuda/hermite_polynomial_he.cu
+++ b/aten/src/ATen/native/cuda/hermite_polynomial_he.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char hermite_polynomial_he_name[] = "hermite_polynomial_he_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char hermite_polynomial_he_name[] = "hermite_polynomial_he_forward";
 
             void hermite_polynomial_he_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 42c4311875f6..61781e03b4a9 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -921,7 +921,7 @@ void codegenOutputQuery(
 
 // TODO: another copy paste from jit, refactor so it's usable from both
 // TODO: try making the CUcontext thread local to see if that improves performance - why is this slow?
-void __inline__ initializeCudaContext() {
+void initializeCudaContext() {
   // lazily construct context if non-existing yet;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   CUcontext pctx = nullptr;
@@ -1656,5 +1656,4 @@ void launch_jitted_pwise_function(
     nullptr));
 }
 
-
 } // at::cuda::jit
diff --git a/aten/src/ATen/native/cuda/jit_utils.h b/aten/src/ATen/native/cuda/jit_utils.h
index 8206f67316e1..40841c2060a8 100644
--- a/aten/src/ATen/native/cuda/jit_utils.h
+++ b/aten/src/ATen/native/cuda/jit_utils.h
@@ -198,4 +198,6 @@ inline std::string typeName(ScalarType t) {
 }
 #undef TYPE_NAME_CASE
 
+TORCH_CUDA_CPP_API void initializeCudaContext();
+
 }}}  // namespace at::cuda::jit
diff --git a/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu b/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu
index 9a50245142f7..a98336dfcb6e 100644
--- a/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu
+++ b/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char laguerre_polynomial_l_name[] = "laguerre_polynomial_l_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char laguerre_polynomial_l_name[] = "laguerre_polynomial_l_forward";
 
             void laguerre_polynomial_l_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 457365135b7a..6f4d37822e2a 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -51,9 +51,9 @@ __global__ void RowwiseMomentsCUDAKernel(
     const T* X,
     T_ACC* mean,
     T_ACC* rstd) {
-  using WelfordType = WelfordData<T_ACC, int64_t, T_ACC>;
+  using WelfordType = WelfordData<T_ACC, int64_t>;
   using WelfordOp =
-      WelfordOps<T_ACC, T_ACC, int64_t, T_ACC, thrust::pair<T_ACC, T_ACC>>;
+      WelfordOps<T_ACC, T_ACC, int64_t, thrust::pair<T_ACC, T_ACC>>;
 
   __shared__
       typename std::aligned_storage<sizeof(WelfordType), alignof(WelfordType)>::
@@ -1445,5 +1445,6 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cuda(
 }
 
 REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl);
+REGISTER_DISPATCH(LayerNormBackwardKernel, &LayerNormBackwardKernelImpl);
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index 71262998464d..87260196a402 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -24,7 +24,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_cholesky_solve_helper_native.h>
-#include <ATen/ops/_symeig_helper_native.h>
 #include <ATen/ops/arange.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
@@ -1873,8 +1872,6 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
 
 REGISTER_CUDA_DISPATCH(geqrf_stub, &geqrf_kernel);
 
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ symeig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 template <typename scalar_t>
 static void apply_magma_eigh(const Tensor& values, const Tensor& vectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
 #if !AT_MAGMA_ENABLED()
@@ -1949,39 +1946,6 @@ static void apply_magma_eigh(const Tensor& values, const Tensor& vectors, const
 #endif
 }
 
-std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvectors, bool upper) {
-  Tensor infos = at::zeros({std::max<int64_t>(1, batchCount(self))}, self.options().dtype(kInt).device(at::kCPU));
-
-  auto eigvals_shape = IntArrayRef(self.sizes().data(), self.dim()-1);  // self.shape[:-1]
-  ScalarType real_dtype = toRealValueType(self.scalar_type());
-
-  // magmaSyevd uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors.
-  // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues.
-  // The data is later moved to the appropriate device.
-  // In the case where self.numel() == 0, we just return an empty tensor of
-  // dimensions on the CUDA (to avoid the unnecessary "to(at::kCUDA)")
-  auto eigvals_working_copy = self.numel() == 0
-                              ? at::empty(eigvals_shape, self.options().dtype(real_dtype))
-                              : at::empty(eigvals_shape, self.options().dtype(real_dtype).device(at::kCPU));
-
-  if (self.numel() == 0) {
-    return std::tuple<Tensor, Tensor>(eigvals_working_copy, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
-  }
-
-  auto self_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cuda", [&]{
-    apply_magma_eigh<scalar_t>(eigvals_working_copy, self_working_copy, infos, upper, eigenvectors);
-  });
-
-  at::_linalg_check_errors(infos, "symeig", self.dim() == 2);
-
-  if (eigenvectors) {
-    return std::tuple<Tensor, Tensor>(eigvals_working_copy.to(self.device()), self_working_copy);
-  } else {
-    return std::tuple<Tensor, Tensor>(eigvals_working_copy.to(self.device()), at::empty({0}, self.options()));
-  }
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_eigh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // This is a type dispatch function for 'apply_magma_eigh'
@@ -2796,8 +2760,7 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lstsq_kernel);
 #if defined(BUILD_LAZY_CUDA_LINALG)
 struct DispatchInitializer {
   DispatchInitializer() {
-    cuda::detail::LinalgDispatch disp{ _symeig_helper_cuda,
-                                       _cholesky_solve_helper_cuda};
+    cuda::detail::LinalgDispatch disp{_cholesky_solve_helper_cuda};
     cuda::detail::registerLinalgDispatch(disp);
   };
 } initializer;
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
index 532919e83ebd..3fdf3ebf7afd 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@@ -84,7 +84,6 @@ namespace cuda { namespace detail {
 // This is only used for an old-style dispatches
 // Please do not add any new entires to it
 struct LinalgDispatch {
-   std::tuple<Tensor, Tensor> (*symeig_helper)(const Tensor& self, bool eigenvectors, bool upper);
    Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
 };
 C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&);
diff --git a/aten/src/ATen/native/cuda/modified_bessel_i0.cu b/aten/src/ATen/native/cuda/modified_bessel_i0.cu
index 2bbe5dfc4b66..9f1f3ba98c67 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_i0.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_i0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char modified_bessel_i0_name[] = "modified_bessel_i0_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_i0_name[] = "modified_bessel_i0_forward";
 
             void modified_bessel_i0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_i1.cu b/aten/src/ATen/native/cuda/modified_bessel_i1.cu
index d76ef10a8578..d51e7fefb0eb 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_i1.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_i1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char modified_bessel_i1_name[] = "modified_bessel_i1_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_i1_name[] = "modified_bessel_i1_forward";
 
             void modified_bessel_i1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_k0.cu b/aten/src/ATen/native/cuda/modified_bessel_k0.cu
index 3e950fa5565f..574268456c84 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_k0.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_k0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char modified_bessel_k0_name[] = "modified_bessel_k0_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_k0_name[] = "modified_bessel_k0_forward";
 
             void modified_bessel_k0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_k1.cu b/aten/src/ATen/native/cuda/modified_bessel_k1.cu
index 6ccfd1d96690..b3720d8e1ba9 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_k1.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_k1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char modified_bessel_k1_name[] = "modified_bessel_k1_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_k1_name[] = "modified_bessel_k1_forward";
 
             void modified_bessel_k1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu b/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu
index 2daf955655bc..ac2355e409ac 100644
--- a/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu
+++ b/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char scaled_modified_bessel_k0_name[] = "scaled_modified_bessel_k0_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char scaled_modified_bessel_k0_name[] = "scaled_modified_bessel_k0_forward";
 
             void scaled_modified_bessel_k0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu b/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu
index a5d846abfac5..b1d8d2a41b62 100644
--- a/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu
+++ b/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char scaled_modified_bessel_k1_name[] = "scaled_modified_bessel_k1_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char scaled_modified_bessel_k1_name[] = "scaled_modified_bessel_k1_forward";
 
             void scaled_modified_bessel_k1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu
index 7dd76e688747..d86042030cd6 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char shifted_chebyshev_polynomial_t_name[] = "shifted_chebyshev_polynomial_t_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_t_name[] = "shifted_chebyshev_polynomial_t_forward";
 
             void shifted_chebyshev_polynomial_t_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu
index 4f885398a28b..a2e2cd485fda 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char shifted_chebyshev_polynomial_u_name[] = "shifted_chebyshev_polynomial_u_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_u_name[] = "shifted_chebyshev_polynomial_u_forward";
 
             void shifted_chebyshev_polynomial_u_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu
index 6d3b24469298..6e5404179ab9 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
 namespace {
-const char shifted_chebyshev_polynomial_v_name[] = "shifted_chebyshev_polynomial_v_forward";
+CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_v_name[] = "shifted_chebyshev_polynomial_v_forward";
 
 void shifted_chebyshev_polynomial_v_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu
index e0ca9d462efa..3bfee57d14ee 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            const char shifted_chebyshev_polynomial_w_name[] = "shifted_chebyshev_polynomial_w_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_w_name[] = "shifted_chebyshev_polynomial_w_forward";
 
             void shifted_chebyshev_polynomial_w_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/spherical_bessel_j0.cu b/aten/src/ATen/native/cuda/spherical_bessel_j0.cu
index 76995e6e4d83..d0bf46e65394 100644
--- a/aten/src/ATen/native/cuda/spherical_bessel_j0.cu
+++ b/aten/src/ATen/native/cuda/spherical_bessel_j0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            const char spherical_bessel_j0_name[] = "spherical_bessel_j0_forward";
+            CONSTEXPR_EXCEPT_WIN_CUDA char spherical_bessel_j0_name[] = "spherical_bessel_j0_forward";
 
             void spherical_bessel_j0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index a189a5d84f39..74d030e9a86c 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -90,7 +90,7 @@ namespace {
     bool train;
     double dropout;
     Tensor dropout_state;
-    DropoutDescriptorParams() {}
+    DropoutDescriptorParams() = default;
     void set(bool train_, double dropout_, Tensor dropout_state_) {
       train = train_;
       dropout = dropout_;
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 9e13c1146999..f1a15cf46c71 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -124,6 +124,8 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
   if (mixed_type) {
     check_mixed_data_type(X, mean, rstd);
   }
+  auto memory_format = X.device().is_cpu() ?
+      X.suggest_memory_format() : at::MemoryFormat::Contiguous;
 
   Tensor dX;
   Tensor dgamma;
@@ -135,7 +137,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm_backward(
         c10::nullopt /* layout */,
         c10::nullopt /* device */,
         c10::nullopt /* pin_memory */,
-        X.suggest_memory_format());
+        memory_format);
   }
   if (grad_input_mask[1]) {
     dgamma = at::native::empty_like(
diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
index a2ed1af23795..5cf71eb34475 100644
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@@ -4,6 +4,7 @@
 #include <ATen/Tensor.h>
 #include <ATen/mkl/Sparse.h>
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/native/mkl/SparseBlasImpl.h>
 
 #include <c10/core/ScalarType.h>
@@ -15,6 +16,14 @@
 #include <ATen/mkl/Utils.h>
 #endif
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cat.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#endif
+
 namespace at {
 namespace native {
 namespace sparse {
@@ -588,7 +597,7 @@ void add_out_sparse_csr(
 }
 
 void triangular_solve_out_sparse_csr(
-    const Tensor& A,
+    const Tensor& A_,
     const Tensor& B,
     const Tensor& X,
     bool upper,
@@ -600,12 +609,31 @@ void triangular_solve_out_sparse_csr(
       "Calling triangular_solve on a sparse CPU tensor requires Linux platform. ",
       "Please use PyTorch built with MKL on Linux.");
 #else
-  if (B.numel() == 0 || X.numel() == 0 || A._nnz() == 0) {
+  if (B.numel() == 0 || X.numel() == 0 || A_._nnz() == 0) {
     // If A has no nnz, then A is singular and we can't solve.
     X.fill_(NAN);
     return;
   }
 
+  const auto materialize_diagonal_indices = [](const Tensor& t) -> Tensor {
+    const auto n = t.size(-1);
+    const auto compressed_indices = std::get<0>(at::sparse_csr::getCompressedPlainIndices(t));
+    const auto diag_indices = at::arange(n, compressed_indices.options()).unsqueeze(0).expand({2, n});
+    const auto diag_values = at::zeros({1}, t.values().options()).expand({n});
+
+    const auto t_coo = t.to_sparse();
+    const auto expanded_indices = at::cat({t_coo._indices(), diag_indices}, /*dim=*/-1);
+    const auto expanded_values = at::cat({t_coo._values(), diag_values}, /*dim=*/0);
+
+    const auto t_expanded_coo = at::sparse_coo_tensor(expanded_indices, expanded_values, t_coo.sizes(), t_coo.options());
+    return t_expanded_coo.to_sparse(t.layout());
+  };
+
+  // MKL has a bug for inputs with unmaterialized diagonal indices.
+  // See https://github.com/pytorch/pytorch/issues/88890 and
+  // the comments within.
+  const auto A = unitriangular ? materialize_diagonal_indices(A_) : A_;
+
   c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_mkl(X);
   IntArrayRef X_strides = X_->strides();
   auto ndim = X_->dim();
diff --git a/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp b/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp
index 8081de65facf..33ef13c08e9f 100644
--- a/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp
+++ b/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp
@@ -53,7 +53,7 @@ static constexpr ScalarType TORCH_INT_TYPE = at::kInt;
 
 class SparseCsrMKLInterface {
  private:
-  sparse_matrix_t A = 0;
+  sparse_matrix_t A{nullptr};
   matrix_descr desc;
 
  public:
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 3d8188c003e1..7ba6b320ad70 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -46,12 +46,44 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
 
 REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_backward_stub);
 
+Tensor mkldnn_convolution_transpose(
+    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) {
+  TORCH_CHECK(false, "mkldnn_convolution_transpose: ATen not compiled with MKLDNN support");
+}
+
+Tensor mkldnn_convolution_transpose_backward_input(
+    IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
+    int64_t groups, bool bias_defined) {
+  TORCH_CHECK(false, "mkldnn_convolution_transpose_backward_input: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<Tensor, Tensor> mkldnn_convolution_transpose_backward_weights(
+    IntArrayRef weight_size, const Tensor& grad_output, const Tensor& input,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
+    int64_t groups, bool bias_defined) {
+  TORCH_CHECK(false, "mkldnn_convolution_transpose_backward_weights: ATen not compiled with MKLDNN support");
+}
+
+std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_transpose_backward(
+    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
+    int64_t groups, std::array<bool,3> output_mask) {
+  TORCH_CHECK(false, "mkldnn_convolution_transpose_backward: ATen not compiled with MKLDNN support");
+}
+
+REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_stub);
+REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub);
+
 }}
 
 #else // AT_MKLDNN_ENABLED
 
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
+#include <ATen/native/ConvUtils.h>
+#include <c10/util/irange.h>
 
 namespace at { namespace native {
 
@@ -282,7 +314,6 @@ Tensor _mkldnn_convolution(
   } else if (!use_channels_last) {
     return mkldnn_to_dense(MKLDNNTensor(y, input_t.options()));
   } else {
-    TORCH_INTERNAL_ASSERT(y.get_desc().is_nhwc());
     return output;
   }
 }
@@ -593,6 +624,176 @@ Tensor& mkldnn_convolution_pointwise_binary_(
   return other_t;
 }
 
+std::vector<int64_t> _original_deconv_weight_size(
+    const Tensor& weight_t,
+    int64_t groups) {
+  TORCH_CHECK(weight_t.is_mkldnn() || weight_t.is_meta(), "expects weight_t to be mkldnn or meta tensor");
+  // The size of weight_t is the prepacked size.
+  //  Groups > 1: [g*o, i/g, ...]
+  //  Groups == 1: [o, i, ...]
+  // Returns original weight size in [i, o, ...]
+  auto dim = weight_t.sizes().size();
+  TORCH_CHECK(dim > 2);
+
+  std::vector<int64_t> weight_IOHW_sizes(dim);
+  if (groups > 1) {
+    weight_IOHW_sizes[0] = weight_t.sizes()[1] * groups;
+    weight_IOHW_sizes[1] = weight_t.sizes()[0] / groups;
+  } else {
+    weight_IOHW_sizes[0] = weight_t.sizes()[1];
+    weight_IOHW_sizes[1] = weight_t.sizes()[0];
+  }
+  for (const auto d : c10::irange(2, dim)) {
+    weight_IOHW_sizes[d] = weight_t.sizes()[d];
+  }
+  return weight_IOHW_sizes;
+}
+
+
+Tensor _mkldnn_convolution_transpose(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool use_channels_last,
+    c10::string_view attr = "none",
+    torch::List<c10::optional<at::Scalar>> scalars =
+        torch::List<c10::optional<at::Scalar>>(),
+    c10::optional<c10::string_view> algorithm = c10::nullopt) {
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (attr != "none") {
+    auto it = fusion_unary_attr_map().find(attr);
+    TORCH_CHECK(it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
+    op_attr = it->second(scalars, algorithm);
+  }
+
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  if (input_t.scalar_type() == ScalarType::BFloat16) {
+    TORCH_CHECK(mkldnn_bf16_device_check(),
+        "mkldnn_convolution_transpose: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  }
+
+  std::vector<int64_t> weight_IOHW_sizes = weight_t.is_mkldnn() ? _original_deconv_weight_size(weight_t, groups) : weight_t.sizes().vec();
+
+  auto memory_format =
+      mkldnn_convolution_memory_format(input_t.ndimension(), use_channels_last);
+
+  auto input = input_t.is_mkldnn() ? input_t : input_t.contiguous(memory_format);
+  auto weight = weight_t.is_mkldnn() ? weight_t : weight_t.contiguous(memory_format);
+
+  auto output_sizes = conv_input_size(input.sizes(), weight_IOHW_sizes, padding, output_padding, stride, dilation, groups);
+  auto output = at::empty({0}, input.options());
+
+  const ideep::tensor x = itensor_from_tensor(input);
+
+  ideep::tensor w = itensor_from_tensor(weight);
+  if (!weight.is_mkldnn()) {
+    // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
+    // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
+    w.transpose_(0, 1);
+  }
+
+  ideep::tensor y;
+  if (use_channels_last) {
+    output.resize_(output_sizes, memory_format);
+    y = itensor_from_tensor(output);
+  }
+
+  if (bias.defined()) {
+    const ideep::tensor b = itensor_from_tensor(bias);
+    ideep::convolution_transpose_forward::compute(
+        x,
+        w,
+        b,
+        output_sizes,
+        y,
+        stride.vec(),
+        padding.vec(),
+        padding_r(padding, output_padding),
+        dilation.vec(),
+        groups,
+        op_attr);
+  } else {
+    ideep::convolution_transpose_forward::compute(
+        x,
+        w,
+        output_sizes,
+        y,
+        stride.vec(),
+        padding.vec(),
+        padding_r(padding, output_padding),
+        dilation.vec(),
+        groups,
+        op_attr);
+  }
+  if (input.is_mkldnn()) {
+    return MKLDNNTensor(y, input.options());
+  } else if (!use_channels_last) {
+    return mkldnn_to_dense(MKLDNNTensor(y, input.options()));
+  } else {
+    TORCH_INTERNAL_ASSERT(y.get_desc().is_nhwc());
+    return output;
+  }
+}
+
+Tensor mkldnn_convolution_transpose_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view attr,
+    torch::List<c10::optional<at::Scalar>> scalars,
+    c10::optional<c10::string_view> algorithm) {
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+  bool use_channels_last =
+      weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
+  return _mkldnn_convolution_transpose(
+      input_t,
+      weight_t,
+      bias_opt,
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups,
+      use_channels_last,
+      attr,
+      scalars,
+      algorithm
+  );
+}
+
+Tensor mkldnn_convolution_transpose_pointwise_meta(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view attr,
+    torch::List<c10::optional<at::Scalar>> scalars,
+    c10::optional<c10::string_view> algorithm) {
+
+  std::vector<int64_t> weight_IOHW_sizes = _original_deconv_weight_size(weight_t, groups);
+  auto output_sizes = conv_input_size(input_t.sizes(), weight_IOHW_sizes, padding, output_padding, stride, dilation, groups);
+
+  auto output = at::empty(output_sizes, input_t.options());
+  return output;
+}
+
 Tensor mkldnn_convolution_backward_input(
     IntArrayRef input_size,
     const Tensor& grad_output,
@@ -631,7 +832,6 @@ Tensor mkldnn_convolution_backward_input(
   } else if (!is_channels_last){
     return mkldnn_to_dense(MKLDNNTensor(grad_x, grad_output.options()));
   } else {
-    TORCH_INTERNAL_ASSERT(grad_x.get_desc().is_nhwc());
     return grad_input;
   }
 }
@@ -682,8 +882,9 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
         mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())),
         bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor());
   } else {
+    auto memory_format = mkldnn_convolution_memory_format(grad_output.ndimension(), is_channels_last);
     return std::make_tuple(
-        mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())).to(at::MemoryFormat::ChannelsLast),
+        mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())).to(memory_format),
         bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor());
   }
 }
@@ -707,12 +908,200 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
     std::tie(grad_weight, grad_bias) = mkldnn_convolution_backward_weights(
       weight.sizes(), grad_output, input, padding, stride, dilation, groups, output_mask[2], is_channels_last);
   }
-
   return std::make_tuple(grad_input, grad_weight, grad_bias);
 }
 
 REGISTER_ALL_CPU_DISPATCH(mkldnn_convolution_backward_stub, &mkldnn_convolution_backward);
 
+Tensor mkldnn_convolution_transpose(
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups)
+{
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  if (input.scalar_type() == ScalarType::BFloat16) {
+    TORCH_CHECK(mkldnn_bf16_device_check(),
+        "mkldnn_convolution_transpose: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  }
+
+  bool use_channels_last = mkldnn_conv_use_channels_last(input, weight);
+  auto memory_format = mkldnn_convolution_memory_format(input.ndimension(), use_channels_last);
+
+  auto output_sizes = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
+  auto output = at::empty({0}, input.options());
+
+  const ideep::tensor x = itensor_from_tensor(input);
+  ideep::tensor w = itensor_from_tensor(weight);
+  // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
+  // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
+  w.transpose_(0, 1);
+
+  ideep::tensor y;
+  if (use_channels_last) {
+    output.resize_(output_sizes, memory_format);
+    y = itensor_from_tensor(output);
+  }
+  if (bias.defined()) {
+    const ideep::tensor b = itensor_from_tensor(bias);
+    ideep::convolution_transpose_forward::compute(
+        x,
+        w,
+        b,
+        output_sizes,
+        y,
+        stride.vec(),
+        padding.vec(),
+        padding_r(padding, output_padding),
+        dilation.vec(),
+        groups);
+  } else {
+    ideep::convolution_transpose_forward::compute(
+        x,
+        w,
+        output_sizes,
+        y,
+        stride.vec(),
+        padding.vec(),
+        padding_r(padding, output_padding),
+        dilation.vec(),
+        groups);
+  }
+
+  if (input.is_mkldnn()) {
+    return MKLDNNTensor(y, input.options());
+  } else if (!use_channels_last) {
+    return mkldnn_to_dense(MKLDNNTensor(y, input.options()));
+  } else {
+    return output;
+  }
+}
+
+Tensor mkldnn_convolution_transpose_backward_input(
+    IntArrayRef input_size,
+    const Tensor& grad_output,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool bias_defined,
+    bool is_channels_last) {
+  auto grad_input = at::empty({0}, grad_output.options());
+
+  auto grad_y = itensor_from_tensor(grad_output);
+  auto w = itensor_view_from_dense(weight).transpose_(0, 1);
+
+  ideep::tensor grad_x;
+  if (is_channels_last) {
+    auto memory_format = mkldnn_convolution_memory_format(grad_output.ndimension(), is_channels_last);
+    grad_input.resize_(input_size, memory_format);
+    grad_x = itensor_from_tensor(grad_input);
+  }
+  ideep::convolution_transpose_backward_data::compute(
+      grad_y,
+      w,
+      input_size.vec(),
+      grad_x,
+      stride.vec(),
+      padding.vec(),
+      padding_r(padding, output_padding),
+      dilation.vec(),
+      groups);
+
+  if (grad_output.is_mkldnn()) {
+    return MKLDNNTensor(grad_x, grad_output.options());
+  } else if (!is_channels_last){
+    return mkldnn_to_dense(MKLDNNTensor(grad_x, grad_output.options()));
+  } else {
+    return grad_input;
+  }
+}
+
+std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
+    IntArrayRef weight_size,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool bias_defined,
+    bool is_channels_last) {
+  auto grad_y = itensor_from_tensor(grad_output);
+  auto x = itensor_from_tensor(input);
+
+  ideep::tensor grad_w, grad_b;
+  if (bias_defined) {
+    ideep::convolution_transpose_backward_weights::compute(
+        x,
+        grad_y,
+        weight_size.vec(),
+        grad_w,
+        grad_b,
+        stride.vec(),
+        padding.vec(),
+        padding_r(padding, output_padding),
+        dilation.vec(),
+        groups);
+  } else {
+    ideep::convolution_transpose_backward_weights::compute(
+        x,
+        grad_y,
+        weight_size.vec(),
+        grad_w,
+        stride.vec(),
+        padding.vec(),
+        padding_r(padding, output_padding),
+        dilation.vec(),
+        groups);
+  }
+
+  if (!is_channels_last) {
+    return std::make_tuple(
+        mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())),
+        bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor());
+  } else {
+    auto memory_format = mkldnn_convolution_memory_format(grad_output.ndimension(), is_channels_last);
+    return std::make_tuple(
+        mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())).to(memory_format),
+        bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor());
+  }
+}
+
+std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_transpose_backward(
+    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
+    std::array<bool,3> output_mask)
+{
+  bool is_channels_last = mkldnn_conv_use_channels_last(input, weight);
+  auto memory_format = mkldnn_convolution_memory_format(input.ndimension(), is_channels_last);
+  Tensor grad_output = grad_output_t.is_mkldnn() ? grad_output_t : grad_output_t.contiguous(memory_format);
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = mkldnn_convolution_transpose_backward_input(
+        input.sizes(), grad_output, weight, padding, output_padding, stride, dilation, groups, output_mask[2], is_channels_last);
+  }
+  if (output_mask[1] || output_mask[2]) {
+    std::tie(grad_weight, grad_bias) = mkldnn_convolution_transpose_backward_weights(
+        weight.sizes(), grad_output, input, padding, output_padding, stride, dilation, groups, output_mask[2], is_channels_last);
+  }
+  return std::make_tuple(grad_input, grad_weight, grad_bias);
+}
+
+REGISTER_ALL_CPU_DISPATCH(mkldnn_convolution_transpose_stub, &mkldnn_convolution_transpose);
+REGISTER_ALL_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub, &mkldnn_convolution_transpose_backward);
+
 TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise"),
@@ -723,6 +1112,9 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise_.binary"),
       TORCH_FN(mkldnn_convolution_pointwise_binary_));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_transpose_pointwise"),
+      TORCH_FN(mkldnn_convolution_transpose_pointwise));
 }
 
 TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
@@ -735,6 +1127,15 @@ TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise_.binary"),
       TORCH_FN(mkldnn_convolution_pointwise_binary_));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_transpose_pointwise"),
+      TORCH_FN(mkldnn_convolution_transpose_pointwise));
+}
+
+TORCH_LIBRARY_IMPL(mkldnn, Meta, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_transpose_pointwise"),
+      TORCH_FN(mkldnn_convolution_transpose_pointwise_meta));
 }
 }}  // namespace at::native
 
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index d643fae22ca2..e77b7856f2b2 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -168,6 +168,105 @@ Tensor mkldnn_reorder_conv3d_weight(
   return new_with_itensor_mkldnn(std::move(result), optTypeMetaToScalarType(self.options().dtype_opt()), self.options().device_opt());
 }
 
+
+ideep::tensor::desc get_conv_transpose_expected_weights_desc(
+    const ideep::tensor::dims& weights_dims,
+    ideep::tensor::data_type w_dtype,
+    const ideep::tensor::dims& strides,
+    const ideep::tensor::dims& padding_l,
+    const ideep::tensor::dims& padding_r,
+    const ideep::tensor::dims& dilates,
+    int groups,
+    bool channels_last,
+    ideep::algorithm aalgorithm,
+    ideep::data_type x_dtype,
+    const ideep::dims& src_dims) {
+  if (channels_last) {
+    return ideep::convolution_transpose_forward::expected_weights_desc<true>(
+        weights_dims,
+        w_dtype,
+        strides,
+        padding_l,
+        padding_r,
+        dilates,
+        groups,
+        aalgorithm,
+        ideep::prop_kind::forward,
+        src_dims);
+  } else {
+    return ideep::convolution_transpose_forward::expected_weights_desc<false>(
+        weights_dims,
+        w_dtype,
+        strides,
+        padding_l,
+        padding_r,
+        dilates,
+        groups,
+        aalgorithm,
+        ideep::prop_kind::forward,
+        src_dims);
+  }
+}
+
+
+Tensor mkldnn_reorder_conv_transpose2d_weight(
+    const Tensor& self,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::OptionalArrayRef<int64_t> input_size) {
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+  if (self.scalar_type() == ScalarType::BFloat16) {
+    TORCH_CHECK(mkldnn_bf16_device_check(),
+        "mkldnn_reorder_conv2d_weight: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  }
+
+  ideep::tensor w = itensor_from_tensor(self);
+
+  ideep::dims src_dims = ideep::dims();
+  bool is_channels_last = false;
+  if (input_size.has_value()) {
+    src_dims = input_size.value().vec();
+    // if has input size, we always use channels last.
+    is_channels_last = true;
+  }
+
+  auto expected_desc = get_conv_transpose_expected_weights_desc(
+      w.get_dims(),
+      w.get_data_type(),
+      stride.vec(),
+      padding.vec(),
+      padding_r(padding, output_padding),
+      dilation.vec(),
+      groups,
+      is_channels_last,
+      ideep::algorithm::deconvolution_direct,
+      w.get_data_type(),
+      src_dims);
+
+  if (groups > 1) {
+    expected_desc = expected_desc.transpose(1, 2);
+  } else {
+    expected_desc = expected_desc.transpose(0, 1);
+  }
+
+  ideep::tensor result;
+  result.init(expected_desc);
+  w.transpose_(0, 1);
+  result.feed_from(w, /*is_deconv_weights*/true);
+
+  return new_with_itensor_mkldnn(std::move(result), optTypeMetaToScalarType(self.options().dtype_opt()),
+                                 self.options().device_opt());
+}
+
+TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_reorder_convolution_transpose_weight"),
+      TORCH_FN(mkldnn_reorder_conv_transpose2d_weight));
+}
+
 #else
 
 Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional<ScalarType> dtype) {
diff --git a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
index fac4507183ad..aa09916210bb 100644
--- a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
+++ b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
@@ -42,6 +42,10 @@ TORCH_LIBRARY(mkldnn, m) {
       "mkldnn::_convolution_pointwise.binary(Tensor X, Tensor other, Tensor W, Tensor? B, int[] padding, int[] stride, int[] dilation, int groups, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "mkldnn::_convolution_pointwise_.binary(Tensor X, Tensor(a!) other, Tensor W, Tensor? B, int[] padding, int[] stride, int[] dilation, int groups, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor(a!) Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "mkldnn::_convolution_transpose_pointwise(Tensor X, Tensor W, Tensor? B, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, str attr, Scalar?[] scalars, str? algorithm) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "mkldnn::_reorder_convolution_transpose_weight(Tensor self, int[2] padding=0, int[2] output_padding=0, int[2] stride=1, int[2] dilation=1, int groups=1, int[]? input_size=None) -> Tensor Y"));
 }
 
 TORCH_LIBRARY(mkldnn_prepacked, m) {
diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp
index 2c9bcc016e47..fec311e5c578 100644
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@@ -133,6 +133,17 @@ AttrFunction attr_func_gelu = [](torch::List<c10::optional<at::Scalar>> scalars,
   return ideep::attr_t::fuse_gelu(1.0, 0.f, 0.f, gelu_type);
 };
 
+AttrFunction attr_func_hardsigmoid =
+    [](torch::List<c10::optional<at::Scalar>> scalars,
+       c10::optional<c10::string_view> algorithm) {
+      ideep::attr_t attr;
+      ideep::post_ops po;
+      po.append_eltwise(
+          1.0f, ideep::algorithm::eltwise_hardsigmoid, 1.0f / 6.0f, 0.5f);
+      attr.set_post_ops(po);
+      return attr;
+    };
+
 const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map() {
   static const std::map<c10::string_view, AttrFunction> fusion_attr_map{
       {"relu", ATTR_FUNC(relu)},
@@ -140,6 +151,7 @@ const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map() {
       {"tanh", ATTR_FUNC(tanh)},
       {"swish", ATTR_FUNC(swish)},
       {"hardswish", ATTR_FUNC(hardswish)},
+      {"hardsigmoid", attr_func_hardsigmoid},
       {"leaky_relu", attr_func_leaky_relu},
       {"hardtanh", attr_func_hardtanh},
       {"gelu", attr_func_gelu},
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index a25be13c46da..b492d2f8aacc 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -33,6 +33,27 @@ void check_mkldnn_binary_fusion_inputs(
     const Tensor& weight,
     const Tensor& bias);
 
+static inline std::vector<int64_t> padding_r(
+    IntArrayRef padding, IntArrayRef output_padding)
+{
+  // ConvTranpose padding adjustment
+  //
+  // PyTorch uses padding/output_padding:
+  //   osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
+  //
+  // MKLDNN uses padding_l/padding_r:
+  //   osize = (isize - 1) * stride - padding_l - padding_r + dilation * (kernel_size - 1) + 1
+  //
+  // So: padding_l = padding, padding_r = padding - output_padding
+  //
+  auto dim = padding.size();
+  std::vector<int64_t> pad_r(dim);
+  for (const auto d : c10::irange(dim)) {
+    pad_r[d] = padding[d] - output_padding[d];
+  }
+  return pad_r;
+}
+
 #if AT_MKLDNN_ENABLED()
 
 using AttrFunction = std::function<ideep::attr_t(
diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index 19434c00280f..cba9fb9fee64 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -26,10 +26,38 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
                                        axis:(NSInteger)axis
                                        name:(NSString * _Nullable)name;
 
+- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                               axis:(NSInteger) axis
+                         descending:(BOOL) descending
+                               name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                         axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                         descending:(BOOL) descending
+                               name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                         axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                               name:(NSString * _Nullable) name;
+
 - (MPSGraphTensor * _Nonnull)argSortWithTensor:(MPSGraphTensor * _Nonnull)tensor
                                           axis:(NSInteger)axis
                                           name:(NSString * _Nullable)name;
 
+- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                  axis:(NSInteger) axis
+                            descending:(BOOL) descending
+                                  name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                           axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                           descending:(BOOL) descending
+                                 name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                           axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                                 name:(NSString * _Nullable) name;
+
 - (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
                                         name:(NSString * _Nullable)name;
 
@@ -88,4 +116,26 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
                                              scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
                                                         layout:(MPSGraphTensorNamedDataLayout) layout
                                                           name:(NSString * _Nullable) name;
-@end
\ No newline at end of file
+
+- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                        coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
+                                                  layout:(MPSGraphTensorNamedDataLayout) layout
+                                    normalizeCoordinates:(BOOL) normalizeCoordinates
+                                     relativeCoordinates:(BOOL) relativeCoordinates
+                                            alignCorners:(BOOL) alignCorners
+                                             paddingMode:(MPSGraphPaddingMode) paddingMode
+                                            samplingMode:(MPSGraphResizeMode) samplingMode
+                                           constantValue:(double) constantValue
+                                                    name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                        coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
+                                                  layout:(MPSGraphTensorNamedDataLayout) layout
+                                    normalizeCoordinates:(BOOL) normalizeCoordinates
+                                     relativeCoordinates:(BOOL) relativeCoordinates
+                                            alignCorners:(BOOL) alignCorners
+                                             paddingMode:(MPSGraphPaddingMode) paddingMode
+                                     nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                           constantValue:(double) constantValue
+                                                    name:(NSString * _Nullable) name;
+@end
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 0f5c23a9ebb8..689d58f3c0cb 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -1,5 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 
+#pragma once
+
 #include <ATen/ATen.h>
 #include <ATen/Tensor.h>
 #include <ATen/Utils.h>
@@ -52,7 +54,10 @@ std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape);
 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input);
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input);
 
 // The MPSShape could vary based on memory format
 MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
@@ -175,7 +180,7 @@ struct MPSGraphCache
   MPSGraphCache(const MPSGraphCache&) = delete;
   void operator=(const MPSGraphCache&) = delete;
 
-  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock, void* view_ptr = nullptr) {
+  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
 
     __block MPSCachedGraph * result = nil;
 
@@ -193,17 +198,14 @@ struct MPSGraphCache
         result = createCacheBlock();
         CacheEntry entry(key, result);
         cache_.emplace(hash, entry);
-        if (view_ptr) {
-          views_list.insert(std::make_pair(view_ptr, hash));
-        }
       }
     });
     return result;
   }
 
   template<typename T>
-  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock, void* view_ptr = nullptr) {
-    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock, view_ptr));
+  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock));
   }
 
   MPSCachedGraph* LookUp(const std::string& key) const {
@@ -228,24 +230,6 @@ struct MPSGraphCache
     return static_cast<T *>(LookUp(key));
   }
 
-  void FindAndRemoveViewEntry(void* ptr) {
-    // this may find multiple view entries with the same buffer pointers
-    auto views_range = views_list.equal_range(ptr);
-    if (views_range.first == views_range.second)
-      return;
-    for (auto view_it = views_range.first; view_it != views_range.second; ++view_it) {
-      MPSCacheKey hash = view_it->second;
-      // find the cache entry associated with the hash
-      auto cache_it = cache_.find(hash);
-      if (cache_it != cache_.end()) {
-        cache_.erase(cache_it);
-        delete cache_it->second.cachedGraph_;
-      }
-    }
-    // this erase-by-key will remove all pairs in the list with the same key
-    views_list.erase(ptr);
-  }
-
  private:
   MPSGraphCache() {
     serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
@@ -253,13 +237,13 @@ struct MPSGraphCache
 
   static MPSGraphCache* _instance_cache;
   std::unordered_map<MPSCacheKey, CacheEntry> cache_;
-  // list of buffers associated with view entries in the cache
-  // note that multiple view cache entries could use the same buffer pointer
-  std::unordered_multimap<void*, MPSCacheKey> views_list;
   dispatch_queue_t serialQueue_ = nullptr;
 
 };
 
+// Common math operations
+MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+
 
 } // namespace mps
 } // namespace native
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 530c14e74485..c5e8b5d1fc17 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -1,7 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/mps/MPSAllocator.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 
 namespace at::native::mps {
 
@@ -35,6 +35,37 @@ MPSDataType getMPSDataType(ScalarType scalar_type) {
   }
 }
 
+// #issue 104398441 sortWithTensor and argsortWithTensor has support of
+// Int32, Half and Float32 types. These utilities are to help cast to these
+// types.
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input) {
+  MPSDataType dataType = getMPSDataType(input.scalar_type());
+  if (dataType != MPSDataTypeInt32 &&
+      dataType != MPSDataTypeFloat32 &&
+      dataType != MPSDataTypeFloat16) {
+      dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+      return [mpsGraph castTensor:inputTensor
+                          toType:dataType
+                          name:@"castInputTensor"];
+  }
+  return inputTensor;
+}
+
+// #issue 104398441 sortWithTensor and argsortWithTensor has support of
+// Int32, Half and Float32 types. These utilities are to help cast from these
+// types.
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input) {
+  MPSDataType dataType = getMPSDataType(input.scalar_type());
+  if (dataType != MPSDataTypeInt32 &&
+      dataType != MPSDataTypeFloat32 &&
+      dataType != MPSDataTypeFloat16) {
+      inputTensor = [mpsGraph castTensor:inputTensor
+                              toType:dataType
+                                name:@"castInputTensor"];
+  }
+  return inputTensor;
+}
+
 MPSDataType getMPSScalarType(ScalarType scalar_type) {
   switch (scalar_type) {
     // This is an intentional fallthrough supporting Double for Scalar
@@ -232,8 +263,9 @@ void printTensorNDArray(const Tensor& t) {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
+  bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if (!src.is_contiguous() && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
@@ -252,7 +284,7 @@ void printTensorNDArray(const Tensor& t) {
   const MPSDataType mpsDataType = dataType != MPSDataTypeInvalid ? dataType :
                       _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) : getMPSDataType(_tensor.scalar_type());
 
-  if (src.is_view() && src.is_contiguous() && src.storage_offset()) {
+  if (src.is_contiguous() && src.storage_offset() && sliceViewTensor) {
     _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType);
   } else {
     if (!mpsShape) {
@@ -313,7 +345,7 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
   MPSGraphTensorData *result = nullptr;
   // Scalar pools are only supported on devices with unified memory
   if (mpsStream->device().hasUnifiedMemory) {
-    scalar.buffer = at::mps::allocate_scalar_buffer(&scalar.value, scalar.size);
+    scalar.buffer = getIMPSAllocator()->allocScalarBufferWithValue(&scalar.value, scalar.size);
     result = [[[MPSGraphTensorData alloc] initWithMTLBuffer: scalar.getMTLBuffer()
                                                       shape: @[@1]
                                                    dataType: getMPSScalarType(scalar.type)] autorelease];
@@ -332,7 +364,6 @@ void resize_tensor(Tensor* output) {
 
 MPSGraph* make_mps_graph() {
   MPSGraph* mpsGraph = [[MPSGraph new] autorelease];
-  mpsGraph.options = MPSGraphOptionsNone;
   return mpsGraph;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 2ebee9c40f8a..568c42909e79 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -18,14 +18,15 @@
 Tensor relu_mps(const Tensor& self) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
-  Tensor output = at::empty_like(self);
-  resize_tensor(&output);
-  TORCH_CHECK(output.is_mps());
-
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   MPSStream* stream = getCurrentMPSStream();
 
+  bool executeGatherOp = !(self.is_contiguous(MemoryFormat::Contiguous) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor output = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
+
   @autoreleasepool {
     string key = "relu" + getTensorsStringKey({self});
     CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
@@ -51,8 +52,8 @@ Tensor relu_mps(const Tensor& self) {
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nil, false);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -75,7 +76,13 @@ Tensor relu_mps(const Tensor& self) {
   using CachedGraph = MPSUnaryCachedGraph;
   // Inplace relu
   Tensor &output = self;
-  TORCH_CHECK(output.is_mps());
+  bool executeGatherOp = !(self.is_contiguous(MemoryFormat::Contiguous) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor out;
+  if (executeGatherOp) {
+    out = at::empty_like(self, MemoryFormat::Contiguous);
+  }
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
@@ -106,8 +113,8 @@ Tensor relu_mps(const Tensor& self) {
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, executeGatherOp ? out : output, nil, false);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -119,7 +126,9 @@ Tensor relu_mps(const Tensor& self) {
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-
+    if (executeGatherOp) {
+      output.copy_(out);
+    }
   }
 
   return output;
@@ -270,7 +279,6 @@ Tensor relu_mps(const Tensor& self) {
   }
 }
 
-
 TORCH_IMPL_FUNC(log_softmax_mps_out) (
   const Tensor &self,
   const int64_t dim,
@@ -302,11 +310,25 @@ Tensor relu_mps(const Tensor& self) {
 
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
-          MPSGraphTensor* softmaxTensor = [mpsGraph softMaxWithTensor:inputTensor
-                                                                 axis:dim
-                                                                 name:nil];
-          MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:softmaxTensor
-                                                                  name:nil];
+          MPSGraphTensor* maximumsTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
+                                                                            axis:dim
+                                                                            name:nil];
+          MPSGraphTensor* inputTensorSubMax = [mpsGraph subtractionWithPrimaryTensor:inputTensor
+                                                                     secondaryTensor:maximumsTensor
+                                                                                name:nil];
+          MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:inputTensorSubMax
+                                                                   name:nil];
+
+          MPSGraphTensor* exponentTensorReduced = [mpsGraph reductionSumWithTensor:exponentTensor
+                                                                              axis:dim
+                                                                              name:nil];
+
+          MPSGraphTensor* logSumExpTensor = [mpsGraph logarithmWithTensor:exponentTensorReduced
+                                                                    name:nil];
+
+          MPSGraphTensor* outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensorSubMax
+                                                                       secondaryTensor:logSumExpTensor
+                                                                                  name:nil];
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->outputTensor_ = outputTensor;
@@ -413,6 +435,220 @@ Tensor relu_mps(const Tensor& self) {
 
 }
 
+std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_mps(const Tensor& self, Tensor& output, Tensor& buffer) {
+  // NOTE: buffer is only used by CPU dispatch, we just ignore it here
+  using namespace mps;
+  using CachedGraph = MPSUnaryCachedGraph;
+
+  if (self.numel() == 0) {
+    return std::forward_as_tuple(output, buffer);
+  }
+
+  output.resize_as_(self);
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  bool executeGatherOp = !(self.is_contiguous(MemoryFormat::Contiguous) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
+
+  @autoreleasepool {
+
+    string key = "log_sigmoid_forward_out:" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:@[@1]
+                                                           dataType:inputTensor.dataType];
+          MPSGraphTensor* minTensor = [mpsGraph minimumWithPrimaryTensor:inputTensor
+                                                         secondaryTensor:zeroTensor
+                                                                    name:nil];
+          MPSGraphTensor* absInputTensor = [mpsGraph absoluteWithTensor:inputTensor
+                                                                   name:nil];
+          MPSGraphTensor* negAbsInputTensor = [mpsGraph negativeWithTensor:absInputTensor
+                                                                      name:nil];
+          MPSGraphTensor* expNegAbsInputTensor = [mpsGraph exponentWithTensor:negAbsInputTensor
+                                                                         name:nil];
+          MPSGraphTensor* outputTensor = at::native::mps::log1p(mpsGraph, expNegAbsInputTensor);
+          outputTensor = [mpsGraph subtractionWithPrimaryTensor:minTensor
+                                                secondaryTensor:outputTensor
+                                                           name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  if (executeGatherOp) {
+    output.copy_(output_);
+  }
+  return std::forward_as_tuple(output, buffer);
+}
+
+std::tuple<Tensor, Tensor> log_sigmoid_forward_mps(const Tensor& self) {
+  auto output = at::empty_like(self);
+  auto buffer = at::empty({0}, self.options());
+  log_sigmoid_forward_out_mps(self, output, buffer);
+  return std::make_tuple(output, buffer);
+}
+
+Tensor& log_sigmoid_backward_mps_out(const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& buffer,
+    Tensor& grad_input) {
+  // NOTE: buffer is only used by CPU dispatch, we just ignore it here
+  using namespace mps;
+
+  if (self.numel() == 0) {
+    return grad_input;
+  }
+
+  grad_input.resize_as_(self);
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  bool executeGatherOp = !(self.is_contiguous(MemoryFormat::Contiguous) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast) ||
+                           self.is_contiguous(MemoryFormat::ChannelsLast3d));
+  Tensor grad_input_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
+
+  @autoreleasepool {
+
+    string key = "log_sigmoid_backward_out:" + getTensorsStringKey({self, grad_output});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:@[@1]
+                                                           dataType:inputTensor.dataType];
+          MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                             shape:@[@1]
+                                                          dataType:inputTensor.dataType];
+          MPSGraphTensor* negOneTensor = [mpsGraph constantWithScalar:-1.0
+                                                                shape:@[@1]
+                                                             dataType:inputTensor.dataType];
+          MPSGraphTensor* inputNegPredicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                                        secondaryTensor:zeroTensor
+                                                                                   name:nil];
+          MPSGraphTensor* maxDerivativeTensor = [mpsGraph selectWithPredicateTensor:inputNegPredicateTensor
+                                                                truePredicateTensor:oneTensor
+                                                               falsePredicateTensor:zeroTensor
+                                                                               name:nil];
+          MPSGraphTensor* signTensor = [mpsGraph selectWithPredicateTensor:inputNegPredicateTensor
+                                                       truePredicateTensor:oneTensor
+                                                      falsePredicateTensor:negOneTensor
+                                                                      name:nil];
+          MPSGraphTensor* absInputTensor = [mpsGraph absoluteWithTensor:inputTensor
+                                                                   name:nil];
+          MPSGraphTensor* negAbsInputTensor = [mpsGraph negativeWithTensor:absInputTensor
+                                                                      name:nil];
+          MPSGraphTensor* expNegAbsInputTensor = [mpsGraph exponentWithTensor:negAbsInputTensor
+                                                                         name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:expNegAbsInputTensor
+                                                             secondaryTensor:oneTensor
+                                                                        name:nil];
+          outputTensor = [mpsGraph divisionWithPrimaryTensor:expNegAbsInputTensor
+                                             secondaryTensor:outputTensor
+                                                        name:nil];
+          outputTensor = [mpsGraph multiplicationWithPrimaryTensor:signTensor
+                                                   secondaryTensor:outputTensor
+                                                              name:nil];
+          outputTensor = [mpsGraph subtractionWithPrimaryTensor:maxDerivativeTensor
+                                                secondaryTensor:outputTensor
+                                                           name:nil];
+          outputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor
+                                                   secondaryTensor:outputTensor
+                                                              name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, executeGatherOp ? grad_input_ : grad_input, nil, false);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  if (executeGatherOp) {
+    grad_input.copy_(grad_input_);
+  }
+  return grad_input;
+}
+
+Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, const Tensor& buffer) {
+  auto grad_input = at::empty_like(grad_output);
+  log_sigmoid_backward_mps_out(grad_output, self, buffer, grad_input);
+  return grad_input;
+}
+
 TORCH_IMPL_FUNC(sigmoid_backward_out_mps)(
   const Tensor& grad_output,
   const Tensor& output,
@@ -420,6 +656,9 @@ Tensor relu_mps(const Tensor& self) {
   using namespace mps;
   TORCH_CHECK(grad_input.is_mps());
 
+  if (grad_output.numel() == 0) {
+    return;
+  }
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
@@ -496,6 +735,9 @@ Tensor relu_mps(const Tensor& self) {
   using namespace mps;
   TORCH_CHECK(grad_input.is_mps());
 
+  if (grad_output.numel() == 0) {
+    return;
+  }
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
@@ -747,6 +989,50 @@ Tensor relu_mps(const Tensor& self) {
     return  erfTensor;
 }
 
+MPSGraphTensor* tanh (MPSGraph* mpsGraph, MPSGraphTensor *inputTensor) {
+    // 0.5 * x * (1 + text{Tanh}(sqrt(2 / pi) * (x + 0.044715 * x^3)))
+    auto dataType = [inputTensor dataType];
+    constexpr float kBeta =  M_SQRT2 * M_2_SQRTPI * 0.5;
+    constexpr float kKappa = 0.044715f;
+    MPSGraphTensor *betaf = [mpsGraph constantWithScalar: kBeta
+                                                   shape: @[@1]
+                                                dataType: dataType];
+    MPSGraphTensor *kappaf = [mpsGraph constantWithScalar: kKappa
+                                                    shape: @[@1]
+                                                 dataType: dataType];
+    MPSGraphTensor *onef = [mpsGraph constantWithScalar: 1.0f
+                                                  shape: @[@1]
+                                              dataType: dataType];
+    MPSGraphTensor *halff = [mpsGraph constantWithScalar: 0.5f
+                                                    shape: @[@1]
+                                                dataType: dataType];
+    MPSGraphTensor *erfTensor = [mpsGraph multiplicationWithPrimaryTensor: inputTensor
+                                                          secondaryTensor: inputTensor
+                                                                    name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: inputTensor
+                                                    name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: kappaf
+                                                    name : nil];
+    erfTensor = [mpsGraph additionWithPrimaryTensor: erfTensor
+                                    secondaryTensor: inputTensor
+                                              name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: betaf
+                                                    name : nil];
+    erfTensor = [mpsGraph tanhWithTensor: erfTensor
+                                   name : nil];
+    erfTensor = [mpsGraph additionWithPrimaryTensor: erfTensor
+                                    secondaryTensor: onef
+                                              name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: halff
+                                                    name : nil];
+
+    return  erfTensor;
+}
+
 TORCH_IMPL_FUNC(gelu_out_mps) (
     const Tensor& self, c10::string_view approximate, const Tensor& output
   ) {
@@ -770,7 +1056,7 @@ Tensor relu_mps(const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "gelu_out_mps" + getTensorsStringKey({self});
+    string key = "gelu_out_mps" + getTensorsStringKey({self}) + ":" + c10::str(approximate);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
       MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
@@ -785,7 +1071,12 @@ Tensor relu_mps(const Tensor& self) {
                                                                   getMPSDataType(self.scalar_type()),
                                                                   getMPSShape(self));
 
-          MPSGraphTensor* outputTensor = normcdf(mpsGraph, inputTensor);
+          MPSGraphTensor* outputTensor = nil;
+          if(approximate == "tanh") {
+            outputTensor = tanh(mpsGraph, inputTensor);
+          } else {
+            outputTensor = normcdf(mpsGraph, inputTensor);
+          }
           outputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor
                                                    secondaryTensor:inputTensor
                                                               name:nil];
@@ -818,7 +1109,6 @@ Tensor relu_mps(const Tensor& self) {
     const Tensor& grad, const Tensor& self, c10::string_view approximate, const Tensor& grad_input
   ) {
   using namespace mps;
-  constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * (0.5);
 
   // Empty output
   if(grad_input.numel() == 0)
@@ -837,7 +1127,7 @@ Tensor relu_mps(const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "gelu_backward_out_mps" + getTensorsStringKey({self, grad});
+    string key = "gelu_backward_out_mps" + getTensorsStringKey({self, grad}) + ":" + c10::str(approximate);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
       MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
@@ -855,32 +1145,110 @@ Tensor relu_mps(const Tensor& self) {
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph,
                                                                   dataType,
                                                                   getMPSShape(self));
-          MPSGraphTensor* cdf = normcdf(mpsGraph, inputTensor);
-          MPSGraphTensor *halff = [mpsGraph constantWithScalar: -0.5f
-                                                    shape: @[@1]
-                                                dataType: dataType];
-          MPSGraphTensor *betaf = [mpsGraph constantWithScalar :kBeta
-                                                    shape :@[@1]
-                                                dataType:dataType];
-          MPSGraphTensor *pdfMul = [mpsGraph squareWithTensor : inputTensor
-                                                    name : nil];
-          pdfMul = [mpsGraph multiplicationWithPrimaryTensor : pdfMul
-                                          secondaryTensor : halff
-                                                    name : nil];
-          pdfMul = [mpsGraph exponentWithTensor : pdfMul
-                                        name  : nil];
-          MPSGraphTensor* pdf = [mpsGraph multiplicationWithPrimaryTensor : pdfMul
-                                                        secondaryTensor  : betaf
-                                                                  name : nil];
-          pdf = [mpsGraph multiplicationWithPrimaryTensor : inputTensor
-                                          secondaryTensor : pdf
-                                            name : nil];
-          pdf = [mpsGraph additionWithPrimaryTensor : pdf
-                                  secondaryTensor : cdf
-                                      name : nil];
-          MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor : gradTensor
-                                                                   secondaryTensor : pdf
-                                                                              name : nil];
+          MPSGraphTensor* outputTensor = nil;
+          if(approximate == "tanh") {
+            constexpr float kBeta = M_SQRT2 * M_2_SQRTPI * (0.5f);
+            constexpr float kKappa = 0.044715f;
+            MPSGraphTensor *betaf = [mpsGraph constantWithScalar: kBeta
+                                                           shape: @[@1]
+                                                        dataType: dataType];
+            MPSGraphTensor *kappaf = [mpsGraph constantWithScalar: kKappa
+                                                            shape: @[@1]
+                                                         dataType: dataType];
+            MPSGraphTensor *halff = [mpsGraph constantWithScalar: 0.5f
+                                                           shape: @[@1]
+                                                        dataType: dataType];
+            MPSGraphTensor *onef = [mpsGraph constantWithScalar: 1.0f
+                                                          shape: @[@1]
+                                                       dataType: dataType];
+            MPSGraphTensor *threef = [mpsGraph constantWithScalar: 3.0f
+                                                            shape: @[@1]
+                                                         dataType: dataType];
+            MPSGraphTensor* x_sq = [mpsGraph multiplicationWithPrimaryTensor: inputTensor
+                                                             secondaryTensor: inputTensor
+                                                                        name: nil];
+            MPSGraphTensor *x_cube =  [mpsGraph multiplicationWithPrimaryTensor: x_sq
+                                                                secondaryTensor: inputTensor
+                                                                           name: nil];
+            MPSGraphTensor *inner = [mpsGraph multiplicationWithPrimaryTensor: kappaf
+                                                              secondaryTensor: x_cube
+                                                                         name: nil];
+            inner = [mpsGraph additionWithPrimaryTensor: inner
+                                        secondaryTensor: inputTensor
+                                                   name: nil];
+            inner = [mpsGraph multiplicationWithPrimaryTensor: betaf
+                                              secondaryTensor: inner
+                                                         name: nil];
+            MPSGraphTensor *tanhInner = [mpsGraph tanhWithTensor: inner
+                                                            name: nil];
+            MPSGraphTensor *left = [mpsGraph multiplicationWithPrimaryTensor: halff
+                                                             secondaryTensor: inputTensor
+                                                                        name: nil];
+            MPSGraphTensor *right = [mpsGraph additionWithPrimaryTensor: onef
+                                                        secondaryTensor: tanhInner
+                                                                   name: nil];
+            MPSGraphTensor *left_derivative = [mpsGraph multiplicationWithPrimaryTensor: halff
+                                                                        secondaryTensor: right
+                                                                                   name: nil];
+            MPSGraphTensor *tanh_derivative = [mpsGraph multiplicationWithPrimaryTensor: tanhInner
+                                                                        secondaryTensor: tanhInner
+                                                                                   name: nil];
+            tanh_derivative = [mpsGraph subtractionWithPrimaryTensor: onef
+                                                     secondaryTensor: tanh_derivative
+                                                                name: nil];
+            MPSGraphTensor *inner_derivative = [mpsGraph multiplicationWithPrimaryTensor: threef
+                                                                         secondaryTensor: kappaf
+                                                                                    name: nil];
+            inner_derivative = [mpsGraph multiplicationWithPrimaryTensor: inner_derivative
+                                                         secondaryTensor: x_sq
+                                                                    name: nil];
+            inner_derivative = [mpsGraph additionWithPrimaryTensor: inner_derivative
+                                                   secondaryTensor: onef
+                                                              name: nil];
+            inner_derivative = [mpsGraph multiplicationWithPrimaryTensor: betaf
+                                                         secondaryTensor: inner_derivative
+                                                                    name: nil];
+            MPSGraphTensor *right_derivative = [mpsGraph multiplicationWithPrimaryTensor: left
+                                                                         secondaryTensor: tanh_derivative
+                                                                                    name: nil];
+            right_derivative = [mpsGraph multiplicationWithPrimaryTensor: right_derivative
+                                                         secondaryTensor: inner_derivative
+                                                                    name: nil];
+            outputTensor = [mpsGraph additionWithPrimaryTensor: left_derivative
+                                               secondaryTensor: right_derivative
+                                                          name: nil];
+            outputTensor = [mpsGraph multiplicationWithPrimaryTensor: gradTensor
+                                                     secondaryTensor: outputTensor
+                                                                name: nil];
+          } else {
+            constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * (0.5);
+            MPSGraphTensor *halff = [mpsGraph constantWithScalar: -0.5f
+                                                           shape: @[@1]
+                                                        dataType: dataType];
+            MPSGraphTensor *betaf = [mpsGraph constantWithScalar: kBeta
+                                                           shape: @[@1]
+                                                        dataType: dataType];
+            MPSGraphTensor* cdf = normcdf(mpsGraph, inputTensor);
+            MPSGraphTensor *pdfMul = [mpsGraph squareWithTensor: inputTensor
+                                                           name: nil];
+            pdfMul = [mpsGraph multiplicationWithPrimaryTensor: pdfMul
+                                               secondaryTensor: halff
+                                                          name: nil];
+            pdfMul = [mpsGraph exponentWithTensor: pdfMul
+                                             name: nil];
+            MPSGraphTensor* pdf = [mpsGraph multiplicationWithPrimaryTensor: pdfMul
+                                                            secondaryTensor: betaf
+                                                                       name: nil];
+            pdf = [mpsGraph multiplicationWithPrimaryTensor: inputTensor
+                                            secondaryTensor: pdf
+                                                       name: nil];
+            pdf = [mpsGraph additionWithPrimaryTensor: pdf
+                                      secondaryTensor: cdf
+                                                 name: nil];
+            outputTensor = [mpsGraph multiplicationWithPrimaryTensor: gradTensor
+                                                     secondaryTensor: pdf
+                                                                name: nil];
+          }
 
           newCachedGraph->gradTensor_ = gradTensor;
           newCachedGraph->inputTensor_ = inputTensor;
@@ -920,11 +1288,17 @@ void elu_variants_out_mps (
   string func_name) {
 
   using namespace mps;
-  TORCH_CHECK(self.is_mps());
+  auto resultMemFormat = result.suggest_memory_format();
+  bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
+  Tensor out;
+  if (executeGatherOp && resultMemFormat == MemoryFormat::ChannelsLast) {
+    out = at::empty_like(result, MemoryFormat::Contiguous);
+  }
 
   // Empty output
-  if(result.numel() == 0)
+  if(result.numel() == 0) {
     return;
+  }
 
   struct CachedGraph : public MPSCachedGraph
   {
@@ -1005,8 +1379,8 @@ void elu_variants_out_mps (
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -1018,8 +1392,10 @@ void elu_variants_out_mps (
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    if (out.has_storage()) {
+      result.copy_(out);
+    }
   }
-
 }
 
 // scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
@@ -1042,20 +1418,24 @@ void elu_variants_out_mps (
   const Tensor& self_or_result,
   const Tensor& grad_input
 ) {
-
   using namespace mps;
-  TORCH_CHECK(grad_output.is_mps());
+  auto gradMemFormat = grad_input.suggest_memory_format();
+  bool executeGatherOp = !(grad_output.is_contiguous(gradMemFormat) && self_or_result.is_contiguous(gradMemFormat) && grad_input.is_contiguous(gradMemFormat));
+  Tensor out;
+  if (executeGatherOp && gradMemFormat == MemoryFormat::ChannelsLast) {
+    out = at::empty_like(grad_input, MemoryFormat::Contiguous);
+  }
 
   // Empty output
-  if(grad_input.numel() == 0)
+  if(grad_input.numel() == 0) {
     return;
+  }
 
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *resultTensor_ = nil;
+    MPSGraphTensor *selfOrResultTensor_ = nil;
     MPSGraphTensor *gradInputTensor_ = nil;
   };
 
@@ -1064,7 +1444,7 @@ void elu_variants_out_mps (
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
                                                  to_string(alpha.to<double>()) + ":" +
                                                  to_string(scale.to<double>()) + ":" +
                                                  to_string(input_scale.to<double>()) + ":" +
@@ -1081,18 +1461,14 @@ void elu_variants_out_mps (
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
-          MPSGraphTensor* inputTensor = nil;
-          MPSGraphTensor* resultTensor = nil;
-
+          MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
           MPSGraphTensor* lessThanZeroGradTensor = nil;
 
           if(is_result) {
-            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
                                                                shape:@[@1]
                                                             dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
                                                                         secondaryTensor:alphaTensor
                                                                                    name:nil];
             auto constMul = scale.to<double>() * input_scale.to<double>();
@@ -1104,11 +1480,10 @@ void elu_variants_out_mps (
                                                                           name:nil];
           }
           else {
-            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
                                                                     shape:@[@1]
                                                                  dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
                                                                           secondaryTensor:inputScaleTensor
                                                                                      name:nil];
             MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
@@ -1128,7 +1503,7 @@ void elu_variants_out_mps (
           MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
                                                               shape:@[@1]
                                                            dataType:getMPSDataType(grad_output.scalar_type())];
-          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
                                                                    secondaryTensor:zeroTensor
                                                                               name:nil];
           MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@@ -1140,8 +1515,7 @@ void elu_variants_out_mps (
                                                                                  name:nil];
 
           newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->resultTensor_ = resultTensor;
+          newCachedGraph->selfOrResultTensor_ = selfOrResultTensor;
           newCachedGraph->gradInputTensor_ = gradInputTensor;
         }
         return newCachedGraph;
@@ -1149,36 +1523,24 @@ void elu_variants_out_mps (
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
-    Placeholder selfPlaceholder = Placeholder();
-    Placeholder resultPlaceholder = Placeholder();
-    if(is_result)
-      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result);
-    else
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result);
-    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
+    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    if(is_result)
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
-      };
-    else
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
+    };
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    if (out.has_storage()) {
+      grad_input.copy_(out);
+    }
   }
-
 }
 
 TORCH_IMPL_FUNC(glu_out_mps) (
@@ -1258,7 +1620,6 @@ void elu_variants_out_mps (
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
 
   }
-
 }
 
 Tensor& glu_backward_mps_out (
@@ -1419,7 +1780,8 @@ Tensor glu_backward_mps (const Tensor& grad_output,
       MPSScalar threshold_scalar = getMPSScalar(threshold, ScalarType::Float);
 
       @autoreleasepool {
-        string key = "softplus_out_mps:" + getTensorsStringKey({self});
+        string key = "softplus_out_mps:" + getTensorsStringKey({self}) + ":" +
+                      std::to_string(beta.to<double>()) + ":" + std::to_string(threshold.to<double>());
 
         CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
         if(!cachedGraph) {
@@ -1438,9 +1800,6 @@ Tensor glu_backward_mps (const Tensor& grad_output,
 
               MPSGraphTensor* reluTensor = [mpsGraph reLUWithTensor:inputTensor
                                                                name:nil];
-              MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
-                                                                  shape:@[@1]
-                                                               dataType:getMPSDataType(self.scalar_type())];
 
               MPSGraphTensor* reciprocalBetaTensor = [mpsGraph reciprocalWithTensor:betaTensor
                                                                              name:nil];
@@ -1452,14 +1811,8 @@ Tensor glu_backward_mps (const Tensor& grad_output,
                                                                                   name:nil];
               MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:bxTensor
                                                                   name:nil];
-              MPSGraphTensor* expPlusOneTensor = [mpsGraph additionWithPrimaryTensor:expTensor
-                                                                     secondaryTensor:unitTensor
-                                                                                name:nil];
-
-              MPSGraphTensor* logTensor = [mpsGraph logarithmWithTensor:expPlusOneTensor
-                                                                   name:nil];
-
-              MPSGraphTensor* softplusTensor = [mpsGraph multiplicationWithPrimaryTensor:logTensor
+              MPSGraphTensor* log1pTensor = at::native::mps::log1p(mpsGraph, expTensor);
+              MPSGraphTensor* softplusTensor = [mpsGraph multiplicationWithPrimaryTensor:log1pTensor
                                                                        secondaryTensor:reciprocalBetaTensor
                                                                             name:nil];
               MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@@ -1524,7 +1877,8 @@ Tensor glu_backward_mps (const Tensor& grad_output,
       MPSStream* stream = getCurrentMPSStream();
 
       @autoreleasepool {
-        string key = "softplus_backward_out_mps:" + getTensorsStringKey({grad_output, self});
+        string key = "softplus_backward_out_mps:" + getTensorsStringKey({grad_output, self}) + ":" +
+                      std::to_string(beta.to<double>()) + ":" + std::to_string(threshold.to<double>());
 
         CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
         if(!cachedGraph) {
@@ -1683,7 +2037,10 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     using namespace mps;
 
     Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
-    Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+    Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous);
+    if (grad_output.numel() == 0) {
+      return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
+    }
 
     struct CachedGraph : public MPSCachedGraph
     {
@@ -1941,6 +2298,177 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
 
 }
 
+
+TORCH_IMPL_FUNC(hardsigmoid_out_mps) (const Tensor& self, const Tensor& result) {
+  using namespace mps;
+  TORCH_CHECK(self.is_mps());
+
+  // Empty output
+  if(result.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "hardsigmoid_out_mps:" + getTensorsStringKey({self});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* threeTensor = [mpsGraph constantWithScalar:3.0
+                                                               shape:@[@1]
+                                                            dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* sixTensor = [mpsGraph constantWithScalar:6.0
+                                                             shape:@[@1]
+                                                          dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* inputPlusThreeTensor = [mpsGraph additionWithPrimaryTensor:inputTensor
+                                                                     secondaryTensor:threeTensor
+                                                                                name:nil];
+
+          MPSGraphTensor* outputTensor = [mpsGraph clampWithTensor:inputPlusThreeTensor
+                                                    minValueTensor:zeroTensor
+                                                    maxValueTensor:sixTensor
+                                                              name:nil];
+          outputTensor = [mpsGraph divisionWithPrimaryTensor:outputTensor
+                                             secondaryTensor:sixTensor
+                                                        name:nil];
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+TORCH_IMPL_FUNC(hardsigmoid_backward_out_mps) (
+  const Tensor& grad_output, const Tensor& self, const Tensor& grad_input
+) {
+  using namespace mps;
+  TORCH_CHECK(self.is_mps());
+
+  // Empty output
+  if(grad_input.numel() == 0)
+    return;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "hardsigmoid_backward_out_mps:" + getTensorsStringKey({self});
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* highTensor = [mpsGraph constantWithScalar:3.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* lowTensor = [mpsGraph constantWithScalar:-3.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor* oneSixTensor = [mpsGraph constantWithScalar:1.0/6.0
+                                                              shape:@[@1]
+                                                           dataType:getMPSDataType(self.scalar_type())];
+          MPSGraphTensor *inputLessThanHighPredicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                                                 secondaryTensor:highTensor
+                                                                                            name:nil];
+          MPSGraphTensor *inputGreaterThanLowPredicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+                                                                                      secondaryTensor:lowTensor
+                                                                                                 name:nil];
+          MPSGraphTensor* inIntervalTensor = [mpsGraph logicalANDWithPrimaryTensor:inputLessThanHighPredicateTensor
+                                                                   secondaryTensor:inputGreaterThanLowPredicateTensor
+                                                                              name:nil];
+          MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor
+                                                                   secondaryTensor:oneSixTensor
+                                                                              name:nil];
+
+          outputTensor = [mpsGraph selectWithPredicateTensor:inIntervalTensor
+                                         truePredicateTensor:outputTensor
+                                        falsePredicateTensor:zeroTensor
+                                                        name:nil];
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->gradInputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
 // -------------------------------------------------
 // Hardtanh backward
 
@@ -2073,12 +2601,17 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
 
-  TORCH_CHECK(self.is_mps());
-
   if (output.numel() == 0) {
     return output;
   }
 
+  auto resultMemFormat = output.suggest_memory_format();
+  bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && output.is_contiguous(resultMemFormat));
+  Tensor out;
+  if (executeGatherOp && !output.is_contiguous(MemoryFormat::Contiguous)) {
+    out = at::empty_like(output, MemoryFormat::Contiguous);
+  }
+
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   MPSStream* stream = at::mps::getCurrentMPSStream();
@@ -2159,9 +2692,9 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
           });
       cachedGraph = static_cast<CachedGraph*>(tmpCachedGraph);
     }
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
     Placeholder outputPlaceholder =
-        Placeholder(cachedGraph->outputTensor_, output);
+        Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : output, nil, false);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -2175,6 +2708,9 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    if (out.has_storage()) {
+      output.copy_(out);
+    }
   }
   return output;
 }
@@ -2196,11 +2732,10 @@ Tensor hardswish_mps(const Tensor& self) {
 Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
   using namespace mps;
 
-  if (grad_output.numel() == 0) {
-    return grad_output;
-  }
-
   Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
+  if (grad_input.numel() == 0) {
+    return grad_input;
+  }
 
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -2211,113 +2746,102 @@ Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
-  MPSStream* stream = at::mps::getCurrentMPSStream();
-
   @autoreleasepool {
     string key = "hardswish_backward_mps" + getTensorsStringKey({self});
-    CachedGraph* cachedGraph = static_cast<CachedGraph*>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if (!cachedGraph) {
-      MPSCachedGraph* tmpCachedGraph =
-          cache_->CreateCachedGraph(key, ^MPSCachedGraph*() {
-            CachedGraph* newCachedGraph = nil;
-            @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-              MPSGraphTensor* gradOutputTensor =
-                  mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-              MPSGraphTensor* inputTensor =
-                  mpsGraphRankedPlaceHolder(mpsGraph, self);
-
-              MPSGraphTensor* zeroTensor = [mpsGraph
-                  constantWithScalar:0.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* unitTensor = [mpsGraph
-                  constantWithScalar:1.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* threeTensor = [mpsGraph
-                  constantWithScalar:3.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* negativeThreeTensor = [mpsGraph
-                  constantWithScalar:-3.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* halfTensor = [mpsGraph
-                  constantWithScalar:0.5f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* tempTensor =
-                  [mpsGraph divisionWithPrimaryTensor:inputTensor
-                                      secondaryTensor:threeTensor
-                                                 name:nil];
-
-              MPSGraphTensor* weightedTensor =
-                  [mpsGraph additionWithPrimaryTensor:tempTensor
-                                      secondaryTensor:halfTensor
-                                                 name:nil];
-
-              MPSGraphTensor* lessThanMinPredicateTensor = [mpsGraph
-                  lessThanOrEqualToWithPrimaryTensor:inputTensor
-                                     secondaryTensor:negativeThreeTensor
-                                                name:nil];
-
-              MPSGraphTensor* lessThanMaxPredicateTensor =
-                  [mpsGraph lessThanWithPrimaryTensor:inputTensor
-                                      secondaryTensor:threeTensor
-                                                 name:nil];
-
-              MPSGraphTensor* lessThanMaxGradTensor =
-                  [mpsGraph selectWithPredicateTensor:lessThanMaxPredicateTensor
-                                  truePredicateTensor:weightedTensor
-                                 falsePredicateTensor:unitTensor
-                                                 name:nil];
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^MPSCachedGraph*() {
+        CachedGraph* newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
-              MPSGraphTensor* gradTensor =
-                  [mpsGraph selectWithPredicateTensor:lessThanMinPredicateTensor
-                                  truePredicateTensor:zeroTensor
-                                 falsePredicateTensor:lessThanMaxGradTensor
-                                                 name:nil];
-              MPSGraphTensor* gradInputTensor =
-                  [mpsGraph multiplicationWithPrimaryTensor:gradTensor
-                                            secondaryTensor:gradOutputTensor
-                                                       name:nil];
+          MPSGraphTensor* zeroTensor = [mpsGraph
+              constantWithScalar:0.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* unitTensor = [mpsGraph
+              constantWithScalar:1.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* threeTensor = [mpsGraph
+              constantWithScalar:3.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* negativeThreeTensor = [mpsGraph
+              constantWithScalar:-3.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* halfTensor = [mpsGraph
+              constantWithScalar:0.5f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* tempTensor =
+              [mpsGraph divisionWithPrimaryTensor:inputTensor
+                                  secondaryTensor:threeTensor
+                                             name:nil];
+
+          MPSGraphTensor* weightedTensor =
+              [mpsGraph additionWithPrimaryTensor:tempTensor
+                                  secondaryTensor:halfTensor
+                                             name:nil];
+
+          MPSGraphTensor* lessThanMinPredicateTensor = [mpsGraph
+              lessThanOrEqualToWithPrimaryTensor:inputTensor
+                                 secondaryTensor:negativeThreeTensor
+                                            name:nil];
+
+          MPSGraphTensor* lessThanMaxPredicateTensor =
+              [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                  secondaryTensor:threeTensor
+                                             name:nil];
+
+          MPSGraphTensor* lessThanMaxGradTensor =
+              [mpsGraph selectWithPredicateTensor:lessThanMaxPredicateTensor
+                              truePredicateTensor:weightedTensor
+                             falsePredicateTensor:unitTensor
+                                             name:nil];
+
+          MPSGraphTensor* gradTensor =
+              [mpsGraph selectWithPredicateTensor:lessThanMinPredicateTensor
+                              truePredicateTensor:zeroTensor
+                             falsePredicateTensor:lessThanMaxGradTensor
+                                             name:nil];
+          MPSGraphTensor* gradInputTensor =
+              [mpsGraph multiplicationWithPrimaryTensor:gradTensor
+                                        secondaryTensor:gradOutputTensor
+                                                   name:nil];
 
-              newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-              newCachedGraph->inputTensor_ = inputTensor;
-              newCachedGraph->gradInputTensor_ = gradInputTensor;
-            }
-            return newCachedGraph;
-          });
-      cachedGraph = static_cast<CachedGraph*>(tmpCachedGraph);
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
     }
 
-    Placeholder gradOutputPlaceholder =
-        Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder gradInputPlaceholder =
-        Placeholder(cachedGraph->gradInputTensor_, grad_input);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() :
-          gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() :
-          selfPlaceholder.getMPSGraphTensorData()
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      gradInputPlaceholder.getMPSGraphTensor() :
-          gradInputPlaceholder.getMPSGraphTensorData()
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };
 
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
   return grad_input;
 }
diff --git a/aten/src/ATen/native/mps/operations/AdaptivePooling.mm b/aten/src/ATen/native/mps/operations/AdaptivePooling.mm
index 412bf0c98021..d90545147e39 100644
--- a/aten/src/ATen/native/mps/operations/AdaptivePooling.mm
+++ b/aten/src/ATen/native/mps/operations/AdaptivePooling.mm
@@ -1,48 +1,43 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/Pool.h>
-#include <torch/library.h>
 
 namespace at::native {
 
-
 void set_kernel_params
   (int64_t isizeH, int64_t isizeW,
    int64_t osizeH, int64_t osizeW,
    int64_t &strideH, int64_t &strideW,
-   int64_t &kernel_sizeH, int64_t &kernel_sizeW) {
+   int64_t &kernel_sizeH, int64_t &kernel_sizeW,
+   bool check_avg_pooling = false) {
 
   TORCH_CHECK((isizeH >= osizeH && isizeW >= osizeW) || (isizeH <= osizeH && isizeW <= osizeW),
-              "Adaptive pool MPS: Input height and width must both be greather than or equal to, or lesser than, output height and width")
-
-  TORCH_CHECK((!(isizeH <= osizeH && isizeW <= osizeW) || (osizeH % isizeH == 0 && osizeW % isizeW == 0)),
-              "Adaptive pool MPS: If output is larger than input, output sizes must be multiples of input sizes")
+              "Adaptive pool MPS: Input height and width must both be greater than, "
+              "or equal to, or lesser than output height and width")
 
   if(isizeH >= osizeH) {
+    if (check_avg_pooling) {
+      TORCH_CHECK((isizeH % osizeH == 0 && isizeW % osizeW == 0),
+                   "Adaptive pool MPS: input sizes must be divisible by output sizes.");
+    }
     strideH = (int64_t) (isizeH / osizeH);
     strideW = (int64_t) (isizeW / osizeW);
-
     kernel_sizeH = isizeH - (osizeH-1) * strideH;
     kernel_sizeW = isizeW - (osizeW-1) * strideW;
-  }
-  else {
+  } else {
+    if (check_avg_pooling) {
+      TORCH_CHECK((osizeH % isizeH == 0 && osizeW % isizeW == 0),
+                  "Adaptive pool MPS: output sizes must be divisible by input sizes.");
+    }
     strideH = (int64_t) (osizeH / isizeH);
     strideW = (int64_t) (osizeW / isizeW);
-
     kernel_sizeH = osizeH - (isizeH-1) * strideH;
     kernel_sizeW = osizeW - (isizeW-1) * strideW;
   }
-
 }
 
 // Adaptive average pooling
-
 Tensor& adaptive_avg_pool2d_out_mps
   (const Tensor& input,
    IntArrayRef output_size,
@@ -51,40 +46,21 @@
   for (int64_t i = 1; i < input.ndimension(); i++) {
     TORCH_CHECK(input.size(i) > 0,
       "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
-      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
-      "empty");
+      "but input has sizes ", input.sizes(), " with dimension ", i, " being empty");
   }
 
   int64_t isizeH = input.size(-2);
   int64_t isizeW = input.size(-1);
-
   int64_t osizeH = output_size[0];
   int64_t osizeW = output_size[1];
 
-  if(input.suggest_memory_format() == at::MemoryFormat::ChannelsLast)
-    TORCH_CHECK(input.ndimension() == 4,
-                    "adaptive_avg_pool2d(): Expected 4D tensor, but got ",
-                    input.sizes())
-
-  switch (input.suggest_memory_format()) {
-    case at::MemoryFormat::Contiguous:
-    case at::MemoryFormat::ChannelsLast:
-      break;
-    default:
-        TORCH_CHECK(
-          false,
-          "Unsupported memory format. Supports only ChannelsLast, Contiguous")
-  }
-
-  int64_t strideH;
-  int64_t strideW;
-  int64_t kernel_sizeH;
-  int64_t kernel_sizeW;
+  int64_t strideH = 0, strideW = 0;
+  int64_t kernel_sizeH = 0, kernel_sizeW = 0;
 
   set_kernel_params(isizeH, isizeW,
                     osizeH, osizeW,
                     strideH, strideW,
-                    kernel_sizeH, kernel_sizeW);
+                    kernel_sizeH, kernel_sizeW, true);
 
   if(isizeH >= osizeH) {
     output =  at::avg_pool2d(input,
@@ -161,46 +137,46 @@
   (const Tensor& gradOutput,
    const Tensor& input) {
 
-    int64_t isizeH = input.size(-2);
-    int64_t isizeW = input.size(-1);
-    int64_t osizeH = gradOutput.size(-2);
-    int64_t osizeW = gradOutput.size(-1);
-
-    int64_t strideH, strideW, kernel_sizeH, kernel_sizeW;
-
-    set_kernel_params(isizeH, isizeW,
-                      osizeH, osizeW,
-                      strideH, strideW,
-                      kernel_sizeH, kernel_sizeW);
-    auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-    if (gradInput.numel() != 0) {
-      if(isizeH >= osizeH) {
-        gradInput = at::avg_pool2d_backward(gradOutput,
-                                            input,
-                                            IntArrayRef({kernel_sizeH, kernel_sizeW}),
-                                            IntArrayRef({strideH, strideW}),
-                                            IntArrayRef({0, 0}),
-                                            false,
-                                            true,
-                                            c10::nullopt);
-      } else {
-        gradInput = at::avg_pool2d(gradOutput,
-                                   IntArrayRef({kernel_sizeH, kernel_sizeW}),
-                                   IntArrayRef({strideH, strideW}),
-                                   IntArrayRef({0, 0}),
-                                   false,
-                                   true,
-                                   c10::nullopt);
-        gradInput = at::mul(gradInput, kernel_sizeH*kernel_sizeW);
-      }
-    }
+  int64_t isizeH = input.size(-2);
+  int64_t isizeW = input.size(-1);
+  int64_t osizeH = gradOutput.size(-2);
+  int64_t osizeW = gradOutput.size(-1);
 
-    return gradInput;
+  int64_t strideH = 0, strideW = 0;
+  int64_t kernel_sizeH = 0, kernel_sizeW = 0;
 
+  set_kernel_params(isizeH, isizeW,
+                    osizeH, osizeW,
+                    strideH, strideW,
+                    kernel_sizeH, kernel_sizeW, true);
+
+  auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (gradInput.numel() != 0) {
+    if(isizeH >= osizeH) {
+      gradInput = at::avg_pool2d_backward(gradOutput,
+                                          input,
+                                          IntArrayRef({kernel_sizeH, kernel_sizeW}),
+                                          IntArrayRef({strideH, strideW}),
+                                          IntArrayRef({0, 0}),
+                                          false,
+                                          true,
+                                          c10::nullopt);
+    } else {
+      gradInput = at::avg_pool2d(gradOutput,
+                                  IntArrayRef({kernel_sizeH, kernel_sizeW}),
+                                  IntArrayRef({strideH, strideW}),
+                                  IntArrayRef({0, 0}),
+                                  false,
+                                  true,
+                                  c10::nullopt);
+      gradInput = at::mul(gradInput, kernel_sizeH*kernel_sizeW);
+    }
+  }
+
+  return gradInput;
 }
 
 // Adaptive max pooling
-
 TORCH_IMPL_FUNC(adaptive_max_pool2d_out_mps)
   (const Tensor& input,
    IntArrayRef output_size,
@@ -216,44 +192,24 @@
 
   int64_t isizeH = input.size(-2);
   int64_t isizeW = input.size(-1);
-
   int64_t osizeH = output_size[0];
   int64_t osizeW = output_size[1];
 
-  if(input.suggest_memory_format() == at::MemoryFormat::ChannelsLast)
-    TORCH_CHECK(input.ndimension() == 4,
-                    "adaptive_avg_pool2d(): Expected 4D tensor, but got ",
-                    input.sizes())
-
-  switch (input.suggest_memory_format()) {
-    case at::MemoryFormat::Contiguous:
-    case at::MemoryFormat::ChannelsLast:
-      break;
-    default:
-        TORCH_CHECK(
-          false,
-          "Unsupported memory format. Supports only ChannelsLast, Contiguous")
-  }
-
-  int64_t strideH;
-  int64_t strideW;
-  int64_t kernel_sizeH;
-  int64_t kernel_sizeW;
+  int64_t strideH = 0, strideW = 0;
+  int64_t kernel_sizeH = 0, kernel_sizeW = 0;
 
   set_kernel_params(isizeH, isizeW,
                     osizeH, osizeW,
                     strideH, strideW,
                     kernel_sizeH, kernel_sizeW);
 
-  auto outputs = at::max_pool2d_with_indices(input,
-                              IntArrayRef({kernel_sizeH, kernel_sizeW}),
-                              IntArrayRef({strideH, strideW}),
-                              IntArrayRef({0, 0}),
-                              IntArrayRef({1, 1}),
-                              false);
-
-  output.copy_(std::get<0>(outputs));
-  indices.copy_(std::get<1>(outputs));
+  at::max_pool2d_with_indices_out(const_cast<Tensor&>(output),
+                                  const_cast<Tensor&>(indices), input,
+                                  IntArrayRef({kernel_sizeH, kernel_sizeW}),
+                                  IntArrayRef({strideH, strideW}),
+                                  IntArrayRef({0, 0}),
+                                  IntArrayRef({1, 1}),
+                                  false);
 }
 
 TORCH_IMPL_FUNC(adaptive_max_pool2d_backward_out_mps)
@@ -267,24 +223,22 @@
   int64_t osizeH = gradOutput.size(-2);
   int64_t osizeW = gradOutput.size(-1);
 
-  int64_t strideH, strideW, kernel_sizeH, kernel_sizeW;
+  int64_t strideH = 0, strideW = 0;
+  int64_t kernel_sizeH = 0, kernel_sizeW = 0;
 
   set_kernel_params(isizeH, isizeW,
                     osizeH, osizeW,
                     strideH, strideW,
                     kernel_sizeH, kernel_sizeW);
 
-  auto returnGradInput = at::max_pool2d_with_indices_backward(gradOutput,
-                                                              input,
-                                                              IntArrayRef({kernel_sizeH, kernel_sizeW}),
-                                                              IntArrayRef({strideH, strideW}),
-                                                              IntArrayRef({0, 0}),
-                                                              IntArrayRef({1, 1}),
-                                                              false,
-                                                              indices);
-
-  gradInput.copy_(returnGradInput);
-
+  at::max_pool2d_with_indices_backward_out(const_cast<Tensor&>(gradInput),
+                                           gradOutput, input,
+                                           IntArrayRef({kernel_sizeH, kernel_sizeW}),
+                                           IntArrayRef({strideH, strideW}),
+                                           IntArrayRef({0, 0}),
+                                           IntArrayRef({1, 1}),
+                                           false,
+                                           indices);
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
new file mode 100644
index 000000000000..395388773563
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -0,0 +1,259 @@
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/BinaryOps.h>
+
+namespace at::native {
+namespace mps {
+
+static const char* METAL_BINARY = R"BINARY_METAL(
+
+#include <metal_stdlib>
+using namespace metal;
+
+template<typename T>
+kernel void fmax(constant void     * input_        [[buffer(0)]],
+                  constant void     * other_        [[buffer(1)]],
+                  device   void     * out_          [[buffer(2)]],
+                  constant uint3    * offsets       [[buffer(3)]],
+                  uint tid [[thread_position_in_grid]]) {
+  device   T* out   = (device   T*)((device uint8_t*)out_ + offsets[tid].x);
+  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+
+  *out = fmax(*input, *other);
+}
+
+template<typename T>
+kernel void fmin(constant void     * input_        [[buffer(0)]],
+                  constant void     * other_        [[buffer(1)]],
+                  device   void     * out_          [[buffer(2)]],
+                  constant uint3    * offsets       [[buffer(3)]],
+                  uint tid [[thread_position_in_grid]]) {
+  device   T* out   = (device   T*)((device uint8_t*)out_ + offsets[tid].x);
+  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+
+  *out = fmin(*input, *other);
+}
+
+template<typename T>
+kernel void copysign(constant void     * input_        [[buffer(0)]],
+                     constant void     * other_        [[buffer(1)]],
+                     device   void     * out_          [[buffer(2)]],
+                     constant uint3    * offsets       [[buffer(3)]],
+                     uint tid [[thread_position_in_grid]]) {
+  device   T* out   = (device   T*)((device uint8_t*)out_ + offsets[tid].x);
+  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+
+  *out = copysign(*input, *other);
+}
+
+template<typename T>
+kernel void copysign_integral(constant void     * input_        [[buffer(0)]],
+                     constant void     * other_        [[buffer(1)]],
+                     device   void     * out_          [[buffer(2)]],
+                     constant uint3    * offsets       [[buffer(3)]],
+                     uint tid [[thread_position_in_grid]]) {
+  device   float* out = (device float*)((device uint8_t*)out_ + offsets[tid].x);
+  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+
+  *out = copysign(static_cast<float>(*input), static_cast<float>(*other));
+}
+
+#define REGISTER_FMAX_OP(DTYPE)                        \
+template                                               \
+[[host_name("fmax_" #DTYPE)]]                          \
+kernel void fmax<DTYPE>(                               \
+  constant void     * input_        [[buffer(0)]],     \
+  constant void     * other_        [[buffer(1)]],     \
+  device   void     * out_          [[buffer(2)]],     \
+  constant uint3    * offsets       [[buffer(3)]],     \
+  uint tid [[thread_position_in_grid]]);
+
+#define REGISTER_FMIN_OP(DTYPE)                        \
+template                                               \
+[[host_name("fmin_" #DTYPE)]]                          \
+kernel void fmin<DTYPE>(                               \
+  constant void     * input_        [[buffer(0)]],     \
+  constant void     * other_        [[buffer(1)]],     \
+  device   void     * out_          [[buffer(2)]],     \
+  constant uint3    * offsets       [[buffer(3)]],     \
+  uint tid [[thread_position_in_grid]]);
+
+#define REGISTER_COPYSIGN_OP(DTYPE)                    \
+template                                               \
+[[host_name("copysign_" #DTYPE)]]                      \
+kernel void copysign<DTYPE>(                           \
+  constant void     * input_        [[buffer(0)]],     \
+  constant void     * other_        [[buffer(1)]],     \
+  device   void     * out_          [[buffer(2)]],     \
+  constant uint3    * offsets       [[buffer(3)]],     \
+  uint tid [[thread_position_in_grid]]);
+
+#define REGISTER_COPYSIGN_INTEGRAL_OP(DTYPE)           \
+template                                               \
+[[host_name("copysign_" #DTYPE)]]                      \
+kernel void copysign_integral<DTYPE>(                  \
+  constant void     * input_        [[buffer(0)]],     \
+  constant void     * other_        [[buffer(1)]],     \
+  device   void     * out_          [[buffer(2)]],     \
+  constant uint3    * offsets       [[buffer(3)]],     \
+  uint tid [[thread_position_in_grid]]);
+
+REGISTER_FMAX_OP(float);
+REGISTER_FMAX_OP(half);
+REGISTER_FMIN_OP(float);
+REGISTER_FMIN_OP(half);
+REGISTER_COPYSIGN_OP(float);
+REGISTER_COPYSIGN_OP(half);
+REGISTER_COPYSIGN_INTEGRAL_OP(int);
+REGISTER_COPYSIGN_INTEGRAL_OP(long);
+REGISTER_COPYSIGN_INTEGRAL_OP(short);
+REGISTER_COPYSIGN_INTEGRAL_OP(char);
+REGISTER_COPYSIGN_INTEGRAL_OP(uchar);
+REGISTER_COPYSIGN_INTEGRAL_OP(bool);
+
+)BINARY_METAL";
+
+using namespace mps;
+
+static id<MTLLibrary> compileBinaryOpsLibrary(id<MTLDevice> device) {
+  static id<MTLLibrary> binaryLibrary = nil;
+  if (binaryLibrary) {
+    return binaryLibrary;
+  }
+
+  NSError *error = nil;
+  MTLCompileOptions *options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion: MTLLanguageVersion2_3];
+  binaryLibrary  = [device newLibraryWithSource:[NSString stringWithCString: METAL_BINARY encoding:NSASCIIStringEncoding]
+                                       options:options
+                                         error:&error];
+  TORCH_CHECK(binaryLibrary, "Failed to create metal binary library, error: ", [[error description] UTF8String]);
+  return binaryLibrary;
+}
+
+static id<MTLComputePipelineState> binaryPipelineState(id<MTLDevice> device, const std::string& kernel) {
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
+  id<MTLComputePipelineState> pso = psoCache[kernel];
+  if (pso) {
+    return pso;
+  }
+
+  NSError* error = nil;
+  id<MTLLibrary> binaryLib = compileBinaryOpsLibrary(device);
+  id<MTLFunction> binaryFunc = [binaryLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(binaryFunc, "Failed to create function state object for: ", kernel);
+  pso = [device newComputePipelineStateWithFunction:binaryFunc error:&error];
+  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+
+  psoCache[kernel] = pso;
+  return pso;
+}
+
+void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name) {
+  TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
+
+  Tensor input = iter.input(0);
+  Tensor other = iter.input(1);
+  Tensor out = iter.output();
+
+  id<MTLBuffer> inputBuffer  = getMTLBufferStorage(input);
+  id<MTLBuffer> otherBuffer  = getMTLBufferStorage(other);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(out);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  const uint32_t nDim = iter.ndim();
+  constexpr uint32_t nOffsets = 3;
+  const uint32_t numThreads = iter.numel();
+  dispatch_sync(mpsStream->queue(), ^(){
+    @autoreleasepool {
+      NSError* error = nil;
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+      id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+      const IntArrayRef& iterShape = iter.shape();
+      std::vector<uint32_t> iterShapeData(iterShape.size());
+      std::vector<std::array<uint32_t, nOffsets>> strides(nDim);
+
+      for (const auto i: c10::irange(iterShape.size())) {
+        TORCH_CHECK(i <= UINT32_MAX);
+        iterShapeData[i] = (uint32_t)(iterShape[i]);
+      }
+
+      for (const auto i: c10::irange(nDim)) {
+        for (const auto offset: c10::irange(nOffsets)) {
+            strides[i][offset] = iter.strides(offset)[i];
+        }
+      }
+
+      id<MTLFunction> kernelDataOffsetsFunction = MPSDevice::getInstance()->metalIndexingFunction("kernel_index_offsets", nil);
+      id<MTLComputePipelineState> kernelDataOffsetsPSO = [[device newComputePipelineStateWithFunction: kernelDataOffsetsFunction
+                                                                                                error: &error] autorelease];
+      id<MTLBuffer> kernelDataOffsets = [[device newBufferWithLength: numThreads * sizeof(simd_uint3)
+                                                             options: 0] autorelease];
+      TORCH_CHECK(kernelDataOffsetsPSO, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+      [computeEncoder setComputePipelineState:kernelDataOffsetsPSO];
+      [computeEncoder setBytes:strides.data() length:sizeof(uint32_t) * nDim * nOffsets atIndex:0];
+      [computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:1];
+      [computeEncoder setBytes:iterShapeData.data() length:sizeof(uint32_t) * iterShape.size() atIndex:2];
+      [computeEncoder setBytes:&nDim length:sizeof(uint32_t) atIndex:3];
+      [computeEncoder setBytes:&nOffsets length:sizeof(uint32_t) atIndex:4];
+
+      NSUInteger kernelOffsetsTGSize = kernelDataOffsetsPSO.maxTotalThreadsPerThreadgroup;
+      if (kernelOffsetsTGSize > numThreads)
+          kernelOffsetsTGSize = numThreads;
+
+      MTLSize kernelOffsetsThreadGroupSize = MTLSizeMake(kernelOffsetsTGSize, 1, 1);
+      [computeEncoder dispatchThreads: gridSize
+                threadsPerThreadgroup: kernelOffsetsThreadGroupSize];
+
+      const std::string kernel = func_name + "_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> binaryPSO = binaryPipelineState(device, kernel);
+      [computeEncoder setComputePipelineState:binaryPSO];
+      [computeEncoder setBuffer:inputBuffer  offset:input.storage_offset() * input.element_size() atIndex:0];
+      [computeEncoder setBuffer:otherBuffer  offset:other.storage_offset() * other.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:out.storage_offset() * out.element_size() atIndex:2];
+      [computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
+
+      NSUInteger tgSize = binaryPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > numThreads) {
+          tgSize = numThreads;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreads: gridSize
+                threadsPerThreadgroup: threadGroupSize];
+
+      [computeEncoder endEncoding];
+      mpsStream->commit(true);
+    }
+  });
+}
+} // namespace mps
+
+void fmax_mps_kernel(TensorIteratorBase& iter) {
+    if (isFloatingType(iter.common_dtype())) {
+        mps::binary_mps_impl(iter, "fmax");
+    } else {
+        at::maximum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
+    }
+}
+void fmin_mps_kernel(TensorIteratorBase& iter) {
+    if (isFloatingType(iter.common_dtype())) {
+        mps::binary_mps_impl(iter, "fmin");
+    } else {
+        at::minimum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
+    }
+}
+
+void copysign_mps_kernel(TensorIteratorBase& iter) {
+    mps::binary_mps_impl(iter, "copysign");
+}
+
+REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel);
+REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel);
+REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 34700ffe2758..4569add637a4 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -26,6 +26,12 @@
 void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha,
                     const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support binary op with uint8 natively starting from macOS 13.0");
+  TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) &&
+              (self.scalar_type() == ScalarType::Long ||
+              (other.scalar_type() == ScalarType::Long && (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
+              "MPS: ", op_name, " op with int64 input is supported natively starting from macOS 13.2");
   MPSStream* mpsStream = getCurrentMPSStream();
 
   const bool is_self_scalar = self.dim() == 0;
@@ -109,7 +115,8 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
           newCachedGraph->outputTensor = binaryBlock(newCachedGraph, primaryCastTensor, secondaryCastTensor);
           // Cast output tensor to an expected type if needed, which addresses discrepancy when int64 scalar is added to int32 tensor
           // Output tensor should have been promoted but it remains an int32 tensor
-          if (outputDataType != common_dtype) {
+          if (outputDataType != common_dtype ||
+             [newCachedGraph->outputTensor dataType] != getMPSDataType(outputDataType)) {
             newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, outputDataType);
           }
         }
@@ -170,8 +177,21 @@ void div_mode_template(const Tensor& self, const Tensor& other,
                        c10::optional<c10::string_view> rounding_mode,
                        const Tensor& output, const string op_name)
 {
+  if(rounding_mode.has_value() && *rounding_mode == "floor"){
+    TORCH_CHECK(self.scalar_type() != ScalarType::Long,
+                "MPS: does not support floor_divide op with int64 input");
+  }
   BinaryOpBlock div_mode_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
+    bool isFloatInput = ([primaryCastTensor dataType] & MPSDataTypeFloatBit) != 0;
+    if(!isFloatInput && rounding_mode.has_value() && (*rounding_mode == "floor" || *rounding_mode == "trunc")) {
+      primaryCastTensor = [mpsGraph castTensor:primaryCastTensor
+                                        toType:MPSDataTypeFloat32
+                                          name:@"primaryCastTensor"];
+      secondaryCastTensor = [mpsGraph castTensor:secondaryCastTensor
+                                          toType:MPSDataTypeFloat32
+                                            name:@"secondaryCastTensor"];
+    }
     MPSGraphTensor* divTensor =  [mpsGraph divisionWithPrimaryTensor:primaryCastTensor
                                                      secondaryTensor:secondaryCastTensor
                                                                 name:nil];
@@ -182,9 +202,27 @@ void div_mode_template(const Tensor& self, const Tensor& other,
     if (!rounding_mode.has_value() || !isFloatOutput) {
       return divTensor;
     } else if (*rounding_mode == "trunc") {
-      return trunc_tensor(mpsGraph, divTensor);
+      auto truncTensor =  trunc_tensor(mpsGraph, divTensor);
+      if (op_name == "fmod_mps_out") {
+        auto mulTensor = [mpsGraph multiplicationWithPrimaryTensor:truncTensor
+                                                   secondaryTensor:secondaryCastTensor
+                                                              name:nil];
+        return [mpsGraph subtractionWithPrimaryTensor:primaryCastTensor
+                                      secondaryTensor:mulTensor
+                                                 name:nil];
+      }
+      return truncTensor;
     } else if (*rounding_mode == "floor") {
-      return [mpsGraph floorWithTensor:divTensor name:nil];
+      MPSGraphTensor* floorTensor = [mpsGraph floorWithTensor:divTensor name:nil];
+      if (op_name == "remainder_out_mps") {
+        auto mulTensor = [mpsGraph multiplicationWithPrimaryTensor:floorTensor
+                                                   secondaryTensor:secondaryCastTensor
+                                                              name:nil];
+        return [mpsGraph subtractionWithPrimaryTensor:primaryCastTensor
+                                      secondaryTensor:mulTensor
+                                                 name:nil];
+      }
+      return floorTensor;
     }
     assert(0 && "Invalid rounding mode\n");
     return nullptr;
@@ -245,7 +283,7 @@ void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alp
 #define CREATE_MPS_STRUCTURED_BINARY_OP_FUNC(func_out, func_stub, other_type)                   \
 TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \
   TORCH_CHECK(!(self.scalar_type() == ScalarType::Long &&                                       \
-               (std::string(#func_stub) == "power" || std::string(#func_stub) == "atan2")),     \
+               std::string(#func_stub) == "atan2"),                                             \
                "MPS does not support ", #func_stub, " op with int64 input")                     \
   mps::binaryOp##other_type(self, other, Scalar(1.0), output, #func_stub,                       \
     ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {                          \
@@ -309,6 +347,28 @@ void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alp
   mps::add_sub_template(self, other, alpha, output, "sub");
 }
 
+TORCH_IMPL_FUNC(pow_Scalar_out_mps) (const Scalar& base, const Tensor& exp, const Tensor& out) {
+  if (base.equal(1.0)) {
+    out.fill_(1);
+  } else {
+    // Copied and modified from aten/stc/ATen/ScalarOps.h
+    // as MPS doesn't support float64 tensor.
+    Tensor base_tensor;
+    if (base.isFloatingPoint()) {
+      base_tensor = at::scalar_tensor(base, at::device(exp.device()).dtype(at::kFloat));
+    } else if (base.isBoolean()) {
+      base_tensor = at::scalar_tensor(base, at::device(exp.device()).dtype(at::kBool));
+    } else if (base.isComplex()) {
+      base_tensor = at::scalar_tensor(base, at::device(exp.device()).dtype(at::kComplexDouble));
+    } else {
+      AT_ASSERT(base.isIntegral(false));
+      base_tensor = at::scalar_tensor(base, at::device(exp.device()).dtype(at::kLong));
+    }
+    base_tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+    at::pow_out(const_cast<Tensor&>(out), base_tensor, exp); // redispatch!
+  }
+}
+
 Tensor& floor_divide_out_mps(const Tensor& self, const Tensor& other, Tensor& result) {
   mps::div_mode_template(self, other, "floor", result, "floor_divide_out");
   return result;
@@ -324,6 +384,33 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
   return floor_divide_out_mps(self, other, self);
 }
 
+TORCH_IMPL_FUNC(remainder_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::div_mode_template(self, other, "floor", output, "remainder_out_mps");
+}
+
+TORCH_IMPL_FUNC(fmod_mps_out) (const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::div_mode_template(self, other, "trunc", output, "fmod_mps_out");
+}
+
+TORCH_IMPL_FUNC(hypot_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
+{
+  mps::BinaryOpBlock hypot_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* twoTensor = [mpsGraph constantWithScalar:2.0
+                                                       shape:@[@1]
+                                                    dataType:primaryCastTensor.dataType];
+    MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:[mpsGraph powerWithPrimaryTensor:primaryCastTensor
+                                                                                     secondaryTensor:twoTensor
+                                                                                                name:nil]
+                                                    secondaryTensor:[mpsGraph powerWithPrimaryTensor:secondaryCastTensor
+                                                                                     secondaryTensor:twoTensor
+                                                                                                name:nil]
+                                                               name:nil];
+    return [mpsGraph squareRootWithTensor:sumTensor name:nil];
+  };
+  mps::binaryOpTensor(self, other, Scalar(1.0), output, "hypot_out_mps", hypot_op_block);
+}
+
 TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
 {
   mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
@@ -348,4 +435,33 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
   mps::binaryOpTensor(self, other, Scalar(1.0), output, "logaddexp2_out_mps", logaddexp2_op_block);
 }
 
+TORCH_IMPL_FUNC(xlogy_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                        shape:@[@1]
+                                                     dataType:primaryCastTensor.dataType];
+    MPSGraphTensor* yIsNaNPredicateTensor = [mpsGraph isNaNWithTensor:secondaryCastTensor
+                                                        name:nil];
+    MPSGraphTensor* logyTensor = [mpsGraph logarithmWithTensor:secondaryCastTensor
+                                                          name:nil];
+    MPSGraphTensor* xlogyTensor = [mpsGraph multiplicationWithPrimaryTensor:primaryCastTensor
+                                                            secondaryTensor:logyTensor
+                                                                       name:nil];
+    MPSGraphTensor* xEqualZeroPredicateTensor = [mpsGraph equalWithPrimaryTensor:primaryCastTensor
+                                                        secondaryTensor:zeroTensor
+                                                                   name:nil];
+    MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:xEqualZeroPredicateTensor
+                                                   truePredicateTensor:zeroTensor
+                                                  falsePredicateTensor:xlogyTensor
+                                                                  name:nil];
+    outputTensor = [mpsGraph selectWithPredicateTensor:yIsNaNPredicateTensor
+                                   truePredicateTensor:secondaryCastTensor
+                                  falsePredicateTensor:outputTensor
+                                                  name:nil];
+    return outputTensor;
+  };
+  mps::binaryOpTensor(self, other, Scalar(1.0), output, "xlogy_out_mps", xlogy_op_block);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 1a9682ece15a..a5768d0d13af 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -20,6 +20,9 @@ Tensor dot_mps(
   const Tensor &self,
   const Tensor &other)
 {
+
+  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS: dot op doesn't support int64 input")
+
   using namespace mps;
   auto output = at::native::empty_mps({}, self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
 
diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm
index e9d633601b13..12e86e14c635 100644
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@@ -10,11 +10,14 @@
   if (self.numel() == 0) {
     return self;
   }
+  Tensor output = self;
+  bool needsCopyToOutput = false;
+  if (!self.is_contiguous() || self.storage_offset()) {
+    output = empty_mps(self.sizes(), self.scalar_type(), c10::nullopt, kMPS);
+    needsCopyToOutput = true;
+  }
 
-  MPSStream* stream = getCurrentMPSStream();
-
-  struct CachedGraph : public MPSCachedGraph
-  {
+  struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor* outputTensor_ = nil;
   };
@@ -24,10 +27,9 @@
   @autoreleasepool {
     string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + to_string(value.toDouble());
 
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-    if(!cachedGraph) {
-
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    if (!cachedGraph) {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool{
@@ -42,7 +44,7 @@
           // constantWithScalar does not work for UInt8 Types on MacOS-12.[34]/Ventura preview
           // workaround by filing it as uint32 tensor and than casting to uint8
           // See https://github.com/pytorch/pytorch/issues/83692
-          MPSGraphTensor* inputTensor = [mpsGraph constantWithScalar: value.toDouble()
+          MPSGraphTensor* inputTensor = [mpsGraph constantWithScalar:value.toDouble()
                                                                shape:getMPSShape(self)
                                                             dataType:dataType];
           MPSGraphTensor* outputTensor = [mpsGraph identityWithTensor:inputTensor
@@ -62,18 +64,21 @@
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_,
+                                                needsCopyToOutput ? output : self,
+                                                nullptr, !needsCopyToOutput);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
 
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), /*feeds*/ nil, results);
+
+    if (needsCopyToOutput) {
+      self.copy_(output);
+    }
   }
 
   return self;
@@ -84,7 +89,7 @@ bool fill_mps_tensor_(Tensor& self, uint8_t value) {
   if (self.is_contiguous()) {
     MPSStream* stream = getCurrentMPSStream();
     auto storage_byte_offset = self.storage_offset() * self.itemsize();
-    stream->fill(mps::getMTLBufferStorage(self), 0, self.nbytes(), storage_byte_offset);
+    stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);
     return true;
   }
   return false;
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index eb1ee36eca02..601cbaec965e 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -11,6 +11,24 @@
 
 namespace at::native {
 
+void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor* descriptor_,
+                    NSUInteger strideInX, NSUInteger strideInY,
+                    NSUInteger dilationRateInX, NSUInteger dilationRateInY,
+                    NSUInteger paddingHorizontal, NSUInteger paddingVertical,
+                    c10::MemoryFormat memory_format, NSUInteger groups) {
+  descriptor_.strides = @[@1, [[NSNumber alloc] initWithInteger: strideInY],
+                              [[NSNumber alloc] initWithInteger: strideInX]];
+  descriptor_.dilationRates = @[@1, [[NSNumber alloc] initWithInteger: dilationRateInY],
+                                      [[NSNumber alloc] initWithInteger: dilationRateInX]];
+
+  descriptor_.paddingStyle = MPSGraphPaddingStyleExplicit;
+  descriptor_.paddingValues = @[@0, @0, [[NSNumber alloc] initWithInteger: paddingVertical], [[NSNumber alloc]
+                                                            initWithInteger: paddingVertical], [[NSNumber alloc]
+                                                            initWithInteger: paddingHorizontal], [[NSNumber alloc]
+                                                            initWithInteger: paddingHorizontal]];
+  descriptor_.channelDimensionIndex = -3LL;
+}
+
 // Create convolution descriptor
 void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
                     NSUInteger strideInX, NSUInteger strideInY,
@@ -38,15 +56,17 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
   descriptor_.groups = groups;
 }
 
-Tensor _mps_convolution(
+Tensor _mps_convolution_impl(
     const Tensor& input_t,
     const Tensor& weight_t,
     const c10::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::optional<IntArrayRef> input_shape) {
   TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS");
+  TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
 
   namespace native_mps = at::native::mps;
   CheckedFrom c = "mps_convolution";
@@ -65,6 +85,8 @@ Tensor _mps_convolution(
   auto memory_format = input_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
   auto output_t = at::empty(
+                    input_shape.has_value() ?
+                    input_shape.value() :
                     conv_output_size(input->sizes(), weight->sizes(),
                                      padding, stride, dilation),
                     input->scalar_type(),
@@ -113,10 +135,11 @@ Tensor _mps_convolution(
     }
 
     string bias_shape_key;
-    if(bias_defined)
+    if(bias_defined) {
       bias_shape_key = to_string(bias_shape[0]);
-    else
+    } else {
       bias_shape_key = "nobias";
+    }
 
     string key = "mps_convolution:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":"
                                     + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":"
@@ -135,23 +158,45 @@ Tensor _mps_convolution(
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphConvolution2DOpDescriptor *descriptor_ = [[MPSGraphConvolution2DOpDescriptor new] autorelease];
-          fill_conv_desc(descriptor_, stride[1], stride[0],
+          MPSGraphConvolution2DOpDescriptor *conv2dDescriptor_ =[[MPSGraphConvolution2DOpDescriptor new] autorelease];
+          MPSGraphDepthwiseConvolution3DOpDescriptor *depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+          MPSShape* weightShape = mps::getMPSShape(weight_t);
+          bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) &&
+                                  inputShape.count >= 4 && weightShape.count >= 4  && !is_channels_last);
+          if(isDepthwiseConv) {
+            fill_depthwise_conv_desc(depthWiseConv3dDescriptor_, stride[1], stride[0],
+                                            dilation[1], dilation[0],
+                                            padding[1], padding[0],
+                                            memory_format, groups);
+          } else {
+            fill_conv_desc(conv2dDescriptor_, stride[1], stride[0],
                                       dilation[1], dilation[0],
                                       padding[1], padding[0],
                                       memory_format, groups);
+          }
 
           MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(input_t.scalar_type()), inputShape);
           MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
 
           MPSGraphTensor* biasTensor = nil;
-          if(bias_defined)
+          if(bias_defined) {
             biasTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType((bias_opt.value()).scalar_type()));
+          }
+
+          MPSGraphTensor* outputTensor;
+          if(isDepthwiseConv) {
+              MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor dimension:-3 withDimension:-4 name:nil];
+              outputTensor = [mpsGraph depthwiseConvolution3DWithSourceTensor: inputTensor
+                                                                weightsTensor: weightTransposeTensor
+                                                                   descriptor: depthWiseConv3dDescriptor_
+                                                                         name: nil];
+          } else {
+              outputTensor = [mpsGraph convolution2DWithSourceTensor: inputTensor
+                                                                 weightsTensor: weightTensor
+                                                                    descriptor: conv2dDescriptor_
+                                                                          name: nil];
+          }
 
-          MPSGraphTensor* outputTensor = [mpsGraph convolution2DWithSourceTensor: inputTensor
-                                                                   weightsTensor: weightTensor
-                                                                      descriptor: descriptor_
-                                                                            name: nil];
           if (is_channels_last) {
             outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
           }
@@ -161,7 +206,6 @@ Tensor _mps_convolution(
                                                secondaryTensor: biasTensor
                                                           name: nil];
           }
-
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->weightTensor_ = weightTensor;
           newCachedGraph->biasTensor_ = biasTensor;
@@ -197,11 +241,23 @@ Tensor _mps_convolution(
   return *output;
 }
 
+Tensor _mps_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups) {
+    return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt);
+}
+
 Tensor mps_convolution_backward_input(
     IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
+  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
   CheckedFrom c = "mps_convolution_backward_input";
   TensorArg grad_output{ grad_output_t, "grad_output", 1 },
             weight{ weight_t, "weight", 2 };
@@ -209,14 +265,7 @@ Tensor mps_convolution_backward_input(
   checkAllSameGPU(c, {grad_output, weight});
   auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-
-  auto grad_input_t = at::empty(
-                    input_size,
-                    grad_output->scalar_type(),
-                    c10::nullopt,
-                    kMPS,
-                    c10::nullopt,
-                    c10::nullopt);
+  auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -270,25 +319,48 @@ Tensor mps_convolution_backward_input(
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphConvolution2DOpDescriptor *descriptor_ = [[MPSGraphConvolution2DOpDescriptor new] autorelease];
-          fill_conv_desc(descriptor_, stride[1], stride[0],
-                                      dilation[1], dilation[0],
-                                      padding[1], padding[0],
-                                      at::MemoryFormat::Contiguous, groups);
+          MPSGraphConvolution2DOpDescriptor *conv2dDescriptor_ = [[MPSGraphConvolution2DOpDescriptor new] autorelease];
+          MPSGraphDepthwiseConvolution3DOpDescriptor *depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new]  autorelease];
+
+          MPSShape* weightOutputShape = mps::getMPSShape(weight_t);
+          // Depthwise conv is input feature channels = groups. So I in OIHW has to be 1.
+          bool isDepthwiseConv = ((groups > 1 && (weightOutputShape[1].intValue == 1)) &&
+                                  gradOutputShape.count >= 4 && weightOutputShape.count >= 4 && !is_channels_last);
+
+          if(isDepthwiseConv) {
+            fill_depthwise_conv_desc(depthWiseConv3dDescriptor_, stride[1], stride[0],
+                                              dilation[1], dilation[0],
+                                              padding[1], padding[0],
+                                              at::MemoryFormat::Contiguous, groups);
+          } else {
+            fill_conv_desc(conv2dDescriptor_, stride[1], stride[0],
+                                        dilation[1], dilation[0],
+                                        padding[1], padding[0],
+                                        at::MemoryFormat::Contiguous, groups);
+          }
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
           MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
             gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
           }
-
-          MPSGraphTensor* gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
-                                                                                            weightsTensor:weightTensor
-                                                                                              outputShape:mps_input_shape
-                                                                             forwardConvolutionDescriptor:descriptor_
-                                                                                                     name:nil];
+          MPSGraphTensor* gradInputTensor;
+          if(isDepthwiseConv) {
+              MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor dimension:-3 withDimension:-4 name:nil];
+              gradInputTensor = [mpsGraph depthwiseConvolution3DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+                                                                                weightsTensor:weightTransposeTensor
+                                                                                  outputShape:mps_input_shape
+                                                                                   descriptor:depthWiseConv3dDescriptor_
+                                                                                         name:nil];
+          } else {
+              gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+                                                                                weightsTensor:weightTensor
+                                                                                  outputShape:mps_input_shape
+                                                                 forwardConvolutionDescriptor:conv2dDescriptor_
+                                                                                         name:nil];
+          }
 
           newCachedGraph->gradOutputTensor_ = gradOutputTensor;
           newCachedGraph->weightTensor_ = weightTensor;
@@ -318,17 +390,15 @@ Tensor mps_convolution_backward_input(
 }
 
 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
+  TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
 
-  auto grad_output_t = grad_output_.to(memory_format);
-  auto input_t = input_.to(memory_format);
-
   MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);
 
   // For uniformity with everything else, although it seems grad_weight
@@ -345,7 +415,7 @@ Tensor mps_convolution_backward_weights(
                           c10::nullopt,
                           kMPS,
                           c10::nullopt,
-                          memory_format);
+                          c10::nullopt);
   TensorArg grad_weight{ grad_weight_t, "result", 0 };
 
   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@@ -395,26 +465,48 @@ Tensor mps_convolution_backward_weights(
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphConvolution2DOpDescriptor *descriptor_ = [[MPSGraphConvolution2DOpDescriptor new] autorelease];
-          fill_conv_desc(descriptor_, stride[1], stride[0],
-                                      dilation[1], dilation[0],
-                                      padding[1], padding[0],
-                                      at::MemoryFormat::Contiguous, groups);
+          MPSGraphConvolution2DOpDescriptor *conv2dDescriptor_ = [[MPSGraphConvolution2DOpDescriptor new] autorelease];
+          MPSGraphDepthwiseConvolution3DOpDescriptor *depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new]  autorelease];
+          MPSShape* inputShape = mps::getMPSShape(input_t);
+          bool isDepthwiseConv = ((groups > 1 && (mps_weight_shape[1].intValue == 1)) && inputShape.count >= 4 && mps_weight_shape.count >= 4 && !is_channels_last);
+
+          if(isDepthwiseConv) {
+            fill_depthwise_conv_desc(depthWiseConv3dDescriptor_, stride[1], stride[0],
+                                                dilation[1], dilation[0],
+                                                padding[1], padding[0],
+                                                at::MemoryFormat::Contiguous, groups);
+          } else {
+              fill_conv_desc(conv2dDescriptor_, stride[1], stride[0],
+                                                  dilation[1], dilation[0],
+                                                  padding[1], padding[0],
+                                                  at::MemoryFormat::Contiguous, groups);
+          }
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
           MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
-          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+          if (is_channels_last) {
             gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
           }
 
-          MPSGraphTensor* gradWeightTensor = [mpsGraph convolution2DWeightsGradientWithIncomingGradientTensor:gradOutputTensorTranspose
-                                                                                                 sourceTensor:inputTensor
-                                                                                                  outputShape:mps_weight_shape
-                                                                                 forwardConvolutionDescriptor:descriptor_
-                                                                                                         name:nil];
-
+          MPSGraphTensor* gradWeightTensor;
+          if(isDepthwiseConv) {
+              NSNumber* outputFeatChannelDim = mps_weight_shape[0];
+              MPSShape* weightShapeTranspose = @[@1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3]];
+              MPSGraphTensor* gradWeightTensorTranspose = [mpsGraph depthwiseConvolution3DWeightsGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+                                                                                              sourceTensor:inputTensor
+                                                                                               outputShape:weightShapeTranspose
+                                                                                                descriptor:depthWiseConv3dDescriptor_
+                                                                                                      name:nil];
+              gradWeightTensor = [mpsGraph transposeTensor:gradWeightTensorTranspose dimension:-3 withDimension:-4 name:nil];
+          } else {
+              gradWeightTensor = [mpsGraph convolution2DWeightsGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+                                                                                               sourceTensor:inputTensor
+                                                                                                outputShape:mps_weight_shape
+                                                                               forwardConvolutionDescriptor:conv2dDescriptor_
+                                                                                                       name:nil];
+          }
           newCachedGraph->gradOutputTensor_ = gradOutputTensor;
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->gradWeightTensor_ = gradWeightTensor;
@@ -444,12 +536,9 @@ Tensor mps_convolution_backward_weights(
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight, grad_bias;
   if (input.numel() == 0) {
     if (output_mask[0]) {
@@ -484,6 +573,7 @@ Tensor _mps_convolution_transpose(
     const Tensor& input_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups) {
+  TORCH_CHECK(input_t.dim() < 5, "ConvTranspose 3D is not supported on MPS");
 
   auto output_t = mps_convolution_transpose_forward(
     input_t, weight_t, padding, output_padding, stride, dilation, groups);
@@ -494,10 +584,10 @@ Tensor _mps_convolution_transpose(
 Tensor mps_convolution_transpose_backward_input(
     const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups)
+    int64_t groups, IntArrayRef input_shape)
 {
-  return at::_mps_convolution(
-    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+  return _mps_convolution_impl(
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape);
 }
 
 Tensor mps_convolution_transpose_backward_weight(
@@ -513,15 +603,12 @@ Tensor mps_convolution_transpose_backward_weight(
 
 
 std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
-    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,2> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
-    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
   }
   if (output_mask[1]) {
     grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 17aa58d3d69e..16dbdbc51d89 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -1,17 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/Copy.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <iostream>
-#include <cstring>
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <torch/library.h>
-#include <ATen/native/Resize.h>
-#include <c10/util/Optional.h>
-
 
 namespace at::native {
 namespace mps {
@@ -84,7 +74,11 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, src);
-          MPSGraphTensor* outputTensor = [mpsGraph castTensor:inputTensor toType:dstDType name:@"cast"];
+          MPSGraphTensor* inputCastTensor = inputTensor;
+          if (isFloatingType(src.scalar_type()) && dstDType == MPSDataTypeUInt8) {
+            inputCastTensor = [mpsGraph castTensor:inputTensor toType:MPSDataTypeInt32 name:@"cast"];
+          }
+          MPSGraphTensor* outputTensor = [mpsGraph castTensor:inputCastTensor toType:dstDType name:@"cast"];
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->outputTensor_ = outputTensor;
@@ -101,26 +95,26 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
                                    autorelease];
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{cachedGraph->inputTensor_: srcData};
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{cachedGraph->outputTensor_: dstData};
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-    if (!non_blocking)
-      stream->synchronize(SyncType::COMMIT_AND_WAIT);
+    stream->executeMPSGraph(cachedGraph->graph(), feeds, results, !non_blocking ? SyncType::COMMIT_AND_WAIT : SyncType::COMMIT_ADAPTIVE);
   }
 }
 
 static at::Tensor& copy_from_mps_(at::Tensor& dst_, const at::Tensor& src_, bool non_blocking)
 {
+  auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
+
   id<MTLDevice> device = MPSDevice::getInstance()->device();
   MPSStream* stream = getCurrentMPSStream();
   Tensor dst;
   Tensor src;
-  if (!dst_.is_contiguous()) {
+  if (!dst_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
     dst = at::empty_like(dst_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
     dst = dst_;
   }
 
   auto storage_byte_offset = src_.storage_offset() * src_.itemsize();
-  if (!src_.is_contiguous()) {
+  if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
     Tensor emptyShell = Tensor();
     src = gatherViewTensor(src_, emptyShell);
     if (src.has_storage()) {
@@ -254,16 +248,21 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
 
   // If dst is contiguous and there is no byte offset, we can save directly the result of
   // gather into dst. This reduces the overhead of doing an additional blit for most cases
-  bool returnGatherOutput = (dst_.is_contiguous() && !dst_byte_offset && src_.dtype() == dst_.dtype());
+  bool returnGatherOutput = dst_.is_contiguous();
   Tensor src;
+  auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
+  const bool sameDataType = src_.dtype() == dst_.dtype();
 
-  if (src_.is_view() || !src_.is_contiguous()) {
+  if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
+      // the copy_cast path requires storage_offset to be applied before casting
+      (src_.storage_offset() && !sameDataType)) {
     Tensor emptyShell = Tensor();
     src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);
 
     if (src.has_storage()) {
-      if (returnGatherOutput)
+      if (returnGatherOutput) {
         return dst_;
+      }
 
       src_byte_offset = 0;
     } else {
@@ -279,19 +278,25 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   // Scatter to `dst` if the memory is not contiguous
   // If the memory is not contiguous, it means that the tensor has strides and we would not be
   // able to do the copy using a single blit
-  if (!dst_.is_contiguous()) {
+  if (!dst_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
     return scatterViewTensor(src, dst_);
   }
   src._set_conj(src_.is_conj());
   src._set_neg(src_.is_neg());
 
-  const size_t src_size = src.nbytes();
-  if (src.dtype() == dst_.dtype()) {
-    MPSStream* stream = getCurrentMPSStream();
+  MPSStream* stream = getCurrentMPSStream();
+  if (sameDataType) {
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
-    stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
+    stream->copy(sourceBuffer, destBuffer, src.nbytes(), src_byte_offset, dst_byte_offset);
   } else {
-    copy_cast_mps(dst_, src, destBuffer, sourceBuffer);
+    if (dst_byte_offset) {
+       auto tmp = at::native::empty_mps(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS);
+       auto tmpBuffer = getMTLBufferStorage(tmp);
+       copy_cast_mps(tmp, src, tmpBuffer, sourceBuffer);
+       stream->copy(tmpBuffer, destBuffer, dst_.nbytes(), 0, dst_byte_offset);
+    } else {
+       copy_cast_mps(dst_, src, destBuffer, sourceBuffer);
+    }
   }
   return dst_;
 }
@@ -301,22 +306,27 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   TORCH_CHECK(dst.defined(), "dst is undefined");
   TORCH_CHECK(src.defined(), "src is undefined");
 
+  bool needs_broadcasting = false;
+
   if (src.numel() == 0 || dst.is_same(src)) {
     return dst;
   }
   if (dst.numel() == 0) {
     dst.resize_as_(src);
   }
+  if (dst.dim() > src.dim()) {
+    needs_broadcasting = true;
+  }
 
   if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) {
-    return copy_from_mps_(dst, src, non_blocking);
+    return copy_from_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
   if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) {
-    return copy_to_mps_(dst, src, non_blocking);
+    return copy_to_mps_(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
 
   if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) {
-    return copy_kernel_mps(dst, src, non_blocking);
+    return copy_kernel_mps(dst, needs_broadcasting ? src.expand_as(dst) : src, non_blocking);
   }
   TORCH_INTERNAL_ASSERT(
       src.device().type() == DeviceType::MPS,
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
new file mode 100644
index 000000000000..7bf2d5f471ed
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -0,0 +1,156 @@
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/GridSamplerUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+
+namespace at {
+namespace native {
+
+void grid_sampler_2d_mps_impl(Tensor &output, const Tensor& input, const Tensor& grid,
+                              int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners) {
+// Grid Sampler support has been added in macOS 13.1
+#if defined(__MAC_13_2)
+  using namespace mps;
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
+  MPSGraphResizeMode samplingMode;
+  MPSGraphPaddingMode paddingMode;
+
+  auto memory_format = input.suggest_memory_format();
+  MPSGraphTensorNamedDataLayout inputTensorLayout =
+      (memory_format == at::MemoryFormat::Contiguous) ? MPSGraphTensorNamedDataLayoutNCHW : MPSGraphTensorNamedDataLayoutNHWC;
+
+  switch (static_cast<GridSamplerPadding>(padding_mode)) {
+    case GridSamplerPadding::Zeros:
+      paddingMode = MPSGraphPaddingModeZero; break;
+    case GridSamplerPadding::Border:
+      TORCH_CHECK(false, "MPS: Unsupported Border padding mode"); break;
+    case GridSamplerPadding::Reflection:
+      paddingMode = align_corners == true ? MPSGraphPaddingModeReflect : MPSGraphPaddingModeSymmetric; break;
+    default:
+      TORCH_CHECK(false, "MPS: Unrecognised Padding Mode: ", padding_mode);
+  }
+
+  switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
+    case GridSamplerInterpolation::Bilinear:
+      samplingMode = MPSGraphResizeBilinear; break;
+    case GridSamplerInterpolation::Nearest:
+      samplingMode = MPSGraphResizeNearest; break;
+    case GridSamplerInterpolation::Bicubic:
+      TORCH_CHECK(false, "MPS: Unsupported Bicubic interpolation"); break;
+    default:
+      TORCH_CHECK(false, "MPS: Unrecognised interpolation mode: ", interpolation_mode); break;
+   }
+
+  MPSStream *stream = getCurrentMPSStream();
+
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* gridTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "grid_sampler_2d_mps"                     +
+                  getTensorsStringKey({input, grid})       +
+                  ":" + std::to_string(interpolation_mode) +
+                  ":" + std::to_string(padding_mode)       +
+                  ":" + std::to_string(align_corners);
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor* gridTensor = mpsGraphRankedPlaceHolder(mpsGraph, grid);
+
+          MPSGraphTensor* outputTensor = nil;
+          if (static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Nearest) {
+            outputTensor = [mpsGraph sampleGridWithSourceTensor: inputTensor
+                                               coordinateTensor: gridTensor
+                                                         layout: inputTensorLayout
+                                           normalizeCoordinates: TRUE
+                                            relativeCoordinates: FALSE
+                                                   alignCorners: align_corners
+                                                    paddingMode: paddingMode
+                                            nearestRoundingMode: MPSGraphResizeNearestRoundingModeRoundToEven
+                                                  constantValue: 0.0f
+                                                           name: nil];
+          } else {
+            outputTensor = [mpsGraph sampleGridWithSourceTensor: inputTensor
+                                               coordinateTensor: gridTensor
+                                                         layout: inputTensorLayout
+                                           normalizeCoordinates: TRUE
+                                            relativeCoordinates: FALSE
+                                                   alignCorners: align_corners
+                                                    paddingMode: paddingMode
+                                                   samplingMode: samplingMode
+                                                  constantValue: 0.0f
+                                                           name: nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gridTensor_ = gridTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
+    Placeholder gridPlaceholder = Placeholder(cachedGraph->gridTensor_, grid);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      gridPlaceholder.getMPSGraphTensor() : gridPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+#endif // defined(__MAC_13_2)
+}
+
+Tensor grid_sampler_2d_mps(const Tensor& input, const Tensor& grid,
+                           int64_t interpolation_mode, int64_t padding_mode,
+                           bool align_corners) {
+#if defined(__MAC_13_2)
+  bool xcode_sdk_13_2_or_higher = true;
+#else
+  bool xcode_sdk_13_2_or_higher = false;
+#endif
+
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) || !xcode_sdk_13_2_or_higher) {
+    TORCH_WARN_ONCE("MPS: grid_sampler_2d op is supported natively starting from macOS 13.1. ",
+                    "Falling back on CPU. This may have performance implications.");
+
+    return at::grid_sampler_2d(
+      input.to("cpu"), grid.to("cpu"), interpolation_mode, padding_mode, align_corners).clone().to("mps");
+  }
+
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+
+  grid_sampler_2d_mps_impl(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index bea033815132..0af63e1a4a06 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -37,8 +37,9 @@ bool dispatchIndexKernel(TensorIteratorBase& iter,
                          bool accumulate) {
   using namespace mps;
 
- if (iter.numel() == 0)
+ if (iter.numel() == 0) {
     return true;
+  }
 
   const Tensor& inputTensor = iter.tensor(1);
   Tensor outputTensor = iter.tensor(0);
@@ -139,7 +140,7 @@ bool dispatchIndexKernel(TensorIteratorBase& iter,
                 threadsPerThreadgroup: threadGroupSize];
 
       [computeEncoder endEncoding];
-      mpsStream->commit(true);
+      mpsStream->synchronize(SyncType::COMMIT_AND_CONTINUE);
     }
   });
 
@@ -227,6 +228,12 @@ Tensor nonzero_fallback(const Tensor& self) {
       return out_;
   }
 
+  int64_t nDim = self.dim();
+  if (self.numel() == 0) {
+    at::native::resize_output(out_, {0, nDim});
+    return out_;
+  }
+
   using namespace mps;
   const uint32_t maxDimensions = 16;
 
@@ -245,32 +252,24 @@ Tensor nonzero_fallback(const Tensor& self) {
     MPSGraphTensor* inputTensor_ = nil;
     MPSGraphTensor* outputTensor_ = nil;
     MPSGraphTensor* scatterDataTensor_ = nil;
+    MPSGraphTensor* countNonzeroTensor_ = nil;
   };
 
-  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
-  int64_t nDim = self.dim();
-  at::native::resize_output(out_, {total_nonzero, nDim});
-  if (out_.numel() ==  0) {
-    return out_;
-  }
-
-  bool contiguous_output = (out_.is_contiguous() && !out_.is_view());
-  Tensor out = out_;
-  if (!contiguous_output) {
-    out = at::native::empty_mps(
-           out_.sizes(),
+  stream->synchronize(SyncType::COMMIT_AND_WAIT);
+  Tensor count_nonzero = at::empty({1}, self.options().dtype(kInt));
+  Tensor out =  at::native::empty_mps(
+           {self.numel(), nDim == 0 ? 1 : nDim},
            out_.scalar_type(),
            c10::nullopt,
            kMPS,
            c10::nullopt,
            c10::nullopt);
-  }
 
   int64_t _apparentInputShape = 1;
   for (auto dim : self.sizes()) {
     _apparentInputShape *= dim;
   }
-  MPSShape *apparentOutputShape = @[@(total_nonzero * nDim)];
+  MPSShape *apparentOutputShape = @[@(self.numel() * nDim)];
   MPSShape *apparentInputShape = @[@(_apparentInputShape)];
 
   // Pseudocode:
@@ -304,6 +303,9 @@ Tensor nonzero_fallback(const Tensor& self) {
           MPSGraphTensor *inputNotEqualToZeroTensor = [mpsGraph notEqualWithPrimaryTensor:inputTensor
                                                                           secondaryTensor:zeroTensor
                                                                                      name:nil];
+          MPSGraphTensor *countNonzero = [mpsGraph reductionSumWithTensor:inputNotEqualToZeroTensor
+                                                         axis:0
+                                                         name:nil];
           MPSGraphTensor *maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
                                                      toType:MPSDataTypeInt32
                                                        name:@"castToInt32"];
@@ -352,6 +354,7 @@ Tensor nonzero_fallback(const Tensor& self) {
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->scatterDataTensor_ = scatterDataTensor;
           newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->countNonzeroTensor_ = countNonzero;
         }
         return newCachedGraph;
       });
@@ -359,8 +362,9 @@ Tensor nonzero_fallback(const Tensor& self) {
     }
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparentInputShape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, contiguous_output ? out_ : out, apparentOutputShape);
-    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, contiguous_output ? out_ : out, apparentOutputShape);
+    Placeholder countNonzeroPlaceholder = Placeholder(cachedGraph->countNonzeroTensor_, count_nonzero);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out, apparentOutputShape);
+    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, out, apparentOutputShape);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -369,15 +373,16 @@ Tensor nonzero_fallback(const Tensor& self) {
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+      countNonzeroPlaceholder.getMPSGraphTensor() : countNonzeroPlaceholder.getMPSGraphTensorData()
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-    if (!contiguous_output) {
-      out_.copy_(out);
-    }
   }
 
+  int32_t total_nonzero = count_nonzero.item<int32_t>();
+  at::native::resize_output(out_, {total_nonzero, nDim});
+  out_.copy_(out.resize_({total_nonzero, nDim}));
   return out_;
 }
 
@@ -434,7 +439,16 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
   using CachedGraph = mps::MPSUnaryCachedGraph;
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
+  MPSDataType inputDataType = getMPSScalarType(self.scalar_type());
+  MPSDataType outputDataType = getMPSScalarType(self.scalar_type());
+  if (!is_macos_13_or_newer()) {
+     if (self.scalar_type() == kBool) {
+      inputDataType = MPSDataTypeInt8;
+     }
+     if (result.scalar_type() == kBool) {
+      outputDataType = MPSDataTypeInt8;
+     }
+  }
   @autoreleasepool {
     NSString* ns_dims_key = [[ns_dims valueForKey:@"description"] componentsJoinedByString:@","];
     // A key is used to identify the MPSGraph which was created once, and can be reused if the parameters, data types etc match the earlier created MPSGraph
@@ -449,7 +463,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(self));
           MPSGraphTensor* outputTensor = [mpsGraph reverseTensor:inputTensor
                                                             axes:ns_dims
                                                             name:nil];
@@ -461,8 +475,10 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     }
 
     // Create placeholders which use the keys of the CachedGraph to create inputs and outputs of the operation
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+    Placeholder inputPlaceholder = Placeholder(
+      cachedGraph->inputTensor_, self, /*mpsShape*/nil, /*gatherTensorData=*/true, inputDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, result, /*mpsShape*/nil, /*gatherTensorData=*/false, outputDataType);
 
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -628,6 +644,11 @@ Tensor index_select_mps(const Tensor & self,
   TORCH_CHECK(dim == 0 || dim < self.dim(),
               "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
 
+  // Empty index
+  if (index.numel() == 0) {
+    return output;
+  }
+
   // Scalar input
   if (self.dim() == 0 && self.numel() == 1){
     output.copy_(self);
@@ -646,12 +667,15 @@ Tensor index_select_mps(const Tensor & self,
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   auto inputType = getMPSDataType(self.scalar_type());
   auto outputType = getMPSDataType(output.scalar_type());
-  if (inputType == MPSDataTypeUInt8 || inputType == MPSDataTypeBool) {
-      inputType = MPSDataTypeInt8;
+  if (inputType == MPSDataTypeUInt8 ||
+     (!is_macos_13_or_newer() && inputType == MPSDataTypeBool)) {
+    inputType = MPSDataTypeInt8;
   }
-  if (outputType == MPSDataTypeUInt8 || outputType == MPSDataTypeBool) {
-      outputType = MPSDataTypeInt8;
+  if (outputType == MPSDataTypeUInt8 ||
+     (!is_macos_13_or_newer() && outputType == MPSDataTypeBool)) {
+    outputType = MPSDataTypeInt8;
   }
+
   @autoreleasepool {
 
     string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
@@ -705,6 +729,10 @@ Tensor index_select_mps(const Tensor & self,
 
 Tensor & masked_fill__mps(Tensor& self, const Tensor & mask, const Scalar& value) {
   using namespace mps;
+
+  if (self.numel() == 0) {
+    return self;
+  }
   TORCH_CHECK(self.device() == mask.device(), "expected self and mask to be on the same device, but got mask on ",
     mask.device(), " and self on ", self.device());
   TORCH_CHECK(mask.scalar_type() == kByte || mask.scalar_type() == kBool,
@@ -718,14 +746,29 @@ Tensor index_select_mps(const Tensor & self,
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *inputTensor_ = nil;
     MPSGraphTensor *maskTensor_ = nil;
+    MPSGraphTensor *valueTensor_ = nil;
     MPSGraphTensor *outputTensor_ = nil;
   };
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
+  MPSDataType inputDataType = getMPSScalarType(self.scalar_type());
+  MPSDataType maskDataType = getMPSScalarType(b_mask->scalar_type());
+  // Workaround for `selectWithPredicateTensor` on macOS Monterey where bool data type may cause a hang
+  // The issue is fixed in macOS Ventura (13.0)
+  if (!is_macos_13_or_newer()) {
+     if (self.scalar_type() == kBool) {
+      inputDataType = MPSDataTypeInt8;
+     }
+     if (mask.scalar_type() == kBool) {
+      maskDataType = MPSDataTypeInt8;
+     }
+  }
+
   MPSStream* stream = getCurrentMPSStream();
+  MPSScalar valueScalar = getMPSScalar(value, value.type());
   @autoreleasepool {
-    string key = "masked_fill" + getTensorsStringKey({self, mask}) + ":" + std::to_string(value.toDouble());
+    string key = "masked_fill" + getTensorsStringKey({self, *b_mask}) + ":" + getMPSTypeString(value.type());
     CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if(!cachedGraph) {
       cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
@@ -736,42 +779,44 @@ Tensor index_select_mps(const Tensor & self,
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, mask);
-          MPSDataType valueType = getMPSScalarType(value.type());
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(self));
+          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, maskDataType, getMPSShape(*b_mask));
+          MPSGraphTensor* valueTensor = mpsGraphScalarPlaceHolder(mpsGraph, value);
 
-          // constantWithScalar doesn't like Bool constants getting created so
-          // mapping them to int8
-          if (valueType == MPSDataTypeBool) {
-            valueType = MPSDataTypeInt8;
+          MPSDataType valueType = getMPSScalarType(value.type());
+          MPSGraphTensor* castValueTensor = valueTensor;
+          if (valueType != inputDataType) {
+            castValueTensor = [mpsGraph castTensor:valueTensor
+                                            toType:inputDataType
+                                              name:@"castValueTensor"];
           }
-          MPSGraphTensor* valueTensor =  [mpsGraph constantWithScalar:value.to<double>()
-                                                            dataType:valueType];
-          valueTensor = [mpsGraph castTensor:valueTensor
-                                          toType:getMPSDataType(self.scalar_type())
-                                           name : @"castTensorEq"];
 
           MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:maskTensor
-                                                        truePredicateTensor:valueTensor
+                                                        truePredicateTensor:castValueTensor
                                                         falsePredicateTensor:inputTensor
                                                              name:nil];
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->maskTensor_ = maskTensor;
+          newCachedGraph->valueTensor_ = valueTensor;
           newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
       });
     }
 
-    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder maskPlaceholder   = Placeholder(cachedGraph->maskTensor_, mask);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
+    Placeholder selfPlaceholder   = Placeholder(
+      cachedGraph->inputTensor_, self, /*mpsShape*/nil, /*gatherTensorData=*/true, inputDataType);
+    Placeholder maskPlaceholder   = Placeholder(
+      cachedGraph->maskTensor_, *b_mask, /*mpsShape*/nil, /*gatherTensorData=*/true, maskDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, self, /*mpsShape*/nil, /*gatherTensorData=*/false, inputDataType);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      maskPlaceholder.getMPSGraphTensor() : maskPlaceholder.getMPSGraphTensorData()
+      maskPlaceholder.getMPSGraphTensor() : maskPlaceholder.getMPSGraphTensorData(),
+      cachedGraph->valueTensor_ : getMPSGraphTensorFromScalar(stream, valueScalar)
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
@@ -779,7 +824,6 @@ Tensor index_select_mps(const Tensor & self,
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-
   }
   namedinference::propagate_names_if_nonempty(self, maybe_outnames);
   return self;
@@ -842,19 +886,31 @@ Tensor embedding_dense_backward_mps(
 
             MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
 
+            MPSGraphTensor* castGradTensor = incomingGradTensor;
+            MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type());
+            // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16
+            if (dataType == MPSDataTypeFloat16) {
+              castGradTensor = [mpsGraph castTensor: incomingGradTensor
+                                             toType: MPSDataTypeFloat32
+                                               name: @"castGradTensor"];
+            }
             if (num_indices_dims != 0) {
               reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
                                                                axes: @[@-1]
                                                                name: nil];
             }
 
-            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor
                                                              indicesTensor: reshapedIndicesTensor
                                                                      shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
                                                            batchDimensions: 0
                                                                       mode: MPSGraphScatterModeAdd
                                                                       name: @"edb"];
-
+            if (dataType == MPSDataTypeFloat16) {
+              outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
+                                                 toType: MPSDataTypeFloat16
+                                                   name: @"castGradTensor"];
+            }
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;
             newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
@@ -886,6 +942,62 @@ Tensor embedding_dense_backward_mps(
   return masked_fill__mps(self, mask, value.item());
 }
 
+Tensor & masked_scatter__mps(Tensor& self, const Tensor& mask, const Tensor& source) {
+  at::assert_no_internal_overlap(self);
+  TORCH_CHECK(
+      self.scalar_type() == source.scalar_type(),
+      "masked_scatter: expected self and source to have same dtypes but got",
+      self.scalar_type(),
+      " and ",
+      source.scalar_type());
+
+  if (self.numel() == 0) {
+    return self;
+  }
+
+  TORCH_CHECK(mask.scalar_type() == ScalarType::Byte || mask.scalar_type() == ScalarType::Bool,
+              "masked_scatter: expected BoolTensor or ByteTensor for mask");
+
+  auto mask_temp = (mask.dim() == 0)
+    ? c10::MaybeOwned<Tensor>::owned(mask.unsqueeze(0))
+    : c10::MaybeOwned<Tensor>::borrowed(mask);
+  auto self_temp = (self.dim() == 0)
+    ? c10::MaybeOwned<Tensor>::owned(self.unsqueeze(0))
+    : c10::MaybeOwned<Tensor>::borrowed(self);
+
+  // Cannot reassign to mask_temp and self_temp here! if they are
+  // owning and expand_outplace returns a borrow, the returned borrow
+  // would dangle.
+  auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp);
+  auto indices = at::native::expandTensors(
+    *std::get<1>(mask_self_expanded),
+    c10::List<c10::optional<at::Tensor>>({*std::move(std::get<0>(mask_self_expanded))})
+    );
+  // next broadcast all index tensors together
+  try {
+    indices = at::expand_outplace(indices);
+  } catch (std::exception &e) {
+    TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together");
+  }
+
+  if (!indices[0].has_storage() || indices[0].numel() == 0) {
+    return self;
+  }
+
+  c10::List<c10::optional<Tensor>> final_indices;
+  final_indices.reserve(indices.size());
+
+  for (const auto index: indices) {
+    final_indices.push_back(index);
+  }
+  return at::index_put_out(
+    self,
+    *std::get<1>(mask_self_expanded),
+    final_indices,
+    source.resize_(indices[0].numel())
+  );
+}
+
 REGISTER_DISPATCH(index_stub, &index_kernel_mps);
 REGISTER_DISPATCH(index_put_stub, &index_put_kernel_mps);
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
index 354cdb435959..519de6afa3b8 100644
--- a/aten/src/ATen/native/mps/operations/Inverse.mm
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -10,7 +10,7 @@
 TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info)
 {
     TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
-    if (!is_macos_13_or_newer()) {
+    if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS)) {
       TORCH_WARN_ONCE("torch.linalg_inv_ex.inverse is supported by MPS on MacOS 13+, please upgrade. Falling back to CPU.");
       auto cpu_info = at::empty({0}, kInt, c10::nullopt, kCPU, c10::nullopt, c10::nullopt);
       auto cpu_result = result.clone().to("cpu");
@@ -24,6 +24,10 @@
     MPSStream* stream = getCurrentMPSStream();
     info.zero_();
 
+    if (A.numel() == 0) {
+        return;
+    }
+
     struct CachedGraph : public MPSCachedGraph
     {
         CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index 91ba2767b169..529c26ded002 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -166,6 +166,9 @@ Tensor _mps_linear_backward_input(
                                         c10::nullopt,
                                         grad_output.suggest_memory_format());
   TORCH_CHECK(output.is_mps());
+  if (grad_output.numel() == 0) {
+    return output;
+  }
 
   MPSGraphCache *cache_ = MPSGraphCache::getInstance();
 
@@ -259,6 +262,11 @@ Tensor _mps_linear_backward_input(
   TORCH_CHECK(output.is_mps());
   TORCH_CHECK(bias.is_mps());
 
+  if (grad_output.numel() == 0) {
+    output.zero_();
+    bias.zero_();
+    return std::tuple<Tensor, Tensor>{ output, bias };
+  }
   MPSGraphCache *cache_ = MPSGraphCache::getInstance();
 
   MPSStream *stream= getCurrentMPSStream();
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 45dbb0a01bca..6e3f1bc594a9 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -1,17 +1,8 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <ATen/mps/MPSStream.h>
-#include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <torch/library.h>
-
-#ifdef __OBJC__
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#endif
-
+#include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/Resize.h>
 
 namespace at::native {
 
@@ -194,6 +185,152 @@ void prepare_matrices_for_broadcasting(
   return output;
 }
 
+
+Tensor addr_mps(const Tensor& self,
+            const Tensor& vec1, const Tensor& vec2,
+            const Scalar& beta, const Scalar& alpha) {
+  Tensor result = at::empty({0}, self.options());
+  addr_out_mps(self, vec1,vec2,beta,alpha,result);
+  return result;
+}
+
+
+Tensor& addr_out_mps(const Tensor& self,
+                 const Tensor& vec1, const Tensor& vec2,
+                 const Scalar& beta, const Scalar& alpha, Tensor &result) {
+  using namespace mps;
+
+  TORCH_CHECK(result.is_mps());
+  TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1, "tensors must be 1-D");
+  TORCH_CHECK(vec1.scalar_type() == ScalarType::Double
+              || vec1.scalar_type() == ScalarType::Float
+              || vec1.scalar_type() == ScalarType::Half, "MPS device does not support addr for non-float input");
+
+  TensorArg args[]{{result, "out", 0}, {self, "self", 1}, {vec1, "vec1", 2}, {vec2, "vec2", 3}};
+  checkAllSameGPU(__func__, args);
+
+  IntArrayRef vec1_sizes = vec1.sizes();
+  IntArrayRef vec2_sizes = vec2.sizes();
+  IntArrayRef self_sizes;
+
+  c10::MaybeOwned<Tensor> self_;
+  if (&result != &self) {
+    self_ = expand_size(self, {vec1_sizes[0], vec2_sizes[0]}, "addr");
+    self_sizes = self_->sizes();
+  } else {
+    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
+    self_sizes = self_->sizes();
+    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
+    TORCH_CHECK(self_sizes[0] == vec1_sizes[0], "vec1_ dim 0 must match vec1 dim 0");
+    TORCH_CHECK(self_sizes[1] == vec2_sizes[0], "vec1_ dim 1 must match vec2 dim 0");
+  }
+
+  if (&result != &vec1) {
+    result.resize_(self_sizes);
+    if (beta.toComplexDouble() != 0.0) {
+      at::native::copy_(result, *self_);
+    }
+  }
+
+  IntArrayRef result_sizes = result.sizes();
+  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+    return result;
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+  bool is_beta_non_zero = beta.toDouble() != 0.0;
+  MPSShape* inputShape = @[@(vec1.numel()), @(1)];
+  MPSShape* otherShape = @[@(1), @(vec2.numel())];
+
+  struct CachedGraph : public mps::MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *vec1Tensor_ = nil;
+    MPSGraphTensor *vec2Tensor_ = nil;
+    MPSGraphTensor *selfTensor_ = nil;
+    MPSGraphTensor *resultTensor_ = nil;
+  };
+
+  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "addr_out_mps_impl" + getTensorsStringKey({vec1, vec2, *self_})
+                                       + ":" + to_string(beta.toDouble())
+                                       + ":" + to_string(alpha.toDouble());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+
+      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool{
+          MPSGraph *mpsGraph = mps::make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *t1 = mps::mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(vec1.scalar_type()), inputShape);
+          MPSGraphTensor *t2 =  mps::mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(vec2.scalar_type()), otherShape);
+          MPSGraphTensor *selfTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, *self_);
+
+          // Intermediate as placeholder
+          MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:t1
+                                                                          secondaryTensor:t2
+                                                                                     name:@"MM/(vec1Xvec2)"];
+
+          // Intermediates for beta and alpha
+          MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta.toDouble()
+                                                           dataType:getMPSScalarType((*self_).scalar_type())];
+          MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.toDouble()
+                                                           dataType:getMPSScalarType(vec1.scalar_type())];
+
+          // Intermediates for multiplying by beta and alpha
+          MPSGraphTensor* productTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor:productTensor
+                                                                              secondaryTensor:alphaTensor
+                                                                                         name:@"MM/alpha*(vec1Xvec2)"];
+          MPSGraphTensor* selfTimesBetaTensor = selfTensor;
+          if (is_beta_non_zero) {
+            selfTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:selfTensor
+                                                            secondaryTensor:betaTensor
+                                                                       name:@"MM/beta*input"];
+          }
+
+          MPSGraphTensor* resultTensor = productTimesAlphaTensor;
+          if (is_beta_non_zero) {
+            resultTensor = [mpsGraph additionWithPrimaryTensor:productTimesAlphaTensor
+                                               secondaryTensor:selfTimesBetaTensor
+                                                          name:@"MM/beta*input+alpha*(vec1@vec2)"];
+           }
+
+          newCachedGraph->vec1Tensor_ = t1;
+          newCachedGraph->vec2Tensor_ = t2;
+          newCachedGraph->selfTensor_ = selfTensor;
+          newCachedGraph->resultTensor_ = resultTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder vec1Placeholder = Placeholder(cachedGraph->vec1Tensor_, vec1, inputShape);
+    Placeholder vec2Placeholder = Placeholder(cachedGraph->vec2Tensor_, vec2, otherShape);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, *self_);
+    Placeholder resultPlaceholder = Placeholder(cachedGraph->resultTensor_, result);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      vec1Placeholder.getMPSGraphTensor() : vec1Placeholder.getMPSGraphTensorData(),
+      vec2Placeholder.getMPSGraphTensor() : vec2Placeholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
+    };
+
+    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return result;
+}
+
 Tensor& addmm_out_mps_impl(
     const Tensor& bias,
     const Tensor& self,  // input
@@ -243,6 +380,7 @@ void prepare_matrices_for_broadcasting(
   bool transpose_mat1_times_mat2 = false;
   bool transpose_mat1            = false;
   bool transpose_mat2            = false;
+  bool is_beta_non_zero          = beta.toDouble() != 0.0;
 
   prepare_matrices_for_broadcasting(&(*bias_), self, other, &beta, &transpose_mat1_times_mat2, transpose_mat1, transpose_mat2);
 
@@ -312,9 +450,12 @@ void prepare_matrices_for_broadcasting(
           MPSGraphTensor* productTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor:productTensor
                                                                               secondaryTensor:alphaTensor
                                                                                          name:@"MM/alpha*(mat1@mat2)"];
-          MPSGraphTensor* biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor
-                                                                          secondaryTensor:betaTensor
-                                                                                     name:@"MM/beta*input"];
+          MPSGraphTensor* biasTimesBetaTensor = biasTensor;
+          if (is_beta_non_zero) {
+            biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor
+                                                            secondaryTensor:betaTensor
+                                                                       name:@"MM/beta*input"];
+          }
 
           if (transpose_mat1_times_mat2)
             biasTimesBetaTensor = [mpsGraph transposeTensor: biasTimesBetaTensor
@@ -322,9 +463,12 @@ void prepare_matrices_for_broadcasting(
                                               withDimension: -2
                                                        name: nil];
 
-          MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:productTimesAlphaTensor
-                                                             secondaryTensor:biasTimesBetaTensor
-                                                                        name:@"MM/beta*input + alpha*(mat1@mat2)"];
+          MPSGraphTensor* outputTensor = productTimesAlphaTensor;
+          if (is_beta_non_zero) {
+            outputTensor = [mpsGraph additionWithPrimaryTensor:productTimesAlphaTensor
+                                               secondaryTensor:biasTimesBetaTensor
+                                                          name:@"MM/beta*input + alpha*(mat1@mat2)"];
+           }
 
           newCachedGraph->selfTensor_ = selfTensor;
           newCachedGraph->otherTensor_ = otherTensor;
@@ -369,6 +513,7 @@ void prepare_matrices_for_broadcasting(
               || batch1.scalar_type() == ScalarType::Half, "MPS device does not support bmm for non-float inputs");
 
   if (batch1.numel() == 0 || batch2.numel() == 0) {
+    result.zero_();
     return result;
   }
 
@@ -596,4 +741,105 @@ Tensor addbmm_mps(const Tensor& self, const Tensor& batch1, const Tensor& batch2
   return addbmm_out_mps(self, batch1, batch2, beta, alpha, self);
 }
 
+Tensor& linalg_solve_triangular_mps_impl( const Tensor& A, const Tensor& B, bool upper, bool transpose, bool left, bool unitriangular, Tensor& out) {
+  using namespace mps;
+
+  checkInputsSolver(A, B, left, "linalg.solve_triangular");
+  Tensor A_t, B_t;
+  std::tie(B_t, A_t) = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/nullptr);
+  at::native::resize_output(out, B_t.sizes());
+
+  if (A.numel() == 0 || B.numel() == 0 || out.numel() == 0) {
+    out.zero_();
+    return out;
+  }
+
+  Tensor A_ = A_t;
+  Tensor B_ = B_t;
+  if (!A_t.is_contiguous()) {
+    A_ = A_t.clone(at::MemoryFormat::Contiguous);
+  }
+  if (!B_t.is_contiguous()) {
+    B_ = B_t.clone(at::MemoryFormat::Contiguous);
+  }
+  id<MTLBuffer> aBuffer = getMTLBufferStorage(A_);
+  id<MTLBuffer> bBuffer = getMTLBufferStorage(B_);
+  id<MTLBuffer> outBuffer = getMTLBufferStorage(out);
+  MPSStream* mpsStream = getCurrentMPSStream();
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+
+  dispatch_sync(mpsStream->queue(), ^(){
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+      uint64_t batchSize = A_.sizes().size() > 2 ? A_.size(0) : 1;
+      uint64_t aRows = A_.size(-2);
+      uint64_t bRows = B_.size(-2);
+      uint64_t aCols = A_.size(-1);
+      uint64_t bCols = B_.size(-1);
+      uint64_t aElemSize = A_.element_size();
+      uint64_t bElemSize = B_.element_size();
+
+      MPSMatrixSolveTriangular *filter = [[[MPSMatrixSolveTriangular alloc] initWithDevice:device
+                                                                                     right:!left
+                                                                                     upper:upper
+                                                                                 transpose:transpose
+                                                                                      unit:unitriangular
+                                                                                     order:left ? bRows : bCols
+                                                                    numberOfRightHandSides:left ? bCols : bRows
+                                                                                     alpha:1.0f] autorelease];
+
+      MPSMatrixDescriptor* sourceMatrixDesc = [MPSMatrixDescriptor matrixDescriptorWithRows:aRows
+                                                                                    columns:aCols
+                                                                                   matrices:batchSize
+                                                                                   rowBytes:aCols * aElemSize
+                                                                                matrixBytes:aRows * aCols * aElemSize
+                                                                                   dataType:getMPSDataType(A_.scalar_type())];
+      MPSMatrixDescriptor* rightHandSideMatrixDesc = [MPSMatrixDescriptor matrixDescriptorWithRows:bRows
+                                                                                           columns:bCols
+                                                                                          matrices:batchSize
+                                                                                          rowBytes:bCols * bElemSize
+                                                                                       matrixBytes:bRows * bCols * bElemSize
+                                                                                          dataType:getMPSDataType(B_.scalar_type())];
+      for (const auto i: c10::irange(batchSize)) {
+        const uint64_t aBatchOffset = i * aRows * aCols;
+        const uint64_t bBatchOffset = i * bRows * bCols;
+        MPSMatrix* sourceMatrix = [[[MPSMatrix alloc] initWithBuffer:aBuffer
+                                                              offset:(A_t.storage_offset() + aBatchOffset) * aElemSize
+                                                          descriptor:sourceMatrixDesc] autorelease];
+        MPSMatrix* rightHandSideMatrix = [[[MPSMatrix alloc] initWithBuffer:bBuffer
+                                                                     offset:(B_t.storage_offset() + bBatchOffset) * bElemSize
+                                                                 descriptor:rightHandSideMatrixDesc] autorelease];
+        MPSMatrix *solutionMatrix = [[[MPSMatrix alloc] initWithBuffer:outBuffer
+                                                                offset:(out.storage_offset() + bBatchOffset) * bElemSize
+                                                            descriptor:rightHandSideMatrixDesc] autorelease];
+
+        [filter encodeToCommandBuffer:commandBuffer
+                         sourceMatrix:sourceMatrix
+                  rightHandSideMatrix:rightHandSideMatrix
+                       solutionMatrix:solutionMatrix];
+      }
+      mpsStream->commit(true);
+    }
+  });
+  return out;
+}
+
+Tensor& linalg_solve_triangular_mps_out( const Tensor& A, const Tensor& B, bool upper, bool left, bool unitriangular, Tensor& out) {
+  return linalg_solve_triangular_mps_impl(A, B, upper, /*transpose=*/false, left, unitriangular, out);
+}
+
+Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper, bool left, bool unitriangular) {
+  Tensor out = empty_mps({0}, A.scalar_type(), c10::nullopt, kMPS, c10::nullopt, MemoryFormat::Contiguous);
+  linalg_solve_triangular_mps_impl(A, B, upper, /*transpose=*/false, left, unitriangular, out);
+  return out;
+}
+
+TORCH_IMPL_FUNC(triangular_solve_mps_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) {
+  clone_A.copy_(A);
+  Tensor out = empty_mps({0}, A.scalar_type(), c10::nullopt, kMPS, c10::nullopt, MemoryFormat::Contiguous);
+  linalg_solve_triangular_mps_impl(A, self, upper, transpose, /*left=*/true, unitriangular, out);
+  result.resize_(out.sizes());
+  result.copy_(out);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 5e4658296890..1a8c689003ba 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -1,13 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/mps/MPSStream.h>
-#include <objc/NSObjCRuntime.h>
-#include <torch/library.h>
-
-#ifdef __OBJC__
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#endif
 
 namespace at::native {
 namespace mps {
@@ -37,12 +30,6 @@ string reductionToString(int64_t reduction)
     }
 }
 
-// MSELoss
-void mse_loss_out_impl(const Tensor& input, const Tensor& target,
-                          int64_t reduction, const Tensor& output, const string op_name)
-{
-}
-
 Tensor& mse_loss_backward_out_impl(const Tensor& grad_output, const Tensor& input, const Tensor& target,
                                    int64_t reduction, Tensor& grad_input, const string op_name)
 {
@@ -313,163 +300,141 @@ void mse_loss_out_impl(const Tensor& input, const Tensor& target,
 
 // NLLLoss
 void nllnd_loss_backward_impl(
-Tensor& grad_input_arg,
-const Tensor& grad_output,
-const Tensor& input_arg,
-const Tensor& target_arg,
-const Tensor& weight,
-int64_t reduction,
-int64_t ignore_index,
-const Tensor& total_weight,
-bool is2D)
-{
-    // Empty output
-    if(grad_input_arg.numel() == 0)
-        return;
-
-    MPSStream* stream = getCurrentMPSStream();
+    Tensor& grad_input_arg,
+    const Tensor& grad_output_arg,
+    const Tensor& input_arg,
+    const Tensor& target_arg,
+    const Tensor& weight_arg,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight,
+    bool is2D) {
 
-    struct CachedGraph : public MPSCachedGraph
-    {
+    if (grad_input_arg.numel() == 0) {
+        return;
+    }
+    struct CachedGraph : public MPSCachedGraph {
         CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
         MPSGraphTensor* inputTensor_ = nil;
         MPSGraphTensor* targetTensor_ = nil;
         MPSGraphTensor* weightTensor_ = nil;
         MPSGraphTensor* totalWeightTensor_ = nil;
         MPSGraphTensor* gradInputTensor_ = nil;
+        MPSGraphTensor* gradOutputTensor_ = nil;
     };
-
-    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
+    bool isWeightsArrayValid = weight_arg.defined() && weight_arg.numel() > 0;
+    int64_t channel_dim = grad_input_arg.dim() < 2 ? 0 : 1;
     auto input = input_arg.dim() == 1 ? input_arg.view({1, input_arg.size(0)}) : input_arg;
     auto target = target_arg.dim() == 0 ? target_arg.view({1}) : target_arg;
     auto grad_input = grad_input_arg.dim() == 1 ? grad_input_arg.view({1, grad_input_arg.size(0)}) : grad_input_arg;
+    auto numClasses = grad_input.sizes()[1];
+    auto weight = weight_arg;
+    auto grad_output = grad_output_arg;
 
+    if (isWeightsArrayValid) {
+        std::vector<int64_t> weightShape(input.dim(), 1);
+        weightShape[1] = input.size(1);
+        weight = weight_arg.view(weightShape);
+    }
+    if (grad_output_arg.dim() < grad_input.dim() && grad_output_arg.dim() > 0) {
+      grad_output = grad_output_arg.unsqueeze(channel_dim);
+    }
     @autoreleasepool {
+        string key = "nllnd_loss_backward" + getTensorsStringKey({input, grad_output, target, weight, total_weight})
+                                           + to_string(numClasses) + ":" + to_string(ignore_index) + ":"
+                                           + to_string(isWeightsArrayValid) + ":" + reductionToString(reduction);
 
-        auto numClasses = grad_input.sizes()[1];
-        bool isWeightsArrayValid = (weight.numel() > 0);
-
-        MPSShape* input_shape = getMPSShape(input);
-        MPSShape* target_shape = getMPSShape(target);
-        MPSShape* weight_shape = getMPSShape(weight);
-        MPSShape* total_weight_shape = getMPSShape(total_weight);
-
-        NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-
-        string key = "nllnd_loss_backward_impl:" + to_string(numClasses) + ":" +
-                                                   to_string(ignore_index) + ":" +
-                                                   to_string(isWeightsArrayValid) + ":" +
-                                                   reductionToString(reduction) + ":" +
-                                                   [ns_shape_key UTF8String] + ":" +
-                                                   getMPSTypeString(input.scalar_type()) + ":" +
-                                                   getMPSTypeString(target.scalar_type()) + ":" +
-                                                   getMPSTypeString(weight.scalar_type()) + ":" +
-                                                   getMPSTypeString(total_weight.scalar_type());
-        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
+        MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+        CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
         if(!cachedGraph) {
-            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+            cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
                 CachedGraph *newCachedGraph = nil;
-
                 @autoreleasepool {
-
                     MPSGraph* mpsGraph = make_mps_graph();
                     newCachedGraph = new CachedGraph(mpsGraph);
 
-                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
-                    MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()), target_shape);
+                    MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+                    MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
                     MPSGraphTensor* weightTensor = nil;
-                    if(isWeightsArrayValid)
-                        weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(weight.scalar_type()), weight_shape);
-                    MPSGraphTensor* totalWeightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(total_weight.scalar_type()), total_weight_shape);
+                    if (isWeightsArrayValid) {
+                        weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
+                    }
+                    MPSGraphTensor* totalWeightTensor = mpsGraphRankedPlaceHolder(mpsGraph, total_weight);
+                    MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
 
                     MPSGraphTensor *udpatedTargetTensor = targetTensor;
 
                     // Replace ignored_index with length depth + 1 so that oneHotAPI ignores it
-                    if(ignore_index != -100)
-                    {
-                        MPSGraphTensor *mpsGraphIndexTensor = [mpsGraph constantWithScalar: ignore_index
-                                                                                  dataType: MPSDataTypeInt64];
-                        MPSGraphTensor *mpsGraphDepthPlusOneTensor = [mpsGraph constantWithScalar: (numClasses + 1)
-                                                                                  dataType: MPSDataTypeInt64];
-
-                        // Equal tensor
-                        MPSGraphTensor* mpsGraphIsEqualTensor = [mpsGraph equalWithPrimaryTensor: targetTensor
-                                                                                 secondaryTensor: mpsGraphIndexTensor
-                                                                                            name: @"isEqualTensor"];
-
-                        udpatedTargetTensor = [mpsGraph selectWithPredicateTensor: mpsGraphIsEqualTensor
-                                                          truePredicateTensor: mpsGraphDepthPlusOneTensor
-                                                         falsePredicateTensor: targetTensor
-                                                                         name: @"predicateTensor"];
+                    if (ignore_index != -100) {
+                        MPSGraphTensor *ignoreIndexTensor = [mpsGraph constantWithScalar: ignore_index
+                                                                                dataType: MPSDataTypeInt64];
+                        MPSGraphTensor *numClassesTensor  = [mpsGraph constantWithScalar: (numClasses + 1)
+                                                                                dataType: MPSDataTypeInt64];
+                        MPSGraphTensor* isEqualTensor = [mpsGraph equalWithPrimaryTensor: targetTensor
+                                                                         secondaryTensor: ignoreIndexTensor
+                                                                                    name: @"isEqualTensor"];
+                        udpatedTargetTensor = [mpsGraph selectWithPredicateTensor: isEqualTensor
+                                                              truePredicateTensor: numClassesTensor
+                                                             falsePredicateTensor: targetTensor
+                                                                             name: @"predicateTensor"];
                     }
-
-                    float onValue = -1.0f;
-
-                    MPSGraphTensor *oneHotTensor;
-
-                    oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
-                                                               depth:numClasses
-                                                                axis:1
-                                                            dataType:inputTensor.dataType
-                                                             onValue:onValue
-                                                            offValue:0.0f
-                                                                name:nil];
-
-                    if(isWeightsArrayValid)
-                    {
-                        oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
-                                                                 secondaryTensor:weightTensor
-                                                                            name:@"scaleByWeightTensor"];
+                    MPSGraphTensor *oneHotTensor = [mpsGraph oneHotWithIndicesTensor: udpatedTargetTensor
+                                                                               depth: numClasses
+                                                                                axis: 1
+                                                                            dataType: inputTensor.dataType
+                                                                             onValue: -1.0f
+                                                                            offValue: 0.0f
+                                                                                name: nil];
+                    if (isWeightsArrayValid) {
+                        oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor: oneHotTensor
+                                                                 secondaryTensor: weightTensor
+                                                                            name: @"scaleByWeightTensor"];
                     }
-
-                    if(reduction == Reduction::Mean)
-                    {
-                        oneHotTensor = [mpsGraph divisionNoNaNWithPrimaryTensor:oneHotTensor
-                                                                secondaryTensor:totalWeightTensor
-                                                                           name:@"divisionTensor"];
+                    if (reduction == Reduction::Mean) {
+                        oneHotTensor = [mpsGraph divisionNoNaNWithPrimaryTensor: oneHotTensor
+                                                                secondaryTensor: totalWeightTensor
+                                                                           name: @"divisionTensor"];
                     }
-
-                    MPSGraphTensor* gradInputTensor = oneHotTensor;
-
+                    MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor: oneHotTensor
+                                                                                secondaryTensor: gradOutputTensor
+                                                                                           name: nil];
                     newCachedGraph->inputTensor_ = inputTensor;
                     newCachedGraph->targetTensor_ = targetTensor;
                     newCachedGraph->weightTensor_ = weightTensor;
                     newCachedGraph->totalWeightTensor_ = totalWeightTensor;
                     newCachedGraph->gradInputTensor_ = gradInputTensor;
-
+                    newCachedGraph->gradOutputTensor_ = gradOutputTensor;
                 }
                 return newCachedGraph;
             });
-            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
         }
 
-        auto inputPlaceholder   = Placeholder(cachedGraph->inputTensor_, input);
-        auto targetPlaceholder   = Placeholder(cachedGraph->targetTensor_, target);
+        auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
+        auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+        auto targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target);
         Placeholder weightPlaceholder = Placeholder();
-        if(isWeightsArrayValid)
-            weightPlaceholder  = Placeholder(cachedGraph->weightTensor_, weight);
-        auto totalWeightPlaceholder   = Placeholder(cachedGraph->totalWeightTensor_, total_weight);
+        if(isWeightsArrayValid) {
+            weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight);
+        }
+        auto totalWeightPlaceholder = Placeholder(cachedGraph->totalWeightTensor_, total_weight);
         auto gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
-        NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = [[[NSMutableDictionary alloc] initWithCapacity: 4] autorelease];
+        NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
         feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
         feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData();
         feeds[totalWeightPlaceholder.getMPSGraphTensor()] = totalWeightPlaceholder.getMPSGraphTensorData();
+        feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
 
-        if(isWeightsArrayValid)
+        if (isWeightsArrayValid) {
             feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
-
+        }
         NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
             gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
         };
 
-        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+        runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
     }
-
-    return;
 }
 
 void nllnd_loss_forward_impl
@@ -907,132 +872,101 @@ void smooth_l1_loss_backward_impl(
     double beta,
     Tensor& grad_input)
 {
- struct CachedGraph : public MPSCachedGraph
-  {
+  if (grad_input.numel() == 0) {
+    return;
+  }
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss_backward does not support negative values for beta.");
+
+  struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *inputTensor_ = nil;
     MPSGraphTensor *targetTensor_ = nil;
     MPSGraphTensor *gradInputTensor_ = nil;
+    MPSGraphTensor* gradOutputTensor_ = nil;
   };
 
- MPSGraphCache *cache_ = MPSGraphCache::getInstance();
-
-  MPSStream *stream= getCurrentMPSStream();
-
   @autoreleasepool {
+    string key = "smooth_l1_loss_backward" + getTensorsStringKey({input, grad_output, grad_input, target}) + ":"
+                                           + reductionToString(reduction) + ":" + to_string(beta);
 
-    auto numClasses = grad_input.sizes()[1];
-    MPSShape* input_shape = getMPSShape(input);
-    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-
-    string key = "smooth_l1_loss_backward_impl:" + to_string(numClasses) + ":" +
-                                                   reductionToString(reduction) + ":" +
-                                                   [ns_shape_key UTF8String] + ":" +
-                                                   to_string(beta) + ":" +
-                                                   getMPSTypeString(input.scalar_type()) + ":" +
-                                                   getMPSTypeString(target.scalar_type());
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-    if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+    MPSGraphCache *cache_ = MPSGraphCache::getInstance();
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    if (!cachedGraph) {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
-          auto numElements = input.numel();
-
           MPSGraph *mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor *inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()));
-          MPSGraphTensor *targetTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()));
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor *targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
+          MPSGraphTensor *gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
 
           MPSGraphTensor *betaTensor = [mpsGraph constantWithScalar: beta
                                                            dataType: MPSDataTypeFloat32];
-
-          MPSGraphTensor *numelTensor = [mpsGraph constantWithScalar: numElements
-                                                            dataType: MPSDataTypeFloat32];
-
           // xn - yn
           MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor
                                                               secondaryTensor: targetTensor
                                                                          name: nil];
-
           // | xn - yn |
           MPSGraphTensor *diffAbsTensor = [mpsGraph absoluteWithTensor: diffTensor
                                                                   name: nil];
-
           // | xn - yn | < beta
           MPSGraphTensor *diffAbsLessThanBetaTensor = [mpsGraph lessThanWithPrimaryTensor: diffAbsTensor
                                                                           secondaryTensor: betaTensor
                                                                                      name: nil];
-
           // ( xn - yn ) / beta
           MPSGraphTensor *truePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor
                                                                     secondaryTensor: betaTensor
                                                                                name: nil];
-
           // ( x - y ) / | x - y |
-           MPSGraphTensor *falsePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor
-                                                                      secondaryTensor: diffAbsTensor
-                                                                                 name: nil];
+          MPSGraphTensor *falsePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor
+                                                                     secondaryTensor: diffAbsTensor
+                                                                                name: nil];
 
           MPSGraphTensor *lossTensor = [mpsGraph selectWithPredicateTensor: diffAbsLessThanBetaTensor
-                                                            truePredicateTensor: truePredicateTensor
-                                                           falsePredicateTensor: falsePredicateTensor
-                                                                           name: @"lossTensor"];
-
+                                                       truePredicateTensor: truePredicateTensor
+                                                      falsePredicateTensor: falsePredicateTensor
+                                                                      name: @"lossTensor"];
           MPSGraphTensor *outputTensor = lossTensor;
-          if (reduction == Reduction::Mean)
-          {
-              outputTensor = [mpsGraph divisionWithPrimaryTensor: lossTensor
-                                                 secondaryTensor: numelTensor
-                                                            name: nil];
+          if (reduction == Reduction::Mean) {
+            MPSGraphTensor *numelTensor = [mpsGraph constantWithScalar: (double) input.numel()
+                                                              dataType: MPSDataTypeFloat32];
+            outputTensor = [mpsGraph divisionWithPrimaryTensor: lossTensor
+                                               secondaryTensor: numelTensor
+                                                          name: nil];
           }
-
-          MPSGraphTensor *gradInputTensor = outputTensor;
-
+          MPSGraphTensor *gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor: outputTensor
+                                                                      secondaryTensor: gradOutputTensor
+                                                                                 name: nil];
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->targetTensor_ = targetTensor;
           newCachedGraph->gradInputTensor_ = gradInputTensor;
-
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
     Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
     Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData()
+      targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(),
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
     };
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };
 
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
 }
 
-void smooth_l1_loss_backward_template(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction,
-    double beta,
-    Tensor& grad_input)
-{
-  TORCH_CHECK(beta >= 0, "smooth_l1_loss_backward does not support negative values for beta.");
-  TORCH_CHECK(input.is_mps());
-  TORCH_CHECK(target.is_mps());
-
-  smooth_l1_loss_backward_impl(
-      grad_output, input, target, reduction, beta, grad_input
-  );
-}
-
 } // namespace mps
 
 // APIs exposed to at::native scope
@@ -1076,12 +1010,14 @@ void smooth_l1_loss_backward_template(
 
                     MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
                     MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
+
+                    MPSDataType     input_type  = getMPSScalarType(input.scalar_type());
                     MPSGraphTensor* deltaTensor = [mpsGraph constantWithScalar:delta
                                                                              shape:@[@1]
-                                                                          dataType:MPSDataTypeFloat32];
+                                                                          dataType:input_type];
                     MPSGraphTensor* halfTensor = [mpsGraph constantWithScalar:.5f
                                                                              shape:@[@1]
-                                                                          dataType:MPSDataTypeFloat32];
+                                                                          dataType:input_type];
 
                     MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor
                                                                         secondaryTensor: targetTensor
@@ -1210,7 +1146,7 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
                                                                               name:nil];
                     MPSGraphTensor* deltaTensor = [mpsGraph constantWithScalar:delta
                                                                          shape:getMPSShape(target)
-                                                                      dataType:MPSDataTypeFloat32];
+                                                                      dataType:getMPSDataType(target.scalar_type())];
                     MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
                                                                         secondaryTensor:targetTensor
                                                                                    name:nil];
@@ -1390,8 +1326,10 @@ Tensor binary_cross_entropy_backward_mps(const Tensor& grad_output, const Tensor
     int64_t reduction,
     double beta,
     Tensor& grad_input) {
-  mps::smooth_l1_loss_backward_template(
+
+  mps::smooth_l1_loss_backward_impl(
       grad_output, input, target, reduction, beta, grad_input);
+
   return grad_input;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 1849a968baf5..34dd5f75211d 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -95,7 +95,7 @@ void get_shapes(MPSShape* input_shape_readonly,
   const bool has_weight = (weight_opt.has_value() && weight_opt->defined());
   const bool has_bias = (bias_opt.has_value() && bias_opt->defined());
 
-  const auto memory_format = self.suggest_memory_format();
+  auto memory_format = self.suggest_memory_format();
 
   if (output.numel() == 0) {
     return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_var);;
@@ -134,7 +134,9 @@ void get_shapes(MPSShape* input_shape_readonly,
                       + std::to_string(momentum) + ":" + std::to_string(train) + ":"
                       + std::to_string(has_running_mean) + ":"
                       + std::to_string(has_weight) + ":" + std::to_string(has_bias) + ":"
-                      + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(self.scalar_type());
+                      + [ns_shape_key UTF8String] + ":"
+                      + native_mps::getTensorsStringKey({
+                        self, weight_opt.value_or(Tensor()), bias_opt.value_or(Tensor()), running_mean_opt.value_or(Tensor()), running_var_opt.value_or(Tensor())});
     auto input_mps_dtype = native_mps::getMPSDataType(self.scalar_type());
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
 
@@ -145,6 +147,12 @@ void get_shapes(MPSShape* input_shape_readonly,
     else
       channelsDim = num_input_dims - 1;
 
+    bool executeGatherOp = true;
+    if (self.is_contiguous(memory_format)) {
+      memory_format = MemoryFormat::Contiguous;
+      executeGatherOp = false;
+    }
+
     if(!cachedGraph) {
       native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
 
@@ -179,6 +187,7 @@ void get_shapes(MPSShape* input_shape_readonly,
 
             MPSGraphTensor* updatedRunningMeanTensor = nil;
             MPSGraphTensor* updatedRunningVarTensor = nil;
+            MPSGraphTensor *scaledInverseSqrtVariance = nil;
 
             /*
             If train:
@@ -194,6 +203,7 @@ Check if running mean exists (maybe do this check before making graph)
 
             Compute the batch norm output and stats to be saved
             */
+            MPSGraphTensor *varTensor = nil;
 
             if(train) {
               // Compute mean and variance of the current batch
@@ -203,6 +213,7 @@ Check if running mean exists (maybe do this check before making graph)
               MPSGraphTensor* batchVarianceTensor = [mpsGraph varianceOfTensor:inputTensor
                                                                           axes:axes
                                                                           name:nil];
+              varTensor = batchVarianceTensor;
               if(has_running_mean) {
                 // TODO: This is not the formula used in PyTorch, is this OK? Seems more robust
                 // float besselCorrectionTerm = float(N) / std::max(N - 1.0f, 1.0f);
@@ -239,14 +250,27 @@ Check if running mean exists (maybe do this check before making graph)
                 updatedRunningVarTensor = [mpsGraph additionWithPrimaryTensor:scaledCorrectedBatchVar
                                                               secondaryTensor:scaledRunningVar
                                                                          name:nil];
-                // Update saved mean and inverse std tensor
-                saveMeanTensor = batchMeanTensor;
-                saveVarTensor = batchVarianceTensor;
-            }
-            else {
-              saveMeanTensor = batchMeanTensor;
-              saveVarTensor = batchVarianceTensor;
             }
+            // Update saved mean and inverse std tensor
+            MPSGraphTensor *epsilonTensor = [mpsGraph constantWithScalar:(double)epsilon
+                                                                   shape:@[@1]
+                                                                dataType:MPSDataTypeFloat32];
+
+            MPSGraphTensor *varianceEps = [mpsGraph additionWithPrimaryTensor:batchVarianceTensor
+                                                              secondaryTensor:epsilonTensor
+                                                                         name:@"varianceEps"];
+
+            MPSGraphTensor *sqrtVariance = [mpsGraph squareRootWithTensor:varianceEps
+                                                                     name:@"sqrtVariance"];
+            float primary = 1.0f;
+            MPSGraphTensor *primaryTensor = [mpsGraph constantWithScalar:primary dataType:MPSDataTypeFloat32];
+
+            scaledInverseSqrtVariance = [mpsGraph divisionWithPrimaryTensor:primaryTensor
+                                                            secondaryTensor:sqrtVariance
+                                                                       name:nil];
+            // Update saved mean and inverse std tensor
+            saveMeanTensor = batchMeanTensor;
+            saveVarTensor = scaledInverseSqrtVariance;
           }
           else { // Test
             TORCH_CHECK(has_running_mean);
@@ -254,12 +278,13 @@ Check if running mean exists (maybe do this check before making graph)
                                                      name:nil];
             saveVarTensor = [mpsGraph identityWithTensor:runningVarTensor
                                                     name:nil];
+            varTensor = saveVarTensor;
           }
 
           // Compute output of batch norm
           MPSGraphTensor* outputTensor = [mpsGraph normalizationWithTensor:inputTensor
                                                                 meanTensor:saveMeanTensor
-                                                            varianceTensor:saveVarTensor
+                                                            varianceTensor:varTensor
                                                                gammaTensor:weightTensor
                                                                 betaTensor:biasTensor
                                                                    epsilon:(float)epsilon
@@ -299,7 +324,7 @@ Check if running mean exists (maybe do this check before making graph)
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, self, input_shape);
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, self, input_shape, executeGatherOp);
     auto weightPlaceholder = native_mps::Placeholder();
     if(has_weight)
       weightPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_opt.value(), new_mean_shape);
@@ -321,7 +346,7 @@ Check if running mean exists (maybe do this check before making graph)
       runningVarInplaceUpdatePlaceholder = native_mps::Placeholder(cachedGraph->runningVarInplaceUpdate_, running_var_opt.value());
     }
 
-    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output, input_shape);
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output, input_shape, false);
     auto saveMeanPlaceholder = native_mps::Placeholder(cachedGraph->saveMeanTensor_, save_mean);
     auto saveVarPlaceholder = native_mps::Placeholder(cachedGraph->saveVarTensor_, save_var);
 
@@ -351,6 +376,10 @@ Check if running mean exists (maybe do this check before making graph)
 
   }
 
+  if(!train) {
+    save_mean.resize_({0});
+    save_var.resize_({0});
+  }
   return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_var);
 }
 
@@ -649,11 +678,24 @@ string get_mem_string(c10::MemoryFormat memory_format) {
 
           if(train) {
             // Use save_mean and save_var
+            float primary = 1.0f;
+            MPSGraphTensor *primaryTensor = [mpsGraph constantWithScalar:primary dataType:MPSDataTypeFloat32];
+            MPSGraphTensor *epsilonTensor = [mpsGraph constantWithScalar:(float)epsilon dataType:MPSDataTypeFloat32];
+            MPSGraphTensor *revertSaveVarTensor = saveVarTensor;
+            revertSaveVarTensor = [mpsGraph divisionWithPrimaryTensor: primaryTensor
+                                                      secondaryTensor: revertSaveVarTensor
+                                                                 name: nil];
+            revertSaveVarTensor = [mpsGraph multiplicationWithPrimaryTensor: revertSaveVarTensor
+                                                            secondaryTensor: revertSaveVarTensor
+                                                                       name: nil];
+            revertSaveVarTensor = [mpsGraph subtractionWithPrimaryTensor: revertSaveVarTensor
+                                                         secondaryTensor: epsilonTensor
+                                                                    name: nil];
             if(grad_input_mask[1]) {
               gradWeightTensor = [mpsGraph normalizationGammaGradientWithIncomingGradientTensor:gradOutputTensor
                                                                                    sourceTensor:inputTensor
                                                                                      meanTensor:saveMeanTensor
-                                                                                 varianceTensor:saveVarTensor
+                                                                                 varianceTensor:revertSaveVarTensor
                                                                                   reductionAxes:axes
                                                                                         epsilon:(float)epsilon
                                                                                            name:nil];
@@ -668,7 +710,7 @@ string get_mem_string(c10::MemoryFormat memory_format) {
               gradInputTensor = [mpsGraph normalizationGradientWithIncomingGradientTensor:gradOutputTensor
                                                                              sourceTensor:inputTensor
                                                                                meanTensor:saveMeanTensor
-                                                                           varianceTensor:saveVarTensor
+                                                                           varianceTensor:revertSaveVarTensor
                                                                               gammaTensor:weightTensor
                                                                       gammaGradientTensor:gradWeightTensor
                                                                        betaGradientTensor:gradBiasTensor
@@ -870,7 +912,7 @@ string get_mem_string(c10::MemoryFormat memory_format) {
   const int normalized_ndim = normalized_shape.size();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int axis = input_ndim - normalized_ndim;
-  at::Tensor input_reshaped = input.reshape({1, M, -1});
+  at::Tensor input_reshaped = input.numel() == 0 ? input.reshape({1, M, 0}) : input.reshape({1, M, -1});
   // Unlike Batch Normalization, which applies scalar scale and bias for each
   // entire channel/plane with the affine option, Layer Normalization applies
   // per-element scale and bias. E.g. For input {N, C, H, W}, weight for
@@ -890,8 +932,6 @@ string get_mem_string(c10::MemoryFormat memory_format) {
   at::Tensor mean = std::get<1>(outputs);
   at::Tensor variance = std::get<2>(outputs);
 
-  at::Tensor rstd = at::rsqrt(at::add(variance, eps));
-
   std::vector<int64_t> stat_shape;
   for (const auto idx : c10::irange(axis)) {
     stat_shape.push_back(input_shape[idx]);
@@ -901,8 +941,8 @@ string get_mem_string(c10::MemoryFormat memory_format) {
     stat_shape.push_back(1);
   }
   mean = mean.view(stat_shape);
-  rstd = rstd.view(stat_shape);
-  return std::make_tuple(out, mean, rstd);
+  variance = variance.view(stat_shape);
+  return std::make_tuple(out, mean, variance);
 }
 
 std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_mps(
diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
index 2eb9da9449bb..92109c64caf1 100644
--- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -14,8 +14,9 @@
                              const bool is_div,
                              const string op_name)
 {
-  if (&output != &self) {
-    output.resize_(output.sizes());
+  if (value_opt.toDouble() == 0.0) {
+    output.copy_(self);
+    return output;
   }
 
   if(output.numel() == 0) {
@@ -48,7 +49,7 @@
             newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
             newCachedGraph->firstTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor1);
             newCachedGraph->secondTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor2);
-            newCachedGraph->valueTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()));
+            newCachedGraph->valueTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()), @[@1]);
 
             // the tensor to be optionally multiplied by value_scalar
             MPSGraphTensor *multiplicandTensor = nil;
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index e404572d51de..ff26ff83518c 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -25,8 +25,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
                             const c10::optional<Tensor>& grad_output_opt,
                             IntArrayRef kernel_size, IntArrayRef stride,
                             IntArrayRef padding, IntArrayRef dilation,
-                            bool ceil_mode, const c10::optional<float> divisor,
-                            PoolingOpBlock poolingBlock, const c10::string& op_name) {
+                            bool ceil_mode, bool count_include_pad,
+                            const c10::optional<int64_t> divisor_override,
+                            PoolingOpBlock poolingBlock, const c10::string& op_name)
+{
   if (input.numel() == 0) {
     return;
   }
@@ -39,7 +41,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
   const Tensor& indices = *(at::borrow_from_optional_tensor(indices_opt));
   const bool is_backward_pass = grad_output.defined();
   const bool has_indices = indices.defined();
-  const bool has_divisor = divisor.has_value();
+  const bool has_divisor = divisor_override.has_value() && divisor_override.value() != 0;
   const auto suggested_memory_format = input.suggest_memory_format();
   // for max_pool2d_with_indices() we cannot pass ChannelsLast (i.e., NHWC) to 'desc.dataLayout' in MPSGraph.
   // Because the returned indices will be selected based on NHWC memory layout which will
@@ -63,12 +65,12 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
     AT_ERROR("Unsupported memory format. Supports only ChannelsLast, Contiguous");
   }
 
+  int padH = safe_downcast<int, int64_t>(padding[0]);
+  int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
   const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
   const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
   const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
   const int dW = stride.empty() ? kW : stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
   const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
   const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
   const int64_t nbatch = ndims == 4 ? input.size(-4) : 1;
@@ -81,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
   pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
 
+  auto output_memory_format = output.suggest_memory_format();
   // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
   // by simply restriding them (instead of calling the costly Contiguous()).
   if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
@@ -92,20 +95,28 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
       outputSizes.insert(outputSizes.begin(), nbatch);
     }
     output.resize_(outputSizes);
-  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+  } else if (output_memory_format == MemoryFormat::ChannelsLast) {
     output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+    output_memory_format = MemoryFormat::Contiguous;
   }
 
   if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
     return;
   }
+  // workaround for issue #103039644: mismatching MPS vs. CPU results
+  // when both ceil_mode and count_include_pad are True
+  if (count_include_pad && ceil_mode) {
+    padH = padW = 0;
+  }
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   @autoreleasepool {
     string key = op_name + getTensorsStringKey({input, indices, grad_output}) + ":K[" +
                  getArrayRefString(kernel_size) + "]:S[" + getArrayRefString(stride) + "]:P[" +
                  getArrayRefString(padding) + "]:D[" + getArrayRefString(dilation) + "]" +
-                 (ceil_mode ? ":ceil" : "") + ":" + (suggested_memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
+                 (ceil_mode ? ":ceil" : "") + (count_include_pad ? ":include_pad" : "") +
+                 (has_divisor ? ":divisor" : "") + ":" +
+                 (suggested_memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
 
     MPSShape* inputShape = getMPSShape(input, memory_format);
     MPSShape* gradOutputShape = is_backward_pass ? getMPSShape(grad_output, memory_format) : nullptr;
@@ -144,7 +155,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
             newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(grad_output.scalar_type()), gradOutputShape);
           }
           if (has_divisor) {
-            newCachedGraph->divisorTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(output.scalar_type()), @[@1]);
+            newCachedGraph->divisorTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeFloat32, @[@1]);
           }
           MPSGraphTensor* outputTensor = poolingBlock(*newCachedGraph, desc);
           // with desc.dataLayout = NHWC (i.e., ChannelsLast), the results need to be converted back to NCHW
@@ -181,17 +192,123 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
     }
     MPSScalar divisor_scalar;
     if (cachedGraph->divisorTensor) {
-      divisor_scalar = getMPSScalar(divisor.value(), output.scalar_type());
+      const float divisor = float(kH * kW) / (float) divisor_override.value();
+      divisor_scalar = getMPSScalar(divisor, ScalarType::Float);
       feeds[cachedGraph->divisorTensor] = getMPSGraphTensorFromScalar(mpsStream, divisor_scalar);
     }
 
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+
+    if (output_memory_format != suggested_memory_format) {
+      const_cast<Tensor&>(output) = output.to(suggested_memory_format);
+    }
   }
 }
 
+static void avg_pool2d_template(const Tensor& input, const Tensor& output,
+                                const c10::optional<Tensor>& grad_output_opt,
+                                IntArrayRef kernel_size, IntArrayRef stride,
+                                IntArrayRef padding, IntArrayRef dilation,
+                                bool ceil_mode, bool count_include_pad,
+                                const c10::optional<int64_t> divisor_override,
+                                const c10::string& op_name)
+{
+  const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
+  const bool is_backward_pass = grad_output.defined();
+  const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
+
+  // custom divisor isn't supported natively in avgPooling2DWithSourceTensor().
+  // For Float input type, we work around it by multiplying divisor after avgPooling2D.
+  // However, for Long type, the accumulated error when multiplying the divisor
+  // would produce results that mismatch CPU results.
+  if (use_divisor && input.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: passing divisor to Average Pooling op with int64 input is ",
+                    "not supported on MPS backend. ",
+                    "Falling back on CPU. This may have performance implications.");
+    if (!is_backward_pass) {
+      const_cast<Tensor&>(output) = at::avg_pool2d(input.to("cpu"), kernel_size, stride, padding, ceil_mode,
+                                       count_include_pad, divisor_override).clone().to("mps");
+    } else {
+      const_cast<Tensor&>(output) = at::avg_pool2d_backward(grad_output.to("cpu"), input.to("cpu"),
+                                       kernel_size, stride, padding, ceil_mode, count_include_pad,
+                                       divisor_override).clone().to("mps");
+    }
+    return;
+  }
+
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    const int64_t ndims = input.ndimension();
+    MPSShape *paddingShape = nil;
+    MPSGraphTensor* paddedTensor = cachedGraph.inputTensor;
+
+    // workaround for issue #103039644: mismatching MPS vs. CPU results
+    // when both ceilMode and includeZeroPadToAverage are True
+    const bool explicit_padding = count_include_pad && ceil_mode;
+    if (explicit_padding) {
+      std::vector<NSNumber*> padVec(ndims, @(0));
+      padVec[ndims - 1] = @(padding.size() == 1 ? padding[0] : padding[1]);
+      padVec[ndims - 2] = @(ndims > 3 ? padding[0] : 0);
+      paddingShape = [NSArray arrayWithObjects: padVec.data() count:ndims];
+      paddedTensor = [mpsGraph padTensor: cachedGraph.inputTensor
+                         withPaddingMode: MPSGraphPaddingModeZero
+                             leftPadding: paddingShape
+                            rightPadding: paddingShape
+                           constantValue: 0.0
+                                    name: nil];
+      paddedTensor = [mpsGraph identityWithTensor: paddedTensor name: nil];
+    } else {
+      desc.includeZeroPadToAverage = count_include_pad;
+    }
+    if (use_divisor) {
+      desc.includeZeroPadToAverage = YES;
+    }
+
+    if (!is_backward_pass) {
+      MPSGraphTensor* avgPoolTensor = [mpsGraph avgPooling2DWithSourceTensor: paddedTensor
+                                                                  descriptor: desc
+                                                                        name: nil];
+      if (cachedGraph.divisorTensor) {
+        // workaround: custom divisor isn't supported by MPS backend, so we scale manually
+        return [mpsGraph multiplicationWithPrimaryTensor: avgPoolTensor
+                                         secondaryTensor: cachedGraph.divisorTensor
+                                                    name: nil];
+      } else {
+        return avgPoolTensor;
+      }
+    } else { // backward pass
+      MPSGraphTensor* scaledGradTensor = cachedGraph.gradOutputTensor;
+      if (cachedGraph.divisorTensor) {
+        scaledGradTensor = [mpsGraph multiplicationWithPrimaryTensor: cachedGraph.gradOutputTensor
+                                                     secondaryTensor: cachedGraph.divisorTensor
+                                                                name: nil];
+      }
+      MPSGraphTensor* avgPoolTensor = [mpsGraph avgPooling2DGradientWithGradientTensor: scaledGradTensor
+                                                                          sourceTensor: paddedTensor
+                                                                            descriptor: desc
+                                                                                  name: nil];
+      if (explicit_padding) {
+        return [mpsGraph padGradientWithIncomingGradientTensor: avgPoolTensor
+                                                  sourceTensor: cachedGraph.inputTensor
+                                                   paddingMode: MPSGraphPaddingModeZero
+                                                   leftPadding: paddingShape
+                                                  rightPadding: paddingShape
+                                                          name: nil];
+
+      } else {
+        return avgPoolTensor;
+      }
+    }
+  };
+
+  pool2d_template(input, output, c10::nullopt, grad_output_opt, kernel_size, stride,
+                  padding, {1, 1}, ceil_mode, count_include_pad, divisor_override,
+                  pooling_op_block, op_name);
+}
+
 } // namespace mps
 
-Tensor _mps_max_pool2d(
+Tensor mps_max_pool2d(
     const Tensor& input,
     IntArrayRef kernel_size,
     IntArrayRef stride,
@@ -207,7 +324,7 @@ Tensor _mps_max_pool2d(
                                              name: nil];
   };
   mps::pool2d_template(input, output, c10::nullopt, c10::nullopt, kernel_size, stride,
-                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d");
+                       padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d");
 
   return output;
 }
@@ -230,7 +347,7 @@ Tensor mps_max_pool2d_backward(
                                                        name: nil];
   };
   mps::pool2d_template(input, grad_input, c10::nullopt, grad_output, kernel_size, stride,
-                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d_backward");
+                       padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_backward");
 
   return grad_input;
 }
@@ -245,6 +362,8 @@ Tensor mps_max_pool2d_backward(
     const Tensor& output,
     const Tensor& indices) {
 
+  auto indices_memory_format = indices.suggest_memory_format();
+
   mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
     MPSGraph* mpsGraph = cachedGraph.graph();
     NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
@@ -254,7 +373,11 @@ Tensor mps_max_pool2d_backward(
     return poolOutputs[0];
   };
   mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
-                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d_indices");
+                       padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices");
+
+  if (indices_memory_format == MemoryFormat::ChannelsLast) {
+    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  }
 }
 
 TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(
@@ -276,40 +399,24 @@ Tensor mps_max_pool2d_backward(
                                                        name: nil];
   };
   mps::pool2d_template(input, grad_input, indices, grad_output, kernel_size, stride,
-                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d_indices_backward");
+                       padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices_backward");
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_out_mps) (
-   const Tensor& input,
-   int64_t kH,
-   int64_t kW,
-   int64_t dH,
-   int64_t dW,
-   int64_t padH,
-   int64_t padW,
-   bool ceil_mode,
-   bool count_include_pad,
-   c10::optional<int64_t> divisor_override,
+    const Tensor& input,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override,
    const Tensor& output) {
 
-  const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
-  float divisor = use_divisor ? float(kH * kW) / (float) divisor_override.value() : 1.0f;
-  count_include_pad = use_divisor ? use_divisor : count_include_pad;
-
-  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
-    MPSGraph* mpsGraph = cachedGraph.graph();
-    desc.includeZeroPadToAverage = count_include_pad;
-    MPSGraphTensor* avgPoolTensor = [mpsGraph avgPooling2DWithSourceTensor: cachedGraph.inputTensor
-                                                                descriptor: desc
-                                                                      name: nil];
-    // workaround: custom divisor isn't supported by MPS backend, so we scale manually
-    return [mpsGraph multiplicationWithPrimaryTensor: avgPoolTensor
-                                     secondaryTensor: cachedGraph.divisorTensor
-                                                name: nil];
-  };
-  mps::pool2d_template(input, output, c10::nullopt, c10::nullopt, {kH, kW}, {dH, dW},
-                       {padH, padW}, {1, 1}, ceil_mode, divisor, pooling_op_block,
-                       std::string("avg_pool2d") + (count_include_pad ? "_include_pad" : ""));
+  mps::avg_pool2d_template(input, output, c10::nullopt, {kH, kW}, {dH, dW}, {padH, padW},
+                           {1, 1}, ceil_mode, count_include_pad, divisor_override, "avg_pool2d");
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps) (
@@ -323,25 +430,8 @@ Tensor mps_max_pool2d_backward(
     c10::optional<int64_t> divisor_override,
     const Tensor& gradInput) {
 
-  const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
-  float divisor = use_divisor ? float(kernel_size[0] * kernel_size[1]) / (float) divisor_override.value() : 1.0f;
-  count_include_pad = use_divisor ? use_divisor : count_include_pad;
-
-  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
-    MPSGraph* mpsGraph = cachedGraph.graph();
-    desc.includeZeroPadToAverage = count_include_pad;
-    // workaround: custom divisor isn't supported by MPS backend, so we scale manually
-    MPSGraphTensor* scaledGradTensor = [mpsGraph multiplicationWithPrimaryTensor: cachedGraph.gradOutputTensor
-                                                                 secondaryTensor: cachedGraph.divisorTensor
-                                                                            name: nil];
-    return [mpsGraph avgPooling2DGradientWithGradientTensor: scaledGradTensor
-                                               sourceTensor: cachedGraph.inputTensor
-                                                 descriptor: desc
-                                                       name: nil];
-  };
-  mps::pool2d_template(input, gradInput, c10::nullopt, gradOutput, kernel_size, stride,
-                       padding, {1, 1}, ceil_mode, divisor, pooling_op_block,
-                       std::string("avg_pool2d_backward") + (count_include_pad ? "_include_pad" : ""));
+  mps::avg_pool2d_template(input, gradInput, gradOutput, kernel_size, stride, padding,
+                           {1, 1}, ceil_mode, count_include_pad, divisor_override, "avg_pool2d_backward");
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm
index cace3ad4e132..9cfd14236219 100644
--- a/aten/src/ATen/native/mps/operations/RangeFactories.mm
+++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm
@@ -87,6 +87,82 @@
       }
       result.resize_({size});
     }
+
+    if (result.numel() == 0) {
+      return;
+    }
+
+    bool is_contiguous = result.is_contiguous();
+    Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+    using namespace mps;
+    auto cache_ = MPSGraphCache::getInstance();
+    auto stream = getCurrentMPSStream();
+    auto mpsDataType = getMPSDataType(result.scalar_type());
+    @autoreleasepool {
+      string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + to_string(size);
+      auto cachedGraph = static_cast<RangeCachedGraph *>(cache_->LookUp(key));
+      if (!cachedGraph) {
+        auto *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph *() {
+          auto mpsGraph = make_mps_graph();
+          return new RangeCachedGraph(mpsGraph, mpsDataType, size);
+        });
+        cachedGraph = static_cast<RangeCachedGraph *>(tmpCachedGraph);
+      }
+      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, r);
+      NSMutableDictionary *feeds   = [[NSMutableDictionary new] autorelease];
+      MPSScalar startScalar = getMPSScalar(start, result.scalar_type());
+      feeds[cachedGraph->startTensor] = getMPSGraphTensorFromScalar(stream, startScalar);
+      MPSScalar stepScalar = getMPSScalar(step, result.scalar_type());
+      feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, stepScalar);
+
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      };
+      runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    }
+
+    if(!is_contiguous) {
+      result.copy_(r);
+    }
+  });
+
+  return result;
+}
+
+Tensor& range_mps_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
+  AT_DISPATCH_MPS_TYPES(result.scalar_type(), "arange_mps", [&]() {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
+
+    // double size_d = ((xend - xstart) / xstep) + 1;
+    double size_d;
+    if (std::is_same<scalar_t, int64_t>::value) {
+      size_d = static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
+                / step.to<accscalar_t>() + 1;
+    } else {
+      size_d = static_cast<double>(end.to<double>() - start.to<double>())
+                / step.to<double>() + 1;
+    }
+
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+              std::isfinite(static_cast<double>(xend)),
+              "unsupported range: ", xstart, " -> ", xend);
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+              "upper bound and larger bound inconsistent with step sign");
+
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+              "invalid size, possible overflow?");
+
+    int64_t size = static_cast<int64_t>(size_d);
+
+    int64_t numel = result.numel();
+
+    if (numel != size) {
+      result.resize_({size});
+    }
     bool is_contiguous = result.is_contiguous();
     Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
     using namespace mps;
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 816bf5bcacbb..583d12dde877 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -31,7 +31,8 @@
   PROD,
   MEAN,
   COUNT_NONZERO,
-  TRACE
+  TRACE,
+  NANSUM,
 };
 
 using namespace mps;
@@ -138,6 +139,10 @@ void reduction_out_mps(
   MPSReductionType reduction_type,
   const std::string& func_name) {
 
+  // issue 103641234, reduction ops does not have int64 support
+  if (input_t.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: no support for int64 reduction ops, casting it to int32");
+  }
   IntArrayRef input_shape = input_t.sizes();
 
   if (opt_dim.has_value()) {
@@ -162,6 +167,9 @@ void reduction_out_mps(
     if (reduction_type == MPSReductionType::PROD) {
       output_t.fill_(1);
     }
+    else if (reduction_type == MPSReductionType::SUM) {
+      output_t.zero_();
+    }
     return;
   }
 
@@ -196,7 +204,10 @@ void reduction_out_mps(
              (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
             inputCastDtype = getMPSDataType(dtype.value());
           } else if (input_type != MPSDataTypeInt32   &&
-                     input_type != MPSDataTypeFloat32) {
+                     input_type != MPSDataTypeFloat32 &&
+                     input_type != MPSDataTypeFloat16) {
+            inputCastDtype = MPSDataTypeFloat32;
+          } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) {
             inputCastDtype = MPSDataTypeFloat32;
           }
 
@@ -240,13 +251,29 @@ void reduction_out_mps(
                                                                axes:wrappedAxes
                                                                name:nil];
           } else if (reduction_type == MPSReductionType::TRACE) {
-            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor
                                                                      numLower:0
                                                                      numUpper:0
                                                                          name:nil];
             castOutputTensor = [mpsGraph reductionSumWithTensor:bandPartWithTensor
                                                            axes:@[@0, @1]
                                                            name:nil];
+          } else if (reduction_type == MPSReductionType::NANSUM) {
+            // Create a 0 tensor of the same shape as inputTensor
+            MPSGraphTensor* zeros = [mpsGraph constantWithScalar:0.0
+                                                        dataType:castInputTensor.dataType];
+            // Find NaNs
+            MPSGraphTensor* nanMask = [mpsGraph isNaNWithTensor:castInputTensor
+                                                           name:nil];
+            // Replace NaNs with 0
+            MPSGraphTensor* nanReplaced = [mpsGraph selectWithPredicateTensor:nanMask
+                                                          truePredicateTensor:zeros
+                                                         falsePredicateTensor:castInputTensor
+                                                                         name:nil];
+            // Sum
+            castOutputTensor = [mpsGraph reductionSumWithTensor:nanReplaced
+                                                           axes:wrappedAxes
+                                                           name:nil];
           }
 
           MPSGraphTensor* outputTensor = nil;
@@ -289,10 +316,37 @@ void reduction_out_mps(
   reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::SUM, "sum_out_mps");
 }
 
+Tensor& nansum_out_mps(
+    const Tensor& self,
+    OptionalIntArrayRef dim,
+    bool keepdim,
+    c10::optional<ScalarType> opt_dtype,
+    Tensor& result) {
+  TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "nansum does not support complex inputs");
+  if (c10::isIntegralType(self.scalar_type(), true)){
+    return at::sum_out(result, self, dim, keepdim, opt_dtype);
+  }
+  ScalarType dtype = get_dtype_from_result(result, opt_dtype);
+  const auto mask = make_dim_mask(dim, self.dim());
+  resize_reduction_result(result, self, mask, keepdim, dtype);
+  reduction_out_mps(self, dim, keepdim, dtype, result, MPSReductionType::NANSUM, "nansum_out_mps");
+  return result;
+}
+
+Tensor nansum_mps(
+    const Tensor& self,
+    OptionalIntArrayRef dim,
+    bool keepdim,
+    c10::optional<ScalarType> opt_dtype) {
+  ScalarType dtype = get_dtype_from_self(self, opt_dtype, true);
+  Tensor result = create_reduction_result(self, dim, keepdim, dtype);
+  return nansum_out_mps(self, dim, keepdim, dtype, result);
+}
+
 Tensor trace_mps_out(const Tensor& self) {
   Tensor output_t = at::native::empty_mps(
                     {},
-                    self.scalar_type(),
+                    get_dtype_from_self(self, c10::nullopt, true),
                     c10::nullopt,
                     kMPS,
                     c10::nullopt,
@@ -316,22 +370,6 @@ Tensor trace_mps_out(const Tensor& self) {
   reduction_out_mps(input_t, IntArrayRef(dims, 1), keepdim, dtype, output_t, MPSReductionType::PROD, "prod_out_mps");
 }
 
-// Taken from ReduceOps.cpp
-inline ScalarType get_dtype_from_self(
-    const Tensor& self,
-    const c10::optional<ScalarType>& dtype,
-    bool promote_integers) {
-  if (dtype.has_value()) {
-    return dtype.value();
-  }
-
-  ScalarType src_type = self.scalar_type();
-  if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) {
-    return kLong;
-  }
-  return src_type;
-}
-
 TORCH_IMPL_FUNC(amax_out_mps)(
   const Tensor& input_t,
   IntArrayRef dim,
@@ -681,7 +719,7 @@ Tensor _cdist_forward_mps(const Tensor& x1, const Tensor& x2, const double p, c1
 Tensor std_var_common_impl_mps(
   const Tensor & input_t,
   at::OptionalIntArrayRef dim,
-  c10::optional<int64_t> correction,
+  const c10::optional<Scalar>& correction,
   bool keepdim,
   StdVarType stdVarType) {
   using CachedGraph = MPSUnaryCachedGraph;
@@ -701,8 +739,8 @@ Tensor std_var_common_impl_mps(
     }
   }
 
-  bool use_correction = !(correction.has_value() && correction.value() == 0);
-  const auto correction_value = correction.value_or(1);
+  bool use_correction = !(correction.has_value() && correction.value().toDouble() == 0);
+  const auto correction_value = correction.value_or(1.0).toDouble();
   int64_t correction_n = 1;
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@@ -822,7 +860,8 @@ Tensor std_var_common_impl_mps(
     return output_t;
   }
 
-  double bessel_correction = static_cast<double>(correction_n) / static_cast<double>(correction_n - correction_value);
+  double dof = std::max(0.0, correction_n - correction_value);
+  double bessel_correction = correction_n / dof;
   auto stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
@@ -893,7 +932,7 @@ Tensor std_var_common_impl_mps(
 Tensor var_mps(
   const Tensor & input_t,
   at::OptionalIntArrayRef dim,
-  c10::optional<int64_t> correction,
+  const c10::optional<Scalar>& correction,
   bool keepdim)
 {
   return std_var_common_impl_mps(input_t, dim, correction, keepdim, STANDARD_VARIANCE);
@@ -902,7 +941,7 @@ Tensor var_mps(
 Tensor std_mps(
    const Tensor & input_t,
    at::OptionalIntArrayRef dim,
-   c10::optional<int64_t> correction,
+   const c10::optional<Scalar>& correction,
    bool keepdim)
 {
   return std_var_common_impl_mps(input_t, dim, correction, keepdim, STANDARD_DEVIATION);
@@ -995,7 +1034,13 @@ Tensor std_mps(
 
 TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t) {
   using CachedGraph = MPSUnaryCachedGraph;
-  if (output_t.numel() == 0 || input_t.numel() == 0) {
+  if (input_t.numel() == 0) {
+    output_t.zero_();
+    return;
+  } else if (input_t.numel() == 1) {
+    output_t.copy_(input_t.view_as(output_t).to(at::kBool));
+    return;
+  } else if (output_t.numel() == 0) {
     return;
   }
 
@@ -1223,7 +1268,9 @@ Tensor std_mps(
   (const Tensor& input_t,
    MPSReductionType reduction_type,
    const std::string& func_name) {
-  TORCH_CHECK(input_t.scalar_type() != ScalarType::Long, "MPS does not support min/max ops with int64 input");
+  if (input_t.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: no support for int64 min/max ops, casting it to int32");
+  }
 
   using CachedGraph = MPSUnaryCachedGraph;
 
@@ -1252,6 +1299,7 @@ Tensor std_mps(
 
           MPSGraphTensor* outputTensor = nil;
           MPSGraphTensor* castInputTensor = nil;
+          MPSGraphTensor* castOutputTensor = nil;
 
           if (input_t.scalar_type() != ScalarType::Float &&
               input_t.scalar_type() != ScalarType::Int   &&
@@ -1274,8 +1322,15 @@ Tensor std_mps(
                                                            name:nil];
           }
 
+          if(input_t.scalar_type() == ScalarType::Long) {
+            castOutputTensor =  [mpsGraph castTensor:outputTensor
+                                             toType:MPSDataTypeInt64
+                                               name:@"castInputTensor"];
+          } else {
+            castOutputTensor = outputTensor;
+          }
           newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->outputTensor_ = castOutputTensor;
         }
         return newCachedGraph;
       });
@@ -1368,42 +1423,46 @@ Tensor min_mps(const Tensor& input_t) {
 
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
           MPSGraphTensor* outputTensor = nil;
-          if (reduction_type == MPSReductionType::MAX) {
-            outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
-                                                           axis:(NSInteger)dim_
-                                                           name:nil];
-          } else if (reduction_type == MPSReductionType::MIN) {
-            outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
-                                                           axis:(NSInteger)dim_
-                                                           name:nil];
-          }
 
-          MPSGraphTensor* castInputTensor = nil;
-
-          if (input_t.scalar_type() != ScalarType::Float &&
-              input_t.scalar_type() != ScalarType::Int   &&
-              input_t.scalar_type() != ScalarType::Half) {
+          MPSGraphTensor* castInputTensor = inputTensor;
+          bool castOutput = false;
+          if(input_t.scalar_type() != ScalarType::Float &&
+             input_t.scalar_type() != ScalarType::Int   &&
+             input_t.scalar_type() != ScalarType::Half) {
             castInputTensor =  [mpsGraph castTensor:inputTensor
                                              toType:MPSDataTypeInt32
                                                name:@"castInputTensor"];
-          } else {
-            castInputTensor = inputTensor;
+            castOutput = true;
           }
 
+          if(reduction_type == MPSReductionType::MAX)
+            outputTensor = [mpsGraph reductionMaximumWithTensor:castInputTensor
+                                                           axis:(NSInteger)dim_
+                                                           name:nil];
+          else if(reduction_type == MPSReductionType::MIN)
+            outputTensor = [mpsGraph reductionMinimumWithTensor:castInputTensor
+                                                           axis:(NSInteger)dim_
+                                                           name:nil];
+
           MPSGraphTensor* argreduceOutTensor = nil;
-          if (reduction_type == MPSReductionType::MAX) {
-            argreduceOutTensor = [mpsGraph reductionArgMaximumWithTensor: castInputTensor
-                                                                    axis: (NSInteger)dim_
-                                                                    name: @"argmax_out"];
-          } else if (reduction_type == MPSReductionType::MIN) {
-            argreduceOutTensor = [mpsGraph reductionArgMinimumWithTensor: castInputTensor
-                                                                    axis: (NSInteger)dim_
-                                                                    name: @"argmax_out"];
+          if(reduction_type == MPSReductionType::MAX)
+            argreduceOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor
+                                                                    axis:(NSInteger)dim_
+                                                                    name:@"argmax_out"];
+          else if(reduction_type == MPSReductionType::MIN)
+            argreduceOutTensor = [mpsGraph reductionArgMinimumWithTensor:castInputTensor
+                                                                    axis:(NSInteger)dim_
+                                                                    name:@"argmax_out"];
+
+          MPSGraphTensor *indicesTensor = [mpsGraph castTensor:argreduceOutTensor
+                                                        toType:MPSDataTypeInt64
+                                                          name:@"cast_out"];
+
+          if (castOutput) {
+            outputTensor = [mpsGraph castTensor:outputTensor
+                                         toType:getMPSDataType(output_t.scalar_type())
+                                           name:@"cast_out"];
           }
-          MPSGraphTensor *indicesTensor = [mpsGraph castTensor: argreduceOutTensor
-                                                        toType: MPSDataTypeInt64
-                                                          name: @"cast_out"];
-
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->outputTensor_ = outputTensor;
           newCachedGraph->indicesTensor_ = indicesTensor;
@@ -1723,11 +1782,21 @@ Tensor median_mps(const Tensor& input_t) {
         @autoreleasepool {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
-
           auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
           auto reshapedTensor = [mpsGraph reshapeTensor: inputTensor
                                               withShape: @[@-1]
                                                    name: nil];
+          MPSDataType dataType = [inputTensor dataType];
+          // #issue 104398441 sortWithTensor only supports following types, cast if necessary
+          if (dataType != MPSDataTypeInt32 &&
+              dataType != MPSDataTypeFloat32 &&
+              dataType != MPSDataTypeFloat16) {
+              dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+              reshapedTensor = [mpsGraph castTensor:reshapedTensor
+                                      toType:dataType
+                                        name:@"castReshapedTensor"];
+          }
+
           auto sortedTensor = [mpsGraph sortWithTensor: reshapedTensor
                                                   axis: ((NSUInteger) (int)0)
                                                   name: nil];
@@ -1807,7 +1876,7 @@ void median_out_mps(
   auto stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = func_name + ":" + to_string(dim_) + ":" + getTensorsStringKey(input_t);
+    string key = func_name + ":" + to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" + getTensorsStringKey(indices_t);
     CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
     if (!cachedGraph) {
@@ -1819,24 +1888,39 @@ void median_out_mps(
           auto mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-          auto sortedTensor = [mpsGraph sortWithTensor: inputTensor
-                                                  axis: (NSUInteger)dim_
-                                                  name: nil];
-          const NSUInteger midpoint = (dim_total_elements + 1) / 2 - 1;
-          auto outputTensor = [mpsGraph sliceTensor:sortedTensor
-                                          dimension:dim_
-                                              start:midpoint
-                                             length:1
-                                               name:nil];
-          auto argreduceOutTensor = [mpsGraph argSortWithTensor:inputTensor
-                                                           axis:(NSInteger)dim_
-                                                           name:@"argmax_out"];
-          auto argOutputTensor = [mpsGraph sliceTensor:argreduceOutTensor
-                                             dimension:dim_
-                                                 start:midpoint
-                                                length:1
-                                                  name:nil];
+          MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input_t.scalar_type()));
+          MPSGraphTensor* outputTensor = nil;
+          MPSGraphTensor* castInputTensor = inputTensor;
+          MPSDataType dataType = getMPSDataType(input_t.scalar_type());
+          // #issue 104398441 sortWithTensor only supports following types, cast if necessary
+          if (dataType != MPSDataTypeInt32 &&
+              dataType != MPSDataTypeFloat32 &&
+              dataType != MPSDataTypeFloat16) {
+              dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+              castInputTensor = [mpsGraph castTensor:inputTensor
+                                      toType:dataType
+                                        name:@"castInputTensor"];
+          }
+
+          MPSGraphTensor * sortedTensor = [mpsGraph
+                                              sortWithTensor:castInputTensor
+                                              axis:((NSUInteger) (int)dim_)
+                                              name:nil];
+
+          outputTensor = [mpsGraph sliceTensor:sortedTensor
+                                                    dimension:dim_
+                                                    start:((NSUInteger) (int)((dim_total_elements+1)/2 ) - 1)
+                                                    length:1
+                                                    name:nil];
+          MPSGraphTensor* argreduceOutTensor = nil;
+            argreduceOutTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                                    axis:(NSInteger)dim_
+                                                                    name:@"argmax_out"];
+          MPSGraphTensor* argOutputTensor = [mpsGraph sliceTensor:argreduceOutTensor
+                                                    dimension:dim_
+                                                    start:((NSUInteger) (int)((dim_total_elements+1)/2 ) - 1)
+                                                    length:1
+                                                    name:nil];
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->outputTensor_ = outputTensor;
@@ -1906,7 +1990,7 @@ void median_out_mps(
   int64_t num_input_dims = input_shape.size();
   NSMutableArray<NSNumber*> *apparent_out_shape = nil;
   // Use this if keepdim is false
-  int64_t num_output_dims = num_input_dims - 1;
+  int64_t num_output_dims = num_input_dims - 1 < 0 ? 0 : num_input_dims - 1;
 
   std::vector<int64_t> vec_apparent_out_shape(num_input_dims);
   std::vector<int64_t> vec_out_shape(num_output_dims);
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 4311769d9b64..d2155d2e7fe0 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -6,8 +6,10 @@
 
 #include <ATen/mps/MPSStream.h>
 #include <ATen/native/LinearAlgebraUtils.h>
+#include <ATen/native/Repeat.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <torch/library.h>
+#include <fmt/format.h>
 
 #ifdef __OBJC__
 #include <MetalPerformanceShaders/MetalPerformanceShaders.h>
@@ -71,6 +73,16 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
   }
 
   auto stream = at::mps::getCurrentMPSStream();
+  auto inputDataType = getMPSDataType(expanded_tensor.scalar_type());
+  auto outputDataType = getMPSDataType(result.scalar_type());
+  if (!is_macos_13_or_newer()) {
+     if (expanded_tensor.scalar_type() == kBool) {
+      inputDataType = MPSDataTypeInt8;
+     }
+     if (result.scalar_type() == kBool) {
+      outputDataType = MPSDataTypeInt8;
+     }
+  }
 
   @autoreleasepool {
     string key = "repeat_mps:" + getTensorsStringKey(self) + ":" + getArrayRefString(repeats);
@@ -84,7 +96,7 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, expanded_tensor);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(expanded_tensor));
           MPSGraphTensor* outputTensor = [mpsGraph tileTensor:inputTensor
                                                withMultiplier:getMPSShape(repeats)
                                                          name:nil];
@@ -97,8 +109,10 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, expanded_tensor);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+    Placeholder selfPlaceholder = Placeholder(
+      cachedGraph->inputTensor_, expanded_tensor, /*mpsShape=*/nil, /*gatherTensorData=*/true, inputDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, result, /*mpsShape=*/nil, /*gatherTensorData*/false, outputDataType);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
@@ -113,4 +127,120 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
   return result;
 }
 
-} // namespace at:;native
+static const char* METAL_REPEAT_INTERLEAVE = R"METAL_REPEAT(
+kernel void repeat_interleave(constant {0}     * repeat_ptr                [[buffer(0)]],
+                              constant int64_t * cumsum_ptr                [[buffer(1)]],
+                              device {0}       * result_ptr                [[buffer(2)]],
+                              uint               threads_per_threadgroup   [[threads_per_threadgroup]],
+                              uint               tid                       [[thread_position_in_grid]]) {{
+  int64_t end = cumsum_ptr[tid];
+  {0} repeat = repeat_ptr[tid];
+  int64_t start = end - repeat;
+  for (uint j = start; j < end; j++) {{
+    result_ptr[j] = tid;
+  }}
+}}
+)METAL_REPEAT";
+
+static
+id<MTLLibrary> compileRepeatInterleaveLib(id<MTLDevice> device, const std::string& t1) {
+  auto key = t1;
+  static std::unordered_map<std::string, id<MTLLibrary>> libMap;
+  auto it = libMap.find(key);
+  if (it != libMap.end()) {
+    return it->second;
+  }
+  NSError *error = nil;
+  MTLCompileOptions *options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion: MTLLanguageVersion2_3];
+  auto rc = [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(METAL_REPEAT_INTERLEAVE, t1).c_str()]
+                                 options:options
+                                   error:&error];
+ TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]);
+ libMap[key] = rc;
+ return rc;
+}
+
+static
+id<MTLComputePipelineState> getPipelineState(id<MTLDevice> device, const std::string& t1) {
+  static std::string kernel = "repeat_interleave";
+  auto key = kernel + t1;
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> cplMap;
+  auto it = cplMap.find(key);
+  if (it != cplMap.end()) {
+     return it->second;
+  }
+  NSError *error = nil;
+  auto library = compileRepeatInterleaveLib(device, t1);
+  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(func != nil, "Can't get kernel ", kernel);
+  auto rc = [device newComputePipelineStateWithFunction:func error:&error];
+  TORCH_CHECK(rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
+  cplMap[key] = rc;
+  return rc;
+}
+
+template <typename index_t>
+void computeRepeatIndices(
+  index_t* repeat_ptr,
+  int64_t* cumsum_ptr,
+  index_t* result_ptr,
+  int64_t size,
+  int64_t result_size) {
+  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
+  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
+  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
+  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
+
+  std::string scalar_type;
+  if (typeid(index_t) == typeid(int32_t)) {
+    scalar_type = "int32_t";
+  } else if (typeid(index_t) == typeid(int64_t)) {
+    scalar_type = "int64_t";
+  } else {
+    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
+  }
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+      id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+      id<MTLComputePipelineState> pipelineState = getPipelineState(MPSDevice::getInstance()->device(), scalar_type);
+
+      [computeEncoder setComputePipelineState: pipelineState];
+      [computeEncoder setBuffer:repeatBuffer offset:0 atIndex:0];
+      [computeEncoder setBuffer:cumsumBuffer offset:0 atIndex:1];
+      [computeEncoder setBuffer:resultBuffer offset:0 atIndex:2];
+      [computeEncoder setBytes:&size length:sizeof(size) atIndex:3];
+      MTLSize gridSize = MTLSizeMake(size, 1, 1);
+      NSUInteger threadsPerThreadgroup_ = pipelineState.maxTotalThreadsPerThreadgroup;
+      if (threadsPerThreadgroup_ > size) {
+          threadsPerThreadgroup_ = size;
+      }
+      MTLSize threadsPerThreadgroup = MTLSizeMake(threadsPerThreadgroup_, 1, 1);
+
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadsPerThreadgroup];
+      [computeEncoder endEncoding];
+      mpsStream->synchronize(SyncType::COMMIT_AND_CONTINUE);
+    }
+  });
+}
+
+Tensor repeat_interleave_mps(const Tensor& repeat_, c10::optional<int64_t> output_size) {
+  Tensor output;
+  Tensor repeat = repeat_;
+  if (repeat.scalar_type() == kLong) {
+    // #103810551: `repeat_interleave_common` uses cumsum to calculate the final shape of output,
+    // which currently doesn't support int64_t as input. Casting internally the indices to int32_t.
+    TORCH_WARN_ONCE("MPS: no support for int64 repeats mask, casting it to int32");
+    repeat = repeat.to(kInt);
+  }
+  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
+    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(
+        repeat, output_size);
+  });
+  return output;
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
index d46ce356318e..9e59a6cf7021 100644
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -23,17 +23,31 @@
     return output_dimensions;
 }
 
-std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
     using namespace mps;
+
+    //Projections are not currently supported, raise an error if needed
+    bool has_projections = (hx[0].size(2) != hx[1].size(2));
+    if(has_projections) {
+        AT_ERROR("LSTM with projections is not currently supported with MPS.");
+    }
+
+    TORCH_CHECK(!(!is_macos_13_or_newer() && num_layers > 1), "Multi-layer LSTM support in MPS available only on MacOS 13 onwards");
+
     std::vector<Tensor> kernel_weights;
     std::vector<Tensor> recurrent_kernel_weights;
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if (has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -44,8 +58,6 @@
       NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
       NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
       NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
-      std::vector<MPSGraphTensor*> outputCellStateFwdVector_;
-      std::vector<MPSGraphTensor*> outputZStateVector_;
     };
 
     MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@@ -67,12 +79,15 @@
             NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()];
             NSMutableArray<MPSGraphTensor*> *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
             NSMutableArray<MPSGraphTensor*> *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()];
+            NSMutableArray<MPSGraphTensor*> *layersOutputsList = [[NSMutableArray alloc] initWithCapacity:num_layers];
 
             for (size_t i = 0; i < num_layers; i += 1) {
                 [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                 [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                if(has_biases) {
+                    [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                    [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                }
             }
 
             MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
@@ -93,25 +108,28 @@
             }
 
             MPSGraphTensor* inputTensor_ = inputTensor;
-            MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
-                                                        dimension:0
-                                                        start:0
-                                                        length:1
-                                                        name:nil];
-            MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
-                                                                dimension:0
-                                                                start:0
-                                                                length:1
-                                                                name:nil];
             NSArray<MPSGraphTensor*>* outputs = nil;
             NSMutableArray<MPSGraphTensor*>* outputStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             NSMutableArray<MPSGraphTensor*>* outputCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             for(int i = 0; i < num_layers; i++) {
-                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                    secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                MPSGraphTensor* biasTensor = nil;
+                if(has_biases) {
+                    biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                     secondaryTensor:recurrentBiasList[i]
+                                                                name:nil];
+                }
+                MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
+                                                           dimension:0
+                                                               start:i
+                                                              length:1
+                                                                name:nil];
+                MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
+                                                               dimension:0
+                                                                   start:i
+                                                                  length:1
+                                                                    name:nil];
                 outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
                                         recurrentWeight:recurrentKernelWeightsList[i]
                                             inputWeight:kernelWeightsList[i]
@@ -121,18 +139,14 @@
                                              descriptor:opDesc
                                                    name:nil];
 
-
-                stateTensor_ = [mpsGraph sliceTensor:stateTensor
-                                                            dimension:0
-                                                            start:i
-                                                            length:1
-                                                            name:nil];
-                cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor
-                                                                    dimension:0
-                                                                    start:i
-                                                                    length:1
-                                                                    name:nil];
                 inputTensor_ = [outputs objectAtIndex:0];
+                // no need to keep a final layer output copy as it is
+                // returned anyway and not used in backprop
+                if(i != num_layers - 1) {
+                    [layersOutputsList addObject:[mpsGraph expandDimsOfTensor:inputTensor_
+                                                                         axis:0
+                                                                         name:nil]];
+                }
                 if(dropout_p>0.0 && train && (i!=num_layers-1)) {
                     inputTensor_ = [mpsGraph dropoutTensor:inputTensor_
                                                       rate:dropout_p
@@ -150,7 +164,7 @@
                                                             name:nil]];
             }
 
-            MPSGraphTensor* outputTensor = [outputs objectAtIndex:0];
+            MPSGraphTensor* outputTensor = inputTensor_;
             if (batch_first) {
                 outputTensor = [mpsGraph transposeTensor:outputTensor
                                                dimension:0
@@ -169,8 +183,11 @@
             MPSGraphTensor* outputCellStatesFwd = [mpsGraph concatTensors:outputCellStateFwdArray
                                                             dimension:0
                                                             name:nil];
+            MPSGraphTensor* layersOutputs = (num_layers > 1)
+                ? [mpsGraph concatTensors:layersOutputsList dimension:0 name:nil]
+                : nil;
 
-            std::vector<MPSGraphTensor*> outputTensors = {outputTensor, outputStates, outputCellStates, outputZStates, outputCellStatesFwd};
+            std::vector<MPSGraphTensor*> outputTensors = {outputTensor, outputStates, outputCellStates, outputZStates, outputCellStatesFwd, layersOutputs};
             newCachedGraph->inputTensors_ = inputTensors;
             newCachedGraph->outputTensors_ = outputTensors;
             newCachedGraph->kernelWeightsList_ = kernelWeightsList;
@@ -188,20 +205,20 @@
       NSMutableArray<MPSGraphTensor*> *biasList = cachedGraph->biasList_;
       NSMutableArray<MPSGraphTensor*> *recurrentBiasList = cachedGraph->recurrentBiasList_;
 
-      Placeholder kernelWeight;
-      Placeholder recurrentKernelWeight;
-      Placeholder bias;
-      Placeholder recurrentBias;
+      Placeholder kernelWeight, recurrentKernelWeight, bias, recurrentBias;
+
       NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[[NSMutableDictionary alloc] init] autorelease];
       for (size_t i = 0; i < num_layers; i+=1) {
           kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
           recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
           [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
           [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          if(has_biases) {
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          }
 
       }
       Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
@@ -218,6 +235,9 @@
       Tensor cy = at::empty_like(hx[1], input.options());
       Tensor zState = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[3])), input.options());
       Tensor cellStateFwd = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[4])), input.options());
+      Tensor layerOutputs = (num_layers > 1)
+          ? at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[5])), input.options())
+          : at::empty({ 1 }, input.options()); // not used if num_layers == 1
 
       Placeholder outputPlaceholder0 = Placeholder(cachedGraph->outputTensors_[0], output);
       Placeholder outputPlaceholder1 = Placeholder(cachedGraph->outputTensors_[1], hy);
@@ -225,20 +245,25 @@
       Placeholder outputPlaceholder3 = Placeholder(cachedGraph->outputTensors_[3], zState);
       Placeholder outputPlaceholder4 = Placeholder(cachedGraph->outputTensors_[4], cellStateFwd);
 
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [@{
         outputPlaceholder0.getMPSGraphTensor() : outputPlaceholder0.getMPSGraphTensorData(),
         outputPlaceholder1.getMPSGraphTensor() : outputPlaceholder1.getMPSGraphTensorData(),
         outputPlaceholder2.getMPSGraphTensor() : outputPlaceholder2.getMPSGraphTensorData(),
         outputPlaceholder3.getMPSGraphTensor() : outputPlaceholder3.getMPSGraphTensorData(),
-        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData()
-      };
+        outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData(),
+      } mutableCopy];
+
+      if (num_layers > 1) {
+          Placeholder outputPlaceholder5 = Placeholder(cachedGraph->outputTensors_[5], layerOutputs);
+          [results setObject:outputPlaceholder5.getMPSGraphTensorData() forKey: outputPlaceholder5.getMPSGraphTensor()];
+      }
 
       runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-      return std::make_tuple(output, hy, cy, zState, cellStateFwd);
+      return std::make_tuple(output, hy, cy, zState, cellStateFwd, layerOutputs);
     }
 }
 
-std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
+std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> lstm_mps_backward(const Tensor& grad_y, const c10::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, const Tensor& layersOutputs, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
     using namespace mps;
     const Tensor& grad_hy_r = c10::value_or_else(grad_hy_opt, [] {return Tensor();});
     const Tensor& grad_cy_r = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
@@ -250,10 +275,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if(has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -264,12 +294,12 @@
       NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList_ = nil;
       NSMutableArray<MPSGraphTensor*> *biasList_ = nil;
       NSMutableArray<MPSGraphTensor*> *recurrentBiasList_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradOutput_ = nil;
       NSMutableArray<MPSGraphTensor*> *gradRecWeights_ = nil;
       NSMutableArray<MPSGraphTensor*> *gradWeights_ = nil;
       NSMutableArray<MPSGraphTensor*> *gradBias_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradState_ = nil;
-      NSMutableArray<MPSGraphTensor*> *gradCellState_ = nil;
+      MPSGraphTensor* gradOutput_ = nil;
+      MPSGraphTensor* gradState_ = nil;
+      MPSGraphTensor* gradCellState_ = nil;
     };
 
     MPSGraphCache* cache_ = MPSGraphCache::getInstance();
@@ -296,8 +326,10 @@
                     for (size_t i = 0; i < num_layers; i += 1) {
                         [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                         [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        if(has_biases) {
+                            [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                            [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        }
                     }
 
                     MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
@@ -308,8 +340,22 @@
                     MPSGraphTensor* gradientCyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_cy.scalar_type()), getMPSShape(grad_cy));
                     MPSGraphTensor* gradientHyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_hy.scalar_type()), getMPSShape(grad_hy));
                     MPSGraphTensor* cellStateFwdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(cell_state_fwd.scalar_type()), getMPSShape(cell_state_fwd));
+                    MPSGraphTensor* layersOutputsTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(layersOutputs.scalar_type()), getMPSShape(layersOutputs));
+
+                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor, layersOutputsTensor};
+
+                    if (batch_first) {
+                        inputTensor = [mpsGraph transposeTensor: inputTensor
+                                                      dimension: 0
+                                                  withDimension: 1
+                                                           name: nil];
+
+                        gradientTensor = [mpsGraph transposeTensor: gradientTensor
+                                                         dimension: 0
+                                                     withDimension: 1
+                                                              name: nil];
+                    }
 
-                    std::vector<MPSGraphTensor*> inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor};
                     newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList;
                     newCachedGraph->kernelWeightsList_ = kernelWeightsList;
                     newCachedGraph->biasList_ = kernelBiasList;
@@ -325,7 +371,6 @@
 
                     NSArray<MPSGraphTensor*>* outputs = nil;
 
-                    NSMutableArray<MPSGraphTensor*>* gradOutputArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                     NSMutableArray<MPSGraphTensor*>* gradRecWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                     NSMutableArray<MPSGraphTensor*>* gradWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
                     NSMutableArray<MPSGraphTensor*>* gradBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
@@ -349,9 +394,15 @@
                         cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
                                                     axis:0
                                                     name:nil];
-                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                            secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                        MPSGraphTensor* biasTensor = nil;
+                        if(has_biases) {
+                            biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                            secondaryTensor:recurrentBiasList[i]
+                                                            name:nil];
+                        } else {
+                            biasTensor = [mpsGraph constantWithScalar:0.0
+                                                            dataType:inputTensor.dataType];
+                        }
 
                         MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                                     dimension:0
@@ -375,7 +426,23 @@
                                                                             length:1
                                                                             name:nil];
 
-                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: inputTensor
+                        MPSGraphTensor* iterationInputTensor_ = nil;
+                        if (i == 0) {
+                            iterationInputTensor_ = inputTensor;
+                        } else {
+                            iterationInputTensor_ = [mpsGraph sliceTensor:layersOutputsTensor
+                                                                dimension: 0
+                                                                    // last element in layersOutputsTensor contains
+                                                                    // **inputs** for the last layer
+                                                                    start: i - num_layers
+                                                                   length: 1
+                                                                     name: nil];
+                            iterationInputTensor_ = [mpsGraph squeezeTensor:iterationInputTensor_
+                                                                       axis:0
+                                                                       name: nil];
+                        }
+
+                        outputs = [mpsGraph LSTMGradientsWithSourceTensor: iterationInputTensor_
                                              recurrentWeight: recurrentKernelWeightsList[i]
                                               sourceGradient: gradientTensor_
                                                       zState: zState
@@ -391,24 +458,31 @@
                                                   descriptor: opDesc
                                                         name: nil];
 
-
                         gradientTensor_ = [outputs objectAtIndex:0];
-                        [gradOutputArray addObject:[outputs objectAtIndex:0]];
-                        [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
-                        [gradWeightsArray addObject:[outputs objectAtIndex:2]];
-                        [gradBiasArray addObject:[outputs objectAtIndex:3]];
-                        [gradStateArray addObject:[outputs objectAtIndex:4]];
-                        [gradCellStateArray addObject:[outputs objectAtIndex:5]];
+                        [gradRecWeightsArray insertObject:[outputs objectAtIndex:1] atIndex:0];
+                        [gradWeightsArray insertObject:[outputs objectAtIndex:2] atIndex:0];
+                        [gradBiasArray insertObject: [outputs objectAtIndex:3] atIndex:0];
+                        [gradStateArray insertObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:4] axis:0 name:nil]  atIndex:0];
+                        [gradCellStateArray insertObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:5] axis:0 name:nil] atIndex:0];
                     }
                     std::vector<MPSGraphTensor*> outputTensors = {[outputs objectAtIndex:0],[outputs objectAtIndex:1],[outputs objectAtIndex:2],[outputs objectAtIndex:3], [outputs objectAtIndex:4], [outputs objectAtIndex:5]};
+
+                    if (batch_first) {
+                        MPSGraphTensor* gradientTensorTransposed = [mpsGraph transposeTensor:gradientTensor_
+                                                                                   dimension: 0
+                                                                               withDimension: 1
+                                                                                        name:nil];
+                        newCachedGraph->gradOutput_ = gradientTensorTransposed;
+                    } else {
+                        newCachedGraph->gradOutput_ = gradientTensor_;
+                    }
+
                     newCachedGraph->outputTensors_ = outputTensors;
-                    newCachedGraph->gradOutput_ = gradOutputArray;
                     newCachedGraph->gradRecWeights_ = gradRecWeightsArray;
                     newCachedGraph->gradWeights_ = gradWeightsArray;
                     newCachedGraph->gradBias_ = gradBiasArray;
-                    newCachedGraph->gradState_ = gradStateArray;
-                    newCachedGraph->gradCellState_ = gradCellStateArray;
-
+                    newCachedGraph->gradState_ = [mpsGraph concatTensors:gradStateArray dimension: 0 name: nil];
+                    newCachedGraph->gradCellState_ = [mpsGraph concatTensors:gradCellStateArray dimension: 0 name: nil];
                 }
                 return newCachedGraph;
             });
@@ -423,6 +497,7 @@
         Placeholder cellStateFwdPlaceholder   = Placeholder(cachedGraph->inputTensors_[5], cell_state_fwd);
         Placeholder gradientHyPlaceholder   = Placeholder(cachedGraph->inputTensors_[6], grad_hy);
         Placeholder gradientCyPlaceholder   = Placeholder(cachedGraph->inputTensors_[7], grad_cy);
+        Placeholder layersOutputsPlaceholder   = Placeholder(cachedGraph->inputTensors_[8], layersOutputs);
 
         NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *feeds = [[[NSMutableDictionary alloc] init] autorelease];
         [feeds setObject:gradientPlaceholder.getMPSGraphTensorData() forKey:gradientPlaceholder.getMPSGraphTensor()];
@@ -433,6 +508,7 @@
         [feeds setObject:cellStatePlaceholder.getMPSGraphTensorData() forKey:cellStatePlaceholder.getMPSGraphTensor()];
         [feeds setObject:zStatePlaceholder.getMPSGraphTensorData() forKey:zStatePlaceholder.getMPSGraphTensor()];
         [feeds setObject:cellStateFwdPlaceholder.getMPSGraphTensorData() forKey:cellStateFwdPlaceholder.getMPSGraphTensor()];
+        [feeds setObject:layersOutputsPlaceholder.getMPSGraphTensorData() forKey:layersOutputsPlaceholder.getMPSGraphTensor()];
 
         NSMutableArray<MPSGraphTensor*> *kernelWeightsList = cachedGraph->kernelWeightsList_;
         NSMutableArray<MPSGraphTensor*> *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_;
@@ -445,68 +521,65 @@
         for (size_t i = 0; i < num_layers; i+=1) {
             kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
             recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
             [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
             [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            if(has_biases) {
+                bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+                recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+                [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+                [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            }
         }
 
-        Tensor output = at::empty_like(input);
-        Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
-        Tensor grad_weights = at::empty_like(kernel_weights[0]);
-        Tensor grad_bias = at::empty_like(biases[0]);
-        Tensor grad_state = at::empty_like(hx[0]);
-        Tensor grad_cell_state = at::empty_like(hx[1]);
-        Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
-        Placeholder gradRecWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[1], grad_rec_weights);
-        Placeholder gradWeightsPlaceholder   = Placeholder(cachedGraph->outputTensors_[2], grad_weights);
-        Placeholder gradBiasPlaceholder   = Placeholder(cachedGraph->outputTensors_[3], grad_bias);
-        Placeholder gradStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[4], grad_state);
-        Placeholder gradCellStatePlaceholder   = Placeholder(cachedGraph->outputTensors_[5], grad_cell_state);
-
-        std::vector<Tensor> grad_hx = {grad_state, grad_cell_state};
+        Tensor output_out = at::empty_like(input);
+        Tensor grad_state_out = at::empty_like(hx[0]);
+        Tensor grad_cell_state_out = at::empty_like(hx[1]);
+
+
+        std::vector<Tensor> grad_hx = {grad_state_out, grad_cell_state_out};
 
         NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*> *results = [[[NSMutableDictionary alloc] init] autorelease];
-        NSMutableArray<MPSGraphTensor*> *gradOutputArray = cachedGraph->gradOutput_;
         NSMutableArray<MPSGraphTensor*> *gradRecWeightsArray = cachedGraph->gradRecWeights_;
         NSMutableArray<MPSGraphTensor*> *gradWeightsArray = cachedGraph->gradWeights_;
         NSMutableArray<MPSGraphTensor*> *gradBiasArray = cachedGraph->gradBias_;
-        NSMutableArray<MPSGraphTensor*> *gradStateArray = cachedGraph->gradState_;
-        NSMutableArray<MPSGraphTensor*> *gradCellStateArray = cachedGraph->gradCellState_;
-        Placeholder gradOutPlaceholder;
+        MPSGraphTensor* gradOutput = cachedGraph->gradOutput_;
+        MPSGraphTensor* gradState = cachedGraph->gradState_;
+        MPSGraphTensor* gradCellState = cachedGraph->gradCellState_;
+
+        Placeholder gradStatePlaceholder = Placeholder(gradState, grad_state_out);
+        Placeholder gradCellStatePlaceholder = Placeholder(gradCellState, grad_cell_state_out);
+        Placeholder outputPlaceholder = Placeholder(gradOutput, output_out);
+        [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
+        [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
+        [results setObject:outputPlaceholder.getMPSGraphTensorData() forKey:outputPlaceholder.getMPSGraphTensor()];
+
+        Placeholder gradRecWeightsPlaceholder, gradWeightsPlaceholder, gradBiasPlaceholder;
 
         std::vector<Tensor> weights;
         for (int i = 0; i < num_layers; i++) {
-            Tensor output = at::empty_like(input);
             Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
             Tensor grad_weights = at::empty_like(kernel_weights[i]);
-            Tensor grad_bias = at::empty_like(biases[i]);
-            Tensor grad_state = at::empty_like(hx[0]);
-            Tensor grad_cell_state = at::empty_like(hx[1]);
+            Tensor grad_bias = at::empty((kernel_weights[i].size(0)), kernel_weights[i].options());
             weights.push_back(grad_weights);
             weights.push_back(grad_rec_weights);
-            weights.push_back(grad_bias);
-            weights.push_back(grad_bias);
-            gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
-            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
-            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
-            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex:i], grad_bias);
-            gradStatePlaceholder = Placeholder([gradStateArray objectAtIndex:i], grad_state);
-            gradCellStatePlaceholder = Placeholder([gradCellStateArray objectAtIndex:i], grad_cell_state);
-
-            [results setObject:gradOutPlaceholder.getMPSGraphTensorData() forKey:gradOutPlaceholder.getMPSGraphTensor()];
-            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
+
+            if(has_biases) {
+                weights.push_back(grad_bias);
+                weights.push_back(grad_bias);
+            }
+
+            gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex: i], grad_rec_weights);
+            gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex: i], grad_weights);
+            gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex: i], grad_bias);
+
             [results setObject:gradBiasPlaceholder.getMPSGraphTensorData() forKey:gradBiasPlaceholder.getMPSGraphTensor()];
-            [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()];
-            [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()];
+            [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()];
             [results setObject:gradWeightsPlaceholder.getMPSGraphTensorData() forKey:gradWeightsPlaceholder.getMPSGraphTensor()];
         }
 
         runMPSGraph(stream, cachedGraph->graph(), feeds, results);
 
-        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output, grad_hx, weights);
+        return std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>> (output_out, grad_hx, weights);
 
     }
 }
diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
index 97d31b2ef857..62ae308cc251 100644
--- a/aten/src/ATen/native/mps/operations/ScatterGather.mm
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -51,11 +51,13 @@
       if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue])
         needSlice = true;
     }
-    // input and output types are always the same
-    auto dtype = getMPSDataType(self.scalar_type());
-    // workaround for UInt8 and Bool issues in MPS backend
-    if (dtype ==  MPSDataTypeUInt8 || dtype ==  MPSDataTypeBool) {
-      dtype = MPSDataTypeInt8;
+    auto input_type = getMPSDataType(self.scalar_type());
+    auto output_type = getMPSDataType(output.scalar_type());
+    if (input_type == MPSDataTypeUInt8 || ((input_type ==  MPSDataTypeBool && !is_macos_13_or_newer()))) {
+      input_type = MPSDataTypeInt8;
+    }
+    if (output_type == MPSDataTypeUInt8 || ((output_type ==  MPSDataTypeBool && !is_macos_13_or_newer()))) {
+      output_type = MPSDataTypeInt8;
     }
     string key = "gather_out_mps" + getTensorsStringKey({self, index, output}) + ":" + std::to_string(dim);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
@@ -68,7 +70,7 @@
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, dtype, input_shape);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_type, getMPSShape(self));
           MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
 
           MPSGraphTensor* getInput = inputTensor;
@@ -98,10 +100,13 @@
                                                           toType:MPSDataTypeInt32
                                                             name:(NSString * _Nonnull)nil];
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wobjc-method-access"
           MPSGraphTensor* outputTensor = [mpsGraph gatherAlongAxis: (NSInteger) dim
                                                  withUpdatesTensor: getInput
                                                      indicesTensor: castIndexTensor
                                                               name: nil];
+#pragma clang diagnostic pop
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->indexTensor_ = indexTensor;
           newCachedGraph->outputTensor_ = outputTensor;
@@ -111,9 +116,9 @@
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape, true, dtype);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape, true, input_type);
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, dtype);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, output_type);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
@@ -261,12 +266,15 @@
             scatter_mode = MPSGraphScatterModeMin;
 
           // Scatter this into the input with set mode
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wobjc-method-access"
           MPSGraphTensor* scatterTensor = [mpsGraph scatterAlongAxis: (NSInteger) dim
                                                       withDataTensor: slicedInput
                                                        updatesTensor: slicedSrc
                                                        indicesTensor: castIndexTensor
                                                                 mode: scatter_mode
                                                                 name: nil];
+#pragma clang diagnostic pop
           if(inputNeedSlice) {
             // Make an array of scatter indices tensors
             NSMutableArray<MPSGraphTensor*>* indicesTensors = [NSMutableArray<MPSGraphTensor*> arrayWithCapacity:num_input_dims];
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 64e0fa3e4231..a4f70fe68ff3 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -5,9 +5,26 @@
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 
 namespace at::native {
 
+// Produces a shape with the `dim` dimension set to 0.
+std::vector<int64_t> getTopK0Shape(IntArrayRef sizes, const int64_t dim_) {
+  const int sz = sizes.size();
+  if (sz == 0) {
+    return {0};
+  }
+  const int64_t dim = maybe_wrap_dim(dim_, sz);
+  std::vector<int64_t> numbers(sz);
+
+  for (int i = 0; i < sz; i++) {
+    const int64_t sz_i = i != dim ? sizes[i] : 0;
+    numbers[i] = sz_i;
+  }
+  return numbers;
+}
+
 // topk
 TORCH_IMPL_FUNC(topk_out_mps)
   (const Tensor& self,
@@ -24,43 +41,98 @@
     k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
     "selected index k out of range");
 
-  TORCH_CHECK( k <= 16 , "Currently topk on mps works only for k<=16 ");
+  if (!is_macos_13_or_newer() && (k>16)) {
+    TORCH_WARN_ONCE("torch.topk support for k>16 by MPS on MacOS 13+, please upgrade");
+    Tensor cpu_indices = indices.clone().to("cpu");
+    Tensor cpu_values = values.clone().to("cpu");
+    at::topk_out(cpu_values, cpu_indices, self.to(at::Device(kCPU)), k, dim_, largest, sorted);
+    values.copy_(cpu_values);
+    indices.copy_(cpu_indices);
+    return;
+  }
 
-  if (self.dim() == 0 && self.numel() == 1)
-  {
+  if (self.dim() == 0 && self.numel() == 1) {
       values.copy_(self);
       indices.zero_();
       return;
   }
-  MPSStream* stream = getCurrentMPSStream();
-  struct CachedGraph : public MPSCachedGraph
+
+  // Handle empty tensors
+  if (self.numel() == 0)
+  {
+      values.copy_(self);
+      indices.copy_(values.toType(at::ScalarType::Long));
+      return;
+  }
+  // Handle k == 0 case. Needed because MPSGraph does not support k == 0.
+  if (k == 0)
   {
+      const auto out_shape = getTopK0Shape(self.sizes(), dim);
+      values.resize_(out_shape);
+      indices.copy_(values.toType(at::ScalarType::Long));
+      return;
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+  struct CachedGraph : public MPSCachedGraph {
       CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
       MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
   };
+
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   // MPSGraph topK is always sorted.
-  @autoreleasepool
-  {
-      // Input as placeholders
-      MPSShape* input_shape = getMPSShape(self);
-      NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-      string key = string("topk:") + [ns_shape_key UTF8String] + ":" +
-                             getMPSTypeString(self.scalar_type()) +
-                             ":k" + to_string(k) + ":dim" + to_string(dim_) +
-                             ":largest" + to_string(largest);
-      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-      if(!cachedGraph)
-      {
-          cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-          CachedGraph *newCachedGraph = nil;
-          @autoreleasepool
-          {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-              newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
-              if ((dim_ != -1 && dim_ != self.dim() - 1) && (!largest))
-              {
+  @autoreleasepool {
+    // Input as placeholders
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    string key = string("topk:") + [ns_shape_key UTF8String] + ":" +
+                           getMPSTypeString(self.scalar_type()) +
+                           ":k" + to_string(k) + ":dim" + to_string(dim_) +
+                           ":largest" + to_string(largest);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+            newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
+
+            if (is_macos_13_or_newer()) {
+              MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
+              MPSDataType dataType = getMPSDataType(self.scalar_type());
+              // #issue 104398441 sortWithTensor and argsortWithTensor
+              if (dataType != MPSDataTypeInt32 &&
+                  dataType != MPSDataTypeFloat32 &&
+                  dataType != MPSDataTypeFloat16) {
+                  dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+                  castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor
+                                          toType:dataType
+                                            name:@"castInputTensor"];
+              }
+              MPSGraphTensor * sortedTensor = [mpsGraph sortWithTensor:castInputTensor
+                                                                  axis:(NSUInteger)dim
+                                                                  descending:largest
+                                                                  name:nil];
+              sortedTensor = [mpsGraph sliceTensor:sortedTensor
+                                                dimension:(NSUInteger)dim
+                                                start:((NSUInteger) 0)
+                                                length:k
+                                                name:nil];
+              MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                                       axis:(NSInteger)dim
+                                                                       descending:largest
+                                                                       name:@"argmax_out"];
+              argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
+                                                        dimension:dim
+                                                        start:((NSUInteger) 0)
+                                                        length:k
+                                                        name:nil];
+              newCachedGraph->valuesTensor = sortedTensor;
+              newCachedGraph->indicesTensor = argSortedTensor;
+
+            } else {
+              if ((dim_ != -1 && dim_ != self.dim() - 1) && (!largest)) {
                 // transpose and negate
                   MPSGraphTensor *transposedInput = [mpsGraph transposeTensor: newCachedGraph->selfTensor
                                                                                dimension: (NSUInteger)self.dim()-1
@@ -86,9 +158,7 @@
                                                                             dimension: (NSUInteger)self.dim()-1
                                                                             withDimension: (NSUInteger)dim_
                                                                             name: nil];
-              }
-              else if (dim_ != -1 && dim_ != self.dim() - 1)
-              {
+              } else if (dim_ != -1 && dim_ != self.dim() - 1) {
                   MPSGraphTensor *transposedInput = [mpsGraph transposeTensor: newCachedGraph->selfTensor
                                                                                dimension: (NSUInteger)self.dim()-1
                                                                                withDimension: (NSUInteger)dim_
@@ -109,9 +179,7 @@
                                                                             dimension: (NSUInteger)self.dim()-1
                                                                             withDimension: (NSUInteger)dim_
                                                                             name: nil];
-              }
-              else if (!largest)
-              {
+              } else if (!largest) {
                   // only negate
                   MPSGraphTensor *negatedInput = [mpsGraph negativeWithTensor:newCachedGraph->selfTensor
                                                                         name: nil];
@@ -123,9 +191,7 @@
                   newCachedGraph->valuesTensor = [mpsGraph negativeWithTensor:valuesNegated
                                                                             name: nil];
                   newCachedGraph->indicesTensor = outputMPSGraphTensors[1];
-              }
-              else
-              {
+              } else {
                   NSArray<MPSGraphTensor *> * outputMPSGraphTensors = [mpsGraph
                                                                          topKWithSourceTensor:newCachedGraph->selfTensor
                                                                          k:((NSUInteger) k)
@@ -133,29 +199,29 @@
                   newCachedGraph->valuesTensor = outputMPSGraphTensors[0];
                   newCachedGraph->indicesTensor = outputMPSGraphTensors[1];
               }
+            }
+        }
+        return newCachedGraph;
+      }));
+    }
+    Placeholder inputPlaceholder  = Placeholder(cachedGraph->selfTensor, self);
+    // Outputs as placeholders
+    Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
+    Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =  nil;
+    feeds = @{
+    inputPlaceholder.getMPSGraphTensor() :
+        inputPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+    valuesPlaceholder.getMPSGraphTensor() :
+            valuesPlaceholder.getMPSGraphTensorData(),
+    indicesPlaceholder.getMPSGraphTensor() :
+          indicesPlaceholder.getMPSGraphTensorData()
+    };
 
-          }
-          return newCachedGraph;
-        }));
-      }
-  Placeholder inputPlaceholder  = Placeholder(cachedGraph->selfTensor, self);
-  // Outputs as placeholders
-  Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
-  Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
-  // Create dictionary of inputs and outputs
-  NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =  nil;
-  feeds = @{
-  inputPlaceholder.getMPSGraphTensor() :
-      inputPlaceholder.getMPSGraphTensorData()
-  };
-  NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-  valuesPlaceholder.getMPSGraphTensor() :
-          valuesPlaceholder.getMPSGraphTensorData(),
-  indicesPlaceholder.getMPSGraphTensor() :
-        indicesPlaceholder.getMPSGraphTensorData()
-  };
-
-  runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
 
@@ -192,6 +258,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
        const Tensor& out) {
 
   using namespace mps;
+
   if (out.numel() == 0) {
     return;
   }
@@ -256,6 +323,10 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
               "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
               notSkippedTensor.device(), " and out is on ", out.device());
 
+  // TODO: For better performance by eliminating input tensor gathering and post transpose,
+  // TODO: it is better to keep the out tensor's memory format.
+  // TODO: dimension needs to be recomputed as:
+  // TODO: dim = 0 --> dim = 0; dim = 1 or 2 --> dim = out.dim()- dim; otherwise dim = dim-1
   if (out.suggest_memory_format() == MemoryFormat::ChannelsLast) {
     out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
   }
@@ -276,7 +347,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   size[dimension] = cat_dim_size;
   // skip resizing if size of result is same as expected
   if (out.sizes() != size) {
-    out.resize_(size, memory_format);
+    out.resize_(size, MemoryFormat::Contiguous);
   }
   if (out.numel() == 0) {
     return;
@@ -312,7 +383,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
             if (tensor.scalar_type() == kBool) {
               scalar_type = MPSDataTypeInt8;
             }
-            newCachedGraph->inputTensors_[idx] = mpsGraphRankedPlaceHolder(mpsGraph, scalar_type, getMPSShape(tensor, memory_format));
+            newCachedGraph->inputTensors_[idx] = mpsGraphRankedPlaceHolder(mpsGraph, scalar_type, getMPSShape(tensor, MemoryFormat::Contiguous));
             if (tensor.scalar_type() != out_dtype) {
               castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
                                                     toType:getMPSDataType(out_dtype)
@@ -332,8 +403,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
                                          toType:MPSDataTypeBool
                                            name:@"outputTensor"];
           }
-          newCachedGraph->outputTensor_ = memory_format == MemoryFormat::ChannelsLast ?
-                                         convertNHWCtoNCHW(mpsGraph, outputTensor) : outputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
       });
@@ -349,8 +419,8 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
           scalar_type = MPSDataTypeInt8;
         }
         inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor,
-                                       getMPSShape(tensor, memory_format),
-                                       memory_format != MemoryFormat::ChannelsLast, scalar_type);
+                                       getMPSShape(tensor, MemoryFormat::Contiguous),
+                                       /*gatherTensorData*/true, scalar_type);
         t_idx++;
       }
       i++;
@@ -360,7 +430,8 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     if (!is_macos_13_or_newer() && out.scalar_type() == kBool) {
       outputDataType = MPSDataTypeInt8;
     }
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out, nil, false, outputDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
 
     NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
     for (auto& inputPlaceholder : inputPlaceholders) {
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
new file mode 100644
index 000000000000..4b3bb692ac0f
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -0,0 +1,99 @@
+//  Copyright © 2023 Apple Inc.
+
+#include <ATen/MemoryOverlap.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/native/TensorShape.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+
+namespace at::native {
+
+// sort
+TORCH_IMPL_FUNC(sort_stable_out_mps)
+(const Tensor& self,
+ c10::optional<bool> stable,
+ int64_t dim,
+ bool descending,
+ const Tensor& values,
+ const Tensor& indices) {
+  using namespace mps;
+  values.copy_(self);
+  // check if self is scalar
+  dim = maybe_wrap_dim(dim, self.dim(), true);
+  if (self.dim() == 0 && self.numel() == 1) {
+    indices.zero_();
+    return;
+  }
+
+  if (!is_macos_13_or_newer()) {
+    TORCH_WARN_ONCE("torch.sort is supported by MPS on MacOS 13+, please upgrade. Falling back to CPU");
+    Tensor cpu_indices = indices.clone().to("cpu");
+    Tensor cpu_values = values.clone().to("cpu");
+    at::sort_out(cpu_values, cpu_indices, self.to(at::Device(kCPU)), false, dim, descending);
+    values.copy_(cpu_values);
+    indices.copy_(cpu_indices);
+    return;
+  }
+  if (self.scalar_type() == ScalarType::Long) {
+    TORCH_WARN_ONCE("MPS: no support for int64 min/max ops, casting it to int32");
+  }
+
+  MPSStream* stream = getCurrentMPSStream();
+  struct CachedGraph : public MPSCachedGraph {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  @autoreleasepool {
+    // Input as placeholders
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    string key = string("sort:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self.scalar_type()) +
+                           ":dim" + to_string(dim) + ":descending" + to_string(descending);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+            newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
+
+            MPSGraphTensor* castInputTensor = castToIHFTypes(mpsGraph, newCachedGraph->selfTensor, self);
+            MPSGraphTensor * sortedTensor = [mpsGraph sortWithTensor:castInputTensor
+                                                                axis:(NSInteger)dim
+                                                                descending:(BOOL)descending
+                                                                name:@"sort_out"];
+            sortedTensor = castFromIHFTypes(mpsGraph, sortedTensor, values);
+            MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                                     axis:(NSInteger)dim
+                                                                     descending:(BOOL)descending
+                                                                     name:@"argsort_out"];
+            argSortedTensor = castFromIHFTypes(mpsGraph, argSortedTensor, indices);
+            newCachedGraph->valuesTensor = sortedTensor;
+            newCachedGraph->indicesTensor = argSortedTensor;
+        }
+        return newCachedGraph;
+      }));
+    }
+    Placeholder inputPlaceholder  = Placeholder(cachedGraph->selfTensor, self);
+    // Outputs as placeholders
+    Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
+    Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =  nil;
+    feeds = @{ inputPlaceholder.getMPSGraphTensor() :
+        inputPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+    valuesPlaceholder.getMPSGraphTensor() :
+            valuesPlaceholder.getMPSGraphTensorData(),
+    indicesPlaceholder.getMPSGraphTensor() :
+          indicesPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+}
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 1c75e53e18ce..4f8def1cbb77 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -320,6 +320,23 @@ void clamp_scalar_out_mps(const Tensor& input_t,
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
+  MPSDataType conditionDataType = getMPSScalarType(condition.scalar_type());
+  MPSDataType selfDataType = getMPSScalarType(self.scalar_type());
+  MPSDataType otherDataType = getMPSScalarType(other.scalar_type());
+  // Workaround for `selectWithPredicateTensor` on macOS Monterey where bool data type may cause a hang
+  // The issue is fixed in macOS Ventura (13.0)
+  if (!is_macos_13_or_newer()) {
+     if (condition.scalar_type() == kBool) {
+      conditionDataType = MPSDataTypeInt8;
+     }
+     if (self.scalar_type() == kBool) {
+      selfDataType = MPSDataTypeInt8;
+     }
+     if (other.scalar_type() == kBool) {
+      otherDataType = MPSDataTypeInt8;
+     }
+  }
+
   @autoreleasepool {
 
     string key = "where_self_out_mps:" + getTensorsStringKey({cond_bool, self, other});
@@ -335,9 +352,9 @@ void clamp_scalar_out_mps(const Tensor& input_t,
                 MPSGraph* mpsGraph = make_mps_graph();
                 newCachedGraph = new CachedGraph(mpsGraph);
 
-                MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, cond_bool);
-                MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-                MPSGraphTensor* otherTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
+                MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, conditionDataType, getMPSShape(cond_bool));
+                MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, selfDataType, getMPSShape(self));
+                MPSGraphTensor* otherTensor = mpsGraphRankedPlaceHolder(mpsGraph, otherDataType, getMPSShape(other));
 
                 MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:conditionTensor
                                                                truePredicateTensor:selfTensor
@@ -354,9 +371,12 @@ void clamp_scalar_out_mps(const Tensor& input_t,
         cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder conditionPlaceholder = Placeholder(cachedGraph->conditionTensor_, cond_bool);
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
-    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder conditionPlaceholder = Placeholder(
+        cachedGraph->conditionTensor_, cond_bool, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, conditionDataType);
+    Placeholder selfPlaceholder = Placeholder(
+        cachedGraph->selfTensor_, self, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, selfDataType);
+    Placeholder otherPlaceholder = Placeholder(
+        cachedGraph->otherTensor_, other, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, otherDataType);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -465,9 +485,16 @@ Tensor where_mps(const Tensor& condition,
           MPSGraphTensor* subZeroTensor = [mpsGraph lessThanWithPrimaryTensor: nanFreeTensor
                                                               secondaryTensor: [mpsGraph constantWithScalar: 0.0 dataType: self_dtype]
                                                                          name: nil];
-          // the cast is a workaround for the issue #103149520 (crash when bool and fp16 passed to binary ops)
-          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: [mpsGraph castTensor: subZeroTensor toType: self_dtype name: @"castTensor"]
-                                                                 secondaryTensor: [mpsGraph isInfiniteWithTensor: nanFreeTensor name:nil]
+          MPSGraphTensor* isInfTensor = [mpsGraph isInfiniteWithTensor: nanFreeTensor name:nil];
+          // workaround for Monterey; On Ventura the output of lessThan() is always Boolean
+          if (subZeroTensor.dataType != MPSDataTypeBool) {
+            subZeroTensor = castMPSTensor(mpsGraph, subZeroTensor, kBool);
+          }
+          if (isInfTensor.dataType != MPSDataTypeBool) {
+            isInfTensor = castMPSTensor(mpsGraph, isInfTensor, kBool);
+          }
+          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: subZeroTensor
+                                                                 secondaryTensor: isInfTensor
                                                                             name: nil];
           MPSGraphTensor* negInfFreeTensor = [mpsGraph selectWithPredicateTensor: isNegInfTensor
                                                              truePredicateTensor: newCachedGraph->negInfReplacementTensor
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
index e9469c4537ed..a4b0db98b0fc 100644
--- a/aten/src/ATen/native/mps/operations/TriangularOps.mm
+++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -19,6 +19,10 @@
  const Tensor &output) {
 
   using namespace mps;
+
+  if (self.numel() == 0) {
+    return;
+  }
   MPSStream* stream = getCurrentMPSStream();
 
   // Derive from MPSCachedGraph
@@ -98,6 +102,10 @@
  const Tensor &output) {
 
   using namespace mps;
+
+  if (self.numel() == 0) {
+    return;
+  }
   MPSStream* stream = getCurrentMPSStream();
 
   // Derive from MPSCachedGraph
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 9e56d542c0fe..3f2f4a4400a9 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -16,6 +16,8 @@ bool is_empty_tensor(const Tensor& self) {
 
 void unary_op(const Tensor& self, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock, is_noop_p is_noop = is_empty_tensor)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support unary op with uint8 natively starting from macOS 13.0");
   if (!output.is_same_size(self)) {
     output.resize_(self.sizes());
   }
@@ -46,8 +48,13 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
       });
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    bool gatherTensorData = true;
+    if (!output.is_contiguous() || output.is_view()) {
+      gatherTensorData = false;
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, /*mpsShape=*/nullptr, gatherTensorData);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, /*mpsShape=*/nullptr, false);
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
     };
@@ -79,6 +86,16 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                                         name:nil];
 };
 
+MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+  MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                  dataType:inputTensor.dataType];
+  MPSGraphTensor* addedTensor = [mpsGraph additionWithPrimaryTensor:inputTensor
+                                                    secondaryTensor:oneTensor
+                                                                name:nil];
+  return [mpsGraph logarithmWithTensor:addedTensor
+                                  name:nil];
+}
+
 } // namespace mps
 
 TORCH_IMPL_FUNC(trunc_out_mps) (const Tensor& self, const Tensor& output) {
@@ -194,13 +211,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
   TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support log1p op with int64 input");
   mps::unary_op(self, output, "log1p_out_mps",
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
-                  MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
-                                                                  dataType:inputTensor.dataType];
-                  MPSGraphTensor* addedTensor = [mpsGraph additionWithPrimaryTensor:inputTensor
-                                                                    secondaryTensor:oneTensor
-                                                                               name:nil];
-                  return [mpsGraph logarithmWithTensor:addedTensor
-                                                  name:nil];
+                  return mps::log1p(mpsGraph, inputTensor);
                 });
 }
 
@@ -237,6 +248,163 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                 });
 }
 
+void logit_mps_impl(const Tensor& self, c10::optional<double> eps, Tensor& output, const std::string op_name) {
+  std::string key = op_name + ":[" + (eps.has_value() ? std::to_string(eps.value()) : "NULL") + "]";
+
+  mps::unary_op(self, output, key,
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+                  MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                                     shape:@[@1]
+                                                                  dataType:inputTensor.dataType];
+                  MPSGraphTensor* logitInputTensor;
+
+                  if (eps.has_value()) {
+                    MPSGraphTensor *lowTensor = [mpsGraph constantWithScalar:eps.value()
+                                                                       shape:@[@1]
+                                                                    dataType:inputTensor.dataType];
+                    MPSGraphTensor *highTensor = [mpsGraph subtractionWithPrimaryTensor: oneTensor
+                                                                        secondaryTensor: lowTensor
+                                                                                   name: nil];
+                    logitInputTensor = [mpsGraph clampWithTensor:inputTensor
+                                                  minValueTensor:lowTensor
+                                                  maxValueTensor:highTensor
+                                                            name:nil];
+                  } else {
+                    logitInputTensor = inputTensor;
+                  }
+
+                  MPSGraphTensor *oneMinusInputTensor = [mpsGraph subtractionWithPrimaryTensor: oneTensor
+                                                                               secondaryTensor: logitInputTensor
+                                                                                          name: nil];
+                  MPSGraphTensor *outputTensor = [mpsGraph divisionWithPrimaryTensor:logitInputTensor
+                                                                     secondaryTensor:oneMinusInputTensor
+                                                                                name:nil];
+                  return [mpsGraph logarithmWithTensor:outputTensor
+                                                  name:nil];
+                });
+}
+
+Tensor& logit_out_mps(const Tensor& self,
+    c10::optional<double> eps,
+    Tensor& result) {
+  logit_mps_impl(self, eps, result, "logit_out_mps");
+  return result;
+}
+
+Tensor logit_mps(const Tensor& self, c10::optional<double> eps) {
+  Tensor result = at::native::empty_mps(
+                      self.sizes(),
+                      ScalarType::Float,
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+  logit_mps_impl(self, eps, result, "logit_mps");
+  return result;
+}
+
+TORCH_IMPL_FUNC(logit_backward_out_mps) (
+    const Tensor& grad_output,
+    const Tensor& input,
+    c10::optional<double> eps,
+    const Tensor& grad_input)
+  {
+  using namespace mps;
+
+  // Empty output
+  if(grad_input.numel() == 0)
+    return;
+
+  double eps_ = eps ? eps.value() : -1.0;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *gradOutputTensor_ = nil;
+    MPSGraphTensor *inputTensor_ = nil;
+    MPSGraphTensor *outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    std::string key =  "logit_backward_out_mps:" + getTensorsStringKey({grad_output, input}) + ":" +
+                  "[" + (eps.has_value() ? std::to_string(eps.value()) : "-1" ) + "]";
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* outputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_input);
+          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:@[@1]
+                                                           dataType:inputTensor.dataType];
+          MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                             shape:@[@1]
+                                                          dataType:inputTensor.dataType];
+          MPSGraphTensor* lowTensor = [mpsGraph constantWithScalar:eps_
+                                                             shape:@[@1]
+                                                          dataType:inputTensor.dataType];
+          MPSGraphTensor *inputLessThanLowPredicateTensor = [mpsGraph lessThanWithPrimaryTensor: inputTensor
+                                                                                secondaryTensor: lowTensor
+                                                                                           name: nil];
+          MPSGraphTensor *highTensor = [mpsGraph subtractionWithPrimaryTensor: oneTensor
+                                                              secondaryTensor: lowTensor
+                                                                         name: nil];
+          MPSGraphTensor *inputGreaterThanHighPredicateTensor = [mpsGraph greaterThanWithPrimaryTensor: inputTensor
+                                                                                       secondaryTensor: highTensor
+                                                                                                  name: nil];
+          MPSGraphTensor* outOfIntervalTensor = [mpsGraph logicalORWithPrimaryTensor: inputLessThanLowPredicateTensor
+                                                                     secondaryTensor: inputGreaterThanHighPredicateTensor
+                                                                                name: nil];
+          MPSGraphTensor *oneMinusInputTensor = [mpsGraph subtractionWithPrimaryTensor: oneTensor
+                                                                       secondaryTensor: inputTensor
+                                                                                  name: nil];
+          outputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+                                                   secondaryTensor:oneMinusInputTensor
+                                                              name:nil];
+          outputTensor = [mpsGraph divisionWithPrimaryTensor:gradOutputTensor
+                                             secondaryTensor:outputTensor
+                                                        name:nil];
+          outputTensor = [mpsGraph selectWithPredicateTensor: outOfIntervalTensor
+                                         truePredicateTensor: zeroTensor
+                                        falsePredicateTensor: outputTensor
+                                                        name: nil];
+
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
 
 
 TORCH_IMPL_FUNC(cumsum_out_mps)
@@ -244,7 +412,10 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
  int64_t dim,
  c10::optional<ScalarType> dtype,
  const Tensor& result) {
-  TORCH_CHECK(dim >=0 && dim < std::max(1LL, self.ndimension()), "Expected dim to be between 0 and ", self.ndimension(), " but got ", dim);
+
+  auto nDims = self.dim();
+  auto wrapped_dim = maybe_wrap_dim(dim, nDims);
+  TORCH_CHECK(wrapped_dim >=0 && wrapped_dim < std::max(1LL, self.ndimension()), "Expected wrapped dim to be between 0 and ", self.ndimension(), " but got ", wrapped_dim , "(original dim is ", dim, ")");
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("torch.cumsum supported by MPS on MacOS 13+, please upgrade");
     auto cpu_result = self.to(at::Device(kCPU)).cumsum(dim, dtype);
@@ -252,11 +423,12 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
     return;
   }
   auto input = dtype.has_value() ? self.to(dtype.value()) : self;
+  TORCH_CHECK(input.scalar_type() != ScalarType::Long, "MPS does not support cumsum op with int64 input");
   mps::unary_op(input, result, "cumsum_out_mp" + std::to_string(dim),
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
        // cumsum is horribly broken for int8, int16 and as chances for overflow is pretty high, cast to int32
        if (isIntegralType(input.scalar_type()) && input.scalar_type() !=ScalarType::Int) {
-           inputTensor = mps::castMPSTensor(mpsGraph, inputTensor, result.scalar_type());
+           inputTensor = mps::castMPSTensor(mpsGraph, inputTensor, ScalarType::Int);
        }
        auto rc = [mpsGraph cumulativeSumWithTensor: inputTensor
                                               axis: dim
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index c0c0f4155d2c..eac16a74564e 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -3,7 +3,6 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/Resize.h>
-#include <ATen/mps/MPSAllocator.h>
 
 namespace at::native {
 namespace mps {
@@ -58,7 +57,7 @@
     return {resultTensor, inverseIndicesTensor, countTensor, lengthTensor};
   }
 
-  // Sort only supports following types, cast if necessary
+  // #issue 104398441 sortWithTensor only supports following types, cast if necessary
   if (dataType != MPSDataTypeInt32 &&
       dataType != MPSDataTypeFloat32 &&
       dataType != MPSDataTypeFloat16) {
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 17895e19c7d7..3b781dea08f4 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input,
   } else {
     native::upsample_2d_common_check(input.sizes(), output_size);
   }
+  Tensor out;
+  if (!output.is_contiguous()) {
+    out = at::empty_like(output, MemoryFormat::Contiguous);
+  }
+
   bool centerResults = false;
   MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
   MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
@@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input,
     MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];
 
     Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
         inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input,
         outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    if (out.has_storage()) {
+      output.copy_(out);
+    }
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index f79796923fe1..a247584fb0ad 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -2,7 +2,10 @@
 
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/Resize.h>
-#include <ATen/mps/MPSAllocator.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <fmt/format.h>
+#include <torch/library.h>
+#include <ATen/mps/IndexKernels.h>
 
 namespace at::native {
 namespace mps {
@@ -32,8 +35,7 @@
 }
 
 // initializes the MTLBuffers for tensor data and runs the MPSGraph for the view op
-static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output,
-                            bool needsScatter, bool requires_sync = false) {
+static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output, bool needsScatter) {
   const id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   const id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
 
@@ -54,7 +56,7 @@
                                                                             dataType: inputType] autorelease];
     if (needsScatter) {
       auto updatesType = getMPSScalarType(src.scalar_type());
-      if (updatesType == MPSDataTypeUInt8 || updatesType == MPSDataTypeBool) {
+      if (updatesType == MPSDataTypeUInt8 || (updatesType == MPSDataTypeBool && !is_macos_13_or_newer())) {
         updatesType = MPSDataTypeInt8;
       }
 
@@ -70,10 +72,10 @@
       strideScalars[i] = getMPSScalar(strides[i], ScalarType::Int);
       feeds[cachedGraph->strideTensors[i]] = getMPSGraphTensorFromScalar(stream, strideScalars[i]);
     }
-    // Workaround for MPSShaderLibrary bug
-    // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
-    auto outputType = getMPSDataType(output.scalar_type());
-    if (outputType ==  MPSDataTypeUInt8) {
+    // Workaround for MPSShaderLibrary bug in macOS Monterey
+    // This is fixed in macOS Ventura
+    auto outputType = getMPSScalarType(output.scalar_type());
+    if (outputType == MPSDataTypeUInt8 || (outputType ==  MPSDataTypeBool && !is_macos_13_or_newer())) {
         outputType =  MPSDataTypeInt8;
     }
     MPSGraphTensorData* outputTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer: outputBuffer
@@ -82,100 +84,10 @@
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       cachedGraph->outputTensor : outputTensorData
     };
-    stream->executeMPSGraph(cachedGraph->graph(), feeds, results,
-                            requires_sync ? SyncType::COMMIT : SyncType::COMMIT_ADAPTIVE);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
   return output;
 }
-MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
-  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
-  std::vector<int64_t> src_view_shape;
-  bool hasMPSShape = (mpsShape != nil);
-  int src_ndim_base = src_base_shape.size();
-  int src_ndim_view = 0;
-  if (hasMPSShape) {
-    src_ndim_view = [mpsShape count];
-    src_view_shape.reserve(src_ndim_view);
-    for (const auto i : c10::irange(src_ndim_view)) {
-      src_view_shape[i] = [mpsShape[i] intValue];
-    }
-  } else {
-    src_ndim_view = src.dim();
-    src_view_shape = src.sizes().vec();
-  }
-
-  MPSNDArray *srcTensorNDArrayView = nil;
-  MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
-  MPSNDArray *srcTensorNDArray = nil;
-  id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
-
-  if (src_ndim_base == src_ndim_view) {
-    srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
-    srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
-
-    int firstDimToSlice = 0;
-    while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
-      firstDimToSlice++;
-    }
-
-    int view_numel = 1;
-    for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
-      view_numel *= src_base_shape[i];
-    }
-
-    int sliceOffset = src.storage_offset() / view_numel;
-    // There are cases where both dimensions of a view can shrink
-    // E.g: x = torch.randn((3,6))[1, 1:3]
-    int nextSliceOffset = src.storage_offset() % view_numel;
-
-    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
-    if (nextSliceOffset) {
-      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
-    }
-  }
-  else {
-    int src_view_numel = 1;
-    for (const auto i : c10::irange(src_ndim_view)) {
-      src_view_numel *= src_view_shape[i];
-    }
-
-    int idx = 0;
-    int finalShapeSize = (src_ndim_view == 0) ? 1 : src_ndim_view;
-    std::vector<NSNumber*> mpsFinalShape(finalShapeSize);
-
-    // When the shapes are different, we need to flatten the first slice in order to alias the memory without any copies
-    // E.g: base tensor [5, 7, 3], view tensor [7, 3] (storage_offset=21). We need to flatten [5, 7, 3] to [35, 3], then
-    // we can slice directly into the first dimension based on the storage_offset
-    uint32_t flattenedSlice = 1;
-    for (const auto i : c10::irange(src_ndim_base - finalShapeSize + 1)) {
-      flattenedSlice *= src_base_shape[i];
-    }
-    mpsFinalShape[idx++] = [NSNumber numberWithInteger:flattenedSlice];
-
-    for (const auto i : c10::irange(src_ndim_base - finalShapeSize + 1, src_ndim_base)) {
-      mpsFinalShape[idx++] = [NSNumber numberWithInteger:src_base_shape[i]];
-    }
-
-    mpsShape = [NSArray arrayWithObjects:mpsFinalShape.data() count:mpsFinalShape.size()];
-    srcTensorNDArray = ndArrayFromTensor(src, mpsShape, mpsDataType);
-    srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
-
-    int dim0 = (src_ndim_view == 0) ? 1 : src_view_shape[0];
-    int totalSlices = dim0;
-
-    // For 1D arrays, the storage_offset gives directly the
-    // starting point from where the slice should start
-    int sliceOffset = src_ndim_view == 1 ? 1 : dim0;
-    int view_numel = src_ndim_view == 1 ? 1 : src_view_numel;
-    [srcTensorNDArrayDesc sliceDimension:finalShapeSize - 1 withSubrange:{static_cast<NSUInteger>((src.storage_offset() / view_numel) * sliceOffset), static_cast<NSUInteger>(totalSlices)}];
-  }
-
-  srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
-                                                           descriptor:srcTensorNDArrayDesc
-                                                             aliasing:MPSAliasingStrategyShallAlias];
-
-  return [[[MPSGraphTensorData alloc] initWithMPSNDArray:srcTensorNDArrayView] autorelease];
-}
 
 MPSGraphTensor *permuteTensor(MPSGraph *graph, MPSGraphTensor *inputTensor, NSArray *permuteOrder) {
   NSUInteger srcRank = [[inputTensor shape] count];
@@ -334,7 +246,7 @@
       if (dstSizes[dstDim] == 0) { return nil; }
   }
 
-  // 1. Flatten the inputTensor if neccessary
+  // 1. Flatten the inputTensor if necessary
   MPSGraphTensor *flatInputTensor = inputTensor;
   {
     // Flatten inputs to remove duplicate strides.
@@ -370,17 +282,19 @@
         // Find what dimension and native length was for the specified stride
         NSDictionary *srcDimLengthOffset = srcStrideToDimLengthOffset[[NSString stringWithFormat:@"%lld",dstStrides[dstDim]]];
 
+        dstDimToSliceLength[dstDim] = dstSizes[dstDim];
+        dstDimToSliceOffset[dstDim] = [srcDimLengthOffset[@"offset"] intValue];
+
         // Stride does not exist in source tensor, or the specified size is too long. Not possible
         // TODO: Longer length with same stride + removal of dim(s) above this is a flatten/reshape. Consider adding support
-        if (!srcDimLengthOffset || dstSizes[dstDim] > [srcDimLengthOffset[@"length"] intValue])
+        if (!srcDimLengthOffset ||
+            // the offset + length of destination should not be larger than source's length when slicing
+            dstDimToSliceOffset[dstDim] + dstDimToSliceLength[dstDim] > [srcDimLengthOffset[@"length"] intValue]) {
           return nil;
-
+        }
         // Get the src dimension corresponding to the requested stride
         NSNumber *srcDim = srcDimLengthOffset[@"dim"];
         [dstDimOrder insertObject:srcDim atIndex:0];
-
-        dstDimToSliceLength[dstDim] = dstSizes[dstDim];
-        dstDimToSliceOffset[dstDim] = [srcDimLengthOffset[@"offset"] intValue];
       }
     }
   }
@@ -509,11 +423,151 @@
   return outputTensor;
 }
 
+static
+std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape, const bool squeeze) {
+  bool hasMPSShape = (mpsShape != nil);
+  std::vector<int64_t> src_view_shape;
+  if (hasMPSShape) {
+    int src_ndim_view = [mpsShape count];
+    if (squeeze) {
+      for (const auto i : c10::irange(src_ndim_view)) {
+        if ([mpsShape[i] intValue] == 1)
+          continue;
+        src_view_shape.emplace_back([mpsShape[i] intValue]);
+      }
+    } else {
+      src_view_shape.resize(src_ndim_view);
+      for (const auto i : c10::irange(src_ndim_view)) {
+        src_view_shape[i] = [mpsShape[i] intValue];
+      }
+    }
+
+  } else {
+    if (squeeze) {
+      IntArrayRef src_shape = src.sizes();
+      size_t src_ndim_view = src_shape.size();
+      for (const auto i : c10::irange(src_ndim_view)) {
+        if (src_shape[i] == 1)
+          continue;
+        src_view_shape.emplace_back(src_shape[i]);
+      }
+    } else {
+      src_view_shape = src.sizes().vec();
+    }
+  }
+
+  return src_view_shape;
+}
+
+
+std::vector<int64_t> getSqueezedBaseShape(const Tensor& src, IntArrayRef shape) {
+  std::vector<int64_t> src_base_shape;
+  for (const auto i : c10::irange(shape.size())) {
+    if (shape[i] == 1)
+      continue;
+    src_base_shape.emplace_back(shape[i]);
+  }
+
+  return src_base_shape;
+}
+
+
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
+  if (!src.is_contiguous()) {
+    return false;
+  }
+
+  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
+  std::vector<int64_t> src_base_squeezed_shape = getSqueezedBaseShape(src, src_base_shape);
+  size_t src_ndim_base = src_base_shape.size();
+  size_t src_squeezed_ndim_base = src_base_squeezed_shape.size();
+  std::vector<int64_t> src_view_squeezed_shape = getViewShape(src, mpsShape, true);
+  size_t src_ndim_view = getViewShape(src, mpsShape, false).size();
+  size_t src_squeezed_ndim_view = src_view_squeezed_shape.size();
+
+  if (src_ndim_base != src_ndim_view) {
+    return false;
+  }
+
+  if (src_squeezed_ndim_base == src_squeezed_ndim_view) {
+    for (const auto i: c10::irange(src_squeezed_ndim_base)) {
+      if (src_view_squeezed_shape[i] > src_base_squeezed_shape[i]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
+  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
+  size_t src_ndim_base = src_base_shape.size();
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
+  size_t src_ndim_view = src_view_shape.size();
+
+  MPSNDArray *srcTensorNDArrayView = nil;
+  MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
+  MPSNDArray *srcTensorNDArray = nil;
+  id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
+  int64_t base_idx = 0;
+
+  std::vector<int64_t> src_base_shape_vec;
+
+  if (src_ndim_view != src_ndim_base) {
+    src_base_shape_vec.reserve(src_ndim_view);
+    for (const auto i : c10::irange(src_ndim_view)) {
+      if (src_view_shape[i] == 1 && src_base_shape[base_idx] != 1) {
+        src_base_shape_vec.emplace_back(1);
+      } else {
+        src_base_shape_vec.emplace_back(src_base_shape[base_idx]);
+        if (base_idx < src_ndim_base - 1)
+          base_idx += 1;
+      }
+    }
+    src_base_shape = IntArrayRef(src_base_shape_vec);
+    src_ndim_base = src_base_shape.size();
+  }
+
+  srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
+  srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
+
+  size_t firstDimToSlice = 0;
+  while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
+    firstDimToSlice++;
+  }
+
+  int64_t view_numel = 1;
+  for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
+    view_numel *= src_base_shape[i];
+  }
+
+  int64_t sliceOffset = src.storage_offset() / view_numel;
+  [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice
+                          withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
+
+  // Slice any remaining dimensions
+  for (const auto crtSliceOffset: c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
+    if (src_view_shape[crtSliceOffset] != src_base_shape[crtSliceOffset]) {
+      if (crtSliceOffset == src_base_shape.size() - 1) {
+        sliceOffset = src.storage_offset() % src_base_shape[src_base_shape.size() - 1];
+      } else {
+        sliceOffset = (src.storage_offset() % view_numel) / (view_numel / src_base_shape[crtSliceOffset]);
+      }
+      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - crtSliceOffset
+                              withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[crtSliceOffset])}];
+    }
+  }
+  srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
+                                                           descriptor:srcTensorNDArrayDesc
+                                                             aliasing:MPSAliasingStrategyShallAlias];
+
+  return [[[MPSGraphTensorData alloc] initWithMPSNDArray:srcTensorNDArrayView] autorelease];
+}
 
 static MPSGraphTensor* chainViewOperation(ViewCachedGraph* cachedGraph, const IntArrayRef& size,
                                           const IntArrayRef& stride, int64_t offset,
                                           const IntArrayRef& base_shape, bool needsScatter,
-                                          const bool needsBoolCast,
                                           MPSGraphTensor* updatesTensor)
 {
   MPSGraph* mpsGraph = cachedGraph->graph();
@@ -556,23 +610,9 @@
                                                    name: nil];
     MPSGraphTensor *inputTensor = cachedGraph->inputTensor;
 
-    // Workaround for bool scatter/gather deficiency
-    // See https://github.com/pytorch/pytorch/issues/82663
-    if (needsBoolCast) {
-      inputTensor = [mpsGraph castTensor:inputTensor
-                                  toType:MPSDataTypeInt8
-                                    name:@"Cast away from bool"];
-    }
-
     if (!needsScatter) {
       MPSGraphTensor *outputTensor = asStridedLayer_pattern(mpsGraph, inputTensor, shape_size, size, stride, offset);
-
       if (outputTensor) {
-        if (needsBoolCast) {
-          outputTensor = [mpsGraph castTensor:outputTensor
-                                       toType:MPSDataTypeBool
-                                         name:@"Cast back to bool"];
-        }
         return outputTensor;
       }
     }
@@ -584,12 +624,15 @@
                                                           withShape: @[@-1]
                                                                name: nil];
     if (needsScatter) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wobjc-method-access"
       MPSGraphTensor* scatteredTensor = [mpsGraph scatterAlongAxis: (NSInteger) 0
                                                     withDataTensor: reshapedInputTensor
                                                      updatesTensor: updatesTensor
                                                      indicesTensor: reshapedIndicesTensor
                                                               mode: MPSGraphScatterModeSet
                                                               name: nil];
+#pragma clang diagnostic pop
       outputTensor = [mpsGraph reshapeTensor: scatteredTensor
                                    withShape: getMPSShape(base_shape)
                                         name: nil];
@@ -605,21 +648,13 @@
                               withShapeTensor: shapeTensor
                                          name: nil];
     }
-
-    // Workaround for bool scatter/gather deficiency
-    // See https://github.com/pytorch/pytorch/issues/82663
-    if (needsBoolCast) {
-      outputTensor = [mpsGraph castTensor:outputTensor
-                                   toType:MPSDataTypeBool
-                                     name:@"Cast back to bool"];
-    }
   }
   return outputTensor;
 }
 
 static IntArrayRef updateTensorBaseShape(const Tensor& self)
 {
-  IntArrayRef base_shape = get_buffer_shape(self.storage().data());
+  IntArrayRef base_shape = getIMPSAllocator()->getBufferShape(self.storage().data());
   // if there's no base_shape stored in MPSAllocator, then infer it from tensor's size and store it
   if (base_shape.size() == 0) {
     // IntArrayRef wouldn't own the data, so we use a static storage
@@ -630,7 +665,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
 
     // base_shape will be retained in MPSAllocator until buffer gets recycled
     if (self.storage().data())
-      set_buffer_shape(self.storage().data(), base_shape);
+      getIMPSAllocator()->setBufferShape(self.storage().data(), base_shape);
   }
   return base_shape;
 }
@@ -668,13 +703,13 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
             MPSGraph* mpsGraph = make_mps_graph();
             MPSGraphTensor* updatesTensor = nil;
             newCachedGraph = new ViewCachedGraph(mpsGraph);
-            // Workaround for MPSShaderLibrary bug
-            // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
+            // Workaround for MPSShaderLibrary bug in macOS Monterey
+            // This is fixed in macOS Ventura
             auto inputType = getMPSScalarType(self.scalar_type());
-            if (inputType == MPSDataTypeUInt8) {
+            if (inputType == MPSDataTypeUInt8 || (inputType == MPSDataTypeBool && !is_macos_13_or_newer())) {
                 inputType = MPSDataTypeInt8;
             }
-            auto needsBoolCast = inputType == MPSDataTypeBool;
+
             // Self is the input tensor we are creating view of
             newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(base_shape));
             newCachedGraph->storageOffsetTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[@1]);
@@ -683,10 +718,10 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
             }
             if (needsScatter) {
               auto updatesType = getMPSScalarType(updates.scalar_type());
-              if (updatesType == MPSDataTypeUInt8) {
-                updatesType = MPSDataTypeInt8;
+              if (updatesType == MPSDataTypeUInt8 || (updatesType == MPSDataTypeBool && !is_macos_13_or_newer())) {
+                  updatesType = MPSDataTypeInt8;
               }
-              newCachedGraph->updatesTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, updatesType);
+              newCachedGraph->updatesTensor = mpsGraphRankedPlaceHolder(mpsGraph, updatesType, getMPSShape(self.numel()));
               updatesTensor = newCachedGraph->updatesTensor;
               if (inputType != updatesType) {
                 updatesTensor = [mpsGraph castTensor:updatesTensor
@@ -694,7 +729,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
                                                 name:@"castUpdatesTensor"];
               }
             }
-            newCachedGraph->outputTensor = chainViewOperation(newCachedGraph, size, stride, storage_offset, base_shape, needsScatter, needsBoolCast, updatesTensor);
+            newCachedGraph->outputTensor = chainViewOperation(newCachedGraph, size, stride, storage_offset, base_shape, needsScatter, updatesTensor);
         }
         return newCachedGraph;
       }));
@@ -703,26 +738,203 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
   }
 }
 
-Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
-{
-  if (src.sizes().size() == 0) {
-    return Tensor();
+static
+std::string getGatherScatterFunctionName(
+  ScalarType scalarType,
+  int64_t dim,
+  bool needsScatter) {
+  std::string kernelName = needsScatter ? "scatter" : "gather";
+  return kernelName + "_kernel_" + std::to_string(dim == 0 ? 1 : dim);
+}
+
+const std::string& getGatherScatterScalarType(const Tensor& t) {
+  auto scalar_type = t.scalar_type();
+  static std::unordered_map<c10::ScalarType, std::string> scalarToMetalType = {
+    {c10::ScalarType::Float, "float"},
+    {c10::ScalarType::Half,  "half"},
+    {c10::ScalarType::Long,  "long"},
+    {c10::ScalarType::Int,   "int"},
+    {c10::ScalarType::Short, "short"},
+    {c10::ScalarType::Char,  "char"},
+    {c10::ScalarType::Byte,  "uchar"},
+    {c10::ScalarType::Bool,  "bool"},
+  };
+
+  auto it = scalarToMetalType.find(scalar_type);
+  TORCH_CHECK(it != scalarToMetalType.end(), "Unsupported type byte size: ", scalar_type);
+  return it->second;
+}
+
+static
+id<MTLLibrary> compileGatherScatterOpsLibrary(id<MTLDevice> device,
+                                              const std::string& dtypeSrc,
+                                              const std::string& dtypeDst,
+                                              bool needsScatter) {
+  auto key = std::to_string(needsScatter) + dtypeSrc + dtypeDst;
+  static std::unordered_map<std::string, id<MTLLibrary>> _libCache;
+  auto it = _libCache.find(key);
+  if (it != _libCache.end()) {
+    return it->second;
   }
-  bool requires_sync = false;
-  Tensor output;
+  NSError *error = nil;
+  MTLCompileOptions *options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion: MTLLanguageVersion2_3];
+  auto gatherScatterLib = [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(needsScatter ? SCATTER_OPS_TEMPLATE : GATHER_OPS_TEMPLATE, dtypeSrc, dtypeDst).c_str()]
+                                               options:options
+                                                 error:&error];
+  TORCH_CHECK(gatherScatterLib != nil && error == nil, "Failed to compile gather-scatter library, error: ", [[error description] UTF8String]);
+  _libCache[key] = gatherScatterLib;
+  return gatherScatterLib;
+}
+
+static id<MTLComputePipelineState> getPipelineState(id<MTLDevice> device,
+                                                    const std::string& kernel,
+                                                    const std::string& dtypeSrc,
+                                                    const std::string& dtypeDst,
+                                                    bool needsScatter) {
+  auto key = kernel + dtypeSrc + dtypeDst;
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> _mtlPipelineCache;
+  auto it = _mtlPipelineCache.find(key);
+  if (it != _mtlPipelineCache.end()) {
+     return it->second;
+  }
+
+  NSError *error = nil;
+  id<MTLLibrary> library = compileGatherScatterOpsLibrary(device, dtypeSrc, dtypeDst, needsScatter);
+  id<MTLFunction> func = [library newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(func, "Failed to load the Metal Shader function: ", kernel);
+  id<MTLComputePipelineState> pso = [device newComputePipelineStateWithFunction:func error:&error];
+  TORCH_CHECK(pso != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]);
+  _mtlPipelineCache[key] = pso;
+  return pso;
+}
+
+Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) {
+  Tensor output = dst;
   if (!dst.has_storage()) {
     output = at::native::empty_mps(src.sizes(), src.scalar_type(), c10::nullopt, kMPS);
-    requires_sync = true;
   }
+
+  if (src.numel() == 0 || output.numel() == 0) {
+    return dst;
+  }
+
+  if (src.dim() > 5) {
   ViewCachedGraph* cachedGraph = createViewGraph(src, dst, src.sizes(), src.strides(),
                                                  src.storage_offset(), /*needsScatter*/ false);
-  return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false, requires_sync);
+    return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false);
+  }
+
+  id<MTLBuffer> outputBuffer = dst.has_storage() ? getMTLBufferStorage(dst) : getMTLBufferStorage(output);
+  int64_t outputStorageOffset = output.storage_offset() * output.element_size();
+  uint32_t numThreads = output.numel();
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^(){
+    id<MTLComputeCommandEncoder> computeEncoder = [mpsStream->commandBuffer() computeCommandEncoder];
+    std::string functionName = getGatherScatterFunctionName(output.scalar_type(), output.dim(), /*needsScatter=*/false);
+    id<MTLComputePipelineState> gatherPSO = getPipelineState(MPSDevice::getInstance()->device(),
+                                                             functionName,
+                                                             getGatherScatterScalarType(src),
+                                                             getGatherScatterScalarType(output),
+                                                             /*needsScatter=*/false);
+
+    uint32_t kernel_size = src.sizes().size();
+    std::vector<uint32_t> src_sizes(kernel_size == 0 ? 1 : kernel_size);
+    std::vector<uint32_t> src_strides(kernel_size == 0 ? 1 : kernel_size);
+
+    if (kernel_size == 0) {
+      src_sizes[0] = src_strides[0] = 1;
+    } else {
+      for (int i = 0; i < kernel_size; i++) {
+        src_sizes[i] = (uint32_t)(src.sizes()[i]);
+        src_strides[i] = (uint32_t)(src.strides()[i]);
+      }
+    }
+
+    [computeEncoder setComputePipelineState: gatherPSO];
+    [computeEncoder setBuffer:getMTLBufferStorage(src) offset:src.storage_offset() * src.element_size() atIndex:0];
+    [computeEncoder setBuffer:outputBuffer offset:outputStorageOffset atIndex:1];
+    [computeEncoder setBytes:&src_sizes[0] length:sizeof(uint32_t) * kernel_size atIndex:2];
+    [computeEncoder setBytes:&src_strides[0] length:sizeof(uint32_t) * kernel_size atIndex:3];
+    [computeEncoder setBytes:&numThreads length:sizeof(uint32_t) atIndex:4];
+
+    MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+    NSUInteger threadsPerThreadgroup_ = gatherPSO.maxTotalThreadsPerThreadgroup;
+    if (threadsPerThreadgroup_ > numThreads) {
+        threadsPerThreadgroup_ = numThreads;
+    }
+
+    MTLSize threadsPerThreadgroup = MTLSizeMake(threadsPerThreadgroup_, 1, 1);
+    [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadsPerThreadgroup];
+    [computeEncoder endEncoding];
+    mpsStream->synchronize(SyncType::COMMIT_AND_CONTINUE);
+  });
+
+  return (dst.has_storage()) ? dst : output;
 }
 
-Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output) {
-  ViewCachedGraph* cachedGraph = createViewGraph(output, src, output.sizes(), output.strides(),
+Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output){
+  if (output.dim() > 5) {
+      ViewCachedGraph* cachedGraph = createViewGraph(output.is_complex() ?  at::view_as_real(output) : output,
+                                                 src, output.sizes(), output.strides(),
                                                  output.storage_offset(), /*needsScatter*/ true);
-  return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true, /*requires_sync*/  true);
+    return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true);
+  }
+  if (src.numel() == 0 || output.numel() == 0) {
+    return output;
+  }
+
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
+  uint32_t numThreads = src.numel();
+  int64_t outputStorageOffset = output.storage_offset() * output.element_size();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^(){
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+      id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+      std::string functionName = getGatherScatterFunctionName(output.scalar_type(), output.dim(), /*needsScatter=*/true);
+      id<MTLComputePipelineState> scatterPSO = getPipelineState(MPSDevice::getInstance()->device(),
+                                                                functionName,
+                                                                getGatherScatterScalarType(src),
+                                                                getGatherScatterScalarType(output),
+                                                                /*needsScatter=*/true);
+
+      uint32_t kernel_size = output.sizes().size();
+      std::vector<uint32_t> output_sizes(kernel_size == 0 ? 1 : kernel_size);
+      std::vector<uint32_t> output_strides(kernel_size == 0 ? 1 : kernel_size);
+
+      if (kernel_size == 0) {
+        output_sizes[0] = output_strides[0] = 1;
+      } else {
+        for (const auto i : c10::irange(kernel_size)) {
+          output_sizes[i] = (uint32_t)(output.sizes()[i]);
+          output_strides[i] = (uint32_t)(output.strides()[i]);
+        }
+      }
+
+      [computeEncoder setComputePipelineState: scatterPSO];
+      [computeEncoder setBuffer:sourceBuffer offset:src.storage_offset() * src.element_size() atIndex:0];
+      [computeEncoder setBuffer:outputBuffer offset:outputStorageOffset atIndex:1];
+      [computeEncoder setBytes:&output_sizes[0] length:sizeof(uint32_t) * kernel_size atIndex:2];
+      [computeEncoder setBytes:&output_strides[0] length:sizeof(uint32_t) * kernel_size atIndex:3];
+      [computeEncoder setBytes:&numThreads length:sizeof(uint32_t) atIndex:4];
+
+      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+      NSUInteger threadsPerThreadgroup_ = scatterPSO.maxTotalThreadsPerThreadgroup;
+      if (threadsPerThreadgroup_ > numThreads) {
+        threadsPerThreadgroup_ = numThreads;
+      }
+
+      MTLSize threadsPerThreadgroup = MTLSizeMake(threadsPerThreadgroup_, 1, 1);
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadsPerThreadgroup];
+      [computeEncoder endEncoding];
+      mpsStream->synchronize(SyncType::COMMIT_AND_CONTINUE);
+    }
+  });
+
+  return output;
 }
 
 } // namespace mps
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index fe97fc7ddea6..277585425424 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -244,7 +244,7 @@
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
     NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
-  tags: [nondeterministic_seeded, canonical]
+  tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
 
 - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
@@ -297,7 +297,7 @@
     CompositeExplicitAutograd: abs
     SparseCPU, SparseCUDA: abs_sparse
     SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -461,7 +461,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: acos.out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: acos_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -504,7 +504,7 @@
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -565,7 +565,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: add
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -596,6 +596,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: addr
+    MPS: addr_mps
     CompositeExplicitAutograd: math_addr
 
 - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
@@ -606,6 +607,7 @@
 - func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: addr_out
+    MPS: addr_out_mps
     CompositeExplicitAutograd: math_addr_out
 
 - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
@@ -698,7 +700,7 @@
   dispatch:
     CompositeExplicitAutograd: arange
   cpp_no_default_args: ['step']
-  tags: canonical
+  tags: core
 
 - func: arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -722,7 +724,7 @@
   structured_delegate: argmax.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: canonical
+  tags: core
 
 - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -734,7 +736,7 @@
   structured_delegate: argmin.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: canonical
+  tags: core
 
 - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -745,7 +747,7 @@
 - func: acosh(Tensor self) -> Tensor
   variants: function, method
   structured_delegate: acosh.out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: acosh_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
@@ -775,7 +777,7 @@
   dispatch:
     SparseCPU, SparseCUDA: asinh_sparse
     SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: asinh_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
@@ -810,7 +812,7 @@
   dispatch:
     SparseCPU, SparseCUDA: atanh_sparse
     SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: atanh_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: atanh.out
@@ -848,7 +850,7 @@
     QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
   device_check: NoCheck
   device_guard: False
-  tags: canonical
+  tags: core
 
 - func: as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
@@ -866,7 +868,7 @@
   dispatch:
     SparseCPU, SparseCUDA: asin_sparse
     SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -904,7 +906,7 @@
   dispatch:
     SparseCPU, SparseCUDA: atan_sparse
     SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: atan_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1092,7 +1094,7 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: bitwise_not.out
   variants: function, method
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1113,7 +1115,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: copysign_out
+    CPU, CUDA, MPS: copysign_out
   tags: pointwise
 
 - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1148,7 +1150,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_not
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1190,7 +1192,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_and
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1211,7 +1213,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_or
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1245,7 +1247,7 @@
     SparseCUDA: bmm_sparse_cuda
     NestedTensorCPU: bmm_nested
     NestedTensorCUDA: bmm_nested_cuda
-  tags: canonical
+  tags: core
 
 - func: bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -1277,7 +1279,7 @@
   dispatch:
     SparseCPU, SparseCUDA: cat_sparse
     QuantizedCPU: cat_quantized_cpu
-  tags: canonical
+  tags: core
 
 - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -1386,7 +1388,7 @@
   structured_delegate: clamp.out
   dispatch:
     QuantizedCPU: clamp_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
   variants: function, method
@@ -1557,7 +1559,7 @@
     CompositeExplicitAutograd: constant_pad_nd
     MPS: constant_pad_nd_mps
   autogen: constant_pad_nd.out
-  tags: canonical
+  tags: core
 
 - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
   variants: method
@@ -1567,13 +1569,13 @@
   dispatch:
     CompositeExplicitAutograd: convolution
   autogen: convolution.out
-  tags: canonical
+  tags: core
 
 - func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CompositeExplicitAutograd, CUDA: convolution_backward
   autogen: convolution_backward.out
-  tags: canonical
+  tags: core
 
 - func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
   dispatch:
@@ -1658,7 +1660,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: cos.out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1679,7 +1681,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: cosh.out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: cosh_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1989,7 +1991,7 @@
     SparseCPU, SparseCUDA: div_sparse
     ZeroTensor: div_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2042,7 +2044,7 @@
   dispatch:
     CompositeExplicitAutograd: div
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2154,7 +2156,7 @@
     CUDA: embedding_dense_backward_cuda
     MPS: embedding_dense_backward_mps
   autogen: embedding_dense_backward.out
-  tags: canonical
+  tags: core
 
 - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
   dispatch:
@@ -2239,6 +2241,11 @@
     SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
     QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized
 
+- func: empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  dispatch:
+    CompositeExplicitAutograd: empty_permuted_symint
+  autogen: empty_permuted.out
+
 # We do not make new_empty a composite that calls into new_empty_strided, as the strided version
 # is significantly more difficult to implement by different backends
 - func: new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2348,7 +2355,7 @@
     Meta: empty_strided_meta_symint
     QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
   autogen: empty_strided.out
-  tags: canonical
+  tags: core
 
 - func: erf(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2357,7 +2364,7 @@
   dispatch:
     SparseCPU, SparseCUDA: erf_sparse
     SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: erf_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2403,7 +2410,7 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: exp.out
   variants: function, method
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: exp_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2473,7 +2480,7 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: expand
-  tags: canonical
+  tags: core
 
 - func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
@@ -2523,7 +2530,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: fill
-  tags: canonical
+  tags: core
 
 - func: fill.Tensor(Tensor self, Tensor value) -> Tensor
   variants: function
@@ -2560,7 +2567,7 @@
   dispatch:
     SparseCPU, SparseCUDA: floor_sparse
     SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: floor_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2652,7 +2659,7 @@
 - func: full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
     CompositeExplicitAutograd: full
-  tags: canonical
+  tags: core
 
 - func: full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -2725,8 +2732,9 @@
   dispatch:
     CPU, QuantizedCPU: grid_sampler_2d_cpu
     CUDA: grid_sampler_2d_cuda
+    MPS: grid_sampler_2d_mps
   autogen: grid_sampler_2d.out
-  tags: canonical
+  tags: core
 
 # `grid_sampler_2d_backward` takes in `output_mask` to optimize performance for
 # the case where `input` doesn't require gradient. Gradient for `grid` is always
@@ -2814,13 +2822,13 @@
     CPU, CUDA: native_group_norm
     CompositeExplicitAutograd: math_group_norm
   autogen: native_group_norm.out
-  tags: canonical
+  tags: core
 
 - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CPU, CUDA: native_group_norm_backward
   autogen: native_group_norm_backward.out
-  tags: canonical
+  tags: core
 
 # Real to complex forward FFT
 - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
@@ -2991,7 +2999,7 @@
     SparseCPU, SparseCUDA: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
   autogen: isnan.out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: is_distributed(Tensor self) -> bool
   variants: function, method
@@ -3087,15 +3095,16 @@
     CompositeExplicitAutograd: math_native_layer_norm
     NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
   autogen: native_layer_norm.out
-  tags: canonical
+  tags: core
 
 - func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
     MPS: layer_norm_backward_mps
+    NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
   autogen: native_layer_norm_backward.out
-  tags: canonical
+  tags: core
 
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
   variants: function, method
@@ -3197,7 +3206,7 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: log.out
   variants: function, method
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: log_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3352,6 +3361,7 @@
   variants: function
   dispatch:
     CPU, CUDA: xlogy_out
+    MPS: xlogy_out_mps
   tags: pointwise
 
 - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -3391,7 +3401,7 @@
 
 - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   structured_delegate: _log_softmax.out
-  tags: canonical
+  tags: core
 
 - func: _log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3522,7 +3532,7 @@
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qmax
-  tags: canonical
+  tags: core
 
 - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
@@ -3550,7 +3560,7 @@
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
   structured_delegate: amax.out
-  tags: canonical
+  tags: core
 
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3564,19 +3574,14 @@
 - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
 
 - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-
-# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
-# native_functions.yaml
-# https://github.com/pytorch/pytorch/issues/77394
-- func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   dispatch:
-    MPS: _mps_max_pool2d
-  autogen: _mps_max_pool2d.out
+    CompositeImplicitAutograd: max_pool2d
+    MPS: mps_max_pool2d
 
-- func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+- func: max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   dispatch:
     MPS: mps_max_pool2d_backward
-  autogen: mps_max_pool2d_backward.out
+  autogen: max_pool2d_backward.out
 
 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   dispatch:
@@ -3632,7 +3637,7 @@
   variants: function, method
   dispatch:
     QuantizedCPU: mean_quantized_cpu
-  tags: canonical
+  tags: core
 
 - func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3708,7 +3713,7 @@
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qmin
-  tags: canonical
+  tags: core
 
 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
@@ -3729,7 +3734,7 @@
 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
   structured_delegate: amin.out
-  tags: canonical
+  tags: core
 
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3814,7 +3819,7 @@
   dispatch:
     SparseCPU, SparseCUDA: _sparse_mm
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm
-  tags: canonical
+  tags: core
 
 - func: mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3825,21 +3830,26 @@
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
 
+- func: _int_mm(Tensor self, Tensor mat2) -> Tensor
+  dispatch:
+    CUDA: _int_mm_cuda
+
+- func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _int_mm_out_cuda
+
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
   python_module: sparse
 
+- func: _sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
+  python_module: sparse
+
 - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
   dispatch:
     SparseCPU: sparse_sparse_matmul_cpu
     SparseCUDA: sparse_sparse_matmul_cuda
   autogen: _sparse_sparse_matmul.out
 
-- func: _sparse_mask_helper(Tensor t, Tensor mask_indices) -> Tensor
-  dispatch:
-    SparseCPU: sparse_mask_helper_cpu
-    SparseCUDA: sparse_mask_helper_cuda
-  autogen: _sparse_mask_helper.out
-
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
   dispatch:
@@ -3864,7 +3874,7 @@
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3898,7 +3908,7 @@
     CompositeExplicitAutograd: mul
     SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3986,7 +3996,7 @@
     CUDA: batch_norm_cuda
     MPS: batch_norm_mps
     MkldnnCPU: mkldnn_batch_norm
-  tags: canonical
+  tags: core
 
 - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   dispatch:
@@ -4003,6 +4013,14 @@
     MkldnnCPU: _mkldnn_batch_norm_legit
   autogen: _native_batch_norm_legit_functional
 
+# HACK: identical to _native_batch_norm_legit, but training is known to be False,
+# So we known that running stats will not be mutated.
+# The real fix here is batch norm consolidation.
+- func: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CompositeExplicitAutograd: _batch_norm_legit_no_training
+  autogen: _native_batch_norm_legit_no_training.out
+
 - func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
   dispatch:
     CPU: _batch_norm_legit_cpu_out
@@ -4015,7 +4033,7 @@
     CUDA: _batch_norm_legit_no_stats_cuda
     MPS: _batch_norm_legit_no_stats_mps
     MkldnnCPU: _mkldnn_batch_norm_legit_no_stats
-  tags: canonical
+  tags: core
 
 - func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   dispatch:
@@ -4145,7 +4163,7 @@
     CompositeExplicitAutograd: permute
     MPS: permute_mps
     SparseCPU, SparseCUDA: permute_sparse_coo
-  tags: canonical
+  tags: core
 
 - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
   variants: function, method
@@ -4280,7 +4298,7 @@
   dispatch:
     CompositeExplicitAutograd: scalar_tensor
   autogen: scalar_tensor.out
-  tags: canonical
+  tags: core
 
 - func: rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_check: NoCheck
@@ -4458,6 +4476,7 @@
   dispatch:
     CPU, Meta: range_out
     CUDA: range_cuda_out
+    MPS: range_mps_out
   cpp_no_default_args: ['step']
 
 - func: ravel(Tensor(a) self) -> Tensor(a)
@@ -4467,7 +4486,7 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: reciprocal.out
   variants: function, method
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4492,7 +4511,7 @@
     SparseCPU, SparseCUDA: neg_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4530,13 +4549,14 @@
     CompositeExplicitAutograd: repeat
     MPS: repeat_mps
   autogen: repeat.out
-  tags: canonical
+  tags: core
 
 - func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor
   variants: function
   dispatch:
     CPU: repeat_interleave_cpu
     CUDA: repeat_interleave_cuda
+    MPS: repeat_interleave_mps
   tags: dynamic_output_shape
   autogen: repeat_interleave.Tensor_out
 
@@ -4658,7 +4678,7 @@
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
     SparseCPU, SparseCUDA: relu_sparse
     SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4724,7 +4744,7 @@
     QuantizedCPU: gelu_quantized_cpu
     QuantizedCUDA: gelu_quantized_cuda
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -4740,6 +4760,7 @@
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_gelu_backward
+    NestedTensorCPU, NestedTensorCUDA: gelu_backwards_nested
   tags: pointwise
 
 - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
@@ -4774,7 +4795,7 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: rsqrt.out
   variants: function, method
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4804,7 +4825,7 @@
     CompositeExplicitAutograd: select_symint
     SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: select_nested
-  tags: canonical
+  tags: core
 
 - func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
   variants: function
@@ -4896,7 +4917,7 @@
   dispatch:
     QuantizedCPU: sigmoid_quantized_cpu
     MkldnnCPU: mkldnn_sigmoid
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4919,6 +4940,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: logit
+    MPS: logit_mps
   tags: pointwise
 
 - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
@@ -4930,6 +4952,7 @@
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: logit_out
+    MPS: logit_out_mps
   tags: pointwise
 
 - func: sin(Tensor self) -> Tensor
@@ -4939,7 +4962,7 @@
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr
     SparseCPU, SparseCUDA: sin_sparse
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4985,7 +5008,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sinh_sparse
     SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sinh_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5050,7 +5073,7 @@
   device_guard: False
   dispatch:
     CompositeExplicitAutograd: slice
-  tags: canonical
+  tags: core
 
 # NOTE: The implementation of split_with_sizes bypasses the dispatcher to call this; undo
 # that if adding specific implementations here!
@@ -5070,7 +5093,7 @@
   dispatch:
     CompositeExplicitAutograd: slice_scatter
   autogen: slice_scatter.out
-  tags: canonical
+  tags: core
 
 - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
   variants: function, method
@@ -5116,7 +5139,7 @@
   dispatch:
     MkldnnCPU: mkldnn_softmax
     NestedTensorCPU, NestedTensorCUDA: softmax_nested
-  tags: canonical
+  tags: core
 
 - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -5208,7 +5231,7 @@
     CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
     NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
-  tags: canonical
+  tags: core
 
 - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
   variants: function, method
@@ -5224,7 +5247,7 @@
     CompositeExplicitAutograd: squeeze
     QuantizedCPU, QuantizedCUDA: squeeze_quantized
     NestedTensorCPU, NestedTensorCUDA: squeeze_dim_nested
-  tags: canonical
+  tags: core
 
 - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
   variants: method
@@ -5333,7 +5356,8 @@
   variants: function, method
   dispatch:
     NestedTensorCPU: NestedTensor_sum_dim_CPU
-  tags: canonical
+    SparseCPU, SparseCUDA: sum_sparse_coo
+  tags: core
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5358,10 +5382,12 @@
   variants: function, method
   dispatch:
     CPU, CUDA: nansum
+    MPS: nansum_mps
 
 - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nansum_out
+    MPS: nansum_out_mps
 
 - func: sum_to_size(Tensor self, int[] size) -> Tensor
   variants: method
@@ -5375,7 +5401,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sqrt_sparse
     SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5420,7 +5446,7 @@
   variants: function, method
   cpp_no_default_args: ["unbiased"]
 
-- func: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
@@ -5438,7 +5464,7 @@
   variants: function
   cpp_no_default_args: ["unbiased"]
 
-- func: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
@@ -5450,7 +5476,7 @@
   variants: function
   cpp_no_default_args: ["unbiased"]
 
-- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
 
@@ -5458,7 +5484,7 @@
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
 
-- func: std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: std_out
@@ -5473,11 +5499,11 @@
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
 
-- func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
 
-- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
 
@@ -5562,7 +5588,7 @@
     SparseCPU, SparseCUDA: tanh_sparse
     SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5631,6 +5657,7 @@
     MkldnnCPU: mkldnn_relu_backward
     SparseCPU, SparseCUDA: threshold_backward_sparse
     SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed
+    NestedTensorCPU, NestedTensorCUDA: threshold_backwards_nested
   tags: pointwise
 
 - func: tile(Tensor self, int[] dims) -> Tensor
@@ -5681,6 +5708,7 @@
     CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip
     MPS: flip_mps
   autogen: flip.out
+  tags: core
 
 - func: fliplr(Tensor self) -> Tensor
   variants: function, method
@@ -5691,7 +5719,7 @@
 - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
   variants: function, method
   dispatch:
-    CPU: roll_cpu
+    CPU, MPS: roll
     CUDA: roll_cuda
   autogen: roll.out
 
@@ -5887,7 +5915,7 @@
     SparseCPU, SparseCUDA: unsqueeze_sparse
     QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
     NestedTensorCPU, NestedTensorCUDA: unsqueeze_nested
-  tags: canonical
+  tags: core
 
 - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
   variants: method
@@ -5907,10 +5935,10 @@
 - func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: canonical
+  tags: core
   cpp_no_default_args: ["unbiased"]
 
-- func: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
@@ -5921,7 +5949,7 @@
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
 
-- func: var.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: var_out
@@ -5935,11 +5963,11 @@
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
 
-- func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
+- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
 
-- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
 
@@ -5953,7 +5981,7 @@
   variants: function
   cpp_no_default_args: ["unbiased"]
 
-- func: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
@@ -5965,7 +5993,7 @@
   variants: function
   cpp_no_default_args: ["unbiased"]
 
-- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
 
@@ -5980,7 +6008,7 @@
   dispatch:
     CPU, CUDA: where
     MPS: where_mps
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5992,7 +6020,7 @@
   variants: function
 
 - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
-  variants: function
+  variants: function, method
 
 - func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
   variants: function
@@ -6294,7 +6322,7 @@
     QuantizedCPU, QuantizedCUDA: quantized_clone
     NestedTensorCPU, NestedTensorCUDA: clone_nested
   autogen: clone.out
-  tags: canonical
+  tags: core
 
 - func: positive(Tensor(a) self) -> Tensor(a)
   variants: function, method
@@ -6344,7 +6372,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sub_sparse
     ZeroTensor: sub_zerotensor
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6360,7 +6388,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: sub
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6441,6 +6469,16 @@
     SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda
     SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu
 
+- func: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
+  python_module: sparse
+  dispatch:
+    SparseCsrCPU: _sparse_mm_reduce_impl_sparse_csr_cpu
+
+- func: _sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor)
+  python_module: sparse
+  dispatch:
+    SparseCsrCPU: _sparse_mm_reduce_impl_backward_sparse_csr_cpu
+
 - func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
@@ -6459,7 +6497,7 @@
     SparseCPU: addmm_sparse_dense_cpu
     SparseCUDA: addmm_sparse_dense_cuda
     SparseCsrCPU, SparseCsrCUDA: addmm_sparse_compressed_dense
-  tags: canonical
+  tags: core
 
 - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   structured_delegate: addmm.out
@@ -6738,6 +6776,7 @@
   variants: method
   dispatch:
     SparseCPU, SparseCUDA, SparseMeta: is_coalesced_sparse
+    CompositeExplicitAutograd: is_coalesced_default
   device_check: NoCheck
   device_guard: False
 
@@ -6770,6 +6809,7 @@
   variants: method
   dispatch:
     SparseCPU, SparseCUDA, SparseMeta: indices_sparse
+    CompositeExplicitAutograd: indices_default
   device_check: NoCheck
   device_guard: False
 
@@ -6779,6 +6819,7 @@
     SparseCPU, SparseCUDA, SparseMeta: values_sparse
     SparseCsrCPU, SparseCsrCUDA: values_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: values_nested
+    CompositeExplicitAutograd: values_default
   device_check: NoCheck
   device_guard: False
 
@@ -6786,6 +6827,7 @@
   variants: method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr
+    CompositeExplicitAutograd: crow_indices_default
   device_check: NoCheck
   device_guard: False
 
@@ -6793,6 +6835,7 @@
   variants: method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr
+    CompositeExplicitAutograd: col_indices_default
   device_check: NoCheck
   device_guard: False
 
@@ -6800,6 +6843,7 @@
   variants: method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr
+    CompositeExplicitAutograd: ccol_indices_default
   device_check: NoCheck
   device_guard: False
 
@@ -6807,6 +6851,7 @@
   variants: method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr
+    CompositeExplicitAutograd: row_indices_default
   device_check: NoCheck
   device_guard: False
 
@@ -7093,7 +7138,7 @@
     CompositeExplicitAutograd: _to_copy
     NestedTensorCPU, NestedTensorCUDA: _to_copy_nested
   autogen: _to_copy.out
-  tags: canonical
+  tags: core
 
 # to(Device) must not exist because all constructors of Device also works for
 # TensorOptions. Otherwise, an ambiguity error is thrown.
@@ -7163,12 +7208,12 @@
 
 # MPS LSTM implementation
 
-- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
   dispatch:
     MPS: _lstm_mps
   autogen: _lstm_mps.out
 
-- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
   dispatch:
     MPS: lstm_mps_backward
   autogen: lstm_mps_backward.out
@@ -7386,6 +7431,7 @@
   dispatch:
     CPU: masked_scatter__cpu
     CUDA: masked_scatter__cuda
+    MPS: masked_scatter__mps
   autogen: masked_scatter.out
 
 - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
@@ -7413,7 +7459,7 @@
     ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
     MkldnnCPU: mkldnn_view
     NestedTensorCPU, NestedTensorCUDA: view_nested
-  tags: canonical
+  tags: core
 
 # Warning: If you want to change the name or overload name of this
 # operator, you might also want to change the `isBlockListedSchema`
@@ -7431,7 +7477,7 @@
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
   variants: method
   dispatch:
-    CPU, CUDA, MPS: put_
+    CPU, CUDA: put_
   autogen: put.out
 
 - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
@@ -7589,7 +7635,7 @@
 - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
   structured_delegate: scatter_add.out
   variants: function, method
-  tags: canonical
+  tags: core
 
 - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
   structured_delegate: scatter_add.out
@@ -7608,7 +7654,7 @@
 - func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
   structured_delegate: scatter_reduce.two_out
   variants: function, method
-  tags: canonical
+  tags: core
 
 - func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
   structured_delegate: scatter_reduce.two_out
@@ -7665,7 +7711,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: bitwise_and.Tensor_out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7727,7 +7773,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: bitwise_or.Tensor_out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7789,7 +7835,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: bitwise_xor.Tensor_out
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8170,7 +8216,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ne_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8188,7 +8234,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ne_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: ne.Scalar_out
@@ -8233,7 +8279,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: eq_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8251,7 +8297,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: eq_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8269,7 +8315,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ge_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8287,7 +8333,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ge_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: ge.Scalar_out
@@ -8332,7 +8378,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: le_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8350,7 +8396,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: le_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: le.Scalar_out
@@ -8395,7 +8441,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: gt_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8413,7 +8459,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: gt_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: gt.Scalar_out
@@ -8458,7 +8504,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: lt_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8476,7 +8522,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: lt_quantized_cpu
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: lt.Scalar_out
@@ -8535,7 +8581,7 @@
     SparseCPU: index_select_sparse_cpu
     SparseCUDA: index_select_sparse_cuda
     MPS: index_select_mps
-  tags: canonical
+  tags: core
 
 - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -8582,7 +8628,7 @@
     CPU: nonzero_cpu
     CUDA: nonzero_cuda
     MPS: nonzero_mps
-  tags: [dynamic_output_shape, canonical]
+  tags: [dynamic_output_shape, core]
 
 - func: nonzero_numpy(Tensor self) -> Tensor[]
   variants: method, function
@@ -8600,7 +8646,7 @@
 - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
   variants: method, function
   structured_delegate: gather.out
-  tags: canonical
+  tags: core
 
 - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
   variants: function
@@ -8665,6 +8711,7 @@
   structured: True
   dispatch:
     CPU, CUDA: triangular_solve_out
+    MPS: triangular_solve_mps_out
     SparseCsrCPU: triangular_solve_out_sparse_csr_cpu
     SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda
 
@@ -8680,32 +8727,18 @@
   python_module: linalg
   dispatch:
     CPU, CUDA: linalg_solve_triangular_out
+    MPS: linalg_solve_triangular_mps_out
 
 - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
   python_module: linalg
   variants: function
   dispatch:
     CPU, CUDA: linalg_solve_triangular
+    MPS: linalg_solve_triangular_mps
 
 - func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
   python_module: linalg
 
-- func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
-  dispatch:
-    CompositeExplicitAutograd: symeig_out
-
-- func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
-  variants: method, function
-  dispatch:
-    CompositeExplicitAutograd: symeig
-
-- func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CPU: _symeig_helper_cpu
-    CUDA: _symeig_helper_cuda
-  autogen: _symeig_helper.out
-
 - func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
 
 - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
@@ -8938,7 +8971,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sign_sparse
     SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: sign_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -9121,13 +9154,14 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: fmod_out
+    MPS: fmod_mps_out
   tags: pointwise
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: fmod.Tensor_out
   variants: method, function
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -9140,6 +9174,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: hypot_out
+    MPS: hypot_out_mps
   tags: pointwise
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
@@ -9226,13 +9261,14 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: remainder_out
+    MPS: remainder_out_mps
   tags: pointwise
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: remainder.Tensor_out
   variants: method, function
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -9244,7 +9280,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: remainder
+    CPU, CUDA, MPS: remainder
   autogen: remainder.Scalar_Tensor_out
   tags: pointwise
 
@@ -9274,7 +9310,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: fmin_out
+    CPU, CUDA, MPS: fmin_out
   tags: pointwise
 
 - func: max(Tensor self) -> Tensor
@@ -9296,14 +9332,14 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: fmax_out
+    CPU, CUDA, MPS: fmax_out
   tags: pointwise
 
 - func: maximum(Tensor self, Tensor other) -> Tensor
   structured_delegate: maximum.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -9335,7 +9371,7 @@
   structured_delegate: minimum.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -9386,6 +9422,7 @@
   structured: True
   dispatch:
     CPU, CUDA: sort_stable_out
+    MPS: sort_stable_out_mps
 
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
@@ -9422,25 +9459,25 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: argsort_stable
+    CPU, CUDA, MPS: argsort_stable
   autogen: argsort.stable_out
 
 - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
   variants: method, function
 
-- func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+- func: topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   structured: True
   dispatch:
     CPU: topk_out_cpu
     CUDA: topk_out_cuda
     MPS: topk_out_mps
 
-- func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+- func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
   variants: method, function
   structured_delegate: topk.values
   dispatch:
     QuantizedCPU: topk_quantized_cpu
-  tags: canonical
+  tags: core
 
 - func: all(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -9520,13 +9557,14 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: pow.Tensor_Tensor_out
   variants: method, function
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
     CPU, CUDA: pow_Scalar_out
+    MPS: pow_Scalar_out_mps
   tags: pointwise
 
 - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
@@ -9550,7 +9588,7 @@
   variants: function, method
   dispatch:
     SparseCPU, SparseCUDA: pow_sparse_scalar
-  tags: [canonical, pointwise]
+  tags: [core, pointwise]
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -9668,6 +9706,7 @@
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: alias
+  tags: core
 
 - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
   variants: function
@@ -10616,6 +10655,58 @@
     CUDA: foreach_tensor_lerp_list_cuda_
   autogen: _foreach_lerp.Scalar_out
 
+- func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_list_kernel_slow
+    CUDA: foreach_tensor_pow_list_kernel_cuda
+
+- func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalar_kernel_slow
+    CUDA: foreach_tensor_pow_scalar_kernel_cuda
+
+- func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalarlist_kernel_slow
+    CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
+
+- func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_scalar_pow_list_kernel_slow
+    CUDA: foreach_scalar_pow_list_kernel_cuda
+
+- func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_list_kernel_slow_
+    CUDA: foreach_tensor_pow_list_kernel_cuda_
+  autogen: _foreach_pow.List_out
+
+- func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalar_kernel_slow_
+    CUDA: foreach_tensor_pow_scalar_kernel_cuda_
+  autogen: _foreach_pow.Scalar_out
+
+- func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_pow_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
+  autogen: _foreach_pow.ScalarList_out
+
 - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
   dispatch:
     CPU: bucketize_cpu
@@ -10970,6 +11061,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid_out
+    MPS: hardsigmoid_out_mps
     QuantizedCPU: hardsigmoid_out_quantized_cpu
 
 - func: hardsigmoid(Tensor self) -> Tensor
@@ -10990,6 +11082,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid_backward_out
+    MPS: hardsigmoid_backward_out_mps
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   structured_delegate: hardsigmoid_backward.grad_input
@@ -11008,7 +11101,7 @@
   dispatch:
     CPU, CUDA, MPS: hardtanh
     QuantizedCPU: hardtanh_quantized_cpu
-  tags: canonical
+  tags: core
 
 - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11073,7 +11166,7 @@
   python_module: nn
   dispatch:
     QuantizedCPU: leaky_relu_quantized_cpu
-  tags: canonical
+  tags: core
 
 - func: leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -11108,6 +11201,7 @@
   dispatch:
     CPU: log_sigmoid_forward_out_cpu
     CUDA: log_sigmoid_forward_out_cuda
+    MPS: log_sigmoid_forward_out_mps
 
 - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
   device_check: NoCheck   # TensorIterator
@@ -11115,18 +11209,21 @@
   dispatch:
     CPU: log_sigmoid_forward_cpu
     CUDA: log_sigmoid_forward_cuda
+    MPS: log_sigmoid_forward_mps
 
 - func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: log_sigmoid_backward_cpu_out
     CUDA: log_sigmoid_backward_cuda_out
+    MPS: log_sigmoid_backward_mps_out
 
 - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
   python_module: nn
   dispatch:
     CPU: log_sigmoid_backward_cpu
     CUDA: log_sigmoid_backward_cuda
+    MPS: log_sigmoid_backward_mps
 
 - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -11239,7 +11336,7 @@
     QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
     QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
   autogen: _adaptive_avg_pool2d.out
-  tags: canonical
+  tags: core
 
 - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
@@ -11248,7 +11345,7 @@
     CUDA: adaptive_avg_pool2d_backward_cuda
     MPS: adaptive_avg_pool2d_backward_mps
   autogen: _adaptive_avg_pool2d_backward.out
-  tags: canonical
+  tags: core
 
 - func: adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -11351,7 +11448,7 @@
   dispatch:
     MkldnnCPU: mkldnn_avg_pool2d
     QuantizedCPU: avg_pool2d_quantized_cpu
-  tags: canonical
+  tags: core
 
 - func: avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11367,7 +11464,7 @@
   structured_delegate: avg_pool2d_backward.grad_input
   dispatch:
     MkldnnCPU: mkldnn_avg_pool2d_backward
-  tags: canonical
+  tags: core
 
 - func: avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -11464,7 +11561,7 @@
 - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
   python_module: nn
   structured_delegate: max_pool2d_with_indices.out
-  tags: canonical
+  tags: core
 
 - func: max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11477,7 +11574,7 @@
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
   python_module: nn
   structured_delegate: max_pool2d_with_indices_backward.grad_input
-  tags: canonical
+  tags: core
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
@@ -11492,7 +11589,7 @@
   dispatch:
     CPU: max_pool3d_with_indices_cpu
     CUDA: max_pool3d_with_indices_cuda
-  tags: canonical
+  tags: core
 
 - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11569,7 +11666,7 @@
     QuantizedCPU: reflection_pad2d_quantized_cpu
     CUDA: reflection_pad2d_cuda
     MPS: reflection_pad2d_mps
-  tags: canonical
+  tags: core
 
 - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11644,7 +11741,7 @@
 - func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
   python_module: nn
   structured_delegate: replication_pad2d.out
-  tags: canonical
+  tags: core
 
 - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11671,7 +11768,7 @@
 - func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
   python_module: nn
   structured_delegate: replication_pad3d.out
-  tags: canonical
+  tags: core
 
 
 - func: replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -11710,7 +11807,7 @@
 - func: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
   autogen: upsample_bilinear2d.vec_out
-  tags: canonical
+  tags: core
 
 - func: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
@@ -11739,7 +11836,7 @@
 - func: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
   autogen: upsample_nearest2d.vec_out
-  tags: canonical
+  tags: core
 
 - func: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
@@ -12058,6 +12155,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: logit_backward_out
+    MPS: logit_backward_out_mps
   tags: pointwise
 
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
@@ -12209,7 +12307,7 @@
   dispatch:
     CPU: col2im_cpu
     CUDA: col2im_cuda
-  tags: canonical
+  tags: core
 
 - func: column_stack(Tensor[] tensors) -> Tensor
 
@@ -12242,7 +12340,7 @@
     SparseMeta: isinf_sparse_meta
     SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr
   autogen: isinf.out
-  tags: canonical
+  tags: core
 
 - func: record_stream(Tensor(a!) self, Stream s) -> ()
   variants: method
@@ -13505,72 +13603,84 @@
   dispatch:
     CompositeExplicitAutogradNonFunctional: _fw_primal_copy
   tags: view_copy
+  autogen: _fw_primal_copy.out
 
 - func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _make_dual_copy
   tags: view_copy
+  autogen: _make_dual_copy.out
 
 - func: view_as_real_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: view_as_real_copy
   tags: view_copy
+  autogen: view_as_real_copy.out
 
 - func: view_as_complex_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: view_as_complex_copy
   tags: view_copy
+  autogen: view_as_complex_copy.out
 
 - func: _conj_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _conj_copy
   tags: view_copy
+  autogen: _conj_copy.out
 
 - func: _neg_view_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _neg_view_copy
   tags: view_copy
+  autogen: _neg_view_copy.out
 
 - func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: as_strided_copy_symint
   tags: view_copy
+  autogen: as_strided_copy.out
 
 - func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _sparse_broadcast_to_copy
   tags: view_copy
+  autogen: _sparse_broadcast_to_copy.out
 
 - func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: diagonal_copy
   tags: view_copy
+  autogen: diagonal_copy.out
 
 - func: expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: expand_copy_symint
   tags: view_copy
+  autogen: expand_copy.out
 
 - func: permute_copy(Tensor self, int[] dims) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: permute_copy
   tags: view_copy
+  autogen: permute_copy.out
 
 - func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint
   tags: view_copy
+  autogen: _reshape_alias_copy.out
 
 - func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor
   variants: function
@@ -13578,18 +13688,21 @@
     CompositeExplicitAutogradNonFunctional: select_copy_symint
     SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr
   tags: view_copy
+  autogen: select_copy.int_out
 
 - func: detach_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: detach_copy
   tags: view_copy
+  autogen: detach_copy.out
 
 - func: slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: slice_copy_Tensor_symint
   tags: view_copy
+  autogen: slice_copy.Tensor_out
 
 - func: split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
   variants: function
@@ -13608,72 +13721,84 @@
   dispatch:
     CompositeExplicitAutogradNonFunctional: squeeze_copy
   tags: view_copy
+  autogen: squeeze_copy.out
 
 - func: squeeze_copy.dim(Tensor self, int dim) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: squeeze_copy_dim
   tags: view_copy
+  autogen: squeeze_copy.dim_out
 
 - func: squeeze_copy.dims(Tensor self, int[] dim) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: squeeze_copy_dims
   tags: view_copy
+  autogen: squeeze_copy.dims_out
 
 - func: t_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: t_copy
   tags: view_copy
+  autogen: t_copy.out
 
 - func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: transpose_copy_int
   tags: view_copy
+  autogen: transpose_copy.int_out
 
 - func: unsqueeze_copy(Tensor self, int dim) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: unsqueeze_copy
   tags: view_copy
+  autogen: unsqueeze_copy.out
 
 - func: _indices_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _indices_copy
   tags: view_copy
+  autogen: _indices_copy.out
 
 - func: _values_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: _values_copy
   tags: view_copy
+  autogen: _values_copy.out
 
 - func: indices_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: indices_copy
   tags: view_copy
+  autogen: indices_copy.out
 
 - func: values_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: values_copy
   tags: view_copy
+  autogen: values_copy.out
 
 - func: crow_indices_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: crow_indices_copy
   tags: view_copy
+  autogen: crow_indices_copy.out
 
 - func: col_indices_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: col_indices_copy
   tags: view_copy
+  autogen: col_indices_copy.out
 
 - func: ccol_indices_copy(Tensor self) -> Tensor
   variants: function
@@ -13695,119 +13820,10 @@
     CompositeExplicitAutogradNonFunctional: unbind_copy_int
   tags: view_copy
 
-- func: view_copy(Tensor self, SymInt[] size) -> Tensor
-  variants: function
-  dispatch:
-    CompositeExplicitAutogradNonFunctional: view_copy_symint
-  tags: view_copy
-
-- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
-  variants: function
-  dispatch:
-    CompositeExplicitAutogradNonFunctional: view_copy_dtype
-  tags: view_copy
-
-- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
-  variants: function
-  dispatch:
-    CompositeExplicitAutogradNonFunctional: unfold_copy
-  tags: view_copy
-
-- func: alias_copy(Tensor self) -> Tensor
-  variants: function
-  dispatch:
-    CompositeExplicitAutogradNonFunctional: alias_copy
-  tags: view_copy
-
-- func: _fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _fw_primal_copy_out
-
-
-- func: _make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _make_dual_copy_out
-
-
-- func: view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: view_as_real_copy_out
-
-
-- func: view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: view_as_complex_copy_out
-
-
-- func: _conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _conj_copy_out
-
-
-- func: _neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _neg_view_copy_out
-
-
-- func: as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: as_strided_copy_out_symint
-
-
-- func: _sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _sparse_broadcast_to_copy_out
-
-
-- func: diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: diagonal_copy_out
-
-
-- func: expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: expand_copy_out_symint
-
-
-- func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: permute_copy_out
-
-
-- func: _reshape_alias_copy.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _reshape_alias_copy_out
-
-
-- func: select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: select_copy_symint_out
-
-
-- func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: detach_copy_out
-
-
-- func: slice_copy.Tensor_out(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
   variants: function
   dispatch:
-    CompositeExplicitAutograd: slice_copy_Tensor_out
-
+    CompositeExplicitAutograd: unbind_copy_int_out
 
 - func: split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
   variants: function
@@ -13820,107 +13836,33 @@
   dispatch:
     CompositeExplicitAutograd: split_with_sizes_copy_out
 
-
-- func: squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: squeeze_copy_out
-
-
-- func: squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: squeeze_copy_dim_out
-
-
-- func: squeeze_copy.dims_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: squeeze_copy_dims_out
-
-
-- func: t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: t_copy_out
-
-
-- func: transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: transpose_copy_int_out
-
-
-- func: unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: unsqueeze_copy_out
-
-
-- func: _indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _indices_copy_out
-
-
-- func: _values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: _values_copy_out
-
-
-- func: indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: indices_copy_out
-
-
-- func: values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: values_copy_out
-
-
-- func: crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: crow_indices_copy_out
-
-
-- func: col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: col_indices_copy_out
-
-
-- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
-  variants: function
-  dispatch:
-    CompositeExplicitAutograd: unbind_copy_int_out
-
-
-- func: view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+- func: view_copy(Tensor self, SymInt[] size) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutograd: view_copy_out_symint
-
+    CompositeExplicitAutogradNonFunctional: view_copy_symint
+  tags: view_copy
+  autogen: view_copy.out
 
-- func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutograd: view_copy_dtype_out
-
+    CompositeExplicitAutogradNonFunctional: view_copy_dtype
+  tags: view_copy
+  autogen: view_copy.dtype_out
 
-- func: unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutograd: unfold_copy_out
-
+    CompositeExplicitAutogradNonFunctional: unfold_copy
+  tags: view_copy
+  autogen: unfold_copy.out
 
-- func: alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+- func: alias_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutograd: alias_copy_out
+    CompositeExplicitAutogradNonFunctional: alias_copy
+  tags: view_copy
+  autogen: alias_copy.out
 
 - func: to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
   variants: method
@@ -13948,25 +13890,37 @@
     CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
   autogen: _native_multi_head_attention.out
 
+- func: scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> Tensor
+  python_module: nn
+  variants: function
+  autogen: scaled_dot_product_attention.out
+
+# TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS WITH THIS OP BUILTIN
 - func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
   python_module: nn
   variants: function
   autogen: _scaled_dot_product_attention.out
 
 # This aten function is kept so that we can test the choice function from Python
-- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> int
+- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False) -> int
   dispatch:
-    CPU, NestedTensorCPU, Meta: _fused_sdp_choice_cpp
+    Meta: _fused_sdp_choice_meta
+    CPU, NestedTensorCPU: _fused_sdp_choice_cpp
     CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
 
-- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None) -> (Tensor, Tensor)
   variants: function
 
-- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool return_softmax=False, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, int philox_seed, int philox_offset, Tensor debug_attn_mask)
   dispatch:
     CUDA: _scaled_dot_product_flash_attention_cuda
     NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
 
+- func: _scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+  variants: function
+  dispatch:
+    CUDA: _scaled_dot_product_flash_attention_backward_cuda
+
 - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
   dispatch:
     CUDA: _scaled_dot_product_efficient_attention_cuda
@@ -13979,12 +13933,17 @@
 - func: _chunk_grad_outputs_efficient_attention(Tensor query, Tensor key, Tensor value, bool is_causal=False) -> bool
   dispatch:
     CUDA: _chunk_grad_outputs_efficient_attention
-# Returns ouput, softmax_logsumexp, softmax
-- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, bool return_softmax, float dropout_p, bool is_causal) -> (Tensor, Tensor, Tensor)
+
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask) -> (Tensor output, Tensor softmax_logsumexp, int philox_seed, int philox_offset, Tensor debug_attn_mask)
   variants: function
   dispatch:
     CUDA: _flash_attention_forward
 
+- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, int philox_seed, int philox_offset) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _flash_attention_backward
+
 # Returns ouput, logsumexp if compute_logsumexp
 - func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
   variants: function
@@ -14730,3 +14689,33 @@
   dispatch:
     CUDA: _fused_adam_kernel_cuda_
   autogen: _fused_adam, _fused_adam.out
+
+- func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
+  variants: function
+  dispatch:
+    CUDA: _fused_adamw_kernel_cuda_
+  autogen: _fused_adamw, _fused_adamw.out
+
+# Collectives
+- func: all_reduce(Tensor self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor
+  # This should be changed to distributed but it requires changes all over the place to work
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: all_reduce
+  variants: function
+
+- func: all_gather_into_tensor(Tensor shard, str tag, int[] ranks, int group_size) -> Tensor
+  # This should be changed to distributed but it requires changes all over the place to work
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: all_gather_into_tensor
+  variants: function
+
+
+- func: wait_tensor(Tensor self) -> Tensor
+  # This should be changed to distributed but it requires changes all over the place to work
+  python_module: nn
+  dispatch:
+    CompositeExplicitAutograd: wait_tensor
+  variants: function
diff --git a/aten/src/ATen/native/nested/NestedTensorBackward.cpp b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
index 51a4210a56ae..962cf5a904b1 100644
--- a/aten/src/ATen/native/nested/NestedTensorBackward.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
@@ -9,6 +9,9 @@
 #include <ATen/NestedTensorImpl.h>
 #include <c10/core/DispatchKey.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
+#include <ATen/native/nested/NestedTensorMath.h>
+#include <ATen/native/layer_norm.h>
+#include <c10/core/DeviceType.h>
 
 namespace at {
 namespace native {
@@ -41,11 +44,12 @@ std::tuple<Tensor, Tensor, Tensor> nested_linear_backward(
     return std::tuple<Tensor, Tensor, Tensor>{Tensor(), Tensor(), Tensor()};
   }
   Tensor grad_input, grad_weight, grad_bias;
-  auto* nt_grad_output = get_nested_tensor_impl(grad_output);
+  auto grad_ouput_contiguous = grad_output.contiguous();
+  auto* nt_grad_output = get_nested_tensor_impl(grad_ouput_contiguous);
   auto* nt_input = get_nested_tensor_impl(input);
   TORCH_INTERNAL_ASSERT(nt_grad_output != nullptr);
   TORCH_INTERNAL_ASSERT(nt_input != nullptr);
-  TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_grad_output));
+  TORCH_INTERNAL_ASSERT(nested_tensor_impl_is_contiguous(nt_grad_output));
   auto grad_ouput_buffer = nt_grad_output->get_buffer();
   auto input_buffer = nt_input->get_buffer();
 
@@ -170,5 +174,103 @@ Tensor _nested_select_backward_symint(
   return nt_grad;
 }
 
+Tensor gelu_backwards_nested(const Tensor& grad, const Tensor& self, c10::string_view approximate){
+    auto partial_gelu_backward = [approximate](auto && PH1, auto && PH2) { return at::gelu_backward(std::forward<decltype(PH1)>(PH1), std::forward<decltype(PH2)>(PH2), approximate); };
+    return map_nt_binary(grad, self, partial_gelu_backward);
+}
+
+// Naming convention for relu
+Tensor threshold_backwards_nested(const Tensor& grad_output, const Tensor& input, const Scalar& threshold){
+    auto partial_relu_backward = [threshold](auto && PH1, auto && PH2) { return at::threshold_backward(std::forward<decltype(PH1)>(PH1), std::forward<decltype(PH2)>(PH2), threshold); };
+    return map_nt_binary(grad_output, input, partial_relu_backward);
+}
+
+std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_nested(
+    const Tensor& grad,
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    const c10::optional<Tensor>& bias_opt /*{ optional */,
+    std::array<bool, 3> grad_input_mask) {
+  // For NestedTensors weight and bias are non nested.
+  auto* nt_impl_grad = get_nested_tensor_impl(grad);
+  auto* nt_impl_input = get_nested_tensor_impl(input);
+  const auto& weight = *weight_opt;
+  const auto& bias = *bias_opt;
+  const auto& sizes = nt_impl_input->get_nested_size_tensor();
+  auto M_N = _check_nested_layer_norm_inputs(
+      *nt_impl_input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
+
+  Tensor dInput;
+  Tensor dgamma;
+  Tensor dbeta;
+  auto input_buffer = nt_impl_input->get_buffer();
+  auto grad_buffer = nt_impl_grad->get_buffer();
+  if (grad_input_mask[0]) {
+    dInput = at::native::empty_like(
+        input_buffer,
+        c10::nullopt /* dtype */,
+        c10::nullopt /* layout */,
+        c10::nullopt /* device */,
+        c10::nullopt /* pin_memory */,
+        at::MemoryFormat::Contiguous);
+  }
+  if (grad_input_mask[1]) {
+    dgamma = M > 0 ? at::native::empty_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         at::MemoryFormat::Contiguous)
+                   : at::native::zeros_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         at::MemoryFormat::Contiguous);
+  }
+  if (grad_input_mask[2]) {
+    dbeta = M > 0 ? at::native::empty_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        at::MemoryFormat::Contiguous)
+                  : at::native::zeros_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        at::MemoryFormat::Contiguous);
+  }
+  if (M > 0) {
+    LayerNormBackwardKernel(
+        input_buffer.is_cuda() ? kCUDA : kCPU,
+        grad_buffer,
+        input_buffer,
+        mean,
+        rstd,
+        *gamma,
+        M,
+        N,
+        &dInput,
+        &dgamma,
+        &dbeta);
+  }
+  return std::make_tuple(
+      wrap_buffer(dInput, sizes), std::move(dgamma), std::move(dbeta));
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
index 215252f91d6d..2bd3c0b64ddd 100644
--- a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@@ -99,19 +99,31 @@ Tensor NestedTensor_elementwise_Tensor(
       self_impl->get_storage_offsets()
     );
   }
-  // special case when other is dense
-  if (self.is_nested() && !other.is_nested()) {
-    // check for the [B, *, D], [B, 1, D] esuhm case
-    // TODO: this if statement is ugly and hopefully we will remove this in the near future
+  // special case when other is dense (CUDA only for now)
+  if (self.is_nested() && !other.is_nested() && self.is_cuda() && other.is_cuda()) {
     auto self_ptr = get_nested_tensor_impl(self);
-    if (self_ptr->dim() == 3 &&
+    auto other_ = other;
+    // check for the [B, *, D], [B, 1, D] case -> use custom kernel
+    // TODO: this if statement is ugly and hopefully we will remove this in the near future
+    bool is_broadcastable_3d = (
+        self_ptr->dim() == 3 &&
         other.dim() == 3 &&
         self_ptr->size(0) == other.size(0) &&
         other.size(1) == 1 &&
         self_ptr->opt_size(2).has_value() &&
-        self_ptr->opt_size(2).value() == other.size(2) &&
-        self.is_cuda() &&
-        other.is_cuda()) {
+        self_ptr->opt_size(2).value() == other.size(2));
+    // check for the [B, *], [B, 1] case -> treat as 3D with [B, *, 1], [B, 1, 1]
+    bool is_broadcastable_2d = (
+        self_ptr->dim() == 2 &&
+        other.dim() == 2 &&
+        self_ptr->size(0) == other.size(0) &&
+        other.size(1) == 1);
+    if(is_broadcastable_2d) {
+        other_ = other.unsqueeze(-1);
+        is_broadcastable_3d = true;
+    }
+
+    if (is_broadcastable_3d) {
       if (!nested_tensor_impl_is_contiguous(self_ptr)) {
         self_ptr = get_nested_tensor_impl(self.contiguous());
       }
@@ -120,9 +132,9 @@ Tensor NestedTensor_elementwise_Tensor(
       auto result_buffer = at::empty_like(self_buffer);
       auto result = wrap_buffer(result_buffer, self_sizes);
       if (op_name == "add") {
-        nested_dense_elementwise_stub(self.device().type(), result, self, other, NESTED_DENSE_OP::ADD);
+        nested_dense_elementwise_stub(self.device().type(), result, self, other_, NESTED_DENSE_OP::ADD);
       } else if (op_name == "mul") {
-        nested_dense_elementwise_stub(self.device().type(), result, self, other, NESTED_DENSE_OP::MUL);
+        nested_dense_elementwise_stub(self.device().type(), result, self, other_, NESTED_DENSE_OP::MUL);
       } else {
         TORCH_CHECK(false, "Unsupported nested dense elementwise op");
       }
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 287e8611701e..b91f80732b9c 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -168,55 +168,6 @@ Tensor _nested_tensor_from_tensor_list(
       pin_memory);
 }
 
-C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
-    const NestedTensorImpl& input,
-    IntArrayRef normalized_shape,
-    const Tensor& weight /* optional */,
-    const Tensor& bias /* optional */) {
-
-  const size_t normalized_ndim = normalized_shape.size();
-  TORCH_CHECK(
-      normalized_ndim >= 1,
-      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
-      "containing at least one element, but got normalized_shape = ",
-      normalized_shape);
-  TORCH_CHECK(
-      !weight.defined() || weight.sizes().equals(normalized_shape),
-      "Expected weight to be of same shape as normalized_shape, but got ",
-      "weight of shape ",
-      weight.sizes(),
-      " and normalized_shape = ",
-      normalized_shape);
-  TORCH_CHECK(
-      !bias.defined() || bias.sizes().equals(normalized_shape),
-      "Expected bias to be of same shape as normalized_shape, but got ",
-      "bias of shape ",
-      bias.sizes(),
-      " and normalized_shape = ",
-      normalized_shape);
-
-  // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
-  // Also, compute M and N considering the idiosyncracies of NestedTensors
-  int64_t N = 1;
-  for (const auto i: c10::irange(normalized_ndim)) {
-    TORCH_CHECK(
-      input.opt_size(-normalized_ndim + i) != c10::nullopt,
-      "normalized_shape extends into irregular dimensions for the nested tensor"
-    );
-    TORCH_CHECK(
-      normalized_shape[i] == *input.opt_size(-normalized_ndim + i),
-      "The shape at dimension ",
-      i,
-      "of normalized_shape doesn't match the input"
-    );
-    N *= normalized_shape[i];
-  }
-
-  const int64_t M = input.numel() / N;
-
-  return std::make_pair(M, N);
-}
-
 std::tuple<Tensor, Tensor, Tensor> nested_layer_norm(
     const Tensor& input,
     IntArrayRef normalized_shape,
@@ -519,7 +470,6 @@ Tensor select_nested(const Tensor& self, int64_t dim, int64_t index) {
     auto new_sizes = at::empty({ntensors, ndims-1}, TensorOptions().dtype(kLong));
     auto new_strides = at::empty({ntensors, ndims-1}, TensorOptions().dtype(kLong));
     auto new_offsets = std::vector<int64_t>(offsets);
-    std::vector<Tensor> tensor_slices(ntensors);
     for (int64_t i : c10::irange(ntensors)) {
       int64_t *size_ptr = new_sizes[i].data_ptr<int64_t>();
       int64_t *stride_ptr = new_strides[i].data_ptr<int64_t>();
@@ -748,7 +698,7 @@ Tensor unsqueeze_nested(const Tensor& self, int64_t dim) {
   if (wrapped_dim == ndim) {
     new_stride = stridemat.new_ones({stridemat.size(0), 1});
   } else {
-    new_stride = (stridemat.select(1, mat_dim - 1) * sizemat.select(1, mat_dim - 1)).unsqueeze(-1);
+    new_stride = (stridemat.select(1, mat_dim) * sizemat.select(1, mat_dim)).unsqueeze(-1);
   }
   Tensor stridemat_unsqueezed = at::cat({stridemat.slice(1, 0, mat_dim),
                                          new_stride,
@@ -885,7 +835,7 @@ inline std::tuple<bool, Tensor, Tensor> NestedTensor_compute_size_stride(
 // we are designing a better semantics to include both inheritance and inference
 Tensor view_nested(const Tensor& self, IntArrayRef proposed_shape) {
   TORCH_CHECK(
-      proposed_shape.size() > 0,
+      !proposed_shape.empty(),
       "shape '[]' is invalid for a nested tensor");
   auto self_ptr = get_nested_tensor_impl(self);
   // basic information before reshaping
@@ -972,7 +922,7 @@ Tensor _nested_view_from_buffer(
 // See Note [Special size rule for nested tensor]
 Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) {
   TORCH_CHECK(
-      proposed_shape.size() > 0,
+      !proposed_shape.empty(),
       "shape '[]' is invalid for a nested tensor");
   auto self_ptr = get_nested_tensor_impl(self);
   // basic information before reshaping
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h
index 954fa807f183..c521bb68562c 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.h
+++ b/aten/src/ATen/native/nested/NestedTensorMath.h
@@ -18,6 +18,62 @@ Tensor map_nt(const Tensor& nt, Func f) {
   const auto& sizes = nt_impl->get_nested_size_tensor();
   return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
 }
+template <typename Func>
+Tensor map_nt_binary(const Tensor& nt_1, const Tensor& nt_2, Func f){
+  auto* nt_impl_1 = get_nested_tensor_impl(nt_1);
+  auto* nt_impl_2 = get_nested_tensor_impl(nt_2);
+  const auto& sizes = nt_impl_1->get_nested_size_tensor();
+  return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl_1->get_buffer(), nt_impl_2->get_buffer()), sizes);
+}
+
+C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
+    const NestedTensorImpl& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */) {
+
+  const size_t normalized_ndim = normalized_shape.size();
+  TORCH_CHECK(
+      normalized_ndim >= 1,
+      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+      "containing at least one element, but got normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !weight.defined() || weight.sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ",
+      weight.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !bias.defined() || bias.sizes().equals(normalized_shape),
+      "Expected bias to be of same shape as normalized_shape, but got ",
+      "bias of shape ",
+      bias.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+
+  // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
+  // Also, compute M and N considering the idiosyncracies of NestedTensors
+  int64_t N = 1;
+  for (const auto i: c10::irange(normalized_ndim)) {
+    TORCH_CHECK(
+      input.opt_size(-normalized_ndim + i) != c10::nullopt,
+      "normalized_shape extends into irregular dimensions for the nested tensor"
+    );
+    TORCH_CHECK(
+      normalized_shape[i] == *input.opt_size(-normalized_ndim + i),
+      "The shape at dimension ",
+      i,
+      "of normalized_shape doesn't match the input"
+    );
+    N *= normalized_shape[i];
+  }
+
+  const int64_t M = input.numel() / N;
+
+  return std::make_pair(M, N);
+}
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
index c8cfa124330d..6842fadbed5a 100644
--- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@@ -306,10 +306,11 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
       self_dim == 4 && self.is_contiguous() &&
       mat2_dim == 4 && mat2.is_contiguous() &&
       !(GradMode::is_enabled() && (self.requires_grad() || mat2.requires_grad()))) {
-    auto n_heads = self_sizes.select(0, 1).select(0, 0).item<int64_t>();
-    auto self_first_dim_n_heads = at::all(self_sizes.select(1, 0) == n_heads).item<bool>();
-    auto mat2_first_dim_n_heads = at::all(mat2_sizes.select(1, 0) == n_heads).item<bool>();
-    if (self_first_dim_n_heads && mat2_first_dim_n_heads) {
+    const auto& self_opt_head_dim = self_ptr->opt_size(1);
+    const auto& mat2_opt_head_dim = mat2_ptr->opt_size(1);
+    if (self_opt_head_dim.has_value() &&
+        mat2_opt_head_dim.has_value() &&
+        self_opt_head_dim.value() == mat2_opt_head_dim.value()) {
       return matmul_with_bmm_nested(self, mat2);
     }
   }
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 9c72454560d3..98865e12e21e 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -319,98 +319,45 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
   return true;
 }
 
-} // namespace
-
-std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_nestedtensor_cuda(
+/**
+ * This function will take nested query, key, and value
+ * and will preprocess it in order to run with either
+ * the flash-attention or efficient-attention kernels.
+ * @return A tuple containing all the necessary data for running the fused kernels
+ */
+inline auto sdpa_nested_preprocessing(
     const Tensor& query,
     const Tensor& key,
-    const Tensor& value,
-    double dropout_p,
-    bool return_softmax,
-    bool is_causal) {
-  TORCH_CHECK(false, "There are currently cuda memory errors being returned from this path.")
+    const Tensor& value) {
   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
   // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   const int64_t num_heads = query.size(1);
-  const int64_t head_dim = query.size(3);
-
-  // Query -> Query (Batch x {Q_seq_len}  x Num_heads x Dim_per_head)
-  // Key   -> Key   (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
-  // Value -> Value (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
-  Tensor q_t = query.transpose(1, 2).contiguous();
-  Tensor k_t = key.transpose(1, 2).contiguous();
-  Tensor v_t = value.transpose(1, 2).contiguous();
-
-  // K and V have to have the same Nnz, should probably torch_check
-  // assume in order to not iterate over v
-
-  auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t);
-  auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t);
-
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
-  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
-
-  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
-  const int64_t max_seqlen_batch_k = std::get<1>(cumulative_and_max_k);
-
-  const int64_t Nnz_q  = cumulative_sequence_length_q[-1].item<int64_t>();
-  const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item<int64_t>();
-
-  auto query_buffer_reshaped =
-      get_buffer(q_t).view({Nnz_q, num_heads, head_dim});
-  auto key_buffer_reshaped =
-      get_buffer(k_t).view({Nnz_kv, num_heads, head_dim});
-  auto value_buffer_reshaped =
-      get_buffer(v_t).view({Nnz_kv, num_heads, head_dim});
-
-  auto attention_and_lse_and_softmax =
-  at::_flash_attention_forward(
-      query_buffer_reshaped,
-      key_buffer_reshaped,
-      value_buffer_reshaped,
-      cumulative_sequence_length_q,
-      cumulative_sequence_length_k,
-      max_seqlen_batch_q,
-      max_seqlen_batch_k,
-      return_softmax,
-      dropout_p,
-      is_causal);
-  // Reshape output to convert nnz to batch_size and seq_len
-  Tensor attention = std::get<0>(attention_and_lse_and_softmax);
-  attention = wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone()).transpose(1,2);
-  return std::tie(attention, std::get<1>(attention_and_lse_and_softmax), std::get<2>(attention_and_lse_and_softmax));
-}
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_cuda(
-    const Tensor& query,
-    const Tensor& key,
-    const Tensor& value,
-    bool compute_log_sumexp,
-    bool is_causal) {
-   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
-  // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
-  // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
-  const int64_t num_heads = query.size(1);
-  const int64_t head_dim = query.size(3);
+  const int64_t head_dim_qk = query.size(3);
+  const int64_t head_dim_v = value.size(3);
 
   Tensor q_t = query.transpose(1, 2);
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
 
   auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(q_t);
-  auto cumulative_and_max_k_and_nnz_k = cumulative_and_max_seq_len(k_t);
+  auto cumulative_and_max_kv_and_nnz_kv = cumulative_and_max_seq_len(k_t);
 
-  // K and V have to have the same Nnz, should probably torch_check
+  // [TODO] K and V have to have the same Nnz, should probably torch_check
   // assume in order to not iterate over v
 
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q);
-  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k_and_nnz_k);
+  Tensor cumulative_sequence_length_q =
+      std::get<0>(cumulative_and_max_q_and_nnz_q);
+  Tensor cumulative_sequence_length_kv =
+      std::get<0>(cumulative_and_max_kv_and_nnz_kv);
 
-  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q);
+  const int64_t max_seqlen_batch_q =
+      std::get<1>(cumulative_and_max_q_and_nnz_q);
+  const int64_t max_seqlen_batch_kv =
+      std::get<1>(cumulative_and_max_kv_and_nnz_kv);
 
   const int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q);
-  const int64_t Nnz_kv = std::get<2>(cumulative_and_max_k_and_nnz_k);
+  const int64_t Nnz_kv = std::get<2>(cumulative_and_max_kv_and_nnz_kv);
 
   Tensor query_buffer_reshaped;
   Tensor key_buffer_reshaped;
@@ -462,92 +409,128 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_
   const int64_t head_v_stride = v_strides[1];
 
   query_buffer_reshaped = q_storage_as_tensor.as_strided(
-      {Nnz_q, num_heads, head_dim},
+      {Nnz_q, num_heads, head_dim_qk},
       {nnz_q_stride, head_q_stride, head_dim_stride},
       query_impl->get_storage_offsets()[0]);
   key_buffer_reshaped = k_storage_as_tensor.as_strided(
-      {Nnz_kv, num_heads, head_dim},
+      {Nnz_kv, num_heads, head_dim_qk},
       {nnz_k_stride, head_k_stride, head_dim_stride},
       key_impl->get_storage_offsets()[0]);
   value_buffer_reshaped = v_storage_as_tensor.as_strided(
-      {Nnz_kv, num_heads, head_dim},
+      {Nnz_kv, num_heads, head_dim_v},
       {nnz_v_stride, head_v_stride, head_dim_stride},
       value_impl->get_storage_offsets()[0]);
-  std::tuple<Tensor, Tensor> attention_and_logsumexp=
-      at::_efficient_attention_forward(
-          query_buffer_reshaped.unsqueeze(0),
-          key_buffer_reshaped.unsqueeze(0),
-          value_buffer_reshaped.unsqueeze(0),
+
+  auto output_shape = get_nested_size_tensor(q_t).clone();
+  if (head_dim_v != head_dim_qk) {
+    output_shape.select(1, -1).fill_(head_dim_v);
+  }
+
+  return std::make_tuple(
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_kv,
+      max_seqlen_batch_q,
+      max_seqlen_batch_kv,
+      output_shape);
+}
+
+} // namespace
+
+std::tuple<
+    Tensor,
+    Tensor,
+    Tensor,
+    Tensor,
+    int64_t,
+    int64_t,
+    int64_t,
+    int64_t,
+    Tensor>
+_scaled_dot_product_flash_attention_nestedtensor_cuda(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask) {
+  Tensor query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped,
+      cumulative_sequence_length_q, cumulative_sequence_length_kv, output_shape;
+  int64_t max_seqlen_batch_q{0}, max_seqlen_batch_kv{0};
+  std::tie(
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_kv,
+      max_seqlen_batch_q,
+      max_seqlen_batch_kv,
+      output_shape) = sdpa_nested_preprocessing(query, key, value);
+
+  Tensor attention, log_sumexp, debug_attn_mask;
+  int64_t philox_seed{0}, philox_offset{0};
+  std::tie(attention, log_sumexp, philox_seed, philox_offset, debug_attn_mask) =
+      at::_flash_attention_forward(
+          query_buffer_reshaped,
+          key_buffer_reshaped,
+          value_buffer_reshaped,
           cumulative_sequence_length_q,
-          cumulative_sequence_length_k,
+          cumulative_sequence_length_kv,
           max_seqlen_batch_q,
-          compute_log_sumexp,
-          is_causal);
+          max_seqlen_batch_kv,
+          dropout_p,
+          is_causal,
+          return_debug_mask);
   // Reshape output to convert nnz to batch_size and seq_len
-  Tensor attention = std::get<0>(attention_and_logsumexp);
-  attention =
-      wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone())
-          .transpose(1, 2);
-  return std::tie(attention, std::get<1>(attention_and_logsumexp));
+  attention = wrap_buffer(attention.view(-1), output_shape).transpose(1, 2);
+  return std::make_tuple(
+      attention,
+      log_sumexp,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_kv,
+      max_seqlen_batch_q,
+      max_seqlen_batch_kv,
+      philox_seed,
+      philox_offset,
+      debug_attn_mask);
 }
 
-Tensor flash_attention_helper(
+std::tuple<Tensor, Tensor>
+_scaled_dot_product_efficient_attention_nestedtensor_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
-    double dropout_p,
-    bool need_atten_weights,
+    bool compute_log_sumexp,
     bool is_causal) {
-  //  Query is of size (batch_size x ragged_seq_len x (3 or 1) x n_heads x
-  //  head_did
-  int64_t head_dim{query.size(-1)};
-  int64_t num_heads{query.size(-2)};
-
-  auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(query);
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q);
-  int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q);
+  Tensor query_buffer_reshaped, key_buffer_reshaped, value_buffer_reshaped,
+      cumulative_sequence_length_q, cumulative_sequence_length_kv, output_shape;
+  int64_t max_seqlen_batch_q{0};
+  std::tie(
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_kv,
+      max_seqlen_batch_q,
+      std::ignore,
+      output_shape) = sdpa_nested_preprocessing(query, key, value);
 
-  TORCH_CHECK(
-      key.is_same(key) && query.is_same(value),
-      "Key and Value must be the same tensor");
-
-  int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q);
-
-  // For the packed case we need to set the output size for dim 2 to 1
-  auto atten_size = get_nested_size_tensor(query).clone();
-  atten_size.index({at::indexing::Slice(), 1}) = 1;
-
-  auto qkv_buffer_reshaped = get_buffer(query)
-                                 .view({Nnz_q, 3, num_heads, head_dim})
-                                 .transpose(0, 1)
-                                 .contiguous();
-
-  auto q = qkv_buffer_reshaped[0];
-  auto k = qkv_buffer_reshaped[1];
-  auto v = qkv_buffer_reshaped[2];
-
-  TORCH_CHECK(q.is_contiguous());
-  TORCH_CHECK(k.is_contiguous());
-  TORCH_CHECK(v.is_contiguous());
-
-  // If we are passing in query, key, value all the same tensors then we have
-  // packed them into one tensor and need to slice for flash attention
-  Tensor attention =
-      std::get<0>(at::_flash_attention_forward(
-          q,
-          k,
-          v,
-          cumulative_sequence_length_q,
+  std::tuple<Tensor, Tensor> attention_and_logsumexp =
+      at::_efficient_attention_forward(
+          query_buffer_reshaped.unsqueeze(0),
+          key_buffer_reshaped.unsqueeze(0),
+          value_buffer_reshaped.unsqueeze(0),
           cumulative_sequence_length_q,
+          cumulative_sequence_length_kv,
           max_seqlen_batch_q,
-          max_seqlen_batch_q,
-          false /*return_softmax*/,
-          dropout_p,
-          is_causal));
-  // Output of flash_attention is a regular tensor lets wrap it back up to
-  // form a nested tensor
-
-  return wrap_buffer(attention.view(-1), atten_size);
+          compute_log_sumexp,
+          is_causal);
+  // Reshape output to convert nnz to batch_size and seq_len
+  Tensor attention = std::get<0>(attention_and_logsumexp);
+  attention = wrap_buffer(attention.view(-1), output_shape).transpose(1, 2);
+  return std::tie(attention, std::get<1>(attention_and_logsumexp));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
index c9387eb0ebb1..8e3739a78d6f 100644
--- a/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
+++ b/aten/src/ATen/native/quantized/cpu/LinearUnpackImpl.cpp
@@ -61,11 +61,18 @@ std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeight::unpack() {
 #ifdef USE_PYTORCH_QNNPACK
 std::tuple<at::Tensor, c10::optional<at::Tensor>> PackedLinearWeightsQnnp::
     unpack() {
-  TORCH_CHECK(
-      orig_weight.defined(),
-      "Cannot unpack weights. "
-      "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking.");
-  return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias_);
+    if (orig_weight.defined()){
+        return std::tuple<at::Tensor, c10::optional<at::Tensor>>(orig_weight, bias_);
+    }
+    else{
+        TORCH_WARN(
+        "Original weight is freed, we are converting pre-packed weight to original weight.");
+        uint8_t* kernel = w->unpackWeights(w_zero_points.data(), n_elements);
+        at::Tensor original_tensor = at::from_blob(kernel, weight_sizes, c10::kByte).clone().toType(c10::kQInt8);
+        original_tensor.sub_(128);
+        free(kernel);
+        return std::tuple<at::Tensor, c10::optional<at::Tensor>>(original_tensor, bias_);
+    }
 }
 #endif // USE_PYTORCH_QNNPACK
 
diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 077dc1fc6064..cc2487ac4606 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -15,7 +15,9 @@ using PrimitiveCacheKey = std::tuple<
     std::vector<int64_t>, // input_shape
     double, // output_scale
     int64_t, // output_zero_point
-    int64_t>; // OMP_number_of_threads
+    int64_t, // OMP_number_of_threads
+    double, // accum_scale
+    int64_t>; // accum_zero_point
 
 enum CacheKeyIndex {
   InputScale,
@@ -269,6 +271,18 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
       const at::Tensor& input,
       bool reduce_range) override;
 
+  at::Tensor apply_add(
+      const at::Tensor& input,
+      const at::Tensor& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  at::Tensor apply_add_relu(
+      const at::Tensor& input,
+      const at::Tensor& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
   std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
 
   static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
@@ -313,6 +327,7 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
   template <bool ReluFused>
   at::Tensor apply_impl(
       const at::Tensor& input,
+      const c10::optional<at::Tensor>& accum,
       double output_scale,
       int64_t output_zero_point);
 
@@ -376,18 +391,27 @@ static bool is_weight_symmetric_quant(
   return is_symmetric;
 }
 
-// Check if onednn should be used w.r.t fbgemm
+// When qengine is x86, use this util func to check if onednn kernel
+// is preferred than fbgemm's to get better performance.
 static bool should_use_onednn_quant(
     const at::Tensor& weight,
     bool is_transposed_conv,
     int groups,
     torch::List<int64_t> output_padding) {
+  // Performance of onednn is only validated on Linux right now.
+  // Also, the heuristics for dispatching are based on perf data on Linux.
+  // So, for x86 qengine, we always use fbgemm kernels if OS is not Linux.
+  // TODO Support more OSs.
+#if !defined(__linux__)
+  return false;
+#else
   bool vnni_available = cpuinfo_has_x86_avx512vnni();
   bool w_sym_quant =
       is_weight_symmetric_quant(weight, is_transposed_conv);
   bool opad_all_zero =
       std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; });
   return vnni_available && (groups <= 100) && w_sym_quant && opad_all_zero;
+#endif
 }
 
 } // onednn_utils
diff --git a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
index ce61afff6b57..cfa9dcdb7028 100644
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -48,7 +48,10 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
         per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
         input_scale(std::move(input_scale)),
         w_scales(std::move(w_scales)),
-        w_zero_points(std::move(w_zps)) {}
+        w_zero_points(std::move(w_zps)) {
+          weight_sizes = this->orig_weight.sizes().vec();
+          n_elements = std::accumulate(std::begin(weight_sizes), std::end(weight_sizes), 1, std::multiplies<double>());
+        }
 
   std::unique_ptr<qnnpack::PackBMatrix> w;
   at::Tensor orig_weight;
@@ -58,6 +61,8 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
   at::Tensor w_scales;
   std::vector<uint8_t> w_zero_points;
   std::vector<float> requantization_scales;
+  std::vector<int64_t> weight_sizes;
+  int n_elements;
 
   at::Tensor apply(
       at::Tensor input,
diff --git a/aten/src/ATen/native/quantized/cpu/QuantUtils.h b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
index 85bcaa1a69fd..0b026c739786 100644
--- a/aten/src/ATen/native/quantized/cpu/QuantUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
@@ -188,7 +188,7 @@ inline TensorQuantizationParams ChooseQuantizationParams(
 constexpr int64_t kConv1dSqueezeDim = 0;
 static C10_UNUSED torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_t>& arg,
                                              int64_t base_value) {
-  TORCH_CHECK(arg.size() > 0, "Argument must have elements.");
+  TORCH_CHECK(!arg.empty(), "Argument must have elements.");
   torch::List<int64_t> result({arg.get(0), base_value});
   if (arg.size() == 1) {
     result[1] = arg.get(0);
diff --git a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
index 8cba2f8cdd94..4cabf903a85c 100644
--- a/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantizedOps.h
@@ -176,7 +176,7 @@ using qmean_inner_dim_fn = void (*)(
 using qstd_inner_dim_fn = void (*)(
     const Tensor& /* X */,
     OptionalIntArrayRef /* dim */,
-    optional<int64_t> /* unbiased */,
+    const c10::optional<Scalar>& /* correction */,
     bool /* keepdim */,
     Tensor& /* Y */);
 
diff --git a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
index c2d18693b9ea..7d3a14358ff9 100644
--- a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
@@ -194,18 +194,18 @@ Tensor& mean_out_quantized_cpu(
 inline bool is_std_inner_dim_fast_path(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    optional<int64_t> unbiased) {
+    const c10::optional<Scalar>& correction) {
   // Do not enter fast path if there are too few elements
   IntArrayRef dims = dim.has_value() ? dim.value() : IntArrayRef();
   auto all_dims = std::vector<int64_t>(self.dim());
   std::iota(all_dims.begin(), all_dims.end(), 0);
   dims = dims.empty() ? all_dims : dims;
-  bool is_unbiased = unbiased.has_value() ? unbiased.value() : 0;
+  bool has_correction = !correction.value_or(1).equal(0);
   int64_t num_ele = 1;
   for (auto d : dims) {
     num_ele *= self.size(d);
   }
-  if (num_ele == 1 && is_unbiased) {
+  if (num_ele == 1 && has_correction) {
     return false;
   }
   return is_innnermost_dim(self, dims);
@@ -214,19 +214,19 @@ inline bool is_std_inner_dim_fast_path(
 Tensor& std_out_quantized_cpu(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    optional<int64_t> unbiased,
+    const c10::optional<Scalar>& correction,
     bool keepdim,
     Tensor& result) {
   // Fast path
   if (self.is_contiguous(c10::MemoryFormat::Contiguous) &&
-      is_std_inner_dim_fast_path(self, dim, unbiased)) {
-    qstd_inner_dim_stub(self.device().type(), self, dim, unbiased, keepdim, result);
+      is_std_inner_dim_fast_path(self, dim, correction)) {
+    qstd_inner_dim_stub(self.device().type(), self, dim, correction, keepdim, result);
     return result;
   }
 
   // Reference path
   auto self_dequantized = self.dequantize();
-  auto result_dequantized = at::std(self_dequantized, dim, unbiased, keepdim);
+  auto result_dequantized = at::std(self_dequantized, dim, correction, keepdim);
   result = at::quantize_per_tensor(
       result_dequantized,
       self.q_scale(),
@@ -238,30 +238,30 @@ Tensor& std_out_quantized_cpu(
 Tensor std_quantized_cpu(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    optional<int64_t> unbiased,
+    const c10::optional<Scalar>& correction,
     bool keepdim) {
   Tensor result;
-  std_out_quantized_cpu(self, dim, unbiased, keepdim, result);
+  std_out_quantized_cpu(self, dim, correction, keepdim, result);
   return result;
 }
 
 Tensor std_quantized_cpu(
     const Tensor& self,
     DimnameList dim,
-    optional<int64_t> unbiased,
+    const c10::optional<Scalar>& correction,
     bool keepdim) {
   return std_quantized_cpu(
-      self, dimnames_to_positions(self, dim), unbiased, keepdim);
+      self, dimnames_to_positions(self, dim), correction, keepdim);
 }
 
 Tensor& std_out_quantized_cpu(
     Tensor& result,
     const Tensor& self,
     DimnameList dim,
-    optional<int64_t> unbiased,
+    const c10::optional<Scalar>& correction,
     bool keepdim) {
   return std_out_quantized_cpu(
-      self, dimnames_to_positions(self, dim), unbiased, keepdim, result);
+      self, dimnames_to_positions(self, dim), correction, keepdim, result);
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/TensorShape.cpp b/aten/src/ATen/native/quantized/cpu/TensorShape.cpp
index b4b519020246..58af539cb142 100644
--- a/aten/src/ATen/native/quantized/cpu/TensorShape.cpp
+++ b/aten/src/ATen/native/quantized/cpu/TensorShape.cpp
@@ -34,7 +34,7 @@ DEFINE_DISPATCH(qcat_relu_nhwc_stub);
 namespace {
 
 bool is_cat_nhwc_fast_path(const MaterializedITensorListRef& qxs, int64_t dim) {
-  TORCH_CHECK(qxs.size() > 0);
+  TORCH_CHECK(!qxs.empty());
   bool is_fast_path = dim == 1;
   // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
   for (const at::Tensor& qx : qxs) {
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
index 4b4c63eb7c3d..8c62af85b8df 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
@@ -215,9 +215,6 @@ Tensor _upsample_nearest3d_quantized_cpu(
   }
 }
 
-using at::native::upsample::compute_output_size;
-using at::native::upsample::get_scale_value;
-
 Tensor upsample_nearest3d_quantized_cpu(
     const Tensor& input,
     IntArrayRef osize,
diff --git a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
index 12e4fbbf1e76..fdc21902c2c5 100644
--- a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
@@ -99,6 +99,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
         op_min,         /* int8_t output_min                    */
         op_max,         /* int8_t output_max                    */
         flags,          /* uint32_t flags                       */
+        nullptr,        /* xnn_caches_t caches                  */
         op);            /* xnn_operator_t* deconvolution_op_out */
 
   }
@@ -130,6 +131,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
         op_min,         /* int8_t output_min                  */
         op_max,         /* int8_t output_max                  */
         flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
         op);            /* xnn_operator_t* convolution_op_out */
   } else { /* per_channel */
     return xnn_create_convolution2d_nhwc_qc8(
@@ -158,6 +160,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
         op_min,         /* int8_t output_min                  */
         op_max,         /* int8_t output_max                  */
         flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
         op);            /* xnn_operator_t* convolution_op_out */
   }
 }
@@ -254,6 +257,7 @@ enum xnn_status xnnp_create_fully_connected_nc(
       output_min,              /* int8_t output_min                      */
       output_max,              /* int8_t output_max                      */
       flags,                   /* uint32_t flags                         */
+      nullptr,                 /* xnn_caches_t caches                    */
       fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
index d6221531b808..7ef97bdcadbc 100644
--- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h
+++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h
@@ -90,7 +90,7 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
   int version = -1;
   if (v.isTuple()) {
     const auto& elements = v.toTupleRef().elements();
-    if (elements.size() > 0) {
+    if (!elements.empty()) {
       auto firstElement = elements[0];
       if (firstElement.isTensor()) {
         version = 1;
@@ -123,9 +123,10 @@ ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
     torch::List<at::Tensor> dilation_x_kSpatialDim = elements[4].toTensorList();
     at::Tensor groups = elements[5].toTensor();
 
-    std::vector<c10::optional<at::Tensor>> optional;
-
     std::vector<int64_t> config_vals;
+    config_vals.reserve(
+        stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() +
+        dilation_x_kSpatialDim.size() + kSpatialDim + 3);
     config_vals.push_back(kSpatialDim);
     for (const auto i : c10::irange(stride_x_kSpatialDim.size())) {
       auto stride = stride_x_kSpatialDim.get(i);
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index aabd980c9f00..953789540308 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2874,7 +2874,7 @@ void qmean_inner_dim_kernel(
 void qstd_inner_dim_kernel(
     const Tensor& self,
     OptionalIntArrayRef dim,
-    optional<int64_t> unbiased,
+    const c10::optional<Scalar>& correction_opt,
     bool keepdim,
     Tensor& result) {
   ScalarType dtype = self.scalar_type();
@@ -2896,10 +2896,8 @@ void qstd_inner_dim_kernel(
   if (!keepdim) {
     out_dims.erase(out_dims.end() - num_dims_to_squeeze, out_dims.end());
   }
-  int64_t den = N; // Denominator when computing mean and deviation
-  if (unbiased.has_value() && unbiased.value() == 1) {
-    den -= 1;
-  }
+  const auto correction = correction_opt.value_or(1).toDouble();
+  double den = std::max(N - correction, 0.0); // Denominator when computing mean and deviation
   auto x_scale = self.q_scale();
   auto x_zp = self.q_zero_point();
   result = at::_empty_affine_quantized(
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 5cfd55a09c94..3c6dcd93e617 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -18,6 +18,7 @@
 #include <ATen/native/quantized/cpu/QuantUtils.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>
+#include <ATen/quantized/Quantizer.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -1150,7 +1151,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply(
     const at::Tensor& input,
     double output_scale,
     int64_t output_zero_point) {
-  return apply_impl<false>(input, output_scale, output_zero_point);
+  return apply_impl<false>(input, c10::nullopt, output_scale, output_zero_point);
 }
 
 template <int kSpatialDim>
@@ -1158,13 +1159,34 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_relu(
     const at::Tensor& input,
     double output_scale,
     int64_t output_zero_point) {
-  return apply_impl<true>(input, output_scale, output_zero_point);
+  return apply_impl<true>(input, c10::nullopt, output_scale, output_zero_point);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_add(
+    const at::Tensor& input,
+    const at::Tensor& accum,
+    double output_scale,
+    int64_t output_zero_point) {
+  TORCH_CHECK(kSpatialDim == 2, " Currently, only conv2d with add is supported.");
+  return apply_impl<false>(input, accum, output_scale, output_zero_point);
+}
+
+template <int kSpatialDim>
+at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_add_relu(
+    const at::Tensor& input,
+    const at::Tensor& accum,
+    double output_scale,
+    int64_t output_zero_point) {
+  TORCH_CHECK(kSpatialDim == 2, " Currently, only conv2d add relu is supported.");
+  return apply_impl<true>(input, accum, output_scale, output_zero_point);
 }
 
 template <int kSpatialDim>
 template <bool kReluFused>
 at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
     const at::Tensor& act,
+    const c10::optional<at::Tensor>& accum,
     double output_scale,
     int64_t output_zero_point) {
   std::string func_name = "quantized::conv";
@@ -1172,6 +1194,18 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
     func_name += "_transpose";
   }
   func_name += std::to_string(kSpatialDim) + "d";
+
+  // has_accum: extra input besides the conv to do conv add fusion.
+  bool has_accum = accum.has_value() ? true : false;
+  auto& ctx = at::globalContext();
+  if (has_accum) {
+    func_name += "_add";
+    TORCH_CHECK(
+      !transpose(),
+      "Didn't support transposed conv for conv with add ",
+      c10::toString(ctx.qEngine()));
+  }
+
   if (kReluFused) {
     func_name += "_relu";
   }
@@ -1237,8 +1271,20 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
   if (output.numel() == 0) {
     return output;
   }
-  ideep::tensor dst({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}},
-                    output.data_ptr());
+  ideep::tensor dst;
+  at::Tensor accum_contig;
+  if (has_accum) {
+    auto dst_desc = ideep::tensor::desc(dst_dims, src_data_type,
+        kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
+    accum_contig = accum.value().contiguous(kSpatialDim == 2 ? c10::MemoryFormat::ChannelsLast : c10::MemoryFormat::ChannelsLast3d);
+    TORCH_CHECK(accum_contig.dtype() == output.dtype(), "The output tensor should have same dtype as the accum tensor.");
+    // When fused with sum, the dst tensor will share the data ptr as the accum tensor.
+    dst.init(dst_desc, accum_contig.data_ptr());
+  } else {
+    dst = ideep::tensor({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}},
+                      output.data_ptr());
+  }
+
   // Parameters
   const ideep::dims& strides = stride().vec();
   const ideep::dims& dilates = dilation().vec();
@@ -1252,9 +1298,24 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
   double inv_output_scale = 1.0/output_scale;
   const ideep::zero_point_t src_zero_points = ideep::zero_point_t(1, input_zp);
   const ideep::zero_point_t dst_zero_points = ideep::zero_point_t(1, output_zero_point);
-  ideep::attr_t op_attr = kReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t();
+
+  ideep::attr_t op_attr;
+  float sum_scale = has_accum ? accum.value().q_scale() : 1.0;
+  int32_t sum_zero_point = has_accum ? accum.value().q_zero_point() : 0;
+  if (has_accum) {
+    // Just tells we have these post op, the actual value such as scale and zero point will be setted later.
+    op_attr = kReluFused ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum();
+    const ideep::scale_t accum_scale = ideep::scale_t(1, 1.0/sum_scale);
+    const ideep::zero_point_t accum_zero_points = ideep::zero_point_t(1, sum_zero_point);
+    // Set the dst scale and zero point with the value of accum.
+    // The true scale and zero point is stored in ideep::scale_t(scale_size, inv_output_scale) and dst_zero_points.
+    dst.set_scale(accum_scale);
+    dst.set_zero_point(accum_zero_points);
+  } else {
+    op_attr = kReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t();
+  }
   // Since src zero point is unknown, set runtime value here
-  op_attr.set_zero_points(DNNL_ARG_SRC, ideep::utils::tensor_zp_mask(1), {DNNL_RUNTIME_S32_VAL});
+  op_attr.set_zero_points(DNNL_ARG_SRC, ideep::utils::tensor_zp_mask(1), src_zero_points);
 
   // Bias might be modified outside (e.g. by quantization bias correction).
   // If so, update the prepacked bias as well.
@@ -1267,7 +1328,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
     // Primitive cache is initialized when called for the first time
     // and won't be updated afterwards.
     PrimitiveCacheKey cache_key = std::make_tuple(
-        input_scale, input_zp, src_dims, output_scale, output_zero_point, num_threads);
+        input_scale, input_zp, src_dims, output_scale, output_zero_point, num_threads, sum_scale, sum_zero_point);
     c10::call_once(*cache_initialized_flag, [&](){
         DeconvParams params;
         ideep::convolution_transpose_forward::prepare(
@@ -1279,7 +1340,8 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
             dnnl::prop_kind::forward_inference,
             ideep::u8s8, ideep::engine::cpu_engine());
         get_deconv_cache() = DeconvPrimitiveCache(cache_key, params, b);
-        weights = weights.reorder_if_differ_in(params.pd.weights_desc());
+        auto expected_weight_desc = ideep::tensor::desc(params.pd.weights_desc(), groups());
+        weights = weights.reorder_if_differ_in(expected_weight_desc);
     });
     if (get_deconv_cache().hit(cache_key)) {
       DeconvParams& params = get_deconv_cache().get_params();
@@ -1299,7 +1361,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
     }
   } else {  // not transposed
     PrimitiveCacheKey cache_key = std::make_tuple(
-        input_scale, input_zp, src_dims, output_scale, output_zero_point, num_threads);
+        input_scale, input_zp, src_dims, output_scale, output_zero_point, num_threads, sum_scale, sum_zero_point);
     c10::call_once(*cache_initialized_flag, [&](){
         ConvParams params;
         ideep::convolution_forward::prepare(
@@ -1311,7 +1373,8 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
             dnnl::prop_kind::forward_inference,
             ideep::u8s8, ideep::engine::cpu_engine());
         get_conv_cache() = ConvPrimitiveCache(cache_key, params, b);
-        weights = weights.reorder_if_differ_in(params.pd.weights_desc());
+        auto expected_weight_desc = ideep::tensor::desc(params.pd.weights_desc(), groups());
+        weights = weights.reorder_if_differ_in(expected_weight_desc);
     });
     // If hit, use cached data. If miss, fall back to normal path.
     if (get_conv_cache().hit(cache_key)) {
@@ -1329,7 +1392,15 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
           ideep::u8s8, ideep::engine::cpu_engine());
     }
   }
-  return output;
+  if (has_accum) {
+    // When fused with sum, the accum tensor share the data ptr as dst tensor as the output.
+    // Reset output's scale and zero point into accum_contig.
+    set_quantizer_(accum_contig, at::make_per_tensor_affine_quantizer(
+        output_scale, output_zero_point, accum_contig.scalar_type()));
+    return accum_contig;
+  } else {
+    return output;
+  }
 }
 
 template at::Tensor PackedConvWeightsOnednn<2>::apply(
@@ -1403,6 +1474,34 @@ class QConvInt8 final {
   }
 };
 
+template <int kSpatialDim, bool kReluFused>
+class QConvAddInt8 final {
+ public:
+  static Tensor run(
+      Tensor act,
+      Tensor accum,
+      const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& packed_weight,
+      double output_scale,
+      int64_t output_zero_point) {
+    auto& ctx = at::globalContext();
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      if (kReluFused) {
+        return dynamic_cast<PackedConvWeightsOnednn<kSpatialDim>*>(packed_weight.get())->apply_add_relu(
+          act, accum, output_scale, output_zero_point);
+      } else {
+        return dynamic_cast<PackedConvWeightsOnednn<kSpatialDim>*>(packed_weight.get())->apply_add(
+          act, accum, output_scale, output_zero_point);
+      }
+    }
+#endif
+    TORCH_CHECK(
+    false,
+    "Didn't find engine for operation quantized::conv2d_add.",
+    toString(ctx.qEngine()));
+  }
+};
+
 template <bool kReluFused>
 class QConv1dInt8 final {
  public:
@@ -1458,6 +1557,8 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"),     QConv1dInt8<true>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"),      QConvInt8<2, false>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_add"),      QConvAddInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_add_relu"), QConvAddInt8<2, true>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d.new"),      QConvInt8<3, false>::run);
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu.new"), QConvInt8<3, true>::run);
   // for backward compatibility
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp
index 26a2855a0fbb..732e0ccd18bd 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp
@@ -163,7 +163,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_dynamic(
       input, q_params.scale, q_params.zero_point, c10::kQUInt8);
 
   at::Tensor out =
-      apply_impl<false>(q_input, q_params.scale, q_params.zero_point);
+      apply_impl<false>(q_input, /*accum*/c10::nullopt, q_params.scale, q_params.zero_point);
 
   // TODO: Modify ideep to allow fp32 input & output
   // to avoid explicit `quantize - dequantize`
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 8c912ca17456..e2703bb93fb4 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -253,7 +253,7 @@ at::Tensor& embedding_bag_nbit_impl(
     } else {
       shape_arr[0] = output_size;
       shape_arr[1] = D;
-      shape = c10::IntArrayRef(shape_arr.data(), 2);
+      shape = c10::IntArrayRef(&shape_arr[0], 2);
     }
     at::native::resize_(output, shape, c10::nullopt);
   }
@@ -423,7 +423,7 @@ at::Tensor& embedding_bag_byte_impl(
     } else {
       shape_arr[0] = output_size;
       shape_arr[1] = D;
-      shape = c10::IntArrayRef(shape_arr.data(), 2);
+      shape = c10::IntArrayRef(&shape_arr[0], 2);
     }
     at::native::resize_(output, shape, c10::nullopt);
   }
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 271f27f81ff6..ed33665623e3 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -840,7 +840,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl(
   // and won't be updated afterwards.
   int num_threads = at::get_num_threads();
   PrimitiveCacheKey cache_key = std::make_tuple(
-      input_scale, input_zero_point, input_dims, output_scale, output_zero_point, num_threads);
+      input_scale, input_zero_point, input_dims, output_scale, output_zero_point, num_threads, /*accum scale*/1.0, /*accum zero point*/0);
   c10::call_once(*cache_initialized_flag, [&](){
       LinearParams params;
       ideep::matmul_forward::prepare</*is_dynamic=*/false>(
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 3325b1b8314b..f871877073a7 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -563,7 +563,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
   // and won't be updated afterwards.
   int num_threads = at::get_num_threads();
   PrimitiveCacheKey cache_key = std::make_tuple(
-      q_params.scale, q_params.zero_point, input_dims, 1.0, 0, num_threads);
+      q_params.scale, q_params.zero_point, input_dims, 1.0, 0, num_threads, /*accum scale*/1.0, /*accum zero point*/0);
   c10::call_once(*cache_initialized_flag, [&](){
       LinearParams params;
       ideep::matmul_forward::prepare</*is_dynamic=*/true>(
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
index 8b5b82453a95..fd6b7ff551db 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt
@@ -174,6 +174,7 @@ set(PYTORCH_QNNPACK_EXEC_SRCS
   src/conv-run.cc
   src/deconv-run.cc
   src/fc-run.cc
+  src/fc-unpack.cc
   src/fc-dynamic-run.cc
   src/indirection.c
   src/operator-run.c)
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
index f981cce9726d..7b5baff68a58 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
@@ -261,6 +261,7 @@ def define_qnnpack(third_party, labels = []):
             "src/fc-dynamic-run.cc",
             "src/fc-prepack.cc",
             "src/fc-run.cc",
+            "src/fc-unpack.cc",
             "src/fully-connected.c",
             "src/fully-connected-sparse.c",
             "src/global-average-pooling.c",
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/include/qnnpack_func.h b/aten/src/ATen/native/quantized/cpu/qnnpack/include/qnnpack_func.h
index 23ebbae25e22..eeadbaf91181 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/include/qnnpack_func.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/include/qnnpack_func.h
@@ -66,6 +66,11 @@ class PackBMatrix final {
     return packed_weights_;
   }
 
+  uint8_t* unpackWeights(
+      const uint8_t* kernel_zero_points,
+      int n_elements
+    ) const;
+
   size_t getInputChannels() const
   {
     return input_channels_;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc
index c77263049701..2b2922d2bf37 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-prepack.cc
@@ -37,7 +37,7 @@ PackBMatrix::PackBMatrix(
   output_channels_ = output_channels;
   packed_weights_ =
       malloc(n_stride * (k_stride * sizeof(uint8_t) + sizeof(int32_t)));
-  if (packed_weights_ == NULL) {
+  if (packed_weights_ == nullptr) {
     pytorch_qnnp_log_error(
         "failed to allocate %zu bytes for packed weights",
         n_stride * (k_stride * sizeof(uint8_t) + sizeof(int32_t)));
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc
new file mode 100644
index 000000000000..d142567b90ef
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fc-unpack.cc
@@ -0,0 +1,73 @@
+#include <pytorch_qnnpack.h>
+#include <qnnpack/log.h>
+#include <qnnpack/pack.h>
+#include <qnnpack_func.h>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+
+namespace qnnpack {
+// For runtime quantization unpacking.
+uint8_t* PackBMatrix::unpackWeights(
+  const uint8_t* kernel_zero_points,
+  int n_elements
+) const {
+  union {
+    void* const as_void_ptr;
+    uint8_t* as_uint8_ptr;
+    int32_t* as_int32_ptr;
+  } packed = {packed_weights_};
+
+  uint8_t* kernel = (uint8_t*)malloc(n_elements * sizeof(uint8_t));;
+
+  // C = A * B
+  // A = M*K
+  // B = K*N
+  const uint32_t nr = pytorch_qnnp_params.q8conv.nr;
+  const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
+
+  // Convert prepacked weight to original weight / bias.
+  for (size_t nr_block_start = 0; nr_block_start < output_channels_; nr_block_start += nr) {
+    const size_t nr_block_size = min(output_channels_ - nr_block_start, nr);
+    for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size;
+         nr_block_offset++) {
+      packed.as_int32_ptr++;
+    }
+    packed.as_int32_ptr += (nr - nr_block_size);
+    for (size_t kr_block_start = 0; kr_block_start < input_channels_; kr_block_start += kr) {
+      const size_t kr_block_size = min(input_channels_ - kr_block_start, kr);
+      for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size;
+           nr_block_offset++) {
+        for (size_t kr_block_offset = 0; kr_block_offset < kr_block_size;
+             kr_block_offset++) {
+          kernel[(nr_block_start + nr_block_offset) * input_channels_ +
+          (kr_block_start + kr_block_offset)] = *(packed.as_uint8_ptr++);
+        }
+        if (kernel_zero_points != 0) {
+          for (size_t kr_block_offset = 0; kr_block_offset < (kr - kr_block_size);
+               kr_block_offset++) {
+            packed.as_uint8_ptr++;
+          }
+        } else {
+          packed.as_uint8_ptr += (kr - kr_block_size);
+        }
+      }
+      if (kernel_zero_points != 0) {
+        size_t remaining_nr_blocks = ((nr - nr_block_size) & (nr - 1));
+        for (size_t nr_block_offset = 0; nr_block_offset < remaining_nr_blocks;
+             nr_block_offset++) {
+          for (size_t kr_block_offset = 0; kr_block_offset < kr;
+               kr_block_offset++) {
+            packed.as_uint8_ptr++;
+          }
+        }
+      } else {
+        packed.as_uint8_ptr += ((nr - nr_block_size) & (nr - 1)) * kr;
+      }
+    }
+  }
+
+  return kernel;
+}
+
+} // namespace qnnpack
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-sse2-per-channel.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-sse2-per-channel.c
index bdc7e2ce2082..25244a2e4df7 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-sse2-per-channel.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-sse2-per-channel.c
@@ -418,7 +418,6 @@ void pytorch_q8dwconv_ukernel_mp8x25_per_channel__sse2(
         _mm_storeu_si128((__m128i*)outacc, vacc_lo);
         outacc += 4;
         _mm_storeu_si128((__m128i*)outacc, vacc_hi);
-        outacc += 4;
       }
     }
     {
@@ -806,7 +805,6 @@ void pytorch_q8dwconv_ukernel_mp8x25_per_channel__sse2(
         _mm_storeu_si128((__m128i*)outacc, vacc_lo);
         outacc += 4;
         _mm_storeu_si128((__m128i*)outacc, vacc_hi);
-        outacc += 4;
       }
     }
     {
@@ -1043,7 +1041,6 @@ void pytorch_q8dwconv_ukernel_mp8x25_per_channel__sse2(
         vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
         vacc_hi =
             _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
-        outacc += 8;
 
         const __m128 vmultiplier_lo =
             _mm_loadu_ps(&quantization_params->sse2.requantization_scales[channels - c]);
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 45453839e10c..793b179fafe3 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -65,6 +65,8 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_add(Tensor qx, Tensor qaccum, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_add_relu(Tensor qx, Tensor qaccum, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index 3637fa7e5b5c..b33ba2818890 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -337,7 +337,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
         auto pool_indices = pools[p];
 
         // Skip empty pools
-        if (pool_indices.size() == 0)
+        if (pool_indices.empty())
           continue;
 
         /* Prepare scratch space */
@@ -478,7 +478,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra
         auto pool_indices = pools[p];
 
         // Skip empty pools
-        if (pool_indices.size() == 0)
+        if (pool_indices.empty())
           continue;
 
         std::vector<scalar_t> tmp_row(nvalues, 0);
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
index 4fbdabce7157..9b2a8be7ef9a 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
@@ -13,6 +13,7 @@
 #else
 #include <ATen/ops/arange.h>
 #include <ATen/ops/empty.h>
+#include <ATen/ops/ones_like.h>
 #include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
 #include <ATen/ops/from_blob.h>
 #include <ATen/ops/result_type.h>
@@ -97,7 +98,8 @@ TensorIterator make_value_selection_intersection_iter(
     const Tensor& lhs_values,
     const Tensor& lhs_select_idx,
     const Tensor& rhs_values,
-    const Tensor& rhs_select_idx) {
+    const Tensor& rhs_select_idx,
+    const c10::optional<Tensor>& match_mask_opt = c10::nullopt) {
   const auto res_values_sizes = [&]() -> std::vector<int64_t> {
     auto sizes = infer_size(
         // keep nnz dim
@@ -126,6 +128,14 @@ TensorIterator make_value_selection_intersection_iter(
     return values.as_strided(values_sizes, values_strides);
   };
 
+  const auto match_mask = [&match_mask_opt, &lhs_select_idx]() -> Tensor {
+    if (match_mask_opt.has_value()) {
+      return *match_mask_opt;
+    } else {
+      return at::ones_like(lhs_select_idx);
+    }
+  }();
+
   auto iter = TensorIteratorConfig()
     .set_check_mem_overlap(false)
     .check_all_same_dtype(false)
@@ -135,6 +145,7 @@ TensorIterator make_value_selection_intersection_iter(
     .add_owned_input(restride_idx(lhs_select_idx))
     .add_owned_input(restride_values(rhs_values))
     .add_owned_input(restride_idx(rhs_select_idx))
+    .add_owned_input(restride_idx(match_mask))
     .build();
 
   return iter;
@@ -151,6 +162,7 @@ void _sparse_binary_op_intersection_kernel_impl(
     const Tensor& x_,
     const Tensor& y_,
     const std::vector<int64_t> broadcasted_shape,
+    const bool restrict_indices_to_rhs = false,
     const bool commutes_with_sum = true
 ) {
   // The common dtype check is relevant when op is done in-place.
@@ -164,8 +176,32 @@ void _sparse_binary_op_intersection_kernel_impl(
 
   using KernelLauncher = KernelLauncher<kernel_t>;
 
-  const Tensor x = commutes_with_sum ? x_ : x_.coalesce();
-  const Tensor y = commutes_with_sum ? y_ : y_.coalesce();
+  // If the op and sum are not commutative, coalesce is required.
+  // If restrict_indices_to_rhs is true, x needs to be coalesced so that
+  // (x.coalesce() intersection y union y).indices().counts() == y.indices().counts().
+  const Tensor x = (!commutes_with_sum || restrict_indices_to_rhs) ? x_.coalesce() : x_;
+  const Tensor y = [&]() -> Tensor {
+    auto rhs = commutes_with_sum ? y_ : y_.coalesce();
+    if (restrict_indices_to_rhs) {
+      // x is coalesced and y is marked as uncoalesced so that the intersection result
+      // respects the order of indices in y.
+      if (!rhs.is_same(y_)) {
+        // Safe to modify in-place, no side effects for y.
+        return rhs._coalesced_(false);
+      } else {
+        // No copy-constructor for sparse, hence a temporary sparse tensor is created
+        // with the fields taken from y. Ensures no side effects for y.
+        auto rhs_copy = at::empty({0}, rhs.options());
+        auto* rhs_copy_sparse_impl = get_sparse_impl(rhs_copy);
+        rhs_copy_sparse_impl->raw_resize_(rhs.sparse_dim(), rhs.dense_dim(), rhs.sizes());
+        rhs_copy_sparse_impl->set_indices_and_values_unsafe(rhs._indices(), rhs._values());
+        rhs_copy_sparse_impl->set_nnz_and_narrow(rhs._nnz());
+        rhs_copy._coalesced_(false);
+        return rhs_copy;
+      }
+    }
+    return rhs;
+  }();
 
   // Given sparse tensors x and y we decide which one is source, and which one
   // is probably_coalesced. The indices of both source and probably_coalesced are
@@ -391,6 +427,28 @@ void _sparse_binary_op_intersection_kernel_impl(
     return std::make_tuple(intersection_count, intersection_first_idx);
   }();
 
+  // Intersection is all we need in such a case.
+  if (restrict_indices_to_rhs) {
+    const auto res_indices = source._indices().clone();
+    const auto res_values = value_selection_intersection_kernel_t::apply(
+        probably_coalesced._values(),
+        intersection_first_idx.to(nnz_arange.scalar_type()),
+        source._values(),
+        nnz_arange.narrow(-1, 0, source._nnz()),
+        intersection_count.ge(1));
+    const auto res_sparse_dim = source.sparse_dim();
+    const auto res_dense_dim = source.dense_dim();
+    const auto& res_shape = broadcasted_shape;
+    const auto res_nnz = source._nnz();
+
+    auto* res_sparse_impl = get_sparse_impl(res);
+    res_sparse_impl->raw_resize_(res_sparse_dim, res_dense_dim, res_shape);
+    res_sparse_impl->set_indices_and_values_unsafe(res_indices, res_values);
+    res_sparse_impl->set_nnz_and_narrow(res_nnz);
+    res._coalesced_(y_.is_coalesced() || !commutes_with_sum);
+    return;
+  }
+
   // Using intersection_count and intersection_first_idx,
   // form indices selected_source and selected_probably_coalesced such that
   // res.values = op(
@@ -537,6 +595,14 @@ void _sparse_binary_op_intersection_kernel_out(
     Tensor& res,
     const Tensor& x,
     const Tensor& y,
+    // If true, the result's indices are the same as that of the rhs'.
+    // This behavior is useful when implementing operations
+    // with the symantics similar to that of sparse_mask,
+    // and it also requires less kernel calls compared to
+    // a generic intersection.
+    const bool restrict_indices_to_rhs = false,
+    // If op commutes with the sum, the arguments are processed as is,
+    // without the calls to coalesce().
     const bool commutes_with_sum = true
 ) {
   TORCH_CHECK(
@@ -576,7 +642,7 @@ void _sparse_binary_op_intersection_kernel_out(
       using hash_t = index_t1;
       using offset_t = index_t0;
       _sparse_binary_op_intersection_kernel_impl<kernel_t, value_selection_intersection_kernel_t, index_t, hash_t, offset_t>(
-          res, x, y, broadcasted_shape, commutes_with_sum);
+          res, x, y, broadcasted_shape, restrict_indices_to_rhs, commutes_with_sum);
   });
 }
 
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
index 4457f20415a6..32bb075da504 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@@ -28,18 +28,27 @@ bool MulOp::apply(bool a, bool b) {
   return a && b;
 }
 
+struct LhsProjOp {
+  template <typename scalar_t>
+  static scalar_t apply(scalar_t a, scalar_t b) {
+    return a;
+  }
+};
+
 template <typename binary_op_t>
 struct CPUValueSelectionIntersectionKernel {
   static Tensor apply(
       const Tensor& lhs_values,
       const Tensor& lhs_select_idx,
       const Tensor& rhs_values,
-      const Tensor& rhs_select_idx) {
+      const Tensor& rhs_select_idx,
+      const c10::optional<Tensor>& match_mask = c10::nullopt) {
     auto iter = make_value_selection_intersection_iter(
         lhs_values,
         lhs_select_idx,
         rhs_values,
-        rhs_select_idx);
+        rhs_select_idx,
+        match_mask);
     auto res_values = iter.tensor(0);
 
     auto lhs_nnz_stride = lhs_values.stride(0);
@@ -56,6 +65,7 @@ struct CPUValueSelectionIntersectionKernel {
                   const auto* ptr_lhs_select_idx_bytes = data[2];
                   const auto* ptr_rhs_values_bytes = data[3];
                   const auto* ptr_rhs_select_idx_bytes = data[4];
+                  const auto* ptr_match_bytes = data[5];
 
                   for (int64_t i = 0; i < n; ++i) {
                     // Exctract data
@@ -64,11 +74,16 @@ struct CPUValueSelectionIntersectionKernel {
                     const auto lhs_nnz_idx = *reinterpret_cast<const index_t*>(ptr_lhs_select_idx_bytes);
                     const auto* ptr_rhs_values = reinterpret_cast<const scalar_t*>(ptr_rhs_values_bytes);
                     const auto rhs_nnz_idx = *reinterpret_cast<const index_t*>(ptr_rhs_select_idx_bytes);
+                    const auto match = *reinterpret_cast<const bool*>(ptr_match_bytes);
 
                     // Apply op
-                    *ptr_res_values = binary_op_t::apply(
-                        *(ptr_lhs_values + lhs_nnz_idx * lhs_nnz_stride),
-                        *(ptr_rhs_values + rhs_nnz_idx * rhs_nnz_stride));
+                    if (match) {
+                      *ptr_res_values = binary_op_t::apply(
+                          *(ptr_lhs_values + lhs_nnz_idx * lhs_nnz_stride),
+                          *(ptr_rhs_values + rhs_nnz_idx * rhs_nnz_stride));
+                    } else {
+                      *ptr_res_values = 0;
+                    }
 
                     // Advance
                     ptr_res_values_bytes += strides[0];
@@ -76,6 +91,7 @@ struct CPUValueSelectionIntersectionKernel {
                     ptr_lhs_select_idx_bytes += strides[2];
                     ptr_rhs_values_bytes += strides[3];
                     ptr_rhs_select_idx_bytes += strides[4];
+                    ptr_match_bytes += strides[5];
                   }
                 };
                 iter.for_each(loop, at::internal::GRAIN_SIZE);
@@ -96,6 +112,16 @@ void mul_sparse_sparse_out_cpu_kernel(
   );
 }
 
+void sparse_mask_intersection_out_cpu_kernel(
+    Tensor& result,
+    const Tensor& x,
+    const Tensor& y) {
+  using CPUValueLhsProjKernel = CPUValueSelectionIntersectionKernel<LhsProjOp>;
+  _sparse_binary_op_intersection_kernel_out<CPUKernelLauncher, CPUValueLhsProjKernel>(
+      result, x, y, true
+  );
+}
+
 }
 
 REGISTER_ARCH_DISPATCH(mul_sparse_sparse_out_stub, DEFAULT, &mul_sparse_sparse_out_cpu_kernel);
@@ -104,4 +130,9 @@ REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_ke
 REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel);
 REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel);
 
+REGISTER_ARCH_DISPATCH(sparse_mask_intersection_out_stub, DEFAULT, &sparse_mask_intersection_out_cpu_kernel);
+REGISTER_AVX512_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel);
+REGISTER_AVX2_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel);
+REGISTER_VSX_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel);
+REGISTER_ZVECTOR_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel);
 }}
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index 3d2526c41204..afbe006dd744 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -693,6 +693,22 @@ Tensor row_indices_sparse_csr(const Tensor& self) {
                                                    [&]{ return get_sparse_csr_impl(self)->plain_indices().alias(); });
 }
 
+Tensor crow_indices_default(const Tensor& self) {
+  TORCH_CHECK(false, "crow_indices expected sparse row compressed tensor layout but got ", self.layout());
+}
+
+Tensor col_indices_default(const Tensor& self) {
+  TORCH_CHECK(false, "col_indices expected sparse row compressed tensor layout but got ", self.layout());
+}
+
+Tensor ccol_indices_default(const Tensor& self) {
+  TORCH_CHECK(false, "ccol_indices expected sparse column compressed tensor layout but got ", self.layout());
+}
+
+Tensor row_indices_default(const Tensor& self) {
+  TORCH_CHECK(false, "row_indices expected sparse column compressed tensor layout but got ", self.layout());
+}
+
 int64_t sparse_dim_sparse_csr(const SparseCsrTensor& self) {
   return get_sparse_csr_impl(self)->sparse_dim();
 }
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index efa692665d4c..c59bcf6cdc03 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -20,6 +20,7 @@
 #include <ATen/Operators.h>
 #else
 #include <ATen/ops/_conj_physical_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr.h>
 #include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo.h>
 #include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
@@ -51,6 +52,7 @@
 #include <ATen/ops/deg2rad.h>
 #include <ATen/ops/deg2rad_native.h>
 #include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
 #include <ATen/ops/erf.h>
 #include <ATen/ops/erf_native.h>
 #include <ATen/ops/erfinv.h>
@@ -932,6 +934,11 @@ Tensor& add_out_sparse_csr_cpu(
         self.sizes(),
         " and tensor `other` with shape ",
         other.sizes());
+
+    if (only_sparse_compressed_add_trivial_cases(self, other, alpha, out)) {
+      return out;
+    }
+
     at::native::resize_as_sparse_compressed_(out, self);
     sparse::impl::cpu::add_out_sparse_csr(self, other, alpha, out);
   }
@@ -1227,7 +1234,7 @@ Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, std::vector<int64_t>
     TORCH_INTERNAL_ASSERT(((dims[0] == 0 && dims[1] == 1) || (dims[0] == 1 && dims[1] == 0)));
     return reduce_sparse_csr_dim01_cpu_template<scalar_t>(sparse, rop);
   }
-  TORCH_INTERNAL_ASSERT(dims.size() == 0);
+  TORCH_INTERNAL_ASSERT(dims.empty());
   // effective after gh-29137 has been resolved
   return sparse.clone();
 }
@@ -1242,7 +1249,7 @@ Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_
   TORCH_INTERNAL_ASSERT(input_dim == 2);
   auto dims = dims_to_sum.vec();
   maybe_wrap_dims(dims, input_dim);
-  if (dims.size() == 0) {
+  if (dims.empty()) {
     // after gh-29137 is resolved, delete this if-block
     dims.emplace_back(0);
     dims.emplace_back(1);
@@ -1292,5 +1299,134 @@ Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, boo
   return result;
 }
 
+std::tuple<Tensor, Tensor> _sparse_mm_reduce_impl_sparse_csr_cpu(
+    const Tensor& self,
+    const Tensor& other,
+    const c10::string_view reduce) {
+
+  auto layout = self.layout();
+  TORCH_CHECK(layout == kSparseCsr,
+      "sparse_mm_reduce: expect self to be SparseCsr, got ", layout);
+  TORCH_CHECK(self.dense_dim() == 0,
+      "sparse_mm_reduce: expected non-hybrid self tensor.");
+  TORCH_CHECK(self.dim() == 2,
+      "sparse_mm_reduce: expected self to be a 2-D tensor, got ", self.dim(), "-D tensor.");
+
+  sparse::impl::check_sparse_mm_reduce_impl_inputs</*train*/false>(
+      self, Tensor(), other);
+
+  auto op = get_reduction_enum(reduce);
+  TORCH_CHECK(op != ReductionType::PROD, "sparse_mm_reduce: reduce type of prod has not been enabled.")
+
+  auto crow = self.crow_indices();
+  auto col = self.col_indices();
+  auto val = self.values();
+
+  // init output to be all zeros, for `rows` that has no nonzero elements,
+  // the corresponding rows in the output will be zero.
+  auto out = at::zeros({self.size(0), other.size(1)}, other.options());
+  auto arg_out = at::empty({0}, col.options());
+
+  int64_t nnz = self._nnz();
+  if (nnz == 0) {
+    return std::make_tuple(out, arg_out);
+  }
+
+  // only need to calculate the out args
+  // for reduce type "amax" and "amin" for training
+  bool need_arg_out = at::GradMode::is_enabled()
+      && (self.requires_grad() || other.requires_grad())
+      && (op == ReductionType::MAX || op == ReductionType::MIN);
+
+  if (!need_arg_out) {
+    spmm_reduce_stub(kCPU, out, crow, col, val, other, op);
+  } else {
+    // allocate memory and init with invalid index
+    arg_out.resize_(out.sizes());
+    arg_out.fill_(nnz);
+    spmm_reduce_arg_stub(kCPU, out, arg_out, crow, col, val, other, op);
+  }
+
+  return std::make_tuple(std::move(out), std::move(arg_out));
+}
+
+std::tuple<Tensor, Tensor> _sparse_mm_reduce_impl_backward_sparse_csr_cpu(
+    const Tensor& self,
+    const Tensor& grad_out,
+    const Tensor& other,
+    const c10::string_view reduce,
+    const Tensor& arg_out,
+    std::array<bool, 2> output_mask) {
+
+  auto layout = self.layout();
+  TORCH_CHECK(layout == kSparseCsr,
+      "sparse_mm_reduce: expect self to be SparseCsr, got ", layout);
+
+  sparse::impl::check_sparse_mm_reduce_impl_inputs</*train*/true>(
+      self, grad_out, other);
+
+  auto op = get_reduction_enum(reduce);
+
+  auto crow = self.crow_indices();
+  auto col = self.col_indices();
+  auto val = self.values();
+
+  // `row`: row indices of COO format
+  // `ccol`: ccol indices of CSC format (with permute)
+  // `permute`: permute pattern from CSR to CSC
+  //
+  // TODO: optimize the following section,
+  // currently `argsort` is sequential.
+  Tensor row, ccol, permute;
+  {
+    bool out_int32 = crow.scalar_type() == ScalarType::Int;
+    Tensor coo_indices = at::_convert_indices_from_csr_to_coo(
+        crow,
+        col,
+        out_int32,
+        /*transpose*/false);
+    row = coo_indices.select(0, 0);
+
+    // calculte the global index for CSC
+    // and get the conversion permute pattern
+    Tensor index = col.mul(self.size(0)).add_(row);
+    permute = index.argsort();
+
+    ccol = at::_convert_indices_from_coo_to_csr(
+        /*column indices*/col.index_select(0, permute),
+        /*column count*/self.size(1),
+        out_int32);
+  }
+
+  Tensor grad_self, grad_other;
+  if (output_mask[0]) {
+    // grad_input has the same indices and nnz with input
+    grad_self = at::empty_like(self);
+    grad_self.values().zero_();
+    if (op == ReductionType::MAX || op == ReductionType::MIN) {
+      spmm_reduce_backward_input_arg_stub(kCPU, grad_self, grad_out, col, other, arg_out, op);
+    } else {
+      spmm_reduce_backward_input_stub(kCPU, grad_self, grad_out, crow, col, other, row, op);
+    }
+  }
+  if (output_mask[1]) {
+    grad_other = at::zeros(other.sizes(), other.options());
+    if (op == ReductionType::MAX || op == ReductionType::MIN) {
+      spmm_reduce_backward_other_arg_stub(kCPU, grad_other, grad_out, col, val, arg_out, op);
+    } else {
+      spmm_reduce_backward_other_stub(kCPU, grad_other, grad_out, crow, val, row, ccol, permute, op);
+    }
+  }
+
+  return std::make_tuple(std::move(grad_self), std::move(grad_other));
+}
+
+DEFINE_DISPATCH(spmm_reduce_stub);
+DEFINE_DISPATCH(spmm_reduce_arg_stub);
+DEFINE_DISPATCH(spmm_reduce_backward_input_stub);
+DEFINE_DISPATCH(spmm_reduce_backward_input_arg_stub);
+DEFINE_DISPATCH(spmm_reduce_backward_other_stub);
+DEFINE_DISPATCH(spmm_reduce_backward_other_arg_stub);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.h b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
index a92added5f01..d954c8960a23 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h
@@ -2,6 +2,9 @@
 
 #include <ATen/Tensor.h>
 #include <ATen/core/Scalar.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/ReductionType.h>
+#include <ATen/native/cpu/SpmmReduceKernel.h>
 
 namespace at {
 namespace native {
@@ -59,6 +62,28 @@ inline void _check_dim(const Tensor& self, int64_t target_dim, c10::string_view
       " instead.");
 }
 
+template <bool train>
+inline void check_sparse_mm_reduce_impl_inputs(
+    const Tensor& self,
+    const Tensor& grad_out,
+    const Tensor& other) {
+  TORCH_INTERNAL_ASSERT(self.is_sparse_csr());
+
+  const auto input_scalar_type = self.values().scalar_type();
+  CheckedFrom c = train ? "sparse_mm_reduce_backward" : "sparse_mm_reduce";
+  if (train) {
+    checkLayout(c, grad_out, kStrided);
+    checkScalarType(c, {grad_out, "grad_out", 1}, input_scalar_type);
+    check_dim_size(grad_out, 2, 0, self.size(0));
+    check_dim_size(grad_out, 2, 1, other.size(1));
+  }
+
+  int pos = train ? 2 : 1;
+  checkLayout(c, other, kStrided);
+  checkScalarType(c, {other, "other", pos}, input_scalar_type);
+  check_dim_size(other, 2, 0, self.size(1));
+}
+
 }
 }
 }
diff --git a/aten/src/ATen/native/sparse/SparseStubs.h b/aten/src/ATen/native/sparse/SparseStubs.h
index 89eda9d05b39..0442f3855206 100644
--- a/aten/src/ATen/native/sparse/SparseStubs.h
+++ b/aten/src/ATen/native/sparse/SparseStubs.h
@@ -11,6 +11,9 @@ namespace native {
 using mul_sparse_sparse_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y);
 DECLARE_DISPATCH(mul_sparse_sparse_out_fn, mul_sparse_sparse_out_stub);
 
+using sparse_mask_intersection_out_fn = void (*)(Tensor& res, const Tensor& x, const Tensor& y);
+DECLARE_DISPATCH(sparse_mask_intersection_out_fn, sparse_mask_intersection_out_stub);
+
 }
 
 }
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index d24068c0a05c..6bb912408838 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -8,6 +8,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/SparseTensorImpl.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/NonSymbolicBC.h>
 #include <ATen/NamedTensorUtils.h>
@@ -34,7 +35,6 @@
 #include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
 #include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_native.h>
 #include <ATen/ops/_sparse_coo_tensor_with_dims_native.h>
-#include <ATen/ops/_sparse_mask_helper_native.h>
 #include <ATen/ops/_validate_sparse_coo_tensor_args_native.h>
 #include <ATen/ops/_values_native.h>
 #include <ATen/ops/clone_native.h>
@@ -86,6 +86,11 @@ bool is_coalesced_sparse(const SparseTensor& self) {
   return get_sparse_impl(self)->coalesced();
 }
 
+bool is_coalesced_default(const Tensor& self) {
+  TORCH_CHECK(false, "is_coalesced expected sparse coordinate tensor layout but got ", self.layout());
+  return false;
+}
+
 int64_t _nnz_sparse(const SparseTensor& self) {
   return get_sparse_impl(self)->nnz();
 }
@@ -114,6 +119,10 @@ Tensor indices_sparse(const Tensor& self) {
   return get_sparse_impl(self)->indices().alias();
 }
 
+Tensor indices_default(const Tensor& self) {
+  TORCH_CHECK(false, "indices expected sparse coordinate tensor layout but got ", self.layout());
+}
+
 Tensor values_sparse(const Tensor& self) {
   TORCH_CHECK(
       self.is_coalesced(),
@@ -121,6 +130,10 @@ Tensor values_sparse(const Tensor& self) {
   return get_sparse_impl(self)->values().alias();
 }
 
+Tensor values_default(const Tensor& self) {
+  TORCH_CHECK(false, "values expected sparse tensor layout but got ", self.layout());
+}
+
 /******************************************************************************
  * creation methods
  * See NOTE [ Sparse: autograd and API ] for details
@@ -632,6 +645,7 @@ SparseTensor& copy_sparse_(
 }
 
 SparseTensor coalesce(const SparseTensor& self) {
+  TORCH_CHECK(self.layout() == kSparse, "coalesce expected sparse coordinate tensor layout but got ", self.layout());
   // See NOTE: [ coalesce autograd ]
   if (self.is_coalesced()) {
     return self;
@@ -729,6 +743,8 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) {
   return dst;
 }
 
+DEFINE_DISPATCH(sparse_mask_intersection_out_stub);
+
 SparseTensor sparse_mask(const Tensor& t, const SparseTensor& mask) {
   TORCH_CHECK(
       mask.sizes().equals(t.sizes()),
@@ -741,6 +757,17 @@ SparseTensor sparse_mask(const Tensor& t, const SparseTensor& mask) {
     return mask.clone().to(t.device(), t.scalar_type());
   }
 
+  if (t.layout() == at::kSparse) {
+    TORCH_CHECK(t.sparse_dim() == mask.sparse_dim(),
+                "sparse_mask(): the number of sparse dimensions in `self` ",
+                "should match that of the `mask`. ",
+                "Got `self.sparse_dim() == ", t.sparse_dim(), "` != ",
+                "`mask.sparse_dim() == ", mask.sparse_dim(), "`.");
+    auto res = at::empty({0}, t.options());
+    sparse_mask_intersection_out_stub(res.device().type(), res, t, mask);
+    return res;
+  }
+
   const auto mask_values = mask._values();
   auto mask_template = at::sparse_coo_tensor(
       mask._indices(),
@@ -749,89 +776,6 @@ SparseTensor sparse_mask(const Tensor& t, const SparseTensor& mask) {
   return t.mul(mask_template).to(t.scalar_type());
 }
 
-Tensor sparse_mask_helper_cpu(
-    const SparseTensor& t,
-    const Tensor& mask_indices) {
-  /*
-    This is a helper function which filter values from `t._values()` using the
-    `mask_indices`. This CPU implementation uses a simple hash_map to filter
-    values by matching the `mask_indices` with the indices at tensor input `t`.
-
-    Inputs:
-      `t`             - coalesced sparse tensor input
-      `mask_indices`  - mask indices tensor
-
-    Note: The nnz in the output tensor will be same as the `mask_indices`. So it
-    will works independently if the mask is coalesced or not.
-  */
-  TORCH_CHECK(t.is_sparse(), "t: input is not a sparse tensor");
-  TORCH_CHECK(t.is_coalesced(), "t:  input is uncoalesced");
-  TORCH_CHECK(
-      mask_indices.dim() == t._indices().dim(),
-      "mask_indices: operands have incompatible indices dim; self has dim ",
-      t._indices().dim(),
-      " but mask has dim ",
-      mask_indices.dim());
-  TORCH_CHECK(
-      mask_indices.is_contiguous(), "mask_indices: mask is not contiguous");
-
-  int64_t r_nnz = mask_indices.size(1);
-  auto t_v = t._values();
-  auto vsize = t_v.sizes().vec();
-  vsize[0] = r_nnz;
-
-  Tensor r_values = at::zeros(vsize, t_v.options());
-  auto t_i = t._indices();
-  auto t_nnz = t._nnz();
-
-  std::unordered_map<int64_t, int64_t> t_flatten_indices =
-      std::unordered_map<int64_t, int64_t>{};
-  auto full_size = t.sizes();
-  auto ti_flattened_indices = at::sparse::flatten_indices(t_i, full_size);
-
-  // Step 1: flatten the sparse indices `t._indices()` tensor and then  map this
-  // flatten value `index` to the original position `i`
-  for (const auto i : c10::irange(t_nnz)) {
-    int64_t index = ti_flattened_indices.data_ptr<int64_t>()[i];
-    t_flatten_indices[index] = i;
-  }
-
-  // Step 2: Filter `t._values()` values by matching the flatten `mask_indices`
-  // with the flatten `t._indices()` using the hash_map `t_flatten_indices`
-
-  auto flattened_mask_indices =
-      at::sparse::flatten_indices(mask_indices, full_size);
-
-  const auto copy_iter = TensorIteratorConfig()
-    .add_output(r_values)
-    .add_input(t_v)
-    .resize_outputs(false)
-    .declare_static_shape(r_values.sizes(), /*squash_dims=*/0)
-    .build();
-
-  at::parallel_for(0, r_nnz, 0, [&](int64_t start, int64_t end) {
-    TensorIterator copy_iter_local(copy_iter);
-    const auto r_values_data = reinterpret_cast<char*>(r_values.data_ptr());
-    const auto t_values_data = reinterpret_cast<char*>(t_v.data_ptr());
-    const auto r_values_stride = r_values.strides()[0] * r_values.element_size();
-    const auto t_values_stride = t_v.strides()[0] * t_v.element_size();
-
-    for (const auto i : c10::irange(start, end)) {
-      int64_t index = flattened_mask_indices.data_ptr<int64_t>()[i];
-      auto iter = t_flatten_indices.find(index);
-      if (iter != t_flatten_indices.end()) {
-        // r_values[i].copy_(t_v[iter->second])
-        copy_iter_local.unsafe_replace_operand(
-            0, r_values_data + i * r_values_stride);
-        copy_iter_local.unsafe_replace_operand(
-            1, t_values_data + iter->second * t_values_stride);
-        copy_stub(kCPU, copy_iter_local, /*non_blocking=*/false);
-      }
-    }
-  });
-  return r_values;
-}
-
 Tensor empty_like_sparse_coo(
     const Tensor& self,
     c10::optional<ScalarType> dtype,
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 52f4d2bad3f5..4df035518ecb 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -31,6 +31,8 @@
 #include <ATen/ops/_sparse_sum_backward_native.h>
 #include <ATen/ops/_sparse_sum_native.h>
 #include <ATen/ops/_sparse_sparse_matmul.h>
+#include <ATen/ops/_sparse_mm_reduce_impl.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_native.h>
 #include <ATen/ops/add.h>
 #include <ATen/ops/add_native.h>
 #include <ATen/ops/addmm.h>
@@ -375,7 +377,7 @@ Tensor norm_sparse(const SparseTensor& self, const Scalar& p) {
 
 Tensor norm_sparse(const SparseTensor& self, const optional<Scalar>& p, IntArrayRef dim, bool keepdim, optional<ScalarType> dtype) {
   AT_ASSERT(self.is_sparse());
-  if (dim.size() > 0) {
+  if (!dim.empty()) {
     // Only full reductions are supported, so check if that is the case
     int64_t ndim = self.dim();
     bool passed_full_reduction_check = static_cast<size_t>(ndim) == dim.size();
@@ -1248,11 +1250,7 @@ Tensor& s_addmm_out_sparse_dense_cpu(
     const SparseTensor& sparse_,
     const Tensor& dense,
     const Scalar& beta,
-    const Scalar& alpha
-) {
-  AT_ASSERT(r.layout() == kStrided, "addmm_sparse_dense: expected strided result tensor, got tensor with layout ", r.layout());
-  AT_ASSERT(sparse_.layout() == kSparse, "addmm_sparse_dense: expected sparse tensor, got tensor with layout ", sparse_.layout());
-
+    const Scalar& alpha) {
   // TODO: This error message seems awfully opaque
   TORCH_CHECK(
       t.is_cpu(),
@@ -1261,15 +1259,30 @@ Tensor& s_addmm_out_sparse_dense_cpu(
   TORCH_CHECK(
       r.is_cpu(),
       "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got tensor on ",
-      t.device());
+      r.device());
   TORCH_CHECK(
       sparse_.is_cpu(),
       "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got tensor on ",
-      t.device());
+      sparse_.device());
   TORCH_CHECK(
       dense.is_cpu(),
       "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got tensor on ",
-      t.device());
+      dense.device());
+
+  TORCH_CHECK(
+      r.layout() == kStrided,
+      "addmm_sparse_dense: expected strided result tensor, got tensor with layout ",
+      r.layout());
+  TORCH_CHECK(
+      t.layout() == kStrided,
+      "addmm_sparse_dense: expected 't' to have strided layout, got tensor with layout ",
+      t.layout());
+  TORCH_CHECK(
+      sparse_.layout() == kSparse && dense.layout() == kStrided,
+      "addmm_sparse_dense: expected either 'mat1' to have sparse layout and 'mat2' to have strided layout, got 'mat1' with layout ",
+      sparse_.layout(),
+      " and 'mat2' with layout ",
+      dense.layout());
 
   TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor");
   TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values");
@@ -1306,7 +1319,6 @@ Tensor& s_addmm_out_sparse_dense_cpu(
   );
 
   return r;
-
 }
 
 Tensor& addmm_out_sparse_dense_cpu(
@@ -1393,6 +1405,12 @@ SparseTensor& _sparse_mm_out(const SparseTensor& sparse,
   return at::addmm_out(result, t, sparse, dense, 0, 1);  // redispatch!
 }
 
+Tensor _sparse_mm(const Tensor& mat1, const Tensor& mat2, const c10::string_view reduce) {
+  // result: out, arg_out
+  auto result = at::_sparse_mm_reduce_impl(mat1, mat2, reduce);
+  return std::get<0>(result);
+}
+
 // --------------------------------------------------------------------
 // hspmm(SparseTensor mat1, Tensor mat2)
 // --------------------------------------------------------------------
@@ -1658,7 +1676,7 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) {
   }
   const int64_t sparse_dims_to_sum_size = dims_to_sum_v.size() - dense_dims_to_sum_v.size();
   const bool sum_all_sparse_dim = (sparse_dim == sparse_dims_to_sum_size);
-  const bool sum_dense_dim = (dense_dims_to_sum_v.size() > 0);
+  const bool sum_dense_dim = (!dense_dims_to_sum_v.empty());
 
   // new values
   Tensor new_values;
@@ -1780,7 +1798,7 @@ Tensor _sparse_sum_backward_cpu(const Tensor& grad_, const SparseTensor& input_,
   }
 
   const bool sum_all_sparse_dim = (input_sparse_dim == sparse_dims_to_sum_size);
-  const bool sum_dense_dim = (dense_dims_to_sum_v.size() > 0);
+  const bool sum_dense_dim = (!dense_dims_to_sum_v.empty());
   const bool sum_sparse_dim = (sparse_dims_to_sum_size > 0);
 
   if (sum_all_sparse_dim) {
diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
index 9b2ef61df5fe..18a20cdff6a2 100644
--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
@@ -49,6 +49,55 @@ _assert(const bool cond, const char* const message) {
 
 enum class CDimName : bool { CRow, CCol };
 
+template <size_t static_shape_max_len>
+class TensorGeometryHolder {
+  using geometry_holder_t = std::array<int64_t, static_shape_max_len>;
+
+public:
+  explicit TensorGeometryHolder(const Tensor& t) {
+    std::copy(t.sizes().begin(), t.sizes().end(), t_sizes.begin());
+    std::copy(t.strides().begin(), t.strides().end(), t_strides.begin());
+  }
+
+  auto operator*() const {
+    return std::make_tuple(t_sizes, t_strides);
+  }
+
+private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+
+template <>
+class TensorGeometryHolder<0> {
+  using geometry_holder_t = Tensor;
+
+public:
+  explicit TensorGeometryHolder(const Tensor& t) {
+    const auto t_ndims = t.dim();
+    const auto cpu_options = t.options().dtype(kLong).device(kCPU);
+    Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options);
+    t_sizes_and_strides_cpu.select(0, 0).copy_(
+        at::tensor(t.sizes(), cpu_options));
+    t_sizes_and_strides_cpu.select(0, 1).copy_(
+        at::tensor(t.strides(), cpu_options));
+    const Tensor t_sizes_and_strides =
+        t_sizes_and_strides_cpu.to(t.device());
+    t_sizes = t_sizes_and_strides.select(0, 0);
+    t_strides = t_sizes_and_strides.select(0, 1);
+  }
+
+  auto operator*() const {
+    return std::make_tuple(
+        t_sizes.template data_ptr<int64_t>(),
+        t_strides.template data_ptr<int64_t>());
+  }
+
+private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+
 // Invariant 5.1
 // compressed_index[..., 0] == 0.
 template <CDimName cdim_name, typename index_t>
@@ -190,7 +239,8 @@ template <
     class kernel_t,
     template <typename func_t, typename vec_func_t>
     class vec_kernel_t = EmptyVecKernel,
-    template <typename scalar_t> class Vec = DummyVec>
+    template <typename scalar_t> class Vec = DummyVec,
+    size_t static_shape_max_len = 0>
 void _validate_compressed_sparse_indices_kernel(
     const Tensor& cidx,
     const Tensor& idx,
@@ -269,14 +319,10 @@ void _validate_compressed_sparse_indices_kernel(
         at::arange(batch_count, cidx.options()).view(batch_dims).unsqueeze_(-1);
 
     const auto idx_ndims = idx.dim();
-    const auto cpu_options = idx.options().dtype(kLong).device(kCPU);
-    Tensor idx_sizes_and_strides_cpu = at::empty({2, idx_ndims}, cpu_options);
-    idx_sizes_and_strides_cpu.select(0, 0).copy_(
-        at::tensor(idx.sizes(), cpu_options));
-    idx_sizes_and_strides_cpu.select(0, 1).copy_(
-        at::tensor(idx.strides(), cpu_options));
-    const Tensor idx_sizes_and_strides =
-        idx_sizes_and_strides_cpu.to(idx.device());
+
+    const auto idx_geometry_holder = TensorGeometryHolder<static_shape_max_len>(idx);
+    const auto idx_sizes = std::get<0>(*idx_geometry_holder);
+    const auto idx_strides = std::get<1>(*idx_geometry_holder);
 
     auto iter = TensorIteratorConfig()
                     .set_check_mem_overlap(false)
@@ -291,11 +337,8 @@ void _validate_compressed_sparse_indices_kernel(
     AT_DISPATCH_INDEX_TYPES(
         idx.scalar_type(),
         NAME,
-        [&iter, &idx, dim, nnz, idx_ndims, &idx_sizes_and_strides]() {
+        [&iter, &idx, dim, nnz, idx_ndims, &idx_sizes, &idx_strides]() {
           const auto* RESTRICT ptr_idx = idx.data_ptr<index_t>();
-          const int64_t* RESTRICT idx_sizes =
-              idx_sizes_and_strides.data_ptr<int64_t>();
-          const int64_t* RESTRICT idx_strides = idx_sizes + idx_ndims;
           const auto zero = index_t{0};
           KernelLauncher::launch(
               iter,
@@ -348,18 +391,41 @@ void validate_compressed_sparse_indices_kernel(
     const int64_t cdim,
     const int64_t dim,
     const int64_t nnz) {
+  constexpr size_t idx_max_ndims = 8; // up to 7-dim batch.
+  const int64_t idx_ndims = idx.dim();
+
   if (is_crow) {
-    _validate_compressed_sparse_indices_kernel<
-        CDimName::CRow,
-        kernel_t,
-        vec_kernel_t,
-        Vec>(cidx, idx, cdim, dim, nnz);
+    if (idx_ndims <= idx_max_ndims) {
+      _validate_compressed_sparse_indices_kernel<
+          CDimName::CRow,
+          kernel_t,
+          vec_kernel_t,
+          Vec,
+          idx_max_ndims>(cidx, idx, cdim, dim, nnz);
+    }
+    else {
+      _validate_compressed_sparse_indices_kernel<
+          CDimName::CRow,
+          kernel_t,
+          vec_kernel_t,
+          Vec>(cidx, idx, cdim, dim, nnz);
+    }
   } else {
-    _validate_compressed_sparse_indices_kernel<
-        CDimName::CCol,
-        kernel_t,
-        vec_kernel_t,
-        Vec>(cidx, idx, cdim, dim, nnz);
+    if (idx_ndims <= idx_max_ndims) {
+      _validate_compressed_sparse_indices_kernel<
+          CDimName::CCol,
+          kernel_t,
+          vec_kernel_t,
+          Vec,
+          idx_max_ndims>(cidx, idx, cdim, dim, nnz);
+    }
+    else {
+      _validate_compressed_sparse_indices_kernel<
+          CDimName::CCol,
+          kernel_t,
+          vec_kernel_t,
+          Vec>(cidx, idx, cdim, dim, nnz);
+    }
   }
 }
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index 34a864a8fae0..1dc0edd4bd04 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -18,7 +18,6 @@
 #else
 #include <ATen/ops/_coalesce_native.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
-#include <ATen/ops/_sparse_mask_helper_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/zeros.h>
 #endif
@@ -40,43 +39,6 @@
 namespace at { namespace native {
 
 using namespace at::sparse;
-using at::cuda::detail::TensorInfo;
-using at::cuda::detail::getTensorInfo;
-
-namespace {
-
-template <typename scalar_t>
-C10_LAUNCH_BOUNDS_1(1024)
-__global__ void _sparse_mask_copy_kernel(
-    int64_t total_threads,
-    int64_t t_nnz,
-    const TensorInfo<int64_t, int64_t> t_indices_ti,
-    const TensorInfo<int64_t, int64_t> mask_indices_ti,
-    const TensorInfo<int64_t, int64_t> t_indices_pos_ti,
-    const TensorInfo<scalar_t, int64_t> t_values_ti,
-    TensorInfo<scalar_t, int64_t> r_values_ti) {
-  const int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= total_threads) return;
-  const int64_t j = t_indices_pos_ti.data[i];
-
-  bool has_match = false;
-  if (j >= 0 &&  j < t_nnz && t_indices_ti.data[j] == mask_indices_ti.data[i]) {
-    has_match = true;
-  }
-
-  int64_t values_stride0 = r_values_ti.strides[0];
-  int64_t out_start = i * values_stride0;
-  int64_t out_end = (i + 1) * values_stride0;
-  int64_t in_start = j * t_values_ti.strides[0];
-
-  if (has_match) {
-    for (int64_t out_i = out_start, in_i = in_start; out_i < out_end; out_i++, in_i++) {
-      r_values_ti.data[out_i] = t_values_ti.data[in_i];
-    }
-  }
-}
-
-} // end namespace
 
 SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
   int64_t nnz = self._nnz();
@@ -204,98 +166,4 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) {
   return dst;
 }
 
-Tensor sparse_mask_helper_cuda(
-    const SparseTensor& t,
-    const Tensor& mask_indices) {
-  /*
-    This is a helper function which filter values from `t._values()` using the
-    `mask_indices`. This CUDA implementation uses `thrust::lower_bound`
-    operation to find the intersection of the `mask_indices` and the
-    `t._indices()` to then filter the values.
-
-    Inputs:
-      `t`             - coalesced sparse tensor input
-      `mask_indices`  - mask indices tensor
-
-    Note: The nnz in the output tensor will be same as the `mask_indices`. So it will
-    works independently if the mask is coalesced or not.
-  */
-  TORCH_CHECK(t.is_sparse(), "t: input is not a sparse tensor");
-  TORCH_CHECK(t.is_coalesced(), "t:  input is uncoalesced");
-  TORCH_CHECK(mask_indices.dim() == t._indices().dim(), "mask_indices: operands have incompatible indices dim; self has dim ",
-      t._indices().dim(), " but mask has dim ", mask_indices.dim());
-  TORCH_CHECK(mask_indices.is_contiguous(), "mask_indices: mask is not contiguous");
-
-  int64_t r_nnz = mask_indices.size(1);
-  auto t_values = t._values().contiguous();
-  auto full_size = t.sizes();
-  auto vsize = t_values.sizes().vec();
-  vsize[0] = r_nnz;
-
-
-  if (t.sparse_dim() == 0) {
-    Tensor t_values_expand = t_values;
-    t_values_expand = t_values_expand.expand(vsize).contiguous();
-    return t_values_expand;
-  }
-  Tensor r_values = at::zeros({vsize}, t_values.options());
-  auto t_indices = t._indices().contiguous();
-  auto t_nnz = t._nnz();
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  at::cuda::ThrustAllocator allocator;
-  auto policy = thrust::cuda::par(allocator).on(stream);
-
-  // Step 1: flatten the sparse indices `t._indices()` tensor into a 1D indices
-  // tensor `t_flatten_indices`.
-  auto t_flatten_indices = at::sparse::flatten_indices(t_indices, full_size).contiguous();
-
-  // Step 2: flatten the sparse indices `mask_indices` tensor into a 1D indices
-  // tensor `mask_flatten_indices`. Note: This could be not sorted if the input
-  // indices in the constructor are not in a coalesced form
-  auto flattened_mask_indices =
-      at::sparse::flatten_indices(mask_indices, full_size);
-
-  Tensor t_indices_pos = at::empty({r_nnz}, mask_indices.options());
-
-  // Step 3: Match the flattened `mask_indices` with the flattened
-  // `t._indices()` using the `thrust::lower_bound`
-  thrust::lower_bound(
-      policy,
-      t_flatten_indices.data_ptr<int64_t>(),
-      t_flatten_indices.data_ptr<int64_t>() + t_nnz,
-      flattened_mask_indices.data_ptr<int64_t>(),
-      flattened_mask_indices.data_ptr<int64_t>() + r_nnz,
-      t_indices_pos.data_ptr<int64_t>());
-
-  // Step 4: Copy the Filtered `t._values()` using the matches at `t_indices_pos`
-  if (r_nnz > 0 && t_values.numel() > 0) {
-    int64_t block_size = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
-    auto grid_size = ceil_div(r_nnz, block_size);
-
-    auto t_indices_ti = getTensorInfo<int64_t, int64_t>(t_flatten_indices);
-    auto mask_indices_ti =
-        getTensorInfo<int64_t, int64_t>(flattened_mask_indices);
-    auto t_indices_pos_ti =
-        getTensorInfo<int64_t, int64_t>(t_indices_pos);
-
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(kHalf,
-        r_values.scalar_type(), "sparse_mask_helper_cuda", [&] {
-          auto t_values_ti = getTensorInfo<scalar_t, int64_t>(t_values);
-          auto r_values_ti =
-              getTensorInfo<scalar_t, int64_t>(r_values);
-
-          _sparse_mask_copy_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
-              r_nnz,
-              t_nnz,
-              t_indices_ti,
-              mask_indices_ti,
-              t_indices_pos_ti,
-              t_values_ti,
-              r_values_ti);
-          C10_CUDA_KERNEL_LAUNCH_CHECK();
-        });
-  }
-  return r_values;
-}
 }} // namespace at::native
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
index 596f8c3b94c7..743ddfaea35c 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -263,6 +263,11 @@ Tensor& add_out_sparse_csr_cuda(
         self.sizes(),
         " and tensor `other` with shape ",
         other.sizes());
+
+    if (only_sparse_compressed_add_trivial_cases(self, other, alpha, out)) {
+      return out;
+    }
+
     at::native::resize_as_sparse_compressed_(out, self);
     sparse::impl::cuda::add_out_sparse_csr(self, other, Scalar(1), alpha, out);
   }
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
index 4310a471ecba..4542be5df75e 100644
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@@ -30,15 +30,15 @@
   desc: |
           This tag indicates if an operator doesn't guarentee bitwise equivalence
           across different runs of an operator with identical inputs.
-- tag: canonical
+- tag: core
   desc: |
-          Canonical aten ops is a subset of aten ops that remains after aten-to-aten decomposition and
-          functionalization pass. Canonical aten ops are fully functional and adhere to single static
+          Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and
+          functionalization pass. Core aten ops are fully functional and adhere to single static
           assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset.
           This opset is designed to serve as the functional IR to interface with compiler backends.
-          In contrast to primTorch, canonical aten opset doesn't decompose ops into explicit
+          In contrast to primTorch, core aten opset doesn't decompose ops into explicit
           type promotion and broadcasting ops.
-          Canonical aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
+          Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
           and thus can be used as an opset for export purpose.
 - tag: pointwise
   desc: |
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 58a3b3ee5722..a5f01419368e 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -14,6 +14,9 @@
 #include <utility>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/Logging.h>
+#include <c10/util/Exception.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
@@ -498,6 +501,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> native_decoder_only_multi_head_attent
   // query shape: [B, T, D]
   // qkv_weight shape: [3 * D, D]
 
+  TORCH_WARN("_native_decoder_only_multi_head_attention is deprecated");
+
   TORCH_CHECK(
       !mask || !query.is_nested(),
       "NestedTensor with mask is not supported yet");
@@ -658,10 +663,80 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> native_decoder_only_multi_head_attent
 }
 
 int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal){
+  return static_cast<int64_t>(sdp::SDPBackend::math);
+}
+
+int64_t _fused_sdp_choice_meta(
+    const Tensor& query_,
+    const Tensor& key,
+    const Tensor& value,
+    const c10::optional<Tensor>& attn_mask_,
+    double dropout_p,
+    bool is_causal) {
+  auto query_key_set = query_.key_set();
+  bool has_cuda = query_key_set.has(c10::DispatchKey::CUDA);
+  if (has_cuda) {
+    auto choice_int = _fused_sdp_choice_stub(
+        at::kCUDA,
+        query_,
+        key,
+        value,
+        attn_mask_,
+        dropout_p,
+        is_causal);
+    return choice_int;
+  }
   return static_cast<int64_t>(sdp::SDPBackend::math);
 }
 
+//  !!!!!! TODO: THIS NEEDS TO BE REMOVED BUT PEOPLE HAVE TRAINED THEIR MODELS
+//  WITH THIS OP BUILTIN !!!!!!
+std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
+    const Tensor& query_,
+    const Tensor& key,
+    const Tensor& value,
+    const c10::optional<Tensor>& attn_mask_,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal) {
+  if (!need_attn_weights) {
+    return std::make_tuple(
+        at::scaled_dot_product_attention(
+            query_, key, value, attn_mask_, dropout_p, is_causal),
+        Tensor());
+  }
+  return at::_scaled_dot_product_attention_math(
+      query_, key, value, attn_mask_, dropout_p, is_causal);
+}
+
+inline void validate_sdpa_input(
+    const Tensor& query_,
+    const Tensor& key,
+    const Tensor& value,
+    const c10::optional<Tensor>& attn_mask_,
+    double dropout_p,
+    bool is_causal) {
+  TORCH_CHECK(
+      query_.dtype() == key.dtype() && query_.dtype() == value.dtype(),
+      "Expected query, key, and value to have the same dtype, but got query.dtype: ",
+      query_.dtype(), " key.dtype: ", key.dtype(), " and value.dtype: ", value.dtype(), " instead.");
+  TORCH_CHECK(
+      query_.device() == key.device() && query_.device() == value.device(),
+      "Expected query, key, and value to have the same device type, but got query.device: ",
+      query_.device(), " key.device: ", key.device(), " and value.device: ", value.device(), " instead.");
+  TORCH_CHECK(
+      query_.dim() >= 2 && key.dim() >= 2 && value.dim() >= 2,
+      "Expected query, key, and value to all be  at least 2 dimensional, but got query.dim: ",
+      query_.dim(), " key.dim: ", key.dim(), " and value.dim: ", value.dim(), " instead.");
+  if (attn_mask_.has_value()){
+    auto mask_dtype = attn_mask_->dtype();
+    TORCH_CHECK(mask_dtype == at::kBool || mask_dtype == query_.dtype(),
+      "Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: ",
+      mask_dtype, " and  query.dtype: ", query_.dtype(), " instead.");
+  }
+  return;
+}
 // Computes scaled dot product attention on query, key and value tensors, using
 // an optional attention mask if passed, and applying dropout if a probability
 // greater than 0.0 is specified.
@@ -690,32 +765,25 @@ int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Ten
 //     S: Source sequence length
 //     L: Target sequence length
 //     E: Embedding dimension
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
+Tensor scaled_dot_product_attention(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
     const c10::optional<Tensor>& attn_mask_,
     double dropout_p,
-    bool need_attn_weights,
     bool is_causal) {
-  // TODO: The second return is the attention weights if the math kernel is
-  // used. The fused kernels do not return this Tensor so for the fused kernels
-  // The second return SHOULD always be an empty Tensor, unless need_attn_weights
-  // is true (in which case the fused kernels would not be called). This blows up
-  // op_info tests.
+  validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_causal);
   int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
   if (query_.device().type() == DeviceType::CUDA){
     choice_int = _fused_sdp_choice_stub(query_.device().type(),
-      query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+      query_, key, value, attn_mask_, dropout_p, is_causal);
   }
   sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
   switch (backend) {
     case sdp::SDPBackend::flash_attention: {
       auto out_lse_softmax = at::_scaled_dot_product_flash_attention(
-          query_, key, value, dropout_p, need_attn_weights, is_causal);
-      return std::make_tuple(
-          std::move(std::get<0>(out_lse_softmax)),
-          std::move(std::get<2>(out_lse_softmax)));
+          query_, key, value, dropout_p, is_causal);
+      return std::get<0>(out_lse_softmax);
     }
     case sdp::SDPBackend::efficient_attention: {
       bool compute_logsumexp =
@@ -723,36 +791,28 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
            value.requires_grad());
       auto out_and_lse = at::_scaled_dot_product_efficient_attention(
           query_, key, value, compute_logsumexp, is_causal);
-      // We need to make an empty tensor in the shape of attention weights
-      // for the sake of meta tensors.
-      if (query_.is_nested()) {
-        // TODO: Need to fix when we have empty for nested tensors.
-        return out_and_lse;
-      }
-      return std::make_tuple(
-          std::move(std::get<0>(out_and_lse)),
-          at::empty_symint({0}, query_.options()));
+      return std::get<0>(out_and_lse);
     }
     case sdp::SDPBackend::math:
-      return at::_scaled_dot_product_attention_math(
+      return std::get<0>(at::_scaled_dot_product_attention_math(
           query_,
           key,
           value,
           attn_mask_,
           dropout_p,
-          need_attn_weights,
-          is_causal);
+          is_causal));
     default:
       TORCH_CHECK(
           false,
           "No viable backend for scaled_dot_product_attention was found.");
-      return std::make_tuple(Tensor(), Tensor());
+      return Tensor();
   }
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
         const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal,
+        const c10::optional<Tensor>& dropout_mask) {
   C10_LOG_API_USAGE_ONCE("torch.sdpa.math_fallback");
   if (query_.is_nested() || key.is_nested() || value.is_nested()) {
     TORCH_CHECK(
@@ -795,15 +855,16 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
     }
     attn = at::softmax(attn, -1);
     if (dropout_p > 0.0) {
-      attn = at::dropout(attn, dropout_p, true);
+      if (dropout_mask.has_value()) {
+        auto attn_dropout_masked = attn.masked_fill(dropout_mask->logical_not(), 0.0);
+        auto dropout_scaling = 1.0 / (1 - dropout_p);
+        return std::make_tuple(at::matmul(attn_dropout_masked, value * dropout_scaling), attn);
+      } else {
+        attn = at::dropout(attn, dropout_p, true);
+      }
     }
-    const auto output = at::matmul(attn, value);
-    // If you don't need it then you don't get it.
-    // TODO: Need to fix when we have empty for nested tensors.
-    attn = need_attn_weights || query_.is_nested()
-        ? attn
-        : at::empty_symint({0}, query_.options());
-    return std::make_tuple(output, attn);
+
+    return std::make_tuple(at::matmul(attn, value), attn);
 }
 
 Tensor triton_multi_head_attention(
diff --git a/aten/src/ATen/native/transformers/attention.h b/aten/src/ATen/native/transformers/attention.h
index febe72b8d38e..c34bdf7af88f 100644
--- a/aten/src/ATen/native/transformers/attention.h
+++ b/aten/src/ATen/native/transformers/attention.h
@@ -8,7 +8,7 @@ namespace at {
 namespace native {
 
 using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal);
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal);
 
 DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub);
 
@@ -17,7 +17,7 @@ TORCH_API Tensor masked_softmax(
     Tensor& attn_scores,
     c10::optional<Tensor> attn_mask,
     const Tensor& query,
-    c10::optional<int64_t> mask_type = NULL);
+    c10::optional<int64_t> mask_type = {});
 
 TORCH_API Tensor transform0213_gemm_nt_bias(
     const Tensor& a,
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 56a4e49d4412..36eba2a472c3 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -557,7 +557,7 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
 
 #endif
   const auto dim_per_head = D / num_head;
-  if ((query.is_same(key) && key.is_same(value)) && dim_per_head % 8 == 0 ) {
+  if ((query.is_same(key) && key.is_same(value)) && dim_per_head % 8 == 0 && !need_weights) {
 
     // We have not done linear projection yet but the input for SDP
     // Is expected to be 4 dimensional. We "cheaply" create view tensors
@@ -566,7 +566,7 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
     auto k = key.view({key.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
     auto v = value.view({value.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
 
-    sdp::sdp_params kernel_params{q, k, v, mask.has_value(), 0.0, need_weights, false};
+    sdp::sdp_params kernel_params{q, k, v, mask.has_value(), 0.0, false};
     auto backend = select_sdp_backend(kernel_params);
     if (backend == sdp::SDPBackend::flash_attention || backend == sdp::SDPBackend::efficient_attention) {
       auto x = at::linear(query, qkv_weight, qkv_bias);
@@ -580,10 +580,10 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
       chunks[2] = (chunks[2].view({x_size_0, -1, num_head, dim_per_head}))
                       .transpose(1, 2);
 
-      auto y = at::_scaled_dot_product_attention(
-          chunks[0], chunks[1], chunks[2], mask, 0.0, need_weights, false);
-      auto past_sdp =
-          std::get<0>(y).transpose(1, 2).reshape({x_size_0, -1, embed_dim});
+      auto y = at::scaled_dot_product_attention(
+          chunks[0], chunks[1], chunks[2], mask, 0.0, false);
+
+      auto past_sdp = y.transpose(1, 2).reshape({x_size_0, -1, embed_dim});
       return std::make_tuple(
           at::linear(past_sdp, proj_weight, proj_bias), Tensor());
     }
@@ -679,14 +679,13 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
   }
   return std::make_tuple(std::move(proj), std::move(qkt));
 }
-
-std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
+std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t, int64_t, int64_t, int64_t, Tensor> _scaled_dot_product_flash_attention_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool return_softmax,
-    bool is_causal) {
+    bool is_causal,
+    bool return_debug_mask) {
   // Used for tracking usage statistics
   C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention");
   // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
@@ -730,8 +729,9 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
   Tensor key_reshaped = k_t.reshape({Nnz_kv, num_heads, head_dim});
   Tensor value_reshaped = v_t.reshape({Nnz_kv, num_heads, head_dim});
 
-  Tensor attention, log_sumexp, softmax;
-  std::tie(attention, log_sumexp, softmax) =
+  Tensor attention, log_sumexp, debug_attn_mask;
+  int64_t philox_seed{0}, philox_offset{0};
+  std::tie(attention, log_sumexp, philox_seed, philox_offset, debug_attn_mask) =
       at::_flash_attention_forward(
           query_reshaped,
           key_reshaped,
@@ -740,14 +740,14 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
           max_seqlen_batch_k,
-          return_softmax,
           dropout_p,
-          is_causal);
+          is_causal,
+          return_debug_mask);
   // Reshape output to convert nnz to batch_size and seq_len
   attention =
       attention.view({batch_size, max_seqlen_batch_q, num_heads, head_dim}).transpose(1,2);
 
-  return std::make_tuple(attention, log_sumexp, softmax);
+  return std::make_tuple(attention, log_sumexp, cumulative_sequence_length_q, cumulative_sequence_length_k, max_seqlen_batch_q, max_seqlen_batch_k, philox_seed, philox_offset, debug_attn_mask);
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
@@ -780,8 +780,8 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
 }
 
 int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
-  sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal){
+  sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, is_causal};
   auto backend = select_sdp_backend(kernel_params);
   if (backend == sdp::SDPBackend::error) {
     TORCH_CHECK(
@@ -809,7 +809,7 @@ bool _chunk_grad_outputs_efficient_attention(
 }
 
 
-std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
+std::tuple<Tensor, Tensor, int64_t, int64_t, Tensor> _flash_attention_forward(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -817,28 +817,47 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
     const Tensor& cumulative_sequence_length_k,
     const int64_t max_seqlen_batch_q,
     const int64_t max_seqlen_batch_k,
-    bool return_softmax,
     double dropout_p,
-    bool is_causal) {
+    bool is_causal,
+    bool return_debug_mask) {
 #if defined(USE_FLASH_ATTENTION)
+  /*
+  num_splits determines how much to parallelize over the seqlen_q dimension
+  num_splits=0 means
+  it will be set by an internal heuristic. We're exposing num_splits mostly for
+  benchmarking. We will hard code it to 0 for now
+  */
+  constexpr int num_splits{0};
   auto softmax_scale = std::pow(query.size(-1), -0.5);
-  return fmha::mha_fwd(
+  at::Tensor output = at::empty_like(query);
+
+  Tensor logsumexp, debug_attn_mask;
+  uint64_t philox_seed{0}, philox_offset{0};
+  std::tie(logsumexp, philox_seed, philox_offset, debug_attn_mask) = fmha::mha_fwd(
       query,
       key,
       value,
+      output,
       cumulative_sequence_length_q,
       cumulative_sequence_length_k,
       max_seqlen_batch_q,
       max_seqlen_batch_k,
       dropout_p,
       softmax_scale,
-      false,
+      false, /*zero_tensors = false for all calls here*/
       is_causal,
-      return_softmax,
-      c10::nullopt);
+      return_debug_mask, /*return_softmax (this is used for testing)*/
+      num_splits);
+
+  debug_attn_mask = return_debug_mask ? debug_attn_mask : at::empty({0}, query.options());
+
+  int64_t signed_philox_seed = sdp::bit_cast<int64_t>(philox_seed);
+  int64_t signed_philox_offset= sdp::bit_cast<int64_t>(philox_offset);
+
+  return std::make_tuple(output, logsumexp, signed_philox_seed, signed_philox_offset, debug_attn_mask);
 #endif
   TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
-  return std::make_tuple(Tensor(), Tensor(), Tensor());
+  return std::make_tuple(Tensor(), Tensor(), 0, 0, Tensor());
 }
 
 std::tuple<at::Tensor, at::Tensor> _efficient_attention_forward(
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 62d4de230626..dac4fee66df5 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -5,14 +5,16 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
+#include <c10/core/TensorImpl.h>
 #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
 
-#include <iostream>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_api.h>
 #endif
 
 #define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
@@ -69,6 +71,71 @@ namespace at {
 
 namespace native {
 
+std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    const Tensor& cumulative_sequence_length_q,
+    const Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
+    double dropout_p,
+    bool is_causal,
+    const int64_t philox_seed,
+    const int64_t philox_offset) {
+#if defined(USE_FLASH_ATTENTION)
+  /*
+  num_splits determines how much to parallelize over the seqlen_q dimension
+  num_splits=0 means
+  it will be set by an internal heuristic. We're exposing num_splits mostly for
+  benchmarking. We will hard code it to 0 for now
+  */
+  constexpr int num_splits{0};
+  auto softmax_scale = std::pow(query.size(-1), -0.5);
+  //  CUDA code assumes that dout is contiguous
+  auto contiguous_grad_out = grad_out.contiguous();
+  auto contiguous_out = out.contiguous();
+  Tensor dq = at::empty_like(query);
+  Tensor dk = at::empty_like(key);
+  Tensor dv = at::empty_like(value);
+  //  The kernel computes irregadless we will drop for this functions return
+  Tensor grad_softmax;
+
+  uint64_t unsigned_philox_seed = sdp::bit_cast<uint64_t>(philox_seed);
+  uint64_t unsigned_philox_offset = sdp::bit_cast<uint64_t>(philox_offset);
+
+  std::tie(dq, dk, dv, grad_softmax) = fmha::mha_bwd(
+          contiguous_grad_out,
+          query,
+          key,
+          value,
+          contiguous_out,
+          logsumexp,
+          dq,
+          dk,
+          dv,
+          cumulative_sequence_length_q,
+          cumulative_sequence_length_k,
+          max_seqlen_batch_q,
+          max_seqlen_batch_k,
+          dropout_p,
+          softmax_scale,
+          false, /*zero_tensors = false for all calls here*/
+          is_causal,
+          num_splits,
+          unsigned_philox_seed,
+          unsigned_philox_offset
+  );
+  return std::make_tuple(dq, dk, dv);
+#endif
+  TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
+  return std::make_tuple(Tensor(), Tensor(), Tensor());
+}
+
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     const at::Tensor& grad_out_,
     const at::Tensor& query,
@@ -260,6 +327,69 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
 }
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_flash_attention_backward_cuda(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    const Tensor& cumulative_sequence_length_q,
+    const Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
+    double dropout_p,
+    bool is_causal,
+    const int64_t philox_seed,
+    const int64_t philox_offset){
+  if (!grad_out_.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+
+  const int64_t batch_size = query.size(0);
+  const int64_t num_heads = query.size(1);
+  const int64_t head_dim = query.size(3);
+
+  Tensor q_t = query.transpose(1, 2);
+  Tensor k_t = key.transpose(1, 2);
+  Tensor v_t = value.transpose(1, 2);
+
+
+  int64_t Nnz_q{batch_size * max_seqlen_batch_q};
+  int64_t Nnz_kv{batch_size * max_seqlen_batch_k};
+
+  // For the standard MHA these will actually be views
+  Tensor query_reshaped = q_t.reshape({Nnz_q, num_heads, head_dim});
+  Tensor key_reshaped = k_t.reshape({Nnz_kv, num_heads, head_dim});
+  Tensor value_reshaped = v_t.reshape({Nnz_kv, num_heads, head_dim});
+
+  auto grad_out_reshaped = grad_out_.transpose(1,2).reshape({{Nnz_q, num_heads, head_dim}});
+  auto out_reshaped = out.transpose(1,2).reshape({Nnz_q, num_heads, head_dim});
+
+  Tensor grad_q, grad_k, grad_v;
+  std::tie(grad_q, grad_k, grad_v) = at::_flash_attention_backward(
+    grad_out_reshaped,
+    query_reshaped,
+    key_reshaped,
+    value_reshaped,
+    out_reshaped,
+    logsumexp,
+    cumulative_sequence_length_q,
+    cumulative_sequence_length_k,
+    max_seqlen_batch_q,
+    max_seqlen_batch_k,
+    dropout_p,
+    is_causal,
+    philox_seed,
+    philox_offset);
+
+  grad_q = grad_q.view({batch_size, max_seqlen_batch_q, num_heads, head_dim}).transpose(1,2);
+  grad_k = grad_k.view({batch_size, max_seqlen_batch_k, num_heads, head_dim}).transpose(1,2);
+  grad_v = grad_v.view({batch_size, max_seqlen_batch_k, num_heads, head_dim}).transpose(1,2);
+
+  return std::make_tuple(grad_q, grad_k, grad_v);
+}
+
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_attention_backward_cuda(
     const at::Tensor& grad_out_,
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue.h b/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue.h
deleted file mode 100644
index 2bf4e1eb5482..000000000000
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2022, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/cutlass.h>
-#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
-#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
-#include <cutlass/epilogue/warp/fragment_iterator_tensor_op.h>
-#include <cutlass/gemm/warp/default_mma_tensor_op.h>
-#include <cutlass/layout/layout.h>
-#include <cutlass/arch/mma.h>
-#include <cutlass/array.h>
-#include <cutlass/numeric_types.h>
-
-#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
-#include <ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h>
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename MmaCore>
-struct FMHAEpilogue {
-
-    using ThreadblockShape = typename MmaCore::Shape;
-    using WarpMma = typename MmaCore::MmaTensorOp;
-    using LayoutC = typename MmaCore::LayoutC;
-    using Element = typename MmaCore::ElementA;
-    using ElementC = typename MmaCore::ElementC;
-
-    static constexpr int kPartitionsK = ThreadblockShape::kK / MmaCore::WarpShape::kK;
-
-    using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
-                                    typename WarpMma::Shape,
-                                    typename WarpMma::Policy::Operator::Shape,
-                                    typename WarpMma::Policy::Operator::ElementC,
-                                    typename WarpMma::Policy::Operator::FragmentC,
-                                    LayoutC>;
-    using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-    static constexpr int kIterationsStore = AccumulatorFragmentIterator::kIterations;
-
-    // Maybe elementsPerAccess should vary: 4 for d=64, 2 for d=32?
-    using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-        ThreadblockShape, typename WarpMma::Shape, kPartitionsK, Element, /*ElementsPerAccess=*/4>::Type;
-    using OutputTileThreadMapAccum = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
-        ThreadblockShape, typename WarpMma::Shape, kPartitionsK, ElementC, /*ElementsPerAccess=*/4>::Type;
-
-    using GmemIterator = fmha::EpiloguePredicatedTileIterator<
-        OutputTileThreadMap,
-        Element
-    >;
-    // which ThreadMap should we use?
-    using GmemIteratorAccum = fmha::EpiloguePredicatedTileIterator<
-        // OutputTileThreadMapAccum,
-        OutputTileThreadMap,
-        ElementC
-    >;
-
-
-    using DefaultIterators = cutlass::epilogue::threadblock::detail::DefaultIteratorsTensorOp<
-        Element, ElementC, /*ElementsPerAccess=*/4, ThreadblockShape, typename WarpMma::Shape,
-        typename WarpMma::Policy::Operator::Shape, typename OutputTileThreadMap::CompactedThreadMap>;
-    using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
-    static_assert(WarpTileIterator::kIterations == kIterationsStore, "");
-    using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
-    using OutputFragment = typename SharedLoadIterator::Fragment;
-
-    // using Padding = cutlass::MatrixShape<0, 0>;
-    using Padding = cutlass::MatrixShape<0, 64 / cutlass::sizeof_bits<ElementC>::value * 4>;
-    static constexpr int kFragmentsPerIteration = kIterationsStore;  // TODO: could be 1 for Volta?
-    /*Using kIterationsStore here so that we get the right storage size*/
-    using EpilogueBase = typename cutlass::epilogue::threadblock::EpilogueBase<
-        ThreadblockShape, typename WarpMma::Shape, kPartitionsK, AccumulatorFragmentIterator, WarpTileIterator,
-        Padding, kIterationsStore>;
-
-    using SharedStorage = typename EpilogueBase::SharedStorage;
-    static constexpr int kSmemTiles = EpilogueBase::kFragmentsPerIteration;
-    static constexpr int kSmemPointerOffset = SharedStorage::StorageShape::kCount / kSmemTiles;
-    static constexpr int kSmemPointerOffsetPerWarp = SharedStorage::StorageShape::kCount / (kSmemTiles * kPartitionsK);
-
-    SharedStorage *shared_storage;
-    WarpTileIterator warp_tile_iterator;
-
-    inline __device__ FMHAEpilogue(void *smem, const int tidx)
-        : shared_storage(reinterpret_cast<SharedStorage *>(smem))
-        , warp_tile_iterator(shared_storage->reference(), threadIdx.x % 32) {
-
-        // const int warp_idx = tidx / 32;
-        // Broadcast the warp_id computed by lane 0 to ensure dependent code
-        // is compiled as warp-uniform.
-        // https://github.com/NVIDIA/cutlass/blob/e66bfcb1f880792caa46b1e983c4114e23afa5f3/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h#L520
-        const int warp_idx = __shfl_sync(0xffffffff, tidx / 32, 0);
-
-        cutlass::MatrixCoord warp_offset{kIterationsStore * warp_idx, 0};
-
-        warp_tile_iterator.add_tile_offset(warp_offset);
-    }
-
-    // Store the accumulators.
-    inline __device__ void store(const AccumulatorTile &acc) {
-        AccumulatorFragmentIterator accum_fragment_iterator(acc);
-        CUTLASS_PRAGMA_UNROLL
-        for (int p = 0; p < kIterationsStore; ++p) {
-            typename AccumulatorFragmentIterator::Fragment accum_fragment;
-            accum_fragment_iterator.load(accum_fragment);
-            ++accum_fragment_iterator;
-
-            warp_tile_iterator.store(accum_fragment);
-            if (p < kIterationsStore - 1) {
-                warp_tile_iterator.add_pointer_offset(kSmemPointerOffsetPerWarp);
-            }
-        }
-        if (kIterationsStore > 1) {
-            warp_tile_iterator.add_pointer_offset((1 - kIterationsStore) * kSmemPointerOffsetPerWarp);
-        }
-    }
-
-    // Load the accumulators
-    template<bool zero_init=true>
-    inline __device__ void load(OutputFragment (&out)[kFragmentsPerIteration],
-                                const int tidx) {
-        SharedLoadIterator shared_load_iterator(shared_storage->reference(), tidx);
-        CUTLASS_PRAGMA_UNROLL
-        for (int p = 0; p < EpilogueBase::kFragmentsPerIteration; ++p) {
-            OutputFragment aligned_accum_fragment[kPartitionsK];
-            shared_load_iterator.load(aligned_accum_fragment[0]);
-            cutlass::plus<OutputFragment> add_fragments;
-            if (kPartitionsK > 1) {
-                CUTLASS_PRAGMA_UNROLL
-                for ( int i = 1; i < kPartitionsK; ++i) {
-                    shared_load_iterator.add_pointer_offset(kSmemPointerOffsetPerWarp * kIterationsStore);
-                    shared_load_iterator.load(aligned_accum_fragment[i]);
-                    aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
-                }
-                shared_load_iterator.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffsetPerWarp * kIterationsStore);
-            }
-            if (p < EpilogueBase::kFragmentsPerIteration - 1) {
-                shared_load_iterator.add_pointer_offset(kSmemPointerOffsetPerWarp);
-            }
-
-            out[p] = zero_init ? aligned_accum_fragment[0] : add_fragments(out[p], aligned_accum_fragment[0]);
-        }
-    }
-
-};
-
-}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h b/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h
deleted file mode 100644
index 170df703e7da..000000000000
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/epilogue_predicated_tile_iterator.h
+++ /dev/null
@@ -1,493 +0,0 @@
-// Adapted from cutlass/epilogue/threadblock/predicated_tile_iterator.h
-// We just want to add the move() function, but idk how to do it without
-// copying the code here.
-
-/******************************************************************************
- * Copyright (c) 2022, Tri Dao.
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/cutlass.h>
-#include <cutlass/arch/arch.h>
-#include <cutlass/arch/memory.h>
-#include <cutlass/array.h>
-#include <cutlass/epilogue/threadblock/output_tile_thread_map.h>
-#include <cutlass/epilogue/threadblock/predicated_tile_iterator_params.h>
-#include <cutlass/layout/matrix.h>
-#include <cutlass/layout/tensor.h>
-#include <cutlass/matrix_shape.h>
-#include <cutlass/numeric_types.h>
-#include <cutlass/tensor_ref.h>
-#include <cutlass/transform/pitch_linear_thread_map.h>
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////
-
-using namespace cutlass;
-using namespace cutlass::epilogue::threadblock;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Tile iterator used to load and store output tile from global memory in epilogue.
-///
-/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
-///
-template <
-  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
-  typename Element_,         ///< Element data type
-  bool ScatterD = false,     ///< Scatter D operand or not
-  bool UseCUDAStore = false
->
-class EpiloguePredicatedTileIterator {
-public:
-  using ThreadMap = ThreadMap_;
-  using Shape = typename ThreadMap::Shape;
-
-  using Element = Element_;
-
-  using Layout = layout::RowMajor;
-  using TensorRef = TensorRef<Element, Layout>;
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorCoord = MatrixCoord;
-
-  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
-  static int const kThreads = ThreadMap::kThreads;
-  static int const kIterations = ThreadMap::Count::kTile;
-
-  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
-  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
-  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
-  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
-
-  /// Fragment object
-  using Fragment = Array<
-    Element,
-    ThreadMap::Iterations::kColumn *
-    ThreadMap::Iterations::kRow *
-    ThreadMap::Iterations::kGroup *
-    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
-
-  /// Memory access size
-  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
-
-  //
-  // Parameters struct
-  //
-
-  /// Uses a non-template class
-  struct Params : PredicatedTileIteratorParams {
-    using Base = PredicatedTileIteratorParams;
-
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Layout const &layout):
-      PredicatedTileIteratorParams(
-        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
-        make_OutputTileThreadMapDesc<ThreadMap>()
-      )
-    { }
-
-    CUTLASS_HOST_DEVICE
-    Params(Base const &base) :
-      Base(base) { }
-  };
-
-  /// Mask object
-  struct Mask {
-
-    static int const kCount = ThreadMap::Iterations::kColumn;
-
-    /// Predicate state
-    bool predicates[kCount];
-
-    //
-    // Mask
-    //
-    CUTLASS_HOST_DEVICE
-    Mask() {
-      enable();
-    }
-
-    ///< Efficiently disables all accesses guarded by mask
-    CUTLASS_HOST_DEVICE void clear() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = false;
-      }
-    }
-
-    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
-    CUTLASS_DEVICE void enable() {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kCount; ++i) {
-        predicates[i] = true;
-      }
-    }
-  };
-
-private:
-
-  //
-  // Data members
-  //
-
-  /// Parameters structure containing reference and precomputed state.
-  PredicatedTileIteratorParams params_;
-
-  /// Byte-level pointer
-  uint8_t *byte_pointer_;
-
-  /// Array of boolean values to contain steady-state predicates
-  Mask mask_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_row_;
-
-  /// Extent of the matrix tile in rows
-  Index extent_column_;
-
-  /// A thread's starting row position (assuming steady-state predicates have been computed)
-  Index thread_start_row_;
-
-  /// A thread's starting column
-  Index thread_start_column_;
-
-  /// Internal state counter
-  int state_[3];
-
-  /// Scatter indices
-  int const *indices_;
-
-  //
-  // Static asserts about internal strides
-  //
-
-  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
-  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
-
-private:
-
-  //
-  // Methods
-  //
-
-public:
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_DEVICE
-  EpiloguePredicatedTileIterator(
-    PredicatedTileIteratorParams const & params,
-    Element *pointer,
-    TensorCoord extent,
-    int thread_idx,
-    TensorCoord threadblock_offset = TensorCoord(),
-    int const *indices = nullptr
-  ):
-    params_(params), indices_(indices)
-  {
-
-    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
-
-    extent_row_ = extent.row();
-    extent_column_ = extent.column();
-
-    thread_start_row_ = thread_offset.row();
-    thread_start_column_ = thread_offset.column();
-
-    // Initialize predicates
-    CUTLASS_PRAGMA_UNROLL
-    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-
-      mask_.predicates[c] = ((thread_offset.column()
-        + ThreadMap::Delta::kColumn * c) < extent.column());
-    }
-
-    // Null pointer performs no accesses
-    if (!pointer) {
-      mask_.clear();
-    }
-
-    if (ScatterD && !indices) {
-      mask_.clear();
-    }
-
-    // Initialize pointer
-    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
-      LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
-      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-
-    if (ScatterD) {
-      byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
-        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
-    }
-
-    // Initialize internal state counter
-    state_[0] = state_[1] = state_[2] = 0;
-  }
-
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
-
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow
-            + group * ThreadMap::Delta::kGroup
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          if (ScatterD && row_guard) {
-            assert(indices_);
-
-            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
-              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            cutlass::arch::global_load<
-              AccessType,
-              sizeof(AccessType)
-            >(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            if (!ScatterD) {
-              byte_pointer += params_.increment_row;
-            }
-          }
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment &frag) const {
-
-    load_with_byte_offset(frag, 0);
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
-    uint8_t *byte_pointer = byte_pointer_;
-    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
-
-          int frag_row_idx =
-            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
-
-          int row_offset = row * ThreadMap::Delta::kRow
-            + group * ThreadMap::Delta::kGroup
-            + cluster * ThreadMap::Delta::kCluster;
-
-          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
-
-          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
-
-          if (ScatterD && row_guard) {
-            assert(indices_);
-
-            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
-              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
-
-            bool guard = row_guard && mask_.predicates[column];
-
-            if (UseCUDAStore) {
-              if (guard) {
-                memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
-                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
-              }
-            } else {
-              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
-                  (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
-                  guard);
-            }
-          }
-
-          if (row + 1 < ThreadMap::Iterations::kRow) {
-            if (!ScatterD) {
-              byte_pointer += params_.increment_row;
-            }
-          }
-
-        }
-
-        if (group + 1 < ThreadMap::Iterations::kGroup) {
-          byte_pointer += params_.increment_group;
-        }
-      }
-
-      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
-        byte_pointer += params_.increment_cluster;
-      }
-    }
-  }
-
-  /// Stores a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const &frag) const {
-
-    store_with_byte_offset(frag, 0);
-  }
-
-  CUTLASS_DEVICE
-  MatrixCoord thread_start() const {
-    return MatrixCoord(thread_start_row_, thread_start_column_);
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_row() const {
-    return thread_start_row_;
-  }
-
-  /// Need to get the thread start row from the tile iterator
-  CUTLASS_DEVICE
-  int32_t thread_start_column() const {
-    return thread_start_column_;
-  }
-
-  /// Extent of the matrix in rows
-  CUTLASS_DEVICE
-  Index extent_row() const {
-    return extent_row_;
-  }
-
-  /// Extent of the matrix in columns
-  CUTLASS_DEVICE
-  Index extent_column() const {
-    return extent_column_;
-  }
-
-  /// Advances to the next position to load or store
-  CUTLASS_HOST_DEVICE
-  void move(const int step=1) {
-
-    if (!ScatterD) {
-      byte_pointer_ += step * params_.advance_row;
-    }
-
-    thread_start_row_ += step * ThreadMap::Shape::kRow;
-  }
-
-  ///< Efficiently disables all accesses guarded by mask
-  CUTLASS_DEVICE void clear_mask() {
-    mask_.clear();
-  }
-
-  ///< Efficiently enables all accesses guarded by mask
-  CUTLASS_DEVICE void enable_mask() {
-    mask_.enable();
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void get_mask(Mask &mask) const {
-    mask = mask_;
-  }
-
-  ///< Sets the mask
-  CUTLASS_DEVICE void set_mask(Mask const &mask) {
-    mask_ = mask;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-
-} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h
index 2bd17da72f7d..950d78ec27be 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha.h
@@ -30,8 +30,15 @@
 #include <cuda.h>
 #include <vector>
 
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
 #include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
+
 #include <ATen/native/transformers/cuda/flash_attn/fmha_utils.h>
 
 
@@ -75,6 +82,8 @@ struct FMHA_fprop_params : public Qkv_params {
     // size_t o_stride_in_bytes;
     uint32_t o_row_stride_in_elts;
     uint32_t o_head_stride_in_elts;
+    uint32_t o_tmp_row_stride_in_elts;
+    uint32_t o_tmp_head_stride_in_elts;
 
     // The pointer to the O_tmp matrix, which holds O intermediate value during
     // the loop;
@@ -93,7 +102,8 @@ struct FMHA_fprop_params : public Qkv_params {
     int b, seqlen_q, seqlen_k, d;
 
     // The scaling factors for the kernel.
-    float scale_bmm1;
+    float scale_bmm1f;
+    uint32_t scale_bmm1;
 
     // array of length b+1 holding starting offset of each sequence.
     int * __restrict__ cu_seqlens_q;
@@ -110,11 +120,46 @@ struct FMHA_fprop_params : public Qkv_params {
     float rp_dropout;
     float scale_bmm1_rp_dropout;
 
+    // Scale factor of 1 / (1 - p_dropout), in half2.
+    uint32_t scale_dropout;
+
     // Random state.
     at::PhiloxCudaState philox_args;
 
     bool is_bf16;
     bool is_causal;
+
+    int num_splits; // How many SMs per attention matrix.
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct FMHA_dgrad_params : public FMHA_fprop_params {
+
+    // The dQKV matrices.
+    void *__restrict__ dq_ptr;
+    void *__restrict__ dk_ptr;
+    void *__restrict__ dv_ptr;
+
+    // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q dimension
+    // void *__restrict__ dk_accum_ptr;
+    // void *__restrict__ dv_accum_ptr;
+
+    // The stride between rows of the dQ, dK and dV matrices.
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    uint32_t dq_row_stride_in_elts;
+    uint32_t dk_row_stride_in_elts;
+    uint32_t dv_row_stride_in_elts;
+    uint32_t dq_head_stride_in_elts;
+    uint32_t dk_head_stride_in_elts;
+    uint32_t dv_head_stride_in_elts;
+
+    // The dO matrix. We assume it is contiguous.
+    void * __restrict__ do_ptr;
+
+    // The pointer to the softmax d sum.
+    void * __restrict__ dsoftmax_sum;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -151,4 +196,14 @@ struct Launch_params{
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-TORCH_API void run_fmha_fprop(Launch_params<FMHA_fprop_params> &launch_params, const bool configure);
+void run_fmha_fwd_hdim32(Launch_params<FMHA_fprop_params> &launch_params);
+void run_fmha_fwd_hdim64(Launch_params<FMHA_fprop_params> &launch_params);
+void run_fmha_fwd_hdim128(Launch_params<FMHA_fprop_params> &launch_params);
+
+void run_fmha_bwd_hdim32(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+void run_fmha_bwd_hdim128(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+
+void run_fmha_block_fp16_sm80(Launch_params<FMHA_fprop_params> &launch_params, const bool configure);
+
+void run_fmha_block_dgrad_fp16_sm80(const FMHA_dgrad_params &params, cudaStream_t stream);
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index 7c317f4ed129..921c60f1d6e5 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -26,12 +26,14 @@
  *
  ******************************************************************************/
 
+#include <cstdint>
 #include <tuple>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #include <ATen/native/transformers/cuda/flash_attn/fmha.h>
 #include <ATen/native/transformers/cuda/flash_attn/fmha_api.h>
@@ -53,15 +55,18 @@ void set_params_fprop(FMHA_fprop_params &params,
                       const at::Tensor q,
                       const at::Tensor k,
                       const at::Tensor v,
+                       at::Tensor out,
                       void *cu_seqlens_q_d,
                       void *cu_seqlens_k_d,
-                      void *o_packed_d,
                       void *o_tmp_d,
                       void *s_d,
                       void *softmax_lse_d,
                       float p_dropout,
                       float softmax_scale,
-                      bool is_causal) {
+                      bool is_causal,
+                      int num_splits) {
+
+    Data_type data_type = !(q.dtype() == at::kBFloat16) ? DATA_TYPE_FP16 : DATA_TYPE_BF16;
 
     // Reset the parameters
     params = {};
@@ -78,17 +83,19 @@ void set_params_fprop(FMHA_fprop_params &params,
     params.q_head_stride_in_elts = q.stride(1);
     params.k_head_stride_in_elts = k.stride(1);
     params.v_head_stride_in_elts = v.stride(1);
-    params.o_ptr = o_packed_d;
-    params.o_row_stride_in_elts = h * d;
-    params.o_head_stride_in_elts = d;
+    params.o_ptr = out.data_ptr();
+    params.o_row_stride_in_elts = out.stride(0);
+    params.o_head_stride_in_elts = out.stride(1);
     params.o_tmp_ptr = o_tmp_d;
+    params.o_tmp_row_stride_in_elts = h * d;
+    params.o_tmp_head_stride_in_elts = d;
 
     params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
     params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
 
     // S = softmax(P)
     params.s_ptr = s_d;
-    params.s_stride_in_bytes = b * h * seqlen_k * 2;  // 2 = sizeof(Element)
+    params.s_stride_in_bytes = get_size_in_bytes(b * h * seqlen_k, data_type);
 
     // Softmax sum
     params.softmax_lse_ptr = softmax_lse_d;
@@ -101,7 +108,11 @@ void set_params_fprop(FMHA_fprop_params &params,
     params.d = d;
 
     // Set the different scale values.
-    params.scale_bmm1 = softmax_scale;
+    // const float scale_bmm1 = 1.f / sqrtf(d);
+    const float scale_bmm1 = softmax_scale;
+
+    params.scale_bmm1f = scale_bmm1;
+    set_alpha(params.scale_bmm1, scale_bmm1, data_type);
 
     // Set this to probability of keeping an element to simplify things.
     params.p_dropout = 1.f - p_dropout;
@@ -110,16 +121,86 @@ void set_params_fprop(FMHA_fprop_params &params,
     params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
     params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
     params.rp_dropout = 1.f / params.p_dropout;
-    params.scale_bmm1_rp_dropout = params.rp_dropout * params.scale_bmm1;
+    params.scale_bmm1_rp_dropout = params.rp_dropout * params.scale_bmm1f;
     TORCH_CHECK(p_dropout < 1.f);
+    set_alpha(params.scale_dropout, params.rp_dropout, data_type);
 
     params.is_causal = is_causal;
+    params.num_splits = num_splits;
+}
+
+void set_params_dgrad(FMHA_dgrad_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t h,
+                      const size_t d,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      const at::Tensor out,
+                      at::Tensor dq,
+                      at::Tensor dk,
+                      at::Tensor dv,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *dq_tmp_d,
+                      void *do_packed_d,
+                      void *softmax_lse_d,
+                      void *dsoftmax_sum_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      bool is_causal,
+                      int num_splits) {
+
+    set_params_fprop(params,
+                     b, seqlen_q, seqlen_k, h, d,
+                     q, k, v, out,
+                     cu_seqlens_q_d,
+                     cu_seqlens_k_d,
+                     dq_tmp_d,  // Reusing the o_tmp_ptr variable to store dq_tmp
+                     nullptr,
+                     softmax_lse_d,
+                     p_dropout,
+                     softmax_scale,
+                     is_causal,
+                     num_splits);
+
+    // Set the pointers and strides.
+    params.dq_ptr = dq.data_ptr();
+    params.dk_ptr = dk.data_ptr();
+    params.dv_ptr = dv.data_ptr();
+    params.dq_row_stride_in_elts = dq.stride(0);
+    params.dk_row_stride_in_elts = dk.stride(0);
+    params.dv_row_stride_in_elts = dv.stride(0);
+    params.dq_head_stride_in_elts = dq.stride(1);
+    params.dk_head_stride_in_elts = dk.stride(1);
+    params.dv_head_stride_in_elts = dv.stride(1);
+    params.do_ptr = do_packed_d;
+
+    // Softmax sum
+    params.dsoftmax_sum = dsoftmax_sum_d;
 }
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
+void run_fmha_fwd(Launch_params<FMHA_fprop_params> &launch_params) {
+    if (launch_params.params.d <= 32) {
+        run_fmha_fwd_hdim32(launch_params);
+    } else if (launch_params.params.d <= 64) {
+        run_fmha_fwd_hdim64(launch_params);
+    } else if (launch_params.params.d <= 128) {
+        run_fmha_fwd_hdim128(launch_params);
+    }
+}
+// The tensor `out` will get populated the output attention
+// First return value is softmax_logsumexp
+// Second return value is the random generator state
+std::tuple<at::Tensor, uint64_t, uint64_t, at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        at::Tensor &out,
         const at::Tensor &cu_seqlens_q,  // b+1
         const at::Tensor &cu_seqlens_k,  // b+1
         const int max_seqlen_q_,
@@ -129,11 +210,9 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
         const bool zero_tensors,
         const bool is_causal,
         const bool return_softmax,
-        c10::optional<at::Generator> gen_) {
-
+        const int num_splits) {
     auto dprops = at::cuda::getCurrentDeviceProperties();
     bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
-    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
     TORCH_CHECK(is_sm8x || is_sm75);
     auto stream = at::cuda::getCurrentCUDAStream().stream();
@@ -144,12 +223,14 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     TORCH_CHECK(q_dtype == at::kHalf || (is_sm8x && q_dtype == at::kBFloat16));
     TORCH_CHECK(k.dtype() == q_dtype);
     TORCH_CHECK(v.dtype() == q_dtype);
+    TORCH_CHECK(out.dtype() == q_dtype);
     TORCH_CHECK(cu_seqlens_q.dtype() == at::kInt);
     TORCH_CHECK(cu_seqlens_k.dtype() == at::kInt);
 
     TORCH_CHECK(q.is_cuda());
     TORCH_CHECK(k.is_cuda());
     TORCH_CHECK(v.is_cuda());
+    TORCH_CHECK(out.is_cuda());
     TORCH_CHECK(cu_seqlens_q.is_cuda());
     TORCH_CHECK(cu_seqlens_k.is_cuda());
 
@@ -168,15 +249,15 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     const int total_k = k.size(TOTAL_DIM);
     TORCH_CHECK(batch_size > 0);
     TORCH_CHECK((head_size % 8 == 0) && (head_size <= 128));
-    const int head_size_rounded = head_size <= 64 ? 64 : 128;
 
     CHECK_SHAPE(q, total_q, num_heads, head_size);
     CHECK_SHAPE(k, total_k, num_heads, head_size);
     CHECK_SHAPE(v, total_k, num_heads, head_size);
+    CHECK_SHAPE(out, total_q, num_heads, head_size);
     CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
     CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
 
-    int blocksize_c = ((head_size_rounded == 128 && (is_dropout || !is_sm80)) || (is_sm75 && head_size_rounded == 64 && is_dropout)) ? 128 : 256;
+    int blocksize_c = head_size > 64 ? 128 : 256;
     // Need to round max_seqlen_k to multiples of blocksize_c
     int max_seqlen_k = ((max_seqlen_k_ + blocksize_c - 1) / blocksize_c) * blocksize_c;
     if( max_seqlen_k_ <= 128 ) {
@@ -192,58 +273,239 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
 
     auto opts = q.options();
 
-    auto o = at::empty({ total_q, num_heads, head_size }, opts);
+    // auto o = torch::empty({ total_q, num_heads, head_size }, opts);
 
     at::Tensor o_tmp;
     if (loop) { o_tmp = at::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat)); }
 
     auto softmax_lse = at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+    // auto softmax_lse = torch::full({batch_size, num_heads, max_seqlen_k}, -std::numeric_limits<float>::infinity(), opts.dtype(at::kFloat));
 
-    //  It appears that FlashAttention can return attention weights, but we don't use them. Since we are currently
-    //  filtering this out in the dispatch mechanism. Investigate this ouput against the math impl.
-    at::Tensor s = at::empty({0}, opts);
-    if (return_softmax) { s = at::empty({ batch_size, num_heads, max_seqlen_q, max_seqlen_k }, opts); }
+    at::Tensor flash_softmax;
+    if (return_softmax) {flash_softmax = at::empty({ batch_size, num_heads, max_seqlen_q, max_seqlen_k }, opts); }
 
     if( zero_tensors ) {
-        o.zero_();
+        out.zero_();
         softmax_lse.fill_(-std::numeric_limits<float>::infinity());
-        if (return_softmax) {s.zero_();}
+        if (return_softmax) {flash_softmax.zero_();}
     }
 
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        gen_, at::cuda::detail::getDefaultCUDAGenerator());
-
     set_params_fprop(launch_params.params,
                      batch_size,
                      max_seqlen_q,
                      max_seqlen_k,
                      num_heads,
                      head_size,
-                     q, k, v,
+                     q, k, v, out,
                      cu_seqlens_q.data_ptr(),
                      cu_seqlens_k.data_ptr(),
-                     o.data_ptr(),
                      loop ? o_tmp.data_ptr() : nullptr,
-                     return_softmax ? s.data_ptr() : nullptr,
+                     return_softmax ? flash_softmax.data_ptr() : nullptr,
                      softmax_lse.data_ptr(),
                      p_dropout,
                      softmax_scale,
-                     is_causal);
+                     is_causal,
+                     num_splits);
 
-    run_fmha_fprop(launch_params, /*configure=*/ true);
     // number of times random will be generated per thread, to offset philox counter in thc random
     // state
-    int64_t counter_offset = launch_params.elts_per_thread;
-
+    // We use a custom RNG that increases the offset by batch_size * nheads * 32.
+    int64_t counter_offset = launch_params.params.b * launch_params.params.h * 32;
+
+    // We want to checkpoint and save the RNG state for backward if dropout
+    // We get the default generator and return the seed and offset which will
+    // be used in the backward function
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
+    uint64_t seed{0}, offset{0};
     if( is_dropout ) {
+        TORCH_CHECK(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None,
+        "scaled_dot_product_flash_attention does not support dropout with cuda graph capture mode enabled");
         // See Note [Acquire lock when using random generators]
         std::lock_guard<std::mutex> lock(gen->mutex_);
-        launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
+        // generator_state = at::Tensor::wrap_tensor_impl(gen -> get_state());
+        at::PhiloxCudaState philox_state = gen->philox_cuda_state(counter_offset);
+        std::tie(seed, offset) = at::cuda::philox::unpack(philox_state);
+        launch_params.params.philox_args = philox_state;
+    }
+
+    run_fmha_fwd(launch_params);
+
+    return {softmax_lse, seed, offset, flash_softmax};
+}
+
+void run_fmha_bwd(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+  if (params.d <= 32) {
+      run_fmha_bwd_hdim32(params, stream, configure);
+  } else if (params.d <= 64) {
+      run_fmha_bwd_hdim64(params, stream, configure);
+  } else if (params.d <= 128) {
+      run_fmha_bwd_hdim128(params, stream, configure);
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
+        const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+        const at::Tensor &k,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &v,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &out,   // total_q x num_heads x head_size
+        const at::Tensor &softmax_lse_,     // b x h x s softmax logsumexp
+        at::Tensor &dq,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+        at::Tensor &dk,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        at::Tensor &dv,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &cu_seqlens_q,  // b+1
+        const at::Tensor &cu_seqlens_k,  // b+1
+        const int max_seqlen_q_,
+        const int max_seqlen_k_,          // max sequence length to choose the kernel
+        const float p_dropout,         // probability to drop
+        const float softmax_scale,
+        const bool zero_tensors,
+        const bool is_causal,
+        const int num_splits,
+        const uint64_t philox_seed,
+        const uint64_t philox_offset
+) {
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm75 = dprops->major == 7 && dprops->minor == 5;
+    bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
+    bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+    TORCH_CHECK(is_sm8x || is_sm75);
+    auto launch = &run_fmha_bwd;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+
+    TORCH_CHECK(q_dtype == at::kHalf || (is_sm8x && q_dtype == at::kBFloat16));
+    TORCH_CHECK(k.dtype() == q_dtype);
+    TORCH_CHECK(v.dtype() == q_dtype);
+    TORCH_CHECK(out.dtype() == q_dtype);
+    TORCH_CHECK(dout.dtype() == q_dtype);
+    TORCH_CHECK(dq.dtype() == q_dtype);
+    TORCH_CHECK(dk.dtype() == q_dtype);
+    TORCH_CHECK(dv.dtype() == q_dtype);
+    TORCH_CHECK(cu_seqlens_q.dtype() == at::kInt);
+    TORCH_CHECK(cu_seqlens_k.dtype() == at::kInt);
+
+    TORCH_CHECK(q.is_cuda());
+    TORCH_CHECK(k.is_cuda());
+    TORCH_CHECK(v.is_cuda());
+    TORCH_CHECK(out.is_cuda());
+    TORCH_CHECK(dout.is_cuda());
+    TORCH_CHECK(softmax_lse_.is_cuda());
+    TORCH_CHECK(cu_seqlens_q.is_cuda());
+    TORCH_CHECK(cu_seqlens_k.is_cuda());
+
+    TORCH_CHECK(q.stride(-1) == 1);
+    TORCH_CHECK(k.stride(-1) == 1);
+    TORCH_CHECK(v.stride(-1) == 1);
+    TORCH_CHECK(out.is_contiguous());
+    TORCH_CHECK(dout.is_contiguous());
+    TORCH_CHECK(dq.stride(-1) == 1);
+    TORCH_CHECK(dk.stride(-1) == 1);
+    TORCH_CHECK(dv.stride(-1) == 1);
+    TORCH_CHECK(cu_seqlens_q.is_contiguous());
+    TORCH_CHECK(cu_seqlens_k.is_contiguous());
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int total_q = sizes[TOTAL_DIM];
+    const int num_heads = sizes[H_DIM];
+    const int head_size = sizes[D_DIM];
+    const int total_k = k.size(TOTAL_DIM);
+    TORCH_CHECK(batch_size > 0);
+    TORCH_CHECK((head_size % 8 == 0) && (head_size <= 128));
+    if (head_size > 64) {  // TODO: eventually we should support SM86 and SM70 with d=128 as well
+        TORCH_CHECK(is_sm80);
+    }
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size);
+    CHECK_SHAPE(k, total_k, num_heads, head_size);
+    CHECK_SHAPE(v, total_k, num_heads, head_size);
+    CHECK_SHAPE(out, total_q, num_heads, head_size);
+    CHECK_SHAPE(dout, total_q, num_heads, head_size);
+    CHECK_SHAPE(dq, total_q, num_heads, head_size);
+    CHECK_SHAPE(dk, total_k, num_heads, head_size);
+    CHECK_SHAPE(dv, total_k, num_heads, head_size);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    int blocksize_c = (head_size > 64 || (is_sm75 && head_size > 32)) ? 128 : 256;
+    int max_seqlen_k = ((max_seqlen_k_ + blocksize_c - 1) / blocksize_c) * blocksize_c;
+    if( max_seqlen_k_ <= 128 ) {
+        max_seqlen_k = 128;
+    } else if( max_seqlen_k_ <= 256 ) {
+        max_seqlen_k = 256;
+    }
+    int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
+    bool loop = max_seqlen_k > blocksize_c;
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    // It's possible the softmax_lse_ from the fwd has a different length since blocksize_c could be different.
+    auto softmax_lse = softmax_lse_.index({at::indexing::Slice(), at::indexing::Slice(), at::indexing::Slice(at::indexing::None, max_seqlen_q)}).contiguous();
+
+    auto opts = q.options();
+    auto softmax_d = at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+    at::Tensor dq_tmp;
+    if (loop) { dq_tmp = at::empty({total_q, num_heads, head_size}, opts.dtype(at::kFloat)); }
+
+    if( zero_tensors ) {
+        dq.zero_();
+        dk.zero_();
+        dv.zero_();
+        softmax_d.zero_();
+    }
+
+    FMHA_dgrad_params params;
+
+    set_params_dgrad(params,
+                     batch_size,
+                     max_seqlen_q,
+                     max_seqlen_k,
+                     num_heads,
+                     head_size,
+                     q, k, v, out,
+                     dq, dk, dv,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     loop ? dq_tmp.data_ptr() : nullptr,
+                     dout.data_ptr(),
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     p_dropout,
+                     softmax_scale,
+                     is_causal,
+                     num_splits);
+
+    launch(params, stream, /*configure=*/true);
+
+    if (params.num_splits > 1) {
+        if (!dq_tmp.defined()) {
+            dq_tmp = at::zeros({total_q, num_heads, head_size}, opts.dtype(at::kFloat));
+            params.o_tmp_ptr = dq_tmp.data_ptr();  // o_tmp stores dq_tmp in the backward pass
+        } else {
+            dq_tmp.zero_();
+        }
     }
+    bool is_dropout = p_dropout > 0.0;
+    TORCH_CHECK(
+        !is_dropout || at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None,
+        "scaled_dot_product_flash_attention does not support dropout with cuda graph capture mode enabled");
+    at::PhiloxCudaState philox_args{philox_seed, philox_offset};
+    params.philox_args = philox_args;
 
-    run_fmha_fprop(launch_params, /*configure=*/false);
+    launch(params, stream, /*configure=*/false);
 
-    return std::make_tuple(o, softmax_lse, s);
+    if (params.num_splits > 1) {
+        dq.copy_(dq_tmp);
+    }
+
+    return std::make_tuple(dq, dk, dv, softmax_d);
 }
 } // namespace fmha
+
 #endif
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
index b0555463be04..682bde362c66 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
@@ -7,10 +7,11 @@
 namespace fmha {
 
 TORCH_API
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
+std::tuple<at::Tensor, uint64_t, uint64_t, at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        at::Tensor &out,
         const at::Tensor &cu_seqlens_q,  // b+1
         const at::Tensor &cu_seqlens_k,  // b+1
         const int max_seqlen_q_,
@@ -20,6 +21,30 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
         const bool zero_tensors,
         const bool is_causal,
         const bool return_softmax,
-        c10::optional<at::Generator> gen_);
+        const int num_splits);
+
+TORCH_API
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
+        const at::Tensor &q,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+        const at::Tensor &k,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &v,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &out,   // total_q x num_heads x head_size
+        const at::Tensor &softmax_lse_,     // b x h x s softmax logsumexp
+        at::Tensor &dq,   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+        at::Tensor &dk,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        at::Tensor &dv,   // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+        const at::Tensor &cu_seqlens_q,  // b+1
+        const at::Tensor &cu_seqlens_k,  // b+1
+        const int max_seqlen_q_,
+        const int max_seqlen_k_,          // max sequence length to choose the kernel
+        const float p_dropout,         // probability to drop
+        const float softmax_scale,
+        const bool zero_tensors,
+        const bool is_causal,
+        const int num_splits,
+        const uint64_t philox_seed,
+        const uint64_t philox_offset
+);
 
 } // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim128.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim128.cu
new file mode 100644
index 000000000000..e9c01abe4a86
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim128.cu
@@ -0,0 +1,12 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_bwd_launch_template.h>
+
+void run_fmha_bwd_hdim128(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(params.is_bf16, ([&] {
+        using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 8, 0x100u, elem_type>;
+        run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+    }));
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim32.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim32.cu
new file mode 100644
index 000000000000..6c76426e17f0
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim32.cu
@@ -0,0 +1,17 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_bwd_launch_template.h>
+
+void run_fmha_bwd_hdim32(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(params.is_bf16, ([&] {
+        if (params.seqlen_k == 128) {
+            using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 8, 0x08u, elem_type>;
+            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+        } else if (params.seqlen_k >= 256) {
+            using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 8, 0x08u, elem_type>;
+            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+        }
+    }));
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim64.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim64.cu
new file mode 100644
index 000000000000..01513d42f80e
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_hdim64.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_bwd_launch_template.h>
+
+void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(params.is_bf16, ([&] {
+        auto dprops = at::cuda::getCurrentDeviceProperties();
+        if (params.seqlen_k == 128) {
+            using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>;
+            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+        } else if (params.seqlen_k >= 256) {
+            if (dprops->major == 8 && dprops->minor == 0) {
+                // Don't share smem for K & V, and don't keep V in registers
+                // This speeds things up by 2-3% by avoiding register spills, but it
+                // uses more shared memory, which is fine on A100 but not other GPUs.
+                // For other GPUs, we keep V in registers.
+                using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x100u, elem_type>;
+                run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+            } else if (dprops->major == 8 && dprops->minor > 0) {
+                using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x08u, elem_type>;
+                run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+            } else if (dprops->major == 7 && dprops->minor == 5) {
+                using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>;
+                run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+            }
+        }
+    }));
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_launch_template.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_launch_template.h
new file mode 100644
index 000000000000..f2730b67c8f7
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_bwd_launch_template.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2022, Tri Dao.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/static_switch.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_dgrad_kernel_1xN_loop.h>
+
+// Pick whether we should parallelize across seqlen_k (num_splits > 1) or not (num_splits=1).
+// Parallelizing will have better occupancy, but has some overhead due to having to zero out
+// dq_tmp and having to copy dq_tmp to dq.
+inline int num_splits_heuristic_bwd(int batch_nheads, int num_SMs, int ctas_per_sm, int seqlen,
+                             int blocksize, bool is_causal) {
+    float n_waves_1 = float(batch_nheads) / (num_SMs * ctas_per_sm);
+    float eff_1 = n_waves_1 / ceil(n_waves_1);
+    int num_splits_parallel = seqlen / blocksize;
+    float n_waves_parallel = float(batch_nheads * num_splits_parallel) / (num_SMs * ctas_per_sm);
+    float eff_parallel_raw = n_waves_parallel / ceil(n_waves_parallel);
+    float discount_factor;
+    if (!is_causal) {
+        discount_factor = 1.f + float(blocksize) / seqlen;
+    } else {  // For causal, parallelizing seems to help with load-balancing as well
+        // For example, if headdim=128, seqlen >= 1280 always prefers parallel
+        if (seqlen / blocksize >= 10) return num_splits_parallel;
+        discount_factor = 1.f + 0.5 * float(blocksize) / seqlen;
+    }
+    float eff_parallel = eff_parallel_raw / discount_factor;
+    return eff_1 >= eff_parallel ? 1 : num_splits_parallel;
+}
+
+template<typename Kernel_traits>
+__global__ void fmha_bwd_dot_do_o_kernel(FMHA_dgrad_params params) {
+    fmha::compute_dot_do_o<Kernel_traits>(params);
+}
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, int loop_steps=-1>
+__global__ void fmha_bwd_dq_dk_dv_loop_kernel(FMHA_dgrad_params params) {
+    fmha::compute_dq_dk_dv_1xN<Kernel_traits, Is_dropout, Is_causal, loop_steps>(params);
+}
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal>
+__global__ void fmha_bwd_q_dk_dv_loop_seqparallel_kernel(FMHA_dgrad_params params) {
+    fmha::compute_dq_dk_dv_seqparallel<Kernel_traits, Is_dropout, Is_causal>(params);
+}
+
+template<typename Kernel_traits>
+void run_fmha_bwd_loop(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+    constexpr int smem_size_softmax = Kernel_traits::Cta_tile_p::M * Kernel_traits::Cta_tile_p::WARPS_N * sizeof(float);
+    constexpr int smem_size_q = Kernel_traits::Smem_tile_q::BYTES_PER_TILE;
+    constexpr int smem_size_v = Kernel_traits::Smem_tile_v::BYTES_PER_TILE;
+    constexpr int smem_size_dq = Kernel_traits::Smem_tile_o::BYTES_PER_TILE;
+
+    using Smem_tile_s = fmha::Smem_tile_mma_transposed<typename Kernel_traits::Cta_tile_p>;
+    constexpr int smem_size_s = Smem_tile_s::BYTES_PER_TILE;
+    static_assert(smem_size_s == 16 * Kernel_traits::Cta_tile_p::N * 2);
+    static_assert(smem_size_dq == 16 * Kernel_traits::Cta_tile_p::K * 4 * Kernel_traits::Cta_tile_p::WARPS_N);
+
+    constexpr int smem_size_dq_dk_dv = smem_size_q * 2 + smem_size_v * (Kernel_traits::V_IN_REGS ? 1 : 2) + smem_size_dq + smem_size_s * 2;
+    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
+    // printf("blocksize_c = %d, WARPS_N = %d, Smem size = %d\n", blocksize_c, Kernel_traits::Cta_tile_p::WARPS_N, smem_size_dq_dk_dv);
+
+    bool is_dropout = params.p_dropout < 1.f;  // params.p_dropout is the probability of "keeping"
+    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
+    BOOL_SWITCH(is_dropout, IsDropoutConst, ([&] {
+        auto kernel = params.is_causal
+            ? &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, true>
+            : &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, false>;
+        if (params.seqlen_k == blocksize_c) {
+            kernel = params.is_causal
+                ? &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, true, /*loop_steps=*/1>
+                : &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, false, /*loop_steps=*/1>;
+        } else if (params.seqlen_k == blocksize_c * 2) {
+            kernel = params.is_causal
+                ? &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, true, /*loop_steps=*/2>
+                : &fmha_bwd_dq_dk_dv_loop_kernel<Kernel_traits, IsDropoutConst, false, /*loop_steps=*/2>;
+        }
+        auto kernel_seqparallel = params.is_causal
+            ? &fmha_bwd_q_dk_dv_loop_seqparallel_kernel<Kernel_traits, IsDropoutConst, true>
+            : &fmha_bwd_q_dk_dv_loop_seqparallel_kernel<Kernel_traits, IsDropoutConst, false>;
+        if( smem_size_dq_dk_dv >= 48 * 1024 ) {
+            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
+            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+                kernel_seqparallel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size_dq_dk_dv));
+        }
+        // Automatically set num_splits to maximize occupancy
+        if (params.num_splits <= 0) {
+            int ctas_per_sm;
+            cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                &ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size_dq_dk_dv);
+            auto dprops = at::cuda::getCurrentDeviceProperties();
+            // printf("CTAS_PER_SM = %d, nSMs = %d\n", ctas_per_sm, dprops->multiProcessorCount);
+            constexpr int M = Kernel_traits::Cta_tile_p::M;
+            // We don't want more than 10 splits due to numerical error.
+            // Numerical error on dk/dv scales as sqrt(num_splits).
+            params.num_splits = num_splits_heuristic_bwd(
+                params.b * params.h, dprops->multiProcessorCount,
+                ctas_per_sm, params.seqlen_k, blocksize_c, params.is_causal
+            );
+        }
+        if (configure) return;
+        if (params.num_splits == 1) {
+            dim3 grid(params.b, params.h, params.num_splits);
+            kernel<<<grid, Kernel_traits::THREADS, smem_size_dq_dk_dv, stream>>>(params);
+        } else {
+            dim3 grid_dot(params.b, params.h, (params.seqlen_q + 128 - 1) / 128);
+            fmha_bwd_dot_do_o_kernel<Kernel_traits><<<grid_dot, Kernel_traits::THREADS, 0, stream>>>(params);
+            int num_splits = params.seqlen_k / blocksize_c;  // seqlen_k is divisible by blocksize_c
+            dim3 grid(params.b, params.h, num_splits);
+            kernel_seqparallel<<<grid, Kernel_traits::THREADS, smem_size_dq_dk_dv, stream>>>(params);
+        }
+        FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    }));
+}
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_dgrad_kernel_1xN_loop.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_dgrad_kernel_1xN_loop.h
new file mode 100644
index 000000000000..ecc443bb830a
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_dgrad_kernel_1xN_loop.h
@@ -0,0 +1,839 @@
+/* Copyright (c) 2022, Tri Dao.
+ */
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_kernel.h>
+#include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
+#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int ROWS, int THREADS_PER_ROW, typename elem_type=__half, int M, typename Gmem_softmax_sum>
+inline __device__ void dot_do_o(const uint4 (&do_)[M], const uint4 (&o)[M], const float scale,
+                                Gmem_softmax_sum gmem_softmax_d, int tidx) {
+    float sum[M];
+    fmha::SumOp<float> sum_op;
+    #pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+        sum[mi] = fmha::Allreduce<THREADS_PER_ROW>::run(
+            fmha::hmulsum8<elem_type>(do_[mi], o[mi]), sum_op
+        ) * scale;
+    }
+    const int dp_sum_row = tidx / THREADS_PER_ROW;
+    if ((dp_sum_row < ROWS) && (tidx % THREADS_PER_ROW == 0)) {
+        gmem_softmax_d.store_row(reinterpret_cast<const uint32_t (&)[M]>(sum), dp_sum_row);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Just compute dot(do, o) and write the result (softmax_d) to global memory as a separate kernel.
+// This is used in the case where we want to parallelize the backward across seqlen_k.
+template<typename Kernel_traits, typename Params>
+inline __device__ void compute_dot_do_o(const Params &params) {
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using elem_type = typename Kernel_traits::elem_type;
+#else
+    constexpr bool is_fp16_type = std::is_same<typename Kernel_traits::elem_type, __half>::value;
+    assert(is_fp16_type);
+    using elem_type = __half;
+#endif
+
+    // The description of the CTA tile for the 1st batched GEMM.
+    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+    // The description of the CTA tile for the 3rd batched GEMM.
+    using Cta_tile_dkv =
+        fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;
+
+    static_assert(Cta_tile_dkv::N == 16 || Cta_tile_dkv::N == 32 || Cta_tile_dkv::N == 64 || Cta_tile_dkv::N == 128);
+    static_assert(Cta_tile_dkv::K == 16);
+
+    // The global memory tile to load dO.
+    using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;
+
+    // The global memory tile to load O.Loading O here is similar to loading dO.
+    using Gmem_tile_o = Gmem_tile_do;
+
+    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
+
+    // The block index for the batch.
+    const int bidb = blockIdx.x;
+    // The block index for the head.
+    const int bidh = blockIdx.y;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    // How many steps to jump per iteration.
+    const int step_stride = gridDim.z;
+
+    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
+    if( binfo.stop_early() ) return;
+
+    // Allocate the global memory tile loader for dO.
+    Gmem_tile_do gmem_do(params.do_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
+                         params.d, binfo, tidx, true);
+
+    // Allocate the global memory tile loader for O.
+    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
+                       params.d, binfo, tidx, true);
+
+    Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
+
+    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
+    const int steps = (params.seqlen_q + Cta_tile_p::M - 1) / Cta_tile_p::M;
+    // Wind gmem tiles to the correct position.
+    gmem_do.move(blockIdx.z);
+    gmem_o.move(blockIdx.z);
+    gmem_softmax_d.move(blockIdx.z);
+
+    // Load over the entire sequence length.
+    for (int l = blockIdx.z; l < steps; l += step_stride) {
+        if (l * Cta_tile_p::M  >= binfo.actual_seqlen_q)
+            break;
+
+        gmem_do.load();
+        gmem_do.move(step_stride);
+        gmem_o.load();
+        gmem_o.move(step_stride);
+
+        dot_do_o<Gmem_tile_do::ROWS, Gmem_tile_do::THREADS_PER_ROW, elem_type>(
+            gmem_do.fetch_, gmem_o.fetch_, params.p_dropout, gmem_softmax_d, tidx
+        );
+        gmem_softmax_d.move(step_stride);
+    }  // Outer loop over the sequence length.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Is_first, bool Is_last, bool Seq_parallel=false, typename Params, typename Prng>
+inline __device__ void compute_dq_dk_dv_1xN_one_iter(const Params &params, Prng &ph,
+                                                     const int loop_step_idx) {
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using elem_type = typename Kernel_traits::elem_type;
+#else
+    constexpr bool is_fp16_type = std::is_same<typename Kernel_traits::elem_type, __half>::value;
+    assert(is_fp16_type);
+    using elem_type = __half;
+#endif
+
+    // The description of the CTA tile for the 1st batched GEMM.
+    using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+    // The description of the CTA tile for the 2nd batched GEMM.
+    using Cta_tile_dq = typename Kernel_traits::Cta_tile_o;
+    // The description of the CTA tile for the 3rd batched GEMM.
+    using Cta_tile_dkv =
+        fmha::Cta_tile_extd<Cta_tile_p::N, Cta_tile_p::K, Cta_tile_p::M, Cta_tile_p::WARPS_N, 1, Cta_tile_p::WARPS_M>;
+
+    static_assert(Cta_tile_dkv::M == 512 ||  Cta_tile_dkv::M == 256 || Cta_tile_dkv::M == 128);
+    static_assert(Cta_tile_dkv::N == 16 || Cta_tile_dkv::N == 32 || Cta_tile_dkv::N == 64 || Cta_tile_dkv::N == 128);
+    static_assert(Cta_tile_dkv::K == 16);
+
+    // The MMA tile for the 1st GEMM.
+    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
+    // The MMA tile for the 2nd GEMM.
+    using Mma_tile_dq = fmha::Hmma_tile<Cta_tile_dq>;
+    // The MMA tile for the 3rd GEMM.
+    using Mma_tile_dkv = fmha::Hmma_tile<Cta_tile_dkv>;
+
+    // The global memory tile to load Q.
+    using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+    // The shared memory tile to reload Q transposed.
+    using Smem_tile_qt = fmha::Smem_tile_b<Cta_tile_dkv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
+
+    // The global memory tile to load K.
+    using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+    // The shared memory tile to swizzle K^T. Treat K^T as V
+    using Smem_tile_kt = typename Kernel_traits::Smem_tile_v;
+
+    // Treating V as K. We need to use Kernel_traits::Smem_tile_k otherwise loading will be wrong
+    // The global memory tile to load V.
+    using Gmem_tile_v = typename Kernel_traits::Gmem_tile_k;
+    // The shared memory tile to swizzle V.
+    using Smem_tile_v = typename Kernel_traits::Smem_tile_k;
+
+    // The global memory tile to load dO.
+    using Gmem_tile_do = typename Kernel_traits::Gmem_tile_do;
+    // The shared memory tile to load dO.
+    // Treating dO as Q.
+    using Smem_tile_do = typename Kernel_traits::Smem_tile_q;
+    // The shared memory tile to reload dO transposed.
+    using Smem_tile_dot = fmha::Smem_tile_b<Cta_tile_dkv, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
+
+    // The global memory tile to load O.Loading O here is similar to loading dO.
+    using Gmem_tile_o = Gmem_tile_do;
+
+    // The global memory tile to store dQ.
+    using Gmem_tile_dq = typename Kernel_traits::Gmem_tile_o;
+    using Gmem_tile_dq_tmp = fmha::Gmem_tile_o<Cta_tile_dq, 4>;
+    // The shared memory tile to swizzle dQ.
+    using Smem_tile_dq = typename Kernel_traits::Smem_tile_o;
+
+    // The global memory tile to store dV.
+    using Gmem_tile_dv = typename Kernel_traits::Gmem_tile_v;
+    // The shared memory tile to swizzle dV.
+    using Smem_tile_dv = fmha::Smem_tile_mma_epilogue<Cta_tile_dkv>;
+
+    // The global memory tile to store dK.
+    using Gmem_tile_dk = typename Kernel_traits::Gmem_tile_v;
+    // The shared memory tile to swizzle dK.
+    using Smem_tile_dk = fmha::Smem_tile_mma_epilogue<Cta_tile_dkv>;
+    static_assert(Smem_tile_dk::NUM_LDS == Gmem_tile_dk::LDGS);
+    static_assert(Smem_tile_dk::THREADS_PER_ROW == Gmem_tile_dk::THREADS_PER_ROW);
+
+    using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
+
+    using Smem_tile_st = typename Kernel_traits::Smem_tile_st;
+
+    using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
+
+    // using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
+    using Gemm1 = Gemm_Q_K<Kernel_traits, /*K-in_regs=*/false, elem_type>;
+
+    using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    // Shared memory layout if we keep V in registers:
+    //  dO | Q | K / V | dQ | S | dP | dP_sum
+    //  dV | dK
+    // Shared memory layout if we keep V shared memory:
+    //  dO | Q | K | V | dQ | S | dP | dP_sum
+    //  dV | dK
+
+
+    // The block index for the batch.
+    const int bidb = blockIdx.x;
+    // The block index for the head.
+    const int bidh = blockIdx.y;
+    // The thread index.
+    const int tidx = threadIdx.x;
+
+    const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
+    // if( binfo.stop_early() ) return;
+    if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
+
+    Gemm1 gemm_q_k(&smem_[Smem_tile_do::BYTES_PER_TILE], tidx);
+    // Allocate the global memory tile loader for Q.
+    Gmem_tile_q gmem_q(params.q_ptr, params.q_row_stride_in_elts, params.q_head_stride_in_elts,
+                       params.d, binfo, tidx, true);
+    // Allocate the global memory tile loader for dQ.
+    Gmem_tile_dq gmem_dq(params.dq_ptr, params.dq_row_stride_in_elts, params.dq_head_stride_in_elts,
+                         params.d, binfo, tidx);
+    Gmem_tile_dq_tmp gmem_dq_tmp(params.o_tmp_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
+                                 params.d, binfo, tidx);
+    // Allocate the global memory tile loader for S.
+    Gmem_tile_s gmem_s(params, binfo, tidx);
+
+    fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
+
+    // Allocate the global memory tile loader for K.
+    Gmem_tile_k gmem_k(params.k_ptr, params.k_row_stride_in_elts, params.k_head_stride_in_elts,
+                       params.d, binfo, tidx, false);
+    // Allocate the global memory tile loader for V.
+    Gmem_tile_v gmem_v(params.v_ptr, params.v_row_stride_in_elts, params.v_head_stride_in_elts,
+                       params.d, binfo, tidx, false);
+    // The base pointer of smem_v;
+    char *smem_v_ = &smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_V];
+
+    // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
+    Smem_tile_v smem_v(smem_v_, tidx);
+    // Allocate the shared memory tile loader for K^T. We use the same as K so be careful!!!
+    Smem_tile_kt smem_kt(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::Smem_tile_q::BYTES_PER_TILE], tidx);
+
+    // Allocate the global memory tile loader for dO.
+    Gmem_tile_do gmem_do(params.do_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
+                         params.d, binfo, tidx, true);
+    // Allocate the shared memory tile loader for dO.
+    Smem_tile_do smem_do(&smem_[0], tidx);
+    Smem_tile_dot smem_dot(&smem_[0], tidx);
+    // Allocate the shared memory tile loader for Q^T.
+    // TODO: assert that this points to the same memory as gemm_q_k.smem_q
+    Smem_tile_qt smem_qt(&smem_[Smem_tile_do::BYTES_PER_TILE], tidx);
+
+    Smem_tile_st smem_s(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE], tidx);
+    Smem_tile_st smem_dp(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O + Smem_tile_dq::BYTES_PER_TILE + Smem_tile_st::BYTES_PER_TILE], tidx);
+
+    // Allocate the global memory tile loader for O.
+    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
+                       params.d, binfo, tidx, true);
+
+    // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
+    Smem_tile_dq smem_dq(&smem_[Smem_tile_do::BYTES_PER_TILE + Gemm1::SMEM_OFFSET_O], tidx);
+
+    Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
+    Gmem_softmax_sum gmem_softmax_d(params.dsoftmax_sum, params, tidx);
+
+    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
+    int begin = Is_causal ? loop_step_idx * Cta_tile_p::N / Cta_tile_p::M : 0;
+    // Otherwise we'd be reading out-of-bound memory before the loop
+    if (begin * Cta_tile_p::M >= binfo.actual_seqlen_q) {
+        // Still need to zero out dk and dv before returning
+        static_assert(Smem_tile_dk::NUM_LDS == Smem_tile_dv::NUM_LDS);
+        uint4 dkv_out[Smem_tile_dk::NUM_LDS];
+        #pragma unroll
+        for (int i = 0; i < Smem_tile_dk::NUM_LDS; ++i) { dkv_out[i] = make_uint4(0u, 0u, 0u, 0u); }
+        Gmem_tile_dk gmem_dk(params.dk_ptr, params.dk_row_stride_in_elts, params.dk_head_stride_in_elts,
+                            params.d, binfo, tidx, false);
+        if (!Is_first) { gmem_dk.move(loop_step_idx); }
+        gmem_dk.store(dkv_out);
+        Gmem_tile_dv gmem_dv(params.dv_ptr, params.dv_row_stride_in_elts, params.dv_head_stride_in_elts,
+                            params.d, binfo, tidx, false);
+        if (!Is_first) { gmem_dv.move(loop_step_idx); }
+        gmem_dv.store(dkv_out);
+        return;
+    }
+
+    const int steps = (params.seqlen_q + Cta_tile_p::M - 1) / Cta_tile_p::M - begin;
+    // Wind gmem tiles to the correct position.
+    gmem_q.move(begin);
+    gmem_do.move(begin);
+    gmem_o.move(begin);
+    if (!Seq_parallel) { gmem_dq.move(begin); }  // If Seq_parallel, we're not using gmem_dq at all
+    gmem_dq_tmp.move(begin);
+    // TODO: need to move gmem_s if we want the intermediate result for debugging
+    gmem_softmax_lse.move(begin);
+    gmem_softmax_d.move(begin);
+
+    if (!Is_first) {
+        gmem_k.move(loop_step_idx);
+        gmem_v.move(loop_step_idx);
+    }
+
+    // Trigger the loads for K.
+    gmem_k.load();
+    // Trigger the loads for Q.
+    gmem_q.load();
+    // Trigger the loads for V.
+    gmem_v.load();
+    // Trigger the loads for dO.
+    gmem_do.load();
+    // Trigger the loads for O.
+    if (Is_first) { gmem_o.load(); }
+
+    float p_lse[Mma_tile_p::MMAS_M * 2];
+    gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_lse));
+
+    if (!Is_first) { __syncthreads(); }
+    // Commit the data for Q, dO, and V to shared memory.
+    gmem_q.commit(gemm_q_k.smem_q);
+    gmem_do.commit(smem_do);
+    if (Is_first) {
+        dot_do_o<Gmem_tile_do::ROWS, Gmem_tile_do::THREADS_PER_ROW, elem_type>(
+            gmem_do.fetch_, gmem_o.fetch_, params.p_dropout, gmem_softmax_d, tidx
+        );
+    }
+
+    // // Instead of scaling dP by rp_dropout, we scale V instead
+    // if (Is_dropout) {
+    //     const uint32_t scale_dropout = params.scale_dropout;
+    //     #pragma unroll
+    //     for(int it=0; it < Gmem_tile_v::LDGS; it++){
+    //         gmem_v.fetch_[it] = fmha::hmul8(scale_dropout, gmem_v.fetch_[it]);
+    //     }
+    // }
+
+    gmem_v.commit(smem_v);
+
+    // const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
+    // #pragma unroll
+    // for(int it=0; it < Gmem_tile_k::LDGS; it++){
+    //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
+    // }
+
+    // Commit the data for K to shared memory.
+    if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
+        gmem_k.commit(gemm_q_k.smem_k);
+    }
+
+    __syncthreads();
+
+    // Load the fragments for Q.
+    gemm_q_k.load_q();
+
+    // Load the fragments for V. We keep the data in registers during the entire kernel.
+    typename Smem_tile_v::Fragment frag_v[Kernel_traits::V_IN_REGS ? Mma_tile_p::MMAS_K : 2][Mma_tile_p::MMAS_N];
+    if (Kernel_traits::V_IN_REGS) {
+        #pragma unroll
+        for( int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki ) {
+            smem_v.load(frag_v[ki], ki);
+        }
+    }
+
+    float dp_sum[Mma_tile_p::MMAS_M * 2];
+    gmem_softmax_d.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(dp_sum));
+
+    // Commit the data for V to shared memory if it has not been done already.
+    if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
+        // Make sure we are done loading the fragments for K.
+        __syncthreads();
+
+        // Commit the data to shared memory for V.
+        gmem_k.commit(gemm_q_k.smem_k);
+
+        // Make sure the data is in shared memory.
+        __syncthreads();
+    }
+
+    // Load the fragments for K.
+    gemm_q_k.load_k();
+    // Load the fragments for K^T.
+    // typename Smem_tile_kt::Fragment frag_kt[2][Mma_tile_dq::MMAS_N];
+    // smem_kt.load(frag_kt[0], 0);
+    // typename Smem_tile_kt::Fragment frag_kt[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_N];
+    // #pragma unroll
+    // for( int ki = 0; ki < Mma_tile_dq::MMAS_K; ++ki ) {
+    //     smem_kt.load(frag_kt[ki], ki);
+    // }
+
+    // Create the object to do the softmax.
+    // We won't be using the shared memory for this softmax at all
+    Softmax softmax(params, smem_, tidx);
+
+    // Declare the accumulators for the 3rd gemm.
+    fmha::Fragment_accumulator acc_dv[Mma_tile_dkv::MMAS_M][Mma_tile_dkv::MMAS_N];
+    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dkv::WARPS_K>::apply(acc_dv);
+    fmha::Fragment_accumulator acc_dk[Mma_tile_dkv::MMAS_M][Mma_tile_dkv::MMAS_N];
+    fmha::Clear_accumulator<fmha::Accumulator_type, Cta_tile_dkv::WARPS_K>::apply(acc_dk);
+
+    // Load over the entire sequence length.
+    for (int l = 0; l < steps; l++) {
+        if ((begin + l) * Cta_tile_p::M  >= binfo.actual_seqlen_q)
+            break;
+
+        // Load the fragments for V.
+        // typename Smem_tile_v::Fragment frag_v[2][Mma_tile_p::MMAS_N];
+        if (!Kernel_traits::V_IN_REGS) { smem_v.load(frag_v[0], 0); }
+
+        // Load the fragments for dO.
+        typename Smem_tile_do::Fragment frag_do[2][Mma_tile_p::MMAS_M];
+        smem_do.load(frag_do[0], 0);
+
+        // Declare the accumulators for the 1st gemm.
+        fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
+
+        // Do this part of P^T = (Q * K^T)^T.
+        gemm_q_k(acc_p);
+
+        // Load the mask for that iteration.
+        mask.load(begin + l);
+
+        // Convert from the accumulator type to FP32 for Softmax.
+        softmax.unpack_noscale(acc_p);
+        // Apply the mask.
+        softmax.apply_mask(mask);
+        // Scale by log-sum-exp of the softmax
+        // softmax.apply_exp(p_lse);
+        softmax.template scale_apply_exp</*scale_max=*/false>(p_lse, params.scale_bmm1f);
+        if (Is_dropout) {
+            // softmax.apply_dropout(ph, params.p_dropout_in_uint);
+            // softmax.template apply_dropout</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint);
+            // softmax.template apply_dropout_16bits</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint16_t);
+            unsigned int warp_idx = threadIdx.x / 32;
+            // TODO: this should change after we rearrange the warps (e.g. cutlass branch)
+            unsigned int block_col_idx = loop_step_idx * Cta_tile_p::N / 16 + warp_idx;
+            unsigned long long philox_subsequence = (begin + l) * (binfo.actual_seqlen_k / 16) + block_col_idx;
+            softmax.template apply_dropout_16bits</*encode_dropout_in_sign_bit=*/true>(ph, params.p_dropout_in_uint16_t, philox_subsequence);
+        }
+
+        using Frag_p = fmha::Fragment_a<fmha::Row>;
+        Frag_p frag_p[Mma_tile_dq::MMAS_K][Mma_tile_dq::MMAS_M];
+        static_assert(Mma_tile_dq::MMAS_M == Mma_tile_p::MMAS_M);
+        static_assert(Mma_tile_dq::MMAS_K == Mma_tile_p::MMAS_N);
+        softmax.template pack<elem_type>(frag_p);
+
+        // Store s * dmask to smem for transpose
+        smem_s.store(frag_p);
+
+        // Trigger the load for the next Q values.
+        if (l + 1 < steps) {
+            gemm_q_k.smem_q.move_to_next_write_buffer();
+            gmem_q.move();
+            gmem_q.load();
+        }
+
+        // if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0 ) {
+        //     // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
+        //     __syncthreads();
+        // }
+
+        fmha::Fragment_accumulator acc_dp[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+        #pragma unroll
+        for (int mi = 0; mi < Mma_tile_p::MMAS_M; ++mi) {
+            #pragma unroll
+            for (int ni = 0; ni < Mma_tile_p::MMAS_N; ++ni) {
+                #pragma unroll
+                for (int ii = 0; ii < 8; ++ii) {
+                    acc_dp[mi][ni].elt(ii) = -dp_sum[mi * 2 + ((ii / 2) % 2)];
+                }
+            }
+        }
+
+        // Do this part of dP^T = (dO * V^T)^T.
+        #pragma unroll
+        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
+            // Trigger the load from shared memory for the next series of dO values.
+            smem_do.load(frag_do[ki & 1], ki);
+            if (!Kernel_traits::V_IN_REGS) {
+                smem_v.load(frag_v[ki & 1], ki);
+                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1) & 1]);
+            } else {
+                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[ki - 1]);
+            }
+            // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l < 4))  {
+            //     float2 tmp = __half22float2(reinterpret_cast<__half2 &>(frag_do[(ki - 1) & 1]));
+            //     printf("frag_do=%.6f, %.6f\n", tmp.x, tmp.y);
+            //     tmp = __half22float2(reinterpret_cast<__half2 &>(frag_v[(ki - 1) & 1]));
+            //     printf("frag_v=%.6f, %.6f\n", tmp.x, tmp.y);
+            // }
+        }
+
+        // Do the final stage of math.
+        {
+            int ki = Mma_tile_p::MMAS_K;
+            if (!Kernel_traits::V_IN_REGS) {
+                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1) & 1]);
+            } else {
+                fmha::gemm_cl<elem_type>(acc_dp, frag_do[(ki - 1) & 1], frag_v[(ki - 1)]);
+            }
+        }
+
+        auto pointwise_mult = [](float p, float dp, float d) {
+            return p * ((!Is_dropout) || p >= 0.f ? dp : d);
+        };
+        #pragma unroll
+        for (int mi = 0; mi < Mma_tile_p::MMAS_M; mi++) {
+            #pragma unroll
+            for (int ni = 0; ni < Mma_tile_p::MMAS_N; ni++) {
+                softmax.elt_[2 * mi + 0][4 * ni + 0] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 0], acc_dp[mi][ni].elt(0), dp_sum[2 * mi + 0]);
+                softmax.elt_[2 * mi + 0][4 * ni + 1] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 1], acc_dp[mi][ni].elt(1), dp_sum[2 * mi + 0]);
+                softmax.elt_[2 * mi + 0][4 * ni + 2] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 2], acc_dp[mi][ni].elt(4), dp_sum[2 * mi + 0]);
+                softmax.elt_[2 * mi + 0][4 * ni + 3] = pointwise_mult(softmax.elt_[2 * mi + 0][4 * ni + 3], acc_dp[mi][ni].elt(5), dp_sum[2 * mi + 0]);
+                softmax.elt_[2 * mi + 1][4 * ni + 0] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 0], acc_dp[mi][ni].elt(2), dp_sum[2 * mi + 1]);
+                softmax.elt_[2 * mi + 1][4 * ni + 1] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 1], acc_dp[mi][ni].elt(3), dp_sum[2 * mi + 1]);
+                softmax.elt_[2 * mi + 1][4 * ni + 2] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 2], acc_dp[mi][ni].elt(6), dp_sum[2 * mi + 1]);
+                softmax.elt_[2 * mi + 1][4 * ni + 3] = pointwise_mult(softmax.elt_[2 * mi + 1][4 * ni + 3], acc_dp[mi][ni].elt(7), dp_sum[2 * mi + 1]);
+            }
+        }
+
+        // Load the fragments for K^T.
+        typename Smem_tile_kt::Fragment frag_kt[2][Mma_tile_dq::MMAS_N];
+        smem_kt.load(frag_kt[0], 0);
+
+        // Trigger the load for the next dO values.
+        if (l + 1 < steps) {
+            smem_do.move_to_next_write_buffer();
+            gmem_do.move();
+            gmem_do.load();
+            if (Is_first) {
+                gmem_o.move();
+                gmem_o.load();
+            }
+        }
+
+        softmax.template pack<elem_type>(frag_p);
+
+        // Store dp to smem for transpose
+        smem_dp.store(frag_p);
+
+        // gmem_s.store(frag_p, mask);
+        // gmem_s.move();
+
+        // Declare the accumulators for the 2nd gemm.
+        fmha::Fragment_accumulator acc_dq[Mma_tile_dq::MMAS_M][Mma_tile_dq::MMAS_N];
+        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_dq::WARPS_K>::apply(acc_dq);
+
+        // Do this part of O = P^T * V^T.
+        #pragma unroll
+        for( int ki = 1; ki < Mma_tile_dq::MMAS_K; ++ki ) {
+            // Trigger the load from shared memory for the next series of Q values.
+            smem_kt.load(frag_kt[ki & 1], ki);
+            // Do the math for the values already in registers.
+            fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1) & 1]);
+            // fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1)]);
+        }
+        // Do the final stage of math.
+        {
+            int ki = Mma_tile_dq::MMAS_K;
+            fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1) & 1]);
+            // fmha::gemm_cl<elem_type>(acc_dq, frag_p[ki - 1], frag_kt[(ki - 1)]);
+        }
+
+        static_assert(Gmem_tile_dq::LOOPS == 1);
+
+        // Swizzle the elements and do the final reduction.
+        // Need to syncthreads here, otherwise the smem_dq reads from the previous iteration
+        // might happen after the smem_dq writes in this iteration.
+        __syncthreads();
+        smem_dq.store(acc_dq, 0);
+
+        typename Smem_tile_dot::Fragment frag_dot[2][Mma_tile_dkv::MMAS_N];
+        static_assert(Smem_tile_dot::Fragment::NUM_REGS == 4);
+        static_assert(Mma_tile_dkv::MMAS_K == 1);
+        smem_dot.load(frag_dot[0], 0);
+
+        // Threads in a warp is communicating via shared memory (smem_s and smem_dp)
+        __syncwarp();
+        typename Smem_tile_st::Fragment frag_s[Mma_tile_dkv::MMAS_K][Mma_tile_dkv::MMAS_M];
+        smem_s.load(frag_s);
+
+        if (Is_dropout) {
+            #pragma unroll
+            for( int ki = 0; ki < Mma_tile_dkv::MMAS_K; ki++ ) {
+                #pragma unroll
+                for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
+                    frag_s[ki][mi].template hrelu_<elem_type>();
+                }
+            }
+        }
+
+        #pragma unroll
+        for( int ki = 1; ki < Mma_tile_dkv::MMAS_K; ++ki ) {
+            // Trigger the load from shared memory for the next series of Q values.
+            smem_dot.load(frag_dot[ki & 1], ki);
+            // Do the math for the values already in registers.
+            fmha::gemm_cl<elem_type>(acc_dv, frag_s[(ki - 1)], frag_dot[(ki - 1) & 1]);
+        }
+
+        // Do the final stage of math.
+        {
+            int ki = Mma_tile_dkv::MMAS_K;
+            fmha::gemm_cl<elem_type>(acc_dv, frag_s[(ki - 1)], frag_dot[(ki - 1) & 1]);
+        }
+
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+        //     float2 tmp0 = __half22float2(reinterpret_cast<__half2 &>(frag_dot[0][0]));
+        //     printf("frag_dot[0][0]=%.6f, %.6f\n", tmp0.x, tmp0.y);
+        //     float2 tmp1 = __half22float2(reinterpret_cast<__half2 &>(frag_dot[0][1]));
+        //     printf("frag_dot[0][1]=%.6f, %.6f\n", tmp1.x, tmp1.y);
+        // }
+
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+        //     printf("l = %d, acc_dv[0][0]=%.6f, %.6f\n", l, acc_dv[0][0].elt(2), acc_dv[0][0].elt(3));
+        //     printf("l = %d, acc_dv[0][1]=%.6f, %.6f\n", l, acc_dv[0][1].elt(2), acc_dv[0][1].elt(3));
+        // }
+        // __syncthreads();
+        // Commit the values for Q and dO into shared memory.
+        if (l + 1 < steps) {
+            gmem_q.commit(gemm_q_k.smem_q);
+        }
+
+        uint4 dq_out[Gmem_tile_dq::STGS_PER_LOOP];
+        if (!Is_first && !Seq_parallel) { gmem_dq_tmp.load(dq_out, 0); }
+
+        // __syncthreads();
+        // Commit the values for Q and dO into shared memory.
+        if (l + 1 < steps) {
+            gmem_do.commit(smem_do);
+            gmem_softmax_d.move();
+            if (Is_first) {
+                dot_do_o<Gmem_tile_do::ROWS, Gmem_tile_do::THREADS_PER_ROW, elem_type>(
+                    gmem_do.fetch_, gmem_o.fetch_, params.p_dropout, gmem_softmax_d, tidx
+                );
+            }
+            gmem_softmax_lse.move();
+            gmem_softmax_lse.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_lse));
+        }
+
+        typename Smem_tile_st::Fragment frag_dpt[Mma_tile_dkv::MMAS_K][Mma_tile_dkv::MMAS_M];
+        smem_dp.load(frag_dpt);
+
+        gemm_q_k.reload_k();
+
+        typename Smem_tile_qt::Fragment frag_qt[2][Mma_tile_dkv::MMAS_N];
+        static_assert(Smem_tile_qt::Fragment::NUM_REGS == 4);
+        static_assert(Mma_tile_dkv::MMAS_K == 1);
+        smem_qt.load(frag_qt[0], 0);
+
+        #pragma unroll
+        for( int ki = 1; ki < Mma_tile_dkv::MMAS_K; ++ki ) {
+            // Trigger the load from shared memory for the next series of Q values.
+            smem_qt.load(frag_qt[ki & 1], ki);
+            // Do the math for the values already in registers.
+            fmha::gemm_cl<elem_type>(acc_dk, frag_dpt[(ki - 1)], frag_qt[(ki - 1) & 1]);
+        }
+
+        // Do the final stage of math.
+        {
+            int ki = Mma_tile_dkv::MMAS_K;
+            fmha::gemm_cl<elem_type>(acc_dk, frag_dpt[(ki - 1)], frag_qt[(ki - 1) & 1]);
+        }
+
+        // Make sure dQ is in shared memory.
+        __syncthreads();
+
+        if (l + 1 < steps) {
+            gmem_softmax_d.load(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(dp_sum));
+        }
+
+        // Load from shared memory.
+        smem_dq.template load</*zero_init=*/Is_first || Seq_parallel>(dq_out);
+
+        if (!Seq_parallel) {
+            const bool is_final_write =
+                Is_last
+                || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
+                || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
+            if (is_final_write) {
+                // if (Is_dropout) {
+                //     dq_out[0] = fmha::fmul4(dq_out[0], params.rp_dropout);
+                // }
+                for (int jj = 0; jj < Gmem_tile_dq::STGS_PER_LOOP; ++jj) {
+                    // dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1f);
+                    dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1_rp_dropout);
+                }
+                // Output the values.
+                gmem_dq.template store<elem_type>(dq_out, 0);
+                // Move to the next part of the output.
+                gmem_dq.move();
+                // TODO: for parallel, need to deal with the dropout scaling
+            } else  {
+                // Output the values.
+                gmem_dq_tmp.store(dq_out, 0);
+            }
+        } else {
+            // We always scale dq_out before writing in this case, since we don't want to
+            // have to scale at the end when copying from dq_tmp to dq.
+            for (int jj = 0; jj < Gmem_tile_dq::STGS_PER_LOOP; ++jj) {
+                // dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1f);
+                dq_out[jj] = fmha::fmul4(dq_out[jj], params.scale_bmm1_rp_dropout);
+            }
+            gmem_dq_tmp.atomic_add(dq_out, 0);
+        }
+
+        // Move to the next part of the output.
+        if (!(Is_first && Is_last)) { gmem_dq_tmp.move(); }
+
+        // // Make sure the data is in shared memory.
+        // __syncthreads();
+
+        // Commit the values for Q and dO into shared memory.
+        if (l + 1 < steps) {
+            gemm_q_k.smem_q.move_to_next_read_buffer();
+            gemm_q_k.reload_q();
+            smem_qt.move_to_next_read_buffer();
+            // smem_qt.load(frag_qt[0], 0);
+            smem_do.move_to_next_read_buffer();
+            smem_dot.move_to_next_read_buffer();
+            // smem_dot.load(frag_dot[0], 0);
+        }
+
+    }  // Outer loop over the sequence length.
+
+    if (Is_dropout) {
+        for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
+            for( int ni = 0; ni < Mma_tile_dkv::MMAS_N; ni++ ) {
+                acc_dv[mi][ni].mul_(params.rp_dropout);
+            }
+        }
+    }
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+    //     printf("l final, acc_dv[0][0]=%.6f, %.6f\n", acc_dv[0][0].elt(2), acc_dv[0][0].elt(3));
+    //     printf("l final, acc_dv[0][1]=%.6f, %.6f\n", acc_dv[0][1].elt(2), acc_dv[0][1].elt(3));
+    // }
+    for( int mi = 0; mi < Mma_tile_dkv::MMAS_M; mi++ ) {
+        for( int ni = 0; ni < Mma_tile_dkv::MMAS_N; ni++ ) {
+            // acc_dk[mi][ni].mul_(Is_dropout ? params.rp_dropout * params.scale_bmm1f : params.scale_bmm1f);
+            // acc_dk[mi][ni].mul_(params.scale_bmm1f);
+            acc_dk[mi][ni].mul_(params.scale_bmm1_rp_dropout);
+        }
+    }
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+    //     printf("l final, acc_dk=%.6f, %.6f\n", acc_dk[0][0].elt(0), acc_dk[0][0].elt(1));
+    // }
+
+    __syncthreads();
+    // TODO [TD - 2022-05-04]: Are there cases where the shared mem for dV and dK are larger than
+    // the total amount of shared mem?
+    // Epilogue swizzle for dV
+    Smem_tile_dv smem_dv(&smem_[0], tidx);
+    smem_dv.template store<elem_type>(acc_dv);
+
+    // Epilogue swizzle for dK
+    Smem_tile_dk smem_dk(&smem_[Smem_tile_dv::BYTES_PER_TILE], tidx);
+    smem_dk.template store<elem_type>(acc_dk);
+
+    __syncthreads();
+    uint4 dv_out[Smem_tile_dv::NUM_LDS];
+    smem_dv.load(dv_out);
+    Gmem_tile_dv gmem_dv(params.dv_ptr, params.dv_row_stride_in_elts, params.dv_head_stride_in_elts,
+                         params.d, binfo, tidx, false);
+    if (!Is_first) {
+        gmem_dv.move(loop_step_idx);
+    }
+    gmem_dv.store(dv_out);
+
+    uint4 dk_out[Smem_tile_dk::NUM_LDS];
+    smem_dk.load(dk_out);
+    Gmem_tile_dk gmem_dk(params.dk_ptr, params.dk_row_stride_in_elts, params.dk_head_stride_in_elts,
+                         params.d, binfo, tidx, false);
+    if (!Is_first) {
+        gmem_dk.move(loop_step_idx);
+    }
+    gmem_dk.store(dk_out);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// loop_steps = -1 means the number of steps will be params.seqlen_k / Kernel_traits::Cta_tile_p::N.
+// This template parameter is there so we can specialize with loop_steps == 1 and loop_steps == 2.
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, int loop_steps=-1, typename Params>
+inline __device__ void compute_dq_dk_dv_1xN(const Params &params) {
+    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
+
+    // The block index for the batch.
+    const int bidb = blockIdx.x;
+    // The block index for the head.
+    const int bidh = blockIdx.y;
+    // The thread index.
+    const int tidx = threadIdx.x;
+    auto seeds = at::cuda::philox::unpack(params.philox_args);
+    Philox ph(std::get<0>(seeds), 0,  std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32);
+
+    if (loop_steps == 1) {
+        compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
+    } else if (loop_steps == 2) {
+        compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, false>(params, ph, 0);
+        compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, true>(params, ph, 1);
+    } else {
+        if (params.seqlen_k == blocksize_c) {
+            compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
+        } else {
+            const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
+            compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, false>(params, ph, 0);
+            for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
+                compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, false>(params, ph, loop_step_idx);
+            }
+            compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, true>(params, ph, max_loop_steps - 1);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, typename Params>
+inline __device__ void compute_dq_dk_dv_seqparallel(const Params &params) {
+    // The block index for the batch.
+    const int bidb = blockIdx.x;
+    // The block index for the head.
+    const int bidh = blockIdx.y;
+    // The thread index.
+    const int tidx = threadIdx.x;
+    auto seeds = at::cuda::philox::unpack(params.philox_args);
+    Philox ph(std::get<0>(seeds), 0, std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32);
+
+    int loop_step_idx = blockIdx.z;
+    compute_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, false, false, /*Seq_parallel=*/true>(params, ph, loop_step_idx);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h
index 1a41438c6627..c3f487321983 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h
@@ -29,28 +29,11 @@
 #pragma once
 
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
-#include <ATen/native/transformers/cuda/flash_attn/mask.h>
+
 #include <ATen/native/transformers/cuda/flash_attn/fmha_kernel.h>
-#include <ATen/native/transformers/cuda/flash_attn/softmax.h>
-#include <ATen/native/transformers/cuda/flash_attn/epilogue.h>
-
-#include <cutlass/cutlass.h>
-#include <cutlass/layout/layout.h>
-#include <cutlass/array.h>
-#include <cutlass/numeric_types.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/arch/mma.h>
-#include <cutlass/gemm/warp/default_mma_tensor_op.h>
-#include <cutlass/gemm/warp/mma_tensor_op_tile_iterator.h>
-#include <cutlass/gemm/threadblock/default_mma_core.h>
-#include <cutlass/gemm/threadblock/default_mma_core_sm75.h>
-#include <cutlass/gemm/threadblock/default_mma_core_sm80.h>
-#include <cutlass/epilogue/warp/fragment_iterator_tensor_op.h>
-#include <cutlass/epilogue/warp/tile_iterator_tensor_op.h>
-#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
-#include <cutlass/epilogue/threadblock/default_epilogue_tensor_op.h>
-#include <cutlass/epilogue/threadblock/predicated_tile_iterator.h>
+#include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
+#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
 
 namespace fmha {
 
@@ -58,89 +41,89 @@ namespace fmha {
 
 template<typename Kernel_traits>
 struct Gemm_Q_K_base {
-    using Smem_O = fmha::FMHAEpilogue<typename Kernel_traits::MmaCorePV>;
-    using WarpMma = typename Kernel_traits::MmaCoreQK::MmaTensorOp;
+    using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+    using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+    using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+    using Fragment_q = typename Smem_tile_q::Fragment;
+    using Fragment_k = typename Smem_tile_k::Fragment;
 
     // The description of the CTA tile for the 1st batched GEMM.
     using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
 
-    static constexpr size_t SMEM_BYTES_SOFTMAX = Cta_tile_p::M * Cta_tile_p::WARPS_N * sizeof(float) * 2;
+    // The MMA tile for the 1st GEMM.
+    using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
 
-    __device__ inline Gemm_Q_K_base(char * smem_ptr_q, char * smem_ptr_k)
-        : smem_q_ptr(smem_ptr_q)
-        , smem_k_ptr(smem_ptr_k) {
+    static constexpr int SMEM_BYTES_SOFTMAX = Cta_tile_p::M * Cta_tile_p::WARPS_N * sizeof(float) * 2;
 
-    }
+    __device__ inline Gemm_Q_K_base(char * smem_ptr_q, char * smem_ptr_k, const int tidx)
+        : smem_q(smem_ptr_q, tidx)
+        , smem_k(smem_ptr_k, tidx) {
 
-    __device__ inline void load_q(int byte_offset=0) {
-        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Cta_tile_p::M, Cta_tile_p::K});
-        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementA *>(smem_q_ptr + byte_offset), layout_A}, threadIdx.x % 32);
-        iter_A.load(frag_q[0]);
     }
 
+    __device__ inline void load_q() {
+        smem_q.load(frag_q[0], 0);
+    }
 
-    __device__ inline void reload_q(int byte_offset=0) {
-        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Cta_tile_p::M, Cta_tile_p::K});
-        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementA *>(smem_q_ptr + byte_offset), layout_A}, threadIdx.x % 32);
-        iter_A.load(frag_q[0]);
+    __device__ inline void reload_q() {
+        smem_q.load(frag_q[0], 0);
     }
 
-    typename WarpMma::FragmentA frag_q[2];
-    char *smem_q_ptr;
-    char *smem_k_ptr;
+    Fragment_q frag_q[2][Mma_tile_p::MMAS_M];
+    Smem_tile_q smem_q;
+    Smem_tile_k smem_k;
 };
 
-template<typename Kernel_traits, bool K_in_regs>
+template<typename Kernel_traits, bool K_in_regs, typename elem_type_=__half>
 struct Gemm_Q_K : public Gemm_Q_K_base<Kernel_traits> {
 
     using Base = Gemm_Q_K_base<Kernel_traits>;
-    using Cta_tile_p = typename Base::Cta_tile_p;
-    using Smem_O = typename Base::Smem_O;
-    using WarpMma = typename Base::WarpMma;
-
-    static constexpr int kIterations = WarpMma::Shape::kK / WarpMma::InstructionShape::kK;
+    using Smem_tile_o = typename Base::Smem_tile_o;
+    using Smem_tile_q = typename Base::Smem_tile_q;
+    using Smem_tile_k = typename Base::Smem_tile_k;
+    using Fragment_k = typename Base::Fragment_k;
+    using Mma_tile_p = typename Base::Mma_tile_p;
+    using elem_type = elem_type_;
 
     static constexpr bool SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
     // If V is stored in shared memory, we can't load K using the same shared memory.
-    static_assert(Kernel_traits::V_IN_REGS, "");
+    static_assert(Kernel_traits::V_IN_REGS);
 
-    static constexpr size_t SMEM_OFFSET_O = Kernel_traits::BYTES_PER_SMEM_Q;
-    static constexpr size_t SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + sizeof(typename Smem_O::SharedStorage);
-    static constexpr size_t SMEM_OFFSET_V = Kernel_traits::BYTES_PER_SMEM_Q + (SHARE_SMEM_FOR_K_AND_V ? 0 : Kernel_traits::BYTES_PER_SMEM_K);
+    static constexpr int SMEM_OFFSET_O = Smem_tile_q::BYTES_PER_TILE;
+    static constexpr int SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE;
+    static constexpr int SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE + (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE);
 
     // Q | K / V
     //   | O | SOFTMAX
-    static constexpr size_t SMEM_BYTES = Kernel_traits::BYTES_PER_SMEM_Q
-        + std::max((size_t)(SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Kernel_traits::BYTES_PER_SMEM_K,
-                   sizeof(typename Smem_O::SharedStorage) + Base::SMEM_BYTES_SOFTMAX);
+    static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE
+                                    + std::max((SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE,
+                                               Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX);
 
-    __device__ inline Gemm_Q_K(char * smem_)
-        : Base(smem_, smem_ + Kernel_traits::BYTES_PER_SMEM_Q) {
+    __device__ inline Gemm_Q_K(char * smem_, const int tidx)
+        : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {
     }
 
     __device__ inline void load_k(){
-        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
-        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
-        const int warp_idx = threadIdx.x / 32;
-        iter_B.add_tile_offset({0, warp_idx});
         #pragma unroll
-        for( int ki = 0; ki < kIterations; ++ki ) {
-            iter_B.load(frag_k[ki]);
-            ++iter_B;
+        for( int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki ) {
+            Base::smem_k.load(frag_k[ki], ki);
         }
     }
 
-    __device__ inline void operator()(WarpMma warp_mma, typename WarpMma::FragmentC &acc_p, int byte_offset_q=0){
-        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Base::Cta_tile_p::M, Base::Cta_tile_p::K});
-        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_q_ptr + byte_offset_q), layout_A}, threadIdx.x % 32);
-        ++iter_A;
+    template<typename Acc, int M, int N>
+    __device__ inline void operator()(Acc (&acc_p)[M][N]){
         // Do this part of P^T = (Q * K^T)^T.
         #pragma unroll
-        for( int ki = 0; ki < kIterations; ++ki ) {
+        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
             // Trigger the load from shared memory for the next series of Q values.
-            if (ki + 1 < kIterations) { iter_A.load(Base::frag_q[(ki + 1) % 2]); ++iter_A; }
+            Base::smem_q.load(Base::frag_q[ki & 1], ki);
             // Do the math for the values already in registers.
-            warp_mma(acc_p, Base::frag_q[ki % 2], frag_k[ki], acc_p);
+            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+        }
+        // Do the final stage of math.
+        {
+            int ki = Mma_tile_p::MMAS_K;
+            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
         }
     }
 
@@ -148,75 +131,66 @@ struct Gemm_Q_K : public Gemm_Q_K_base<Kernel_traits> {
         // Noop.
     }
 
-    typename WarpMma::FragmentB frag_k[kIterations];
+    Fragment_k frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
 };
 
 
-template<typename Kernel_traits>
-struct Gemm_Q_K<Kernel_traits, false> : public Gemm_Q_K_base<Kernel_traits> {
+template<typename Kernel_traits, typename elem_type_>
+struct Gemm_Q_K<Kernel_traits, false, elem_type_> : public Gemm_Q_K_base<Kernel_traits> {
     using Base = Gemm_Q_K_base<Kernel_traits>;
-    using Cta_tile_p = typename Base::Cta_tile_p;
-    using Smem_O = typename Base::Smem_O;
-    using WarpMma = typename Base::WarpMma;
+    using Smem_tile_o = typename Base::Smem_tile_o;
+    using Smem_tile_q = typename Base::Smem_tile_q;
+    using Smem_tile_k = typename Base::Smem_tile_k;
+    using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+    using Fragment_k = typename Base::Fragment_k;
+    using Mma_tile_p = typename Base::Mma_tile_p;
+    using elem_type = elem_type_;
+    Fragment_k frag_k[2][Mma_tile_p::MMAS_N];
 
     static constexpr bool SHARE_SMEM_FOR_K_AND_V = Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
     static constexpr bool V_IN_REGS = Kernel_traits::V_IN_REGS;
-    static_assert(V_IN_REGS || !SHARE_SMEM_FOR_K_AND_V, "");
+    static_assert(V_IN_REGS || !SHARE_SMEM_FOR_K_AND_V);
 
-    static constexpr size_t SMEM_OFFSET_V = Kernel_traits::BYTES_PER_SMEM_Q + (SHARE_SMEM_FOR_K_AND_V ? 0 : Kernel_traits::BYTES_PER_SMEM_K);
-    static constexpr size_t SMEM_OFFSET_O = SMEM_OFFSET_V + Kernel_traits::BYTES_PER_SMEM_V;
-    static constexpr size_t SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + sizeof(typename Smem_O::SharedStorage);
+    static constexpr int SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE + (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE);
+    static_assert(Smem_tile_v::BYTES_PER_TILE == (int) Smem_tile_k::BYTES_PER_TILE);
+    static constexpr int SMEM_OFFSET_O = SMEM_OFFSET_V + Smem_tile_v::BYTES_PER_TILE;
+    static constexpr int SMEM_OFFSET_SOFTMAX = SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE;
 
     // If V_IN_REGS and SHARE_SMEM_FOR_K_AND_V:      Q | K/V | O | SOFTMAX
     // If !V_IN_REGS (then !SHARE_SMEM_FOR_K_AND_V): Q | K   | V | O | SOFTMAX
-    static constexpr size_t SMEM_BYTES = Kernel_traits::BYTES_PER_SMEM_Q
-        + (SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Kernel_traits::BYTES_PER_SMEM_K
-        + sizeof(typename Smem_O::SharedStorage) + Base::SMEM_BYTES_SOFTMAX;
+    static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE
+                                    + (SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE
+                                    + Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX;
 
-    __device__ inline Gemm_Q_K(char * smem_)
-        : Base(smem_, smem_ + Kernel_traits::BYTES_PER_SMEM_Q) {
+    __device__ inline Gemm_Q_K(char * smem_, const int tidx)
+      : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {
     }
 
     __device__ inline void load_k(){
-        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
-        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
-        const int warp_idx = threadIdx.x / 32;
-        iter_B.add_tile_offset({0, warp_idx});
-        iter_B.load(frag_k[0]);
+        Base::smem_k.load(frag_k[0], 0);
     }
 
-    __device__ inline void operator()(WarpMma warp_mma, typename WarpMma::FragmentC &acc_p, int byte_offset_q=0){
-        typename WarpMma::LayoutA layout_A = WarpMma::LayoutA::packed({Base::Cta_tile_p::M, Base::Cta_tile_p::K});
-        typename WarpMma::IteratorA iter_A({reinterpret_cast<typename WarpMma::ElementA *>(Base::smem_q_ptr + byte_offset_q), layout_A}, threadIdx.x % 32);
-        ++iter_A;
-        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
-        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
-        const int warp_idx = threadIdx.x / 32;
-        iter_B.add_tile_offset({0, warp_idx});
-        ++iter_B;
-
+    template<typename Acc, int M, int N>
+    __device__ inline void operator()(Acc (&acc_p)[M][N]){
         // Do this part of P^T = (Q * K^T)^T.
-        constexpr int kIterations = WarpMma::Shape::kK / WarpMma::InstructionShape::kK;
         #pragma unroll
-        for( int ki = 0; ki < kIterations; ++ki ) {
+        for( int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki ) {
             // Trigger the load from shared memory for the next series of Q values.
-            if (ki + 1 < kIterations) {
-                iter_A.load(Base::frag_q[(ki + 1) % 2]); ++iter_A;
-                iter_B.load(frag_k[(ki + 1) % 2]); ++iter_B;
-            }
+            Base::smem_q.load(Base::frag_q[ki & 1], ki);
+            Base::smem_k.load(frag_k[ki & 1], ki);
             // Do the math for the values already in registers.
-            warp_mma(acc_p, Base::frag_q[ki % 2], frag_k[ki % 2], acc_p);
+            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
+        }
+        // Do the final stage of math.
+        {
+            int ki = Mma_tile_p::MMAS_K;
+            fmha::gemm_cl<elem_type>(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
         }
     }
+
     __device__ inline void reload_k(){
-        typename WarpMma::LayoutB layout_B = WarpMma::LayoutB::packed({Cta_tile_p::K, Cta_tile_p::N});
-        typename WarpMma::IteratorB iter_B({reinterpret_cast<typename WarpMma::ElementB *>(Base::smem_k_ptr), layout_B}, threadIdx.x % 32);
-        const int warp_idx = threadIdx.x / 32;
-        iter_B.add_tile_offset({0, warp_idx});
-        iter_B.load(frag_k[0]);
+        Base::smem_k.load(frag_k[0], 0);
     }
-
-    typename WarpMma::FragmentB frag_k[2];
 };
 
 template<typename Kernel_traits>
@@ -225,7 +199,15 @@ constexpr size_t get_dynamic_smem_size(){
 }
 
 template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax, bool Is_first, bool Is_last, typename Params, typename Prng>
-inline __device__ void device_1xN_(const Params &params, const int bidb, const int bidh, int begin, int steps, Prng &ph0, Prng &ph1, const int loop_step_idx) {
+inline __device__ void device_1xN_(const Params &params, const int bidb, const int bidh, int steps, Prng &ph, const int loop_step_idx) {
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using elem_type = typename Kernel_traits::elem_type;
+#else
+    constexpr bool is_fp16_type = std::is_same<typename Kernel_traits::elem_type, __half>::value;
+    assert(is_fp16_type);
+    using elem_type = __half;
+#endif
 
     // The description of the CTA tile for the 1st batched GEMM.
     using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
@@ -237,49 +219,30 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
     // The MMA tile for the 2nd GEMM.
     using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
 
-    using InstructionShape = typename Kernel_traits::MmaInstructionShape;
-    using Element = typename Kernel_traits::Element;
-    using ElementAccum = typename Kernel_traits::ElementAccum;
-
-    using ThreadblockShapeQK = typename Kernel_traits::ThreadblockShapeQK;
-    using LayoutQ = typename Kernel_traits::LayoutQ;
-    using LayoutK = typename Kernel_traits::LayoutK;
-    using LayoutP = typename Kernel_traits::LayoutP;
-    using MmaCoreQK = typename Kernel_traits::MmaCoreQK;
-    using WarpMmaQK = typename MmaCoreQK::MmaTensorOp;
-    using SmemLayoutQ = typename MmaCoreQK::SmemLayoutA;
-    using SmemLayoutK = typename MmaCoreQK::SmemLayoutB;
-    using SmemIteratorQ = typename MmaCoreQK::SmemIteratorA;
-    using SmemIteratorK = typename MmaCoreQK::SmemIteratorB;
-
-    using ThreadblockShapePV = typename Kernel_traits::ThreadblockShapePV;
-    using LayoutV = typename Kernel_traits::LayoutV;
-    using LayoutO = typename Kernel_traits::LayoutO;
-    using MmaCorePV = typename Kernel_traits::MmaCorePV;
-    using WarpMmaPV = typename MmaCorePV::MmaTensorOp;
-    using WarpIteratorV = typename WarpMmaPV::IteratorB;
-    using SmemLayoutV = typename MmaCorePV::SmemLayoutB;
-    using SmemIteratorV = typename MmaCorePV::SmemIteratorB;
-    constexpr int kIterationsPV = WarpMmaPV::Shape::kK / WarpMmaPV::InstructionShape::kK;
-
     // The global memory tile to load Q.
-    // Copy from mma_piplined_testbed.h
-    using GmemIteratorQ = typename Kernel_traits::GmemIteratorQ;
+    using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+
     // The global memory tile to load K.
-    using GmemIteratorK = typename Kernel_traits::GmemIteratorK;
+    using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+
     // The global memory tile to load V.
-    using GmemIteratorV = typename Kernel_traits::GmemIteratorV;
+    using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+    // The shared memory tile to swizzle V.
+    using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
     // The global memory tile to store O.
-    using GmemIteratorO = typename fmha::FMHAEpilogue<MmaCorePV>::GmemIterator;
-    using GmemIteratorOAccum = typename fmha::FMHAEpilogue<MmaCorePV>::GmemIteratorAccum;
+    using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+    using Gmem_tile_o_tmp = fmha::Gmem_tile_o<Cta_tile_o, 4>;
+    // The shared memory tile to swizzle O.
+    using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
 
     using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
 
     using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
 
-    using Smem_softmax_lse = typename Kernel_traits::Smem_softmax_lse;
+    using Smem_softmax_sum = typename Kernel_traits::Smem_dp_sum;
 
-    using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
+    using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS, elem_type>;
 
     using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
 
@@ -289,120 +252,82 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
     // The thread index.
     const int tidx = threadIdx.x;
 
+    // How many steps to jump per iteration, which is the same as params.num_splits.
+    const int step_stride = gridDim.z;
+
     const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
+    // if( binfo.stop_early() ) return;
     if( binfo.stop_early(loop_step_idx * Cta_tile_p::N) ) return;
 
-    Gemm1 gemm_q_k(smem_);
+    Gemm1 gemm_q_k(smem_, tidx);
+    // Allocate the global memory tile loader for Q.
+    Gmem_tile_q gmem_q(params.q_ptr, params.q_row_stride_in_elts, params.q_head_stride_in_elts,
+                       params.d, binfo, tidx, true);
+    // Allocate the global memory tile loader for O.
+    Gmem_tile_o gmem_o(params.o_ptr, params.o_row_stride_in_elts, params.o_head_stride_in_elts,
+                       params.d, binfo, tidx);
+    Gmem_tile_o_tmp gmem_o_tmp(params.o_tmp_ptr, params.o_tmp_row_stride_in_elts,
+                               params.o_tmp_head_stride_in_elts, params.d, binfo, tidx);
     // Allocate the global memory tile loader for S.
     Gmem_tile_s gmem_s(params, binfo, tidx);
     Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
 
     // Wind gmem tiles to the correct position.
-    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0, "");
-    const int begin_og = begin;
-    begin = Is_causal ? std::max(begin, loop_step_idx * Cta_tile_p::N / Cta_tile_p::M) : begin;
+    static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
+    int begin = Is_causal ? loop_step_idx * Cta_tile_p::N / Cta_tile_p::M : 0;
+    // We want begin to be a multiple of gridDim.z
+    // This is because the row indices processed by each threadblock must align between the
+    // loop steps, otherwise we have a dependency between the blocks.
+    // For example, threadblock with blockIdx.z == 1 must process row indices that are
+    // k * gridDim.z + 1 for integer k.
+    const int begin_mod_z = begin % gridDim.z;
+    begin = begin_mod_z <= blockIdx.z ? begin - begin_mod_z : begin + gridDim.z - begin_mod_z;
+    // Otherwise we'd be reading out-of-bound memory before the loop
+    if ((begin + blockIdx.z) * Cta_tile_p::M >= binfo.actual_seqlen_q) return;
     const int steps_og = steps;
-    steps -= begin - begin_og;
-    if (Return_softmax) { gmem_s.move(begin); }
-    gmem_softmax_lse.move(begin);
+    steps -= begin;
+    gmem_q.move(begin + blockIdx.z);
+    gmem_o.move(begin + blockIdx.z);
+    gmem_o_tmp.move(begin + blockIdx.z);
+    if (Return_softmax) {
+        gmem_s.move(begin + blockIdx.z);
+    }
+    gmem_softmax_lse.move(begin + blockIdx.z);
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("begin = %d, steps = %d\n", begin, steps);
+    // }
 
     fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
 
+    // Allocate the global memory tile loader for K.
+    Gmem_tile_k gmem_k(params.k_ptr, params.k_row_stride_in_elts, params.k_head_stride_in_elts,
+                       params.d, binfo, tidx, false);
+    // Allocate the global memory tile loader for V.
+    Gmem_tile_v gmem_v(params.v_ptr, params.v_row_stride_in_elts, params.v_head_stride_in_elts,
+                       params.d, binfo, tidx, false);
     // The base pointer of smem_v;
-    char *smem_v_addr = &smem_[Gemm1::SMEM_OFFSET_V];
+    char *smem_v_ = &smem_[Gemm1::SMEM_OFFSET_V];
 
     // Allocate the shared memory tile loader for V. We use the same as K so be careful!!!
-
-    SmemLayoutQ layout_Q = SmemLayoutQ::packed({ThreadblockShapeQK::kM, ThreadblockShapeQK::kK});
-    SmemIteratorQ smem_q({reinterpret_cast<Element *>(smem_), layout_Q}, tidx);
-    SmemLayoutK layout_K = SmemLayoutK::packed({ThreadblockShapeQK::kK, ThreadblockShapeQK::kN});
-    SmemIteratorK smem_k({reinterpret_cast<Element *>(smem_ + Kernel_traits::BYTES_PER_SMEM_Q), layout_K}, tidx);
-    SmemLayoutV layout_V = SmemLayoutV::packed({ThreadblockShapePV::kK, ThreadblockShapePV::kN});
-    // SmemIterator stores to smem and WarpIterator loads from smem
-    SmemIteratorV smem_v({reinterpret_cast<Element *>(smem_v_addr), layout_V}, tidx);
-    WarpIteratorV iter_V({reinterpret_cast<Element *>(smem_v_addr), layout_V}, threadIdx.x % 32);
+    Smem_tile_v smem_v(smem_v_, tidx);
 
     // Allocate the shared memory tile loader for O. We use the same as K so be careful!!!
-    using Smem_O = fmha::FMHAEpilogue<MmaCorePV>;
-    Smem_O smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
-
-    // Allocate the global memory tile loader for Q.
-    // cutlass::transform::threadblock::PredicatedTileIterator deals with seqlen not divisible
-    // by 16 in a different way than we want. If the seqlen_q is 36, the first iteration would
-    // load 4 rows and the next two iterations would load 16 rows each. Instead we round the
-    // actual_seqlen_q to be multiple of 16, then change the mask in the last iteration, so
-    // that in this case we would load 16, 16, 4.
-    LayoutQ gmem_layout_Q(params.q_row_stride_in_elts);
-    typename GmemIteratorQ::Params gmem_Q_params(gmem_layout_Q);
-    const uint32_t row_offset_q = (binfo.sum_s_q + begin * ThreadblockShapeQK::kM) * params.q_row_stride_in_elts + binfo.bidh * params.q_head_stride_in_elts;
-    const int actual_seqlen_q = binfo.actual_seqlen_q - begin * ThreadblockShapeQK::kM;
-    const int seqlen_q_remainder = actual_seqlen_q % ThreadblockShapeQK::kM;
-    const int extent_q = ((actual_seqlen_q <= ThreadblockShapeQK::kM) || (seqlen_q_remainder == 0)) ? actual_seqlen_q : actual_seqlen_q + ThreadblockShapeQK::kM - seqlen_q_remainder;
-    GmemIteratorQ gmem_q(gmem_Q_params,
-                         reinterpret_cast<Element *>(params.q_ptr) + row_offset_q,
-                         {extent_q, params.d},
-                         tidx);
-
-    // Allocate the global memory tile loader for K.
-    LayoutK gmem_layout_K(params.k_row_stride_in_elts);
-    typename GmemIteratorK::Params gmem_K_params(gmem_layout_K);
-    const uint32_t row_offset_k = (binfo.sum_s_k + loop_step_idx * ThreadblockShapeQK::kN) * params.k_row_stride_in_elts + binfo.bidh * params.k_head_stride_in_elts;
-    const int extent_k = min(binfo.actual_seqlen_k - loop_step_idx * ThreadblockShapeQK::kN, ThreadblockShapeQK::kN);
-    GmemIteratorK gmem_k(gmem_K_params,
-                         reinterpret_cast<Element *>(params.k_ptr) + row_offset_k,
-                         {params.d, extent_k},
-                         tidx);
-
-    // Allocate the global memory tile loader for V.
-    LayoutV gmem_layout_V(params.v_row_stride_in_elts);
-    typename GmemIteratorV::Params gmem_V_params(gmem_layout_V);
-    const uint32_t row_offset_v = (binfo.sum_s_k + loop_step_idx * ThreadblockShapePV::kK) * params.v_row_stride_in_elts + binfo.bidh * params.v_head_stride_in_elts;
-    // extent_v is the same as extent_k
-    GmemIteratorV gmem_v(gmem_V_params,
-                         reinterpret_cast<Element *>(params.v_ptr) + row_offset_v,
-                         {extent_k, params.d},
-                         tidx);
-
-    // Allocate the global memory tile loader for O.
-    LayoutO gmem_layout_O(params.o_row_stride_in_elts);
-    typename GmemIteratorO::Params gmem_O_params(gmem_layout_O);
-    const uint32_t row_offset_o = (binfo.sum_s_q + begin * ThreadblockShapeQK::kM) * params.o_row_stride_in_elts + binfo.bidh * params.o_head_stride_in_elts;
-    GmemIteratorO gmem_o(gmem_O_params,
-                         reinterpret_cast<Element *>(params.o_ptr) + row_offset_o,
-                         {actual_seqlen_q, params.d},
-                         tidx);
-
-    typename GmemIteratorOAccum::Params gmem_Oaccum_params(gmem_layout_O);
-    GmemIteratorOAccum gmem_o_accum(gmem_Oaccum_params,
-                                    reinterpret_cast<ElementAccum *>(params.o_tmp_ptr) + row_offset_o,
-                                    {actual_seqlen_q, params.d},
-                                    tidx);
-
-    // Create the object to do the softmax.
-    Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
-
-    Smem_softmax_lse smem_softmax_lse(reinterpret_cast<float *>(&smem_[Gemm1::SMEM_BYTES]));
+    Smem_tile_o smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
 
     if (!Is_first) {
+        gmem_k.move(loop_step_idx);
+        gmem_v.move(loop_step_idx);
         if (Return_softmax) { gmem_s.move(loop_step_idx * steps_og); }
     }
 
-    if (!Is_first) { __syncthreads(); }
-
-    // Trigger the loads for V.
-    typename GmemIteratorV::Fragment gmem_frag_v;
-    gmem_frag_v.clear();
-    gmem_v.load(gmem_frag_v);
-
+    // Trigger the loads for K.
+    gmem_k.load();
     // Trigger the loads for Q.
-    typename GmemIteratorQ::Fragment gmem_frag_q;
-    gmem_frag_q.clear();
-    gmem_q.load(gmem_frag_q);
+    gmem_q.load();
+    // Trigger the loads for V.
+    gmem_v.load();
 
-    // Trigger the loads for K.
-    typename GmemIteratorK::Fragment gmem_frag_k;
-    gmem_frag_k.clear();
-    gmem_k.load(gmem_frag_k);
+    if (!Is_first) { __syncthreads(); }
 
     float p_prev_lse[Mma_tile_p::MMAS_M * 2];
     if (!Is_first) {
@@ -410,12 +335,18 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
     }
 
     // Commit the data for Q and V to shared memory.
-    smem_v.store(gmem_frag_v);
-    smem_q.store(gmem_frag_q);
+    gmem_q.commit(gemm_q_k.smem_q);
+    gmem_v.commit(smem_v);
+
+    // const uint32_t scale_bmm1 = reinterpret_cast<const uint32_t&>(params.scale_bmm1);
+    // #pragma unroll
+    // for(int it=0;it < Gmem_tile_k::LDGS;it++){
+    //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
+    // }
 
     // Commit the data for K to shared memory.
     if( !Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
-        smem_k.store(gmem_frag_k);
+        gmem_k.commit(gemm_q_k.smem_k);
     }
 
     __syncthreads();
@@ -423,25 +354,20 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
     // Load the fragments for Q.
     gemm_q_k.load_q();
 
-    // Load the fragments for V. We keep the data in registers during the entire
-    // kernel. copied from mma_pipelined.h
-    const int warp_idx = threadIdx.x / 32;
-    iter_V.add_tile_offset({kIterationsPV * warp_idx, 0});
-    typename WarpIteratorV::Fragment frag_v[kIterationsPV];
-    static_assert(WarpIteratorV::Fragment::kStorageElements == 4 * Mma_tile_o::MMAS_N || WarpIteratorV::Fragment::kStorageElements == 2 * Mma_tile_o::MMAS_N, "");
+    // Load the fragments for V. We keep the data in registers during the entire kernel.
+    typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
     #pragma unroll
-    for( int ki = 0; ki < kIterationsPV; ++ki ) {
-        iter_V.load(frag_v[ki]);
-        ++iter_V;
+    for( int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki ) {
+        smem_v.load(frag_v[ki], ki);
     }
 
-    // Commit the data for K to shared memory if it has not been done already.
+    // Commit the data for V to shared memory if it has not been done already.
     if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V ) {
         // Make sure we are done loading the fragments for K.
         __syncthreads();
 
-        // Commit the data to shared memory for K.
-        smem_k.store(gmem_frag_k);
+        // Commit the data to shared memory for V.
+        gmem_k.commit(gemm_q_k.smem_k);
 
         // Make sure the data is in shared memory.
         __syncthreads();
@@ -450,43 +376,37 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
     // Load the fragments for K.
     gemm_q_k.load_k();
 
+    // Create the object to do the softmax.
+    Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
+
+    Smem_softmax_sum smem_softmax_lse(reinterpret_cast<float *>(&smem_[Gemm1::SMEM_BYTES]), tidx);
+
     // Load over the entire sequence length.
-    for( int l = 0; l < steps; l++ ) {
-        if((begin + l) * Cta_tile_p::M >= binfo.actual_seqlen_q) break;
+    for (int l = blockIdx.z; l < steps; l += step_stride) {
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z <= 1)) {
+        //     printf("l = %d\n", l);
+        // }
+        if ((begin + l) * Cta_tile_p::M >= binfo.actual_seqlen_q) break;
 
         // Declare the accumulators for the 1st gemm.
-        WarpMmaQK mma_qk;
-        typename WarpMmaQK::FragmentC acc_p;
-        acc_p.clear();
+        fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_p::WARPS_K>::apply(acc_p);
 
         // Do this part of P = Q * K^T.
-        gemm_q_k(mma_qk, acc_p);
+        gemm_q_k(acc_p);
 
-        typename Smem_O::OutputFragment out[Smem_O::kIterationsStore];
-        static_assert(GmemIteratorOAccum::kIterations == Smem_O::kIterationsStore, "");
-        static_assert(GmemIteratorO::kIterations == Smem_O::kIterationsStore, "");
-        if (!Is_first) {
-            #pragma unroll
-            for (int iter = 0; iter < GmemIteratorOAccum::kIterations; ++iter) {
-                gmem_o_accum.load(out[iter]);
-                gmem_o_accum.move();
-            }
-        }
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+        //     printf("acc_p=%.6f, %.6f\n", acc_p[0][0].elt(0), acc_p[0][0].elt(1));
+        // }
+
+        uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+        if (!Is_first) { gmem_o_tmp.load(out, 0); }
 
         // Trigger the load for the next Q values.
-        if( l < steps - 1) {
-            ++gmem_q;
-            // If actual_seqlen_q is not a multiple of 16, we change the mask in the last iteration
-            // to load the "residue" tile.
-            if ((l + 1 == steps - 1) && (actual_seqlen_q % ThreadblockShapeQK::kM != 0)) {
-                // TODO: this probably only works for head_dim = 64 and head_dim = 128, which is
-                // what we have right now. Maybe for head_dim = 32 or 96, this could be different.
-                const int row_idx = tidx / (GmemIteratorQ::Shape::kColumn / GmemIteratorQ::Fragment::kElements);
-                if (row_idx >= actual_seqlen_q - (l + 1) * ThreadblockShapeQK::kM) {
-                    gmem_q.clear_mask();
-                }
-            }
-            gmem_q.load(gmem_frag_q);
+        if (l + step_stride < steps) {
+            gemm_q_k.smem_q.move_to_next_write_buffer();
+            gmem_q.move(step_stride);
+            gmem_q.load();
         }
 
         // Load the mask for that iteration.
@@ -498,187 +418,245 @@ inline __device__ void device_1xN_(const Params &params, const int bidb, const i
         // Apply the mask.
         softmax.apply_mask(mask);
 
-        if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0 ) {
+        if( Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l < step_stride ) {
             // if we share K and V, it could be that V was not fully read yet but we write into smem for reduction
             __syncthreads();
         }
-
+        // if (!Is_first) {
+        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l >= 0))  {
+        //         printf("p_prev_lse=%.6f, %.6f\n", p_prev_lse[0], p_prev_lse[1]);
+        //     }
+        // }
         // Compute the max.
         float p_max[Mma_tile_p::MMAS_M * 2];
         if (!Is_first) {
             smem_softmax_lse.store_pair(p_prev_lse);
-            for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi] / params.scale_bmm1; }
+            // for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi]; }
+            for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] = p_prev_lse[mi] / params.scale_bmm1f; }
         }
 
         // Trigger the load for the next LSE values.
-        if( l < steps - 1) {
+        if (l + step_stride < steps) {
             if (!Is_first) {
-                gmem_softmax_lse.load_next(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+                gmem_softmax_lse.load_next(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
+                                           step_stride);
             }
         }
 
         softmax.template reduce_max</*zero_init=*/Is_first>(p_max);
 
+        // if ((threadIdx.x == 0) && (l == 38)) {
+        //     printf("loop_step_idx %d, p_max = %.6f, %.6f., p_prev_lse = %.6f, %.6f\n", loop_step_idx, p_max[0], p_max[1], Is_first ? -10000.f : p_prev_lse[0], Is_first ? -10000.f : p_prev_lse[1]);
+        // }
+
+        // if (!Is_first) {
+        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+        //         printf("after reduce_max=%.6f, %.6f\n", softmax.elt_[0][0], softmax.elt_[0][1]);
+        //     }
+        // }
+
         // Compute the exponential value.
-        softmax.scale_apply_exp(p_max, params.scale_bmm1);
+        // softmax.apply_exp(p_max);
+        softmax.scale_apply_exp(p_max, params.scale_bmm1f);
+
+        // if (!Is_first) {
+        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+        //         printf("after apply_exp=%.6f, %.6f\n", softmax.elt_[0][0], softmax.elt_[0][1]);
+        //     }
+        // }
 
-        // We don't finalize the sum reduction here, as that would incur an extra sync_threads().
-        // Instead, we reduce the sum from each warp, write to smem, then wait until the sync_threads()
-        // from storing acc_o. Then we read the sum of each warp from smem and finalize the reduction.
-        // As a consequence, we don't scale acc_p by the inverse sum, we scale the output by the inverse sum.
         // Compute the sum.
         float p_sum[Mma_tile_p::MMAS_M * 2];
+        // if (!Is_first) {
+        //     int warp = tidx / Cta_tile_p::THREADS_PER_WARP;
+        //     int lane = tidx % Cta_tile_p::THREADS_PER_WARP;
+        //     for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
+        //         p_sum[mi] = ((warp == 0) && (lane % 4 == 0)) ? expf(p_prev_lse[mi] - p_max[mi]) : 0;
+        //     }
+        // }
         // softmax.reduce_sum(p_sum);
         softmax.reduce_sum_before_sync_(p_sum);
+        // softmax.template reduce_sum_before_sync_</*zero_init=*/Is_first>(p_sum);
+
+        // float p_sum_log[Mma_tile_p::MMAS_M * 2];
+        // for (int mi = 0; mi  < Mma_tile_p::MMAS_M * 2; ++mi) {
+        //     float sum = p_sum[mi];
+        //     // p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi] + __logf(sum);
+        //     constexpr float kLog2e = M_LOG2E;
+        //     p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi] * kLog2e + __log2f(sum);
+        // }
+        // // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_sum));
+        // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_sum_log));
+        // gmem_softmax_lse.move();
+
+        // // Finalize softmax on the accumulators of P^T.
+        // softmax.scale(p_sum);
 
         constexpr bool encode_dropout_in_sign_bit = Return_softmax;
         if (Is_dropout) {
-            softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph0, ph1, params.p_dropout_in_uint16_t);
+            // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph, params.p_dropout_in_uint);
+            // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph, ph1, params.p_dropout_in_uint);
+            // softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph, ph1, params.p_dropout_in_uint16_t);
+            unsigned int warp_idx = threadIdx.x / 32;
+            // TODO: this should change after we rearrange the warps (e.g. cutlass branch)
+            unsigned int block_col_idx = loop_step_idx * Cta_tile_p::N / 16 + warp_idx;
+            // We want to use actual_seqlen_k, not seqlen_k, since seqlen_k could be rounded
+            // differently in the fwd and bwd pass. E.g., for d=128 on A100, fwd rounds seqlen_k
+            // to multiples of 256 while bwd rounds seqlen_k to multiples of 128.
+            unsigned long long philox_subsequence = (begin + l) * (binfo.actual_seqlen_k / 16) + block_col_idx;
+            softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(ph, params.p_dropout_in_uint16_t, philox_subsequence);
         }
 
-        static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M, "");
-        static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N, "");
-        softmax.pack_noconvert(acc_p);
-        cutlass::NumericArrayConverter<Element, ElementAccum, decltype(acc_p)::kElements, cutlass::FloatRoundStyle::round_to_nearest> convert_p;
-        auto frag_p = convert_p(acc_p);
-
+        using Frag_p = fmha::Fragment_a<fmha::Row>;
+        Frag_p frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+        static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M);
+        static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N);
+        softmax.template pack<elem_type>(frag_p);
         if (Return_softmax) {
-            gmem_s.store(reinterpret_cast<const cutlass::Array<Element, 8>(&)[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M]>(frag_p), mask);
-            gmem_s.move();
+            gmem_s.store(frag_p, mask);
+            gmem_s.move(step_stride);
         }
 
         // Commit the values for Q into shared memory.
-        if (l < steps - 1) { smem_q.store(gmem_frag_q); }
+        if (l + step_stride < steps) {
+            gmem_q.commit(gemm_q_k.smem_q);
+        }
 
         if (Is_dropout && encode_dropout_in_sign_bit) {
-            cutlass::epilogue::thread::ReLu<decltype(frag_p)> relu;
-            frag_p = relu(frag_p);
+            #pragma unroll
+            for( int ki = 0; ki < Mma_tile_o::MMAS_K; ki++ ) {
+                #pragma unroll
+                for( int mi = 0; mi < Mma_tile_o::MMAS_M; mi++ ) {
+                    frag_p[ki][mi].template hrelu_<elem_type>();
+                }
+            }
         }
 
         // Declare the accumulators for the 2nd gemm.
-        WarpMmaPV mma_pv;
-        typename WarpMmaPV::FragmentC acc_o;
-        static_assert(WarpMmaPV::FragmentC::kElements == Mma_tile_o::MMAS_M * Mma_tile_o::MMAS_N * 8, "");
-        acc_o.clear();
-
-        // For some reason, WarpMmaPV::FragmentA has length K * N * (8|4) instead of just N * (8|4).
-        // We have to first cast frag_p to be array of k x (N * (8|4)), then cast each row to be
-        // an array of WarpMmaPV::FragmentA (which is what mma_pv expects).
-        static_assert(decltype(frag_p)::kElements == kIterationsPV * Mma_tile_o::MMAS_M * WarpMmaPV::FragmentA::kElements, "");
-        const auto frag_p_reshaped = reinterpret_cast<const cutlass::Array<Element, WarpMmaPV::FragmentA::kElements> (&)[kIterationsPV]>(frag_p);
+        fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
+        fmha::Clear_accumulator<typename fmha::Accumulator_type, Cta_tile_o::WARPS_K>::apply(acc_o);
+
+        // Do this part of O = P^T * V^T.
         #pragma unroll
-        for( int ki = 0; ki < kIterationsPV; ++ki ) {
-            mma_pv(acc_o, reinterpret_cast<const typename WarpMmaPV::FragmentA(&)>(frag_p_reshaped[ki]), frag_v[ki], acc_o);
+        for( int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki ) {
+            fmha::gemm_cl<elem_type>(acc_o, frag_p[ki], frag_v[ki]);
+            // if ((threadIdx.x == 4) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+            //     float2 tmp_p = __half22float2(reinterpret_cast<__half2 &>(frag_p[ki]));
+            //     float2 tmp_v = __half22float2(reinterpret_cast<__half2 &>(frag_v[ki]));
+            //     printf("Per warp, threadIdx.x = %d, frag_p = %.6f, %.6f, frag_v = %.6f, %.6f, acc_o=%.6f\n", threadIdx.x, tmp_p.x, tmp_p.y, tmp_v.x, tmp_v.y, acc_o[0][0].elt(0));
+            // }
         }
-        // Swizzle the elements and do the final reduction.
-        smem_o.store(acc_o);
+
+        // if ((threadIdx.x % 32 == 16) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+        //     printf("Per warp, threadIdx.x = %d, acc_o=%.6f\n", threadIdx.x, acc_o[0][2].elt(0));
+        // }
 
         // The mapping from tidx to rows changes between the softmax and the
         // O-reduction. So we recalculate the max.
-        using OutputTileThreadMap = typename Smem_O::OutputTileThreadMap;
-        constexpr int kOutputRowsPerThread = OutputTileThreadMap::Iterations::kRow * Smem_O::kIterationsStore;
-        float p_max_o[kOutputRowsPerThread][Mma_tile_o::MMAS_M];
-        int rows[kOutputRowsPerThread];
-        cutlass::MatrixCoord output_thread_offset = OutputTileThreadMap::initial_offset(tidx);
-        const int output_thread_start_row = output_thread_offset.row();
-        const int output_thread_start_column = output_thread_offset.column();
-        for (int iter = 0; iter < Smem_O::kIterationsStore; ++iter) {
-            for (int row = 0; row < OutputTileThreadMap::Iterations::kRow; ++row) {
-                rows[iter * OutputTileThreadMap::Iterations::kRow + row] = output_thread_start_row + iter * OutputTileThreadMap::Shape::kRow + row;
-            }
+        float p_max_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+        int rows[Gmem_tile_o::STGS_PER_LOOP];
+        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+            rows[jj] = tidx / Gmem_tile_o::THREADS_PER_ROW + jj * Gmem_tile_o::ROWS_PER_STG;
         }
-
         softmax.reduce_max_after_sync_(p_max_o, rows);
-        static_assert(Mma_tile_o::MMAS_M == 1, "");
-        for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
-            p_max_o[jj][0] *= params.scale_bmm1;
+        static_assert(Mma_tile_o::MMAS_M == 1);
+        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+            p_max_o[jj][0] *= params.scale_bmm1f;
         }
-        float p_prev_scale_o[kOutputRowsPerThread];
+        float p_prev_scale_o[Gmem_tile_o::STGS_PER_LOOP];
         if (!Is_first) {
             smem_softmax_lse.load(p_prev_scale_o, rows);
         }
+        // if (!Is_first) {
+        //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+        //         printf("p_prev_scale_o=%.6f\n", p_prev_scale_o[0]);
+        //     }
+        // }
+
+        static_assert(Gmem_tile_o::LOOPS == 1);
+
+        // Swizzle the elements and do the final reduction.
+        smem_o.store(acc_o, 0);
 
         // Make sure the data is in shared memory.
         __syncthreads();
 
-        static_assert(Mma_tile_o::MMAS_M == 1, "");
-        float p_sum_o[kOutputRowsPerThread][Mma_tile_o::MMAS_M];
+        static_assert(Mma_tile_o::MMAS_M == 1);
+        float p_sum_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
         softmax.reduce_sum_after_sync_(p_sum_o, rows);
         if (!Is_first) {
-            for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+            for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
                 p_prev_scale_o[jj] = expf(p_prev_scale_o[jj] - p_max_o[jj][0]);
                 p_sum_o[jj][0] += p_prev_scale_o[jj];
             }
         }
 
-        float p_sum_log[kOutputRowsPerThread][Mma_tile_o::MMAS_M];
+        float p_sum_log[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
         #pragma unroll
-        for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
             float sum = p_sum_o[jj][0];
             p_sum_log[jj][0] = (sum == 0.f || sum != sum) ? -INFINITY : p_max_o[jj][0] + __logf(sum);
-            if (output_thread_start_column == 0) {
+            // if (sum == 0.f || sum != sum) {
+            //     printf("loop_step_idx = %d, l = %d, tidx = %d, sum = %.6f, p_max_o = %.6f\n", loop_step_idx, l, tidx, sum, p_max_o[jj][0]);
+            // }
+            // if (Is_first) {
+            //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l == 0))  {
+            //         printf("p_sum_log=%.6f\n", p_sum_log[jj][0]);
+            //     }
+            // }
+            if (tidx % Gmem_tile_o::THREADS_PER_ROW == 0) {
                 gmem_softmax_lse.store_row(
                     reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]), rows[jj]);
             }
         }
-        gmem_softmax_lse.move();
+        gmem_softmax_lse.move(step_stride);
 
         // Load from shared memory.
-        using ArrayTypeO = cutlass::Array<ElementAccum, OutputTileThreadMap::kElementsPerAccess>;
-        static_assert(OutputTileThreadMap::kElementsPerAccess * kOutputRowsPerThread == Smem_O::kIterationsStore * Smem_O::OutputFragment::kElements, "");
-        cutlass::multiplies<ArrayTypeO> multiply_fragments;
         if (!Is_first) {
-            auto out_reshaped = reinterpret_cast<ArrayTypeO (&)[kOutputRowsPerThread]>(out);
-            for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
-                out_reshaped[jj] = multiply_fragments(out_reshaped[jj], p_prev_scale_o[jj]);
+            for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+                out[jj] = fmha::fmul4(out[jj], p_prev_scale_o[jj]);
             }
         }
-        smem_o.template load</*zero_init=*/Is_first>(out, tidx);
+        smem_o.template load</*zero_init=*/Is_first>(out);
 
         const bool is_final_write =
             Is_last
             || ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen_k)
             || ((Is_causal) && ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
-        auto out_reshaped = reinterpret_cast<ArrayTypeO (&)[kOutputRowsPerThread]>(out);
         #pragma unroll
-        for (int jj = 0; jj < kOutputRowsPerThread; jj++) {
+        for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
             float sum = p_sum_o[jj][0];
             float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
             if (Is_dropout && is_final_write) {
                 inv_sum *= params.rp_dropout;
             }
-            out_reshaped[jj] = multiply_fragments(out_reshaped[jj], inv_sum);
+            out[jj] = fmha::fmul4(out[jj], inv_sum);
         }
 
+        // if (Is_dropout && Is_last) {
+        //     for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+        //         out[jj] = fmha::fmul4(out[jj], params.rp_dropout);
+        //     }
+        // }
+
         // Output the values.
         if (is_final_write) {
-            typename GmemIteratorO::Fragment out_converted;
-            cutlass::NumericArrayConverter<Element, ElementAccum, decltype(out_converted)::kElements, cutlass::FloatRoundStyle::round_to_nearest> convert_o;
-            #pragma unroll
-            for (int iter = 0; iter < GmemIteratorO::kIterations; ++iter) {
-                out_converted = convert_o(out[iter]);
-                gmem_o.store(out_converted);
-                gmem_o.move();
-            }
-            // We also need to move gmem_o_accum. For example, if Is_causal=true and seqlen=512,
-            // in the first loop, we write the first 256 rows to gmem_o and the last 256 rows to gmem_o_accum.
-            if (Is_first && !Is_last) { gmem_o_accum.move(GmemIteratorOAccum::kIterations); }
+            gmem_o.template store<elem_type>(out, 0);
+            gmem_o.move(step_stride);
         } else {
-            if (!Is_first) { gmem_o_accum.move(-GmemIteratorOAccum::kIterations); }
-            #pragma unroll
-            for (int iter = 0; iter < GmemIteratorOAccum::kIterations; ++iter) {
-                gmem_o_accum.store(out[iter]);
-                gmem_o_accum.move();
-            }
+            gmem_o_tmp.store(out, 0);
         }
 
+        // Move to the next part of the output.
+        if (!(Is_first && Is_last)) { gmem_o_tmp.move(step_stride); }
         gemm_q_k.reload_k();
 
+        // Make sure we are reading from the correct buffer.
+        gemm_q_k.smem_q.move_to_next_read_buffer();
         // Trigger the load from shared memory for the next series of Q values.
-        if(l < steps - 1) {
+        if (l + step_stride < steps) {
             gemm_q_k.reload_q();
         }
-
     }  // Outer loop over the sequence length.
 }
 
@@ -694,26 +672,28 @@ inline __device__ void device_1xN_loop(const Params &params) {
     // The thread index.
     const int tidx = threadIdx.x;
 
-    const int tidx_global = (bidb * params.h + bidh) * blockDim.x * 2 + tidx;
+    // We want the fwd and bwd to generate the same dropout pattern (RNG), without restricting
+    // them to have the same number of threads or have to traverse the attention matrix
+    // in the same order.
+    // In the Philox RNG, we use the offset to store the batch, head, and the lane id
+    // (within a warp). We use the subsequence to store the location of the 16 x 16 blocks within
+    // the attention matrix. This way, as long as we have the batch, head, and the location of
+    // the 16 x 16 block within the attention matrix, we can generate the exact same dropout pattern.
     auto seeds = at::cuda::philox::unpack(params.philox_args);
-    // We use 2 Philox generators to match the dropout pattern in the backward pass.
-    // Forward pass uses 128 threads while backward pass uses 256 threads, so each thread
-    // in the forward pass is simulating the droout pattern of 2 threads in the backward pass.
-    Philox ph0(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
-    Philox ph1(std::get<0>(seeds), tidx_global + blockDim.x, std::get<1>(seeds));
+    Philox ph(std::get<0>(seeds), 0, std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32);
     constexpr int M = Kernel_traits::Cta_tile_p::M;
     const int STEPS = (params.seqlen_q + M - 1) / M;
 
     constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
     if (params.seqlen_k == blocksize_c) {
-        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, true>(params, bidb, bidh, 0, STEPS, ph0, ph1, 0);
+        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, true>(params, bidb, bidh, STEPS, ph, 0);
     } else {
         const int max_loop_steps = (params.seqlen_k + blocksize_c - 1) / blocksize_c;
-        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, false>(params, bidb, bidh, 0, STEPS, ph0, ph1, 0);
+        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, true, false>(params, bidb, bidh, STEPS, ph, 0);
         for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1; loop_step_idx++) {
-            fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, false>(params, bidb, bidh, 0, STEPS, ph0, ph1, loop_step_idx);
+            fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, false>(params, bidb, bidh, STEPS, ph, loop_step_idx);
         }
-        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, true>(params, bidb, bidh, 0, STEPS, ph0, ph1, max_loop_steps - 1);
+        fmha::device_1xN_<Kernel_traits, Is_dropout, Is_causal, Return_softmax, false, true>(params, bidb, bidh, STEPS, ph, max_loop_steps - 1);
     }
 }
 
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_dispatch.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_dispatch.cu
deleted file mode 100644
index 7748a779a82a..000000000000
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_dispatch.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
-#include <ATen/native/transformers/cuda/flash_attn/static_switch.h>
-#include <ATen/native/transformers/cuda/flash_attn/fmha.h>
-#include <ATen/native/transformers/cuda/flash_attn/kernel_traits.h>
-#include <ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h>
-
-template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
-__global__ void fmha_fprop_loop_kernel(FMHA_fprop_params params) {
-    fmha::device_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
-}
-
-template<typename Kernel_traits>
-void run_fmha_loop_(Launch_params<FMHA_fprop_params> &launch_params,
-                    const bool configure) {
-    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
-    const int loop_steps = (launch_params.params.seqlen_k + blocksize_c - 1) / blocksize_c;
-
-    if (configure) {
-        using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
-        constexpr int M = Kernel_traits::Cta_tile_p::M;
-        size_t STEPS = (launch_params.params.seqlen_q + M - 1) / M;
-        constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
-        constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
-        size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
-        launch_params.elts_per_thread = elts_per_head;
-        return;
-    }
-
-    constexpr size_t smem_size_softmax_lse = Kernel_traits::Smem_softmax_lse::BYTES_PER_TILE;
-    // Don't need smem_size_softmax_lse if we're not looping
-    const size_t smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
-        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
-    // printf("smem_size = %d\n", smem_size);
-
-    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
-    // https://github.com/kokkos/kokkos-kernels/issues/349
-    // https://github.com/HazyResearch/flash-attention/issues/21
-    BOOL_SWITCH(launch_params.is_dropout, IsDropoutConst, [&] {
-        auto kernel = launch_params.params.is_causal
-            ? (launch_params.return_softmax
-               ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, true, true>
-               : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, true, false>)
-            : (launch_params.return_softmax
-               ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, false, true>
-               : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConst, false, false>);
-        // constexpr bool IsDropoutConstTmp = false;
-        // auto kernel = launch_params.params.is_causal
-        //     ? (launch_params.return_softmax
-        //        ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, true, true>
-        //        : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, true, false>)
-        //     : (launch_params.return_softmax
-        //        ? &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, false, true>
-        //        : &fmha_fprop_loop_kernel<Kernel_traits, IsDropoutConstTmp, false, false>);
-        if( smem_size >= 48L  * 1024 ) {
-            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
-                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-        }
-        dim3 grid(launch_params.params.b, launch_params.params.h);
-        kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
-            launch_params.params);
-        FMHA_CHECK_CUDA(cudaPeekAtLastError());
-    });
-}
-
-TORCH_API void run_fmha_fprop(Launch_params<FMHA_fprop_params> &launch_params,
-                    const bool configure) {
-    BOOL_SWITCH(launch_params.params.is_bf16, IsBf16Const, [&] {
-        using elem_type = std::conditional<IsBf16Const, cutlass::bfloat16_t, cutlass::half_t>::type;
-        auto dprops = at::cuda::getCurrentDeviceProperties();
-        if (launch_params.params.d <= 64) {
-            if( launch_params.params.seqlen_k == 128 ) {
-                // TD [2022-08-20]: One might expect that not sharing the smem between K & V
-                // could be faster, but seems like it's the same speed.
-                using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u, elem_type>;
-                run_fmha_loop_<Kernel_traits>(launch_params, configure);
-            } else if( launch_params.params.seqlen_k >= 256 ) {
-                if (dprops->major == 8 && dprops->minor >= 0) {
-                    using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u, elem_type>;
-                    run_fmha_loop_<Kernel_traits>(launch_params, configure);
-                } else if (dprops->major == 7 && dprops->minor == 5) {
-                    if (launch_params.is_dropout) { // Need to use the same block size as backward
-                        using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u, elem_type>;
-                        run_fmha_loop_<Kernel_traits>(launch_params, configure);
-                    } else {
-                        using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u, elem_type>;
-                        run_fmha_loop_<Kernel_traits>(launch_params, configure);
-                    }
-                }
-            }
-        } else if (launch_params.params.d <= 128) {
-            if( launch_params.params.seqlen_k == 128 ) {
-                using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
-                run_fmha_loop_<Kernel_traits>(launch_params, configure);
-            } else {
-                if (dprops->major == 8 && dprops->minor == 0 && !launch_params.is_dropout) {
-                    // TD [2022-06-05] Keep K in smem to reduce register spilling
-                    // Gives about 6% speedup compared to using block size 128.
-                    using Kernel_traits = FMHA_kernel_traits<256, 128, 16, 1, 4, 0x18u, elem_type>;
-                    // using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
-                    run_fmha_loop_<Kernel_traits>(launch_params, configure);
-                } else {  // Need to use the same block size as backward
-                    using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
-                    run_fmha_loop_<Kernel_traits>(launch_params, configure);
-                }
-            }
-        }
-    });
-}
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim128.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim128.cu
new file mode 100644
index 000000000000..281f8630d4a4
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim128.cu
@@ -0,0 +1,12 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_fwd_launch_template.h>
+
+void run_fmha_fwd_hdim128(Launch_params<FMHA_fprop_params> &launch_params) {
+    FP16_SWITCH(launch_params.params.is_bf16, ([&] {
+        using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u, elem_type>;
+        run_fmha_fwd_loop<Kernel_traits>(launch_params);
+    }));
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim32.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim32.cu
new file mode 100644
index 000000000000..44181ee2de08
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim32.cu
@@ -0,0 +1,17 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_fwd_launch_template.h>
+
+void run_fmha_fwd_hdim32(Launch_params<FMHA_fprop_params> &launch_params) {
+    FP16_SWITCH(launch_params.params.is_bf16, ([&] {
+        if (launch_params.params.seqlen_k == 128) {
+            using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 4, 0x08u, elem_type>;
+            run_fmha_fwd_loop<Kernel_traits>(launch_params);
+        } else if (launch_params.params.seqlen_k >= 256) {
+            using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u, elem_type>;
+            run_fmha_fwd_loop<Kernel_traits>(launch_params);
+        }
+    }));
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim64.cu b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim64.cu
new file mode 100644
index 000000000000..683085ed530a
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_hdim64.cu
@@ -0,0 +1,17 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha_fwd_launch_template.h>
+
+void run_fmha_fwd_hdim64(Launch_params<FMHA_fprop_params> &launch_params) {
+    FP16_SWITCH(launch_params.params.is_bf16, ([&] {
+        if (launch_params.params.seqlen_k == 128) {
+            using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u, elem_type>;
+            run_fmha_fwd_loop<Kernel_traits>(launch_params);
+        } else if (launch_params.params.seqlen_k >= 256) {
+            using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u, elem_type>;
+            run_fmha_fwd_loop<Kernel_traits>(launch_params);
+        }
+    }));
+}
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_launch_template.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_launch_template.h
new file mode 100644
index 000000000000..dc98732131e2
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_fwd_launch_template.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2022, Tri Dao.
+
+#pragma once
+
+#include <vector>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha.h>
+#include <ATen/native/transformers/cuda/flash_attn/static_switch.h>
+#include <ATen/native/transformers/cuda/flash_attn/fmha_fprop_kernel_1xN.h>
+
+// Find the number of splits that maximizes the occupancy. For example, if we have
+// batch * n_heads = 48 and we have 108 SMs, having 2 splits (efficiency = 0.89) is
+// better than having 3 splits (efficiency = 0.67). However, we also don't want too many
+// splits as that would incur more HBM reads/writes.
+// So we find the best efficiency, then find the smallest number of splits that gets 95%
+// of the best efficiency.
+// [2022-11-25] TD: Mark this as "inline" otherwise we get "multiple definition" error.
+inline int num_splits_heuristic_fwd(int batch_nheads, int num_SMs, int ctas_per_sm, int max_splits) {
+    float max_efficiency = 0.f;
+    std::vector<float> efficiency;
+    efficiency.reserve(max_splits);
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        float n_waves = float(batch_nheads * num_splits) / (num_SMs * ctas_per_sm);
+        float eff = n_waves / ceil(n_waves);
+        // printf("num_splits = %d, eff = %f\n", num_splits, eff);
+        if (eff > max_efficiency) { max_efficiency = eff; }
+        efficiency.push_back(eff);
+    }
+    for (int num_splits = 1; num_splits <= max_splits; num_splits++) {
+        if (efficiency[num_splits - 1] > 0.95 * max_efficiency) {
+            // printf("num_splits chosen = %d\n", num_splits);
+            return num_splits;
+        }
+    }
+    return 1;
+}
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
+__global__ void fmha_fwd_loop_kernel(FMHA_fprop_params params) {
+    fmha::device_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
+}
+
+template<typename Kernel_traits>
+void run_fmha_fwd_loop(Launch_params<FMHA_fprop_params> &launch_params) {
+    constexpr int blocksize_c = Kernel_traits::Cta_tile_p::N;
+    const int loop_steps = (launch_params.params.seqlen_k + blocksize_c - 1) / blocksize_c;
+
+    constexpr int smem_size_softmax_lse = Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
+    // Don't need smem_size_softmax_lse if we're not looping
+    const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
+        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
+
+    // Work-around for gcc 7. It doesn't like nested BOOL_SWITCH.
+    // https://github.com/kokkos/kokkos-kernels/issues/349
+    // https://github.com/HazyResearch/flash-attention/issues/21
+    BOOL_SWITCH(launch_params.is_dropout, IsDropoutConst, ([&] {
+        auto kernel = launch_params.params.is_causal
+            ? (launch_params.return_softmax
+               ? &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, true, true>
+               : &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, true, false>)
+            : (launch_params.return_softmax
+               ? &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, false, true>
+               : &fmha_fwd_loop_kernel<Kernel_traits, IsDropoutConst, false, false>);
+        if( smem_size >= 48 * 1024 ) {
+            FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+        }
+        // Automatically set num_splits to maximize occupancy
+        if (launch_params.params.num_splits <= 0) {
+            int ctas_per_sm;
+            cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                &ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size);
+            auto dprops = at::cuda::getCurrentDeviceProperties();
+            // printf("CTAS_PER_SM = %d, nSMs = %d\n", ctas_per_sm, dprops->multiProcessorCount);
+            constexpr int M = Kernel_traits::Cta_tile_p::M;
+            launch_params.params.num_splits = num_splits_heuristic_fwd(
+                launch_params.params.b * launch_params.params.h, dprops->multiProcessorCount,
+                ctas_per_sm,
+                /*max_splits=*/std::min(30, (launch_params.params.seqlen_q + M - 1 / M))
+            );
+        }
+        // printf("smem_size = %d\n", smem_size);
+        dim3 grid(launch_params.params.b, launch_params.params.h, launch_params.params.num_splits);
+        kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
+            launch_params.params);
+        FMHA_CHECK_CUDA(cudaPeekAtLastError());
+    }));
+}
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h
index a321e839b3bb..a46d01615e0b 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_kernel.h
@@ -1,5 +1,4 @@
 /******************************************************************************
- * Copyright (c) 2022, Tri Dao.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,6 +28,15 @@
 #pragma once
 
 #include <ATen/cuda/CUDAContext.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/fmha.h>
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+#include <ATen/native/transformers/cuda/flash_attn/smem_tile.h>
+#include <ATen/native/transformers/cuda/flash_attn/gmem_tile.h>
+#include <ATen/native/transformers/cuda/flash_attn/mask.h>
+#include <ATen/native/transformers/cuda/flash_attn/softmax.h>
+#include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
+
 namespace fmha {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,14 +64,14 @@ struct BlockInfoPadded {
         return actual_seqlen_k <= start_col;
     }
 
-    uint32_t actual_seqlen_q;
-    uint32_t actual_seqlen_k;
-    uint32_t sum_s_q;
-    uint32_t sum_s_k;
-    uint32_t bidh;
-    uint32_t bidb;
-    uint32_t tidx_global;
-    uint32_t h;
+    int actual_seqlen_q;
+    int actual_seqlen_k;
+    int sum_s_q;
+    int sum_s_k;
+    int bidh;
+    int bidb;
+    int tidx_global;
+    int h;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h
index 9a40ecb59f24..3bdcad3c058f 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_utils.h
@@ -1,5 +1,3 @@
-
-
 /******************************************************************************
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
@@ -33,6 +31,9 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -50,3 +51,50 @@
     } while( 0 )
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum Data_type { DATA_TYPE_FP16, DATA_TYPE_BF16, DATA_TYPE_FP32, DATA_TYPE_INT32, DATA_TYPE_INT8 };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void set_alpha( uint32_t &alpha, float norm, Data_type dtype ) {
+    if( dtype == DATA_TYPE_FP16 ) {
+        half x = __float2half_rn( norm );
+        uint16_t h = reinterpret_cast<const uint16_t &>( x );
+        ushort2 h2 = { h, h };
+        alpha = reinterpret_cast<const uint32_t &>( h2 );
+    } else if( dtype == DATA_TYPE_BF16 ) {
+        __nv_bfloat16 x = __float2bfloat16( norm );
+        uint16_t h = reinterpret_cast<const uint16_t &>( x );
+        ushort2 h2 = { h, h };
+        alpha = reinterpret_cast<const uint32_t &>( h2 );
+    } else if( dtype == DATA_TYPE_FP32 ) {
+        alpha = reinterpret_cast<const uint32_t &>( norm );
+    } else if( dtype == DATA_TYPE_INT32 ) {
+        int32_t inorm = static_cast<int32_t>( norm );
+        alpha = reinterpret_cast<const uint32_t &>( inorm );
+    } else {
+        assert( false );
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline size_t get_size_in_bytes( size_t n, Data_type dtype ) {
+    switch( dtype ) {
+    case DATA_TYPE_FP32:
+        return n * 4;
+    case DATA_TYPE_FP16:
+        return n * 2;
+    case DATA_TYPE_BF16:
+        return n * 2;
+    case DATA_TYPE_INT32:
+        return n * 4;
+    case DATA_TYPE_INT8:
+        return n;
+    default:
+        assert( false );
+        return 0;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h b/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h
index 2753e5e52572..9feca1e6fdc3 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/gemm.h
@@ -40,6 +40,336 @@ namespace fmha {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template< typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_ >
+struct Fragment_base_ {
+
+    // The data type.
+    using Data_type = Data_type_;
+    // default input type
+    using Input_type_ = Data_type_;
+    // Does it store the array of elements.
+    static constexpr bool HAS_ELTS = BITS_PER_ELT_ >= 8;
+    // The number of elements.
+    static constexpr int NUM_ELTS = NUM_ELTS_;
+    // The size of element in bits.
+    static constexpr int BITS_PER_ELT = BITS_PER_ELT_;
+    // The size of byte of a single register.
+    static constexpr int BYTES_PER_REG = 4;
+    // The size in bits.
+    static constexpr int BITS_PER_REG = BYTES_PER_REG * 8;
+    // The number of registers needed to store the fragment.
+    static constexpr int NUM_REGS = DivUpConstexpr(NUM_ELTS * BITS_PER_ELT, BITS_PER_REG);
+    // The size in bytes (as returned by sizeof(Fragment_base<>).
+    static constexpr int SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG;
+    // The alignment.
+    static constexpr int ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : MinConstexpr(NUM_REGS * BYTES_PER_REG, 16);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The type of the elements.
+    typename Data_type_,
+    // The number of elements.
+    int NUM_ELTS_,
+    // The alignment if you want to force a value -- use 0 otherwise.
+    int ALIGNMENT_ = 0,
+    // The base class.
+    typename Base_ = Fragment_base_<Data_type_, NUM_ELTS_, 8 * sizeof(Data_type_), ALIGNMENT_>
+>
+struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment : public Base_ {
+
+    // The size of a load/store.
+    static constexpr int BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t);
+
+    // Clear the fragment. Using PTX in that code seems to produce better SASS...
+    inline __device__ void clear() {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) : );
+        }
+    }
+
+    // Immutable access to a register.
+    inline __device__ const uint32_t& reg(int ii) const {
+        return this->regs_[ii];
+    }
+
+    // Mutable access to a register.
+    inline __device__ uint32_t& reg(int ii) {
+        return this->regs_[ii];
+    }
+
+    uint32_t regs_[Base_::NUM_REGS];
+
+    // Immutable access to the elements.
+    inline __device__ const Data_type_& elt(int ii) const {
+        return reinterpret_cast<const Data_type_*>(&this->regs_[0])[ii];
+    }
+
+    // Mutable access to the elements.
+    inline __device__ Data_type_& elt(int ii) {
+        return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii];
+    }
+
+    // Immutable access to the elements with a cast.
+    template< typename Cast_type >
+    inline __device__ const Cast_type& elt_as(int ii) const {
+        return reinterpret_cast<const Cast_type*>(&this->regs_[0])[ii];
+    }
+
+    // Mutable access to the elements.
+    template< typename Cast_type >
+    inline __device__ Cast_type& elt_as(int ii) {
+        return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
+    }
+
+    // Add another fragment.
+    inline __device__ void add(const Fragment &other) {
+        // TODO (TD 2022-04-09): Shouldn't this be NUM_REGS instead of NUM_ELTS?
+        // Also are we doing int addition or __half2 addition?
+        #pragma unroll
+        for( int ii = 0; ii < NUM_ELTS_; ++ii ) {
+            this->elt(ii) += other.elt(ii);
+        }
+    }
+
+    // Multiply by another fragment.
+    inline __device__ void hmul(const Fragment &other) {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            this->reg(ii) = fmha::hmul2(this->reg(ii), other.reg(ii));
+        }
+    }
+
+    template <typename elem_type>
+    inline __device__ void hrelu_() {
+        #pragma unroll
+        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
+            this->reg(ii) = fmha::hrelu2<elem_type>(this->reg(ii));
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout >
+struct Fragment_a : public Fragment<uint16_t, 8> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout >
+struct Fragment_b : public Fragment<uint16_t, 8> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fragment_accumulator : public Fragment<float, 8> {
+
+    // The base class.
+    using Base = Fragment<float, 8>;
+
+    // Add two fragments.
+    template< typename Other_fragment_ >
+    inline __device__ void add(const Other_fragment_ &other) {
+        for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
+            this->elt(ii) = this->elt(ii) + other.elt(ii);
+        }
+    }
+
+    inline __device__ void mul_(const float other) {
+        for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
+            this->elt(ii) *= other;
+        }
+    }
+
+    // Do the HMMA.
+    template< typename Layout_a, typename Layout_b >
+    inline __device__ void mma(const Fragment_a<Layout_a> &a,
+                               const Fragment_b<Layout_b> &b) {
+        asm volatile( \
+            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
+            "    {%0, %1, %2, %3}, \n" \
+            "    {%4, %5, %6, %7}, \n" \
+            "    {%8, %9}, \n" \
+            "    {%0, %1, %2, %3}; \n" \
+                    : "+f"(  elt(0)), "+f"(  elt(1)), "+f"(  elt(2)), "+f"(  elt(3))
+                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
+                    ,  "r"(b.reg(0)),  "r"(b.reg(1)));
+        asm volatile( \
+            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
+            "    {%0, %1, %2, %3}, \n" \
+            "    {%4, %5, %6, %7}, \n" \
+            "    {%8, %9}, \n" \
+            "    {%0, %1, %2, %3}; \n" \
+                    : "+f"(  elt(4)), "+f"(  elt(5)), "+f"(  elt(6)), "+f"(  elt(7))
+                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
+                    ,  "r"(b.reg(2)),  "r"(b.reg(3)));
+    }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Fragment, int M, int N >
+inline __device__ void clear(Fragment (&frag)[M][N]) {
+    #pragma unroll
+    for( int mi = 0; mi < M; ++mi ) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ++ni ) {
+            frag[mi][ni].clear();
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Accumulator_type, int WARPS_K >
+struct Clear_accumulator {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int WARPS_K >
+struct Clear_accumulator<float, WARPS_K> {
+  template< typename Acc, int M, int N >
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
+
+    #pragma unroll
+    for( int mi = 0; mi < M; ++mi ) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ++ni ) {
+            acc[mi][ni].mma(a[mi], b[ni]);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps half types => cutlass data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct HalfTypeToCutlassType { using Type = Type_; };
+
+/// Statically maps __half => cutlass::half_t
+template <> struct HalfTypeToCutlassType<__half> {
+    using Type = cutlass::half_t;
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+template <> struct HalfTypeToCutlassType<__nv_bfloat16> {
+    using Type = cutlass::bfloat16_t;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename elem_type, typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm_cl(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
+    using Shape = cutlass::gemm::GemmShape<16 * M, 16 * N, 16>;
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+#elif defined(__CUDA_ARCH__)  && __CUDA_ARCH__ >= 750
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+#else
+    assert(0);
+    // THIS IS NOT CORRECT BUT THE ASSERT WILL STOP THIS
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+    // TD [2022-06-02] We don't support Volta (SM70) yet.
+#endif
+    using Element = typename HalfTypeToCutlassType<elem_type>::Type;
+    using ElementC = float;
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+
+    using WarpMma = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+        Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC,
+        cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd, 1, true>::Type;
+
+    constexpr int kIters = Shape::kK / InstructionShape::kK;
+    // using FragmentA = typename WarpMma::FragmentA;
+    // using FragmentB = typename WarpMma::FragmentB;
+    using FragmentA = typename WarpMma::ArchMmaOperator::FragmentA;
+    using FragmentB = typename WarpMma::ArchMmaOperator::FragmentB;
+    using FragmentC = typename WarpMma::FragmentC;
+
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y) == 0) {
+    //     printf("FragmentA::kStorageElements = %d\n", FragmentA::kStorageElements);
+    //     printf("Archmma::FragmentA::kStorageElements = %d\n", WarpMma::ArchMmaOperator::FragmentA::kStorageElements);
+    //     printf("FragmentB::kStorageElements = %d\n", FragmentB::kStorageElements);
+    //     printf("Archmma::FragmentB::kStorageElements = %d\n", WarpMma::ArchMmaOperator::FragmentB::kStorageElements);
+    //     printf("FragmentC::kStorageElements = %d\n", FragmentC::kStorageElements);
+    //     printf("Archmma::FragmentC::kStorageElements = %d\n", WarpMma::ArchMmaOperator::FragmentC::kStorageElements);
+    // }
+
+    // static_assert(FragmentA::kStorageElements == M * a[0].NUM_REGS);
+    // static_assert(FragmentB::kStorageElements == N * b[0].NUM_REGS);
+    static_assert(FragmentA::kStorageElements * kIters == a[0].NUM_REGS);
+    static_assert(FragmentB::kStorageElements * kIters * 16 / InstructionShape::kN == b[0].NUM_REGS);
+    static_assert(FragmentC::kStorageElements == M * N * acc[0][0].NUM_REGS);
+    // const FragmentA a_cl = reinterpret_cast<const FragmentA (&)>(a);
+    // const FragmentB b_cl = reinterpret_cast<const FragmentB (&)>(b);
+    FragmentC c_cl = reinterpret_cast<FragmentC (&)>(acc);
+    FragmentA a_cl[kIters][M];
+    FragmentA b_cl[kIters][N];
+    constexpr int kRegs = InstructionShape::kK == 16 ? 4 : 2;
+    #pragma unroll
+    for (int iter = 0; iter < kIters; iter++) {
+        #pragma unroll
+        for (int mi = 0; mi < M; mi++) {
+            uint32_t *a_ptr = a_cl[iter][mi].raw_data();
+            #pragma unroll
+            for (int ki = 0; ki < kRegs; ki++) {
+                a_ptr[ki] = a[mi].regs_[iter * kRegs + ki];
+            }
+        }
+    }
+    #pragma unroll
+    for (int iter = 0; iter < kIters; iter++) {
+        #pragma unroll
+        for (int ni = 0; ni < N; ni++) {
+            uint32_t *b_ptr = b_cl[iter][ni].raw_data();
+            #pragma unroll
+            for (int ki = 0; ki < kRegs; ki++) {
+                // b_ptr[ki] = b[ni].regs_[iter * kRegs + ki];
+                // TD [2022-06-02] For some reason the order for frag_b is different.
+                b_ptr[ki] = b[ni].regs_[InstructionShape::kK == 16 ? iter * kRegs + ki : ki * kRegs + iter];
+            }
+        }
+    }
+
+    WarpMma mma_op;
+    // mma_op(c_cl, a_cl, b_cl, c_cl);
+    #pragma unroll
+    for (int iter = 0; iter < kIters; iter++) {
+        mma_op(c_cl, reinterpret_cast<const typename WarpMma::FragmentA (&)>(a_cl[iter]),
+               reinterpret_cast<const typename WarpMma::FragmentB (&)>(b_cl[iter]), c_cl);
+    }
+
+    // The modified c_cl is not copied back into acc, idk why
+    #pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+        #pragma unroll
+        for (int ni = 0; ni < N; ni++) {
+            #pragma unroll
+            for (int i =0; i < 8; i++) {
+                acc[mi][ni].elt(i) = c_cl[mi * N * 8 + ni * 8 + i];
+            }
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<
     // The number of rows in the CTA tile.
     int M_,
@@ -83,13 +413,40 @@ struct Hmma_tile {
         MMAS_N = DivUpConstexpr(Cta_tile::N, N_PER_MMA_PER_CTA),
         MMAS_K = DivUpConstexpr(Cta_tile::K, K_PER_MMA_PER_CTA);
 
+    // // The number of elements computed per warp.
+    // static constexpr int M_PER_WARP = MMAS_M * M_PER_MMA,
+    //     N_PER_WARP = MMAS_N * N_PER_MMA,
+    //     K_PER_WARP = MMAS_K * K_PER_MMA;
+
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+using A_type = uint16_t;
+using B_type = uint16_t;
+using C_type = uint16_t;
+using Accumulator_type = float;
+using Epilogue_type = float;
+
+constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
+constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
+constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
 using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template<typename Cta_tile_>
+using Cta_tile_with_k_with_padding = Cta_tile_extd<Cta_tile_::M,
+                                                   Cta_tile_::N,
+                                                   Next_power_of_two<Cta_tile_::K>::VALUE,
+                                                   Cta_tile_::WARPS_M,
+                                                   Cta_tile_::WARPS_N,
+                                                   Cta_tile_::WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h b/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h
index ea54086ac36a..22d57b4ab25c 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/gmem_tile.h
@@ -27,9 +27,293 @@
 
 #pragma once
 
-#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+
 namespace fmha {
 
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile_,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns.
+    int COLS,
+    int BYTES_PER_LDGS_ = 16
+>
+struct Gmem_tile_qkv {
+
+    using Cta_tile = Cta_tile_;
+
+    static constexpr int BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8;
+    // The size of each LDG.
+    static constexpr int BYTES_PER_LDG = BYTES_PER_LDGS_;
+    // The size of a row in bytes.
+    static constexpr int BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8;
+
+    // The number of threads to load a "row" of the matrix.
+    static constexpr int THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG;
+
+    static constexpr int ROWS = ROWS_;
+    // The number of "rows" loaded per LDG.
+    static constexpr int ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW;
+    // The number of LDGs needed to load a chunk of the Q matrix.
+    static constexpr int LDGS = DivUpConstexpr(ROWS, ROWS_PER_LDG);
+
+    // Ctor.
+    template< typename BInfo >
+    inline __device__ Gmem_tile_qkv(void *ptr_, const uint32_t row_stride_in_elts,
+                                    const uint32_t head_stride_in_elts, const int headdim,
+                                    const BInfo &binfo, const int tidx, bool use_seqlen_q)
+        : row_stride_in_bytes(row_stride_in_elts * BYTES_PER_ELEMENT)
+        , actual_seqlen(use_seqlen_q ? binfo.actual_seqlen_q : binfo.actual_seqlen_k)
+        , ptr(reinterpret_cast<char *>(ptr_))
+        , tidx_(tidx)
+        , col_predicate((tidx % THREADS_PER_ROW) * (BYTES_PER_LDG / BYTES_PER_ELEMENT) < headdim) {
+
+        // Compute the position in the sequence (within the CTA for the moment).
+        int row = tidx / THREADS_PER_ROW;
+        // Compute the position of the thread in the row.
+        int col = tidx % THREADS_PER_ROW;
+
+        // Store the row as we need it to disable the loads.
+        // TD [2022-04-16]: To minimize registers, we'll recompute row_ instead of storing it
+        // row_ = row;
+
+        // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
+        // int64_t row_offset = (int64_t)row * params.qkv_stride_in_bytes;
+        uint32_t row_offset = (uint32_t)(((use_seqlen_q ? binfo.sum_s_q : binfo.sum_s_k) + row) * row_stride_in_bytes);
+        // Add the block index.
+        // row_offset += (int64_t)((binfo.sum_s * NUM_MATS + qkv_offset) * binfo.h + binfo.bidh) * BYTES_PER_ROW;
+        row_offset += (uint32_t)(binfo.bidh * head_stride_in_elts * BYTES_PER_ELEMENT);
+
+        // Assemble the final pointer.
+        ptr += row_offset + col * BYTES_PER_LDG;
+    }
+
+    // Store data to shared memory.
+    template< typename Smem_tile >
+    inline __device__ void commit(Smem_tile &smem_tile) {
+        smem_tile.store(fetch_);
+    }
+
+    inline __device__ void load() {
+        int row_ = tidx_ / THREADS_PER_ROW;
+        const void *ptrs[LDGS];
+        uint32_t preds[LDGS];
+        #pragma unroll
+        for( int ii = 0; ii < LDGS; ++ii ) {
+            // ptrs[ii] = ptr + (int64_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+            ptrs[ii] = ptr + (uint32_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+            preds[ii] = col_predicate && ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen));
+            fetch_[ii] = make_uint4(0, 0, 0, 0);
+        }
+
+        // not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
+        Ldg_functor<uint4, LDGS> fct(fetch_, ptrs);
+        #pragma unroll
+        for( int ii = 0; ii < LDGS; ++ii ) {
+            fct.load(ii, preds[ii]);
+        }
+    }
+
+    // Store data to memory.
+    inline __device__ void store(const uint4 (&data)[LDGS]) {
+        int row_ = tidx_ / THREADS_PER_ROW;
+        #pragma unroll
+        for( int ii = 0; ii < LDGS; ++ii ) {
+            // char *ptr_ = ptr + (int64_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+            char *ptr_ = ptr + (uint32_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+            if (col_predicate && (row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen)) {
+                fmha::stg(ptr_, data[ii]);
+            }
+        }
+    }
+
+    inline __device__ void move(const int steps = 1) {
+        // ptr += (int64_t)ROWS * row_stride_in_bytes * steps;
+        ptr += (uint32_t)ROWS * row_stride_in_bytes * steps;
+        actual_seqlen -= ROWS * steps;
+    }
+
+    // The stride between rows for the QKV matrice.
+    // int64_t row_stride_in_bytes;
+    const uint32_t row_stride_in_bytes;
+    // The pointer.
+    char *ptr;
+    // The fetch registers.
+    uint4 fetch_[LDGS];
+    // Keep track of the row the thread is processing as we move the tile.
+    // int row_;
+    const int tidx_;
+    // The length of the sequence loaded by that memory tile.
+    int actual_seqlen;
+    const bool col_predicate;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    typename Cta_tile,
+    int BYTES_PER_ELEMENT = 2
+>
+struct Gmem_tile_o {
+
+    static_assert(BYTES_PER_ELEMENT == 2 || BYTES_PER_ELEMENT == 4);
+
+    // The mma tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // The size of each element.
+    // static constexpr int BYTES_PER_ELEMENT = 2;
+    // The size of each STG.
+    static constexpr int BYTES_PER_STG = BYTES_PER_ELEMENT * 4;
+    static constexpr int COLS = Cta_tile::N;
+    // The size of a row in bytes.
+    static constexpr int BYTES_PER_ROW = COLS * BYTES_PER_ELEMENT;
+
+    // The number of threads to store a "row" of the matrix.
+    static constexpr int THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG;
+    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+    static constexpr int ROWS = Cta_tile::M;
+    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+    static constexpr int ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA;
+    // The number of outter loop for the stores.
+    static constexpr int LOOPS = ROWS / ROWS_PER_LOOP;
+
+    // The number of "rows" stored per STG.
+    static constexpr int ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW;
+    // Do we have to guard against partial writes/reads.
+    static constexpr bool HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0;
+    // The number of STGs needed to store a chunk of the Q matrix.
+    static constexpr int STGS_PER_LOOP = DivUpConstexpr(ROWS_PER_LOOP, ROWS_PER_STG);
+    // The number of STGs needed to store a chunk of the Q matrix in total.
+    static constexpr int STGS = STGS_PER_LOOP * LOOPS;
+
+    // Ctor.
+    template<typename BInfo>
+    // inline __device__ Gmem_tile_o(void *ptr, const size_t row_stride_in_elts, const BInfo &binfo, const int tidx)
+    inline __device__ Gmem_tile_o(void *ptr, const uint32_t row_stride_in_elts,
+                                  const uint32_t head_stride_in_elts, const int headdim,
+                                  const BInfo &binfo, const int tidx)
+        : row_stride_in_bytes(row_stride_in_elts * BYTES_PER_ELEMENT)
+        , actual_seqlen_q(binfo.actual_seqlen_q)
+        , ptr_(reinterpret_cast<char *>(ptr))
+        , tidx_(tidx)
+        , col_predicate((tidx % THREADS_PER_ROW) * (BYTES_PER_STG / BYTES_PER_ELEMENT) < headdim) {
+
+        // Compute the position in the sequence (within the CTA for the moment).
+        int row = tidx / THREADS_PER_ROW;
+        // Compute the position of the thread in the row.
+        int col = tidx % THREADS_PER_ROW;
+
+        // Store the row as we need it to disable loads.
+        // row_ = row;
+
+        // The row offset in the batched GEMM.
+        // int64_t row_offset = (int64_t)row * row_stride_in_bytes + binfo.bidx * BYTES_PER_ROW;
+        uint32_t row_offset = (uint32_t)((binfo.sum_s_q + row) * row_stride_in_bytes);
+        row_offset += (uint32_t)(binfo.bidh * head_stride_in_elts * BYTES_PER_ELEMENT);
+        // Assemble the final pointer.
+        ptr_ += row_offset + col * BYTES_PER_STG;
+
+        // Is that thread active on the last STG?
+        if( HAS_INCOMPLETE_STG ) {
+            is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+        }
+    }
+
+    // Store data to global memory.
+    template<typename elem_type=__half>
+    inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
+        int row_ = tidx_ / THREADS_PER_ROW;
+        #pragma unroll
+        for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
+            int jj = mi * STGS_PER_LOOP + ii;
+            if ((!col_predicate) || (row_ + jj * ROWS_PER_STG >= this->actual_seqlen_q)) {
+                break;
+            }
+
+            if (BYTES_PER_ELEMENT == 4) {
+                if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
+                    fmha::stg(this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes, src[ii]);
+                }
+            } else if (BYTES_PER_ELEMENT == 2) {
+                float x = reinterpret_cast<const float &>(src[ii].x);
+                float y = reinterpret_cast<const float &>(src[ii].y);
+                float z = reinterpret_cast<const float &>(src[ii].z);
+                float w = reinterpret_cast<const float &>(src[ii].w);
+                uint2 out = fmha::float4_pack<elem_type>(x, y, z, w);
+                if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
+                    fmha::stg(this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes, out);
+                }
+            }
+        }
+    }
+
+    // Store data to global memory with atomicAdd.
+    inline __device__ void atomic_add(const uint4 (&src)[STGS_PER_LOOP], int mi) {
+        static_assert(BYTES_PER_ELEMENT == 4);  // Only do atomic add on floats
+        int row_ = tidx_ / THREADS_PER_ROW;
+        #pragma unroll
+        for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
+            int jj = mi * STGS_PER_LOOP + ii;
+            if ((!col_predicate) || (row_ + jj * ROWS_PER_STG >= this->actual_seqlen_q)) {
+                break;
+            }
+
+            if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
+                float *ptr_ = reinterpret_cast<float *>(this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes);
+                #pragma unroll
+                for (int jj = 0; jj < 4; ++jj) {
+                    atomicAdd(ptr_ + jj, reinterpret_cast<const float(&)[4]>(src[ii])[jj]);
+                }
+            }
+        }
+    }
+
+    // Load data from global memory.
+    inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+        static_assert(BYTES_PER_ELEMENT == 4);
+        int row_ = tidx_ / THREADS_PER_ROW;
+        #pragma unroll
+        for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
+            int jj = mi * STGS_PER_LOOP + ii;
+            if ((!col_predicate) || (row_ + jj * ROWS_PER_STG >= this->actual_seqlen_q)) {
+                break;
+            }
+
+            if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
+                fmha::ldg(dst[ii], this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes);
+            }
+        }
+    }
+
+    inline __device__ void move(const int steps = 1) {
+        // row_ += ROWS * steps;
+        // ptr_ += (int64_t)ROWS * row_stride_in_bytes * steps;
+        ptr_ += (uint32_t)ROWS * row_stride_in_bytes * steps;
+        actual_seqlen_q -= ROWS * steps;
+    }
+
+    // The stride between rows for the QKV matrice.
+    // int64_t row_stride_in_bytes;
+    const uint32_t row_stride_in_bytes;
+    // The pointer.
+    char *ptr_;
+    // Is the thread active for the last STG?
+    int is_active_for_last_stg_;
+    // The length of the sequence loaded by that memory tile.
+    int actual_seqlen_q;
+    const int tidx_;
+    const bool col_predicate;
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template< typename Cta_tile, int BYTES_PER_ELEMENT >
@@ -118,16 +402,15 @@ struct Gmem_tile_mma_s : public Base {
     // Store to global memory.
     template<typename Mask, typename Fragment>
     inline __device__ void store(const Fragment (&frag)[N][M], const Mask& mask){
-        static_assert(Fragment::kStorageElements == 4, "");
         #pragma unroll
         for( int mi = 0; mi < M; mi++ ) {
             #pragma unroll
             for( int ni = 0; ni < N; ni++ ) {
                 uint4 dst;
-                dst.x = frag[ni][mi].raw_data()[0];
-                dst.y = frag[ni][mi].raw_data()[2];
-                dst.z = frag[ni][mi].raw_data()[1];
-                dst.w = frag[ni][mi].raw_data()[3];
+                dst.x = frag[ni][mi].reg(0);
+                dst.y = frag[ni][mi].reg(2);
+                dst.z = frag[ni][mi].reg(1);
+                dst.w = frag[ni][mi].reg(3);
                 if( mask.any_valid(mi, ni) ) {
                     Base::store(dst, mi, ni);
                 }
@@ -269,4 +552,4 @@ struct Gmem_summary_stats {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-}  // namespace fmha
+}  // namespace fmha
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
index 9c630fbd4fe1..bd1d1549b24a 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/kernel_traits.h
@@ -25,24 +25,18 @@
  *
  ******************************************************************************/
 
-#pragma once
-
-#include <cutlass/cutlass.h>
+#include <ATen/cuda/CUDAContext.h>
 
-#include <cutlass/gemm/gemm.h>
-
-#include <cutlass/layout/layout.h>
-#include <cutlass/numeric_types.h>
-#include <cutlass/transform/threadblock/predicated_tile_iterator.h>
+#include <cuda_fp16.h>
 
 #include <ATen/native/transformers/cuda/flash_attn/gemm.h>
 #include <ATen/native/transformers/cuda/flash_attn/gmem_tile.h>
-#include <ATen/native/transformers/cuda/flash_attn/summary_stats.h>
-#include <ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h>
+
+#pragma once
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template<int S, int D, int STEP, int WARPS_M, int WARPS_N, uint32_t FLAGS = 0x08u, typename elem_type=cutlass::half_t>
+template<int S, int D, int STEP, int WARPS_M, int WARPS_N, uint32_t FLAGS = 0x08u, typename elem_type_=__half>
 struct FMHA_kernel_traits {
 
     // The CTA description for the 1st GEMM.
@@ -57,98 +51,71 @@ struct FMHA_kernel_traits {
     // Do we keep V in registers.
     static constexpr bool V_IN_REGS = (FLAGS & 0x100u) == 0u;
 
+    // The global memory tile to load Q.
+    using Gmem_tile_q = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;
+
+    // The shared memory tile to swizzle Q.
+    // using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 1>;
+    using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
+
+    // The global memory tile to load K.
+    using Gmem_tile_k = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_B, S, D>;
+    // The shared memory tile to swizzle K.
+    using Smem_tile_k = fmha::Smem_tile_b<Cta_tile_p, fmha::Col>;
+
+    // The global memory tile to load V.
+    using Gmem_tile_v = fmha::Gmem_tile_qkv<Cta_tile_o, fmha::BITS_PER_ELEMENT_B, S, D>;
+    // The shared memory tile to swizzle V.
+    using Smem_tile_v = fmha::Smem_tile_v<Cta_tile_o>;
+
+    // The global memory tile to store O.
+    using Gmem_tile_o = fmha::Gmem_tile_o<Cta_tile_o>;
+    // The shared memory tile for O.
+    using Smem_tile_o = fmha::Smem_tile_o<Cta_tile_o>;;
+
     // The global memory tile to load/store S.
     using Gmem_tile_s = fmha::Gmem_tile_mma_s<Cta_tile_p>;
 
+    // The shared memory tile to transpose S.
+    using Smem_tile_st = fmha::Smem_tile_mma_transposed<Cta_tile_p>;
+
+    using Gmem_tile_do = fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;
+
+    // // The global memory tile to store the accumulated dK and dV
+    // // Hack: we set BYTES_PER_LDGS=32 to emulate the access pattern of dK and dV
+    // // where there are 16 bits per lements and 16 bytes per load. In reality we won't
+    // // be issue any load or store of size 32 bytes.
+    // using Gmem_tile_dkv_accum = fmha::Gmem_tile_qkv<Cta_tile_o, 32, S, D, 32>;
+
     // The global memory tile to store the softmax sum.
     using Gmem_softmax_sum = fmha::Gmem_summary_stats<Cta_tile_p>;
 
+    // The shared memory tile to store dp sum.
+    using Smem_dp_sum = fmha::Smem_tile_dp_sum<Gmem_tile_q, 2>;
+
+    using elem_type = elem_type_;
+
+    // Make sure the number of threads match.
+    static_assert((int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW, "");
+
     // The number of threads.
     static constexpr int THREADS = Cta_tile_p::THREADS_PER_CTA;
     // Make sure the number of threads matches both CTAs.
     static_assert(THREADS == Cta_tile_o::THREADS_PER_CTA, "");
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    using MmaInstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
-#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
-    using MmaInstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
-#else
-    // using MmaInstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
-    using MmaInstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
-    // TD [2022-06-02] We don't support Volta (SM70) yet.
-#endif
-
-#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
-    using Element = elem_type;
-#else
-    using Element = cutlass::half_t;
-#endif
-    using ElementAccum = float;
-
-    static_assert(WARPS_M == 1, "");
-    using ThreadblockShapeQK = cutlass::gemm::GemmShape<STEP, S, D>;
-    using WarpCountQK = cutlass::gemm::GemmShape<WARPS_M, WARPS_N, 1>;
-    using WarpShapeQK = cutlass::gemm::GemmShape<
-       ThreadblockShapeQK::kM,
-       ThreadblockShapeQK::kN / WarpCountQK::kN, ThreadblockShapeQK::kK>;
-    using LayoutQ = cutlass::layout::RowMajor;
-    using LayoutK = cutlass::layout::ColumnMajor;
-    using LayoutP = cutlass::layout::RowMajor;
-    using MmaCoreQK = typename fmha::FMHAMmaCore<
-        ThreadblockShapeQK, WarpShapeQK, MmaInstructionShape, Element, LayoutQ,
-        Element, LayoutK, ElementAccum, LayoutP,
-        cutlass::arch::OpClassTensorOp>;
-
-    using ThreadblockShapePV = cutlass::gemm::GemmShape<STEP, D, S>;
-    using WarpCountPV = cutlass::gemm::GemmShape<WARPS_M, 1, WARPS_N>;
-    using WarpShapePV = cutlass::gemm::GemmShape<ThreadblockShapePV::kM, ThreadblockShapePV::kN, ThreadblockShapePV::kK / WarpCountPV::kK>;
-    using LayoutV = cutlass::layout::RowMajor;
-    using LayoutO = cutlass::layout::RowMajor;
-    using MmaCorePV = typename fmha::FMHAMmaCore<
-        ThreadblockShapePV, WarpShapePV, MmaInstructionShape, Element, LayoutP,
-        Element, LayoutV, ElementAccum, LayoutO,
-        cutlass::arch::OpClassTensorOp>;
-
-    // The global memory tile to load Q.
-    // Copy from mma_piplined_testbed.h
-    using GmemIteratorQ = cutlass::transform::threadblock::PredicatedTileIterator<
-      cutlass::MatrixShape<ThreadblockShapeQK::kM, ThreadblockShapeQK::kK>,
-      Element,
-      LayoutQ,
-      0,
-      typename MmaCoreQK::IteratorThreadMapA
-    >;
-
-    // The global memory tile to load K.
-    using GmemIteratorK = cutlass::transform::threadblock::PredicatedTileIterator<
-      cutlass::MatrixShape<ThreadblockShapeQK::kK, ThreadblockShapeQK::kN>,
-      Element,
-      LayoutK,
-      1,
-      typename MmaCoreQK::IteratorThreadMapB
-    >;
-
-    // The global memory tile to load V.
-    using GmemIteratorV = cutlass::transform::threadblock::PredicatedTileIterator<
-      cutlass::MatrixShape<ThreadblockShapePV::kK, ThreadblockShapePV::kN>,
-      Element,
-      LayoutV,
-      0,
-      typename MmaCorePV::IteratorThreadMapB
-    >;
-
-    // The shared memory tile to store softmax lse.
-    using Smem_softmax_lse = fmha::Smem_tile_softmax_lse<ThreadblockShapeQK::kM, MmaInstructionShape::kM, WarpCountQK::kM>;
-
     // The amount of shared memory needed to load Q and K.
-    static constexpr size_t BYTES_PER_SMEM_Q = ThreadblockShapeQK::kM * ThreadblockShapeQK::kK * sizeof(Element);
-    static constexpr size_t BYTES_PER_SMEM_K = ThreadblockShapeQK::kN * ThreadblockShapeQK::kK * sizeof(Element);
-    static constexpr size_t BYTES_PER_SMEM_V = ThreadblockShapePV::kN * ThreadblockShapePV::kK * sizeof(Element);
-    static_assert(BYTES_PER_SMEM_K == BYTES_PER_SMEM_V, "");
-    static constexpr size_t BYTES_PER_SMEM_QK = BYTES_PER_SMEM_Q + BYTES_PER_SMEM_K;
+    static constexpr int BYTES_PER_SMEM_QK = Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE;
     // The extra amount of shared memory needed to load V.
-    static constexpr size_t BYTES_PER_SMEM_V_EXTRA = SHARE_SMEM_FOR_K_AND_V ? 0u : BYTES_PER_SMEM_V;
+    static constexpr int BYTES_PER_SMEM_V = SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE;
     // The amount of shared memory needed for Q, K and V..
-    static constexpr size_t BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V_EXTRA;
-
+    static constexpr int BYTES_PER_SMEM_QKV = BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V;
+    // The amount of shared memory needed to load Q and store O.
+    static constexpr int BYTES_PER_SMEM_QO = Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE;
+
+    // The amount of shared memory needed for Q, K, V and O.
+    static constexpr int BYTES_PER_SMEM = fmha::MaxConstexpr(BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO);
+    // Make sure we have enough shared memory.
+    static_assert(Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <= BYTES_PER_SMEM, "");
 };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h b/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
index 6169c89550b6..4153b098f406 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/mask.h
@@ -28,6 +28,7 @@
 #pragma once
 
 #include <ATen/cuda/CUDAContext.h>
+
 namespace fmha {
 
 
@@ -52,21 +53,20 @@ struct Mask {
         const int quad = lane / 4;
         const int tid = (lane % 4) * 2;
         row = warp_m * 16 + quad;
-        // col = warp_n * 16 + tid;
-        col = warp_n * Mma_tile::N_PER_MMA * Mma_tile::MMAS_N + tid;
+        col = warp_n * 16 + tid;
     }
 
     inline __device__ bool is_valid(const int mi, const int ni, const int ii, const int jj) const {
 
         // ii and jj iterate over the 2x4 fragment
         // const int current_col = (Is_causal ? loop_step_idx * Cta_tile::N : 0) + ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
-        // const int current_col = ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
-        const int current_col = ni * Mma_tile::N_PER_MMA + col + (jj & 2) * 4 + (jj & 1);
+        const int current_col = ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
         const int current_row = row_offset + ii * 8;
         const bool col_valid = current_col < actual_seqlen_k;
         // const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1)) < actual_seqlen_k;
         //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen_k;
-        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        // bool all_valid = Is_causal ? col_valid && (current_col + loop_step_idx * Cta_tile::N <= current_row) : col_valid;
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 1)) {
         //     printf("current_col=%d, current_row=%d, actual_seqlen_k=%d, col_valid=%d, all_valid=%d\n", current_col, current_row, actual_seqlen_k, col_valid, all_valid);
         // }
         return Is_causal ? col_valid && (current_col + loop_step_idx * Cta_tile::N <= current_row) : col_valid;
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h b/aten/src/ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h
deleted file mode 100644
index 863d30b14adf..000000000000
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/mma_core_sm75.h
+++ /dev/null
@@ -1,382 +0,0 @@
-// Adapted from cutlass/gemm/threadblock/default_mma_core_sm75.h
-// This is very similar, except we make it work for head_dim=128.
-// The original cutlass version only allows kK of the thread block to be
-// at most 64. Here we set kCrosswise = max(64, ThreadblockShape::kK) instead.
-
-/******************************************************************************
- * Copyright (c) 2022, Tri Dao.
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/cutlass.h>
-#include <cutlass/array.h>
-#include <cutlass/platform/platform.h>
-
-#include <cutlass/numeric_types.h>
-#include <cutlass/matrix_shape.h>
-
-#include <cutlass/layout/tensor_op_multiplicand_sm75.h>
-#include <cutlass/transform/pitch_linear_thread_map.h>
-#include <cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h>
-
-#include <cutlass/gemm/warp/default_mma_tensor_op.h>
-#include <cutlass/gemm/threadblock/default_mma_core.h>
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace fmha {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template defininng default matrix multiply operators inferred from threadblock tile size,
-/// global memory data layout, and target math instruction.
-template <
-    /// Shape of threadblock-scoped matrix multiply operator
-    typename Shape,
-    /// Shape of warp-level matrix multiply operator
-    typename WarpShape,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape,
-    /// Element data type of A operand
-    typename ElementA,
-    /// Layout of operand A
-    typename LayoutA,
-    /// Element data type of B operand
-    typename ElementB,
-    /// Layout of operand B
-    typename LayoutB,
-    /// Data type of accumulator
-    typename ElementC,
-    /// Layout of accumulator
-    typename LayoutC,
-    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
-    typename OperatorClass,
-    /// Operation performed by MMA
-    typename Operator = cutlass::arch::OpMultiplyAdd
->
-struct FMHAMmaCore;
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: column-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_>
-struct FMHAMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                   cutlass::layout::RowMajor, ElementB_, cutlass::layout::ColumnMajor,
-                   ElementC_, LayoutC_, cutlass::arch::OpClassTensorOp, Operator_
-                  > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = cutlass::layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = cutlass::gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisibility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = cutlass::gemm::warp::WarpSize<cutlass::arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Cutlass only supports Crosswise at most 64
-  static int const kCrosswise = std::min(Shape::kK, 64);
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      kCrosswise / (kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  static int const kWarpThreadArrangementContiguousB =
-      kCrosswise / (kAccessSizeInBits / cutlass::sizeof_bits<ElementB>::value);
-
-  static int const kWarpThreadArrangementStridedB =
-      kWarpSize / kWarpThreadArrangementContiguousB;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<ElementA>::value, kCrosswise>;
-
-  // Shared memory layout
-  using SmemLayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<ElementB>::value, kCrosswise>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = cutlass::transform::PitchLinearWarpRakedThreadMap<
-      cutlass::layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      cutlass::layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                                        kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = cutlass::transform::threadblock::RegularTileIterator<
-    cutlass::MatrixShape<Shape::kM, Shape::kK>,
-    ElementA,
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = cutlass::transform::PitchLinearWarpRakedThreadMap<
-      cutlass::layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
-      cutlass::layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
-                                        kWarpThreadArrangementStridedB>,
-      kAccessSizeInBits / cutlass::sizeof_bits<ElementB>::value>;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = cutlass::transform::threadblock::RegularTileIterator<
-    cutlass::MatrixShape<Shape::kK, Shape::kN>,
-    ElementB,
-    SmemLayoutB,
-    1,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
-    MmaTensorOp,
-    cutlass::MatrixShape<0, 0>,
-    cutlass::MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization:
-///
-///   A: row-major
-///   B: row-major
-///   Operator: tensor op class
-///
-/// This uses the default warp-level operator given tile sizes
-template <
-    /// Shape of threadblock-scoped matrix multiply operator (concept:
-    /// GemmShape)
-    typename Shape_,
-    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
-    typename WarpShape_,
-    /// Shape of one matrix production operation (concept: GemmShape)
-    typename InstructionShape_,
-    /// Data type of A operand
-    typename ElementA_,
-    /// Data type of B operand
-    typename ElementB_,
-    /// Data type of accumulator
-    typename ElementC_,
-    /// Layout of accumulator
-    typename LayoutC_,
-    /// Operation performed by MMA
-    typename Operator_>
-struct FMHAMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
-                   cutlass::layout::RowMajor, ElementB_, cutlass::layout::RowMajor, ElementC_,
-                   LayoutC_, cutlass::arch::OpClassTensorOp, Operator_
-                  > {
-  using Shape = Shape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using ElementA = ElementA_;
-  using LayoutA = cutlass::layout::RowMajor;
-  using ElementB = ElementB_;
-  using LayoutB = cutlass::layout::ColumnMajor;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using OperatorClass = cutlass::arch::OpClassTensorOp;
-
-  /// Number of warps present
-  using WarpCount = cutlass::gemm::GemmShape<
-    Shape::kM / WarpShape::kM,
-    Shape::kN / WarpShape::kN,
-    Shape::kK / WarpShape::kK
-  >;
-
-  // Divisility requirements
-  static_assert(
-    !(Shape::kM % WarpShape::kM) &&
-    !(Shape::kN % WarpShape::kN),
-    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
-  );
-
-  /// Number of threads per warp
-  static int const kWarpSize = cutlass::gemm::warp::WarpSize<cutlass::arch::OpClassTensorOp>::value;
-
-  /// Number of threads total
-  static int const kThreads = WarpCount::kCount * kWarpSize;
-
-  /// Size of a threadblock-scoped access
-  static int const kAccessSizeInBits = 128;
-
-  /// Cutlass only supports Crosswise at most 64
-  static int const kCrosswise = std::min(Shape::kK, 64);
-
-  /// Default Operator
-  using Operator = Operator_;
-
-  // Warp thread arrangement
-  static int const kWarpThreadArrangementContiguousA =
-      kCrosswise / (kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value);
-
-  static int const kWarpThreadArrangementStridedA =
-      kWarpSize / kWarpThreadArrangementContiguousA;
-
-  //
-  // Shared memory layouts
-  //
-
-  using SmemLayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
-      cutlass::sizeof_bits<ElementA>::value, kCrosswise>;
-
-  // Shared memory layout
-  using SmemLayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
-      cutlass::sizeof_bits<ElementB>::value, int(128 / sizeof(ElementB))>;
-
-  //
-  // Iterators to write to shared memory
-  //
-
-  /// ThreadMap of iterator A
-  using IteratorThreadMapA = cutlass::transform::PitchLinearWarpRakedThreadMap<
-      cutlass::layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
-      cutlass::layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
-                                        kWarpThreadArrangementStridedA>,
-      kAccessSizeInBits / cutlass::sizeof_bits<ElementA>::value>;
-
-  /// Shared memory iterator to A operand
-  using SmemIteratorA = cutlass::transform::threadblock::RegularTileIterator<
-    cutlass::MatrixShape<Shape::kM, Shape::kK>,
-    ElementA,
-    SmemLayoutA,
-    0,
-    IteratorThreadMapA
-  >;
-
-  /// ThreadMap of iterator B
-  using IteratorThreadMapB = cutlass::transform::PitchLinearWarpRakedThreadMap<
-    cutlass::layout::PitchLinearShape<Shape::kN, Shape::kK>,
-    kThreads,
-    cutlass::layout::PitchLinearShape<8, 4>,
-    kAccessSizeInBits / cutlass::sizeof_bits<ElementB>::value
-  >;
-
-  /// Shared memory iterator to B operand
-  using SmemIteratorB = cutlass::transform::threadblock::RegularTileIterator<
-    cutlass::MatrixShape<Shape::kK, Shape::kN>,
-    ElementB,
-    SmemLayoutB,
-    0,
-    IteratorThreadMapB
-  >;
-
-  //
-  // Warp-level matrix multiply operator
-  //
-
-  // Define the warp-level tensor op
-  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
-      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
-      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
-
-  /// Policy used to define MmaPipelined
-  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
-    MmaTensorOp,
-    cutlass::MatrixShape<0, 0>,
-    cutlass::MatrixShape<0, 0>,
-    WarpCount::kK
-  >;
-};
-
-
-} // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh b/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
index 456b320b64ef..22046cafb55c 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/philox.cuh
@@ -1,3 +1,4 @@
+// Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/csrc/multihead_attn/philox.cuh
 // Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
 #pragma once
 // Philox CUDA.
@@ -11,8 +12,7 @@ public:
   __device__ inline Philox(unsigned long long seed,
                            unsigned long long subsequence,
                            unsigned long long offset)
-      : STATE(0)
-      , key(reinterpret_cast<const uint2&>(seed)) {
+    : key(reinterpret_cast<const uint2&>(seed)) {
     //key.x = (unsigned int)seed;
     //key.y = (unsigned int)(seed >> 32);
     //counter = make_uint4(0, 0, 0, 0);
@@ -21,7 +21,6 @@ public:
     //STATE = 0;
     //incr_n(offset / 4);
 
-    // key = reinterpret_cast<const uint2&>(seed);
     ull2 * tmp = reinterpret_cast<ull2*>(&counter);
     tmp->x = offset / 4;
     tmp->y = subsequence;
@@ -29,34 +28,46 @@ public:
     //     printf("Philox counter: %d, %d, %d, %d\n", counter.x, counter.y, counter.z, counter.w);
     // }
   }
+
   __device__ inline uint4 operator()() {
-    // if (STATE == 0) {
-      uint4 counter_ = counter;
-      uint2 key_ = key;
-      // 7-round philox
-      #pragma unroll
-      for (int i = 0; i < 6; i++) {
-        counter_ = single_round(counter_, key_);
-        key_.x += (kPhilox10A);
-        key_.y += (kPhilox10B);
-      }
-      // output = single_round(counter_, key_);
-      uint4 output = single_round(counter_, key_);
-      // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-      //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
-      //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
-      // }
-      incr();
+    uint4 counter_ = counter;
+    uint2 key_ = key;
+    // 7-round philox
+    #pragma unroll
+    for (int i = 0; i < 6; i++) {
+      counter_ = single_round(counter_, key_);
+      key_.x += (kPhilox10A);
+      key_.y += (kPhilox10B);
+    }
+    uint4 output = single_round(counter_, key_);
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
+    //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
+    // }
+    incr();
+    return output;
+  }
+
+  __device__ inline uint4 operator()(const unsigned long long subsequence) {
+    uint4 counter_ = counter;
+    ull2 * tmp = reinterpret_cast<ull2*>(&counter_);
+    tmp->y = subsequence;
+    // if ((threadIdx.x % 32 == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("tidx = %d, counter_: %u, %u, %u, %u\n", threadIdx.x, counter_.x, counter_.y, counter_.z, counter_.w);
+    // }
+    uint2 key_ = key;
+    // 7-round philox
+    #pragma unroll
+    for (int i = 0; i < 6; i++) {
+      counter_ = single_round(counter_, key_);
+      key_.x += (kPhilox10A);
+      key_.y += (kPhilox10B);
+    }
+    uint4 output = single_round(counter_, key_);
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
+    //     printf("Philox output: %u, %u, %u, %u\n", output.x, output.y, output.z, output.w);
     // }
-    // return a float4 directly
-    // unsigned long ret;
-    // switch(STATE) {
-    //  case 0: ret = output.x; break;
-    //  case 1: ret = output.y; break;
-    //  case 2: ret = output.z; break;
-    //  case 3: ret = output.w; break;
-    //}
-    // STATE = (STATE + 1) % 4;
     return output;
   }
 
@@ -66,25 +77,23 @@ private:
       uint64_t y;
   };
   uint4 counter;
-  // uint4 output;
   const uint2 key;
-  unsigned int STATE;
-  __device__ inline void incr_n(unsigned long long n) {
-    unsigned int nlo = (unsigned int)(n);
-    unsigned int nhi = (unsigned int)(n >> 32);
-    counter.x += nlo;
-    if (counter.x < nlo)
-      nhi++;
-    counter.y += nhi;
-    if (nhi <= counter.y)
-      return;
-    if (++counter.z)
-      return;
-    ++counter.w;
-  }
 
-  __device__ uint4 incr128 (uint4 ctr)
-  {
+  // __device__ inline void incr_n(unsigned long long n) {
+  //   unsigned int nlo = (unsigned int)(n);
+  //   unsigned int nhi = (unsigned int)(n >> 32);
+  //   counter.x += nlo;
+  //   if (counter.x < nlo)
+  //     nhi++;
+  //   counter.y += nhi;
+  //   if (nhi <= counter.y)
+  //     return;
+  //   if (++counter.z)
+  //     return;
+  //   ++counter.w;
+  // }
+
+  __device__ uint4 incr(uint4 ctr) {
     uint4 res;
     asm ("add.cc.u32      %0, %4, %8;\n\t"
          "addc.cc.u32     %1, %5, %9;\n\t"
@@ -100,42 +109,46 @@ private:
     // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
     //     printf("Counter before: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
     // }
-    counter = incr128(counter);
+    counter = incr(counter);
     // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
     //     printf("Counter after: %u, %u, %u, %u\n", counter.x, counter.y, counter.z, counter.w);
     // }
   }
-  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
-                                    unsigned int *result_high) {
-    *result_high = __umulhi(a, b);
-    return a * b;
-  }
-  __device__ uint2 mulhilo32_v2 (const unsigned int a, const unsigned int b)
-  {
+
+  // __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+  //                                   unsigned int *result_high) {
+  //   *result_high = __umulhi(a, b);
+  //   return a * b;
+  // }
+
+  __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
     uint2 *res;
     unsigned long long tmp;
     asm ("mul.wide.u32      %0, %1, %2;\n\t"
-         : "=l"(tmp)
-         : "r"(a), "r"(b));
+          : "=l"(tmp)
+          : "r"(a), "r"(b));
     res = (uint2*)(&tmp);
     return *res;
   }
+
   __device__ inline uint4 single_round(const uint4 ctr, const uint2 key) {
     //unsigned int hi0;
     //unsigned int hi1;
     //unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
     //unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
     //uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
-    uint2 res0 = mulhilo32_v2(kPhiloxSA, ctr.x);
-    uint2 res1 = mulhilo32_v2(kPhiloxSB, ctr.z);
+    uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
+    uint2 res1 = mulhilo32(kPhiloxSB, ctr.z);
     uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
     return ret;
   }
+
   static const unsigned long kPhilox10A = 0x9E3779B9;
   static const unsigned long kPhilox10B = 0xBB67AE85;
   static const unsigned long kPhiloxSA = 0xD2511F53;
   static const unsigned long kPhiloxSB = 0xCD9E8D57;
 };
+
 // Inverse of 2^32.
 constexpr float M_RAN_INVM32 = 2.3283064e-10f;
 __device__ __inline__ float4 uniform4(const uint4 x) {
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/smem_tile.h b/aten/src/ATen/native/transformers/cuda/flash_attn/smem_tile.h
new file mode 100644
index 000000000000..7c5aa222d8fe
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/smem_tile.h
@@ -0,0 +1,1704 @@
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/transformers/cuda/flash_attn/utils.h>
+#include <ATen/native/transformers/cuda/flash_attn/gemm.h>
+
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // The number of rows in the 2D shared memory buffer.
+    int M_,
+    // The number of cols.
+    int N_,
+    // The size in bits of each element.
+    int BITS_PER_ELEMENT_,
+    // The number of bytes per STS.
+    int BYTES_PER_STS_ = 16,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_ = 1,
+    // Do we enable the fast path for LDS.128 and friends.
+    int ENABLE_LDS_FAST_PATH_ = 0,
+    // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
+    int ROWS_PER_XOR_PATTERN_ = 8,
+    // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
+    int COLS_PER_XOR_PATTERN_ = 1,
+    // Use or not predicates
+    bool USE_PREDICATES_ = true
+>
+struct Smem_tile_without_skews {
+
+    // The size in bits of each element.
+    enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+    // The size in bytes of a single STS.
+    enum { BYTES_PER_STS = BYTES_PER_STS_ };
+    // The number of elements per STS.
+    enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+    // To support arbitrary N, we pad some values to a power-of-2.
+    enum { N_WITH_PADDING = Next_power_of_two<N_>::VALUE };
+    // The number of bytes per row without packing of rows.
+    enum { BYTES_PER_ROW_BEFORE_PACKING = N_WITH_PADDING * BITS_PER_ELEMENT / 8 };
+    // The number of bytes per row -- we want at least 128B per row.
+    enum { BYTES_PER_ROW = Max<BYTES_PER_ROW_BEFORE_PACKING, 128>::VALUE };
+    // The number of rows in shared memory (two rows may be packed into a single one).
+    enum { ROWS = M_ * BYTES_PER_ROW_BEFORE_PACKING / BYTES_PER_ROW };
+
+    // The number of threads per row.
+    enum { THREADS_PER_ROW_UNBOUNDED = BYTES_PER_ROW / BYTES_PER_STS };
+    // The number of threads per row.
+    enum { THREADS_PER_ROW = Min<Cta_tile::THREADS_PER_CTA, THREADS_PER_ROW_UNBOUNDED>::VALUE };
+
+    // The number of STS per row.
+    enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+    // It must be at least one.
+    static_assert(STS_PER_ROW >= 1, "");
+    // The number of rows written with a single STS.
+    enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+    // Make sure we write to at least one row per STS. Thanks Dr. Obvious ;)
+    static_assert(ROWS_PER_STS >= 1, "");
+    // The number of STS needed to store all rows.
+    enum { STS_PER_COL = Div_up<ROWS, ROWS_PER_STS>::VALUE };
+    // The number of STS in total.
+    enum { STS = STS_PER_COL * STS_PER_ROW };
+
+    // TD [2022-06-02] In the case of Q (16 x 64) in the backward pass with 256 threads,
+    // we only need to store 16 * 64 * 2 = 2KB instead of 4KB.
+    static constexpr bool PARTIAL_STORE = ROWS_PER_STS > ROWS;
+    static constexpr int STORING_THREADS = PARTIAL_STORE ? ROWS * THREADS_PER_ROW : Cta_tile::THREADS_PER_CTA;
+
+    // The size of one buffer in bytes in shared memory.
+    // enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * Cta_tile::THREADS_PER_CTA };
+    enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * STORING_THREADS };
+    // The number of buffers.
+    enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+    // The size in bytes of total buffers.
+    enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+    // The boundary for smem_read_offset and smem_write_offset increment.
+    enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+    // Do we enable the LDS.128 fast path?
+    enum { ENABLE_LDS_FAST_PATH = ENABLE_LDS_FAST_PATH_ };
+    static_assert(ENABLE_LDS_FAST_PATH == 0);
+    // The number of rows that are used for the XOR swizzling to allow fast STS/LDS.
+    enum { ROWS_PER_XOR_PATTERN = ROWS_PER_XOR_PATTERN_ };
+    // The number of cols that are used for the XOR swizzling to allow fast STS/LDS.
+    enum { COLS_PER_XOR_PATTERN = COLS_PER_XOR_PATTERN_ * 16 / BYTES_PER_STS };
+    // Use or not predicates
+    enum { USE_PREDICATES = USE_PREDICATES_ };
+
+    // The type of elements that are stored in shared memory by each thread.
+    using Store_type = typename Uint_from_size_in_bytes<BYTES_PER_STS>::Type;
+
+    // Ctor.
+    inline __device__ Smem_tile_without_skews(void *smem, int tidx)
+        : smem_(__nvvm_get_smem_pointer(smem)), tidx_(tidx) {
+
+        // The row written by a thread. See doc/mma_smem_layout.xlsx.
+        int smem_write_row = tidx / THREADS_PER_ROW;
+
+        // The XOR pattern.
+        int smem_write_xor = smem_write_row % ROWS_PER_XOR_PATTERN * COLS_PER_XOR_PATTERN;
+        // Compute the column and apply the XOR pattern.
+        int smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;
+
+        // The offset.
+        this->smem_write_offset_ = smem_write_row*BYTES_PER_ROW + smem_write_col*BYTES_PER_STS;
+
+        // TODO: Why not merge it with the read offset?
+        // this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+        // this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+    }
+
+    // Compute the store pointers.
+    template< int N >
+    inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
+        #pragma unroll
+        for( int ii = 0; ii < N; ++ii ) {
+            // Decompose the STS into row/col.
+            int row = ii / STS_PER_ROW;
+            int col = ii % STS_PER_ROW;
+
+            // Assemble the offset.
+            int offset = smem_write_offset_ + row*ROWS_PER_STS*BYTES_PER_ROW;
+
+            // Take the column into account.
+            if( STS_PER_ROW > 1 ) {
+                offset += col*THREADS_PER_ROW*BYTES_PER_STS;
+            }
+
+            // Apply the XOR pattern if needed.
+            if( ROWS_PER_STS < ROWS_PER_XOR_PATTERN ) {
+                const int m = row * ROWS_PER_STS % ROWS_PER_XOR_PATTERN;
+                offset ^= m * COLS_PER_XOR_PATTERN * BYTES_PER_STS;
+            }
+
+            // Assemble the final pointer :)
+            // ptrs[ii] = smem_ + offset + smem_write_buffer_;
+            // smem_write_buffer_ is already merged with smem_write_offset_
+            ptrs[ii] = smem_ + offset;
+        }
+    }
+
+    inline __device__ void debug_reset() {
+        for( int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
+        for( int row = 0; row < ROWS; ++row ) {
+            for( int col = 0; col < BYTES_PER_ROW; col += 4 ) {
+                if( threadIdx.x == 0 ) {
+                    uint32_t val = 0x0;
+                    sts(val, smem_ + row*BYTES_PER_ROW + col + buffer);
+                }
+            }
+        }
+        }
+    }
+
+    // Print the content of the tile (only for debug ;)).
+    inline __device__ void debug_print() const {
+        for( int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
+        for( int row = 0; row < ROWS; ++row ) {
+            for( int col = 0; col < BYTES_PER_ROW; col += 4 ) {
+                if( threadIdx.x == 0 ) {
+                    uint32_t val;
+                    lds(val, smem_ + row*BYTES_PER_ROW + col + buffer);
+                    printf("block=(x=%2d, y=%2d, z=%2d) (smem_=%2d, buffer=%2d, row=%2d, byte=%4d)=0x%08x\n",
+                        blockIdx.x,
+                        blockIdx.y,
+                        blockIdx.z,
+                        smem_,
+                        buffer,
+                        row,
+                        col,
+                        val);
+                }
+            }
+        }
+        }
+    }
+
+    // Move the read offset to next buffer.
+    inline __device__ void move_to_next_read_buffer() {
+        // if( BUFFERS_PER_TILE > 1 && smem_read_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
+        //     this->smem_read_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
+        // } else if( BUFFERS_PER_TILE > 1 ) {
+        //     this->smem_read_buffer_ += BYTES_PER_BUFFER;
+        // }
+        if( BUFFERS_PER_TILE > 1 && smem_read_offset_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
+            this->smem_read_offset_ -= BYTES_PER_TILE_INC_BOUNDARY;
+        } else if( BUFFERS_PER_TILE > 1 ) {
+            this->smem_read_offset_ += BYTES_PER_BUFFER;
+        }
+    }
+
+    // Move the read offset to next buffer. TODO: Remove this member function!!!
+    inline __device__ void move_next_read_buffer() {
+        this->move_to_next_read_buffer();
+    }
+
+    // Move the read offset to next N buffer (circular-buffer).
+    inline __device__ void move_to_next_read_buffer(int N) {
+        if( BUFFERS_PER_TILE > 1 ) {
+            // this->smem_read_buffer_ += N * BYTES_PER_BUFFER;
+            // this->smem_read_buffer_ -= smem_read_buffer_ >= BYTES_PER_TILE ? BYTES_PER_TILE : 0;
+            this->smem_read_offset_ += N * BYTES_PER_BUFFER;
+            this->smem_read_offset_ -= smem_read_offset_ >= BYTES_PER_TILE ? BYTES_PER_TILE : 0;
+        }
+    }
+
+    // Move the read offset to next N buffer (circular-buffer). TODO: Remove this member function!!!
+    inline __device__ void move_next_read_buffer(int N) {
+        this->move_to_next_read_buffer(N);
+    }
+
+    // Move the write offset to next buffer.
+    inline __device__ void move_to_next_write_buffer() {
+        // if( BUFFERS_PER_TILE > 1 && smem_write_buffer_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
+        //     this->smem_write_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
+        // } else if( BUFFERS_PER_TILE > 1 ) {
+        //     this->smem_write_buffer_ += BYTES_PER_BUFFER;
+        // }
+        if( BUFFERS_PER_TILE > 1 && smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY ) {
+            this->smem_write_offset_ -= BYTES_PER_TILE_INC_BOUNDARY;
+        } else if( BUFFERS_PER_TILE > 1 ) {
+            this->smem_write_offset_ += BYTES_PER_BUFFER;
+        }
+    }
+
+    // Move the write offset to next buffer. TODO: Remove that member function!
+    inline __device__ void move_next_write_buffer() {
+        this->move_to_next_write_buffer();
+    }
+
+    // Move the read offset.
+    inline __device__ void move_read_offset(int delta) {
+        this->smem_read_offset_ += delta;
+    }
+
+    // Move the write offset.
+    inline __device__ void move_write_offset(int delta) {
+        this->smem_write_offset_ += delta;
+    }
+
+    // Store to the tile in shared memory.
+    template< int N >
+    inline __device__ void store(const Store_type (&data)[N], uint64_t = 0) {
+        uint32_t smem_ptrs[N];
+        this->compute_store_pointers(smem_ptrs);
+        // Trying to reduce the shared mem for Q from 4KB per buffer to 2KB per buffer.
+        if (!PARTIAL_STORE || (tidx_ / THREADS_PER_ROW < ROWS)) {
+            sts(smem_ptrs, data);
+        }
+    }
+
+    // Store to the tile in shared memory.
+    template< int N, int M >
+    inline __device__ void store(const Store_type (&data)[N], uint32_t (&preds)[M], uint64_t = 0) {
+        uint32_t smem_ptrs[N];
+        this->compute_store_pointers(smem_ptrs);
+        sts(smem_ptrs, data, preds);
+    }
+
+    // Store to the tile in shared memory.
+    template< int N >
+    inline __device__ void store(const Store_type (&data)[N], uint32_t preds, uint64_t = 0) {
+        this->store(data, preds);
+    }
+
+    // Store to the tile in shared memory.
+    template< int N >
+    inline __device__ void store(const void* (&gmem_ptrs)[N], uint32_t preds, uint64_t = 0) {
+        uint32_t tmp[1] = { preds };
+        this->store(gmem_ptrs, tmp);
+    }
+
+    // The shared memory pointer.
+    const uint32_t smem_;
+    // The read offset. Reserve 4 offsets if needed.
+    int smem_read_offset_;
+    // The write offset.
+    int smem_write_offset_;
+    // The buffer base offset for read.
+    // int smem_read_buffer_;
+    // The buffer base offset for write.
+    // int smem_write_buffer_;
+    const int tidx_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The layout of the tile.
+    typename Layout,
+    // The size of the STS.
+    int BYTES_PER_STS = 16,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE = 1,
+    // Use or not predicates
+    bool USE_PREDICATES = true
+>
+struct Smem_tile_a {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int MMAS_K, int MMAS_K_WITH_PADDING >
+struct Compute_reset_mask {
+    // The potential mask.
+    enum { HALF = MMAS_K_WITH_PADDING / 2 };
+    // The remainder.
+    enum { MOD = MMAS_K % HALF };
+    // The final value.
+    enum { VALUE = (MMAS_K == MOD ? 0 : HALF) | Compute_reset_mask<MOD, HALF>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int MMAS_K_WITH_PADDING >
+struct Compute_reset_mask<0, MMAS_K_WITH_PADDING> {
+    enum { VALUE = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int MMAS_K >
+struct Compute_reset_mask<MMAS_K, MMAS_K> {
+    enum { VALUE = MMAS_K - 1 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+struct Rows_per_xor_pattern_a {
+    // The size in bits.
+    enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_A };
+    // The number of rows.
+    enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+struct Rows_per_xor_pattern_row_a : public Rows_per_xor_pattern_a<N> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_a<Cta_tile::K>::VALUE
+>
+struct Smem_tile_row_a : public Smem_tile_without_skews<Cta_tile,
+                                                               Cta_tile::M,
+                                                               Cta_tile::K,
+                                                               fmha::BITS_PER_ELEMENT_A,
+                                                               BYTES_PER_STS,
+                                                               BUFFERS_PER_TILE,
+                                                               0,
+                                                               ROWS_PER_XOR_PATTERN_,
+                                                               1> {
+    // The MMA tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    // The base class.
+    using Base = Smem_tile_without_skews<Cta_tile,
+                                         Cta_tile::M,
+                                         Cta_tile::K,
+                                         fmha::BITS_PER_ELEMENT_A,
+                                         BYTES_PER_STS,
+                                         BUFFERS_PER_TILE,
+                                         0,
+                                         ROWS_PER_XOR_PATTERN_,
+                                         1>;
+    // The fragment.
+    using Fragment = Fragment_a<Row>;
+
+    // When we use padding to reach a power of two, special care has to be taken.
+    using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Cta_tile>;
+    // The number of MMAs.
+    using Mma_tile_with_padding = fmha::Hmma_tile<Cta_tile_with_padding>;
+
+    // The size of a single LDS in bytes.
+    enum { BYTES_PER_LDS = 16 };
+
+    // Ctor.
+    inline __device__ Smem_tile_row_a(void *smem, int tidx) : Base(smem, tidx) {
+
+        // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+        // The number of warps.
+        const int WARPS_M = Cta_tile::WARPS_M;
+        const int WARPS_N = Cta_tile::WARPS_N;
+        const int WARPS_K = Cta_tile::WARPS_K;
+
+        static_assert(WARPS_M == 1);
+        static_assert(WARPS_N == 4 || WARPS_N == 8);
+        static_assert(WARPS_K == 1);
+        static_assert(Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 4 || Base::ROWS_PER_XOR_PATTERN == 8);
+
+        // The row and column read by the thread.
+        int smem_read_row  = (tidx & 0x0f);
+        constexpr int ROWS_PER_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+        int smem_read_col = ((smem_read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) * Base::COLS_PER_XOR_PATTERN;
+        smem_read_col ^= (tidx & 0x10) / 16;
+
+        // The shared memory offset.
+        this->smem_read_offset_ = smem_read_row*Base::BYTES_PER_ROW_BEFORE_PACKING + smem_read_col*BYTES_PER_LDS;
+    }
+
+    // Rewind smem_read_offset for last LDS phase in main loop.
+    inline __device__ void reverse_smem_read_offset(int ki = 0) {
+        // Undo the pointer increment for the next ni.
+        // Should match the load function below for ki = 0.
+        if( Mma_tile_with_padding::MMAS_K >=  2 ) {
+            this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+        }
+    }
+
+    // Load from shared memory.
+    inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {
+        #pragma unroll
+        for( int mi = 0; mi < Mma_tile::MMAS_M; ++mi ) {
+            // Jump by as many matrix rows as needed (a row in smem may pack multiple matrix rows).
+            int offset = mi * Mma_tile::M_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+            // Load using LDSM.M88.4.
+            uint4 tmp;
+            // ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+            ldsm(tmp, this->smem_ + this->smem_read_offset_ + offset);
+
+            // Store the value into the fragment.
+            a[mi].reg(0) = tmp.x;
+            a[mi].reg(1) = tmp.y;
+            a[mi].reg(2) = tmp.z;
+            a[mi].reg(3) = tmp.w;
+        }
+
+        // Move the offset to the next possition. See doc/mma_smem_layout.xlsx.
+        static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+        if(        Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15 ) {
+            this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >= 16 && ki %  8 ==  7 ) {
+            this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >=  8 && ki %  4 ==  3 ) {
+            this->smem_read_offset_ ^=  7 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >=  4 && ki %  2 ==  1 ) {
+            this->smem_read_offset_ ^=  3 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >=  2 ) {
+            this->smem_read_offset_ ^=  1 * BYTES_PER_LDS * 2;
+        }
+    }
+
+    // Reset the read offset.
+    inline __device__ void reset_read_offset() {
+        // The number of MMAs in the K dimension.
+        enum { MMAS_K = Mma_tile::MMAS_K };
+        // The number of MMAs in the K dimension when we include padding.
+        enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+        // Assemble the mask.
+        enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+        // Reset the read offset.
+        this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
+    }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE
+>
+struct Smem_tile_a<Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_row_a<Cta_tile,
+                                    BYTES_PER_STS,
+                                    BUFFERS_PER_TILE> {
+    // The base class.
+    using Base = Smem_tile_row_a<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+    // Ctor.
+    inline __device__ Smem_tile_a(void *smem, int tidx) : Base(smem, tidx) {
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The layout of the tile.
+    typename Layout,
+    // The size of the STS.
+    int BYTES_PER_STS = 16,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE = 1,
+    // Use or not predicates
+    bool USE_PREDICATES = true
+>
+struct Smem_tile_b {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+struct Rows_per_xor_pattern_b {
+    // The size in bits.
+    enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_B };
+    // The number of rows.
+    enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+struct Rows_per_xor_pattern_col_b : public Rows_per_xor_pattern_b<N> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_col_b<Cta_tile::K>::VALUE
+>
+struct Smem_tile_col_b : public Smem_tile_without_skews<Cta_tile,
+                                                           Cta_tile::N,
+                                                           Cta_tile::K,
+                                                           fmha::BITS_PER_ELEMENT_B,
+                                                           BYTES_PER_STS,
+                                                           BUFFERS_PER_TILE,
+                                                           0,
+                                                           ROWS_PER_XOR_PATTERN_,
+                                                           1> {
+    // The MMA tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    // The base class.
+    using Base = Smem_tile_without_skews<Cta_tile,
+                                         Cta_tile::N,
+                                         Cta_tile::K,
+                                         fmha::BITS_PER_ELEMENT_B,
+                                         BYTES_PER_STS,
+                                         BUFFERS_PER_TILE,
+                                         0,
+                                         ROWS_PER_XOR_PATTERN_,
+                                         1>;
+    // The fragment.
+    using Fragment = Fragment_b< Col>;
+
+    // When we use padding to reach a power of two, special care has to be taken.
+    using Cta_tile_with_padding = Cta_tile_with_k_with_padding< Cta_tile>;
+    // The number of MMAs.
+    using Mma_tile_with_padding = fmha::Hmma_tile<Cta_tile_with_padding>;
+
+    // The size of a single LDS in bytes.
+    enum { BYTES_PER_LDS = 16 };
+
+    // The number of STS per thread
+    enum { STS_PER_THREAD_ = Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA };
+    // The number of STS per thread must be at least 1.
+    enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };
+
+    // Ctor.
+    inline __device__ Smem_tile_col_b(void *smem, int tidx) : Base(smem, tidx) {
+
+        // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+        // The number of warps.
+        const int WARPS_M = Cta_tile::WARPS_M;
+        const int WARPS_N = Cta_tile::WARPS_N;
+        const int WARPS_K = Cta_tile::WARPS_K;
+        static_assert(Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 4 || Base::ROWS_PER_XOR_PATTERN == 8);
+        static_assert(WARPS_M == 1);
+        static_assert(WARPS_N == 4 || WARPS_N == 8);
+        static_assert(WARPS_K == 1);
+
+        // The masks to select the warps.
+        const int WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+
+        // The divisor for the warps.
+        const int WARP_DIV_N = WARPS_M *       1 * Cta_tile::THREADS_PER_WARP;
+
+        // The row and column read by the thread.
+        int smem_read_row  = (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA +
+                             (tidx & 0x07) +
+                             (tidx & 0x10) / 2;
+        constexpr int ROWS_PER_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+        int smem_read_col = ((smem_read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) * Base::COLS_PER_XOR_PATTERN;
+        smem_read_col ^= (tidx & 0x08) / 8;
+        // The shared memory offset.
+        this->smem_read_offset_ = smem_read_row*Base::BYTES_PER_ROW_BEFORE_PACKING + smem_read_col*BYTES_PER_LDS;
+    }
+
+    // Rewind smem_read_offset for last LDS phase in main loop.
+    inline __device__ void reverse_smem_read_offset(int ki = 0) {
+        // Undo the pointer increment for the next ni.
+        // Should match the load function below for ki = 0.
+        if( Mma_tile_with_padding::MMAS_K >=  2 ) {
+            this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+        }
+    }
+
+    // Load from shared memory.
+    inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+        #pragma unroll
+        for( int ni = 0; ni < Mma_tile::MMAS_N; ++ni ) {
+            // Jump by as many matrix rows as needed (a row in smem may pack multiple matrix rows).
+            int offset = ni * Mma_tile::N_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+            // Load using LDSM.M88.4.
+            uint4 tmp;
+            // ldsm(tmp, this->smem_ + this->smem_read_offset_ + this->smem_read_buffer_ + offset);
+            ldsm(tmp, this->smem_ + this->smem_read_offset_ + offset);
+
+            // Store the value into the fragment.
+            b[ni].reg(0) = tmp.x;
+            b[ni].reg(1) = tmp.y;
+            b[ni].reg(2) = tmp.z;
+            b[ni].reg(3) = tmp.w;
+        }
+
+        // Move the offset to the next possition. See doc/mma_smem_layout.xlsx.
+        static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+        if(        Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15 ) {
+            this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >= 16 && ki %  8 ==  7 ) {
+            this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >=  8 && ki %  4 ==  3 ) {
+            this->smem_read_offset_ ^=  7 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >=  4 && ki %  2 ==  1 ) {
+            this->smem_read_offset_ ^=  3 * BYTES_PER_LDS * 2;
+        } else if( Mma_tile_with_padding::MMAS_K >=  2 ) {
+            this->smem_read_offset_ ^=  1 * BYTES_PER_LDS * 2;
+        }
+    }
+
+    // Reset the read offset.
+    inline __device__ void reset_read_offset() {
+        // The number of MMAs in the K dimension.
+        enum { MMAS_K = Mma_tile::MMAS_K };
+        // The number of MMAs in the K dimension when we include padding.
+        enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+        // Assemble the mask.
+        enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+        // Reset the read offset.
+        this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE
+>
+struct Smem_tile_b< Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE >
+    : public Smem_tile_col_b<Cta_tile,
+                             BYTES_PER_STS,
+                             BUFFERS_PER_TILE> {
+
+    // The base class.
+    using Base = Smem_tile_col_b< Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+    // Ctor.
+    inline __device__ Smem_tile_b(void *smem, int tidx) : Base(smem, tidx) {
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<  int N >
+struct Rows_per_xor_pattern_row_b : public Rows_per_xor_pattern_b< N> {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_b<Cta_tile::N>::VALUE,
+    // How many cols to use for the XOR pattern to avoid bank conflicts?
+    int COLS_PER_XOR_PATTERN_ = 1
+>
+struct Smem_tile_row_b : public Smem_tile_without_skews<Cta_tile,
+                                                               Cta_tile::K,
+                                                               Cta_tile::N,
+                                                               fmha::BITS_PER_ELEMENT_B,
+                                                               BYTES_PER_STS,
+                                                               BUFFERS_PER_TILE,
+                                                               0,
+                                                               ROWS_PER_XOR_PATTERN_,
+                                                               COLS_PER_XOR_PATTERN_> {
+
+    // The MMA tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    // The base class.
+    using Base = Smem_tile_without_skews<Cta_tile,
+                                         Cta_tile::K,
+                                         Cta_tile::N,
+                                         fmha::BITS_PER_ELEMENT_B,
+                                         BYTES_PER_STS,
+                                         BUFFERS_PER_TILE,
+                                         0,
+                                         ROWS_PER_XOR_PATTERN_,
+                                         COLS_PER_XOR_PATTERN_>;
+    // The fragment.
+    using Fragment = Fragment_b<Row>;
+
+    // Can we use LDSM? No if the data type is 32-bit large.
+    enum { USE_LDSMT = fmha::BITS_PER_ELEMENT_B == 16 };
+    // The size of a single LDS in bytes.
+    enum { BYTES_PER_LDS = USE_LDSMT ? 16 : 4 };
+    // The number of elements per LDS.
+    enum { ELEMENTS_PER_LDS = BYTES_PER_LDS * 8 / fmha::BITS_PER_ELEMENT_B };
+
+    // The number of STS per thread
+    enum { STS_PER_THREAD_ = Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA };
+    // The number of STS per thread must be at least 1.
+    enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };
+
+    // Ctor.
+    inline __device__ Smem_tile_row_b(void *smem, int tidx) : Base(smem, tidx) {
+
+        // The number of warps.
+        const int WARPS_M = Cta_tile::WARPS_M;
+        const int WARPS_N = Cta_tile::WARPS_N;
+        const int WARPS_K = Cta_tile::WARPS_K;
+        static_assert(WARPS_K == 1);
+        static_assert(WARPS_M == 4 || WARPS_M == 8);
+        static_assert(WARPS_N == 1);
+
+        // The masks to select the warps.
+        const int WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+        const int WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+        // The divisor for the warps.
+        const int WARP_DIV_N = WARPS_M *       1 * Cta_tile::THREADS_PER_WARP;
+        const int WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+
+        static_assert(USE_LDSMT);
+        static_assert(Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 4 || Base::ROWS_PER_XOR_PATTERN == 8);
+
+        // The row/col read by the thread.
+        int smem_read_row = (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 16 +
+                            (tidx & 0x07) + (tidx & 0x08);
+        constexpr int ROWS_PER_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+        int smem_read_col = ((smem_read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) * Base::COLS_PER_XOR_PATTERN;
+        smem_read_col ^= (tidx & WARP_MASK_N) / WARP_DIV_N * 2 + (tidx & 0x10) / 16;
+
+        // The shared memory offset.
+        this->smem_read_offset_ = smem_read_row*Base::BYTES_PER_ROW_BEFORE_PACKING + smem_read_col*BYTES_PER_LDS;
+
+        // Fill zeroes for group conv
+    }
+
+    // Rewind smem_read_offset for last LDS phase in main loop.
+    inline __device__ void reverse_smem_read_offset(int ki = 0) {
+        // The size of each element in bits.
+        const int BITS_PER_ELT = fmha::BITS_PER_ELEMENT_B;
+        // The size in bytes of the data needed to compute an MMA per CTA.
+        const int BYTES_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;
+
+        #pragma unroll
+        for( int ni = 0; ni < Mma_tile::MMAS_N; ++ni ) {
+            // Undo the pointer increment for the next ni.
+            // Should match the load function below for ki = 0.
+            if( BYTES_PER_MMA_PER_CTA >= 128 ) {
+                // Nothing to do!
+            } else if( BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 ) {
+                this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+            } else if( BYTES_PER_MMA_PER_CTA == 64 ) {
+                // Nothing to do!
+            } else if( BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+            } else if( BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+            }
+        }
+
+        // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
+        if( BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 &&
+                Mma_tile::MMAS_N % 2 == 1 ) {
+            this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+        }
+    }
+
+    // Load from shared memory.
+    inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+        // The size of each element in bits.
+        const int BITS_PER_ELT = fmha::BITS_PER_ELEMENT_B;
+        // The size in bytes of the data needed to compute an MMA per CTA.
+        const int BYTES_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;
+
+        // uint32_t smem_read_og = this->smem_ + this->smem_read_offset_;
+        #pragma unroll
+        for( int ni = 0; ni < Mma_tile::MMAS_N; ++ni ) {
+            // Prepare the offset.
+            int offset = ki * Base::ROWS_PER_XOR_PATTERN * 2 * Base::BYTES_PER_ROW_BEFORE_PACKING;
+                if ( BYTES_PER_MMA_PER_CTA == 32 ) {
+                    offset += this->smem_read_offset_;
+                } else if ( BYTES_PER_MMA_PER_CTA == 64 ) {
+                    offset += this->smem_read_offset_ + (ni/2) * BYTES_PER_MMA_PER_CTA * 2;
+                } else {
+                    offset += this->smem_read_offset_ + (ni  ) * BYTES_PER_MMA_PER_CTA;
+                }
+
+            // Load the data using LDSM.MT88.2.
+            // uint32_t ptr = this->smem_ + this->smem_read_buffer_ + offset;
+            uint32_t ptr = this->smem_ + offset;
+            uint4 tmp;
+            if( USE_LDSMT ) {
+                ldsmt(tmp, ptr);
+            } else {
+                lds(tmp.x, (ptr     ) + 0*Base::BYTES_PER_ROW_BEFORE_PACKING);
+                lds(tmp.y, (ptr     ) + 4*Base::BYTES_PER_ROW_BEFORE_PACKING);
+                lds(tmp.z, (ptr ^ 32) + 0*Base::BYTES_PER_ROW_BEFORE_PACKING);
+                lds(tmp.w, (ptr ^ 32) + 4*Base::BYTES_PER_ROW_BEFORE_PACKING);
+            }
+
+            // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+            //     printf("BYTES_PER_MMA_PER_CTA=%d, ni = %d, smem_read diff = %d\n", BYTES_PER_MMA_PER_CTA, ni, ptr - smem_read_og);
+            // }
+            // Store those values in the fragment.
+            b[ni].reg(0) = tmp.x;
+            b[ni].reg(1) = tmp.y;
+            b[ni].reg(2) = tmp.z;
+            b[ni].reg(3) = tmp.w;
+
+            // Move the pointer for the next ni. I expect the compiler to not recompute those.
+            if( BYTES_PER_MMA_PER_CTA >= 128 ) {
+                // Nothing to do!
+            } else if( BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 ) {
+                this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+            } else if( BYTES_PER_MMA_PER_CTA == 64 ) {
+                // Nothing to do!
+            } else if( BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 8 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 4 == 3 ? 14 : (ni % 2 == 1 ? 6 : 2));
+            } else if( BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+            } else if( BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+            }
+        }
+
+        // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
+        if( BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 &&
+                Mma_tile::MMAS_N % 2 == 1 ) {
+            this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE
+>
+struct Smem_tile_b<Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_row_b<Cta_tile,
+                             BYTES_PER_STS,
+                             BUFFERS_PER_TILE> {
+
+    // The base class.
+    using Base = Smem_tile_row_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+    // Ctor.
+    inline __device__ Smem_tile_b(void *smem, int tidx) : Base(smem, tidx) {
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile>
+struct Smem_tile_v : public fmha::Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 16, 16, 1, 0, Rows_per_xor_pattern_col_b<Cta_tile::N>::VALUE, 1> {
+
+    // The base class.
+    using Base = Smem_tile_without_skews<Cta_tile, Cta_tile::K, Cta_tile::N, 16, 16, 1, 0, Rows_per_xor_pattern_col_b<Cta_tile::N>::VALUE, 1>;
+    // The MMA tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    // The fragment.
+    using Fragment = Fragment_b< fmha::Col>;
+
+    // The size of a single LDS in bytes.
+    enum { BYTES_PER_LDS = 16 };
+
+    // Ctor.
+    inline __device__ Smem_tile_v(void *smem, int tidx) : Base(smem, tidx) {
+
+        // The row/col read by the thread.
+        int read_row, read_col;
+
+        static_assert(Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 1 && (Cta_tile::WARPS_K == 4 || Cta_tile::WARPS_K == 8));
+
+        read_row = (tidx & 0xe0) / 2 + (tidx & 0x0f);
+        constexpr int ROWS_PER_PACKING = Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+        read_col = ((read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) * Base::COLS_PER_XOR_PATTERN;
+        read_col ^= (tidx & 0x10) / 16;
+
+        // The shared memory offset.
+        this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW_BEFORE_PACKING + read_col * BYTES_PER_LDS;
+    }
+
+    // Load from shared memory.
+    inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+#pragma unroll
+        for( int ni = 0; ni < Mma_tile::MMAS_N; ++ni ) {
+            // Jump by 16 * #warps row.
+            int row = ki * 16 * Cta_tile::WARPS_K;
+
+            // Load the data using LDSM.MT88.2.
+            uint4 tmp;
+            fmha::ldsmt(tmp, this->smem_ + this->smem_read_offset_ + row * Base::BYTES_PER_ROW_BEFORE_PACKING);
+            b[ni].reg(0) = tmp.x;
+            b[ni].reg(1) = tmp.y;
+            b[ni].reg(2) = tmp.z;
+            b[ni].reg(3) = tmp.w;
+
+            // Move the pointer for the next ni. I expect the compiler to not recompute those.
+            if( Mma_tile::MMAS_N == 1 ) {
+                // noop
+            } else if( Mma_tile::MMAS_N == 2 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+            } else if( Mma_tile::MMAS_N == 4 ) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+            } else if (Mma_tile::MMAS_N == 8) {
+                this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 4 == 3 ? 14 : (ni % 2 == 1 ? 6 : 2));
+            } else {
+                assert(false);  // Not implemented!
+            }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o {
+
+    // The MMA tile.
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    // The accumulators.
+    using Accumulator = fmha::Fragment_accumulator;
+    // The accumulators.
+    using Data_type = typename Accumulator::Data_type;
+
+    // The size of each element.
+    static constexpr int BYTES_PER_ELEMENT = sizeof(Data_type);
+    // The size of each STS.
+    static constexpr int BYTES_PER_STS = 8;
+    // The size of each row in shared memory.
+    static constexpr int BYTES_PER_ROW = Cta_tile::N * Cta_tile::WARPS_K * BYTES_PER_ELEMENT;
+
+    // The size of each LDS.
+    static constexpr int BYTES_PER_LDS = 16;
+    static constexpr int THREADS_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT / BYTES_PER_LDS;
+
+    // The number of rows.
+    static constexpr int ROWS = Cta_tile::M;
+    // The number of "rows" to process per loop iteration (in the "epilogue").
+    static constexpr int ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA;
+    // The number of outer loops.
+    static constexpr int LOOPS = ROWS / ROWS_PER_LOOP;
+    // Make sure it matches our expectations.
+    static_assert(LOOPS == 1 || LOOPS == (int)Mma_tile::MMAS_M, "");
+
+    // The number of rows loaded per LDS.
+    static constexpr int ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW;
+    // Do we have to guard against partial writes/reads.
+    static constexpr bool HAS_INCOMPLETE_LDS = ROWS_PER_LOOP % ROWS_PER_LDS != 0;
+    // The total number of LDS per loop.
+    static constexpr int LDS_PER_LOOP = fmha::DivUpConstexpr(ROWS_PER_LOOP, ROWS_PER_LDS);
+
+    // The amount of shared memory.
+    static constexpr int BYTES_PER_TILE = ROWS_PER_LOOP * BYTES_PER_ROW;
+
+    // The write pointer.
+    uint32_t smem_write_, smem_read_;
+    // Is the thread active for the last LDS of the series?
+    int is_active_for_last_lds_;
+
+    // static_assert(BYTES_PER_ROW == 64 * 4 * Cta_tile::WARPS_K);
+    static_assert(LOOPS == 1 || LOOPS == (int)Mma_tile::MMAS_M, "");
+
+    // Ctor.
+    inline __device__ Smem_tile_o(void *smem, int tidx) {
+
+        // Get a 32-bit value for the shared memory address.
+        uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+        static_assert(Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 1 && (Cta_tile::WARPS_K == 4 || Cta_tile::WARPS_K == 8));
+        static_assert(Cta_tile::N == 16 || Cta_tile::N == 32 || Cta_tile::N == 64 || Cta_tile::N == 128);
+
+        int write_row = (tidx & 0x1c) / 4;
+
+        const int lane = tidx % 32;
+        const int warp = tidx / 32;
+
+        constexpr int ELEMENTS_PER_STS = BYTES_PER_STS / BYTES_PER_ELEMENT;
+        constexpr int STS_PER_WARP = 16 * Mma_tile::MMAS_N / ELEMENTS_PER_STS;
+        int write_col = warp * STS_PER_WARP + lane % STS_PER_WARP;
+
+        // if ((threadIdx.x == 16) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("write_row = %d, write_col = %d\n", write_row, write_col);
+        // }
+
+        // if ((blockIdx.x == 0) && (blockIdx.y == 0) && (write_row == 0) && (write_col == 0)) {
+        //     printf("threadIdx.x = %d\n", threadIdx.x);
+        // }
+
+        // Assemble the write pointer.
+        smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+
+        // The element read by each thread.
+        int read_row = tidx / THREADS_PER_ROW;
+        int read_col = tidx % THREADS_PER_ROW;
+
+        // Take the XOR pattern into account for the column.
+        read_col ^= 2 * (read_row % (Cta_tile::N == 16 ? 2 : (Cta_tile::N == 32 ? 4 : 8)));
+        // read_col ^= 2 * (read_row % (Cta_tile::N == 16 ? 2 : (Cta_tile::N == 32 ? 4 : (Cta_tile::N == 128 ? 16 : 8))));
+
+        // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("read_row = %d, read_col = %d\n", read_row, read_col);
+        // }
+        // if ((blockIdx.x == 0) && (blockIdx.y == 0) && (read_row == 0) && (read_col == 0)) {
+        //     printf("threadIdx.x = %d\n", threadIdx.x);
+        // }
+        // Assemble the read pointer.
+        this->smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+        // Is that thread active on the last LDS?
+        if( HAS_INCOMPLETE_LDS ) {
+            this->is_active_for_last_lds_ = read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < Cta_tile::M;
+        }
+    }
+
+    // Load the output fragments.
+    template <bool zero_init=true>
+    inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+        #pragma unroll
+        for( int ii = 0; ii < LDS_PER_LOOP; ++ii ) {
+
+            // Load the elements before the reduction (split-K).
+            uint4 tmp[Cta_tile::WARPS_K];
+            #pragma unroll
+            for( int jj = 0; jj < Cta_tile::WARPS_K; ++jj ) {
+                int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW + jj * Cta_tile::N * BYTES_PER_ELEMENT;
+                uint32_t smem_read = this->smem_read_ + imm;
+                // TD [2022-06-05] Ugly fix for d=128 in the forward pass, maybe there's a better way.
+                if ((Cta_tile::N == 128) && (ROWS_PER_LDS == 4) && (ii % 2 == 1)) {
+                    smem_read ^= 8 * BYTES_PER_LDS;
+                }
+                // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+                //     printf("imm diff = %d\n", smem_read - this->smem_read_);
+                // }
+                if( !HAS_INCOMPLETE_LDS || (ii < LDS_PER_LOOP - 1 || this->is_active_for_last_lds_) ) {
+                    // fmha::lds(tmp[jj], this->smem_read_ + imm);
+                    fmha::lds(tmp[jj], smem_read);
+                }
+            }
+
+            // Perform the reduction.
+            out[ii] = zero_init ? tmp[0] : fmha::fadd4(out[ii], tmp[0]);
+            // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+            //     printf("out reduction: out = %.6f\n", reinterpret_cast<float (&)[4]>(out[ii])[0]);
+            // }
+            #pragma unroll
+            for( int jj = 1; jj < Cta_tile::WARPS_K; ++jj ) {
+                out[ii] = fmha::fadd4(out[ii], tmp[jj]);
+                // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+                //     printf("out reduction tmp = %.6f, out = %.6f\n", reinterpret_cast<float (&)[4]>(tmp[jj])[0], reinterpret_cast<float (&)[4]>(out[ii])[0]);
+                // }
+            }
+        }
+    }
+
+    // Store the accumulators.
+    template <int M, int N>
+    inline __device__ void store(const Accumulator (&acc)[M][N], int mi) {
+        // uint32_t smem_write_og = this->smem_write_;
+        static constexpr int M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA;
+        #pragma unroll
+        for( int ni = 0; ni < Mma_tile::MMAS_N; ++ni ) {
+
+            // The number of MMAs that are stored per loop iteration.
+            static constexpr int MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS;
+
+            // Store 1st column of the different MMAs.
+            #pragma unroll
+            for( int mj = 0; mj < MMAS_M_PER_LOOP; ++mj ) {
+                // Precompute the immediates to jump between rows.
+                int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+                int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+                uint2 tmp0, tmp1;
+                tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(0);
+                tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(1);
+
+                tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(2);
+                tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(3);
+
+                // Store.
+                fmha::sts(this->smem_write_ + row_0, tmp0);
+                fmha::sts(this->smem_write_ + row_1, tmp1);
+            }
+            // if ((threadIdx.x == 16) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+            //     printf("smem_write diff = %d\n", this->smem_write_ - smem_write_og);
+            // }
+
+            // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+            //     uint4 read_tmp;
+            //     fmha::lds(read_tmp, this->smem_read_);
+            //     printf("smem_o = %.6f\n", reinterpret_cast<float (&)[4]>(read_tmp)[0]);
+            // }
+            // Swizzle the write pointer using a XOR of 16B.
+            this->smem_write_ ^= 32;
+
+            // Store 2nd column of the different MMAs.
+            #pragma unroll
+            for( int mj = 0; mj < MMAS_M_PER_LOOP; ++mj ) {
+                // Precompute the immediates to jump between rows.
+                int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+                int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+
+                uint2 tmp0, tmp1;
+                tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(4);
+                tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(5);
+
+                tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(6);
+                tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(7);
+                // Store.
+                fmha::sts(this->smem_write_ + row_0, tmp0);
+                fmha::sts(this->smem_write_ + row_1, tmp1);
+            }
+
+            // if ((threadIdx.x == 16) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+            //     printf("smem_write diff = %d\n", this->smem_write_ - smem_write_og);
+            // }
+
+            // Cancel the previous XOR of 1 + swizzle the write pointer using a XOR of 32B or 64B.
+            static_assert(Mma_tile::MMAS_N <= 8, "Not implemented");
+            if(        Mma_tile::MMAS_N >= 8 && ni % 4 == 3 ) {
+                this->smem_write_ ^= 15 * 32;
+            } else if( Mma_tile::MMAS_N >= 4 && ni % 2 == 1 ) {
+                this->smem_write_ ^= 7 * 32;
+            } else if( Mma_tile::MMAS_N >= 2 ) {
+                this->smem_write_ ^= 3 * 32;
+            } else {
+                this->smem_write_ ^= 3 * 32;
+            }
+            // this->smem_write_ ^= (ni & 1) ? 7 * 32 : 3 * 32;
+            // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+            //     uint4 read_tmp;
+            //     fmha::lds(read_tmp, this->smem_read_);
+            //     printf("smem_o = %.6f\n", reinterpret_cast<float (&)[4]>(read_tmp)[0]);
+            // }
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile>
+struct Smem_tile_mma {
+
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    using Fragment = fmha::Fragment_a<fmha::Col>;
+
+    enum { COLS = Cta_tile::N };
+    enum { BYTES_PER_ELT = 2 };
+    enum { BYTES_PER_STS = 4 };
+    enum { BYTES_PER_ROW = COLS * BYTES_PER_ELT };  // TODO
+    enum { BYTES_PER_TILE = Cta_tile::M * BYTES_PER_ROW };
+
+    enum { WARPS_M = Cta_tile::WARPS_M };
+    enum { WARPS_N = Cta_tile::WARPS_N };
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    static_assert(WARPS_K == 1);
+    inline __device__ Smem_tile_mma(char *smem, int tidx) {
+        uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+        int write_col, write_row;
+        static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) || (WARPS_M == 4 || WARPS_M == 8) || WARPS_N == 1);
+        if( WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) ) {
+            write_row = (tidx & 0x1c) / 4;
+            write_col = (tidx & 0xe0) / 4 + (tidx & 0x03);
+            write_col ^= (write_row & 0x07) * 4;
+        } else {
+            write_row = (tidx & 0xe0) / 2 + (tidx & 0x1c) / 4;
+            write_col = (tidx & 0x03);
+            // write_col ^= (write_row & (BYTES_PER_ROW == 32 ? 0x01 : (BYTES_PER_ROW == 64 ? 0x03 : (BYTES_PER_ROW == 128 ? 0x07 : 0x0f)))) * 4;
+            write_col ^= (write_row & (BYTES_PER_ROW == 32 ? 0x01 : (BYTES_PER_ROW == 64 ? 0x03 : (BYTES_PER_ROW == 128 ? 0x07 : 0x07)))) * 4;
+        }
+
+        // write_offset_ = write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+        smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+    }
+
+    template<int M, int N>
+    inline __device__ void store(const uint4 (&regs)[M][N]) {
+        static_assert(COLS == Cta_tile::N);
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+                // size_t offset = write_offset_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
+                // fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
+                // fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
+                // offset ^= 4 * BYTES_PER_STS;
+                // fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
+                // fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
+                // size_t offset = smem_write_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
+                uint32_t offset = smem_write_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
+                fmha::sts(offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
+                fmha::sts(offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
+                offset ^= 4 * BYTES_PER_STS;
+                fmha::sts(offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
+                fmha::sts(offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
+            }
+        }
+    }
+
+    template<typename Fragment, int M, int N>
+    inline __device__ void store(const Fragment (&frag)[N][M]) {
+        static_assert(COLS == Cta_tile::N);
+        uint4 regs[M][N];
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+                // Need to transpose ref(1) and reg(2) here since when we load it we transpose again.
+                regs[mi][ni] = make_uint4(frag[ni][mi].reg(0), frag[ni][mi].reg(2),
+                                          frag[ni][mi].reg(1), frag[ni][mi].reg(3));
+            }
+        }
+        this->store(regs);
+    }
+
+    // uint32_t smem_;
+    // uint32_t write_offset_;
+    uint32_t smem_write_;
+};
+
+template< typename Cta_tile, typename Base = Smem_tile_mma< Cta_tile>>
+struct Smem_tile_mma_transposed : public Base {
+    enum { BYTES_PER_LDS = 16 };
+    enum { BYTES_PER_ROW = Base::BYTES_PER_ROW };
+    enum { BYTES_PER_ELT = Base::BYTES_PER_ELT };
+    enum { WARPS_M = Base::WARPS_M };
+    enum { WARPS_N = Base::WARPS_N };
+    static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
+    using Fragment = typename Base::Fragment;
+    inline __device__ Smem_tile_mma_transposed(char *smem, int tidx) : Base(smem, tidx) {
+
+        uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+        static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
+        int read_row, read_col;
+        read_row = (tidx & 0x0f);
+        read_col = (tidx & 0xe0) / 16 + (tidx & 0x1c) / 16;
+
+        // read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 : (Base::BYTES_PER_ROW == 64 ? 0x03 : (Base::BYTES_PER_ROW == 128 ? 0x07 : 0x0f))));
+        read_col ^= (read_row & 0x07);
+        // read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+        smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+    }
+
+    template<int M, int N>
+    inline __device__ void load(Fragment (&frag)[M][N]) {
+        static_assert(Base::COLS == Cta_tile::N);
+        for( int mi = 0; mi < M; mi++ ) {
+            for( int ni = 0; ni < N; ni++ ) {
+                // size_t offset = read_offset_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
+                uint4 dst;
+                // fmha::ldsmt(dst, this->smem_ + offset);
+                // size_t offset = smem_read_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
+                uint32_t offset = smem_read_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni * WARPS_N * 16 * BYTES_PER_ELT;
+                fmha::ldsmt(dst, offset);
+                frag[mi][ni].reg(0) = dst.x;
+                frag[mi][ni].reg(1) = dst.z;  // Fragment A regs col major!
+                frag[mi][ni].reg(2) = dst.y;
+                frag[mi][ni].reg(3) = dst.w;
+            }
+        }
+    }
+
+    // uint32_t read_offset_;
+    uint32_t smem_read_;
+};
+
+template< typename Cta_tile, typename Base = Smem_tile_mma< Cta_tile>>
+struct Smem_tile_mma_epilogue : public Base {
+    enum { BYTES_PER_LDS = 16 };
+    enum { BYTES_PER_ROW = Base::BYTES_PER_ROW };
+    enum { BYTES_PER_ELT = Base::BYTES_PER_ELT };
+    enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDS };
+    static_assert(THREADS_PER_ROW * BYTES_PER_LDS == BYTES_PER_ROW);
+    enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+    enum { NUM_LDS = Cta_tile::M / ROWS_PER_LDS };
+    static_assert(NUM_LDS * ROWS_PER_LDS == Cta_tile::M);
+    enum { WARPS_M = Base::WARPS_M };
+    enum { WARPS_N = Base::WARPS_N };
+    static_assert((WARPS_M == 4 || WARPS_N == 8) || WARPS_N == 1);
+
+    using Acc = fmha::Fragment_accumulator;
+
+    inline __device__ Smem_tile_mma_epilogue(char *smem, int tidx) : Base(smem, tidx) {
+        uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+        const int read_row = tidx / THREADS_PER_ROW;
+        int read_col = tidx % THREADS_PER_ROW;
+        // read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 : (Base::BYTES_PER_ROW == 64 ? 0x03 : 0x07)));
+        static_assert(Base::BYTES_PER_ROW == 32 || Base::BYTES_PER_ROW == 64 || Base::BYTES_PER_ROW == 128 || Base::BYTES_PER_ROW == 256);
+        read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 : (Base::BYTES_PER_ROW == 64 ? 0x03 : (Base::BYTES_PER_ROW == 128 ? 0x07 : 0x07))));
+        // read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+        smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+    }
+
+    inline __device__ void load(uint4 (&data)[NUM_LDS]) {
+        for( int ii = 0; ii < NUM_LDS; ii++ ) {
+            // size_t offset = read_offset_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
+            // fmha::lds(data[ii], this->smem_ + offset);
+            // size_t offset = smem_read_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
+            uint32_t offset = smem_read_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
+            fmha::lds(data[ii], offset);
+        }
+    }
+
+    template<typename elem_type=__half, int M, int N>
+    inline __device__ void store(const Acc (&acc)[M][N]){
+        #pragma unroll
+        for( int mi = 0; mi < M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < N; ni++ ) {
+                // 1st row - 4 elements per row.
+                float tmp00 = acc[mi][ni].elt(0);
+                float tmp01 = acc[mi][ni].elt(1);
+                float tmp02 = acc[mi][ni].elt(4);
+                float tmp03 = acc[mi][ni].elt(5);
+                // 2nd row - 4 elements per row.
+                float tmp10 = acc[mi][ni].elt(2);
+                float tmp11 = acc[mi][ni].elt(3);
+                float tmp12 = acc[mi][ni].elt(6);
+                float tmp13 = acc[mi][ni].elt(7);
+
+                uint32_t x = fmha::float2_pack<elem_type>(tmp00, tmp01);
+                uint32_t y = fmha::float2_pack<elem_type>(tmp02, tmp03);
+                uint32_t z = fmha::float2_pack<elem_type>(tmp10, tmp11);
+                uint32_t w = fmha::float2_pack<elem_type>(tmp12, tmp13);
+
+                // size_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
+                // fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, x);
+                // fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, z);
+                // offset ^= 4 * Base::BYTES_PER_STS;
+                // fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, y);
+                // fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, w);
+                // size_t offset = (this->smem_write_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
+                uint32_t offset = (this->smem_write_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
+                // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                //     printf("mi = %d, ni = %d, offset - smem_write_ = %d\n", mi, ni, offset - this->smem_write_);
+                // }
+                fmha::sts(offset + 0 * BYTES_PER_ROW, x);
+                fmha::sts(offset + 8 * BYTES_PER_ROW, z);
+                offset ^= 4 * Base::BYTES_PER_STS;
+                fmha::sts(offset + 0 * BYTES_PER_ROW, y);
+                fmha::sts(offset + 8 * BYTES_PER_ROW, w);
+            }
+        }
+    }
+
+    template<int M, int N>
+    inline __device__ void store(const uint4 (&regs)[M][N]) {
+        for( int mi = 0; mi < M; mi++ ) {
+            for( int ni = 0; ni < N; ni++ ) {
+                // size_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
+                uint32_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
+                fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
+                fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
+                offset ^= 4 * Base::BYTES_PER_STS;
+                fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
+                fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
+            }
+        }
+    }
+
+    // uint32_t read_offset_;
+    uint32_t smem_read_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Cta_tile>
+struct Smem_tile_transpose {
+
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+    using Fragment_write = fmha::Fragment_b<fmha::Col>;
+    using Fragment_read = fmha::Fragment_b<fmha::Col>;
+
+    enum { COLS = Cta_tile::N };
+    enum { BYTES_PER_ELT = 2 };
+    enum { BYTES_PER_STS = 4 };
+    enum { BYTES_PER_ROW = COLS * BYTES_PER_ELT };  // TODO
+    enum { BYTES_PER_TILE = Cta_tile::M * BYTES_PER_ROW };
+
+    enum { BYTES_PER_LDS = 16 };
+
+    enum { WARPS_M = Cta_tile::WARPS_M };
+    enum { WARPS_N = Cta_tile::WARPS_N };
+    enum { WARPS_K = Cta_tile::WARPS_K };
+
+    static_assert(WARPS_K == 1);
+    static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
+
+    inline __device__ Smem_tile_transpose(char *smem, int tidx) {
+        smem_ = __nvvm_get_smem_pointer(smem);
+        // uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+        int write_col, write_row;
+        static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) || (WARPS_M == 4 || WARPS_N == 8) || WARPS_N == 1);
+        if( WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) ) {
+            write_row = (tidx & 0x1c) / 4;
+            write_col = (tidx & 0xe0) / 4 + (tidx & 0x03);
+        } else {
+            write_row = (tidx & 0xe0) / 2 + (tidx & 0x1c) / 4;
+            write_col = (tidx & 0x03);
+        }
+        write_col ^= (write_row & 0x07) * 4;
+
+        write_offset_ = write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+        // smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+
+        int read_row, read_col;
+        read_row = (tidx & 0x0f);
+        read_col = (tidx & 0xe0) / 16 + (tidx & 0x1c) / 16;
+
+        read_col ^= (read_row & 0x07);
+        read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+        // smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+    }
+
+    template<int M, int N>
+    inline __device__ void store(const Fragment_write (&frag_w)[M][N], int mi) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            // size_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            uint32_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(0));
+            fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(2));
+            offset ^= 4 * BYTES_PER_STS;
+            fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(1));
+            fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(3));
+        }
+    }
+
+    template<int N>
+    inline __device__ void load(Fragment_read (&frag_r)[N]) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            // size_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            uint32_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            uint4 dst;
+            fmha::ldsmt(dst, this->smem_ + offset);
+            frag_r[ni].reg(0) = dst.x;
+            frag_r[ni].reg(1) = dst.y;  // Fragment B regs col major!
+            frag_r[ni].reg(2) = dst.z;
+            frag_r[ni].reg(3) = dst.w;
+        }
+    }
+
+    template<int M, int N>
+    inline __device__ void transpose(const Fragment_write (&frag_w)[M][N], Fragment_read (&frag_r)[M], int mi) {
+        static_assert(COLS == Cta_tile::N);
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            // size_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            uint32_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(0));
+            fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(2));
+            offset ^= 4 * BYTES_PER_STS;
+            fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(1));
+            fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(3));
+        }
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            // size_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            // size_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            uint32_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+            uint4 dst;
+            fmha::ldsmt(dst, this->smem_ + offset);
+            frag_r[ni].reg(0) = dst.x;
+            frag_r[ni].reg(1) = dst.y;  // Fragment B regs col major!
+            frag_r[ni].reg(2) = dst.z;
+            frag_r[ni].reg(3) = dst.w;
+        }
+    }
+
+    uint32_t smem_;
+    uint32_t write_offset_;
+    uint32_t read_offset_;
+    // uint32_t smem_write_;
+    // uint32_t smem_read_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    typename Gmem_tile,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_ = 1
+>
+struct Smem_tile_dp_sum {
+
+    using Cta_tile = typename Gmem_tile::Cta_tile;
+    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+    // The size of each element.
+    static constexpr int BYTES_PER_ELEMENT = 4;
+    static constexpr int ROWS = Gmem_tile::ROWS;
+    static constexpr int THREADS_PER_ROW = Gmem_tile::THREADS_PER_ROW;
+    static constexpr int MMAS_M = Mma_tile::MMAS_M;
+
+    static constexpr int ROWS_PER_LDG = Gmem_tile::ROWS_PER_LDG;
+    static constexpr int LDGS = Gmem_tile::LDGS;
+
+    static constexpr int ROWS_PER_MMA = Mma_tile::M_PER_MMA;
+
+    // The size of one buffer in bytes in shared memory.
+    static constexpr int BYTES_PER_BUFFER = ROWS * BYTES_PER_ELEMENT;
+    // The number of buffers.
+    static constexpr int BUFFERS_PER_TILE = BUFFERS_PER_TILE_;
+    // The size in bytes of total buffers.
+    static constexpr int BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE;
+    // The boundary for smem_read_offset and smem_write_offset increment.
+    static constexpr int ROWS_PER_TILE_INC_BOUNDARY = ROWS * BUFFERS_PER_TILE - ROWS;
+
+    inline __device__ Smem_tile_dp_sum(float *smem, const int tidx)
+        : smem_(smem), smem_read_buffer_(smem), smem_write_buffer_(smem), tidx_(tidx) {
+    }
+
+    // Move the read offset to next buffer.
+    inline __device__ void move_to_next_read_buffer() {
+        if( BUFFERS_PER_TILE > 1 && (smem_read_buffer_ - smem_) >= ROWS_PER_TILE_INC_BOUNDARY ) {
+            this->smem_read_buffer_ -= ROWS_PER_TILE_INC_BOUNDARY;
+        } else if( BUFFERS_PER_TILE > 1 ) {
+            this->smem_read_buffer_ += ROWS;
+        }
+    }
+
+    // Move the write offset to next buffer.
+    inline __device__ void move_to_next_write_buffer() {
+        if( BUFFERS_PER_TILE > 1 && (smem_write_buffer_ - smem_) >= ROWS_PER_TILE_INC_BOUNDARY ) {
+            this->smem_write_buffer_ -= ROWS_PER_TILE_INC_BOUNDARY;
+        } else if( BUFFERS_PER_TILE > 1 ) {
+            this->smem_write_buffer_ += ROWS;
+        }
+    }
+
+    inline __device__ void store(const float (&sum)[LDGS]) {
+        if (tidx_ % THREADS_PER_ROW == 0) {
+            int row = tidx_ / THREADS_PER_ROW;
+            #pragma unroll
+            for (int i = 0; i < LDGS; ++i) {
+                if (row + i * ROWS_PER_LDG < ROWS) {
+                    smem_write_buffer_[row + i * ROWS_PER_LDG] = sum[i];
+                }
+            }
+        }
+    }
+
+    inline __device__ void store(const float sum, const int buffer_idx) {
+        float *smem_write = smem_ + buffer_idx * ROWS;
+        int row = tidx_ / THREADS_PER_ROW;
+        if ((row < ROWS) && (tidx_ % THREADS_PER_ROW == 0)) {
+            smem_write[row] = sum;
+        }
+    }
+
+    inline __device__ void store(const float (&sum)[LDGS], const int buffer_idx) {
+        float *smem_write = smem_ + buffer_idx * ROWS;
+        if (tidx_ % THREADS_PER_ROW == 0) {
+            int row = tidx_ / THREADS_PER_ROW;
+            #pragma unroll
+            for (int i = 0; i < LDGS; ++i) {
+                if (row + i * ROWS_PER_LDG < ROWS) {
+                    smem_write[row + i * ROWS_PER_LDG] = sum[i];
+                }
+            }
+        }
+    }
+
+    inline __device__ void store_pair(const float (&sum)[MMAS_M * 2]) {
+        float *smem_write = smem_;
+        // Extract the position in the warp.
+        int warp = tidx_ / Cta_tile::THREADS_PER_WARP;
+        int lane = tidx_ % Cta_tile::THREADS_PER_WARP;
+        int row = lane / 4;
+        #pragma unroll
+        for (int mi = 0; mi < MMAS_M; ++mi) {
+            smem_write[mi * ROWS_PER_MMA + row + 0] = sum[mi * 2 + 0];
+            smem_write[mi * ROWS_PER_MMA + row + 8] = sum[mi * 2 + 1];
+        }
+    }
+
+    inline __device__ void store_pair(const float (&sum)[MMAS_M * 2], const int buffer_idx) {
+        float *smem_write = smem_ + buffer_idx * ROWS;
+        // Extract the position in the warp.
+        int warp = tidx_ / Cta_tile::THREADS_PER_WARP;
+        int lane = tidx_ % Cta_tile::THREADS_PER_WARP;
+        int row = lane / 4;
+        #pragma unroll
+        for (int mi = 0; mi < MMAS_M; ++mi) {
+            smem_write[mi * ROWS_PER_MMA + row + 0] = sum[mi * 2 + 0];
+            smem_write[mi * ROWS_PER_MMA + row + 8] = sum[mi * 2 + 1];
+        }
+    }
+
+    template<int N>
+    inline __device__ void load(float (&sum)[N], const int (&row)[N]) {
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            sum[ni] = smem_read_buffer_[row[ni]];
+        }
+    }
+
+    template<int N>
+    inline __device__ void load(float (&sum)[N], const int (&row)[N], const int buffer_idx) {
+        float *smem_read = smem_ + buffer_idx * ROWS;
+        #pragma unroll
+        for( int ni = 0; ni < N; ni++ ) {
+            sum[ni] = smem_read[row[ni]];
+        }
+    }
+
+    static inline __device__ float reduce_warp(float sum) {
+        fmha::SumOp<float> sum_op;
+        return fmha::Allreduce<THREADS_PER_ROW>::run(sum, sum_op);
+    }
+
+    const int tidx_;
+    float * const smem_;
+    float *smem_read_buffer_;
+    float *smem_write_buffer_;
+};
+
+}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h b/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
index 2e121d0e9311..77dfc350fe70 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/softmax.h
@@ -1,5 +1,4 @@
 /******************************************************************************
- * Copyright (c) 2022, Tri Dao.
  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,11 +28,10 @@
 #pragma once
 
 #include <cmath>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
 #include <ATen/native/transformers/cuda/flash_attn/philox.cuh>
 
-#include <cutlass/cutlass.h>
-#include <cutlass/array.h>
-
 namespace fmha {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -78,11 +76,18 @@ struct Smem_tile_reduce {
 
     static constexpr int ROWS = WARPS_M * MMAS_M * 16;
     static constexpr int COLS = WARPS_N;
-    static_assert(COLS == 4 || COLS == 8, "");
+    static_assert(COLS == 4 || COLS == 8);
     static constexpr int ROWS_PER_XOR_PATTERN = (COLS == 8) ? 4 : 8;
     static constexpr int BYTES_PER_TILE = ROWS * COLS * sizeof(float);
     static constexpr int ELTS_PER_TILE = ROWS * COLS;
 
+    static constexpr int THREADS_PER_GROUP = Kernel_traits::Gmem_tile_o::THREADS_PER_ROW;
+    // TD [2022-05-02]: No longer true if head_dim != 64
+    // static_assert(THREADS_PER_GROUP == 16); // DEBUG
+    static constexpr int ROWS_PER_WARP = 32 / THREADS_PER_GROUP;
+    static constexpr int LOOPS = Kernel_traits::Gmem_tile_o::LOOPS;
+    static_assert(LOOPS == 1);
+
     using read_t = typename ReadType<COLS>::T;
 
     __device__ inline Smem_tile_reduce(float *smem_, const int tidx) {
@@ -166,6 +171,9 @@ struct Softmax_base {
         :  // packed_mask_ptr_(reinterpret_cast<const char*>(params.packed_mask_ptr)),
           smem_(reinterpret_cast<float *>(smem)), tidx_(tidx) {
 
+        // Move to the 1st mask loaded by the thread+ tidx;
+        // packed_mask_ptr_ += bidb * params.packed_mask_stride_in_bytes + tidx * sizeof(uint32_t);
+
         // Extract the position in the warp.
         int warp = tidx / Cta_tile::THREADS_PER_WARP;
         int lane = tidx % Cta_tile::THREADS_PER_WARP;
@@ -208,6 +216,25 @@ struct Softmax_base {
         }
     }
 
+    // Apply the exp to all the elements.
+    template <bool max_in_base2=false, bool elt_in_base2=false>
+    inline __device__ void apply_exp(const float (&max)[MMAS_M * 2]) {
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            // max * log_2(e)) This allows the compiler to use the ffma
+            // instruction instead of fadd and fmul separately.
+            constexpr float kLog2e = M_LOG2E;
+            const float max_base2 = max_in_base2 ? max[mi] : max[mi] * kLog2e;
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+                // elt_[mi][ni] = apply_exp_(elt_[mi][ni], max[mi]);
+                elt_[mi][ni] = apply_exp2_(elt_in_base2 ? elt_[mi][ni] : elt_[mi][ni] * kLog2e,
+                                           max_base2);
+            }
+        }
+    }
+
     // Apply the exp to all the elements.
     template <bool scale_max=true>
     inline __device__ void scale_apply_exp(const float (&max)[MMAS_M * 2], const float scale_) {
@@ -226,6 +253,32 @@ struct Softmax_base {
         }
     }
 
+    // Apply the exp to all the elements.
+    template <bool max_in_base2=false>
+    inline __device__ void apply_exp_col(const float (&max)[MMAS_N * 4]) {
+        #pragma unroll
+        for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+            constexpr float kLog2e = M_LOG2E;
+            const float max_base2 = max_in_base2 ? max[ni] : max[ni] * kLog2e;
+            #pragma unroll
+            for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+                elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * kLog2e, max_base2);
+            }
+        }
+    }
+    // inline __device__ void apply_exp_col(const float (&max)[MMAS_N]) {
+    //     constexpr float kLog2e = M_LOG2E;
+    //     #pragma unroll
+    //     for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+    //         float max_base2 = max_in_base2 ? max[ni / 4] : max[ni / 4] * kLog2e;
+    //         max_base2 = __shfl_sync(0xffffffff, max_base2, (ni % 4) * 8 + threadIdx.x % 8);
+    //         #pragma unroll
+    //         for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+    //             elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * kLog2e, max_base2);
+    //         }
+    //     }
+    // }
+
     template <bool encode_dropout_in_sign_bit=false>
     inline __device__ void apply_dropout_16bits(Philox &ph, uint16_t p_dropout_in_uint16_t) {
         // We encode the dropout pattern in the sign bit of the non-negative
@@ -237,17 +290,52 @@ struct Softmax_base {
         for( int mi = 0; mi < MMAS_M; mi++ ) {
             #pragma unroll
             for( int ni = 0; ni < MMAS_N; ni++ ) {
-                uint4 random_uint4 = ph();
-                uint16_t (&rnd)[8] = reinterpret_cast<uint16_t (&)[8]>(random_uint4);
+                uint16_t tmp[8];
+                // fmha::uint4_to_ushort8(ph(), tmp);
+                uint4 tmp_32 = ph();
+                fmha::uint4_to_ushort8(tmp_32, tmp);
+                // if ((threadIdx.x % 32 == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                //     printf("tidx = %d, ni = %d, ph  Philox: %u, %u, %u, %u\n", threadIdx.x, ni, tmp_32.x, tmp_32.y, tmp_32.z, tmp_32.w);
+                // }
+                #pragma unroll
+                for (int ii = 0; ii < 2; ++ii) {
+                    #pragma unroll
+                    for (int jj = 0; jj < 4; ++jj) {
+                        elt_[mi * 2 + ii][4 * ni + jj] =
+                            encode_dropout(tmp[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
+                    }
+                }
+            }
+        }
+    }
+
+    template <bool encode_dropout_in_sign_bit=false>
+    inline __device__ void apply_dropout_16bits(Philox &ph, uint16_t p_dropout_in_uint16_t,
+                                                unsigned long long philox_subsequence) {
+        // We encode the dropout pattern in the sign bit of the non-negative
+        // softmax to distinguish from pre-existing zeros
+        auto encode_dropout = [](bool keep, float val) {
+            return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+        };
+        static_assert(MMAS_M == 1);  // We're assuming 16x16 blocks.
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M; mi++ ) {
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N; ni++ ) {
+                uint16_t tmp[8];
+                // fmha::uint4_to_ushort8(ph(), tmp);
+                fmha::uint4_to_ushort8(ph(philox_subsequence + ni * Cta_tile::WARPS_N), tmp);
+                // uint4 tmp_32 = ph(philox_subsequence + ni * Cta_tile::WARPS_N);
+                // fmha::uint4_to_ushort8(tmp_32, tmp);
                 // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, rnd.x, rnd.y, rnd.z, rnd.w);
+                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp_32.x, tmp_32.y, tmp_32.z, tmp_32.w);
                 // }
                 #pragma unroll
                 for (int ii = 0; ii < 2; ++ii) {
                     #pragma unroll
                     for (int jj = 0; jj < 4; ++jj) {
                         elt_[mi * 2 + ii][4 * ni + jj] =
-                            encode_dropout(rnd[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
+                            encode_dropout(tmp[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
                     }
                 }
             }
@@ -263,39 +351,70 @@ struct Softmax_base {
         };
         #pragma unroll
         for( int mi = 0; mi < MMAS_M; mi++ ) {
-            static_assert(MMAS_N % 2 == 0, "");
+            static_assert(MMAS_N % 2 == 0);
             #pragma unroll
             for( int ni = 0; ni < MMAS_N; ni += 2 ) {
-                uint4 random_uint4 = ph0();
-                uint16_t (&rnd0)[8] = reinterpret_cast<uint16_t (&)[8]>(random_uint4);
+                uint16_t tmp[8];
+                fmha::uint4_to_ushort8(ph0(), tmp);
                 // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, rnd0.x, rnd0.y, rnd0.z, rnd0.w);
+                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp.x, tmp.y, tmp.z, tmp.w);
                 // }
                 #pragma unroll
                 for (int ii = 0; ii < 2; ++ii) {
                     #pragma unroll
                     for (int jj = 0; jj < 4; ++jj) {
                         elt_[mi * 2 + ii][4 * ni + jj] =
-                            encode_dropout(rnd0[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
+                            encode_dropout(tmp[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * ni + jj]);
                     }
                 }
-                random_uint4 = ph1();
-                uint16_t (&rnd1)[8] = reinterpret_cast<uint16_t (&)[8]>(random_uint4);
+                fmha::uint4_to_ushort8(ph1(), tmp);
                 // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
-                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, rnd1.x, rnd1.y, rnd1.z, rnd1.w);
+                //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp.x, tmp.y, tmp.z, tmp.w);
                 // }
                 #pragma unroll
                 for (int ii = 0; ii < 2; ++ii) {
                     #pragma unroll
                     for (int jj = 0; jj < 4; ++jj) {
                         elt_[mi * 2 + ii][4 * (ni + 1) + jj] =
-                            encode_dropout(rnd1[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * (ni + 1) + jj]);
+                            encode_dropout(tmp[ii * 4 + jj] <= p_dropout_in_uint16_t, elt_[mi * 2 + ii][4 * (ni + 1) + jj]);
                     }
                 }
             }
         }
     }
 
+    // Scale all the elements.
+    inline __device__ void scale(const float (&sum)[MMAS_M * 2]) {
+        // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal.
+        float inv_sum[MMAS_M * 2];
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+            inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+        }
+
+        // Update the values.
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+                elt_[mi][ni] *= inv_sum[mi];
+            }
+        }
+    }
+
+    // Subtract all elements by dp_sum
+    inline __device__ void subtract_dp_sum(const float (&dp_sum)[MMAS_M * 2]) {
+        #pragma unroll
+        for( int mi = 0; mi < MMAS_M * 2; ++mi ) {
+            #pragma unroll
+            for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+                elt_[mi][ni] -= dp_sum[mi];
+            }
+        }
+    }
+
+    // The pointer to the mask.
+    const char *packed_mask_ptr_;
     // Shared memory for the CTA-wide reduction.
     float *smem_, *smem_write_, *smem_read_;
     // The current thread index.
@@ -311,6 +430,10 @@ struct Softmax : public Softmax_base<Cta_tile, Kernel_traits> {
 
     // The base class.
     using Base = Softmax_base<Cta_tile, Kernel_traits>;
+    // The fragment.
+    using Fragment_a = fmha::Fragment_a<fmha::Row>;
+
+    static_assert(Fragment_a::NUM_REGS == 4);
 
     static constexpr int WARPS_M = Cta_tile::WARPS_M;
     static constexpr int WARPS_N = Cta_tile::WARPS_N;
@@ -318,53 +441,92 @@ struct Softmax : public Softmax_base<Cta_tile, Kernel_traits> {
     static constexpr int MMAS_M = Base::MMAS_M;
     static constexpr int MMAS_N = Base::MMAS_N;
 
+    // The accumulators.
+    using Accumulator = fmha::Fragment_accumulator;
+    using Accumulator_out = Fragment<uint16_t, 8>;
+    static_assert(Accumulator_out::NUM_REGS == 4);
+
+    static_assert(std::is_same<Accumulator::Data_type, float>::value);
+
     using Smem_tile_red = Smem_tile_reduce<Cta_tile, Kernel_traits>;
-    static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N, "");
+    static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N);
     // Ctor.
     template<typename Params>
     inline __device__ Softmax(const Params &params, void *smem, int tidx)
         : Base(params, smem, tidx)
+        , params_scale_bmm1_(params.scale_bmm1)
         , smem_sum_(static_cast<float*>(smem), tidx)
         , smem_max_(static_cast<float*>(smem) + Smem_tile_red::ELTS_PER_TILE, tidx) {
     }
 
     // Pack the data to a fragment for the next GEMM.
-    inline __device__ void pack_noconvert(cutlass::Array<float, MMAS_M * MMAS_N * 8> &frag) const {
+    template<typename elem_type=__half, int K, int M>
+    inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+        #pragma unroll
+        for( int mi = 0; mi < M; ++mi ) {
+            #pragma unroll
+            for( int ki = 0; ki < K; ++ki ) {
+
+                // 1st row - 4 elements per row.
+                float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+                float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+                float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+                float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+                // 2nd row - 4 elements per row.
+                float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+                float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+                float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+                float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+                // Pack to 4 registers.
+                dst[ki][mi].reg(0) = fmha::float2_pack<elem_type>(tmp_00, tmp_01);
+                dst[ki][mi].reg(1) = fmha::float2_pack<elem_type>(tmp_10, tmp_11);
+                dst[ki][mi].reg(2) = fmha::float2_pack<elem_type>(tmp_02, tmp_03);
+                dst[ki][mi].reg(3) = fmha::float2_pack<elem_type>(tmp_12, tmp_13);
+            }
+        }
+    }
+
+    // Scale FP32 fragments
+    inline __device__ void unpack(const Accumulator (&acc)[MMAS_M][MMAS_N]) {
+        const float scalef = reinterpret_cast<const float &>(this->params_scale_bmm1_);
+
         #pragma unroll
         for( int mi = 0; mi < MMAS_M; ++mi ) {
             #pragma unroll
-            for( int ki = 0; ki < MMAS_N; ++ki ) {
+            for( int ni = 0; ni < MMAS_N; ++ni ) {
                 // 1st row - 4 elements per row.
-                frag[ki * MMAS_M * 8 + mi * 8 + 0] = this->elt_[2 * mi + 0][4 * ki + 0];
-                frag[ki * MMAS_M * 8 + mi * 8 + 1] = this->elt_[2 * mi + 0][4 * ki + 1];
-                frag[ki * MMAS_M * 8 + mi * 8 + 4] = this->elt_[2 * mi + 0][4 * ki + 2];
-                frag[ki * MMAS_M * 8 + mi * 8 + 5] = this->elt_[2 * mi + 0][4 * ki + 3];
+                this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scalef;
+                this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scalef;
+                this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scalef;
+                this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scalef;
                 // 2nd row - 4 elements per row.
-                frag[ki * MMAS_M * 8 + mi * 8 + 2] = this->elt_[2 * mi + 1][4 * ki + 0];
-                frag[ki * MMAS_M * 8 + mi * 8 + 3] = this->elt_[2 * mi + 1][4 * ki + 1];
-                frag[ki * MMAS_M * 8 + mi * 8 + 6] = this->elt_[2 * mi + 1][4 * ki + 2];
-                frag[ki * MMAS_M * 8 + mi * 8 + 7] = this->elt_[2 * mi + 1][4 * ki + 3];
+                this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scalef;
+                this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scalef;
+                this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scalef;
+                this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scalef;
             }
         }
     }
 
-    template <typename FragmentC>
-    inline __device__ void unpack_noscale(const FragmentC (&acc)) {
-        static_assert(FragmentC::kElements == MMAS_M * MMAS_N * 8, "");
+    // Scale FP32 fragments
+    inline __device__ void unpack_noscale(const Accumulator (&acc)[MMAS_M][MMAS_N]) {
+
         #pragma unroll
         for( int mi = 0; mi < MMAS_M; ++mi ) {
             #pragma unroll
             for( int ni = 0; ni < MMAS_N; ++ni ) {
                 // 1st row - 4 elements per row.
-                this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi * MMAS_N * 8 + ni * 8 + 0];
-                this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi * MMAS_N * 8 + ni * 8 + 1];
-                this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi * MMAS_N * 8 + ni * 8 + 4];
-                this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi * MMAS_N * 8 + ni * 8 + 5];
+                this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0);
+                this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1);
+                this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4);
+                this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5);
                 // 2nd row - 4 elements per row.
-                this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi * MMAS_N * 8 + ni * 8 + 2];
-                this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi * MMAS_N * 8 + ni * 8 + 3];
-                this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi * MMAS_N * 8 + ni * 8 + 6];
-                this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi * MMAS_N * 8 + ni * 8 + 7];
+                this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2);
+                this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3);
+                this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6);
+                this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7);
             }
         }
     }
@@ -437,6 +599,7 @@ struct Softmax : public Softmax_base<Cta_tile, Kernel_traits> {
         reduce_after_sync_(frag, rows, max, smem_max_);
     }
 
+    const uint32_t params_scale_bmm1_;
     Smem_tile_red smem_max_;
     Smem_tile_red smem_sum_;
 };
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h b/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
index 7920ac045d0a..53bcf35d6936 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/static_switch.h
@@ -1,5 +1,6 @@
 // Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
 // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+// and https://github.com/facebookresearch/xformers/blob/main/xformers/csrc/attention/cuda/fmha/gemm_kernel_utils.h#L8
 
 #pragma once
 
@@ -9,17 +10,31 @@
 ///
 /// Usage:
 /// ```
-/// BOOL_SWITCH(flag, BoolConst, [&] {
+/// BOOL_SWITCH(flag, BoolConst, ([&] {
 ///     some_function<BoolConst>(...);
-/// });
+/// }));
 /// ```
-#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
-    [&] {                                                                            \
-        if (COND) {                                                                  \
-            constexpr bool CONST_NAME = true;                                        \
-            return __VA_ARGS__();                                                    \
-        } else {                                                                     \
-            constexpr bool CONST_NAME = false;                                       \
-            return __VA_ARGS__();                                                    \
-        }                                                                            \
-    }()
+/// We need "({" and "})" to make sure that the code is a single argument being passed to the macro.
+#define BOOL_SWITCH(COND, CONST_NAME, F)       \
+    {                                          \
+        if (COND) {                            \
+            constexpr bool CONST_NAME = true;  \
+            F();                               \
+        } else {                               \
+            constexpr bool CONST_NAME = false; \
+            F();                               \
+        }                                      \
+    }
+
+// modified from BOOL_SWITCH
+// because MSVC cannot handle std::conditional with constexpr variable
+#define FP16_SWITCH(COND, F)                 \
+    {                                        \
+        if (COND) {                          \
+            using elem_type = __nv_bfloat16; \
+            F();                             \
+        } else {                             \
+            using elem_type = __half;        \
+            F();                             \
+        }                                    \
+    }
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/summary_stats.h b/aten/src/ATen/native/transformers/cuda/flash_attn/summary_stats.h
deleted file mode 100644
index a3abda34b4e4..000000000000
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/summary_stats.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2022, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-namespace fmha {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<int kRows, int kRowsPerMma, int kWarpCountM>
-struct Smem_tile_softmax_lse {
-
-    static constexpr int kMmaM = (kRows / kWarpCountM) / kRowsPerMma;
-    static_assert(kMmaM * kRowsPerMma * kWarpCountM == kRows, "");
-    // static_assert(kWarpCountM == 1);
-    // Otherwise we might need to check warp_idx / kWarpCountM == 0 instead of just warp_idx == 0
-
-    // The size of one buffer in bytes in shared memory.
-    static constexpr size_t BYTES_PER_TILE = kRows * sizeof(float);
-
-    inline __device__ Smem_tile_softmax_lse(float *smem) : smem_(smem) {
-    }
-
-    inline __device__ void store_pair(const float (&sum)[kMmaM * 2]) {
-        // Broadcast the warp_id computed by lane 0 to ensure dependent code
-        // is compiled as warp-uniform.
-        // This makes a difference of 50us for BERT.
-        // const int warp_idx = threadIdx.x / 32;
-        const int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-        const int lane_idx =  threadIdx.x % 32;
-        const int warp_n = warp_idx / kWarpCountM;
-        // Extract the position in the warp.
-        const int row = lane_idx / 4;
-        if ((lane_idx % 4 == 0) && (warp_n == 0)) {
-            #pragma unroll
-            for (int mi = 0; mi < kMmaM; ++mi) {
-                smem_[mi * kRowsPerMma + row + 0] = sum[mi * 2 + 0];
-                smem_[mi * kRowsPerMma + row + 8] = sum[mi * 2 + 1];
-            }
-        }
-    }
-
-    template<int N>
-    inline __device__ void load(float (&sum)[N], const int (&row)[N]) {
-        #pragma unroll
-        for( int ni = 0; ni < N; ni++ ) {
-            sum[ni] = smem_[row[ni]];
-        }
-    }
-
-    float * const smem_;
-};
-
-}  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h b/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
index 7caa29f20869..dca0ac150d46 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/utils.h
@@ -32,7 +32,8 @@
 #include <cstdlib>
 
 #include <ATen/cuda/CUDAContext.h>
-// #include <cuda_fp16.h>
+
+#include <cuda_fp16.h>
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 #include <cuda_bf16.h>
@@ -51,6 +52,66 @@ struct Col {};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template< int M, bool = (M & (M-1)) == 0 >
+struct Next_power_of_two {
+};
+
+template< int M >
+struct Next_power_of_two<  M, true > { enum { VALUE =   M }; };
+template<>
+struct Next_power_of_two<  3, false> { enum { VALUE =   4 }; };
+template<>
+struct Next_power_of_two<  5, false> { enum { VALUE =   8 }; };
+template<>
+struct Next_power_of_two<  6, false> { enum { VALUE =   8 }; };
+template<>
+struct Next_power_of_two<  7, false> { enum { VALUE =   8 }; };
+template<>
+struct Next_power_of_two<  9, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 10, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 11, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 12, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 13, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 14, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 15, false> { enum { VALUE =  16 }; };
+template<>
+struct Next_power_of_two< 24, false> { enum { VALUE =  32 }; };
+template<>
+struct Next_power_of_two< 48, false> { enum { VALUE =  64 }; };
+template<>
+struct Next_power_of_two< 80, false> { enum { VALUE = 128 }; };
+template<>
+struct Next_power_of_two< 96, false> { enum { VALUE = 128 }; };
+template<>
+struct Next_power_of_two<112, false> { enum { VALUE = 128 }; };
+template<>
+struct Next_power_of_two<144, false> { enum { VALUE = 256 }; };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, bool = (N & (N-1)) == 0 >
+struct Prev_power_of_two {
+};
+
+template< int N >
+struct Prev_power_of_two< N, true > { enum { VALUE = N }; };
+template<>
+struct Prev_power_of_two< 3, false> { enum { VALUE = 2 }; };
+template<>
+struct Prev_power_of_two< 5, false> { enum { VALUE = 4 }; };
+template<>
+struct Prev_power_of_two< 6, false> { enum { VALUE = 4 }; };
+template<>
+struct Prev_power_of_two< 7, false> { enum { VALUE = 4 }; };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template< int M, int N >
 struct Div_up {
     enum { VALUE = (M + N-1) / N };
@@ -126,6 +187,49 @@ struct Uint_from_size_in_bytes<16> {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template< int WARPS_M, int WARPS_N, int WARPS_K >
+struct Warp_masks {
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Warp_masks<8, 1, 1> { enum { M = 0xe0, N = 0x00, K = 0x00 }; };
+template<>
+struct Warp_masks<4, 2, 1> { enum { M = 0x60, N = 0x80, K = 0x00 }; };
+template<>
+struct Warp_masks<4, 1, 2> { enum { M = 0x60, N = 0x00, K = 0x80 }; };
+template<>
+struct Warp_masks<4, 1, 1> { enum { M = 0x60, N = 0x00, K = 0x00 }; };
+template<>
+struct Warp_masks<2, 4, 1> { enum { M = 0x20, N = 0xc0, K = 0x00 }; };
+template<>
+struct Warp_masks<2, 2, 2> { enum { M = 0x20, N = 0x40, K = 0x80 }; };
+template<>
+struct Warp_masks<2, 2, 1> { enum { M = 0x20, N = 0x40, K = 0x00 }; };
+template<>
+struct Warp_masks<2, 1, 2> { enum { M = 0x20, N = 0x00, K = 0x40 }; };
+template<>
+struct Warp_masks<2, 1, 1> { enum { M = 0x20, N = 0x00, K = 0x00 }; };
+template<>
+struct Warp_masks<1, 8, 1> { enum { M = 0x00, N = 0xe0, K = 0x00 }; };
+template<>
+struct Warp_masks<1, 4, 2> { enum { M = 0x00, N = 0x60, K = 0x80 }; };
+template<>
+struct Warp_masks<1, 4, 1> { enum { M = 0x00, N = 0x60, K = 0x00 }; };
+template<>
+struct Warp_masks<1, 2, 2> { enum { M = 0x00, N = 0x20, K = 0x40 }; };
+template<>
+struct Warp_masks<1, 2, 1> { enum { M = 0x00, N = 0x20, K = 0x00 }; };
+template<>
+struct Warp_masks<1, 1, 4> { enum { M = 0x00, N = 0x00, K = 0x60 }; };
+template<>
+struct Warp_masks<1, 1, 2> { enum { M = 0x00, N = 0x00, K = 0x20 }; };
+template<>
+struct Warp_masks<1, 1, 1> { enum { M = 0x00, N = 0x00, K = 0x00 }; };
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template< typename T >
 inline __device__ __host__ T div_up(T m, T n) {
     return (m + n-1) / n;
@@ -133,24 +237,124 @@ inline __device__ __host__ T div_up(T m, T n) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+inline int clz(int x) {
+    for( int i = 31; i >= 0; --i ) {
+        if( (1 << i) & x ) {
+            return 31 - i;
+        }
+    }
+    return 32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int find_log_2(int x, bool round_up = false) {
+    int a = 31 - clz(x);
+    if( round_up ) {
+        a += (x & (x-1)) ? 1 : 0;
+    }
+    return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd2(uint32_t a, uint32_t b) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint32_t c;
+        asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+        return c;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmin2(uint32_t a, uint32_t b) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint32_t c;
+        asm volatile("min.f16x2 %0, %1, %2;" : "=r"(c) : "r"(a), "r"(b));
+        return c;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmul2(const uint32_t a, const uint32_t b) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        // uint32_t c;
+        // asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+        // return c;
+        __half2 result = __hmul2(reinterpret_cast<const __half2 (&)>(a),
+                                reinterpret_cast<const __half2 (&)>(b));
+        return reinterpret_cast<uint32_t(&)>(result);
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint2 hmul4(uint2 a, uint2 b) {
+//     uint2 c;
+//     c.x = hmul2(a.x, b.x);
+//     c.y = hmul2(a.y, b.y);
+//     return c;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint4 hmul8(uint4 a, uint4 b) {
+//     uint4 c;
+//     c.x = hmul2(a.x, b.x);
+//     c.y = hmul2(a.y, b.y);
+//     c.z = hmul2(a.z, b.z);
+//     c.w = hmul2(a.w, b.w);
+//     return c;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint4 hmul8(uint32_t a, uint4 b) {
+//     uint4 c;
+//     c.x = hmul2(a, b.x);
+//     c.y = hmul2(a, b.y);
+//     c.z = hmul2(a, b.z);
+//     c.w = hmul2(a, b.w);
+//     return c;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename T>
 inline __device__ uint32_t hrelu2(uint32_t x);
 
 template<>
 inline __device__ uint32_t hrelu2<__half>(uint32_t x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     uint32_t res;
     const uint32_t zero = 0u;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     asm volatile( "max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(zero));
-#else
+    return res;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+    uint32_t res;
+    const uint32_t zero = 0u;
     asm volatile( \
         "{\n" \
         "\t .reg .f16x2 sela;\n" \
         "\t set.gtu.u32.f16x2 sela, %1, %2;\n" \
         "\t and.b32 %0, sela, %1;\n"
         "}\n" : "=r"(res) : "r"(x), "r"(zero));
-#endif
     return res;
+#else
+    assert(false);
+    return 0;
+#endif
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
@@ -165,14 +369,56 @@ inline __device__ uint32_t hrelu2<__nv_bfloat16>(uint32_t x) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-static inline __device__ uint16_t float_to_half(float f) {
-    uint16_t h;
-    asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(h) : "f"(f));
-    return h;
+static inline __device__ uint32_t habs2(uint32_t x) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+        uint32_t res;
+        asm volatile( "abs.f16x2 %0, %1;\n" : "=r"(res) : "r"(x));
+        return res;
+    #else
+        assert(false);
+        return 0;
+    #endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+template< typename T >
+static inline __device__ T clamp(T x, T lb, T ub) {
+    return x < lb ? lb : (x > ub ? ub : x);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t clamp_to_zero(uint16_t x) {
+    uint16_t mask;
+    asm volatile("set.gtu %0, %1, 0;" : "=h"(mask) : "h"(x));
+    return mask & x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint16_t float_to_half(float f) {
+//     uint16_t h;
+//     asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(h) : "f"(f));
+//     return h;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint32_t float2_to_half2(float a, float b) {
+//     uint32_t c;
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+//     asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(c) : "f"(b), "f"(a));
+// #else
+//     uint16_t lo = float_to_half(a);
+//     uint16_t hi = float_to_half(b);
+//     asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(c) : "h"(lo), "h"(hi));
+// #endif
+//     return c;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename T>
 inline __device__ uint32_t float2_pack(float a, float b);
 
@@ -192,6 +438,27 @@ inline __device__ uint32_t float2_pack<__nv_bfloat16>(float a, float b) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// static inline __device__ uint32_t float_to_half2(float a) {
+//     return float2_to_half2(a,a);
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint32_t float2_to_half2(const float2 &f) {
+//     return float2_to_half2(f.x, f.y);
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ uint2 float4_to_half4(float x, float y, float z, float w) {
+//     uint2 d;
+//     d.x = float2_to_half2(x, y);
+//     d.y = float2_to_half2(z, w);
+//     return d;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename T>
 inline __device__ uint2 float4_pack(float x, float y, float z, float w) {
     uint2 d;
@@ -202,6 +469,121 @@ inline __device__ uint2 float4_pack(float x, float y, float z, float w) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+static inline __device__ uint32_t hfma2(uint32_t a, uint32_t b, uint32_t c) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint32_t d;
+        asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+        return d;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hfma2_relu(uint32_t a, uint32_t b, uint32_t c) {
+    uint32_t d;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("fma.rn.f16x2.relu %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+#else
+    d = hrelu2<__half>(hfma2(a, b, c));
+#endif
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t h0_h0(uint32_t x) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint32_t y;
+        asm volatile("{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {lo, lo};}\n"
+            : "=r"(y) : "r"(x));
+        return y;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float h0_to_float(uint32_t h2) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        float f;
+        asm volatile("{\n" \
+            ".reg .f16 lo, hi;\n" \
+            "mov.b32 {lo, hi}, %1;\n" \
+            "cvt.f32.f16 %0, lo;\n" \
+            "}\n" : "=f"(f) : "r"(h2));
+        return f;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t h1_h1(uint32_t x) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint32_t y;
+        asm volatile("{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {hi, hi};}\n"
+            : "=r"(y) : "r"(x));
+        return y;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hadd(uint16_t a, uint16_t b) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint16_t d;
+        asm volatile("add.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
+        return d;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd(uint32_t a, uint32_t b) {
+    return hadd2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hadd4(uint2 a, uint2 b) {
+    uint2 c;
+    c.x = hadd2(a.x, b.x);
+    c.y = hadd2(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hadd(uint2 a, uint2 b) {
+    return hadd4(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hadd8(uint4 a, uint4 b) {
+    uint4 c;
+    c.x = hadd2(a.x, b.x);
+    c.y = hadd2(a.y, b.y);
+    c.z = hadd2(a.z, b.z);
+    c.w = hadd2(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename T>
 inline __device__ float2 half2_unpack(uint32_t a);
 
@@ -219,7 +601,7 @@ inline __device__ float2 half2_unpack<__nv_bfloat16>(uint32_t a) {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// Convert two half2's or bf162's into float, then take their dot product.
+// Converted two half2's or bf162's into float, then take their dot product.
 template <typename T>
 inline __device__ float hfma2_to_float(const uint32_t a, const uint32_t b) {
     float2 af = fmha::half2_unpack<T>(a);
@@ -240,6 +622,217 @@ inline __device__ float hmulsum8(const uint4 a, const uint4 b) {
     return sum;
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 fadd4(uint4 a, uint4 b) {
+    float4 c;
+    c.x = reinterpret_cast<const float&>(a.x) + reinterpret_cast<const float&>(b.x);
+    c.y = reinterpret_cast<const float&>(a.y) + reinterpret_cast<const float&>(b.y);
+    c.z = reinterpret_cast<const float&>(a.z) + reinterpret_cast<const float&>(b.z);
+    c.w = reinterpret_cast<const float&>(a.w) + reinterpret_cast<const float&>(b.w);
+    return reinterpret_cast<const uint4&>(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 fmul4(uint4 a, float b) {
+    float4 c;
+    c.x = reinterpret_cast<const float &>(a.x) * b;
+    c.y = reinterpret_cast<const float &>(a.y) * b;
+    c.z = reinterpret_cast<const float &>(a.z) * b;
+    c.w = reinterpret_cast<const float &>(a.w) * b;
+    return reinterpret_cast<const uint4 &>(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hadd(uint4 a, uint4 b) {
+    return hadd8(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float half_to_float(uint16_t h) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        float f;
+        asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+        return f;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ float2 half2_to_float2(uint32_t x) {
+//     uint16_t lo, hi;
+//     asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(x));
+//     return make_float2(half_to_float(lo), half_to_float(hi));
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// static inline __device__ void half2_to_float2(float &x, float &y, uint32_t h) {
+//     float2 tmp = half2_to_float2(h);
+//     x = tmp.x;
+//     y = tmp.y;
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hfma(uint16_t a, uint16_t b, uint16_t c) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint16_t d;
+        asm volatile("fma.rn.f16 %0, %1, %2, %3;" : "=h"(d) : "h"(a), "h"(b), "h"(c));
+        return d;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hmul(uint16_t a, uint16_t b) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >=530
+        uint16_t d;
+        asm volatile("mul.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
+        return d;
+    #else
+        assert(false);
+        return 0;
+    #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void uint4_to_ushort8(const uint4 a, uint16_t (&b)[8]) {
+    uint32_t *b_tmp = reinterpret_cast<uint32_t *>(&b[0]);
+    b_tmp[0] = a.x;
+    b_tmp[1] = a.y;
+    b_tmp[2] = a.z;
+    b_tmp[3] = a.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float sigmoid(float x) {
+    return 1.f / (1.f + expf(-x));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint16_t &dst) {
+    dst = uint16_t(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint32_t &dst) {
+    dst = 0u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint2 &dst) {
+    dst = make_uint2(0u, 0u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint4 &dst) {
+    dst = make_uint4(0u, 0u, 0u, 0u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// P R E D I C A T E   P A C K I N G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+enum { BYTES_PER_REG = 4, PREDS_PER_BYTE = 4, PREDS_PER_REG = BYTES_PER_REG * PREDS_PER_BYTE };
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// G E N E R I C   P R E D I C A T E D   L D G S T S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, int M, typename Functor >
+inline __device__ void load_(Functor &fct, const uint32_t (&preds)[M]) {
+
+    // The number of complete bytes (where we use all the predicates in a byte).
+    enum { COMPLETE = N / PREDS_PER_BYTE };
+    // Make sure we did allocate enough predicates.
+    static_assert(Div_up<COMPLETE, BYTES_PER_REG>::VALUE <= M, "");
+    // The remainder.
+    enum { REMAINDER = N - COMPLETE * PREDS_PER_BYTE };
+    // Make sure we got the math right and the remainder is between 0 and 3.
+    static_assert(REMAINDER >= 0 && REMAINDER <= 3, "");
+    // The mask to extract the predicates.
+    enum { COMPLETE_MASK = (1 << PREDS_PER_BYTE) - 1 };
+
+    // Clear the fetch registers.
+    #pragma unroll
+    for( int ii = 0; ii < N; ++ii ) {
+        fct.clear(ii);
+    }
+
+    // Run complete steps.
+    bool p[PREDS_PER_BYTE];
+    #pragma unroll
+    for( int ii = 0; ii < COMPLETE; ++ii ) {
+
+        // The predicate.
+        uint32_t reg = preds[ii / BYTES_PER_REG];
+
+        // Extract the predicates.
+        #pragma unroll
+        for( int jj = 0; jj < PREDS_PER_BYTE; ++jj ) {
+            uint32_t mask = 1u << (ii % BYTES_PER_REG * 8 + jj);
+            p[jj] = (reg & mask) != 0u;
+        }
+
+        // Issue the loads.
+        #pragma unroll
+        for( int jj = 0; jj < PREDS_PER_BYTE; ++jj ) {
+            fct.load(ii * PREDS_PER_BYTE + jj, p[jj]);
+        }
+    }
+
+    // Skip the rest of the code if we do not have a remainder.
+    if( REMAINDER > 0 ) {
+
+        // The mask to extract the predicates.
+        enum { REMAINDER_MASK = (1 << REMAINDER) - 1 };
+
+        // The predicate register.
+        uint32_t reg = preds[COMPLETE / BYTES_PER_REG];
+
+        // Extract the predicates.
+        #pragma unroll
+        for( int jj = 0; jj < PREDS_PER_BYTE; ++jj ) {
+            uint32_t mask = 1u << (COMPLETE % BYTES_PER_REG * 8 + jj);
+            p[jj] = (reg & mask) != 0u;
+        }
+
+        // Issue the loads.
+        #pragma unroll
+        for( int ii = 0; ii < REMAINDER; ++ii ) {
+            fct.load(COMPLETE * PREDS_PER_BYTE + ii, p[ii]);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int M, typename Functor >
+inline __device__ void load_(Functor &fct, uint32_t preds) {
+    uint32_t tmp[1] = { preds };
+    load_<M>(fct, tmp);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // L D G
@@ -274,6 +867,167 @@ inline __device__ void ldg(uint4 &dst, const void *ptr) {
     dst = *reinterpret_cast<const uint4*>(ptr);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Data_type, int N >
+struct Ldg_functor {
+    // Ctor.
+    inline __device__ Ldg_functor(Data_type (&fetch)[N], const void* (&ptrs)[N])
+        : fetch_(fetch), ptrs_(ptrs) {
+    }
+
+    // Clear the element.
+    inline __device__ void clear(int ii) {
+        fmha::clear(fetch_[ii]);
+    }
+
+    // Trigger the loads.
+    inline __device__ void load(int ii, bool p) {
+        if( p ) {
+            ldg(fetch_[ii], ptrs_[ii]);
+        }
+    }
+
+    // The fetch registers.
+    Data_type (&fetch_)[N];
+    // The pointers.
+    const void* (&ptrs_)[N];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Data_type, int N, int M >
+inline __device__ void ldg_(Data_type (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
+    Ldg_functor<Data_type, N> fct(fetch, ptrs);
+    load_<N>(fct, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, int M >
+inline __device__ void ldg(uint8_t (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
+    ldg_<uint8_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, int M >
+inline __device__ void ldg(uint16_t (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
+    ldg_<uint16_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, int M >
+inline __device__ void ldg(uint32_t (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
+    ldg_<uint32_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, int M >
+inline __device__ void ldg(uint2 (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
+    ldg_<uint2, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N, int M >
+inline __device__ void ldg(uint4 (&fetch)[N], const void* (&ptrs)[N], uint32_t (&preds)[M]) {
+    ldg_<uint4, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint16_t &dst, uint32_t ptr) {
+    asm volatile("ld.shared.b16 %0, [%1];\n" : "=h"(dst) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint32_t &dst, uint32_t ptr) {
+    asm volatile("ld.shared.b32 %0, [%1];\n" : "=r"(dst) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint2 &dst, uint32_t ptr) {
+    asm volatile("ld.shared.v2.b32 {%0, %1}, [%2];\n" : "=r"(dst.x), "=r"(dst.y) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint4 &dst, uint32_t ptr) {
+    asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst.x)
+        , "=r"(dst.y)
+        , "=r"(dst.z)
+        , "=r"(dst.w)
+        :  "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D S M
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint32_t &dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+        : "=r"(dst) : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint32_t &dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 {%0}, [%1];\n"
+        : "=r"(dst) : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint2 &dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0, %1}, [%2];\n"
+        : "=r"(dst.x), "=r"(dst.y) : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint2 &dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0, %1}, [%2];\n"
+        : "=r"(dst.x), "=r"(dst.y) : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint4 &dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+    asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w) : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint4 &dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+    asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+        : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w) : "r"(ptr));
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // S T G
@@ -308,6 +1062,82 @@ inline __device__ void stg(void *ptr, uint4 val) {
     *reinterpret_cast<uint4*>(ptr) = val;
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint16_t val) {
+    asm volatile("st.shared.b16 [%0], %1;\n" : : "r"(ptr), "h"(val));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint32_t val) {
+    asm volatile("st.shared.b32 [%0], %1;\n" : : "r"(ptr), "r"(val));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint2 val) {
+    asm volatile("st.shared.v2.b32 [%0], {%1, %2};\n"
+        :
+        : "r"(ptr)
+        , "r"(val.x)
+        , "r"(val.y));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint4 val) {
+    asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
+        :
+        : "r"(ptr)
+        , "r"(val.x)
+        , "r"(val.y)
+        , "r"(val.z)
+        , "r"(val.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Data_type, int N >
+inline __device__ void sts_(uint32_t (&ptrs)[N], const Data_type (&data)[N]) {
+    #pragma unroll
+    for( int ii = 0; ii < N; ++ii ) {
+        sts(ptrs[ii], data[ii]);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint16_t (&data)[N]) {
+    sts_<uint16_t, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint32_t (&data)[N]) {
+    sts_<uint32_t, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint2 (&data)[N]) {
+    sts_<uint2, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< int N >
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint4 (&data)[N]) {
+    sts_<uint4, N>(ptrs, data);
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T>
@@ -332,7 +1162,7 @@ __device__ inline T operator()(T const & x, T const & y) { return x + y; }
 
 template<int THREADS>
 struct Allreduce {
-    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4, "");
+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
     template<typename T, typename Operator>
     static __device__ inline T run(T x, Operator &op) {
         constexpr int OFFSET = THREADS / 2;
@@ -366,6 +1196,18 @@ __device__ inline void  quad_reduce(float (&dst)[M], float (&src)[M], Operator &
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// template<typename Operator, int M>
+// __device__ inline void  quad_reduce(__half2 (&dst)[M], __half2 (&src)[M], Operator &op) {
+//     #pragma unroll
+//     for(int mi=0; mi < M; mi++){
+//         dst[mi] = src[mi];
+//         dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 2));
+//         dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 1));
+//     }
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename Operator, int M>
 __device__ inline void quad_reduce(float (&dst)[M], float2 (&src)[M], Operator &op) {
     float tmp[M];
@@ -378,6 +1220,19 @@ __device__ inline void quad_reduce(float (&dst)[M], float2 (&src)[M], Operator &
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// template<typename Operator, int M>
+// __device__ inline void quad_reduce(__half2 (&dst)[M], float2 (&src)[M], Operator &op) {
+//     __half2 tmp[M];
+//     #pragma unroll
+//     for(int mi=0; mi < M; mi++){
+//         tmp[mi] = op(reinterpret_cast<const __half2 &>(src[mi].x),
+//                      reinterpret_cast<const __half2 &>(src[mi].y));
+//     }
+//     quad_reduce(dst, tmp, op);
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename Operator, int M>
 __device__ inline void quad_allreduce(float (&dst)[M], float (&src)[M], Operator &op) {
     #pragma unroll
@@ -389,6 +1244,17 @@ __device__ inline void quad_allreduce(float (&dst)[M], float (&src)[M], Operator
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// template<typename Operator, int M>
+// __device__ inline void quad_allreduce(__half2 (&dst)[M], __half2 (&src)[M], Operator &op) {
+//     #pragma unroll
+//     for(int mi=0; mi < M; mi++){
+//         dst[mi] = src[mi];
+//         dst[mi] = Allreduce<4>::run(dst[mi], op);
+//     }
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 template<typename Operator, int M>
 __device__ inline void quad_allreduce(float (&dst)[M], float2 (&src)[M], Operator &op) {
     float tmp[M];
@@ -401,4 +1267,17 @@ __device__ inline void quad_allreduce(float (&dst)[M], float2 (&src)[M], Operato
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// template<typename Operator, int M>
+// __device__ inline void quad_allreduce(__half2 (&dst)[M], float2 (&src)[M], Operator &op) {
+//     __half2 tmp[M];
+//     #pragma unroll
+//     for(int mi=0; mi < M; mi++){
+//         tmp[mi] = op(reinterpret_cast<const __half2 &>(src[mi].x),
+//                      reinterpret_cast<const __half2 &>(src[mi].y));
+//     }
+//     quad_allreduce(dst, tmp, op);
+// }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace fmha
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
index a952090840fc..143f3dfc79a9 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -308,10 +308,12 @@ class PredicatedTileIteratorPrefetch {
           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn;
                ++column) {
-            unsigned long addr =
-                (unsigned long)((void*)&memory_pointer
-                                    [column * ThreadMap::Delta::kColumn /
-                                     kElementsPerAccess]);
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)(
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess]);
             asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
           }
 
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
index 5207daa22d6f..5df0d12c2e6e 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
@@ -79,7 +79,7 @@ struct AttentionKernel {
       cutlass::sizeof_bits<scalar_t>::value == 16;
   static constexpr bool kKeepOutputInRF = kSingleValueIteration;
   static constexpr bool kNeedsOutputAccumulatorBuffer =
-      !kKeepOutputInRF && !std::is_same<output_accum_t, output_t>::value;
+      !kKeepOutputInRF && !cutlass::platform::is_same<output_accum_t, output_t>::value;
 
   static_assert(kQueriesPerBlock % 32 == 0, "");
   static_assert(kKeysPerBlock % 32 == 0, "");
@@ -863,15 +863,19 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
       int(__CUDA_ARCH_OR_ZERO__));                                  \
   _ATTENTION_KERNEL_FORWARD_END();
 
+// On windows we don't build with /Zc:preprocessor
+// See: https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define EXPAND( x ) x
+
 // All kernels are disabled by default
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(50, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(50, __VA_ARGS__))
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(70, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(70, __VA_ARGS__))
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(75, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(75, __VA_ARGS__))
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(80, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(80, __VA_ARGS__))
 
 // Enable the right one based on __CUDA_ARCH__
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 500
@@ -879,17 +883,17 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
 #elif __CUDA_ARCH__ < 700
 #undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(50, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD(50, __VA_ARGS__))
 #elif __CUDA_ARCH__ < 750
 #undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(70, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD(70, __VA_ARGS__))
 #elif __CUDA_ARCH__ < 800
 #undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(75, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD(75, __VA_ARGS__))
 #elif __CUDA_ARCH__ >= 800
 #undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80
 #define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(80, __VA_ARGS__)
+  EXPAND(INSTANTIATE_ATTENTION_KERNEL_FORWARD(80, __VA_ARGS__))
 #endif
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 63252b0f238c..433d5b4cd158 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -21,13 +21,29 @@
 
 namespace sdp {
 
+template <typename To, typename From>
+To bit_cast(From f) {
+  static_assert(sizeof(To) == sizeof(From));
+  To t;
+  std::memcpy(&t, &f, sizeof(f));
+  return t;
+}
+
+// This helper function creates a constexpr std::array
+// From a compile time list of values
+template <typename V, typename... T>
+constexpr auto array_of(T&&... t)
+    -> std::array < V, sizeof...(T) >
+{
+    return {{ std::forward<T>(t)... }};
+}
+
 struct sdp_params {
   const at::Tensor& query;
   const at::Tensor& key;
   const at::Tensor& value;
   bool has_attn_mask;
   double dropout;
-  bool need_attn_weights;
   bool is_causal;
 };
 
@@ -98,18 +114,6 @@ inline bool check_tensor_dtype(
   return true;
 }
 
-inline bool check_for_attn_weights(sdp_params params, bool debug) {
-  // This can be returned form flash attention but care is needed
-  // to convert from flash_attn format to attn_weights
-  if (params.need_attn_weights) {
-    if (debug) {
-      TORCH_WARN("Both fused kernels do not support need_attn_weights=True.");
-    }
-    return false;
-  }
-  return true;
-}
-
 inline bool check_for_non_zero_dropout(sdp_params params, bool debug) {
   if (params.dropout != 0.0) {
     if (debug) {
@@ -121,10 +125,22 @@ inline bool check_for_non_zero_dropout(sdp_params params, bool debug) {
 }
 
 inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
+  // When this function is called we are assured that the nt is dim==4
   if (!params.query.is_nested()) {
     return true;
   }
-  const at::Tensor& sizes = at::native::get_nested_tensor_impl(params.query)->get_nested_size_tensor();
+  // we are only checking query but should probably check all of them
+  const auto nt_q_tensor_impl = at::native::get_nested_tensor_impl(params.query);
+  const at::Tensor& sizes = nt_q_tensor_impl->get_nested_size_tensor();
+  auto num_head_dims = nt_q_tensor_impl->opt_size(1);
+  if (!num_head_dims.has_value() ) {
+    // num_head_dims is ragged
+    if (debug) {
+      TORCH_WARN("Memory efficient attention does not support ragged num_head_dims");
+    }
+    return false;
+  }
+
   auto* sizes_ptr = sizes.data_ptr<int64_t>();
   const int64_t n_tensors = params.query.size(0);
   const int64_t size_tensor_stride = sizes.stride(0);
@@ -133,7 +149,7 @@ inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   for (const auto i : c10::irange(n_tensors)) {
     if (sizes_ptr[(i * size_tensor_stride) + 1] <= 1) {
       if (debug) {
-        TORCH_WARN("Flash Attention does not support sequence_length <= 1");
+        TORCH_WARN("Memory efficient attention does not support sequence_length <= 1");
       }
       return false;
     }
@@ -153,12 +169,10 @@ inline bool check_for_nested_inputs(sdp_params params, bool debug){
 }
 
 inline bool check_requires_grad(sdp_params params, bool debug) {
-  bool any_tensors_are_subclass =
-      at::areAnyTensorSubclassLike({params.query, params.key, params.value});
   const bool any_inputs_require_grad = params.query.requires_grad() ||
       params.key.requires_grad() || params.value.requires_grad();
   const bool gradmode_enabled = at::GradMode::is_enabled();
-  if ((any_inputs_require_grad && gradmode_enabled) || any_tensors_are_subclass) {
+  if ((any_inputs_require_grad && gradmode_enabled)) {
     if (debug) {
       TORCH_WARN("Flash Attention does not currently support training.");
     }
@@ -207,22 +221,51 @@ inline bool check_tensor_shapes(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_equal_batch_size_and_num_heads(sdp_params params, bool debug) {
+  // This is expected to be called after check_tensor_shapes ensuring that the size()
+  // calls won't error since the inputs are all 4 dimensional
+  bool same_batch_size = params.query.size(0) == params.key.size(0) &&
+      params.query.size(0) == params.value.size(0);
+  // We pass through for NestedTensors since this is checked in a later filter
+  bool same_num_heads = params.query.is_nested()
+      ? true
+      : params.query.size(1) == params.key.size(1) &&
+          params.query.size(1) == params.value.size(1);
+
+  if (!(same_batch_size && same_num_heads)) {
+    if (debug) {
+      TORCH_WARN(
+        "Both fused kernels requires query, key and value to have the same batch_size and num_heads. Query.sizes(): ",
+        params.query.sizes(),
+        ", Key sizes(): ",
+        params.key.sizes(),
+        ", Value sizes(): ",
+        params.value.sizes(),
+        " instead.");
+    }
+    return false;
+  }
+  return true;
+}
+
 inline bool check_head_dim_size(sdp_params params, bool debug) {
   const int64_t query_size_last = params.query.size(-1);
+  const int64_t key_size_last = params.key.size(-1);
   const int64_t value_size_last = params.value.size(-1);
-  if (!(query_size_last == params.key.size(-1) && query_size_last % 8 == 0 &&
+  if (!(query_size_last == key_size_last &&
+        query_size_last == value_size_last && query_size_last % 8 == 0 &&
         query_size_last <= 128 && value_size_last % 8 == 0 &&
         value_size_last <= 128)) {
     if (debug) {
       TORCH_WARN(
-        "Flash attention requires last dimension of inputs to be a multiple of 8 and less than or equal to 128.",
-        "Got Query.size(-1): ",
-        query_size_last,
-        ", Key.size(-1): ",
-        params.key.size(-1),
-        ", Value.size(-1): ",
-        params.value.size(-1),
-        " instead.");
+          "Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 128.",
+          " Got Query.size(-1): ",
+          query_size_last,
+          ", Key.size(-1): ",
+          params.key.size(-1),
+          ", Value.size(-1): ",
+          params.value.size(-1),
+          " instead.");
     }
     return false;
   }
@@ -343,6 +386,34 @@ inline bool check_gpu_sm50_or_greater(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_gpu_sm86_head_dim_128(sdp_params params, bool debug) {
+  // Memory Efficient Attention is throwing a cuda illegal memory error
+  // on sm86 when head_dim is 128.
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  bool is_sm86 = (dprops->major == 8) && (dprops->minor == 6);
+  if (is_sm86 && (params.query.size(-1) == 128)) {
+    if (debug) {
+      TORCH_WARN(
+        "Memory Efficient Attention does not currently support head_dim == 128 on sm86",
+        "because it is throwing a cuda illegal memory error on sm86 when head_dim is 128.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad_and_head_dim_128_and_sm86(sdp_params params, bool debug){
+  // Flash Attention will raise an error in the backward pass if the head_dim size is 128
+  // And the device is not sm80, the other head_dim check catches everything but sm86
+  if (!check_requires_grad(params, false) && !check_gpu_sm86_head_dim_128(params, false)){
+    if (debug){
+      TORCH_WARN("Flash attention currently doesn't support training with head_dim == 128 on sm86.");
+    }
+    return false;
+  }
+  return true;
+}
+
 inline bool check_use_deterministic_algorithms(sdp_params params, bool debug) {
   auto& ctx = at::globalContext();
   if (ctx.deterministicAlgorithms()) {
@@ -369,17 +440,18 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
   TORCH_CHECK(!debug, "Torch was not compiled with flash attention.");
   return false;
 #endif
-  //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
+
+  // Define gate functions that determine if a flash kernel can be ran
+  // Replace with std::to_array when we migrate to c++20
+  constexpr auto constraints = array_of<bool (*)(sdp_params, bool)>(
       check_runtime_disabled_flash,
-      check_requires_grad,
       check_tensor_shapes,
-      check_for_attn_weights,
+      check_equal_batch_size_and_num_heads,
       check_for_attn_mask,
       check_head_dim_size,
       check_gpu_sm75_or_greater,
-      check_for_nested_inputs,
-      check_for_seq_len_1_nested_tensor}};
+      check_requires_grad_and_head_dim_128_and_sm86,
+      check_for_seq_len_1_nested_tensor);
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
@@ -388,10 +460,10 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
 
   auto dprop = at::cuda::getCurrentDeviceProperties();
   if (dprop->major >= 8) {
-    static const std::array<at::ScalarType, 2> sm80_flash_dtypes{at::kHalf, at::kBFloat16};
+    constexpr auto sm80_flash_dtypes = array_of<at::ScalarType> (at::kHalf, at::kBFloat16);
     return check_tensor_dtype(params, sm80_flash_dtypes, debug);
   } else {
-    static const std::array<at::ScalarType, 1> default_flash_dtypes{at::kHalf};
+    constexpr auto default_flash_dtypes = array_of<at::ScalarType> (at::kHalf);
     return check_tensor_dtype(params, default_flash_dtypes, debug);
   }
 }
@@ -401,28 +473,29 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
   TORCH_CHECK(!debug, "Torch was not compiled with flash attention.");
   return false;
 #endif
-  // Constraints specific to flash attention
-  static const std::vector<caffe2::ScalarType> flash_dtypes{
-      at::kHalf, at::kFloat, at::kBFloat16};
+  // Constraints specific to mem efficient attention
+  constexpr auto mem_efficient_dtypes =
+      array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
 
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 10> constraints{{
+  constexpr auto constraints = array_of<bool (*)(sdp_params, bool)>(
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
       check_requires_grad_and_nested,
-      check_for_attn_weights,
       check_tensor_shapes,
+      check_equal_batch_size_and_num_heads,
       check_for_attn_mask,
       check_head_dim_size_mem_efficient,
+      check_gpu_sm86_head_dim_128,
       check_for_seq_len_1_nested_tensor,
       check_for_non_zero_dropout,
-      check_use_deterministic_algorithms}};
+      check_use_deterministic_algorithms);
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
     }
   }
-  if (!check_tensor_dtype(params, flash_dtypes, debug)) {
+  if (!check_tensor_dtype(params, mem_efficient_dtypes, debug)) {
     return false;
   }
   return true;
diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp
index 4a4c9946b35a..fc8a02cd38d4 100644
--- a/aten/src/ATen/native/transformers/transformer.cpp
+++ b/aten/src/ATen/native/transformers/transformer.cpp
@@ -2,6 +2,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/NestedTensorImpl.h>
+#include <c10/util/Exception.h>
 
 #include <torch/library.h>
 
@@ -167,6 +168,8 @@ std::tuple<Tensor, Tensor, Tensor>  transformer_decoder_only_layer_forward(
       return std::make_tuple(src_out, incr_key.value(), incr_value.value());
     }
   }
+  TORCH_WARN("_transformer_decoder_only_layer_fwd is deprecated")
+
   TORCH_CHECK(!norm_first, "norm_first is not supported yet");
   auto mha_out = native_decoder_only_multi_head_attention(
       src,
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index 323bbd9512ba..cbd6e59e402d 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -63,11 +63,14 @@ void CommandBuffer::begin() {
 
 void CommandBuffer::end() {
   TORCH_CHECK(
-      state_ == CommandBuffer::State::RECORDING,
+      state_ == CommandBuffer::State::RECORDING ||
+          state_ == CommandBuffer::State::SUBMITTED,
       "Vulkan CommandBuffer: called end() on a command buffer whose state "
-      "is not RECORDING.");
+      "is not RECORDING or SUBMITTED.");
 
-  VK_CHECK(vkEndCommandBuffer(handle_));
+  if (state_ == CommandBuffer::State::RECORDING) {
+    VK_CHECK(vkEndCommandBuffer(handle_));
+  }
   state_ = CommandBuffer::State::READY;
 }
 
@@ -338,7 +341,7 @@ void CommandBuffer::reset_querypool(
   vkCmdResetQueryPool(handle_, querypool, first_idx, count);
 }
 
-VkCommandBuffer CommandBuffer::get_submit_handle() {
+VkCommandBuffer CommandBuffer::get_submit_handle(const bool final_use) {
   TORCH_CHECK(
       state_ == CommandBuffer::State::READY,
       "Vulkan CommandBuffer: called begin() on a command buffer whose state "
@@ -346,8 +349,9 @@ VkCommandBuffer CommandBuffer::get_submit_handle() {
 
   const VkCommandBuffer handle = handle_;
 
-  handle_ = VK_NULL_HANDLE;
-  bound_.reset();
+  if (!is_reusable() || final_use) {
+    invalidate();
+  }
   state_ = CommandBuffer::State::SUBMITTED;
 
   return handle;
@@ -388,7 +392,7 @@ CommandPool::~CommandPool() {
   vkDestroyCommandPool(device_, pool_, nullptr);
 }
 
-CommandBuffer CommandPool::get_new_cmd() {
+CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   std::lock_guard<std::mutex> lock(mutex_);
 
   // No-ops if there are command buffers available
@@ -396,8 +400,13 @@ CommandBuffer CommandPool::get_new_cmd() {
 
   const VkCommandBuffer handle = buffers_[in_use_];
 
+  VkCommandBufferUsageFlags cmd_flags = 0u;
+  if (!reusable) {
+    cmd_flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+  }
+
   in_use_++;
-  return CommandBuffer(handle);
+  return CommandBuffer(handle, cmd_flags);
 }
 
 void CommandPool::flush() {
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index 9c19095acdeb..7a46e2ebe3cc 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -18,8 +18,7 @@ class CommandBuffer final {
  public:
   explicit CommandBuffer(
       const VkCommandBuffer,
-      const VkCommandBufferUsageFlags =
-          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
+      const VkCommandBufferUsageFlags);
 
   CommandBuffer(const CommandBuffer&) = delete;
   CommandBuffer& operator=(const CommandBuffer&) = delete;
@@ -69,6 +68,15 @@ class CommandBuffer final {
   Bound bound_;
 
  public:
+  inline bool is_reusable() {
+    return !(flags_ & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
+  }
+
+  inline void invalidate() {
+    handle_ = VK_NULL_HANDLE;
+    bound_.reset();
+  }
+
   void begin();
   void end();
 
@@ -112,7 +120,7 @@ class CommandBuffer final {
   void write_timestamp(const VkQueryPool, const uint32_t) const;
   void reset_querypool(const VkQueryPool, const uint32_t, const uint32_t) const;
 
-  VkCommandBuffer get_submit_handle();
+  VkCommandBuffer get_submit_handle(const bool final_use = false);
 
   inline operator bool() const {
     return VK_NULL_HANDLE != handle_;
@@ -150,7 +158,7 @@ class CommandPool final {
   size_t in_use_;
 
  public:
-  CommandBuffer get_new_cmd();
+  CommandBuffer get_new_cmd(bool reusable = false);
 
   void flush();
 
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index d8dbc0d605e9..a5c4349d9aeb 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -23,7 +23,7 @@ Context::Context(size_t adapter_i, const ContextConfig& config)
 #endif /* USE_VULKAN_GPU_DIAGNOSTICS */
       // Command buffer submission
       cmd_mutex_{},
-      cmd_(VK_NULL_HANDLE),
+      cmd_(VK_NULL_HANDLE, 0u),
       submit_count_{0u},
       // Memory Management
       buffer_clearlist_mutex_{},
@@ -70,10 +70,13 @@ void Context::submit_compute_epilogue(
   command_buffer.dispatch(global_workgroup_size);
 }
 
-void Context::submit_cmd_to_gpu(const VkFence fence_handle) {
+void Context::submit_cmd_to_gpu(
+    const VkFence fence_handle,
+    const bool final_use) {
   if (cmd_) {
     cmd_.end();
-    adapter_p_->submit_cmd(queue_, cmd_.get_submit_handle(), fence_handle);
+    adapter_p_->submit_cmd(
+        queue_, cmd_.get_submit_handle(final_use), fence_handle);
 
     submit_count_ = 0u;
   }
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index 4991773a7618..d151d9fbf5c0 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -157,14 +157,14 @@ class Context final {
     return std::unique_lock<std::mutex>(cmd_mutex_);
   }
 
- private:
-  inline void set_cmd() {
+  inline void set_cmd(bool reusable = false) {
     if (!cmd_) {
-      cmd_ = command_pool_.get_new_cmd();
+      cmd_ = command_pool_.get_new_cmd(reusable);
       cmd_.begin();
     }
   }
 
+ private:
   DescriptorSet submit_compute_prologue(
       CommandBuffer&,
       const ShaderInfo&,
@@ -196,10 +196,10 @@ class Context final {
       const VkFence fence_handle,
       Arguments&&...);
 
- private:
-  void submit_cmd_to_gpu(const VkFence fence_handle = VK_NULL_HANDLE);
+  void submit_cmd_to_gpu(
+      const VkFence fence_handle = VK_NULL_HANDLE,
+      const bool final_use = false);
 
- public:
   void flush();
 };
 
@@ -257,14 +257,18 @@ class StorageBuffer final {
   StorageBuffer(const StorageBuffer&) = delete;
   StorageBuffer& operator=(const StorageBuffer&) = delete;
 
-  StorageBuffer(StorageBuffer&&) = delete;
-  StorageBuffer& operator=(StorageBuffer&&) = delete;
+  StorageBuffer(StorageBuffer&&) = default;
+  StorageBuffer& operator=(StorageBuffer&&) = default;
 
   ~StorageBuffer() {
     context_p_->register_buffer_cleanup(vulkan_buffer_);
   }
 
-  VulkanBuffer& buffer() {
+  inline c10::ScalarType dtype() {
+    return dtype_;
+  }
+
+  inline VulkanBuffer& buffer() {
     return vulkan_buffer_;
   }
 };
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
index 37ba55ca1c36..9bb4fb1740cc 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@@ -41,20 +41,14 @@ DescriptorSet& DescriptorSet::operator=(DescriptorSet&& other) noexcept {
 DescriptorSet& DescriptorSet::bind(
     const uint32_t idx,
     const VulkanBuffer& buffer) {
-  add_binding(DescriptorSet::ResourceBinding{
-      idx, // binding_idx
-      shader_layout_signature_[idx], // descriptor_type
-      false, // is_image
-      {
-          // resource_info
-          .buffer_info =
-              {
-                  buffer.handle(), // buffer
-                  buffer.mem_offset(), // offset
-                  buffer.mem_range(), // range
-              },
-      },
-  });
+  DescriptorSet::ResourceBinding binder;
+  binder.binding_idx = idx; // binding_idx
+  binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
+  binder.is_image = false; // is_image
+  binder.resource_info.buffer_info.buffer = buffer.handle(); // buffer
+  binder.resource_info.buffer_info.offset = buffer.mem_offset(); // offset
+  binder.resource_info.buffer_info.range = buffer.mem_range(); // range
+  add_binding(std::move(binder));
 
   return *this;
 }
@@ -67,20 +61,14 @@ DescriptorSet& DescriptorSet::bind(
     binding_layout = VK_IMAGE_LAYOUT_GENERAL;
   }
 
-  add_binding(DescriptorSet::ResourceBinding{
-      idx, // binding_idx
-      shader_layout_signature_[idx], // descriptor_type
-      true, // is_image
-      {
-          // resource_info
-          .image_info =
-              {
-                  image.sampler(), // buffer
-                  image.image_view(), // imageView
-                  binding_layout, // imageLayout
-              },
-      },
-  });
+  DescriptorSet::ResourceBinding binder;
+  binder.binding_idx = idx; // binding_idx
+  binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
+  binder.is_image = true; // is_image
+  binder.resource_info.image_info.sampler = image.sampler(); // buffer
+  binder.resource_info.image_info.imageView = image.image_view(); // imageView
+  binder.resource_info.image_info.imageLayout = binding_layout; // imageLayout
+  add_binding(std::move(binder));
 
   return *this;
 }
diff --git a/aten/src/ATen/native/vulkan/api/Tensor.cpp b/aten/src/ATen/native/vulkan/api/Tensor.cpp
index c2959c14e1cf..4568cfff20b3 100644
--- a/aten/src/ATen/native/vulkan/api/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Tensor.cpp
@@ -124,8 +124,7 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
 
     c10::SmallVector<int64_t, 6u> gpu_sizes(3);
 
-    // Channel dim will be always be aligned. For 4 dimensional tensors, batch
-    // and channel are combined, then aligned.
+    // Channel dim will be be aligned to the next multiple of 4
     switch (ndim) {
       case 1:
         gpu_sizes[0] = 4;
@@ -146,8 +145,8 @@ c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
         break;
 
       case 4:
-        int64_t combined_depth = sizes[0] * sizes[1];
-        gpu_sizes[0] = api::utils::align_up(combined_depth, INT64_C(4));
+        int64_t padded_c = api::utils::align_up(sizes[1], INT64_C(4));
+        gpu_sizes[0] = sizes[0] * padded_c;
         gpu_sizes[1] = sizes[2];
         gpu_sizes[2] = sizes[3];
         break;
diff --git a/aten/src/ATen/native/vulkan/api/Tensor.h b/aten/src/ATen/native/vulkan/api/Tensor.h
index 34fed0aad62b..80aee396639a 100644
--- a/aten/src/ATen/native/vulkan/api/Tensor.h
+++ b/aten/src/ATen/native/vulkan/api/Tensor.h
@@ -101,6 +101,16 @@ class vTensor final {
       const api::StorageType storage_type = api::StorageType::TEXTURE_3D,
       const c10::MemoryFormat memory_format = c10::MemoryFormat::Contiguous);
 
+  // Copy Constructor and Assignment; Ideally copying  would be disabled
+  // (see the reasoning for move assignment below) but it is required for
+  // compatibility with OpaqueTensorImpl
+  vTensor(const vTensor& other) = default;
+  vTensor& operator=(const vTensor& other) = default;
+
+  // Move Constructor and assignment
+  vTensor(vTensor&& other) = default;
+  vTensor& operator=(vTensor&& other) = default;
+
   // Used for passing buffer sizes and strides data to shaders
   struct BufferMetadata {
     api::utils::uvec4 sizes;
@@ -269,6 +279,10 @@ class vTensor final {
     return c10::multiply_integers(sizes());
   }
 
+  inline size_t nbytes() const {
+    return c10::elementSize(dtype()) * numel();
+  }
+
   /*
    * Returns numel but based on gpu_sizes_ instead of sizes_
    */
@@ -276,10 +290,6 @@ class vTensor final {
     return view_->buffer_length_;
   }
 
-  inline size_t nbytes() const {
-    return c10::elementSize(dtype()) * numel();
-  }
-
   /*
    * Return nbytes but bnased on gpu_sizes_ instead of sizes_
    */
diff --git a/aten/src/ATen/native/vulkan/glsl/abs.glsl b/aten/src/ATen/native/vulkan/glsl/abs.glsl
new file mode 100644
index 000000000000..0113e03cafa6
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/abs.glsl
@@ -0,0 +1,27 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 intex = texelFetch(uInput, pos, 0);
+    imageStore(
+        uOutput,
+        pos,
+        abs(intex));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/abs_.glsl b/aten/src/ATen/native/vulkan/glsl/abs_.glsl
new file mode 100644
index 000000000000..dcf4125b0de4
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/abs_.glsl
@@ -0,0 +1,26 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION restrict Block {
+  ivec4 size;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    const vec4 intex = imageLoad(uOutput, pos);
+    imageStore(
+        uOutput,
+        pos,
+        abs(intex));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/cat_feature.glsl b/aten/src/ATen/native/vulkan/glsl/cat_feature.glsl
index fbd30345c293..26544771d6fd 100644
--- a/aten/src/ATen/native/vulkan/glsl/cat_feature.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/cat_feature.glsl
@@ -1,47 +1,74 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT    $format
+#define FORMAT $format
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput;
 
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION           image3D uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION           sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict  Block {
-  ivec4 size;            // output texture size (x=width,y=height,z=depth,w=unused)
-  ivec4 isize;           // input texture size (x=width,y=height,z=depth,w=unused)
-  uint batch_size;       // input tensor's batch size
-  uint ch_size;          // input tensor's channel size
-  uint ch_interval;      // channel interval (total # of channels for all tensors)
-  uint ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor
-} uBlock;
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // output texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 out_extents;
+  // input texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 in_extents;
+  // input tensor's batch size
+  uint batch_size;
+  // input tensor's channel size
+  uint ch_size;
+  // channel interval (total # of channels for all tensors)
+  uint ch_interval;
+  // # of channels for tensor 0 to i-1 at ith tensor
+  uint ch_size_allprior;
+}
+uBlock;
+
+/*
+ * Local Work Group
+ */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const ivec3 posIn = ivec3(gl_GlobalInvocationID);
+  const ivec3 in_pos = ivec3(gl_GlobalInvocationID);
   const uint max_src_index = uBlock.ch_size * uBlock.batch_size;
 
-  if (all(lessThan(posIn, uBlock.isize.xyz))) {
-    ivec3 posOut = posIn; // x and y don't change. only z and index matter
-    const vec4 inval = texelFetch(uInput, posIn, 0);
-
-    for (uint i = 0; i < 4; ++i)
-    {
-      uint src_index = posIn.z * 4 + i;
-      if (src_index >= max_src_index) {
-        // out of range
-        break;
-      }
-
-      uint dst_index = uint(src_index / uBlock.ch_size) * uBlock.ch_interval + (src_index % uBlock.ch_size) + uBlock.ch_size_allprior;
-      posOut.z = int(dst_index / 4);
-      uint j = (dst_index % 4);
-
-      vec4 outval = imageLoad(uOutput, posOut);
-      outval[j] = inval[i];
-      imageStore(uOutput, posOut, outval);
+  if (any(greaterThanEqual(in_pos, uBlock.in_extents.xyz))) {
+    return;
+  }
+
+  // x and y don't change. only z and index matter
+  ivec3 out_pos = in_pos;
+  const vec4 in_tex = texelFetch(uInput, in_pos, 0);
+
+  for (uint i = 0; i < 4; ++i) {
+    uint src_index = in_pos.z * 4 + i;
+
+    if (src_index >= max_src_index) {
+      // out of range
+      break;
     }
+
+    uint src_n_idx = src_index / uBlock.ch_size;
+    uint src_c_idx = src_index % uBlock.ch_size;
+
+    uint dst_nc_idx =
+        src_n_idx * uBlock.ch_interval + src_c_idx + uBlock.ch_size_allprior;
+
+    out_pos.z = int(dst_nc_idx / 4);
+    uint j = (dst_nc_idx % 4);
+
+    vec4 out_tex = imageLoad(uOutput, out_pos);
+    out_tex[j] = in_tex[i];
+    imageStore(uOutput, out_pos, out_tex);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
index 50600fdcdcfb..05ee499b50f6 100644
--- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
@@ -20,9 +20,10 @@ uBuffer;
  * Params Buffer
  */
 layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  // xyz contain the extents of the input texture, w contains HxW to help
-  // calculate buffer offsets
+  // Extents of the output texture
   ivec4 in_extents;
+  // Number of texels spanned by one channel
+  ivec2 c_info;
 }
 uBlock;
 
@@ -40,13 +41,25 @@ void main() {
 
   const vec4 intex = texelFetch(uImage, pos, 0);
 
+  const int n_index = int(pos.z / uBlock.c_info.x);
+  const int c_index = (pos.z % uBlock.c_info.x) * 4;
+  int d_offset = (n_index * uBlock.c_info.y) + c_index;
+
   const int base_index =
-      pos.x + uBlock.in_extents.x * pos.y + (4 * uBlock.in_extents.w) * pos.z;
+      pos.x + uBlock.in_extents.x * pos.y + uBlock.in_extents.w * d_offset;
   const ivec4 buf_indices =
       base_index + ivec4(0, 1, 2, 3) * uBlock.in_extents.w;
 
-  uBuffer.data[buf_indices.x] = intex.x;
-  uBuffer.data[buf_indices.y] = intex.y;
-  uBuffer.data[buf_indices.z] = intex.z;
-  uBuffer.data[buf_indices.w] = intex.w;
+  if (c_index < uBlock.c_info.y) {
+    uBuffer.data[buf_indices.x] = intex.x;
+  }
+  if (c_index + 1 < uBlock.c_info.y) {
+    uBuffer.data[buf_indices.y] = intex.y;
+  }
+  if (c_index + 2 < uBlock.c_info.y) {
+    uBuffer.data[buf_indices.z] = intex.z;
+  }
+  if (c_index + 3 < uBlock.c_info.y) {
+    uBuffer.data[buf_indices.w] = intex.w;
+  }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
index dc59ca5cba5e..46bbc2484954 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -1,54 +1,77 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT    $format
+#define FORMAT $format
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
 
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
-  ivec4 size;
-  ivec3 isize;
-} uBlock;
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // extents of the output texture
+  // w contains pre-computed H*W of the input texture for convenience
+  ivec4 out_extents;
+  // extents of the input texture
+  // w contains size of input channels aligned to 4
+  ivec4 in_extents;
+}
+uBlock;
+
+/*
+ * Shared memory buffer
+ */
 shared vec4 sh_mem[64];
 
+/*
+ * Local Work Group
+ */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+/*
+ * Computes the mean of an input tensor along the width, height, and channel
+ * axes.
+ */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   const ivec3 tid = ivec3(gl_LocalInvocationID);
   const ivec3 group_size = ivec3(gl_WorkGroupSize);
 
-  if (pos.z < uBlock.isize.z) {
+  if (pos.z < uBlock.in_extents.z) {
     vec4 sum = vec4(0);
 
-    for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) {
-      for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) {
+    for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) {
+      for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) {
         sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
       }
     }
 
-    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum;
+    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] =
+        sum;
   }
   memoryBarrierShared();
   barrier();
 
-  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.size.z) {
+  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.out_extents.z) {
     return;
   }
 
   vec4 total = vec4(0);
   for (int y = 0; y < group_size.y; ++y) {
     for (int x = 0; x < group_size.x; ++x) {
-      total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
+      total +=
+          sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
     }
   }
 
-  imageStore(
-      uOutput,
-      pos,
-      total / uBlock.size.w);
+  imageStore(uOutput, pos, total / uBlock.out_extents.w);
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
index 5f949ea83d29..b79dd7c4e8c6 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -1,73 +1,90 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT    $format
+#define FORMAT $format
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
-
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
-  ivec4 size;
-  ivec3 isize;
-} uBlock;
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // extents of the output texture
+  // w contains pre-computed H*W of the input texture for convenience
+  ivec4 out_extents;
+  // extents of the input texture
+  // w contains size of input channels aligned to 4
+  ivec4 in_extents;
+}
+uBlock;
 
+/*
+ * Shared memory buffer
+ */
 shared vec4 sh_mem[64];
 
+/*
+ * Local Work Group
+ */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+/*
+ * Computes the mean of an input tensor along the width and height axes.
+ */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   const ivec3 tid = ivec3(gl_LocalInvocationID);
   const ivec3 group_size = ivec3(gl_WorkGroupSize);
 
-  if (pos.z < uBlock.isize.z) {
+  if (pos.z < uBlock.in_extents.z) {
     vec4 sum = vec4(0);
 
-    for (int y = tid.y; y < uBlock.isize.y; y+=group_size.y) {
-      for (int x = tid.x; x < uBlock.isize.x; x+=group_size.x) {
+    for (int y = tid.y; y < uBlock.in_extents.y; y += group_size.y) {
+      for (int x = tid.x; x < uBlock.in_extents.x; x += group_size.x) {
         sum += texelFetch(uInput, ivec3(x, y, pos.z), 0);
       }
     }
 
-    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] = sum;
+    sh_mem[tid.z * group_size.y * group_size.x + tid.y * group_size.x + tid.x] =
+        sum;
   }
   memoryBarrierShared();
   barrier();
 
-  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.isize.z) {
+  if (tid.y > 0 || tid.x > 0 || pos.z >= uBlock.in_extents.z) {
     return;
   }
 
   vec4 total = vec4(0);
   for (int y = 0; y < group_size.y; ++y) {
     for (int x = 0; x < group_size.x; ++x) {
-      total += sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
+      total +=
+          sh_mem[tid.z * group_size.y * group_size.x + y * group_size.x + x];
     }
   }
 
-  const vec4 outtex = total / uBlock.size.w;
-  const int zoutx = 4*pos.z;
-  const int width = uBlock.size.x;
-  const int maxlen = uBlock.size.x * uBlock.size.y;
-
-  const int zouty = min(zoutx + 1, maxlen);
-  ivec3 posy = ivec3((zouty)%width, (zouty)/width, 0);
-  vec4 outy = vec4(outtex.y, 0, 0, 0);
-  imageStore(uOutput, posy, outy);
-
-  const int zoutz = min(zoutx + 2, maxlen);
-  ivec3 posz = ivec3((zoutz)%width, (zoutz)/width, 0);
-  vec4 outz = vec4(outtex.z, 0, 0, 0);
-  imageStore(uOutput, posz, outz);
-
-  const int zoutw = min(zoutx + 3, maxlen);
-  ivec3 posw = ivec3((zoutw)%width, (zoutw)/width, 0);
-  vec4 outw = vec4(outtex.w, 0, 0, 0);
-  imageStore(uOutput, posw, outw);
-
-  ivec3 posx = ivec3(zoutx%width, zoutx/width, 0);
-  vec4 outx = vec4(outtex.x, 0, 0, 0);
-  imageStore(uOutput, posx, outx);
+  const vec4 outtex = total / uBlock.out_extents.w;
+
+  const int nc_idx = pos.z * 4;
+  const int out_width = uBlock.out_extents.x;
+  const int out_height = uBlock.out_extents.y;
+
+  for (int i = 0; i < 4; ++i) {
+    const int n_idx = (nc_idx + i) / uBlock.in_extents.w;
+    const int c_idx = (nc_idx + i) % uBlock.in_extents.w;
+
+    ivec3 pos = ivec3(c_idx, n_idx, 0);
+    if (c_idx < out_width && n_idx < out_height) {
+      imageStore(uOutput, pos, vec4(outtex[i], 0, 0, 0));
+    }
+  }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
index 70f57c0742ad..be1f2520b7c8 100644
--- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
@@ -21,9 +21,10 @@ uBuffer;
  * Params Buffer
  */
 layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  // xyz contain the extents of the output texture, w contains HxW to help
-  // calculate buffer offsets
+  // Extents of the output texture
   ivec4 out_extents;
+  // Number of texels spanned by one channel
+  ivec2 c_info;
 }
 uBlock;
 
@@ -39,15 +40,31 @@ void main() {
     return;
   }
 
+  const int n_index = int(pos.z / uBlock.c_info.x);
+  const int c_index = (pos.z % uBlock.c_info.x) * 4;
+  int d_offset = (n_index * uBlock.c_info.y) + c_index;
+
   const int base_index =
-      pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z;
+      pos.x + uBlock.out_extents.x * pos.y + uBlock.out_extents.w * d_offset;
   const ivec4 buf_indices =
       base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w;
 
-  float val_x = uBuffer.data[buf_indices.x];
-  float val_y = uBuffer.data[buf_indices.y];
-  float val_z = uBuffer.data[buf_indices.z];
-  float val_w = uBuffer.data[buf_indices.w];
+  float val_x = 0;
+  if (c_index < uBlock.c_info.y) {
+    val_x = uBuffer.data[buf_indices.x];
+  }
+  float val_y = 0;
+  if (c_index + 1 < uBlock.c_info.y) {
+    val_y = uBuffer.data[buf_indices.y];
+  }
+  float val_z = 0;
+  if (c_index + 2 < uBlock.c_info.y) {
+    val_z = uBuffer.data[buf_indices.z];
+  }
+  float val_w = 0;
+  if (c_index + 3 < uBlock.c_info.y) {
+    val_w = uBuffer.data[buf_indices.w];
+  }
 
   imageStore(uImage, pos, vec4(val_x, val_y, val_z, val_w));
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/permute_4d.glsl b/aten/src/ATen/native/vulkan/glsl/permute_4d.glsl
index 95b8858d2f46..fc57ba4d3db4 100644
--- a/aten/src/ATen/native/vulkan/glsl/permute_4d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/permute_4d.glsl
@@ -1,28 +1,48 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT    $format
+#define FORMAT $format
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
-
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION           image3D uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION           sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict  Block {
-  ivec4 size;            // output texture size (x=width,y=height,z=depth,w=unused)
-  ivec4 isize;           // input texture size (x=width,y=height,z=depth,w=unused)
-  uvec4 tensor_size;     // output tensor size
-  uvec4 itensor_size;    // input tensor size
-  uvec4 dims;            // output dims
-} uBlock;
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput;
+
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // output texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 size;
+  // input texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 isize;
+  // output tensor size
+  uvec4 out_tensor_size;
+  // input tensor size
+  uvec4 in_tensor_size;
+  // output dims
+  uvec4 out_ndims;
+  // x = output channels aligned to 4, y = input channels aligned to 4
+  uvec2 ch_info;
+}
+uBlock;
 
+/*
+ * Local Work Group
+ */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 posOut = ivec3(gl_GlobalInvocationID);
 
   if (all(lessThan(posOut, uBlock.size.xyz))) {
-    const uint max_dst_index = uBlock.tensor_size[0] * uBlock.tensor_size[1];
+    const uint max_dst_index = uBlock.out_tensor_size[0] * uBlock.ch_info.x;
     vec4 outval = vec4(0.0);
 
     for (uint j = 0; j < 4; ++j) {
@@ -33,73 +53,73 @@ void main() {
         break;
       }
 
-      uint b1 = int(dst_index / uBlock.tensor_size[1]);
-      uint c1 = dst_index % uBlock.tensor_size[1];
+      uint b1 = int(dst_index / uBlock.ch_info.x);
+      uint c1 = dst_index % uBlock.ch_info.x;
       uint h1 = posOut.y;
       uint w1 = posOut.x;
 
       uint b, c, h, w;
-      switch (uBlock.dims[0]) {
-      case 0:
-        b = b1;
-        break;
-      case 1:
-        c = b1;
-        break;
-      case 2:
-        h = b1;
-        break;
-      case 3:
-        w = b1;
-        break;
+      switch (uBlock.out_ndims[0]) {
+        case 0:
+          b = b1;
+          break;
+        case 1:
+          c = b1;
+          break;
+        case 2:
+          h = b1;
+          break;
+        case 3:
+          w = b1;
+          break;
       }
 
-      switch (uBlock.dims[1]) {
-      case 0:
-        b = c1;
-        break;
-      case 1:
-        c = c1;
-        break;
-      case 2:
-        h = c1;
-        break;
-      case 3:
-        w = c1;
-        break;
+      switch (uBlock.out_ndims[1]) {
+        case 0:
+          b = c1;
+          break;
+        case 1:
+          c = c1;
+          break;
+        case 2:
+          h = c1;
+          break;
+        case 3:
+          w = c1;
+          break;
       }
 
-      switch (uBlock.dims[2]) {
-      case 0:
-        b = h1;
-        break;
-      case 1:
-        c = h1;
-        break;
-      case 2:
-        h = h1;
-        break;
-      case 3:
-        w = h1;
-        break;
+      switch (uBlock.out_ndims[2]) {
+        case 0:
+          b = h1;
+          break;
+        case 1:
+          c = h1;
+          break;
+        case 2:
+          h = h1;
+          break;
+        case 3:
+          w = h1;
+          break;
       }
 
-      switch (uBlock.dims[3]) {
-      case 0:
-        b = w1;
-        break;
-      case 1:
-        c = w1;
-        break;
-      case 2:
-        h = w1;
-        break;
-      case 3:
-        w = w1;
-        break;
+      switch (uBlock.out_ndims[3]) {
+        case 0:
+          b = w1;
+          break;
+        case 1:
+          c = w1;
+          break;
+        case 2:
+          h = w1;
+          break;
+        case 3:
+          w = w1;
+          break;
       }
 
-      uint src_index = b * uBlock.itensor_size[1] + c;
+      uint src_index = b * uBlock.ch_info.y + c;
       ivec3 posIn;
       posIn.x = int(w);
       posIn.y = int(h);
@@ -114,5 +134,4 @@ void main() {
       }
     }
   }
-
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/select_height.glsl b/aten/src/ATen/native/vulkan/glsl/select_height.glsl
new file mode 100644
index 000000000000..db6a2bf22695
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/select_height.glsl
@@ -0,0 +1,40 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec3 size;
+  int index;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // w
+  const int src_x = pos.x;
+  // h
+  const int src_y = uBlock.index;
+  // c
+  const int src_z = pos.y;
+
+  const vec4 v = texelFetch(uInput, ivec3(src_x, src_y, src_z), 0);
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= uBlock.size.y) {
+      return;
+    }
+
+    imageStore(uOutput, new_pos, vec4(v[i], 0, 0, 0));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/select_width.glsl b/aten/src/ATen/native/vulkan/glsl/select_width.glsl
new file mode 100644
index 000000000000..6b3f1c615785
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/select_width.glsl
@@ -0,0 +1,40 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  ivec3 size;
+  int index;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // w
+  const int src_x = uBlock.index;
+  // h
+  const int src_y = pos.x;
+  // c
+  const int src_z = pos.y;
+
+  const vec4 v = texelFetch(uInput, ivec3(src_x, src_y, src_z), 0);
+
+  for (int i = 0; i < 4; i++) {
+    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
+
+    // When the C-channel exceeds original block size, exit early
+    if (new_pos.y >= uBlock.size.y) {
+      return;
+    }
+
+    imageStore(uOutput, new_pos, vec4(v[i], 0, 0, 0));
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/slice_4d.glsl b/aten/src/ATen/native/vulkan/glsl/slice_4d.glsl
index d878fc41885e..3f0b441b1ac5 100644
--- a/aten/src/ATen/native/vulkan/glsl/slice_4d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/slice_4d.glsl
@@ -1,26 +1,46 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT    $format
+#define FORMAT $format
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION image3D uOutput;
 
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION           image3D uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION           sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION restrict  Block {
-  ivec4 size;            // output texture size (x=width,y=height,z=depth,w=unused)
-  ivec4 isize;           // input texture size (x=width,y=height,z=depth,w=unused)
-  uvec4 tensor_size;     // output tensor size
-  uvec4 itensor_size;    // input tensor size
-  uvec4 args;            // input arguments (dim, start, end, step)
-} uBlock;
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // output texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 size;
+  // input texture size (x=width,y=height,z=depth,w=unused)
+  ivec4 isize;
+  // output tensor size
+  uvec4 tensor_size;
+  // input tensor size
+  uvec4 itensor_size;
+  // input arguments (dim, start, end, step)
+  uvec4 args;
+  // x = output channels aligned to 4, y = input channels aligned to 4
+  uvec2 c_info;
+}
+uBlock;
+
+/*
+ * Local Work Group
+ */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 posOut = ivec3(gl_GlobalInvocationID);
-  const uint max_dst_index = uBlock.tensor_size[0] * uBlock.tensor_size[1];
+  const uint max_dst_index = uBlock.tensor_size[0] * uBlock.c_info.x;
   const uint dim = uBlock.args[0];
   const uint start = uBlock.args[1];
   const uint step = uBlock.args[3];
@@ -36,8 +56,8 @@ void main() {
       }
 
       // dst dims
-      uint b1 = int(dst_index / uBlock.tensor_size[1]);
-      uint c1 = dst_index % uBlock.tensor_size[1];
+      uint b1 = int(dst_index / uBlock.c_info.x);
+      uint c1 = dst_index % uBlock.c_info.x;
       uint h1 = posOut.y;
       uint w1 = posOut.x;
 
@@ -49,12 +69,11 @@ void main() {
 
       if (dim == 0) { // batch
         b = start + step * b1;
-      }
-      else if (dim == 1) {  // feature(channel)
+      } else if (dim == 1) { // feature(channel)
         c = start + step * c1;
       }
 
-      uint src_index = b * uBlock.itensor_size[1] + c;
+      uint src_index = b * uBlock.c_info.y + c;
       ivec3 posIn;
       posIn.x = int(w);
       posIn.y = int(h);
diff --git a/aten/src/ATen/native/vulkan/graph/Arithmetic.cpp b/aten/src/ATen/native/vulkan/graph/Arithmetic.cpp
new file mode 100644
index 000000000000..716cb6c7e14f
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Arithmetic.cpp
@@ -0,0 +1,101 @@
+#include <ATen/native/vulkan/impl/Common.h>
+
+#include <ATen/native/vulkan/graph/Arithmetic.h>
+#include <ATen/native/vulkan/graph/Staging.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void add_arithmetic_node(
+    ComputeGraph& graph,
+    const ValueRef t1,
+    const ValueRef t2,
+    const ValueRef out,
+    const float alpha,
+    const arithmetic::OpType optype) {
+  // Prepacking first arg (if needed)
+  ValueRef arg1 = t1;
+  if (graph.get_val(t1).isTensorRef()) {
+    TensorRef& t1_asref = graph.get_val(t1).toTensorRef();
+    ValueRef t1_vten = graph.add_tensor(t1_asref.sizes, t1_asref.dtype);
+    graph.prepack_nodes().emplace_back(new ArithmeticPrepack(t1, t1_vten));
+    arg1 = t1_vten;
+  }
+  VKGRAPH_CHECK(graph.get_val(arg1).isTensor());
+  // Prepacking second arg (if needed)
+  ValueRef arg2 = t2;
+  if (graph.get_val(t2).isTensorRef()) {
+    TensorRef& t2_asref = graph.get_val(t2).toTensorRef();
+    ValueRef t2_vten = graph.add_tensor(t2_asref.sizes, t2_asref.dtype);
+    graph.prepack_nodes().emplace_back(new ArithmeticPrepack(t2, t2_vten));
+    arg2 = t2_vten;
+  }
+  VKGRAPH_CHECK(graph.get_val(arg2).isTensor());
+
+  graph.execute_nodes().emplace_back(
+      new ArithmeticNode(arg1, arg2, out, alpha, optype));
+}
+
+ValueRef add_arithmetic_node(
+    ComputeGraph& graph,
+    const ValueRef t1,
+    const ValueRef t2,
+    const float alpha,
+    const arithmetic::OpType optype) {
+  IntArrayRef t1_sizes = graph.get_val_sizes(t1);
+  c10::ScalarType t1_dtype = graph.get_val_dtype(t1);
+
+  IntArrayRef t2_sizes = graph.get_val_sizes(t2);
+  c10::ScalarType t2_dtype = graph.get_val_dtype(t2);
+
+  ValueRef out = graph.add_tensor(t1_sizes, t1_dtype);
+  add_arithmetic_node(graph, t1, t2, out, alpha, optype);
+  return out;
+}
+
+ArithmeticPrepack::ArithmeticPrepack(
+    const ValueRef tref,
+    const ValueRef packed) {
+  inputs_.emplace_back(tref);
+  outputs_.emplace_back(packed);
+}
+
+void ArithmeticPrepack::encode_prepack(ComputeGraph* graph) const {
+  TensorRef tref = graph->get_val(inputs_[0]).toTensorRef();
+  vTensor packed = graph->get_val(outputs_[0]).toTensor();
+
+  api::StorageBuffer staging(
+      graph->context(), packed.dtype(), packed.gpu_nbytes());
+
+  size_t numel = c10::multiply_integers(tref.sizes);
+  size_t nbytes = numel * c10::elementSize(tref.dtype);
+  copy_ptr_to_staging(tref.data, staging, nbytes);
+
+  encode_copy_to_vtensor(graph->context(), staging, packed);
+}
+
+ArithmeticNode::ArithmeticNode(
+    const ValueRef t1,
+    const ValueRef t2,
+    const ValueRef out,
+    const float alpha,
+    const arithmetic::OpType optype)
+    : alpha_(alpha), optype_(optype) {
+  inputs_.emplace_back(t1);
+  inputs_.emplace_back(t2);
+  outputs_.emplace_back(out);
+}
+
+void ArithmeticNode::encode_execute(ComputeGraph* graph) const {
+  vTensor& in1 = graph->get_val(inputs_[0]).toTensor();
+  vTensor& in2 = graph->get_val(inputs_[1]).toTensor();
+  vTensor& out = graph->get_val(outputs_[0]).toTensor();
+
+  api::ShaderInfo kernel = arithmetic::get_shader(optype_);
+  arithmetic::record_op(graph->context(), kernel, in1, in2, out, alpha_);
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Arithmetic.h b/aten/src/ATen/native/vulkan/graph/Arithmetic.h
new file mode 100644
index 000000000000..1b8d621ab2e2
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Arithmetic.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/impl/Arithmetic.h>
+
+#include <ATen/native/vulkan/graph/Graph.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void add_arithmetic_node(
+    ComputeGraph& graph,
+    const ValueRef t1,
+    const ValueRef t2,
+    const ValueRef out,
+    const float alpha,
+    const arithmetic::OpType optype);
+
+ValueRef add_arithmetic_node(
+    ComputeGraph& graph,
+    const ValueRef t1,
+    const ValueRef t2,
+    const float alpha,
+    const arithmetic::OpType optype);
+
+class ArithmeticPrepack : public virtual OpNode {
+ public:
+  explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed);
+
+  void encode_prepack(ComputeGraph* graph) const override;
+};
+
+class ArithmeticNode : public virtual OpNode {
+ public:
+  explicit ArithmeticNode(
+      const ValueRef t1,
+      const ValueRef t2,
+      const ValueRef out,
+      const float alpha,
+      const arithmetic::OpType optype);
+
+  void encode_execute(ComputeGraph* graph) const override;
+
+ private:
+  float alpha_;
+  arithmetic::OpType optype_;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Config.h b/aten/src/ATen/native/vulkan/graph/Config.h
new file mode 100644
index 000000000000..e42df98fec5e
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Config.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/Context.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+struct GraphConfig final {
+  api::ContextConfig context_config;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Constant.cpp b/aten/src/ATen/native/vulkan/graph/Constant.cpp
new file mode 100644
index 000000000000..f9f6d871ffc0
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Constant.cpp
@@ -0,0 +1,21 @@
+#include <ATen/native/vulkan/graph/Constant.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+TensorRef::TensorRef(
+    const IntArrayRef t_sizes,
+    c10::ScalarType t_dtype,
+    const void* const t_data)
+    : sizes{}, dtype{t_dtype}, data{t_data} {
+  size_t ndim = t_sizes.size();
+  sizes.resize(ndim);
+  for (int i = 0; i < ndim; ++i) {
+    sizes[i] = t_sizes[i];
+  }
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Constant.h b/aten/src/ATen/native/vulkan/graph/Constant.h
new file mode 100644
index 000000000000..11e54aa0cd45
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Constant.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/Context.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+/*
+ * Represents a reference to a tensor that has been serialized with the model,
+ * such as a serialized weight tensor. It contains some metadata as well as a
+ * raw pointer to the data of the tensor, which is assumed to be contiguous.
+ */
+struct TensorRef final {
+  std::vector<int64_t> sizes;
+  c10::ScalarType dtype;
+  const void* data;
+
+  explicit TensorRef(
+      const IntArrayRef t_sizes,
+      c10::ScalarType t_dtype,
+      const void* const t_data);
+
+  TensorRef(const TensorRef&) = default;
+  TensorRef& operator=(const TensorRef&) = default;
+
+  TensorRef(TensorRef&&) = default;
+  TensorRef& operator=(TensorRef&&) = default;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Copy.cpp b/aten/src/ATen/native/vulkan/graph/Copy.cpp
new file mode 100644
index 000000000000..d123665cddb5
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Copy.cpp
@@ -0,0 +1,55 @@
+#include <ATen/native/vulkan/graph/Copy.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void add_copy_node(
+    ComputeGraph& graph,
+    const ValueRef from,
+    const ValueRef to) {
+  graph.execute_nodes().emplace_back(new CopyNode(from, to));
+}
+
+ValueRef add_copy_node(ComputeGraph& graph, const ValueRef from) {
+  IntArrayRef out_sizes = graph.get_val_sizes(from);
+  c10::ScalarType out_dtype = graph.get_val_dtype(from);
+  ValueRef to = graph.add_tensor(out_sizes, out_dtype);
+  add_copy_node(graph, from, to);
+  return to;
+}
+
+CopyNode::CopyNode(const ValueRef from, const ValueRef to) {
+  inputs_.emplace_back(from);
+  outputs_.emplace_back(to);
+}
+
+void CopyNode::encode_execute(ComputeGraph* graph) const {
+  api::PipelineBarrier pipeline_barrier{};
+
+  vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor();
+  vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor();
+
+  graph->context()->submit_copy<api::VulkanImage, api::VulkanImage>(
+      // pipeline barrier
+      pipeline_barrier,
+      // resources
+      from_tensor.image(
+          pipeline_barrier,
+          api::PipelineStage::TRANSFER,
+          api::MemoryAccessType::READ),
+      to_tensor.image(
+          pipeline_barrier,
+          api::PipelineStage::TRANSFER,
+          api::MemoryAccessType::WRITE),
+      // copy details
+      from_tensor.extents(),
+      {0u, 0u, 0u},
+      {0u, 0u, 0u},
+      // fence handle
+      VK_NULL_HANDLE);
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Copy.h b/aten/src/ATen/native/vulkan/graph/Copy.h
new file mode 100644
index 000000000000..af9893d69347
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Copy.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/graph/Graph.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void add_copy_node(ComputeGraph& graph, const ValueRef from, const ValueRef to);
+ValueRef add_copy_node(ComputeGraph& graph, const ValueRef from);
+
+class CopyNode : public virtual OpNode {
+ public:
+  explicit CopyNode(const ValueRef from, const ValueRef to);
+
+  void encode_execute(ComputeGraph* graph) const override;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Exception.cpp b/aten/src/ATen/native/vulkan/graph/Exception.cpp
new file mode 100644
index 000000000000..ec155b0c8985
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Exception.cpp
@@ -0,0 +1,37 @@
+#include <ATen/native/vulkan/graph/Exception.h>
+
+#include <sstream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
+  out << loc.func << " at " << loc.file << ": " << loc.line;
+  return out;
+}
+
+Error::Error(SourceLocation location, std::string msg)
+    : location_{location}, msg_(std::move(msg)) {
+  refresh_what();
+}
+
+void Error::refresh_what() {
+  what_ = compute_what(/*include_backtrace =*/true);
+}
+
+std::string Error::compute_what(bool include_source) const {
+  std::ostringstream oss;
+  oss << msg_;
+
+  if (include_source) {
+    oss << "\n"
+        << "Raised from: " << location_;
+  }
+
+  return oss.str();
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Exception.h b/aten/src/ATen/native/vulkan/graph/Exception.h
new file mode 100644
index 000000000000..a317d8de498f
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Exception.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <exception>
+#include <ostream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+/*
+ * Same as c10::SourceLocation, represents a location in source code
+ */
+struct SourceLocation {
+  const char* func;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+/*
+ * Simple error class modeled after c10::Error
+ */
+class Error : public std::exception {
+ public:
+  // Constructors
+  Error(SourceLocation location, std::string msg);
+
+ private:
+  // The source location of the exception
+  SourceLocation location_;
+  // The actual error message
+  std::string msg_;
+
+  std::string what_;
+
+ public:
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+
+  const std::string& msg() const {
+    return msg_;
+  }
+
+ private:
+  void refresh_what();
+  std::string compute_what(bool include_source) const;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#define VKGRAPH_THROW(...)                                   \
+  throw ::at::native::vulkan::Error(                         \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+      c10::str(__VA_ARGS__));
+
+#define VKGRAPH_CHECK(cond, ...)                               \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                        \
+    throw ::at::native::vulkan::Error(                         \
+        {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+        c10::str(__VA_ARGS__));                                \
+  }
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Graph.cpp b/aten/src/ATen/native/vulkan/graph/Graph.cpp
new file mode 100644
index 000000000000..e6016db80bea
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Graph.cpp
@@ -0,0 +1,134 @@
+#include <ATen/native/vulkan/graph/Graph.h>
+#include <ATen/native/vulkan/graph/Staging.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+ComputeGraph::ComputeGraph(GraphConfig config)
+    : config_{config},
+      context_{new api::Context(
+          api::runtime()->default_adapter_i(),
+          config_.context_config)},
+      values_{},
+      prepack_nodes_{},
+      execute_nodes_{},
+      inputs_{},
+      outputs_{} {
+  context_->set_cmd(/*reusable = */ true);
+}
+
+ComputeGraph::~ComputeGraph() {
+  values_.clear();
+
+  prepack_nodes_.clear();
+  execute_nodes_.clear();
+
+  context_->flush();
+}
+
+ValueRef ComputeGraph::add_tensor(
+    const IntArrayRef sizes,
+    const c10::ScalarType dtype) {
+  ValueRef idx(values_.size());
+  values_.emplace_back(vTensor(context(), sizes, dtype));
+  return idx;
+}
+
+ValueRef ComputeGraph::add_tensorref(
+    const IntArrayRef sizes,
+    const c10::ScalarType dtype,
+    const void* const data) {
+  ValueRef idx(values_.size());
+  values_.emplace_back(TensorRef(sizes, dtype, data));
+  return idx;
+}
+
+ValueRef ComputeGraph::add_staging(
+    const c10::ScalarType dtype,
+    const size_t numel) {
+  ValueRef idx(values_.size());
+  values_.emplace_back(api::StorageBuffer(context(), dtype, numel));
+  return idx;
+}
+
+ValueRef ComputeGraph::set_input_tensor(
+    const ValueRef idx,
+    const bool use_staging) {
+  if (use_staging) {
+    vTensor& tensor = get_val(idx).toTensor();
+    ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
+    execute_nodes_.emplace_back(new StagingNode(staging_idx, idx));
+    inputs_.push_back(staging_idx);
+    return staging_idx;
+  }
+  inputs_.push_back(idx);
+  return idx;
+}
+
+ValueRef ComputeGraph::set_output_tensor(
+    const ValueRef idx,
+    const bool use_staging) {
+  if (use_staging) {
+    vTensor& tensor = get_val(idx).toTensor();
+    ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
+    execute_nodes_.emplace_back(new StagingNode(idx, staging_idx));
+    outputs_.push_back(staging_idx);
+    return staging_idx;
+  }
+  outputs_.push_back(idx);
+  return idx;
+}
+
+void ComputeGraph::copy_into_staging(
+    const ValueRef idx,
+    const void* data,
+    const size_t numel) {
+  Value& in_val = get_val(idx);
+  api::StorageBuffer& staging = in_val.toStaging();
+  size_t nbytes = numel * c10::elementSize(staging.dtype());
+  copy_ptr_to_staging(data, staging, nbytes);
+}
+
+void ComputeGraph::copy_from_staging(
+    const ValueRef idx,
+    void* data,
+    const size_t numel) {
+  Value& out_val = get_val(idx);
+  api::StorageBuffer& staging = out_val.toStaging();
+  size_t nbytes = numel * c10::elementSize(staging.dtype());
+  copy_staging_to_ptr(staging, data, nbytes);
+}
+
+void ComputeGraph::encode_prepack() {
+  for (std::unique_ptr<OpNode>& node : prepack_nodes_) {
+    node->encode_prepack(this);
+  }
+}
+
+void ComputeGraph::prepack() const {
+  // Submit and execute the command buffer
+  api::VulkanFence fence = context_->fences().get_fence();
+  context_->submit_cmd_to_gpu(fence.get_submit_handle(), /*final_use = */ true);
+  fence.wait();
+
+  // Flush the context and obtain a new command buffer
+  context_->flush();
+  context_->set_cmd(/*reusable = */ true);
+}
+
+void ComputeGraph::encode_execute() {
+  for (std::unique_ptr<OpNode>& node : execute_nodes_) {
+    node->encode_execute(this);
+  }
+}
+
+void ComputeGraph::execute() const {
+  api::VulkanFence fence = context_->fences().get_fence();
+  context_->submit_cmd_to_gpu(fence.get_submit_handle());
+  fence.wait();
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Graph.h b/aten/src/ATen/native/vulkan/graph/Graph.h
new file mode 100644
index 000000000000..ed9372767eba
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Graph.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/Context.h>
+#include <ATen/native/vulkan/api/Tensor.h>
+
+#include <ATen/native/vulkan/graph/Config.h>
+#include <ATen/native/vulkan/graph/Exception.h>
+#include <ATen/native/vulkan/graph/Value.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+typedef int32_t ValueRef;
+class ComputeGraph;
+
+/*
+ * Represents a single op in a ML model. In graph mode, ops will be implemented
+ * introducing a derived class that implements encode_execute, which will
+ * implement encoding of the shader corresponding to the op into the command
+ * buffer of a ComputeGraph, as well as encode_prepack, which will implement
+ * encoding of shaders transferring necessary data (such as weights and biases)
+ * to the GPU, wherever prepacking is necessary.
+ */
+class OpNode {
+  friend class ComputeGraph;
+
+ public:
+  virtual ~OpNode() {}
+
+ protected:
+  std::vector<ValueRef> inputs_;
+  std::vector<ValueRef> outputs_;
+
+ public:
+  virtual void encode_prepack(ComputeGraph* graph) const {}
+  virtual void encode_execute(ComputeGraph* graph) const {}
+};
+
+/*
+ * This is the core data structure used to execute Vulkan models in graph mode.
+ * As opposed to ATen/eager mode where a command buffer is encoded every
+ * inference (since ops are executed with the model), in graph mode the ops that
+ * compose the model are intended to be parsed only once, upon which a command
+ * buffer will be encoded. Model inference will then execute the cached command
+ * buffer without needing to encode a new one.
+ */
+class ComputeGraph final {
+ public:
+  explicit ComputeGraph(GraphConfig config);
+
+  ComputeGraph(ComputeGraph&&) = default;
+  ComputeGraph& operator=(ComputeGraph&&) = default;
+
+  ~ComputeGraph();
+
+ private:
+  GraphConfig config_;
+  std::unique_ptr<api::Context> context_;
+  std::vector<Value> values_;
+
+  std::vector<std::unique_ptr<OpNode>> prepack_nodes_;
+  std::vector<std::unique_ptr<OpNode>> execute_nodes_;
+
+  std::vector<ValueRef> inputs_;
+  std::vector<ValueRef> outputs_;
+
+ public:
+  //
+  // Accessors
+  //
+
+  inline api::Context* context() {
+    return context_.get();
+  }
+
+  inline std::vector<ValueRef>& inputs() {
+    return inputs_;
+  }
+
+  inline std::vector<ValueRef>& outputs() {
+    return outputs_;
+  }
+
+  /*
+   * Returns the value at a particular reference
+   */
+  inline Value& get_val(ValueRef idx) {
+    return values_[idx];
+  }
+
+  inline IntArrayRef get_val_sizes(ValueRef idx) {
+    Value& val = get_val(idx);
+    if (val.isTensor()) {
+      return val.toTensor().sizes();
+    } else if (val.isTensorRef()) {
+      return val.toTensorRef().sizes;
+    }
+    VKGRAPH_THROW("Could not get sizes of value with type ", val.type());
+  }
+
+  inline c10::ScalarType get_val_dtype(ValueRef idx) {
+    Value& val = get_val(idx);
+    if (val.isTensor()) {
+      return val.toTensor().dtype();
+    } else if (val.isTensorRef()) {
+      return val.toTensorRef().dtype;
+    }
+    VKGRAPH_THROW("Could not get dtype of value with type ", val.type());
+  }
+
+  inline std::vector<std::unique_ptr<OpNode>>& prepack_nodes() {
+    return prepack_nodes_;
+  }
+
+  inline std::vector<std::unique_ptr<OpNode>>& execute_nodes() {
+    return execute_nodes_;
+  }
+
+  //
+  // Graph Building
+  //
+
+  ValueRef add_tensor(const IntArrayRef sizes, const c10::ScalarType dtype);
+  ValueRef add_tensorref(
+      const IntArrayRef sizes,
+      const c10::ScalarType dtype,
+      const void* const data);
+  ValueRef add_staging(const c10::ScalarType dtype, const size_t numel);
+
+  ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
+  ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
+
+  //
+  // Input/Output
+  //
+
+  void copy_into_staging(
+      const ValueRef idx,
+      const void* data,
+      const size_t numel);
+  void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
+
+  //
+  // Graph Prepacking
+  //
+
+  void encode_prepack();
+  void prepack() const;
+
+  //
+  // Graph Execution
+  //
+
+  void encode_execute();
+  void execute() const;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Staging.cpp b/aten/src/ATen/native/vulkan/graph/Staging.cpp
new file mode 100644
index 000000000000..2d46071af55c
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Staging.cpp
@@ -0,0 +1,126 @@
+#include <ATen/native/vulkan/impl/Packing.h>
+
+#include <ATen/native/vulkan/graph/Exception.h>
+#include <ATen/native/vulkan/graph/Staging.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void memcpy_to_mapping(
+    const void* src,
+    api::MemoryMap& dst_mapping,
+    const size_t nbytes,
+    const c10::ScalarType dtype) {
+  if (dtype == at::kFloat) {
+    memcpy_to_mapping_impl<float>(src, dst_mapping, nbytes);
+  } else if (dtype == at::kHalf) {
+    memcpy_to_mapping_impl<c10::Half>(src, dst_mapping, nbytes);
+  } else if (dtype == c10::kQUInt8) {
+    memcpy_to_mapping_impl<c10::quint8>(src, dst_mapping, nbytes);
+  } else if (dtype == c10::kQInt8) {
+    memcpy_to_mapping_impl<c10::qint8>(src, dst_mapping, nbytes);
+  } else if (dtype == c10::kQInt32) {
+    memcpy_to_mapping_impl<c10::qint32>(src, dst_mapping, nbytes);
+  } else {
+    VKGRAPH_THROW("Unrecognized dtype!");
+  }
+}
+
+void memcpy_from_mapping(
+    api::MemoryMap& src_mapping,
+    void* dst,
+    const size_t nbytes,
+    const c10::ScalarType dtype) {
+  if (dtype == at::kFloat) {
+    memcpy_from_mapping_impl<float>(src_mapping, dst, nbytes);
+  } else if (dtype == at::kHalf) {
+    memcpy_from_mapping_impl<c10::Half>(src_mapping, dst, nbytes);
+  } else if (dtype == c10::kQUInt8) {
+    memcpy_from_mapping_impl<c10::quint8>(src_mapping, dst, nbytes);
+  } else if (dtype == c10::kQInt8) {
+    memcpy_from_mapping_impl<c10::qint8>(src_mapping, dst, nbytes);
+  } else if (dtype == c10::kQInt32) {
+    memcpy_from_mapping_impl<c10::qint32>(src_mapping, dst, nbytes);
+  } else {
+    VKGRAPH_THROW("Unrecognized dtype!");
+  }
+}
+
+void copy_ptr_to_staging(
+    const void* src,
+    api::StorageBuffer& staging,
+    const size_t nbytes) {
+  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
+  mapping.invalidate();
+  memcpy_to_mapping(src, mapping, nbytes, staging.dtype());
+}
+
+void copy_staging_to_ptr(
+    api::StorageBuffer& staging,
+    void* dst,
+    const size_t nbytes) {
+  api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::READ);
+  mapping.invalidate();
+  memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
+}
+
+void encode_copy_to_vtensor(
+    api::Context* context,
+    api::StorageBuffer& staging,
+    vTensor& tensor) {
+  api::ShaderInfo shader = packing::get_nchw_to_image_shader(tensor);
+  api::PipelineBarrier pipeline_barrier{};
+  packing::record_nchw_to_image_op(
+      context,
+      shader,
+      staging.buffer(),
+      tensor,
+      pipeline_barrier,
+      VK_NULL_HANDLE);
+}
+
+void encode_copy_from_vtensor(
+    api::Context* context,
+    vTensor& tensor,
+    api::StorageBuffer& staging) {
+  api::ShaderInfo shader = packing::get_image_to_nchw_shader(tensor);
+  api::PipelineBarrier pipeline_barrier{};
+  packing::record_image_to_nchw_op(
+      context,
+      shader,
+      tensor,
+      staging.buffer(),
+      pipeline_barrier,
+      VK_NULL_HANDLE);
+}
+
+StagingNode::StagingNode(ValueRef from, ValueRef to) {
+  inputs_.emplace_back(from);
+  outputs_.emplace_back(to);
+}
+
+void StagingNode::encode_execute(ComputeGraph* graph) const {
+  Value& in_val = graph->get_val(inputs_[0]);
+  Value& out_val = graph->get_val(outputs_[0]);
+
+  if (in_val.isStaging() && out_val.isTensor()) {
+    api::StorageBuffer& from_staging = graph->get_val(inputs_[0]).toStaging();
+    vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor();
+    encode_copy_to_vtensor(graph->context(), from_staging, to_tensor);
+  } else if (in_val.isTensor() && out_val.isStaging()) {
+    vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor();
+    api::StorageBuffer& to_staging = graph->get_val(outputs_[0]).toStaging();
+    encode_copy_from_vtensor(graph->context(), from_tensor, to_staging);
+  } else {
+    VKGRAPH_THROW(
+        "Unexpected input value type ",
+        in_val.type(),
+        " and output value type ",
+        out_val.type());
+  }
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Staging.h b/aten/src/ATen/native/vulkan/graph/Staging.h
new file mode 100644
index 000000000000..96c287f01512
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Staging.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/graph/Graph.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+//
+// Functions to memcpy data into staging buffer
+//
+
+void memcpy_to_mapping(
+    const void* src,
+    api::MemoryMap& dst_mapping,
+    const size_t nbytes,
+    const c10::ScalarType dtype);
+void memcpy_from_mapping(
+    const api::MemoryMap& src_mapping,
+    void* dst,
+    const size_t nbytes,
+    const c10::ScalarType dtype);
+
+//
+// Utility functions for memcpy
+//
+
+template <typename T>
+void memcpy_to_mapping_impl(
+    const void* src,
+    api::MemoryMap& dst_mapping,
+    const size_t nbytes) {
+  T* data_ptr = dst_mapping.template data<T>();
+  memcpy(data_ptr, reinterpret_cast<const T*>(src), nbytes);
+}
+
+template <typename T>
+void memcpy_from_mapping_impl(
+    api::MemoryMap& src_mapping,
+    void* dst,
+    const size_t nbytes) {
+  T* data_ptr = src_mapping.template data<T>();
+  memcpy(reinterpret_cast<T*>(dst), data_ptr, nbytes);
+}
+
+//
+// Functions to copy data into and out of a staging buffer
+//
+
+void copy_ptr_to_staging(
+    const void* src,
+    api::StorageBuffer& staging,
+    const size_t nbytes);
+void copy_staging_to_ptr(
+    api::StorageBuffer& staging,
+    void* dst,
+    const size_t nbytes);
+
+//
+// Functions to record copying data between a staging buffer and a vTensor
+//
+
+void encode_copy_to_vtensor(
+    api::Context* context,
+    api::StorageBuffer& staging,
+    vTensor& tensor);
+void encode_copy_from_vtensor(
+    api::Context* context,
+    vTensor& tensor,
+    api::StorageBuffer& staging);
+
+/*
+ * OpNode that allows copying data into and out of a staging buffer.
+ */
+class StagingNode : public virtual OpNode {
+ public:
+  explicit StagingNode(ValueRef from, ValueRef to);
+
+  void encode_execute(ComputeGraph* graph) const override;
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Types.cpp b/aten/src/ATen/native/vulkan/graph/Types.cpp
new file mode 100644
index 000000000000..b8ba6df7da0d
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Types.cpp
@@ -0,0 +1,27 @@
+#include <ATen/native/vulkan/graph/Types.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+std::ostream& operator<<(std::ostream& out, const TypeTag& tag) {
+  switch (tag) {
+    case TypeTag::NONE:
+      out << "NONE";
+      break;
+    case TypeTag::TENSOR:
+      out << "TENSOR";
+      break;
+    case TypeTag::STAGING:
+      out << "STAGING";
+      break;
+    default:
+      out << "UNKNOWN";
+      break;
+  }
+  return out;
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/graph/Types.h b/aten/src/ATen/native/vulkan/graph/Types.h
new file mode 100644
index 000000000000..6736f6e50385
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Types.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ostream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+/*
+ * This class is modelled after c10::IValue; however, it is simplified and does
+ * not support as many types. However, the core design is the same; it is a
+ * tagged union over the types supported by the Vulkan Graph type.
+ */
+enum class TypeTag : uint32_t {
+  NONE,
+  TENSOR,
+  STAGING,
+  TENSORREF,
+  INT,
+  DOUBLE,
+  BOOL,
+};
+
+std::ostream& operator<<(std::ostream& out, const TypeTag& tag);
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/graph/Value.h b/aten/src/ATen/native/vulkan/graph/Value.h
new file mode 100644
index 000000000000..33a37f45a48e
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/graph/Value.h
@@ -0,0 +1,178 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/Context.h>
+#include <ATen/native/vulkan/api/Tensor.h>
+
+#include <ATen/native/vulkan/graph/Constant.h>
+#include <ATen/native/vulkan/graph/Types.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+/*
+ * This class is modelled after c10::IValue; however, it is simplified and does
+ * not support as many types. However, the core design is the same; it is a
+ * tagged union over the types supported by the Vulkan Graph type.
+ */
+struct Value final {
+ private:
+  /*
+   * The union type which is used to store the value of the Value.
+   */
+  union Payload {
+    /*
+     * Similar to IValue::Payload, trivially copyable types are nested in their
+     * own union.
+     */
+    union TriviallyCopyablePayload {
+      TriviallyCopyablePayload() : as_int(0) {}
+      int64_t as_int;
+      double as_double;
+      bool as_bool;
+    } u;
+
+    vTensor as_tensor;
+    api::StorageBuffer as_staging;
+    TensorRef as_tensorref;
+
+    Payload() : u() {}
+    ~Payload() {}
+  };
+
+ public:
+  //
+  // Copy constructor and assignment (disabled)
+  //
+
+  Value(const Value& rhs) = delete;
+  Value& operator=(const Value&) = delete;
+
+  //
+  // Move constructor and assignment; Move assignment is disabled but
+  // construction is implemented to allow for use in container types.
+  //
+
+  Value& operator=(Value&&) = delete;
+
+  Value(Value&& rhs) noexcept : tag(rhs.tag) {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor) vTensor(std::move(rhs.payload.as_tensor));
+    } else if (rhs.isStaging()) {
+      new (&payload.as_staging)
+          api::StorageBuffer(std::move(rhs.payload.as_staging));
+    } else if (rhs.isTensorRef()) {
+      payload.as_tensorref = std::move(rhs.payload.as_tensorref);
+    } else {
+      payload.u = rhs.payload.u;
+    }
+    tag = rhs.tag;
+    rhs.clearToNone();
+  }
+
+  //
+  // Accessors
+  //
+
+  inline TypeTag type() const {
+    return tag;
+  }
+
+  //
+  // Destructor
+  //
+
+  ~Value() {
+    if (this->isTensor()) {
+      payload.as_tensor.~vTensor();
+    } else if (this->isStaging()) {
+      payload.as_staging.~StorageBuffer();
+    } else if (this->isTensorRef()) {
+      payload.as_tensorref.~TensorRef();
+    }
+  }
+
+  //
+  // Tensor
+  //
+
+  Value(vTensor&& t) : tag(TypeTag::TENSOR) {
+    new (&payload.as_tensor) vTensor(std::move(t));
+  }
+
+  inline bool isTensor() const {
+    return TypeTag::TENSOR == tag;
+  }
+
+  inline vTensor& toTensor() {
+    VKGRAPH_CHECK(
+        isTensor(),
+        "Expected value to have type TENSOR, got ",
+        tag,
+        " instead.");
+    return payload.as_tensor;
+  }
+
+  //
+  // Staging
+  //
+
+  Value(api::StorageBuffer&& t) : tag(TypeTag::STAGING) {
+    new (&payload.as_staging) api::StorageBuffer(std::move(t));
+  }
+
+  inline bool isStaging() const {
+    return TypeTag::STAGING == tag;
+  }
+
+  inline api::StorageBuffer& toStaging() {
+    VKGRAPH_CHECK(
+        isStaging(),
+        "Expected value to have type STAGING, got ",
+        tag,
+        " instead.");
+    return payload.as_staging;
+  }
+
+  //
+  // TensorRef
+  //
+
+  Value(TensorRef&& t) : tag(TypeTag::TENSORREF) {
+    payload.as_tensorref = std::move(t);
+  }
+
+  inline bool isTensorRef() const {
+    return TypeTag::TENSORREF == tag;
+  }
+
+  inline TensorRef& toTensorRef() {
+    VKGRAPH_CHECK(
+        isTensorRef(),
+        "Expected value to have type TENSORREF, got ",
+        tag,
+        " instead.");
+    return payload.as_tensorref;
+  }
+
+ private:
+  Payload payload;
+  TypeTag tag;
+
+  //
+  // Utility Functions
+  //
+
+  inline void clearToNone() noexcept {
+    payload.u.as_int = 0;
+    tag = TypeTag::NONE;
+  }
+};
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp b/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp
new file mode 100644
index 000000000000..ddbb12ca588c
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/impl/Arithmetic.cpp
@@ -0,0 +1,83 @@
+#include <ATen/native/vulkan/impl/Arithmetic.h>
+#include <ATen/native/vulkan/impl/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace arithmetic {
+
+api::ShaderInfo get_shader(const OpType type) {
+  switch (type) {
+    case OpType::ADD:
+      return VK_KERNEL(add);
+    case OpType::SUB:
+      return VK_KERNEL(sub);
+    case OpType::MUL:
+      return VK_KERNEL(mul);
+    case OpType::DIV:
+      return VK_KERNEL(div);
+  }
+}
+
+struct Params final {
+  api::utils::ivec3 out_extents;
+  int32_t fill_0;
+  api::utils::ivec3 input1_extents;
+  int32_t nc_size_1;
+  api::utils::ivec3 input2_extents;
+  int32_t nc_size_2;
+  float alpha;
+};
+
+void record_op(
+    api::Context* const context,
+    const api::ShaderInfo& compute_shader,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst,
+    const float alpha) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  uint32_t nc_1 = dim_at<Dim4D::Channel>(v_in1) * dim_at<Dim4D::Batch>(v_in1);
+  uint32_t nc_2 = dim_at<Dim4D::Channel>(v_in2) * dim_at<Dim4D::Batch>(v_in2);
+
+  Params block{
+      api::utils::make_ivec3(v_dst.extents()),
+      0u,
+      api::utils::make_ivec3(v_in1.extents()),
+      api::utils::safe_downcast<int32_t>(nc_1),
+      api::utils::make_ivec3(v_in2.extents()),
+      api::utils::safe_downcast<int32_t>(nc_2),
+      alpha,
+  };
+
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      // shader descriptor
+      compute_shader,
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_in1.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      v_in2.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+}
+
+} // namespace arithmetic
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/impl/Arithmetic.h b/aten/src/ATen/native/vulkan/impl/Arithmetic.h
new file mode 100644
index 000000000000..5e01a7cebfca
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/impl/Arithmetic.h
@@ -0,0 +1,28 @@
+#include <ATen/native/vulkan/api/api.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace arithmetic {
+
+enum class OpType : uint32_t {
+  ADD,
+  SUB,
+  MUL,
+  DIV,
+};
+
+api::ShaderInfo get_shader(const OpType type);
+
+void record_op(
+    api::Context* const context,
+    const api::ShaderInfo& compute_shader,
+    vTensor& v_in1,
+    vTensor& v_in2,
+    vTensor& v_dst,
+    const float alpha);
+
+} // namespace arithmetic
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/ops/Common.cpp b/aten/src/ATen/native/vulkan/impl/Common.cpp
similarity index 88%
rename from aten/src/ATen/native/vulkan/ops/Common.cpp
rename to aten/src/ATen/native/vulkan/impl/Common.cpp
index 5a3daeb07428..47dd62a2286a 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.cpp
+++ b/aten/src/ATen/native/vulkan/impl/Common.cpp
@@ -1,9 +1,8 @@
-#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/impl/Common.h>
 
 namespace at {
 namespace native {
 namespace vulkan {
-namespace ops {
 
 api::utils::uvec3 adaptive_work_group_size(
     const api::utils::uvec3& global_work_group) {
@@ -22,7 +21,6 @@ api::utils::uvec3 adaptive_work_group_size(
   return local_group_size;
 }
 
-} // namespace ops
 } // namespace vulkan
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/vulkan/impl/Common.h b/aten/src/ATen/native/vulkan/impl/Common.h
new file mode 100644
index 000000000000..bee8896dad5d
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/impl/Common.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/api/api.h>
+#include <ATen/native/vulkan/impl/Registry.h>
+
+#define VK_KERNEL(shader_name) \
+  ::at::native::vulkan::get_shader_info(#shader_name)
+#define VK_LOOKUP_KERNEL(op_name) \
+  ::at::native::vulkan::look_up_shader_info(#op_name)
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+/*
+ * Maps a semantic dimension name to an integer that corresponds to its
+ * innermost ordering in a 4D tensor in NCHW format. Width is the innermost
+ * dimension, so it corresponds to 1, height is the next innermost, so it
+ * corresponds to 2, and so on.
+ */
+struct Dim4D {
+  static constexpr uint32_t Width = 1u;
+  static constexpr uint32_t Height = 2u;
+  static constexpr uint32_t Channel = 3u;
+  static constexpr uint32_t Batch = 4u;
+};
+
+/*
+ * Semantic dimension names for a 1D tensor
+ */
+struct Dim1D {
+  static constexpr uint32_t Length = 1u;
+};
+
+/*
+ * Semantic dimension names for a 2D Convolution kernel.
+ */
+struct DimConv2DKernel {
+  static constexpr uint32_t Width = 1u;
+  static constexpr uint32_t Height = 2u;
+  static constexpr uint32_t InChannels = 3u;
+  static constexpr uint32_t OutChannels = 4u;
+};
+
+/*
+ * The same as the above, except for a 2D Transposed Convolution kernel.
+ */
+struct DimTConv2DKernel {
+  static constexpr uint32_t Width = 1u;
+  static constexpr uint32_t Height = 2u;
+  static constexpr uint32_t OutChannels = 3u;
+  static constexpr uint32_t InChannels = 4u;
+};
+
+/*
+ * The functions below safely return the size of the dimension at the N-th
+ * innermost index. If the dimensionality of the size array is not sufficient
+ * then 1 will be returned. The structs above are intended to be used with
+ * these functions.
+ */
+template <uint32_t N>
+uint32_t dim_at(const IntArrayRef sizes) {
+  const uint32_t dims = sizes.size();
+  return dims < N ? 1 : api::utils::safe_downcast<uint32_t>(sizes[dims - N]);
+}
+
+template <uint32_t N>
+uint32_t dim_at(const vTensor& v_in) {
+  return dim_at<N>(v_in.sizes());
+}
+
+/*
+ * For most global work group sizes, returns {4, 4, 4}, but adjusts the size for
+ * 2D global work group sizes. Always maintains a total of 64 invocations
+ */
+api::utils::uvec3 adaptive_work_group_size(
+    const api::utils::uvec3& global_work_group);
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/impl/Packing.cpp b/aten/src/ATen/native/vulkan/impl/Packing.cpp
new file mode 100644
index 000000000000..a3d26df6bb07
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/impl/Packing.cpp
@@ -0,0 +1,272 @@
+#include <ATen/native/vulkan/impl/Common.h>
+#include <ATen/native/vulkan/impl/Packing.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace packing {
+
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
+  if (v_dst.is_quantized()) {
+    switch (v_dst.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        switch (v_dst.dtype()) {
+          case c10::ScalarType::QUInt8:
+            return VK_KERNEL(nchw_to_image_uint8);
+          case c10::ScalarType::QInt8:
+            return VK_KERNEL(nchw_to_image_int8);
+          case c10::ScalarType::QInt32:
+            return VK_KERNEL(nchw_to_image_int32);
+          default:
+            TORCH_CHECK(
+                false,
+                "Vulkan quantization currently not supported for dtype ",
+                v_dst.dtype());
+        }
+      default:
+        TORCH_CHECK(false, "No kernel available!");
+      case api::StorageType::BUFFER:
+      case api::StorageType::UNKNOWN:
+        TORCH_CHECK(false, "Requested storage type must be a texture type.");
+    }
+  }
+
+  switch (v_dst.storage_type()) {
+    case api::StorageType::TEXTURE_3D:
+      return VK_KERNEL(nchw_to_image);
+    case api::StorageType::TEXTURE_2D:
+      return VK_KERNEL(nchw_to_image2d);
+    default:
+      TORCH_CHECK(false, "No kernel available!");
+  }
+}
+
+api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) {
+  if (v_src.is_quantized()) {
+    auto plane_size =
+        dim_at<Dim4D::Height>(v_src) * dim_at<Dim4D::Width>(v_src);
+    switch (v_src.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        switch (v_src.dtype()) {
+          case c10::ScalarType::QUInt8:
+            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
+                                       : VK_KERNEL(image_to_nchw_quantized);
+          case c10::ScalarType::QInt8:
+            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
+                                       : VK_KERNEL(image_to_nchw_quantized);
+          case c10::ScalarType::QInt32:
+            return VK_KERNEL(image_to_nchw_int32);
+          default:
+            TORCH_CHECK(
+                false,
+                "Vulkan quantization currently not supported for dtype ",
+                v_src.dtype());
+        }
+      default:
+        TORCH_CHECK(false, "No kernel available!");
+      case api::StorageType::BUFFER:
+      case api::StorageType::UNKNOWN:
+        TORCH_CHECK(false, "Requested storage type must be a texture type.");
+    }
+  }
+
+  switch (v_src.storage_type()) {
+    case api::StorageType::TEXTURE_3D:
+      return VK_KERNEL(image_to_nchw);
+    case api::StorageType::TEXTURE_2D:
+      return VK_KERNEL(image2d_to_nchw);
+    default:
+      TORCH_CHECK(false, "No kernel available!");
+  }
+}
+
+struct ToFromTextureParams final {
+  api::utils::ivec3 extents;
+  int32_t plane_size;
+  api::utils::ivec2 c_info;
+};
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::ShaderInfo& compute_shader,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle) {
+  api::utils::uvec3 global_size = v_dst.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  int32_t height =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Height>(v_dst));
+  int32_t width =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Width>(v_dst));
+  int32_t channels =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Channel>(v_dst));
+
+  int32_t plane_size = height * width;
+  int32_t c_depth = api::utils::div_up(channels, 4);
+
+  ToFromTextureParams block{
+      api::utils::make_ivec3(v_dst.extents()),
+      plane_size,
+      {c_depth, channels},
+  };
+
+  api::UniformParamsBuffer params(context, block);
+  context->submit_compute_job(
+      // shader descriptor
+      compute_shader,
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      fence_handle,
+      // shader arguments
+      v_dst.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      src_buffer,
+      // params buffer
+      params.buffer());
+}
+
+void record_image_to_nchw_op(
+    api::Context* const context,
+    api::ShaderInfo& compute_shader,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle) {
+  api::utils::uvec3 global_size = v_src.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  int32_t height =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Height>(v_src));
+  int32_t width =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Width>(v_src));
+  int32_t channels =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Channel>(v_src));
+
+  int32_t plane_size = height * width;
+  int32_t c_depth = api::utils::div_up(channels, 4);
+
+  ToFromTextureParams block{
+      api::utils::make_ivec3(v_src.extents()),
+      plane_size,
+      {c_depth, channels},
+  };
+
+  if (v_src.dtype() == c10::ScalarType::QUInt8 ||
+      v_src.dtype() == c10::ScalarType::QInt8) {
+    if (plane_size % 4 == 0) {
+      global_size.data[0u] = plane_size / 4;
+      global_size.data[1u] = 1;
+      local_size.data[0u] *= local_size.data[1u];
+      local_size.data[1u] = 1;
+    } else {
+      uint32_t numel = v_src.numel();
+      global_size = {api::utils::div_up(numel, uint32_t(4)), 1u, 1u};
+      local_size = {64u, 1u, 1u};
+    }
+  }
+
+  api::UniformParamsBuffer params(context, block);
+  context->submit_compute_job(
+      // shader descriptor
+      compute_shader,
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      fence_handle,
+      // shader arguments
+      v_src.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      dst_buffer,
+      // params buffer
+      params.buffer());
+}
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle) {
+  uint32_t gpu_buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
+
+  api::utils::uvec3 global_size = {gpu_buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {32u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_dst.get_cpu_buffer_metadata());
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(buffer_to_buffer),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      fence_handle,
+      // shader arguments
+      v_dst.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_dst.buffer_metadata(),
+      src_buffer,
+      cpu_buffer_metadata.buffer());
+}
+
+void record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
+
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {4u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_src.get_cpu_buffer_metadata());
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(buffer_to_buffer),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      fence_handle,
+      // shader arguments
+      dst_buffer,
+      cpu_buffer_metadata.buffer(),
+      v_src.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_src.buffer_metadata());
+}
+
+} // namespace packing
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/impl/Packing.h b/aten/src/ATen/native/vulkan/impl/Packing.h
new file mode 100644
index 000000000000..480a5e959b01
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/impl/Packing.h
@@ -0,0 +1,44 @@
+#include <ATen/native/vulkan/api/api.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace packing {
+
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst);
+api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src);
+
+void record_nchw_to_image_op(
+    api::Context* const context,
+    api::ShaderInfo& compute_shader,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle);
+
+void record_image_to_nchw_op(
+    api::Context* const context,
+    api::ShaderInfo& compute_shader,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle);
+
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle);
+
+void record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle);
+
+} // namespace packing
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/ops/Registry.cpp b/aten/src/ATen/native/vulkan/impl/Registry.cpp
similarity index 98%
rename from aten/src/ATen/native/vulkan/ops/Registry.cpp
rename to aten/src/ATen/native/vulkan/impl/Registry.cpp
index 43c581b137ff..3cf3148c8749 100644
--- a/aten/src/ATen/native/vulkan/ops/Registry.cpp
+++ b/aten/src/ATen/native/vulkan/impl/Registry.cpp
@@ -1,7 +1,7 @@
 #ifdef USE_VULKAN_API
 
 #include <ATen/native/vulkan/api/Shader.h>
-#include <ATen/native/vulkan/ops/Registry.h>
+#include <ATen/native/vulkan/impl/Registry.h>
 #include <ATen/native/vulkan/spv.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/vulkan/ops/Registry.h b/aten/src/ATen/native/vulkan/impl/Registry.h
similarity index 100%
rename from aten/src/ATen/native/vulkan/ops/Registry.h
rename to aten/src/ATen/native/vulkan/impl/Registry.h
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index 42fbef56dcaf..6ca3bedddf53 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -398,6 +398,14 @@ Tensor& tanh_(Tensor& self) {
   return ops::activation_(self, VK_KERNEL(tanh_));
 }
 
+Tensor abs(const Tensor& self) {
+  return ops::activation(self, VK_KERNEL(abs));
+}
+
+Tensor& abs_(Tensor& self) {
+  return ops::activation_(self, VK_KERNEL(abs_));
+}
+
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
@@ -417,6 +425,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid_"), sigmoid_);
   m.impl(TORCH_SELECTIVE_NAME("aten::tanh"), tanh);
   m.impl(TORCH_SELECTIVE_NAME("aten::tanh_"), tanh_);
+  m.impl(TORCH_SELECTIVE_NAME("aten::abs"), abs);
+  m.impl(TORCH_SELECTIVE_NAME("aten::abs_"), abs_);
   m.impl(TORCH_SELECTIVE_NAME("aten::relu"), relu);
   m.impl(TORCH_SELECTIVE_NAME("aten::relu_"), relu_);
   m.impl(TORCH_SELECTIVE_NAME("aten::threshold"), threshold);
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
index 380bb7ae0e3e..ff87d1d755d9 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -5,8 +5,8 @@
 #include <ATen/core/List.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/vulkan/api/api.h>
+#include <ATen/native/vulkan/impl/Common.h>
 #include <ATen/native/vulkan/ops/Convert.h>
-#include <ATen/native/vulkan/ops/Registry.h>
 
 #define VK_KERNEL(shader_name) \
   ::at::native::vulkan::get_shader_info(#shader_name)
@@ -50,46 +50,6 @@ struct Layout final {
   };
 };
 
-/*
- * Maps a semantic dimension name to an integer that corresponds to its
- * innermost ordering in a 4D tensor in NCHW format. Width is the innermost
- * dimension, so it corresponds to 1, height is the next innermost, so it
- * corresponds to 2, and so on.
- */
-struct Dim4D {
-  static constexpr uint32_t Width = 1u;
-  static constexpr uint32_t Height = 2u;
-  static constexpr uint32_t Channel = 3u;
-  static constexpr uint32_t Batch = 4u;
-};
-
-/*
- * Semantic dimension names for a 1D tensor
- */
-struct Dim1D {
-  static constexpr uint32_t Length = 1u;
-};
-
-/*
- * Semantic dimension names for a 2D Convolution kernel.
- */
-struct DimConv2DKernel {
-  static constexpr uint32_t Width = 1u;
-  static constexpr uint32_t Height = 2u;
-  static constexpr uint32_t InChannels = 3u;
-  static constexpr uint32_t OutChannels = 4u;
-};
-
-/*
- * The same as the above, except for a 2D Transposed Convolution kernel.
- */
-struct DimTConv2DKernel {
-  static constexpr uint32_t Width = 1u;
-  static constexpr uint32_t Height = 2u;
-  static constexpr uint32_t OutChannels = 3u;
-  static constexpr uint32_t InChannels = 4u;
-};
-
 /*
  * The functions below safely return the size of the dimension at the N-th
  * innermost index. If the dimensionality of the size array is not sufficient
@@ -126,9 +86,6 @@ inline c10::optional<Scalar> get_optional_scalar(
                                       : c10::optional<Scalar>();
 }
 
-api::utils::uvec3 adaptive_work_group_size(
-    const api::utils::uvec3& global_work_group);
-
 } // namespace ops
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/ops/Concat.cpp b/aten/src/ATen/native/vulkan/ops/Concat.cpp
index 827605b794ec..b078f7d3b892 100644
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@@ -30,29 +30,31 @@ Tensor cat_feature(
   for (const at::Tensor& tensor : tensors) {
     ch_interval += tensor.sizes()[1];
   }
+  ch_interval = api::utils::align_up(ch_interval, INT64_C(4));
 
   for (const at::Tensor& tensor : tensors) {
     const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan();
     const vTensor& v_self = convert(self);
 
+    uint32_t in_channels = safe_downcast<uint32_t>(v_self.sizes()[1]);
+    uint32_t in_ch_aligned = api::utils::align_up(in_channels, 4u);
+
     const struct Block final {
-      uvec3 size; // output texture size
-      uint32_t fill0; // dummy
-      uvec3 isize; // input texture size
-      uint32_t fill1; // dummy
-      uint32_t batchSize; // input tensor's batch size
-      uint32_t chSize; // input tensor's channel size
-      uint32_t
-          chInterval; // channel interval (total # of channels for all tensors)
-      uint32_t
-          chSizeAllprior; // # of channels for tensor 0 to i-1 at ith tensor
+      ivec3 out_extents;
+      int32_t fill0;
+      ivec3 in_extents;
+      int32_t fill1;
+      uint32_t batchSize;
+      uint32_t chSize;
+      uint32_t chInterval;
+      uint32_t chSizeAllprior;
     } block{
-        v_output.extents(),
-        0u,
-        v_self.extents(),
-        0u,
+        api::utils::make_ivec3(v_output.extents()),
+        0,
+        api::utils::make_ivec3(v_self.extents()),
+        0,
         safe_downcast<uint32_t>(v_self.sizes()[0]),
-        safe_downcast<uint32_t>(v_self.sizes()[1]),
+        in_ch_aligned,
         safe_downcast<uint32_t>(ch_interval),
         safe_downcast<uint32_t>(ch_size_allprior),
     };
diff --git a/aten/src/ATen/native/vulkan/ops/Glu.cpp b/aten/src/ATen/native/vulkan/ops/Glu.cpp
index c9c8520cd4cf..5b103b3b95df 100644
--- a/aten/src/ATen/native/vulkan/ops/Glu.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Glu.cpp
@@ -15,9 +15,10 @@ Tensor glu(const at::Tensor& input_arg, const int64_t dim = -1) {
       dim == 1,
       "Vulkan glu only supports GLU for dim = 1, but got dim = ",
       dim);
+  // For now, only allow if channels dim is a multiple of 4
   TORCH_CHECK(
-      get_dim<Dim4D::Channel>(input_arg) % 2 == 0,
-      "Vulkan glu expects channel dim to be multiple of 2!");
+      get_dim<Dim4D::Channel>(input_arg) % 4 == 0,
+      "Vulkan glu expects channel dim to be multiple of 4!");
 
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_input = convert(input);
@@ -43,8 +44,7 @@ Tensor glu(const at::Tensor& input_arg, const int64_t dim = -1) {
 
   context->submit_compute_job(
       // shader descriptor
-      output_ch_size % 4 == 0 ? VK_KERNEL(glu_channel_mul4)
-                              : VK_KERNEL(glu_channel),
+      VK_KERNEL(glu_channel_mul4),
       // pipeline barrier
       pipeline_barrier,
       // global work group size
diff --git a/aten/src/ATen/native/vulkan/ops/Mean.cpp b/aten/src/ATen/native/vulkan/ops/Mean.cpp
index 0fa533863f3c..04e0e52dfd35 100644
--- a/aten/src/ATen/native/vulkan/ops/Mean.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp
@@ -54,16 +54,22 @@ Tensor mean(
       input_arg.scalar_type(),
   };
 
+  int32_t channels = safe_downcast<int32_t>(get_dim<Dim4D::Channel>(v_input));
+  int32_t ch_aligned = api::utils::align_up(channels, 4);
+
   const struct Block final {
-    uvec3 extents;
-    int32_t range;
-    uvec3 iextents;
+    ivec3 out_extents;
+    int32_t plane_size;
+    ivec3 in_extents;
+    int32_t ch_aligned;
   } block{
-      v_output.extents(),
+      api::utils::make_ivec3(v_output.extents()),
       safe_downcast<int32_t>(
           v_input_sizes[Layout::Activation4D::width] *
           v_input_sizes[Layout::Activation4D::height]),
-      v_input.extents()};
+      api::utils::make_ivec3(v_input.extents()),
+      ch_aligned,
+  };
 
   api::UniformParamsBuffer params(context, block);
   api::PipelineBarrier pipeline_barrier{};
diff --git a/aten/src/ATen/native/vulkan/ops/Permute.cpp b/aten/src/ATen/native/vulkan/ops/Permute.cpp
index 4d03d28d5441..11da8592c536 100644
--- a/aten/src/ATen/native/vulkan/ops/Permute.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Permute.cpp
@@ -20,22 +20,30 @@ Tensor permute_4d(
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_self = convert(input);
 
+  uint32_t out_channels = out_size.data[1u];
+  uint32_t in_channels = in_size.data[1u];
+
+  uint32_t out_c_aligned = api::utils::align_up(out_channels, 4u);
+  uint32_t in_c_aligned = api::utils::align_up(in_channels, 4u);
+
   const struct Block final {
-    uvec3 size; // output texture size
-    uint32_t fill_0; // dummy
-    uvec3 isize; // input texture size
-    uint32_t fill_1; // dummy
-    uvec4 tensor_size; // output tensor size
-    uvec4 itensor_size; // input tensor size
-    uvec4 dims; // output dims
+    ivec3 out_extents;
+    int32_t fill0;
+    ivec3 in_extents;
+    int32_t fill1;
+    uvec4 out_tensor_size;
+    uvec4 in_tensor_size;
+    uvec4 out_ndims;
+    uvec2 ch_info;
   } block{
-      v_output.extents(),
-      0u,
-      v_self.extents(),
-      0u,
+      api::utils::make_ivec3(v_output.extents()),
+      0,
+      api::utils::make_ivec3(v_self.extents()),
+      0,
       out_size,
       in_size,
       out_dims,
+      {out_c_aligned, in_c_aligned},
   };
 
   api::UniformParamsBuffer params(context, block);
diff --git a/aten/src/ATen/native/vulkan/ops/Select.cpp b/aten/src/ATen/native/vulkan/ops/Select.cpp
index 316c6c1215e5..97009cefaac7 100644
--- a/aten/src/ATen/native/vulkan/ops/Select.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Select.cpp
@@ -53,9 +53,125 @@ Tensor select_depth(const Tensor& input_arg, uint32_t index) {
   return convert(v_output);
 }
 
+Tensor select_height(const Tensor& input_arg, uint32_t index) {
+  api::Context* const context = api::context();
+
+  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
+  const vTensor& v_input = convert(input);
+  const IntArrayRef v_input_sizes = v_input.sizes();
+
+  vTensor v_output{
+      context,
+      {v_input_sizes[0], v_input_sizes[2]},
+      input_arg.scalar_type(),
+  };
+
+  const struct Block final {
+    uvec3 size; // output texture size
+    uint32_t index;
+  } block{v_output.extents(), index};
+
+  // Input tensor is a (c, h, w)
+  // Output tensor is a (c, w)
+  // In shader, the input texture's coordinate is (w, h, c)
+  // In shader, the output texture's coordinate is (w, c, 1)
+  uint32_t w = v_output.extents().data[0u];
+  uint32_t c = v_output.extents().data[1u];
+
+  // Encoding of c-channel is packed into texel, hence we only call ceil(c/4)
+  // times to minimize invocation and read.
+  // For the last dimension, it is the selected height. Shader will do a direct
+  // lookup based on block.index.
+  uvec3 global_workgroup_size{w, api::utils::div_up(c, 4u), 1};
+
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(select_height),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_workgroup_size,
+      // local work group size
+      adaptive_work_group_size(global_workgroup_size),
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_output.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+
+  return convert(v_output);
+}
+
+Tensor select_width(const Tensor& input_arg, uint32_t index) {
+  api::Context* const context = api::context();
+
+  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
+  const vTensor& v_input = convert(input);
+  const IntArrayRef v_input_sizes = v_input.sizes();
+
+  vTensor v_output{
+      context,
+      {v_input_sizes[0], v_input_sizes[1]},
+      input_arg.scalar_type(),
+  };
+
+  const struct Block final {
+    uvec3 size; // output texture size
+    uint32_t index;
+  } block{v_output.extents(), index};
+
+  // Input tensor is a (c, h, w)
+  // Output tensor is a (c, h)
+  // In shader, the input texture's coordinate is (w, h, c)
+  // In shader, the output texture's coordinate is (h, c, 1)
+  uint32_t h = v_output.extents().data[0u];
+  uint32_t c = v_output.extents().data[1u];
+
+  // Encoding of c-channel is packed into texel, hence we only call ceil(c/4)
+  // times to minimize invocation and read.
+  // For the last dimension, it is the selected width. Shader will do a direct
+  // lookup based on block.index.
+  uvec3 global_workgroup_size{h, api::utils::div_up(c, 4u), 1};
+
+  api::UniformParamsBuffer params(context, block);
+  api::PipelineBarrier pipeline_barrier{};
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(select_width),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_workgroup_size,
+      // local work group size
+      adaptive_work_group_size(global_workgroup_size),
+      // fence handle
+      VK_NULL_HANDLE,
+      // shader arguments
+      v_output.image(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_input.image(pipeline_barrier, api::PipelineStage::COMPUTE),
+      // params buffer
+      params.buffer());
+
+  return convert(v_output);
+}
+
 Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   TORCH_CHECK(self.dim() == 3, "Vulkan select only supports 3d tensors!");
-  TORCH_CHECK(dim == 0, "Vulkan select only supports dim = 0!");
+  TORCH_CHECK(
+      0 <= dim && dim <= 2,
+      "Vulkan select only supports one of the dim (0, 1, 2)");
 
   const int64_t size = self.size(dim);
 
@@ -73,7 +189,13 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) {
     index += size;
   }
 
-  return select_depth(self, index);
+  if (dim == 0) {
+    return select_depth(self, index);
+  } else if (dim == 1) {
+    return select_height(self, index);
+  } else {
+    return select_width(self, index);
+  }
 }
 
 #ifdef USE_VULKAN_API
diff --git a/aten/src/ATen/native/vulkan/ops/Slice.cpp b/aten/src/ATen/native/vulkan/ops/Slice.cpp
index fe03d28750b2..400ad47cdba1 100644
--- a/aten/src/ATen/native/vulkan/ops/Slice.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Slice.cpp
@@ -24,25 +24,33 @@ Tensor slice_4d(
   const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
   const vTensor& v_self = convert(input);
 
+  uint32_t out_channels = out_tsize.data[1u];
+  uint32_t in_channels = in_tsize.data[1u];
+
+  uint32_t out_c_aligned = api::utils::align_up(out_channels, 4u);
+  uint32_t in_c_aligned = api::utils::align_up(in_channels, 4u);
+
   const struct Block final {
-    uvec3 size; // output texture size
-    uint32_t fill_0; // dummy
-    uvec3 isize; // input texture size
-    uint32_t fill_1; // dummy
+    ivec3 size; // output texture size
+    int32_t fill_0; // dummy
+    ivec3 isize; // input texture size
+    int32_t fill_1; // dummy
     uvec4 tensor_size; // output tensor size
     uvec4 itensor_size; // input tensor size
     uvec4 args; // input arguments (dim, start, end, step)
+    uvec2 c_info; // tensor channels aligned to 4
   } block{
-      v_output.extents(),
-      0u,
-      v_self.extents(),
-      0u,
+      api::utils::make_ivec3(v_output.extents()),
+      0,
+      api::utils::make_ivec3(v_self.extents()),
+      0,
       out_tsize,
       in_tsize,
       {safe_downcast<uint32_t>(dim),
        safe_downcast<uint32_t>(start),
        safe_downcast<uint32_t>(end),
        safe_downcast<uint32_t>(step)},
+      {out_c_aligned, in_c_aligned},
   };
 
   api::UniformParamsBuffer params(context, block);
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.cpp b/aten/src/ATen/native/vulkan/ops/Utils.cpp
index 2d391eabc6e1..18a61adadc27 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Utils.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/impl/Packing.h>
 #include <ATen/native/vulkan/ops/Common.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -14,259 +15,6 @@ namespace native {
 namespace vulkan {
 namespace ops {
 
-namespace packing {
-
-static api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
-  if (v_dst.is_quantized()) {
-    switch (v_dst.storage_type()) {
-      case api::StorageType::TEXTURE_3D:
-        switch (v_dst.dtype()) {
-          case c10::ScalarType::QUInt8:
-            return VK_KERNEL(nchw_to_image_uint8);
-          case c10::ScalarType::QInt8:
-            return VK_KERNEL(nchw_to_image_int8);
-          case c10::ScalarType::QInt32:
-            return VK_KERNEL(nchw_to_image_int32);
-          default:
-            TORCH_CHECK(
-                false,
-                "Vulkan quantization currently not supported for dtype ",
-                v_dst.dtype());
-        }
-      default:
-        TORCH_CHECK(false, "No kernel available!");
-      case api::StorageType::BUFFER:
-      case api::StorageType::UNKNOWN:
-        TORCH_CHECK(false, "Requested storage type must be a texture type.");
-    }
-  }
-
-  switch (v_dst.storage_type()) {
-    case api::StorageType::TEXTURE_3D:
-      return VK_KERNEL(nchw_to_image);
-    case api::StorageType::TEXTURE_2D:
-      return VK_KERNEL(nchw_to_image2d);
-    default:
-      TORCH_CHECK(false, "No kernel available!");
-  }
-}
-
-static api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) {
-  if (v_src.is_quantized()) {
-    auto plane_size =
-        get_dim<Dim4D::Height>(v_src) * get_dim<Dim4D::Width>(v_src);
-    switch (v_src.storage_type()) {
-      case api::StorageType::TEXTURE_3D:
-        switch (v_src.dtype()) {
-          case c10::ScalarType::QUInt8:
-            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
-                                       : VK_KERNEL(image_to_nchw_quantized);
-          case c10::ScalarType::QInt8:
-            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
-                                       : VK_KERNEL(image_to_nchw_quantized);
-          case c10::ScalarType::QInt32:
-            return VK_KERNEL(image_to_nchw_int32);
-          default:
-            TORCH_CHECK(
-                false,
-                "Vulkan quantization currently not supported for dtype ",
-                v_src.dtype());
-        }
-      default:
-        TORCH_CHECK(false, "No kernel available!");
-      case api::StorageType::BUFFER:
-      case api::StorageType::UNKNOWN:
-        TORCH_CHECK(false, "Requested storage type must be a texture type.");
-    }
-  }
-
-  switch (v_src.storage_type()) {
-    case api::StorageType::TEXTURE_3D:
-      return VK_KERNEL(image_to_nchw);
-    case api::StorageType::TEXTURE_2D:
-      return VK_KERNEL(image2d_to_nchw);
-    default:
-      TORCH_CHECK(false, "No kernel available!");
-  }
-}
-
-struct ToFromTextureParams final {
-  api::utils::ivec3 extents;
-  int32_t plane_size;
-};
-
-void record_nchw_to_image_op(
-    api::Context* const context,
-    api::ShaderInfo& compute_shader,
-    api::VulkanBuffer& src_buffer,
-    vTensor& v_dst,
-    api::PipelineBarrier pipeline_barrier,
-    const VkFence fence_handle) {
-  api::utils::uvec3 global_size = v_dst.extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-  int32_t height =
-      api::utils::safe_downcast<int32_t>(get_dim<Dim4D::Height>(v_dst));
-  int32_t width =
-      api::utils::safe_downcast<int32_t>(get_dim<Dim4D::Width>(v_dst));
-  int32_t plane_size = height * width;
-
-  ToFromTextureParams block{
-      api::utils::make_ivec3(v_dst.extents()),
-      plane_size,
-  };
-
-  api::UniformParamsBuffer params(context, block);
-  context->submit_compute_job(
-      // shader descriptor
-      compute_shader,
-      // pipeline barrier
-      pipeline_barrier,
-      // global work group size
-      global_size,
-      // local work group size
-      local_size,
-      // fence handle
-      fence_handle,
-      // shader arguments
-      v_dst.image(
-          pipeline_barrier,
-          api::PipelineStage::COMPUTE,
-          api::MemoryAccessType::WRITE),
-      src_buffer,
-      // params buffer
-      params.buffer());
-}
-
-void record_image_to_nchw_op(
-    api::Context* const context,
-    api::ShaderInfo& compute_shader,
-    vTensor& v_src,
-    api::VulkanBuffer& dst_buffer,
-    api::PipelineBarrier pipeline_barrier,
-    const VkFence fence_handle) {
-  api::utils::uvec3 global_size = v_src.extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-  int32_t height =
-      api::utils::safe_downcast<int32_t>(get_dim<Dim4D::Height>(v_src));
-  int32_t width =
-      api::utils::safe_downcast<int32_t>(get_dim<Dim4D::Width>(v_src));
-  int32_t plane_size = height * width;
-
-  ToFromTextureParams block{
-      api::utils::make_ivec3(v_src.extents()),
-      plane_size,
-  };
-
-  if (v_src.dtype() == c10::ScalarType::QUInt8 ||
-      v_src.dtype() == c10::ScalarType::QInt8) {
-    if (plane_size % 4 == 0) {
-      global_size.data[0u] = plane_size / 4;
-      global_size.data[1u] = 1;
-      local_size.data[0u] *= local_size.data[1u];
-      local_size.data[1u] = 1;
-    } else {
-      uint32_t numel = v_src.numel();
-      global_size = {api::utils::div_up(numel, uint32_t(4)), 1u, 1u};
-      local_size = {64u, 1u, 1u};
-    }
-  }
-
-  api::UniformParamsBuffer params(context, block);
-  context->submit_compute_job(
-      // shader descriptor
-      compute_shader,
-      // pipeline barrier
-      pipeline_barrier,
-      // global work group size
-      global_size,
-      // local work group size
-      local_size,
-      // fence handle
-      fence_handle,
-      // shader arguments
-      v_src.image(
-          pipeline_barrier,
-          api::PipelineStage::COMPUTE,
-          api::MemoryAccessType::WRITE),
-      dst_buffer,
-      // params buffer
-      params.buffer());
-}
-
-void record_nchw_to_buffer_op(
-    api::Context* const context,
-    api::VulkanBuffer& src_buffer,
-    vTensor& v_dst,
-    api::PipelineBarrier pipeline_barrier,
-    const VkFence fence_handle) {
-  uint32_t gpu_buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
-
-  api::utils::uvec3 global_size = {gpu_buf_len, 1u, 1u};
-  api::utils::uvec3 local_size = {32u, 1u, 1u};
-
-  api::UniformParamsBuffer cpu_buffer_metadata(
-      context, v_dst.get_cpu_buffer_metadata());
-
-  context->submit_compute_job(
-      // shader descriptor
-      VK_KERNEL(buffer_to_buffer),
-      // pipeline barrier
-      pipeline_barrier,
-      // global work group size
-      global_size,
-      // local work group size
-      local_size,
-      // fence handle
-      fence_handle,
-      // shader arguments
-      v_dst.buffer(
-          pipeline_barrier,
-          api::PipelineStage::COMPUTE,
-          api::MemoryAccessType::WRITE),
-      v_dst.buffer_metadata(),
-      src_buffer,
-      cpu_buffer_metadata.buffer());
-}
-
-void record_buffer_to_nchw_op(
-    api::Context* const context,
-    vTensor& v_src,
-    api::VulkanBuffer& dst_buffer,
-    api::PipelineBarrier pipeline_barrier,
-    const VkFence fence_handle) {
-  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
-
-  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
-  api::utils::uvec3 local_size = {4u, 1u, 1u};
-
-  api::UniformParamsBuffer cpu_buffer_metadata(
-      context, v_src.get_cpu_buffer_metadata());
-
-  context->submit_compute_job(
-      // shader descriptor
-      VK_KERNEL(buffer_to_buffer),
-      // pipeline barrier
-      pipeline_barrier,
-      // global work group size
-      global_size,
-      // local work group size
-      local_size,
-      // fence handle
-      fence_handle,
-      // shader arguments
-      dst_buffer,
-      cpu_buffer_metadata.buffer(),
-      v_src.buffer(
-          pipeline_barrier,
-          api::PipelineStage::COMPUTE,
-          api::MemoryAccessType::WRITE),
-      v_src.buffer_metadata());
-}
-
-} // namespace packing
-
 namespace utils {
 
 /*
@@ -300,12 +48,12 @@ Tensor nchw_to_nc4hw(const Tensor& src) {
   uint32_t H = get_dim<Dim4D::Height>(src.sizes());
   uint32_t W = get_dim<Dim4D::Width>(src.sizes());
 
-  uint32_t NC4 = api::utils::div_up(N * C, 4u);
-  uint32_t NC_aligned = api::utils::align_up(N * C, 4u);
+  uint32_t C_aligned = api::utils::align_up(C, 4u);
+  uint32_t NC4 = (N * C_aligned) / 4;
 
-  // Add padding to the tensor so that the batch-channel dim is a multiple of 4
-  Tensor padding = at::zeros({NC_aligned - N * C, H, W}, src.options());
-  Tensor src_padded = at::cat({src.reshape({N * C, H, W}), padding});
+  // Add padding to the tensor so that the channel dim is a multiple of 4
+  Tensor padding = at::zeros({N, C_aligned - C, H, W}, src.options());
+  Tensor src_padded = at::cat({src.reshape({N, C, H, W}), padding}, 1);
   // Reshape to group channels into groups of 4 and permute so that the groups
   // are in the first dimension so that they are contiguous
   Tensor src_NC4HW = src_padded.reshape({NC4, 4, H, W}).permute({0, 2, 3, 1});
@@ -325,7 +73,7 @@ Tensor create_staging_tensor(const vTensor& v_in) {
   uint32_t H = get_dim<Dim4D::Height>(v_in.sizes());
   uint32_t W = get_dim<Dim4D::Width>(v_in.sizes());
 
-  uint32_t NC4 = api::utils::div_up(N * C, 4u);
+  uint32_t NC4 = N * api::utils::div_up(C, 4u);
 
   // Note that the dtype corresponding with the texture format of the vTensor is
   // used instead of options().dtype(). This is to ensure the number of bytes in
@@ -350,13 +98,13 @@ Tensor nc4hw_to_nchw(const Tensor& t_in, IntArrayRef sizes) {
   uint32_t H = get_dim<Dim4D::Height>(sizes);
   uint32_t W = get_dim<Dim4D::Width>(sizes);
 
-  uint32_t NC_aligned = api::utils::align_up(N * C, 4u);
+  uint32_t C_aligned = api::utils::align_up(C, 4u);
 
   // Undo the permute step and channel grouping step
-  Tensor t_in_padded = t_in.permute({0, 3, 1, 2}).reshape({NC_aligned, H, W});
+  Tensor t_in_padded = t_in.permute({0, 3, 1, 2}).reshape({N, C_aligned, H, W});
   // Remove the padding channels
   Tensor t_in_shaved =
-      at::narrow(t_in_padded, /*dim=*/0, /*start*/ 0, /*end*/ N * C);
+      at::narrow(t_in_padded, /*dim=*/1, /*start*/ 0, /*end*/ C);
 
   // Reshape to original sizing and dtype and return a contiguous Tensor
   return t_in_shaved.reshape(sizes).contiguous();
diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp
index ccc4fa406bf8..cf9d180b2153 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@@ -236,6 +236,7 @@ ContextConv2D create(
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
+      nullptr,                                                        // xnn_caches_t
       &convolution_op);                                               // operator
   } else {
     for (const auto i : c10::irange(4)) {
@@ -264,6 +265,7 @@ ContextConv2D create(
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
+      nullptr,                                                        // xnn_caches_t
       &convolution_op);                                               // operator
   }
 
diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
index f821e449caf4..37e3c6eb1c31 100644
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -97,6 +97,7 @@ ContextLinear create(
       output_min,                                                     // output_min
       output_max,                                                     // output_max
       0u,                                                             // flags
+      nullptr,                                                        // xnn_caches_t
       &linear_op);                                                    // operator
 
   TORCH_CHECK(
diff --git a/aten/src/ATen/nnapi/nnapi_bind.cpp b/aten/src/ATen/nnapi/nnapi_bind.cpp
index 58874a2babcc..fd7979cb2ab6 100644
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@@ -46,7 +46,7 @@ void NnapiCompilation::init(
 
 void NnapiCompilation::init2(
     at::Tensor serialized_model_tensor,
-    std::vector<at::Tensor> parameter_buffers,
+    const std::vector<at::Tensor>& parameter_buffers,
     int64_t compilation_preference,
     bool relax_f32_to_f16
   ) {
@@ -55,7 +55,9 @@ void NnapiCompilation::init2(
   load_platform_library();
 
   std::vector<const void*> buffers;
+  buffers.reserve(parameter_buffers.size());
   std::vector<int32_t> buffer_sizes;
+  buffer_sizes.reserve(parameter_buffers.size());
   for (auto& t : parameter_buffers) {
     TORCH_CHECK(t.is_contiguous());
     buffers.push_back(t.data_ptr());
@@ -73,10 +75,9 @@ void NnapiCompilation::init2(
     ser_model_ptr,
     serialized_model_tensor.nbytes()
   };
-  TORCH_CHECK(ser_model.size() > 0);
+  TORCH_CHECK(!ser_model.empty());
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  ANeuralNetworksModel* model;
+  ANeuralNetworksModel* model{};
   check_nnapi->Model_create(&model);
   CAFFE_ENFORCE(model);
   model_.reset(model);
@@ -102,8 +103,7 @@ void NnapiCompilation::init2(
   }
   check_nnapi->Model_finish(model_.get());
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  ANeuralNetworksCompilation* compilation;
+  ANeuralNetworksCompilation* compilation{};
   check_nnapi->Compilation_create(model_.get(), &compilation);
   // TODO: Make this configurable.
   check_nnapi->Compilation_setPreference(compilation, static_cast<int32_t>(compilation_preference));
diff --git a/aten/src/ATen/nnapi/nnapi_bind.h b/aten/src/ATen/nnapi/nnapi_bind.h
index 8f36b2930bfa..82c5bf31a4ce 100644
--- a/aten/src/ATen/nnapi/nnapi_bind.h
+++ b/aten/src/ATen/nnapi/nnapi_bind.h
@@ -44,7 +44,7 @@ struct NnapiCompilation : torch::jit::CustomClassHolder {
 
     TORCH_API void init2(
       at::Tensor serialized_model_tensor,
-      std::vector<at::Tensor> parameter_buffers,
+      const std::vector<at::Tensor>& parameter_buffers,
       int64_t compilation_preference,
       bool relax_f32_to_f16
     );
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index 54fe7082c3e7..2a70ea6094bd 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -97,7 +97,7 @@ int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::S
       element_per_byte = 1;
   }
   // zero dim tensor
-  if (sizes.size() == 0) {
+  if (sizes.empty()) {
     return c10::multiply_integers(sizes) * dtype_itemsize;
   }
   // Consider most inner dim as cols
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 27901acaef0e..587e3b11ea7f 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -153,7 +153,7 @@ class CacheEntry {
 
   // Includes sampling callbacks which are waiting to run.
   c10::SmallVector<CallbackAndCounter, kSoftLimitCallbacks> callbacks_;
-  RecordScope scope_;
+  RecordScope scope_{RecordScope::FUNCTION};
 
   StepCallbacks active_callbacks_;
 
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 8f05c8b6f829..8a4bbe5ae247 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -709,7 +709,7 @@ class TORCH_API RecordFunctionGuard {
 class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
  public:
   DisableRecordFunctionGuard() : RecordFunctionGuard(false) {}
-  virtual ~DisableRecordFunctionGuard() = default;
+  ~DisableRecordFunctionGuard() override = default;
 };
 
 struct TORCH_API RecordFunctionTLS {
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 27b9e3759652..00256cb9c1af 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -36,6 +36,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/reportMemoryUsage_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/scalar_tensor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/scalar_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/StorageUtils_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/stride_properties_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/test_parallel.cpp
@@ -107,6 +108,7 @@ list(APPEND ATen_VEC_TEST_SRCS
 
 list(APPEND ATen_MPS_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_print.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_allocator.cpp
   )
 
 # Caffe2 specific tests
diff --git a/aten/src/ATen/test/StorageUtils_test.cpp b/aten/src/ATen/test/StorageUtils_test.cpp
new file mode 100644
index 000000000000..bc4855778e6c
--- /dev/null
+++ b/aten/src/ATen/test/StorageUtils_test.cpp
@@ -0,0 +1,33 @@
+#include <gtest/gtest.h>
+
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <ATen/StorageUtils.h>
+
+using namespace ::testing;
+
+TEST(StorageUtilsTest, shm_storage_refcount) {
+  auto t1 = std::make_unique<at::Tensor>(
+      at::full({5, 5}, 7, at::dtype(at::kLong).device(at::kCPU)));
+  auto t2 = std::make_unique<at::Tensor>(t1->slice(0, 0, 3));
+
+  auto verificationTensor = t1->clone();
+  ASSERT_EQ(t1->storage().use_count(), 2);
+  ASSERT_EQ(t2->storage().use_count(), 2);
+  ASSERT_EQ(verificationTensor.storage().use_count(), 1);
+
+  at::share_memory_(*t1);
+  ASSERT_EQ(t1->storage().allocator(), nullptr)
+      << "Expect original storage allocator to be detached";
+  ASSERT_NE(verificationTensor.storage().allocator(), nullptr);
+  ASSERT_EQ(t1->storage().use_count(), 2) << "Expect refcount to be the same";
+  ASSERT_EQ(t2->storage().use_count(), 2);
+
+  ASSERT_TRUE(t1->equal(verificationTensor));
+  auto weakStoragePtr = t1->storage().getWeakStorageImpl();
+  // weak + 1 (if any strong ref exists due to how intrusive_ptr refcount works)
+  ASSERT_EQ(weakStoragePtr.weak_use_count(), 2);
+  t1.reset();
+  t2.reset();
+  ASSERT_TRUE(weakStoragePtr.expired());
+}
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 75cd45d0ee78..3b4bb076ab87 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -109,7 +109,7 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
   auto begin = std::chrono::high_resolution_clock::now();
   Tensor d = ones({3, 4}, type);
   Tensor r = zeros({3, 4}, type);
-  for (const auto i : c10::irange(100000)) {
+  for (const auto i : c10::irange(1000)) {
     (void)i; // Suppress unused variable warning
     add_out(r, r, d);
   }
@@ -120,14 +120,14 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) {
                    end - begin)
                    .count()
             << " ms" << std::endl;
-  ASSERT_EQ_RESOLVED(norm(100000 * d).item<double>(), norm(r).item<double>());
+  ASSERT_EQ_RESOLVED(norm(1000 * d).item<double>(), norm(r).item<double>());
 }
 
 void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) {
   auto begin = std::chrono::high_resolution_clock::now();
   Tensor d = ones({3, 4}, type);
   Tensor r = zeros({3, 4}, type);
-  for (const auto i : c10::irange(100000)) {
+  for (const auto i : c10::irange(1000)) {
     (void)i; // Suppress unused variable warning
     r = add(r, d);
   }
@@ -138,7 +138,7 @@ void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) {
                    end - begin)
                    .count()
             << " ms" << std::endl;
-  ASSERT_EQ_RESOLVED(norm(100000 * d).item<double>(), norm(r).item<double>());
+  ASSERT_EQ_RESOLVED(norm(1000 * d).item<double>(), norm(r).item<double>());
 }
 
 void TestIsContiguous(DeprecatedTypeProperties& type) {
diff --git a/aten/src/ATen/test/cuda_reportMemoryUsage_test.cpp b/aten/src/ATen/test/cuda_reportMemoryUsage_test.cpp
index e00127f858df..88ea1e099ce9 100644
--- a/aten/src/ATen/test/cuda_reportMemoryUsage_test.cpp
+++ b/aten/src/ATen/test/cuda_reportMemoryUsage_test.cpp
@@ -49,7 +49,7 @@ TEST(DeviceCachingAllocator, check_reporter) {
   // alloc2 remain, it is a memory free operation, so it shouldn't reserve more
   // memory.
   EXPECT_TRUE(
-      alloc2_true_alloc_size <= r.total_reserved &&
+      alloc2_true_alloc_size <= static_cast<int64_t>(r.total_reserved) &&
       r.total_reserved <= max_reserved);
   EXPECT_TRUE(r.device.is_cuda());
 
@@ -58,7 +58,7 @@ TEST(DeviceCachingAllocator, check_reporter) {
   EXPECT_EQ(alloc2_true_ptr, r.ptr);
   EXPECT_EQ(-alloc2_true_alloc_size, r.alloc_size);
   EXPECT_EQ(0, r.total_allocated);
-  EXPECT_TRUE(0 <= r.total_reserved && r.total_reserved <= max_reserved);
+  EXPECT_TRUE(r.total_reserved <= max_reserved);
   EXPECT_TRUE(r.device.is_cuda());
 }
 
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 02ccb8b6ce5d..4a61cfe64002 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -169,6 +169,6 @@ TEST(TestHalf, ComplexHalf) {
   Half real = 3.0f;
   Half imag = -10.0f;
   auto complex = c10::complex<Half>(real, imag);
-  assert(complex.real() == real);
-  assert(complex.imag() == imag);
+  ASSERT_EQ(complex.real(), real);
+  ASSERT_EQ(complex.imag(), imag);
 }
diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp
index 8875e72a6af9..1ac5873b147d 100644
--- a/aten/src/ATen/test/math_kernel_test.cpp
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@@ -54,7 +54,6 @@ TEST(MathKernelTest, NativeGroupNorm) {
 
 TEST(MathKernelTest, NativeLayerNorm) {
   const auto input = rand({20, 10, 10, 10});
-  const auto input_shape = input.sizes();
 
   double eps = 1e-05;
   for (bool undef_weight: {true, false}) {
diff --git a/aten/src/ATen/test/mps_test_allocator.cpp b/aten/src/ATen/test/mps_test_allocator.cpp
new file mode 100644
index 000000000000..399aef9f5543
--- /dev/null
+++ b/aten/src/ATen/test/mps_test_allocator.cpp
@@ -0,0 +1,39 @@
+#include <gtest/gtest.h>
+#include <torch/torch.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+
+namespace replay {
+std::function<void()> callback_action;
+
+class ReplayBufferCleaner : virtual public at::mps::IMpsAllocatorCallback {
+    public:
+    void executeMPSAllocatorCallback(void* ptr, EventType event) override {
+     if (event == EventType::ALLOCATION_FAILED) {
+        callback_action();
+     }
+    }
+};
+}
+
+namespace at::mps {
+REGISTER_MPS_ALLOCATOR_CALLBACK("ReplayBufferCleaner", replay::ReplayBufferCleaner);
+}
+
+TEST(MPSAllocator, MPSAllocatorCallbacks) {
+    std::vector<torch::Tensor> replay_buffer;
+    replay::callback_action = [&]() {
+        if (!replay_buffer.empty()) {
+            replay_buffer.erase(replay_buffer.begin(), replay_buffer.begin() + (replay_buffer.size()/10));
+        }
+    };
+    size_t max_iter = 100000;
+    for (size_t i = 0; i < max_iter; i++) {
+        torch::Tensor new_value = torch::randn({10000, 10000}, at::device(at::kMPS));
+        // early stop the first time the callback is called
+        if (replay_buffer.size() != i) {
+            break;
+        }
+        replay_buffer.push_back(new_value);
+    }
+    ASSERT_TRUE(replay_buffer.size() < max_iter);
+}
diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp
index 305fd3755603..40fa0bb94294 100644
--- a/aten/src/ATen/test/quantized_test.cpp
+++ b/aten/src/ATen/test/quantized_test.cpp
@@ -220,7 +220,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) {
   TensorOptions options(at::kQUInt8);
 
   auto custom_vec = std::make_unique<std::vector<uint8_t>>();
-  custom_vec->reserve(numel);
+  custom_vec->resize(numel);
 
   uint8_t* custom_data = custom_vec->data();
   for (const auto i : c10::irange(numel)) {
@@ -263,7 +263,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) {
   TensorOptions options(at::kQUInt8);
 
   auto custom_vec = std::make_unique<std::vector<uint8_t>>();
-  custom_vec->reserve(numel);
+  custom_vec->resize(numel);
 
   uint8_t* custom_data = custom_vec->data();
   for (const auto i : c10::irange(numel)) {
diff --git a/aten/src/ATen/test/reportMemoryUsage.h b/aten/src/ATen/test/reportMemoryUsage.h
index e3a73cb24b8f..f7d660d65ee5 100644
--- a/aten/src/ATen/test/reportMemoryUsage.h
+++ b/aten/src/ATen/test/reportMemoryUsage.h
@@ -10,8 +10,8 @@ class TestMemoryReportingInfo : public c10::MemoryReportingInfoBase {
   struct Record {
     void* ptr;
     int64_t alloc_size;
-    int64_t total_allocated;
-    int64_t total_reserved;
+    size_t total_allocated;
+    size_t total_reserved;
     c10::Device device;
   };
 
@@ -23,8 +23,8 @@ class TestMemoryReportingInfo : public c10::MemoryReportingInfoBase {
   void reportMemoryUsage(
       void* ptr,
       int64_t alloc_size,
-      int64_t total_allocated,
-      int64_t total_reserved,
+      size_t total_allocated,
+      size_t total_reserved,
       c10::Device device) override {
     records.emplace_back(
         Record{ptr, alloc_size, total_allocated, total_reserved, device});
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index e07c8f4ec028..ba010074c2aa 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -1795,11 +1795,13 @@ TEST_F(VulkanAPITest, glu_ch_32) {
   test_glu({1, 32, 100, 19});
 }
 
-TEST_F(VulkanAPITest, glu_ch_10) {
+// Re-enable once glu_channel shader is fixed
+TEST_F(VulkanAPITest, DISABLED_glu_ch_10) {
   test_glu({17, 10, 57, 41});
 }
 
-TEST_F(VulkanAPITest, glu_ch_2) {
+// Re-enable once glu_channel shader is fixed
+TEST_F(VulkanAPITest, DISABLED_glu_ch_2) {
   test_glu({1, 2, 100, 40});
 }
 
@@ -2803,6 +2805,42 @@ TEST_F(VulkanAPITest, select_3d_depth_large) {
   test_select({100, 1, 144}, 0, 50);
 }
 
+TEST_F(VulkanAPITest, select_3d_height_small) {
+  test_select({1, 1, 1}, 1, 0);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_medium) {
+  test_select({3, 5, 2}, 1, 2);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_medium1) {
+  test_select({16, 16, 5}, 1, 6);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_medium2) {
+  test_select({17, 17, 5}, 1, 6);
+}
+
+TEST_F(VulkanAPITest, select_3d_height_large) {
+  test_select({100, 144, 5}, 1, 50);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_small) {
+  test_select({1, 1, 1}, 2, 0);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_medium) {
+  test_select({3, 5, 3}, 2, 2);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_medium2) {
+  test_select({17, 17, 8}, 2, 6);
+}
+
+TEST_F(VulkanAPITest, select_3d_width_large) {
+  test_select({100, 3, 144}, 2, 50);
+}
+
 TEST_F(VulkanAPITest, sigmoid) {
   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
   const auto in_vulkan = in_cpu.vulkan();
@@ -2882,6 +2920,36 @@ TEST_F(VulkanAPITest, DISABLED_log_softmax) {
   }
 }
 
+TEST_F(VulkanAPITest, abs) {
+  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
+  const auto in_vulkan = in_cpu.vulkan();
+
+  const auto out_cpu = at::abs(in_cpu);
+  const auto out_vulkan = at::abs(in_vulkan);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, abs_) {
+  auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
+  auto vulkan = cpu.vulkan();
+
+  at::abs_(cpu);
+  at::abs_(vulkan);
+
+  const auto check = almostEqual(cpu, vulkan.cpu());
+  if (!check) {
+    showRtol(cpu, vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, tanh) {
   const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30;
   const auto in_vulkan = in_cpu.vulkan();
@@ -3756,7 +3824,7 @@ TEST_F(VulkanAPITest, permute_4dmclaren_success) {
 
 TEST_F(VulkanAPITest, permute_4dbig_success) {
   // Arrange
-  const auto in_cpu = at::rand({3, 9, 89, 91}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu = at::rand({3, 9, 51, 41}, at::device(at::kCPU).dtype(at::kFloat));
   std::vector<std::vector<int64_t>> all_dims;
   std::vector<int64_t> in{0, 1, 2, 3};
   gen_allpermutations(all_dims, in, 0);
diff --git a/benchmarks/distributed/ddp/benchmark.py b/benchmarks/distributed/ddp/benchmark.py
index a905ad60f530..c72e3e6a27d9 100644
--- a/benchmarks/distributed/ddp/benchmark.py
+++ b/benchmarks/distributed/ddp/benchmark.py
@@ -151,7 +151,7 @@ def print_measurements(prefix, nelem, measurements):
     return results
 
 
-class Benchmark(object):
+class Benchmark:
     def __init__(self, device, distributed_backend, bucket_size):
         self.device = device
         self.batch_size = 32
@@ -173,7 +173,7 @@ def generate_target(self):
 
 class TorchvisionBenchmark(Benchmark):
     def __init__(self, device, distributed_backend, bucket_size, model):
-        super(TorchvisionBenchmark, self).__init__(
+        super().__init__(
             device,
             distributed_backend,
             bucket_size,
diff --git a/benchmarks/distributed/ddp/diff.py b/benchmarks/distributed/ddp/diff.py
index dc984626888a..d427a5b29d91 100644
--- a/benchmarks/distributed/ddp/diff.py
+++ b/benchmarks/distributed/ddp/diff.py
@@ -25,7 +25,7 @@ def main():
     ja = load(args.file[0])
     jb = load(args.file[1])
 
-    keys = (set(ja.keys()) | set(jb.keys())) - set(["benchmark_results"])
+    keys = (set(ja.keys()) | set(jb.keys())) - {"benchmark_results"}
     print("{:20s} {:>20s}      {:>20s}".format("", "baseline", "test"))
     print("{:20s} {:>20s}      {:>20s}".format("", "-" * 20, "-" * 20))
     for key in sorted(keys):
diff --git a/benchmarks/distributed/pipeline/pipe.py b/benchmarks/distributed/pipeline/pipe.py
index 418e20168c28..8a08d25ca4c9 100644
--- a/benchmarks/distributed/pipeline/pipe.py
+++ b/benchmarks/distributed/pipeline/pipe.py
@@ -43,7 +43,7 @@ def forward(self, src):
 
 class PositionalEncodingLayer(nn.Module):
     def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncodingLayer, self).__init__()
+        super().__init__()
         self.dropout = nn.Dropout(p=dropout)
 
         pe = torch.zeros(max_len, d_model)
@@ -99,7 +99,7 @@ def __init__(self, ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder):
             layers.append(TransformerDecoderLayer(ninp, nhead, nhid, dropout))
 
         layers.append(LinearLayer(ninp, ntokens, initrange))
-        super(TransformerLMSequential, self).__init__(*layers)
+        super().__init__(*layers)
 
 
 def make_model(args, device, ntokens):
diff --git a/benchmarks/distributed/rpc/parameter_server/launcher.py b/benchmarks/distributed/rpc/parameter_server/launcher.py
index 96f1053d0346..a4c13cdb29b6 100644
--- a/benchmarks/distributed/rpc/parameter_server/launcher.py
+++ b/benchmarks/distributed/rpc/parameter_server/launcher.py
@@ -448,11 +448,13 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="RPC server Benchmark")
     parser.add_argument(
+        "--master-addr",
         "--master_addr",
         type=str,
         help="IP address of the machine that will host the process with rank 0"
     )
     parser.add_argument(
+        "--master-port",
         "--master_port",
         type=str,
         help="A free port on the machine that will host the process with rank 0"
@@ -493,6 +495,7 @@ def main(args):
         help="cudaserver count for benchmark run"
     )
     parser.add_argument(
+        "--rpc-timeout",
         "--rpc_timeout",
         type=int,
         help="timeout in seconds to use for RPC"
@@ -508,6 +511,7 @@ def main(args):
         help="epoch count for training"
     )
     parser.add_argument(
+        "--batch-size",
         "--batch_size",
         type=int,
         help="number of training examples used in one iteration"
@@ -523,62 +527,74 @@ def main(args):
         help="id for model configuration"
     )
     parser.add_argument(
+        "--data-config-path",
         "--data_config_path",
         type=str,
         help="path to data configuration file"
     )
     parser.add_argument(
+        "--model-config-path",
         "--model_config_path",
         type=str,
         help="path to model configuration file"
     )
     parser.add_argument(
+        "--server-config-path",
         "--server_config_path",
         type=str,
         help="path to server configuration file"
     )
     parser.add_argument(
+        "--trainer-config-path",
         "--trainer_config_path",
         type=str,
         help="path to trainer configuration file"
     )
     parser.add_argument(
+        "--torch-seed",
         "--torch_seed",
         type=int,
         help="seed for generating random numbers to a non-deterministic random number"
     )
     parser.add_argument(
+        "--cuda-seed",
         "--cuda_seed",
         type=int,
         help="seed for generating random numbers to a random number for the current GPU"
     )
     parser.add_argument(
+        "--preprocess-data",
         "--preprocess_data",
         type=str,
         help="this function will be used to preprocess data before training"
     )
     parser.add_argument(
+        "--create-criterion",
         "--create_criterion",
         type=str,
         help="this function will be used to create the criterion used for model loss calculation"
     )
     parser.add_argument(
+        "--create-ddp-model",
         "--create_ddp_model",
         type=str,
         help="this function will be used to create the ddp model used during training"
     )
     parser.add_argument(
+        "--hook-state",
         "--hook_state",
         type=str,
         help="this will be the state class used when registering the ddp communication hook"
     )
     parser.add_argument(
+        "--ddp-hook",
         "--ddp_hook",
         type=str,
         default="allreduce_hook",
         help="ddp communication hook"
     )
     parser.add_argument(
+        "--iteration-step",
         "--iteration_step",
         type=str,
         help="this will be the function called for each iteration of training"
diff --git a/benchmarks/distributed/rpc/rl/README.md b/benchmarks/distributed/rpc/rl/README.md
index 1cd29a7a4b61..86bc1d76ebb6 100644
--- a/benchmarks/distributed/rpc/rl/README.md
+++ b/benchmarks/distributed/rpc/rl/README.md
@@ -20,7 +20,7 @@ This benchmark depends on PyTorch.
 
 For any environments you are interested in, pass the corresponding arguments to `python launcher.py`.
 
-```python launcher.py --world_size="10,20" --master_addr="127.0.0.1" --master_port="29501 --batch="True" --state_size="10-20-10" --nlayers="5" --out_features="10" --output_file_path="benchmark_report.json"```
+```python launcher.py --world-size="10,20" --master-addr="127.0.0.1" --master-port="29501 --batch="True" --state-size="10-20-10" --nlayers="5" --out-features="10" --output-file-path="benchmark_report.json"```
 
 Example Output:
 
diff --git a/benchmarks/distributed/rpc/rl/agent.py b/benchmarks/distributed/rpc/rl/agent.py
index 9fdacbf348a5..db8460a62e51 100644
--- a/benchmarks/distributed/rpc/rl/agent.py
+++ b/benchmarks/distributed/rpc/rl/agent.py
@@ -22,7 +22,7 @@ def __init__(self, in_features, nlayers, out_features):
             nlayers (int): Number of layers in the model
             out_features (int): Number of features the model outputs
         """
-        super(Policy, self).__init__()
+        super().__init__()
 
         self.model = nn.Sequential(
             nn.Flatten(1, -1),
diff --git a/benchmarks/distributed/rpc/rl/launcher.py b/benchmarks/distributed/rpc/rl/launcher.py
index 8905378eb9be..afabc558161f 100644
--- a/benchmarks/distributed/rpc/rl/launcher.py
+++ b/benchmarks/distributed/rpc/rl/launcher.py
@@ -29,15 +29,15 @@ def str2bool(v):
 
 
 parser = argparse.ArgumentParser(description='PyTorch RPC RL Benchmark')
-parser.add_argument('--world_size', type=str, default='10')
-parser.add_argument('--master_addr', type=str, default='127.0.0.1')
-parser.add_argument('--master_port', type=str, default='29501')
+parser.add_argument('--world-size', '--world_size', type=str, default='10')
+parser.add_argument('--master-addr', '--master_addr', type=str, default='127.0.0.1')
+parser.add_argument('--master-port', '--master_port', type=str, default='29501')
 parser.add_argument('--batch', type=str, default='True')
 
-parser.add_argument('--state_size', type=str, default='10-20-10')
+parser.add_argument('--state-size', '--state_size', type=str, default='10-20-10')
 parser.add_argument('--nlayers', type=str, default='5')
-parser.add_argument('--out_features', type=str, default='10')
-parser.add_argument('--output_file_path', type=str, default='benchmark_report.json')
+parser.add_argument('--out-features', '--out_features', type=str, default='10')
+parser.add_argument('--output-file-path', '--output_file_path', type=str, default='benchmark_report.json')
 
 args = parser.parse_args()
 args = vars(args)
diff --git a/benchmarks/dynamo/Makefile b/benchmarks/dynamo/Makefile
index c5c9907a7a94..6dc0bf1f91d1 100644
--- a/benchmarks/dynamo/Makefile
+++ b/benchmarks/dynamo/Makefile
@@ -18,7 +18,7 @@ pull-deps: clone-deps
 	(cd ../../../torchvision    && git pull && git submodule update --init --recursive)
 	(cd ../../../torchdata      && git pull && git submodule update --init --recursive)
 	(cd ../../../torchtext      && git pull && git submodule update --init --recursive)
-	(cd ../../../torchaudio      && git pull && git submodule update --init --recursive)
+	(cd ../../../torchaudio     && git pull && git submodule update --init --recursive)
 	(cd ../../../detectron2     && git pull && git submodule update --init --recursive)
 	(cd ../../../torchbenchmark && git pull && git submodule update --init --recursive)
 	(cd ../../../triton         && git fetch && git checkout $(TRITON_VERSION) && git submodule update --init --recursive)
@@ -28,7 +28,7 @@ build-deps: clone-deps
 	# conda create --name torchdynamo -y python=3.8
 	# conda activate torchdynamo
 	conda install -y astunparse numpy scipy ninja pyyaml mkl mkl-include setuptools cmake \
-		typing_extensions six requests protobuf numba cython scikit-learn
+		typing-extensions requests protobuf numba cython scikit-learn
 	conda install -y -c pytorch magma-cuda116
 	conda install -y -c conda-forge librosa
 	(cd ../../../torchvision && python setup.py clean && python setup.py develop)
diff --git a/benchmarks/dynamo/all_torchbench_models_list.txt b/benchmarks/dynamo/all_torchbench_models_list.txt
new file mode 100644
index 000000000000..1e896c333288
--- /dev/null
+++ b/benchmarks/dynamo/all_torchbench_models_list.txt
@@ -0,0 +1,73 @@
+BERT_pytorch
+Background_Matting
+DALLE2_pytorch
+LearningToPaint
+Super_SloMo
+alexnet
+attention_is_all_you_need_pytorch
+dcgan
+demucs
+densenet121
+detectron2_fasterrcnn_r_101_c4
+detectron2_fasterrcnn_r_101_dc5
+detectron2_fasterrcnn_r_101_fpn
+detectron2_fasterrcnn_r_50_c4
+detectron2_fasterrcnn_r_50_dc5
+detectron2_fasterrcnn_r_50_fpn
+detectron2_fcos_r_50_fpn
+detectron2_maskrcnn
+detectron2_maskrcnn_r_101_c4
+detectron2_maskrcnn_r_101_fpn
+detectron2_maskrcnn_r_50_c4
+detectron2_maskrcnn_r_50_fpn
+dlrm
+drq
+fambench_dlrm
+fambench_xlmr
+fastNLP_Bert
+hf_Albert
+hf_Bart
+hf_Bert
+hf_BigBird
+hf_DistilBert
+hf_GPT2
+hf_Longformer
+hf_Reformer
+hf_T5
+maml
+maml_omniglot
+mnasnet1_0
+mobilenet_v2
+mobilenet_v2_quantized_qat
+mobilenet_v3_large
+moco
+nvidia_deeprecommender
+opacus_cifar10
+pplbench_beanmachine
+pyhpc_equation_of_state
+pyhpc_isoneutral_mixing
+pyhpc_turbulent_kinetic_energy
+pytorch_CycleGAN_and_pix2pix
+pytorch_stargan
+pytorch_struct
+pytorch_unet
+resnet18
+resnet50
+resnet50_quantized_qat
+resnext50_32x4d
+shufflenet_v2_x1_0
+soft_actor_critic
+speech_transformer
+squeezenet1_1
+tacotron2
+timm_efficientdet
+timm_efficientnet
+timm_nfnet
+timm_regnet
+timm_resnest
+timm_vision_transformer
+timm_vovnet
+tts_angular
+vgg16
+vision_maskrcnn
+yolov3
\ No newline at end of file
diff --git a/benchmarks/dynamo/benchmarks.py b/benchmarks/dynamo/benchmarks.py
new file mode 100755
index 000000000000..15e7f5254f49
--- /dev/null
+++ b/benchmarks/dynamo/benchmarks.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import argparse
+import os
+
+from typing import Set
+
+# Note - hf and timm have their own version of this, torchbench does not
+# TOOD(voz): Someday, consolidate all the files into one runner instead of a shim like this...
+def model_names(filename: str) -> Set[str]:
+    names = set()
+    with open(filename, "r") as fh:
+        lines = fh.readlines()
+        lines = [line.rstrip() for line in lines]
+        for line in lines:
+            line_parts = line.split(" ")
+            if len(line_parts) == 1:
+                line_parts = line.split(",")
+            model_name = line_parts[0]
+            names.add(model_name)
+    return names
+
+
+TIMM_MODEL_NAMES = model_names(
+    os.path.join(os.path.dirname(__file__), "timm_models_list.txt")
+)
+HF_MODELS_FILE_NAME = model_names(
+    os.path.join(os.path.dirname(__file__), "huggingface_models_list.txt")
+)
+TORCHBENCH_MODELS_FILE_NAME = model_names(
+    os.path.join(os.path.dirname(__file__), "all_torchbench_models_list.txt")
+)
+
+# timm <> HF disjoint
+assert TIMM_MODEL_NAMES.isdisjoint(HF_MODELS_FILE_NAME)
+# timm <> torch disjoint
+assert TIMM_MODEL_NAMES.isdisjoint(TORCHBENCH_MODELS_FILE_NAME)
+# torch <> hf disjoint
+assert TORCHBENCH_MODELS_FILE_NAME.isdisjoint(HF_MODELS_FILE_NAME)
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--only",
+        help="""Run just one model from whichever model suite it belongs to. Or
+        specify the path and class name of the model in format like:
+        --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
+
+        Due to the fact that dynamo changes current working directory,
+        the path should be an absolute path.
+
+        The class should have a method get_example_inputs to return the inputs
+        for the model. An example looks like
+        ```
+        class LinearModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(10, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+            def get_example_inputs(self):
+                return (torch.randn(2, 10),)
+        ```
+    """,
+    )
+    return parser.parse_known_args(args)
+
+
+if __name__ == "__main__":
+    args, unknown = parse_args()
+    if args.only:
+        name = args.only
+        if name in TIMM_MODEL_NAMES:
+            import timm_models
+
+            timm_models.timm_main()
+        elif name in HF_MODELS_FILE_NAME:
+            import huggingface
+
+            huggingface.huggingface_main()
+        elif name in TORCHBENCH_MODELS_FILE_NAME:
+            import torchbench
+
+            torchbench.torchbench_main()
+        else:
+            print(f"Illegal model name? {name}")
+            exit(-1)
+    else:
+        import torchbench
+
+        torchbench.torchbench_main()
+
+        import huggingface
+
+        huggingface.huggingface_main()
+
+        import timm_models
+
+        timm_models.timm_main()
diff --git a/benchmarks/dynamo/check_hf_bert_perf_csv.py b/benchmarks/dynamo/check_hf_bert_perf_csv.py
index 6887c4fcb64b..dc269890d238 100644
--- a/benchmarks/dynamo/check_hf_bert_perf_csv.py
+++ b/benchmarks/dynamo/check_hf_bert_perf_csv.py
@@ -16,7 +16,9 @@ def check_hf_bert_perf_csv(filename):
     for _, row in df.iterrows():
         model_name = row["name"]
         speedup = row["speedup"]
-        if speedup < 1.185:
+        # Reduced from 1.19 to 1.17, see https://github.com/pytorch/pytorch/issues/94687
+        # Reduce further to 1.165 due to runner and run to run variances
+        if speedup < 1.165:
             failed.append(model_name)
 
         print(f"{model_name:34} {speedup}")
diff --git a/benchmarks/dynamo/check_memory_compression_ratio.py b/benchmarks/dynamo/check_memory_compression_ratio.py
new file mode 100644
index 000000000000..3308758943e3
--- /dev/null
+++ b/benchmarks/dynamo/check_memory_compression_ratio.py
@@ -0,0 +1,57 @@
+import argparse
+import sys
+import textwrap
+
+import pandas as pd
+
+
+def main(args):
+    actual = pd.read_csv(args.actual)
+    expected = pd.read_csv(args.expected)
+    failed = []
+
+    for name in actual["name"]:
+        actual_memory_compression = float(
+            actual.loc[actual["name"] == name]["compression_ratio"]
+        )
+        try:
+            expected_memory_compression = float(
+                expected.loc[expected["name"] == name]["compression_ratio"]
+            )
+        except TypeError:
+            print(f"{name:34} is missing from {args.expected}")
+            continue
+        if actual_memory_compression >= expected_memory_compression * 0.95:
+            status = "PASS"
+        else:
+            status = "FAIL"
+            failed.append(name)
+        print(
+            f"""
+            {name:34}:
+                actual_memory_compression={actual_memory_compression:.2f},
+                expected_memory_compression={expected_memory_compression:.2f},
+                {status}
+            """
+        )
+
+    if failed:
+        print(
+            textwrap.dedent(
+                f"""
+                Error: {len(failed)} models below expected memory compression ratio:
+                    {' '.join(failed)}
+                If this drop is expected, you can update `{args.expected}`.
+                """
+            )
+        )
+        sys.exit(1)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--actual", type=str, required=True)
+parser.add_argument("--expected", type=str, required=True)
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    main(args)
diff --git a/benchmarks/dynamo/combine_csv.py b/benchmarks/dynamo/combine_csv.py
new file mode 100644
index 000000000000..b579e0a1bbbd
--- /dev/null
+++ b/benchmarks/dynamo/combine_csv.py
@@ -0,0 +1,49 @@
+# This script takes csvs produced by parse_logs.py and combines them
+# into a single CSV file
+
+import ast
+import csv
+import sys
+from collections import defaultdict
+
+assert len(sys.argv) == 3
+
+RESULTS = defaultdict(dict)
+
+for side, f in zip(["static", "dynamic"], sys.argv[1:]):
+    with open(f, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            RESULTS[(row["bench"], row["name"])][side] = row
+
+fields = ["frame_time", "graph_breaks"]
+
+out = csv.DictWriter(
+    sys.stdout,
+    ["bench", "name"] + [f"delta_{n}" for n in fields] + ["static_url", "dynamic_url"],
+    dialect="excel",
+)
+out.writeheader()
+
+for (bench, name), sides in RESULTS.items():
+    if "static" not in sides:
+        continue
+    if "dynamic" not in sides:
+        continue
+    if not name:
+        out.writerow(
+            {
+                "static_url": sides["static"]["explain"],
+                "dynamic_url": sides["dynamic"]["explain"],
+            }
+        )
+        continue
+    row = {"bench": bench, "name": name}
+    for f in fields:
+        try:
+            static = ast.literal_eval(sides["static"][f])
+            dynamic = ast.literal_eval(sides["dynamic"][f])
+        except SyntaxError:
+            continue
+        row[f"delta_{f}"] = dynamic - static
+    out.writerow(row)
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index cccf77a8059c..88ca3ff734ad 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -6,6 +6,7 @@
 import functools
 import importlib
 import io
+import itertools
 import logging
 import os
 import random
@@ -13,7 +14,6 @@
 import subprocess
 import sys
 import time
-import warnings
 from contextlib import contextmanager
 from typing import NamedTuple
 
@@ -27,8 +27,6 @@
 import torch.distributed
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.exc import BackendCompilerFailed
-from torch._dynamo.optimizations import backends
-from torch._dynamo.optimizations.log_args import conv_args_analysis
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
 from torch._dynamo.utils import clone_inputs
@@ -66,10 +64,14 @@ class CI(NamedTuple):
     backend: str  # aot_eager or inductor
     training: bool
     dynamic: bool = False
+    device: str = "cuda"
 
 
 CI_SKIP = collections.defaultdict(list)
 
+
+# Skips for dynamic=False
+
 CI_SKIP[CI("aot_eager", training=False)] = [
     # TorchBench
     "DALLE2_pytorch",  # AttributeError: text_encodings
@@ -114,13 +116,10 @@ class CI(NamedTuple):
     "fbnetv3_b",  # Accuracy (blocks.2.2.bn1.weight.grad)
     "levit_128",  # Accuracy (patch_embed.0.c.weight.grad)
     "sebotnet33ts_256",  # Accuracy (stem.conv1.conv.weight.grad)
-    "xcit_large_24_p8_224",  # fp64_OOM
-]
-
-CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
-    *CI_SKIP[CI("aot_eager", training=True)],
-    "crossvit_9_240",  # torch._C._nn.upsample_bicubic2d
-    "twins_pcpvt_base",  # timeout
+    "xcit_large_24_p8_224",  # fp64_OOM,
+    "gernet_l",  # accuracy https://github.com/pytorch/pytorch/issues/93847
+    "gluon_xception65",  # accuracy https://github.com/pytorch/pytorch/issues/93847
+    "tinynet_a",  # accuracy https://github.com/pytorch/pytorch/issues/93847
 ]
 
 CI_SKIP[CI("inductor", training=False)] = [
@@ -141,9 +140,41 @@ class CI(NamedTuple):
     # Huggingface
     "AllenaiLongformerBase",
     "DebertaV2ForQuestionAnswering",  # OOM
+    "OPTForCausalLM",  # OOM
+    # TIMM
+    "cait_m36_384",  # Accuracy
+    "botnet26t_256",  # accuracy https://github.com/pytorch/pytorch/issues/93847
+    "gluon_xception65",  # accuracy https://github.com/pytorch/pytorch/issues/93847
+]
+
+CI_SKIP[CI("inductor", training=False, device="cpu")] = [
+    # TorchBench
+    "drq",  # Need to update torchbench
+    "detectron2_fasterrcnn_r_101_c4",
+    "detectron2_fasterrcnn_r_101_dc5",
+    "detectron2_fasterrcnn_r_101_fpn",
+    "detectron2_fasterrcnn_r_50_c4",
+    "detectron2_fasterrcnn_r_50_dc5",
+    "detectron2_fasterrcnn_r_50_fpn",
+    "detectron2_fcos_r_50_fpn",
+    "detectron2_maskrcnn_r_101_c4",
+    "detectron2_maskrcnn_r_101_fpn",
+    "detectron2_maskrcnn_r_50_c4",
+    "detectron2_maskrcnn_r_50_fpn",
+    "hf_GPT2_large",  # Intermittent failure on CI
+    "mobilenet_v2_quantized_qat",
+    "pyhpc_turbulent_kinetic_energy",
+    "vision_maskrcnn",
+    "resnet50_quantized_qat",  # Eager model failed to run(Quantize only works on Float Tensor, got Double)
+    # Huggingface
+    "AllenaiLongformerBase",
+    "BartForConditionalGeneration",  # OOM
+    "DebertaV2ForQuestionAnswering",  # OOM
+    "MBartForConditionalGeneration",  # Accuracy https://github.com/pytorch/pytorch/issues/94793
+    "PLBartForConditionalGeneration",  # Accuracy https://github.com/pytorch/pytorch/issues/94794
     # TIMM
     "cait_m36_384",  # Accuracy
-    "ghostnet_100",  # Accuracy
+    "pnasnet5large",  # OOM
 ]
 
 CI_SKIP[CI("inductor", training=True)] = [
@@ -166,14 +197,47 @@ class CI(NamedTuple):
     "eca_halonext26ts",  # accuracy
     "fbnetv3_b",  # accuracy
     "levit_128",  # fp64_OOM
+    # https://github.com/pytorch/pytorch/issues/94066
+    "sebotnet33ts_256",  # Accuracy failed for key name stem.conv1.conv.weight.grad
     "xcit_large_24_p8_224",  # fp64_OOM
 ]
 
+# Skips for dynamic=True
+
+CI_SKIP[CI("aot_eager", training=False, dynamic=True)] = [
+    *CI_SKIP[CI("aot_eager", training=False)],
+    # torchbench
+    "vision_maskrcnn",  # 'literal' is an illegal expression for augmented assignment
+]
+
+CI_SKIP[CI("aot_eager", training=True, dynamic=True)] = [
+    *CI_SKIP[CI("aot_eager", training=True)],
+    *CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
+]
+
+CI_SKIP[CI("inductor", training=False, dynamic=True)] = [
+    *CI_SKIP[CI("aot_eager", training=False, dynamic=True)],
+    *CI_SKIP[CI("inductor", training=False)],
+    # torchbench
+    "functorch_dp_cifar10",  # timeout
+    "opacus_cifar10",  # timeout
+    # timm_models
+    "pnasnet5large",  # ceiling is not defined
+    "volo_d1_224",  # ceiling is not defined
+]
+
+CI_SKIP[CI("inductor", training=True, dynamic=True)] = [
+    # NB: Intentionally omitting for symmetry with dynamic=False
+    # *CI_SKIP[CI("aot_eager", training=True, dynamic=True)],
+    *CI_SKIP[CI("inductor", training=False, dynamic=True)],
+    *CI_SKIP[CI("inductor", training=True)],
+    # TODO: Fill this in
+]
+
 
 CI_SKIP_OPTIMIZER = {
     # TIMM
     "convmixer_768_32",  # accuracy
-    "sebotnet33ts_256",  # accuracy
     "hrnet_w18",  # Stack issue in fx
     # TorchBench
     "dlrm",  # symbolic shapes error
@@ -276,6 +340,8 @@ def print_summary(filename):
     data = pd.read_csv(filename)
     if "tag" in data.columns:
         for tag in data.tag.unique():
+            if tag == "0.0000":
+                continue  # This happens for failed runs
             print(f"\nSummary for tag={tag}:")
             print_summary_table(data[data.tag == tag])
     else:
@@ -289,21 +355,21 @@ def print_summary_table(data):
             if col in ("dev", "name", "batch_size", "tag"):
                 continue
             elif col in ("pct_ops", "pct_time"):
-                print(col.ljust(width), f"{data[col].mean():.1%}")
+                print(col.ljust(width), f"{data[col].mean():.3%}")
             elif col in ("graphs", "graph_calls", "captured_ops", "total_ops"):
-                print(col.ljust(width), f"{data[col].mean():.1f}")
+                print(col.ljust(width), f"{data[col].mean():.3f}")
             elif col in ("compilation_latency"):
-                print(col.ljust(width), f"mean={data[col].mean():.1f} seconds")
+                print(col.ljust(width), f"mean={data[col].mean():.3f} seconds")
             elif col in ("compression_ratio"):
-                print(col.ljust(width), f"mean={data[col].mean():.1f}x")
+                print(col.ljust(width), f"mean={data[col].mean():.3f}x")
             elif col in ("accuracy"):
                 pass_rate = (data[col] == "pass").mean()
-                print(col.ljust(width), f"pass_rate={100*pass_rate:.1f}%")
+                print(col.ljust(width), f"pass_rate={100*pass_rate:.2f}%")
             else:
                 cdata = data[col].clip(1)
                 print(
                     col.ljust(width),
-                    f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.2f}x",
+                    f"gmean={gmean(cdata):.2f}x mean={cdata.mean():.3f}x",
                 )
         except Exception as e:
             pass
@@ -513,6 +579,7 @@ def maybe_mark_profile(*args, **kwargs):
     # Use higher tolerance for XLA since XLA cause numerical unstability when
     # graph size changes
     tolerance = args.xla_tolerance if args.trace_on_xla else 1e-4
+    torch._dynamo.config.repro_tolerance = tolerance
 
     with maybe_profile(enabled=args.export_profiler_trace) as p:
         frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
@@ -758,70 +825,6 @@ def try_script(model, example_inputs):
         return None
 
 
-def speedup_experiment_onnx(args, model_iter_fn, model, example_inputs):
-    """
-    Measure baseline performance (without using TorchDynamo) of ONNXRT and TensorFlow.
-
-    Writes to ./baseline_onnx.csv
-    """
-    if current_device == "cpu":
-        m_onnxrt = backends.onnxrt_cpu(
-            try_script(model, example_inputs), example_inputs
-        )
-    else:
-        m_onnxrt = backends.onnxrt_cuda(
-            try_script(model, example_inputs), example_inputs
-        )
-
-    if current_name != "timm_resnest":
-        m_onnx2tf = backends.onnx2tf(try_script(model, example_inputs), example_inputs)
-    else:
-        # this one takes 8+ hours to finish
-        m_onnx2tf = None
-
-    return baselines(
-        [
-            ("eager", model),
-            ("onnxrt", m_onnxrt),
-            ("onnx2tf", m_onnx2tf),
-        ],
-        model_iter_fn,
-        example_inputs,
-        args,
-    )
-
-
-def speedup_experiment_trt(args, model_iter_fn, model, example_inputs):
-    """
-    Measure baseline performance (without using TorchDynamo) of TensorRT.
-
-    Writes to ./baseline_trt.csv
-    """
-    m_onnx2trt = backends.onnx2tensorrt(
-        try_script(model, example_inputs), example_inputs
-    )
-
-    m_torch2trt = backends.torch2trt(model, example_inputs)
-
-    if current_name != "opacus_cifar10":
-        m_fx2trt = backends.fx2trt(model, example_inputs)
-    else:
-        # fx2trt infinite loops on one model
-        m_fx2trt = None
-
-    return baselines(
-        [
-            ("eager", model),
-            ("onnx2trt", m_onnx2trt),
-            ("torch2trt", m_torch2trt),
-            ("fx2trt", m_fx2trt),
-        ],
-        model_iter_fn,
-        example_inputs,
-        args,
-    )
-
-
 def read_batch_size_from_file(args, filename, model_name):
     batch_size = None
     if os.path.exists("benchmarks"):
@@ -930,6 +933,20 @@ def scale(self, loss):
         return loss
 
 
+def get_dynamo_stats():
+    # TODO: consider deepcopy'ing the entire counters struct and
+    # adding a helper to do subtraction on it
+    return collections.Counter(
+        {
+            "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
+            "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
+            "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
+            # NB: The plus removes zero counts
+            "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
+        }
+    )
+
+
 def maybe_fresh_cache(fn, is_cold_start):
     def inner(*args, **kwargs):
         cache_minder = NullContext()
@@ -1139,7 +1156,8 @@ def batch_size_finder(self, device, model_name, initial_batch_size=1024):
             batch_size = self.decay_batch_exp(batch_size)
         return 1
 
-    def run_n_iterations(self, mod, inputs, n=2):
+    def run_n_iterations(self, mod, inputs):
+        n = self.args.iterations
         for _ in range(n - 1):
             self.model_iter_fn(mod, inputs, collect_outputs=False)
         return self.model_iter_fn(mod, inputs, collect_outputs=True)
@@ -1171,8 +1189,9 @@ def check_accuracy(
         1) Collect the outputs with fp64 datatype. This is useful for error checking.
         2) Checks if eager itself has variations.
         """
+        start_stats = get_dynamo_stats()
 
-        def record_status(accuracy_status):
+        def record_status(accuracy_status, dynamo_start_stats):
             """
             Records the status in the csv file
             """
@@ -1187,11 +1206,17 @@ def record_status(accuracy_status):
                 headers.insert(3, "tag")
                 fields.insert(3, tag)
 
+            dynamo_stats = get_dynamo_stats()
+            dynamo_stats.subtract(dynamo_start_stats)
+            for k, v in dynamo_stats.items():
+                headers.append(k)
+                fields.append(v)
+
             output_csv(output_filename, headers, fields)
             return "PASS" if accuracy_status in ("pass", "pass_due_to_skip") else "FAIL"
 
         if name in self.skip_accuracy_checks_large_models_dashboard:
-            return record_status("pass_due_to_skip")
+            return record_status("pass_due_to_skip", dynamo_start_stats=start_stats)
 
         def deepcopy_and_maybe_ddp(model):
             model = copy.deepcopy(model)
@@ -1199,8 +1224,9 @@ def deepcopy_and_maybe_ddp(model):
                 model = DDP(model, find_unused_parameters=True)
             elif self.args.fsdp:
                 model = FSDP(model, use_orig_params=True)
-                torch._inductor.config.triton.cudagraphs = False
-                log.warn("Disabling cudagraphs for FSDP compatibility")
+                if torch._inductor.config.triton.cudagraphs:
+                    log.warning("Disabling cudagraphs for FSDP compatibility")
+                    torch._inductor.config.triton.cudagraphs = False
             return model
 
         # Collect the fp64 reference outputs to be used later for accuracy checking.
@@ -1248,11 +1274,11 @@ def deepcopy_and_maybe_ddp(model):
             if not same(
                 correct_result,
                 correct_rerun_result,
-                fp64_outputs,
+                fp64_ref=None,  # Two eager runs should be the same without comparing against fp64_output
                 equal_nan=self.equal_nan,
             ):
                 accuracy_status = "eager_variation"
-                return record_status(accuracy_status)
+                return record_status(accuracy_status, dynamo_start_stats=start_stats)
             correct_rerun_result = None
 
             # Run with Dynamo
@@ -1276,13 +1302,17 @@ def deepcopy_and_maybe_ddp(model):
                     )
                 ):
                     accuracy_status = "pass_due_to_skip"
-                    return record_status(accuracy_status)
+                    return record_status(
+                        accuracy_status, dynamo_start_stats=start_stats
+                    )
                 else:
                     print(
                         "TorchDynamo optimized model failed to run because of following error"
                     )
                     accuracy_status = "fail_to_run"
-                    return record_status(accuracy_status)
+                    return record_status(
+                        accuracy_status, dynamo_start_stats=start_stats
+                    )
             if not same(
                 correct_result,
                 new_result,
@@ -1295,15 +1325,16 @@ def deepcopy_and_maybe_ddp(model):
                     accuracy_status = "pass_due_to_skip"
                 else:
                     accuracy_status = "fail_accuracy"
-                return record_status(accuracy_status)
+                return record_status(accuracy_status, dynamo_start_stats=start_stats)
 
-        return record_status(accuracy_status)
+        return record_status(accuracy_status, dynamo_start_stats=start_stats)
 
     def run_performance_test(
         self, name, model, example_inputs, optimize_ctx, experiment, tag=None
     ):
         def warmup(fn, model, example_inputs, mode, niters=5):
             peak_mem = 0
+            start_stats = get_dynamo_stats()
             try:
                 if current_device == "cuda":
                     torch.cuda.reset_peak_memory_stats()
@@ -1322,7 +1353,9 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             except Exception as e:
                 log.exception(f"Failed for {mode} {e}")
                 return sys.exit(-1)
-            return latency, peak_mem
+            dynamo_stats = get_dynamo_stats()
+            dynamo_stats.subtract(start_stats)
+            return latency, peak_mem, dynamo_stats
 
         # Cast the model to float16/float32 as necessary
         model, example_inputs = self.maybe_cast(model, example_inputs)
@@ -1334,11 +1367,11 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                 experiment_kwargs["tag"] = tag
             results = []
 
-            eager_latency, eager_peak_mem = warmup(
+            eager_latency, eager_peak_mem, _ = warmup(
                 self.model_iter_fn, model, example_inputs, "eager"
             )
             optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
-            dynamo_latency, dynamo_peak_mem = warmup(
+            dynamo_latency, dynamo_peak_mem, dynamo_stats = warmup(
                 optimized_model_iter_fn, model, example_inputs, "dynamo"
             )
 
@@ -1355,6 +1388,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             if experiment.func is speedup_experiment:
                 experiment_kwargs["compilation_latency"] = compilation_time
                 experiment_kwargs["compression_ratio"] = compression_ratio
+                experiment_kwargs["dynamo_stats"] = dynamo_stats
 
             if experiment.func is coverage_experiment:
                 ok, total = Stats.reset_counters()
@@ -1378,56 +1412,6 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             results.append(experiment(model, example_inputs, **experiment_kwargs))
             return " ".join(map(str, results))
 
-    def compare_branches(
-        self,
-        name,
-        model,
-        example_inputs,
-        optimize_ctx,
-        experiment,
-        explain,
-        comparison_branch=None,
-        branch=None,
-    ):
-        assert branch is None, "Branch set during top level flow."
-        import git
-
-        repo = git.Repo()
-        curr_branch = repo.active_branch.name
-        if curr_branch != comparison_branch:
-            # Run current
-            try:
-                self.run_one_model(
-                    name,
-                    model,
-                    example_inputs,
-                    optimize_ctx,
-                    experiment,
-                    explain=explain,
-                    branch=curr_branch,
-                    tag=curr_branch,
-                )
-                # Run comparison branch
-                repo.git.checkout(comparison_branch)
-                self.run_one_model(
-                    name,
-                    model,
-                    example_inputs,
-                    optimize_ctx,
-                    experiment,
-                    explain=explain,
-                    branch=comparison_branch,
-                    tag=comparison_branch,
-                )
-            finally:
-                # Swap back
-                repo.git.checkout(curr_branch)
-            return
-        else:
-            raise RuntimeError(
-                f"--diff-branch: current branch is same as {comparison_branch} branch, what are you diffing?"
-            )
-
     def run_one_model(
         self,
         name,
@@ -1435,29 +1419,17 @@ def run_one_model(
         example_inputs,
         optimize_ctx,
         experiment,
-        comparison_branch=None,
-        branch=None,
         explain=False,
         tag=None,
     ):
-        if comparison_branch is not None:
-            self.compare_branches(
-                name,
-                model,
-                example_inputs,
-                optimize_ctx,
-                experiment,
-                comparison_branch=comparison_branch,
-                explain=explain,
-            )
-            return
         mode = "train" if self.args.training else "eval"
         msg = f"{current_device:4} {mode:5} {current_name:34} "
-        if branch:
-            msg += f" {branch:26}"
+        if tag:
+            msg += f" {tag:26}"
         print(msg, end=" ", flush=True)
-        start_calls_captured = torch._dynamo.utils.counters["stats"]["calls_captured"]
-        start_unique_graphs = torch._dynamo.utils.counters["stats"]["unique_graphs"]
+
+        start_stats = get_dynamo_stats()
+
         if self.args.accuracy:
             status = self.check_accuracy(
                 name, model, example_inputs, optimize_ctx, experiment, tag
@@ -1469,16 +1441,26 @@ def run_one_model(
             )
             print(status)
         if self.args.timing:
-            from torch._dynamo.utils import print_time_report
+            from torch._dynamo.utils import op_count, print_time_report
+            from torch.utils._stats import simple_call_counter
 
             print_time_report()
+            stats = "STATS: "
+            stats = stats + " | ".join(
+                itertools.chain(
+                    [f"call_* op count: {op_count}"],
+                    (f"{key}:{value}" for key, value in simple_call_counter.items()),
+                )
+            )
+            print(stats)
+        stats = get_dynamo_stats()
+        stats.subtract(start_stats)
 
-        end_calls_captured = torch._dynamo.utils.counters["stats"]["calls_captured"]
-        end_unique_graphs = torch._dynamo.utils.counters["stats"]["unique_graphs"]
         if explain:
             print(
-                f"Dynamo produced {end_unique_graphs-start_unique_graphs} graph(s) "
-                f"covering {end_calls_captured-start_calls_captured} ops"
+                f"Dynamo produced {stats['unique_graphs']} graphs "
+                f"covering {stats['calls_captured']} ops with "
+                f"{stats['graph_breaks']} graph breaks ({stats['unique_graph_breaks']} unique)"
             )
 
 
@@ -1486,6 +1468,13 @@ def help(fn):
     return fn.__doc__
 
 
+diff_branch_default = "DIFF-BRANCH-DEFAULT"
+
+
+def should_diff_branch(args):
+    return args.diff_branch != diff_branch_default
+
+
 def parse_args(args=None):
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -1564,7 +1553,12 @@ def parse_args(args=None):
         default=False,
         help="use channels last format",
     )
-    parser.add_argument("--batch_size", type=int, help="batch size for benchmarking")
+    parser.add_argument(
+        "--batch-size", "--batch_size", type=int, help="batch size for benchmarking"
+    )
+    parser.add_argument(
+        "--iterations", type=int, default=2, help="how many iterations to run"
+    )
     parser.add_argument(
         "--batch-size-file", type=str, help="String to load batch size from"
     )
@@ -1692,10 +1686,20 @@ def get_example_inputs(self):
         action="store_true",
         help="exports trace of kineto profiler",
     )
-    parser.add_argument("--profiler_trace_name", help="Overwrites exported trace name")
+    parser.add_argument(
+        "--profiler-trace-name",
+        "--profiler_trace_name",
+        help="Overwrites exported trace name",
+    )
 
     parser.add_argument(
-        "--diff-branch", default=None, help="Delta current branch against given branch."
+        "--diff-branch",
+        default=diff_branch_default,
+        help="delta current branch against given branch.",
+    )
+
+    parser.add_argument(
+        "--tag", default=None, help="Specify a tag to be included in csv files."
     )
 
     parser.add_argument(
@@ -1705,6 +1709,7 @@ def get_example_inputs(self):
     )
 
     parser.add_argument(
+        "--cold-start-latency",
         "--cold_start_latency",
         action="store_true",
         help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
@@ -1714,6 +1719,11 @@ def get_example_inputs(self):
         action="store_true",
         help="Disables cudagraphs for Inductor",
     )
+    parser.add_argument(
+        "--print-graph-breaks",
+        action="store_true",
+        help="Show a warning whenever graph break",
+    )
     parser.add_argument(
         "--trace-on-xla",
         action="store_true",
@@ -1740,6 +1750,19 @@ def get_example_inputs(self):
         help="Print n/k models message between each model run.",
     )
 
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=1800,
+        help="timeout (second) for benchmarking.",
+    )
+
+    parser.add_argument(
+        "--per_process_memory_fraction",
+        type=float,
+        default=1,
+        help="Set per-process GPU memory fraction (limit) for reducing usable size and reproducing OOMs",
+    )
     group_fuser = parser.add_mutually_exclusive_group()
     # --nvfuser is now the default, keep the option to not break scripts
     group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
@@ -1770,12 +1793,6 @@ def get_example_inputs(self):
     group.add_argument(
         "--overhead", action="store_true", help=help(overhead_experiment)
     )
-    group.add_argument(
-        "--speedup-onnx", action="store_true", help=help(speedup_experiment_onnx)
-    )
-    group.add_argument(
-        "--speedup-trt", action="store_true", help=help(speedup_experiment_trt)
-    )
     group.add_argument(
         "--speedup-dynamo-ts",
         action="store_true",
@@ -1804,14 +1821,9 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure speedup with TorchInductor",
     )
-    group.add_argument(
-        "--inductor-dynamic",
-        action="store_true",
-        help="Measure speedup with TorchInductor",
-    )
     group.add_argument(
         "--backend",
-        choices=torch._dynamo.list_backends(),
+        choices=torch._dynamo.list_backends(exclude_tags=None),
         help="measure speedup with a given backend",
     )
     group.add_argument("--nothing", action="store_true", help=help(null_experiment))
@@ -1821,6 +1833,7 @@ def get_example_inputs(self):
         help="Dump convolution input/weight/bias's shape/stride/dtype and other options to json",
     )
     group.add_argument(
+        "--recompile-profiler",
         "--recompile_profiler",
         action="store_true",
         help="Run the dynamo recompilation profiler on each model.",
@@ -1848,7 +1861,7 @@ def main(runner, original_dir=None):
         os.chdir(original_dir)
     args = parse_args()
 
-    if args.diff_branch:
+    if should_diff_branch(args):
         import git
 
         # We do this here so we error out earlier if there's an issue
@@ -1857,13 +1870,18 @@ def main(runner, original_dir=None):
             raise RuntimeError(
                 "--diff-branch called on dirty branch. Commit, stash, or reset."
             )
+        main_branch = repo.active_branch.name
+        if main_branch == args.diff_branch:
+            raise RuntimeError(
+                f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
+            )
 
     with maybe_init_distributed(
         (args.ddp or args.fsdp) and args.only, port=args.distributed_master_port
     ):
-        return maybe_fresh_cache(run, args.cold_start_latency and args.only)(
-            runner, args, original_dir
-        )
+        return maybe_fresh_cache(
+            run, (args.cold_start_latency and args.only) or args.ci
+        )(runner, args, original_dir)
 
 
 def run(runner, args, original_dir=None):
@@ -1880,17 +1898,9 @@ def run(runner, args, original_dir=None):
     if args.dynamic_ci_skips_only:
         args.dynamic_shapes = True
         args.ci = True
-        # We only have a CI skip list for aot_eager right now.  When inductor
-        # comes online, add that skip list too.
-        assert (
-            args.backend == "aot_eager"
-        ), "--dynamic-ci-skips only works with aot_eager backend at the moment"
     if args.dynamic_shapes:
         torch._dynamo.config.dynamic_shapes = True
-        torch._functorch.config.use_dynamic_shapes = True
     if args.ci:
-        # Only dump error on CI
-        args.quiet = True
         args.repeat = 2
         if args.dynamic_ci_skips_only:
             # Test only the incremental set of jobs whose skipped was
@@ -1901,9 +1911,11 @@ def run(runner, args, original_dir=None):
                 set(CI_SKIP[ci(dynamic=True)]) - set(CI_SKIP[ci(dynamic=False)])
             )
         else:
-            args.exclude_exact = CI_SKIP[
-                CI(args.backend, training=args.training, dynamic=args.dynamic_shapes)
-            ]
+            ci = functools.partial(
+                CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
+            )
+            for device in args.devices:
+                args.exclude_exact.extend(CI_SKIP[ci(device=device)])
     if args.ddp:
         # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
         # but just to measure impact on singlenode of performing graph-breaks.
@@ -1941,6 +1953,8 @@ def run(runner, args, original_dir=None):
             # TODO - Using train mode for timm_models. Move to train mode for HF and Torchbench as well.
             args.use_eval_mode = True
         inductor_config.fallback_random = True
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.benchmark = False
         torch.backends.cudnn.deterministic = True
 
         # Remove randomeness when torch manual seed is called
@@ -2015,6 +2029,9 @@ def run(runner, args, original_dir=None):
     if args.verbose:
         torch._dynamo.config.log_level = logging.DEBUG
 
+    if args.print_graph_breaks:
+        torch._dynamo.config.print_graph_breaks = True
+
     if args.quiet:
         torch._dynamo.config.log_level = logging.ERROR
 
@@ -2032,7 +2049,7 @@ def run(runner, args, original_dir=None):
     if args.devices == ["cpu"]:
         runner.skip_models.update(runner.very_slow_models)
 
-    if args.inductor or args.inductor_dynamic or args.inductor_settings:
+    if args.inductor or args.inductor_settings:
         runner.skip_models.update(runner.failing_torchinductor_models)
         if args.float16:
             # TODO(jansel): check if correctness issue is real
@@ -2062,52 +2079,18 @@ def run(runner, args, original_dir=None):
         optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "overheads.csv"
-    elif args.inductor or args.inductor_dynamic:
+    elif args.inductor:
         inductor_config.debug = args.verbose
         if args.threads:
             inductor_config.cpp.threads = args.threads
 
-        if args.inductor_dynamic:
-            inductor_config.triton.cudagraphs = False
-            inductor_config.dynamic_shapes = True
-        else:
-            inductor_config.dynamic_shapes = False
-            if args.export_profiler_trace:
-                print("Profiling requested, setting cudagraphs to False")
-                inductor_config.triton.cudagraphs = False
-
         optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "inductor.csv"
-    elif args.speedup_onnx:
-        experiment = speedup_experiment_onnx
-        output_filename = "baseline_onnx.csv"
-    elif args.speedup_trt:
-        experiment = speedup_experiment_trt
-        output_filename = "baseline_trt.csv"
     elif args.speedup_dynamo_ts:
-        optimize_ctx = torch._dynamo.optimize(backends.ts, nopython=args.nopython)
+        optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "speedup_dynamo_ts.csv"
-    elif args.speedup_fx2trt:
-        optimize_ctx = torch._dynamo.optimize(
-            backends.fx2trt_compiler, nopython=args.nopython
-        )
-        experiment = speedup_experiment_fx2trt
-        output_filename = "speedups_fx2trt.csv"
-        runner.skip_models.update(runner.failing_fx2trt_models)
-        args.float32 = True
-        args.float16 = False
-        args.cosine = True
-    elif args.speedup_fx2trt_fp16:
-        optimize_ctx = torch._dynamo.optimize(
-            backends.fx2trt_compiler_fp16, nopython=args.nopython
-        )
-        experiment = speedup_experiment_fx2trt
-        output_filename = "speedups_fx2trt_fp16.csv"
-        args.float32 = False
-        args.float16 = True
-        args.cosine = True
     elif args.prims_nvfuser:
         optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython)
         experiment = speedup_experiment
@@ -2127,28 +2110,12 @@ def run(runner, args, original_dir=None):
         optimize_ctx = nothing
         output_filename = "nothing.csv"
     elif args.backend:
-        if args.backend == "ipex":
-            if args.bfloat16:
-                optimize_ctx = torch._dynamo.optimize(
-                    backends.ipex_bf16, nopython=args.nopython
-                )
-            else:
-                assert args.float32, "IPEX only supports fp32 and bf16 for now."
-                optimize_ctx = torch._dynamo.optimize(
-                    backends.ipex_fp32, nopython=args.nopython
-                )
-        else:
-            optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
+        optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
         experiment = speedup_experiment
         if args.accuracy:
             output_filename = f"accuracy_{args.backend}.csv"
         else:
             output_filename = f"speedup_{args.backend}.csv"
-    elif args.log_conv_args:
-        optimize_ctx = torch._dynamo.optimize(
-            conv_args_analysis, nopython=args.nopython
-        )
-        output_filename = "log_conv_args.csv"
     elif args.recompile_profiler:
         output_filename = "recompile_profiler_log.csv"
         experiment = recompile_profiler_experiment
@@ -2160,8 +2127,7 @@ def run(runner, args, original_dir=None):
         output_filename = "coverage.csv"
 
     if args.inductor or args.backend == "inductor":
-        if args.disable_cudagraphs:
-            inductor_config.triton.cudagraphs = False
+        inductor_config.triton.cudagraphs = not args.disable_cudagraphs
 
     runner.setup_amp()
 
@@ -2187,7 +2153,7 @@ def run(runner, args, original_dir=None):
         if args.profiler_trace_name is None:
             if args.backend:
                 args.profiler_trace_name = args.backend
-            elif args.inductor or args.inductor_dynamic:
+            elif args.inductor:
                 args.profiler_trace_name = "inductor"
             else:
                 args.profiler_trace_name = "profile"
@@ -2196,7 +2162,25 @@ def run(runner, args, original_dir=None):
 
     experiment = functools.partial(experiment, args, runner.model_iter_fn)
 
-    if args.only:
+    if args.only and should_diff_branch(args):
+        import git
+
+        repo = git.Repo()
+        main_branch = repo.active_branch.name
+        try:
+            # Adding diff-branch again to the args will override previous value
+            call_args = (
+                [sys.executable] + sys.argv + [f"--diff-branch={diff_branch_default}"]
+            )
+            # Run for main branch
+            subprocess.check_call(call_args + [f"--tag={main_branch}"])
+            # Run for comparison branch
+            repo.git.checkout(args.diff_branch)
+            subprocess.check_call(call_args + [f"--tag={args.diff_branch}"])
+        finally:
+            # Go back to main branch
+            repo.git.checkout(main_branch)
+    elif args.only:
         model_name = args.only
         for device in args.devices:
             batch_size = args.batch_size
@@ -2234,7 +2218,7 @@ def run(runner, args, original_dir=None):
                     import traceback
 
                     print(traceback.format_exc())
-                    logging.warn(f"{args.only} failed to load")
+                    logging.warning(f"{args.only} failed to load")
                     continue  # bad benchmark implementation
 
             if args.trace_on_xla:
@@ -2262,14 +2246,19 @@ def run(runner, args, original_dir=None):
                 )
                 continue
 
+            if args.per_process_memory_fraction != 1:
+                torch.cuda.set_per_process_memory_fraction(
+                    args.per_process_memory_fraction
+                )
+
             runner.run_one_model(
                 name,
                 model,
                 example_inputs,
                 optimize_ctx,
                 experiment,
-                comparison_branch=args.diff_branch,
                 explain=args.explain,
+                tag=args.tag,
             )
         if args.generate_aot_autograd_stats:
             stats_file = output_filename.split(".csv")[0] + "_stats.csv"
@@ -2303,8 +2292,11 @@ def write_csv():
                     )
 
             try:
+                timeout = args.timeout
+                if should_diff_branch(args):
+                    timeout *= 2
                 subprocess.check_call(
-                    [sys.executable] + sys.argv + [f"--only={name}"], timeout=60 * 20
+                    [sys.executable] + sys.argv + [f"--only={name}"], timeout=timeout
                 )
             except subprocess.TimeoutExpired:
                 print("TIMEOUT", file=sys.stderr)
@@ -2350,6 +2342,6 @@ def log_operator_inputs(model, example_inputs, model_iter_fn, name, args):
 
 
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.WARNING)
-    warnings.filterwarnings("ignore")
-    main()
+    raise RuntimeError(
+        f"You shouldn't run {sys.argv[0]} directly, instead try timm_model.py, torchbench.py or hugginface.py"
+    )
diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index 24625c84e1a1..81bed379e282 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -38,7 +38,7 @@ def cleanup():
 
 class CustomLinear(torch.nn.Module):
     def __init__(self, a, b):
-        super(CustomLinear, self).__init__()
+        super().__init__()
         self.weight = nn.Parameter(torch.randn(a, b))
 
     def forward(self, x):
@@ -47,7 +47,7 @@ def forward(self, x):
 
 class MyModule(torch.nn.Module):
     def __init__(self, a, b):
-        super(MyModule, self).__init__()
+        super().__init__()
         self.net = nn.Sequential(
             nn.Linear(a, b),
             nn.ReLU(),
@@ -59,7 +59,7 @@ def forward(self, x):
 
 class ToyModel(nn.Module):
     def __init__(self):
-        super(ToyModel, self).__init__()
+        super().__init__()
         self.net = nn.Sequential(
             *[nn.Linear(10, 10000), nn.ReLU()]
             + [nn.Linear(10000, 10000), nn.ReLU()]
diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index 410fab580c77..9d99c4fcb6e1 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -85,7 +85,7 @@ def move_tensor(maybe_tensor):
             dynamo.config.optimize_ddp = False
         if args.dynamo == "inductor" and args.fsdp:
             torch._inductor.config.triton.cudagraphs = False
-            log.warn("disabling inductor cudagraphs for compatibility with FSDP")
+            log.warning("disabling inductor cudagraphs for compatibility with FSDP")
 
         def print_compile(gm, ex):
             print(
@@ -121,24 +121,29 @@ def print_compile(gm, ex):
         help="if set to a str, uses dynamo[str] backend. else, eager",
     )
     parser.add_argument("--verbose", action="store_true")
-    parser.add_argument("--batch_size", default=None)
+    parser.add_argument("--batch-size", "--batch_size", default=None)
     parser.add_argument(
         "--torchviz", action="store_true", help="Dump autograd graph with torchviz"
     )
     parser.add_argument("--profile", action="store_true", help="Run the profiler")
-    parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
+    parser.add_argument(
+        "--trace-file", "--trace_file", default="profile.json", help="Run the profiler"
+    )
     parser.add_argument("--repeat", default=10, help="Repeats for timing run")
     parser.add_argument(
+        "--dynamo-no-optimize-ddp",
         "--dynamo_no_optimize_ddp",
         action="store_true",
         help="Disable dynamo's ddp optimizer (enabled by default)",
     )
     parser.add_argument(
+        "--fsdp-checkpoint",
         "--fsdp_checkpoint",
         action="store_true",
         help="Use gradient checkpointing via model-specific policy",
     )
     parser.add_argument(
+        "--fsdp-wrap",
         "--fsdp_wrap",
         action="store_true",
         help="Apply fsdp to submodules via model-specific policy",
@@ -150,10 +155,12 @@ def print_compile(gm, ex):
 
     model_arg = parser.add_mutually_exclusive_group(required=True)
     model_arg.add_argument(
-        "--torchbench_model", help="name of torchbench model, e.g. hf_Bert"
+        "--torchbench-model",
+        "--torchbench_model",
+        help="name of torchbench model, e.g. hf_Bert",
     )
     model_arg.add_argument(
-        "--toy_model", action="store_true", help="use toy model instead"
+        "--toy-model", "--toy_model", action="store_true", help="use toy model instead"
     )
     args = parser.parse_args()
 
diff --git a/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv b/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
new file mode 100644
index 000000000000..5e05180d3bad
--- /dev/null
+++ b/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
@@ -0,0 +1,55 @@
+dev,name,batch_size,speedup,abs_latency,compilation_latency,compression_ratio
+cuda,BERT_pytorch,16,2.6028,22.2879,41.0046,1.1965
+cuda,Background_Matting,4,1.1296,112.7632,27.8916,1.0396
+cuda,LearningToPaint,96,1.0951,11.3205,13.0241,0.9960
+cuda,Super_SloMo,6,1.2160,65.3294,27.1633,1.2396
+cuda,alexnet,128,1.1919,8.2399,6.5561,1.0008
+cuda,attention_is_all_you_need_pytorch,256,1.4975,36.6682,43.0610,1.1824
+cuda,dcgan,32,0.9276,2.2476,5.7151,1.0064
+cuda,demucs,4,1.0313,51.7716,12.8195,0.9971
+cuda,densenet121,4,1.1976,46.0111,64.0118,0.9945
+cuda,dlrm,1024,1.3421,3.2177,4.9493,1.0009
+cuda,drq,1,1.0820,3.8157,8.0732,0.9687
+cuda,fastNLP_Bert,6,1.4839,37.9050,32.7583,1.1563
+cuda,functorch_dp_cifar10,64,1.5014,6.9596,14.1516,0.4432
+cuda,hf_Albert,8,2.2452,30.6134,25.9036,1.2649
+cuda,hf_Bart,4,1.7012,34.3999,37.9975,1.0128
+cuda,hf_Bert,4,1.9003,23.3435,34.8196,1.0273
+cuda,hf_Bert_large,4,1.6346,52.8525,62.3112,1.0726
+cuda,hf_BigBird,2,1.9208,105.2672,101.4787,1.1415
+cuda,hf_DistilBert,8,1.3988,22.5793,20.2386,1.0232
+cuda,hf_GPT2,4,1.8075,27.5184,25.3428,1.1562
+cuda,hf_GPT2_large,4,1.7716,118.7404,68.1618,1.1725
+cuda,hf_Reformer,4,1.1744,70.4228,15.1152,0.9266
+cuda,hf_T5,8,1.8778,93.3134,37.0046,1.2279
+cuda,hf_T5_large,2,2.3623,101.5518,143.7982,1.1674
+cuda,lennard_jones,1000,1.0649,1.5233,4.1119,0.9998
+cuda,mnasnet1_0,32,1.1957,19.1993,27.2302,0.7758
+cuda,mobilenet_v2,96,1.4876,32.3311,27.4719,1.1729
+cuda,mobilenet_v3_large,32,1.3051,21.0818,55.7101,0.7771
+cuda,nvidia_deeprecommender,256,1.0182,10.0515,5.1433,0.9711
+cuda,phlippe_densenet,128,1.1230,21.9244,26.8021,1.0062
+cuda,phlippe_resnet,128,1.0857,8.8702,11.5935,1.0037
+cuda,pytorch_CycleGAN_and_pix2pix,1,1.8336,7.4113,13.1523,1.0224
+cuda,pytorch_stargan,16,1.2906,11.6881,45.2834,0.8874
+cuda,pytorch_struct,200,1.2499,3.9393,18.4688,0.7357
+cuda,pytorch_unet,1,1.3525,29.6253,14.6794,1.0087
+cuda,resnet152,32,1.0883,60.3646,65.7002,0.9385
+cuda,resnet18,16,0.9888,10.3945,15.6529,0.6190
+cuda,resnet50,32,1.1437,23.2979,27.0392,0.8824
+cuda,resnext50_32x4d,8,1.0935,19.0480,27.1950,0.7721
+cuda,shufflenet_v2_x1_0,128,1.3027,25.7017,27.9875,1.1015
+cuda,soft_actor_critic,256,0.9965,2.2580,4.6661,0.9995
+cuda,speech_transformer,32,1.8405,35.1645,33.3422,1.0888
+cuda,squeezenet1_1,32,1.4191,7.3454,9.4751,1.1148
+cuda,timm_efficientdet,1,1.6630,78.2697,150.9620,0.9904
+cuda,timm_efficientnet,32,1.2689,28.5348,66.3911,0.9428
+cuda,timm_nfnet,128,1.5319,79.5429,32.9961,1.1070
+cuda,timm_regnet,32,1.0564,56.9897,53.0027,0.9500
+cuda,timm_resnest,32,1.6485,14.3908,56.7240,0.9515
+cuda,timm_vision_transformer,8,1.6100,18.7736,36.9495,0.7301
+cuda,timm_vision_transformer_large,8,1.0842,170.9849,72.0604,0.9762
+cuda,timm_vovnet,32,1.0472,25.4676,24.8428,0.8843
+cuda,tts_angular,64,1.0366,6.9889,4.2683,0.9973
+cuda,vgg16,64,1.2560,52.7072,7.3733,0.9884
+cuda,yolov3,16,1.2600,54.2350,42.4711,1.0108
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index f43104323225..893a50ccb94d 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -364,7 +364,7 @@ def rand_int_tensor(device, low, high, shape):
 
 class HuggingfaceRunner(BenchmarkRunner):
     def __init__(self):
-        super(HuggingfaceRunner, self).__init__()
+        super().__init__()
         self.suite_name = "huggingface"
 
     def load_model(
@@ -582,10 +582,14 @@ def refresh_model_names_and_batch_sizes():
             log.warning(f"Failed to find suitable batch size for {model_name}")
 
 
-if __name__ == "__main__":
+def huggingface_main():
     # Code to refresh model names and batch sizes
     # if "--find-batch-sizes" not in sys.argv:
     #     refresh_model_names_and_batch_sizes()
     logging.basicConfig(level=logging.WARNING)
     warnings.filterwarnings("ignore")
     main(HuggingfaceRunner())
+
+
+if __name__ == "__main__":
+    huggingface_main()
diff --git a/benchmarks/dynamo/huggingface_models_list_cpu.txt b/benchmarks/dynamo/huggingface_models_list_cpu.txt
new file mode 100644
index 000000000000..cabd79ac830f
--- /dev/null
+++ b/benchmarks/dynamo/huggingface_models_list_cpu.txt
@@ -0,0 +1,47 @@
+AlbertForMaskedLM,4
+AlbertForQuestionAnswering,4
+AllenaiLongformerBase,4
+BartForCausalLM,4
+BartForConditionalGeneration,2
+BertForMaskedLM,16
+BertForQuestionAnswering,16
+BigBird,32
+BlenderbotForCausalLM,32
+BlenderbotSmallForCausalLM,64
+BlenderbotSmallForConditionalGeneration,64
+CamemBert,16
+DebertaForMaskedLM,32
+DebertaForQuestionAnswering,8
+DebertaV2ForMaskedLM,16
+DebertaV2ForQuestionAnswering,2
+DistilBertForMaskedLM,128
+DistilBertForQuestionAnswering,256
+DistillGPT2,16
+ElectraForCausalLM,8
+ElectraForQuestionAnswering,8
+GoogleFnet,16
+GPT2ForSequenceClassification,4
+LayoutLMForMaskedLM,16
+LayoutLMForSequenceClassification,16
+M2M100ForConditionalGeneration,16
+MBartForCausalLM,4
+MBartForConditionalGeneration,2
+MegatronBertForCausalLM,4
+MegatronBertForQuestionAnswering,8
+MobileBertForMaskedLM,64
+MobileBertForQuestionAnswering,64
+MT5ForConditionalGeneration,16
+OPTForCausalLM,2
+PegasusForCausalLM,32
+PegasusForConditionalGeneration,32
+PLBartForCausalLM,8
+PLBartForConditionalGeneration,4
+RobertaForCausalLM,16
+RobertaForQuestionAnswering,16
+Speech2Text2ForCausalLM,32
+T5ForConditionalGeneration,4
+T5Small,1
+TrOCRForCausalLM,32
+XGLMForCausalLM,8
+XLNetLMHeadModel,8
+YituTechConvBert,16
diff --git a/benchmarks/dynamo/microbenchmarks/microbench.py b/benchmarks/dynamo/microbenchmarks/microbench.py
index 8d783bed5f89..c4fbafe4667e 100755
--- a/benchmarks/dynamo/microbenchmarks/microbench.py
+++ b/benchmarks/dynamo/microbenchmarks/microbench.py
@@ -8,7 +8,7 @@
 import torch
 
 import torch._inductor
-from torch._dynamo.optimizations.backends import cudagraphs_inner
+from torch._dynamo.backends.cudagraphs import cudagraphs_inner
 from torch._dynamo.testing import same
 from torch._inductor.compile_fx import compile_fx
 from torch._inductor.utils import timed
diff --git a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
index 7b7b9a09e5e6..046a1dd9c9b1 100644
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@@ -181,7 +181,7 @@ def __torch_dispatch__(self, func_overload, types, args=(), kwargs=None):
         return out
 
     def log_to_file(self, output_filename, *, skip_non_compute_operators=True):
-        sorted_operators = sorted(list(self.func_db.keys()))
+        sorted_operators = sorted(self.func_db.keys())
         with open(output_filename, "w") as f:
             for operator in sorted_operators:
                 if skip_non_compute_operators and non_compute_operator(eval(operator)):
diff --git a/benchmarks/dynamo/microbenchmarks/operatorbench.py b/benchmarks/dynamo/microbenchmarks/operatorbench.py
index 147bf75e9a92..dfbe6248e2cc 100644
--- a/benchmarks/dynamo/microbenchmarks/operatorbench.py
+++ b/benchmarks/dynamo/microbenchmarks/operatorbench.py
@@ -4,7 +4,7 @@
 import torch
 from operator_inp_utils import OperatorInputsLoader
 
-from torch._dynamo.optimizations.backends import cudagraphs_inner
+from torch._dynamo.backends.cudagraphs import cudagraphs_inner
 from torch._dynamo.testing import same
 from torch._inductor import config as inductor_config
 from torch._inductor.compile_fx import compile_fx
diff --git a/benchmarks/dynamo/parse_logs.py b/benchmarks/dynamo/parse_logs.py
index ab9b7589d525..a82648d4dd77 100644
--- a/benchmarks/dynamo/parse_logs.py
+++ b/benchmarks/dynamo/parse_logs.py
@@ -1,7 +1,6 @@
 import csv
 import os
 import re
-import subprocess
 import sys
 
 # This script takes the logs produced by the benchmark scripts (e.g.,
@@ -24,11 +23,6 @@
 if m is not None:
     gist_url = m.group(0)
 
-# Record the current commit hash for ease of reproducibility
-hash = subprocess.check_output(
-    "git rev-parse HEAD".split(" "), encoding="utf-8"
-).rstrip()
-
 # Split the log into an entry per benchmark
 entries = re.split(
     r"(?:cuda (?:train|eval) +([^ ]+)|WARNING:root:([^ ]+) failed to load)", full_log
@@ -45,8 +39,26 @@ def chunker(seq, size):
 c = 0
 i = 0
 
-out = csv.writer(sys.stdout, dialect="excel")
-out.writerow(["", hash, "", "", "", "", gist_url])
+out = csv.DictWriter(
+    sys.stdout,
+    [
+        "bench",
+        "name",
+        "result",
+        "component",
+        "context",
+        "explain",
+        "frame_time",
+        "backend_time",
+        "graph_count",
+        "op_count",
+        "graph_breaks",
+        "unique_graph_breaks",
+    ],
+    dialect="excel",
+)
+out.writeheader()
+out.writerow({"explain": gist_url})
 
 # Sometimes backtraces will be in third party code, which results
 # in very long file names.  Delete the absolute path in this case.
@@ -130,6 +142,26 @@ def normalize_file(f):
         if len(split_str) == 2:
             backend_time = float(split_str[1])
             frame_time = float(split_str[0].split("entire_frame_compile:")[1])
+
+    if "STATS:" in log:
+        result = re.search("STATS:(.*)\n", log).group(1)
+        # call_* op count: 970 | FakeTensor.__torch_dispatch__:35285 | ProxyTorchDispatchMode.__torch_dispatch__:13339
+        split_all = result.split("|")
+        # TODO: rewrite this to work with arbitrarily many stats
+
+    graph_count = None
+    op_count = None
+    graph_breaks = None
+    unique_graph_breaks = None
+    if m := re.search(
+        r"Dynamo produced (\d+) graphs covering (\d+) ops with (\d+) graph breaks \((\d+) unique\)",
+        log,
+    ):
+        graph_count = m.group(1)
+        op_count = m.group(2)
+        graph_breaks = m.group(3)
+        unique_graph_breaks = m.group(4)
+
     # If the context string is too long, don't put it in the CSV.
     # This is a hack to try to make it more likely that Google Sheets will
     # offer to split columns
@@ -143,7 +175,20 @@ def normalize_file(f):
         context = ""
 
     out.writerow(
-        [bench, name, "", r, component, context, explain, frame_time, backend_time]
+        {
+            "bench": bench,
+            "name": name,
+            "result": r,
+            "component": component,
+            "context": context,
+            "explain": explain,
+            "frame_time": frame_time,
+            "backend_time": backend_time,
+            "graph_count": graph_count,
+            "op_count": op_count,
+            "graph_breaks": graph_breaks,
+            "unique_graph_breaks": unique_graph_breaks,
+        }
     )
     i += 1
 
diff --git a/benchmarks/dynamo/run_all.sh b/benchmarks/dynamo/run_all.sh
index 732abc2d1c72..18612c8b855e 100755
--- a/benchmarks/dynamo/run_all.sh
+++ b/benchmarks/dynamo/run_all.sh
@@ -26,16 +26,13 @@ if getent hosts fwdproxy; then
 fi
 
 # Feel free to edit these, but we expect most users not to need to modify this
-BASE_FLAGS=( --accuracy --explain --timing )
+BASE_FLAGS=( --accuracy --explain --timing --print-graph-breaks )
 DATE="$(date)"
 WORK="$PWD"
 
 cd "$(dirname "$BASH_SOURCE")"/../..
 
-python benchmarks/dynamo/torchbench.py --output "$WORK"/torchbench.csv "${BASE_FLAGS[@]}" "$@" 2>&1 | tee "$WORK"/torchbench.log
-python benchmarks/dynamo/huggingface.py --output "$WORK"/huggingface.csv "${BASE_FLAGS[@]}" "$@" 2>&1 | tee "$WORK"/huggingface.log
-python benchmarks/dynamo/timm_models.py --output "$WORK"/timm_models.csv "${BASE_FLAGS[@]}" "$@" 2>&1 | tee "$WORK"/timm_models.log
-cat "$WORK"/torchbench.log "$WORK"/huggingface.log "$WORK"/timm_models.log | tee "$WORK"/sweep.log
+python benchmarks/dynamo/benchmarks.py --output "$WORK"/benchmarks.csv "${BASE_FLAGS[@]}" "$@" 2>&1 | tee "$WORK"/sweep.log
 gh gist create -d "Sweep logs for $(git rev-parse --abbrev-ref HEAD) $* - $(git rev-parse HEAD) $DATE" "$WORK"/sweep.log | tee -a "$WORK"/sweep.log
 python benchmarks/dynamo/parse_logs.py "$WORK"/sweep.log > "$WORK"/final.csv
 gh gist create "$WORK"/final.csv
diff --git a/benchmarks/dynamo/run_delta.sh b/benchmarks/dynamo/run_delta.sh
new file mode 100755
index 000000000000..7ca5a881a284
--- /dev/null
+++ b/benchmarks/dynamo/run_delta.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -x
+
+# Some QoL for people running this script on Meta servers
+if getent hosts fwdproxy; then
+    export https_proxy=http://fwdproxy:8080 http_proxy=http://fwdproxy:8080 no_proxy=.fbcdn.net,.facebook.com,.thefacebook.com,.tfbnw.net,.fb.com,.fburl.com,.facebook.net,.sb.fbsbx.com,localhost
+fi
+
+WORK="$PWD"
+
+cd "$(dirname "$BASH_SOURCE")"/../..
+
+ROOT="$PWD"
+
+mkdir -p "$WORK/sweep/static"
+mkdir -p "$WORK/sweep/dynamic"
+
+(cd "$WORK/sweep/static" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@")
+(cd "$WORK/sweep/dynamic" && "$ROOT/benchmarks/dynamo/run_all.sh" "$@" --dynamic-shapes)
+python benchmarks/dynamo/combine_csv.py "$WORK/sweep/static/final.csv" "$WORK/sweep/dynamic/final.csv" > "$WORK/delta.csv"
+gh gist create "$WORK/delta.csv"
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index d370bbf200c7..8db152daadc3 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -13,10 +13,10 @@
 below) for inference, run them and visualize the logs.
 
 If you want to just print the commands, you could use the following command
--> python benchmarks/runner.py --print_run_commands --suites=torchbench --inference
+-> python benchmarks/runner.py --print-run-commands --suites=torchbench --inference
 
 Similarly, if you want to just visualize the already finished logs
--> python benchmarks/runner.py --visualize_logs --suites=torchbench --inference
+-> python benchmarks/runner.py --visualize-logs --suites=torchbench --inference
 
 If you want to test float16
 -> python benchmarks/runner.py --suites=torchbench --inference --dtypes=float16
@@ -68,7 +68,7 @@
         "ts_nvfuser": "--training --nvfuser --speedup-dynamo-ts ",
         "eager": "--training --backend=eager ",
         "aot_eager": "--training --backend=aot_eager ",
-        "aot_cudagraphs": "--training --backend=aot_cudagraphs ",
+        "cudagraphs": "--training --backend=cudagraphs ",
         "aot_nvfuser": "--training --nvfuser --backend=aot_ts_nvfuser ",
         "nvprims_nvfuser": "--training --backend=nvprims_nvfuser ",
         "inductor": "--training --inductor ",
@@ -178,11 +178,13 @@ def parse_args():
     # Choose either generation of commands, pretty parsing or e2e runs
     group = parser.add_mutually_exclusive_group(required=False)
     group.add_argument(
+        "--print-run-commands",
         "--print_run_commands",
         action="store_true",
         help="Generate commands and saves them to run.sh",
     )
     group.add_argument(
+        "--visualize-logs",
         "--visualize_logs",
         action="store_true",
         help="Pretty print the log files and draw graphs",
@@ -265,7 +267,11 @@ def parse_args():
         help="Github CLI path",
     )
     parser.add_argument(
-        "--batch_size", type=int, default=None, help="batch size for benchmarking"
+        "--batch-size",
+        "--batch_size",
+        type=int,
+        default=None,
+        help="batch size for benchmarking",
     )
     parser.add_argument(
         "--threads",
@@ -276,12 +282,14 @@ def parse_args():
     )
     launcher_group = parser.add_argument_group("CPU Launcher Parameters")
     launcher_group.add_argument(
+        "--enable-cpu-launcher",
         "--enable_cpu_launcher",
         action="store_true",
         default=False,
         help="Use torch.backends.xeon.run_cpu to get the peak performance on Intel(R) Xeon(R) Scalable Processors.",
     )
     launcher_group.add_argument(
+        "--cpu-launcher-args",
         "--cpu_launcher_args",
         type=str,
         default="",
@@ -334,6 +342,8 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
     with open(generated_file, "w") as runfile:
         lines = []
 
+        lines.append("#!/bin/bash")
+        lines.append("set -x")
         lines.append("# Setup the output directory")
         lines.append(f"rm -rf {output_dir}")
         lines.append(f"mkdir {output_dir}")
@@ -364,14 +374,14 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                         filters = DEFAULTS["quick"][suite]
                         cmd = f"{cmd} {filters}"
 
-                    if testing == "performance" and compiler in (
+                    if compiler in (
                         "inductor",
                         "inductor_no_cudagraphs",
                     ):
-                        cmd = f"{cmd} --cold_start_latency"
+                        cmd = f"{cmd} --cold-start-latency"
 
                     if args.batch_size is not None:
-                        cmd = f"{cmd} --batch_size {args.batch_size}"
+                        cmd = f"{cmd} --batch-size {args.batch_size}"
 
                     if args.threads is not None:
                         cmd = f"{cmd} --threads {args.threads}"
@@ -1294,13 +1304,17 @@ def comment_on_gh(self, comment):
             f.write(comment)
             filename = f.name
 
+        issue_number = "93794"
+        if self.args.dtypes[0] == "float32":
+            issue_number = "93518"
+
         subprocess.check_call(
             [
                 self.args.dashboard_gh_cli_path,
                 "issue",
                 "comment",
-                "--repo=https://github.com/pytorch/torchdynamo.git",
-                "681",
+                "--repo=https://github.com/pytorch/pytorch.git",
+                issue_number,
                 "-F",
                 filename,
             ]
diff --git a/benchmarks/dynamo/test.py b/benchmarks/dynamo/test.py
index 438218462030..d506c4df2328 100644
--- a/benchmarks/dynamo/test.py
+++ b/benchmarks/dynamo/test.py
@@ -36,7 +36,7 @@ def test_benchmark_infra_runs(self) -> None:
                     "--performance",
                     "--only=BERT_pytorch",
                     "-n1",
-                    "--batch_size=1",
+                    "--batch-size=1",
                 ]
             )
             run(TorchBenchmarkRunner(), args, original_dir)
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 6e1c2437e062..905ea324c255 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -169,7 +169,7 @@ def populate_family(models):
 
 class TimmRunnner(BenchmarkRunner):
     def __init__(self):
-        super(TimmRunnner, self).__init__()
+        super().__init__()
         self.suite_name = "timm_models"
 
     def load_model(
@@ -185,10 +185,11 @@ def load_model(
         # _, model_dtype, data_dtype = self.resolve_precision()
         channels_last = self._args.channels_last
 
-        retries = 1
+        tries = 1
         success = False
         model = None
-        while not success and retries < 6:
+        total_allowed_tries = 5
+        while not success and tries <= total_allowed_tries:
             try:
                 model = create_model(
                     model_name,
@@ -206,10 +207,14 @@ def load_model(
                     # drop_block_rate=kwargs.pop('drop_block', None),
                 )
                 success = True
-            except Exception:
-                wait = retries * 30
-                time.sleep(wait)
-                retries += 1
+            except Exception as e:
+                tries += 1
+                if tries <= total_allowed_tries:
+                    wait = tries * 30
+                    print(
+                        f"Failed to load model: {e}. Trying again ({tries}/{total_allowed_tries}) after {wait}s"
+                    )
+                    time.sleep(wait)
 
         if model is None:
             raise RuntimeError(f"Failed to load model '{model_name}'")
@@ -332,7 +337,11 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         return None
 
 
-if __name__ == "__main__":
+def timm_main():
     logging.basicConfig(level=logging.WARNING)
     warnings.filterwarnings("ignore")
     main(TimmRunnner())
+
+
+if __name__ == "__main__":
+    timm_main()
diff --git a/benchmarks/dynamo/timm_models_list_cpu.txt b/benchmarks/dynamo/timm_models_list_cpu.txt
new file mode 100644
index 000000000000..50edec92d268
--- /dev/null
+++ b/benchmarks/dynamo/timm_models_list_cpu.txt
@@ -0,0 +1,60 @@
+adv_inception_v3,128
+beit_base_patch16_224,64
+botnet26t_256,128
+cait_m36_384,4
+coat_lite_mini,32
+convit_base,64
+convmixer_768_32,2
+convnext_base,64
+crossvit_9_240,32
+cspdarknet53,64
+deit_base_distilled_patch16_224,64
+dm_nfnet_f0,128
+dpn107,32
+eca_botnext26ts_256,128
+eca_halonext26ts,128
+ese_vovnet19b_dw,128
+fbnetc_100,32
+fbnetv3_b,32
+gernet_l,128
+ghostnet_100,128
+gluon_inception_v3,128
+gluon_xception65,32
+gmixer_24_224,16
+gmlp_s16_224,128
+hrnet_w18,128
+inception_v3,128
+jx_nest_base,32
+lcnet_050,64
+mixer_b16_224,128
+mixnet_l,128
+mnasnet_100,32
+mobilenetv2_100,32
+mobilenetv3_large_100,32
+mobilevit_s,256
+nfnet_l0,128
+pit_b_224,64
+pnasnet5large,16
+poolformer_m36,64
+regnety_002,128
+repvgg_a2,128
+res2net101_26w_4s,64
+res2net50_14w_8s,128
+res2next50,128
+resmlp_12_224,128
+resnest101e,64
+rexnet_100,128
+sebotnet33ts_256,64
+selecsls42b,128
+spnasnet_100,32
+swin_base_patch4_window7_224,64
+swsl_resnext101_32x16d,32
+tf_efficientnet_b0,128
+tf_mixnet_l,32
+tinynet_a,128
+tnt_s_patch16_224,32
+twins_pcpvt_base,64
+visformer_small,128
+vit_base_patch16_224,64
+volo_d1_224,64
+xcit_large_24_p8_224,5
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index e8d959887290..48a7da1d2d55 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -183,6 +183,7 @@ def setup_torchbench_cwd():
     "hf_GPT2_large",
     "hf_T5_large",
     "timm_vision_transformer_large",
+    "maml",  # accuracy https://github.com/pytorch/pytorch/issues/93847
 }
 
 
@@ -194,7 +195,7 @@ def setup_torchbench_cwd():
 
 class TorchBenchmarkRunner(BenchmarkRunner):
     def __init__(self):
-        super(TorchBenchmarkRunner, self).__init__()
+        super().__init__()
         self.suite_name = "torchbench"
         self.optimizer = None
 
@@ -373,9 +374,12 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         return None
 
 
-if __name__ == "__main__":
-
+def torchbench_main():
     original_dir = setup_torchbench_cwd()
     logging.basicConfig(level=logging.WARNING)
     warnings.filterwarnings("ignore")
     main(TorchBenchmarkRunner(), original_dir)
+
+
+if __name__ == "__main__":
+    torchbench_main()
diff --git a/benchmarks/dynamo/torchbench_models_list_cpu.txt b/benchmarks/dynamo/torchbench_models_list_cpu.txt
new file mode 100644
index 000000000000..ab485702b838
--- /dev/null
+++ b/benchmarks/dynamo/torchbench_models_list_cpu.txt
@@ -0,0 +1,48 @@
+alexnet,128
+attention_is_all_you_need_pytorch,64
+BERT_pytorch,32
+dcgan,256
+densenet121,512
+dlrm,2048
+fastNLP_Bert,8
+functorch_dp_cifar10,1024
+hf_Albert,8
+hf_Bart,8
+hf_Bert,8
+hf_Bert_large,8
+hf_DistilBert,8
+hf_GPT2,8
+hf_GPT2_large,1
+hf_Longformer,4
+hf_Reformer,8
+hf_T5,4
+hf_T5_base,1
+hf_T5_large,1
+LearningToPaint,96
+lennard_jones,1024
+mnasnet1_0,32
+mobilenet_v2,16
+mobilenet_v3_large,32
+nvidia_deeprecommender,256
+phlippe_densenet,128
+phlippe_resnet,512
+pytorch_unet,4
+resnet152,32
+resnet18,256
+resnet50,256
+resnext50_32x4d,256
+shufflenet_v2_x1_0,64
+speech_transformer,1024
+squeezenet1_1,16
+Super_SloMo,1024
+timm_efficientnet,64
+timm_nfnet,128
+timm_regnet,32
+timm_resnest,32
+timm_vision_transformer,16
+timm_vision_transformer_large,8
+timm_vovnet,32
+tts_angular,1024
+vgg16,64
+vision_maskrcnn,1
+yolov3,32
diff --git a/benchmarks/dynamo/training_loss.py b/benchmarks/dynamo/training_loss.py
index 2ec794540334..8886553c9736 100644
--- a/benchmarks/dynamo/training_loss.py
+++ b/benchmarks/dynamo/training_loss.py
@@ -128,7 +128,7 @@ def parse_args():
     )
     parser.add_argument(
         "--backend",
-        choices=torch._dynamo.list_backends(),
+        choices=torch._dynamo.list_backends(exclude_tags=None),
         default="inductor",
         help="train/evaluate model with a given backend (default: inductor)",
     )
diff --git a/benchmarks/fastrnns/bench.py b/benchmarks/fastrnns/bench.py
index 8b4569a9d56b..d4b70ff78b7a 100644
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@@ -44,7 +44,7 @@ def pretty_print(benchresult, colwidth=16, sep=' '):
     return sep.join(items)
 
 # shim for torch.cuda.Event when running on cpu
-class Event(object):
+class Event:
     def __init__(self, enable_timing):
         pass
 
@@ -209,7 +209,7 @@ def bench_group(model_list, bench_name, bench_group, bench_args):
     parser.add_argument('--warmup', default='10', type=int)
     parser.add_argument('--nloops', default='100', type=int)
     parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--variable_lstms', action='store_true',
+    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true',
                         help='Also benchmark variable sequence length lstms '
                         'Note that some of these run really slowly '
                         'and that the `seqLength` flag will be ignored.')
@@ -224,9 +224,9 @@ def bench_group(model_list, bench_name, bench_group, bench_args):
                         help='The fuser backend to use. One of: te, old, or none')
     parser.add_argument('--executor', default=None, type=str,
                         help='The executor to use. One of: legacy, simple, profiling')
-    parser.add_argument('--cuda_pointwise_loop_level', default=None, type=int)
-    parser.add_argument('--cuda_pointwise_block_count', default=None, type=int)
-    parser.add_argument('--cuda_pointwise_block_size', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-loop-level', '--cuda_pointwise_loop_level', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-block-count', '--cuda_pointwise_block_count', default=None, type=int)
+    parser.add_argument('--cuda-pointwise-block-size', '--cuda_pointwise_block_size', default=None, type=int)
 
     args = parser.parse_args()
     set_fuser(args.fuser, args.executor)
diff --git a/benchmarks/fastrnns/custom_lstms.py b/benchmarks/fastrnns/custom_lstms.py
index 60abb1ac574c..c21defda239f 100644
--- a/benchmarks/fastrnns/custom_lstms.py
+++ b/benchmarks/fastrnns/custom_lstms.py
@@ -92,7 +92,7 @@ def reverse(lst: List[Tensor]) -> List[Tensor]:
 
 class LSTMCell(jit.ScriptModule):
     def __init__(self, input_size, hidden_size):
-        super(LSTMCell, self).__init__()
+        super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
@@ -120,7 +120,7 @@ def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor,
 
 class LayerNorm(jit.ScriptModule):
     def __init__(self, normalized_shape):
-        super(LayerNorm, self).__init__()
+        super().__init__()
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = (normalized_shape,)
         normalized_shape = torch.Size(normalized_shape)
@@ -146,7 +146,7 @@ def forward(self, input):
 
 class LayerNormLSTMCell(jit.ScriptModule):
     def __init__(self, input_size, hidden_size, decompose_layernorm=False):
-        super(LayerNormLSTMCell, self).__init__()
+        super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.weight_ih = Parameter(torch.randn(4 * hidden_size, input_size))
@@ -183,7 +183,7 @@ def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor,
 
 class LSTMLayer(jit.ScriptModule):
     def __init__(self, cell, *cell_args):
-        super(LSTMLayer, self).__init__()
+        super().__init__()
         self.cell = cell(*cell_args)
 
     @jit.script_method
@@ -198,7 +198,7 @@ def forward(self, input: Tensor, state: Tuple[Tensor, Tensor]) -> Tuple[Tensor,
 
 class ReverseLSTMLayer(jit.ScriptModule):
     def __init__(self, cell, *cell_args):
-        super(ReverseLSTMLayer, self).__init__()
+        super().__init__()
         self.cell = cell(*cell_args)
 
     @jit.script_method
@@ -215,7 +215,7 @@ class BidirLSTMLayer(jit.ScriptModule):
     __constants__ = ['directions']
 
     def __init__(self, cell, *cell_args):
-        super(BidirLSTMLayer, self).__init__()
+        super().__init__()
         self.directions = nn.ModuleList([
             LSTMLayer(cell, *cell_args),
             ReverseLSTMLayer(cell, *cell_args),
@@ -247,7 +247,7 @@ class StackedLSTM(jit.ScriptModule):
     __constants__ = ['layers']  # Necessary for iterating through self.layers
 
     def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
-        super(StackedLSTM, self).__init__()
+        super().__init__()
         self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
                                         other_layer_args)
 
@@ -274,7 +274,7 @@ class StackedLSTM2(jit.ScriptModule):
     __constants__ = ['layers']  # Necessary for iterating through self.layers
 
     def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
-        super(StackedLSTM2, self).__init__()
+        super().__init__()
         self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
                                         other_layer_args)
 
@@ -299,7 +299,7 @@ class StackedLSTMWithDropout(jit.ScriptModule):
     __constants__ = ['layers', 'num_layers']
 
     def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
-        super(StackedLSTMWithDropout, self).__init__()
+        super().__init__()
         self.layers = init_stacked_lstm(num_layers, layer, first_layer_args,
                                         other_layer_args)
         # Introduces a Dropout layer on the outputs of each LSTM layer except
diff --git a/benchmarks/fastrnns/profile.py b/benchmarks/fastrnns/profile.py
index ad55108724f1..7f3de61ef9c3 100644
--- a/benchmarks/fastrnns/profile.py
+++ b/benchmarks/fastrnns/profile.py
@@ -95,7 +95,7 @@ def full_profile(rnns, **args):
     for k, v in args.items():
         profile_args.append('--{}={}'.format(k, v))
     profile_args.append('--rnns {}'.format(' '.join(rnns)))
-    profile_args.append('--internal_run')
+    profile_args.append('--internal-run')
 
     outpath = nvprof_output_filename(rnns, **args)
 
@@ -114,7 +114,7 @@ def full_profile(rnns, **args):
     parser.add_argument('--inputSize', default='512', type=int)
     parser.add_argument('--hiddenSize', default='512', type=int)
     parser.add_argument('--miniBatch', default='64', type=int)
-    parser.add_argument('--sleep_between_seconds', default='1', type=int)
+    parser.add_argument('--sleep-between-seconds', '--sleep_between_seconds', default='1', type=int)
     parser.add_argument('--nloops', default='5', type=int)
 
     parser.add_argument('--rnns', nargs='*',
@@ -122,7 +122,7 @@ def full_profile(rnns, **args):
 
     # if internal_run, we actually run the rnns.
     # if not internal_run, we shell out to nvprof with internal_run=T
-    parser.add_argument('--internal_run', default=False, action='store_true',
+    parser.add_argument('--internal-run', '--internal_run', default=False, action='store_true',
                         help='Don\'t use this')
     args = parser.parse_args()
     if args.rnns is None:
diff --git a/benchmarks/fastrnns/test.py b/benchmarks/fastrnns/test.py
index 6cc68cce6c11..a56cf928fd7a 100644
--- a/benchmarks/fastrnns/test.py
+++ b/benchmarks/fastrnns/test.py
@@ -12,7 +12,7 @@ def barf():
 
 
 def assertEqual(tensor, expected, threshold=0.001):
-    if isinstance(tensor, list) or isinstance(tensor, tuple):
+    if isinstance(tensor, (list, tuple)):
         for t, e in zip(tensor, expected):
             assertEqual(t, e)
     else:
@@ -128,8 +128,8 @@ def test_vl_py(**test_args):
     parser.add_argument('--hiddenSize', default='512', type=int)
     parser.add_argument('--miniBatch', default='64', type=int)
     parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--check_grad', default='True', type=bool)
-    parser.add_argument('--variable_lstms', action='store_true')
+    parser.add_argument('--check-grad', '--check_grad', default='True', type=bool)
+    parser.add_argument('--variable-lstms', '--variable_lstms', action='store_true')
     parser.add_argument('--seed', default='17', type=int)
     parser.add_argument('--verbose', action='store_true')
     parser.add_argument('--rnns', nargs='*',
diff --git a/benchmarks/framework_overhead_benchmark/C2Module.py b/benchmarks/framework_overhead_benchmark/C2Module.py
index 8deade61ac81..dfc5e6e79098 100644
--- a/benchmarks/framework_overhead_benchmark/C2Module.py
+++ b/benchmarks/framework_overhead_benchmark/C2Module.py
@@ -9,7 +9,7 @@ def add_blob(ws, blob_name, tensor_size):
     blob_tensor = np.random.randn(*tensor_size).astype(np.float32)
     ws.FeedBlob(blob_name, blob_tensor)
 
-class C2SimpleNet(object):
+class C2SimpleNet:
     """
     This module constructs a net with 'op_name' operator. The net consist
     a series of such operator.
diff --git a/benchmarks/framework_overhead_benchmark/SimpleAddModule.py b/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
index ead8deaf14d2..a4c2a1c83a26 100644
--- a/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
+++ b/benchmarks/framework_overhead_benchmark/SimpleAddModule.py
@@ -9,7 +9,7 @@ def add_tensors_loop(x, y):
 
 class SimpleAddModule(torch.nn.Module):
     def __init__(self, add_op):
-        super(SimpleAddModule, self).__init__()
+        super().__init__()
         self.add_op = add_op
 
     def forward(self, x, y):
diff --git a/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py b/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
index 905b590885da..727b78197b39 100644
--- a/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
+++ b/benchmarks/framework_overhead_benchmark/framework_overhead_benchmark.py
@@ -15,12 +15,12 @@
 Example build/run:
 To run PT benchmark:
 buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_benchmark --
- --add_op --graph_mode --eager_mode (Runs both graph mode and eager mode)
+ --add-op --graph-mode --eager-mode (Runs both graph mode and eager mode)
 buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_benchmark --
- --add_op --graph_mode (Runs only graph mode)
+ --add-op --graph-mode (Runs only graph mode)
 To run C2 benchmark:
 buck run @mode/opt <path-to-framework_overhead_benchmark>:framework_overhead_benchmark --
- --add_op --benchmark_c2_net
+ --add-op --benchmark-c2-net
 """
 
 SUPPORTED_OPS = {"add_op"}
@@ -64,13 +64,25 @@ def benchmark_simple_fn(args, config, module_config, module_type, result):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--op", default="add_op", dest="op", type=str)
-    parser.add_argument("--benchmark_c2_net", default=False, dest="benchmark_c2_net", action="store_true")
-    parser.add_argument("--use_throughput_benchmark", default=False, dest="use_throughput_benchmark", action="store_true")
+    parser.add_argument(
+        "--benchmark-c2-net",
+        "--benchmark_c2_net",
+        default=False,
+        dest="benchmark_c2_net",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--use-throughput-benchmark",
+        "--use_throughput_benchmark",
+        default=False,
+        dest="use_throughput_benchmark",
+        action="store_true",
+    )
     parser.add_argument("--debug", default=False, dest="debug", action="store_true")
     parser.add_argument("--save", default=False, dest="save", action="store_true")
-    parser.add_argument("--eager_mode", default=False, dest="eager_mode", action="store_true")
-    parser.add_argument("--num_warmup_iters", type=int, default=100)
-    parser.add_argument("--num_iters", type=int, default=1000)
+    parser.add_argument("--eager-mode", "--eager_mode", default=False, dest="eager_mode", action="store_true")
+    parser.add_argument("--num-warmup-iters", "--num_warmup_iters", type=int, default=100)
+    parser.add_argument("--num-iters", "--num_iters", type=int, default=1000)
     args = parser.parse_args()
 
     if args.op not in SUPPORTED_OPS:
diff --git a/benchmarks/framework_overhead_benchmark/pt_wrapper_module.py b/benchmarks/framework_overhead_benchmark/pt_wrapper_module.py
index 84b2724bf63d..154564f1c6d7 100644
--- a/benchmarks/framework_overhead_benchmark/pt_wrapper_module.py
+++ b/benchmarks/framework_overhead_benchmark/pt_wrapper_module.py
@@ -1,6 +1,6 @@
 import torch
 
-class WrapperModule(object):
+class WrapperModule:
     """ Wraps the instance of wrapped_type.
     For graph_mode traces the instance of wrapped_type.
     Randomaly initializes num_params tensors with single float element.
diff --git a/benchmarks/functional_autograd_benchmark/torchaudio_models.py b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
index 1e568d1d01f0..0563613a35a0 100644
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@@ -27,7 +27,7 @@ class Wav2Letter(nn.Module):
     def __init__(self, num_classes: int = 40,
                  input_type: str = "waveform",
                  num_features: int = 1) -> None:
-        super(Wav2Letter, self).__init__()
+        super().__init__()
 
         acoustic_num_features = 250 if input_type == "waveform" else num_features
         acoustic_model = nn.Sequential(
@@ -85,7 +85,7 @@ def __init__(self, module):
         Allows handling of variable sequence lengths and minibatch sizes.
         :param module: Module to apply input to.
         """
-        super(SequenceWise, self).__init__()
+        super().__init__()
         self.module = module
 
     def forward(self, x):
@@ -110,7 +110,7 @@ def __init__(self, seq_module):
         Input needs to be in the shape of (BxCxDxT)
         :param seq_module: The sequential module containing the conv stack.
         """
-        super(MaskConv, self).__init__()
+        super().__init__()
         self.seq_module = seq_module
 
     def forward(self, x, lengths):
@@ -142,7 +142,7 @@ def forward(self, input_):
 
 class BatchRNN(nn.Module):
     def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
-        super(BatchRNN, self).__init__()
+        super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bidirectional = bidirectional
@@ -170,7 +170,7 @@ class Lookahead(nn.Module):
     # input shape - sequence, batch, feature - TxNxH
     # output shape - same as input
     def __init__(self, n_features, context):
-        super(Lookahead, self).__init__()
+        super().__init__()
         assert context > 0
         self.context = context
         self.n_features = n_features
@@ -193,7 +193,7 @@ def __repr__(self):
 class DeepSpeech(nn.Module):
     def __init__(self, rnn_type, labels, rnn_hidden_size, nb_layers, audio_conf,
                  bidirectional, context=20):
-        super(DeepSpeech, self).__init__()
+        super().__init__()
 
         self.hidden_size = rnn_hidden_size
         self.hidden_layers = nb_layers
@@ -298,7 +298,7 @@ class PositionalEncoding(nn.Module):
     """
 
     def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
         self.dropout = nn.Dropout(p=dropout)
 
         pe = torch.zeros(max_len, d_model)
@@ -327,7 +327,7 @@ class TransformerModel(nn.Module):
     """Container module with an encoder, a recurrent or transformer module, and a decoder."""
 
     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
-        super(TransformerModel, self).__init__()
+        super().__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
         except Exception as e:
@@ -392,7 +392,7 @@ def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
             >>> print(attn_output.shape)
             >>> torch.Size([21, 64, 10])
         """
-        super(MultiheadAttentionContainer, self).__init__()
+        super().__init__()
         self.nhead = nhead
         self.in_proj_container = in_proj_container
         self.attention_layer = attention_layer
@@ -456,7 +456,7 @@ def __init__(self, dropout=0.0):
             >>> print(attn_output.shape, attn_weights.shape)
             torch.Size([256, 21, 3]) torch.Size([256, 21, 21])
         """
-        super(ScaledDotProduct, self).__init__()
+        super().__init__()
         self.dropout = dropout
 
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
@@ -532,7 +532,7 @@ def __init__(self, query_proj, key_proj, value_proj):
             value_proj: a proj layer for value.
         """
 
-        super(InProjContainer, self).__init__()
+        super().__init__()
         self.query_proj = query_proj
         self.key_proj = key_proj
         self.value_proj = value_proj
diff --git a/benchmarks/functional_autograd_benchmark/torchvision_models.py b/benchmarks/functional_autograd_benchmark/torchvision_models.py
index 5026366036c5..40b9cf660a49 100644
--- a/benchmarks/functional_autograd_benchmark/torchvision_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchvision_models.py
@@ -29,7 +29,7 @@ class BasicBlock(nn.Module):
 
     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                  base_width=64, dilation=1, norm_layer=None):
-        super(BasicBlock, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         if groups != 1 or base_width != 64:
@@ -74,7 +74,7 @@ class Bottleneck(nn.Module):
 
     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                  base_width=64, dilation=1, norm_layer=None):
-        super(Bottleneck, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         width = int(planes * (base_width / 64.)) * groups
@@ -116,7 +116,7 @@ class ResNet(nn.Module):
     def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                  groups=1, width_per_group=64, replace_stride_with_dilation=None,
                  norm_layer=None):
-        super(ResNet, self).__init__()
+        super().__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
         self._norm_layer = norm_layer
@@ -281,7 +281,7 @@ def __init__(self, model, return_layers):
             if not return_layers:
                 break
 
-        super(IntermediateLayerGetter, self).__init__(layers)
+        super().__init__(layers)
         self.return_layers = orig_return_layers
 
     def forward(self, x):
@@ -297,7 +297,7 @@ class _SimpleSegmentationModel(nn.Module):
     __constants__ = ['aux_classifier']
 
     def __init__(self, backbone, classifier, aux_classifier=None):
-        super(_SimpleSegmentationModel, self).__init__()
+        super().__init__()
         self.backbone = backbone
         self.classifier = classifier
         self.aux_classifier = aux_classifier
@@ -346,7 +346,7 @@ def __init__(self, in_channels, channels):
             nn.Conv2d(inter_channels, channels, 1)
         ]
 
-        super(FCNHead, self).__init__(*layers)
+        super().__init__(*layers)
 
 def _segm_resnet(name, backbone_name, num_classes, aux, pretrained_backbone=True):
     # backbone = resnet.__dict__[backbone_name](
diff --git a/benchmarks/instruction_counts/execution/work.py b/benchmarks/instruction_counts/execution/work.py
index ed0c6a475b0c..a1fa961ea7e5 100644
--- a/benchmarks/instruction_counts/execution/work.py
+++ b/benchmarks/instruction_counts/execution/work.py
@@ -100,7 +100,7 @@ def cmd(self) -> str:
 
         cmd.extend([
             _PYTHON, WORKER_PATH,
-            "--communication_file", self._communication_file,
+            "--communication-file", self._communication_file,
         ])
         return " ".join(cmd)
 
diff --git a/benchmarks/instruction_counts/worker/main.py b/benchmarks/instruction_counts/worker/main.py
index f59509de7478..dbe1810e9917 100644
--- a/benchmarks/instruction_counts/worker/main.py
+++ b/benchmarks/instruction_counts/worker/main.py
@@ -183,6 +183,6 @@ def main(communication_file: str) -> None:
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--communication_file', type=str)
+    parser.add_argument('--communication-file', '--communication_file', type=str)
     communication_file = parser.parse_args().communication_file
     main(communication_file)
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index cff275d9a1f9..bef7e0067de4 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -28,19 +28,19 @@ $ python setup.py install
 Run `torch.add` benchmark:
 ```
 $ cd pytorch/benchmarks/operator_benchmark
-$ python -m pt.add_test --omp_num_threads 1 --mkl_num_threads 1
+$ python -m pt.add_test --omp-num-threads 1 --mkl-num-threads 1
 ```
-Note: we set the number of OpenMP and MKL threads both to 1. If you want to benchmark operators with multithreading (intra-op parallelism), use the `--omp_num_threads` and `--mkl_num_threads` flags.
+Note: we set the number of OpenMP and MKL threads both to 1. If you want to benchmark operators with multithreading (intra-op parallelism), use the `--omp-num-threads` and `--mkl-num-threads` flags.
 
 List all the supported tests:
 ```
-$ python -m pt.add_test --list_tests
+$ python -m pt.add_test --list-tests
 ```
 
 Filter and run a test (use `add_M8_N16_K32` as an example):
 ```
-$ python -m pt.add_test --test_name add_K32_M8_N1
---omp_num_threads 1 --mkl_num_threads 1
+$ python -m pt.add_test --test-name add_K32_M8_N1
+--omp-num-threads 1 --mkl-num-threads 1
 ```
 
 Run all the supported benchmarks:
@@ -121,28 +121,28 @@ $ python benchmark_runner.py --help
 
 Run all the supported benchmarks:
 ```
-$ python -m benchmark_all_test --omp_num_threads 1 --mkl_num_threads 1
+$ python -m benchmark_all_test --omp-num-threads 1 --mkl-num-threads 1
 ```
 
 List all the supported operators:
 ```
-$ python -m benchmark_all_test --list_ops
+$ python -m benchmark_all_test --list-ops
 ```
 
 List all the supported tests:
 ```
-$ python -m benchmark_all_test --list_tests
+$ python -m benchmark_all_test --list-tests
 ```
 
 Filter and run an operator (use add as an example):
 ```
-$ python -m benchmark_all_test --operators add --omp_num_threads 1 --mkl_num_threads 1
+$ python -m benchmark_all_test --operators add --omp-num-threads 1 --mkl-num-threads 1
 ```
 Note: this filter is based on the operator name rather than the file name.
 
 Run torch.add benchmark with tag 'long':
 ```
-$ python -m pt.add_test --tag_filter long
+$ python -m pt.add_test --tag-filter long
 ```
 
 ## Adding New Operators to the Benchmark Suite
diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py
index b0534bd9722d..d5939030d03c 100644
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@@ -12,7 +12,7 @@
 """
 
 
-class Caffe2BenchmarkBase(object):
+class Caffe2BenchmarkBase:
     """ This is a base class used to create Caffe2 operator benchmark
     """
     tensor_index = 0
@@ -103,7 +103,7 @@ def extract_inputs_tuple(self):
         pass
 
 
-class Caffe2OperatorTestCase(object):
+class Caffe2OperatorTestCase:
     """ This class includes all the information needed to benchmark an operator.
         op_bench: it's a user-defined class (child of Caffe2BenchmarkBase)
         which includes input and operator, .etc
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index 075a676b359e..46ae589b8762 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -150,7 +150,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
             yield _create_test(new_op, test_attrs, tags, OperatorTestCase, run_backward, input_name)
 
 
-class BenchmarkRunner(object):
+class BenchmarkRunner:
     """BenchmarkRunner is responsible for benchmarking all the registered
     benchmark test groups.
 
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index 6a53c9c97b3c..e9a9b3c5de42 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -18,7 +18,7 @@ class TorchBenchmarkBase(torch.nn.Module):
     """
 
     def __init__(self):
-        super(TorchBenchmarkBase, self).__init__()
+        super().__init__()
         self.user_given_name = None
         self._pass_count = 0
         self._num_inputs_require_grads = 0
@@ -100,7 +100,7 @@ def test_name(self, **kargs):
         return name
 
 
-class PyTorchOperatorTestCase(object):
+class PyTorchOperatorTestCase:
     """ This class includes all the information needed to benchmark an operator.
         op_bench: it's a user-defined class (child of TorchBenchmarkBase)
         which includes input and operator, .etc
diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py
index 3e998e6ceb4e..7212147399a0 100644
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@@ -17,6 +17,7 @@
 
 def parse_args():
     parser.add_argument(
+        '--tag-filter',
         '--tag_filter',
         help='tag_filter can be used to run the shapes which matches the tag. (all is used to run all the shapes)',
         default='short')
@@ -28,21 +29,25 @@ def parse_args():
         default=None)
 
     parser.add_argument(
+        '--operator-range',
         '--operator_range',
         help='Filter tests based on operator_range(e.g. a-c or b,c-d)',
         default=None)
 
     parser.add_argument(
+        '--test-name',
         '--test_name',
         help='Run tests that have the provided test_name',
         default=None)
 
     parser.add_argument(
+        '--list-ops',
         '--list_ops',
         help='List operators without running them',
         action='store_true')
 
     parser.add_argument(
+        '--list-tests',
         '--list_tests',
         help='List all test cases without running them',
         action='store_true')
@@ -54,6 +59,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--num-runs",
         "--num_runs",
         help="Run each test for num_runs. Each run executes an operator for number of <--iterations>",
         type=int,
@@ -61,6 +67,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--min-time-per-test",
         "--min_time_per_test",
         help="Set the minimum time (unit: seconds) to run each test",
         type=int,
@@ -68,6 +75,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--warmup-iterations",
         "--warmup_iterations",
         help="Number of iterations to ignore before measuring performance",
         default=100,
@@ -75,6 +83,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--omp-num-threads",
         "--omp_num_threads",
         help="Number of OpenMP threads used in PyTorch/Caffe2 runtime",
         default=None,
@@ -82,6 +91,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--mkl-num-threads",
         "--mkl_num_threads",
         help="Number of MKL threads used in PyTorch/Caffe2 runtime",
         default=None,
@@ -89,6 +99,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--report-aibench",
         "--report_aibench",
         type=benchmark_utils.str2bool,
         nargs='?',
@@ -98,6 +109,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--use-jit",
         "--use_jit",
         type=benchmark_utils.str2bool,
         nargs='?',
@@ -107,6 +119,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--forward-only",
         "--forward_only",
         type=benchmark_utils.str2bool,
         nargs='?',
diff --git a/benchmarks/operator_benchmark/benchmark_utils.py b/benchmarks/operator_benchmark/benchmark_utils.py
index 095d454300c8..41b02c96c6dd 100644
--- a/benchmarks/operator_benchmark/benchmark_utils.py
+++ b/benchmarks/operator_benchmark/benchmark_utils.py
@@ -185,7 +185,7 @@ def attr_probs(**probs):
     return probs
 
 
-class RandomSample(object):
+class RandomSample:
 
     def __init__(self, configs):
         self.saved_cum_distribution = {}
diff --git a/benchmarks/operator_benchmark/pt/qarithmetic_test.py b/benchmarks/operator_benchmark/pt/qarithmetic_test.py
index 97766bdb4c19..0eefb49570ec 100644
--- a/benchmarks/operator_benchmark/pt/qarithmetic_test.py
+++ b/benchmarks/operator_benchmark/pt/qarithmetic_test.py
@@ -46,7 +46,7 @@ def setup(self, N, dtype, contig):
 
 class QFunctionalBenchmark(_QFunctionalBinaryArithmeticBenchmarkBase):
     def init(self, N, dtype, contig, op_func):
-        super(QFunctionalBenchmark, self).setup(N, dtype, contig)
+        super().setup(N, dtype, contig)
         self.inputs = {
             "q_input_a": self.q_input_a,
             "q_input_b": self.q_input_a,
@@ -66,7 +66,7 @@ def forward(self, q_input_a, q_input_b, scale: float, zero_point: int):
 
 class QFunctionalScalarBenchmark(_QFunctionalBinaryArithmeticBenchmarkBase):
     def init(self, N, dtype, contig, op_func):
-        super(QFunctionalScalarBenchmark, self).setup(N, dtype, contig)
+        super().setup(N, dtype, contig)
         self.inputs = {
             "q_input": self.q_input_a,
             "scalar_input": 42
diff --git a/benchmarks/operator_benchmark/pt/qconv_test.py b/benchmarks/operator_benchmark/pt/qconv_test.py
index c48759d330e7..c928c59d324a 100644
--- a/benchmarks/operator_benchmark/pt/qconv_test.py
+++ b/benchmarks/operator_benchmark/pt/qconv_test.py
@@ -41,7 +41,7 @@ def forward(self, input):
 class QConv2dBenchmark(op_bench.TorchBenchmarkBase):
     # def init(self, N, IC, OC, H, W, G, kernel, stride, pad):
     def init(self, IC, OC, kernel, stride, N, H, W, G, pad, device):
-        # super(QConv2dBenchmark, self).init(N, IC, OC, (H, W), G, (kernel, kernel), stride, pad)
+        # super().init(N, IC, OC, (H, W), G, (kernel, kernel), stride, pad)
 
         self.scale = 1.0 / 255
         self.zero_point = 0
diff --git a/benchmarks/operator_benchmark/pt/qlinear_test.py b/benchmarks/operator_benchmark/pt/qlinear_test.py
index c4f8f36c11d3..cc0db6952816 100644
--- a/benchmarks/operator_benchmark/pt/qlinear_test.py
+++ b/benchmarks/operator_benchmark/pt/qlinear_test.py
@@ -32,7 +32,7 @@ def forward(self, input):
 
 class QLinearBenchmark(_QLinearBenchmarkBase):
     def init(self, N, IN, OUT, device):
-        super(QLinearBenchmark, self).init(N, IN, OUT, nnq.Linear(IN, OUT))
+        super().init(N, IN, OUT, nnq.Linear(IN, OUT))
         self.inputs = {
             "input": self.qX
         }
@@ -41,7 +41,7 @@ def init(self, N, IN, OUT, device):
 
 class QDynamicLinearBenchmark(_QLinearBenchmarkBase):
     def init(self, N, IN, OUT, device):
-        super(QDynamicLinearBenchmark, self).init(N, IN, OUT, nnqd.Linear(IN, OUT))
+        super().init(N, IN, OUT, nnqd.Linear(IN, OUT))
         self.inputs = {
             "input": self.X
         }
diff --git a/benchmarks/operator_benchmark/pt/qpool_test.py b/benchmarks/operator_benchmark/pt/qpool_test.py
index bc93f2e1f887..f407f1d42c0e 100644
--- a/benchmarks/operator_benchmark/pt/qpool_test.py
+++ b/benchmarks/operator_benchmark/pt/qpool_test.py
@@ -101,22 +101,20 @@ def init(self, N, C, H, W, k, s, p, contig, dtype):
         self.pool_op = torch.nn.MaxPool2d(kernel_size=k, stride=s, padding=p,
                                           dilation=(1, 1), ceil_mode=False,
                                           return_indices=False)
-        super(QMaxPool2dBenchmark, self).setup(N, C, H, W, dtype, contig)
+        super().setup(N, C, H, W, dtype, contig)
 
 
 class QAvgPool2dBenchmark(_QPool2dBenchmarkBase):
     def init(self, N, C, H, W, k, s, p, contig, dtype):
         self.pool_op = torch.nn.AvgPool2d(kernel_size=k, stride=s, padding=p,
                                           ceil_mode=False)
-        super(QAvgPool2dBenchmark, self).setup(N, C, H, W, dtype, contig)
+        super().setup(N, C, H, W, dtype, contig)
 
 
 class QAdaptiveAvgPool2dBenchmark(_QPool2dBenchmarkBase):
     def init(self, N, C, input_size, output_size, contig, dtype):
         self.pool_op = torch.nn.AdaptiveAvgPool2d(output_size=output_size)
-        super(QAdaptiveAvgPool2dBenchmark, self).setup(N, C, *input_size,
-                                                       dtype=dtype,
-                                                       contig=contig)
+        super().setup(N, C, *input_size, dtype=dtype, contig=contig)
 
 
 op_bench.generate_pt_test(qadaptive_avgpool2d_short_configs + qadaptive_avgpool2d_long_configs,
diff --git a/benchmarks/operator_benchmark/pt/softmax_test.py b/benchmarks/operator_benchmark/pt/softmax_test.py
index 237d9001e017..24954ad00774 100644
--- a/benchmarks/operator_benchmark/pt/softmax_test.py
+++ b/benchmarks/operator_benchmark/pt/softmax_test.py
@@ -44,6 +44,29 @@
     ],
 )
 
+softmax_two_dims_ops_list = op_bench.op_list(
+    attr_names=['op_name', 'op_func'],
+    attrs=[
+        ['LogSoftmax', nn.LogSoftmax],
+    ],
+)
+
+
+softmax_two_dims_configs = op_bench.config_list(
+    attr_names=[
+        'N', 'seq_len', 'dim'
+    ],
+    attrs=[
+        [700, 23258, 0],
+        [700, 23258, 1],
+        [1024, 23258, 1]
+    ],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+    tags=['long']
+)
+
 
 class SoftmaxBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, N, C, H, W, device, op_func):
@@ -56,10 +79,25 @@ def forward(self, input):
         return self.op_func(input)
 
 
+class Softmax2DimsBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, N, seq_len, dim, device, op_func):
+        self.inputs = {
+            "input": torch.rand(N, seq_len, device=device)
+        }
+        self.op_func = op_func(dim=dim)
+
+    def forward(self, input):
+        return self.op_func(input)
+
 op_bench.generate_pt_tests_from_op_list(softmax_ops_list,
                                         softmax_configs_short + softmax_configs_long,
                                         SoftmaxBenchmark)
 
 
+op_bench.generate_pt_tests_from_op_list(softmax_two_dims_ops_list,
+                                        softmax_two_dims_configs,
+                                        Softmax2DimsBenchmark)
+
+
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
index 75cd490fed2e..5c1f2597415c 100644
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -30,15 +30,15 @@ def parallel_task(x):
     parser = argparse.ArgumentParser(
         description='Profiler benchmark')
 
-    parser.add_argument('--with_cuda', action='store_true')
-    parser.add_argument('--with_stack', action='store_true')
-    parser.add_argument('--use_script', action='store_true')
-    parser.add_argument('--use_kineto', action='store_true')
-    parser.add_argument('--profiling_tensor_size', default=1, type=int)
-    parser.add_argument('--workload', default='loop', type=str)
-    parser.add_argument('--internal_iter', default=256, type=int)
-    parser.add_argument('--timer_min_run_time', default=10, type=int)
-    parser.add_argument('--cuda_only', action='store_true')
+    parser.add_argument('--with-cuda', '--with_cuda', action='store_true')
+    parser.add_argument('--with-stack', '--with_stack', action='store_true')
+    parser.add_argument('--use-script', '--use_script', action='store_true')
+    parser.add_argument('--use-kineto', '--use_kineto', action='store_true')
+    parser.add_argument('--profiling-tensor-size', '--profiling_tensor_size', default=1, type=int)
+    parser.add_argument('--workload', '--workload', default='loop', type=str)
+    parser.add_argument('--internal-iter', '--internal_iter', default=256, type=int)
+    parser.add_argument('--timer-min-run-time', '--timer_min_run_time', default=10, type=int)
+    parser.add_argument('--cuda-only', '--cuda_only', action='store_true')
 
     args = parser.parse_args()
 
diff --git a/benchmarks/record_function_benchmark/record_function_bench.py b/benchmarks/record_function_benchmark/record_function_bench.py
index 830328247bb5..d8c9e90b7743 100644
--- a/benchmarks/record_function_benchmark/record_function_bench.py
+++ b/benchmarks/record_function_benchmark/record_function_bench.py
@@ -92,7 +92,7 @@ def run_bench(model_names, bench_args):
     parser.add_argument('--lstmMiniBatch', default='64', type=int)
     parser.add_argument('--warmup', default='2', type=int)
     parser.add_argument('--nloops', default='50', type=int)
-    parser.add_argument('--timer_min_run_time', default=120, type=int)
+    parser.add_argument('--timer-min-run-time', '--timer_min_run_time', default=120, type=int)
 
     args = parser.parse_args()
 
diff --git a/benchmarks/sparse/dlmc/README.md b/benchmarks/sparse/dlmc/README.md
index 26305f3f8428..b1448b190593 100644
--- a/benchmarks/sparse/dlmc/README.md
+++ b/benchmarks/sparse/dlmc/README.md
@@ -4,7 +4,7 @@ These sets of benchmarks are for the sparse matrix functionality using a popular
 
 Performance benchmarks scripts for matrix-matrix and matrix-vector ops (dense-sparse, sparse-sparse, and compare to dense-dense) are implemented here.
 
-- `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward_test`, on CPU or CUDA with `--with_cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
+- `matmul_bench.py` with `--operation sparse@sparse|sparse@dense` is for Sparse matrix-matrix multiplication (SPMM) performance test. It can run in forward and backward mode with `--backward-test`, on CPU or CUDA with `--with-cuda`, using different datasets from the dataset collection DLMC. For more details see `test.sh` file.
 
 - `matmul_bench.py` with `--operation sparse@vector` is for Sparse matrix-vector multiplication (SPMV) performance test.
 
diff --git a/benchmarks/sparse/dlmc/matmul_bench.py b/benchmarks/sparse/dlmc/matmul_bench.py
index 504686654607..6b896ddf34a6 100644
--- a/benchmarks/sparse/dlmc/matmul_bench.py
+++ b/benchmarks/sparse/dlmc/matmul_bench.py
@@ -41,11 +41,11 @@ def parse_args():
     parser = argparse.ArgumentParser(description='matmul benchmark')
     parser.add_argument('--path', type=str, help='DLMC dataset path')
     parser.add_argument('--dataset', type=str, default='magnitude_pruning')
-    parser.add_argument('--hidden_size', default=2048, type=int)
-    parser.add_argument('--backward_test', action="store_true")
+    parser.add_argument('--hidden-size', '--hidden_size', default=2048, type=int)
+    parser.add_argument('--backward-test', '--backward_test', action="store_true")
     parser.add_argument('--operation', type=str, help="|".join(OPS_MAP.keys()), default=next(iter(OPS_MAP)))
-    parser.add_argument('--with_cuda', action='store_true')
-    parser.add_argument('--timer_min_run_time', default=1, type=float)
+    parser.add_argument('--with-cuda', '--with_cuda', action='store_true')
+    parser.add_argument('--timer-min-run-time', '--timer_min_run_time', default=1, type=float)
     return parser
 
 
diff --git a/benchmarks/sparse/dlmc/test.sh b/benchmarks/sparse/dlmc/test.sh
index ac5f32e0bdfc..96a277ca8fea 100644
--- a/benchmarks/sparse/dlmc/test.sh
+++ b/benchmarks/sparse/dlmc/test.sh
@@ -8,20 +8,20 @@ DATASET_ROOT_DIR=$HOME/datasets/
 echo "!! SPARSE SPMS TIME BENCHMARK!! "
 
 # cpu
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --backward-test
 
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --backward-test
 
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector
 
 
 # cuda
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with_cuda
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with_cuda--backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with-cuda
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@sparse --with-cuda --backward-test
 
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with_cuda
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with_cuda --backward_test
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with-cuda
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@dense --with-cuda --backward-test
 
-python -m dlmc.matmul_bench  --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector --with_cuda
+python -m dlmc.matmul_bench --path $DATASET_ROOT_DIR/dlmc/rn50 --dataset magnitude_pruning --operation sparse@vector --with-cuda
diff --git a/benchmarks/sparse/spmm.py b/benchmarks/sparse/spmm.py
index 5877c3e4ec50..722f67b55d28 100644
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@@ -70,9 +70,9 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
     parser.add_argument("--m", default='1000', type=int)
     parser.add_argument("--n", default='1000', type=int)
     parser.add_argument("--k", default='1000', type=int)
-    parser.add_argument("--nnz_ratio", default='0.1', type=float)
+    parser.add_argument("--nnz-ratio", "--nnz_ratio", default='0.1', type=float)
     parser.add_argument("--outfile", default='stdout', type=str)
-    parser.add_argument("--test_count", default='10', type=int)
+    parser.add_argument("--test-count", "--test_count", default='10', type=int)
 
     args = parser.parse_args()
 
diff --git a/benchmarks/sparse/spmv.py b/benchmarks/sparse/spmv.py
index 46d84ee637db..252383b83fdd 100644
--- a/benchmarks/sparse/spmv.py
+++ b/benchmarks/sparse/spmv.py
@@ -68,9 +68,9 @@ def test_sparse_coo_and_csr(m, nnz, test_count):
 
     parser.add_argument("--format", default='csr', type=str)
     parser.add_argument("--m", default='1000', type=int)
-    parser.add_argument("--nnz_ratio", default='0.1', type=float)
+    parser.add_argument("--nnz-ratio", "--nnz_ratio", default='0.1', type=float)
     parser.add_argument("--outfile", default='stdout', type=str)
-    parser.add_argument("--test_count", default='10', type=int)
+    parser.add_argument("--test-count", "--test_count", default='10', type=int)
 
     args = parser.parse_args()
 
diff --git a/benchmarks/sparse/test_csr.sh b/benchmarks/sparse/test_csr.sh
index a1e0427a20ae..c793658e31ea 100644
--- a/benchmarks/sparse/test_csr.sh
+++ b/benchmarks/sparse/test_csr.sh
@@ -18,8 +18,8 @@ cd benchmarks
 echo "!! SPARSE SPMM TIME BENCHMARK!! " >> $OUTFILE
 for dim0 in 1000 5000 10000; do
     for nnzr in 0.01 0.05 0.1 0.3; do
-        python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
-        # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
+        python -m sparse.spmm --format csr --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
+        # python -m sparse.spmm --format coo --m $dim0 --n $dim0 --k $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
     done
 done
 echo "----------------------" >> $OUTFILE
@@ -34,8 +34,8 @@ python setup.py install
 cd benchmarks
 for dim0 in 1000 5000 10000; do
     for nnzr in 0.01 0.05 0.1 0.3; do
-        python -m sparse.spmv --format csr --m $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
-        python -m sparse.spmv --format coo --m $dim0 --nnz_ratio $nnzr --outfile $OUTFILE
+        python -m sparse.spmv --format csr --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
+        python -m sparse.spmv --format coo --m $dim0 --nnz-ratio $nnzr --outfile $OUTFILE
     done
 done
 echo "----------------------" >> $OUTFILE
diff --git a/benchmarks/sparse/utils.py b/benchmarks/sparse/utils.py
index 3d58a3b9aa30..b2cabcfdc693 100644
--- a/benchmarks/sparse/utils.py
+++ b/benchmarks/sparse/utils.py
@@ -6,7 +6,7 @@
 import time
 
 # shim for torch.cuda.Event when running on cpu
-class Event(object):
+class Event:
     def __init__(self, enable_timing):
         pass
 
diff --git a/benchmarks/static_runtime/test_utils.cc b/benchmarks/static_runtime/test_utils.cc
index 59699c42fea1..d7f49c7171cb 100644
--- a/benchmarks/static_runtime/test_utils.cc
+++ b/benchmarks/static_runtime/test_utils.cc
@@ -353,8 +353,8 @@ void testStaticRuntime(
 
           size_t new_managed_bytes =
               memory_planner ? memory_planner->total_managed() : 0;
-          if (check_resize && new_managed_bytes > 0) {
-            EXPECT_GT(new_managed_bytes, managed_bytes);
+          if (check_resize && new_managed_bytes >= 0) {
+            EXPECT_GE(new_managed_bytes, managed_bytes);
           }
 
           // Run static runtime again with an input of the shape observed during
diff --git a/benchmarks/tensorexpr/HowToRun.md b/benchmarks/tensorexpr/HowToRun.md
index a1b241d7ac48..17061ad21934 100644
--- a/benchmarks/tensorexpr/HowToRun.md
+++ b/benchmarks/tensorexpr/HowToRun.md
@@ -6,5 +6,5 @@ to show documentation.
 
 An example of an actual command line that one might use as a starting point:
 ```
-python -m benchmarks.tensorexpr --device gpu --mode fwd --jit_mode trace --cuda_fuser=te
+python -m benchmarks.tensorexpr --device gpu --mode fwd --jit-mode trace --cuda-fuser=te
 ```
diff --git a/benchmarks/tensorexpr/__main__.py b/benchmarks/tensorexpr/__main__.py
index f984dbccd02d..ed632e966b2c 100644
--- a/benchmarks/tensorexpr/__main__.py
+++ b/benchmarks/tensorexpr/__main__.py
@@ -67,30 +67,35 @@ def main():
         help="the underlying tensor engine. only pt for now",
     )
     parser.add_argument(
+        "--jit-mode",
         "--jit_mode",
         type=str,
         default="trace",
         help="the jit mode to use: one of {trace, none}",
     )
     parser.add_argument(
+        "--cuda-pointwise-loop-levels",
         "--cuda_pointwise_loop_levels",
         type=int,
         default=None,
         help="num of loop levesl for Cuda pointwise operations: 2 or 3",
     )
     parser.add_argument(
+        "--cuda-pointwise-block-count",
         "--cuda_pointwise_block_count",
         type=int,
         default=None,
         help="num of block for Cuda pointwise operations",
     )
     parser.add_argument(
+        "--cuda-pointwise-block-size",
         "--cuda_pointwise_block_size",
         type=int,
         default=None,
         help="num of blocks for Cuda pointwise operations",
     )
     parser.add_argument(
+        "--cuda-fuser",
         "--cuda_fuser",
         type=str,
         default="te",
@@ -118,12 +123,14 @@ def main():
         help="Disable shape randomization in dynamic benchmarks.",
     )
     parser.add_argument(
+        "--cpu-fusion",
         "--cpu_fusion",
         default=False,
         action='store_true',
         help="Enable CPU fusion.",
     )
     parser.add_argument(
+        "--cat-wo-conditionals",
         "--cat_wo_conditionals",
         default=False,
         action='store_true',
diff --git a/benchmarks/tensorexpr/benchmark.py b/benchmarks/tensorexpr/benchmark.py
index f37d0a7e5c1b..c560ff57a348 100644
--- a/benchmarks/tensorexpr/benchmark.py
+++ b/benchmarks/tensorexpr/benchmark.py
@@ -7,7 +7,7 @@
 import json
 
 
-class Benchmark(object):
+class Benchmark:
     def __init__(self, mode, device, dtype):
         self.mode = mode
         self.deterministic = False
@@ -238,7 +238,7 @@ def cuda_pointwise_context(loop_levels, block_count, block_size):
         torch._C._jit_set_te_cuda_pointwise_block_size(old_block_size)
 
 # Auxiliary class to facilitate dynamic input shape
-class DynamicShape(object):
+class DynamicShape:
     r'''
     An Auxiliary class for dynamic shape benchmarks
 
diff --git a/benchmarks/tensorexpr/broadcast.py b/benchmarks/tensorexpr/broadcast.py
index 364bc61c1f8c..a4547b9ea3b9 100644
--- a/benchmarks/tensorexpr/broadcast.py
+++ b/benchmarks/tensorexpr/broadcast.py
@@ -69,7 +69,7 @@ def memory_workload(self):
 
 class BroadcastRowBench(BroadcastMulBench):
     def __init__(self, mode, device, dtype, M, N, K):
-        super(BroadcastRowBench, self).__init__(mode, device, dtype, "row", M, N, K)
+        super().__init__(mode, device, dtype, "row", M, N, K)
 
     @staticmethod
     def module():
@@ -78,7 +78,7 @@ def module():
 
 class BroadcastMidBench(BroadcastMulBench):
     def __init__(self, mode, device, dtype, M, N, K):
-        super(BroadcastMidBench, self).__init__(mode, device, dtype, "mid", M, N, K)
+        super().__init__(mode, device, dtype, "mid", M, N, K)
 
     @staticmethod
     def module():
@@ -87,7 +87,7 @@ def module():
 
 class BroadcastColBench(BroadcastMulBench):
     def __init__(self, mode, device, dtype, M, N, K):
-        super(BroadcastColBench, self).__init__(mode, device, dtype, "col", M, N, K)
+        super().__init__(mode, device, dtype, "col", M, N, K)
 
     @staticmethod
     def module():
diff --git a/benchmarks/tensorexpr/microbenchmarks.py b/benchmarks/tensorexpr/microbenchmarks.py
index 1ba84ce355df..9a929064664d 100644
--- a/benchmarks/tensorexpr/microbenchmarks.py
+++ b/benchmarks/tensorexpr/microbenchmarks.py
@@ -7,7 +7,7 @@
 import seaborn as sns
 import argparse
 
-class kernel_arena_scope(object):
+class kernel_arena_scope:
     def __enter__(self):
         self.scope = te.KernelScope()
 
@@ -247,7 +247,7 @@ def dump_plot(df, sizes):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Runs NNC microbenchmarks')
-    parser.add_argument('--multi_threaded', action='store_true', help='Run with more than one thread')
+    parser.add_argument('--multi-threaded', '--multi_threaded', action='store_true', help='Run with more than one thread')
     args = parser.parse_args()
     if not args.multi_threaded:
         torch.set_num_threads(1)
diff --git a/benchmarks/tensorexpr/pt_engine.py b/benchmarks/tensorexpr/pt_engine.py
index c25b568a2271..e09ee4cb38ce 100644
--- a/benchmarks/tensorexpr/pt_engine.py
+++ b/benchmarks/tensorexpr/pt_engine.py
@@ -1,7 +1,7 @@
 import torch
 
 
-class TorchTensorEngine(object):
+class TorchTensorEngine:
     def rand(self, shape, device=None, dtype=None, requires_grad=False):
         return torch.rand(shape, device=device, dtype=dtype, requires_grad=requires_grad)
 
diff --git a/benchmarks/tensorexpr/reduction.py b/benchmarks/tensorexpr/reduction.py
index c50d639a6576..77d64074eb81 100644
--- a/benchmarks/tensorexpr/reduction.py
+++ b/benchmarks/tensorexpr/reduction.py
@@ -80,7 +80,7 @@ def _skip_input_transform_str(self):
 
 class ReduceRowBench(ReduceBench):
     def __init__(self, mode, device, dtype, M, N, K, skip_input_transform):
-        super(ReduceRowBench, self).__init__(mode, device, dtype, "row", M, N, K, skip_input_transform)
+        super().__init__(mode, device, dtype, "row", M, N, K, skip_input_transform)
 
     @staticmethod
     def module():
@@ -89,7 +89,7 @@ def module():
 
 class ReduceMidBench(ReduceBench):
     def __init__(self, mode, device, dtype, M, N, K, skip_input_transform):
-        super(ReduceMidBench, self).__init__(mode, device, dtype, "mid", M, N, K, skip_input_transform)
+        super().__init__(mode, device, dtype, "mid", M, N, K, skip_input_transform)
 
     @staticmethod
     def module():
@@ -98,7 +98,7 @@ def module():
 
 class ReduceColBench(ReduceBench):
     def __init__(self, mode, device, dtype, M, N, K, skip_input_transform):
-        super(ReduceColBench, self).__init__(mode, device, dtype, "col", M, N, K, skip_input_transform)
+        super().__init__(mode, device, dtype, "col", M, N, K, skip_input_transform)
 
     @staticmethod
     def module():
@@ -107,7 +107,7 @@ def module():
 
 class ReduceFullBench(ReduceBench):
     def __init__(self, mode, device, dtype, M, skip_input_transform):
-        super(ReduceFullBench, self).__init__(mode, device, dtype, "full", M, 1, 1, skip_input_transform)
+        super().__init__(mode, device, dtype, "full", M, 1, 1, skip_input_transform)
 
     def config(self):
         return [self.M * self.N * self.K, self._skip_input_transform_str()]
@@ -178,7 +178,7 @@ def memory_workload(self):
 
 class Reduce2DInnerBench(Reduce2DBench):
     def __init__(self, mode, device, dtype, dim0, dim1):
-        super(Reduce2DInnerBench, self).__init__(mode, device, dtype, 1, dim0, dim1)
+        super().__init__(mode, device, dtype, 1, dim0, dim1)
 
     @staticmethod
     def default_configs():
@@ -186,7 +186,7 @@ def default_configs():
         return [parent_config[1:]]
 
     def config(self):
-        parent_config = super(Reduce2DInnerBench, self).config()
+        parent_config = super().config()
         return parent_config[1:]
 
     @staticmethod
@@ -195,7 +195,7 @@ def module():
 
 class Reduce2DOuterBench(Reduce2DBench):
     def __init__(self, mode, device, dtype, dim0, dim1):
-        super(Reduce2DOuterBench, self).__init__(mode, device, dtype, 0, dim0, dim1)
+        super().__init__(mode, device, dtype, 0, dim0, dim1)
 
     @staticmethod
     def default_configs():
@@ -203,7 +203,7 @@ def default_configs():
         return [parent_config[1:]]
 
     def config(self):
-        parent_config = super(Reduce2DOuterBench, self).config()
+        parent_config = super().config()
         return parent_config[1:]
 
     @staticmethod
@@ -249,7 +249,7 @@ def default_configs():
         return [parent_config[1:]]
 
     def config(self):
-        parent_config = super(DynamicReduce2DInnerBench, self).config()
+        parent_config = super().config()
         return parent_config[1:]
 
     @staticmethod
@@ -267,7 +267,7 @@ def default_configs():
         return [parent_config[1:]]
 
     def config(self):
-        parent_config = super(DynamicReduce2DInnerBench, self).config()
+        parent_config = super().config()
         return parent_config[1:]
 
     @staticmethod
diff --git a/benchmarks/transformer/better_transformer_vs_mha_functional.py b/benchmarks/transformer/better_transformer_vs_mha_functional.py
index b76077ba4c22..25cc7a15d6c2 100644
--- a/benchmarks/transformer/better_transformer_vs_mha_functional.py
+++ b/benchmarks/transformer/better_transformer_vs_mha_functional.py
@@ -185,8 +185,8 @@ def main(save_path: Optional[Path], error_path: Optional[Path]):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--save_path", type=str, help="Path to save the results")
-    parser.add_argument("--error_save_path", type=str, help="Path to save the errors")
+    parser.add_argument("--save-path", "--save_path", type=str, help="Path to save the results")
+    parser.add_argument("--error-save-path", "--error_save_path", type=str, help="Path to save the errors")
 
     args = parser.parse_args()
     save_path = Path(args.save_path) if args.save_path else None
diff --git a/benchmarks/transformer/sdp.py b/benchmarks/transformer/sdp.py
index 6cf01c15cf0b..3a5af1490bbe 100644
--- a/benchmarks/transformer/sdp.py
+++ b/benchmarks/transformer/sdp.py
@@ -99,7 +99,7 @@ def __init__(self, num_heads, in_proj_weight, in_proj_bias, out_proj):
         self.out_proj = out_proj
         self.num_heads = num_heads
 
-    def forward(self, query, key, value, mask, need_weights=False):
+    def forward(self, query, key, value, mask):
         if not (query is key and key is value):
             raise NotImplementedError(
                 "query, key and value must be the same Tensor for now."
@@ -122,13 +122,12 @@ def forward(self, query, key, value, mask, need_weights=False):
         value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        attn, _ = torch.nn.functional._scaled_dot_product_attention(
+        attn = torch.nn.functional.scaled_dot_product_attention(
             query,
             key,
             value,
             attn_mask=None,
             dropout_p=0.0,
-            need_attn_weights=need_weights,
             is_causal=False,
         )
 
@@ -223,17 +222,17 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
             config.pad_percentage,
             config.dtype,
         )
-        nn_mha_output, _ = nn_mha(qkv, qkv, qkv, mask, need_weights=False)
-        composite_mha_output, _ = composite_mha(qkv, qkv, qkv, mask, need_weights=False)
+        nn_mha_output, _ = nn_mha(qkv, qkv, qkv, mask)
+        composite_mha_output, _ = composite_mha(qkv, qkv, qkv, mask)
 
         # First order sanity check
         assert_close_tensors(nn_mha_output, composite_mha_output)
 
         nn_mha_time = benchmark_torch_function_in_microseconds(
-            nn_mha, qkv, qkv, qkv, mask, need_weights=False
+            nn_mha, qkv, qkv, qkv, mask
         )
         composite_mha_time = benchmark_torch_function_in_microseconds(
-            composite_mha, qkv, qkv, qkv, mask, need_weights=False
+            composite_mha, qkv, qkv, qkv, mask
         )
 
         # TorchDynamo will error on NestedTensors
@@ -242,11 +241,11 @@ def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
             compiled_composite_mha = torch.compile(composite_mha)
 
             compiled_nn_mha_time = benchmark_torch_function_in_microseconds(
-                compiled_nn_mha, qkv, qkv, qkv, mask, need_weights=False
+                compiled_nn_mha, qkv, qkv, qkv, mask
             )
 
             compiled_composite_mha_time = benchmark_torch_function_in_microseconds(
-                compiled_composite_mha, qkv, qkv, qkv, mask, need_weights=False
+                compiled_composite_mha, qkv, qkv, qkv, mask,
             )
         else:
             compiled_nn_mha_time = None
@@ -340,7 +339,7 @@ def main(save_path: Optional[Path]):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--save_path", type=str, help="Path to save the results")
+    parser.add_argument("--save-path", "--save_path", type=str, help="Path to save the results")
 
     args = parser.parse_args()
     save_path = Path(args.save_path) if args.save_path else None
diff --git a/benchmarks/transformer/sdp_backwards.py b/benchmarks/transformer/sdp_backwards.py
index 2f745e157b28..c1169cfb8b6d 100644
--- a/benchmarks/transformer/sdp_backwards.py
+++ b/benchmarks/transformer/sdp_backwards.py
@@ -36,13 +36,12 @@ def forward(self, query, key, value, mask):
         value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        attn, _ = torch.nn.functional._scaled_dot_product_attention(
+        attn, _ = torch.nn.functional.scaled_dot_product_attention(
             query,
             key,
             value,
             attn_mask=None,
             dropout_p=0.0,
-            need_attn_weights=False,
             is_causal=False,
         )
 
diff --git a/benchmarks/upload_scribe.py b/benchmarks/upload_scribe.py
index 5068dd287e9d..d476ade1b8df 100644
--- a/benchmarks/upload_scribe.py
+++ b/benchmarks/upload_scribe.py
@@ -129,7 +129,7 @@ def post_pytest_benchmarks(self, pytest_json):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--pytest_bench_json", type=argparse.FileType('r'),
+    parser.add_argument("--pytest-bench-json", "--pytest_bench_json", type=argparse.FileType('r'),
                         help='Upload json data formatted by pytest-benchmark module')
     args = parser.parse_args()
     if args.pytest_bench_json:
diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py
index 8684e07ee4fd..aab941cf1cde 100755
--- a/binaries/bench_gen/bench_gen.py
+++ b/binaries/bench_gen/bench_gen.py
@@ -67,16 +67,16 @@ def main(args):
     parser.add_argument("--context", help="Context to run on.", default="CPU")
     parser.add_argument("--kwargs", help="kwargs to pass to operator.",
                         nargs="*", type=parse_kwarg, default=[])
-    parser.add_argument("--init_net", help="Output initialization net.",
+    parser.add_argument("--init-net", "--init_net", help="Output initialization net.",
                         default="init_net.pb")
-    parser.add_argument("--predict_net", help="Output prediction net.",
+    parser.add_argument("--predict-net", "--predict_net", help="Output prediction net.",
                         default="predict_net.pb")
-    parser.add_argument("--benchmark_name",
+    parser.add_argument("--benchmark-name", "--benchmark_name",
                         help="Name of the benchmark network",
                         default="benchmark")
-    parser.add_argument("--input_name", help="Name of the input blob.",
+    parser.add_argument("--input-name", "--input_name", help="Name of the input blob.",
                         default="data")
-    parser.add_argument("--output_name", help="Name of the output blob.",
+    parser.add_argument("--output-name", "--output_name", help="Name of the output blob.",
                         default="output")
     parser.add_argument("--instances",
                         help="Number of instances to run the operator.",
diff --git a/buckbuild.bzl b/buckbuild.bzl
index eabfe45962a0..215c4a11c3b4 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -821,28 +821,6 @@ def define_buck_targets(
         ],
     )
 
-    fb_xplat_cxx_library(
-        name = "th_header",
-        header_namespace = "",
-        exported_headers = subdir_glob([
-            # TH
-            ("aten/src", "TH/*.h"),
-            ("aten/src", "TH/*.hpp"),
-            ("aten/src", "TH/generic/*.h"),
-            ("aten/src", "TH/generic/*.hpp"),
-            ("aten/src", "TH/generic/simd/*.h"),
-            ("aten/src", "TH/vector/*.h"),
-            ("aten/src", "TH/generic/*.c"),
-            ("aten/src", "TH/generic/*.cpp"),
-            ("aten/src/TH", "*.h"),  # for #include <THGenerateFloatTypes.h>
-            # THNN
-            ("aten/src", "THNN/*.h"),
-            ("aten/src", "THNN/generic/*.h"),
-            ("aten/src", "THNN/generic/*.c"),
-        ]),
-        labels = labels,
-    )
-
     fb_xplat_cxx_library(
         name = "aten_header",
         header_namespace = "",
@@ -909,7 +887,6 @@ def define_buck_targets(
             [
                 ("torch/csrc/api/include", "torch/**/*.h"),
                 ("", "torch/csrc/**/*.h"),
-                ("", "torch/csrc/generic/*.cpp"),
                 ("", "torch/script.h"),
                 ("", "torch/library.h"),
                 ("", "torch/custom_class.h"),
@@ -2128,6 +2105,7 @@ def define_buck_targets(
             "torch/csrc/jit/mobile/prim_ops_registery.cpp",
             "torch/csrc/jit/runtime/operator.cpp",
             "torch/csrc/jit/runtime/slice_indices_adjust.cpp",
+            "torch/csrc/utils/cpp_stacktraces.cpp",
         ],
         header_namespace = "",
         exported_headers = [
diff --git a/build_variables.bzl b/build_variables.bzl
index 2b4df5f833ab..2558a112ac50 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -18,34 +18,34 @@ GENERATED_LAZY_TS_CPP = [
 
 # NVFuser runtime library
 libtorch_nvfuser_runtime_sources = [
-    "torch/csrc/jit/codegen/cuda/runtime/array.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/array_rocm.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/bf16_support_rocm.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/block_sync_default_rocm.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/broadcast.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/helpers.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/index_utils.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/memory.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/swizzle.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/tensor.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/tuple.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/type_traits.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/warp.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/warp_rocm.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/welford.cu",
+    "third_party/nvfuser/runtime/array.cu",
+    "third_party/nvfuser/runtime/array_rocm.cu",
+    "third_party/nvfuser/runtime/bf16_support.cu",
+    "third_party/nvfuser/runtime/bf16_support_rocm.cu",
+    "third_party/nvfuser/runtime/block_reduction.cu",
+    "third_party/nvfuser/runtime/block_sync_atomic.cu",
+    "third_party/nvfuser/runtime/block_sync_default.cu",
+    "third_party/nvfuser/runtime/block_sync_default_rocm.cu",
+    "third_party/nvfuser/runtime/broadcast.cu",
+    "third_party/nvfuser/runtime/fp16_support.cu",
+    "third_party/nvfuser/runtime/fused_reduction.cu",
+    "third_party/nvfuser/runtime/fused_welford_helper.cu",
+    "third_party/nvfuser/runtime/fused_welford_impl.cu",
+    "third_party/nvfuser/runtime/grid_broadcast.cu",
+    "third_party/nvfuser/runtime/grid_reduction.cu",
+    "third_party/nvfuser/runtime/grid_sync.cu",
+    "third_party/nvfuser/runtime/helpers.cu",
+    "third_party/nvfuser/runtime/index_utils.cu",
+    "third_party/nvfuser/runtime/memory.cu",
+    "third_party/nvfuser/runtime/random_numbers.cu",
+    "third_party/nvfuser/runtime/swizzle.cu",
+    "third_party/nvfuser/runtime/tensor.cu",
+    "third_party/nvfuser/runtime/tensorcore.cu",
+    "third_party/nvfuser/runtime/tuple.cu",
+    "third_party/nvfuser/runtime/type_traits.cu",
+    "third_party/nvfuser/runtime/warp.cu",
+    "third_party/nvfuser/runtime/warp_rocm.cu",
+    "third_party/nvfuser/runtime/welford.cu",
     "aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh",
     "aten/src/ATen/cuda/detail/UnpackRaw.cuh",
 ]
@@ -71,7 +71,6 @@ def libtorch_generated_sources(gencode_pattern):
 
 # copied from https://github.com/pytorch/pytorch/blob/f99a693cd9ff7a9b5fdc71357dac66b8192786d3/aten/src/ATen/core/CMakeLists.txt
 jit_core_headers = [
-    "torch/csrc/utils/memory.h",
     "torch/csrc/Export.h",
     "torch/csrc/jit/frontend/source_range.h",
     "torch/csrc/jit/serialization/callstack_debug_info_serialization.h",
@@ -84,6 +83,8 @@ jit_core_headers = [
     "torch/csrc/jit/frontend/schema_type_parser.h",
     "torch/csrc/jit/frontend/error_report.h",
     "torch/csrc/jit/frontend/tree.h",
+    "torch/csrc/utils/cpp_stacktraces.h",
+    "torch/csrc/utils/memory.h",
     "torch/custom_class.h",
     "torch/custom_class_detail.h",
     "torch/library.h",
@@ -96,6 +97,7 @@ jit_core_sources = [
     "torch/csrc/jit/frontend/schema_type_parser.cpp",
     "torch/csrc/jit/frontend/strtod.cpp",
     "torch/csrc/jit/frontend/source_range.cpp",
+    "torch/csrc/utils/cpp_stacktraces.cpp",
 ]
 
 # copied from https://github.com/pytorch/pytorch/blob/0bde610c14b92d351b968a0228df29e92442b1cc/torch/CMakeLists.txt
@@ -231,6 +233,7 @@ core_sources_full_mobile_no_backend_interface_xplat = [
     "torch/csrc/jit/ir/node_hashing.cpp",
     "torch/csrc/jit/ir/scope.cpp",
     "torch/csrc/jit/ir/subgraph_matcher.cpp",
+    "torch/csrc/jit/ir/graph_utils.cpp",
     "torch/csrc/jit/jit_log.cpp",
     "torch/csrc/jit/jit_opt_limit.cpp",
     "torch/csrc/jit/mobile/nnc/aot_compiler.cpp",
@@ -402,7 +405,6 @@ core_sources_full_mobile_no_backend_interface_xplat = [
     "torch/csrc/jit/tensorexpr/unique_name_manager.cpp",
     "torch/csrc/jit/testing/file_check.cpp",
     "torch/csrc/jit/testing/hooks_for_testing.cpp",
-    "torch/csrc/utils/cpp_stacktraces.cpp",
     "torch/csrc/utils/schema_info.cpp",
     "torch/csrc/utils/tensor_flatten.cpp",
     "torch/csrc/utils/variadic.cpp",
@@ -677,107 +679,6 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
     "torch/csrc/profiler/stubs/cuda.cpp",
     "torch/csrc/autograd/functions/comm.cpp",
-    "torch/csrc/jit/codegen/cuda/arith.cpp",
-    "torch/csrc/jit/codegen/cuda/compute_at.cpp",
-    "torch/csrc/jit/codegen/cuda/inlining.cpp",
-    "torch/csrc/jit/codegen/cuda/compute_at_map.cpp",
-    "torch/csrc/jit/codegen/cuda/codegen.cpp",
-    "torch/csrc/jit/codegen/cuda/contiguity.cpp",
-    "torch/csrc/jit/codegen/cuda/dispatch.cpp",
-    "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp",
-    "torch/csrc/jit/codegen/cuda/executor.cpp",
-    "torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp",
-    "torch/csrc/jit/codegen/cuda/executor_launch_params.cpp",
-    "torch/csrc/jit/codegen/cuda/evaluator_common.cpp",
-    "torch/csrc/jit/codegen/cuda/executor_utils.cpp",
-    "torch/csrc/jit/codegen/cuda/fusion.cpp",
-    "torch/csrc/jit/codegen/cuda/graph_fuser.cpp",
-    "torch/csrc/jit/codegen/cuda/grouped_reduction.cpp",
-    "torch/csrc/jit/codegen/cuda/index_compute.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_index_compute.cpp",
-    "torch/csrc/jit/codegen/cuda/instrumentation.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_builder.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_cloner.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_container.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_nodes.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_iostream.cpp",
-    "torch/csrc/jit/codegen/cuda/ir_utils.cpp",
-    "torch/csrc/jit/codegen/cuda/iter_visitor.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel_cache.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel_ir.cpp",
-    "torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_allocation.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_index.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_instrument.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_predicate.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_replace_size.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_shift.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_sync_information.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_unroll.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_utils.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_validation.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp",
-    "torch/csrc/jit/codegen/cuda/lower2device.cpp",
-    "torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp",
-    "torch/csrc/jit/codegen/cuda/manager.cpp",
-    "torch/csrc/jit/codegen/cuda/maxinfo_propagator.cpp",
-    "torch/csrc/jit/codegen/cuda/mutator.cpp",
-    "torch/csrc/jit/codegen/cuda/non_divisible_split.cpp",
-    "torch/csrc/jit/codegen/cuda/ops/alias.cpp",
-    "torch/csrc/jit/codegen/cuda/ops/composite.cpp",
-    "torch/csrc/jit/codegen/cuda/ops/normalization.cpp",
-    "torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp",
-    "torch/csrc/jit/codegen/cuda/parallel_type_bitmap.cpp",
-    "torch/csrc/jit/codegen/cuda/parser.cpp",
-    "torch/csrc/jit/codegen/cuda/partial_split_map.cpp",
-    "torch/csrc/jit/codegen/cuda/partition.cpp",
-    "torch/csrc/jit/codegen/cuda/predicate_compute.cpp",
-    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp",
-    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp",
-    "torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp",
-    "torch/csrc/jit/codegen/cuda/register_interface.cpp",
-    "torch/csrc/jit/codegen/cuda/root_domain_map.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/registry.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/utils.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp",
-    "torch/csrc/jit/codegen/cuda/type_inference.cpp",
-    "torch/csrc/jit/codegen/cuda/type_promotion.cpp",
-    "torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp",
-    "torch/csrc/jit/codegen/cuda/tensor_view.cpp",
-    "torch/csrc/jit/codegen/cuda/transform_iter.cpp",
-    "torch/csrc/jit/codegen/cuda/transform_replay.cpp",
-    "torch/csrc/jit/codegen/cuda/transform_rfactor.cpp",
-    "torch/csrc/jit/codegen/cuda/transform_view.cpp",
-    "torch/csrc/jit/codegen/cuda/type.cpp",
-    "torch/csrc/jit/codegen/cuda/utils.cpp",
-    "torch/csrc/jit/codegen/cuda/mma_type.cpp",
-    "torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp",
     "torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp",
     "torch/csrc/jit/tensorexpr/cuda_codegen.cpp",
     "torch/csrc/jit/runtime/register_cuda_ops.cpp",
@@ -895,6 +796,7 @@ libtorch_python_core_sources = [
     "torch/csrc/MemoryFormat.cpp",
     "torch/csrc/QScheme.cpp",
     "torch/csrc/Module.cpp",
+    "torch/csrc/PyInterpreter.cpp",
     "torch/csrc/python_dimname.cpp",
     "torch/csrc/Size.cpp",
     "torch/csrc/Storage.cpp",
@@ -921,8 +823,8 @@ libtorch_python_core_sources = [
     "torch/csrc/dynamo/guards.cpp",
     "torch/csrc/dynamo/init.cpp",
     "torch/csrc/functorch/init.cpp",
+    "torch/csrc/mps/Module.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
-    "torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp",
     "torch/csrc/jit/python/init.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
     "torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp",
@@ -1081,6 +983,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/ParallelOpenMP.cpp",
     "aten/src/ATen/ParallelThreadPoolNative.cpp",
     "aten/src/ATen/PythonTorchFunctionTLS.cpp",
+    "aten/src/ATen/ThreadLocalPythonObjects.cpp",
     "aten/src/ATen/ScalarOps.cpp",
     "aten/src/ATen/SparseTensorImpl.cpp",
     "aten/src/ATen/SparseCsrTensorImpl.cpp",
@@ -1240,6 +1143,7 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/cpu/scaled_modified_bessel_k1.cpp",
     "aten/src/ATen/native/cpu/spherical_bessel_j0.cpp",
     "aten/src/ATen/native/cpu/SampledAddmmKernel.cpp",
+    "aten/src/ATen/native/cpu/SpmmReduceKernel.cpp",
     "aten/src/ATen/native/cpu/SparseFactories.cpp",
     "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp",
 ]
@@ -1328,6 +1232,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/Bucketization.cpp",
     "aten/src/ATen/native/CPUBlas.cpp",
     "aten/src/ATen/native/ChanelShuffle.cpp",
+    "aten/src/ATen/native/Collectives.cpp",
     "aten/src/ATen/native/Col2Im.cpp",
     "aten/src/ATen/native/PadNd.cpp",
     "aten/src/ATen/native/Convolution.cpp",
diff --git a/c10/BUILD.bazel b/c10/BUILD.bazel
index 8627bd736a8d..3504451fc8df 100644
--- a/c10/BUILD.bazel
+++ b/c10/BUILD.bazel
@@ -40,7 +40,7 @@ cc_library(
     deps = [
         "//c10/core:alignment",
         "//c10/cuda:Macros",
-        "//c10/macros",
+        "//c10/macros:macros",
     ] + select({
         ":using_gflags": ["@com_github_gflags_gflags//:gflags"],
         "//conditions:default": [],
diff --git a/c10/build.bzl b/c10/build.bzl
index 21107eb8b992..6a0920687113 100644
--- a/c10/build.bzl
+++ b/c10/build.bzl
@@ -7,7 +7,7 @@ def define_targets(rules):
             "//c10/core:alignment",
             "//c10/core:alloc_cpu",
             "//c10/core:base",
-            "//c10/macros",
+            "//c10/macros:macros",
             "//c10/mobile:CPUCachingAllocator",
             "//c10/mobile:CPUProfilingAllocator",
             "//c10/util:TypeCast",
@@ -15,7 +15,7 @@ def define_targets(rules):
             "//c10/util:typeid",
         ] + rules.if_cuda(
             [
-                "//c10/cuda",
+                "//c10/cuda:cuda",
                 "//c10/cuda:Macros",
             ],
             [],
diff --git a/c10/core/Allocator.cpp b/c10/core/Allocator.cpp
index 9879f05e64e4..dada5bb0eac4 100644
--- a/c10/core/Allocator.cpp
+++ b/c10/core/Allocator.cpp
@@ -46,8 +46,8 @@ bool memoryProfilingEnabled() {
 void reportMemoryUsageToProfiler(
     void* ptr,
     int64_t alloc_size,
-    int64_t total_allocated,
-    int64_t total_reserved,
+    size_t total_allocated,
+    size_t total_reserved,
     Device device) {
   auto* reporter_ptr = static_cast<MemoryReportingInfoBase*>(
       ThreadLocalDebugInfo::get(DebugInfoKind::PROFILER_STATE));
@@ -59,8 +59,8 @@ void reportMemoryUsageToProfiler(
 
 void reportOutOfMemoryToProfiler(
     int64_t alloc_size,
-    int64_t total_allocated,
-    int64_t total_reserved,
+    size_t total_allocated,
+    size_t total_reserved,
     Device device) {
   auto* reporter_ptr = static_cast<MemoryReportingInfoBase*>(
       ThreadLocalDebugInfo::get(DebugInfoKind::PROFILER_STATE));
@@ -74,8 +74,8 @@ MemoryReportingInfoBase::MemoryReportingInfoBase() = default;
 
 void MemoryReportingInfoBase::reportOutOfMemory(
     int64_t /*alloc_size*/,
-    int64_t /*total_allocated*/,
-    int64_t /*total_reserved*/,
+    size_t /*total_allocated*/,
+    size_t /*total_reserved*/,
     Device /*device*/) {}
 
 } // namespace c10
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index 663471de5d0e..1fe60817f8e2 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -239,14 +239,14 @@ struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase {
   virtual void reportMemoryUsage(
       void* ptr,
       int64_t alloc_size,
-      int64_t total_allocated,
-      int64_t total_reserved,
+      size_t total_allocated,
+      size_t total_reserved,
       Device device) = 0;
 
   virtual void reportOutOfMemory(
       int64_t alloc_size,
-      int64_t total_allocated,
-      int64_t total_reserved,
+      size_t total_allocated,
+      size_t total_reserved,
       Device device);
 
   virtual bool memoryProfilingEnabled() const = 0;
@@ -256,14 +256,14 @@ C10_API bool memoryProfilingEnabled();
 C10_API void reportMemoryUsageToProfiler(
     void* ptr,
     int64_t alloc_size,
-    int64_t total_allocated,
-    int64_t total_reserved,
+    size_t total_allocated,
+    size_t total_reserved,
     Device device);
 
 C10_API void reportOutOfMemoryToProfiler(
     int64_t alloc_size,
-    int64_t total_allocated,
-    int64_t total_reserved,
+    size_t total_allocated,
+    size_t total_reserved,
     Device device);
 
 } // namespace c10
diff --git a/c10/core/AutogradState.h b/c10/core/AutogradState.h
index cf821ec030e1..6b51c09cbf3d 100644
--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@@ -1,8 +1,6 @@
 #pragma once
 
-#include <c10/macros/Macros.h>
-
-#include <cstdint>
+#include <c10/macros/Export.h>
 
 namespace c10 {
 
@@ -38,6 +36,10 @@ struct C10_API AutogradState {
     mulithreading_enabled_ = mulithreading_enabled;
   }
 
+  void set_view_replay_enabled(bool view_replay_enabled) {
+    view_replay_enabled_ = view_replay_enabled;
+  }
+
   bool get_grad_mode() const {
     return grad_mode_;
   }
@@ -54,11 +56,16 @@ struct C10_API AutogradState {
     return mulithreading_enabled_;
   }
 
+  bool get_view_replay_enabled() const {
+    return view_replay_enabled_;
+  }
+
  private:
   bool grad_mode_ : 1;
   bool inference_mode_ : 1;
   bool fw_grad_mode_ : 1;
   bool mulithreading_enabled_ : 1;
+  bool view_replay_enabled_ : 1;
 };
 
 } // namespace c10
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index 60b76edb9c7f..c103c42a2829 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -5,6 +5,7 @@
 #include <c10/core/impl/alloc_cpu.h>
 #include <c10/mobile/CPUCachingAllocator.h>
 #include <c10/mobile/CPUProfilingAllocator.h>
+#include <c10/util/Logging.h>
 
 // TODO: rename flag to C10
 C10_DEFINE_bool(
@@ -71,7 +72,6 @@ template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
 class DefaultMobileCPUAllocator final : public at::Allocator {
  public:
   DefaultMobileCPUAllocator() = default;
-  // NOLINTNEXTLINE(modernize-use-override)
   ~DefaultMobileCPUAllocator() override = default;
 
   static void deleter(void* const pointer) {
@@ -208,7 +208,11 @@ void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) {
   }
   if (profile_memory) {
     reportMemoryUsageToProfiler(
-        ptr, nbytes, allocated, 0, c10::Device(c10::DeviceType::CPU));
+        ptr,
+        static_cast<int64_t>(nbytes),
+        allocated,
+        0,
+        c10::Device(c10::DeviceType::CPU));
   }
 }
 
@@ -243,7 +247,11 @@ void ProfiledCPUMemoryReporter::Delete(void* ptr) {
   }
   if (profile_memory) {
     reportMemoryUsageToProfiler(
-        ptr, -nbytes, allocated, 0, c10::Device(c10::DeviceType::CPU));
+        ptr,
+        -static_cast<int64_t>(nbytes),
+        allocated,
+        0,
+        c10::Device(c10::DeviceType::CPU));
   }
 }
 
@@ -265,7 +273,7 @@ void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
   if (profile_memory) {
     reportOutOfMemoryToProfiler(
         static_cast<int64_t>(nbytes),
-        static_cast<int64_t>(allocated),
+        allocated,
         0,
         c10::Device(c10::DeviceType::CPU));
   }
diff --git a/c10/core/CPUAllocator.h b/c10/core/CPUAllocator.h
index da56a5b222a8..14fe876008d0 100644
--- a/c10/core/CPUAllocator.h
+++ b/c10/core/CPUAllocator.h
@@ -1,11 +1,11 @@
 #pragma once
 
 #include <cstring>
+#include <mutex>
 #include <unordered_map>
 
 #include <c10/core/Allocator.h>
-#include <c10/core/alignment.h> // legacy, update dependents to include this directly
-#include <c10/util/Logging.h>
+#include <c10/util/Flags.h>
 
 // TODO: rename to c10
 C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 86ebdf24ec94..d02eb5e94b89 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -8,7 +8,6 @@
 #include <exception>
 #include <ostream>
 #include <string>
-#include <tuple>
 #include <vector>
 
 namespace c10 {
@@ -130,7 +129,7 @@ Device::Device(const std::string& device_string) : Device(Type::CPU) {
 
   try {
     if (!device_index_str.empty()) {
-      index_ = c10::stoi(device_index_str);
+      index_ = static_cast<c10::DeviceIndex>(c10::stoi(device_index_str));
     }
   } catch (const std::exception&) {
     TORCH_CHECK(
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index d4e80ed14df1..efc33be399af 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -1,8 +1,6 @@
 #include <c10/core/DeviceType.h>
 #include <c10/util/Exception.h>
-#include <c10/util/Optional.h>
 #include <atomic>
-#include <memory>
 #include <mutex>
 
 namespace c10 {
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index 69966084c6c4..91a606b07a21 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -311,6 +311,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"NestedTensor", c10::DispatchKey::NestedTensor},
       {"NestedTensorCPU", c10::DispatchKey::NestedTensorCPU},
       {"NestedTensorCUDA", c10::DispatchKey::NestedTensorCUDA},
+      {"NestedTensorMeta", c10::DispatchKey::NestedTensorMeta},
       {"PrivateUse1", c10::DispatchKey::PrivateUse1},
       {"PrivateUse2", c10::DispatchKey::PrivateUse2},
       {"PrivateUse3", c10::DispatchKey::PrivateUse3},
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index d9361de1e52f..abc4ab7e9852 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -2,11 +2,8 @@
 
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/ArrayRef.h>
-#include <c10/util/Exception.h>
 #include <ostream>
 #include <string>
-#include <vector>
 
 namespace c10 {
 
@@ -27,6 +24,10 @@ namespace c10 {
 // make sure you update PrivateUse3Bit.  (But you shouldn't: private use
 // keys should have higher precedence than all built-in keys)
 
+// If you add a new (non-privateuse) backend here,
+// make sure to add an Autograd<Backend> fallthrough kernel
+// in aten/src/ATen/core/VariableFallbackKernel.cpp
+
 #define C10_FORALL_BACKEND_COMPONENTS(_, extra) \
   _(CPU, extra)                                 \
   _(CUDA, extra)                                \
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 90da13a59a26..df9ac27919e1 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -157,7 +157,7 @@ class DispatchKeySet final {
 
   // NB: default constructor representation as zero is MANDATORY as
   // use of DispatchKeySet in TLS requires this.
-  constexpr DispatchKeySet() : repr_(0) {}
+  constexpr DispatchKeySet() = default;
 
   constexpr DispatchKeySet(Full)
       : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index e2876bf9a1cf..dfac912ac4ee 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/GeneratorImpl.h>
-#include <chrono>
 #include <random>
 
 #if defined(__SGX_ENABLED__)
@@ -9,6 +8,7 @@
 #ifndef _WIN32
 #include <fcntl.h>
 #include <unistd.h>
+#include <chrono>
 #endif
 
 namespace c10 {
@@ -26,6 +26,7 @@ GeneratorImpl::GeneratorImpl(Device device_in, DispatchKeySet key_set)
 c10::intrusive_ptr<GeneratorImpl> GeneratorImpl::clone() const {
   auto res = this->clone_impl();
   c10::raw::intrusive_ptr::incref(res);
+  c10::raw::weak_intrusive_ptr::incref(res);
   return c10::intrusive_ptr<GeneratorImpl>::reclaim(res);
 }
 
@@ -46,14 +47,13 @@ namespace detail {
 #if !defined(_WIN32)
 static uint64_t readURandomLong() {
   int randDev = open("/dev/urandom", O_RDONLY);
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  uint64_t randValue;
   TORCH_CHECK(randDev >= 0, "Unable to open /dev/urandom");
+  uint64_t randValue{};
   ssize_t readBytes = read(randDev, &randValue, sizeof(randValue));
+  close(randDev);
   TORCH_CHECK(
       readBytes >= (ssize_t)sizeof(randValue),
       "Unable to read from /dev/urandom");
-  close(randDev);
   return randValue;
 }
 #endif // _WIN32
diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h
index 389bd6271403..0b2b2a87eae0 100644
--- a/c10/core/GeneratorImpl.h
+++ b/c10/core/GeneratorImpl.h
@@ -1,17 +1,12 @@
 #pragma once
 
 #include <stdint.h>
-#include <atomic>
-#include <deque>
 #include <mutex>
-#include <typeinfo>
-#include <utility>
 
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/core/TensorImpl.h>
-#include <c10/util/C++17.h>
-#include <c10/util/Exception.h>
+#include <c10/macros/Export.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/python_stub.h>
 
@@ -67,7 +62,7 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
   GeneratorImpl(GeneratorImpl&& other) = delete;
   GeneratorImpl& operator=(const GeneratorImpl& other) = delete;
 
-  virtual ~GeneratorImpl() = default;
+  ~GeneratorImpl() override = default;
   c10::intrusive_ptr<GeneratorImpl> clone() const;
 
   // Common methods for all generators
@@ -103,7 +98,7 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
 
 namespace detail {
 
-TORCH_API uint64_t getNonDeterministicRandom(bool is_cuda = false);
+C10_API uint64_t getNonDeterministicRandom(bool is_cuda = false);
 
 } // namespace detail
 
diff --git a/c10/core/GradMode.cpp b/c10/core/GradMode.cpp
index c2ea8698732d..d4eb08829e92 100644
--- a/c10/core/GradMode.cpp
+++ b/c10/core/GradMode.cpp
@@ -1,7 +1,5 @@
 #include <c10/core/GradMode.h>
 
-#include <stdexcept>
-
 namespace c10 {
 
 bool GradMode::is_enabled() {
diff --git a/c10/core/GradMode.h b/c10/core/GradMode.h
index d83ff6d0d0d3..e98e2ec9354e 100644
--- a/c10/core/GradMode.h
+++ b/c10/core/GradMode.h
@@ -5,14 +5,14 @@
 
 namespace c10 {
 
-struct TORCH_API GradMode {
+struct C10_API GradMode {
   static bool is_enabled();
   static void set_enabled(bool enabled);
 };
 
 // A RAII, thread local (!) guard that enables or disables grad mode upon
 // construction, and sets it back to the original value upon destruction.
-struct TORCH_API AutoGradMode {
+struct C10_API AutoGradMode {
   AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
     GradMode::set_enabled(enabled);
   }
@@ -24,13 +24,13 @@ struct TORCH_API AutoGradMode {
 
 // A RAII, thread local (!) guard that stops future operations from building
 // gradients.
-struct TORCH_API NoGradGuard : public AutoGradMode {
+struct C10_API NoGradGuard : public AutoGradMode {
   NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
 };
 
 // A RAII, thread local (!) guard that enables or disables forward grad mode
 // upon construction, and sets it back to the original value upon destruction.
-struct TORCH_API AutoFwGradMode {
+struct C10_API AutoFwGradMode {
   AutoFwGradMode(bool enabled)
       : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
     AutogradState::get_tls_state().set_fw_grad_mode(enabled);
diff --git a/c10/core/InferenceMode.cpp b/c10/core/InferenceMode.cpp
index 59eca760cf50..fafb14c426be 100644
--- a/c10/core/InferenceMode.cpp
+++ b/c10/core/InferenceMode.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/InferenceMode.h>
-#include <stdexcept>
 
 namespace c10 {
 // Invariant:
diff --git a/c10/core/InferenceMode.h b/c10/core/InferenceMode.h
index fd93e5ba8c56..b0979b58e5b8 100644
--- a/c10/core/InferenceMode.h
+++ b/c10/core/InferenceMode.h
@@ -9,7 +9,7 @@ namespace c10 {
 
 // A RAII, thread local (!) guard that enables or disables inference mode upon
 // construction, and sets it back to the original value upon destruction.
-struct TORCH_API InferenceMode {
+struct C10_API InferenceMode {
   // Note [Expected TLS state in InferenceMode]:
   //   InferenceMode: ADInplaceOrView not in
   //   raw_local_dispatch_key_set.included(),
diff --git a/c10/core/SafePyObject.cpp b/c10/core/SafePyObject.cpp
index 09c20e24df11..b9c4c4bd2b21 100644
--- a/c10/core/SafePyObject.cpp
+++ b/c10/core/SafePyObject.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/SafePyObject.h>
-#include <c10/core/TensorImpl.h>
 
 namespace c10 {
 
diff --git a/c10/core/Stream.h b/c10/core/Stream.h
index c5bd253f353e..732ac651b762 100644
--- a/c10/core/Stream.h
+++ b/c10/core/Stream.h
@@ -15,9 +15,9 @@ namespace c10 {
 using StreamId = int64_t;
 
 struct C10_API StreamData3 {
-  int64_t stream_id;
-  int64_t device_index;
-  int64_t device_type;
+  StreamId stream_id;
+  DeviceIndex device_index;
+  DeviceType device_type;
 };
 
 // NB: I decided not to call the above StreamIndex to avoid confusion with
@@ -130,9 +130,8 @@ class C10_API Stream final {
   // implementation detail and should not be relied upon.
   uint64_t hash() const noexcept {
     // Concat these together into a 64-bit integer
-    uint64_t bits = static_cast<uint64_t>(static_cast<uint8_t>(device_type()))
-            << 56 |
-        static_cast<uint64_t>(static_cast<uint8_t>(device_index())) << 48 |
+    uint64_t bits = static_cast<uint64_t>(device_type()) << 56 |
+        static_cast<uint64_t>(device_index()) << 48 |
         // Remove the sign extension part of the 64-bit address because
         // the id might be used to hold a pointer.
         (static_cast<uint64_t>(id()) & ((1ull << 48) - 1));
@@ -140,22 +139,15 @@ class C10_API Stream final {
   }
 
   struct StreamData3 pack3() const {
-    StreamData3 data;
-    data.stream_id = static_cast<int64_t>(id());
-    data.device_index = static_cast<int64_t>(device_index());
-    data.device_type = static_cast<int64_t>(device_type());
-    return data;
+    return {id(), device_index(), device_type()};
   }
 
   static Stream unpack3(
-      int64_t stream_id,
-      int64_t device_index,
-      int64_t device_type) {
-    const auto _stream_id = static_cast<StreamId>(stream_id);
-    const auto _device_index = static_cast<DeviceIndex>(device_index);
-    const auto _device_type = static_cast<DeviceType>(device_type);
-    TORCH_CHECK(isValidDeviceType(_device_type));
-    return Stream(UNSAFE, Device(_device_type, _device_index), _stream_id);
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    TORCH_CHECK(isValidDeviceType(device_type));
+    return Stream(UNSAFE, Device(device_type, device_index), stream_id);
   }
 
   // I decided NOT to provide setters on this class, because really,
diff --git a/c10/core/SymBool.cpp b/c10/core/SymBool.cpp
index c41cffb06135..1f88f840b6ab 100644
--- a/c10/core/SymBool.cpp
+++ b/c10/core/SymBool.cpp
@@ -10,6 +10,14 @@ SymNode SymBool::toSymNodeImpl() const {
   return SymNode::reclaim_copy(toSymNodeImplUnowned());
 }
 
+SymNode SymBool::wrap_node(const SymNode& base) const {
+  if (is_symbolic()) {
+    return toSymNodeImpl();
+  } else {
+    return base->wrap_bool(as_bool_unchecked());
+  }
+}
+
 static std::array<SymNode, 2> normalize_symbools(
     const SymBool& a_,
     const SymBool& b_) {
@@ -69,4 +77,11 @@ bool SymBool::guard_bool(const char* file, int64_t line) const {
   return a->guard_bool(file, line);
 }
 
+bool SymBool::has_hint() const {
+  if (!is_symbolic()) {
+    return true;
+  }
+  return toSymNodeImpl()->has_hint();
+}
+
 } // namespace c10
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index de2d7c2f2825..e0f458dfb2a4 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -5,9 +5,6 @@
 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
 
-#include <limits>
-#include <memory>
-
 namespace c10 {
 
 class C10_API SymBool {
@@ -26,8 +23,12 @@ class C10_API SymBool {
     return std::move(ptr_).release();
   }
 
+  // Only valid if is_symbolic()
   SymNode toSymNodeImpl() const;
 
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
   bool expect_bool() const {
     TORCH_CHECK(!is_symbolic());
     return data_;
@@ -52,6 +53,8 @@ class C10_API SymBool {
   // bool, so it's not so common to have to call this
   bool guard_bool(const char* file, int64_t line) const;
 
+  bool has_hint() const;
+
   C10_ALWAYS_INLINE bool is_symbolic() const {
     return ptr_;
   }
diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 161313c777dd..267f894c23ad 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -11,6 +11,14 @@ SymNode SymFloat::toSymNodeImpl() const {
   return SymNode::reclaim_copy(toSymNodeImplUnowned());
 }
 
+SymNode SymFloat::wrap_node(const SymNode& base) const {
+  if (is_symbolic()) {
+    return toSymNodeImpl();
+  } else {
+    return base->wrap_float(as_float_unchecked());
+  }
+}
+
 static std::array<SymNode, 2> normalize_symfloats(
     const SymFloat& a_,
     const SymFloat& b_) {
@@ -62,6 +70,69 @@ SymFloat SymFloat::operator/(const SymFloat& sci) const {
   return SymFloat(res[0]->truediv(res[1]));
 }
 
+SymBool SymFloat::sym_eq(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return data_ == sci.data_;
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return res[0]->eq(res[1]);
+}
+
+SymBool SymFloat::sym_ne(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return data_ != sci.data_;
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return res[0]->ne(res[1]);
+}
+
+SymBool SymFloat::sym_lt(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return data_ < sci.data_;
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return res[0]->lt(res[1]);
+}
+
+SymBool SymFloat::sym_le(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return data_ <= sci.data_;
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return res[0]->le(res[1]);
+}
+
+SymBool SymFloat::sym_gt(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return data_ > sci.data_;
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return res[0]->gt(res[1]);
+}
+
+SymBool SymFloat::sym_ge(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return data_ >= sci.data_;
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return res[0]->ge(res[1]);
+}
+
+SymFloat SymFloat::min(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return std::min(data_, sci.data_);
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return SymFloat(res[0]->sym_min(res[1]));
+}
+SymFloat SymFloat::max(const SymFloat& sci) const {
+  if (!is_symbolic() && !sci.is_symbolic()) {
+    return std::max(data_, sci.data_);
+  }
+  auto res = normalize_symfloats(*this, sci);
+  return SymFloat(res[0]->sym_max(res[1]));
+}
+
 std::ostream& operator<<(std::ostream& os, const SymFloat& s) {
   if (s.is_symbolic()) {
     os << s.toSymNodeImpl()->str();
@@ -88,4 +159,11 @@ double SymFloat::guard_float(const char* file, int64_t line) const {
   return a->guard_float(file, line);
 }
 
+bool SymFloat::has_hint() const {
+  if (!is_symbolic()) {
+    return true;
+  }
+  return toSymNodeImpl()->has_hint();
+}
+
 } // namespace c10
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index 50512dc6fb20..3275d1e2ab1b 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/SymBool.h>
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
@@ -28,8 +29,12 @@ class C10_API SymFloat {
     return std::move(ptr_).release();
   }
 
+  // Only valid if is_symbolic()
   SymNode toSymNodeImpl() const;
 
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
   double expect_float() const {
     TORCH_CHECK(!is_symbolic());
     return data_;
@@ -40,6 +45,35 @@ class C10_API SymFloat {
   SymFloat operator*(const SymFloat&) const;
   SymFloat operator/(const SymFloat&) const;
 
+  SymBool sym_eq(const SymFloat&) const;
+  SymBool sym_ne(const SymFloat&) const;
+  SymBool sym_lt(const SymFloat&) const;
+  SymBool sym_le(const SymFloat&) const;
+  SymBool sym_gt(const SymFloat&) const;
+  SymBool sym_ge(const SymFloat&) const;
+
+  bool operator==(const SymFloat& o) const {
+    return sym_eq(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator!=(const SymFloat& o) const {
+    return sym_ne(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<(const SymFloat& o) const {
+    return sym_lt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<=(const SymFloat& o) const {
+    return sym_le(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>(const SymFloat& o) const {
+    return sym_gt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>=(const SymFloat& o) const {
+    return sym_ge(o).guard_bool(__FILE__, __LINE__);
+  }
+
+  SymFloat min(const SymFloat& sci) const;
+  SymFloat max(const SymFloat& sci) const;
+
   // Need guidance on where to put this code
   SymFloat sqrt() const;
 
@@ -53,6 +87,8 @@ class C10_API SymFloat {
   // number can be used to diagnose overspecialization.
   double guard_float(const char* file, int64_t line) const;
 
+  bool has_hint() const;
+
   // N.B. It's important to keep this definition in the header
   // as we expect if checks to be folded for mobile builds
   // where `is_symbolic` is always false
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index faa0d650b038..24066bb111a5 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -31,6 +31,14 @@ SymNode SymInt::toSymNodeImpl() const {
   return SymNode::reclaim_copy(toSymNodeImplUnowned());
 }
 
+SymNode SymInt::wrap_node(const SymNode& base) const {
+  if (is_symbolic()) {
+    return toSymNodeImpl();
+  } else {
+    return base->wrap_int(as_int_unchecked());
+  }
+}
+
 SymInt::SymInt(SymNode sin_sp) {
   TORCH_CHECK(sin_sp->is_int());
   auto ptr = static_cast<uint64_t>(
@@ -47,6 +55,13 @@ int64_t SymInt::guard_int(const char* file, int64_t line) const {
   return a->guard_int(file, line);
 }
 
+bool SymInt::has_hint() const {
+  if (!is_symbolic()) {
+    return true;
+  }
+  return toSymNodeImpl()->has_hint();
+}
+
 SymInt::operator SymFloat() const {
   if (!is_symbolic()) {
     return SymFloat(double(data_));
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index ca3e718f8c02..40d504be5788 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -4,11 +4,8 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
-#include <c10/util/intrusive_ptr.h>
 
-#include <memory>
 #include <numeric>
-#include <utility>
 
 namespace c10 {
 
@@ -116,8 +113,12 @@ class C10_API SymInt {
 #endif
   }
 
+  // Only valid if is_symbolic()
   SymNode toSymNodeImpl() const;
 
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
   ~SymInt() {
     release_();
   }
@@ -131,6 +132,11 @@ class C10_API SymInt {
     return data_;
   }
 
+  // Test if we have a hint for this int (e.g., guard_int would work).
+  // Most of the time this is true; it is only false when you have
+  // an unbacked SymInt.
+  bool has_hint() const;
+
   // Insert a guard for the int to be its concrete value, and then return
   // that value.  This operation always works, even if the int is symbolic,
   // so long as we know what the underlying value is (e.g., this won't work
diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h
index 8b89e93641c0..c86d5ebb74c7 100644
--- a/c10/core/SymIntArrayRef.h
+++ b/c10/core/SymIntArrayRef.h
@@ -5,11 +5,6 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 
-#include <array>
-#include <initializer_list>
-#include <iterator>
-#include <vector>
-
 namespace c10 {
 using SymIntArrayRef = ArrayRef<SymInt>;
 
diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index c87ed6c75a7f..f4e14994031e 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -5,17 +5,18 @@
 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
 #include <memory>
-#include <mutex>
-#include <vector>
 
 namespace c10 {
 
 class SymNodeImpl;
 using SymNode = c10::intrusive_ptr<SymNodeImpl>;
 
+// When you add a method, you also need to edit
+// torch/csrc/jit/python/init.cpp
+// torch/csrc/utils/python_symnode.h
 class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
  public:
-  virtual ~SymNodeImpl() = default;
+  ~SymNodeImpl() override = default;
 
   template <typename T>
   c10::intrusive_ptr<T> dyn_cast() const {
@@ -96,6 +97,31 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
     TORCH_CHECK(false, "NYI");
   };
   // NB: self is ignored here, only the arguments are used
+  virtual SymNode is_contiguous(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_contiguous_2d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_contiguous_3d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_strides_2d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_strides_3d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
   virtual SymNode is_non_overlapping_and_dense(
       ArrayRef<SymNode> sizes,
       ArrayRef<SymNode> strides) {
@@ -131,6 +157,9 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   virtual bool bool_() {
     TORCH_CHECK(false, "NYI");
   };
+  virtual bool has_hint() {
+    TORCH_CHECK(false, "NYI");
+  };
   virtual std::string str() {
     TORCH_CHECK(false, "NYI");
   };
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 743e80f8eeb7..2c1324036e59 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -111,7 +111,7 @@ TensorImpl::TensorImpl(
     DispatchKeySet key_set,
     const caffe2::TypeMeta data_type)
     : storage_(std::move(storage)),
-      storage_offset_(0),
+
       numel_(0),
       data_type_(data_type),
       device_opt_(storage_.device()),
@@ -123,6 +123,7 @@ TensorImpl::TensorImpl(
   }
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 TensorImpl::TensorImpl(
     DispatchKeySet key_set,
     const caffe2::TypeMeta data_type,
@@ -136,7 +137,7 @@ TensorImpl::TensorImpl(
     const caffe2::TypeMeta data_type,
     c10::optional<c10::Device> device_opt)
     : storage_(std::move(storage)),
-      storage_offset_(0),
+
       numel_(0),
       data_type_(data_type),
       device_opt_(device_opt) {
@@ -224,6 +225,51 @@ void TensorImpl::HandleResize() {
   }
 }
 
+// base, sizes, strides
+static c10::optional<
+    std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>>
+normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
+  // Look for a SymNode to dispatch on
+  SymNode base;
+  bool all_hinted = true;
+  for (const auto& s : sizes) {
+    if (all_hinted && !s.has_hint()) {
+      all_hinted = false;
+    }
+    if (!base && s.is_symbolic()) {
+      base = s.toSymNodeImpl();
+    }
+  }
+  for (const auto& s : strides) {
+    if (all_hinted && !s.has_hint()) {
+      all_hinted = false;
+    }
+    if (!base && s.is_symbolic()) {
+      base = s.toSymNodeImpl();
+    }
+  }
+  if (!base || all_hinted) {
+    // Couldn't find.  Tell the caller to do the normal computation
+    // Alternately, if everything is hinted, we want the normal computation
+    // too
+    return c10::nullopt;
+  }
+  // Populate the SymNode array
+  std::vector<SymNode> size_nodes;
+  std::vector<SymNode> stride_nodes;
+  size_nodes.reserve(sizes.size());
+  stride_nodes.reserve(strides.size());
+  for (const auto& s : sizes) {
+    size_nodes.emplace_back(s.wrap_node(base));
+  }
+  for (const auto& s : strides) {
+    stride_nodes.emplace_back(s.wrap_node(base));
+  }
+  return c10::make_optional(
+      std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>(
+          std::move(base), std::move(size_nodes), std::move(stride_nodes)));
+}
+
 template <typename T>
 bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
   bool is_contiguous = true;
@@ -255,14 +301,6 @@ bool TensorImpl::compute_contiguous(identity<bool>) const {
       numel_);
 }
 
-SymBool TensorImpl::compute_contiguous(identity<SymBool>) const {
-  if (is_sparse()) {
-    return false;
-  }
-  return _compute_contiguous<c10::SymInt>(
-      extra_meta_->sizes_, extra_meta_->strides_, extra_meta_->numel_);
-}
-
 template <typename T>
 bool _compute_channels_last_contiguous_2d(
     ArrayRef<T> sizes,
@@ -301,15 +339,6 @@ bool TensorImpl::compute_channels_last_contiguous_2d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
-SymBool TensorImpl::compute_channels_last_contiguous_2d(
-    identity<SymBool>) const {
-  if (is_sparse()) {
-    return false;
-  }
-  return _compute_channels_last_contiguous_2d<c10::SymInt>(
-      extra_meta_->sizes_, extra_meta_->strides_);
-}
-
 template <typename T>
 bool _compute_channels_last_contiguous_3d(
     ArrayRef<T> sizes,
@@ -348,15 +377,6 @@ bool TensorImpl::compute_channels_last_contiguous_3d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
-SymBool TensorImpl::compute_channels_last_contiguous_3d(
-    identity<SymBool>) const {
-  if (is_sparse()) {
-    return false;
-  }
-  return _compute_channels_last_contiguous_3d<c10::SymInt>(
-      extra_meta_->sizes_, extra_meta_->strides_);
-}
-
 bool TensorImpl::compute_strides_like_channels_last_2d(identity<bool>) const {
   if (is_sparse()) {
     return false;
@@ -366,15 +386,6 @@ bool TensorImpl::compute_strides_like_channels_last_2d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
-SymBool TensorImpl::compute_strides_like_channels_last_2d(
-    identity<SymBool>) const {
-  if (is_sparse()) {
-    return false;
-  }
-  return is_channels_last_strides_2d<c10::SymInt>(
-      extra_meta_->sizes_, extra_meta_->strides_);
-}
-
 bool TensorImpl::compute_strides_like_channels_last_3d(identity<bool>) const {
   if (is_sparse()) {
     return false;
@@ -384,15 +395,6 @@ bool TensorImpl::compute_strides_like_channels_last_3d(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
-SymBool TensorImpl::compute_strides_like_channels_last_3d(
-    identity<SymBool>) const {
-  if (is_sparse()) {
-    return false;
-  }
-  return is_channels_last_strides_3d<c10::SymInt>(
-      extra_meta_->sizes_, extra_meta_->strides_);
-}
-
 template <typename T>
 bool _compute_non_overlapping_and_dense(
     ArrayRef<T> sizes,
@@ -438,65 +440,128 @@ bool TensorImpl::compute_non_overlapping_and_dense(identity<bool>) const {
       sizes_and_strides_.strides_arrayref());
 }
 
-SymBool TensorImpl::compute_non_overlapping_and_dense(identity<SymBool>) const {
+// Special treatment because of numel
+SymBool TensorImpl::compute_contiguous(identity<SymBool>) const {
   if (is_sparse()) {
     return false;
   }
-  return _compute_non_overlapping_and_dense<c10::SymInt>(
-      extra_meta_->sizes_, extra_meta_->strides_);
+  SymIntArrayRef sizes = extra_meta_->sizes_;
+  SymIntArrayRef strides = extra_meta_->strides_;
+  auto n = normalize_sym_sizes_strides(sizes, strides);
+  if (n.has_value()) {
+    SymNode base;
+    std::vector<SymNode> size_nodes;
+    std::vector<SymNode> stride_nodes;
+    std::tie(base, size_nodes, stride_nodes) = *n;
+    return SymBool(base->is_contiguous(size_nodes, stride_nodes));
+  } else {
+    return _compute_contiguous(sizes, strides, extra_meta_->numel_);
+  }
 }
 
+// The rest of them
+#define DEFINE_SYMBOOL_COMPUTE(name, nodeimpl, fallback)        \
+  SymBool TensorImpl::name(identity<SymBool>) const {           \
+    if (is_sparse()) {                                          \
+      return false;                                             \
+    }                                                           \
+    SymIntArrayRef sizes = extra_meta_->sizes_;                 \
+    SymIntArrayRef strides = extra_meta_->strides_;             \
+    auto n = normalize_sym_sizes_strides(sizes, strides);       \
+    if (n.has_value()) {                                        \
+      SymNode base;                                             \
+      std::vector<SymNode> size_nodes;                          \
+      std::vector<SymNode> stride_nodes;                        \
+      std::tie(base, size_nodes, stride_nodes) = *n;            \
+      return SymBool(base->nodeimpl(size_nodes, stride_nodes)); \
+    } else {                                                    \
+      return fallback(sizes, strides);                          \
+    }                                                           \
+  }
+
+// clang-format off
+DEFINE_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, is_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d)
+DEFINE_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, is_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d)
+DEFINE_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d, is_channels_last_strides_2d)
+DEFINE_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d, is_channels_last_strides_3d)
+DEFINE_SYMBOOL_COMPUTE(compute_non_overlapping_and_dense, is_non_overlapping_and_dense, _compute_non_overlapping_and_dense)
+// clang-format on
+
+#undef DEFINE_SYMBOOL_COMPUTE
+
 // Glue compute
-// NB: intentionally not using bitwise operators.  Using bitwise operators
-// currently impedes ShapeEnv from getting crucial equalities which cause
+// NB: this logic very intentionally short circuits if possible.  Without
+// short circuiting, it causes
 // python test/functorch/test_aotdispatch.py -k
 // test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 to run
-// very slowly.  I think probably we just need to be able to reason through
-// And/Or, and then we can switch these to be symbolic.
+// very slowly.
+
+static bool definitely_true(SymBool b) {
+  return b.has_hint() && b.guard_bool(__FILE__, __LINE__);
+}
 
 SymBool TensorImpl::compute_is_non_overlapping_and_dense_dim4(
     identity<SymBool> type_id) {
-  return extra_meta_->is_contiguous_.guard_bool(__FILE__, __LINE__) ||
-      extra_meta_->is_channels_last_contiguous_.guard_bool(
-          __FILE__, __LINE__) ||
-      compute_non_overlapping_and_dense(type_id).guard_bool(__FILE__, __LINE__);
+  if (definitely_true(extra_meta_->is_contiguous_)) {
+    return true;
+  }
+  if (definitely_true(extra_meta_->is_channels_last_contiguous_)) {
+    return true;
+  }
+  return extra_meta_->is_contiguous_ |
+      extra_meta_->is_channels_last_contiguous_ |
+      compute_non_overlapping_and_dense(type_id);
 }
 
 SymBool TensorImpl::compute_channels_last_contiguous_3d_dim5(
     identity<SymBool> type_id) {
-  return !extra_meta_->is_channels_last_contiguous_.guard_bool(
-             __FILE__, __LINE__) &&
-      compute_channels_last_contiguous_3d(type_id).guard_bool(
-          __FILE__, __LINE__);
+  if (definitely_true(extra_meta_->is_channels_last_contiguous_)) {
+    return false;
+  }
+  return ~extra_meta_->is_channels_last_contiguous_ &
+      compute_channels_last_contiguous_3d(type_id);
 }
 
 SymBool TensorImpl::compute_channels_last_2d_dim5(identity<SymBool> type_id) {
-  return !extra_meta_->is_channels_last_3d_contiguous_.guard_bool(
-             __FILE__, __LINE__) &&
-      compute_strides_like_channels_last_2d(type_id).guard_bool(
-          __FILE__, __LINE__);
+  if (definitely_true(extra_meta_->is_channels_last_3d_contiguous_)) {
+    return false;
+  }
+  return ~extra_meta_->is_channels_last_3d_contiguous_ &
+      compute_strides_like_channels_last_2d(type_id);
 }
 
 SymBool TensorImpl::compute_channels_last_3d_dim5(identity<SymBool> type_id) {
-  return !extra_meta_->is_channels_last_.guard_bool(__FILE__, __LINE__) &&
-      compute_strides_like_channels_last_3d(type_id).guard_bool(
-          __FILE__, __LINE__);
+  if (definitely_true(extra_meta_->is_channels_last_)) {
+    return false;
+  }
+  return ~extra_meta_->is_channels_last_ &
+      compute_strides_like_channels_last_3d(type_id);
 }
 
 SymBool TensorImpl::compute_is_non_overlapping_and_dense_dim5(
     identity<SymBool> type_id) {
-  return extra_meta_->is_contiguous_.guard_bool(__FILE__, __LINE__) ||
-      extra_meta_->is_channels_last_contiguous_.guard_bool(
-          __FILE__, __LINE__) ||
-      extra_meta_->is_channels_last_3d_contiguous_.guard_bool(
-          __FILE__, __LINE__) ||
-      compute_non_overlapping_and_dense(type_id).guard_bool(__FILE__, __LINE__);
+  if (definitely_true(extra_meta_->is_contiguous_)) {
+    return true;
+  }
+  if (definitely_true(extra_meta_->is_channels_last_contiguous_)) {
+    return true;
+  }
+  if (definitely_true(extra_meta_->is_channels_last_3d_contiguous_)) {
+    return true;
+  }
+  return extra_meta_->is_contiguous_ |
+      extra_meta_->is_channels_last_contiguous_ |
+      extra_meta_->is_channels_last_3d_contiguous_ |
+      compute_non_overlapping_and_dense(type_id);
 }
 
 SymBool TensorImpl::compute_is_non_overlapping_and_dense_anydim(
     identity<SymBool> type_id) {
-  return extra_meta_->is_contiguous_.guard_bool(__FILE__, __LINE__) ||
-      compute_non_overlapping_and_dense(type_id).guard_bool(__FILE__, __LINE__);
+  if (definitely_true(extra_meta_->is_contiguous_)) {
+    return true;
+  }
+  return extra_meta_->is_contiguous_ |
+      compute_non_overlapping_and_dense(type_id);
 }
 
 void TensorImpl::release_resources() {
@@ -866,7 +931,8 @@ void TensorImpl::Extend(int64_t num, float growthPct) {
   newCapacity[0] = std::max(
       newDims[0],
       static_cast<int64_t>(std::ceil(
-          sizes_and_strides_.size_at_unchecked(0) * (1 + growthPct / 100))));
+          static_cast<float>(sizes_and_strides_.size_at_unchecked(0)) *
+          (1 + growthPct / 100))));
   auto oldData = std::move(storage_.data_ptr());
   auto oldSize = numel_;
   Resize(std::move(newCapacity));
@@ -1105,22 +1171,17 @@ void TensorImpl::generic_set_sizes_contiguous(SymIntArrayRef sizes) {
 
 void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
   TORCH_INTERNAL_ASSERT(has_symbolic_sizes_strides_);
-#ifdef DEBUG
-  TORCH_INTERNAL_ASSERT(
-      compute_numel() == numel_,
-      "If you are seeing this error, that means empty_tensor_restride was "
-      "called before setting correct numel");
-#endif
   switch (memory_format) {
     case MemoryFormat::Contiguous: {
-      // dim_ is a virtual call, don't repeat it
-      const auto dim_ = dim();
+      // TODO: figure out if the non-symint version can also devirtualize;
+      // the last time we tried it was probably a narrowing problem
+      const auto dim_ = static_cast<int64_t>(extra_meta_->sizes_.size());
       extra_meta_->strides_.resize(dim_);
       if (dim_ > 0) {
         const auto last_idx = dim_ - 1;
         extra_meta_->strides_[last_idx] = c10::SymInt(1);
         for (auto i = last_idx - 1; i >= 0; --i) {
-          extra_meta_->strides_[last_idx] =
+          extra_meta_->strides_[i] =
               extra_meta_->strides_[i + 1] * extra_meta_->sizes_[i + 1].max(1);
         }
       }
@@ -1129,15 +1190,15 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
     case MemoryFormat::ChannelsLast: {
       TORCH_CHECK(
           dim() == 4, "required rank 4 tensor to use channels_last format");
-      set_sizes_and_strides(
-          sym_sizes(), get_channels_last_strides_2d(sym_sizes()));
+      clone_symvec(
+          get_channels_last_strides_2d(sym_sizes()), extra_meta_->strides_);
       break;
     }
     case MemoryFormat::ChannelsLast3d: {
       TORCH_CHECK(
           dim() == 5, "required rank 5 tensor to use channels_last_3d format");
-      set_sizes_and_strides(
-          sym_sizes(), get_channels_last_strides_3d(sym_sizes()));
+      clone_symvec(
+          get_channels_last_strides_3d(sym_sizes()), extra_meta_->strides_);
       break;
     }
     case MemoryFormat::Preserve:
@@ -1151,6 +1212,29 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
   // recompute contiguous flag, as currently NHWC/NCHW flags are not mutually
   // exclusive see #24090
   refresh_contiguous();
+  // hard code some known true settings, for unbacked case
+  // TODO: avoid chundering into the guards for computing these
+  switch (memory_format) {
+    case MemoryFormat::Contiguous: {
+      extra_meta_->is_contiguous_ = true;
+      extra_meta_->is_non_overlapping_and_dense_ = true;
+      break;
+    }
+    case MemoryFormat::ChannelsLast: {
+      extra_meta_->is_channels_last_contiguous_ = true;
+      extra_meta_->is_channels_last_ = true;
+      extra_meta_->is_non_overlapping_and_dense_ = true;
+      break;
+    }
+    case MemoryFormat::ChannelsLast3d: {
+      extra_meta_->is_channels_last_3d_contiguous_ = true;
+      extra_meta_->is_channels_last_3d_ = true;
+      extra_meta_->is_non_overlapping_and_dense_ = true;
+      break;
+    }
+    default:
+      break;
+  }
 }
 
 namespace impl {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 3b2fe47eabd1..ae8fa515b06b 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -331,9 +331,9 @@ struct C10_API VariableVersion {
   // doesn't allocate the intrusive_ptr.
   // Example use cases are:
   //  - Inference tensors don't track version counter, so they'll just always
-  //    have disbaled VariableVersion.
+  //    have disabled VariableVersion.
   //  - In SavedVariable class we override version_counter_ inside its
-  //  construtor
+  //  constructor
   //    so that we can use the cheap constructor there.
   enum Disabled { DISABLED };
   // It's okay to return true even for inference tensor which
@@ -388,6 +388,15 @@ struct C10_API VariableVersion {
     }
   }
 
+  void set_version(int64_t i) {
+    TORCH_CHECK(
+        version_counter_,
+        "Tried to call torch.autograd._unsafe_set_version() on a tensor "
+        "that does not have a version counter. Was it created in inference mode?");
+    TORCH_CHECK(i >= 0, "Cannot set a version_counter to a value below 0: ", i);
+    version_counter_->version_ = i;
+  }
+
   // Inference tensor doesn't have version counter so it shouldn't be
   // accessed.
   uint32_t current_version() const {
@@ -503,7 +512,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class;
  */
 struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
-  virtual ~TensorImpl() override;
+  ~TensorImpl() override;
   // Note [Enum ImplType]
   // This enum is temporary. In the followup refactor we should
   // think about how to specialize TensorImpl creation for view
@@ -1887,7 +1896,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
            BackendComponent::CUDABit,
            BackendComponent::MPSBit,
            BackendComponent::HIPBit,
-           BackendComponent::XPUBit});
+           BackendComponent::XPUBit,
+           BackendComponent::HPUBit});
       constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense);
       return ts.has_any(dense_k) && ts.has_any(dense_backends);
     };
diff --git a/c10/core/alignment.h b/c10/core/alignment.h
index 4a8c732ef42d..2877decc04d7 100644
--- a/c10/core/alignment.h
+++ b/c10/core/alignment.h
@@ -14,4 +14,8 @@ constexpr size_t gAlignment = 16;
 constexpr size_t gAlignment = 64;
 #endif
 
+constexpr size_t gPagesize = 4096;
+// since the default thp pagesize is 2MB, enable thp only
+// for buffers of size 2MB or larger to avoid memory bloating
+constexpr size_t gAlloc_threshold_thp = 2 * 1024 * 1024;
 } // namespace c10
diff --git a/c10/core/build.bzl b/c10/core/build.bzl
index 24c5947185a5..eb2c01d56d6f 100644
--- a/c10/core/build.bzl
+++ b/c10/core/build.bzl
@@ -47,7 +47,7 @@ def define_targets(rules):
         visibility = ["//visibility:public"],
         deps = [
             ":alignment",
-            "//c10/macros",
+            "//c10/macros:macros",
             "//c10/util:base",
         ],
     )
@@ -82,7 +82,7 @@ def define_targets(rules):
         visibility = ["//visibility:public"],
         deps = [
             ":ScalarType",
-            "//c10/macros",
+            "//c10/macros:macros",
             "//c10/util:TypeCast",
             "//c10/util:base",
             "//c10/util:typeid",
diff --git a/c10/core/impl/InlineStreamGuard.h b/c10/core/impl/InlineStreamGuard.h
index 7f4691e84a79..71be63d8ad88 100644
--- a/c10/core/impl/InlineStreamGuard.h
+++ b/c10/core/impl/InlineStreamGuard.h
@@ -208,7 +208,7 @@ class InlineMultiStreamGuard {
       impl_.emplace(getDeviceTypeOfStreams(streams));
       original_streams_.reserve(streams.size());
       for (const Stream& s : streams) {
-        original_streams_.push_back(this->impl_->exchangeStream(s));
+        original_streams_.emplace_back(this->impl_->exchangeStream(s));
       }
     }
   }
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index 0e251538e142..d574de071d7a 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -97,10 +97,17 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   };
 };
 
+// Construct this in Global scope instead of within `disarm`
+// where it will be only initialized first time `disarm` is called.
+// This increases the likelihood `noop_vtable` lives longer than
+// any object that refers to it.
+
+// If `noop_vtable` goes out of scope first, other objects will have dangling
+// reference to it.
+static NoopPyInterpreterVTable noop_vtable;
+
 void PyInterpreter::disarm() noexcept {
-  // Intentionally leaked
-  static PyInterpreterVTable* noop_vtable = new NoopPyInterpreterVTable();
-  vtable_ = noop_vtable;
+  vtable_ = &noop_vtable;
 }
 
 } // namespace impl
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
index 519b19865e65..3fc5670147ce 100644
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@@ -26,6 +26,7 @@ PyInterpreter* PyObjectSlot::pyobj_interpreter() {
 }
 
 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
   return reinterpret_cast<PyObject*>(
       reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
 }
@@ -47,10 +48,12 @@ PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
 }
 
 bool PyObjectSlot::owns_pyobj() {
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
   return reinterpret_cast<uintptr_t>(pyobj_) & 1;
 }
 
 void PyObjectSlot::set_owns_pyobj(bool b) {
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
   pyobj_ = reinterpret_cast<PyObject*>(
       reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
 }
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index 193faa7709f5..65eb48fc003a 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -3,13 +3,15 @@
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 
+#include <utility>
+
 namespace c10 {
 namespace impl {
 
 thread_local TorchDispatchModeTLS torchDispatchModeState;
 
 void TorchDispatchModeTLS::push_onto_stack(std::shared_ptr<SafePyObject> mode) {
-  if (torchDispatchModeState.stack_.size() == 0) {
+  if (torchDispatchModeState.stack_.empty()) {
     c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
     c10::impl::tls_set_dispatch_key_included(
         DispatchKey::PythonTLSSnapshot, true);
@@ -19,12 +21,12 @@ void TorchDispatchModeTLS::push_onto_stack(std::shared_ptr<SafePyObject> mode) {
 
 const std::shared_ptr<SafePyObject> TorchDispatchModeTLS::pop_stack() {
   TORCH_CHECK(
-      torchDispatchModeState.stack_.size() > 0,
+      !torchDispatchModeState.stack_.empty(),
       "trying to pop from empty mode stack");
   std::shared_ptr<SafePyObject> out = torchDispatchModeState.stack_.back();
   torchDispatchModeState.stack_.pop_back();
 
-  if (torchDispatchModeState.stack_.size() == 0) {
+  if (torchDispatchModeState.stack_.empty()) {
     c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
     c10::impl::tls_set_dispatch_key_included(
         DispatchKey::PythonTLSSnapshot, false);
@@ -41,16 +43,16 @@ const std::shared_ptr<SafePyObject>& TorchDispatchModeTLS::get_stack_at(
 }
 
 int64_t TorchDispatchModeTLS::stack_len() {
-  return torchDispatchModeState.stack_.size();
+  return static_cast<int64_t>(torchDispatchModeState.stack_.size());
 }
 
 const TorchDispatchModeTLS& TorchDispatchModeTLS::get_state() {
   return torchDispatchModeState;
 }
 
-void TorchDispatchModeTLS::set_state(const TorchDispatchModeTLS& state) {
-  torchDispatchModeState = state;
-  if (torchDispatchModeState.stack_.size() == 0) {
+void TorchDispatchModeTLS::set_state(TorchDispatchModeTLS state) {
+  torchDispatchModeState = std::move(state);
+  if (torchDispatchModeState.stack_.empty()) {
     c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
     c10::impl::tls_set_dispatch_key_included(
         DispatchKey::PythonTLSSnapshot, false);
diff --git a/c10/core/impl/TorchDispatchModeTLS.h b/c10/core/impl/TorchDispatchModeTLS.h
index da30d0460427..7a288a459694 100644
--- a/c10/core/impl/TorchDispatchModeTLS.h
+++ b/c10/core/impl/TorchDispatchModeTLS.h
@@ -2,8 +2,6 @@
 
 #include <c10/core/SafePyObject.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/ArrayRef.h>
-#include <c10/util/Optional.h>
 
 namespace c10 {
 namespace impl {
@@ -15,7 +13,7 @@ struct C10_API TorchDispatchModeTLS {
   static int64_t stack_len();
 
   static const TorchDispatchModeTLS& get_state();
-  static void set_state(const TorchDispatchModeTLS& state);
+  static void set_state(TorchDispatchModeTLS state);
 
  private:
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp
index f2cd27e1add7..644f35f8de02 100644
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@@ -30,8 +30,8 @@ void memset_junk(void* data, size_t num) {
   static constexpr int32_t kJunkPattern = 0x7fedbeef;
   static constexpr int64_t kJunkPattern64 =
       static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
-  int32_t int64_count = num / sizeof(kJunkPattern64);
-  int32_t remaining_bytes = num % sizeof(kJunkPattern64);
+  auto int64_count = num / sizeof(kJunkPattern64);
+  auto remaining_bytes = num % sizeof(kJunkPattern64);
   int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
   for (const auto i : c10::irange(int64_count)) {
     data_i64[i] = kJunkPattern64;
@@ -41,6 +41,38 @@ void memset_junk(void* data, size_t num) {
   }
 }
 
+static inline bool is_thp_alloc_enabled() {
+  static bool value = [&](const char* pt) {
+    if (pt != nullptr) {
+      return std::atoi(pt);
+    } else {
+      return 0;
+    }
+  }(std::getenv("THP_MEM_ALLOC_ENABLE"));
+  return value;
+}
+
+#ifdef __linux__
+inline size_t c10_compute_alignment(size_t nbytes) {
+  static const auto pagesize = sysconf(_SC_PAGESIZE);
+  // for kernels that don't provide page size, default it to 4K
+  const size_t thp_alignment = (gPagesize < 0 ? gPagesize : pagesize);
+  return (is_thp_alloc_enabled() ? thp_alignment : gAlignment);
+}
+
+inline bool is_thp_alloc(size_t nbytes) {
+  // enable thp (transparent huge pages) for larger buffers
+  return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
+}
+#else
+constexpr size_t c10_compute_alignment(C10_UNUSED size_t nbytes) {
+  return gAlignment;
+}
+
+constexpr bool is_thp_alloc(C10_UNUSED size_t nbytes) {
+  return false;
+}
+#endif
 } // namespace
 
 void* alloc_cpu(size_t nbytes) {
@@ -71,7 +103,7 @@ void* alloc_cpu(size_t nbytes) {
       nbytes,
       " bytes.");
 #else
-  int err = posix_memalign(&data, gAlignment, nbytes);
+  int err = posix_memalign(&data, c10_compute_alignment(nbytes), nbytes);
   CAFFE_ENFORCE(
       err == 0,
       "DefaultCPUAllocator: can't allocate memory: you tried to allocate ",
@@ -81,6 +113,16 @@ void* alloc_cpu(size_t nbytes) {
       " (",
       strerror(err),
       ")");
+#ifdef __linux__
+  // MADV_HUGEPAGE advise is available only for linux.
+  // general posix compliant systems can check POSIX_MADV_SEQUENTIAL advise.
+  if (is_thp_alloc(nbytes)) {
+    int ret = madvise(data, nbytes, MADV_HUGEPAGE);
+    if (ret != 0) {
+      TORCH_WARN_ONCE("thp madvise for HUGEPAGE failed with ", strerror(errno));
+    }
+  }
+#endif
 #endif
 
   // move data to a thread's NUMA node
diff --git a/c10/core/impl/alloc_cpu.h b/c10/core/impl/alloc_cpu.h
index dc0f97f0f3c1..3f28be980e6b 100644
--- a/c10/core/impl/alloc_cpu.h
+++ b/c10/core/impl/alloc_cpu.h
@@ -4,6 +4,11 @@
 
 #include <cstddef>
 
+#ifdef __linux__
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
 namespace c10 {
 
 C10_API void* alloc_cpu(size_t nbytes);
diff --git a/c10/core/thread_pool.cpp b/c10/core/thread_pool.cpp
index 7ccc3948e8c1..7aaf085df9d2 100644
--- a/c10/core/thread_pool.cpp
+++ b/c10/core/thread_pool.cpp
@@ -57,7 +57,7 @@ bool ThreadPool::inThreadPool() const {
 }
 
 void ThreadPool::run(std::function<void()> func) {
-  if (threads_.size() == 0) {
+  if (threads_.empty()) {
     throw std::runtime_error("No threads to run a task");
   }
   std::unique_lock<std::mutex> lock(mutex_);
@@ -71,9 +71,7 @@ void ThreadPool::run(std::function<void()> func) {
 
 void ThreadPool::waitWorkComplete() {
   std::unique_lock<std::mutex> lock(mutex_);
-  while (!complete_) {
-    completed_.wait(lock);
-  }
+  completed_.wait(lock, [&]() { return complete_; });
 }
 
 void ThreadPool::main_loop(std::size_t index) {
@@ -81,9 +79,7 @@ void ThreadPool::main_loop(std::size_t index) {
   while (running_) {
     // Wait on condition variable while the task is empty and
     // the pool is still running.
-    while (tasks_.empty() && running_) {
-      condition_.wait(lock);
-    }
+    condition_.wait(lock, [&]() { return !tasks_.empty() || !running_; });
     // If pool is no longer running, break out of loop.
     if (!running_) {
       break;
diff --git a/c10/core/thread_pool.h b/c10/core/thread_pool.h
index 9d2d6b5e3dac..bc35707ef5f9 100644
--- a/c10/core/thread_pool.h
+++ b/c10/core/thread_pool.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <atomic>
 #include <condition_variable>
 #include <functional>
 #include <mutex>
@@ -7,8 +8,6 @@
 #include <thread>
 #include <utility>
 
-#include <c10/util/Optional.h>
-#include <c10/util/intrusive_ptr.h>
 #include <c10/util/numa.h>
 #include <c10/util/thread_name.h>
 
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 2b3efee1ce67..e61e30dc6132 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -12,6 +12,7 @@
 #include <cuda_runtime_api.h>
 #include <algorithm>
 #include <bitset>
+#include <cstdint>
 #include <deque>
 #include <iterator>
 #include <map>
@@ -183,16 +184,17 @@ struct Block {
   cudaStream_t stream; // allocation stream
   stream_set stream_uses; // streams on which the block was used
   size_t size; // block size in bytes
-  BlockPool* pool; // owning memory pool
-  void* ptr; // memory address
-  bool allocated; // in-use flag
-  Block* prev; // prev block if split from a larger allocation
-  Block* next; // next block if split from a larger allocation
-  int event_count; // number of outstanding CUDA events
-  int gc_count; // counter for prioritizing older / less useful blocks for
-                // garbage collection
+  size_t requested_size; // memory originally requested
+  BlockPool* pool{nullptr}; // owning memory pool
+  void* ptr{nullptr}; // memory address
+  bool allocated{false}; // in-use flag
+  Block* prev{nullptr}; // prev block if split from a larger allocation
+  Block* next{nullptr}; // next block if split from a larger allocation
+  int event_count{0}; // number of outstanding CUDA events
+  int gc_count{0}; // counter for prioritizing older / less useful blocks for
+                   // garbage collection
   std::unique_ptr<HistoryChain> history;
-  HistoryChain* history_last;
+  HistoryChain* history_last{nullptr};
 
   Block(
       int device,
@@ -204,13 +206,9 @@ struct Block {
         stream(stream),
         stream_uses(),
         size(size),
+        requested_size(0),
         pool(pool),
-        ptr(ptr),
-        allocated(0),
-        prev(nullptr),
-        next(nullptr),
-        event_count(0),
-        gc_count(0) {}
+        ptr(ptr) {}
 
   // constructor for search key
   Block(int device, cudaStream_t stream, size_t size)
@@ -218,13 +216,7 @@ struct Block {
         stream(stream),
         stream_uses(),
         size(size),
-        pool(nullptr),
-        ptr(nullptr),
-        allocated(0),
-        prev(nullptr),
-        next(nullptr),
-        event_count(0),
-        gc_count(0) {}
+        requested_size(0) {}
 
   bool is_split() const {
     return (prev != nullptr) || (next != nullptr);
@@ -371,16 +363,12 @@ struct MempoolIdHash {
 };
 
 cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
-// TODO: ideally we'd replace this with something like
-// !defined(TORCH_HIP_VERSION) as CUDA <= 10 support was dropped and really
-// this is only a workaround for TORCH_HIP_VERSION not being a sufficient guard
-// to prevent ROCM build breakage.
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   if (at::cuda::currentStreamCaptureStatusMayInitCtx() ==
       at::cuda::CaptureStatus::None) {
 #endif
     return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size));
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   } else {
     // It's ok to capture cudaMallocs, as long as we never cudaFree those
     // addresses before replay.
@@ -482,16 +470,16 @@ void CachingAllocatorConfig::lexArgs(
   for (size_t i = 0; i < env_length; i++) {
     if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
       if (buf.size() != 0) {
-        config.emplace_back(std::string(buf.begin(), buf.end()));
+        config.emplace_back(buf.begin(), buf.end());
         buf.clear();
       }
-      config.emplace_back(std::string(1, env[i]));
+      config.emplace_back(1, env[i]);
     } else if (env[i] != ' ') {
       buf.emplace_back(static_cast<char>(env[i]));
     }
   }
   if (!buf.empty()) {
-    config.emplace_back(std::string(buf.begin(), buf.end()));
+    config.emplace_back(buf.begin(), buf.end());
   }
 }
 
@@ -898,12 +886,29 @@ class DeviceCachingAllocator {
           stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
               .current,
           c10::Device(c10::DeviceType::CUDA, static_cast<DeviceIndex>(device)));
-      for (const auto& obs : oom_observers_) {
+
+      auto allocated_bytes =
+          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto reserved_bytes =
+          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto observers_local = oom_observers_;
+
+      // Make sure we do not have the device lock before calling our
+      // observers which might need hold the GIL
+      // It is safe to release at this point because will no longer
+      // be reading any allocator state.
+
+      lock.unlock();
+
+      for (const auto& obs : observers_local) {
         obs(device,
             alloc_size,
             set_fraction ? allowed_memory_maximum : device_total,
             device_free);
       }
+
       // "total capacity": total global memory on GPU
       // "allowed": memory is allowed to use, which set by fraction.
       // "already allocated": memory allocated by the program using the
@@ -932,16 +937,12 @@ class DeviceCachingAllocator {
           "; ",
           format_size(device_total),
           " total capacity; ",
-          format_size(
-              stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
-                  .current),
+          format_size(allocated_bytes),
           " already allocated; ",
           format_size(device_free),
           " free; ",
           allowed_info,
-          format_size(
-              stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
-                  .current),
+          format_size(reserved_bytes),
           " reserved in total by PyTorch)",
           " If reserved memory is >> allocated memory try setting max_split_size_mb to avoid"
           " fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
@@ -978,12 +979,16 @@ class DeviceCachingAllocator {
       if (already_split) {
         // An already-split inactive block is being shrunk by size bytes.
         update_stat_array(
-            stats.inactive_split_bytes, -block->size, params.stat_types);
+            stats.inactive_split_bytes,
+            -static_cast<std::int64_t>(block->size),
+            params.stat_types);
       } else {
         // A new split inactive block is being created from a previously unsplit
         // block, size remaining->size bytes.
         for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-          update_stat(stats.inactive_split_bytes[stat_type], remaining->size);
+          update_stat(
+              stats.inactive_split_bytes[stat_type],
+              static_cast<std::int64_t>(remaining->size));
           update_stat(stats.inactive_split[stat_type], 1);
         });
       }
@@ -991,12 +996,15 @@ class DeviceCachingAllocator {
     } else if (already_split) {
       // An already-split block is becoming active
       for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-        update_stat(stats.inactive_split_bytes[stat_type], -block->size);
+        update_stat(
+            stats.inactive_split_bytes[stat_type],
+            -static_cast<std::int64_t>(block->size));
         update_stat(stats.inactive_split[stat_type], -1);
       });
     }
 
     block->allocated = true;
+    block->requested_size = orig_size;
     if (record_history) {
       trimHistoryBefore(block, (char*)block->ptr + size);
       block->history = std::make_unique<HistoryChain>(HistoryChain{
@@ -1018,9 +1026,16 @@ class DeviceCachingAllocator {
 
     for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
       update_stat(stats.allocation[stat_type], 1);
-      update_stat(stats.allocated_bytes[stat_type], block->size);
+      update_stat(
+          stats.allocated_bytes[stat_type],
+          static_cast<std::int64_t>(block->size));
       update_stat(stats.active[stat_type], 1);
-      update_stat(stats.active_bytes[stat_type], block->size);
+      update_stat(
+          stats.active_bytes[stat_type],
+          static_cast<std::int64_t>(block->size));
+      update_stat(
+          stats.requested_bytes[stat_type],
+          static_cast<std::int64_t>(block->requested_size));
     });
     if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_allocations, 1);
@@ -1051,7 +1066,9 @@ class DeviceCachingAllocator {
         true;
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       update_stat(stats.allocation[stat_type], -1);
-      update_stat(stats.allocated_bytes[stat_type], -block->size);
+      update_stat(
+          stats.allocated_bytes[stat_type],
+          -static_cast<std::int64_t>(block->size));
     });
     if (block->history) {
       record_trace(
@@ -1166,6 +1183,7 @@ class DeviceCachingAllocator {
       reset_accumulated_stat(stats.reserved_bytes[statType]);
       reset_accumulated_stat(stats.active_bytes[statType]);
       reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+      reset_accumulated_stat(stats.requested_bytes[statType]);
     }
 
     stats.num_alloc_retries = 0;
@@ -1188,6 +1206,7 @@ class DeviceCachingAllocator {
       reset_peak_stat(stats.reserved_bytes[statType]);
       reset_peak_stat(stats.active_bytes[statType]);
       reset_peak_stat(stats.inactive_split_bytes[statType]);
+      reset_peak_stat(stats.requested_bytes[statType]);
     }
     reset_peak_stat(stats.oversize_allocations);
     reset_peak_stat(stats.oversize_segments);
@@ -1218,6 +1237,7 @@ class DeviceCachingAllocator {
         BlockInfo& block_info = segment_info.blocks.back();
 
         block_info.size = block->size;
+        block_info.requested_size = block->requested_size;
         block_info.allocated = block->allocated;
         block_info.active = block->allocated || (block->event_count > 0) ||
             !block->stream_uses.empty();
@@ -1228,6 +1248,7 @@ class DeviceCachingAllocator {
         }
         if (block_info.active) {
           segment_info.active_size += block_info.size;
+          segment_info.requested_size += block_info.requested_size;
         }
         HistoryChain* h = block->history.get();
         while (h) {
@@ -1403,6 +1424,7 @@ class DeviceCachingAllocator {
           block->history->h.context);
     }
     size_t original_block_size = block->size;
+    size_t requested_size = block->requested_size;
 
     auto& pool = *block->pool;
     int64_t net_change_inactive_split_blocks = 0;
@@ -1439,7 +1461,12 @@ class DeviceCachingAllocator {
           stats.inactive_split_bytes[stat_type],
           net_change_inactive_split_size);
       update_stat(stats.active[stat_type], -1);
-      update_stat(stats.active_bytes[stat_type], -original_block_size);
+      update_stat(
+          stats.active_bytes[stat_type],
+          -static_cast<std::int64_t>(original_block_size));
+      update_stat(
+          stats.requested_bytes[stat_type],
+          -static_cast<std::int64_t>(requested_size));
     });
   }
 
@@ -1492,7 +1519,7 @@ class DeviceCachingAllocator {
   }
 
   BlockPool& get_pool(size_t size, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
     // captures_underway is a conservative guess that the current stream may be
     // capturing. It's only > 0 if some thread has begun and not yet ended a
     // capture, so it's usually 0, and we can short-circuit
@@ -1790,7 +1817,9 @@ class DeviceCachingAllocator {
     stat_types[static_cast<size_t>(get_stat_type_for_pool(*pool))] = true;
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       update_stat(stats.segment[stat_type], -1);
-      update_stat(stats.reserved_bytes[stat_type], -block->size);
+      update_stat(
+          stats.reserved_bytes[stat_type],
+          -static_cast<std::int64_t>(block->size));
     });
     if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_segments, -1);
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 41e082933d55..303890ef9449 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -68,7 +68,7 @@ struct DeviceStats {
   // released via cudaFree)
   StatArray inactive_split;
 
-  // SUM: bytes requested by client code
+  // SUM: bytes allocated by this memory alocator
   StatArray allocated_bytes;
   // SUM: bytes reserved by this memory allocator (both free and used)
   StatArray reserved_bytes;
@@ -76,6 +76,8 @@ struct DeviceStats {
   StatArray active_bytes;
   // SUM: bytes within inactive, split memory blocks
   StatArray inactive_split_bytes;
+  // SUM: bytes requested by client code
+  StatArray requested_bytes;
 
   // COUNT: total number of failed calls to CUDA malloc necessitating cache
   // flushes.
@@ -95,7 +97,7 @@ struct DeviceStats {
 };
 
 struct Context {
-  virtual ~Context() {}
+  virtual ~Context() = default;
 };
 
 typedef std::shared_ptr<Context> (*CreateContextFn)(void);
@@ -110,6 +112,7 @@ struct History {
 // cudaMalloc)..
 struct BlockInfo {
   int64_t size = 0;
+  int64_t requested_size = 0;
   int32_t gc_counter = 0;
   bool allocated = false;
   bool active = false;
@@ -121,6 +124,7 @@ struct SegmentInfo {
   int64_t device = 0;
   int64_t address = 0;
   int64_t total_size = 0;
+  int64_t requested_size = 0;
   int64_t allocated_size = 0;
   int64_t active_size = 0;
   cudaStream_t stream = 0;
diff --git a/c10/cuda/CUDADeviceAssertion.h b/c10/cuda/CUDADeviceAssertion.h
index 65aca3c6399b..285668f13427 100644
--- a/c10/cuda/CUDADeviceAssertion.h
+++ b/c10/cuda/CUDADeviceAssertion.h
@@ -18,7 +18,7 @@ static __device__ void dstrcpy(char* dst, const char* src) {
   *dst = '\0';
 }
 
-__device__ void dsa_add_new_assertion_failure(
+static __device__ void dsa_add_new_assertion_failure(
     DeviceAssertionsData* assertions_data,
     const char* assertion_msg,
     const char* filename,
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index 3be77dd7d138..24f3d928af69 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -24,6 +24,9 @@ void c10_cuda_check_implementation(
     return;
   }
 
+  auto error_unused C10_UNUSED = cudaGetLastError();
+  (void)error_unused;
+
   std::string check_message;
 #ifndef STRIP_ERROR_MESSAGES
   check_message.append("CUDA error: ");
diff --git a/c10/cuda/CUDAGraphsC10Utils.h b/c10/cuda/CUDAGraphsC10Utils.h
index d78e7a182708..2fbe9c186e81 100644
--- a/c10/cuda/CUDAGraphsC10Utils.h
+++ b/c10/cuda/CUDAGraphsC10Utils.h
@@ -17,12 +17,7 @@ using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
 
 // RAII guard for "cudaStreamCaptureMode", a thread-local value
 // that controls the error-checking strictness of a capture.
-
-// TODO: ideally we'd replace this with something like
-// !defined(TORCH_HIP_VERSION) as CUDA <= 10 support was dropped and really
-// this is only a workaround for TORCH_HIP_VERSION not being a sufficient guard
-// to prevent ROCM build breakage.
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
 struct C10_CUDA_API CUDAStreamCaptureModeGuard {
   CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired) {
     strictness_ = desired;
@@ -37,7 +32,7 @@ struct C10_CUDA_API CUDAStreamCaptureModeGuard {
 };
 #endif
 
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
 // Protects against enum cudaStreamCaptureStatus implementation changes.
 // Some compilers seem not to like static_assert without the messages.
 static_assert(
@@ -52,7 +47,7 @@ static_assert(
 #endif
 
 enum class CaptureStatus : int {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone),
   Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive),
   Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated)
@@ -66,7 +61,7 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
     case CaptureStatus::None:
       os << "cudaStreamCaptureStatusNone";
       break;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
     case CaptureStatus::Active:
       os << "cudaStreamCaptureStatusActive";
       break;
@@ -83,7 +78,7 @@ inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
 
 // Use this version where you're sure a CUDA context exists already.
 inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
   cudaStreamCaptureStatus is_capturing;
   C10_CUDA_CHECK(
       cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index f567a2655c94..d4bb53853720 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -33,9 +33,9 @@ namespace {
 struct UsageStream {
   cudaStream_t stream;
   int device;
-  UsageStream() {}
+  UsageStream() = default;
   UsageStream(cudaStream_t s, int d) : stream(s), device(d) {}
-  UsageStream(const UsageStream& us) : stream(us.stream), device(us.device) {}
+  UsageStream(const UsageStream& us) = default;
   UsageStream(const UsageStream&& us) : stream(us.stream), device(us.device) {}
   UsageStream& operator=(UsageStream other) {
     stream = other.stream;
@@ -262,9 +262,8 @@ inline void free_impl(PtrInfo::iterator& it) {
 
     if (C10_UNLIKELY(capture_underway)) {
       // See Note [Avoid dangling free streams during CUDA graph capture]
-      capture_free_streams.insert(UsageStream(
-          dummy_unifying_free_stream.stream,
-          dummy_unifying_free_stream.device));
+      capture_free_streams.emplace(
+          dummy_unifying_free_stream.stream, dummy_unifying_free_stream.device);
     }
   }
 
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 61f5881b44ef..094372a74f46 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -165,9 +165,9 @@ class C10_CUDA_API CUDAStream {
 
   // Unpack a CUDAStream from the 3 fields generated by pack().
   static CUDAStream unpack3(
-      int64_t stream_id,
-      int64_t device_index,
-      int64_t device_type) {
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
     return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
   }
 
@@ -203,7 +203,7 @@ class C10_CUDA_API CUDAStream {
  * isHighPriority to true, or a stream for a specific device by setting device
  * (defaulting to the current CUDA stream.)
  */
-TORCH_API CUDAStream
+C10_API CUDAStream
 getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
 
 /**
@@ -213,7 +213,7 @@ getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
  * want to operate on a non-torch allocated stream for data exchange or similar
  * purposes
  */
-TORCH_API CUDAStream
+C10_API CUDAStream
 getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
 
 /**
@@ -222,7 +222,7 @@ getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
  * where most computation occurs when you aren't explicitly using
  * streams.
  */
-TORCH_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
+C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
 
 /**
  * Get the current CUDA stream, for the passed CUDA device, or for the
@@ -231,7 +231,7 @@ TORCH_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
  * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
  * or 'CUDAStreamGuard'.
  */
-TORCH_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
+C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
 
 /**
  * Set the current stream on the device of the passed in stream to be
@@ -243,7 +243,7 @@ TORCH_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
  * (which will switch both your current device and current stream in the way you
  * expect, and reset it back to its original state afterwards).
  */
-TORCH_API void setCurrentCUDAStream(CUDAStream stream);
+C10_API void setCurrentCUDAStream(CUDAStream stream);
 
 C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
 
diff --git a/c10/cuda/build.bzl b/c10/cuda/build.bzl
index 9ee16a418e30..b9e16a321032 100644
--- a/c10/cuda/build.bzl
+++ b/c10/cuda/build.bzl
@@ -25,11 +25,12 @@ def define_targets(rules):
         linkstatic = True,
         local_defines = ["C10_BUILD_MAIN_LIB"],
         visibility = ["//visibility:public"],
+        defines = ["USE_CUDA"],
         deps = [
             ":Macros",
             "@cuda",
             "//c10/core:base",
-            "//c10/macros",
+            "//c10/macros:macros",
             "//c10/util:base",
         ],
         target_compatible_with = rules.requires_cuda_enabled(),
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index c2365e449a40..0a48ba060aa4 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -20,7 +20,7 @@ namespace impl {
 struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
   static constexpr DeviceType static_type = DeviceType::CUDA;
 
-  CUDAGuardImpl() {}
+  CUDAGuardImpl() = default;
   explicit CUDAGuardImpl(DeviceType t) {
     TORCH_INTERNAL_ASSERT(t == DeviceType::CUDA);
   }
diff --git a/c10/cuda/test/build.bzl b/c10/cuda/test/build.bzl
index 334b3a75b6aa..4f6afe0adbb2 100644
--- a/c10/cuda/test/build.bzl
+++ b/c10/cuda/test/build.bzl
@@ -16,7 +16,7 @@ def define_targets(rules):
         ],
         deps = [
             "@com_google_googletest//:gtest_main",
-            "//c10/cuda",
+            "//c10/cuda:cuda",
         ],
         target_compatible_with = rules.requires_cuda_enabled(),
     )
@@ -30,7 +30,7 @@ def define_targets(rules):
             ],
             deps = [
                 "@com_google_googletest//:gtest_main",
-                "//c10/cuda",
+                "//c10/cuda:cuda",
             ],
             target_compatible_with = rules.requires_cuda_enabled(),
         )
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu b/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu
index 85b419ed48a3..90b9faff0a48 100644
--- a/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu
@@ -93,7 +93,7 @@ void cuda_device_assertions_catches_stream() {
 
 TEST(CUDATest, cuda_device_assertions_catches_stream) {
 #ifdef TORCH_USE_CUDA_DSA
-  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true;
   cuda_device_assertions_catches_stream();
 #else
   GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time.";
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu b/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
index 6cd448170579..01c83e37919a 100644
--- a/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
@@ -78,7 +78,7 @@ void cuda_device_assertions_catches_thread_and_block_and_device() {
 
 TEST(CUDATest, cuda_device_assertions_catches_thread_and_block_and_device) {
 #ifdef TORCH_USE_CUDA_DSA
-  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true;
   cuda_device_assertions_catches_thread_and_block_and_device();
 #else
   GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time.";
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu b/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu
index 6be834040459..c3b7215f6a9c 100644
--- a/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu
@@ -96,7 +96,7 @@ void cuda_device_assertions_from_2_processes() {
 
 TEST(CUDATest, cuda_device_assertions_from_2_processes) {
 #ifdef TORCH_USE_CUDA_DSA
-  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true;
   cuda_device_assertions_from_2_processes();
 #else
   GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time.";
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
index 8072e310cd43..eb6ce03343d9 100644
--- a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
@@ -85,7 +85,7 @@ void cuda_device_assertions_multiple_writes_from_blocks_and_threads() {
 
 TEST(CUDATest, cuda_device_assertions_multiple_writes_from_blocks_and_threads) {
 #ifdef TORCH_USE_CUDA_DSA
-  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true;
   cuda_device_assertions_multiple_writes_from_blocks_and_threads();
 #else
   GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time.";
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
index 81784ab4ffd5..4e3c73542a8e 100644
--- a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
@@ -82,7 +82,7 @@ void cuda_device_assertions_multiple_writes_from_multiple_blocks() {
 
 TEST(CUDATest, cuda_device_assertions_multiple_writes_from_multiple_blocks) {
 #ifdef TORCH_USE_CUDA_DSA
-  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true;
   cuda_device_assertions_multiple_writes_from_multiple_blocks();
 #else
   GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time.";
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu
index 8858e65467bb..64a543652e0e 100644
--- a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu
@@ -70,7 +70,7 @@ void cuda_device_assertions_multiple_writes_from_same_block() {
 
 TEST(CUDATest, cuda_device_assertions_multiple_writes_from_same_block) {
 #ifdef TORCH_USE_CUDA_DSA
-  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled_at_runtime = true;
   cuda_device_assertions_multiple_writes_from_same_block();
 #else
   GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled at compile time.";
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 3f055ae054d0..966a7a27ff06 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -384,7 +384,7 @@ __host__ __device__
         const char* assertion,
         const char* file,
         unsigned int line,
-        const char* function) throw() __attribute__((__noreturn__));
+        const char* function) noexcept __attribute__((__noreturn__));
 
 #if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
     !defined(TORCH_DISABLE_GPU_ASSERTS)
@@ -434,8 +434,7 @@ __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
 // Warning: __has_trivial_copy for GCC may not always detect the non-POD
 // correctly. For example, T = std::unique_ptr may evaluate to true and be
 // treated as POD. This can cause unexpected behavior.
-#if defined(__GNUG__) && __GNUC__ < 5 && \
-    !(defined(__clang__) && defined(_LIBCPP_VERSION))
+#if defined(__GNUG__) && __GNUC__ < 5 && !defined(__clang__)
 #define C10_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
 #else
 #define C10_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index 0b3a5a5f3d84..ed123399a8db 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -47,7 +47,7 @@ def define_targets(rules):
             ":complex_math_test_common",
             ":complex_test_common",
             "@com_google_googletest//:gtest_main",
-            "//c10/macros",
+            "//c10/macros:macros",
             "//c10/util:base",
         ],
     )
@@ -74,7 +74,7 @@ def define_targets(rules):
         hdrs = ["util/complex_test_common.h"],
         deps = [
             "@com_google_googletest//:gtest",
-            "//c10/macros",
+            "//c10/macros:macros",
             "//c10/util:base",
         ],
         testonly = True,
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index 4143ae595e31..fedca4f02aea 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -33,7 +33,7 @@ struct bitset final {
     return 8 * sizeof(bitset_type);
   }
 
-  constexpr bitset() noexcept : bitset_(0) {}
+  constexpr bitset() noexcept = default;
   constexpr bitset(const bitset&) noexcept = default;
   constexpr bitset(bitset&&) noexcept = default;
   // there is an issure for gcc 5.3.0 when define default function as constexpr
@@ -109,7 +109,7 @@ struct bitset final {
     return lhs.bitset_ == rhs.bitset_;
   }
 
-  bitset_type bitset_;
+  bitset_type bitset_{0};
 };
 
 inline bool operator!=(bitset lhs, bitset rhs) noexcept {
diff --git a/c10/util/Flags.h b/c10/util/Flags.h
index 1f9698dc990d..516b474b3653 100644
--- a/c10/util/Flags.h
+++ b/c10/util/Flags.h
@@ -208,7 +208,7 @@ C10_DECLARE_REGISTRY(C10FlagsRegistry, C10FlagParser, const std::string&);
   C10_DEFINE_typed_var(std::string, name, default_value, help_str)
 
 // DECLARE_typed_var should be used in header files and in the global namespace.
-#define C10_DECLARE_typed_var(type, name) C10_IMPORT extern type FLAGS_##name
+#define C10_DECLARE_typed_var(type, name) C10_API extern type FLAGS_##name
 
 #define C10_DECLARE_int(name) C10_DECLARE_typed_var(int, name)
 #define C10_DECLARE_int32(name) C10_DECLARE_int(name)
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index fe74e4954864..40b85f8470f0 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -32,7 +32,7 @@ std::function<string(void)>* GetFetchStackTrace() {
 } // namespace
 
 void SetStackTraceFetcher(std::function<string(void)> fetcher) {
-  *GetFetchStackTrace() = fetcher;
+  *GetFetchStackTrace() = std::move(fetcher);
 }
 
 void ThrowEnforceNotMet(
@@ -113,13 +113,13 @@ DDPUsageLoggerType* GetDDPUsageLogger() {
 
 void SetAPIUsageLogger(std::function<void(const std::string&)> logger) {
   TORCH_CHECK(logger);
-  *GetAPIUsageLogger() = logger;
+  *GetAPIUsageLogger() = std::move(logger);
 }
 
 void SetPyTorchDDPUsageLogger(
     std::function<void(const DDPLoggingData&)> logger) {
   TORCH_CHECK(logger);
-  *GetDDPUsageLogger() = logger;
+  *GetDDPUsageLogger() = std::move(logger);
 }
 
 void LogAPIUsage(const std::string& event) try {
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index b25d7841e3f4..0f5c70f268d7 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -2,6 +2,7 @@
 #define C10_UTIL_LOGGING_H_
 
 #include <climits>
+#include <cstring>
 #include <exception>
 #include <functional>
 #include <limits>
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index fc50af2b0fa8..44f28b206921 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -501,13 +501,8 @@ class arrayref_optional_base {
       : storage_(v) {}
 
   constexpr bool initialized() const noexcept {
-    typename storage::raw repr;
-    // Cast to void* to suppress GCC's -Wclass-memaccess.
-    memcpy(
-        static_cast<void*>(&repr),
-        static_cast<const void*>(&storage_),
-        sizeof(storage_));
-    return repr.p != nullptr || repr.sz == 0;
+    return storage_.uninitialized_.p != nullptr ||
+        storage_.uninitialized_.sz == 0;
   }
 
   void setInitialized(bool init) noexcept {
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
index d75e2b9590c9..29daa6a02353 100644
--- a/c10/util/Registry.h
+++ b/c10/util/Registry.h
@@ -207,11 +207,18 @@ class Registerer {
 // dllexport are mixed, but the warning is fine and linker will be properly
 // exporting the symbol. Same thing happens in the gflags flag declaration and
 // definition caes.
-#define C10_DECLARE_TYPED_REGISTRY(                                        \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                       \
-  C10_IMPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
-  RegistryName();                                                          \
-  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>   \
+#define C10_DECLARE_TYPED_REGISTRY(                                      \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
+  C10_API ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*  \
+  RegistryName();                                                        \
+  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__> \
+      Registerer##RegistryName
+
+#define TORCH_DECLARE_TYPED_REGISTRY(                                     \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                      \
+  TORCH_API ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName();                                                         \
+  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>  \
       Registerer##RegistryName
 
 #define C10_DEFINE_TYPED_REGISTRY(                                         \
@@ -268,6 +275,10 @@ class Registerer {
   C10_DECLARE_TYPED_REGISTRY(                               \
       RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
 
+#define TORCH_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  TORCH_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
 #define C10_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
   C10_DEFINE_TYPED_REGISTRY(                               \
       RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
@@ -280,6 +291,10 @@ class Registerer {
   C10_DECLARE_TYPED_REGISTRY(                                      \
       RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
 
+#define TORCH_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  TORCH_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
 #define C10_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
   C10_DEFINE_TYPED_REGISTRY(                                      \
       RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index 4cf5755227b5..8a65b6b7951f 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -1,4 +1,3 @@
-#include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
 
 #include <cstring>
@@ -47,7 +46,7 @@ size_t ReplaceAll(std::string& s, c10::string_view from, c10::string_view to) {
   if (from.size() >= to.size()) {
     // If the replacement string is not larger than the original, we
     // can do the replacement in-place without allocating new storage.
-    char* s_data = s.data();
+    char* s_data = &s[0];
 
     while ((cur_pos = s.find(from.data(), last_pos, from.size())) !=
            std::string::npos) {
diff --git a/c10/util/ThreadLocalDebugInfo.cpp b/c10/util/ThreadLocalDebugInfo.cpp
index e79ee00d1a61..5aea3f946bbd 100644
--- a/c10/util/ThreadLocalDebugInfo.cpp
+++ b/c10/util/ThreadLocalDebugInfo.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <c10/util/ThreadLocal.h>
 #include <c10/util/ThreadLocalDebugInfo.h>
 
@@ -27,8 +28,8 @@ std::shared_ptr<ThreadLocalDebugInfo> ThreadLocalDebugInfo::current() {
 
 /* static */
 void ThreadLocalDebugInfo::_forceCurrentDebugInfo(
-    const std::shared_ptr<ThreadLocalDebugInfo>& info) {
-  debug_info = info;
+    std::shared_ptr<ThreadLocalDebugInfo> info) {
+  debug_info = std::move(info);
 }
 
 /* static */
@@ -39,7 +40,7 @@ void ThreadLocalDebugInfo::_push(
   debug_info = std::make_shared<ThreadLocalDebugInfo>();
   debug_info->parent_info_ = prev_info;
   debug_info->kind_ = kind;
-  debug_info->info_ = info;
+  debug_info->info_ = std::move(info);
 }
 
 /* static */
@@ -86,8 +87,8 @@ DebugInfoGuard::DebugInfoGuard(std::shared_ptr<ThreadLocalDebugInfo> info) {
   if (!info) {
     return;
   }
-  prev_info_ = debug_info;
-  debug_info = info;
+  prev_info_ = std::move(debug_info);
+  debug_info = std::move(info);
   active_ = true;
 }
 
diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h
index 9d58695209d4..8820d35ac47b 100644
--- a/c10/util/ThreadLocalDebugInfo.h
+++ b/c10/util/ThreadLocalDebugInfo.h
@@ -1,11 +1,9 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <c10/util/Exception.h>
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 
 namespace c10 {
 
@@ -41,7 +39,7 @@ class C10_API ThreadLocalDebugInfo {
 
   // Internal, use DebugInfoGuard/ThreadLocalStateGuard
   static void _forceCurrentDebugInfo(
-      const std::shared_ptr<ThreadLocalDebugInfo>& info);
+      std::shared_ptr<ThreadLocalDebugInfo> info);
 
   // Push debug info struct of a given kind
   static void _push(DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info);
diff --git a/c10/util/Type_demangle.cpp b/c10/util/Type_demangle.cpp
index 8b2e626aba32..435e7cf11d55 100644
--- a/c10/util/Type_demangle.cpp
+++ b/c10/util/Type_demangle.cpp
@@ -24,8 +24,7 @@ std::string demangle(const char* name) {
       abi::__cxa_demangle(
           name,
           /*__output_buffer=*/nullptr,
-          // NOLINTNEXTLINE(modernize-use-nullptr)
-          /*__length=*/0,
+          /*__length=*/nullptr,
           &status),
       /*deleter=*/free);
 
diff --git a/c10/util/UniqueVoidPtr.h b/c10/util/UniqueVoidPtr.h
index 7d9e422f3c67..bd449969fc5c 100644
--- a/c10/util/UniqueVoidPtr.h
+++ b/c10/util/UniqueVoidPtr.h
@@ -10,7 +10,7 @@ using DeleterFnPtr = void (*)(void*);
 namespace detail {
 
 // Does not delete anything
-TORCH_API void deleteNothing(void*);
+C10_API void deleteNothing(void*);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
diff --git a/c10/util/build.bzl b/c10/util/build.bzl
index 8d79a557477f..f7cbcc4be508 100644
--- a/c10/util/build.bzl
+++ b/c10/util/build.bzl
@@ -9,7 +9,7 @@ def define_targets(rules):
         deps = [
             ":base",
             "//c10/core:ScalarType",
-            "//c10/macros",
+            "//c10/macros:macros",
         ],
     )
 
@@ -37,7 +37,7 @@ def define_targets(rules):
         visibility = ["//visibility:public"],
         deps = [
             "@fmt",
-            "//c10/macros",
+            "//c10/macros:macros",
         ] + rules.select({
             "//c10:using_gflags": ["@com_github_gflags_gflags//:gflags"],
             "//conditions:default": [],
@@ -57,7 +57,7 @@ def define_targets(rules):
         deps = [
             ":base",
             "//c10/core:ScalarType",
-            "//c10/macros",
+            "//c10/macros:macros",
         ],
     )
 
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 5045df5a4208..3658b6ba6fa3 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -247,13 +247,38 @@ struct alignas(sizeof(T) * 2) complex {
   constexpr FORCE_INLINE_APPLE complex<T>& operator/=(const complex<U>& rhs)
       __ubsan_ignore_float_divide_by_zero__ {
     // (a + bi) / (c + di) = (ac + bd)/(c^2 + d^2) + (bc - ad)/(c^2 + d^2) i
-    T a = real_;
-    T b = imag_;
-    U c = rhs.real();
-    U d = rhs.imag();
-    auto denominator = c * c + d * d;
-    real_ = (a * c + b * d) / denominator;
-    imag_ = (b * c - a * d) / denominator;
+    // the calculation below follows numpy's complex division
+    T ar = real_;
+    T ai = imag_;
+    U br = rhs.real();
+    U bi = rhs.imag();
+
+#if defined(__GNUC__) && !defined(__clang__)
+    // std::abs is already constexpr by gcc
+    auto abs_br = std::abs(br);
+    auto abs_bi = std::abs(bi);
+#else
+    auto abs_br = br < 0 ? -br : br;
+    auto abs_bi = bi < 0 ? -bi : bi;
+#endif
+
+    if (abs_br >= abs_bi) {
+      if (abs_br == 0 && abs_bi == 0) {
+        /* divide by zeros should yield a complex inf or nan */
+        real_ = ar / abs_br;
+        imag_ = ai / abs_bi;
+      } else {
+        auto rat = bi / br;
+        auto scl = 1.0 / (br + bi * rat);
+        real_ = (ar + ai * rat) * scl;
+        imag_ = (ai - ar * rat) * scl;
+      }
+    } else {
+      auto rat = br / bi;
+      auto scl = 1.0 / (bi + br * rat);
+      real_ = (ar * rat + ai) * scl;
+      imag_ = (ai * rat - ar) * scl;
+    }
     return *this;
   }
 #undef FORCE_INLINE_APPLE
diff --git a/c10/util/complex_math.h b/c10/util/complex_math.h
index f627eb6cfa45..84073099eddf 100644
--- a/c10/util/complex_math.h
+++ b/c10/util/complex_math.h
@@ -51,10 +51,10 @@ C10_HOST_DEVICE inline c10::complex<T> log2(const c10::complex<T>& x) {
 #if defined(_LIBCPP_VERSION) || \
     (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))
 namespace _detail {
-TORCH_API c10::complex<float> sqrt(const c10::complex<float>& in);
-TORCH_API c10::complex<double> sqrt(const c10::complex<double>& in);
-TORCH_API c10::complex<float> acos(const c10::complex<float>& in);
-TORCH_API c10::complex<double> acos(const c10::complex<double>& in);
+C10_API c10::complex<float> sqrt(const c10::complex<float>& in);
+C10_API c10::complex<double> sqrt(const c10::complex<double>& in);
+C10_API c10::complex<float> acos(const c10::complex<float>& in);
+C10_API c10::complex<double> acos(const c10::complex<double>& in);
 }; // namespace _detail
 #endif
 
diff --git a/c10/util/complex_utils.h b/c10/util/complex_utils.h
index a28f0bd487fe..1ca105f1d0af 100644
--- a/c10/util/complex_utils.h
+++ b/c10/util/complex_utils.h
@@ -38,4 +38,9 @@ namespace std {
 template <typename T>
 class numeric_limits<c10::complex<T>> : public numeric_limits<T> {};
 
+template <typename T>
+bool isnan(const c10::complex<T>& v) {
+  return std::isnan(v.real()) || std::isnan(v.imag());
+}
+
 } // namespace std
diff --git a/c10/util/flags_use_no_gflags.cpp b/c10/util/flags_use_no_gflags.cpp
index ecd1fd2c95fd..078d21f468f3 100644
--- a/c10/util/flags_use_no_gflags.cpp
+++ b/c10/util/flags_use_no_gflags.cpp
@@ -148,7 +148,7 @@ C10_EXPORT bool C10FlagParser::Parse<int64_t>(
     const string& content,
     int64_t* value) {
   try {
-    static_assert(sizeof(long long) == sizeof(int64_t), "");
+    static_assert(sizeof(long long) == sizeof(int64_t));
 #ifdef __ANDROID__
     // Android does not have std::atoll.
     *value = atoll(content.c_str());
diff --git a/c10/util/flat_hash_map.h b/c10/util/flat_hash_map.h
index ccaf6e1bf34f..b89d6ed4f547 100644
--- a/c10/util/flat_hash_map.h
+++ b/c10/util/flat_hash_map.h
@@ -138,10 +138,10 @@ struct KeyOrValueEquality : functor_storage<bool, key_equal> {
 static constexpr int8_t min_lookups = 4;
 template <typename T>
 struct sherwood_v3_entry {
-  sherwood_v3_entry() {}
+  sherwood_v3_entry() = default;
   sherwood_v3_entry(int8_t distance_from_desired)
       : distance_from_desired(distance_from_desired) {}
-  ~sherwood_v3_entry() {}
+  ~sherwood_v3_entry() = default;
 
   bool has_value() const {
     return distance_from_desired >= 0;
@@ -234,12 +234,14 @@ template <
     typename T,
     typename FindKey,
     typename ArgumentHash,
-    typename Hasher,
+    typename DetailHasher,
     typename ArgumentEqual,
     typename Equal,
     typename ArgumentAlloc,
     typename EntryAlloc>
-class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
+class sherwood_v3_table : private EntryAlloc,
+                          private DetailHasher,
+                          private Equal {
   using Entry = detailv3::sherwood_v3_entry<T>;
   using AllocatorTraits = std::allocator_traits<EntryAlloc>;
   using EntryPointer = typename AllocatorTraits::pointer;
@@ -264,7 +266,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
       const ArgumentHash& hash = ArgumentHash(),
       const ArgumentEqual& equal = ArgumentEqual(),
       const ArgumentAlloc& alloc = ArgumentAlloc())
-      : EntryAlloc(alloc), Hasher(hash), Equal(equal) {
+      : EntryAlloc(alloc), DetailHasher(hash), Equal(equal) {
     rehash(bucket_count);
   }
   sherwood_v3_table(size_type bucket_count, const ArgumentAlloc& alloc)
@@ -351,7 +353,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
                 other.get_allocator())) {}
   sherwood_v3_table(const sherwood_v3_table& other, const ArgumentAlloc& alloc)
       : EntryAlloc(alloc),
-        Hasher(other),
+        DetailHasher(other),
         Equal(other),
         _max_load_factor(other._max_load_factor) {
     rehash_for_other_container(other);
@@ -365,14 +367,16 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
   }
   sherwood_v3_table(sherwood_v3_table&& other) noexcept
       : EntryAlloc(std::move(other)),
-        Hasher(std::move(other)),
+        DetailHasher(std::move(other)),
         Equal(std::move(other)) {
     swap_pointers(other);
   }
   sherwood_v3_table(
       sherwood_v3_table&& other,
       const ArgumentAlloc& alloc) noexcept
-      : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other)) {
+      : EntryAlloc(alloc),
+        DetailHasher(std::move(other)),
+        Equal(std::move(other)) {
     swap_pointers(other);
   }
   sherwood_v3_table& operator=(const sherwood_v3_table& other) {
@@ -391,7 +395,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
           *this, other);
     }
     _max_load_factor = other._max_load_factor;
-    static_cast<Hasher&>(*this) = other;
+    static_cast<DetailHasher&>(*this) = other;
     static_cast<Equal&>(*this) = other;
     rehash_for_other_container(other);
     insert(other.begin(), other.end());
@@ -419,7 +423,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
         emplace(std::move(elem));
       other.clear();
     }
-    static_cast<Hasher&>(*this) = std::move(other);
+    static_cast<DetailHasher&>(*this) = std::move(other);
     static_cast<Equal&>(*this) = std::move(other);
     return *this;
   }
@@ -870,11 +874,11 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
 
   template <typename U>
   uint64_t hash_object(const U& key) {
-    return static_cast<Hasher&>(*this)(key);
+    return static_cast<DetailHasher&>(*this)(key);
   }
   template <typename U>
   uint64_t hash_object(const U& key) const {
-    return static_cast<const Hasher&>(*this)(key);
+    return static_cast<const DetailHasher&>(*this)(key);
   }
   template <typename L, typename R>
   bool compares_equal(const L& lhs, const R& rhs) {
diff --git a/c10/util/int128.cpp b/c10/util/int128.cpp
index 329452d9c2e7..0486f1c7bd9b 100644
--- a/c10/util/int128.cpp
+++ b/c10/util/int128.cpp
@@ -35,7 +35,6 @@
 #include <c10/util/int128.h>
 #include <iomanip>
 #include <ostream> // NOLINT(readability/streams)
-#include <sstream>
 
 namespace c10 {
 
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index e75c1980fdfa..6eb149e2b7c1 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -470,6 +470,10 @@ class intrusive_ptr final {
    * passed in *must* have been created using intrusive_ptr::release().
    */
   static intrusive_ptr reclaim(TTarget* owning_ptr) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        owning_ptr == NullType::singleton() ||
+            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+        "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
     return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
   }
 
diff --git a/c10/util/numa.h b/c10/util/numa.h
index aa5ae5233242..30c3ad5356ea 100644
--- a/c10/util/numa.h
+++ b/c10/util/numa.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <c10/util/Logging.h>
-#include <c10/util/Optional.h>
 
 C10_DECLARE_bool(caffe2_cpu_numa_enabled);
 
diff --git a/c10/util/reverse_iterator.h b/c10/util/reverse_iterator.h
index 70dbe5a8bee2..16d6db3fc477 100644
--- a/c10/util/reverse_iterator.h
+++ b/c10/util/reverse_iterator.h
@@ -61,13 +61,7 @@
 namespace c10 {
 
 template <typename _Iterator>
-class reverse_iterator
-    : public std::iterator<
-          typename std::iterator_traits<_Iterator>::iterator_category,
-          typename std::iterator_traits<_Iterator>::value_type,
-          typename std::iterator_traits<_Iterator>::difference_type,
-          typename std::iterator_traits<_Iterator>::pointer,
-          typename std::iterator_traits<_Iterator>::reference> {
+class reverse_iterator {
  protected:
   _Iterator current;
 
@@ -75,9 +69,11 @@ class reverse_iterator
 
  public:
   using iterator_type = _Iterator;
+  using value_type = typename __traits_type::value_type;
   using difference_type = typename __traits_type::difference_type;
   using pointer = typename __traits_type::pointer;
   using reference = typename __traits_type::reference;
+  using iterator_category = typename __traits_type::iterator_category;
 
   constexpr reverse_iterator() : current() {}
 
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index ab40b594a0b0..b60314d26c66 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -16,7 +16,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
-#include <unordered_set>
 
 #ifdef C10_ANDROID
 #ifndef SYS_gettid
diff --git a/c10/util/signal_handler.h b/c10/util/signal_handler.h
index 2dafaf468354..70295874844b 100644
--- a/c10/util/signal_handler.h
+++ b/c10/util/signal_handler.h
@@ -20,7 +20,7 @@
 
 namespace c10 {
 
-class TORCH_API SignalHandler {
+class C10_API SignalHandler {
  public:
   enum class Action { NONE, STOP };
 
@@ -40,13 +40,13 @@ class TORCH_API SignalHandler {
 };
 
 #if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
-class TORCH_API FatalSignalHandler {
+class C10_API FatalSignalHandler {
   // This works by setting up certain fatal signal handlers. Previous fatal
   // signal handlers will still be called when the signal is raised. Defaults
   // to being off.
  public:
-  TORCH_API void setPrintStackTracesOnFatalSignal(bool print);
-  TORCH_API bool printStackTracesOnFatalSignal();
+  C10_API void setPrintStackTracesOnFatalSignal(bool print);
+  C10_API bool printStackTracesOnFatalSignal();
   static FatalSignalHandler& getInstance();
   virtual ~FatalSignalHandler();
 
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index cf161d2ed956..53c107a930c3 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -4,15 +4,9 @@
 #include <algorithm>
 #include <atomic>
 
-#if !defined(_MSC_VER)
-#include <cxxabi.h>
-#endif
-
-using std::string;
-
 namespace caffe2 {
 namespace detail {
-C10_EXPORT void _ThrowRuntimeTypeLogicError(const string& msg) {
+C10_EXPORT void _ThrowRuntimeTypeLogicError(const std::string& msg) {
   // In earlier versions it used to be std::abort() but it's a bit hard-core
   // for a library
   TORCH_CHECK(false, msg);
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index c0585b9f05ae..a3dff5696707 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -6,11 +6,6 @@ if(USE_VULKAN)
   include(../cmake/VulkanCodegen.cmake)
 endif()
 
-# ---[ MSVC OpenMP modification
-if(MSVC)
-  include(../cmake/public/utils.cmake)
-endif()
-
 # Debug messages - if you want to get a list of source files and examine
 # target information, enable the following by -DPRINT_CMAKE_DEBUG_INFO=ON.
 set(PRINT_CMAKE_DEBUG_INFO FALSE CACHE BOOL "print cmake debug information")
@@ -105,6 +100,7 @@ if(INTERN_BUILD_ATEN_OPS)
   list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
   list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS})
   list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
+  set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
 endif()
 
 # ---[ Caffe2 build
@@ -657,6 +653,7 @@ if(USE_CUDA)
     PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
   )
   set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp PROPERTIES COMPILE_FLAGS "-DUSE_CUDA=1")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/interface.cpp PROPERTIES COMPILE_FLAGS "-DUSE_CUDA=1")
 endif()
 
 if(BUILD_ONEDNN_GRAPH)
@@ -955,18 +952,18 @@ elseif(USE_CUDA)
     )
     if($ENV{ATEN_STATIC_CUDA})
       target_link_libraries(torch_cuda_linalg PRIVATE
-          ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a
-          ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a     # needed for libcusolver_static
+          CUDA::cusolver_static
+          ${CUDAToolkit_LIBRARY_DIR}/liblapack_static.a     # needed for libcusolver_static
       )
     else()
       target_link_libraries(torch_cuda_linalg PRIVATE
-          ${CUDA_cusolver_LIBRARY}
+          CUDA::cusolver
       )
     endif()
     # NS: TODO, is this really necessary?
     if(USE_MAGMA AND CAFFE2_STATIC_LINK_CUDA)
       target_link_libraries(torch_cuda_linalg PRIVATE
-          "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
+          CUDA::culibos ${CMAKE_DL_LIBS})
     endif()
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp PROPERTIES COMPILE_FLAGS "-DBUILD_LAZY_CUDA_LINALG")
     install(TARGETS torch_cuda_linalg DESTINATION "${TORCH_INSTALL_LIB_DIR}")
@@ -978,10 +975,6 @@ elseif(USE_CUDA)
   endif()
 endif()
 
-if(USE_CUDA OR USE_ROCM)
-  include(${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/nvfuser.cmake)
-endif()
-
 if(NOT MSVC AND USE_XNNPACK)
   TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
 endif()
@@ -1165,31 +1158,8 @@ if(BUILD_TEST)
   endif()
 endif()
 
-# XXX This ABI check cannot be run with arm-linux-androideabi-g++
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-  if(DEFINED GLIBCXX_USE_CXX11_ABI)
-    message(STATUS "_GLIBCXX_USE_CXX11_ABI is already defined as a cmake variable")
-  else()
-    message(STATUS "${CMAKE_CXX_COMPILER} ${TORCH_SRC_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
-    execute_process(
-      COMMAND
-      "${CMAKE_CXX_COMPILER}"
-      "${TORCH_SRC_DIR}/abi-check.cpp"
-      "-o"
-      "${CMAKE_BINARY_DIR}/abi-check"
-      RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
-    if(ABI_CHECK_COMPILE_RESULT)
-      message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
-    endif()
-    execute_process(
-      COMMAND "${CMAKE_BINARY_DIR}/abi-check"
-      RESULT_VARIABLE ABI_CHECK_RESULT
-      OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
-    if(ABI_CHECK_RESULT)
-      message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
-    endif()
-  endif()
-  message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
+  include(../cmake/CheckAbi.cmake)
 endif()
 
 # CMake config for external projects.
@@ -1222,29 +1192,6 @@ if(NOT NO_API)
     $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/api/include>)
 endif()
 
-
-if(USE_OPENMP)
-  find_package(OpenMP QUIET)
-endif()
-if(USE_OPENMP AND OPENMP_FOUND)
-  if(MSVC AND OpenMP_CXX_LIBRARIES MATCHES "libiomp5md\\.lib")
-    set(AT_MKL_MT 1)
-  else()
-    set(AT_MKL_MT 0)
-  endif()
-  message(STATUS "pytorch is compiling with OpenMP. \n"
-    "OpenMP CXX_FLAGS: ${OpenMP_CXX_FLAGS}. \n"
-    "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}.")
-  if(UNIX)
-    separate_arguments(OpenMP_CXX_OPTIONS UNIX_COMMAND "${OpenMP_CXX_FLAGS}")
-  else()
-    separate_arguments(OpenMP_CXX_OPTIONS WINDOWS_COMMAND "${OpenMP_CXX_FLAGS}")
-  endif()
-  target_compile_options(torch_cpu PRIVATE ${OpenMP_CXX_OPTIONS})
-  target_link_libraries(torch_cpu PRIVATE ${OpenMP_CXX_LIBRARIES})
-endif()
-
-
 if(USE_ROCM)
   target_compile_definitions(torch_hip PRIVATE
     USE_ROCM
@@ -1335,13 +1282,6 @@ if(NOT INTERN_BUILD_MOBILE)
   endif()
 endif()
 
-if(USE_OPENMP AND OPENMP_FOUND)
-  message(STATUS "Caffe2 is compiling with OpenMP. \n"
-    "OpenMP CXX_FLAGS: ${OpenMP_CXX_FLAGS}. \n"
-    "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}.")
-  target_link_libraries(torch_cpu PRIVATE ${OpenMP_CXX_LIBRARIES})
-endif()
-
 if($ENV{TH_BINARY_BUILD})
   if(NOT MSVC AND USE_CUDA AND NOT APPLE)
     # Note [Extra MKL symbols for MAGMA in torch_cpu]
@@ -1378,19 +1318,12 @@ target_link_libraries(torch_cpu PRIVATE ${Caffe2_DEPENDENCY_WHOLE_LINK_LIBS})
 target_include_directories(torch_cpu INTERFACE $<INSTALL_INTERFACE:include>)
 target_include_directories(torch_cpu PRIVATE ${Caffe2_CPU_INCLUDE})
 target_include_directories(torch_cpu SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
-# Set standard properties on the target
-torch_set_target_props(torch_cpu)
 
-
-target_compile_options(torch_cpu PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
+target_compile_definitions(torch_cpu PRIVATE CAFFE2_BUILD_MAIN_LIB)
 if(USE_CUDA)
-  target_compile_options(torch_cuda PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
-  # NB: This must be target_compile_definitions, not target_compile_options,
-  # as the latter is not respected by nvcc
-  target_compile_definitions(torch_cuda PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
+  target_compile_definitions(torch_cuda PRIVATE TORCH_CUDA_BUILD_MAIN_LIB)
 elseif(USE_ROCM)
-  target_compile_options(torch_hip PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
-  target_compile_definitions(torch_hip PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
+  target_compile_definitions(torch_hip PRIVATE TORCH_HIP_BUILD_MAIN_LIB)
 endif()
 
 if(USE_EXPERIMENTAL_CUDNN_V8_API)
@@ -1539,10 +1472,6 @@ if(USE_CUDA)
       torch_cuda PRIVATE ${Caffe2_GPU_INCLUDE})
   target_link_libraries(
       torch_cuda PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
-  if(USE_CUDNN)
-    target_link_libraries(
-        torch_cuda PRIVATE  caffe2::cudnn-private)
-  endif()
 
   # These public dependencies must go after the previous dependencies, as the
   # order of the libraries in the linker call matters here when statically
@@ -1714,11 +1643,6 @@ if(BUILD_TEST)
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
     target_link_libraries(${test_name} torch_library gtest_main)
-    if(USE_OPENMP)
-      # -fopenmp is a compile time flag and as result not guaranteed
-      # to link executable against OpenMP runtime library
-      target_link_libraries(${test_name} ${OpenMP_CXX_LIBRARIES})
-    endif()
     target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
     target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
@@ -1814,6 +1738,20 @@ if(BUILD_TEST)
   endif()
 endif()
 
+if(MSVC)
+  # This is used to enable the conforming lambda processor in MSVC
+  # Which allows us to capture constexpr in lambdas
+  # Note that this will be turned on by default for std=c++20 and above
+  # This should be applied globally when https://github.com/pytorch/pytorch/issues/92600 is fixed
+  foreach(tmp ${MEM_EFF_ATTENTION_CUDA_SOURCES})
+    # MEM_EFF_ATTENTION_CUDA is populated in pytorch/aten/src/ATen/CMakeLists.txt
+    # We iterate over these files, updating paths and adding the compile flag
+    FILE(RELATIVE_PATH tmp_path "${PROJECT_SOURCE_DIR}" "${tmp}")
+    SET(tmp_path "../${tmp_path}")
+    set_source_files_properties(${tmp_path} PROPERTIES COMPILE_FLAGS "-Xcompiler /Zc:lambda")
+  endforeach()
+endif()
+
 # Note: we only install the caffe2 python files if BUILD_CAFFE2_OPS is ON
 # This is because the build rules here written in such a way that they always
 # appear to need to be re-run generating >600 pieces of work during the pytorch
@@ -1844,13 +1782,8 @@ if(BUILD_PYTHON)
   pycmd(PY_EXT_SUFFIX "
       def get_ext_suffix():
           import sys
-          if sys.version_info < (3, 8) and sys.platform == 'win32':
-              # Workaround for https://bugs.python.org/issue39825
-              import _imp
-              return _imp.extension_suffixes()[0]
-          else:
-              import sysconfig
-              return sysconfig.get_config_var('EXT_SUFFIX')
+          import sysconfig
+          return sysconfig.get_config_var('EXT_SUFFIX')
 
       suffix = get_ext_suffix()
       if suffix is not None:
@@ -1905,7 +1838,6 @@ if(BUILD_PYTHON)
   if(NOT MSVC)
     set_target_properties(caffe2_pybind11_state PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
   endif()
-  torch_set_target_props(caffe2_pybind11_state)
   set_target_properties(caffe2_pybind11_state PROPERTIES PREFIX "" DEBUG_POSTFIX "")
   set_target_properties(caffe2_pybind11_state PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
   set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
@@ -1941,7 +1873,6 @@ if(BUILD_PYTHON)
     if(NOT MSVC)
       set_target_properties(caffe2_pybind11_state_gpu PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
     endif()
-    torch_set_target_props(caffe2_pybind11_state_gpu)
     set_target_properties(caffe2_pybind11_state_gpu PROPERTIES PREFIX "" DEBUG_POSTFIX "")
     set_target_properties(caffe2_pybind11_state_gpu PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
     set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
@@ -1973,7 +1904,6 @@ if(BUILD_PYTHON)
     if(NOT MSVC)
       target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_CXX_FLAGS} -fvisibility=hidden)
     endif()
-    torch_set_target_props(caffe2_pybind11_state_hip)
     set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "")
     set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
     set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "${_caffe2_pybind11_state_linker_flags}")
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index b22b840c25ad..f3996186314e 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <unordered_map>
 #include <string>
-#include <ATen/ATen.h>
+#include <ATen/Functions.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <caffe2/core/context.h>
diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py
index b8e2f8b37b2a..386993d6f36e 100644
--- a/caffe2/contrib/playground/AnyExp.py
+++ b/caffe2/contrib/playground/AnyExp.py
@@ -76,7 +76,7 @@ def initialize_params_from_file(*args, **kwargs):
     return checkpoint.initialize_params_from_file(*args, **kwargs)
 
 
-class AnyExpTrainer(object):
+class AnyExpTrainer:
 
     def __init__(self, opts):
         import logging
diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py
index ed0158bbf087..68897792d284 100644
--- a/caffe2/contrib/playground/meter.py
+++ b/caffe2/contrib/playground/meter.py
@@ -6,7 +6,7 @@
 from abc import abstractmethod
 
 
-class Meter(object):
+class Meter:
 
     @abstractmethod
     def __init__(self, **kwargs):
diff --git a/caffe2/contrib/prof/cuda_profile_ops.cc b/caffe2/contrib/prof/cuda_profile_ops.cc
index 8a281ecfede8..893d8e8415a0 100644
--- a/caffe2/contrib/prof/cuda_profile_ops.cc
+++ b/caffe2/contrib/prof/cuda_profile_ops.cc
@@ -57,8 +57,12 @@ class CudaProfileInitializeOp : public OperatorBase {
 
   bool Run(int /* unused */ /*stream_id*/ = 0) override {
     // If this fails, check the contents of "output" for hints.
+#if defined(CUDA_VERSION) && CUDA_VERSION < 12000
+    // cudaProfilerInitialize is no longer needed after CUDA 12:
+    // https://forums.developer.nvidia.com/t/cudaprofilerinitialize-is-deprecated-alternative/200776/3
     CUDA_CHECK(
         cudaProfilerInitialize(config_.c_str(), output_.c_str(), cudaCSV));
+#endif
     return true;
   }
 
diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py
index 6f5ad1896e35..e086a74f879c 100644
--- a/caffe2/contrib/tensorboard/tensorboard.py
+++ b/caffe2/contrib/tensorboard/tensorboard.py
@@ -28,7 +28,7 @@
         # tensorflow<=0.12.1
         from tensorflow.train import SummaryWriter as FileWriter
 
-class Config(object):
+class Config:
     HEIGHT = 600
     ASPECT_RATIO = 1.6
 
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index 69517d85a993..304f973576c1 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -1,5 +1,6 @@
 #include <random>
 
+#include <c10/core/alignment.h>
 #include <gtest/gtest.h>
 #include "caffe2/core/context.h"
 #include "caffe2/proto/caffe2_pb.h"
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index 82da29a44f4b..216d3833648b 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -12,6 +12,7 @@
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <torch/library.h>
+#include <caffe2/core/tensor.h>
 #include <vector>
 
 namespace caffe2 {
@@ -20,19 +21,19 @@ namespace detail {
 constexpr const char* PREALLOCATED_OUTPUT_ARGNAME =
     "_caffe2_preallocated_outputs";
 
-using _CallCaffe2OpFunc = c10::List<at::Tensor>(
+using _CallCaffe2OpFunc = std::vector<caffe2::Tensor>(
     const c10::FunctionSchema& schema,
-    std::vector<c10::IValue>&& inputs,
-    c10::List<at::Tensor>&& outputs);
+    std::vector<c10::IValue> &&inputs,
+    std::vector<caffe2::Tensor> &&outputs);
 
 template <class Caffe2Operator>
-inline c10::List<at::Tensor> _call_caffe2_op(
+inline std::vector<caffe2::Tensor> _call_caffe2_op(
     const c10::FunctionSchema& schema,
-    std::vector<c10::IValue>&& inputs,
-    c10::List<at::Tensor>&& outputs) {
+    std::vector<c10::IValue> &&inputs,
+    std::vector<caffe2::Tensor> &&outputs) {
   Caffe2Operator op(schema, std::move(inputs), std::move(outputs), -1);
   op.Run(-1);
-  return std::move(op).move_newstyle_outputs();
+  return std::move(op).move_output_tensors();
 }
 
 // This function is inline in the hope that compilers optimizing for speed will
@@ -62,7 +63,6 @@ inline void _call_caffe2_op_from_c10(
           *OptionalType::create(ListType::ofTensors())));
   IValue preallocated_outputs = torch::jit::pop(*stack);
 
-  const size_t num_outputs = schema.returns().size();
   const size_t num_inputs = schema.arguments().size() -
       1; // -1 because the last argument is the list of preallocated tensors
 
@@ -71,7 +71,7 @@ inline void _call_caffe2_op_from_c10(
     // either the schema doesn't support preallocated outputs or it does but
     // they haven't been passed in. Pass a list of uninitialized tensors to
     // the caffe2 operator as preallocated outputs.
-    outputs.resize(num_outputs);
+    outputs.resize(schema.returns().size());
   } else {
     AT_ASSERT(preallocated_outputs.isTensorList());
     outputs = std::move(preallocated_outputs).toTensorList();
@@ -81,7 +81,15 @@ inline void _call_caffe2_op_from_c10(
   // instances in the cache.
   std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
 
-  outputs = (*call_op)(schema, std::move(inputs), std::move(outputs));
+  // Convert outputs to caffe2::Tensor
+  const size_t num_outputs = outputs.size();
+  std::vector<caffe2::Tensor> outputs_c2(num_outputs);
+  for (auto i : c10::irange(num_outputs)) {
+    outputs_c2[i] = caffe2::Tensor(outputs.extract(i));
+  }
+
+  outputs_c2 = (*call_op)(schema, std::move(inputs), std::move(outputs_c2));
+  TORCH_INTERNAL_ASSERT(num_outputs == outputs_c2.size());
 
   bool return_tensor_list = false;
   if (schema.returns().size() == 1) {
@@ -93,11 +101,13 @@ inline void _call_caffe2_op_from_c10(
     }
   }
   if (return_tensor_list) {
-    // We should not unwrap the list if we expect tensor list in the schema.
+    for (const auto i : c10::irange(num_outputs)) {
+      outputs.set(i, at::Tensor(std::move(outputs_c2[i])));
+    }
     torch::jit::push(*stack, outputs);
   } else {
-    for (const auto i : c10::irange(outputs.size())) {
-      torch::jit::push(*stack, outputs.extract(i));
+    for (const auto i : c10::irange(num_outputs)) {
+      torch::jit::push(*stack, at::Tensor(std::move(outputs_c2[i])));
     }
   }
 
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index a16f2cb26846..a978cfd164ce 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -59,10 +59,6 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
       device_option_(
           operator_def.has_device_option() ? operator_def.device_option()
                                            : DeviceOption()),
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      newstyle_outputs_(),
-#endif
       input_size_(operator_def.input_size()),
       event_(std::make_unique<Event>(device_option_)) {
   static GlobalInitIsCalledGuard guard;
@@ -124,14 +120,13 @@ compute_input_size_(const std::vector<c10::IValue>& inputs) {
 OperatorBase::OperatorBase(
     const c10::FunctionSchema& fn_schema,
     std::vector<c10::IValue> inputs,
-    c10::List<at::Tensor> outputs)
+    std::vector<caffe2::Tensor> outputs)
     // NOLINTNEXTLINE(performance-move-const-arg)
     : fn_schema_(make_unique<c10::FunctionSchema>(std::move(fn_schema))),
       newstyle_inputs_(std::move(inputs)),
-      newstyle_outputs_(std::move(outputs)),
+      output_tensors_(std::move(outputs)),
       input_size_(compute_input_size_(newstyle_inputs_)) {
   input_tensors_.resize(input_size_);
-  output_tensors_.resize(newstyle_outputs_.size());
 }
 #endif
 
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 4fd8619631a3..ff845e0343a9 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -74,7 +74,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
   explicit OperatorBase(
       const c10::FunctionSchema& schema,
       std::vector<c10::IValue> inputs,
-      c10::List<at::Tensor> outputs);
+      std::vector<caffe2::Tensor> outputs);
 #endif
 
   virtual ~OperatorBase() noexcept;
@@ -250,15 +250,12 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    at::Tensor output = newstyle_outputs_[idx];
-    if (!output.defined() || caffe2::Tensor(output).GetDeviceType() != type) {
+    auto &output = output_tensors_[idx];
+    if (!output.defined() || output.GetDeviceType() != type) {
       // Fix tensor type
-      Tensor tensor = Tensor(type);
-      output = at::Tensor(std::move(tensor.getIntrusivePtr()));
+      output = Tensor(type);
     }
-    output_tensors_[idx] = caffe2::Tensor(output);
-    newstyle_outputs_[idx] = std::move(output);
-    return &output_tensors_[idx];
+    return &output;
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -280,9 +277,6 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     if (!isLegacyOperator()) {
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      newstyle_outputs_[idx] = at::Tensor(tensor);
-
-      // also update the tensor in the hack
       output_tensors_[idx] = std::move(tensor);
 #else
       CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
@@ -310,16 +304,12 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    at::Tensor output = newstyle_outputs_[idx];
-    Tensor tensor = output.defined()
-        ? GetSizedTensorWithOptions(caffe2::Tensor(output), dims, options)
+    auto &output = output_tensors_[idx];
+    output = output.defined()
+        ? GetSizedTensorWithOptions(std::move(output), dims, options)
         : caffe2::empty(dims, options);
-    // assign it back in case it changed
-    output = at::Tensor(std::move(tensor.getIntrusivePtr()));
 
-    output_tensors_[idx] = caffe2::Tensor(output);
-    newstyle_outputs_[idx] = std::move(output);
-    return &output_tensors_[idx];
+    return &output;
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -434,7 +424,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    return newstyle_outputs_.size();
+    return output_tensors_.size();
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -599,8 +589,8 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
 
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  c10::List<at::Tensor> move_newstyle_outputs() && {
-    return std::move(newstyle_outputs_);
+  std::vector<caffe2::Tensor> move_output_tensors() && {
+    return std::move(output_tensors_);
   }
 #endif
 
@@ -620,7 +610,6 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
   vector<c10::IValue> newstyle_inputs_;
-  c10::List<at::Tensor> newstyle_outputs_;
 #endif
   // HACK
   // We preserve the fact that Output() returns Tensor*
@@ -819,7 +808,7 @@ class Operator : public OperatorBase {
   explicit Operator(
       const c10::FunctionSchema& fn_schema,
       std::vector<c10::IValue> inputs,
-      c10::List<at::Tensor> outputs,
+      std::vector<caffe2::Tensor> outputs,
       StreamId stream = 0)
       : OperatorBase(fn_schema, std::move(inputs), std::move(outputs)) {
     // In the constructor, we switch to the device so that the child class
diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py
index 72f8e456292d..3f60b5ada340 100644
--- a/caffe2/distributed/file_store_handler_op_test.py
+++ b/caffe2/distributed/file_store_handler_op_test.py
@@ -21,7 +21,7 @@ class TestFileStoreHandlerOp(TestCase):
     testCounter = 0
 
     def setUp(self):
-        super(TestFileStoreHandlerOp, self).setUp()
+        super().setUp()
         self.tmpdir = tempfile.mkdtemp()
 
         # Use counter to tell test cases apart
@@ -29,7 +29,7 @@ def setUp(self):
 
     def tearDown(self):
         shutil.rmtree(self.tmpdir)
-        super(TestFileStoreHandlerOp, self).tearDown()
+        super().tearDown()
 
     def create_store_handler(self):
         # Use new path for every test so they are isolated
diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py
index 2eb6c9adb705..0c8361c1e958 100644
--- a/caffe2/distributed/redis_store_handler_op_test.py
+++ b/caffe2/distributed/redis_store_handler_op_test.py
@@ -17,12 +17,9 @@
 
 class TestRedisStoreHandlerOp(TestCase):
     def setUp(self):
-        super(TestRedisStoreHandlerOp, self).setUp()
+        super().setUp()
         self.uuid = str(uuid.uuid4()) + "/"
 
-    def tearDown(self):
-        super(TestRedisStoreHandlerOp, self).tearDown()
-
     def create_store_handler(self):
         store_handler = "store_handler"
         workspace.RunOperatorOnce(
diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py
index 05245be9b210..b089d650511f 100644
--- a/caffe2/distributed/store_ops_test_util.py
+++ b/caffe2/distributed/store_ops_test_util.py
@@ -12,7 +12,7 @@
 from caffe2.python import core, workspace
 
 
-class StoreOpsTests(object):
+class StoreOpsTests:
     @classmethod
     def _test_set_get(cls, queue, create_store_handler_fn, index, num_procs):
         store_handler = create_store_handler_fn()
diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py
index 1a795e2fcf0e..c57bff57fe3e 100644
--- a/caffe2/experiments/python/device_reduce_sum_bench.py
+++ b/caffe2/experiments/python/device_reduce_sum_bench.py
@@ -25,7 +25,6 @@
 import logging
 import os
 
-from six import add_metaclass
 import numpy as np
 
 from caffe2.python import workspace, core
@@ -46,8 +45,7 @@ def __new__(metacls, name, bases, class_dict):
         return cls
 
 
-@add_metaclass(BenchmarkMeta)
-class Benchmark(object):
+class Benchmark(metaclass=BenchmarkMeta):
 
     def __init__(self):
         self.results = []
diff --git a/caffe2/operators/pow_op.cc b/caffe2/operators/pow_op.cc
index 159757b6e531..97ede3fdf781 100644
--- a/caffe2/operators/pow_op.cc
+++ b/caffe2/operators/pow_op.cc
@@ -13,8 +13,7 @@ struct EigenPowFunctor {
   template <int b_is_scalar, typename T1, typename T2, typename R>
   inline void
   Run(size_t n, const T1* a, const T2* b, T2 e, R* out, CPUContext*) {
-    // NOLINTNEXTLINE(modernize-use-nullptr)
-    if (b == NULL) {
+    if (b == nullptr) {
       EigenVectorArrayMap<R>(out, n) =
           EIGEN_POW((ConstEigenVectorArrayMap<T1>(a, n)), (e));
     } else {
diff --git a/caffe2/operators/roi_align_rotated_op.h b/caffe2/operators/roi_align_rotated_op.h
index f63cf03ab92b..fe4441f890c5 100644
--- a/caffe2/operators/roi_align_rotated_op.h
+++ b/caffe2/operators/roi_align_rotated_op.h
@@ -35,9 +35,7 @@ class RoIAlignRotatedOp final : public Operator<Context> {
   }
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
-  bool RunOnDevice() override {
-    CAFFE_NOT_IMPLEMENTED;
-  }
+  bool RunOnDevice() override;
 
  protected:
   StorageOrder order_;
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 83e393e67731..888d286458a3 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -56,18 +56,10 @@
 
     kernel32.LoadLibraryW.restype = ctypes.c_void_p
     if with_load_library_flags:
-        kernel32.AddDllDirectory.restype = ctypes.c_void_p
         kernel32.LoadLibraryExW.restype = ctypes.c_void_p
 
     for dll_path in dll_paths:
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(dll_path)
-        elif with_load_library_flags:
-            res = kernel32.AddDllDirectory(dll_path)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += ' Error adding "{}" to the DLL directories.'.format(dll_path)
-                raise err
+        os.add_dll_directory(dll_path)
 
     dlls = glob.glob(os.path.join(th_dll_path, '*.dll'))
     path_patched = False
diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py
index 39dba40df8a0..172abfed56c2 100644
--- a/caffe2/python/binarysize.py
+++ b/caffe2/python/binarysize.py
@@ -24,7 +24,7 @@
 import sys
 
 
-class Trie(object):
+class Trie:
     """A simple class that represents a Trie."""
 
     def __init__(self, name):
diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py
index 980c4fe40e08..22bf49ed4154 100644
--- a/caffe2/python/cached_reader.py
+++ b/caffe2/python/cached_reader.py
@@ -71,7 +71,7 @@ def __init__(
         assert original_reader is not None, "original_reader can't be None"
         self.original_reader = original_reader
 
-        super(CachedReader, self).__init__(
+        super().__init__(
             db_path,
             db_type,
             name,
diff --git a/caffe2/python/caffe_translator.py b/caffe2/python/caffe_translator.py
index 63b5706120ac..23987adf3532 100644
--- a/caffe2/python/caffe_translator.py
+++ b/caffe2/python/caffe_translator.py
@@ -192,7 +192,7 @@ def _GetInputDims(caffe_net):
     return input_dims
 
 
-class TranslatorRegistry(object):
+class TranslatorRegistry:
     registry_ = {}
 
     @classmethod
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
index c379211a509d..0b6baea95265 100644
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@@ -96,13 +96,13 @@ def compile(self, session_class):
         self.exit_group = session_class.compile(self.exit_group)
 
     def __enter__(self):
-        super(Job, self).__enter__()
+        super().__enter__()
         self.epoch_group.__enter__()
         return self
 
     def __exit__(self, *args):
         self.epoch_group.__exit__()
-        super(Job, self).__exit__(*args)
+        super().__exit__(*args)
 
     def add_stop_condition(self, output):
         if isinstance(output, core.BlobReference):
@@ -146,7 +146,7 @@ def db_name(epoch, node_name, db_prefix, path_prefix=None):
     return db_name
 
 
-class CheckpointManager(object):
+class CheckpointManager:
     """
     Controls saving and loading of workspaces on every epoch boundary of a job.
     If a CheckpointManager instance is passed to JobRunner, then JobRunner will
@@ -429,7 +429,7 @@ def cp_accessible(self, epoch=None):
             return True
 
 
-class MultiNodeCheckpointManager(object):
+class MultiNodeCheckpointManager:
     """
     Coordinates checkpointing and checkpointing across multiple nodes.
     Each of `init`, `load` and `save` will build TaskGroups which will
@@ -634,7 +634,7 @@ def cp_accessible(self, epoch=None):
             return True
 
 
-class UploadTaskGroupBuilder(object):
+class UploadTaskGroupBuilder:
     """A simple class to upload checkpoints."""
     def build(self, epoch, checkpoint_manager):
         """Builds the task group to upload checkpoints.
@@ -652,7 +652,7 @@ def build(self, epoch, checkpoint_manager):
         raise NotImplementedError()
 
 
-class JobRunner(object):
+class JobRunner:
     """
     Implement the runtime logic for jobs with checkpointing at the level of
     epoch. Can be used to run either single-host or distributed jobs. Job
diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
index 90746747dd98..b97e0f6c5bcd 100644
--- a/caffe2/python/checkpoint_test.py
+++ b/caffe2/python/checkpoint_test.py
@@ -78,8 +78,8 @@ def fetch_total(session):
             session, checkpoint = builder()
             job.compile(LocalSession)
             num_epochs = JobRunner(job, checkpoint).train(session)
-            self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
-            self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])
+            self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
+            self.assertEqual(fetch_total(session), EXPECTED_TOTALS[-1])
 
             for initial_epoch in range(1, num_epochs + 1):
                 session, checkpoint = builder()
@@ -87,11 +87,11 @@ def fetch_total(session):
                     job,
                     checkpoint, resume_from_epoch=initial_epoch
                 ).train(session)
-                self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])
+                self.assertEqual(fetch_total(session), EXPECTED_TOTALS[-1])
 
             for epoch in range(1, num_epochs + 1):
                 session.run(checkpoint.load(epoch))
-                self.assertEquals(fetch_total(session),
+                self.assertEqual(fetch_total(session),
                                   EXPECTED_TOTALS[epoch - 1])
 
     def test_single_checkpoint(self):
@@ -141,7 +141,7 @@ def test_ckpt_name_and_load_model_from_ckpts(self):
                     epoch = 5
                     node_name = 'trainer_%d' % node_id
                     expected_db_name = tmpdir + '/' + node_name + '.5'
-                    self.assertEquals(
+                    self.assertEqual(
                         checkpoint.get_ckpt_db_name(node_name, epoch),
                         expected_db_name)
             shutil.rmtree(tmpdir)
@@ -159,15 +159,15 @@ def test_ckpt_name_and_load_model_from_ckpts(self):
                     job.compile(LocalSession)
                     job_runner = JobRunner(job, checkpoint)
                     num_epochs = job_runner.train(session)
-                self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+                self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
 
                 # There are 17 global blobs after finishing up the job runner.
                 # (only blobs on init_group are checkpointed)
-                self.assertEquals(len(ws.blobs), 17)
+                self.assertEqual(len(ws.blobs), 17)
 
             ws = workspace.C.Workspace()
             session = LocalSession(ws)
-            self.assertEquals(len(ws.blobs), 0)
+            self.assertEqual(len(ws.blobs), 0)
             model_blob_names = ['trainer_1/task_2/GivenTensorInt64Fill:0',
                                 'trainer_2/task_2/GivenTensorInt64Fill:0']
             checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
@@ -190,7 +190,7 @@ def test_ckpt_name_and_load_model_from_ckpts(self):
                     # Check that all the model blobs are loaded.
                     for blob_name in model_blob_names:
                         self.assertTrue(ws.has_blob(blob_name))
-                        self.assertEquals(
+                        self.assertEqual(
                             ws.fetch_blob(blob_name),
                             np.array([EXPECTED_TOTALS[epoch - 1]]))
                 self.assertFalse(
@@ -227,7 +227,7 @@ def test_upload_checkpoint(self):
                         job, checkpoint,
                         upload_task_group_builder=local_upload_builder)
                     num_epochs = job_runner.train(session)
-                    self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+                    self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
 
             # The uploaded files should exist now.
             for node_id in range(num_nodes):
@@ -260,7 +260,7 @@ def test_ckpt_save_failure(self):
                 num_epochs = job_runner.train(session)
             # make sure all epochs are executed even though saving the checkpoint failed
             # Saving checkpoint failure should not cause job failure
-            self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
+            self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
 
     def test_download_group_simple(self):
         """
@@ -332,7 +332,7 @@ def fetch_total(session):
                     checkpoint,
                     resume_from_epoch=initial_epoch
                 ).train(session)
-                self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])
+                self.assertEqual(fetch_total(session), EXPECTED_TOTALS[-1])
 
         finally:
             shutil.rmtree(tmpdir)
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index a0fd52e1fdbc..45a676b09c7b 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -36,7 +36,7 @@ def __init__(self, order="NCHW", name=None,
         }
         if ws_nbytes_limit:
             cnn_arg_scope['ws_nbytes_limit'] = ws_nbytes_limit
-        super(CNNModelHelper, self).__init__(
+        super().__init__(
             skip_sparse_optim=skip_sparse_optim,
             name="CNN" if name is None else name,
             init_params=init_params,
diff --git a/caffe2/python/context.py b/caffe2/python/context.py
index ce9b312855e6..f04b3b692d87 100644
--- a/caffe2/python/context.py
+++ b/caffe2/python/context.py
@@ -6,7 +6,7 @@
 import functools
 
 
-class _ContextInfo(object):
+class _ContextInfo:
     def __init__(self, cls, allow_default):
         self.cls = cls
         self.allow_default = allow_default
@@ -35,7 +35,7 @@ def get_active(self, required=True):
         return self._stack[-1]
 
 
-class _ContextRegistry(object):
+class _ContextRegistry:
     def __init__(self):
         self._ctxs = {}
 
@@ -62,7 +62,7 @@ def _get_managed_classes(obj):
 
 
 
-class Managed(object):
+class Managed:
     """
     Managed makes the inheritted class a context managed class.
 
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
index 3f9df172d2b7..ee47ccb4bd08 100644
--- a/caffe2/python/control_test.py
+++ b/caffe2/python/control_test.py
@@ -11,7 +11,7 @@
 
 class TestControl(test_util.TestCase):
     def setUp(self):
-        super(TestControl, self).setUp()
+        super().setUp()
         self.N_ = 10
 
         self.init_net_ = core.Net("init-net")
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 0c81a9f2157f..d9f97b6121fd 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -8,7 +8,6 @@
 from collections import namedtuple, OrderedDict, defaultdict
 from past.builtins import basestring
 from itertools import chain
-from six import binary_type, string_types, text_type
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import scope, utils, workspace
@@ -201,7 +200,7 @@ def InferOpDeviceAsBlobDevices(op):
 GradientSlice = namedtuple('GradientSlice', ['indices', 'values'])
 
 
-class BlobReference(object):
+class BlobReference:
     """A wrapper around a blob in a net.
 
     BlobReference gives us a way to refer to the network that the blob is
@@ -215,9 +214,9 @@ def __init__(self, name, net=None):
         Note that this does not prepends the namescope. If needed, use
         ScopedBlobReference() to prepend the existing namespace.
         """
-        if isinstance(name, string_types):
+        if isinstance(name, str):
             self._name = name
-        elif isinstance(name, binary_type):
+        elif isinstance(name, bytes):
             self._name = name.decode('utf-8')
         else:
             self._name = str(name)
@@ -230,9 +229,9 @@ def __hash__(self):
         return hash(self._name)
 
     def __eq__(self, other):
-        if isinstance(other, string_types):
+        if isinstance(other, str):
             return self._name == other
-        elif isinstance(other, binary_type):
+        elif isinstance(other, bytes):
             return self._name == other.decode('utf-8')
         elif isinstance(other, BlobReference):
             return self._name == other._name
@@ -249,12 +248,12 @@ def __repr__(self):
         return 'BlobReference("{}")'.format(self._name)
 
     def __add__(self, other):
-        if not isinstance(other, string_types):
+        if not isinstance(other, str):
             raise RuntimeError('Cannot add BlobReference to a non-string.')
         return BlobReference(self._name + other, self._from_net)
 
     def __radd__(self, other):
-        if not isinstance(other, string_types):
+        if not isinstance(other, str):
             raise RuntimeError('Cannot add a non-string to BlobReference.')
         return BlobReference(other + self._name, self._from_net)
 
@@ -272,7 +271,7 @@ def _CreateAndAddToNet(self, op_type, inputs=None, *args, **kwargs):
         network's __getattr__ function.
         """
         inputs = [] if inputs is None else inputs
-        if isinstance(inputs, BlobReference) or isinstance(inputs, string_types):
+        if isinstance(inputs, BlobReference) or isinstance(inputs, str):
             inputs = [inputs]
         # add self to the input list.
         inputs.insert(0, self)
@@ -317,7 +316,7 @@ def __dir__(self):
 
 def ScopedName(name):
     """prefix the name with the current scope."""
-    if isinstance(name, binary_type):
+    if isinstance(name, bytes):
         name = name.decode('ascii')
     return scope.CurrentNameScope() + name
 
@@ -331,7 +330,7 @@ def _RectifyInputOutput(blobs, net=None):
     """A helper function to rectify the input or output of the CreateOperator
     interface.
     """
-    if isinstance(blobs, string_types) or isinstance(blobs, binary_type):
+    if isinstance(blobs, (bytes, str)):
         # If blobs is a single string, prepend scope.CurrentNameScope()
         # and put it as a list.
         # TODO(jiayq): enforce using BlobReference instead of raw strings.
@@ -343,7 +342,7 @@ def _RectifyInputOutput(blobs, net=None):
         # If blob is a list, we go through it and type check.
         rectified = []
         for blob in blobs:
-            if isinstance(blob, string_types) or isinstance(blob, binary_type):
+            if isinstance(blob, (bytes, str)):
                 rectified.append(ScopedBlobReference(blob, net=net))
             elif type(blob) is BlobReference:
                 rectified.append(blob)
@@ -385,11 +384,11 @@ def CreateOperator(
     # Add rectified inputs and outputs
     inputs = _RectifyInputOutput(inputs)
     outputs = _RectifyInputOutput(outputs)
-    operator.input.extend([text_type(i) for i in inputs])
-    operator.output.extend([text_type(o) for o in outputs])
+    operator.input.extend(map(str, inputs))
+    operator.output.extend(map(str, outputs))
     if control_input:
         control_input = _RectifyInputOutput(control_input)
-        operator.control_input.extend([text_type(i) for i in control_input])
+        operator.control_input.extend(map(str, control_input))
     # Set device option:
     # (1) If device_option is explicitly set, use device_option.
     # (2) If not, but scope.CurrentDeviceScope() is set,
@@ -486,7 +485,7 @@ def GetIndexFromGradientList(g_list, name):
 ])
 
 
-class IR(object):
+class IR:
     """A simple IR class to keep track of all intermediate representations used
     in the gradient computation.
     """
@@ -667,7 +666,7 @@ def BuildGradientGenerators(  # NOQA
             # (2) add outputs to the locally generated blobs
             # If an output corresponds to the gradient of an input, we also
             # record it to gradient_generators
-            locally_generated_blobs.extend([str(s) for s in grad_op.output])
+            locally_generated_blobs.extend(map(str, grad_op.output))
             for i, output in enumerate(grad_op.output):
                 input_index = GetIndexFromGradientList(g_input, output)
                 if input_index is not None:
@@ -1095,8 +1094,7 @@ def GetBackwardPass(self, ys):
         all_input_to_grad_out = {}
         for key, val in all_input_to_grad.items():
             if val is not None:
-                if (isinstance(val, string_types) or
-                        isinstance(val, binary_type)):
+                if isinstance(val, (bytes, str)):
                     grad_out = BlobReference(val)
                 else:
                     grad_out = GradientSlice(BlobReference(val[0]),
@@ -1105,7 +1103,7 @@ def GetBackwardPass(self, ys):
         return all_gradient_ops, all_input_to_grad_out
 
 
-class GradientRegistry(object):
+class GradientRegistry:
     """GradientRegistry holds the mapping from operators to their gradients."""
     gradient_registry_ = {}
 
@@ -1310,7 +1308,7 @@ def recurrent_network_op_remap(op, prefix, blob_remap):
     """
 
     def get_remapped_str(blob_str):
-        if isinstance(blob_str, binary_type):
+        if isinstance(blob_str, bytes):
             blob_str = blob_str.decode('utf-8')
         return blob_remap.get(blob_str, blob_str).encode('utf-8')
 
@@ -1446,7 +1444,7 @@ def _recover_record_by_prefix(names, prefix=''):
         col_blobs=[_get_blob_ref(prefix + name) for name in column_names])
 
 
-class Net(object):
+class Net:
     _net_names_used = set()
     operator_registry_ = {}
 
@@ -1983,7 +1981,7 @@ def NextName(self, prefix=None, output_id=None):
     def _ExtendOps(self, new_ops):
         self._net.op.extend(new_ops)
         for op in new_ops:
-            self._op_outputs.update([text_type(o) for o in op.output])
+            self._op_outputs.update([str(o) for o in op.output])
 
     def _CheckLookupTables(self):
         '''
@@ -2668,7 +2666,7 @@ def _add_net_to_dict(net_dict, net):
         return True
 
 
-class ExecutionStep(object):
+class ExecutionStep:
     _step_names_used = set()
 
     @staticmethod
@@ -2874,7 +2872,7 @@ def add_nets_in_order(step, net_list):
         net_list.append(proto.report_net)
 
 
-class Plan(object):
+class Plan:
 
     def __init__(self, name_or_step):
         self._plan = caffe2_pb2.PlanDef()
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 2f143fbae07a..6a8c0d7d3ca1 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -459,13 +459,13 @@ def test_extract_simple(self):
             self.assertFalse("xx/data" in op.input)
 
         # Note: image input should not be included
-        self.assertEquals(ops[0].type, "Conv")
-        self.assertEquals(ops[1].type, "FC")
-        self.assertEquals(ops[2].type, "FC")
-        self.assertEquals(len(ops), 3)
+        self.assertEqual(ops[0].type, "Conv")
+        self.assertEqual(ops[1].type, "FC")
+        self.assertEqual(ops[2].type, "FC")
+        self.assertEqual(len(ops), 3)
 
         # test rename happened
-        self.assertEquals(ops[0].input[0], "image")
+        self.assertEqual(ops[0].input[0], "image")
 
         # Check export blobs
         self.assertTrue("image" not in export_blobs)
@@ -474,7 +474,7 @@ def test_extract_simple(self):
 
         # Check external inputs/outputs
         self.assertTrue("image" in predict_net.Proto().external_input)
-        self.assertEquals(set(["pred"]), set(predict_net.Proto().external_output))
+        self.assertEqual(set(["pred"]), set(predict_net.Proto().external_output))
         self.assertEqual(
             set(predict_net.Proto().external_input) -
             set([str(p) for p in model.params]), set(["image"])
diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py
index 703ae604c654..e6c36a3c571e 100644
--- a/caffe2/python/crf.py
+++ b/caffe2/python/crf.py
@@ -13,7 +13,7 @@
 """
 
 
-class CRFWithLoss(object):
+class CRFWithLoss:
     def __init__(self, model, num_classes, transitions_blob=None):
         self.model = model
         self.num_classes = num_classes
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 6633931a0f6b..0dfe4de0ea91 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -1304,7 +1304,7 @@ def modify_ops(net):
     modify_ops(model.net)
 
 
-class CollectivesConcurrencyControl(object):
+class CollectivesConcurrencyControl:
     """
     Creates common worlds (up to max_concurrent_context) and manage the
     sequential execution of collectives that shares the same context with
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
index 3d2e656cb738..1284d9287894 100644
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@@ -26,7 +26,7 @@
 import time
 
 
-class Reader(object):
+class Reader:
     """
     Reader is an abstract class to be implemented in order to provide
     operations capable of iterating through a dataset or stream of data.
@@ -143,7 +143,7 @@ def execution_step(self, reader_net_name=None, external_should_stop=None):
         return (read_step, fields)
 
 
-class Writer(object):
+class Writer:
     """
     Writer is an abstract class to be implemented in order to provide
     operations capable of feeding a data stream or a dataset.
@@ -207,7 +207,7 @@ def commit(self, finish_net):
         pass
 
 
-class ReaderBuilder(object):
+class ReaderBuilder:
     """ Allow usage of a reader in distributed fashion. """
     def schema(self):
         raise NotImplementedError()
@@ -256,7 +256,7 @@ def new_reader(self, **kwargs):
         return output if isinstance(output, Reader) else output.reader()
 
 
-class Pipe(object):
+class Pipe:
     def __init__(self, schema=None, obj_key=None):
         self._num_writers = 0
         self._num_readers = 0
@@ -424,7 +424,7 @@ def __init__(self, reader, num_iter=1):
                 produces a data_finished blob as a side effect to indicate
                 whether the input stream is exhausted.
         """
-        super(ReaderWithLimit, self).__init__(reader)
+        super().__init__(reader)
         self.counter = None
         self.num_iter = num_iter
         if self.num_iter is not None:
@@ -466,7 +466,7 @@ def __init__(self, reader, duration=0):
                 produces a data_finished blob as a side effect to indicate
                 whether the input stream is exhausted.
         """
-        super(ReaderWithTimeLimit, self).__init__(reader)
+        super().__init__(reader)
 
         self.timer = None
         self.duration = duration
@@ -528,7 +528,7 @@ def __init__(self, names, readers):
             readers: list[Reader] Reader instances, must have schema
         """
         assert len(names) == len(readers)
-        super(CompositeReader, self).__init__(schema=Struct(*[
+        super().__init__(schema=Struct(*[
             (name, reader.schema()) for name, reader in zip(names, readers)
         ]))
         self._names = names
@@ -584,7 +584,7 @@ def __init__(self, names, reader_builders):
             reader_builders: list[ReaderBuilder] ReaderBuilder instances;
                 must have schema
         """
-        super(CompositeReaderBuilder, self).__init__()
+        super().__init__()
         self._names = names
         self._reader_builders = reader_builders
         self._schema = Struct(*[
diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py
index 4c2d4c806476..abb1f27d87ca 100644
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@@ -182,7 +182,7 @@ def execution_step_with_progress(name, init_net, substeps, rows_read):
         report_interval=5)
 
 
-class Dataset(object):
+class Dataset:
     """Represents an in-memory dataset with fixed schema.
 
     Use this to store and iterate through datasets with complex schema that
diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py
index 49b16096125c..7b1f2cccae0e 100644
--- a/caffe2/python/db_file_reader.py
+++ b/caffe2/python/db_file_reader.py
@@ -66,7 +66,7 @@ def __init__(
 
         # Before self._init_reader_schema(...),
         # self.db_path and self.db_type are required to be set.
-        super(DBFileReader, self).__init__(self._init_reader_schema(field_names))
+        super().__init__(self._init_reader_schema(field_names))
         self.ds = Dataset(self._schema, self.name + '_dataset')
         self.ds_reader = None
 
diff --git a/caffe2/python/device_checker.py b/caffe2/python/device_checker.py
index 21dc3ec69205..3385f1e2c046 100644
--- a/caffe2/python/device_checker.py
+++ b/caffe2/python/device_checker.py
@@ -6,7 +6,7 @@
 from caffe2.python.core import InferOpBlobDevicesAsDict
 
 
-class DeviceChecker(object):
+class DeviceChecker:
     """A device checker in Python to check consistency across multiple devices.
 
     This is not the most efficient way to check devices, as the Python interface
diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py
index 904f1731e960..982a05255e2d 100644
--- a/caffe2/python/docs/formatter.py
+++ b/caffe2/python/docs/formatter.py
@@ -7,7 +7,7 @@
 from caffe2.python.docs.parser import Parser
 
 
-class Formatter(object):
+class Formatter:
     def __init__(self):
         self.content = ""
 
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
index 29611bf4603c..0a2cca904c05 100644
--- a/caffe2/python/docs/generator.py
+++ b/caffe2/python/docs/generator.py
@@ -12,7 +12,7 @@
 OpSchema = workspace.C.OpSchema
 
 
-class DocUploader(object):
+class DocUploader:
     def __init__(self):
         pass
 
@@ -20,7 +20,7 @@ def upload(self, text):
         pass
 
 
-class DocGenerator(object):
+class DocGenerator:
     def __init__(self, formatter, uploader):
         self.formatter = formatter
         self.uploader = uploader
@@ -94,7 +94,7 @@ def createBody(self):
         self.content_body += self.formatter.dump()
 
 
-class OperatorEngine(object):
+class OperatorEngine:
     def __init__(self, name):
         self.op_name = name
         self.base_op_name, self.engine = name.split("_ENGINE_", 1)
@@ -116,7 +116,7 @@ def generateDoc(self, formatter):
                                                       impl=impl))
 
 
-class OperatorDoc(object):
+class OperatorDoc:
     def __init__(self, name, schema, priority):
         self.name = name
         self.schema = schema
diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py
index a4edb6e07246..1d8e194a3e86 100644
--- a/caffe2/python/docs/parser.py
+++ b/caffe2/python/docs/parser.py
@@ -7,7 +7,7 @@
 import re
 
 
-class Parser(object):
+class Parser:
     # List of tuples (regex_str, lambda(regex_match, formatter))
     # If a lambda returns True it will be called repeatedly with replacement
     # otherwise it will only be called on text that hasn't been parsed yet.
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
index 59e85431e8bf..910e7818a6e8 100644
--- a/caffe2/python/examples/char_rnn.py
+++ b/caffe2/python/examples/char_rnn.py
@@ -35,7 +35,7 @@ def CreateNetOnce(net, created_names=set()): # noqa
         workspace.CreateNet(net)
 
 
-class CharRNN(object):
+class CharRNN:
     def __init__(self, args):
         self.seq_length = args.seq_length
         self.batch_size = args.batch_size
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
index e213f33ba153..2821ec1ff42b 100644
--- a/caffe2/python/experiment_util.py
+++ b/caffe2/python/experiment_util.py
@@ -23,7 +23,7 @@
 '''
 
 
-class ExternalLogger(object):
+class ExternalLogger:
     __metaclass__ = abc.ABCMeta
 
     @abc.abstractmethod
diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py
index d32acb3d8a90..26a4dbab2b3b 100644
--- a/caffe2/python/functional.py
+++ b/caffe2/python/functional.py
@@ -7,7 +7,6 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python.onnx.workspace import Workspace
 from collections import namedtuple
-from six import string_types
 
 OpSchema = workspace.C.OpSchema
 
@@ -19,7 +18,7 @@ def namedtupledict(typename, field_names, *args, **kwargs):
     data = namedtuple(typename, field_names, *args, **kwargs)
 
     def getitem(self, key):
-        if isinstance(key, string_types):
+        if isinstance(key, str):
             key = field_names_map[key]
         return super(type(self), self).__getitem__(key)
 
@@ -27,7 +26,7 @@ def getitem(self, key):
     return data
 
 
-class _Functional(object):
+class _Functional:
     def __getattribute__(self, op_type):
         def op_func(*inputs, **args):
             ws = Workspace()
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
index 5f116bd6107c..f4eabaa274f8 100644
--- a/caffe2/python/gradient_checker.py
+++ b/caffe2/python/gradient_checker.py
@@ -69,7 +69,7 @@ def _assert_close(value1, value2, threshold, err_msg=''):
     return np.mean(delta), max(delta)
 
 
-class NetGradientChecker(object):
+class NetGradientChecker:
     @staticmethod
     def CompareNets(nets, outputs, outputs_with_grad_ids,
                     inputs_with_grads, input_values=None,
diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py
index d0474ed70022..f5bb71abc657 100644
--- a/caffe2/python/gru_cell.py
+++ b/caffe2/python/gru_cell.py
@@ -19,7 +19,7 @@ def __init__(
         linear_before_reset=False,
         **kwargs
     ):
-        super(GRUCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.forget_bias = float(forget_bias)
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index d27aaf5dfb0d..9a8e237e3021 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -47,7 +47,7 @@ def __init__(self, name, input_feature_schema, trainer_extra_schema,
             This attribute access will be consistent with MTML model.
         '''
 
-        super(LayerModelHelper, self).__init__(name=name)
+        super().__init__(name=name)
         self._layer_names = set()
         self._layers = []
         self._param_to_shape = {}
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
index 8e1831a2ff35..84b2ed1deddf 100644
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ b/caffe2/python/layer_parameter_sharing_test.py
@@ -20,26 +20,26 @@ def test_layer_parameter_name(self):
                 self.model.input_feature_schema.float_features,
                 output_dims
             )
-            self.assertEquals(self.model.layers[-1].w, 'global_scope/fc/w')
-            self.assertEquals(fc1_output(), 'global_scope/fc/output')
+            self.assertEqual(self.model.layers[-1].w, 'global_scope/fc/w')
+            self.assertEqual(fc1_output(), 'global_scope/fc/output')
 
             with scope.NameScope('nested_scope'):
                 fc2_output = self.model.FC(
                     fc1_output,
                     output_dims
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/nested_scope/fc/w')
-                self.assertEquals(fc2_output(),
+                self.assertEqual(fc2_output(),
                                   'global_scope/nested_scope/fc/output')
 
                 fc3_output = self.model.FC(
                     fc1_output,
                     output_dims
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/nested_scope/fc_auto_0/w')
-                self.assertEquals(fc3_output(),
+                self.assertEqual(fc3_output(),
                                   'global_scope/nested_scope/fc_auto_0/output')
 
     def test_layer_shared_parameter_name_different_namescopes(self):
@@ -51,9 +51,9 @@ def test_layer_shared_parameter_name_different_namescopes(self):
                         self.model.input_feature_schema.float_features,
                         output_dims
                     )
-                    self.assertEquals(self.model.layers[-1].w,
+                    self.assertEqual(self.model.layers[-1].w,
                                       'global_scope/scope_0/fc/w')
-                    self.assertEquals(fc1_output(),
+                    self.assertEqual(fc1_output(),
                                       'global_scope/scope_0/fc/output')
 
                 with scope.NameScope('scope_1'):
@@ -61,9 +61,9 @@ def test_layer_shared_parameter_name_different_namescopes(self):
                         self.model.input_feature_schema.float_features,
                         output_dims
                     )
-                    self.assertEquals(self.model.layers[-1].w,
+                    self.assertEqual(self.model.layers[-1].w,
                                       'global_scope/scope_0/fc/w')
-                    self.assertEquals(fc2_output(),
+                    self.assertEqual(fc2_output(),
                                       'global_scope/scope_1/fc/output')
 
     def test_layer_shared_parameter_name_within_same_namescope(self):
@@ -74,14 +74,14 @@ def test_layer_shared_parameter_name_within_same_namescope(self):
                     self.model.input_feature_schema.float_features,
                     output_dims
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/fc/w')
 
                 self.model.FC(
                     self.model.input_feature_schema.float_features,
                     output_dims
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/fc/w')
 
     def test_layer_shared_parameter_name_within_same_namescope_customized_name(self):
@@ -93,7 +93,7 @@ def test_layer_shared_parameter_name_within_same_namescope_customized_name(self)
                     output_dims,
                     name='shared_fc'
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/shared_fc/w')
 
                 self.model.FC(
@@ -101,7 +101,7 @@ def test_layer_shared_parameter_name_within_same_namescope_customized_name(self)
                     output_dims,
                     name='new_fc'
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/shared_fc/w')
 
     def test_layer_shared_parameter_name_different_shapes(self):
@@ -112,7 +112,7 @@ def test_layer_shared_parameter_name_different_shapes(self):
                     self.model.input_feature_schema.float_features,
                     output_dims
                 )
-                self.assertEquals(self.model.layers[-1].w,
+                self.assertEqual(self.model.layers[-1].w,
                                   'global_scope/fc/w')
 
                 with self.assertRaisesRegex(ValueError, 'Got inconsistent shapes .*'):
@@ -145,7 +145,7 @@ def test_layer_duplicated_parameter_init(self):
             op_outputs.extend(op.output)
 
         # only fill these parameter blobs once
-        self.assertEquals(
+        self.assertEqual(
             sorted(op_outputs),
             ['global_scope/shared_fc/b', 'global_scope/shared_fc/w']
         )
diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py
index bf45ed072224..32bf58edeb0d 100644
--- a/caffe2/python/layer_test_util.py
+++ b/caffe2/python/layer_test_util.py
@@ -32,7 +32,7 @@ def __new__(cls, op_type, op_input, op_output, op_arg=None):
 class LayersTestCase(test_util.TestCase):
 
     def setUp(self):
-        super(LayersTestCase, self).setUp()
+        super().setUp()
         self.setup_example()
 
     def setup_example(self):
diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py
index 146a0bdb1974..143c2df80d89 100644
--- a/caffe2/python/layers/adaptive_weight.py
+++ b/caffe2/python/layers/adaptive_weight.py
@@ -27,7 +27,7 @@ def __init__(
         reg_lambda=0.1,
         **kwargs
     ):
-        super(AdaptiveWeight, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         self.output_schema = schema.Scalar(
             np.float32, self.get_next_blob_reference("adaptive_weight")
         )
diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py
index 1a0fd8b295f3..811845944cd8 100644
--- a/caffe2/python/layers/add_bias.py
+++ b/caffe2/python/layers/add_bias.py
@@ -14,7 +14,7 @@ class AddBias(ModelLayer):
 
     def __init__(self, model, input_record, bias_init=None,
                  bias_optim=None, name='add_bias'):
-        super(AddBias, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
         assert len(input_record.field_type().shape) > 0, (
             "AddBias expects limited dimensions of the input tensor")
diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py
index 89c5014f5c5c..3b52652cdbf7 100644
--- a/caffe2/python/layers/arc_cosine_feature_map.py
+++ b/caffe2/python/layers/arc_cosine_feature_map.py
@@ -49,8 +49,7 @@ def __init__(
             name='arc_cosine_feature_map',
             **kwargs):
 
-        super(ArcCosineFeatureMap, self).__init__(model, name, input_record,
-                                                  **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
         self.params = []
         self.model = model
diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py
index 0a5323625419..72202314fe1a 100644
--- a/caffe2/python/layers/batch_huber_loss.py
+++ b/caffe2/python/layers/batch_huber_loss.py
@@ -18,7 +18,7 @@
 class BatchHuberLoss(ModelLayer):
 
     def __init__(self, model, input_record, name='batch_huber_loss', delta=1.0, **kwargs):
-        super(BatchHuberLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert delta > 0
 
diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py
index 46b0e4d42cdf..05d900325119 100644
--- a/caffe2/python/layers/batch_lr_loss.py
+++ b/caffe2/python/layers/batch_lr_loss.py
@@ -35,7 +35,7 @@ def __init__(
         task_gamma_lb=0.1,
         **kwargs
     ):
-        super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         self.average_loss = average_loss
 
diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py
index b0dd63ab09c8..70c73aed497a 100644
--- a/caffe2/python/layers/batch_mse_loss.py
+++ b/caffe2/python/layers/batch_mse_loss.py
@@ -18,7 +18,7 @@
 class BatchMSELoss(ModelLayer):
 
     def __init__(self, model, input_record, name='batch_mse_loss', **kwargs):
-        super(BatchMSELoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert schema.is_schema_subset(
             schema.Struct(
diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py
index 6395b09ff67f..0de3e6a62455 100644
--- a/caffe2/python/layers/batch_normalization.py
+++ b/caffe2/python/layers/batch_normalization.py
@@ -22,8 +22,7 @@ def __init__(
         scale_init_value=1.0,
         **kwargs
     ):
-        super(BatchNormalization, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
 
diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
index 84e7d4873f50..8500dcddb84c 100644
--- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
+++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
@@ -19,8 +19,7 @@ def __init__(
         name='batch_sigmoid_cross_entropy_loss',
         **kwargs
     ):
-        super(BatchSigmoidCrossEntropyLoss, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert schema.is_schema_subset(
             schema.Struct(
diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py
index 30667a04c159..a2b718d81564 100644
--- a/caffe2/python/layers/batch_softmax_loss.py
+++ b/caffe2/python/layers/batch_softmax_loss.py
@@ -22,8 +22,7 @@ def __init__(
         average_by_batch_size=False,
         **kwargs
     ):
-        super(BatchSoftmaxLoss, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert schema.is_schema_subset(
             schema.Struct(
diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py
index a37fab463581..669d4a54f0c1 100644
--- a/caffe2/python/layers/blob_weighted_sum.py
+++ b/caffe2/python/layers/blob_weighted_sum.py
@@ -23,7 +23,7 @@ def __init__(
         name='blob_weighted_sum',
         **kwargs
     ):
-        super(BlobWeightedSum, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         self.blobs = self.input_record.field_blobs()
 
diff --git a/caffe2/python/layers/bpr_loss.py b/caffe2/python/layers/bpr_loss.py
index 389de8c241e8..5f2446404683 100644
--- a/caffe2/python/layers/bpr_loss.py
+++ b/caffe2/python/layers/bpr_loss.py
@@ -19,7 +19,7 @@
 class BPRLoss(ModelLayer):
 
     def __init__(self, model, input_record, name='bpr_loss', **kwargs):
-        super(BPRLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert schema.is_schema_subset(
             schema.Struct(
                 ('pos_prediction', schema.Scalar()),
diff --git a/caffe2/python/layers/bucket_weighted.py b/caffe2/python/layers/bucket_weighted.py
index 2c200a922fdd..c72aceaaf17d 100644
--- a/caffe2/python/layers/bucket_weighted.py
+++ b/caffe2/python/layers/bucket_weighted.py
@@ -22,7 +22,7 @@
 class BucketWeighted(ModelLayer):
     def __init__(self, model, input_record, max_score=0, bucket_boundaries=None,
                  hash_buckets=True, weight_optim=None, name="bucket_weighted"):
-        super(BucketWeighted, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)
 
         assert isinstance(input_record, schema.List), "Incorrect input type"
         self.bucket_boundaries = bucket_boundaries
diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py
index 29c63f3d8948..2505a15f74b3 100644
--- a/caffe2/python/layers/build_index.py
+++ b/caffe2/python/layers/build_index.py
@@ -23,7 +23,7 @@ def __init__(
         name='map_to_range',
         **kwargs
     ):
-        super(MapToRange, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert max_index > 0
         assert isinstance(input_record, schema.Scalar)
diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py
index 6351aad24700..f7dabe7fd608 100644
--- a/caffe2/python/layers/concat.py
+++ b/caffe2/python/layers/concat.py
@@ -65,7 +65,7 @@ class Concat(ModelLayer):
 
     def __init__(self, model, input_record, axis=1, add_axis=0,
                  name='concat', **kwargs):
-        super(Concat, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         self.axis = axis
         self.add_axis = add_axis
         assert not (axis == 0 and add_axis == 1), \
diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py
index e98bac7e2d80..6b7e15fe9041 100644
--- a/caffe2/python/layers/conv.py
+++ b/caffe2/python/layers/conv.py
@@ -31,7 +31,7 @@ def __init__(self, model, input_record, output_dim, kernel_h, kernel_w,
                  kernel_optim=None, bias_optim=None,
                  name='conv', **kwargs):
 
-        super(Conv, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
         # input num_channels (C) is needed
         input_dims = input_record.field_type().shape
diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py
index 4bc0cf2785b2..27d3c91039cc 100644
--- a/caffe2/python/layers/dropout.py
+++ b/caffe2/python/layers/dropout.py
@@ -19,7 +19,7 @@ def __init__(
             dropout_for_eval=False,
             **kwargs):
 
-        super(Dropout, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
         assert (ratio >= 0 and ratio < 1.0), \
             "Expected 0 <= ratio < 1, but got ratio of %s" % ratio
diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py
index 9220f22165a3..a67240a9cd77 100644
--- a/caffe2/python/layers/fc.py
+++ b/caffe2/python/layers/fc.py
@@ -29,7 +29,7 @@ def __init__(self, model, input_record, output_dims, weight_init=None,
                  max_fc_size=None, axis=1, transposed=False,
                  uniform_weight_init_scale_numerator=1.0,
                  **kwargs):
-        super(FC, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Scalar), (
             "Incorrect input type {}".format(input_record))
         assert len(input_record.field_types()[0].shape) > 0, (
diff --git a/caffe2/python/layers/fc_with_bootstrap.py b/caffe2/python/layers/fc_with_bootstrap.py
index b3c2eb346f96..75f5a41f51fe 100644
--- a/caffe2/python/layers/fc_with_bootstrap.py
+++ b/caffe2/python/layers/fc_with_bootstrap.py
@@ -37,7 +37,7 @@ def __init__(
         axis=1,
         **kwargs
     ):
-        super(FCWithBootstrap, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(
             input_record, schema.Scalar
         ), "Incorrect input type {}".format(input_record)
diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py
index 2899af618b79..15f11c83dbb0 100644
--- a/caffe2/python/layers/fc_without_bias.py
+++ b/caffe2/python/layers/fc_without_bias.py
@@ -25,7 +25,7 @@ def __init__(
         uniform_weight_init_scale_numerator=1.0,
         **kwargs
     ):
-        super(FCWithoutBias, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
         assert len(input_record.field_types()[0].shape) > 0, (
             "FCWithoutBias expects limited dimensions of the input tensor"
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
index ca004d136ded..50ccdaafa7cd 100644
--- a/caffe2/python/layers/feature_sparse_to_dense.py
+++ b/caffe2/python/layers/feature_sparse_to_dense.py
@@ -26,7 +26,7 @@ def __init__(
         Default_dense_value can only be 0.0 or float("NaN"). Any input that isn't
         None will be NaN.
         """
-        super(FeatureSparseToDense, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         if default_dense_value is None:
             default_dense_value = 0.0
         default_dense_value = float(default_dense_value)
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
index bc47c474ac8f..4543f695337d 100644
--- a/caffe2/python/layers/functional.py
+++ b/caffe2/python/layers/functional.py
@@ -25,7 +25,7 @@ def __init__(self, model, input_record, output_names_or_num, function,
         # allow coercion
         input_record = schema.as_record(input_record)
 
-        super(Functional, self).__init__(model, name, input_record, tags=tags, **kwargs)
+        super().__init__(model, name, input_record, tags=tags, **kwargs)
         self._function = function
         self._kwargs = kwargs
         return_struct = (
diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py
index da468d5db90c..2ed36015981a 100644
--- a/caffe2/python/layers/gather_record.py
+++ b/caffe2/python/layers/gather_record.py
@@ -30,7 +30,7 @@ class GatherRecord(ModelLayer):
     """
 
     def __init__(self, model, input_record, name='gather_record', **kwargs):
-        super(GatherRecord, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert 'indices' in input_record
         assert 'record' in input_record
diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py
index 7e4987270660..5f6f6b9961a9 100644
--- a/caffe2/python/layers/label_smooth.py
+++ b/caffe2/python/layers/label_smooth.py
@@ -29,7 +29,7 @@ class LabelSmooth(ModelLayer):
     def __init__(
         self, model, label, smooth_matrix, name='label_smooth', **kwargs
     ):
-        super(LabelSmooth, self).__init__(model, name, label, **kwargs)
+        super().__init__(model, name, label, **kwargs)
         self.label = label
         # shape as a list
         smooth_matrix = np.array(smooth_matrix).astype(np.float32).flatten()
diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py
index 5e6874b4cca0..3b44ea708031 100644
--- a/caffe2/python/layers/last_n_window_collector.py
+++ b/caffe2/python/layers/last_n_window_collector.py
@@ -15,8 +15,7 @@ class LastNWindowCollector(ModelLayer):
 
     def __init__(self, model, input_record, num_to_collect,
                  name='last_n_window_collector', **kwargs):
-        super(LastNWindowCollector, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert num_to_collect > 0
         self.num_to_collect = num_to_collect
         assert isinstance(input_record, schema.Scalar), \
diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py
index 580a03bfc5da..0e722c960e39 100644
--- a/caffe2/python/layers/layer_normalization.py
+++ b/caffe2/python/layers/layer_normalization.py
@@ -23,8 +23,7 @@ def __init__(
         scale_init_value=1.0,
         **kwargs
     ):
-        super(LayerNormalization, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert isinstance(input_record, schema.Scalar), (
             "Incorrect input type: {}".format(input_record))
diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py
index abcdd1596220..30b632eef2ba 100644
--- a/caffe2/python/layers/layers.py
+++ b/caffe2/python/layers/layers.py
@@ -119,7 +119,7 @@ def set_request_only(field):
         )
 
 
-class InstantiationContext(object):
+class InstantiationContext:
     """
     List of contexts where layer could be instantitated
     """
@@ -157,7 +157,7 @@ def create_layer(layer_name, *args, **kwargs):
 LayerPsParam = namedtuple("LayerPsParam", ["sparse_key", "average_length"])
 
 
-class LayerParameter(object):
+class LayerParameter:
     def __init__(
         self,
         parameter=None,
@@ -248,7 +248,7 @@ def is_request_only_scalar(scalar):
 # `ids`: A set of feature IDs that are accessed in the model layer
 AccessedFeatures = namedtuple("AccessedFeatures", ["type", "ids"])
 
-class ModelLayer(object):
+class ModelLayer:
     def __init__(
         self,
         model,
diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py
index 6f97ade23ef4..be8762938824 100644
--- a/caffe2/python/layers/margin_rank_loss.py
+++ b/caffe2/python/layers/margin_rank_loss.py
@@ -19,7 +19,7 @@ class MarginRankLoss(ModelLayer):
 
     def __init__(self, model, input_record, name='margin_rank_loss',
                  margin=0.1, average_loss=False, **kwargs):
-        super(MarginRankLoss, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert margin >= 0, ('For hinge loss, margin should be no less than 0')
         self._margin = margin
         self._average_loss = average_loss
diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py
index b076cd8c5e75..d130c48b6c4f 100644
--- a/caffe2/python/layers/merge_id_lists.py
+++ b/caffe2/python/layers/merge_id_lists.py
@@ -25,7 +25,7 @@ class MergeIdLists(ModelLayer):
         the merged ID_LIST feature
     """
     def __init__(self, model, input_record, name='merged'):
-        super(MergeIdLists, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)
         assert all(schema.equal_schemas(x, IdList) for x in input_record), \
             "Inputs to MergeIdLists should all be IdLists."
 
diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py
index 5020e5432c2a..0cdd0259cd08 100644
--- a/caffe2/python/layers/pairwise_similarity.py
+++ b/caffe2/python/layers/pairwise_similarity.py
@@ -15,7 +15,7 @@ class PairwiseSimilarity(ModelLayer):
 
     def __init__(self, model, input_record, output_dim, pairwise_similarity_func='dot',
                  name='pairwise_similarity', **kwargs):
-        super(PairwiseSimilarity, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Struct), (
             "Incorrect input type. Expected Struct, but received: {0}".
             format(input_record))
diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py
index 12e26bcd774e..d2c917ed0243 100644
--- a/caffe2/python/layers/position_weighted.py
+++ b/caffe2/python/layers/position_weighted.py
@@ -22,7 +22,7 @@
 class PositionWeighted(ModelLayer):
     def __init__(self, model, input_record, weight_optim=None,
                  name="position_weights"):
-        super(PositionWeighted, self).__init__(model, name, input_record)
+        super().__init__(model, name, input_record)
 
         assert isinstance(input_record, schema.List), "Incorrect input type"
         length_metadata = input_record.lengths.metadata
diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py
index bde05ab97147..350454b24977 100644
--- a/caffe2/python/layers/random_fourier_features.py
+++ b/caffe2/python/layers/random_fourier_features.py
@@ -38,8 +38,7 @@ def __init__(
             name='random_fourier_features',
             **kwargs):
 
-        super(RandomFourierFeatures, self).__init__(model, name, input_record,
-                                                    **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert isinstance(input_record, schema.Scalar), "Incorrect input type"
 
         input_dims = input_record.field_type().shape[0]
diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py
index 21b9c44f2a79..fe7302c5045e 100644
--- a/caffe2/python/layers/reservoir_sampling.py
+++ b/caffe2/python/layers/reservoir_sampling.py
@@ -19,8 +19,7 @@ class ReservoirSampling(ModelLayer):
 
     def __init__(self, model, input_record, num_to_collect,
                  name='reservoir_sampling', **kwargs):
-        super(ReservoirSampling, self).__init__(
-            model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert num_to_collect > 0
         self.num_to_collect = num_to_collect
 
diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py
index 034c897e2c2f..ac63dc054442 100644
--- a/caffe2/python/layers/sampling_train.py
+++ b/caffe2/python/layers/sampling_train.py
@@ -21,9 +21,7 @@ def __init__(
         name='sampling_train',
         **kwargs
     ):
-        super(SamplingTrain, self).__init__(
-            model, name, input_record, **kwargs
-        )
+        super().__init__(model, name, input_record, **kwargs)
 
         layer_class = get_layer_class(prediction_layer)
         assert issubclass(layer_class, SamplingTrainableMixin)
diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py
index 79c928d21252..fdfbcb9e8ff4 100644
--- a/caffe2/python/layers/sampling_trainable_mixin.py
+++ b/caffe2/python/layers/sampling_trainable_mixin.py
@@ -11,7 +11,7 @@
 class SamplingTrainableMixin(metaclass=abc.ABCMeta):
 
     def __init__(self, *args, **kwargs):
-        super(SamplingTrainableMixin, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self._train_param_blobs = None
         self._train_param_blobs_frozen = False
 
diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py
index 49e42ca308d7..e691cbce57a0 100644
--- a/caffe2/python/layers/select_record_by_context.py
+++ b/caffe2/python/layers/select_record_by_context.py
@@ -32,8 +32,7 @@ def __init__(
         default_output_record_field=None,
         **kwargs
     ):
-        super(SelectRecordByContext, self).__init__(model, name, input_record,
-                                                    **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert isinstance(input_record, schema.Struct)
         assert len(input_record) > 1
diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py
index 58f30ac71f19..0df5ce4190fe 100644
--- a/caffe2/python/layers/semi_random_features.py
+++ b/caffe2/python/layers/semi_random_features.py
@@ -84,7 +84,7 @@ def __init__(
             self.input_record_full = input_record
             self.input_record_random = input_record
 
-        super(SemiRandomFeatures, self).__init__(
+        super().__init__(
             model,
             self.input_record_full,
             output_dims,
diff --git a/caffe2/python/layers/sparse_dropout_with_replacement.py b/caffe2/python/layers/sparse_dropout_with_replacement.py
index 3e03888e57dc..e7df3b495032 100644
--- a/caffe2/python/layers/sparse_dropout_with_replacement.py
+++ b/caffe2/python/layers/sparse_dropout_with_replacement.py
@@ -42,7 +42,7 @@ def __init__(
             name='sparse_dropout',
             **kwargs):
 
-        super(SparseDropoutWithReplacement, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert schema.equal_schemas(input_record, IdList), "Incorrect input type"
 
         self.dropout_prob_train = float(dropout_prob_train)
diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py
index c3ada99dc4a7..4b7f29a6a661 100644
--- a/caffe2/python/layers/sparse_feature_hash.py
+++ b/caffe2/python/layers/sparse_feature_hash.py
@@ -22,7 +22,7 @@ class SparseFeatureHash(ModelLayer):
 
     def __init__(self, model, input_record, seed=0, modulo=None,
                  use_hashing=True, use_divide_mod=False, divisor=None, name='sparse_feature_hash', **kwargs):
-        super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time."
 
diff --git a/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py b/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
index 05d13d68be14..8fa5ce0128b3 100644
--- a/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
+++ b/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
@@ -41,7 +41,7 @@ def __init__(
             name='sparse_itemwise_dropout',
             **kwargs):
 
-        super(SparseItemwiseDropoutWithReplacement, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         assert schema.equal_schemas(input_record, IdList), "Incorrect input type"
 
         self.dropout_prob_train = float(dropout_prob_train)
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
index dd1c42606063..cff997152e5d 100644
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -133,7 +133,7 @@ def __init__(self, model, input_record, inner_shape, reducer,
                  name='sparse_lookup', regularizer=None, use_external_weights=False,
                  uniform_weight_init_scale_numerator=1.0, **kwargs):
 
-        super(SparseLookup, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
 
         self.sparse_key = get_key(self.input_record)()
         logger.info("Setup the sparse lookup layer for " + self.sparse_key)
diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py
index 58e569a272c7..c70bdc21b474 100644
--- a/caffe2/python/layers/split.py
+++ b/caffe2/python/layers/split.py
@@ -15,7 +15,7 @@ class Split(ModelLayer):
 
     def __init__(self, model, input_record, num_splits=1, axis=1,
                  name='split', split=None, **kwargs):
-        super(Split, self).__init__(model, name, input_record, **kwargs)
+        super().__init__(model, name, input_record, **kwargs)
         self.axis = axis
         # Assume that first dimension is batch, so actual axis in shape is
         # axis - 1
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
index 1913ef5425bd..7fbea3be9f7e 100644
--- a/caffe2/python/layers/tags.py
+++ b/caffe2/python/layers/tags.py
@@ -27,7 +27,7 @@ def remove_tags(self, tags):
         self.tags = self.tags[:-len(tags)]
 
 
-class Tags(object):
+class Tags:
     # TODO(amalevich): Tags might need to live in their own contexts, add this
     # split later
     EXCLUDE_FROM_TRAIN = 'exclude_from_train'
diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py
index 5581371d008d..76631b09bdd6 100644
--- a/caffe2/python/layers/uniform_sampling.py
+++ b/caffe2/python/layers/uniform_sampling.py
@@ -27,9 +27,7 @@ def __init__(
         name='uniform_sampling',
         **kwargs
     ):
-        super(UniformSampling, self).__init__(
-            model, name, input_record, **kwargs
-        )
+        super().__init__(model, name, input_record, **kwargs)
 
         assert num_elements > num_samples > 0
         assert isinstance(input_record, schema.Scalar)
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
index ff2923d3cd61..8449a66db770 100644
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@@ -424,7 +424,7 @@ def testSparseLookupSumPoolingWithEviction(self):
         workspace.RunNetOnce(train_net.Proto())
         embedding_after_training = workspace.FetchBlob("sparse_lookup/w")
         # Verify row 0's value does not change after reset
-        self.assertEquals(embedding_after_training.all(), embedding_after_init.all())
+        self.assertEqual(embedding_after_training.all(), embedding_after_init.all())
 
 
     def testSparseLookupSumPooling(self):
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
index defe44c6a8b4..b4f7a62a6893 100644
--- a/caffe2/python/memonger_test.py
+++ b/caffe2/python/memonger_test.py
@@ -263,7 +263,7 @@ def test_memonger_mix_cpu_gpu(self):
         device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
             device_blobs[workspace.GpuDeviceType]
         )
-        self.assertEquals(device_crossers, set())
+        self.assertEqual(device_crossers, set())
 
     @given(input_dim=st.integers(min_value=4, max_value=4),
            output_dim=st.integers(min_value=4, max_value=4),
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index 2bf49750cc20..34466620cb27 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -72,7 +72,7 @@
 ]
 
 
-class ModelHelper(object):
+class ModelHelper:
     """A helper model so we can manange models more easily. It contains net def
     and parameter storages. You can add an Operator yourself, e.g.
 
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
index ba4236d04654..8e2943a8955b 100644
--- a/caffe2/python/modeling/initializers.py
+++ b/caffe2/python/modeling/initializers.py
@@ -7,7 +7,7 @@
 from caffe2.python.modeling.parameter_info import ParameterInfo
 
 
-class Initializer(object):
+class Initializer:
     '''
     This class abstracts out parameter creation. One can come up with a new
     Initializer in order to implement more complex parameter initialization logic
@@ -33,7 +33,7 @@ def create_param(self, param_name, init_net, shape):
         )
 
 
-class ExternalInitializer(object):
+class ExternalInitializer:
     '''
     This class is used in cases when the parameter should not be initialized by
     the initializer, but rather provided in the workspace when param_init_net is
diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py
index 195048cf91e8..dfbaffbd801c 100644
--- a/caffe2/python/modeling/parameter_info.py
+++ b/caffe2/python/modeling/parameter_info.py
@@ -8,13 +8,13 @@
 import numpy as np
 
 
-class ParameterTags(object):
+class ParameterTags:
     BIAS = 'BIAS'
     WEIGHT = 'WEIGHT'
     COMPUTED_PARAM = 'COMPUTED_PARAM'
 
 
-class ParameterInfo(object):
+class ParameterInfo:
 
     def __init__(
             self, param_id, param, key=None, shape=None, length=None,
diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py
index a0174500a413..afb1b53fdcb6 100644
--- a/caffe2/python/modeling/parameter_sharing.py
+++ b/caffe2/python/modeling/parameter_sharing.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ParameterSharingContext(object):
+class ParameterSharingContext:
     """
     This class manages scope driven way of parameter sharing across different
     NameScopes.
diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py
index d37e40880c02..d845d6decb46 100644
--- a/caffe2/python/modeling/parameter_sharing_test.py
+++ b/caffe2/python/modeling/parameter_sharing_test.py
@@ -19,56 +19,56 @@ class ParameterSharingTest(unittest.TestCase):
     def test_parameter_sharing_default_scopes(self):
         # Test no sharing default scopes
         param_1 = parameter_sharing_context.get_parameter_name('w')
-        self.assertEquals(param_1, 'w')
+        self.assertEqual(param_1, 'w')
         with scope.NameScope('scope'):
             param_2 = parameter_sharing_context.get_parameter_name('w')
-            self.assertEquals(param_2, 'scope/w')
+            self.assertEqual(param_2, 'scope/w')
             with scope.NameScope('scope_2'):
                 param_3 = parameter_sharing_context.get_parameter_name('w')
-                self.assertEquals(param_3, 'scope/scope_2/w')
+                self.assertEqual(param_3, 'scope/scope_2/w')
 
     def test_parameter_sharing_nested_scopes(self):
         # Test parameter sharing
         with scope.NameScope('global_scope'):
             with ParameterSharing({'model_b': 'model_a'}):
                 param_global = parameter_sharing_context.get_parameter_name('w')
-                self.assertEquals(param_global, 'global_scope/w')
+                self.assertEqual(param_global, 'global_scope/w')
                 # This scope is overridden to match 'model_a'
                 with scope.NameScope('model_b'):
                     with ParameterSharing({'shared_scope': ''}):
                         param_4 = parameter_sharing_context.get_parameter_name(
                             'w')
-                        self.assertEquals(param_4, 'global_scope/model_a/w')
+                        self.assertEqual(param_4, 'global_scope/model_a/w')
                         with scope.NameScope('shared_scope'):
                             param_5 = parameter_sharing_context.\
                                 get_parameter_name('w')
-                            self.assertEquals(param_5, 'global_scope/model_a/w')
+                            self.assertEqual(param_5, 'global_scope/model_a/w')
                 # This scope is supposed to have not sharing
                 with scope.NameScope('model_c'):
                     with ParameterSharing({'shared_scope': ''}):
                         param_4 = parameter_sharing_context.get_parameter_name(
                             'w')
-                        self.assertEquals(param_4, 'global_scope/model_c/w')
+                        self.assertEqual(param_4, 'global_scope/model_c/w')
                         with scope.NameScope('shared_scope'):
                             param_5 = parameter_sharing_context.\
                                 get_parameter_name('w')
-                            self.assertEquals(param_5, 'global_scope/model_c/w')
+                            self.assertEqual(param_5, 'global_scope/model_c/w')
 
     def test_parameter_sharing_subscopes(self):
         # Sharing only one of the subscopes
         with ParameterSharing({'global_scope/b': 'global_scope/a'}):
             with scope.NameScope('global_scope'):
                 param_6 = parameter_sharing_context.get_parameter_name('w')
-                self.assertEquals(param_6, 'global_scope/w')
+                self.assertEqual(param_6, 'global_scope/w')
                 with scope.NameScope('a'):
                     param_7 = parameter_sharing_context.get_parameter_name('w')
-                    self.assertEquals(param_7, 'global_scope/a/w')
+                    self.assertEqual(param_7, 'global_scope/a/w')
                 with scope.NameScope('b'):
                     param_8 = parameter_sharing_context.get_parameter_name('w')
-                    self.assertEquals(param_8, 'global_scope/a/w')
+                    self.assertEqual(param_8, 'global_scope/a/w')
                 with scope.NameScope('c'):
                     param_9 = parameter_sharing_context.get_parameter_name('w')
-                    self.assertEquals(param_9, 'global_scope/c/w')
+                    self.assertEqual(param_9, 'global_scope/c/w')
 
     def test_create_param(self):
         model = model_helper.ModelHelper(name="test")
diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py
index 6fc9f8ece480..a94deb965e1b 100644
--- a/caffe2/python/models/seq2seq/beam_search.py
+++ b/caffe2/python/models/seq2seq/beam_search.py
@@ -11,7 +11,7 @@
 from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
 
 
-class BeamSearchForwardOnly(object):
+class BeamSearchForwardOnly:
     """
     Class generalizing forward beam search for seq2seq models.
 
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
index 5adabb86fadf..4eedbde4ab0e 100644
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
@@ -20,11 +20,7 @@ def __init__(self, init_params=True, **kwargs):
         if kwargs.get('ws_nbytes_limit', None):
             arg_scope['ws_nbytes_limit'] = kwargs.pop('ws_nbytes_limit')
 
-        super(Seq2SeqModelHelper, self).__init__(
-            init_params=init_params,
-            arg_scope=arg_scope,
-            **kwargs
-        )
+        super().__init__(init_params=init_params, arg_scope=arg_scope, **kwargs)
         self.non_trainable_params = []
 
     def AddParam(self, name, init=None, init_value=None, trainable=True):
diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py
index 01e003f73d2a..17187a7894c4 100644
--- a/caffe2/python/models/seq2seq/seq2seq_util.py
+++ b/caffe2/python/models/seq2seq/seq2seq_util.py
@@ -316,7 +316,7 @@ def build_embedding_encoder(
     )
 
 
-class LSTMWithAttentionDecoder(object):
+class LSTMWithAttentionDecoder:
 
     def scope(self, name):
         return self.name + '/' + name if self.name is not None else name
diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py
index 8080318da4d0..95a3d3485ab7 100644
--- a/caffe2/python/models/seq2seq/train.py
+++ b/caffe2/python/models/seq2seq/train.py
@@ -96,7 +96,7 @@ def prepare_batch(batch):
     )
 
 
-class Seq2SeqModelCaffe2(object):
+class Seq2SeqModelCaffe2:
 
     def _build_model(
         self,
diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py
index b65d97587549..574e7b644550 100644
--- a/caffe2/python/modifier_context.py
+++ b/caffe2/python/modifier_context.py
@@ -9,7 +9,7 @@
 DEFAULT_MODIFIER = 'DEFAULT'
 
 
-class ModifierContext(object):
+class ModifierContext:
     """
     provide context to allow param_info to have different modifiers
     """
@@ -40,7 +40,7 @@ def pop_modifiers(self):
         self._rebuild_modifiers()
 
 
-class UseModifierBase(object):
+class UseModifierBase:
     '''
     context class to allow setting the current context.
     Example usage with layer:
diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py
index fd525ed4766a..a6e57f4dd972 100644
--- a/caffe2/python/net_builder.py
+++ b/caffe2/python/net_builder.py
@@ -137,7 +137,7 @@ def get(self):
         return self._children
 
     def __exit__(self, etype, *args):
-        super(NetBuilder, self).__exit__(etype, *args)
+        super().__exit__(etype, *args)
 
         if self._use_control_ops and len(self._children) > 0:
             _children = self._children
@@ -203,7 +203,7 @@ def __str__(self):
         return self.name or 'Un-named NetBuilder'
 
 
-class Operations(object):
+class Operations:
     """
     Operations to be used in the context of a NetBuilder.
     """
diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py
index bef6caefac3d..5320c2b04588 100644
--- a/caffe2/python/net_builder_test.py
+++ b/caffe2/python/net_builder_test.py
@@ -12,7 +12,7 @@
 import threading
 
 
-class PythonOpStats(object):
+class PythonOpStats:
     lock = threading.Lock()
     num_instances = 0
     num_calls = 0
@@ -101,7 +101,7 @@ def test_ops(self):
         ]
         for b, expected in expected:
             actual = ws.blobs[str(b)].fetch()
-            self.assertEquals(actual, expected)
+            self.assertEqual(actual, expected)
 
     def _expected_loop(self):
         total = 0
@@ -152,7 +152,7 @@ def test_net_multi_use(self):
             result = final_output(total)
         with LocalSession() as session:
             session.run(task)
-            self.assertEquals(2, result.fetch())
+            self.assertEqual(2, result.fetch())
 
     def test_loops(self):
         with Task() as task:
@@ -162,7 +162,7 @@ def test_loops(self):
             expected = self._expected_loop()
             actual = [o.fetch() for o in out_actual]
             for e, a in zip(expected, actual):
-                self.assertEquals(e, a)
+                self.assertEqual(e, a)
 
     def test_setup(self):
         with Task() as task:
@@ -184,9 +184,9 @@ def test_setup(self):
             o7_2 = final_output(seven_2)
         with LocalSession() as session:
             session.run(task)
-            self.assertEquals(o6.fetch(), 6)
-            self.assertEquals(o7_1.fetch(), 7)
-            self.assertEquals(o7_2.fetch(), 7)
+            self.assertEqual(o6.fetch(), 6)
+            self.assertEqual(o7_1.fetch(), 7)
+            self.assertEqual(o7_2.fetch(), 7)
 
     def test_multi_instance_python_op(self):
         """
@@ -203,8 +203,8 @@ def test_multi_instance_python_op(self):
             PythonOpStats.num_instances = 0
             PythonOpStats.num_calls = 0
             session.run(task)
-            self.assertEquals(PythonOpStats.num_instances, 64)
-            self.assertEquals(PythonOpStats.num_calls, 256)
+            self.assertEqual(PythonOpStats.num_instances, 64)
+            self.assertEqual(PythonOpStats.num_calls, 256)
 
     def test_multi_instance(self):
         NUM_INSTANCES = 10
@@ -242,9 +242,9 @@ def test_multi_instance(self):
 
         with LocalSession() as session:
             session.run(tg)
-            self.assertEquals(total1.fetch(), NUM_INSTANCES * NUM_ITERS)
-            self.assertEquals(total2.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
-            self.assertEquals(total3.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
+            self.assertEqual(total1.fetch(), NUM_INSTANCES * NUM_ITERS)
+            self.assertEqual(total2.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
+            self.assertEqual(total3.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
 
     def test_if_net(self):
         with NetBuilder() as nb:
@@ -303,11 +303,11 @@ def test_if_net(self):
         y1_value = ws.blobs[str(y1)].fetch()
         y2_value = ws.blobs[str(y2)].fetch()
 
-        self.assertEquals(first_res_value, 1)
-        self.assertEquals(second_res_value, 2)
-        self.assertEquals(y0_value, 1000)
-        self.assertEquals(y1_value, 101)
-        self.assertEquals(y2_value, 108)
+        self.assertEqual(first_res_value, 1)
+        self.assertEqual(second_res_value, 2)
+        self.assertEqual(y0_value, 1000)
+        self.assertEqual(y1_value, 101)
+        self.assertEqual(y2_value, 108)
         self.assertTrue(str(local_blob) not in ws.blobs)
 
     def test_while_net(self):
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 2adf605c5a84..6b0af67853a4 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -13,10 +13,9 @@
 from contextlib import contextmanager
 from copy import copy
 from itertools import chain
-from six import binary_type, text_type
 
 
-class Visitor(object):
+class Visitor:
     @classmethod
     def register(cls, Type):
         if not(hasattr(cls, 'visitors')):
@@ -155,7 +154,7 @@ def analyze(obj):
     Analyzer()(obj)
 
 
-class Text(object):
+class Text:
     def __init__(self):
         self._indent = 0
         self._lines_in_context = [0]
@@ -192,9 +191,9 @@ def __init__(self, factor_prefixes=False, c2_syntax=True):
 
 
 def _sanitize_str(s):
-    if isinstance(s, text_type):
+    if isinstance(s, str):
         sanitized = s
-    elif isinstance(s, binary_type):
+    elif isinstance(s, bytes):
         sanitized = s.decode('ascii', errors='ignore')
     else:
         sanitized = str(s)
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index 2b83e0ec9358..0390d8ef20c2 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -9,7 +9,7 @@
 from caffe2.python import core
 
 
-class NNModule(object):
+class NNModule:
     def __init__(self, net=None, device_map=None):
         if net is not None:
             serialized_proto = None
diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py
index 2ca147328c78..bc6b36b00cf8 100644
--- a/caffe2/python/normalizer.py
+++ b/caffe2/python/normalizer.py
@@ -3,7 +3,7 @@
 
 
 
-class Normalizer(object):
+class Normalizer:
     def __init__(self):
         pass
     """
@@ -21,7 +21,7 @@ def _run(self, net, param):
 
 class BatchNormalizer(Normalizer):
     def __init__(self, momentum, scale_init_value=1.0):
-        super(BatchNormalizer, self).__init__()
+        super().__init__()
         self._momentum = float(momentum)
         self._scale_init_value = float(scale_init_value)
 
@@ -33,7 +33,7 @@ def _run(self, layer_model, param):
 
 class LayerNormalizer(Normalizer):
     def __init__(self, epsilon, use_layer_norm_op=True, scale_init_value=1.0):
-        super(LayerNormalizer, self).__init__()
+        super().__init__()
         self._epsilon = float(epsilon)
         self._use_layer_norm_op = use_layer_norm_op
         self._scale_init_value = float(scale_init_value)
diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py
index f0ce5099ea75..6a1c2b2642ec 100644
--- a/caffe2/python/normalizer_test.py
+++ b/caffe2/python/normalizer_test.py
@@ -12,4 +12,4 @@ def test_normalizer_context(self):
         bn = BatchNormalizer(momentum=0.1)
         with UseNormalizer({'BATCH': bn}):
             normalizer = NormalizerContext.current().get_normalizer('BATCH')
-            self.assertEquals(bn, normalizer)
+            self.assertEqual(bn, normalizer)
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index d523eb8204ab..477ded3284e8 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -100,7 +100,7 @@ def convertAttributeProto(onnx_arg):
 
 
 # TODO: Move this into ONNX main library
-class OnnxNode(object):
+class OnnxNode:
     """
     Reimplementation of NodeProto from ONNX, but in a form
     more convenient to work with from Python.
diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py
index 322e6c2e2894..6092d93da2a7 100644
--- a/caffe2/python/onnx/backend_cpp_rep.py
+++ b/caffe2/python/onnx/backend_cpp_rep.py
@@ -12,7 +12,7 @@
 # mainly to handle the different input and output types for convenience of Python
 class Caffe2CppRep(BackendRep):
     def __init__(self, cpp_rep):
-        super(Caffe2CppRep, self).__init__()
+        super().__init__()
         self.__core = cpp_rep
         self.__external_outputs = cpp_rep.external_outputs()
         self.__external_inputs = cpp_rep.external_inputs()
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index ab97fd562dc1..e9bc9438df9b 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -11,7 +11,7 @@
 
 class Caffe2Rep(BackendRep):
     def __init__(self, init_net, predict_net, workspace, uninitialized):
-        super(Caffe2Rep, self).__init__()
+        super().__init__()
         self.init_net = init_net
         self.predict_net = predict_net
         self.workspace = workspace
@@ -28,7 +28,7 @@ def _name_scope(self):
         return ''
 
     def run(self, inputs, **kwargs):
-        super(Caffe2Rep, self).run(inputs, **kwargs)
+        super().run(inputs, **kwargs)
         with core.DeviceScope(self.predict_net.device_option):
             if isinstance(inputs, dict):
                 with core.NameScope(self._name_scope):
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index b5121602aff5..25a843e949ff 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -29,7 +29,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Caffe2Frontend(object):
+class Caffe2Frontend:
     # This number controls the semantics of the operators we target.  Whenever
     # ONNX makes a BC breaking change to semantics of operators, having this set
     # to an accurate number will prevent our models form exporting.  However,
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index 461b454b6a91..918a701db958 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -186,6 +186,11 @@
                      '|test_sequencemap_.*'
                      ')')
 
+# Unsupported ops in opset 18
+backend_test.exclude('(test_center_crop_pad_.*'
+                     '|test_col2im*'
+                     '|test_bitwise*)')
+
 # Skip vgg to speed up CI
 if 'JENKINS_URL' in os.environ:
     backend_test.exclude(r'(test_vgg19|test_vgg)')
diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py
index f03e3609fe8b..b15ef1dd9186 100644
--- a/caffe2/python/onnx/workspace.py
+++ b/caffe2/python/onnx/workspace.py
@@ -12,7 +12,7 @@
 
 # Separating out the context manager part so that users won't
 # (mis-)use Workspace instances as context managers
-class _WorkspaceCtx(object):
+class _WorkspaceCtx:
     def __init__(self, workspace_id):
         self.workspace_id = workspace_id
         # A stack, so that the context manager is reentrant.
@@ -34,7 +34,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         workspace.SwitchWorkspace(w, create_if_missing=True)
 
 
-class Workspace(object):
+class Workspace:
     """
     An object representing a Caffe2 workspace.  It is a context manager,
     so you can say 'with workspace:' to use the represented workspace
diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py
index f039ef09f637..52cf75de79fa 100644
--- a/caffe2/python/operator_fp_exceptions_test.py
+++ b/caffe2/python/operator_fp_exceptions_test.py
@@ -33,7 +33,7 @@ def test_fp_exception_divbyzero(self):
                 workspace.RunNetOnce(net)
             except Exception as e:
                 exception_raised = True
-            self.assertEquals(exception_raised, throw_if_fp_exceptions)
+            self.assertEqual(exception_raised, throw_if_fp_exceptions)
 
 
 if __name__ == '__main__':
diff --git a/caffe2/python/operator_test/async_net_barrier_test.py b/caffe2/python/operator_test/async_net_barrier_test.py
index e2c0ea0ccc1a..c12cd9a2fe53 100644
--- a/caffe2/python/operator_test/async_net_barrier_test.py
+++ b/caffe2/python/operator_test/async_net_barrier_test.py
@@ -25,7 +25,7 @@ def test_async_net_barrier_op(self, n, shape, dc, gc):
         )
 
         def reference_func(*args):
-            self.assertEquals(len(args), n)
+            self.assertEqual(len(args), n)
             return args
 
         self.assertReferenceChecks(gc, barrier_op, test_inputs, reference_func)
diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py
index 88e38df52da5..7f568f523bbf 100644
--- a/caffe2/python/operator_test/atomic_ops_test.py
+++ b/caffe2/python/operator_test/atomic_ops_test.py
@@ -46,7 +46,7 @@ def test_atomic_ops(self):
         plan.AddStep(super_step)
         workspace.RunPlan(plan)
         # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
-        self.assertEquals(workspace.FetchBlob(checksum), 200010000)
+        self.assertEqual(workspace.FetchBlob(checksum), 200010000)
 
     @unittest.skip("Test is flaky: https://github.com/pytorch/pytorch/issues/28179")
     def test_atomic64_ops(self):
@@ -85,7 +85,7 @@ def test_atomic64_ops(self):
         plan.AddStep(super_step)
         workspace.RunPlan(plan)
         # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
-        self.assertEquals(workspace.FetchBlob(checksum), 200010000)
+        self.assertEqual(workspace.FetchBlob(checksum), 200010000)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
index a7e01570a22a..7121258de127 100644
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@@ -264,8 +264,8 @@ def test_dataset_ops(self):
         ]
         zipped = zip(expected_fields, schema.field_names(), schema.field_types())
         for (ref_name, ref_type), name, dtype in zipped:
-            self.assertEquals(ref_name, name)
-            self.assertEquals(np.dtype(ref_type), dtype)
+            self.assertEqual(ref_name, name)
+            self.assertEqual(np.dtype(ref_type), dtype)
         """
         2. The contents of our dataset.
 
@@ -447,7 +447,7 @@ def test_dataset_ops(self):
         """
         subschema = Struct(("top_level", schema.int_lists.values))
         int_list_contents = contents.int_lists.values.field_names()
-        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
+        self.assertEqual(len(subschema.field_names()), len(int_list_contents))
         """
         7. Random Access a dataset
 
@@ -474,7 +474,7 @@ def test_dataset_ops(self):
             actual = FetchRecord(batch)
             _assert_records_equal(actual, entry)
         workspace.RunNet(str(read_next_net))
-        self.assertEquals(True, workspace.FetchBlob(should_stop))
+        self.assertEqual(True, workspace.FetchBlob(should_stop))
         """
         8. Random Access a dataset with loop_over = true
 
@@ -496,7 +496,7 @@ def test_dataset_ops(self):
 
         for _ in range(len(entries) * 3):
             workspace.RunNet(str(read_next_net))
-            self.assertEquals(False, workspace.FetchBlob(should_stop))
+            self.assertEqual(False, workspace.FetchBlob(should_stop))
         """
         9. Sort and shuffle a dataset
 
@@ -536,7 +536,7 @@ def test_dataset_ops(self):
         trimmed = FetchRecord(ds.content())
         EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
         actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
-        self.assertEquals(EXPECTED_SIZES, actual_sizes)
+        self.assertEqual(EXPECTED_SIZES, actual_sizes)
 
     def test_last_n_window_ops(self):
         collect_net = core.Net("collect_net")
diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
index e471e13fc520..56fc8e81e199 100644
--- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
+++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
@@ -39,7 +39,7 @@ def c10_op_ref(maps, rois):
 
 class TestHeatmapMaxKeypointOp(hu.HypothesisTestCase):
     def setUp(self):
-        super(TestHeatmapMaxKeypointOp, self).setUp()
+        super().setUp()
         np.random.seed(0)
 
         # initial coordinates and interpolate HEATMAP_SIZE from it
diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py
index 245bca210ad9..8a0754b32d25 100644
--- a/caffe2/python/operator_test/hsm_test.py
+++ b/caffe2/python/operator_test/hsm_test.py
@@ -119,7 +119,7 @@ def simulation_hsm_search():
         for i in range(names.shape[0]):
             for j in range(names.shape[1]):
                 if names[i][j]:
-                    self.assertEquals(
+                    self.assertEqual(
                         names[i][j], p_names[i][j].item().encode('utf-8'))
                     self.assertAlmostEqual(
                         scores[i][j], p_scores[i][j], delta=0.001)
diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py
index cf021f59362b..cf99128b3151 100644
--- a/caffe2/python/operator_test/index_ops_test.py
+++ b/caffe2/python/operator_test/index_ops_test.py
@@ -56,7 +56,7 @@ def _test_index_ops(self, entries, dtype, index_create_op):
             ['index'],
             ['index_size']))
         size = workspace.FetchBlob('index_size')
-        self.assertEquals(size, 6)
+        self.assertEqual(size, 6)
 
         workspace.RunOperatorOnce(core.CreateOperator(
             'IndexStore',
@@ -89,7 +89,7 @@ def _test_index_ops(self, entries, dtype, index_create_op):
             ['index2'],
             ['index2_size']))
         index2_size = workspace.FetchBlob('index2_size')
-        self.assertEquals(index2_size, 5)
+        self.assertEqual(index2_size, 5)
 
         # test serde
         with tempfile.NamedTemporaryFile() as tmp:
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index c4520f8ee1b6..315905f61c7e 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -31,7 +31,7 @@ class MiniDBEntry(NamedTuple):
 class TestLoadSaveBase(test_util.TestCase):
 
     def __init__(self, methodName, db_type='minidb'):
-        super(TestLoadSaveBase, self).__init__(methodName)
+        super().__init__(methodName)
         self._db_type = db_type
 
     @settings(deadline=None)
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
index 698fbb76df88..73b2e448f24d 100644
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -300,7 +300,7 @@ def test_presence_mask(self, gc, dc):
 
         output = workspace.FetchBlob('t')
         expected_output_shape = (3, 3, 2)
-        self.assertEquals(output.shape, expected_output_shape)
+        self.assertEqual(output.shape, expected_output_shape)
 
         presence_mask = workspace.FetchBlob('p')
         expected_presence_mask = np.array(
@@ -323,7 +323,7 @@ def test_presence_mask_empty(self):
 
         output = workspace.FetchBlob('p')
         expected_output_shape = (0, 0)
-        self.assertEquals(output.shape, expected_output_shape)
+        self.assertEqual(output.shape, expected_output_shape)
 
     @given(**hu.gcs_cpu_only)
     @settings(deadline=10000)
diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py
index 53d3fd4f4ecc..20f6f610e11c 100644
--- a/caffe2/python/operator_test/rebatching_queue_test.py
+++ b/caffe2/python/operator_test/rebatching_queue_test.py
@@ -51,7 +51,7 @@ def test_rebatching_queue_single_enqueue_dequeue(self):
         workspace.RunNetOnce(net)
 
         for idx in range(3):
-            self.assertEquals(workspace.FetchBlob(results[idx]), [1.0])
+            self.assertEqual(workspace.FetchBlob(results[idx]), [1.0])
 
     def test_rebatching_queue_multi_enqueue_dequeue(self):
         net = core.Net('net')
@@ -280,7 +280,7 @@ def append(ins, outs):
         # We check that the outputs are a permutation of inputs
         inputs.sort()
         outputs.sort()
-        self.assertEquals(inputs, outputs)
+        self.assertEqual(inputs, outputs)
 
 
 if __name__ == "__main__":
diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py
index 5d9b83604423..7c21ee633168 100644
--- a/caffe2/python/operator_test/recurrent_net_executor_test.py
+++ b/caffe2/python/operator_test/recurrent_net_executor_test.py
@@ -18,7 +18,7 @@
 class TestRNNExecutor(test_util.TestCase):
 
     def setUp(self):
-        super(TestRNNExecutor, self).setUp()
+        super().setUp()
         self.batch_size = 8
         self.input_dim = 20
         self.hidden_dim = 30
diff --git a/caffe2/python/operator_test/self_binning_histogram_test.py b/caffe2/python/operator_test/self_binning_histogram_test.py
index afcf5ea57e3e..f22a730e7e4a 100644
--- a/caffe2/python/operator_test/self_binning_histogram_test.py
+++ b/caffe2/python/operator_test/self_binning_histogram_test.py
@@ -7,7 +7,7 @@
 from hypothesis import given, settings
 
 
-class TestSelfBinningHistogramBase(object):
+class TestSelfBinningHistogramBase:
     def __init__(self, bin_spacing, dtype, abs=False):
         self.bin_spacing = bin_spacing
         self.dtype = dtype
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
index 702effc226d6..f39b929bce1f 100644
--- a/caffe2/python/operator_test/shape_inference_test.py
+++ b/caffe2/python/operator_test/shape_inference_test.py
@@ -26,13 +26,13 @@ def testShapeInferenceSimpleFC(self):
                 {'data': [b, 96]}
             )
 
-            self.assertEquals(shapes['data'], [b, 96])
-            self.assertEquals(shapes['fc1_w'], [32, 96])
-            self.assertEquals(shapes['fc1_b'], [32])
-            self.assertEquals(shapes['fc1'], [b, 32])
-            self.assertEquals(shapes['fc2_w'], [55, 32])
-            self.assertEquals(shapes['fc2_b'], [55])
-            self.assertEquals(shapes['fc2'], [b, 55])
+            self.assertEqual(shapes['data'], [b, 96])
+            self.assertEqual(shapes['fc1_w'], [32, 96])
+            self.assertEqual(shapes['fc1_b'], [32])
+            self.assertEqual(shapes['fc1'], [b, 32])
+            self.assertEqual(shapes['fc2_w'], [55, 32])
+            self.assertEqual(shapes['fc2_b'], [55])
+            self.assertEqual(shapes['fc2'], [b, 55])
 
     def testFCAxis2(self):
         model = model_helper.ModelHelper(name="test_model")
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
index 12a9e6826fd1..a15aec9f2271 100644
--- a/caffe2/python/operator_test/stats_put_ops_test.py
+++ b/caffe2/python/operator_test/stats_put_ops_test.py
@@ -37,9 +37,9 @@ def test_default_value(self):
 
         self.assertIn(stat_name + sum_postfix, stat_dict)
         self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEquals(stat_dict[stat_name + sum_postfix],
+        self.assertEqual(stat_dict[stat_name + sum_postfix],
          default_value * magnitude_expand)
-        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
 
     def test_clamp(self):
         put_value = 10
@@ -68,9 +68,9 @@ def test_clamp(self):
 
         self.assertIn(stat_name + sum_postfix, stat_dict)
         self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEquals(stat_dict[stat_name + sum_postfix],
+        self.assertEqual(stat_dict[stat_name + sum_postfix],
             9223372036854775807)
-        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
 
     def test_clamp_with_out_of_bounds(self):
         put_value = float(1e20)
@@ -99,9 +99,9 @@ def test_clamp_with_out_of_bounds(self):
 
         self.assertIn(stat_name + sum_postfix, stat_dict)
         self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEquals(stat_dict[stat_name + sum_postfix],
+        self.assertEqual(stat_dict[stat_name + sum_postfix],
             9223372036854775807)
-        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
 
     def test_avg_put_ops(self):
         put_value = 15.1111
@@ -129,9 +129,9 @@ def test_avg_put_ops(self):
 
         self.assertIn(stat_name + sum_postfix, stat_dict)
         self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEquals(stat_dict[stat_name + sum_postfix],
+        self.assertEqual(stat_dict[stat_name + sum_postfix],
          put_value * magnitude_expand)
-        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
 
     def test_increment_put_ops(self):
         put_value = 15.1111
@@ -157,7 +157,7 @@ def test_increment_put_ops(self):
         stat_dict = dict(zip(k, v))
 
         self.assertIn(stat_name + member_postfix, stat_dict)
-        self.assertEquals(stat_dict[stat_name + member_postfix],
+        self.assertEqual(stat_dict[stat_name + member_postfix],
          put_value * magnitude_expand)
 
     def test_stddev_put_ops(self):
@@ -190,6 +190,6 @@ def test_stddev_put_ops(self):
         self.assertIn(stat_name + count_postfix, stat_dict)
         self.assertIn(stat_name + sumoffset_postfix, stat_dict)
         self.assertIn(stat_name + sumsqoffset_postfix, stat_dict)
-        self.assertEquals(stat_dict[stat_name + sum_postfix],
+        self.assertEqual(stat_dict[stat_name + sum_postfix],
             put_value * magnitude_expand)
-        self.assertEquals(stat_dict[stat_name + count_postfix], 1)
+        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
diff --git a/caffe2/python/operator_test/unsafe_coalesce_test.py b/caffe2/python/operator_test/unsafe_coalesce_test.py
index 36f10cf1b426..c99ef31236cc 100644
--- a/caffe2/python/operator_test/unsafe_coalesce_test.py
+++ b/caffe2/python/operator_test/unsafe_coalesce_test.py
@@ -27,7 +27,7 @@ def test_unsafe_coalesce_op(self, n, shape, dc, gc):
         )
 
         def reference_func(*args):
-            self.assertEquals(len(args), n)
+            self.assertEqual(len(args), n)
             return list(args) + [np.concatenate([x.flatten() for x in args])]
 
         self.assertReferenceChecks(gc, coalesce_op, test_inputs, reference_func)
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index c038fc2c8e37..fcc825ca667a 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -32,7 +32,7 @@ def reset_optimizer_instance_count():
     _optimizer_instance_count.clear()
 
 
-class Optimizer(object):
+class Optimizer:
     def __init__(self):
         self._aux_params = AuxOptimizerParams(local=[], shared=[])
         self._instance_num = _optimizer_instance_count[self.__class__.__name__]
@@ -302,7 +302,7 @@ def __init__(
         lars=None,
         **kwargs
     ):
-        super(SgdOptimizer, self).__init__()
+        super().__init__()
         self.base_learning_rate = base_learning_rate
         self.policy = policy
         self.momentum = momentum
@@ -418,7 +418,7 @@ def __init__(
         sparse_dedup_aggregator=None,
         **kwargs
     ):
-        super(MultiPrecisionSgdOptimizer, self).__init__(
+        super().__init__(
             base_learning_rate=base_learning_rate,
             policy=policy,
             momentum=momentum,
@@ -489,7 +489,7 @@ def __init__(
         sparse_dedup_aggregator=None,
         **kwargs
     ):
-        super(FP16SgdOptimizer, self).__init__(
+        super().__init__(
             base_learning_rate=base_learning_rate,
             policy=policy,
             momentum=momentum,
@@ -635,7 +635,7 @@ def __init__(
         use_dedicated_lr_iteration_counter=False,
         **kwargs
     ):
-        super(AdagradOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.epsilon = epsilon
         self.decay = decay
@@ -1207,7 +1207,7 @@ def __init__(
         output_effective_lr_and_update=False,
         **kwargs
     ):
-        super(WngradOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.epsilon = epsilon
         self.policy = policy
@@ -1319,7 +1319,7 @@ def __init__(
               include 'mean' and 'sum'.
             lars: lars offset.
         """
-        super(StormOptimizer, self).__init__()
+        super().__init__()
         self.lr = lr
         self.momentum = momentum
         self.beta = beta
@@ -1420,7 +1420,7 @@ def __init__(
               include "mean" and "sum".
             engine: the engine used, options include "", "CUDNN", etc.
         """
-        super(AdadeltaOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.epsilon = epsilon
         self.decay = decay
@@ -1488,7 +1488,7 @@ def __init__(
         sparse_dedup_aggregator=None,
         engine="",
     ):
-        super(FtrlOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.beta = beta
         self.lambda1 = lambda1
@@ -1546,7 +1546,7 @@ def __init__(
         sparse_dedup_aggregator=None,
         engine="",
     ):
-        super(GFtrlOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.beta = beta
         self.lambda1 = lambda1
@@ -1598,7 +1598,7 @@ def __init__(
         use_smart_decay=False,  # See https://fburl.com/2jdiwrhy for context.
         **kwargs
     ):
-        super(AdamOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.beta1 = beta1
         self.beta2 = beta2
@@ -1761,7 +1761,7 @@ def __init__(
         engine="",
         **kwargs
     ):
-        super(DecayAdagradOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.beta1 = beta1
         self.beta2 = beta2
@@ -1885,7 +1885,7 @@ def __init__(
         sparse_dedup_aggregator=None,
         **kwargs
     ):
-        super(YellowFinOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.mu = mu
         self.beta = beta
@@ -1973,7 +1973,7 @@ def __init__(
         engine="",
         **kwargs
     ):
-        super(RmsPropOptimizer, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.decay = decay
         self.momentum = momentum
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
index 7511a2c8a3ec..e84177502be5 100644
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@@ -79,7 +79,7 @@ def check_optimizer(self, optimizer):
 
     @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
     def testGPUDense(self):
-        super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16)
+        super().testGPUDense(core.DataType.FLOAT16)
 
 
 class TestFtrl(OptimizerTestBase, TestCase):
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index beb8a3781832..2c0eefa71012 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -14,7 +14,7 @@
 from caffe2.python.model_helper import ModelHelper
 
 
-class OptimizerTestBase(object):
+class OptimizerTestBase:
     """
     This is an abstract base class.
     Don't inherit from unittest.TestCase, and don't name it 'Test*'.
@@ -148,7 +148,7 @@ def testSparse(self):
         self.check_optimizer(optimizer)
 
 
-class LRModificationTestBase(object):
+class LRModificationTestBase:
     """
     This is an abstract base class.
     Don't inherit from unittest.TestCase, and don't name it 'Test*'.
diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py
index 067f4794a89f..a561ae43acb9 100644
--- a/caffe2/python/parallel_workers.py
+++ b/caffe2/python/parallel_workers.py
@@ -84,7 +84,7 @@ def init_workers(
     return global_coordinator
 
 
-class Metrics(object):
+class Metrics:
     def __init__(self, external_loggers):
         self._metrics = collections.defaultdict(lambda: 0)
         self._external_loggers = external_loggers
@@ -124,7 +124,7 @@ def cleanup(self):
         pass
 
 
-class WorkerCoordinator(object):
+class WorkerCoordinator:
     def __init__(
         self, worker_name, worker_ids, init_fun,
         state=None, shutdown_fun=None
@@ -191,7 +191,7 @@ def get_worker_ids(self):
         return self._worker_ids
 
 
-class GlobalWorkerCoordinator(object):
+class GlobalWorkerCoordinator:
     def __init__(self):
         self._coordinators = []
         self._fetcher_id_seq = 0
@@ -248,7 +248,7 @@ def cleanup():
         atexit.register(cleanup)
 
 
-class Worker(object):
+class Worker:
     def __init__(
         self,
         coordinator,
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index 4625d0b0458c..195ac8285c83 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -12,7 +12,7 @@
 from caffe2.python.task import Node, Task, TaskGroup
 
 
-class Output(object):
+class Output:
     """
     Represents the result of a processor function. A processor can either
     return an Output, or it can return a record, in which case an Output will be
@@ -394,7 +394,7 @@ def read_ex(self, init_net, exit_net):
         return read_nets, status, fields
 
 
-class NetProcessor(object):
+class NetProcessor:
     """
     Processor that clones a core.Net each time it's called, executing
     the cloned net as the processor. It requires the Net to have input
diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py
index fe00933ac4e1..0764aec4ef96 100644
--- a/caffe2/python/pipeline_test.py
+++ b/caffe2/python/pipeline_test.py
@@ -70,7 +70,7 @@ def proc2(rec):
         output = FetchRecord(dst_blobs, ws=ws)
         num_dequeues = ws.blobs[str(counter)].fetch()
 
-        self.assertEquals(
+        self.assertEqual(
             num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))
 
         for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 5b2c2f71a827..2f601b605482 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -58,16 +58,9 @@ constexpr bool kPyBindFalse = false;
 
 namespace py = pybind11;
 
-// NOLINTNEXTLINE(modernize-use-equals-default)
-BlobFetcherBase::~BlobFetcherBase() {}
 // NOLINTNEXTLINE(modernize-use-equals-default)
 BlobFeederBase::~BlobFeederBase() {}
 
-C10_DEFINE_TYPED_REGISTRY(
-    BlobFetcherRegistry,
-    TypeIdentifier,
-    BlobFetcherBase,
-    std::unique_ptr);
 C10_DEFINE_TYPED_REGISTRY(
     BlobFeederRegistry,
     caffe2::DeviceType,
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index 6d89b55bcc73..f4c20b6e6280 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -14,6 +14,7 @@
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/python/pybind_state_dlpack.h"
+#include "caffe2/python/pybind_workspace.h"
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -55,16 +56,6 @@ Workspace* GetCurrentWorkspace();
 // Get workspace by name. Returns nullptr if none exists by name.
 Workspace* GetWorkspaceByName(const std::string& name);
 
-class C10_EXPORT BlobFetcherBase {
- public:
-  struct FetchedBlob {
-    pybind11::object obj;
-    bool copied;
-  };
-  virtual ~BlobFetcherBase();
-  virtual pybind11::object Fetch(const Blob& blob) = 0;
-};
-
 class BlobFeederBase {
  public:
   virtual ~BlobFeederBase();
@@ -75,17 +66,6 @@ class BlobFeederBase {
       bool in_place = false) = 0;
 };
 
-C10_DECLARE_TYPED_REGISTRY(
-    BlobFetcherRegistry,
-    TypeIdentifier,
-    BlobFetcherBase,
-    std::unique_ptr);
-#define REGISTER_BLOB_FETCHER(id, ...) \
-  C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
-inline unique_ptr<BlobFetcherBase> CreateFetcher(TypeIdentifier id) {
-  return BlobFetcherRegistry()->Create(id);
-}
-
 C10_DECLARE_TYPED_REGISTRY(
     BlobFeederRegistry,
     DeviceType,
diff --git a/caffe2/python/pybind_workspace.cc b/caffe2/python/pybind_workspace.cc
index aa837b7b4dfe..2962e3b297be 100644
--- a/caffe2/python/pybind_workspace.cc
+++ b/caffe2/python/pybind_workspace.cc
@@ -1,8 +1,18 @@
 #include "caffe2/core/workspace.h"
+#include "caffe2/python/pybind_workspace.h"
 
 namespace caffe2 {
 namespace python {
 
+// NOLINTNEXTLINE(modernize-use-equals-default)
+BlobFetcherBase::~BlobFetcherBase() {}
+
+C10_DEFINE_TYPED_REGISTRY(
+    BlobFetcherRegistry,
+    TypeIdentifier,
+    BlobFetcherBase,
+    std::unique_ptr);
+
 // gWorkspace is the pointer to the current workspace. The ownership is kept
 // by the gWorkspaces map.
 static Workspace* gWorkspace = nullptr;
diff --git a/caffe2/python/pybind_workspace.h b/caffe2/python/pybind_workspace.h
index 0467d9ff6ccd..ac43992b6416 100644
--- a/caffe2/python/pybind_workspace.h
+++ b/caffe2/python/pybind_workspace.h
@@ -1,5 +1,32 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+//#include <Python.h>
+
 namespace caffe2 {
 namespace python {
+class C10_EXPORT BlobFetcherBase {
+ public:
+  struct FetchedBlob {
+    pybind11::object obj;
+    bool copied;
+  };
+  virtual ~BlobFetcherBase();
+  virtual pybind11::object Fetch(const Blob& blob) = 0;
+};
+
+C10_DECLARE_TYPED_REGISTRY(
+    BlobFetcherRegistry,
+    TypeIdentifier,
+    BlobFetcherBase,
+    std::unique_ptr);
+#define REGISTER_BLOB_FETCHER(id, ...) \
+  C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
+inline unique_ptr<BlobFetcherBase> CreateFetcher(TypeIdentifier id) {
+  return BlobFetcherRegistry()->Create(id);
+}
 
 Workspace* GetCurrentWorkspace();
 void SetCurrentWorkspace(Workspace* workspace);
diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py
index 1170c2bf3a82..d6eb554272d1 100644
--- a/caffe2/python/record_queue.py
+++ b/caffe2/python/record_queue.py
@@ -17,7 +17,7 @@
 class _QueueReader(Reader):
     def __init__(self, blobs_queue, schema, name=None):
         """Don't call this directly. Instead, use dataset.reader()"""
-        super(_QueueReader, self).__init__(schema)
+        super().__init__(schema)
         self.blobs_queue = blobs_queue
         self.name = name
 
@@ -45,7 +45,7 @@ def write(self, writer_net, fields):
         return status
 
 
-class RecordQueue(object):
+class RecordQueue:
     """ The class is used to feed data with some process from a reader into a
         queue and provider a reader interface for data fetching from the queue.
     """
diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
index 48726d67f4f8..4236647ed198 100644
--- a/caffe2/python/regularizer.py
+++ b/caffe2/python/regularizer.py
@@ -6,12 +6,12 @@
 import numpy as np
 
 
-class RegularizationBy(object):
+class RegularizationBy:
     AFTER_OPTIMIZER = "after_optimizer"
     ON_LOSS = "on_loss"
 
 
-class Regularizer(object):
+class Regularizer:
     def __init__(self):
         self.kEpsilon = 1e-9
 
@@ -89,7 +89,7 @@ def _ensure_clipped(
 
 class L1Norm(Regularizer):
     def __init__(self, reg_lambda):
-        super(L1Norm, self).__init__()
+        super().__init__()
         assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
 
         self.reg_lambda = reg_lambda
@@ -109,7 +109,7 @@ def __init__(self, reg_lambda, p_value=0.5):
                     we will calculate Lp norm with the formula:
                     pow( sum_i { pow(theda_i, p) } ,  1/p)
         """
-        super(LpNorm, self).__init__()
+        super().__init__()
         assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
         assert p_value > 0, "p_value factor should be greater than 0"
         self.p_value = p_value
@@ -158,7 +158,7 @@ def __init__(self, reg_lambda, alpha=0.01, budget=0):
                     budget, no penalization will be applied. Optional parameter, if
                     0, then no budget is used
         """
-        super(L0ApproxNorm, self).__init__()
+        super().__init__()
         assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
         assert alpha > 0, "alpha factor must be a positive value greater than 0"
         assert budget >= 0, "budget factor must be greater than or equal to 0"
@@ -204,7 +204,7 @@ class L1NormTrimmed(Regularizer):
     The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
     """
     def __init__(self, reg_lambda, k):
-        super(L1NormTrimmed, self).__init__()
+        super().__init__()
         assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
         assert isinstance(k, int), "k should be an interger as expected #. after selection"
         assert k >= 1, "k should be larger than 1"
@@ -225,7 +225,7 @@ def _run_on_loss(self, net, param_init_net, param, grad=None):
 
 class L2Norm(Regularizer):
     def __init__(self, reg_lambda):
-        super(L2Norm, self).__init__()
+        super().__init__()
         assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
 
         self.reg_lambda = reg_lambda
@@ -239,7 +239,7 @@ def _run_on_loss(self, net, param_init_net, param, grad=None):
 
 class ElasticNet(Regularizer):
     def __init__(self, l1, l2):
-        super(ElasticNet, self).__init__()
+        super().__init__()
         self.l1 = l1
         self.l2 = l2
 
@@ -257,7 +257,7 @@ def _run_on_loss(self, net, param_init_net, param, grad=None):
 
 class ElasticNetL1NormTrimmed(Regularizer):
     def __init__(self, l1, l2, k):
-        super(ElasticNetL1NormTrimmed, self).__init__()
+        super().__init__()
         self.l1 = l1
         self.l2 = l2
         self.k = k
@@ -282,7 +282,7 @@ def _run_on_loss(self, net, param_init_net, param, grad=None):
 
 class MaxNorm(Regularizer):
     def __init__(self, norm=1.0, dtype=None):
-        super(MaxNorm, self).__init__()
+        super().__init__()
         self.norm = norm
         self.dtype = dtype
 
@@ -309,7 +309,7 @@ def _run_after_optimizer(self, net, param_init_net, param, grad):
 
 class ConstantNorm(Regularizer):
     def __init__(self, norm=1.0):
-        super(ConstantNorm, self).__init__()
+        super().__init__()
         self.norm = norm
 
     def _run_after_optimizer(self, net, param_init_net, param, grad):
@@ -329,7 +329,7 @@ def _run_after_optimizer(self, net, param_init_net, param, grad):
 
 class SparseLpNorm(Regularizer):
     def __init__(self, p, reg_lambda):
-        super(SparseLpNorm, self).__init__()
+        super().__init__()
         assert p in (1.0, 2.0), "Sparse Lp regularization only implemented for p = 1.0 and p = 2.0."
         assert reg_lambda > 0, "factor ahead of regularization should be greater than 0."
         self.p = p
@@ -349,12 +349,12 @@ def _run_after_optimizer(self, net, param_init_net, param, grad):
 
 class SparseL1Norm(SparseLpNorm):
     def __init__(self, reg_lambda):
-        super(SparseL1Norm, self).__init__(p=1.0, reg_lambda=reg_lambda)
+        super().__init__(p=1.0, reg_lambda=reg_lambda)
 
 
 class SparseL2Norm(SparseLpNorm):
     def __init__(self, reg_lambda):
-        super(SparseL2Norm, self).__init__(p=2.0, reg_lambda=reg_lambda)
+        super().__init__(p=2.0, reg_lambda=reg_lambda)
 
 
 class LogBarrier(Regularizer):
@@ -369,7 +369,7 @@ def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
         similar to the learning rate. It is specified by a learning rate policy and
         corresponding options
         """
-        super(LogBarrier, self).__init__()
+        super().__init__()
         assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
         self.reg_lambda = reg_lambda
         self.discount_policy = discount_policy
@@ -412,7 +412,7 @@ class BoundedGradientProjection(Regularizer):
     def __init__(
         self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
     ):
-        super(BoundedGradientProjection, self).__init__()
+        super().__init__()
         lb = float(lb) if lb is not None else None
         ub = float(ub) if ub is not None else None
         epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
@@ -481,7 +481,7 @@ def __init__(self, reg_lambda, groups, stabilizing_val=0):
                 of the gradient operator of Sqrt has taken into stability into
                 consideration, this term won't be necessary.
         """
-        super(GroupL1Norm, self).__init__()
+        super().__init__()
         assert (
             (reg_lambda) >= 0
         ), "regularization weight should be 0 or positive"
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 38407eaab83a..3ae0964c1081 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -42,7 +42,7 @@ def _RectifyNames(blob_references_or_names):
     return [_RectifyName(i) for i in blob_references_or_names]
 
 
-class RNNCell(object):
+class RNNCell:
     '''
     Base class for writing recurrent / stateful operations.
 
@@ -268,7 +268,7 @@ def _prepare_output_sequence(self, model, state_outputs):
         return state_outputs[output_sequence_index]
 
 
-class LSTMInitializer(object):
+class LSTMInitializer:
     def __init__(self, hidden_size):
         self.hidden_size = hidden_size
 
@@ -302,7 +302,7 @@ def __init__(
         activation=None,
         **kwargs
     ):
-        super(BasicRNNCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.drop_states = drop_states
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -403,7 +403,7 @@ def __init__(
         initializer=None,
         **kwargs
     ):
-        super(LSTMCell, self).__init__(initializer=initializer, **kwargs)
+        super().__init__(initializer=initializer, **kwargs)
         self.initializer = initializer or LSTMInitializer(
             hidden_size=hidden_size)
 
@@ -507,9 +507,7 @@ def __init__(
         initializer=None,
         **kwargs
     ):
-        super(LayerNormLSTMCell, self).__init__(
-            initializer=initializer, **kwargs
-        )
+        super().__init__(initializer=initializer, **kwargs)
         self.initializer = initializer or LSTMInitializer(
             hidden_size=hidden_size
         )
@@ -828,7 +826,7 @@ def __init__(
         assert 'is_test' in kwargs, "Argument 'is_test' is required"
         self.is_test = kwargs.pop('is_test')
         self.use_cudnn = use_cudnn
-        super(DropoutCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
         self.prepare_input = internal_cell.prepare_input
         self.get_output_state_index = internal_cell.get_output_state_index
@@ -888,7 +886,7 @@ def _apply_dropout(self, model, output):
         return output
 
 
-class MultiRNNCellInitializer(object):
+class MultiRNNCellInitializer:
     def __init__(self, cells):
         self.cells = cells
 
@@ -932,7 +930,7 @@ def __init__(self, cells, residual_output_layers=None, **kwargs):
 
         forward_only: used to construct inference-only network.
         '''
-        super(MultiRNNCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.cells = cells
 
         if residual_output_layers is None:
@@ -1117,7 +1115,7 @@ def __init__(
         attention_memory_optimization,
         **kwargs
     ):
-        super(AttentionCell, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.encoder_output_dim = encoder_output_dim
         self.encoder_outputs = encoder_outputs
         self.encoder_lengths = encoder_lengths
@@ -1414,7 +1412,7 @@ def __init__(
             forward_only=False,
             drop_states=False,
         )
-        super(LSTMWithAttentionCell, self).__init__(
+        super().__init__(
             encoder_output_dim=encoder_output_dim,
             encoder_outputs=encoder_outputs,
             encoder_lengths=encoder_lengths,
@@ -1453,7 +1451,7 @@ def __init__(
             forward_only=False,
             drop_states=False,
         )
-        super(MILSTMWithAttentionCell, self).__init__(
+        super().__init__(
             encoder_output_dim=encoder_output_dim,
             encoder_outputs=encoder_outputs,
             decoder_cell=decoder_cell,
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index eac09b67ab33..ab6ec29372e2 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -26,7 +26,7 @@
 from collections import OrderedDict, namedtuple
 from past.builtins import basestring
 from itertools import islice
-from six import StringIO
+from io import StringIO
 from typing import Sequence
 
 logger = logging.getLogger(__name__)
@@ -95,7 +95,7 @@ class Metadata(
 Metadata.__new__.__defaults__ = (None, None, None)
 
 
-class Field(object):
+class Field:
     """Represents an abstract field type in a dataset.
     """
 
@@ -218,7 +218,7 @@ def __init__(self, values, lengths_blob=None):
         self._items = _normalize_field(values)
         self.lengths._set_parent(self, 0)
         self._items._set_parent(self, 1)
-        super(List, self).__init__([self.lengths, self._items])
+        super().__init__([self.lengths, self._items])
 
     def field_names(self):
         value_fields = self._items.field_names()
@@ -295,7 +295,7 @@ def __init__(self, values, lengths_blob=None, evicted_values=None):
             self._evicted_values = _normalize_field(evicted_values)
         else:
             self._evicted_values = Scalar(np.int64, evicted_values)
-        super(ListWithEvicted, self).__init__(values, lengths_blob=lengths_blob)
+        super().__init__(values, lengths_blob=lengths_blob)
 
     def field_names(self):
         value_fields = self._items.field_names()
@@ -418,7 +418,7 @@ def __init__(self, *fields):
             self.fields[name] = self.fields[name] + field
         for id, (_, field) in enumerate(self.fields.items()):
             field._set_parent(self, id)
-        super(Struct, self).__init__(self.fields.values())
+        super().__init__(self.fields.values())
         self._frozen = True
 
     def _struct_from_nested_name(self, nested_name, field):
@@ -544,7 +544,7 @@ def __getattr__(self, item):
         if item.startswith('__'):
             raise AttributeError(item)
         try:
-            return super(Struct, self).__getattribute__("fields")[item]
+            return super().__getattribute__("fields")[item]
         except KeyError as e:
             raise AttributeError(item) from e
 
@@ -555,7 +555,7 @@ def __setattr__(self, key, value):
         # post initialization.
         if getattr(self, '_frozen', None) and not key.startswith('_'):
             raise TypeError('Struct.__setattr__() is disabled after __init__()')
-        super(Struct, self).__setattr__(key, value)
+        super().__setattr__(key, value)
 
     def __add__(self, other):
         """
@@ -725,7 +725,7 @@ class Scalar(Field):
     def __init__(self, dtype=None, blob=None, metadata=None):
         self._metadata = None
         self.set(dtype, blob, metadata, unsafe=True)
-        super(Scalar, self).__init__([])
+        super().__init__([])
 
     def field_names(self):
         return ['']
@@ -979,7 +979,7 @@ def from_dtype(dtype, _outer_shape=()):
     return Struct(*struct_fields)
 
 
-class _SchemaNode(object):
+class _SchemaNode:
     """This is a private class used to represent a Schema Node"""
 
     __slots__: Sequence[str] = ("name", "children", "type_str", "field")
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
index bb9536e4430b..8f3ed4415fd4 100644
--- a/caffe2/python/schema_test.py
+++ b/caffe2/python/schema_test.py
@@ -82,7 +82,7 @@ class Subclass(schema.Struct):
 
     def testNormalizeField(self):
         s = schema.Struct(('field1', np.int32), ('field2', str))
-        self.assertEquals(
+        self.assertEqual(
             s,
             schema.Struct(
                 ('field1', schema.Scalar(dtype=np.int32)),
@@ -97,11 +97,11 @@ def testTuple(self):
             ('field_1', schema.Scalar(dtype=np.str)),
             ('field_2', schema.Scalar(dtype=np.float32))
         )
-        self.assertEquals(s, s2)
-        self.assertEquals(s[0], schema.Scalar(dtype=np.int32))
-        self.assertEquals(s[1], schema.Scalar(dtype=np.str))
-        self.assertEquals(s[2], schema.Scalar(dtype=np.float32))
-        self.assertEquals(
+        self.assertEqual(s, s2)
+        self.assertEqual(s[0], schema.Scalar(dtype=np.int32))
+        self.assertEqual(s[1], schema.Scalar(dtype=np.str))
+        self.assertEqual(s[2], schema.Scalar(dtype=np.float32))
+        self.assertEqual(
             s[2, 0],
             schema.Struct(
                 ('field_2', schema.Scalar(dtype=np.float32)),
@@ -110,19 +110,19 @@ def testTuple(self):
         )
         # test iterator behavior
         for i, (v1, v2) in enumerate(zip(s, s2)):
-            self.assertEquals(v1, v2)
-            self.assertEquals(s[i], v1)
-            self.assertEquals(s2[i], v1)
+            self.assertEqual(v1, v2)
+            self.assertEqual(s[i], v1)
+            self.assertEqual(s2[i], v1)
 
     def testRawTuple(self):
         s = schema.RawTuple(2)
-        self.assertEquals(
+        self.assertEqual(
             s, schema.Struct(
                 ('field_0', schema.Scalar()), ('field_1', schema.Scalar())
             )
         )
-        self.assertEquals(s[0], schema.Scalar())
-        self.assertEquals(s[1], schema.Scalar())
+        self.assertEqual(s[0], schema.Scalar())
+        self.assertEqual(s[1], schema.Scalar())
 
     def testStructIndexing(self):
         s = schema.Struct(
@@ -130,10 +130,10 @@ def testStructIndexing(self):
             ('field2', schema.List(schema.Scalar(dtype=str))),
             ('field3', schema.Struct()),
         )
-        self.assertEquals(s['field2'], s.field2)
-        self.assertEquals(s['field2'], schema.List(schema.Scalar(dtype=str)))
-        self.assertEquals(s['field3'], schema.Struct())
-        self.assertEquals(
+        self.assertEqual(s['field2'], s.field2)
+        self.assertEqual(s['field2'], schema.List(schema.Scalar(dtype=str)))
+        self.assertEqual(s['field3'], schema.Struct())
+        self.assertEqual(
             s['field2', 'field1'],
             schema.Struct(
                 ('field2', schema.List(schema.Scalar(dtype=str))),
@@ -147,8 +147,8 @@ def testListInStructIndexing(self):
             ('field1', schema.Scalar(dtype=np.int32)),
             ('field2', a)
         )
-        self.assertEquals(s['field2:lengths'], a.lengths)
-        self.assertEquals(s['field2:values'], a.items)
+        self.assertEqual(s['field2:lengths'], a.lengths)
+        self.assertEqual(s['field2:values'], a.items)
         with self.assertRaises(KeyError):
             s['fields2:items:non_existent']
         with self.assertRaises(KeyError):
@@ -160,9 +160,9 @@ def testListWithEvictedInStructIndexing(self):
             ('field1', schema.Scalar(dtype=np.int32)),
             ('field2', a)
         )
-        self.assertEquals(s['field2:lengths'], a.lengths)
-        self.assertEquals(s['field2:values'], a.items)
-        self.assertEquals(s['field2:_evicted_values'], a._evicted_values)
+        self.assertEqual(s['field2:lengths'], a.lengths)
+        self.assertEqual(s['field2:values'], a.items)
+        self.assertEqual(s['field2:_evicted_values'], a._evicted_values)
         with self.assertRaises(KeyError):
             s['fields2:items:non_existent']
         with self.assertRaises(KeyError):
@@ -177,8 +177,8 @@ def testMapInStructIndexing(self):
             ('field1', schema.Scalar(dtype=np.int32)),
             ('field2', a)
         )
-        self.assertEquals(s['field2:values:keys'], a.keys)
-        self.assertEquals(s['field2:values:values'], a.values)
+        self.assertEqual(s['field2:values:keys'], a.keys)
+        self.assertEqual(s['field2:values:values'], a.values)
         with self.assertRaises(KeyError):
             s['fields2:keys:non_existent']
 
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index bf3c8e9a0d06..c2498cd800d8 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -35,69 +35,69 @@ def thread_runner(idx, testobj):
 class TestScope(unittest.TestCase):
 
     def testNamescopeBasic(self):
-        self.assertEquals(scope.CurrentNameScope(), "")
+        self.assertEqual(scope.CurrentNameScope(), "")
 
         with scope.NameScope("test_scope"):
-            self.assertEquals(scope.CurrentNameScope(), "test_scope/")
+            self.assertEqual(scope.CurrentNameScope(), "test_scope/")
 
-        self.assertEquals(scope.CurrentNameScope(), "")
+        self.assertEqual(scope.CurrentNameScope(), "")
 
     def testNamescopeAssertion(self):
-        self.assertEquals(scope.CurrentNameScope(), "")
+        self.assertEqual(scope.CurrentNameScope(), "")
 
         try:
             with scope.NameScope("test_scope"):
-                self.assertEquals(scope.CurrentNameScope(), "test_scope/")
+                self.assertEqual(scope.CurrentNameScope(), "test_scope/")
                 raise Exception()
         except Exception:
             pass
 
-        self.assertEquals(scope.CurrentNameScope(), "")
+        self.assertEqual(scope.CurrentNameScope(), "")
 
     def testEmptyNamescopeBasic(self):
-        self.assertEquals(scope.CurrentNameScope(), "")
+        self.assertEqual(scope.CurrentNameScope(), "")
 
         with scope.NameScope("test_scope"):
             with scope.EmptyNameScope():
-                self.assertEquals(scope.CurrentNameScope(), "")
-            self.assertEquals(scope.CurrentNameScope(), "test_scope/")
+                self.assertEqual(scope.CurrentNameScope(), "")
+            self.assertEqual(scope.CurrentNameScope(), "test_scope/")
 
     def testDevicescopeBasic(self):
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
         dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
         with scope.DeviceScope(dsc):
-            self.assertEquals(scope.CurrentDeviceScope(), dsc)
+            self.assertEqual(scope.CurrentDeviceScope(), dsc)
 
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
     def testEmptyDevicescopeBasic(self):
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
         dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
         with scope.DeviceScope(dsc):
-            self.assertEquals(scope.CurrentDeviceScope(), dsc)
+            self.assertEqual(scope.CurrentDeviceScope(), dsc)
             with scope.EmptyDeviceScope():
-                self.assertEquals(scope.CurrentDeviceScope(), None)
-            self.assertEquals(scope.CurrentDeviceScope(), dsc)
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+                self.assertEqual(scope.CurrentDeviceScope(), None)
+            self.assertEqual(scope.CurrentDeviceScope(), dsc)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
     def testDevicescopeAssertion(self):
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
         dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
 
         try:
             with scope.DeviceScope(dsc):
-                self.assertEquals(scope.CurrentDeviceScope(), dsc)
+                self.assertEqual(scope.CurrentDeviceScope(), dsc)
                 raise Exception()
         except Exception:
             pass
 
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
     def testTags(self):
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
         extra_info1 = ["key1:value1"]
         extra_info2 = ["key2:value2"]
@@ -107,19 +107,19 @@ def testTags(self):
         extra_info_1_2_3 = ["key1:value1", "key2:value2", "key3:value3"]
 
         with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info1)):
-            self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info1)
+            self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info1)
 
             with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info2)):
-                self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
+                self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
 
                 with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info3)):
-                    self.assertEquals(
+                    self.assertEqual(
                         scope.CurrentDeviceScope().extra_info, extra_info_1_2_3
                     )
 
-                self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
-            self.assertEquals(scope.CurrentDeviceScope().extra_info, extra_info1)
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+                self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
+            self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info1)
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
     def testMultiThreaded(self):
         """
@@ -127,8 +127,8 @@ def testMultiThreaded(self):
         and don't interfere
         """
         global SUCCESS_COUNT
-        self.assertEquals(scope.CurrentNameScope(), "")
-        self.assertEquals(scope.CurrentDeviceScope(), None)
+        self.assertEqual(scope.CurrentNameScope(), "")
+        self.assertEqual(scope.CurrentDeviceScope(), None)
 
         threads = []
         for i in range(4):
@@ -140,13 +140,13 @@ def testMultiThreaded(self):
             t.start()
 
         with scope.NameScope("master"):
-            self.assertEquals(scope.CurrentDeviceScope(), None)
-            self.assertEquals(scope.CurrentNameScope(), "master/")
+            self.assertEqual(scope.CurrentDeviceScope(), None)
+            self.assertEqual(scope.CurrentNameScope(), "master/")
             for t in threads:
                 t.join()
 
-            self.assertEquals(scope.CurrentNameScope(), "master/")
-            self.assertEquals(scope.CurrentDeviceScope(), None)
+            self.assertEqual(scope.CurrentNameScope(), "master/")
+            self.assertEqual(scope.CurrentDeviceScope(), None)
 
         # Ensure all threads succeeded
-        self.assertEquals(SUCCESS_COUNT, 4)
+        self.assertEqual(SUCCESS_COUNT, 4)
diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index 7fa92a99b3c9..e84fd640a2ac 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -232,7 +232,7 @@ def assertReferenceChecks(
         outputs_to_check=None,
         ensure_outputs_are_inferred=False,
     ):
-        outs = super(SerializedTestCase, self).assertReferenceChecks(
+        outs = super().assertReferenceChecks(
             device_option,
             op,
             inputs,
diff --git a/caffe2/python/session.py b/caffe2/python/session.py
index fb2b57c4f5ee..edc32ccf808f 100644
--- a/caffe2/python/session.py
+++ b/caffe2/python/session.py
@@ -10,14 +10,14 @@
 from caffe2.python.task import Cluster, Task, TaskGroup, WorkspaceType
 
 
-class CompiledRunnable(object):
+class CompiledRunnable:
     """ Wrapper for compiled runnable returned from session.compile() """
     def __init__(self, obj, session_class):
         self.obj = obj
         self.session_class = session_class
 
 
-class Session(object):
+class Session:
     """
     Allows to run Nets, ExecutionSteps, Plans, Tasks and TaskGroups.
     A session can potentially run in multiple nodes concurrently.
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index 3eda48f9fca5..8a332de0767a 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -89,7 +89,7 @@ def kwargs(self):
         return self._kwargs
 
 
-class WorkspaceType(object):
+class WorkspaceType:
     """
     Determines whether tasks of a TaskGroup will run directly at the global
     workspace, which is kept alive across runs, or whether a new child
@@ -351,7 +351,7 @@ def __repr__(self):
             self.remote_nets())
 
 
-class TaskOutput(object):
+class TaskOutput:
     """
     Represents the output of a task. An output can be a blob,
     a list of blob, or a record.
@@ -409,7 +409,7 @@ def final_output(blob_or_record):
     return cur_task.add_output(blob_or_record)
 
 
-class TaskOutputList(object):
+class TaskOutputList:
     """ Keeps a list of outputs for a task """
     def __init__(self, outputs=None):
         self.outputs = outputs or []
@@ -535,7 +535,7 @@ def __init__(
         self._num_instances = num_instances
 
     def __enter__(self):
-        super(Task, self).__enter__()
+        super().__enter__()
 
         # temporarily remove from _tasks_to_add to ensure correct order
         if self.group is not None:
@@ -548,7 +548,7 @@ def __enter__(self):
         return self
 
     def __exit__(self, type, value, traceback):
-        super(Task, self).__exit__(type, value, traceback)
+        super().__exit__(type, value, traceback)
 
         self._net_builder.__exit__(type, value, traceback)
         if type is None:
@@ -644,7 +644,7 @@ def __repr__(self):
             self.name, self.node, self.outputs())
 
 
-class SetupNets(object):
+class SetupNets:
     """
     Allow to register a list of nets to be run at initialization
     and finalization of Tasks or TaskGroups.
diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py
index fc1bad34b201..78d3bc8b85ff 100644
--- a/caffe2/python/transformations.py
+++ b/caffe2/python/transformations.py
@@ -21,7 +21,7 @@
 import caffe2.python._import_c_extension as C
 
 
-class Transformer(object):
+class Transformer:
     def __init__(self):
         pass
 
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index fa11109cfc9b..dbc906f7d405 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -44,7 +44,7 @@ def _fuse_nnpack_convrelu(self, net, expected_result_num_ops,
     expected_activation_arg=True):
         self._add_nnpack(net)
         transformer.FuseNNPACKConvRelu(net)
-        self.assertEquals(tu.numOps(net), expected_result_num_ops)
+        self.assertEqual(tu.numOps(net), expected_result_num_ops)
         has_activation_arg = False
         for arg in net.Proto().op[0].arg:
             if tu.str_compare(arg.name, "activation"):
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
index 6f9426d6a93a..495dc27fcd5b 100644
--- a/caffe2/python/trt/test_trt.py
+++ b/caffe2/python/trt/test_trt.py
@@ -21,7 +21,7 @@
 import tarfile
 import tempfile
 import shutil
-from six.moves.urllib.request import urlretrieve
+from urllib.request import urlretrieve
 
 def _print_net(net):
     for i in net.external_input:
diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py
index 0cee3b254720..7c12fc7aaeb8 100644
--- a/caffe2/python/tt_core_test.py
+++ b/caffe2/python/tt_core_test.py
@@ -52,7 +52,7 @@ def test_full_tt_svd(self):
         Y_full_tt = workspace.FetchBlob("Y").flatten()
 
         assert(len(Y_fc) == len(Y_full_tt))
-        self.assertAlmostEquals(np.linalg.norm(Y_fc - Y_full_tt), 0, delta=1e-3)
+        self.assertAlmostEqual(np.linalg.norm(Y_fc - Y_full_tt), 0, delta=1e-3)
 
         # Testing TT-decomposition with minimal ranks
         sparse_tt_ranks = [1, 1, 1, 1, 1]
@@ -74,7 +74,7 @@ def test_full_tt_svd(self):
         Y_sparse_tt = workspace.FetchBlob("Y").flatten()
 
         assert(len(Y_fc) == len(Y_sparse_tt))
-        self.assertAlmostEquals(np.linalg.norm(Y_fc - Y_sparse_tt),
+        self.assertAlmostEqual(np.linalg.norm(Y_fc - Y_sparse_tt),
                                 39.974, delta=1e-3)
 
 
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 7c8a99c8a657..02a77e74681a 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -14,7 +14,6 @@
 import copy
 import functools
 import numpy as np
-from six import integer_types, binary_type, text_type, string_types
 
 OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
 OPTIMIZER_ITERATION_LR_NAME = "optimizer_iteration_lr"
@@ -30,7 +29,7 @@ def OpAlmostEqual(op_a, op_b, ignore_fields=None):
     if not isinstance(ignore_fields, list):
         ignore_fields = [ignore_fields]
 
-    assert all(isinstance(f, text_type) for f in ignore_fields), (
+    assert all(isinstance(f, str) for f in ignore_fields), (
         'Expect each field is text type, but got {}'.format(ignore_fields))
 
     def clean_op(op):
@@ -145,13 +144,13 @@ def MakeArgument(key, value):
 
     if type(value) is float:
         argument.f = value
-    elif type(value) in integer_types or type(value) is bool:
+    elif type(value) in [bool, int]:
         # We make a relaxation that a boolean variable will also be stored as
         # int.
         argument.i = value
-    elif isinstance(value, binary_type):
+    elif isinstance(value, bytes):
         argument.s = value
-    elif isinstance(value, text_type):
+    elif isinstance(value, str):
         argument.s = value.encode('utf-8')
     elif isinstance(value, caffe2_pb2.NetDef):
         argument.n.CopyFrom(value)
@@ -162,16 +161,16 @@ def MakeArgument(key, value):
             v.item() if type(v) is np.float_ else v for v in value
         )
     elif iterable and all(
-        type(v) in integer_types or type(v) in [bool, np.int_] for v in value
+        type(v) in [bool, int, np.int_] for v in value
     ):
         argument.ints.extend(
             v.item() if type(v) is np.int_ else v for v in value
         )
     elif iterable and all(
-        isinstance(v, binary_type) or isinstance(v, text_type) for v in value
+        isinstance(v, bytes) or isinstance(v, str) for v in value
     ):
         argument.strings.extend(
-            v.encode('utf-8') if isinstance(v, text_type) else v
+            v.encode('utf-8') if isinstance(v, str) else v
             for v in value
         )
     elif iterable and all(isinstance(v, caffe2_pb2.NetDef) for v in value):
@@ -277,7 +276,7 @@ def ResetBlobs(blobs):
     )
 
 
-class DebugMode(object):
+class DebugMode:
     '''
     This class allows to drop you into an interactive debugger
     if there is an unhandled exception in your python script
@@ -384,7 +383,7 @@ def EnumClassKeyVals(cls):
     for k in dir(cls):
         if k == k.upper():
             v = getattr(cls, k)
-            if isinstance(v, string_types):
+            if isinstance(v, str):
                 assert v not in enum.values(), (
                     "Failed to resolve {} as Enum: "
                     "duplicate entries {}={}, {}={}".format(
diff --git a/caffe2/python/visualize.py b/caffe2/python/visualize.py
index 626668841a6b..92190d1e62a0 100644
--- a/caffe2/python/visualize.py
+++ b/caffe2/python/visualize.py
@@ -25,7 +25,7 @@ def ChannelLast(arr):
     return arr.swapaxes(ndim - 3, ndim - 2).swapaxes(ndim - 2, ndim - 1)
 
 
-class PatchVisualizer(object):
+class PatchVisualizer:
     """PatchVisualizer visualizes patches.
   """
 
@@ -139,7 +139,7 @@ def get_patch_shape(self, patch):
 """
 
 
-class NHWC(object):
+class NHWC:
     @staticmethod
     def ShowSingle(*args, **kwargs):
         _default_visualizer.ShowSingle(*args, **kwargs)
@@ -157,7 +157,7 @@ def ShowChannels(*args, **kwargs):
         _default_visualizer.ShowChannels(*args, **kwargs)
 
 
-class NCHW(object):
+class NCHW:
     @staticmethod
     def ShowSingle(patch, *args, **kwargs):
         _default_visualizer.ShowSingle(ChannelLast(patch), *args, **kwargs)
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index ea546cb30a1b..e7fc0c3ec825 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -438,6 +438,10 @@ def FetchInt8BlobRealVal(name):
         np.float32) * int8_blob.scale
 
 
+def RemoveBlob(name) -> None:
+    ws = C.Workspace.current
+    _Workspace_remove_blob(ws, name)
+
 def _Workspace_fetch_int8_blob(ws, name):
     """Fetches an Int8 blob from the workspace. It shared backend implementation
     with FetchBlob but it is recommended when fetching Int8 Blobs
@@ -526,7 +530,7 @@ def GetNameScope():
     return scope.CurrentNameScope()
 
 
-class _BlobDict(object):
+class _BlobDict:
     """Provides python dict compatible way to do fetching and feeding"""
 
     def __getitem__(self, key):
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index b434b5e748cc..f359efc05050 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -134,13 +134,13 @@ def testTensorAccess(self):
         """ feed (copy) data into tensor """
         val = np.array([[b"abc", b"def"], [b"ghi", b"jkl"]], dtype=np.object)
         tensor.feed(val)
-        self.assertEquals(tensor.data[0, 0], b"abc")
+        self.assertEqual(tensor.data[0, 0], b"abc")
         np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
 
         val = np.array([1.1, 10.2])
         tensor.feed(val)
         val[0] = 5.2
-        self.assertEquals(tensor.data[0], 1.1)
+        self.assertEqual(tensor.data[0], 1.1)
 
         """ fetch (copy) data from tensor """
         val = np.array([1.1, 1.2])
@@ -149,7 +149,7 @@ def testTensorAccess(self):
         tensor.data[0] = 5.2
         val3 = tensor.fetch()
         np.testing.assert_array_equal(val, val2)
-        self.assertEquals(val3[0], 5.2)
+        self.assertEqual(val3[0], 5.2)
 
     def testFetchFeedBlob(self):
         self.assertEqual(
@@ -294,8 +294,8 @@ def testFetchBlobs(self):
         workspace.FeedBlob("s1", s1)
         workspace.FeedBlob("s2", s2)
         fetch1, fetch2 = workspace.FetchBlobs(["s1", "s2"])
-        self.assertEquals(s1, fetch1)
-        self.assertEquals(s2, fetch2)
+        self.assertEqual(s1, fetch1)
+        self.assertEqual(s2, fetch2)
 
     def testFetchFeedViaBlobDict(self):
         self.assertEqual(
@@ -768,7 +768,7 @@ def test_apply_transform_if_faster(self, value):
 
 class MyModule(torch.jit.ScriptModule):
     def __init__(self):
-        super(MyModule, self).__init__()
+        super().__init__()
         self.mult = torch.nn.Parameter(torch.tensor([[1, 2, 3, 4, 5.0]]))
 
     @torch.jit.script_method
diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h
index 3299327be430..2d5ebd0350a5 100644
--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@@ -145,9 +145,11 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
 #ifdef __GNUC__
   #define PREFETCH(location) __builtin_prefetch(location)
 #else
+#ifndef PREFETCH
   // no prefetching
   #define PREFETCH(location) ;
 #endif
+#endif
 
 // abort if byte order is undefined
 #ifndef __BYTE_ORDER
diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc
index 6eafe4ec1f52..db1f124aeb3a 100644
--- a/caffe2/share/contrib/nnpack/conv_op.cc
+++ b/caffe2/share/contrib/nnpack/conv_op.cc
@@ -161,8 +161,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
   ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
   const int oH = Y->dim32(2), oW = Y->dim32(3);
 
-  // NOLINTNEXTLINE(modernize-use-nullptr)
-  const float* biasData = NULL;
+  const float* biasData = nullptr;
   if (InputSize() == 3) {
     /* Convolution with bias */
     auto& bias = Input(2);
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index d847ffca6817..e210db6ca0fd 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -213,8 +213,8 @@ class BlockingCounter {
 
 // A workload for a worker.
 struct Task {
-  Task() {}
-  virtual ~Task() {}
+  Task() = default;
+  virtual ~Task() = default;
   virtual void Run() = 0;
 };
 
@@ -331,7 +331,7 @@ class alignas(kGEMMLOWPCacheLineSize) Worker {
 
 class WorkersPool {
  public:
-  WorkersPool() {}
+  WorkersPool() = default;
 
   void Execute(const std::vector<std::shared_ptr<Task>>& tasks) {
     CAFFE_ENFORCE_GE(tasks.size(), 1);
diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in
index 8045c87598df..53e9af1a68bb 100644
--- a/cmake/Caffe2Config.cmake.in
+++ b/cmake/Caffe2Config.cmake.in
@@ -13,9 +13,6 @@ set(CAFFE2_VERSION "@CAFFE2_VERSION@")
 # Utils functions.
 include("${CMAKE_CURRENT_LIST_DIR}/public/utils.cmake")
 
-# Include threads lib.
-include("${CMAKE_CURRENT_LIST_DIR}/public/threads.cmake")
-
 # Depending on whether Caffe2 uses gflags during compile time or
 # not, invoke gflags.
 if(@USE_GFLAGS@)
@@ -87,21 +84,20 @@ if(@USE_CUDA@)
   # If Caffe2 was compiled with the libraries below, they must
   # be found again when including the Caffe2 target.
   set(CAFFE2_USE_CUDA @USE_CUDA@)
-  set(CAFFE2_USE_CUDNN @USE_CUDNN@)
   set(CAFFE2_USE_TENSORRT @USE_TENSORRT@)
+
+  # Add current directory to module path so we pick up FindCUDAToolkit.cmake
+  set(old_CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}")
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
   include("${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake")
+  set(CMAKE_MODULE_PATH "${old_CMAKE_MODULE_PATH}")
+
   if(@CAFFE2_USE_CUDA@ AND NOT CAFFE2_USE_CUDA)
     message(FATAL_ERROR
       "Your installed Caffe2 version uses CUDA but I cannot find the CUDA "
       "libraries. Please set the proper CUDA prefixes and / or install "
       "CUDA.")
   endif()
-  if(@CAFFE2_USE_CUDNN@ AND NOT CAFFE2_USE_CUDNN)
-    message(FATAL_ERROR
-      "Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN "
-      "libraries. Please set the proper cuDNN prefixes and / or install "
-      "cuDNN.")
-  endif()
   if(@CAFFE2_USE_TENSORRT@ AND NOT CAFFE2_USE_TENSORRT)
     message(FATAL_ERROR
       "Your installed Caffe2 version uses TensorRT but I cannot find the TensorRT "
diff --git a/cmake/CheckAbi.cmake b/cmake/CheckAbi.cmake
new file mode 100644
index 000000000000..e483510e583a
--- /dev/null
+++ b/cmake/CheckAbi.cmake
@@ -0,0 +1,27 @@
+if(DEFINED GLIBCXX_USE_CXX11_ABI)
+  message(STATUS "_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} is already defined as a cmake variable")
+  return()
+endif()
+
+# XXX This ABI check cannot be run with arm-linux-androideabi-g++
+message(STATUS "${CMAKE_CXX_COMPILER} ${PROJECT_SOURCE_DIR}/torch/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
+execute_process(
+  COMMAND
+  "${CMAKE_CXX_COMPILER}"
+  "${PROJECT_SOURCE_DIR}/torch/abi-check.cpp"
+  "-o"
+  "${CMAKE_BINARY_DIR}/abi-check"
+  RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
+if(ABI_CHECK_COMPILE_RESULT)
+  message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
+  set(GLIBCXX_USE_CXX11_ABI 0)
+endif()
+execute_process(
+  COMMAND "${CMAKE_BINARY_DIR}/abi-check"
+  RESULT_VARIABLE ABI_CHECK_RESULT
+  OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
+if(ABI_CHECK_RESULT)
+  message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
+  set(GLIBCXX_USE_CXX11_ABI 0)
+endif()
+message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index cb1a201912e0..e7a06a0ab7fb 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -53,7 +53,7 @@ if(USE_CUDA)
       caffe2_update_option(USE_NVRTC OFF)
     endif()
     if(CAFFE2_USE_CUDNN)
-      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn-public)
+      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS torch::cudnn)
     else()
       caffe2_update_option(USE_CUDNN OFF)
     endif()
@@ -84,49 +84,11 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_BUILD_MOBILE)
   enable_ubsan()
 endif()
 
-# For MSVC,
-# 1. Remove /Zi, /ZI and /Z7 for Release, MinSizeRel and Default builds
-# 2. Switch off incremental linking in debug builds
-# 3. If MSVC_Z7_OVERRIDE is ON, then /Zi and /ZI will be replaced with /Z7
-#    for Debug and RelWithDebInfo builds
-if(MSVC)
-  # skip unwanted includes from windows.h
-  add_definitions(-DWIN32_LEAN_AND_MEAN)
-
-  # Windows SDK broke compatibility since version 25131, but introduced this define for backward compatibility.
-  add_definitions(-D_UCRT_LEGACY_INFINITY)
-
-  foreach(flag_var
-      CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
-      CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
-    if(${flag_var} MATCHES "/Z[iI7]")
-      string(REGEX REPLACE "/Z[iI7]" "" ${flag_var} "${${flag_var}}")
-    endif()
-  endforeach(flag_var)
-  if(MSVC_Z7_OVERRIDE)
-    foreach(flag_var
-        CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELWITHDEBINFO
-        CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/Z[iI]")
-        string(REGEX REPLACE "/Z[iI]" "/Z7" ${flag_var} "${${flag_var}}")
-      endif()
-    endforeach(flag_var)
-  endif(MSVC_Z7_OVERRIDE)
-  foreach(flag_var
-      CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
-      CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
-      CMAKE_SHARED_LINKER_FLAGS_DEBUG CMAKE_STATIC_LINKER_FLAGS_DEBUG
-      CMAKE_EXE_LINKER_FLAGS_DEBUG CMAKE_MODULE_LINKER_FLAGS_DEBUG)
-    if(${flag_var} MATCHES "/INCREMENTAL" AND NOT ${flag_var} MATCHES "/INCREMENTAL:NO")
-      string(REGEX REPLACE "/INCREMENTAL" "/INCREMENTAL:NO" ${flag_var} "${${flag_var}}")
-    endif()
-  endforeach(flag_var)
-endif(MSVC)
-
 # ---[ Threads
-include(${CMAKE_CURRENT_LIST_DIR}/public/threads.cmake)
-if(TARGET caffe2::Threads)
-  list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::Threads)
+find_package(Threads REQUIRED)
+if(TARGET Threads::Threads)
+  list(APPEND Caffe2_DEPENDENCY_LIBS Threads::Threads)
+  add_library(caffe2::Threads ALIAS Threads::Threads)
 else()
   message(FATAL_ERROR
       "Cannot find threading library. Caffe2 requires Threads to compile.")
@@ -257,7 +219,6 @@ endif()
 if(NOT INTERN_BUILD_MOBILE)
   set(AT_MKL_ENABLED 0)
   set(AT_MKL_SEQUENTIAL 0)
-  set(AT_MKL_MT 0)
   set(USE_BLAS 1)
   if(NOT (ATLAS_FOUND OR BLIS_FOUND OR GENERIC_BLAS_FOUND OR MKL_FOUND OR OpenBLAS_FOUND OR VECLIB_FOUND OR FlexiBLAS_FOUND))
     message(WARNING "Preferred BLAS (" ${BLAS} ") cannot be found, now searching for a general BLAS library")
@@ -271,10 +232,6 @@ if(NOT INTERN_BUILD_MOBILE)
     if("${MKL_THREADING}" STREQUAL "SEQ")
       set(AT_MKL_SEQUENTIAL 1)
     endif()
-    if(MSVC AND MKL_LIBRARIES MATCHES ".*libiomp5md\\.lib.*")
-      add_definitions(-D_OPENMP_NOFORCE_MANIFEST)
-      set(AT_MKL_MT 1)
-    endif()
     set(AT_MKL_ENABLED 1)
   endif()
 elseif(INTERN_USE_EIGEN_BLAS)
@@ -632,11 +589,24 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
     set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "")
     set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "")
 
+    # Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
+    # these new ISA features may not be supported on older compilers
+    set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
+    set(XNNPACK_ENABLE_ARM_FP16_VECTOR OFF CACHE BOOL "")
+
+    # Setting this global PIC flag for all XNNPACK targets.
+    # This is needed for Object libraries within XNNPACK which must
+    # be PIC to successfully link this static libXNNPACK with pytorch
+    set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE_FLAG ${CMAKE_POSITION_INDEPENDENT_CODE})
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
     add_subdirectory(
       "${XNNPACK_SOURCE_DIR}"
       "${CONFU_DEPENDENCIES_BINARY_DIR}/XNNPACK")
 
-    set_property(TARGET XNNPACK PROPERTY POSITION_INDEPENDENT_CODE ON)
+    # Revert to whatever it was before
+    set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE_FLAG})
+
     # Workaround for https://github.com/pytorch/pytorch/issues/47292
     if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.5.0))
       # Compiling qu8-requantization/precise-psimd.c without any optimization flags on gcc-7.4 or older i
@@ -1072,7 +1042,7 @@ if(BUILD_PYTHON)
 
   # These should fill in the rest of the variables, like versions, but resepct
   # the variables we set above
-  set(Python_ADDITIONAL_VERSIONS ${PYTHON_VERSION} 3.8 3.7)
+  set(Python_ADDITIONAL_VERSIONS ${PYTHON_VERSION} 3.8)
   find_package(PythonInterp 3.0)
   find_package(PythonLibs 3.0)
 
@@ -1080,9 +1050,9 @@ if(BUILD_PYTHON)
     message(FATAL_ERROR
       "Found Python libraries version ${PYTHONLIBS_VERSION_STRING}. Python 2 has reached end-of-life and is no longer supported by PyTorch.")
   endif()
-  if(${PYTHONLIBS_VERSION_STRING} VERSION_LESS 3.7)
+  if(${PYTHONLIBS_VERSION_STRING} VERSION_LESS 3.8)
     message(FATAL_ERROR
-      "Found Python libraries version ${PYTHONLIBS_VERSION_STRING}. Python 3.6 is no longer supported by PyTorch.")
+      "Found Python libraries version ${PYTHONLIBS_VERSION_STRING}. Python < 3.8 is no longer supported by PyTorch.")
   endif()
 
   # When building pytorch, we pass this in directly from setup.py, and
@@ -1137,6 +1107,9 @@ message(STATUS "pybind11 include dirs: " "${pybind11_INCLUDE_DIRS}")
 add_library(pybind::pybind11 INTERFACE IMPORTED)
 target_include_directories(pybind::pybind11 SYSTEM INTERFACE ${pybind11_INCLUDE_DIRS})
 target_link_libraries(pybind::pybind11 INTERFACE python::python)
+if(APPLE)
+  target_link_options(pybind::pybind11 INTERFACE -undefined dynamic_lookup)
+endif()
 
 # ---[ MPI
 if(USE_MPI)
@@ -1170,72 +1143,20 @@ if(USE_MPI)
 endif()
 
 # ---[ OpenMP
-if(USE_OPENMP)
-  # OpenMP support?
-  set(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
-
-  # macOS + GCC
-  if(APPLE AND CMAKE_COMPILER_IS_GNUCC)
-    exec_program(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
-    string(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
-    message(STATUS "macOS Darwin version: ${DARWIN_VERSION}")
-    if(DARWIN_VERSION GREATER 9)
-      set(APPLE_OPENMP_SUCKS 1)
-    endif(DARWIN_VERSION GREATER 9)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
-      OUTPUT_VARIABLE GCC_VERSION)
-    if(APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
-      message(WARNING "Disabling OpenMP (unstable with this version of GCC). "
-        "Install GCC >= 4.6.2 or change your OS to enable OpenMP.")
-      add_compile_options(-Wno-unknown-pragmas)
-      set(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
-    endif()
-  endif()
-
-  if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC"
-    AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    message(STATUS "Setting OpenMP flags for clang-cl")
-    set(OpenMP_CXX_FLAGS "-Xclang -fopenmp")
-    set(OpenMP_C_FLAGS "-Xclang -fopenmp")
-    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
-    set(OPENMP_FOUND ON CACHE BOOL "OpenMP Support found")
-    if(NOT MKL_FOUND)
-      execute_process(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE clang_version_output)
-      string(REGEX REPLACE ".*InstalledDir: ([^\n]+).*" "\\1" CLANG_BINDIR ${clang_version_output})
-
-      get_filename_component(CLANG_ROOT ${CLANG_BINDIR} DIRECTORY)
-      set(CLANG_OPENMP_LIBRARY "${CLANG_ROOT}/lib/libiomp5md.lib")
-
-      if(NOT TARGET caffe2::openmp)
-        add_library(caffe2::openmp INTERFACE IMPORTED)
-      endif()
-
-      set_property(
-        TARGET caffe2::openmp PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CLANG_OPENMP_LIBRARY})
-
-      list(APPEND Caffe2_PUBLIC_DEPENDENCY_LIBS caffe2::openmp)
-    endif()
-  endif()
-
-  if(WITH_OPENMP AND NOT CHECKED_OPENMP)
-    find_package(OpenMP QUIET)
-    set(CHECKED_OPENMP ON CACHE BOOL "already checked for OpenMP")
-
-    # OPENMP_FOUND is not cached in FindOpenMP.cmake (all other variables are cached)
-    # see https://github.com/Kitware/CMake/blob/master/Modules/FindOpenMP.cmake
-    set(OPENMP_FOUND ${OPENMP_FOUND} CACHE BOOL "OpenMP Support found")
-  endif()
-
+if(USE_OPENMP AND NOT TARGET caffe2::openmp)
+  include(${CMAKE_CURRENT_LIST_DIR}/Modules/FindOpenMP.cmake)
   if(OPENMP_FOUND)
     message(STATUS "Adding OpenMP CXX_FLAGS: " ${OpenMP_CXX_FLAGS})
-    if("${OpenMP_CXX_LIBRARIES}" STREQUAL "")
-        message(STATUS "No OpenMP library needs to be linked against")
-    else()
-        message(STATUS "Will link against OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
+    if(OpenMP_CXX_LIBRARIES)
+      message(STATUS "Will link against OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
+    endif()
+    add_library(caffe2::openmp INTERFACE IMPORTED)
+    target_link_libraries(caffe2::openmp INTERFACE OpenMP::OpenMP_CXX)
+    list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::openmp)
+    if(MSVC AND OpenMP_CXX_LIBRARIES MATCHES ".*libiomp5md\\.lib.*")
+      target_compile_definitions(caffe2::openmp INTERFACE _OPENMP_NOFORCE_MANIFEST)
+      target_link_options(caffe2::openmp INTERFACE "/NODEFAULTLIB:vcomp")
     endif()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
   else()
     message(WARNING "Not compiling with OpenMP. Suppress this warning with -DUSE_OPENMP=OFF")
     caffe2_update_option(USE_OPENMP OFF)
@@ -1243,6 +1164,7 @@ if(USE_OPENMP)
 endif()
 
 
+
 # ---[ Android specific ones
 if(ANDROID)
   list(APPEND Caffe2_DEPENDENCY_LIBS log)
@@ -1275,7 +1197,7 @@ endif(USE_LLVM)
 # ---[ cuDNN
 if(USE_CUDNN)
   set(CUDNN_FRONTEND_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/cudnn_frontend/include)
-  include_directories(${CUDNN_FRONTEND_INCLUDE_DIR})
+  target_include_directories(torch::cudnn INTERFACE ${CUDNN_FRONTEND_INCLUDE_DIR})
 endif()
 
 # ---[ HIP
@@ -1323,6 +1245,7 @@ if(USE_ROCM)
     list(APPEND HIP_CXX_FLAGS -fPIC)
     list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1)
     list(APPEND HIP_CXX_FLAGS -DCUDA_HAS_FP16=1)
+    list(APPEND HIP_CXX_FLAGS -DUSE_ROCM)
     list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
     list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
     list(APPEND HIP_CXX_FLAGS -DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
@@ -1488,8 +1411,7 @@ if(USE_GLOO)
         # https://github.com/facebookincubator/gloo/blob/950c0e23819779a9e0c70b861db4c52b31d1d1b2/cmake/Dependencies.cmake#L123
         set(NCCL_EXTERNAL ON)
       endif()
-      # gloo uses cuda_add_library
-      torch_update_find_cuda_flags()
+      set(GLOO_USE_CUDA_TOOLKIT ON CACHE BOOL "" FORCE)
       add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo)
     else()
       add_library(gloo SHARED IMPORTED)
@@ -1784,17 +1706,6 @@ if(NOT INTERN_BUILD_MOBILE)
     set(AT_CUDA_ENABLED 1)
   endif()
 
-  if(NOT USE_CUDNN)
-    message(STATUS "USE_CUDNN is set to 0. Compiling without cuDNN support")
-    set(AT_CUDNN_ENABLED 0)
-  elseif(NOT CUDNN_FOUND)
-    message(WARNING "CuDNN not found. Compiling without CuDNN support")
-    set(AT_CUDNN_ENABLED 0)
-  else()
-    include_directories(SYSTEM ${CUDNN_INCLUDE_PATH})
-    set(AT_CUDNN_ENABLED 1)
-  endif()
-
   if(NOT USE_ROCM)
     message("disabling ROCM because NOT USE_ROCM is set")
     message(STATUS "MIOpen not found. Compiling without MIOpen support")
@@ -1981,7 +1892,7 @@ if(USE_KINETO)
         include(CheckCXXSourceRuns)
         # rt is handled by the CMAKE_REQUIRED_LIBRARIES set above
         if(NOT APPLE)
-          set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl")
+          set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl" "pthread")
         endif()
         set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive")
         check_cxx_source_runs("#include <stdexcept>
diff --git a/cmake/GoogleTestPatch.cmake b/cmake/GoogleTestPatch.cmake
index c7fbb6ce9f02..36018ace1d89 100644
--- a/cmake/GoogleTestPatch.cmake
+++ b/cmake/GoogleTestPatch.cmake
@@ -20,6 +20,5 @@ else(REVERT)
   file(READ ${FILENAME} content)
   file(WRITE ${BACKUP} "${content}")
   string(REGEX REPLACE "[-/]Z[iI]" "/Z7" content "${content}")
-  string(REGEX REPLACE "Threads::Threads" "caffe2::Threads" content "${content}")
   file(WRITE ${FILENAME} "${content}")
 endif(REVERT)
diff --git a/cmake/Metal.cmake b/cmake/Metal.cmake
index e3124609c179..f5d3be02be2a 100644
--- a/cmake/Metal.cmake
+++ b/cmake/Metal.cmake
@@ -19,7 +19,9 @@ if(NOT DEFINED CMAKE_OSX_DEVELOPER_ROOT)
         set(CMAKE_OSX_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
     elseif(EXISTS ${XCODE_PRE_43_ROOT})
         set(CMAKE_OSX_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
-    endif(EXISTS ${XCODE_POST_43_ROOT})
+    elseif(EXISTS ${CMAKE_XCODE_DEVELOPER_DIR} AND ${CMAKE_XCODE_DEVELOPER_DIR} STREQUAL "/Library/Developer/CommandLineTools")
+            set(CMAKE_OSX_DEVELOPER_ROOT ${CMAKE_XCODE_DEVELOPER_DIR})
+    endif()
 endif(NOT DEFINED CMAKE_OSX_DEVELOPER_ROOT)
 set(CMAKE_OSX_DEVELOPER_ROOT ${CMAKE_OSX_DEVELOPER_ROOT} CACHE PATH "Location of OSX SDKs root directory")
 
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index f31dbd02c6bf..0f0fd3ff5bc7 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -136,85 +136,6 @@ if(NOT MSVC)
   endif()
 endif()
 
-# ---[ If we are using msvc, set no warning flags
-# Note(jiayq): if you are going to add a warning flag, check if this is
-# totally necessary, and only add when you see fit. If it is needed due to
-# a third party library (like Protobuf), mention it in the comment as
-# "THIRD_PARTY_NAME related"
-# From https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/
-if(${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
-  add_compile_options(
-      ##########################################
-      # Protobuf related. Cannot remove.
-      # This is directly copied from
-      #     https://github.com/google/protobuf/blob/master/cmake/README.md
-      ##########################################
-      /wd4018 # 'expression' : signed/unsigned mismatch
-      /wd4065 # (3): switch with default but no case.
-      /wd4146 # unary minus operator applied to unsigned type, result still unsigned
-      /wd4244 # Conversion from 'type1' to 'type2', possible loss of data.
-      /wd4251 # 'identifier' : class 'type' needs to have dll-interface to be used by clients of class 'type2'
-      /wd4267 # Conversion from 'size_t' to 'type', possible loss of data.
-      /wd4305 # 'identifier' : truncation from 'type1' to 'type2'
-      /wd4355 # 'this' : used in base member initializer list
-      /wd4506 # (1): no definition for inline function. Protobuf related.
-      /wd4661 # No suitable definition provided for explicit template instantiation request
-      /wd4800 # 'type' : forcing value to bool 'true' or 'false' (performance warning)
-      /wd4996 # 'function': was declared deprecated
-      ##########################################
-      # Third party related. Cannot remove.
-      ##########################################
-      /wd4141 # (1): inline used twice. google benchmark related.
-      /wd4503 # (1): decorated name length exceeded, name was truncated.
-              #      Eigen related.
-      /wd4554 # (3): check operator precedence for possible error.
-              # Eigen related.
-      /wd4805 # (1): Unsafe mix of types in gtest/gtest.h. Gtest related.
-      ##########################################
-      # These are directly ATen related. However, several are covered by
-      # the above now. We leave them here for documentation purposes only.
-      #/wd4267 # Conversion from 'size_t' to 'type', possible loss of data.
-      /wd4522 # (3): 'class' : multiple assignment operators specified
-      /wd4838 # (1): conversion from 'type_1' to 'type_2' requires a
-              #      narrowing conversion
-      #/wd4305 # 'identifier' : truncation from 'type1' to 'type2'
-      #/wd4244 # Conversion from 'type1' to 'type2', possible loss of data.
-      /wd4190 # (1): 'identifier1' has C-linkage specified, but returns UDT
-              #      'identifier2' which is incompatible with C
-      /wd4101 # (3): 'identifier' : unreferenced local variable
-      #/wd4996 # (3): Use of deprecated POSIX functions. Since we develop
-      #        #      mainly on Linux, this is ignored.
-      /wd4275 # (2): non - DLL-interface classkey 'identifier' used as
-              #      base for DLL-interface classkey 'identifier'
-      ##########################################
-      # These are directly Caffe2 related. However, several are covered by
-      # protobuf now. We leave them here for documentation purposes only.
-      ##########################################
-      #/wd4018 # (3): Signed/unsigned mismatch. We've used it in many places
-      #        #      of the code and it would be hard to correct all.
-      #/wd4244 # (2/3/4): Possible loss of precision. Various cases where we
-      #        #      implicitly cast TIndex to int etc. Need cleaning.
-      #/wd4267 # (3): Conversion of size_t to smaller type. Same as 4244.
-      #/wd4996 # (3): Use of deprecated POSIX functions. Since we develop
-      #        #      mainly on Linux, this is ignored.
-      /wd4273 # (1): inconsistent dll linkage. This is related to the
-              #      caffe2 FLAGS_* definition using dllimport in header and
-              #      dllexport in cc file. The strategy is copied from gflags.
-  )
-
-  # Make sure windows.h does not include additional headers.
-  add_definitions("/DWIN32_LEAN_AND_MEAN")
-
-  # Make sure windef.h does not define max/min macros.
-  # Required by ATen among others.
-  add_definitions("/DNOMINMAX")
-
-  set(CMAKE_SHARED_LINKER_FLAGS
-      "${CMAKE_SHARED_LINKER_FLAGS} /ignore:4049 /ignore:4217 /ignore:4099")
-  set(CMAKE_EXE_LINKER_FLAGS
-      "${CMAKE_EXE_LINKER_FLAGS} /ignore:4049 /ignore:4217 /ignore:4099")
-endif()
-
 # ---[ If we are building on ios, or building with opengl support, we will
 # enable -mfpu=neon-fp16 for iOS Metal build. For Android, this fpu setting
 # is going to be done with android-cmake by setting
diff --git a/cmake/Modules/FindCUDAToolkit.cmake b/cmake/Modules/FindCUDAToolkit.cmake
new file mode 100644
index 000000000000..7c8a79c5493a
--- /dev/null
+++ b/cmake/Modules/FindCUDAToolkit.cmake
@@ -0,0 +1,1073 @@
+
+# This module is back-ported from CMake 3.17 and above to work with CMake 3.10
+
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+.. versionadded:: 3.17
+
+This script locates the NVIDIA CUDA toolkit and the associated libraries, but
+does not require the ``CUDA`` language be enabled for a given project. This
+module does not search for the NVIDIA CUDA Samples.
+
+.. versionadded:: 3.19
+  QNX support.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+The CUDA Toolkit search behavior uses the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` or
+   the appropriate ``version.txt`` file can be found underneath the specified
+   directory.
+
+3. If the CUDA_PATH environment variable is defined, it will be searched
+   for ``nvcc``.
+
+4. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+6. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system(e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Arguments
+^^^^^^^^^
+
+``[<version>]``
+    The ``[<version>]`` argument requests a version with which the package found
+    should be compatible. See :ref:`find_package version format <FIND_PACKAGE_VERSION_FORMAT>`
+    for more details.
+
+Options
+^^^^^^^
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module defines :prop_tgt:`IMPORTED` targets for each
+of the following libraries that are part of the CUDAToolkit:
+
+- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
+- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
+- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
+- :ref:`cuFFT<cuda_toolkit_cuFFT>`
+- :ref:`cuRAND<cuda_toolkit_cuRAND>`
+- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
+- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
+- :ref:`NPP<cuda_toolkit_NPP>`
+- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
+- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
+- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
+- :ref:`nvidia-ML<cuda_toolkit_nvML>`
+- :ref:`nvRTC<cuda_toolkit_nvRTC>`
+- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
+- :ref:`OpenCL<cuda_toolkit_opencl>`
+- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
+
+.. _`cuda_toolkit_rt_lib`:
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+.. _`cuda_toolkit_driver_lib`:
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`.
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+
+.. _`cuda_toolkit_cuBLAS`:
+
+cuBLAS
+""""""
+
+The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+- ``CUDA::cublasLt`` starting in CUDA 10.1
+- ``CUDA::cublasLt_static`` starting in CUDA 10.1
+
+.. _`cuda_toolkit_cuFFT`:
+
+cuFFT
+"""""
+
+The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+
+- ``CUDA::cufftw_static``
+
+cuRAND
+""""""
+
+The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`cuda_toolkit_cuSOLVER`:
+
+cuSOLVER
+""""""""
+
+The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`cuda_toolkit_cuSPARSE`:
+
+cuSPARSE
+""""""""
+
+The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
+.. _`cuda_toolkit_NPP`:
+
+NPP
+"""
+
+The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+  Removed starting in CUDA 11.0, use :ref:`nvJPEG<cuda_toolkit_nvJPEG>` instead.
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`cuda_toolkit_nvBLAS`:
+
+nvBLAS
+""""""
+
+The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`cuda_toolkit_nvGRAPH`:
+
+nvGRAPH
+"""""""
+
+The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
+Removed starting in CUDA 11.0
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+
+.. _`cuda_toolkit_nvJPEG`:
+
+nvJPEG
+""""""
+
+The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`cuda_toolkit_nvRTC`:
+
+nvRTC
+"""""
+
+The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. _`cuda_toolkit_nvml`:
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvml``
+
+.. _`cuda_toolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+.. _`cuda_toolkit_opencl`:
+
+OpenCL
+""""""
+
+The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`cuda_toolkit_cuLIBOS`:
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`cuda_toolkit_cuRAND`:
+
+
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version`` or ``version.txt``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MINOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    The path to the CUDA Toolkit ``include`` folder containing the header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_LIBRARY_ROOT``
+    .. versionadded:: 3.18
+
+    The path to the CUDA Toolkit directory containing the nvvm directory and
+    version.txt.
+
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalent to
+    the parent directory of ``CUDAToolkit_BIN_DIR``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+
+
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as
+# CMAKE_CUDA_COMPILER_TOOLKIT_ROOT and CMAKE_CUDA_COMPILER_LIBRARY_ROOT.
+# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly
+# different installation.
+if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT)
+  set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
+  set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
+  set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
+
+  if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+  endif()
+else()
+  function(_CUDAToolkit_find_root_dir )
+    cmake_parse_arguments(arg "" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN})
+
+    if(NOT CUDAToolkit_BIN_DIR)
+      if(NOT CUDAToolkit_SENTINEL_FILE)
+        find_program(CUDAToolkit_NVCC_EXECUTABLE
+          NAMES nvcc nvcc.exe
+          PATHS ${arg_SEARCH_PATHS}
+          ${arg_FIND_FLAGS}
+        )
+      endif()
+
+      if(NOT CUDAToolkit_NVCC_EXECUTABLE)
+        find_file(CUDAToolkit_SENTINEL_FILE
+          NAMES version.txt
+          PATHS ${arg_SEARCH_PATHS}
+          NO_DEFAULT_PATH
+        )
+      endif()
+
+      if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}")
+        # If NVCC exists  then invoke it to find the toolkit location.
+        # This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit,
+        # NVIDIA HPC SDK, and distro's splayed layouts
+        execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "-v" "__cmake_determine_cuda"
+          OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT)
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)")
+          get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE)
+        else()
+          get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+        endif()
+        unset(_CUDA_NVCC_OUT)
+
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+      endif()
+
+      if(CUDAToolkit_SENTINEL_FILE)
+        get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE)
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin")
+
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+      endif()
+    endif()
+
+    if(CUDAToolkit_BIN_DIR)
+      get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+      set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE)
+    endif()
+
+  endfunction()
+
+  # For NVCC we can easily deduce the SDK binary directory from the compiler path.
+  if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+    get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+    set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "")
+    # Try language provided path first.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH)
+    mark_as_advanced(CUDAToolkit_BIN_DIR)
+  endif()
+
+  # Try user provided path
+  if(NOT CUDAToolkit_ROOT_DIR AND CUDAToolkit_ROOT)
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH)
+  endif()
+  if(NOT CUDAToolkit_ROOT_DIR)
+    _CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin)
+  endif()
+
+  # If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error.
+  if(NOT CUDAToolkit_ROOT_DIR AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
+    # Declare error messages now, print later depending on find_package args.
+    set(fail_base "Could not find nvcc executable in path specified by")
+    set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+    set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+
+    if(CUDAToolkit_FIND_REQUIRED)
+      if(DEFINED CUDAToolkit_ROOT)
+        message(FATAL_ERROR ${cuda_root_fail})
+      elseif(DEFINED ENV{CUDAToolkit_ROOT})
+        message(FATAL_ERROR ${env_cuda_root_fail})
+      endif()
+    else()
+      if(NOT CUDAToolkit_FIND_QUIETLY)
+        if(DEFINED CUDAToolkit_ROOT)
+          message(STATUS ${cuda_root_fail})
+        elseif(DEFINED ENV{CUDAToolkit_ROOT})
+          message(STATUS ${env_cuda_root_fail})
+        endif()
+      endif()
+      set(CUDAToolkit_FOUND FALSE)
+      unset(fail_base)
+      unset(cuda_root_fail)
+      unset(env_cuda_root_fail)
+      return()
+    endif()
+  endif()
+
+  # CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+  #
+  # - Linux: /usr/local/cuda-X.Y
+  # - macOS: /Developer/NVIDIA/CUDA-X.Y
+  # - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+  #
+  # We will also search the default symlink location /usr/local/cuda first since
+  # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+  # directory is the desired location.
+  if(NOT CUDAToolkit_ROOT_DIR)
+    if(UNIX)
+      if(NOT APPLE)
+        set(platform_base "/usr/local/cuda-")
+      else()
+        set(platform_base "/Developer/NVIDIA/CUDA-")
+      endif()
+    else()
+      set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+    endif()
+
+    # Build out a descending list of possible cuda installations, e.g.
+    file(GLOB possible_paths "${platform_base}*")
+    # Iterate the glob results and create a descending list.
+    set(versions)
+    foreach(p ${possible_paths})
+      # Extract version number from end of string
+      string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+      if(IS_DIRECTORY ${p} AND p_version)
+        list(APPEND versions ${p_version})
+      endif()
+    endforeach()
+
+    # Sort numerically in descending order, so we try the newest versions first.
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      list(SORT versions COMPARE NATURAL ORDER DESCENDING)
+    elseif(versions)
+      # Alphabetical sort here is not ideal but better than nothing
+      list(SORT versions)
+      list(REVERSE versions)
+    endif()
+
+    # With a descending list of versions, populate possible paths to search.
+    set(search_paths)
+    foreach(v ${versions})
+      list(APPEND search_paths "${platform_base}${v}")
+    endforeach()
+
+    # Force the global default /usr/local/cuda to the front on Unix.
+    if(UNIX)
+      list(INSERT search_paths 0 "/usr/local/cuda")
+    endif()
+
+    # Now search for the toolkit again using the platform default search paths.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin)
+
+    # We are done with these variables now, cleanup for caller.
+    unset(platform_base)
+    unset(possible_paths)
+    unset(versions)
+    unset(search_paths)
+
+    if(NOT CUDAToolkit_ROOT_DIR)
+      if(CUDAToolkit_FIND_REQUIRED)
+        message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
+      elseif(NOT CUDAToolkit_FIND_QUIETLY)
+        message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
+      endif()
+
+      set(CUDAToolkit_FOUND FALSE)
+      return()
+    endif()
+  endif()
+endif()
+
+if(NOT CUDAToolkit_BIN_DIR)
+  set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin")
+endif()
+
+if(NOT CUDAToolkit_NVCC_EXECUTABLE)
+  set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}")
+endif()
+
+if(CMAKE_CUDA_COMPILER_TOOLKIT_VERSION)
+  set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
+else()
+  function(_CUDAToolkit_find_version_file result_variable)
+    # We first check for a non-scattered installation to prefer it over a scattered installation.
+    if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/version.txt")
+      set(${result_variable} "${CUDAToolkit_ROOT}/version.txt" PARENT_SCOPE)
+    elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/version.txt")
+      set(${result_variable} "${CUDAToolkit_ROOT_DIR}/version.txt" PARENT_SCOPE)
+    elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt")
+      set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt" PARENT_SCOPE)
+    elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt")
+      set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt" PARENT_SCOPE)
+    endif()
+  endfunction()
+
+  _CUDAToolkit_find_version_file( _CUDAToolkit_version_file )
+  if(_CUDAToolkit_version_file)
+    # CUDAToolkit_LIBRARY_ROOT contains the device library and version file.
+    get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE)
+  endif()
+  unset(_CUDAToolkit_version_file)
+
+  if(CUDAToolkit_NVCC_EXECUTABLE AND
+     CMAKE_CUDA_COMPILER_VERSION AND
+     CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+    # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+    # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+    if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+    endif()
+  elseif(CUDAToolkit_NVCC_EXECUTABLE)
+    # Compute the version by invoking nvcc
+    execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+    if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+    endif()
+    unset(NVCC_OUT)
+  else()
+    _CUDAToolkit_find_version_file(version_file)
+    if(version_file)
+      file(READ "${version_file}" VERSION_INFO)
+      if(VERSION_INFO MATCHES [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+        set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+        set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+        set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+        set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+      endif()
+    endif()
+  endif()
+endif()
+
+# Find target directory when crosscompiling.
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    if(ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+      set(CUDAToolkit_TARGET_NAME "aarch64-qnx")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif(ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+endif()
+
+# If not already set we can simply use the toolkit root or it's a scattered installation.
+if(NOT CUDAToolkit_TARGET_DIR)
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
+# CUDAToolkit_TARGET_DIR always points to the directory containing the include directory.
+# On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux.
+if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h")
+  set(CUDAToolkit_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/include")
+elseif(NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIR.")
+endif()
+
+# The NVHPC layout moves math library headers and libraries to a sibling directory.
+# Create a separate variable so this directory can be selectively added to math targets.
+if(NOT EXISTS "${CUDAToolkit_INCLUDE_DIR}/cublas_v2.h")
+  set(CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/../../math_libs/include")
+  get_filename_component(CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_MATH_INCLUDE_DIR}" ABSOLUTE)
+  if(NOT EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/cublas_v2.h")
+    if(NOT CUDAToolkit_FIND_QUIETLY)
+      message(STATUS "Unable to find cublas_v2.h in either \"${CUDAToolkit_INCLUDE_DIR}\" or \"${CUDAToolkit_MATH_INCLUDE_DIR}\"")
+    endif()
+    unset(CUDAToolkit_MATH_INCLUDE_DIR)
+  endif()
+endif()
+
+# Find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64 lib/x64
+)
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64/stubs lib/x64/stubs
+)
+
+if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIR
+    CUDAToolkit_VERSION
+    CUDA_CUDART
+    CUDAToolkit_BIN_DIR
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 CUDAToolkit_SENTINEL_FILE
+                 )
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+  set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
+  get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+endif()
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_HINTS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS" ${ARGN})
+
+    set(search_names ${lib_name} ${arg_ALT})
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+            ${arg_EXTRA_HINTS}
+      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+                    ${arg_EXTRA_PATH_SUFFIXES}
+    )
+    # Don't try any stub directories until we have exhausted all other
+    # search locations.
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+            ${arg_EXTRA_HINTS}
+      PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs
+                    # Support NVHPC splayed math library layout
+                    ../../math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64
+                    ../../math_libs/lib64
+    )
+
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
+
+    if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} UNKNOWN IMPORTED)
+      set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+          INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+      set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+          INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+      if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR)
+        string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs)
+        if(NOT ${math_libs} EQUAL -1)
+          set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+              INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_MATH_INCLUDE_DIRS}")
+          set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+              INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${CUDAToolkit_MATH_INCLUDE_DIRS}")
+        endif()
+      endif()
+      set_property(TARGET CUDA::${lib_name} PROPERTY IMPORTED_LOCATION "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+              INTERFACE_LINK_LIBRARIES CUDA::${dep})
+        endif()
+      endforeach()
+      if(arg_EXTRA_INCLUDE_DIRS)
+        set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+            INTERFACE_INCLUDE_DIRECTORIES "${arg_EXTRA_INCLUDE_DIRS}")
+        set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+            INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${arg_EXTRA_INCLUDE_DIRS}")
+      endif()
+    endif()
+  endfunction()
+
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    set_property(TARGET CUDA::toolkit APPEND PROPERTY
+        INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+    set_property(TARGET CUDA::toolkit APPEND PROPERTY
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
+
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
+
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
+
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    set_property(TARGET CUDA::cudart_static APPEND PROPERTY
+        INTERFACE_LINK_LIBRARIES CUDA::cudart_static_deps)
+
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      set_property(TARGET CUDA::cudart_static_deps APPEND PROPERTY
+          INTERFACE_LINK_LIBRARIES Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
+
+    if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX"))
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        set_property(TARGET CUDA::cudart_static_deps APPEND PROPERTY
+            INTERFACE_LINK_LIBRARIES ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach(cuda_lib cublasLt cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
+  endforeach()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0)
+    # cublas depends on cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library
+    _CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static)
+  else()
+    _CUDAToolkit_find_and_add_import_lib(cublas)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos)
+  endif()
+
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2)
+    _CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos)
+  endif()
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
+
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2)
+    # cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2,
+    # https://docs.nvidia.com/cuda/archive/11.5.0/cusolver/index.html#static-link-lapack
+    _CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib
+    _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cusolver_lapack_static)
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1)
+    # cusolver depends on libcusolver_metis and cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.2.2/cusolver/index.html#link-dependency
+    _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublasLt)
+
+    _CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib
+    _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cusolver_metis_static cublasLt_static)
+  endif()
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
+
+  # Process the majority of the NPP libraries.
+  foreach(cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
+  endforeach()
+
+  find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS
+      "${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include"
+      "${CUDAToolkit_INCLUDE_DIR}/../extras/CUPTI/include"
+      "${CUDAToolkit_INCLUDE_DIR}"
+      NO_DEFAULT_PATH)
+  mark_as_advanced(CUDAToolkit_CUPTI_INCLUDE_DIR)
+
+  if(CUDAToolkit_CUPTI_INCLUDE_DIR)
+    _CUDAToolkit_find_and_add_import_lib(cupti
+                                        EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                            ../extras/CUPTI/lib/
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                        EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                            ../extras/CUPTI/lib/
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
+
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
+
+  # nvtools can be installed outside the CUDA toolkit directory,
+  # so search the NVTOOLSEXT_PATH windows only environment variable
+  set(nvToolsExt_EXTRA_PATH)
+  if(WIN32)
+     set(nvToolsExt_EXTRA_PATH "C:\\Program Files\\NVIDIA Corporation\\NvToolsExt")
+  endif()
+
+  find_path(CUDAToolkit_nvToolsExt_INCLUDE_DIR nvToolsExt.h
+      PATHS "${CUDAToolkit_INCLUDE_DIR}"
+            "${CUDAToolkit_ROOT_DIR}"
+            ENV NVTOOLSEXT_PATH
+            "${nvToolsExt_EXTRA_PATH}"
+      PATH_SUFFIXES include
+      NO_DEFAULT_PATH)
+  mark_as_advanced(CUDAToolkit_nvToolsExt_INCLUDE_DIR)
+
+  if(CUDAToolkit_nvToolsExt_INCLUDE_DIR)
+    _CUDAToolkit_find_and_add_import_lib(nvToolsExt
+        ALT nvToolsExt64 nvToolsExt64_1
+        EXTRA_HINTS ENV NVTOOLSEXT_PATH
+                    "${nvToolsExt_EXTRA_PATH}"
+        EXTRA_INCLUDE_DIRS "${CUDAToolkit_nvToolsExt_INCLUDE_DIR}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
+
+unset(CUDAToolkit_ROOT_DIR)
+
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
+endif()
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 01594a5b66e0..83df105870b0 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -41,6 +41,12 @@ IF (WIN32)
 ELSE (WIN32)
   SET(DEFAULT_INTEL_COMPILER_DIR "/opt/intel")
   SET(DEFAULT_INTEL_MKL_DIR "/opt/intel/mkl")
+  if (EXISTS "/opt/intel/oneapi")
+    SET(DEFAULT_INTEL_COMPILER_DIR "/opt/intel/oneapi")
+    if (EXISTS "/opt/intel/oneapi/mkl/latest")
+      SET(DEFAULT_INTEL_MKL_DIR "/opt/intel/oneapi/mkl/latest")
+    endif()
+  endif()
 ENDIF (WIN32)
 
 # Intel Compiler Suite
diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
index 5c1595a29211..04e4ef8fa41f 100644
--- a/cmake/Modules/FindOpenMP.cmake
+++ b/cmake/Modules/FindOpenMP.cmake
@@ -249,11 +249,14 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
 
     if(NOT "${CMAKE_${LANG}_COMPILER_ID}" STREQUAL "GNU")
       find_package(MKL QUIET)
-      if(MKL_FOUND AND (NOT "${MKL_OPENMP_LIBRARY}" STREQUAL ""))
+      if(MKL_FOUND AND MKL_OPENMP_LIBRARY)
         # If we already link OpenMP via MKL, use that. Otherwise at run-time
         # OpenMP will complain about being initialized twice (OMP: Error #15),
         # can may cause incorrect behavior.
         set(OpenMP_libomp_LIBRARY "${MKL_OPENMP_LIBRARY}" CACHE STRING "libomp location for OpenMP")
+        if("-fopenmp=libiomp5" IN_LIST OpenMP_${LANG}_FLAG_CANDIDATES)
+          set(OPENMP_FLAG "-fopenmp=libiomp5")
+        endif()
       else()
         find_library(OpenMP_libomp_LIBRARY
           NAMES omp gomp iomp5
@@ -263,7 +266,7 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
       endif()
       mark_as_advanced(OpenMP_libomp_LIBRARY)
 
-      if (OpenMP_libomp_LIBRARY)
+      if(OpenMP_libomp_LIBRARY)
         try_compile( OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG} ${CMAKE_BINARY_DIR} ${_OPENMP_TEST_SRC}
           CMAKE_FLAGS "-DCOMPILE_DEFINITIONS:STRING=${OPENMP_FLAGS_TEST}"
           LINK_LIBRARIES ${CMAKE_${LANG}_VERBOSE_FLAG} ${OpenMP_libomp_LIBRARY}
@@ -271,7 +274,12 @@ function(_OPENMP_GET_FLAGS LANG FLAG_MODE OPENMP_FLAG_VAR OPENMP_LIB_NAMES_VAR)
         )
         if(OpenMP_COMPILE_RESULT_${FLAG_MODE}_${OPENMP_PLAIN_FLAG})
           set("${OPENMP_FLAG_VAR}" "${OPENMP_FLAG}" PARENT_SCOPE)
-          set("${OPENMP_LIB_NAMES_VAR}" "libomp" PARENT_SCOPE)
+          if(MKL_OPENMP_LIBRARY)
+            set(OpenMP_libiomp5_LIBRARY "${MKL_OPENMP_LIBRARY}" CACHE STRING "libomp location for OpenMP")
+            set("${OPENMP_LIB_NAMES_VAR}" "libiomp5" PARENT_SCOPE)
+          else()
+            set("${OPENMP_LIB_NAMES_VAR}" "libomp" PARENT_SCOPE)
+          endif()
           break()
         endif()
       endif()
diff --git a/cmake/Modules_CUDA_fix/FindCUDNN.cmake b/cmake/Modules_CUDA_fix/FindCUDNN.cmake
index e30d20ba1906..82134328c803 100644
--- a/cmake/Modules_CUDA_fix/FindCUDNN.cmake
+++ b/cmake/Modules_CUDA_fix/FindCUDNN.cmake
@@ -47,4 +47,32 @@ find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
 
 find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH)
 
-mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
+if(CUDNN_FOUND)
+  # Get cuDNN version
+  if(EXISTS ${CUDNN_INCLUDE_PATH}/cudnn_version.h)
+    file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
+  else()
+    file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
+  endif()
+  string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+               CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+               CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+  string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+               CUDNN_VERSION_MINOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+               CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+  string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+               CUDNN_VERSION_PATCH "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+               CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+  # Assemble cuDNN version
+  if(NOT CUDNN_VERSION_MAJOR)
+    set(CUDNN_VERSION "?")
+  else()
+    set(CUDNN_VERSION
+        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+  endif()
+endif()
+
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY CUDNN_VERSION)
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index 146724051290..7f45cd098447 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -630,6 +630,7 @@ macro(cuda_unset_include_and_libraries)
   unset(CUDA_cublas_LIBRARY CACHE)
   unset(CUDA_cublas_device_LIBRARY CACHE)
   unset(CUDA_cublasemu_LIBRARY CACHE)
+  unset(CUDA_cublasLt_LIBRARY CACHE)
   unset(CUDA_cufft_LIBRARY CACHE)
   unset(CUDA_cufftemu_LIBRARY CACHE)
   unset(CUDA_cupti_LIBRARY CACHE)
@@ -963,6 +964,7 @@ endif()
 
 find_cuda_helper_libs(cufft)
 find_cuda_helper_libs(cublas)
+find_cuda_helper_libs(cublasLt)
 # cusparse showed up in version 3.2
 find_cuda_helper_libs(cusparse)
 find_cuda_helper_libs(curand)
@@ -993,7 +995,7 @@ if (CUDA_BUILD_EMULATION)
   set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
 else()
   set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
 endif()
 
 ########################
@@ -1962,7 +1964,7 @@ macro(CUDA_ADD_CUBLAS_TO_TARGET target)
   if (CUDA_BUILD_EMULATION)
     target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublasemu_LIBRARY})
   else()
-    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY})
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
   endif()
 endmacro()
 
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index 33c484e10296..10dad435b9ba 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -60,13 +60,16 @@ endif()
 if(NOT CUDA_VERSION VERSION_LESS "11.8")
   list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
   list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Hopper")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.7")
   list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
   list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.7")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0")
 
   if(CUDA_VERSION VERSION_LESS "12.0")
     set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.7+PTX")
     list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
     list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0+PTX")
   endif()
@@ -204,8 +207,8 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
         set(arch_bin 7.5)
         set(arch_ptx 7.5)
       elseif(${arch_name} STREQUAL "Ampere")
-        set(arch_bin 8.0)
-        set(arch_ptx 8.0)
+        set(arch_bin 8.0 8.6 8.7)
+        set(arch_ptx 8.0 8.6 8.7)
       elseif(${arch_name} STREQUAL "Ada")
         set(arch_bin 8.9)
         set(arch_ptx 8.9)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 23c9cd8eeb77..053af1a0b2ab 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -84,26 +84,17 @@ function(caffe2_print_configuration_summary)
       message(STATUS "    cuDNN version       : ${CUDNN_VERSION}")
     endif()
     message(STATUS "    CUDA root directory : ${CUDA_TOOLKIT_ROOT_DIR}")
-    get_target_property(__tmp caffe2::cuda IMPORTED_LOCATION)
-    message(STATUS "    CUDA library        : ${__tmp}")
-    get_target_property(__tmp torch::cudart INTERFACE_LINK_LIBRARIES)
-    message(STATUS "    cudart library      : ${__tmp}")
-    get_target_property(__tmp caffe2::cublas INTERFACE_LINK_LIBRARIES)
-    message(STATUS "    cublas library      : ${__tmp}")
-    get_target_property(__tmp caffe2::cufft INTERFACE_LINK_LIBRARIES)
-    message(STATUS "    cufft library       : ${__tmp}")
-    get_target_property(__tmp caffe2::curand IMPORTED_LOCATION)
-    message(STATUS "    curand library      : ${__tmp}")
+    message(STATUS "    CUDA library        : ${CUDA_cuda_driver_LIBRARY}")
+    message(STATUS "    cudart library      : ${CUDA_cudart_LIBRARY}")
+    message(STATUS "    cublas library      : ${CUDA_cublas_LIBRARY}")
+    message(STATUS "    cufft library       : ${CUDA_cufft_LIBRARY}")
+    message(STATUS "    curand library      : ${CUDA_curand_LIBRARY}")
+    message(STATUS "    cusparse library    : ${CUDA_cusparse_LIBRARY}")
     if(${USE_CUDNN})
-      get_target_property(__tmp caffe2::cudnn-public INTERFACE_LINK_LIBRARIES)
+      get_target_property(__tmp torch::cudnn INTERFACE_LINK_LIBRARIES)
       message(STATUS "    cuDNN library       : ${__tmp}")
-      if(${CUDNN_STATIC})
-        get_target_property(__tmp caffe2::cudnn-private INTERFACE_LINK_LIBRARIES)
-        message(STATUS "    cuDNN static library: ${__tmp}")
-      endif()
     endif()
-    get_target_property(__tmp caffe2::nvrtc IMPORTED_LOCATION)
-    message(STATUS "    nvrtc               : ${__tmp}")
+    message(STATUS "    nvrtc               : ${CUDA_nvrtc_LIBRARY}")
     message(STATUS "    CUDA include path   : ${CUDA_INCLUDE_DIRS}")
     message(STATUS "    NVCC executable     : ${CUDA_NVCC_EXECUTABLE}")
     message(STATUS "    CUDA compiler       : ${CMAKE_CUDA_COMPILER}")
@@ -120,6 +111,7 @@ function(caffe2_print_configuration_summary)
   if(${USE_ROCM})
     message(STATUS "    ROCM_VERSION        : ${ROCM_VERSION}")
   endif()
+  message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
   message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
   message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
   message(STATUS "    USE_FAKELOWP          : ${USE_FAKELOWP}")
@@ -195,6 +187,8 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
   message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
+  message(STATUS "  Public CUDA Deps.    : ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}")
+  message(STATUS "  Private CUDA Deps.   : ${Caffe2_CUDA_DEPENDENCY_LIBS}")
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
   message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 33f697104e3c..68de16b5a0de 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -39,8 +39,8 @@ endif()
 # Enable CUDA language support
 set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}")
 # Pass clang as host compiler, which according to the docs
-# Must be done before CUDA language is enabled, see  mast be done before
-# see https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html
+# Must be done before CUDA language is enabled, see
+# https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html
 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
   set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}")
 endif()
@@ -48,6 +48,27 @@ enable_language(CUDA)
 set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
+# CMP0074 - find_package will respect <PackageName>_ROOT variables
+cmake_policy(PUSH)
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12.0)
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+cmake_policy(POP)
+
+if(NOT CMAKE_CUDA_COMPILER_VERSION STREQUAL CUDAToolkit_VERSION OR
+    NOT CUDA_INCLUDE_DIRS STREQUAL CUDAToolkit_INCLUDE_DIR)
+  message(FATAL_ERROR "Found two conflicting CUDA installs:\n"
+                      "V${CMAKE_CUDA_COMPILER_VERSION} in '${CUDA_INCLUDE_DIRS}' and\n"
+                      "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIR}'")
+endif()
+
+if(NOT TARGET CUDA::nvToolsExt)
+  message(FATAL_ERROR "Failed to find nvToolsExt")
+endif()
+
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@@ -107,21 +128,6 @@ if(CUDA_FOUND)
   endif()
 endif()
 
-# Find cuDNN.
-if(USE_STATIC_CUDNN)
-  set(CUDNN_STATIC ON CACHE BOOL "")
-else()
-  set(CUDNN_STATIC OFF CACHE BOOL "")
-endif()
-
-find_package(CUDNN)
-
-if(CAFFE2_USE_CUDNN AND NOT CUDNN_FOUND)
-  message(WARNING
-    "Caffe2: Cannot find cuDNN library. Turning the option off")
-  set(CAFFE2_USE_CUDNN OFF)
-endif()
-
 # Optionally, find TensorRT
 if(CAFFE2_USE_TENSORRT)
   find_path(TENSORRT_INCLUDE_DIR NvInfer.h
@@ -153,39 +159,6 @@ if(CAFFE2_USE_TENSORRT)
   endif()
 endif()
 
-# ---[ Extract versions
-if(CAFFE2_USE_CUDNN)
-  # Get cuDNN version
-  if(EXISTS ${CUDNN_INCLUDE_PATH}/cudnn_version.h)
-    file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
-  else()
-    file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
-  endif()
-  string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
-               CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
-  string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
-               CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
-  string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
-               CUDNN_VERSION_MINOR "${CUDNN_HEADER_CONTENTS}")
-  string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
-               CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
-  string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
-               CUDNN_VERSION_PATCH "${CUDNN_HEADER_CONTENTS}")
-  string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
-               CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
-  # Assemble cuDNN version
-  if(NOT CUDNN_VERSION_MAJOR)
-    set(CUDNN_VERSION "?")
-  else()
-    set(CUDNN_VERSION
-        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
-  endif()
-  message(STATUS "Found cuDNN: v${CUDNN_VERSION}  (include: ${CUDNN_INCLUDE_PATH}, library: ${CUDNN_LIBRARY_PATH})")
-  if(CUDNN_VERSION VERSION_LESS "7.0.0")
-    message(FATAL_ERROR "PyTorch requires cuDNN 7 and above.")
-  endif()
-endif()
-
 # ---[ CUDA libraries wrapper
 
 # find libcuda.so and lbnvrtc.so
@@ -193,12 +166,8 @@ endif()
 # stubs folder, in case we are building on a system that does not
 # have cuda driver installed. On windows, we also search under the
 # folder lib/x64.
-find_library(CUDA_CUDA_LIB cuda
-    PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs lib/x64)
-find_library(CUDA_NVRTC_LIB nvrtc
-    PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES lib lib64 lib/x64)
+set(CUDA_CUDA_LIB "${CUDA_cuda_driver_LIBRARY}" CACHE FILEPATH "")
+set(CUDA_NVRTC_LIB "${CUDA_nvrtc_LIBRARY}" CACHE FILEPATH "")
 if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH)
   if("${PYTHON_EXECUTABLE}" STREQUAL "")
     set(_python_exe "python")
@@ -226,164 +195,101 @@ endif()
 # end-users should never have this flag set.
 
 # cuda
-add_library(caffe2::cuda UNKNOWN IMPORTED)
-set_property(
-    TARGET caffe2::cuda PROPERTY IMPORTED_LOCATION
-    ${CUDA_CUDA_LIB})
+add_library(caffe2::cuda INTERFACE IMPORTED)
 set_property(
-    TARGET caffe2::cuda PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
+    TARGET caffe2::cuda PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::cuda_driver)
 
-# cudart. CUDA_LIBRARIES is actually a list, so we will make an interface
-# library.
+# cudart
 add_library(torch::cudart INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA)
     set_property(
         TARGET torch::cudart PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_cudart_static_LIBRARY}")
-    if(NOT WIN32)
-      set_property(
-          TARGET torch::cudart APPEND PROPERTY INTERFACE_LINK_LIBRARIES
-          rt dl)
-    endif()
+        CUDA::cudart_static)
 else()
     set_property(
         TARGET torch::cudart PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CUDA_LIBRARIES})
+        CUDA::cudart)
 endif()
-set_property(
-    TARGET torch::cudart PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
 
 # nvToolsExt
 add_library(torch::nvtoolsext INTERFACE IMPORTED)
-if(MSVC)
-  if(NOT NVTOOLEXT_HOME)
-    set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
-  endif()
-  if(DEFINED ENV{NVTOOLSEXT_PATH})
-    set(NVTOOLEXT_HOME $ENV{NVTOOLSEXT_PATH})
-    file(TO_CMAKE_PATH ${NVTOOLEXT_HOME} NVTOOLEXT_HOME)
-  endif()
-  set_target_properties(
-      torch::nvtoolsext PROPERTIES
-      INTERFACE_LINK_LIBRARIES ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
-      INTERFACE_INCLUDE_DIRECTORIES ${NVTOOLEXT_HOME}/include)
-
-elseif(APPLE)
-  set_property(
-      TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
-      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib)
-
-else()
-  find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
-  set_property(
-      TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
-      ${LIBNVTOOLSEXT})
-endif()
+set_property(
+    TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::nvToolsExt)
 
-# cublas. CUDA_CUBLAS_LIBRARIES is actually a list, so we will make an
-# interface library similar to cudart.
+# cublas
 add_library(caffe2::cublas INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
     set_property(
         TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CUDA_CUBLAS_LIBRARIES})
-    # Add explicit dependency to cudart_static to fix
-    # libcublasLt_static.a.o): undefined reference to symbol 'cudaStreamWaitEvent'
-    # error adding symbols: DSO missing from command line
+        # NOTE: cublas is always linked dynamically
+        CUDA::cublas CUDA::cublasLt)
     set_property(
-      TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
-      "${CUDA_cudart_static_LIBRARY}" rt dl)
+        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cudart_static rt)
 else()
     set_property(
         TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CUDA_CUBLAS_LIBRARIES})
+        CUDA::cublas CUDA::cublasLt)
 endif()
-set_property(
-    TARGET caffe2::cublas PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
 
-# cudnn public and private interfaces
+# cudnn interface
 # static linking is handled by USE_STATIC_CUDNN environment variable
-# If library is linked dynamically, than private interface is no-op
-# If library is linked statically:
-#  - public interface would only reference headers
-#  - private interface will contain the actual link instructions
 if(CAFFE2_USE_CUDNN)
-  add_library(caffe2::cudnn-public INTERFACE IMPORTED)
-  set_property(
-    TARGET caffe2::cudnn-public PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDNN_INCLUDE_PATH})
-  add_library(caffe2::cudnn-private INTERFACE IMPORTED)
-  set_property(
-    TARGET caffe2::cudnn-private PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDNN_INCLUDE_PATH})
+  if(USE_STATIC_CUDNN)
+    set(CUDNN_STATIC ON CACHE BOOL "")
+  else()
+    set(CUDNN_STATIC OFF CACHE BOOL "")
+  endif()
+
+  find_package(CUDNN)
+
+  if(NOT CUDNN_FOUND)
+    message(WARNING
+      "Cannot find cuDNN library. Turning the option off")
+    set(CAFFE2_USE_CUDNN OFF)
+  else()
+    if(CUDNN_VERSION VERSION_LESS "8.0.0")
+      message(FATAL_ERROR "PyTorch requires cuDNN 8 and above.")
+    endif()
+  endif()
+
+  add_library(torch::cudnn INTERFACE IMPORTED)
+  target_include_directories(torch::cudnn INTERFACE ${CUDNN_INCLUDE_PATH})
   if(CUDNN_STATIC AND NOT WIN32)
-    set_property(
-      TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES
-      ${CUDNN_LIBRARY_PATH})
-    set_property(
-      TARGET caffe2::cudnn-private APPEND PROPERTY INTERFACE_LINK_LIBRARIES
-      "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
-    # Add explicit dependency on cublas to cudnn
-    get_target_property(__tmp caffe2::cublas INTERFACE_LINK_LIBRARIES)
-    set_property(
-      TARGET caffe2::cudnn-private APPEND PROPERTY INTERFACE_LINK_LIBRARIES
-      "${__tmp}")
-    # Lines below use target_link_libraries because we support cmake 3.5+.
-    # For cmake 3.13+, target_link_options to set INTERFACE_LINK_OPTIONS would be better.
-    # https://cmake.org/cmake/help/v3.5/command/target_link_libraries.html warns
-    # "Item names starting with -, but not -l or -framework, are treated as linker flags.
-    #  Note that such flags will be treated like any other library link item for purposes
-    #  of transitive dependencies, so they are generally safe to specify only as private
-    #  link items that will not propagate to dependents."
-    # Propagating to a dependent (torch_cuda) is exactly what we want here, so we are
-    # flouting the warning, but I can't think of a better (3.5+ compatible) way.
-    target_link_libraries(caffe2::cudnn-private INTERFACE
+    target_link_options(torch::cudnn INTERFACE
         "-Wl,--exclude-libs,libcudnn_static.a")
   else()
-  set_property(
-    TARGET caffe2::cudnn-public PROPERTY INTERFACE_LINK_LIBRARIES
-    ${CUDNN_LIBRARY_PATH})
+    target_link_libraries(torch::cudnn INTERFACE ${CUDNN_LIBRARY_PATH})
   endif()
+else()
+  message(STATUS "USE_CUDNN is set to 0. Compiling without cuDNN support")
 endif()
 
 # curand
-add_library(caffe2::curand UNKNOWN IMPORTED)
+add_library(caffe2::curand INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
-    set_property(
-        TARGET caffe2::curand PROPERTY IMPORTED_LOCATION
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a")
     set_property(
         TARGET caffe2::curand PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
+        CUDA::curand_static)
 else()
     set_property(
-        TARGET caffe2::curand PROPERTY IMPORTED_LOCATION
-        ${CUDA_curand_LIBRARY})
+        TARGET caffe2::curand PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::curand)
 endif()
-set_property(
-    TARGET caffe2::curand PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
 
-# cufft. CUDA_CUFFT_LIBRARIES is actually a list, so we will make an
-# interface library similar to cudart.
+# cufft
 add_library(caffe2::cufft INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
     set_property(
         TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static_nocallback.a"
-        "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl)
+        CUDA::cufft_static_nocallback)
 else()
     set_property(
         TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
-        ${CUDA_CUFFT_LIBRARIES})
+        CUDA::cufft)
 endif()
-set_property(
-    TARGET caffe2::cufft PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
 
 # TensorRT
 if(CAFFE2_USE_TENSORRT)
@@ -397,28 +303,10 @@ if(CAFFE2_USE_TENSORRT)
 endif()
 
 # nvrtc
-add_library(caffe2::nvrtc UNKNOWN IMPORTED)
-set_property(
-    TARGET caffe2::nvrtc PROPERTY IMPORTED_LOCATION
-    ${CUDA_NVRTC_LIB})
+add_library(caffe2::nvrtc INTERFACE IMPORTED)
 set_property(
-    TARGET caffe2::nvrtc PROPERTY INTERFACE_INCLUDE_DIRECTORIES
-    ${CUDA_INCLUDE_DIRS})
-
-# Note: in theory, we can add similar dependent library wrappers. For
-# now, Caffe2 only uses the above libraries, so we will only wrap
-# these.
-
-# Special care for windows platform: we know that 32-bit windows does not
-# support cuda.
-if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-  if(NOT (CMAKE_SIZEOF_VOID_P EQUAL 8))
-    message(FATAL_ERROR
-            "CUDA support not available with 32-bit windows. Did you "
-            "forget to set Win64 in the generator target?")
-    return()
-  endif()
-endif()
+    TARGET caffe2::nvrtc PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::nvrtc)
 
 # Add onnx namepsace definition to nvcc
 if(ONNX_NAMESPACE)
@@ -461,17 +349,6 @@ if(MSVC)
   list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward")
 endif()
 
-# OpenMP flags for NVCC with Clang-cl
-if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC"
-  AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Xclang" "-fopenmp")
-  if(MSVC_TOOLSET_VERSION LESS 142)
-    list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp")
-  else()
-    list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp:experimental")
-  endif()
-endif()
-
 # Debug and Release symbol support
 if(MSVC)
   if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
diff --git a/cmake/public/threads.cmake b/cmake/public/threads.cmake
deleted file mode 100644
index 749619d64d99..000000000000
--- a/cmake/public/threads.cmake
+++ /dev/null
@@ -1,29 +0,0 @@
-if(TARGET caffe2::Threads)
-  return()
-endif()
-
-find_package(Threads REQUIRED)
-
-# Threads::Threads doesn't work if the target has CUDA code
-if(THREADS_FOUND)
-  add_library(caffe2::Threads INTERFACE IMPORTED)
-
-  if(THREADS_HAVE_PTHREAD_ARG)
-    set(compile_options
-        $<$<COMPILE_LANGUAGE:C>:-pthread>
-        $<$<COMPILE_LANGUAGE:CXX>:-pthread>)
-    if(USE_CUDA)
-      list(APPEND compile_options
-        $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler -pthread>)
-    endif()
-
-    set_property(TARGET caffe2::Threads
-                 PROPERTY INTERFACE_COMPILE_OPTIONS
-                 ${compile_options})
-  endif()
-
-  if(CMAKE_THREAD_LIBS_INIT)
-    set_property(TARGET caffe2::Threads
-                 PROPERTY INTERFACE_LINK_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
-  endif()
-endif()
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 9ad0a2f96f88..0ce0f3b080c9 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -429,18 +429,6 @@ function(torch_compile_options libname)
         ${MSVC_RUNTIME_LIBRARY_OPTION}
         $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:${MSVC_DEBINFO_OPTION}>
         /EHsc
-        /DNOMINMAX
-        /wd4267
-        /wd4251
-        /wd4522
-        /wd4522
-        /wd4838
-        /wd4305
-        /wd4244
-        /wd4190
-        /wd4101
-        /wd4996
-        /wd4275
         /bigobj>
       )
   else()
@@ -512,26 +500,6 @@ function(torch_compile_options libname)
 
 endfunction()
 
-
-##############################################################################
-# Set standard target properties.
-# Usage:
-#   torch_set_target_props(lib_name)
-function(torch_set_target_props libname)
-  if(MSVC AND AT_MKL_MT)
-    set(VCOMP_LIB "vcomp")
-    set_target_properties(${libname} PROPERTIES LINK_FLAGS_MINSIZEREL "/NODEFAULTLIB:${VCOMP_LIB}")
-    set_target_properties(${libname} PROPERTIES LINK_FLAGS_RELWITHDEBINFO "/NODEFAULTLIB:${VCOMP_LIB}")
-    set_target_properties(${libname} PROPERTIES LINK_FLAGS_RELEASE "/NODEFAULTLIB:${VCOMP_LIB}")
-    set_target_properties(${libname} PROPERTIES LINK_FLAGS_DEBUG "/NODEFAULTLIB:${VCOMP_LIB}d")
-    set_target_properties(${libname} PROPERTIES STATIC_LIBRARY_FLAGS_MINSIZEREL "/NODEFAULTLIB:${VCOMP_LIB}")
-    set_target_properties(${libname} PROPERTIES STATIC_LIBRARY_FLAGS_RELWITHDEBINFO "/NODEFAULTLIB:${VCOMP_LIB}")
-    set_target_properties(${libname} PROPERTIES STATIC_LIBRARY_FLAGS_RELEASE "/NODEFAULTLIB:${VCOMP_LIB}")
-    set_target_properties(${libname} PROPERTIES STATIC_LIBRARY_FLAGS_DEBUG "/NODEFAULTLIB:${VCOMP_LIB}d")
-  endif()
-endfunction()
-
-
 ##############################################################################
 # Set old-style FindCuda.cmake compile flags from modern CMake cuda flags.
 # Usage:
diff --git a/docker.Makefile b/docker.Makefile
index f85a3c3a3fc1..fd49964c4587 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -8,7 +8,7 @@ $(warning WARNING: No docker user found using results from whoami)
 DOCKER_ORG                = $(shell whoami)
 endif
 
-CUDA_VERSION              = 11.6.2
+CUDA_VERSION              = 11.7.0
 CUDNN_VERSION             = 8
 BASE_RUNTIME              = ubuntu:18.04
 BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
diff --git a/docs/caffe2/installation.md b/docs/caffe2/installation.md
index 6abc67f58a70..6c8ac2f2b954 100644
--- a/docs/caffe2/installation.md
+++ b/docs/caffe2/installation.md
@@ -58,10 +58,6 @@ Note that you might need to uninstall existing Eigen and pybind11 packages due t
 
 ## Python support
 
-To use Caffe2 in Python, you need two libraries, future and six.
-
-    pip install future six
-
 To run the tutorials, download additional source from GitHub.
 
     git clone --recursive https://github.com/caffe2/tutorials.git caffe2_tutorials
diff --git a/docs/cpp/requirements.txt b/docs/cpp/requirements.txt
index ca3eb7da6846..da401f2883a6 100644
--- a/docs/cpp/requirements.txt
+++ b/docs/cpp/requirements.txt
@@ -6,4 +6,3 @@ docutils==0.16
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 bs4
 lxml
-six
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index 18776049fe70..366e7221f34f 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -31,6 +31,20 @@
     {% include "searchbox.html" %}
 {% endblock %}
 
+{%- block content %}
+{{ super() }}
+<script>
+
+var match = window.location.href.match(/\/_[a-zA-Z0-9_]*.html|_dynamo/gi);
+var url = window.location.href.lastIndexOf(match[match.length-1]);
+
+if (url)
+  {
+    var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-exclamation-circle" aria-hidden="true">&nbsp</i> This page describes an internal API which is not intended to be used outside of the PyTorch codebase and can be modified or removed without notice.</p></div>'
+    document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
+  }
+</script>
+{%- endblock %}
 
 {% block footer %}
 {{ super() }}
diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst
index 30bd9c6cf975..0c7acc901261 100644
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@@ -2,7 +2,7 @@ PyTorch Contribution Guide
 ==========================
 
 PyTorch is a GPU-accelerated Python tensor computation package for
-building deep neural networks using a on tape-based autograd systems.
+building deep neural networks using a tape-based autograd systems.
 
 Contribution Process
 --------------------
@@ -129,7 +129,7 @@ proposed solution. The PyTorch team can provide guidance that saves you
 time.
 
 Issues that are labeled first-new-issue, low, or medium priority provide
-the best entrance point are great places to start.
+the best entrance points and are great places to start.
 
 Adding Tutorials
 ~~~~~~~~~~~~~~~~
diff --git a/docs/source/community/governance.rst b/docs/source/community/governance.rst
index 3aa124846328..36c9ee281614 100644
--- a/docs/source/community/governance.rst
+++ b/docs/source/community/governance.rst
@@ -128,9 +128,9 @@ The Process for Nomination
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 * Each module has its own process. Please contact module maintainers for more information.
-  However, if there is no process identified, you can file a request to the core maintainers
-  by submitting `this form <https://forms.gle/xNeu1byGMZVHcA2q7>`__. Core maintainers are
-  meeting every three months.
+  However, if there is no process identified, you can file a request to the core
+  maintainers by submitting `this form <https://share.hsforms.com/1fh3SpHFMR2ihEBQ2orgN8A4tvhy>`__.
+  Core maintainers are meeting every three months.
 * If you are submitting a request to the core maintainers, the information in your request
   must include the following items:
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 90f1659d30e5..1911860ea955 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -175,6 +175,7 @@
     "AnyType",
     "Argument",
     "ArgumentSpec",
+    "AwaitType",
     "BenchmarkConfig",
     "BenchmarkExecutionStats",
     "Block",
@@ -274,6 +275,7 @@
     # torch.cuda._sanitizer
     "Access",
     "AccessType",
+    "Await",
     "CUDASanitizer",
     "CUDASanitizerDispatchMode",
     "CUDASanitizerErrors",
@@ -349,7 +351,7 @@
 
 # General information about the project.
 project = 'PyTorch'
-copyright = '2022, PyTorch Contributors'
+copyright = '2023, PyTorch Contributors'
 author = 'PyTorch Contributors'
 torch_version = str(torch.__version__)
 
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
index b14e5cec360d..e208da759dec 100644
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@@ -33,6 +33,9 @@ torch.cuda
     stream
     synchronize
     utilization
+    temperature
+    power_draw
+    clock_rate
     OutOfMemoryError
 
 Random Number Generator
diff --git a/docs/source/ddp_comm_hooks.rst b/docs/source/ddp_comm_hooks.rst
index 881e7f97edb1..8c6022b83d7f 100644
--- a/docs/source/ddp_comm_hooks.rst
+++ b/docs/source/ddp_comm_hooks.rst
@@ -134,7 +134,7 @@ Here is a simple, end-to-end example of saving and reloading PowerSGD state and
 
     class SimpleModel(nn.Module):
         def __init__(self):
-            super(SimpleModel, self).__init__()
+            super().__init__()
             self.fc1 = nn.Linear(24,24)
             self.relu = nn.ReLU()
             self.fc2 = nn.Linear(24,12)
diff --git a/docs/source/distributed.checkpoint.rst b/docs/source/distributed.checkpoint.rst
index 380ec0e6022a..4ace8e48caf6 100644
--- a/docs/source/distributed.checkpoint.rst
+++ b/docs/source/distributed.checkpoint.rst
@@ -1,4 +1,69 @@
-Distributed Checkpoint
-========================
+.. role:: hidden
+    :class: hidden-section
+
+Distributed Checkpoint - torch.distributed.checkpoint
+=====================================================
+
+
+Distributed Checkpoint (DCP) support loading and saving models from multiple ranks in parallel.
+It handles load-time resharding which enables saving in one cluster topology and loading into another.
+
+DCP is different than `torch.save` and `torch.load` in a few significant ways:
+
+* It produces multiple files per checkpoint, with at least one per rank.
+* It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead.
+
+The entrypoints to load and save a checkpoint are the following:
+
 
 .. automodule:: torch.distributed.checkpoint
+
+.. currentmodule:: torch.distributed.checkpoint
+
+.. autofunction::  load_state_dict
+.. autofunction::  save_state_dict
+
+The following types define the IO interface used during checkpoint:
+
+.. autoclass:: torch.distributed.checkpoint.StorageReader
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.StorageWriter
+  :members:
+
+The following types define the planner interface used during checkpoint:
+
+.. autoclass:: torch.distributed.checkpoint.LoadPlanner
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.LoadPlan
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.ReadItem
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.SavePlanner
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.SavePlan
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.WriteItem
+  :members:
+
+We provide a filesystem based storage layer:
+
+.. autoclass:: torch.distributed.checkpoint.FileSystemReader
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.FileSystemWriter
+  :members:
+
+We provide default implementations of `LoadPlanner` and `SavePlanner` that
+can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor and DistributedTensor.
+
+.. autoclass:: torch.distributed.checkpoint.DefaultSavePlanner
+  :members:
+
+.. autoclass:: torch.distributed.checkpoint.DefaultLoadPlanner
+  :members:
diff --git a/docs/source/distributed.tensor.parallel.rst b/docs/source/distributed.tensor.parallel.rst
index 64544539edd4..a46647473915 100644
--- a/docs/source/distributed.tensor.parallel.rst
+++ b/docs/source/distributed.tensor.parallel.rst
@@ -1,7 +1,69 @@
 .. role:: hidden
     :class: hidden-section
 
-Tensor Parallelism
-========================
-.. py:module:: torch.distributed.tensor.parallel
+Tensor Parallelism - torch.distributed.tensor.parallel
+======================================================
+
+Tensor Parallelism(TP) is built on top of the PyTorch DistributedTensor
+(`DTensor <https://github.com/pytorch/pytorch/blob/master/torch/distributed/_tensor/README.md>`__)
+and provides several parallelism styles: Rowwise, Colwise and Pairwise Parallelism.
+
+.. warning ::
+    Tensor Parallelism APIs are experimental and subject to change.
+
+The entrypoint to parallelize your ``nn.Module`` using Tensor Parallelism is:
+
+.. automodule:: torch.distributed.tensor.parallel
+
 .. currentmodule:: torch.distributed.tensor.parallel
+
+.. autofunction::  parallelize_module
+
+Tensor Parallelism supports the following parallel styles:
+
+.. autoclass:: torch.distributed.tensor.parallel.style.RowwiseParallel
+  :members:
+
+.. autoclass:: torch.distributed.tensor.parallel.style.ColwiseParallel
+  :members:
+
+.. autoclass:: torch.distributed.tensor.parallel.style.PairwiseParallel
+  :members:
+
+.. warning ::
+    Sequence Parallelism are still in experimental and no evaluation has been done.
+
+.. autoclass:: torch.distributed.tensor.parallel.style.PairwiseSequenceParallel
+  :members:
+
+Since Tensor Parallelism is built on top of DTensor, we need to specify the
+input and output placement of the module with DTensors so it can expectedly
+interacts with the module before and after. The followings are functions
+used for input/output preparation:
+
+
+.. currentmodule:: torch.distributed.tensor.parallel.style
+
+.. autofunction::  make_input_replicate_1d
+.. autofunction::  make_input_reshard_replicate
+.. autofunction::  make_input_shard_1d
+.. autofunction::  make_input_shard_1d_last_dim
+.. autofunction::  make_output_replicate_1d
+.. autofunction::  make_output_reshard_tensor
+.. autofunction::  make_output_shard_1d
+.. autofunction::  make_output_tensor
+
+Currently, there are some constraints which makes it hard for the `nn.MultiheadAttention`
+module to work out of box for Tensor Parallelism, so we built this multihead_attention
+module for Tensor Parallelism users. Also, in ``parallelize_module``, we automatically
+swap ``nn.MultiheadAttention`` to this custom module when specifying ``PairwiseParallel``.
+
+.. autoclass:: torch.distributed.tensor.parallel.multihead_attention_tp.TensorParallelMultiheadAttention
+  :members:
+
+We also enabled 2D parallelism to integrate with ``FullyShardedDataParallel``.
+Users just need to call the following API explicitly:
+
+
+.. currentmodule:: torch.distributed.tensor.parallel.fsdp
+.. autofunction::  enable_2d_with_fsdp
diff --git a/docs/source/dynamo/custom-backends.rst b/docs/source/dynamo/custom-backends.rst
index 7322fceb5181..0649e9134101 100644
--- a/docs/source/dynamo/custom-backends.rst
+++ b/docs/source/dynamo/custom-backends.rst
@@ -1,8 +1,132 @@
 Custom Backends
 ===============
 
+Overview
+--------
+
+``torch.compile`` provides a straightforward method to enable users
+to define custom backends.
+
+A backend function has the contract
+``(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]) -> Callable``.
+
+Backend functions can be called by TorchDynamo, the graph tracing component of ``torch.compile``,
+after tracing an FX graph and are
+expected to return a compiled function that is equivalent to the traced FX graph.
+The returned callable should have the same contract as the ``forward`` function of the original ``torch.fx.GraphModule``
+passed into the backend:
+``(*args: torch.Tensor) -> List[torch.Tensor]``.
+
+In order for TorchDynamo to call your backend, pass your backend function as the ``backend`` kwarg in
+``torch.compile``. For example,
+
+.. code-block:: python
+
+    import torch
+
+    def my_custom_backend(gm, example_inputs):
+        return gm.forward
+
+    def f(...):
+        ...
+
+    f_opt = torch.compile(f, backend=my_custom_backend)
+
+    @torch.compile(backend=my_custom_backend)
+    def g(...):
+        ...
+
+See below for more examples.
+
+Registering Custom Backends
+---------------------------
+
+You can register your backend using the ``register_backend`` decorator, for example,
+
+.. code-block:: python
+
+    from torch._dynamo.optimizations import register_backend
+
+    @register_backend
+    def my_compiler(gm, example_inputs):
+        ...
+
+Besides the ``register_backend`` decorator, if your backend is in another python package, you could also register your
+backend through entry points of python package, which provides a way for a package to register a plugin for another one.
+
+.. hint::
+
+    You can learn more about ``entry_points`` in the
+    `python packaging documentation <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`__.
+
+To register your backend through ``entry_points``, you could add your backend function to the ``torch_dynamo_backends`` entry point group in the
+``setup.py`` file of your package like:
+
+.. code-block:: python
+
+    ...
+    setup(
+        ...
+        'torch_dynamo_backends': [
+            'my_compiler = your_module.submodule:my_compiler',
+        ]
+        ...
+    )
+
+Please replace the ``my_compiler`` before ``=`` to the name of your backend's name and replace the part after ``=`` to
+the module and function name of your backend function.
+The entry point will be added to your python environment after the installation of the package.
+When you call ``torch.compile(model, backend="my_compiler")``, PyTorch would first search the backend named ``my_compiler``
+that has been registered with ``register_backend``. If not found, it will continue to search in all backends registered
+via ``entry_points``.
+
+Registration serves two purposes:
+
+* You can pass a string containing your backend function's name to ``torch.compile`` instead of the function itself,
+  for example, ``torch.compile(model, backend="my_compiler")``.
+* It is required for use with the `minifier <https://pytorch.org/docs/master/dynamo/troubleshooting.html>`__. Any generated
+  code from the minifier must call your code that registers your backend function, typically through an ``import`` statement.
+
+Custom Backends after AOTAutograd
+---------------------------------
+
+It is possible to define custom backends that are called by AOTAutograd rather than TorchDynamo.
+This is useful for 2 main reasons:
+
+* Users can define backends that support model training, as AOTAutograd can generate the backward graph for compilation.
+* AOTAutograd produces FX graphs consisting of `canonical Aten ops <https://pytorch.org/docs/master/ir.html#canonical-aten-ir>`__. As a result,
+  custom backends only need to support the canonical Aten opset, which is a significantly smaller opset than the entire torch/Aten opset.
+
+Wrap your backend with
+``torch._dynamo.optimizations.training.aot_autograd`` and use ``torch.compile`` with the ``backend`` kwarg as before.
+Backend functions wrapped by ``aot_autograd`` should have the same contract as before.
+
+Backend functions are passed to ``aot_autograd`` through the ``fw_compiler`` (forward compiler)
+or ``bw_compiler`` (backward compiler) kwargs. If ``bw_compiler`` is not specified, the backward compile function
+defaults to the forward compile function.
+
+One caveat is that AOTAutograd requires compiled functions returned by backends to be "boxed". This can be done by wrapping
+the compiled function with ``functorch.compile.make_boxed_func``.
+
+For example,
+
+.. code-block:: python
+
+    from torch._dynamo.optimizations.training import aot_autograd
+    from functorch.compile import make_boxed_func
+
+    def my_compiler(gm, example_inputs):
+        return make_boxed_func(gm.forward)
+
+    my_backend = aot_autograd(fw_compiler=my_compiler)  # bw_compiler=my_compiler
+
+    model_opt = torch.compile(model, backend=my_backend)
+
+Examples
+--------
+
 Debugging Backend
------------------
+^^^^^^^^^^^^^^^^^
 
 If you want to better understand what is going on during a
 compilation, you can create a custom compiler, which is referred to as
@@ -16,12 +140,11 @@ For example:
 
    from typing import List
    import torch
-   import torch._dynamo as dynamo
    def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
        print("my_compiler() called with FX graph:")
        gm.graph.print_tabular()
        return gm.forward  # return a python callable
-   @dynamo.optimize(my_compiler)
+   @torch.compile(backend=my_compiler)
    def fn(x, y):
        a = torch.cos(x)
        b = torch.sin(y)
@@ -46,8 +169,12 @@ This works for ``torch.nn.Module`` as well as shown below:
 
 .. code-block:: python
 
+   from typing import List
    import torch
-   import torch._dynamo as dynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
    class MockModule(torch.nn.Module):
        def __init__(self):
            super().__init__()
@@ -55,7 +182,7 @@ This works for ``torch.nn.Module`` as well as shown below:
        def forward(self, x):
            return self.relu(torch.cos(x))
    mod = MockModule()
-   optimized_mod = dynamo.optimize(my_compiler)(mod)
+   optimized_mod = torch.compile(mod, backend=my_compiler)
    optimized_mod(torch.randn(10))
 
 Let’s take a look at one more example with control flow:
@@ -64,12 +191,11 @@ Let’s take a look at one more example with control flow:
 
    from typing import List
    import torch
-   import torch._dynamo as dynamo
    def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
        print("my_compiler() called with FX graph:")
        gm.graph.print_tabular()
        return gm.forward  # return a python callable
-   @dynamo.optimize(my_compiler)
+   @torch.compile(backend=my_compiler)
    def toy_example(a, b):
        x = a / (torch.abs(a) + 1)
        if b.sum() < 0:
@@ -115,7 +241,7 @@ The order of the last two graphs is nondeterministic depending
 on which one is encountered first by the just-in-time compiler.
 
 Speedy Backend
---------------
+^^^^^^^^^^^^^^
 
 Integrating a custom backend that offers superior performance is also
 easy and we’ll integrate a real one
@@ -124,34 +250,40 @@ with `optimize_for_inference <https://pytorch.org/docs/stable/generated/torch.ji
 .. code-block:: python
 
    def optimize_for_inference_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       scripted = torch.jit.trace(gm, example_inputs)
+       scripted = torch.jit.script(gm)
        return torch.jit.optimize_for_inference(scripted)
 
 And then you should be able to optimize any existing code with:
 
 .. code-block:: python
 
-   @dynamo.optimize(optimize_for_inference_compiler)
+   @torch.compile(backend=optimize_for_inference_compiler)
    def code_to_accelerate():
        ...
 
 Composable Backends
--------------------
+^^^^^^^^^^^^^^^^^^^
 
 TorchDynamo includes many backends, which can be found in
 `backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
-or ``torchdynamo.list_backends()``. You can combine these backends
+or ``torch._dynamo.list_backends()``. You can combine these backends
 together with the following code:
 
 .. code-block:: python
 
    from torch._dynamo.optimizations import BACKENDS
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       trt_compiled = BACKENDS["tensorrt"](gm, example_inputs)
-       if trt_compiled is not None:
-           return trt_compiled
-       # first backend failed, try something else...
-       cudagraphs_compiled = BACKENDS["cudagraphs"](gm, example_inputs)
-       if cudagraphs_compiled is not None:
-           return cudagraphs_compiled
-       return gm.forward
+    def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        try:
+            trt_compiled = BACKENDS["tensorrt"](gm, example_inputs)
+            if trt_compiled is not None:
+                return trt_compiled
+        except Exception:
+            pass
+        # first backend failed, try something else...
+        try:
+            inductor_compiled = BACKENDS["inductor"](gm, example_inputs)
+            if inductor_compiled is not None:
+                return inductor_compiled
+        except Exception:
+            pass
+        return gm.forward
diff --git a/docs/source/dynamo/get-started.rst b/docs/source/dynamo/get-started.rst
index e8d25db664f7..a5927044ef09 100644
--- a/docs/source/dynamo/get-started.rst
+++ b/docs/source/dynamo/get-started.rst
@@ -76,7 +76,7 @@ hub.
 
 And that is not the only available backend, you can run in a REPL
 ``dynamo.list_backends()`` to see all the available backends. Try out the
-``aot_cudagraphs`` or ``nvfuser`` next as inspiration.
+``cudagraphs`` or ``nvfuser`` next as inspiration.
 
 Let’s do something a bit more interesting now, our community frequently
 uses pretrained models from
@@ -125,36 +125,23 @@ which should work with any model you throw at it.
 Existing Backends
 ~~~~~~~~~~~~~~~~~
 
-TorchDynamo has a growing list of backends, which can be found in
-`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
-or ``torchdynamo.list_backends()`` each of which with its optional dependencies.
+TorchDynamo has a growing list of backends, which can be found in the
+`backends <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/backends/>`__ folder
+or ``torch._dynamo.list_backends()`` each of which with its optional dependencies.
 
 Some of the most commonly used backends include:
 
-* **Debugging backends**:
-  * ``dynamo.optimize("eager")`` - Uses PyTorch
-  to run the extracted GraphModule. This is quite useful in debugging
-  TorchDynamo issues.
-  * ``dynamo.optimize("aot_eager")`` - Uses
-  AotAutograd with no compiler, for example, just using PyTorch eager for the
-  AotAutograd’s extracted forward and backward graphs. This is useful for
-  debugging, and unlikely to give speedups.
-
-* **Training & inference backends**:
-  * ``dynamo.optimize("inductor")`` - Uses ``TorchInductor`` backend
-  with AotAutograd and cudagraphs by leveraging
-  codegened Triton kernels `Read
-  more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
-  * ``dynamo.optimize("nvfuser")`` - nvFuser with TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-  * ``dynamo.optimize("aot_nvfuser")`` - nvFuser with AotAutograd. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-  * ``dynamo.optimize("aot_cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
-
-* **Inference-only backends**:
-  * ``dynamo.optimize("ofi")`` - Uses
-  Torchscript ``optimize_for_inference``. `Read
-  more <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__
-  * ``dynamo.optimize("fx2trt")`` - Uses Nvidia TensorRT for inference optimizations. `Read more <https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst>`__
-  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__ \* ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+**Training & inference backends**:
+  * ``dynamo.optimize("inductor")`` - Uses ``TorchInductor`` backend. `Read more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
+  * ``dynamo.optimize("aot_ts_nvfuser")`` - nvFuser with AotAutograd/TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+  * ``dynamo.optimize("nvprims_nvfuser")`` - nvFuser with PrimTorch. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+  * ``dynamo.optimize("cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+
+**Inference-only backends**:
+  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__
+  * ``dynamo.optimize("tensorrt")`` - Uses ONNXRT to run TensorRT for inference optimizations. `Read more <https://github.com/onnx/onnx-tensorrt>`__
+  * ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+  * ``dynamo.optimize("tvm")`` - Uses Apach TVM for inference optimizations. `Read more <https://tvm.apache.org/>`__
 
 Why do you need another way of optimizing PyTorch code?
 -------------------------------------------------------
diff --git a/docs/source/dynamo/troubleshooting.rst b/docs/source/dynamo/troubleshooting.rst
index 2867b1c3cf51..36cfb39a0a57 100644
--- a/docs/source/dynamo/troubleshooting.rst
+++ b/docs/source/dynamo/troubleshooting.rst
@@ -38,12 +38,20 @@ tools and their typical usage. For additional help see
      - set environment variable ``TORCHDYNAMO_REPRO_AFTER="dynamo"``
    * - Minifier for ``TorchInductor``
      - If the error is known to occur after `AOTAutograd`` find
-       smallest subgraph wich reproduces errors during TorchInductor lowering
+       smallest subgraph which reproduces errors during TorchInductor lowering
      - set environment variable ``TORCHDYNAMO_REPRO_AFTER="aot"``
-   * - Accuracy minifier
+   * - Dynamo accuracy minifier
      - Finds the smallest subgraph which reproduces an accuracy issue
-       between an eager model model and optimized model
-     - ``TORCHDYNAMO_REPRO_AFTER=<"aot"/"dynamo"> TORCHDYNAMO_REPRO_LEVEL=4``
+       between an eager model model and optimized model, when you
+       suspect the problem is in AOTAutograd
+     - ``TORCHDYNAMO_REPRO_AFTER="dynamo" TORCHDYNAMO_REPRO_LEVEL=4``
+   * - Inductor accuracy minifier
+     - Finds the smallest subgraph which reproduces an accuracy issue
+       between an eager model model and optimized model, when you
+       suspect the problem is in the backend (e.g., inductor).
+       If this doesn't work, try the Dynamo accuracy minifier
+       instead.
+     - ``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4``
    * - ``torch._dynamo.explain``
      - Find graph breaks and display reasoning for them
      - ``torch._dynamo.explain(fn, *inputs)``
@@ -195,7 +203,7 @@ execute only the frame in which the error occurs to enable easier
 debugging. There are two tools available to enable this:
 
 - Setting the environment variable ``TORCHDYNAMO_DEBUG_FUNCTION`` to the desired function name will only run torchdynamo on functions with that name.
-- Enabling the record/replay tool (set ``torch._dynamo.config.replay_record_enabled = True``) which dumps anexecution record when an error is encountered. This record can then be replayed to run only the frame where an error occurred.
+- Enabling the record/replay tool (set ``torch._dynamo.config.replay_record_enabled = True``) which dumps an execution record when an error is encountered. This record can then be replayed to run only the frame where an error occurred.
 
 TorchInductor Errors
 --------------------
@@ -318,14 +326,12 @@ code:
    # GPU Hardware Info:
    # NVIDIA A100-SXM4-40GB : 8
 
-
    from torch.nn import *
+
    class Repro(torch.nn.Module):
        def __init__(self):
            super().__init__()
 
-
-
        def forward(self, add):
            _foobar = torch.ops.aten._foobar.default(add);  add = None
            return (_foobar,)
@@ -399,14 +405,12 @@ the following code in ``{torch._dynamo.config.base_dir}/repro.py``.
    from math import inf
    from torch._dynamo.debug_utils import run_fwd_maybe_bwd
 
-
    from torch.nn import *
+
    class Repro(torch.nn.Module):
        def __init__(self):
            super().__init__()
 
-
-
        def forward(self, add):
            relu = torch.relu(add);  add = None
            return (relu,)
@@ -558,7 +562,7 @@ that are encountered. Here is an example usage:
    explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
    print(explanation)
    """
-   Dynamo produced 3 graphs, with 2 graph break and 6 ops.
+   Dynamo produced 3 graphs, with 2 graph breaks and 6 ops.
     Break reasons:
    1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
       File "t2.py", line 16, in toy_example
diff --git a/docs/source/elastic/quickstart.rst b/docs/source/elastic/quickstart.rst
index 8ede30e18bed..dea0055432f0 100644
--- a/docs/source/elastic/quickstart.rst
+++ b/docs/source/elastic/quickstart.rst
@@ -7,11 +7,11 @@ To launch a **fault-tolerant** job, run the following on all nodes.
 
     torchrun
        --nnodes=NUM_NODES
-       --nproc_per_node=TRAINERS_PER_NODE
-       --max_restarts=NUM_ALLOWED_FAILURES
-       --rdzv_id=JOB_ID
-       --rdzv_backend=c10d
-       --rdzv_endpoint=HOST_NODE_ADDR
+       --nproc-per-node=TRAINERS_PER_NODE
+       --max-restarts=NUM_ALLOWED_FAILURES
+       --rdzv-id=JOB_ID
+       --rdzv-backend=c10d
+       --rdzv-endpoint=HOST_NODE_ADDR
        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 
@@ -22,18 +22,18 @@ and at most ``MAX_SIZE`` nodes.
 
     torchrun
         --nnodes=MIN_SIZE:MAX_SIZE
-        --nproc_per_node=TRAINERS_PER_NODE
-        --max_restarts=NUM_ALLOWED_FAILURES_OR_MEMBERSHIP_CHANGES
-        --rdzv_id=JOB_ID
-        --rdzv_backend=c10d
-        --rdzv_endpoint=HOST_NODE_ADDR
+        --nproc-per-node=TRAINERS_PER_NODE
+        --max-restarts=NUM_ALLOWED_FAILURES_OR_MEMBERSHIP_CHANGES
+        --rdzv-id=JOB_ID
+        --rdzv-backend=c10d
+        --rdzv-endpoint=HOST_NODE_ADDR
         YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 .. note::
    TorchElastic models failures as membership changes. When a node fails,
    this is treated as a "scale down" event. When the failed node is replaced by
    the scheduler, it is a "scale up" event. Hence for both fault tolerant
-   and elastic jobs, ``--max_restarts`` is used to control the total number of
+   and elastic jobs, ``--max-restarts`` is used to control the total number of
    restarts before giving up, regardless of whether the restart was caused
    due to a failure or a scaling event.
 
@@ -47,8 +47,8 @@ ideally you should pick a node that has a high bandwidth.
 
 .. note::
    The ``--standalone`` option can be passed to launch a single node job with a
-   sidecar rendezvous backend. You don’t have to pass ``--rdzv_id``,
-   ``--rdzv_endpoint``, and ``--rdzv_backend`` when the ``--standalone`` option
+   sidecar rendezvous backend. You don’t have to pass ``--rdzv-id``,
+   ``--rdzv-endpoint``, and ``--rdzv-backend`` when the ``--standalone`` option
    is used.
 
 
diff --git a/docs/source/elastic/train_script.rst b/docs/source/elastic/train_script.rst
index 04225d79067a..cc99dc2da9f2 100644
--- a/docs/source/elastic/train_script.rst
+++ b/docs/source/elastic/train_script.rst
@@ -21,7 +21,7 @@ working with ``torchrun`` with these differences:
    (see `elastic launch <run.html>`_).
 
 4. ``use_env`` flag has been removed. If you were parsing local rank by parsing
-   the ``--local_rank`` option, you need to get the local rank from the
+   the ``--local-rank`` option, you need to get the local rank from the
    environment variable ``LOCAL_RANK`` (e.g. ``int(os.environ["LOCAL_RANK"])``).
 
 Below is an expository example of a training script that checkpoints on each
diff --git a/docs/source/func.api.rst b/docs/source/func.api.rst
index aabc955a519a..3e03382ffe48 100644
--- a/docs/source/func.api.rst
+++ b/docs/source/func.api.rst
@@ -16,6 +16,7 @@ Function Transforms
      grad_and_value
      vjp
      jvp
+     linearize
      jacrev
      jacfwd
      hessian
diff --git a/docs/source/index.rst b/docs/source/index.rst
index a8ce02630d56..59c363d23a01 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,6 +81,7 @@ Features described in this documentation are classified by release status:
    torch.autograd <autograd>
    torch.library <library>
    cuda
+   mps
    torch.backends <backends>
    torch.distributed <distributed>
    torch.distributed.algorithms.join <distributed.algorithms.join>
diff --git a/docs/source/ir.rst b/docs/source/ir.rst
index b935a18df2c8..d782dea88b96 100644
--- a/docs/source/ir.rst
+++ b/docs/source/ir.rst
@@ -1,14 +1,14 @@
 IRs
 ===============
 
-PyTorch 2.0 offers two set of IRs for backends to interface with: Canonical Aten IR and Prims IR.
+PyTorch 2.0 offers two set of IRs for backends to interface with: Core Aten IR and Prims IR.
 
-Canonical Aten IR
+Core Aten IR
 --------------------
 
-Canonical aten ops is the core subset of aten operators that can be used to compose other operators.
-Canonical aten IR is fully functional, and there is no `inplace` or `_out` variants in this opset.
-In contrast to Prims IR, canonical aten ops reuses the existing aten ops in "native_functions.yaml",
+Core aten ops is the core subset of aten operators that can be used to compose other operators.
+Core aten IR is fully functional, and there is no `inplace` or `_out` variants in this opset.
+In contrast to Prims IR, core aten ops reuses the existing aten ops in "native_functions.yaml",
 and it doesn't further decompose ops into explicit type promotion and broadcasting ops.
 This opset is designed to serve as the functional IR to interface with backends.
 
@@ -24,7 +24,7 @@ Prims IR
 -----------
 
 Prims IR is a set of primitive operators that can be used to compose other operators.
-Prims IR is a lower level opset than canonical aten IR, and it further decomposes ops into explicit
+Prims IR is a lower level opset than core aten IR, and it further decomposes ops into explicit
 type promotion and broadcasting ops: prims.convert_element_type and prims.broadcast_in_dim.
 This opset is designed to interface with compiler backends.
 
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 4c92f7a0ac4d..fd084427b33b 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -161,7 +161,7 @@ Example (using a traced module):
 
     class MyScriptModule(torch.nn.Module):
         def __init__(self):
-            super(MyScriptModule, self).__init__()
+            super().__init__()
             self.means = torch.nn.Parameter(torch.tensor([103.939, 116.779, 123.68])
                                             .resize_(1, 3, 1, 1))
             self.resnet = torch.jit.trace(torchvision.models.resnet18(),
@@ -593,7 +593,7 @@ Q: How do I store attributes on a :class:`ScriptModule`?
 
         class Model(torch.nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.x = 2
 
             def forward(self):
@@ -672,7 +672,7 @@ The new usage looks like this:
 
     class Model(nn.Module):
         def __init__(self):
-            super(Model, self).__init__()
+            super().__init__()
             self.conv1 = nn.Conv2d(1, 20, 5)
             self.conv2 = nn.Conv2d(20, 20, 5)
 
@@ -779,7 +779,7 @@ Old API:
 
     class MyModule(torch.jit.ScriptModule):
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             self.my_dict = torch.jit.Attribute({}, Dict[str, int])
             self.my_int = torch.jit.Attribute(20, int)
 
@@ -795,7 +795,7 @@ New API:
         my_dict: Dict[str, int]
 
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             # This type cannot be inferred and must be specified
             self.my_dict = {}
 
@@ -820,7 +820,7 @@ Old API:
         __constants__ = ['my_constant']
 
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             self.my_constant = 2
 
         def forward(self):
@@ -831,19 +831,14 @@ New API:
 
 ::
 
-    try:
-        from typing_extensions import Final
-    except:
-        # If you don't have `typing_extensions` installed, you can use a
-        # polyfill from `torch.jit`.
-        from torch.jit import Final
+    from typing import Final
 
     class MyModule(torch.nn.Module):
 
         my_constant: Final[int]
 
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             self.my_constant = 2
 
         def forward(self):
diff --git a/docs/source/jit_language_reference.rst b/docs/source/jit_language_reference.rst
index 63c52314aa3a..b342a26ef9c9 100644
--- a/docs/source/jit_language_reference.rst
+++ b/docs/source/jit_language_reference.rst
@@ -205,7 +205,7 @@ Example (type annotations for Python 3):
 
     class EmptyDataStructures(torch.nn.Module):
         def __init__(self):
-            super(EmptyDataStructures, self).__init__()
+            super().__init__()
 
         def forward(self, x: torch.Tensor) -> Tuple[List[Tuple[int, float]], Dict[str, int]]:
             # This annotates the list to be a `List[Tuple[int, float]]`
@@ -249,7 +249,7 @@ Example (refining types on parameters and locals):
         z: Optional[int]
 
         def __init__(self, z):
-            super(M, self).__init__()
+            super().__init__()
             # If `z` is None, its type cannot be inferred, so it must
             # be specified (above)
             self.z = z
@@ -567,7 +567,7 @@ calling its ``forward`` method (e.g. ``self.resnet.forward(input)``).
 
     class MyModule(nn.Module):
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             means = torch.tensor([103.939, 116.779, 123.68])
             self.means = torch.nn.Parameter(means.resize_(1, 3, 1, 1))
             resnet = torchvision.models.resnet18()
@@ -703,7 +703,7 @@ loop at compile time, with each member of the constant module list.
 
     class SubModule(torch.nn.Module):
         def __init__(self):
-            super(SubModule, self).__init__()
+            super().__init__()
             self.weight = nn.Parameter(torch.randn(2))
 
         def forward(self, input):
@@ -713,7 +713,7 @@ loop at compile time, with each member of the constant module list.
         __constants__ = ['mods']
 
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
 
         def forward(self, v):
@@ -853,7 +853,7 @@ value should be treated as a constant.
         a : torch.jit.Final[int]
 
         def __init__(self):
-            super(Foo, self).__init__()
+            super().__init__()
             self.a = 1 + 4
 
         def forward(self, input):
@@ -906,7 +906,7 @@ Example:
         some_dict: Dict[str, int]
 
         def __init__(self, a_dict):
-            super(Foo, self).__init__()
+            super().__init__()
             self.words = []
             self.some_dict = a_dict
 
diff --git a/docs/source/jit_language_reference_v2.rst b/docs/source/jit_language_reference_v2.rst
index 91114c6b0d30..ffa72f596fc5 100644
--- a/docs/source/jit_language_reference_v2.rst
+++ b/docs/source/jit_language_reference_v2.rst
@@ -209,7 +209,7 @@ such as ``Future[int]``. Structural types are composable with any ``TSType``.
 ::
 
     TSStructuralType ::=  TSTuple | TSNamedTuple | TSList | TSDict |
-                        TSOptional | TSUnion | TSFuture | TSRRef
+                        TSOptional | TSUnion | TSFuture | TSRRef | TSAwait
 
     TSTuple          ::= "Tuple" "[" (TSType ",")* TSType "]"
     TSNamedTuple     ::= "namedtuple" "(" (TSType ",")* TSType ")"
@@ -218,6 +218,7 @@ such as ``Future[int]``. Structural types are composable with any ``TSType``.
     TSUnion          ::= "Union" "[" (TSType ",")* TSType "]"
     TSFuture         ::= "Future" "[" TSType "]"
     TSRRef           ::= "RRef" "[" TSType "]"
+    TSAwait          ::= "Await" "[" TSType "]"
     TSDict           ::= "Dict" "[" KeyType "," TSType "]"
     KeyType          ::= "str" | "int" | "float" | "bool" | TensorType | "Any"
 
@@ -226,6 +227,7 @@ Where:
 * ``Tuple``, ``List``, ``Optional``, ``Union``, ``Future``, ``Dict`` represent Python type class names that are defined in the module ``typing``. To use these type names, you must import them from ``typing`` (e.g., ``from typing import Tuple``).
 * ``namedtuple`` represents the Python class ``collections.namedtuple`` or ``typing.NamedTuple``.
 * ``Future`` and ``RRef`` represent the Python classes ``torch.futures`` and ``torch.distributed.rpc``.
+* ``Await`` represent the Python class ``torch._awaits._Await``
 
 **Compared to Python**
 
@@ -828,8 +830,8 @@ TorchScript Type System Definition
     TSMetaType      ::= "Any"
     TSPrimitiveType ::= "int" | "float" | "double" | "complex" | "bool" | "str" | "None"
 
-    TSStructualType ::=  TSTuple | TSNamedTuple | TSList | TSDict |
-                         TSOptional | TSUnion | TSFuture | TSRRef
+    TSStructualType ::=  TSTuple | TSNamedTuple | TSList | TSDict | TSOptional |
+                         TSUnion | TSFuture | TSRRef | TSAwait
     TSTuple         ::= "Tuple" "[" (TSType ",")* TSType "]"
     TSNamedTuple    ::= "namedtuple" "(" (TSType ",")* TSType ")"
     TSList          ::= "List" "[" TSType "]"
@@ -837,6 +839,7 @@ TorchScript Type System Definition
     TSUnion         ::= "Union" "[" (TSType ",")* TSType "]"
     TSFuture        ::= "Future" "[" TSType "]"
     TSRRef          ::= "RRef" "[" TSType "]"
+    TSAwait         ::= "Await" "[" TSType "]"
     TSDict          ::= "Dict" "[" KeyType "," TSType "]"
     KeyType         ::= "str" | "int" | "float" | "bool" | TensorType | "Any"
 
@@ -1434,16 +1437,15 @@ For loops on lists: for loops over a ``nn.ModuleList`` will unroll the body of t
 
     class SubModule(torch.nn.Module):
         def __init__(self):
-            super(SubModule, self).__init__()
+            super().__init__()
             self.weight = nn.Parameter(torch.randn(2))
 
         def forward(self, input):
             return self.weight + input
 
     class MyModule(torch.nn.Module):
-
         def __init__(self):
-            super(MyModule, self).init()
+            super().__init__()
             self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
 
         def forward(self, v):
diff --git a/docs/source/masked.rst b/docs/source/masked.rst
index 60b9af7ebccc..139c267ac6ff 100644
--- a/docs/source/masked.rst
+++ b/docs/source/masked.rst
@@ -220,7 +220,7 @@ Reductions
 ----------
 
 The following reductions are available (with autograd support). For more information, the
-`Overview <https://pytorch.org/tutorials/prototype/maskedtensor_overview.html/>`_ tutorial
+`Overview <https://pytorch.org/tutorials/prototype/maskedtensor_overview.html>`_ tutorial
 details some examples of reductions, while the
 `Advanced semantics <https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html>`_ tutorial
 has some further in-depth discussions about how we decided on certain reduction semantics.
diff --git a/docs/source/mps.rst b/docs/source/mps.rst
new file mode 100644
index 000000000000..91662aa9d3dc
--- /dev/null
+++ b/docs/source/mps.rst
@@ -0,0 +1,18 @@
+torch.mps
+===================================
+.. automodule:: torch.mps
+.. currentmodule:: torch.mps
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    synchronize
+    get_rng_state
+    set_rng_state
+    manual_seed
+    seed
+    empty_cache
+    set_per_process_memory_fraction
+    current_allocated_memory
+    driver_allocated_memory
\ No newline at end of file
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index d74b1044d8ad..9eb3ddce4236 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -49,6 +49,15 @@ Pooling functions
     fractional_max_pool2d
     fractional_max_pool3d
 
+Attention Mechanisms
+-------------------------------
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    scaled_dot_product_attention
+
 Non-linear activation functions
 -------------------------------
 
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index ced3edd28e66..cce6e1ab98e8 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -442,6 +442,8 @@ Utility functions in other modules
     nn.utils.rnn.pad_packed_sequence
     nn.utils.rnn.pad_sequence
     nn.utils.rnn.pack_sequence
+    nn.utils.rnn.unpack_sequence
+    nn.utils.rnn.unpad_sequence
 
 .. autosummary::
     :toctree: generated
diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index d561f63f1cbc..ca6ddf6970ab 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -414,9 +414,10 @@ limit definition of a derivative and generalizes it to operate on
 complex numbers. Consider a function :math:`f: ℂ → ℂ`,
 
     .. math::
-        `f(z=x+yj) = u(x, y) + v(x, y)j`
+        f(z=x+yj) = u(x, y) + v(x, y)j
 
-where :math:`u` and :math:`v` are two variable real valued functions.
+where :math:`u` and :math:`v` are two variable real valued functions
+and :math:`j` is the imaginary unit.
 
 Using the derivative definition, we can write:
 
@@ -888,4 +889,33 @@ registered to Node. As the forward is computed, hooks are registered to grad_fn
 to the inputs and outputs of the module. Because a module may take multiple inputs and return
 multiple outputs, a dummy custom autograd Function is first applied to the inputs of the module
 before forward and the outputs of the module before the output of forward is returned to ensure
-that those tensors share a single grad_fn, which we can then attach our hooks to.
+that those Tensors share a single grad_fn, which we can then attach our hooks to.
+
+Behavior of Tensor hooks when Tensor is modified in-place
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Usually hooks registered to a Tensor receive the gradient of the outputs with respect to that
+Tensor, where the value of the Tensor is taken to be its value at the time backward is computed.
+
+However, if you register hooks to a Tensor, and then modify that Tensor in-place, hooks
+registered before in-place modification similarly receive gradients of the outputs with
+respect to the Tensor, but the value of the Tensor is taken to be its value before
+in-place modification.
+
+If you prefer the behavior in the former case,
+you should register them to the Tensor after all in-place modifications to it have been made.
+For example:
+
+.. code::
+
+    t = torch.tensor(1., requires_grad=True).sin()
+    t.cos_()
+    t.register_hook(fn)
+    t.backward()
+
+Furthemore, it can be helpful to know that under the hood,
+when hooks are registered to a Tensor, they actually become permanently bound to the grad_fn
+of that Tensor, so if that Tensor is then modified in-place,
+even though the Tensor now has a new grad_fn, hooks registered before it was
+modified in-place will continue to be associated with the old grad_fn, e.g. they will
+fire when that Tensor's old grad_fn is reached in the graph by the autograd engine.
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
index 7a78b07472f5..4eca7972efe9 100644
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -415,7 +415,7 @@ This is how a ``Linear`` module can be implemented::
 
     class Linear(nn.Module):
         def __init__(self, input_features, output_features, bias=True):
-            super(Linear, self).__init__()
+            super().__init__()
             self.input_features = input_features
             self.output_features = output_features
 
@@ -566,8 +566,8 @@ of doing this is to define a decorator::
   import functools
   def implements(torch_function):
       """Register a torch function override for ScalarTensor"""
-      @functools.wraps(torch_function)
       def decorator(func):
+          functools.update_wrapper(func, torch_function)
           HANDLED_FUNCTIONS[torch_function] = func
           return func
       return decorator
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index 3693ef409138..c3f75dbbe8b8 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -153,7 +153,7 @@ can use this pattern:
     # A module with two linear layers
     >>> class MyModule(torch.nn.Module):
           def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             self.l0 = torch.nn.Linear(4, 2)
             self.l1 = torch.nn.Linear(2, 1)
 
@@ -218,7 +218,7 @@ this:
     # A module with control flow
     >>> class ControlFlowModule(torch.nn.Module):
           def __init__(self):
-            super(ControlFlowModule, self).__init__()
+            super().__init__()
             self.l0 = torch.nn.Linear(4, 2)
             self.l1 = torch.nn.Linear(2, 1)
 
diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index f270aa8fa8ab..1dc2948e52b7 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -129,6 +129,49 @@ Algorithms
     Rprop
     SGD
 
+Many of our algorithms have various implementations optimized for performance,
+readability and/or generality, so we attempt to default to the generally fastest
+implementation for the current device if no particular implementation has been
+specified by the user.
+
+We have 3 major categories of implementations: for-loop, foreach (multi-tensor), and
+fused. The most straightforward implementations are for-loops over the parameters with
+big chunks of computation. For-looping is usually slower than our foreach
+implementations, which combine parameters into a multi-tensor and run the big chunks
+of computation all at once, thereby saving many sequential kernel calls. A few of our
+optimizers have even faster fused implementations, which fuse the big chunks of
+computation into one kernel. We can think of foreach implementations as fusing
+horizontally and fused implementations as fusing vertically on top of that.
+
+In general, the performance ordering of the 3 implementations is fused > foreach > for-loop.
+So when applicable, we default to foreach over for-loop. Applicable means the foreach
+implementation is available, the user has not specified any implementation-specific kwargs
+(e.g., fused, foreach, differentiable), and all tensors are native and on CUDA. Note that
+while fused should be even faster than foreach, the implementations are newer and we would
+like to give them more bake-in time before flipping the switch everywhere. You are welcome
+to try them out though!
+
+Below is a table showing the available and default implementations of each algorithm:
+
+.. csv-table::
+    :header: "Algorithm", "Default", "Has foreach?", "Has fused?"
+    :widths: 25, 25, 25, 25
+    :delim: ;
+
+    :class:`Adadelta`;foreach;yes;no
+    :class:`Adagrad`;foreach;yes;no
+    :class:`Adam`;foreach;yes;yes
+    :class:`AdamW`;foreach;yes;yes
+    :class:`SparseAdam`;for-loop;no;no
+    :class:`Adamax`;foreach;yes;no
+    :class:`ASGD`;foreach;yes;no
+    :class:`LBFGS`;for-loop;no;no
+    :class:`NAdam`;foreach;yes;no
+    :class:`RAdam`;foreach;yes;no
+    :class:`RMSprop`;foreach;yes;no
+    :class:`Rprop`;foreach;yes;no
+    :class:`SGD`;foreach;yes;no
+
 How to adjust learning rate
 ---------------------------
 
diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst
index e974df655af7..0e99517f3abf 100644
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@@ -1,12 +1,12 @@
 Quantization API Reference
 -------------------------------
 
-torch.quantization
+torch.ao.quantization
 ~~~~~~~~~~~~~~~~~~~~~
 
 This module contains Eager mode quantization APIs.
 
-.. currentmodule:: torch.quantization
+.. currentmodule:: torch.ao.quantization
 
 Top level APIs
 ^^^^^^^^^^^^^^
@@ -49,12 +49,12 @@ Utility functions
     propagate_qconfig_
     default_eval_fn
 
-torch.quantization.quantize_fx
+torch.ao.quantization.quantize_fx
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module contains FX graph mode quantization APIs (prototype).
 
-.. currentmodule:: torch.quantization.quantize_fx
+.. currentmodule:: torch.ao.quantization.quantize_fx
 
 .. autosummary::
     :toctree: generated
@@ -178,13 +178,13 @@ regular full-precision tensor.
     topk
 
 
-torch.quantization.observer
+torch.ao.quantization.observer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module contains observers which are used to collect statistics about
 the values observed during calibration (PTQ) or training (QAT).
 
-.. currentmodule:: torch.quantization.observer
+.. currentmodule:: torch.ao.quantization.observer
 
 .. autosummary::
     :toctree: generated
@@ -211,13 +211,13 @@ the values observed during calibration (PTQ) or training (QAT).
     default_dynamic_quant_observer
     default_float_qparams_observer
 
-torch.quantization.fake_quantize
+torch.ao.quantization.fake_quantize
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module implements modules which are used to perform fake quantization
 during QAT.
 
-.. currentmodule:: torch.quantization.fake_quantize
+.. currentmodule:: torch.ao.quantization.fake_quantize
 
 .. autosummary::
     :toctree: generated
@@ -240,13 +240,13 @@ during QAT.
     disable_observer
     enable_observer
 
-torch.quantization.qconfig
+torch.ao.quantization.qconfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This module defines `QConfig` objects which are used
 to configure quantization settings for individual ops.
 
-.. currentmodule:: torch.quantization.qconfig
+.. currentmodule:: torch.ao.quantization.qconfig
 
 .. autosummary::
     :toctree: generated
@@ -481,14 +481,14 @@ This module implements the quantized versions of the functional layers such as
     upsample_bilinear
     upsample_nearest
 
-torch.nn.quantizable
-~~~~~~~~~~~~~~~~~~~~
+torch.ao.nn.quantizable
+~~~~~~~~~~~~~~~~~~~~~~~
 
 This module implements the quantizable versions of some of the nn layers.
 These modules can be used in conjunction with the custom module mechanism,
 by providing the ``custom_module_config`` argument to both prepare and convert.
 
-.. currentmodule:: torch.nn.quantizable
+.. currentmodule:: torch.ao.nn.quantizable
 
 .. autosummary::
     :toctree: generated
@@ -585,21 +585,30 @@ the `custom operator mechanism <https://pytorch.org/tutorials/advanced/torch_scr
 
 
 .. These modules are missing docs. Adding them here only for tracking
-.. automodule:: torch.nn.intrinsic
-.. automodule:: torch.nn.intrinsic.modules
-.. automodule:: torch.nn.quantizable
-.. automodule:: torch.nn.quantizable.modules
-.. automodule:: torch.nn.quantized
+.. automodule:: torch.ao.nn.quantizable.modules
    :noindex:
-
 .. automodule:: torch.ao.nn.quantized.reference
    :noindex:
 .. automodule:: torch.ao.nn.quantized.reference.modules
    :noindex:
 
-.. py:module:: torch.nn.intrinsic.qat
-.. py:module:: torch.nn.intrinsic.qat.modules
-.. py:module:: torch.nn.intrinsic.quantized
-.. py:module:: torch.nn.intrinsic.quantized.modules
-.. py:module:: torch.nn.intrinsic.quantized.dynamic
-.. py:module:: torch.nn.intrinsic.quantized.dynamic.modules
+.. automodule:: torch.nn.quantizable
+.. automodule:: torch.nn.qat.dynamic.modules
+.. automodule:: torch.nn.qat.modules
+.. automodule:: torch.nn.qat
+.. automodule:: torch.nn.intrinsic.qat.modules
+.. automodule:: torch.nn.quantized.dynamic
+.. automodule:: torch.nn.intrinsic
+.. automodule:: torch.nn.intrinsic.quantized.modules
+.. automodule:: torch.quantization.fx
+.. automodule:: torch.nn.intrinsic.quantized.dynamic
+.. automodule:: torch.nn.qat.dynamic
+.. automodule:: torch.nn.intrinsic.qat
+.. automodule:: torch.nn.quantized.modules
+.. automodule:: torch.nn.intrinsic.quantized
+.. automodule:: torch.nn.quantizable.modules
+.. automodule:: torch.nn.quantized
+.. automodule:: torch.nn.intrinsic.quantized.dynamic.modules
+.. automodule:: torch.nn.quantized.dynamic.modules
+.. automodule:: torch.quantization
+.. automodule:: torch.nn.intrinsic.modules
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index f1b88a433fa4..4697985f75c5 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -3,8 +3,8 @@
 Quantization
 ============
 
-.. automodule:: torch.quantization
-.. automodule:: torch.quantization.fx
+.. automodule:: torch.ao.quantization
+.. automodule:: torch.ao.quantization.fx
 
 .. warning ::
      Quantization is in beta and subject to change.
@@ -185,7 +185,7 @@ PTDQ API Example::
   # create a model instance
   model_fp32 = M()
   # create a quantized model instance
-  model_int8 = torch.quantization.quantize_dynamic(
+  model_int8 = torch.ao.quantization.quantize_dynamic(
       model_fp32,  # the original model
       {torch.nn.Linear},  # a set of layers to dynamically quantize
       dtype=torch.qint8)  # the target dtype for quantized weights
@@ -232,11 +232,11 @@ PTSQ API Example::
       def __init__(self):
           super().__init__()
           # QuantStub converts tensors from floating point to quantized
-          self.quant = torch.quantization.QuantStub()
+          self.quant = torch.ao.quantization.QuantStub()
           self.conv = torch.nn.Conv2d(1, 1, 1)
           self.relu = torch.nn.ReLU()
           # DeQuantStub converts tensors from quantized to floating point
-          self.dequant = torch.quantization.DeQuantStub()
+          self.dequant = torch.ao.quantization.DeQuantStub()
 
       def forward(self, x):
           # manually specify where tensors will be converted from floating
@@ -262,17 +262,17 @@ PTSQ API Example::
   # can be specified here.
   # Note: the old 'fbgemm' is still available but 'x86' is the recommended default
   # for server inference.
-  # model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
-  model_fp32.qconfig = torch.quantization.get_default_qconfig('x86')
+  # model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+  model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('x86')
 
   # Fuse the activations to preceding layers, where applicable.
   # This needs to be done manually depending on the model architecture.
   # Common fusions include `conv + relu` and `conv + batchnorm + relu`
-  model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+  model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
 
   # Prepare the model for static quantization. This inserts observers in
   # the model that will observe activation tensors during calibration.
-  model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
+  model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)
 
   # calibrate the prepared model to determine quantization parameters for activations
   # in a real world setting, the calibration would be done with a representative dataset
@@ -283,7 +283,7 @@ PTSQ API Example::
   # quantizes the weights, computes and stores the scale and bias value to be
   # used with each activation tensor, and replaces key operators with quantized
   # implementations.
-  model_int8 = torch.quantization.convert(model_fp32_prepared)
+  model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
 
   # run the model, relevant calculations will happen in int8
   res = model_int8(input_fp32)
@@ -333,12 +333,12 @@ QAT API Example::
       def __init__(self):
           super().__init__()
           # QuantStub converts tensors from floating point to quantized
-          self.quant = torch.quantization.QuantStub()
+          self.quant = torch.ao.quantization.QuantStub()
           self.conv = torch.nn.Conv2d(1, 1, 1)
           self.bn = torch.nn.BatchNorm2d(1)
           self.relu = torch.nn.ReLU()
           # DeQuantStub converts tensors from quantized to floating point
-          self.dequant = torch.quantization.DeQuantStub()
+          self.dequant = torch.ao.quantization.DeQuantStub()
 
       def forward(self, x):
           x = self.quant(x)
@@ -361,18 +361,18 @@ QAT API Example::
   # can be specified here.
   # Note: the old 'fbgemm' is still available but 'x86' is the recommended default
   # for server inference.
-  # model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
-  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('x86')
+  # model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
+  model_fp32.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
 
   # fuse the activations to preceding layers, where applicable
   # this needs to be done manually depending on the model architecture
-  model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
+  model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32,
       [['conv', 'bn', 'relu']])
 
   # Prepare the model for QAT. This inserts observers and fake_quants in
   # the model needs to be set to train for QAT logic to work
   # the model that will observe weight and activation tensors during calibration.
-  model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused.train())
+  model_fp32_prepared = torch.ao.quantization.prepare_qat(model_fp32_fused.train())
 
   # run the training loop (not shown)
   training_loop(model_fp32_prepared)
@@ -382,7 +382,7 @@ QAT API Example::
   # used with each activation tensor, fuses modules where appropriate,
   # and replaces key operators with quantized implementations.
   model_fp32_prepared.eval()
-  model_int8 = torch.quantization.convert(model_fp32_prepared)
+  model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
 
   # run the model, relevant calculations will happen in int8
   res = model_int8(input_fp32)
@@ -438,7 +438,7 @@ FXPTQ API Example::
     get_default_qat_qconfig_mapping,
     QConfigMapping,
   )
-  import torch.quantization.quantize_fx as quantize_fx
+  import torch.ao.quantization.quantize_fx as quantize_fx
   import copy
 
   model_fp = UserModel()
@@ -450,7 +450,7 @@ FXPTQ API Example::
   # we need to deepcopy if we still want to keep model_fp unchanged after quantization since quantization apis change the input model
   model_to_quantize = copy.deepcopy(model_fp)
   model_to_quantize.eval()
-  qconfig_mapping = QConfigMapping().set_global(torch.quantization.default_dynamic_qconfig)
+  qconfig_mapping = QConfigMapping().set_global(torch.ao.quantization.default_dynamic_qconfig)
   # a tuple of one or more example inputs are needed to trace the model
   example_inputs = (input_fp32)
   # prepare
@@ -772,18 +772,18 @@ Default settings for x86::
 
     # set the qconfig for PTQ
     # Note: the old 'fbgemm' is still available but 'x86' is the recommended default on x86 CPUs
-    qconfig = torch.quantization.get_default_qconfig('x86')
+    qconfig = torch.ao.quantization.get_default_qconfig('x86')
     # or, set the qconfig for QAT
-    qconfig = torch.quantization.get_default_qat_qconfig('x86')
+    qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')
     # set the qengine to control weight packing
     torch.backends.quantized.engine = 'x86'
 
 Default settings for qnnpack::
 
     # set the qconfig for PTQ
-    qconfig = torch.quantization.get_default_qconfig('qnnpack')
+    qconfig = torch.ao.quantization.get_default_qconfig('qnnpack')
     # or, set the qconfig for QAT
-    qconfig = torch.quantization.get_default_qat_qconfig('qnnpack')
+    qconfig = torch.ao.quantization.get_default_qat_qconfig('qnnpack')
     # set the qengine to control weight packing
     torch.backends.quantized.engine = 'qnnpack'
 
@@ -907,7 +907,7 @@ be done at a future time.
 Custom API Example::
 
   import torch
-  import torch.nn.quantized as nnq
+  import torch.ao.nn.quantized as nnq
   from torch.ao.quantization import QConfigMapping
   import torch.ao.quantization.quantize_fx
 
@@ -1039,14 +1039,14 @@ If you see an error similar to::
   RuntimeError: Could not run 'quantized::some_operator' with arguments from the 'CPU' backend...
 
 This means that you are trying to pass a non-quantized Tensor to a quantized
-kernel. A common workaround is to use ``torch.quantization.QuantStub`` to
+kernel. A common workaround is to use ``torch.ao.quantization.QuantStub`` to
 quantize the tensor.  This needs to be done manually in Eager mode quantization.
 An e2e example::
 
   class M(torch.nn.Module):
       def __init__(self):
           super().__init__()
-          self.quant = torch.quantization.QuantStub()
+          self.quant = torch.ao.quantization.QuantStub()
           self.conv = torch.nn.Conv2d(1, 1, 1)
 
       def forward(self, x):
@@ -1064,18 +1064,18 @@ If you see an error similar to::
   RuntimeError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend.
 
 This means that you are trying to pass a quantized Tensor to a non-quantized
-kernel. A common workaround is to use ``torch.quantization.DeQuantStub`` to
+kernel. A common workaround is to use ``torch.ao.quantization.DeQuantStub`` to
 dequantize the tensor.  This needs to be done manually in Eager mode quantization.
 An e2e example::
 
   class M(torch.nn.Module):
       def __init__(self):
           super().__init__()
-          self.quant = torch.quantization.QuantStub()
+          self.quant = torch.ao.quantization.QuantStub()
           self.conv1 = torch.nn.Conv2d(1, 1, 1)
           # this module will not be quantized (see `qconfig = None` logic below)
           self.conv2 = torch.nn.Conv2d(1, 1, 1)
-          self.dequant = torch.quantization.DeQuantStub()
+          self.dequant = torch.ao.quantization.DeQuantStub()
 
       def forward(self, x):
           # during the convert step, this will be replaced with a
@@ -1166,26 +1166,14 @@ Please take a look at `Limitations of Symbolic Tracing <https://docs-preview.pyt
 .. py:module:: torch.ao.nn.quantizable
 .. py:module:: torch.ao.nn.quantizable.modules
 .. py:module:: torch.ao.nn.quantized
+.. py:module:: torch.ao.nn.quantized.reference
+.. py:module:: torch.ao.nn.quantized.reference.modules
 .. py:module:: torch.ao.nn.sparse
 .. py:module:: torch.ao.nn.sparse.quantized
 .. py:module:: torch.ao.nn.sparse.quantized.dynamic
 .. py:module:: torch.ao.ns
 .. py:module:: torch.ao.ns.fx
-.. py:module:: torch.ao.quantization
-.. py:module:: torch.ao.quantization.fx
 .. py:module:: torch.ao.quantization.backend_config
 .. py:module:: torch.ao.pruning
 .. py:module:: torch.ao.pruning.scheduler
 .. py:module:: torch.ao.pruning.sparsifier
-
-.. py:module:: torch.nn.qat
-.. py:module:: torch.nn.qat.modules
-.. py:module:: torch.nn.qat.dynamic
-.. py:module:: torch.nn.qat.dynamic.modules
-.. py:module:: torch.nn.quantized
-.. py:module:: torch.nn.quantized.modules
-.. py:module:: torch.nn.quantized.dynamic
-.. py:module:: torch.nn.quantized.dynamic.modules
-
-.. py:module:: torch.ao.nn.quantized.reference
-.. py:module:: torch.ao.nn.quantized.reference.modules
diff --git a/docs/source/scripts/build_opsets.py b/docs/source/scripts/build_opsets.py
index 68a9f2f98216..2ab913fe85a0 100644
--- a/docs/source/scripts/build_opsets.py
+++ b/docs/source/scripts/build_opsets.py
@@ -21,7 +21,7 @@ def get_aten():
 
     aten_ops = OrderedDict()
     for function in native_functions:
-        if "canonical" in function.tags:
+        if "core" in function.tags:
             op_name = str(function.func.name)
             aten_ops[op_name] = function
 
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index 377368e09c78..c273f74b8c0b 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -38,7 +38,7 @@ performance optimization.
 
 Like many other performance optimization sparse storage formats are not
 always advantageous. When trying sparse formats for your use case
-you might find your execution time to decrease rather than increase.
+you might find your execution time to increase rather than decrease.
 
 Please feel encouraged to open a GitHub issue if you analytically
 expected to see a stark increase in performance but measured a
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 2700e613ad4c..4f6de6f62d53 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -650,7 +650,6 @@ Tensor class reference
     Tensor.svd
     Tensor.swapaxes
     Tensor.swapdims
-    Tensor.symeig
     Tensor.t
     Tensor.t_
     Tensor.tensor_split
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index bbec47f69404..a4f0a2c721e1 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -589,7 +589,6 @@ BLAS and LAPACK Operations
     svd
     svd_lowrank
     pca_lowrank
-    symeig
     lobpcg
     trapz
     trapezoid
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index 0af23dc400b0..332e6a935c5b 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -166,7 +166,7 @@ struct Dim : public py::base<Dim> {
         return batchtensor_;
     }
 private:
-    int64_t size_;
+    int64_t size_{-1};
     at::Tensor range_;
     at::Tensor batchtensor_;
 };
@@ -1472,15 +1472,17 @@ py::object create_dimlist(py::object name, py::handle size) {
 struct PyInstDecoder {
     PyInstDecoder(PyCodeObject* code_object, int lasti)
     : code_object_(code_object), code_(_PyCode_CODE(code_object)), offset_(lasti / sizeof(_Py_CODEUNIT))  {}
+    // On Windows, _PyOpcode_Caches and _PyOpcode_Deopt are private symbols
+    // See https://github.com/pytorch/pytorch/issues/93854
     void next() {
-    #if IS_PYTHON_3_11_PLUS
+    #if IS_PYTHON_3_11_PLUS && !defined(_WIN32)
         offset_ += _PyOpcode_Caches[opcode()];
     #endif
         offset_ += 1;
     }
     int opcode() {
         auto r = _Py_OPCODE(code_[offset_]);
-    #if IS_PYTHON_3_11_PLUS
+    #if IS_PYTHON_3_11_PLUS && !defined(_WIN32)
         r = _PyOpcode_Deopt[r];
     #endif
         return r;
@@ -2874,7 +2876,7 @@ struct WrappedOperator : public py::base<WrappedOperator> {
         name = orig.attr("__name__");
         doc = orig.attr("__doc__");
         dim_name = std::move(dim_name_);
-        if (!py::is_none(doc) && dim_name.size() > 0) {
+        if (!py::is_none(doc) && !dim_name.empty()) {
             doc = py::unicode_from_format("%S\nArgument '%s' can be either an integer or a torchdim.Dim object.\n", doc.ptr(), dim_name.c_str());
         }
         method_def.ml_name = py::is_none(name) ? "" : PyUnicode_AsUTF8(name.ptr());
diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
index dd0edfe5d5a3..45e836987d42 100644
--- a/functorch/csrc/dim/minpybind.h
+++ b/functorch/csrc/dim/minpybind.h
@@ -59,8 +59,7 @@ struct vector_args;
 struct handle {
     handle(PyObject* ptr)
     : ptr_(ptr) {}
-    handle()
-    : ptr_(nullptr) {}
+    handle() = default;
 
 
     PyObject* ptr() const {
@@ -90,7 +89,7 @@ struct handle {
     }
 
 protected:
-    PyObject * ptr_;
+    PyObject* ptr_ = nullptr;
 };
 
 
@@ -107,26 +106,26 @@ struct hdl : public handle {
     }
     hdl(T* ptr)
     : hdl((PyObject*) ptr) {}
-    hdl(obj<T> o)
+    hdl(const obj<T>& o)
     : hdl(o.ptr()) {}
 private:
     hdl(handle h) : handle(h) {}
 };
 
 struct object : public handle {
-    object() {}
+    object() = default;
     object(const object& other)
     : handle(other.ptr_) {
         Py_XINCREF(ptr_);
     }
-    object(object&& other)
+    object(object&& other) noexcept
     : handle(other.ptr_) {
         other.ptr_ = nullptr;
     }
     object& operator=(const object& other) {
         return *this = object(other);
     }
-    object& operator=(object&& other) {
+    object& operator=(object&& other) noexcept {
         PyObject* tmp = ptr_;
         ptr_ = other.ptr_;
         other.ptr_ = tmp;
@@ -160,19 +159,19 @@ struct object : public handle {
 
 template<typename T>
 struct obj : public object {
-    obj() {}
+    obj() = default;
     obj(const obj& other)
     : object(other.ptr_) {
         Py_XINCREF(ptr_);
     }
-    obj(obj&& other)
+    obj(obj&& other) noexcept
     : object(other.ptr_) {
         other.ptr_ = nullptr;
     }
     obj& operator=(const obj& other) {
         return *this = obj(other);
     }
-    obj& operator=(obj&& other) {
+    obj& operator=(obj&& other) noexcept {
         PyObject* tmp = ptr_;
         ptr_ = other.ptr_;
         other.ptr_ = tmp;
@@ -503,7 +502,7 @@ struct dict_view : public handle {
         return PyDict_Check(h.ptr());
     }
     bool next(Py_ssize_t* pos, py::handle* key, py::handle* value) {
-        PyObject *k, *v;
+        PyObject *k = nullptr, *v = nullptr;
         auto r = PyDict_Next(ptr(), pos, &k, &v);
         *key = k;
         *value = v;
diff --git a/functorch/dim/batch_tensor.py b/functorch/dim/batch_tensor.py
index e909afe1e21e..f8b036488814 100644
--- a/functorch/dim/batch_tensor.py
+++ b/functorch/dim/batch_tensor.py
@@ -15,7 +15,7 @@
 def _enable_layers(dims):
     global _enabled
     assert not _enabled
-    input = list(sorted((d._level, d.size) for d in dims if not isinstance(d, int)))
+    input = sorted((d._level, d.size) for d in dims if not isinstance(d, int))
     n = len(input)
     try:
         _vmap_add_layers(input)
diff --git a/functorch/examples/compilation/fuse_module.py b/functorch/examples/compilation/fuse_module.py
index ec091eb24435..3d2f830485b9 100644
--- a/functorch/examples/compilation/fuse_module.py
+++ b/functorch/examples/compilation/fuse_module.py
@@ -23,7 +23,7 @@ def run(mod, input):
 
 class Foo(nn.Module):
     def __init__(self):
-        super(Foo, self).__init__()
+        super().__init__()
         self.param = nn.Parameter(torch.randn(1))
         self.register_buffer("buf", torch.randn(1))
 
diff --git a/functorch/examples/dp_cifar10/cifar10_opacus.py b/functorch/examples/dp_cifar10/cifar10_opacus.py
index bcd0aae8b9db..22cd3ed92022 100644
--- a/functorch/examples/dp_cifar10/cifar10_opacus.py
+++ b/functorch/examples/dp_cifar10/cifar10_opacus.py
@@ -449,6 +449,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--clip-per-layer",
         "--clip_per_layer",
         action="store_true",
         default=False,
diff --git a/functorch/examples/dp_cifar10/cifar10_transforms.py b/functorch/examples/dp_cifar10/cifar10_transforms.py
index 825f0a75a19f..600931d50ec9 100644
--- a/functorch/examples/dp_cifar10/cifar10_transforms.py
+++ b/functorch/examples/dp_cifar10/cifar10_transforms.py
@@ -472,6 +472,7 @@ def parse_args():
     )
 
     parser.add_argument(
+        "--clip-per-layer",
         "--clip_per_layer",
         action="store_true",
         default=False,
diff --git a/functorch/examples/maml_omniglot/maml-omniglot-higher.py b/functorch/examples/maml_omniglot/maml-omniglot-higher.py
index 8f6e017f212a..17a882dd3370 100755
--- a/functorch/examples/maml_omniglot/maml-omniglot-higher.py
+++ b/functorch/examples/maml_omniglot/maml-omniglot-higher.py
@@ -46,15 +46,15 @@
 
 def main():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('--n_way', type=int, help='n way', default=5)
+    argparser.add_argument('--n-way', '--n_way', type=int, help='n way', default=5)
     argparser.add_argument(
-        '--k_spt', type=int, help='k shot for support set', default=5)
+        '--k-spt', '--k_spt', type=int, help='k shot for support set', default=5)
     argparser.add_argument(
-        '--k_qry', type=int, help='k shot for query set', default=15)
+        '--k-qry', '--k_qry', type=int, help='k shot for query set', default=15)
     argparser.add_argument(
         '--device', type=str, help='device', default='cuda')
     argparser.add_argument(
-        '--task_num',
+        '--task-num', '--task_num',
         type=int,
         help='meta batch size, namely task num',
         default=32)
diff --git a/functorch/examples/maml_omniglot/maml-omniglot-ptonly.py b/functorch/examples/maml_omniglot/maml-omniglot-ptonly.py
index 594237ee7d6e..3040df681ab1 100755
--- a/functorch/examples/maml_omniglot/maml-omniglot-ptonly.py
+++ b/functorch/examples/maml_omniglot/maml-omniglot-ptonly.py
@@ -46,15 +46,15 @@
 
 def main():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('--n_way', type=int, help='n way', default=5)
+    argparser.add_argument('--n-way', '--n_way', type=int, help='n way', default=5)
     argparser.add_argument(
-        '--k_spt', type=int, help='k shot for support set', default=5)
+        '--k-spt', '--k_spt', type=int, help='k shot for support set', default=5)
     argparser.add_argument(
-        '--k_qry', type=int, help='k shot for query set', default=15)
+        '--k-qry', '--k_qry', type=int, help='k shot for query set', default=15)
     argparser.add_argument(
         '--device', type=str, help='device', default='cuda')
     argparser.add_argument(
-        '--task_num',
+        '--task-num', '--task_num',
         type=int,
         help='meta batch size, namely task num',
         default=32)
diff --git a/functorch/examples/maml_omniglot/maml-omniglot-transforms.py b/functorch/examples/maml_omniglot/maml-omniglot-transforms.py
index efbb9da45d2d..890fcf38f9db 100755
--- a/functorch/examples/maml_omniglot/maml-omniglot-transforms.py
+++ b/functorch/examples/maml_omniglot/maml-omniglot-transforms.py
@@ -47,15 +47,15 @@
 
 def main():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument('--n_way', type=int, help='n way', default=5)
+    argparser.add_argument('--n-way', '--n_way', type=int, help='n way', default=5)
     argparser.add_argument(
-        '--k_spt', type=int, help='k shot for support set', default=5)
+        '--k-spt', '--k_spt', type=int, help='k shot for support set', default=5)
     argparser.add_argument(
-        '--k_qry', type=int, help='k shot for query set', default=15)
+        '--k-qry', '--k_qry', type=int, help='k shot for query set', default=15)
     argparser.add_argument(
         '--device', type=str, help='device', default='cuda')
     argparser.add_argument(
-        '--task_num',
+        '--task-num', '--task_num',
         type=int,
         help='meta batch size, namely task num',
         default=32)
diff --git a/functorch/examples/maml_omniglot/support/omniglot_loaders.py b/functorch/examples/maml_omniglot/support/omniglot_loaders.py
index 24d47dcf9980..cac99b2dfbb2 100644
--- a/functorch/examples/maml_omniglot/support/omniglot_loaders.py
+++ b/functorch/examples/maml_omniglot/support/omniglot_loaders.py
@@ -82,7 +82,7 @@ def _check_exists(self):
             os.path.exists(os.path.join(self.root, self.processed_folder, "images_background"))
 
     def download(self):
-        from six.moves import urllib
+        import urllib
         import zipfile
 
         if self._check_exists():
diff --git a/functorch/examples/maml_regression/evjang_transforms_module.py b/functorch/examples/maml_regression/evjang_transforms_module.py
index d1483550a29e..cc333ba46077 100644
--- a/functorch/examples/maml_regression/evjang_transforms_module.py
+++ b/functorch/examples/maml_regression/evjang_transforms_module.py
@@ -15,7 +15,7 @@
 
 class ThreeLayerNet(nn.Module):
     def __init__(self):
-        super(ThreeLayerNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(1, 40)
         self.relu1 = nn.ReLU()
         self.fc2 = nn.Linear(40, 40)
diff --git a/functorch/experimental/_cond.py b/functorch/experimental/_cond.py
index 8a75300e435a..f0cfe5b0e2f8 100644
--- a/functorch/experimental/_cond.py
+++ b/functorch/experimental/_cond.py
@@ -101,16 +101,18 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
 
 
+@cond.py_impl(DispatchKey.CUDA)
 @cond.py_impl(DispatchKey.CPU)
 def cond_dense(pred, true_fn, false_fn, operands):
     mode = _get_current_dispatch_mode()
-    assert (mode is None), "Mode should never be enabled for CPU key"
+    assert (mode is None), "Mode should never be enabled for CPU/CUDA key"
     if pred:
         return true_fn(*operands)
     else:
         return false_fn(*operands)
 
 
+@cond.py_impl(DispatchKey.AutogradCUDA)
 @cond.py_impl(DispatchKey.AutogradCPU)
 def cond_autograd(pred, true_fn, false_fn, *operands):
     # TODO: support autograd
diff --git a/functorch/experimental/_map.py b/functorch/experimental/_map.py
index 568b2de3884c..8016f5589c99 100644
--- a/functorch/experimental/_map.py
+++ b/functorch/experimental/_map.py
@@ -3,6 +3,7 @@
 import torch
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey, DispatchKeySet, ExcludeDispatchKeyGuard
+from torch._functorch.eager_transforms import _unwrap_all_tensors_from_functional, _wrap_all_tensors_to_functional, functionalize
 from torch._ops import PyOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
@@ -17,6 +18,7 @@
     _pop_mode_temporarily,
 )
 from torch.utils._pytree import tree_flatten
+from ._cond import _has_potential_branch_input_alias, _has_potential_branch_input_mutation, UnsupportedAliasMutationException
 
 
 map = PyOperator("map")
@@ -57,13 +59,15 @@ def trace_map(proxy_mode, func_overload, f, xs, *args):
     return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
 
 
+@map.py_impl(DispatchKey.CUDA)
 @map.py_impl(DispatchKey.CPU)
 def map_cpu(f, xs, *args):
     mode = _get_current_dispatch_mode()
-    assert (mode is None), "Mode should never be enabled for CPU key"
+    assert (mode is None), "Mode should never be enabled for CPU/CUDA key"
     return torch.stack([f(x, *args) for x in xs])
 
 
+@map.py_impl(DispatchKey.AutogradCUDA)
 @map.py_impl(DispatchKey.AutogradCPU)
 def map_autograd(f, xs, *args):
     # TODO: support autograd
@@ -95,6 +99,48 @@ def map_python_dispatcher(*args):
     _ = ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.PythonDispatcher))
     return map(*args)
 
+@map.py_impl(torch._C._functorch.TransformType.Functionalize)
+def map_functionalize(interpreter, f, xs, *args):
+    """
+    Functionalization implementation for torch.map. Currently:
+      1. We don't allow any input mutation inside the map function
+      2. Our check for above condition is not exhaustive
+    """
+    reapply_views = interpreter.functionalize_add_back_views()
+    mode = 'mutations_and_views' if reapply_views else 'mutations'
+    # At this point, we will see functionalized tensors, so need to unwrap them first
+    unwrapped_xs = _unwrap_all_tensors_from_functional(xs, reapply_views=reapply_views)
+    unwrapped_args = _unwrap_all_tensors_from_functional(args, reapply_views=reapply_views)
+
+    functional_map_fn = functionalize(f, remove=mode)
+
+    with interpreter.lower():
+        fake_tensor_mode = FakeTensorMode()
+        with fake_tensor_mode as ft_mode:
+
+            # Returns fake inputs for a single map function call
+            def get_fake_inputs(unwrapped_xs, unwrapped_args):
+                fake_xs = ft_mode.fake_tensor_converter(ft_mode, unwrapped_xs)
+                fake_args = pytree.tree_map_only(
+                    torch.Tensor,
+                    lambda x: ft_mode.fake_tensor_converter(ft_mode, x),
+                    unwrapped_args,
+                )
+                return (fake_xs[0],) + fake_args
+
+            fake_inputs = get_fake_inputs(unwrapped_xs, unwrapped_args)
+            if _has_potential_branch_input_mutation(functional_map_fn, fake_inputs):
+                raise UnsupportedAliasMutationException(
+                    "torch.map is mutating the input!"
+                )
+
+            if _has_potential_branch_input_alias(functional_map_fn, fake_inputs):
+                raise UnsupportedAliasMutationException(
+                    "torch.map is aliasing the input!"
+                )
+
+        map_return = map(functional_map_fn, unwrapped_xs, *unwrapped_args)
+        return _wrap_all_tensors_to_functional(map_return, level=interpreter.level())
 
 # TODO(voz) Make this automatic for keys, this is very ugly atm
 map.fallthrough(DispatchKey.PythonTLSSnapshot)
diff --git a/functorch/notebooks/_src/plot_ensembling.py b/functorch/notebooks/_src/plot_ensembling.py
index 94cd1151ad7b..7bce421ddfd6 100644
--- a/functorch/notebooks/_src/plot_ensembling.py
+++ b/functorch/notebooks/_src/plot_ensembling.py
@@ -24,7 +24,7 @@
 # Here's a simple CNN
 class SimpleCNN(nn.Module):
     def __init__(self):
-        super(SimpleCNN, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
         self.fc1 = nn.Linear(9216, 128)
diff --git a/functorch/notebooks/_src/plot_per_sample_gradients.py b/functorch/notebooks/_src/plot_per_sample_gradients.py
index 0feb2b80d947..668e089f821c 100644
--- a/functorch/notebooks/_src/plot_per_sample_gradients.py
+++ b/functorch/notebooks/_src/plot_per_sample_gradients.py
@@ -17,7 +17,7 @@
 # Here's a simple CNN
 class SimpleCNN(nn.Module):
     def __init__(self):
-        super(SimpleCNN, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
         self.fc1 = nn.Linear(9216, 128)
diff --git a/functorch/notebooks/ensembling.ipynb b/functorch/notebooks/ensembling.ipynb
index 41565aa07b62..1ecc8738b0b5 100644
--- a/functorch/notebooks/ensembling.ipynb
+++ b/functorch/notebooks/ensembling.ipynb
@@ -49,7 +49,7 @@
         "# Here's a simple MLP\n",
         "class SimpleMLP(nn.Module):\n",
         "    def __init__(self):\n",
-        "        super(SimpleMLP, self).__init__()\n",
+        "        super().__init__()\n",
         "        self.fc1 = nn.Linear(784, 128)\n",
         "        self.fc2 = nn.Linear(128, 128)\n",
         "        self.fc3 = nn.Linear(128, 10)\n",
diff --git a/functorch/notebooks/neural_tangent_kernels.ipynb b/functorch/notebooks/neural_tangent_kernels.ipynb
index 11bd8413380a..9d041be90926 100644
--- a/functorch/notebooks/neural_tangent_kernels.ipynb
+++ b/functorch/notebooks/neural_tangent_kernels.ipynb
@@ -38,7 +38,7 @@
     "\n",
     "class CNN(nn.Module):\n",
     "    def __init__(self):\n",
-    "        super(CNN, self).__init__()\n",
+    "        super().__init__()\n",
     "        self.conv1 = nn.Conv2d(3, 32, (3, 3))\n",
     "        self.conv2 = nn.Conv2d(32, 32, (3, 3))\n",
     "        self.conv3 = nn.Conv2d(32, 32, (3, 3))\n",
diff --git a/functorch/notebooks/per_sample_grads.ipynb b/functorch/notebooks/per_sample_grads.ipynb
index b0bcf2670c04..5f7ad23880b5 100644
--- a/functorch/notebooks/per_sample_grads.ipynb
+++ b/functorch/notebooks/per_sample_grads.ipynb
@@ -44,7 +44,7 @@
         "\n",
         "class SimpleCNN(nn.Module):\n",
         "    def __init__(self):\n",
-        "        super(SimpleCNN, self).__init__()\n",
+        "        super().__init__()\n",
         "        self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
         "        self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
         "        self.fc1 = nn.Linear(9216, 128)\n",
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
index 46276114c5e0..7c9a2d7ff4f4 100644
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@@ -3,10 +3,6 @@ file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip)
 
 if(BUILD_CAFFE2_OPS)
-  if(USE_OPENMP AND OPENMP_FOUND)
-    Set(OpenMP_link ${OpenMP_CXX_LIBRARIES})
-  endif()
-
   # Note(ilijar): Since Detectron ops currently have no
   # CPU implementation, we only build GPU ops for now.
   if(USE_CUDA)
@@ -15,8 +11,11 @@ if(BUILD_CAFFE2_OPS)
         ${Detectron_CPU_SRCS}
         ${Detectron_GPU_SRCS})
 
-    torch_set_target_props(caffe2_detectron_ops_gpu)
-    target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch ${OpenMP_link})
+    target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch)
+    if(USE_OPENMP)
+      target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp)
+    endif()
+
     if(USE_MKLDNN)
       target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn)
     endif()
@@ -31,7 +30,6 @@ if(BUILD_CAFFE2_OPS)
         caffe2_detectron_ops_hip SHARED
         ${Detectron_CPU_SRCS}
         ${Detectron_HIP_SRCS})
-    torch_set_target_props(caffe2_detectron_ops_hip)
     target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS})
     if(USE_MKLDNN)
       target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn)
@@ -44,8 +42,10 @@ if(BUILD_CAFFE2_OPS)
       set_target_properties(caffe2_detectron_ops PROPERTIES
         VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
     endif()
-    torch_set_target_props(caffe2_detectron_ops)
-    target_link_libraries(caffe2_detectron_ops PRIVATE torch ${OpenMP_link})
+    target_link_libraries(caffe2_detectron_ops PRIVATE torch)
+    if(USE_OPENMP)
+      target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp)
+    endif()
     if(USE_MKLDNN)
       target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn)
     endif()
diff --git a/mypy-nofollow.ini b/mypy-nofollow.ini
index e2cc39bd9754..7051df24a02b 100644
--- a/mypy-nofollow.ini
+++ b/mypy-nofollow.ini
@@ -19,8 +19,8 @@ files =
     test/test_utils.py
 
 # Minimum version supported - variable annotations were introduced
-# in Python 3.7
-python_version = 3.7
+# in Python 3.8
+python_version = 3.8
 
 [mypy-sympy]
 ignore_missing_imports = True
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 81c66d5239eb..e4d9d7a143e6 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -6,7 +6,7 @@
 # files.
 
 [mypy]
-python_version = 3.7
+python_version = 3.8
 plugins = mypy_plugins/check_mypy_version.py
 
 cache_dir = .mypy_cache/strict
@@ -63,6 +63,12 @@ follow_imports = skip
 [mypy-numpy]
 ignore_missing_imports = True
 
+[mypy-sympy]
+ignore_missing_imports = True
+
+[mypy-sympy.*]
+ignore_missing_imports = True
+
 [mypy-mypy.*]
 ignore_missing_imports = True
 
diff --git a/mypy.ini b/mypy.ini
index 4afe7dcf1255..380f432c4805 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -43,8 +43,8 @@ files =
 exclude = torch/include/|torch/csrc/|torch/distributed/elastic/agent/server/api.py|torch/testing/_internal|torch/distributed/fsdp/fully_sharded_data_parallel.py
 
 # Minimum version supported - variable annotations were introduced
-# in Python 3.7
-python_version = 3.7
+# in Python 3.8
+python_version = 3.8
 
 
 #
@@ -124,9 +124,6 @@ warn_unused_ignores = False
 [mypy-tools.generate_torch_version]
 warn_unused_ignores = False
 
-[mypy-tools.stats.s3_stat_parser]
-warn_unused_ignores = False
-
 #
 # Adding type annotations to caffe2 is probably not worth the effort
 # only work on this if you have a specific reason for it, otherwise
@@ -203,6 +200,12 @@ ignore_missing_imports = True
 [mypy-numpy.*]
 ignore_missing_imports = True
 
+[mypy-sympy]
+ignore_missing_imports = True
+
+[mypy-sympy.*]
+ignore_missing_imports = True
+
 [mypy-hypothesis.*]
 ignore_missing_imports = True
 
diff --git a/mypy_plugins/check_mypy_version.py b/mypy_plugins/check_mypy_version.py
index 0110232e566d..7ef19ef22b0b 100644
--- a/mypy_plugins/check_mypy_version.py
+++ b/mypy_plugins/check_mypy_version.py
@@ -9,7 +9,7 @@ def get_correct_mypy_version():
     # there's probably a more elegant way to do this
     match, = re.finditer(
         r'mypy==(\d+(?:\.\d+)*)',
-        (Path(__file__).parent.parent / '.circleci' / 'docker' / 'requirements-ci.txt').read_text(),
+        (Path(__file__).parent.parent / '.ci' / 'docker' / 'requirements-ci.txt').read_text(),
     )
     version, = match.groups()
     return version
diff --git a/pyproject.toml b/pyproject.toml
index 522adbf5d389..338bdc9bcf63 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,8 +8,7 @@ requires = [
     "pyyaml",
     "setuptools",
     "cmake",
-    "typing_extensions",
-    "six",
+    "typing-extensions",
     "requests",
 ]
 # Use legacy backend to import local packages in setup.py
diff --git a/pytest.ini b/pytest.ini
index 2732aa9a1ff4..67a691290076 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -11,3 +11,5 @@ addopts =
 testpaths =
     test
 junit_logging_reruns = all
+filterwarnings =
+    ignore:Module already imported so cannot be rewritten.*hypothesis:pytest.PytestAssertRewriteWarning
diff --git a/related_commits b/related_commits
index 38965a4c25ba..ff003950a104 100644
--- a/related_commits
+++ b/related_commits
@@ -1,10 +1,10 @@
-ubuntu|pytorch|apex|master|14db5c27acbe7c122794e11e94c205d0e4c8462e|https://github.com/ROCmSoftwarePlatform/apex
-centos|pytorch|apex|master|14db5c27acbe7c122794e11e94c205d0e4c8462e|https://github.com/ROCmSoftwarePlatform/apex
-ubuntu|pytorch|torchvision|main|c206a471617e41ba04a0f3cc5d926a4b7c391afe|https://github.com/pytorch/vision
-centos|pytorch|torchvision|main|c206a471617e41ba04a0f3cc5d926a4b7c391afe|https://github.com/pytorch/vision
-ubuntu|pytorch|torchtext|main|3e5f77e5c2c35b35f46cdc4bf7b7e82b7c30a0b0|https://github.com/pytorch/text
-centos|pytorch|torchtext|main|3e5f77e5c2c35b35f46cdc4bf7b7e82b7c30a0b0|https://github.com/pytorch/text
-ubuntu|pytorch|torchdata|main|2ca1fa6483e58c6428319393e1aab4c26f576bec|https://github.com/pytorch/data
-centos|pytorch|torchdata|main|2ca1fa6483e58c6428319393e1aab4c26f576bec|https://github.com/pytorch/data
-ubuntu|pytorch|torchaudio|main|41b883145a81b98254794c1504600dd610fc81f6|https://github.com/pytorch/audio
-centos|pytorch|torchaudio|main|41b883145a81b98254794c1504600dd610fc81f6|https://github.com/pytorch/audio
+ubuntu|pytorch|apex|master|03d70c41ac392bde3824841e5137cde3825adec1|https://github.com/ROCmSoftwarePlatform/apex
+centos|pytorch|apex|master|03d70c41ac392bde3824841e5137cde3825adec1|https://github.com/ROCmSoftwarePlatform/apex
+ubuntu|pytorch|torchvision|main|caf12f840037193fb3d1e6c60168c37dfa218f43|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|caf12f840037193fb3d1e6c60168c37dfa218f43|https://github.com/pytorch/vision
+ubuntu|pytorch|torchtext|main|38399ea985a0ba535c8228884e11ab66e76a6d46|https://github.com/pytorch/text
+centos|pytorch|torchtext|main|38399ea985a0ba535c8228884e11ab66e76a6d46|https://github.com/pytorch/text
+ubuntu|pytorch|torchdata|main|e74ed435b8eae3293bfe6b51cdf09859eedcd2cc|https://github.com/pytorch/data
+centos|pytorch|torchdata|main|e74ed435b8eae3293bfe6b51cdf09859eedcd2cc|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|main|1ed380953f733fc7973616a06e9576ad79fe6fb8|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|1ed380953f733fc7973616a06e9576ad79fe6fb8|https://github.com/pytorch/audio
diff --git a/requirements-flake8.txt b/requirements-flake8.txt
index 08c432ad4eb3..6824da33c759 100644
--- a/requirements-flake8.txt
+++ b/requirements-flake8.txt
@@ -2,6 +2,7 @@ flake8==3.8.2
 flake8-bugbear==20.1.4
 flake8-comprehensions==3.3.0
 flake8-executable==2.0.4
+flake8-logging-format==0.9.0
 git+https://github.com/malfet/flake8-coding.git
 flake8-pyi==20.5.0
 mccabe==0.6.1
diff --git a/requirements.txt b/requirements.txt
index 8b05458d8cf7..3f4997a3efe9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,9 +7,8 @@ psutil
 pyyaml
 requests
 setuptools
-six
 types-dataclasses
-typing_extensions
+typing-extensions
 sympy
 filelock
 networkx
diff --git a/scripts/build_tegra_x1.sh b/scripts/build_tegra_x1.sh
index 49c559ae3894..b1121ff1d716 100755
--- a/scripts/build_tegra_x1.sh
+++ b/scripts/build_tegra_x1.sh
@@ -41,10 +41,6 @@ sudo apt-get install \
 # the one provided by apt-get is quite old so we install it via pip
 sudo pip install hypothesis
 
-# Install the six module, which includes Python 2 and 3 compatibility utilities,
-# and is required for Caffe2
-sudo pip install six
-
 # Now, actually build the android target.
 echo "Building caffe2"
 cd $BUILD_ROOT
diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh
index c9d26ced319a..33fc65c50c9e 100755
--- a/scripts/build_tizen.sh
+++ b/scripts/build_tizen.sh
@@ -95,10 +95,6 @@ sudo zypper install \
 # Obtain python hypothesis, which Caffe2 uses for unit testing. Note that
 # the one provided by zypper is quite old so we install it via pip
 sudo pip install hypothesis
-
-# Install the six module, which includes Python 2 and 3 compatibility utilities,
-# and is required for Caffe2
-sudo pip install six
 }
 
 caffe2_full_build(){
diff --git a/scripts/model_zoo/update-caffe2-models.py b/scripts/model_zoo/update-caffe2-models.py
index e9a5f28cb880..7f9c8e9815db 100755
--- a/scripts/model_zoo/update-caffe2-models.py
+++ b/scripts/model_zoo/update-caffe2-models.py
@@ -6,7 +6,7 @@
 import tarfile
 import tempfile
 
-from six.moves.urllib.request import urlretrieve
+from urllib.request import urlretrieve
 
 from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
 
diff --git a/scripts/model_zoo/update-models-from-caffe2.py b/scripts/model_zoo/update-models-from-caffe2.py
index f3b485f495d3..9e408d6808f1 100644
--- a/scripts/model_zoo/update-models-from-caffe2.py
+++ b/scripts/model_zoo/update-models-from-caffe2.py
@@ -17,7 +17,7 @@
 
 import boto3
 
-from six.moves.urllib.request import urlretrieve
+from urllib.request import urlretrieve
 
 from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
 from caffe2.proto import caffe2_pb2
@@ -163,7 +163,7 @@ def tensortype_to_ndarray(tensor_type):
 
 
 def generate_test_input_data(onnx_model, scale):
-    real_inputs_names = list(set([input.name for input in onnx_model.graph.input]) - set([init.name for init in onnx_model.graph.initializer]))
+    real_inputs_names = list({input.name for input in onnx_model.graph.input} - {init.name for init in onnx_model.graph.initializer})
     real_inputs = []
     for name in real_inputs_names:
         for input in onnx_model.graph.input:
diff --git a/scripts/release_notes/categorize.py b/scripts/release_notes/categorize.py
index a79c737d18e5..666597994386 100644
--- a/scripts/release_notes/categorize.py
+++ b/scripts/release_notes/categorize.py
@@ -128,7 +128,7 @@ def update_commit(self, commit, category, topic):
         assert topic in topics
         commit.category = category
         commit.topic = topic
-        self.commits.write_to_disk()
+        self.commits.write_result()
 
 def main():
     parser = argparse.ArgumentParser(description='Tool to help categorize commits')
diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py
index 5529a2f2a9d5..d71486cdc8a8 100644
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -17,14 +17,14 @@
 Create a new commitlist for consumption by categorize.py.
 Said commitlist contains commits between v1.5.0 and f5bc91f851.
 
-    python commitlist.py --create_new tags/v1.5.0 f5bc91f851
+    python commitlist.py --create-new tags/v1.5.0 f5bc91f851
 
 Update the existing commitlist to commit bfcb687b9c.
 
-    python commitlist.py --update_to bfcb687b9c
+    python commitlist.py --update-to bfcb687b9c
 
 """
-@dataclasses.dataclass(frozen=True)
+@dataclasses.dataclass(frozen=False)
 class Commit:
     commit_hash: str
     category: str
@@ -143,7 +143,7 @@ def categorize(features):
         files_changed = features['files_changed']
         for file in files_changed:
             file_lowercase = file.lower()
-            if CommitList.keywordInFile(file, ['docker/', '.circleci', '.github', '.jenkins', '.azure_pipelines']):
+            if CommitList.keywordInFile(file, ['docker/', '.circleci', '.github', '.jenkins', '.ci', '.azure_pipelines']):
                 category = 'releng'
                 break
             # datapipe(s), torch/utils/data, test_{dataloader, datapipe}
@@ -342,16 +342,16 @@ def main():
     parser = argparse.ArgumentParser(description='Tool to create a commit list')
 
     group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument('--create_new', nargs=2)
-    group.add_argument('--update_to')
+    group.add_argument('--create-new', '--create_new', nargs=2)
+    group.add_argument('--update-to', '--update_to')
     # I found this flag useful when experimenting with adding new auto-categorizing filters.
     # After running commitlist.py the first time, if you add any new filters in this file,
     # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file,
     # but only affect the rows that were previously marked as "Uncategorized"
-    group.add_argument('--rerun_with_new_filters', action='store_true')
+    group.add_argument('--rerun-with-new-filters', '--rerun_with_new_filters', action='store_true')
     group.add_argument('--stat', action='store_true')
-    group.add_argument('--export_markdown', action='store_true')
-    group.add_argument('--export_csv_categories', action='store_true')
+    group.add_argument('--export-markdown', '--export_markdown', action='store_true')
+    group.add_argument('--export-csv-categories', '--export_csv_categories', action='store_true')
     parser.add_argument('--path', default='results/commitlist.csv')
     args = parser.parse_args()
 
diff --git a/scripts/release_notes/namespace_check.py b/scripts/release_notes/namespace_check.py
index 54196bdfbe6f..1b9a91c12f8a 100644
--- a/scripts/release_notes/namespace_check.py
+++ b/scripts/release_notes/namespace_check.py
@@ -39,7 +39,7 @@ def get_content(submod):
     return content
 
 def namespace_filter(data):
-    out = set(d for d in data if d[0] != "_")
+    out = {d for d in data if d[0] != "_"}
     return out
 
 def run(args, submod):
diff --git a/setup.py b/setup.py
index e428dc874f0f..d4b64795cbd9 100644
--- a/setup.py
+++ b/setup.py
@@ -95,6 +95,9 @@
 #   USE_FFMPEG
 #     enables use of ffmpeg for additional operators
 #
+#   USE_FLASH_ATTENTION=0
+#     disables building flash attention for scaled dot product attention
+#
 #   USE_LEVELDB
 #     enables use of LevelDB for storage
 #
@@ -172,6 +175,9 @@
 #   NCCL_INCLUDE_DIR
 #     specify where nccl is installed
 #
+#   NVFUSER_SOURCE_DIR
+#     specify nvfuser root directory
+#
 #   NVTOOLSEXT_PATH (Windows only)
 #     specify where nvtoolsext is installed
 #
@@ -211,7 +217,7 @@
     sys.exit(-1)
 
 import platform
-python_min_version = (3, 7, 0)
+python_min_version = (3, 8, 0)
 python_min_version_str = '.'.join(map(str, python_min_version))
 if sys.version_info < python_min_version:
     print("You are using Python {}. Python >={} is required.".format(platform.python_version(),
@@ -544,6 +550,11 @@ def run(self):
         else:
             report('-- Not using ITT')
 
+        if cmake_cache_vars['BUILD_NVFUSER']:
+            report('-- Building nvfuser')
+        else:
+            report('-- Not Building nvfuser')
+
         # Do not use clang to compile extensions if `-fstack-clash-protection` is defined
         # in system CFLAGS
         c_flags = str(os.getenv('CFLAGS', ''))
@@ -633,6 +644,22 @@ def build_extensions(self):
                     os.makedirs(dst_dir)
                 self.copy_file(src, dst)
 
+        # Copy nvfuser extension
+        for i, ext in enumerate(self.extensions):
+            if ext.name != "nvfuser._C":
+                continue
+            fullname = self.get_ext_fullname(ext.name)
+            filename = self.get_ext_filename(fullname)
+            fileext = os.path.splitext(filename)[1]
+            src = os.path.join(os.path.dirname(filename), "nvfuser" + fileext)
+            dst = os.path.join(os.path.realpath(self.build_lib), filename)
+            if os.path.exists(src):
+                report("Copying {} from {} to {}".format(ext.name, src, dst))
+                dst_dir = os.path.dirname(dst)
+                if not os.path.exists(dst_dir):
+                    os.makedirs(dst_dir)
+                self.copy_file(src, dst)
+
         setuptools.command.build_ext.build_ext.build_extensions(self)
 
 
@@ -798,12 +825,7 @@ def configure_extension_build():
         # /MD links against DLL runtime
         # and matches the flags set for protobuf and ONNX
         # /EHsc is about standard C++ exception handling
-        # /DNOMINMAX removes builtin min/max functions
-        # /wdXXXX disables warning no. XXXX
-        extra_compile_args = ['/MD', '/FS', '/EHsc', '/DNOMINMAX',
-                              '/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838',
-                              '/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996',
-                              '/wd4275']
+        extra_compile_args = ['/MD', '/FS', '/EHsc']
     else:
         extra_link_args = []
         extra_compile_args = [
@@ -891,6 +913,8 @@ def make_relative_rpath_args(path):
         excludes.extend(['caffe2', 'caffe2.*'])
     if not cmake_cache_vars['BUILD_FUNCTORCH']:
         excludes.extend(['functorch', 'functorch.*'])
+    if not cmake_cache_vars['BUILD_NVFUSER']:
+        excludes.extend(['nvfuser', 'nvfuser.*'])
     packages = find_packages(exclude=excludes)
     C = Extension("torch._C",
                   libraries=main_libraries,
@@ -913,6 +937,13 @@ def make_relative_rpath_args(path):
 
     # These extensions are built by cmake and copied manually in build_extensions()
     # inside the build_ext implementation
+    if cmake_cache_vars['USE_ROCM']:
+        triton_req_file = os.path.join(cwd, ".github", "requirements", "triton-requirements-rocm.txt")
+        if os.path.exists(triton_req_file):
+            with open(triton_req_file) as f:
+                triton_req = f.read().strip()
+                extra_install_requires.append(triton_req)
+
     if cmake_cache_vars['BUILD_CAFFE2']:
         extensions.append(
             Extension(
@@ -937,6 +968,12 @@ def make_relative_rpath_args(path):
                 name=str('functorch._C'),
                 sources=[]),
         )
+    if cmake_cache_vars['BUILD_NVFUSER']:
+        extensions.append(
+            Extension(
+                name=str('nvfuser._C'),
+                sources=[]),
+        )
 
     cmdclass = {
         'bdist_wheel': wheel_concatenate,
@@ -981,9 +1018,11 @@ def print_box(msg):
 def main():
     # the list of runtime dependencies required by this built package
     install_requires = [
-        'typing_extensions',
+        'filelock',
+        'typing-extensions',
         'sympy',
         'networkx',
+        'jinja2',
     ]
 
     extras_require = {
@@ -1061,6 +1100,7 @@ def main():
         'include/ATen/hip/detail/*.cuh',
         'include/ATen/hip/detail/*.h',
         'include/ATen/hip/impl/*.h',
+        'include/ATen/miopen/*.h',
         'include/ATen/detail/*.h',
         'include/ATen/native/*.h',
         'include/ATen/native/cpu/*.h',
@@ -1114,6 +1154,9 @@ def main():
         'include/torch/csrc/distributed/c10d/*.h',
         'include/torch/csrc/distributed/c10d/*.hpp',
         'include/torch/csrc/distributed/rpc/*.h',
+        'include/torch/csrc/distributed/autograd/context/*.h',
+        'include/torch/csrc/distributed/autograd/functions/*.h',
+        'include/torch/csrc/distributed/autograd/rpc_messages/*.h',
         'include/torch/csrc/jit/*.h',
         'include/torch/csrc/jit/backends/*.h',
         'include/torch/csrc/jit/generated/*.h',
diff --git a/test/_nvfuser/__init__.py b/test/_nvfuser/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/_nvfuser/test_dynamo.py b/test/_nvfuser/test_dynamo.py
new file mode 120000
index 000000000000..140d1d845e7f
--- /dev/null
+++ b/test/_nvfuser/test_dynamo.py
@@ -0,0 +1 @@
+../../third_party/nvfuser/python_tests/test_dynamo.py
\ No newline at end of file
diff --git a/test/_nvfuser/test_python_frontend.py b/test/_nvfuser/test_python_frontend.py
new file mode 120000
index 000000000000..a022a886483e
--- /dev/null
+++ b/test/_nvfuser/test_python_frontend.py
@@ -0,0 +1 @@
+../../third_party/nvfuser/python_tests/test_python_frontend.py
\ No newline at end of file
diff --git a/test/_nvfuser/test_torchscript.py b/test/_nvfuser/test_torchscript.py
new file mode 120000
index 000000000000..24384a274229
--- /dev/null
+++ b/test/_nvfuser/test_torchscript.py
@@ -0,0 +1 @@
+../../third_party/nvfuser/python_tests/test_torchscript.py
\ No newline at end of file
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index caa8f8ca9ef3..6b424d34d70a 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -579,6 +579,7 @@
     "prelu",
     "relu_",
     "rrelu_",
+    "scaled_dot_product_attention",
     "selu_",
     "softplus",
     "softshrink",
@@ -1266,7 +1267,6 @@
     "_sparse_csr_sum",
     "_sparse_csr_tensor_unsafe",
     "_sparse_log_softmax_backward_data",
-    "_sparse_mask_helper",
     "_sparse_softmax_backward_data",
     "_sparse_sparse_matmul",
     "_sparse_sum",
@@ -1298,6 +1298,7 @@
     "lobpcg",
     "lu",
     "obj",
+    "segment_reduce",
     "set_default_dtype",
     "set_grad_enabled",
     "set_printoptions",
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index f531dd2927bb..85d78c49ea54 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -188,7 +188,7 @@ def test_s_prep_before_fusion(self):
         )
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(isinstance(mod[5], torch.ao.nn.intrinsic.quantized.LinearReLU))
         self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
 
     # This tests whether performing fusion before sparse prepare causes and issues. The
@@ -230,7 +230,7 @@ def test_fusion_before_s_prep(self):
         tq.convert(mod, inplace=True)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(isinstance(mod[5], torch.ao.nn.intrinsic.quantized.LinearReLU))
         self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -375,7 +375,7 @@ def test_q_prep_fx_before_s_prep(self):
         mod = convert_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU))
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -433,9 +433,9 @@ def test_q_prep_fx_s_prep_ref_conv(self):
         mod = convert_to_reference_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.LinearReLU))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.LinearReLU))
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
-        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.nn.quantized._reference.Linear))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.ao.nn.quantized.reference.Linear))
 
         # check that module was actually sparsified
         cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
@@ -479,7 +479,7 @@ def test_s_prep_before_q_prep_fx(self):
         mod = convert_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU))
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -525,7 +525,7 @@ def test_s_prep_before_qat_prep_fx(self):
         mod = convert_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.quantized.LinearReLU))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.quantized.LinearReLU))
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
 
         # check that module was actually sparsified
@@ -570,9 +570,9 @@ def test_s_prep_q_prep_fx_ref(self):
         mod = convert_to_reference_fx(mod)
 
         # check that final module is the expected quantized module and that the model runs
-        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.LinearReLU))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.ao.nn.intrinsic.LinearReLU))
         self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
-        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.nn.quantized._reference.Linear))
+        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.ao.nn.quantized.reference.Linear))
 
         # check that module was actually sparsified
         cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index a431ac4535a6..666cdf7eb46c 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -213,7 +213,7 @@ def check_memory_reference(self, data_list, data_with_config, defaults, **kwargs
             weight = sparsifier._extract_weight(data)
             weight.data = weight + torch.randn(*weight.shape)
             contained_data = sparsifier.get_data(name=name)
-            assert id(weight.data) == id(contained_data.data)
+            assert weight.data.storage().data_ptr() == contained_data.data.storage().data_ptr()
             assert torch.all(contained_data == weight)
 
 
@@ -533,8 +533,8 @@ def test_ptq_sparsify_first(self):
                                       select_embeddings=select_embeddings,
                                       **sparse_config)
 
-        assert type(model.emb1) == torch.nn.quantized.modules.embedding_ops.Embedding
-        assert type(model.embbag1) == torch.nn.quantized.modules.embedding_ops.EmbeddingBag
+        assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert type(model.embbag1) == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
         assert type(model.emb_seq[0] == nn.Embedding)
         assert type(model.emb_seq[1] == nn.EmbeddingBag)
         assert type(model.linear1) == nn.Linear
@@ -568,10 +568,10 @@ def test_ptq_quantize_first(self):
         sparse_config = {'sparsity_level': 0.8, 'sparse_block_shape': (1, 1)}
         post_training_sparse_quantize(model, DataNormSparsifier, sparsify_first=False, **sparse_config)
 
-        assert type(model.emb1) == torch.nn.quantized.modules.embedding_ops.Embedding
-        assert type(model.embbag1) == torch.nn.quantized.modules.embedding_ops.EmbeddingBag
-        assert type(model.emb_seq[0] == torch.nn.quantized.modules.embedding_ops.Embedding)
-        assert type(model.emb_seq[1] == torch.nn.quantized.modules.embedding_ops.EmbeddingBag)
+        assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert type(model.embbag1) == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        assert type(model.emb_seq[0] == torch.ao.nn.quantized.modules.embedding_ops.Embedding)
+        assert type(model.emb_seq[1] == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag)
         assert type(model.linear1) == nn.Linear  # not quantized
         assert type(model.linear2) == nn.Linear  # not quantized
 
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index 512c58b18836..582f12fe4861 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -417,7 +417,7 @@ def test_mask_squash(self):
             assert torch.all(weights == torch.eye(height, width) * weights)  # only diagonal to be present
 
     def test_sparsity_levels(self):
-        nearliness_levels = list(nearliness for nearliness in range(-1, 100))
+        nearliness_levels = list(range(-1, 100))
         model = nn.Sequential()
 
         p = re.compile(r'[-\.\s]')
diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index c8cda86a6313..045a73f0b93e 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 # Owner(s): ["module: unknown"]
-
-
 import copy
 import logging
 import random
@@ -10,6 +8,7 @@
 from torch import nn
 from torch.ao.pruning._experimental.pruner import (
     SaliencyPruner,
+    LSTMSaliencyPruner,
     BaseStructuredSparsifier,
     FakeStructuredSparsity,
 )
@@ -28,8 +27,12 @@
     Conv2dPool,
     Conv2dPoolFlatten,
     Conv2dPoolFlattenFunctional,
+    LSTMLinearModel,
+    LSTMLayerNormLinearModel,
+    rows_are_subset,
 )
 
+
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
@@ -53,8 +56,25 @@ def update_mask(self, module, tensor_name, **kwargs):
         module.parametrizations[tensor_name][0].mask[prune] = False
 
 
+class BottomHalfLSTMPruner(BaseStructuredSparsifier):
+    """
+    Pruner that will remove the bottom half of the rows.
+    This is primarily meant for testing purposes
+    """
+
+    def update_mask(self, module, tensor_name, **kwargs):
+        for p in getattr(module.parametrizations, tensor_name):
+            if isinstance(p, FakeStructuredSparsity):
+                mask = p.mask
+                masks = torch.split(mask, len(mask) // 4)
+                for small in masks:
+                    num = len(small)
+                    small[num // 2 :] = False
+                new_mask = torch.cat(masks)
+                mask.data = new_mask.data
+
 class TestSaliencyPruner(TestCase):
-    def test_update_mask(self):
+    def test_saliency_pruner_update_mask(self):
         """Test that we prune out the row with the lowest saliency (first row)"""
         model = SimpleLinear()
         with torch.no_grad():
@@ -75,6 +95,70 @@ def test_update_mask(self):
         assert expected.shape == pruned.shape
         assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
 
+    def test_lstm_saliency_pruner_update_mask(self):
+        model = LSTMLinearModel(
+            input_dim=2,
+            hidden_dim=2,
+            output_dim=2,
+            num_layers=1,
+        )
+
+        manual_weights = torch.Tensor([[1, 1],
+                                       [2, 2],
+                                       [2, 2],
+                                       [1, 1],
+                                       [-1, -1],
+                                       [-2, -2],
+                                       [-2, -2],
+                                       [-1, -1]])
+
+        with torch.no_grad():
+            model.lstm.weight_ih_l0 = nn.Parameter(manual_weights)
+            model.lstm.weight_hh_l0 = nn.Parameter(torch.Tensor(manual_weights))
+            model.lstm.bias_ih_l0 = nn.Parameter(manual_weights[:, 0])
+            model.lstm.bias_hh_l0 = nn.Parameter(manual_weights[:, 0])
+
+        config = [
+            {"tensor_fqn": "lstm.weight_ih_l0"},
+            {"tensor_fqn": "lstm.weight_hh_l0"},
+        ]
+        lstm_input = torch.ones((1, 2))
+        fx_pruner = LSTMSaliencyPruner({"sparsity_level": 0.5})
+        fx_pruner.prepare(model, config)
+        fx_pruner.enable_mask_update = True
+        fx_pruner.step()
+
+        model.eval()
+        pruned_model = fx_pruner.prune()
+        pruned_model.eval()
+
+        # make sure both models run
+        model(lstm_input)
+        pruned_model(lstm_input)
+
+        # make sure lowest saliency rows are pruned
+        expected = torch.Tensor([[2, 2],
+                                 [2, 2],
+                                 [-2, -2],
+                                 [-2, -2]])
+        pruned = model.lstm.weight_ih_l0
+        assert expected.shape == pruned.shape
+        assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
+
+        expected = torch.Tensor([[2],
+                                 [2],
+                                 [-2],
+                                 [-2]])
+        pruned = model.lstm.weight_hh_l0
+        assert expected.shape == pruned.shape
+        assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
+
+        expected = torch.Tensor([2, 2, -2, -2])
+        for pruned in [model.lstm.bias_ih_l0, model.lstm.bias_hh_l0]:
+            assert expected.shape == pruned.shape
+            assert torch.isclose(expected, pruned, rtol=1e-05, atol=1e-07).all()
+
+
 
 class TestBaseStructuredSparsifier(TestCase):
     def _check_pruner_prepared(self, model, pruner, device):
@@ -667,3 +751,169 @@ def test_complex_conv2d(self):
                     torch.device(device),
                     also_prune_bias,
                 )
+
+    def test_prune_lstm_linear_multiple_layer(self):
+        """
+        Test fusion support for LSTM(multi-layer) -> Linear
+        """
+        model = LSTMLinearModel(
+            input_dim=8,
+            hidden_dim=8,
+            output_dim=8,
+            num_layers=2,
+        )
+
+        config = [
+            {"tensor_fqn": "lstm.weight_ih_l0"},
+            {"tensor_fqn": "lstm.weight_hh_l0"},
+            {"tensor_fqn": "lstm.weight_ih_l1"},
+            {"tensor_fqn": "lstm.weight_hh_l1"},
+        ]
+
+        lstm_input = torch.ones((1, 8))
+        fx_pruner = BottomHalfLSTMPruner({"sparsity_level": 0.5})
+        fx_pruner.prepare(model, config)
+
+        fx_pruner.enable_mask_update = True
+        fx_pruner.step()
+
+        model.eval()
+        _, _ = model(lstm_input)
+        pruned_model = fx_pruner.prune()
+        pruned_model.eval()
+        _, _ = pruned_model(lstm_input)
+
+        expected_params = dict(model.named_parameters())
+        for name, param in model.named_parameters():
+            assert name in expected_params
+            # We cannot compare y_expected == y_pruned, as the 0 elements mess up the numerics
+            # Instead we check that the weights of the new LSTM are a subset of the weights of
+            # the old LSTM
+            assert rows_are_subset(param, expected_params[name])
+            del expected_params[name]
+
+        # assert we haven't deleted any keys
+        assert len(expected_params) == 0
+
+    def test_prune_lstm_linear_single_layer(self):
+        """
+        Test fusion support for LSTM (single-layer) -> Linear
+        """
+        model = LSTMLinearModel(
+            input_dim=8,
+            hidden_dim=8,
+            output_dim=8,
+            num_layers=1,
+        )
+
+        config = [
+            {"tensor_fqn": "lstm.weight_ih_l0"},
+            {"tensor_fqn": "lstm.weight_hh_l0"},
+        ]
+
+        lstm_input = torch.ones((1, 8))
+        fx_pruner = BottomHalfLSTMPruner({"sparsity_level": 0.5})
+        fx_pruner.prepare(model, config)
+        fx_pruner.enable_mask_update = True
+        fx_pruner.step()
+        model.eval()
+
+        out_expected, lstm_out_expected = model(lstm_input)
+        pruned_model = fx_pruner.prune()
+        pruned_model.eval()
+        out_pruned, lstm_out_pruned = pruned_model(lstm_input)
+        r, c = lstm_out_expected.size()
+
+        # We cannot check that y_expected == y_pruned as usual because
+        # zeros vs. missing elements yield different numerical results.
+        # Instead that we check that the pruned elements are the first half of the results
+        # since we are using a BottomHalfLSTMPruner
+        assert torch.isclose(
+            lstm_out_expected[:, : c // 2], lstm_out_pruned, rtol=1e-05, atol=1e-07
+        ).all()
+        # also check that output of linear is the same shape, this means we've resized
+        # linear columns correctly.
+        assert out_expected.shape == out_pruned.shape
+
+    def test_prune_lstm_layernorm_linear_multiple_layer(self):
+        """
+        Test fusion support for LSTM(multi-layer) -> Linear
+        """
+        model = LSTMLayerNormLinearModel(
+            input_dim=8,
+            output_dim=8,
+            hidden_dim=8,
+            num_layers=2,
+        )
+
+        config = [
+            {"tensor_fqn": "lstm.weight_ih_l0"},
+            {"tensor_fqn": "lstm.weight_hh_l0"},
+            {"tensor_fqn": "lstm.weight_ih_l1"},
+            {"tensor_fqn": "lstm.weight_hh_l1"},
+        ]
+
+        lstm_input = torch.ones((1, 8))
+        fx_pruner = BottomHalfLSTMPruner({"sparsity_level": 0.5})
+        fx_pruner.prepare(model, config)
+
+        fx_pruner.enable_mask_update = True
+        fx_pruner.step()
+
+        model.eval()
+        _, _ = model(lstm_input)
+        pruned_model = fx_pruner.prune()
+        pruned_model.eval()
+        _, _ = pruned_model(lstm_input)
+
+        expected_params = dict(model.named_parameters())
+        for name, param in model.named_parameters():
+            assert name in expected_params
+            # We cannot compare y_expected == y_pruned, as the 0 elements mess up the numerics
+            # Instead we check that the weights of the new LSTM are a subset of the weights of
+            # the old LSTM
+            assert rows_are_subset(param, expected_params[name])
+            del expected_params[name]
+
+        # assert we haven't deleted any keys
+        assert len(expected_params) == 0
+
+    def test_prune_lstm_layernorm_linear_single_layer(self):
+        """
+        Test fusion support for LSTM (single-layer) -> Linear
+        """
+        model = LSTMLinearModel(
+            input_dim=8,
+            hidden_dim=8,
+            output_dim=8,
+            num_layers=1,
+        )
+
+        config = [
+            {"tensor_fqn": "lstm.weight_ih_l0"},
+            {"tensor_fqn": "lstm.weight_hh_l0"},
+        ]
+
+        lstm_input = torch.ones((1, 8))
+        fx_pruner = BottomHalfLSTMPruner({"sparsity_level": 0.5})
+        fx_pruner.prepare(model, config)
+        fx_pruner.enable_mask_update = True
+        fx_pruner.step()
+        model.eval()
+
+        out_expected, lstm_out_expected = model(lstm_input)
+        pruned_model = fx_pruner.prune()
+        pruned_model.eval()
+        out_pruned, lstm_out_pruned = pruned_model(lstm_input)
+        r, c = lstm_out_expected.size()
+
+        # We cannot check that y_expected == y_pruned as usual because
+        # zeros vs. missing elements yield different numerical results.
+        # Instead that we check that the pruned elements are the first half of the results
+        # since we are using a BottomHalfLSTMPruner
+        assert torch.isclose(
+            lstm_out_expected[:, : c // 2], lstm_out_pruned, rtol=1e-05, atol=1e-07
+        ).all()
+        # also check that output of linear is the same shape, this means we've resized
+        # linear columns correctly.
+        assert out_expected.shape == out_pruned.shape
diff --git a/test/autograd/test_complex.py b/test/autograd/test_complex.py
index c8796a4bae61..5162e0399ee8 100644
--- a/test/autograd/test_complex.py
+++ b/test/autograd/test_complex.py
@@ -15,11 +15,11 @@ def test_view_func_for_complex_views(self):
         x1 = torch.view_as_complex(x0)
         x2 = torch.view_as_real(x1)
         x2.mul_(2)
-        x2.sum().backward()
+        x2.sum().abs().backward()
 
         y0 = y.clone()
         y0.mul_(2)
-        y0.sum().backward()
+        y0.sum().abs().backward()
 
         self.assertEqual(x.grad, y.grad)
 
@@ -35,11 +35,11 @@ def fn(a):
 
         x0 = fn(x)
         x0.mul_(2)
-        x0.sum().backward()
+        x0.sum().abs().backward()
 
         y0 = fn(y)
         y1 = y0.mul(2)
-        y1.sum().backward()
+        y1.sum().abs().backward()
 
         self.assertEqual(x.grad, y.grad)
 
@@ -55,11 +55,11 @@ def fn(a, dim0_size=5):
 
         x0 = fn(x)
         x0.mul_(2)
-        x0.sum().backward()
+        x0.sum().abs().backward()
 
         y0 = fn(y)
         y1 = y0.mul(2)
-        y1.sum().backward()
+        y1.sum().abs().backward()
 
         self.assertEqual(x.grad, y.grad)
 
diff --git a/test/backends/xeon/test_launch.py b/test/backends/xeon/test_launch.py
index 056a53ee110d..c3585ba7429d 100644
--- a/test/backends/xeon/test_launch.py
+++ b/test/backends/xeon/test_launch.py
@@ -52,8 +52,8 @@ def test_cpu_info(self):
 
     def test_multi_threads(self):
         num = 0
-        with subprocess.Popen(f"python -m torch.backends.xeon.run_cpu --ninstances 4 --use_default_allocator \
-            --disable_iomp --disable_numactl --log_path {self._test_dir} --no_python pwd",
+        with subprocess.Popen(f"python -m torch.backends.xeon.run_cpu --ninstances 4 --use-default-allocator \
+            --disable-iomp --disable-numactl --log-path {self._test_dir} --no-python pwd",
                               shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
             for line in p.stdout.readlines():
                 segs = str(line, "utf-8").strip().split("-")
diff --git a/test/bottleneck_test/test_cuda.py b/test/bottleneck_test/test_cuda.py
index 1cf018f0241a..65bbcac0f015 100644
--- a/test/bottleneck_test/test_cuda.py
+++ b/test/bottleneck_test/test_cuda.py
@@ -6,7 +6,7 @@
 
 class Model(nn.Module):
     def __init__(self):
-        super(Model, self).__init__()
+        super().__init__()
         self.linear = nn.Linear(20, 20)
 
     def forward(self, input):
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index dd16d9c18083..28f17f10ff43 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -45,8 +45,7 @@ TEST_F(ModuleTest, ZeroGrad) {
   for (auto& parameter : module->parameters()) {
     // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
     auto grad = parameter.grad();
-    ASSERT_TRUE(grad.defined());
-    ASSERT_EQ(grad.sum().item<float>(), 0);
+    ASSERT_FALSE(grad.defined());
   }
 }
 
@@ -66,14 +65,14 @@ TEST_F(ModuleTest, ZeroGradWithUndefined) {
   ASSERT_TRUE(module.x.grad().defined());
   ASSERT_FALSE(module.y.grad().defined());
 
-  module.zero_grad();
+  module.zero_grad(false); // set_to_none = false
 
   ASSERT_TRUE(module.x.grad().defined());
   ASSERT_FALSE(module.y.grad().defined());
 
   ASSERT_EQ(module.x.grad().sum().item<float>(), 0);
 
-  module.zero_grad(true); // set_to_none = true
+  module.zero_grad();
 
   ASSERT_FALSE(module.x.grad().defined());
   ASSERT_FALSE(module.y.grad().defined());
diff --git a/test/cpp/api/moduledict.cpp b/test/cpp/api/moduledict.cpp
index 88a46f37a8c6..51018435236b 100644
--- a/test/cpp/api/moduledict.cpp
+++ b/test/cpp/api/moduledict.cpp
@@ -299,3 +299,11 @@ TEST_F(ModuleDictTest, PrettyPrintModuleDict) {
       "  (lstm): torch::nn::LSTM(input_size=4, hidden_size=5, num_layers=1, bias=true, batch_first=false, dropout=0, bidirectional=false)\n"
       ")");
 }
+
+TEST_F(ModuleDictTest, InvalidAt) {
+  torch::OrderedDict<std::string, std::shared_ptr<Module>> ordereddict = {
+      {"linear", Linear(10, 3).ptr()}};
+  ModuleDict dict(ordereddict);
+  ASSERT_THROWS_WITH(
+      dict->at<torch::nn::Dropout2dImpl>("linear"), "Unable to cast module");
+}
diff --git a/test/cpp/api/modulelist.cpp b/test/cpp/api/modulelist.cpp
index e8a0bebeb945..afd7df433fc5 100644
--- a/test/cpp/api/modulelist.cpp
+++ b/test/cpp/api/modulelist.cpp
@@ -300,3 +300,9 @@ TEST_F(ModuleListTest, RangeBasedForLoop) {
     module->pretty_print(buffer);
   }
 }
+
+TEST_F(ModuleListTest, InvalidAt) {
+  torch::nn::ModuleList m(torch::nn::Linear(1, 2));
+  ASSERT_THROWS_WITH(
+      m->at<torch::nn::Dropout2dImpl>(0), "Unable to cast module");
+}
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index 6bdc23e0f9ba..a71eb6f1812d 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -435,8 +435,7 @@ TEST(OptimTest, ZeroGrad) {
   optimizer.zero_grad();
 
   for (const auto& parameter : model->parameters()) {
-    ASSERT_TRUE(parameter.grad().defined());
-    ASSERT_EQ(parameter.grad().sum().item<float>(), 0);
+    ASSERT_FALSE(parameter.grad().defined());
   }
 }
 
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index 78d629f97ef7..2c4352e96086 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -1099,6 +1099,13 @@ TEST(TensorTest, BackwardNonScalarOutputs) {
       y.backward(), "grad can be implicitly created only for scalar outputs");
 }
 
+TEST(TensorTest, BackwardComplexScalarOutput) {
+  auto x = torch::randn({5, 5}, torch::requires_grad());
+  auto y = (x * c10::Scalar(c10::complex<float>(0, 0.5))).sum();
+  ASSERT_THROWS_WITH(
+      y.backward(), "grad can be computed only for real scalar outputs");
+}
+
 TEST(TensorTest, IsLeaf) {
   auto x = torch::tensor({5}, torch::dtype(torch::kFloat).requires_grad(true));
   auto y = x * x;
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index b8b765a68d8b..2376f1bc43b1 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -95,23 +95,6 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_flatbuffer.cpp
 )
 
-if(USE_CUDA)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu1.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu2.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu3.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_view.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_rng.cu)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp)
-endif()
-
 add_executable(test_jit
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${JIT_TEST_SRCS}
diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp
index 7f57bc5ca75a..2f495c405cfe 100644
--- a/test/cpp/jit/test_exception.cpp
+++ b/test/cpp/jit/test_exception.cpp
@@ -113,7 +113,7 @@ TEST(TestException, TestCustomException) {
   py::exec(R"PY(
   class SimpleValueError(ValueError):
     def __init__(self, message):
-      super(SimpleValueError, self).__init__(message)
+      super().__init__(message)
   )PY");
 
   std::string pythonCode = R"PY(
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index c45ca96383e9..212d64251de3 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -1157,7 +1157,7 @@ TEST(RunTimeTest, ParseOperator) {
 
   // class Add(torch.nn.Module):
   //     def __init__(self):
-  //         super(Add, self).__init__()
+  //         super().__init__()
 
   //     def forward(self, a, b):
   //         return a + b
diff --git a/test/cpp/jit/tests_setup.py b/test/cpp/jit/tests_setup.py
index 35267352e86c..b4643927a978 100644
--- a/test/cpp/jit/tests_setup.py
+++ b/test/cpp/jit/tests_setup.py
@@ -3,7 +3,7 @@
 import torch
 
 
-class Setup(object):
+class Setup:
     def setup(self):
         raise NotImplementedError()
 
@@ -11,7 +11,7 @@ def shutdown(self):
         raise NotImplementedError()
 
 
-class FileSetup(object):
+class FileSetup:
     path = None
 
     def shutdown(self):
@@ -26,7 +26,7 @@ class EvalModeForLoadedModule(FileSetup):
     def setup(self):
         class Model(torch.jit.ScriptModule):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.dropout = torch.nn.Dropout(0.1)
 
             @torch.jit.script_method
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff
index 4f62dbfbeb80..8860b3f1eeb3 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff
index 01891bc9e4a9..ce612850374e 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff
index f932d478d0ab..d895b209afb3 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff
index d20ba9bf4820..ba723415ecf2 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff
index 7299062135c9..61b96da1f6fb 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff
index 700a0e5bae11..3085b13a71be 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff
index 0b1200312851..2f8cfb95856c 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff
index ce5daf444635..20524c64f7c2 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff
index 46b57c83fe78..fc5e8f3bd2a0 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff differ
diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff
index 963070db5149..bdababe90d91 100644
Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff differ
diff --git a/test/cpp/lazy/test_lazy_ops.cpp b/test/cpp/lazy/test_lazy_ops.cpp
index 4f48cd8e8686..aa31ffc59bb5 100644
--- a/test/cpp/lazy/test_lazy_ops.cpp
+++ b/test/cpp/lazy/test_lazy_ops.cpp
@@ -956,7 +956,7 @@ TEST_F(LazyOpsTest, TestIntegerAdd) {
       torch::Tensor b =
           torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
       torch::Scalar one =
-          isIntegralType(type) ? torch::Scalar(1) : torch::Scalar(1.0);
+          isIntegralType(type, false) ? torch::Scalar(1) : torch::Scalar(1.0);
       torch::Tensor c = torch::add(b, one);
 
       torch::Tensor lazy_a = CopyToDevice(a, device);
@@ -1028,39 +1028,6 @@ TEST_F(LazyOpsTest, TestQR) {
   }
 }
 
-TEST_F(LazyOpsTest, TestSymEig) {
-  static const int dims[] = {4, 7};
-  for (auto m : dims) {
-    for (bool eigenvectors : {true, false}) {
-      for (bool upper : {true, false}) {
-        torch::Tensor a = torch::rand(
-            {m, m},
-            torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
-        torch::Tensor sym_a = a.mm(a.t());
-        auto b = torch::symeig(sym_a, eigenvectors, upper);
-        ForEachDevice([&](const torch::Device& device) {
-          torch::Tensor lazy_a = CopyToDevice(sym_a, device);
-          auto lazy_b = torch::symeig(lazy_a, eigenvectors, upper);
-          AllClose(
-              std::get<0>(b),
-              std::get<0>(lazy_b),
-              /*rtol=*/3e-2,
-              /*atol=*/1e-2);
-          if (eigenvectors) {
-            AllClose(
-                std::get<1>(b).abs(),
-                std::get<1>(lazy_b).abs(),
-                /*rtol=*/3e-2,
-                /*atol=*/1e-2);
-          } else {
-            EXPECT_EQ(std::get<1>(b).sizes(), std::get<1>(lazy_b).sizes());
-          }
-        });
-      }
-    }
-  }
-}
-
 TEST_F(LazyOpsTest, TestCholesky) {
   static const int dims[] = {4, 7};
   for (auto m : dims) {
@@ -1586,7 +1553,7 @@ TEST_F(LazyOpsTest, TestStdWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // int rank = a.dim();
-  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
@@ -1606,7 +1573,7 @@ TEST_F(LazyOpsTest, TestStdMeanWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // int rank = a.dim();
-  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
@@ -1743,7 +1710,7 @@ TEST_F(LazyOpsTest, TestVarWithDim) {
 TEST_F(LazyOpsTest, TestVarWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
-  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (bool keepDim : {true, false}) {
       for (const auto& correction : corrections) {
@@ -1763,7 +1730,7 @@ TEST_F(LazyOpsTest, TestVarWithCorrection) {
 TEST_F(LazyOpsTest, TestVarMeanWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
-  c10::optional<int64_t> corrections[] = {1, 2, c10::nullopt};
+  c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (const auto& correction : corrections) {
       for (auto keepdim : {true, false}) {
diff --git a/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp b/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
index 1648b1e3d819..e176e6b2395b 100644
--- a/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
+++ b/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
@@ -21,21 +21,21 @@ TEST(RunTimeTest, LoadAndForward) {
   //  sequence.ptl source code:
   //  class A(torch.nn.Module):
   //    def __init__(self):
-  //      super(A, self).__init__()
+  //      super().__init__()
   //
   //    def forward(self, x):
   //      return x + 1
   //
   //  class B(torch.nn.Module):
   //    def __init__(self):
-  //      super(B, self).__init__()
+  //      super().__init__()
   //
   //    def forward(self, x):
   //      return x + 2
   //
   //  class C(torch.nn.Module):
   //    def __init__(self):
-  //      super(C, self).__init__()
+  //      super().__init__()
   //      self.A0 = A()
   //      self.B0 = B()
   //
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 520ae6301ceb..d469a7dfa21b 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -182,7 +182,6 @@ TEST(LLVM, BitCast) {
   constexpr int16_t ref16 = 1337;
   constexpr int32_t ref32 = 1337;
   constexpr int64_t ref64 = 1337;
-  at::Half reff16 = 1337.0f;
   constexpr float reff32 = 1337.0f;
   constexpr double reff64 = 1337.0f;
 
diff --git a/test/cpp_api_parity/module_impl_check.py b/test/cpp_api_parity/module_impl_check.py
index 6e4480901dde..bbfad91d109e 100644
--- a/test/cpp_api_parity/module_impl_check.py
+++ b/test/cpp_api_parity/module_impl_check.py
@@ -65,7 +65,11 @@
   write_ivalue_to_file(torch::IValue(cpp_output), forward_output_file_path);
 
   // Backward pass
-  cpp_output.sum().backward();
+  if (cpp_output.is_complex()) {
+    cpp_output.sum().abs().backward();
+  } else {
+    cpp_output.sum().backward();
+  }
 
   // Put all gradients into a c10::Dict, save it into a file to be compared in Python later
   c10::Dict<std::string, torch::Tensor> grad_dict;
@@ -109,7 +113,10 @@ def run_python_forward_backward(unit_test_class, test_params):
     script_module = torch.jit.trace(module, torch.tensor(0))
 
     # Backward pass
-    python_output.sum().backward()
+    if python_output.dtype.is_complex:
+        python_output.sum().abs().backward()
+    else:
+        python_output.sum().backward()
 
     # Put all gradients into a dict, to be compared later
     python_grad_dict = {}
diff --git a/test/cpp_api_parity/sample_module.py b/test/cpp_api_parity/sample_module.py
index 082df0a3bad5..e126bbd2b8bf 100644
--- a/test/cpp_api_parity/sample_module.py
+++ b/test/cpp_api_parity/sample_module.py
@@ -13,7 +13,7 @@
 
 class SampleModule(torch.nn.Module):
     def __init__(self, has_parity, has_submodule):
-        super(SampleModule, self).__init__()
+        super().__init__()
         self.has_parity = has_parity
         if has_submodule:
             self.submodule = SampleModule(self.has_parity, False)
diff --git a/test/create_dummy_torchscript_model.py b/test/create_dummy_torchscript_model.py
index ffd869e27f0b..ba9f6617177c 100644
--- a/test/create_dummy_torchscript_model.py
+++ b/test/create_dummy_torchscript_model.py
@@ -7,7 +7,7 @@
 class NeuralNetwork(nn.Module):
 
     def __init__(self):
-        super(NeuralNetwork, self).__init__()
+        super().__init__()
         self.flatten = nn.Flatten()
         self.linear_relu_stack = nn.Sequential(
             nn.Linear(28 * 28, 512),
diff --git a/test/custom_backend/backend.py b/test/custom_backend/backend.py
index 8b48ed0a4108..7c8114247655 100644
--- a/test/custom_backend/backend.py
+++ b/test/custom_backend/backend.py
@@ -43,9 +43,6 @@ class Model(torch.nn.Module):
     and executing in C++.
     """
 
-    def __init__(self):
-        super(Model, self).__init__()
-
     def forward(self, a, b):
         return (a + b, a - b)
 
diff --git a/test/custom_operator/model.py b/test/custom_operator/model.py
index 5131b4ad6db6..ff9e310b556d 100644
--- a/test/custom_operator/model.py
+++ b/test/custom_operator/model.py
@@ -19,7 +19,7 @@ def get_custom_op_library_path():
 
 class Model(torch.jit.ScriptModule):
     def __init__(self):
-        super(Model, self).__init__()
+        super().__init__()
         self.p = torch.nn.Parameter(torch.eye(5))
 
     @torch.jit.script_method
diff --git a/test/distributed/_composable/fully_shard/test_fully_shard_init.py b/test/distributed/_composable/fully_shard/test_fully_shard_init.py
index 0dd33efd21f4..2192e00e11c1 100644
--- a/test/distributed/_composable/fully_shard/test_fully_shard_init.py
+++ b/test/distributed/_composable/fully_shard/test_fully_shard_init.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from torch.distributed._composable import fully_shard
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
+from torch.distributed.fsdp._common_utils import _is_fsdp_flattened, clean_tensor_name
 from torch.distributed.fsdp.wrap import _FSDPPolicy, ModuleWrapPolicy
 from torch.testing._internal.common_dist_composable import (
     CompositeParamModel,
@@ -236,7 +236,9 @@ def _param_init_fn(module: nn.Module):
             composable_module.named_parameters(),
             fsdp_wrapped_model.named_parameters(),
         ):
-            self.assertEqual(composable_param_name, fsdp_wrapped_param_name)
+            self.assertEqual(
+                composable_param_name, clean_tensor_name(fsdp_wrapped_param_name)
+            )
             self.assertEqual(
                 composable_param.device,
                 torch.device("cuda", torch.cuda.current_device()),
diff --git a/test/distributed/_composable/fully_shard/test_fully_shard_optim_checkpoint.py b/test/distributed/_composable/fully_shard/test_fully_shard_optim_checkpoint.py
index 1d7215c7e94e..7ecdcea0d088 100644
--- a/test/distributed/_composable/fully_shard/test_fully_shard_optim_checkpoint.py
+++ b/test/distributed/_composable/fully_shard/test_fully_shard_optim_checkpoint.py
@@ -46,8 +46,8 @@ def _test_optim_state_save_load(self, model1, optim1, model2, optim2) -> None:
             model(batch).sum().backward()
             optim.step()
 
-        optim_state_dict1 = FSDP._optim_state_dict(model1, optim1)
-        optim_state_dict2 = FSDP._optim_state_dict(model2, optim2)
+        optim_state_dict1 = FSDP.optim_state_dict(model1, optim1)
+        optim_state_dict2 = FSDP.optim_state_dict(model2, optim2)
 
         self.assertEqual(
             len(optim_state_dict1["state"]), len(optim_state_dict2["state"])
diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index 10a64cf33723..e5c9f0ff593e 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -14,7 +14,7 @@
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False)
         self.fc2 = nn.Linear(10, 50, bias=False)
         self.fc3 = nn.Linear(50, 4, bias=False)
diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
index a884d64d399f..24d99e29a5cc 100644
--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
@@ -29,7 +29,7 @@
 
 class MyShardedModel(torch.nn.Module):
     def __init__(self, spec=None, group=None):
-        super(MyShardedModel, self).__init__()
+        super().__init__()
         # Use same seed.
         torch.manual_seed(0)
         self.param = torch.nn.Parameter(torch.rand(5, 10))
@@ -47,7 +47,7 @@ def forward(self, input):
 
 class MyShardedLinear(torch.nn.Module):
     def __init__(self, rank=None):
-        super(MyShardedLinear, self).__init__()
+        super().__init__()
         # Use same seed.
         torch.manual_seed(0)
         self.linear1 = torch.nn.Linear(17, 12)
diff --git a/test/distributed/_shard/test_replicated_tensor.py b/test/distributed/_shard/test_replicated_tensor.py
deleted file mode 100644
index 9dfdd8703588..000000000000
--- a/test/distributed/_shard/test_replicated_tensor.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Owner(s): ["oncall: distributed"]
-import io
-
-import torch
-import torch.distributed._shard.sharded_tensor as sharded_tensor
-
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-
-from torch.distributed._shard import _shard_tensor
-from torch.distributed._shard.replicated_tensor import ReplicatedTensor
-from torch.distributed._shard.sharding_spec import ChunkShardingSpec
-from torch.testing._internal.common_distributed import (
-    requires_nccl,
-    skip_if_lt_x_gpu,
-)
-
-from torch.testing._internal.distributed._shard.sharded_tensor import (
-    ShardedTensorTestBase,
-    with_comms,
-)
-from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import (
-    gen_binary_op_func
-)
-from torch.testing._internal.distributed._shard.sharded_tensor import TEST_GPU_NUM
-
-
-class TestReplicatedTensor(ShardedTensorTestBase):
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_replicated_tensor_basics(self):
-        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4
-        replica_tensor = ReplicatedTensor(local_tensor)
-        # validate it's a replicated tensor by checking values on all rank
-        validated = replica_tensor.validate()
-        self.assertEqual(validated, True)
-        res = replica_tensor + 2
-        self.assertIsInstance(res, torch.Tensor)
-        self.assertNotIsInstance(res, ReplicatedTensor)
-        self.assertEqual(res, torch.ones(3, 3) * 6)
-
-        # modify local tensor on certain rank, and test if validation raise
-        if self.rank == 2:
-            local_tensor += 3
-
-        with self.assertRaisesRegex(ValueError, 'have different values'):
-            replica_tensor.validate()
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_replicated_tensor_inter_op_replicated_tensor(self):
-        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}")
-        replica_tensor1 = ReplicatedTensor(local_tensor * 4)
-        replica_tensor2 = ReplicatedTensor(local_tensor * 6)
-
-        new_tensor = replica_tensor1 * replica_tensor2
-        self.assertIsInstance(new_tensor, ReplicatedTensor)
-        self.assertEqual(new_tensor, torch.ones(3, 3) * 24)
-
-        # test replicated tensor inter-op with different pgs
-        new_pg = dist.new_group(ranks=[1, 2, 3])
-        replica_tensor_new_group = ReplicatedTensor(local_tensor * 3, process_group=new_pg)
-
-        with self.assertRaisesRegex(RuntimeError, 'must be in the same'):
-            replica_tensor_new_group * replica_tensor1
-
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_replicated_tensor_inter_op_tensor(self):
-        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4
-        replica_tensor = ReplicatedTensor(local_tensor)
-
-        local_rand_tensor = torch.randn(3, 3, device=f"cuda:{self.rank}")
-
-        new_tensor = replica_tensor + local_rand_tensor
-        self.assertIsInstance(new_tensor, torch.Tensor)
-        self.assertNotIsInstance(new_tensor, ReplicatedTensor)
-
-        self.assertEqual(new_tensor, local_tensor + local_rand_tensor)
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_replicated_tensor_inter_op_sharded_tensor(self):
-        torch.manual_seed(self.rank)
-
-        local_tensor1 = torch.rand(12, 3, device=f"cuda:{self.rank}") * 4
-        local_tensor2 = torch.ones(12, 3, device=f"cuda:{self.rank}") * 4
-
-        spec = ChunkShardingSpec(
-            dim=0,
-            placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
-                "rank:2/cuda:2",
-                "rank:3/cuda:3",
-            ],
-        )
-
-        st = _shard_tensor(local_tensor1, spec, src_rank=0)
-        replica_tensor = ReplicatedTensor(local_tensor2)
-
-        ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"]
-
-        for op in ops:
-            binary_op = gen_binary_op_func(op)
-            res = binary_op(st, replica_tensor)
-            self.assertIsInstance(res, sharded_tensor.ShardedTensor)
-            self.assertNotIsInstance(res, ReplicatedTensor)
-            output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None
-            res.gather(dst=0, out=output)
-
-            if self.rank == 0:
-                local_output = binary_op(local_tensor1, local_tensor2)
-                self.assertEqual(output, local_output)
-
-            # reflective
-            reflect_res = binary_op(replica_tensor, st)
-            self.assertIsInstance(reflect_res, sharded_tensor.ShardedTensor)
-            self.assertNotIsInstance(reflect_res, ReplicatedTensor)
-            reflect_output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None
-            reflect_res.gather(dst=0, out=reflect_output)
-
-            if self.rank == 0:
-                reflect_local_output = binary_op(local_tensor2, local_tensor1)
-                self.assertEqual(reflect_output, reflect_local_output)
-
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_replicated_tensor_implicit_broadcasting(self):
-        #  use same seed
-        torch.manual_seed(self.rank)
-
-        # test implicit broadcasting
-        local_tensor1 = torch.rand(12, 3, device=f"cuda:{self.rank}") * 4
-        # we use size (3) to trigger the implicit broadcasting logic
-        # and it will fail if implicit broadcasting not happen.
-        local_tensor2 = torch.ones(3, device=f"cuda:{self.rank}")
-
-        spec = ChunkShardingSpec(
-            dim=0,
-            placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
-                "rank:2/cuda:2",
-                "rank:3/cuda:3",
-            ],
-        )
-
-        st = _shard_tensor(local_tensor1, spec, src_rank=0)
-        replica_tensor = ReplicatedTensor(local_tensor2)
-
-        ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"]
-
-        for op in ops:
-            binary_op = gen_binary_op_func(op)
-            # replicated tensor should automatically broadcasted
-            res = binary_op(st, replica_tensor)
-
-            self.assertIsInstance(res, sharded_tensor.ShardedTensor)
-            output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None
-            res.gather(dst=0, out=output)
-
-            if self.rank == 0:
-                local_output = binary_op(local_tensor1, local_tensor2)
-                self.assertEqual(output, local_output)
-
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_replicated_tensor_inter_op_sharded_tensor_errors(self):
-        local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4
-        replica_tensor = ReplicatedTensor(local_tensor)
-
-        torch.manual_seed(self.rank)
-        spec = ChunkShardingSpec(
-            dim=0,
-            placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
-                "rank:2/cuda:2",
-                "rank:3/cuda:3",
-            ],
-        )
-
-        st1 = sharded_tensor.rand(spec, (20, 3, 3))
-        st2 = sharded_tensor.rand(spec, (30, 3, 3))
-
-        with self.assertRaisesRegex(RuntimeError, 'Implicit broadcasting'):
-            st1 + st2
-
-        with self.assertRaisesRegex(RuntimeError, 'not supported for ShardedTensor'):
-            st1 % replica_tensor
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_with_ddp(self):
-        # Test Replicated params for DDP
-        replica_tensor = ReplicatedTensor(torch.rand(4, 8, device=self.rank))
-        model = torch.nn.Linear(8, 2).cuda(self.rank)
-        optim = torch.optim.SGD(model.parameters(), lr=0.1)
-        ddp = DDP(model)
-
-        # Test module.parameters.
-        params = list(ddp.parameters())
-        self.assertEqual(2, len(params))
-        self.assertEqual(ddp.module.weight, params[0])
-        self.assertEqual(ddp.module.bias, params[1])
-
-        params = list(model.parameters())
-        self.assertEqual(2, len(params))
-        self.assertEqual(model.weight, params[0])
-        self.assertEqual(model.bias, params[1])
-
-        # Validate output
-        out = ddp(replica_tensor)
-        self.assertIsInstance(out, ReplicatedTensor)
-
-        # Test backward and optimizer.
-
-        # Validate backward.
-        out.sum().backward()
-        self.assertIsNotNone(model.weight.grad)
-        self.assertIsNotNone(model.bias.grad)
-        self.assertIsNotNone(ddp.module.weight.grad)
-        self.assertIsNotNone(ddp.module.bias.grad)
-
-        original_params = []
-        for param_group in optim.param_groups:
-            for original_param in param_group['params']:
-                self.assertIsNotNone(original_param.grad)
-                original_params.append(original_param)
-
-        self.assertEqual(model.weight.grad, original_params[0].grad)
-        self.assertEqual(model.bias.grad, original_params[1].grad)
-        self.assertEqual(model.weight.grad, ddp.module.weight.grad)
-        self.assertEqual(model.bias.grad, ddp.module.bias.grad)
-
-        # Validate optimizer.
-        optim.step()
-        self.assertEqual(model.weight, ddp.module.weight)
-        self.assertEqual(model.weight, original_params[0])
-
-        self.assertEqual(model.bias, ddp.module.bias)
-        self.assertEqual(model.bias, original_params[1])
-
-        # Validate zero_grad
-        optim.zero_grad()
-        self.assertEqual(model.weight.grad, torch.zeros_like(model.weight.grad))
-        self.assertEqual(model.weight.grad, ddp.module.weight.grad)
-        self.assertEqual(model.weight.grad, original_params[0].grad)
-
-        self.assertEqual(model.bias.grad, torch.zeros_like(model.bias.grad))
-        self.assertEqual(model.bias.grad, ddp.module.bias.grad)
-        self.assertEqual(model.bias.grad, original_params[1].grad)
-
-        # Validate zero_grad set_to_none
-        optim.zero_grad(set_to_none=True)
-        self.assertIsNone(model.weight.grad)
-        self.assertEqual(model.weight.grad, ddp.module.weight.grad)
-        self.assertEqual(model.weight.grad, original_params[0].grad)
-
-        self.assertIsNone(model.bias.grad)
-        self.assertEqual(model.bias.grad, ddp.module.bias.grad)
-        self.assertEqual(model.bias.grad, original_params[1].grad)
-
-        # Multiple forward passes.
-        for _ in range(5):
-            out = ddp(replica_tensor)
-            self.assertIsInstance(out, ReplicatedTensor)
-
-        # Test with context manager.
-        from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
-        with _ddp_replicated_tensor(False):
-            for _ in range(5):
-                with _ddp_replicated_tensor(True):
-                    ddp = DDP(model)
-                    out = ddp(replica_tensor)
-                self.assertIsInstance(out, ReplicatedTensor)
-
-        # Test save and load.
-        with _ddp_replicated_tensor(False):
-            ddp = DDP(model)
-            expected_state_dict = ddp.state_dict()
-            buffer = io.BytesIO()
-            torch.save(ddp, buffer)
-
-            buffer.seek(0)
-            obj = torch.load(buffer)
-            self.assertEqual(expected_state_dict, obj.state_dict())
-
-        with _ddp_replicated_tensor(True):
-            ddp = DDP(model)
-            buffer = io.BytesIO()
-            torch.save(ddp, buffer)
-
-            buffer.seek(0)
-            obj = torch.load(buffer)
-            self.assertEqual(expected_state_dict, obj.state_dict())
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_unsqueeze(self):
-        local_tensor = torch.rand(3, 3, device=self.rank)
-        replicated_tensor = ReplicatedTensor(local_tensor)
-
-        unsqueezed_replicated_tensor = replicated_tensor.unsqueeze(0)
-        unsqueezed_local_tensor = local_tensor.unsqueeze(0)
-
-        self.assertIsInstance(unsqueezed_replicated_tensor, ReplicatedTensor)
-        self.assertIsInstance(torch.unsqueeze(replicated_tensor, 0), ReplicatedTensor)
-        self.assertEqual(unsqueezed_local_tensor, unsqueezed_replicated_tensor)
-        self.assertEqual(torch.unsqueeze(replicated_tensor, 0), unsqueezed_replicated_tensor)
-
-    @with_comms(init_rpc=False)
-    @skip_if_lt_x_gpu(TEST_GPU_NUM)
-    @requires_nccl()
-    def test_getitem(self):
-        local_tensor = torch.rand(3, 3, device=self.rank)
-        replicated_tensor = ReplicatedTensor(local_tensor)
-
-        replicated_tensor_view = replicated_tensor[0]
-        local_tensor_view = local_tensor[0]
-
-        self.assertIsInstance(replicated_tensor_view, ReplicatedTensor)
-        self.assertEqual(local_tensor_view, replicated_tensor_view)
diff --git a/test/distributed/_spmd/test_tracing.py b/test/distributed/_spmd/test_tracing.py
new file mode 100644
index 000000000000..54222726f20f
--- /dev/null
+++ b/test/distributed/_spmd/test_tracing.py
@@ -0,0 +1,358 @@
+# Owner(s): ["oncall: distributed"]
+
+from copy import deepcopy
+from functools import wraps
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.distributed._spmd.api import Schema, SPMD
+from torch.distributed._spmd.comm_tensor import CommTensor
+from torch.distributed._tensor import DeviceMesh, Replicate
+from torch.distributed.distributed_c10d import get_global_rank, get_world_size
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms as base_with_comms,
+)
+
+
+def with_comms(func):
+    @base_with_comms
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        # make sure we set different random seeds for each rank
+        # otherwise we dont need DDP / SPMD
+        # (we would have the same parameters and inputs everywhere)
+        torch.manual_seed(torch.distributed.get_rank())
+        return func(self, *args, **kwargs)
+
+    return wrapper
+
+
+class TraceDeviceMeshTestBase:
+    def _test_tracing_all_reduce_nd(self, mesh_tensor):
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+
+            def fn(tensor: torch.Tensor):
+                tensor_to_reduce = CommTensor(tensor.clone())
+                mesh.all_reduce(tensor_to_reduce, mesh_dim=dim)
+                # multiply with 1 to trigger wait on read during tracing.
+                return tensor_to_reduce * 1
+
+            # use a local_tensor + 1 for tracing to make sure that we are not
+            # simply replaying recorded tensor value
+            traced_fn = make_fx(fn)(local_tensor + 1)
+
+            # execute traced DeviceMesh communication
+            reduced_tensor = traced_fn(local_tensor.clone())
+            res_num = sum(global_ranks)
+            self.assertEqual(reduced_tensor, torch.ones(3, 3) * res_num)
+
+    def _test_broadcast_nd(self, mesh_tensor):
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+
+            def fn(tensor: torch.Tensor):
+                received_tensor = CommTensor(tensor.clone())
+                mesh.broadcast(received_tensor, mesh_dim=dim)
+                # multiply with 1 to trigger wait on read during tracing.
+                return received_tensor * 1
+
+            local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+            # use a local_tensor + 1 for tracing to make sure that we are not
+            # simply replaying recorded tensor value
+            traced_fn = make_fx(fn)(local_tensor + 1)
+
+            # execute traced DeviceMesh communication
+            received_tensor = traced_fn(local_tensor)
+            res_num = global_ranks[0]
+            self.assertEqual(received_tensor, torch.ones(3, 3) * res_num)
+
+    def _test_scatter_nd(self, mesh_tensor):
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            scattered_tensors = [
+                torch.ones(3, 3, device=self.device_type) * global_rank
+                for global_rank in global_ranks
+            ]
+
+            def fn(to_receive: torch.Tensor, to_scatter: List[torch.Tensor]):
+                to_scatter = [CommTensor(t) for t in to_scatter]
+                to_receive = CommTensor(to_receive)
+                mesh.scatter(to_receive, to_scatter, mesh_dim=dim)
+                # multiply with 1 to trigger wait on read during tracing.
+                return to_receive * 1
+
+            # use a local_tensor + 1 for tracing to make sure that we are not
+            # simply replaying recorded tensor value
+            to_receive = torch.empty_like(
+                scattered_tensors[mesh.get_coordinate()[dim]]
+            )
+            traced_fn = make_fx(fn)(to_receive, [t + 1 for t in scattered_tensors])
+
+            received_tensor = traced_fn(to_receive, scattered_tensors)
+            self.assertEqual(received_tensor, torch.ones(3, 3) * self.rank)
+
+    def _test_all_gather_nd(self, mesh_tensor):
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        # each rank have its own tensor, all_gather gives a big tensor
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+
+            gathered_list = [
+                torch.empty_like(local_tensor) for _ in range(dim_group_size)
+            ]
+
+            def fn(gathered_list: List[torch.Tensor], tensor: torch.Tensor):
+                gathered_list = [CommTensor(t) for t in gathered_list]
+                tensor = CommTensor(tensor)
+                mesh.all_gather(gathered_list, tensor, mesh_dim=dim)
+                return [t * 1 for t in gathered_list]
+
+            # use a local_tensor + 1 for tracing to make sure that we are not
+            # simply replaying recorded tensor value
+            traced_fn = make_fx(fn)(gathered_list, local_tensor + 1)
+            gathered_list = traced_fn(gathered_list, local_tensor)
+
+            self.assertEqual(len(gathered_list), dim_group_size)
+            for idx, gathered_tensor in enumerate(gathered_list):
+                self.assertEqual(gathered_tensor, torch.ones(3, 3) * global_ranks[idx])
+
+
+class TraceDeviceMesh3DTest(DTensorTestBase, TraceDeviceMeshTestBase):
+    @property
+    def world_size(self):
+        return 8
+
+    @with_comms
+    def test_tracing_all_reduce_nd(self):
+        self._test_tracing_all_reduce_nd(torch.arange(8).reshape(2, 2, 2))
+
+    @with_comms
+    def test_broadcast_nd(self):
+        self._test_broadcast_nd(torch.arange(8).reshape(2, 2, 2))
+
+    @with_comms
+    def test_scatter_nd(self):
+        self._test_scatter_nd(torch.arange(8).reshape(2, 2, 2))
+
+    @with_comms
+    def test_all_gather_nd(self):
+        self._test_all_gather_nd(torch.arange(8).reshape(2, 2, 2))
+
+
+class TraceDeviceMesh2DTest(DTensorTestBase, TraceDeviceMeshTestBase):
+    @property
+    def world_size(self):
+        return 4
+
+    @with_comms
+    def test_tracing_all_reduce_nd(self):
+        self._test_tracing_all_reduce_nd(torch.arange(4).reshape(2, 2))
+
+    @with_comms
+    def test_broadcast_nd(self):
+        self._test_broadcast_nd(torch.arange(4).reshape(2, 2))
+
+    @with_comms
+    def test_scatter_nd(self):
+        self._test_scatter_nd(torch.arange(4).reshape(2, 2))
+
+    @with_comms
+    def test_all_gather_nd(self):
+        self._test_all_gather_nd(torch.arange(4).reshape(2, 2))
+
+
+class TraceModuleTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 2
+
+    def _test_trace_replicate(self, model: nn.Module, x, *args, **kwargs):
+        # if x.device.type == "cuda":
+        ddp = DDP(deepcopy(model))
+        spmd = SPMD(
+            deepcopy(model),
+            schema=Schema(
+                mesh=DeviceMesh(self.device_type, torch.arange(self.world_size)),
+                placements=[Replicate()],
+            ),
+            input_schemas=kwargs["inp_schemas"] if "inp_schemas" in kwargs else None,
+        )
+        if "inp_schemas" in kwargs:
+            del kwargs["inp_schemas"]
+        only_fw = False
+        if "only_fw" in kwargs:
+            only_fw = kwargs["only_fw"]
+            del kwargs["only_fw"]
+        if only_fw:
+            output_ddp = ddp(x, *args, **kwargs)
+            output_spmd = spmd(x, *args, **kwargs)
+            self.assertTrue(output_ddp.size(), output_spmd.size())
+            return
+        ddp(x, *args, **kwargs).sum().backward()
+        spmd(x, *args, **kwargs).sum().backward()
+        for p1, p2 in zip(ddp.parameters(), spmd.parameters()):
+            # DDP divides gradients by world size to compute average, but
+            # _Partial tensor shouldn't do that automatically. Hence explicitly
+            # do division here.
+            self.assertTrue(
+                p1.grad.allclose(p2.grad / self.world_size) or p1.grad.allclose(p2.grad)
+            )
+
+    @with_comms
+    def test_torch_cat(self):
+        x = torch.rand((2, 4)).to(self.device_type)
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = torch.nn.Parameter(torch.rand((2, 4)))
+
+            def forward(self, x):
+                # TODO(anj): Using self.w and ignoring x results in an allgather call
+                # that we have not yet supported.
+                return torch.cat((self.w, self.w), 0)
+
+        model = Model().to(self.device_type)
+        inp_kwargs = {}
+        inp_kwargs["inp_schemas"] = [
+            Schema(
+                mesh=DeviceMesh(self.device_type, torch.arange(self.world_size)),
+                placements=[Replicate()],
+            )
+        ]
+        self._test_trace_replicate(
+            Model().to(self.device_type),
+            torch.rand((2, 4)).to(self.device_type),
+            **inp_kwargs,
+        )
+
+    @with_comms
+    def test_layer_norm_fw(self):
+        # This test is for get_item support. layer_norm contains
+        # tuples in its output which means we need to support get_item.
+        input_dims = []
+
+        input = np.random.randn(4, 5).astype(np.float32)
+        model = nn.LayerNorm(input.shape[1:]).to(self.device_type)
+        pt_input = torch.tensor(input, dtype=torch.float).to(self.device_type)
+        self._test_trace_replicate(model, pt_input)
+
+    @with_comms
+    def test_baked_in_shape(self):
+        class LCE(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                torch.manual_seed(5)
+                self.w = torch.nn.Parameter(torch.rand((5, 10)))
+                self.b = torch.nn.Parameter(torch.rand((5)))
+
+            def forward(self, x, *args, **kwargs):
+                # the code below will bake in the shape of x_t as arguments to expand
+                x_t = x.permute(0, 2, 1)
+                y_t = kwargs["dict_test"]["value"].expand(x_t.shape) + args[0][
+                    0
+                ].expand(x_t.shape)
+                # code below triggers an "expand" with shape baked in.
+                return torch.nn.functional.linear(y_t, self.w, self.b)
+
+        model = LCE().to(self.device_type)
+        x = torch.randn(2, 10, 80).to(self.device_type)
+        y = torch.randn(2, 80, 10).to(self.device_type)
+        z = torch.randn(2, 80, 10).to(self.device_type)
+        self._test_trace_replicate(model, x, [y], dict_test={"value": z})
+
+    @with_comms
+    def test_sequential(self):
+        model = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)]).to(
+            self.device_type
+        )
+        x = torch.randn(2, 10).to(self.device_type)
+        self._test_trace_replicate(model, x)
+
+    @with_comms
+    def test_parallel(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.module_list = nn.ModuleList([nn.Linear(10, 10) for _ in range(2)])
+
+            def forward(self, x):
+                return sum([m(x) for m in self.module_list])
+
+        model = Model().to(self.device_type)
+        x = torch.randn(2, 10).to(self.device_type)
+        self._test_trace_replicate(model, x)
+
+    @with_comms
+    def test_hybrid(self):
+        bottom_model = nn.Sequential(
+            nn.Linear(4, 8),
+            nn.Softmax(),
+        ).to(self.device_type)
+
+        top_model = nn.Sequential(
+            nn.Linear(8, 2),
+            nn.Softmax(),
+        ).to(self.device_type)
+
+        hybrid = nn.Sequential(
+            DDP(deepcopy(bottom_model)),
+            SPMD(
+                deepcopy(top_model),
+                schema=Schema(
+                    mesh=DeviceMesh(self.device_type, torch.arange(self.world_size)),
+                    placements=[Replicate()],
+                ),
+            ),
+        )
+        ddp = DDP(nn.Sequential(deepcopy(bottom_model), deepcopy(top_model)))
+        input = torch.randn(12, 4).to(self.device_type)
+
+        ddp(input).sum().backward()
+        hybrid(input).sum().backward()
+        for p1, p2 in zip(ddp.parameters(), hybrid.parameters()):
+            # DDP divides gradients by world size to compute average, but
+            # _Partial tensor shouldn't do that automatically. Hence explicitly
+            # do division here.
+            self.assertTrue(
+                p1.grad.allclose(p2.grad / self.world_size) or p1.grad.allclose(p2.grad)
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_common_rules.py b/test/distributed/_tensor/test_common_rules.py
index 774854aa01c0..af6e06446f9b 100644
--- a/test/distributed/_tensor/test_common_rules.py
+++ b/test/distributed/_tensor/test_common_rules.py
@@ -2,9 +2,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
 from torch._C import parse_schema
 from torch.distributed._tensor import DeviceMesh
-from torch.distributed._tensor.dispatch import OpSchema
+from torch.distributed._tensor.op_schema import OpSchema
 
 from torch.distributed._tensor.ops.common_rules import (
     einop_rule,
@@ -26,6 +27,10 @@ def world_size(self) -> int:
         # at least with 2d mesh
         return 4
 
+    def _gen_tensor_meta(self, shape):
+        empty_tensor = torch.empty(shape)
+        return _extract_tensor_metadata(empty_tensor)
+
     @with_comms
     def test_einop_basic_propagation(self):
         # plain einsum, mm
@@ -34,39 +39,39 @@ def test_einop_basic_propagation(self):
         func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
         # propagate col-wise sharding
         mat1, mat2 = [-1, -1], [-1, 0]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
+
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([4, 8]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [-1, 0])
-        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
 
         # propagate row-wise sharding
         mat1, mat2 = [0, -1], [-1, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, -1])
-        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
 
         # generate partial
         mat1, mat2 = [-1, 0], [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertTrue(output_spec.placements[0].is_partial())
-        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
 
     @with_comms
     def test_einop_pointwise_propagation(self):
@@ -76,36 +81,40 @@ def test_einop_pointwise_propagation(self):
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         # addition
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 8]))
         mat1 = [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 8]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
         output_sharding = einop_rule(
             "ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat1_spec), {})
         )
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, -1])
-        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
 
         # broadcast addition
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 8]))
         mat1 = [-1, 0, -1]
         mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4, 2])
+            mesh, mat1, [], tensor_meta=mat1_tensor_meta
         )
-        mat2_spec = DTensorSpec.from_dim_map(mesh, [-1], [], shape=torch.Size([2]))
+
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([2]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, [-1], [], tensor_meta=mat2_tensor_meta)
         output_sharding = einop_rule(
             "ijk,k->ijk", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [-1, 0, -1])
-        self.assertEqual(output_spec.shape, torch.Size([8, 4, 2]))
 
         # broadcast to a common shape
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 8, 8]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([1, 8]))
         mat1_spec = DTensorSpec.from_dim_map(
-            mesh, [0, -1, -1], [], shape=torch.Size([8, 8, 8])
+            mesh, [0, -1, -1], [], tensor_meta=mat1_tensor_meta
         )
         mat2_spec = DTensorSpec.from_dim_map(
-            mesh, [-1, -1], [], shape=torch.Size([1, 8])
+            mesh, [-1, -1], [], tensor_meta=mat2_tensor_meta
         )
         output_sharding = einop_rule(
             "ijk,1k->ijk", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
@@ -113,7 +122,6 @@ def test_einop_pointwise_propagation(self):
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, -1, -1])
-        self.assertEqual(output_spec.shape, torch.Size([8, 8, 8]))
 
     @with_comms
     def test_einop_merge_sharding(self):
@@ -126,15 +134,16 @@ def test_einop_merge_sharding(self):
         func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
 
         mat1, mat2 = [0, -1], [-1, 1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([4, 8]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [0, 1])
-        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
 
     @with_comms
     def test_einop_linearity(self):
@@ -146,8 +155,10 @@ def test_einop_linearity(self):
         mm_func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
 
         mat1, mat2 = [0, -1], [-1, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [1], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([4, 8]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [1], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         # if not turn on linearity, partial sum is not eligible to propagate, we return
         # suggestion to reshard inputs with no partial sum (i.e. all_reduce one input)
         output_sharding = einop_rule(
@@ -179,8 +190,10 @@ def test_einop_linearity(self):
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         mat1, mat2 = [0, -1], [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [1], shape=torch.Size([8, 6]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([8, 6]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 6]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([8, 6]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [1], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
 
         output_sharding = einop_rule(
             "ij,ij->ij",
@@ -202,8 +215,10 @@ def test_einop_multi_sharding_on_mesh_dim(self):
 
         func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
         mat1, mat2 = [0, -1], [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 12]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([12, 4]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 12]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([12, 4]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -228,8 +243,10 @@ def test_einop_errors(self):
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         mat1, mat2 = [0, -1], [1, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([8, 4]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
 
         with self.assertRaisesRegex(RuntimeError, "sharded two different ways:"):
             einop_rule("ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat2_spec), {}))
@@ -242,10 +259,13 @@ def test_pointwise_rules_broadcasting(self):
             "where.self(Tensor condition, Tensor self, Tensor other) -> Tensor"
         )
         inp1, inp2, inp3 = [0], [], [-1, -1]
-        condition = DTensorSpec.from_dim_map(mesh, inp1, [], shape=torch.Size([8]))
-        self_tensor = DTensorSpec.from_dim_map(mesh, inp2, [], shape=torch.Size([]))
+        inp1_tensor_meta = self._gen_tensor_meta(torch.Size([8]))
+        inp2_tensor_meta = self._gen_tensor_meta(torch.Size([]))
+        inp3_tensor_meta = self._gen_tensor_meta(torch.Size([1, 1]))
+        condition = DTensorSpec.from_dim_map(mesh, inp1, [], tensor_meta=inp1_tensor_meta)
+        self_tensor = DTensorSpec.from_dim_map(mesh, inp2, [], tensor_meta=inp2_tensor_meta)
         other_tensor = DTensorSpec.from_dim_map(
-            mesh, inp3, [], shape=torch.Size([1, 1])
+            mesh, inp3, [], tensor_meta=inp3_tensor_meta
         )
         # propagate point-wise sharding with broadcasting
         output_sharding = pointwise_rule(
@@ -254,7 +274,6 @@ def test_pointwise_rules_broadcasting(self):
         output_spec = output_sharding.output_spec
         self.assertIsNotNone(output_spec)
         self.assertEqual(output_spec.dim_map, [-1, 0])
-        self.assertEqual(output_spec.shape, [1, 8])
 
     @with_comms
     def test_pointwise_rules_suggestion(self):
@@ -265,8 +284,10 @@ def test_pointwise_rules_suggestion(self):
         )
         # propagate point-wise sharding
         inp1, inp2 = [-1, -1], [-1, 0]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, inp1, [], shape=torch.Size([8, 4]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, inp2, [], shape=torch.Size([8, 4]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, inp1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, inp2, [], tensor_meta=mat2_tensor_meta)
         # adding a positional argument -1 to arg schema
         output_sharding = pointwise_rule(
             OpSchema(func_schema, (mat1_spec, mat2_spec, -1), {})
@@ -294,8 +315,10 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
 
         # basic case to test implicit broadcasting shape alignment
         mat1, mat2 = [-1, 0], [0]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([20, 6]))
-        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([6]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([20, 6]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([6]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], tensor_meta=mat2_tensor_meta)
         output_sharding = pointwise_rule(
             OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -305,11 +328,13 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
 
         # more advanced case that needs reshard one input to align sharding
         mat1, mat2 = [0, -1, -1, 1], [0, -1, 1]
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([12, 1, 1, 8]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([12, 4, 8]))
         mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([12, 1, 1, 8])
+            mesh, mat1, [], tensor_meta=mat1_tensor_meta
         )
         mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([12, 4, 8])
+            mesh, mat2, [], tensor_meta=mat2_tensor_meta
         )
         output_sharding = pointwise_rule(
             OpSchema(func_schema, (mat1_spec, mat2_spec), {})
@@ -338,11 +363,13 @@ def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
 
         # more advanced case that needs reshard one input to align sharding
         mat1, mat2 = [0, -1, 1], [-1, -1, 0]
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([12, 4, 8]))
+        mat2_tensor_meta = self._gen_tensor_meta(torch.Size([12, 1, 8]))
         mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([12, 4, 8])
+            mesh, mat1, [], tensor_meta=mat1_tensor_meta
         )
         mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([12, 1, 8])
+            mesh, mat2, [], tensor_meta=mat2_tensor_meta
         )
         output_sharding = pointwise_rule(
             OpSchema(func_schema, (mat1_spec, mat2_spec), {})
@@ -366,7 +393,8 @@ def test_reduction_rule(self):
         )
         # reduction on a 2d mat
         mat1 = [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
+        mat1_tensor_meta = self._gen_tensor_meta(torch.Size([8, 4]))
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], tensor_meta=mat1_tensor_meta)
         # reduction on dim 0
         output_sharding_0 = reduction_rule(
             OpSchema(func_schema, (mat1_spec, 0), {}),
@@ -377,7 +405,6 @@ def test_reduction_rule(self):
         self.assertEqual(output_sharding_0.output_spec.dim_map, [-1])
         # pending sum on dim 0
         self.assertEqual(output_sharding_0.output_spec.sums, [0])
-        self.assertEqual(output_sharding_0.output_spec.shape, torch.Size([4]))
 
         # reduction on dim 1
         output_sharding_1 = reduction_rule(
@@ -388,7 +415,6 @@ def test_reduction_rule(self):
         self.assertIsNotNone(output_sharding_1.output_spec)
         self.assertEqual(output_sharding_1.output_spec.dim_map, [0])
         self.assertEqual(output_sharding_1.output_spec.sums, [])
-        self.assertEqual(output_sharding_1.output_spec.shape, torch.Size([8]))
 
         # full reduction if not specify dim
         output_sharding_all_dim = reduction_rule(
@@ -400,7 +426,6 @@ def test_reduction_rule(self):
         self.assertEqual(output_sharding_all_dim.output_spec.dim_map, [])
         # pending sum on mesh
         self.assertEqual(output_sharding_all_dim.output_spec.sums, [0])
-        self.assertEqual(output_sharding_all_dim.output_spec.shape, torch.Size([]))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_tensor/test_device_mesh.py b/test/distributed/_tensor/test_device_mesh.py
index c7983cde5993..9b515e128305 100644
--- a/test/distributed/_tensor/test_device_mesh.py
+++ b/test/distributed/_tensor/test_device_mesh.py
@@ -460,7 +460,7 @@ def test_reduce_scatter_nd(self):
                 contiguous=True,
             )
             scattered_tensor = torch.empty_like(
-                local_rs_list[mesh.get_coordinate_on_dim(dim)],
+                local_rs_list[mesh.get_coordinate()[dim]],
                 device=self.device_type,
             )
             global_ranks = [
@@ -523,7 +523,7 @@ def test_scatter_nd(self):
                 for global_rank in global_ranks
             ]
             received_tensor = torch.empty_like(
-                scattered_tensors[mesh.get_coordinate_on_dim(dim)]
+                scattered_tensors[mesh.get_coordinate()[dim]]
             )
             mesh.scatter(received_tensor, scattered_tensors, mesh_dim=dim)
             self.assertEqual(received_tensor, torch.ones(3, 3) * self.rank)
@@ -563,7 +563,7 @@ def test_all_to_all_nd(self):
         # check all dim groups
         dim_to_subgroups = mesh.get_dim_groups()
         for dim, dim_group in enumerate(dim_to_subgroups):
-            my_coordinate = mesh.get_coordinate_on_dim(dim)
+            my_coordinate = mesh.get_coordinate()[dim]
             dim_group_size = get_world_size(dim_group)
             global_ranks = [
                 get_global_rank(dim_group, i) for i in range(dim_group_size)
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
index c79ae66c548b..a694db767ec2 100644
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -2,6 +2,11 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+import torch.nn.functional as F
+from torch.distributed.tensor.parallel import (
+    PairwiseParallel,
+    parallelize_module,
+)
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 
@@ -11,22 +16,24 @@
     with_comms,
 )
 
+class DummyMLP(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.net1 = torch.nn.Linear(5, 1024, device=device)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(1024, 4, device=device)
 
-class DTensorTest(DTensorTestBase):
-    # @with_comms
-    # def test_tensor_constructor(self):
-    #     import torch.distributed._tensor as dist_tensor
-    #     shard_spec = PlacementSpec(device_mesh, strategies=[Shard(0)])
-    #     empty_tensor = dist_tensor.empty((12, 10), placement_spec=shard_spec)
-    #     zero_tensor = dist_tensor.zeros((12, 10), placement_spec=shard_spec)
-    #     one_tensor = dist_tensor.ones((12, 10), placement_spec=shard_spec)
-
-    #     zero_cuda_tensor = dist_tensor.zeros((12, 10), device="cuda", placement_spec=shard_spec)
+    def forward(self, x):
+        return self.net2(F.relu(self.net1(x)))
 
-    #     dist_tensor.empty_like(empty_tensor)
-    #     dist_tensor.zero_like(empty_tensor)
-    #     dist_tensor.one_like(empty_tensor)
+    def reset_parameters(self, *args, **kwargs):
+        with torch.no_grad():
+            self.net1.weight.fill_(0.5)
+            self.net2.weight.fill_(1)
+            self.net1.bias.fill_(1.5)
+            self.net2.bias.fill_(1.2)
 
+class DTensorTest(DTensorTestBase):
     @with_comms
     def test_dtensor_constructor(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
@@ -37,13 +44,23 @@ def test_dtensor_constructor(self):
             local_tensor,
             device_mesh,
             shard_spec,
-            size=dist_tensor_shape,
+            shape=dist_tensor_shape,
+            dtype=local_tensor.dtype,
             requires_grad=True,
+            stride=local_tensor.stride()
         )
         self.assertEqual(dist_tensor.size(), torch.Size((self.world_size * 3, 3)))
 
         with self.assertWarnsRegex(UserWarning, "To construct"):
-            DTensor(local_tensor, device_mesh, shard_spec, size=dist_tensor_shape)
+            DTensor(
+                local_tensor,
+                device_mesh,
+                shard_spec,
+                shape=dist_tensor_shape,
+                dtype=local_tensor.dtype,
+                requires_grad=False,
+                stride=local_tensor.stride()
+            )
 
         local_tensor = torch.randn(3, 3, requires_grad=False)
         with self.assertWarnsRegex(UserWarning, "To construct"):
@@ -51,24 +68,78 @@ def test_dtensor_constructor(self):
                 local_tensor,
                 device_mesh,
                 shard_spec,
-                size=dist_tensor_shape,
+                shape=dist_tensor_shape,
+                dtype=local_tensor.dtype,
                 requires_grad=True,
+                stride=local_tensor.stride()
             )
 
+    @with_comms
+    def test_meta_dtensor(self):
+        device_mesh = self.build_device_mesh()
+        dist_specs = [[Shard(0)], [Replicate()]]
+        meta_tensor = torch.randn(1024, 2048, device="meta")
+        for dist_spec in dist_specs:
+            # Test distribute_tensor on meta tensor
+            meta_dtensor = distribute_tensor(meta_tensor, device_mesh, dist_spec)
+            self.assertTrue(meta_dtensor.is_meta)
+            meta_dtensor = torch.empty_like(meta_dtensor, device=self.device_type)
+            torch.nn.init.constant_(meta_dtensor, 1.2)
+            value_tensor = torch.empty_like(meta_dtensor.to_local()).fill_(1.2)
+            self.assertFalse(meta_dtensor.is_meta)
+            self.assertEqual(meta_dtensor.device.type, self.device_type)
+            self.assertEqual(meta_dtensor.to_local(), value_tensor)
+            # Test from_local on meta tensor
+            meta_dtensor = DTensor.from_local(meta_tensor, device_mesh, dist_spec)
+            meta_dtensor = torch.empty_like(meta_dtensor, device=self.device_type)
+            torch.nn.init.constant_(meta_dtensor, 1.5)
+            self.assertEqual(meta_dtensor.device.type, self.device_type)
+            value_tensor = torch.empty_like(meta_dtensor.to_local()).fill_(1.5)
+            self.assertEqual(meta_dtensor.to_local(), value_tensor)
+
+    @with_comms
+    def test_modules_w_meta_dtensor(self):
+        model = DummyMLP("meta")
+        device_mesh = self.build_device_mesh()
+        model_tp = parallelize_module(model, device_mesh, PairwiseParallel())
+        model_tp.to_empty(device=self.device_type)
+        model_tp.reset_parameters()
+        optim = torch.optim.SGD(model_tp.parameters(), lr=0.1)
+        model_regular = DummyMLP(self.device_type)
+        model_regular_tp = parallelize_module(model_regular, device_mesh, PairwiseParallel())
+        optim_regular = torch.optim.SGD(model_regular_tp.parameters(), lr=0.1)
+        model_regular_tp.reset_parameters()
+        torch.manual_seed(0)
+        inp = torch.randn(20, 5, device=self.device_type)
+
+        output = model_tp(inp)
+        output_regular = model_regular_tp(inp)
+        self.assertEqual(output, output_regular)
+
+        output.sum().backward()
+        output_regular.sum().backward()
+
+        optim.step()
+        optim_regular.step()
+
+        torch.manual_seed(1)
+        inp = torch.randn(20, 5, device=self.device_type)
+        self.assertEqual(model_tp(inp), model_regular_tp(inp))
+
     @with_comms
     def test_dtensor_stride(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         shard0_spec = [Shard(0)]
         local_tensor = torch.randn(4, 8)
         global_shape = torch.Size([self.world_size * 4, 8])
-        dist_tensor = DTensor(local_tensor, device_mesh, shard0_spec, size=global_shape)
+        dist_tensor = DTensor.from_local(local_tensor, device_mesh, shard0_spec)
         # won't affect stride
         self.assertEqual(dist_tensor.stride(), (8, 1))
 
         shard1_spec = [Shard(1)]
         local_tensor = torch.randn(8, 4)
         global_shape = torch.Size([8, self.world_size * 4])
-        dist_tensor = DTensor(local_tensor, device_mesh, shard1_spec, size=global_shape)
+        dist_tensor = DTensor.from_local(local_tensor, device_mesh, shard1_spec)
         # will affect stride after DT initialized
         self.assertEqual(dist_tensor.stride(), (4 * self.world_size, 1))
 
@@ -77,8 +148,8 @@ def test_dtensor_stride(self):
         local_tensor_t = local_tensor.permute(1, 2, 0)
         global_shape = torch.Size([4, self.world_size * 8, 8])
         self.assertEqual(local_tensor_t.stride(), (8, 1, 32))
-        dist_tensor = DTensor(
-            local_tensor_t, device_mesh, shard1_spec, size=global_shape
+        dist_tensor = DTensor.from_local(
+            local_tensor_t, device_mesh, shard1_spec
         )
         global_stride = (8 * self.world_size, 1, 32 * self.world_size)
         self.assertEqual(dist_tensor.stride(), global_stride)
@@ -133,8 +204,10 @@ def test_to_local(self):
             local_tensor_with_grad,
             device_mesh,
             shard_spec,
-            size=dist_tensor_shape,
+            shape=dist_tensor_shape,
+            dtype=local_tensor_with_grad.dtype,
             requires_grad=True,
+            stride=local_tensor_with_grad.stride()
         )
         self.assertEqual(sharded_tensor.size(), dist_tensor_shape)
         self.assertEqual(sharded_tensor.to_local(), local_tensor_with_grad)
@@ -342,6 +415,30 @@ def test_dtensor_spec_local_shard_offset(self):
             dtensor = distribute_tensor(logical_tensor, device_mesh, shard_spec)
             self.assertEqual(expected_shard_offsets, dtensor._spec.local_offsets)
 
+    @with_comms
+    def test_from_local_sub_mesh(self):
+        mesh = DeviceMesh(self.device_type, [0, 2])
+        local_tensor = torch.ones(3, 4)
+
+        dtensor = DTensor.from_local(local_tensor, mesh, [Shard(0)])
+        self.assertEqual(dtensor.size(), torch.Size([6, 4]))
+
+        if self.rank == 0 or self.rank == 2:
+            self.assertEqual(dtensor.to_local(), torch.ones(3, 4))
+        else:
+            self.assertEqual(dtensor.to_local(), torch.tensor([]))
+
+        # test dtensor created in submesh, the operation should only
+        # be applied to the local shard inside the mesh, not the whole
+        # world, so only 0/2 really run the computation
+        new_dtensor = dtensor + 2
+
+        if self.rank == 0 or self.rank == 2:
+            self.assertEqual(new_dtensor.to_local(), torch.ones(3, 4) + 2)
+        else:
+            self.assertEqual(new_dtensor.to_local(), torch.tensor([]))
+
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
index b283d4d3270f..b2685820bcd4 100644
--- a/test/distributed/_tensor/test_dtensor_ops.py
+++ b/test/distributed/_tensor/test_dtensor_ops.py
@@ -98,6 +98,7 @@ def wrapped(fn):
     xfail("__rsub__"),
     xfail("_native_batch_norm_legit"),
     xfail("_softmax_backward_data"),
+    xfail("_upsample_bilinear2d_aa"),
     xfail("addbmm"),
     xfail("addmv"),
     xfail("addr"),
@@ -118,7 +119,7 @@ def wrapped(fn):
     xfail("bernoulli"),
     xfail("block_diag"),
     xfail("broadcast_shapes"),
-    xfail("cat"),
+    xfail("cauchy"),
     xfail("cartesian_prod"),
     xfail("cdist"),
     xfail("cholesky"),
@@ -128,11 +129,9 @@ def wrapped(fn):
     xfail("clamp"),
     xfail("clamp_max"),
     xfail("clamp_min"),
-    xfail("column_stack"),
     xfail("combinations"),
     xfail("complex"),
     xfail("constant_pad_nd"),
-    xfail("copysign"),
     xfail("corrcoef"),
     xfail("count_nonzero"),
     xfail("cov"),
@@ -147,13 +146,12 @@ def wrapped(fn):
     xfail("diagonal"),
     xfail("diagonal_copy"),
     xfail("diagonal_scatter"),
-    xfail("diff"),
     xfail("dist"),
     xfail("dot"),
-    xfail("dstack"),
     xfail("einsum"),
     xfail("empty"),
     xfail("empty_like"),
+    xfail("exponential"),
     xfail("eye"),
     xfail("fft.fft2"),
     xfail("fft.fft"),
@@ -181,6 +179,7 @@ def wrapped(fn):
     xfail("full"),
     xfail("full_like"),
     xfail("gather"),
+    xfail("geometric"),
     xfail("geqrf"),
     xfail("grid_sampler_2d"),
     xfail("gradient"),
@@ -188,7 +187,6 @@ def wrapped(fn):
     xfail("histc"),
     xfail("histogram"),
     xfail("histogramdd"),
-    xfail("hstack"),
     xfail("index_add"),
     xfail("index_copy"),
     xfail("index_fill"),
@@ -244,6 +242,7 @@ def wrapped(fn):
     xfail("linalg.vecdot"),
     xfail("linalg.vector_norm"),
     xfail("linspace"),
+    xfail("log_normal"),
     xfail("log_softmax"),
     xfail("log_softmax", "with_dtype"),
     xfail("logcumsumexp"),
@@ -398,6 +397,7 @@ def wrapped(fn):
     xfail("norm", "nuc"),
     xfail("normal"),
     xfail("normal", "number_mean"),
+    xfail("normal", "in_place"),
     xfail("ormqr"),
     xfail("ones"),
     xfail("pca_lowrank"),
@@ -406,7 +406,6 @@ def wrapped(fn):
     xfail("put"),
     xfail("qr"),
     xfail("quantile"),
-    xfail("rad2deg"),
     xfail("rand_like"),
     xfail("randint_like"),
     xfail("randint"),
@@ -432,6 +431,7 @@ def wrapped(fn):
     xfail("select_scatter"),
     xfail("sort"),
     xfail("sparse.sampled_addmm"),
+    xfail("sparse.mm", "reduce"),
     xfail("special.airy_ai"),
     xfail("special.bessel_j0"),
     xfail("special.bessel_j1"),
@@ -458,9 +458,6 @@ def wrapped(fn):
     xfail("special.spherical_bessel_j0"),
     xfail("special.xlog1py"),
     xfail("special.zeta"),
-    xfail("split"),
-    xfail("split", "list_args"),
-    xfail("split_with_sizes"),
     xfail("squeeze", "multiple"),
     xfail("signal.windows.bartlett"),
     xfail("signal.windows.blackman"),
@@ -481,7 +478,6 @@ def wrapped(fn):
     xfail("stft"),
     xfail("svd"),
     xfail("svd_lowrank"),
-    xfail("symeig"),
     xfail("t"),
     xfail("take_along_dim"),
     xfail("take"),
@@ -507,7 +503,6 @@ def wrapped(fn):
     xfail("vdot"),
     xfail("view_copy"),
     xfail("view_as_complex"),
-    xfail("vstack"),
     xfail("where"),
     xfail("zeros"),
     # ops inside this might even fail without dtensor
@@ -520,7 +515,7 @@ def wrapped(fn):
     skip("__rmatmul__"),
     skip("meshgrid", "list_of_tensors"),
     skip("meshgrid", "variadic_tensors"),
-    skip("nn.functional._scaled_dot_product_attention"),
+    skip("nn.functional.scaled_dot_product_attention"),
     skip("nn.functional.softmin"),
     skip("nn.functional.embedding"),
     skip("nn.functional.embedding_bag"),
@@ -543,8 +538,8 @@ def wrapped(fn):
     skip("masked.std"),
     skip("masked.normalize"),
     skip("prod"),
-    skip("segment_reduce", "lengths"),
-    skip("segment_reduce", "offsets"),
+    skip("_segment_reduce", "lengths"),
+    skip("_segment_reduce", "offsets"),
 
     # TODO: fix the following ops
     skip("squeeze"),
@@ -627,8 +622,21 @@ def assert_ref_dtensor_equal(self, dtensor_rs, rs):
     def run_dtensor_crossref(self, func, args, kwargs):
         to_dtensor = DTensorConverter(self.mesh, args, kwargs)
 
+        def concat_res_if_necessary(func, res: object) -> object:
+            # concat the result on corresponding dim for ops like
+            # split, so that we can call backward on a single tensor
+            if (
+                (resolve_name(func) is not None)
+                and ("split" in resolve_name(func))
+            ):
+                dim = args[2] if len(args) == 3 else 0
+                return torch.cat(res, dim=dim)
+            else:
+                return res
+
         # TODO: also handle cases where func raise an exception
         rs = func(*args, **kwargs)
+        rs = concat_res_if_necessary(func, rs)
 
         def to_replicate(e: object) -> object:
             return (
@@ -669,6 +677,7 @@ def to_replicate(e: object) -> object:
 
                         # redistribute/all_gather the results to compare with normal output
                         dtensor_rs = tree_map(to_replicate, dtensor_rs)
+                        dtensor_rs = concat_res_if_necessary(func, dtensor_rs)
                         try:
                             if resolve_name(func) not in skip_bw:
                                 if isinstance(dtensor_rs, DTensor):
diff --git a/test/distributed/_tensor/test_tp_sharding_ops.py b/test/distributed/_tensor/test_tp_sharding_ops.py
deleted file mode 100644
index ef4d635f6ef7..000000000000
--- a/test/distributed/_tensor/test_tp_sharding_ops.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-# Owner(s): ["oncall: distributed"]
-
-import torch
-from torch.distributed._tensor import (
-    DeviceMesh,
-    distribute_tensor,
-    DTensor,
-    Replicate,
-    Shard,
-)
-from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.distributed._tensor.common_dtensor import (
-    DTensorTestBase,
-    with_comms,
-)
-
-
-class TPShardingOpsTest(DTensorTestBase):
-    @property
-    def world_size(self) -> int:
-        return 4
-
-    @with_comms
-    def test_sharded_view(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(0)
-        tensor = torch.rand(16, 35, 26)
-        sharding = [Shard(0)]
-        st = distribute_tensor(tensor, device_mesh, sharding).view(8, 4, 35, 13)
-        st_new = distribute_tensor(tensor.view(8, 4, 35, 13), device_mesh, sharding)
-        self.assertEqual(st.to_local(), st_new.to_local())
-        self.assertEqual(st.placements[0], st_new.placements[0])
-
-    @with_comms
-    def test_sharded_transpose(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(self.rank)
-        tensor = torch.rand(3, 5, 6, device=self.device_type)
-        sharding = [Shard(0)]
-        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
-        new_dt = dist_tensor.transpose(0, 2)
-        self.assertTrue(new_dt.placements[0].is_shard(dim=2))
-        self.assertEqual(new_dt.to_local(), tensor.transpose(0, 2))
-        new_dt = dist_tensor.transpose(1, 2)
-        self.assertTrue(new_dt.placements[0].is_shard(dim=0))
-        self.assertEqual(new_dt.to_local(), tensor.transpose(1, 2))
-
-    @with_comms
-    def test_sharded_permute(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(self.rank)
-        tensor = torch.rand(3, 5, 6, device=self.device_type)
-        sharding = [Shard(0)]
-        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
-        new_dt = dist_tensor.permute(1, 0, 2)
-        self.assertTrue(new_dt.placements[0].is_shard(dim=1))
-        self.assertEqual(new_dt.to_local(), tensor.permute(1, 0, 2))
-
-    @with_comms
-    def test_replicated_permute(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(0)
-        tensor = torch.rand(3, 5, 6, device=self.device_type)
-        sharding = [Replicate()]
-        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
-        new_dt = dist_tensor.permute(1, 0, 2)
-        self.assertTrue(new_dt.placements[0].is_replicate())
-        self.assertEqual(new_dt.to_local(), tensor.permute(1, 0, 2))
-        self.assertEqual(new_dt.stride(), tensor.permute(1, 0, 2).stride())
-
-    @with_comms
-    def test_sharded_cat(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(self.rank)
-        tensor_1 = torch.rand(3, 5, 6)
-        tensor_2 = torch.rand(3, 5, 6)
-        tensor_3 = torch.rand(3, 5, 6)
-        sharding = [Shard(0)]
-        dt_1 = DTensor.from_local(tensor_1, device_mesh, sharding)
-        dt_2 = DTensor.from_local(tensor_2, device_mesh, sharding)
-        dt_3 = DTensor.from_local(tensor_3, device_mesh, sharding)
-        new_dt = torch.cat([dt_1, dt_2, dt_3])
-        cat_dt = DTensor.from_local(
-            torch.cat([tensor_1, tensor_2, tensor_3]), device_mesh, sharding
-        )
-        self.assertEqual(new_dt.to_local(), cat_dt.to_local())
-        self.assertEqual(new_dt.size(), cat_dt.size())
-
-    @with_comms
-    def test_sharded_split(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(self.rank)
-        tensor = torch.rand(3, 5, 6, device=self.device_type)
-        sharding = [Shard(2)]
-        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
-        dt_list = dist_tensor.split(dist_tensor.size(-1) // 2, dim=-1)
-        local_tensors = tensor.split(3, dim=-1)
-        for idx, dt in enumerate(dt_list):
-            self.assertTrue(dt.placements[0].is_shard(dim=2))
-            self.assertEqual(dt.to_local(), local_tensors[idx])
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/distributed/_tensor/test_view_ops.py b/test/distributed/_tensor/test_view_ops.py
index fa502d2b5603..c223c383c68e 100644
--- a/test/distributed/_tensor/test_view_ops.py
+++ b/test/distributed/_tensor/test_view_ops.py
@@ -30,18 +30,18 @@
 
 class TestViewOps(DTensorTestBase):
     def test_view_groups(self):
-        self.assertEquals(
+        self.assertEqual(
             view_groups([2, 3], [3, 2]),
             (
                 Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 0),
                 Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 1),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([3, 4, 5], [12, 5]),
             (Flatten((InputDim(0), InputDim(1))), InputDim(2)),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([2, 3, 4, 5, 7], [12, 70]),
             (
                 Split(
@@ -72,7 +72,7 @@ def test_view_groups(self):
                 ),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([2, 3, 4, 5, 7], [3, 8, 7, 5]),
             (
                 Split(Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 0),
@@ -81,7 +81,7 @@ def test_view_groups(self):
                 Split(Flatten((InputDim(3), InputDim(4))), (7, 5), 1),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([3, 4, 8, 3], [12, 4, 2, 3]),
             (
                 Flatten((InputDim(0), InputDim(1))),
@@ -90,7 +90,7 @@ def test_view_groups(self):
                 InputDim(3),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([3, 24], [1, 3, 2, 4, 1, 3, 1]),
             (
                 Singleton(),
@@ -102,7 +102,7 @@ def test_view_groups(self):
                 Singleton(),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([1, 1, 3, 2, 1, 1], [6, 1, 1, 1]),
             (
                 Flatten((InputDim(2), InputDim(3))),
@@ -111,7 +111,7 @@ def test_view_groups(self):
                 Singleton(),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([1, 1, 12, 1, 1, 1, 2, 5, 1], [3, 4, 1, 10]),
             (
                 Split(InputDim(2), (3, 4), 0),
@@ -120,7 +120,7 @@ def test_view_groups(self):
                 Flatten((InputDim(6), InputDim(7))),
             ),
         )
-        self.assertEquals(
+        self.assertEqual(
             view_groups([2, 3, 4], [2, -1, 4]),
             (InputDim(0), InputDim(1), InputDim(2)),
         )
@@ -180,7 +180,7 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
 
     def dimmap_test(self, op, args, expected_rule_output):
         rules = ops[op].dim_map(*args)
-        self.assertEquals(rules, expected_rule_output)
+        self.assertEqual(rules, expected_rule_output)
         self.call_dt_test(op, args, {}, self.device_mesh)
 
     @with_comms
diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index 71db81e545d0..90dded67974b 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -66,6 +66,7 @@ def test_local_model(self):
         self.assertEqual(len(tracker.memories_reserved), tracker._op_index)
         self.assertTrue(len(tracker._markers) == 2)
         self.assertTrue(tracker._cur_module_name != "")
+        self.assertTrue(hasattr(tracker, "_num_cuda_retries"))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index a685fb682ed8..2d6a17bf8d57 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -43,7 +43,7 @@ def gpus_for_rank(world_size):
 
 class Task(nn.Module):
     def __init__(self):
-        super(Task, self).__init__()
+        super().__init__()
         torch.manual_seed(0)
         self.p = nn.Parameter(torch.randn(40, 20))
 
@@ -62,7 +62,7 @@ def forward(self, x, rank):
 
 class DistributedDataParallelCommHookTest(MultiProcessTestCase):
     def setUp(self):
-        super(DistributedDataParallelCommHookTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index aebf3ccd6266..368671a35fd5 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -43,12 +43,12 @@ def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
     class DistQuantizationTests(MultiProcessTestCase):
 
         def setUp(self):
-            super(DistQuantizationTests, self).setUp()
+            super().setUp()
             self._spawn_processes()
-            torch.backends.cudnn.flags(allow_tf32=False).__enter__()
+            torch.backends.cudnn.flags(enabled=True, allow_tf32=False).__enter__()
 
         def tearDown(self):
-            super(DistQuantizationTests, self).tearDown()
+            super().tearDown()
             try:
                 os.remove(self.file_name)
             except OSError:
diff --git a/test/distributed/algorithms/test_join.py b/test/distributed/algorithms/test_join.py
index 2b8a3764d21f..66ec0495bb02 100644
--- a/test/distributed/algorithms/test_join.py
+++ b/test/distributed/algorithms/test_join.py
@@ -83,7 +83,7 @@ class AllReducer(Joinable):
     per-iteration collective communication.
     """
     def __init__(self, device, process_group):
-        super(AllReducer, self).__init__()
+        super().__init__()
         self.device = device
         self.process_group = process_group
         self.post_hook_tensor = torch.tensor([BEFORE_CONSTANT], device=self.device)
@@ -139,7 +139,7 @@ def find_common_rank(self, rank, to_consider):
 class TestJoin(MultiProcessTestCase):
     r"""Test cases for the generic join context."""
     def setUp(self):
-        super(TestJoin, self).setUp()
+        super().setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
         os.environ["BACKEND"] = BACKEND
         self._spawn_processes()
diff --git a/test/distributed/checkpoint/test_2d_fsdp_dt_checkpoint.py b/test/distributed/checkpoint/test_2d_fsdp_dt_checkpoint.py
index fa1c1f6197b4..a060f837ac57 100644
--- a/test/distributed/checkpoint/test_2d_fsdp_dt_checkpoint.py
+++ b/test/distributed/checkpoint/test_2d_fsdp_dt_checkpoint.py
@@ -20,7 +20,7 @@
     PairwiseParallel,
     parallelize_module,
 )
-from torch.distributed.tensor.parallel.fsdp import is_available
+from torch.distributed.tensor.parallel.fsdp import enable_2d_with_fsdp
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 
 
@@ -39,7 +39,7 @@
 
 class SimpleModel(torch.nn.Module):
     def __init__(self):
-        super(SimpleModel, self).__init__()
+        super().__init__()
         self.net1 = torch.nn.Linear(5, 8)
         self.relu = torch.nn.ReLU()
         self.net2 = torch.nn.Linear(8, 4)
@@ -120,7 +120,7 @@ def init_model(
 
 class Test2dFsdpDtCheckpoint(DTensorTestBase):
     def _test_fsdp_dt_checkpoint(self, fsdp_pg=None) -> None:
-        if not is_available():
+        if not enable_2d_with_fsdp():
             self.skipTest("FSDP 2d parallel integration not available")
 
         CHECKPOINT_DIR = self.temp_dir
@@ -139,17 +139,13 @@ def _test_fsdp_dt_checkpoint(self, fsdp_pg=None) -> None:
         with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
             state_dict = {
                 "model": model.state_dict(),
-                "optim": FSDP.sharded_optim_state_dict(model, optim),
+                "optim": FSDP.optim_state_dict(model, optim),
             }
 
             dist_cp.save_state_dict(
                 state_dict=state_dict,
                 storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
-                planner=DefaultSavePlanner(
-                    flatten_state_dict=True,
-                    flatten_sharded_tensors=True,
-                    dedup_replicated_tensors=True,
-                ),
+                planner=DefaultSavePlanner(),
             )
 
         model_2 = init_model(fsdp_pg=fsdp_pg)[0]
@@ -176,10 +172,7 @@ def _test_fsdp_dt_checkpoint(self, fsdp_pg=None) -> None:
             dist_cp.load_state_dict(
                 state_dict=state_dict,
                 storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
-                planner=DefaultLoadPlanner(
-                    flatten_state_dict=True,
-                    flatten_sharded_tensors=True,
-                ),
+                planner=DefaultLoadPlanner(),
             )
             model_2.load_state_dict(state_dict["model"])
 
@@ -188,8 +181,7 @@ def _test_fsdp_dt_checkpoint(self, fsdp_pg=None) -> None:
                 optimizer_key="optim",
                 storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
             )
-
-            flattened_osd = FSDP.flatten_sharded_optim_state_dict(
+            flattened_osd = FSDP.optim_state_dict_to_load(
                 optim_state["optim"], model_2, optim_2
             )
             optim_2.load_state_dict(flattened_osd)
diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 96c98116328c..a0002c32b1b4 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -185,10 +185,10 @@ def _fail_rank_async(self, name, result=None):
 
 class FaultyStorageWriter(TestStorageBase, StorageWriter):
     def __init__(self, fail_conf):
-        super(FaultyStorageWriter, self).__init__(fail_conf)
+        super().__init__(fail_conf)
 
-    def init(self, is_coordinator: bool) -> None:
-        self._fail_rank("fail_init")
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+        self._fail_rank("fail_set_up_storage_writer")
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
         self._fail_rank("fail_prepare_local_plan")
@@ -212,11 +212,11 @@ def finish(
 
 class FaultyStorageReader(TestStorageBase, StorageReader):
     def __init__(self, metadata, fail_conf):
-        super(FaultyStorageReader, self).__init__(fail_conf)
+        super().__init__(fail_conf)
         self.metadata = metadata
 
-    def init(self, metadata: Metadata, is_coordinator: bool) -> None:
-        self._fail_rank("fail_init")
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        self._fail_rank("fail_set_up_storage_reader")
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         self._fail_rank("fail_prepare_local_plan")
@@ -329,7 +329,7 @@ def test_save_error_handling(self) -> None:
             "bytes": [1, 2, 3, 4],
         }
 
-        self._test_save(state_dict, fail_init=[0])
+        self._test_save(state_dict, fail_set_up_storage_writer=[0])
         self._test_save(state_dict, fail_finish=[0])
         self._test_save(state_dict, fail_prepare_global_plan=[0])
 
@@ -337,7 +337,7 @@ def test_save_error_handling(self) -> None:
         self._test_save(state_dict, fail_write_data=[2])
         self._test_save(state_dict, fail_write_data_async=[3])
 
-        self._test_save(state_dict, coordinator=1, fail_init=[1])
+        self._test_save(state_dict, coordinator=1, fail_set_up_storage_writer=[1])
         self._test_save(state_dict, coordinator=1, fail_finish=[1])
 
     def test_save_error_handling_no_dist(self) -> None:
@@ -345,7 +345,7 @@ def test_save_error_handling_no_dist(self) -> None:
 
         self.assertFalse(dist.is_initialized())
 
-        self._test_save(state_dict, fail_init=[0])
+        self._test_save(state_dict, fail_set_up_storage_writer=[0])
         self._test_save(state_dict, fail_finish=[0])
         self._test_save(state_dict, fail_prepare_global_plan=[0])
 
@@ -364,14 +364,14 @@ def test_load_error_handling(self) -> None:
         }
 
         self._test_load(state_dict)
-        self._test_load(state_dict, fail_init=[0])
+        self._test_load(state_dict, fail_set_up_storage_reader=[0])
         self._test_load(state_dict, fail_prepare_global_plan=[0])
         self._test_load(state_dict, fail_read_metadata=[0])
         self._test_load(state_dict, fail_prepare_local_plan=[1])
         self._test_load(state_dict, fail_read_data=[3])
         self._test_load(state_dict, fail_read_data_async=[1])
 
-        self._test_load(state_dict, coordinator=3, fail_init=[0])
+        self._test_load(state_dict, coordinator=3, fail_set_up_storage_reader=[0])
         self._test_load(state_dict, coordinator=1, fail_read_metadata=[3])
         self._test_load(state_dict, coordinator=2, fail_read_data=[0])
         self._test_load(state_dict, coordinator=3, fail_read_data_async=[2])
@@ -380,7 +380,7 @@ def test_load_error_handling(self) -> None:
     def test_load_error_handling_no_dist(self) -> None:
         state_dict = {"replicated": torch.rand(10, 10), "bytes": [1, 2, 3, 4]}
         self._test_load(state_dict)
-        self._test_load(state_dict, fail_init=[0])
+        self._test_load(state_dict, fail_set_up_storage_reader=[0])
         self._test_load(state_dict, fail_read_metadata=[0])
         self._test_load(state_dict, fail_prepare_local_plan=[0])
         self._test_load(state_dict, fail_prepare_global_plan=[0])
diff --git a/test/distributed/checkpoint/test_dtensor_checkpoint.py b/test/distributed/checkpoint/test_dtensor_checkpoint.py
new file mode 100644
index 000000000000..258ca17dd5d6
--- /dev/null
+++ b/test/distributed/checkpoint/test_dtensor_checkpoint.py
@@ -0,0 +1,329 @@
+# Owner(s): ["oncall: distributed"]
+from typing import Dict, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed._tensor import (
+    DeviceMesh,
+    DTensor,
+    Replicate,
+    Shard,
+    distribute_tensor,
+)
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    skip_if_lt_x_gpu,
+    with_comms,
+)
+from torch.testing._internal.common_utils import run_tests
+
+
+class MyTestModule(torch.nn.Module):
+    def __init__(
+        self,
+        sdt: DTensor,
+        rdt: DTensor,
+        extra_state: int = 1,
+        extra_state_tensor: torch.Tensor = torch.zeros(1),
+    ) -> None:
+        super().__init__()
+        self.rdt = torch.nn.Parameter(rdt)
+        self.sdt = torch.nn.Parameter(sdt)
+        self._extra_state = extra_state
+        self._extra_state_tensor = extra_state_tensor
+
+    @property
+    def extra_state(self) -> int:
+        return self._extra_state
+
+    @extra_state.setter
+    def extra_state(self, new_extra_state: int) -> None:
+        self._extra_state = new_extra_state
+
+    @property
+    def extra_state_tensor(self) -> torch.Tensor:
+        return self._extra_state_tensor
+
+    @extra_state_tensor.setter
+    def extra_state_tensor(self, new_extra_state_tensor: torch.Tensor) -> None:
+        self._extra_state_tensor = new_extra_state_tensor
+
+    def get_extra_state(self) -> Dict[str, Union[int, torch._tensor.Tensor]]:
+        return {
+            "extra_state": self._extra_state,
+            "extra_state_tensor": self._extra_state_tensor,
+        }
+
+    def set_extra_state(
+        self, state: Dict[str, Union[int, torch._tensor.Tensor]]
+    ) -> None:
+        self._extra_state = state["extra_state"]  # pyre-ignore[8]
+        self._extra_state_tensor = state["extra_state_tensor"]  # pyre-ignore[8]
+
+
+class DTensorPlanner(DTensorTestBase):
+    def create_dtensor_model(
+        self,
+        tensor_to_shard: torch.tensor,
+        tensor_to_replicate: torch.tensor,
+    ) -> torch.nn.Module:
+        mesh = DeviceMesh(
+            device_type=self.device_type,
+            mesh=range(dist.get_world_size()),
+        )
+        sharded_dt = distribute_tensor(
+            tensor_to_shard, mesh, placements=[Shard(0)]
+        )
+        replicated_dt = distribute_tensor(
+            tensor_to_replicate, mesh, placements=[Replicate()]
+        )
+        model = MyTestModule(sharded_dt, replicated_dt).cuda(dist.get_rank())
+
+        return model, sharded_dt, replicated_dt
+
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_distributed_tensor_planner(self) -> None:
+        CHECKPOINT_DIR = self.temp_dir
+
+        local_tensor = torch.arange(0, 4, dtype=torch.float32)
+        local_tensor_2 = torch.arange(4, 8, dtype=torch.float32)
+        model, sharded_dt, replicated_dt = self.create_dtensor_model(
+            local_tensor, local_tensor_2
+        )
+        state_dict = model.state_dict()
+
+        """
+        When the model is initialized, the state_dict on each rank are as followed when there are 4 GPUs:
+        rank 0:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([4., 5., 6., 7.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()]
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([0.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)])
+                        ),
+                    (
+                        '_extra_state',
+                        {'extra_state': 1, 'extra_state_tensor': tensor([0.])}
+                    )
+                ]
+            )
+        rank 1:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([4., 5., 6., 7.],device='cuda:3'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()])
+                        ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([1.], device='cuda:3'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)])
+                        ),
+                    (
+                        '_extra_state',
+                        {'extra_state': 1, 'extra_state_tensor': tensor([0.])}
+                    )
+                ]
+            )
+        rank 3:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([4., 5., 6., 7.],device='cuda:2'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()]
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([2.], device='cuda:2'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)])
+                        ),
+                    (
+                        '_extra_state',
+                        {'extra_state': 1, 'extra_state_tensor': tensor([0.])}
+                    )
+                ]
+            )
+        rank 4:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([4., 5., 6., 7.], device='cuda:3'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()]
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([3.], device='cuda:3'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)]
+                        )
+                    ),
+                    (
+                        '_extra_state',
+                        {'extra_state': 1, 'extra_state_tensor': tensor([0.])}
+                    )
+                ]
+            )
+        """
+
+        dist_cp.save_state_dict(
+            state_dict=state_dict,
+            storage_writer=dist_cp.FileSystemWriter(path=CHECKPOINT_DIR),
+            planner=dist_cp.DefaultSavePlanner(),
+        )
+        model, _, _ = self.create_dtensor_model(
+            local_tensor * 10, local_tensor_2 * 10
+        )
+        state_dict = model.state_dict()
+        """
+        When the model is re-initialized, we have changed the params in state_dict.
+        The updated values are as followed, when there are 4 GPUs:
+        rank 0:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([40., 50., 60., 70.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()],
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([0.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)],
+                        ),
+                    (
+                        '_extra_state', {'extra_state': 10, 'extra_state_tensor': tensor([10.])}
+                    )
+                ]
+            )
+        rank 1:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([40., 50., 60., 70.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()],
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(local_tensor=tensor([10.], device='cuda:0'),
+                        device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                        placements=[Shard(dim=0)],
+                        )
+                    ),
+                    (
+                        '_extra_state', {'extra_state': 10, 'extra_state_tensor': tensor([10.])}
+                    )
+                ]
+            )
+        rank 3:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([40., 50., 60., 70.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()],
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([20.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)]
+                        )
+                    ),
+                    (
+                        '_extra_state', {'extra_state': 10, 'extra_state_tensor': tensor([10.])}
+                    )
+                ]
+            )
+        rank 4:
+            OrderedDict(
+                [
+                    (
+                        'rdt',
+                        DTensor(
+                            local_tensor=tensor([40., 50., 60., 70.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Replicate()]
+                        )
+                    ),
+                    (
+                        'sdt',
+                        DTensor(
+                            local_tensor=tensor([30.], device='cuda:0'),
+                            device_mesh=DeviceMesh:([0, 1, 2, 3]),
+                            placements=[Shard(dim=0)]
+                        )
+                    ),
+                    (
+                        '_extra_state',
+                        {'extra_state': 10, 'extra_state_tensor': tensor([10.])}
+                    )
+                ]
+            )
+        """
+
+        dist_cp.load_state_dict(
+            state_dict=state_dict,
+            storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+            planner=dist_cp.DefaultLoadPlanner(),
+        )
+
+        """
+        After loading the model from the checkpoint, we want to make sure that the values in state_dict
+        match the values that are originally saved to the checkpoint.
+        """
+        for k, v in state_dict.items():
+            if k == "rdt":
+                self.assertEqual(replicated_dt.to_local(), v.to_local())
+            if k == "sdt":
+                self.assertEqual(sharded_dt.to_local(), v.to_local())
+            if k == "_extra_state":
+                self.assertEqual(1, v["extra_state"])
+                self.assertEqual(torch.tensor([0.0]), v["extra_state_tensor"])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
index 016467144e8f..3d92e792811c 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import os
+import sys
 import shutil
 import tempfile
 from typing import Dict
@@ -74,7 +75,7 @@ def assert_state_dict_equal(
                 value_1.local_shards(), value_2.local_shards()
             ):
                 self.assertTrue(
-                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
+                    torch.equal(local_shard_1.tensor, local_shard_2.tensor),
                     f"Key {key}'s shard does not match",
                 )
         elif isinstance(value_1, torch.Tensor):
@@ -100,7 +101,7 @@ def __init__(
         self,
         spec: ShardingSpec,
     ) -> None:
-        super(MyShardedModel3, self).__init__()
+        super().__init__()
         self.sharded_tensor: ShardedTensor = sharded_tensor.rand(
             spec, 10, 20, init_rrefs=False
         )
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
index 52e414545c04..559f86bfc74b 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
@@ -1,8 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
-import os
-import shutil
 import tempfile
 from typing import Dict
 
@@ -76,7 +74,7 @@ def assert_state_dict_equal(
                 value_1.local_shards(), value_2.local_shards()
             ):
                 self.assertTrue(
-                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
+                    torch.equal(local_shard_1.tensor, local_shard_2.tensor),
                     f"Key {key}'s shard does not match",
                 )
         elif isinstance(value_1, torch.Tensor):
@@ -102,7 +100,7 @@ def __init__(
         self,
         spec: ShardingSpec,
     ) -> None:
-        super(MyShardedModel3, self).__init__()
+        super().__init__()
         self.sharded_tensor: ShardedTensor = sharded_tensor.rand(
             spec, 10, 20, init_rrefs=False
         )
@@ -296,9 +294,6 @@ def test_load_with_different_shard_plan(self, thread_count) -> None:
                 if s0 == s1:
                     continue
 
-                if dist.get_rank() == 0:
-                    shutil.rmtree(path, ignore_errors=True)
-                    os.makedirs(path)
                 dist.barrier()
 
                 model_to_save = MyShardedModel3(s0)
@@ -359,10 +354,6 @@ def test_load_rowwise_to_colwise(self, thread_count) -> None:
             ],
         )
 
-        if dist.get_rank() == 0:
-            shutil.rmtree(path, ignore_errors=True)
-            os.makedirs(path)
-
         model_to_save = MyShardedModel3(src_spec).cuda(dist.get_rank())
         model_to_save._register_state_dict_hook(state_dict_hook)
         state_dict_to_save = model_to_save.state_dict()
diff --git a/test/distributed/checkpoint/test_fsdp_model_state.py b/test/distributed/checkpoint/test_fsdp_model_state.py
index b45e4d19ba32..99313f3dc8f5 100644
--- a/test/distributed/checkpoint/test_fsdp_model_state.py
+++ b/test/distributed/checkpoint/test_fsdp_model_state.py
@@ -36,10 +36,7 @@ def _test_fsdp_model_state(self, process_group) -> None:
             dist_cp.save_state_dict(
                 state_dict=state_dict,
                 storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
-                planner=DefaultSavePlanner(
-                    flatten_state_dict=True,
-                    flatten_sharded_tensors=True,
-                ),
+                planner=DefaultSavePlanner(),
             )
 
         model_2 = FSDP(
@@ -60,10 +57,7 @@ def _test_fsdp_model_state(self, process_group) -> None:
             dist_cp.load_state_dict(
                 state_dict=state_dict,
                 storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
-                planner=DefaultLoadPlanner(
-                    flatten_state_dict=True,
-                    flatten_sharded_tensors=True,
-                ),
+                planner=DefaultLoadPlanner(),
             )
             model_2.load_state_dict(state_dict["model"])
 
diff --git a/test/distributed/checkpoint/test_fsdp_optim_state.py b/test/distributed/checkpoint/test_fsdp_optim_state.py
index 5fe9e2259c02..1d2138eb3563 100644
--- a/test/distributed/checkpoint/test_fsdp_optim_state.py
+++ b/test/distributed/checkpoint/test_fsdp_optim_state.py
@@ -40,16 +40,13 @@ def test_distributed_tensor_planner(self) -> None:
         with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
             state_dict = {
                 "model": model.state_dict(),
-                "optim": FSDP.sharded_optim_state_dict(model, optim),
+                "optim": FSDP.optim_state_dict(model, optim),
             }
 
             dist_cp.save_state_dict(
                 state_dict=state_dict,
                 storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
-                planner=DefaultSavePlanner(
-                    flatten_state_dict=True,
-                    flatten_sharded_tensors=True,
-                ),
+                planner=DefaultSavePlanner(),
             )
 
         # now load the model and ensure the values are the same
@@ -73,10 +70,7 @@ def test_distributed_tensor_planner(self) -> None:
             dist_cp.load_state_dict(
                 state_dict=state_dict,
                 storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
-                planner=DefaultLoadPlanner(
-                    flatten_state_dict=True,
-                    flatten_sharded_tensors=True,
-                ),
+                planner=DefaultLoadPlanner(),
             )
             model_2.load_state_dict(state_dict["model"])
 
@@ -86,7 +80,7 @@ def test_distributed_tensor_planner(self) -> None:
                 storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
             )
 
-            flattened_osd = FSDP.flatten_sharded_optim_state_dict(
+            flattened_osd = FSDP.optim_state_dict_to_load(
                 optim_state["optim"], model_2, optim_2
             )
             optim_2.load_state_dict(flattened_osd)
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index b8ba04d6c152..1189a9ac13a9 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -189,8 +189,8 @@ def test_agent_constructor(self):
         spec = self._get_worker_spec(max_restarts=1)
         agent = TestAgent(spec)
         worker_group = agent.get_worker_group()
-        self.assertEquals(WorkerState.INIT, worker_group.state)
-        self.assertEquals(spec.max_restarts, agent._remaining_restarts)
+        self.assertEqual(WorkerState.INIT, worker_group.state)
+        self.assertEqual(spec.max_restarts, agent._remaining_restarts)
 
     @patch("torch.distributed.elastic.agent.server.api.put_metric")
     def test_record_flakiness_metric(self, put_metric_mock):
@@ -398,7 +398,7 @@ def test_run_happy_path(self, record_events_mock, mock_monitor_workers):
         agent.run()
 
         # no failure, no membership changes -> no retries
-        self.assertEquals(max_restarts, agent._remaining_restarts)
+        self.assertEqual(max_restarts, agent._remaining_restarts)
         record_events_mock.assert_called_once()
 
     @patch.object(TestAgent, "_initialize_workers", side_effect=RuntimeError())
@@ -450,7 +450,7 @@ def test_run_membership_change(
         worker_group = agent._worker_group
 
         agent.run()
-        self.assertEquals(WorkerState.SUCCEEDED, worker_group.state)
+        self.assertEqual(WorkerState.SUCCEEDED, worker_group.state)
         record_events_mock.assert_called_once()
 
     @patch.object(
@@ -482,8 +482,8 @@ def test_get_ranks(self):
         )
         agent = TestAgent(spec)
         total_sum, ranks = agent._get_ranks(role_infos, 0, 0, len(role_infos))
-        self.assertEquals(15, total_sum)
-        self.assertEquals([0, 1, 2, 3], list(ranks))
+        self.assertEqual(15, total_sum)
+        self.assertEqual([0, 1, 2, 3], list(ranks))
 
     def test_assign_worker_ranks(self):
         role_infos = [
diff --git a/test/distributed/elastic/events/lib_test.py b/test/distributed/elastic/events/lib_test.py
index 4ddb317710ee..3a5fb694bfda 100644
--- a/test/distributed/elastic/events/lib_test.py
+++ b/test/distributed/elastic/events/lib_test.py
@@ -9,7 +9,6 @@
 
 import json
 import logging
-import unittest
 from dataclasses import asdict
 from unittest.mock import patch
 
@@ -21,10 +20,10 @@
     _get_or_create_logger,
     construct_and_record_rdzv_event,
 )
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
-class EventLibTest(unittest.TestCase):
+class EventLibTest(TestCase):
     def assert_event(self, actual_event, expected_event):
         self.assertEqual(actual_event.name, expected_event.name)
         self.assertEqual(actual_event.source, expected_event.source)
@@ -59,7 +58,7 @@ def test_event_deser(self):
         deser_event = Event.deserialize(json_event)
         self.assert_event(event, deser_event)
 
-class RdzvEventLibTest(unittest.TestCase):
+class RdzvEventLibTest(TestCase):
     @patch("torch.distributed.elastic.events.record_rdzv_event")
     @patch("torch.distributed.elastic.events.get_logging_handler")
     def test_construct_and_record_rdzv_event(self, get_mock, record_mock):
diff --git a/test/distributed/fsdp/test_checkpoint_wrapper.py b/test/distributed/fsdp/test_checkpoint_wrapper.py
index d8e005fcf82b..c6c5d54f1bf9 100644
--- a/test/distributed/fsdp/test_checkpoint_wrapper.py
+++ b/test/distributed/fsdp/test_checkpoint_wrapper.py
@@ -22,9 +22,6 @@
 
 
 class CheckpointWrapperTest(TestCase):
-    def setUp(self):
-        super().setUp()
-
     def test_load_activation_checkpointed_module(self):
         lin = nn.Linear(10, 10, bias=False)
         lin = checkpoint_wrapper(
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 81b9f4c37f06..76df5be0a1af 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -300,6 +300,43 @@ def _test_low_precision_grads(
                     torch.linalg.vector_norm(param.grad, norm_type).item() <= max_norm,
                 )
 
+    @skip_if_lt_x_gpu(2)
+    def test_no_gradients(self):
+        """
+        Tests that calling ``clip_grad_norm_()`` when the FDSP module has no
+        gradients simply returns a scalar zero tensor in FP32 without erroring.
+        """
+        self.run_subtests(
+            {"use_orig_params": [False, True]},
+            self._test_no_gradients,
+        )
+
+    def _test_no_gradients(self, use_orig_params: bool):
+        lin_module = nn.Linear(24, 24)
+        mixed_precision_config = MixedPrecision(
+            param_dtype=torch.float16,
+            reduce_dtype=torch.float32,
+            buffer_dtype=torch.float32,
+        )
+        fsdp_module = FSDP(
+            lin_module,
+            sharding_strategy=ShardingStrategy.SHARD_GRAD_OP,
+            mixed_precision=mixed_precision_config,
+            device_id=self.rank,
+            use_orig_params=use_orig_params,
+        )
+        inp = torch.randn(32, 24, device="cuda")
+        fsdp_module(inp)
+        with self.assertWarnsRegex(
+            expected_warning=UserWarning,
+            expected_regex="on rank "
+            rf"{self.rank} with no gradients -- returning the total "
+            "norm in the default dtype torch.float32",
+        ):
+            total_norm = fsdp_module.clip_grad_norm_(1)
+        self.assertEqual(total_norm.dtype, torch.float32)
+        self.assertEqual(total_norm, torch.tensor(0.0, device="cuda"))
+
 
 instantiate_parametrized_tests(TestClipGradNorm)
 
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index e426ebe32328..3b023d735eee 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -69,7 +69,7 @@ def forward(self, x):
         return self.out(F.relu(self.net(x)))
 
 
-class DummyState(object):
+class DummyState:
 
     __slots__ = ["process_group", "noise"]
 
@@ -78,7 +78,7 @@ def __init__(self, process_group: dist.ProcessGroup, noise: int):
         self.noise = noise
 
 
-class DummyHook(object):
+class DummyHook:
     def dummy_hook_for_no_shard_fsdp(self, state: DummyState, grad: torch.Tensor):
         """
         This communication hook is for illustration and testing purpose only.
diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
index 3676acdbda54..d93a923f5f79 100644
--- a/test/distributed/fsdp/test_fsdp_ignored_modules.py
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -244,9 +244,9 @@ def _test_diff_ignored_modules_across_ranks(
             {"ignored_modules": layer1_ignored_modules}
             if ignore_modules
             else {
-                "ignored_parameters": set(
+                "ignored_parameters": {
                     p for m in layer1_ignored_modules for p in m.parameters()
-                )
+                }
             }
         )
         model.layer1 = FSDP(model.layer1, **ignore_kwargs)
@@ -260,9 +260,9 @@ def _test_diff_ignored_modules_across_ranks(
             {"ignored_modules": model_ignored_modules}
             if ignore_modules
             else {
-                "ignored_parameters": set(
+                "ignored_parameters": {
                     p for m in model_ignored_modules for p in m.parameters()
-                )
+                }
             }
         )
         wrapped_model = FSDP(model, **ignore_kwargs_top)
@@ -279,9 +279,9 @@ def test_ignored_modules_not_under_wrapped_root(self, ignore_modules: bool):
             {"ignored_modules": ignored_modules}
             if ignore_modules
             else {
-                "ignored_parameters": set(
+                "ignored_parameters": {
                     p for m in ignored_modules for p in m.parameters()
-                )
+                }
             }
         )
 
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index fd1035d1042e..07822cd02e4b 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -95,6 +95,20 @@ def forward(self, x):
 
     @skip_if_lt_x_gpu(2)
     def test_fsdp_not_all_outputs_used_in_loss(self):
+        self.run_subtests(
+            {
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.SHARD_GRAD_OP,
+                    ShardingStrategy.NO_SHARD,
+                ]
+            },
+            self._test_fsdp_not_all_outputs_used_in_loss,
+        )
+
+    def _test_fsdp_not_all_outputs_used_in_loss(
+        self, sharding_strategy: ShardingStrategy
+    ):
         class MyModule(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -120,58 +134,52 @@ def _check_equal(local, fsdp):
                 for p1, p2 in zip(fsdp.parameters(), local.parameters()):
                     torch.testing.assert_close(p1, p2)
 
-        for sharding_strategy in [
-            ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP,
-            ShardingStrategy.NO_SHARD,
-        ]:
-            with self.subTest(sharding_strategy=sharding_strategy):
-                fsdp_ctor = functools.partial(FSDP, sharding_strategy=sharding_strategy)
-                m = MyModule().cuda()
-                m_local = deepcopy(m)
-                local_m = m_local
-                prev_params = [p.clone() for p in m_local.parameters()]
-
-                m.lin1 = fsdp_ctor(m.lin1)
-                m = fsdp_ctor(m)
-                _check_equal(m_local, m)
-
-                opt = torch.optim.SGD(m.parameters(), lr=1e-3)
-                opt_local = torch.optim.SGD(local_m.parameters(), lr=1e-3)
-
-                for i in range(6):
-                    t = torch.ones(4, device="cuda")
-                    a, b = m(t)
-                    local_a, local_b = local_m(t)
-                    if i < 2:
-                        # use both params in loss computation. Later,
-                        # b will go unused and we check grads are the
-                        # same as local training.
-                        loss = (a @ b).sum()
-                        loss_local = (local_a @ local_b).sum()
-                    else:
-                        loss = a.sum()
-                        loss_local = local_a.sum()
-
-                    loss.backward()
-                    loss_local.backward()
-                    _check_resharded(m)
-                    opt.step()
-                    opt_local.step()
-                    _check_equal(m_local, m)
-                    # Ensure at least some change from previous params, otherwise
-                    # above check would be vacuously true.
-                    self.assertTrue(
-                        any(
-                            not torch.equal(p1, p2)
-                            for p1, p2 in zip(prev_params, m_local.parameters())
-                        )
-                    )
-                    prev_params = [p.clone() for p in local_m.parameters()]
-                    opt.zero_grad()
-                    opt_local.zero_grad()
-
-                dist.barrier()
+        fsdp_ctor = functools.partial(FSDP, sharding_strategy=sharding_strategy)
+        m = MyModule().cuda()
+        m_local = deepcopy(m)
+        local_m = m_local
+        prev_params = [p.clone() for p in m_local.parameters()]
+
+        m.lin1 = fsdp_ctor(m.lin1)
+        m = fsdp_ctor(m)
+        _check_equal(m_local, m)
+
+        opt = torch.optim.SGD(m.parameters(), lr=1e-3)
+        opt_local = torch.optim.SGD(local_m.parameters(), lr=1e-3)
+
+        for i in range(6):
+            t = torch.ones(4, device="cuda")
+            a, b = m(t)
+            local_a, local_b = local_m(t)
+            if i < 2:
+                # use both params in loss computation. Later,
+                # b will go unused and we check grads are the
+                # same as local training.
+                loss = (a @ b).sum()
+                loss_local = (local_a @ local_b).sum()
+            else:
+                loss = a.sum()
+                loss_local = local_a.sum()
+
+            loss.backward()
+            loss_local.backward()
+            _check_resharded(m)
+            opt.step()
+            opt_local.step()
+            _check_equal(m_local, m)
+            # Ensure at least some change from previous params, otherwise
+            # above check would be vacuously true.
+            self.assertTrue(
+                any(
+                    not torch.equal(p1, p2)
+                    for p1, p2 in zip(prev_params, m_local.parameters())
+                )
+            )
+            prev_params = [p.clone() for p in local_m.parameters()]
+            opt.zero_grad()
+            opt_local.zero_grad()
+
+        dist.barrier()
 
     @skip_if_lt_x_gpu(2)
     @parametrize("use_second_layer", [True, False])
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index 0c8b3225ae71..70eb0062e043 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -21,7 +21,6 @@
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
 from torch.nn.modules.batchnorm import _BatchNorm
-from torch.testing._internal.common_cuda import CUDA11OrLater
 from torch.testing._internal.common_distributed import (
     SaveForwardInputsModel,
     skip_if_lt_x_gpu,
@@ -81,9 +80,7 @@
 # Nothing is cast (thus param, comm, grad, and buffer should be in the full precision)
 mp_no_mixed_precision = MixedPrecision()
 
-nccl_supports_bf16 = (
-    CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10)
-)
+nccl_supports_bf16 = dist.is_nccl_available() and nccl.version() >= (2, 10)
 
 mp_configs = [default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision]
 if nccl_supports_bf16:
@@ -667,7 +664,7 @@ def test_grads_reduced_precision(self):
     def test_mp_batchnorm(self, convert_sync_bn):
         class BatchNormNet(nn.Module):
             def __init__(self, affine=True):
-                super(BatchNormNet, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 40, bias=False)
                 self.bn = nn.BatchNorm1d(4, affine=affine)
                 self.fc2 = nn.Linear(40, 4, bias=False)
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 249f1ff35048..0cd93b1421e6 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -2,6 +2,7 @@
 
 import bisect
 import sys
+from copy import deepcopy
 from enum import auto, Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
@@ -15,7 +16,12 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
     OptimStateKeyType,
+    ShardedOptimStateDictConfig,
+    ShardedStateDictConfig,
+    StateDictSettings,
     StateDictType,
 )
 from torch.distributed.optim import _NamedOptimizer
@@ -280,7 +286,7 @@ def param_group1(self) -> List[torch.nn.Parameter]:
 
 class TestFSDPOptimState(FSDPTest):
     def __init__(self, *args, **kwargs):
-        super(TestFSDPOptimState, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self._model_class = {
             _ModelClass.NESTED: self._init_nested_model,
             _ModelClass.TRANSFORMER: self._init_transformer_model,
@@ -428,7 +434,10 @@ def _check_same_state(
             # Check parameter keys are the same first for earlier erroring
             ref_osd_param_ids = set(ref_osd_state.keys())
             fsdp_osd_param_ids = set(fsdp_osd_state.keys())
-            self.assertTrue(ref_osd_param_ids == fsdp_osd_param_ids)
+            self.assertTrue(
+                ref_osd_param_ids == fsdp_osd_param_ids,
+                (ref_osd_param_ids, fsdp_osd_param_ids),
+            )
             # Check state values are the same
             for param_id, param_state in fsdp_osd_state.items():
                 for state_name, value in param_state.items():
@@ -657,6 +666,20 @@ def test_shard_full_optim_state_dict_nested(
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=False,
+            use_diff_optim_inputs=use_diff_optim_inputs,
+            wrap_alt=wrap_alt,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_shard_full_optim_state_dict_nested_halve_world_size(self):
         """Tests :meth:`shard_full_optim_state_dict` for a non-FSDP-root model
@@ -678,6 +701,20 @@ def test_shard_full_optim_state_dict_nested_halve_world_size(self):
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(),
+            ),
+            use_multiple_param_groups=use_multiple_param_groups,
+            halve_world_size=True,
+            use_diff_optim_inputs=use_diff_optim_inputs,
+            wrap_alt=wrap_alt,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_shard_full_optim_state_dict_transformer(self) -> None:
         """Tests :meth:`shard_full_optim_state_dict` for an FSDP-root
@@ -693,6 +730,19 @@ def test_shard_full_optim_state_dict_transformer(self) -> None:
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.TRANSFORMER,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=True,
+            use_diff_optim_inputs=False,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     @parametrize("use_multiple_param_groups", [False, True])
     @parametrize("wrap_alt", [False, True])
@@ -717,6 +767,20 @@ def test_scatter_full_optim_state_dict_nested(
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(rank0_only=True),
+            ),
+            use_multiple_param_groups=use_multiple_param_groups,
+            halve_world_size=False,
+            use_diff_optim_inputs=use_diff_optim_inputs,
+            wrap_alt=wrap_alt,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_scatter_full_optim_state_dict_nested_halve_world_size(self):
         """Tests :meth:`scatter_full_optim_state_dict` for a non-FSDP-root
@@ -738,6 +802,20 @@ def test_scatter_full_optim_state_dict_nested_halve_world_size(self):
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(rank0_only=True),
+            ),
+            use_multiple_param_groups=use_multiple_param_groups,
+            halve_world_size=True,
+            use_diff_optim_inputs=use_diff_optim_inputs,
+            wrap_alt=wrap_alt,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_scatter_full_optim_state_dict_transformer(self) -> None:
         """Tests :meth:`scatter_full_optim_state_dict` for an FSDP-root
@@ -753,6 +831,19 @@ def test_scatter_full_optim_state_dict_transformer(self) -> None:
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.TRANSFORMER,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(rank0_only=True),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=True,
+            use_diff_optim_inputs=False,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_flatten_sharded_optim_state_dict_nested(self) -> None:
         """Tests :meth:`flatten_sharded_optim_state_dict` for an FSDP-root
@@ -768,6 +859,20 @@ def test_flatten_sharded_optim_state_dict_nested(self) -> None:
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.SHARDED_STATE_DICT,
+                ShardedStateDictConfig(),
+                ShardedOptimStateDictConfig(),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=False,
+            use_diff_optim_inputs=False,
+            wrap_alt=True,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_flatten_sharded_optim_state_dict_transformer(self) -> None:
         """Tests :meth:`flatten_sharded_optim_state_dict` for an FSDP-root
@@ -782,18 +887,64 @@ def test_flatten_sharded_optim_state_dict_transformer(self) -> None:
             num_iters=3,
         )
 
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.TRANSFORMER,
+            state_dict_settings=StateDictSettings(
+                StateDictType.SHARDED_STATE_DICT,
+                ShardedStateDictConfig(),
+                ShardedOptimStateDictConfig(),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=False,
+            use_diff_optim_inputs=False,
+            num_iters=3,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_use_orig_params(self) -> None:
         """Tests :meth:`optim_state_dict` for an FSDP-root nested model."""
-        self._test_load_optim_state(
+        self._test_load_optim_state_with_optim_state_dict(
             _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=False,
+            use_diff_optim_inputs=False,
+            wrap_alt=True,
+            num_iters=3,
+            fsdp_kwargs={"use_orig_params": True},
+        )
+
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.FULL_STATE_DICT,
+                FullStateDictConfig(),
+                FullOptimStateDictConfig(rank0_only=True),
+            ),
+            use_multiple_param_groups=False,
+            halve_world_size=False,
+            use_diff_optim_inputs=False,
+            wrap_alt=True,
+            num_iters=3,
+            fsdp_kwargs={"use_orig_params": True},
+        )
+
+        self._test_load_optim_state_with_optim_state_dict(
+            _ModelClass.NESTED,
+            state_dict_settings=StateDictSettings(
+                StateDictType.SHARDED_STATE_DICT,
+                ShardedStateDictConfig(),
+                ShardedOptimStateDictConfig(),
+            ),
             use_multiple_param_groups=False,
             halve_world_size=False,
-            osd_comm_method=_OSDCommMethod.OPTIM_STATE_DICT,
             use_diff_optim_inputs=False,
-            use_optim_input=False,
             wrap_alt=True,
-            num_iters=1,
+            num_iters=3,
             fsdp_kwargs={"use_orig_params": True},
         )
 
@@ -822,7 +973,7 @@ def _test_load_optim_state(
         """
         initializer = self._model_class[model_class]
         if osd_comm_method == _OSDCommMethod.OPTIM_STATE_DICT:
-            osd_method = FSDP._optim_state_dict
+            osd_method = FSDP.optim_state_dict
         elif osd_comm_method == _OSDCommMethod.FLATTEN_SHARDED_OSD:
             osd_method = FSDP.sharded_optim_state_dict
         else:
@@ -928,8 +1079,8 @@ def _test_load_optim_state(
                 optim=optim2,
             )
         elif osd_comm_method == _OSDCommMethod.OPTIM_STATE_DICT:
-            sharded_osd1 = FSDP._optim_state_dict_to_load(fsdp_osd1, model2, optim2)
-            sharded_osd2 = FSDP._optim_state_dict_to_load(fsdp_osd2, model2, optim2)
+            sharded_osd1 = FSDP.optim_state_dict_to_load(fsdp_osd1, model2, optim2)
+            sharded_osd2 = FSDP.optim_state_dict_to_load(fsdp_osd2, model2, optim2)
 
         # As a sanity check, check that sharding the second model's full/sharded
         # optimizer state dict according to itself is equivalent to its local
@@ -960,9 +1111,8 @@ def _test_load_optim_state(
             check_same_param_keys=check_same_param_keys,
         )
         # As a sanity check, check that we can load and run a few iterations
-        if osd_comm_method != _OSDCommMethod.FLATTEN_SHARDED_OSD:
-            optim2.load_state_dict(sharded_osd1)
-            self._step_model(model2, optim2, num_iters=num_iters)
+        optim2.load_state_dict(sharded_osd2)
+        self._step_model(model2, optim2, num_iters=num_iters)
 
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", STATE_DICT_TYPES)
@@ -1275,31 +1425,6 @@ def get_warning_context():
             should_check_method, get_warning_context, fsdp_kwargs=None
         )
 
-    @skip_if_lt_x_gpu(2)
-    def test_use_orig_params_error(self):
-        """Tests that the optimizer state checkpointing APIs raise an error
-        when ``use_orig_params=True``."""
-
-        def should_check_method(method_name: str):
-            # Skip `rekey_optim_state_dict` since that does not depend on
-            # `use_orig_params=True`
-            return method_name not in (
-                "rekey_optim_state_dict",
-                "full_optim_state_dict",
-                "shard_full_optim_state_dict",
-            )
-
-        def get_error_context():
-            error_regex = "Optimizer state checkpointing is not supported yet for `use_orig_params=True`"
-            return self.assertRaisesRegex(
-                expected_exception=NotImplementedError, expected_regex=error_regex
-            )
-
-        fsdp_kwargs = {"use_orig_params": True}
-        self._run_on_all_optim_state_apis(
-            should_check_method, get_error_context, fsdp_kwargs
-        )
-
     def _run_on_all_optim_state_apis(
         self,
         should_check_method_fn: Callable[[str], bool],
@@ -1441,11 +1566,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         optim.step()
 
     @skip_if_lt_x_gpu(2)
-    def test_compatible_with_named_optimizer(self):
-        class TestDummyModel(torch.nn.Module):
+    def test_compatible_with_trec(self):
+        class DenseModel(torch.nn.Module):
             def __init__(self):
-                super(TestDummyModel, self).__init__()
-                torch.manual_seed(0)
+                super().__init__()
                 self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
                 self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
                 self.net3 = nn.Linear(32, 64)
@@ -1454,29 +1578,50 @@ def __init__(self):
             def forward(self, x):
                 return self.net4(self.net3(self.net2(self.net1(x))))
 
-        models = []
-        optims = []
-        state_dicts = []
-        models.append(FSDP(TestDummyModel().cuda(), use_orig_params=True))
-        optims.append(torch.optim.Adam(models[-1].parameters(), lr=1e-2))
-        models.append(FSDP(TestDummyModel().cuda(), use_orig_params=True))
-        optims.append(
+        class FakeMPModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                torch.manual_seed(0)
+                self.dense = FSDP(DenseModel().cuda(), use_orig_params=True)
+                if dist.get_rank() == 0:
+                    self.sparse0 = nn.Sequential(nn.Linear(8, 8), nn.ReLU())
+                else:
+                    self.sparse1 = nn.Sequential(nn.Linear(8, 8), nn.ReLU())
+
+            def forward(self, x):
+                if dist.get_rank() == 0:
+                    sparse = self.sparse0(x)
+                else:
+                    sparse = self.sparse1(x)
+                dist.all_reduce(sparse)
+                return self.dense(sparse)
+
+        models = [FakeMPModel().cuda(), FakeMPModel().cuda()]
+        optims = [
+            torch.optim.Adam(models[0].parameters(), lr=1e-2),
             _NamedOptimizer(
-                models[-1].named_parameters(),
+                models[1].named_parameters(),
                 torch.optim.Adam,
-                [{"params": models[-1].parameters()}],
-                models[-1],
+                [{"params": models[1].parameters()}],
+                models[1],
                 lr=1e-2,
-            )
-        )
+            ),
+        ]
+        state_dicts = []
 
         # Train one batch and see if optim_state_dict are the same.
-        batch = torch.rand(5, 8)
+        batch = torch.rand(5, 8, device=torch.device("cuda"))
         for model, optim in zip(models, optims):
+            # Eagerly initialize the states
+            for param in model.parameters():
+                if param.requires_grad:
+                    t = torch.zeros_like(param)
+                    param.grad = torch.autograd.Variable(t)
+            optim.step()
             loss = model(batch).sum()
             loss.backward()
             optim.step()
-            state_dicts.append(FSDP._optim_state_dict(model, optim))
+            state_dicts.append(deepcopy(FSDP.optim_state_dict(model, optim)))
 
         self._check_same_param_groups(
             state_dicts[0], state_dicts[1], check_same_param_keys=False
@@ -1487,14 +1632,17 @@ def forward(self, x):
 
         # Make optim1 has a different state.
         for i in range(5):
-            batch = torch.rand(5, 8)
+            batch = torch.rand(5, 8).cuda()
             loss = models[1](batch).sum()
             loss.backward()
             optims[1].step()
 
         # Load the state back to see if load_optim_state_dict works.
-        optims[1].load_state_dict(state_dicts[1])
-        state_dicts[1] = FSDP._optim_state_dict(models[1], optims[1])
+        state_dict_to_load = FSDP.optim_state_dict_to_load(
+            state_dicts[1], models[1], optims[1], is_named_optimizer=True
+        )
+        optims[1].load_state_dict(state_dict_to_load)
+        state_dicts[1] = FSDP.optim_state_dict(models[1], optims[1])
 
         self._check_same_param_groups(
             state_dicts[0], state_dicts[1], check_same_param_keys=False
@@ -1503,6 +1651,130 @@ def forward(self, x):
             state_dicts[0], state_dicts[1], check_same_param_keys=True
         )
 
+    @skip_if_lt_x_gpu(2)
+    def test_with_empty_optimizer_state(self):
+        class TestDummyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                torch.manual_seed(0)
+                self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
+                self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
+                self.net3 = nn.Linear(32, 64)
+                self.net4 = nn.Sequential(nn.ReLU(), nn.Linear(64, 8))
+
+            def forward(self, x):
+                return self.net4(self.net3(self.net2(self.net1(x))))
+
+        model = FSDP(TestDummyModel().cuda())
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        state_dict = optim.state_dict()
+        gathered_state_dict = FSDP.optim_state_dict(model, optim)
+        self.assertEqual(gathered_state_dict["state"], state_dict["state"])
+
+    def _test_load_optim_state_with_optim_state_dict(
+        self,
+        model_class: _ModelClass,
+        state_dict_settings: StateDictSettings,
+        use_multiple_param_groups: bool,
+        halve_world_size: bool,
+        use_diff_optim_inputs: bool,
+        num_iters: int,
+        **new_model_kwargs,
+    ):
+        """
+        (1) Runs a model with full world size for K iterations to generate a
+        full/sharded optimizer state dict;
+        (2) initializes a model with halved world size and possibly different
+        FSDP wrapping scheme (based on ``new_model_kwargs``);
+        (3) loads the full/sharded optimizer state dict from (1) according to the
+        halved-world-size model;
+        (4) runs the halved-world-size model for K iterations; and
+        (5) checks that the sharded optimizer state dict from (3) matches the
+        halved-world-size model's local optimizer state dict, meaning that the
+        former could have equivalently been loaded into the local optimizer.
+        """
+        initializer = self._model_class[model_class]
+
+        # First, run a wrapped model with full world size for a few iterations
+        model1, optim1, optim_input1 = initializer(
+            wrap=True,
+            use_multiple_param_groups=use_multiple_param_groups,
+        )
+        FSDP.set_state_dict_type(
+            model1,
+            state_dict_settings.state_dict_type,
+            state_dict_settings.state_dict_config,
+            state_dict_settings.optim_state_dict_config,
+        )
+        self._step_model(model1, optim1, num_iters=num_iters)
+        fsdp_osd1 = FSDP.optim_state_dict(model1, optim1)
+        if halve_world_size:
+            # Create a new process group with halved world size
+            new_group_ranks = [r for r in range(self.world_size) if r % 2 == 0]
+            new_group = dist.new_group(ranks=new_group_ranks)
+            if self.rank not in new_group_ranks:
+                return
+        else:
+            # Continue using the same group and hence world size
+            new_group = dist.distributed_c10d._get_default_group()
+        # Second, run a wrapped model with (possibly) halved world size and
+        # (possibly) differing `optim_input` across ranks
+        model2, optim2, optim_input2 = initializer(
+            wrap=True,
+            group=new_group,
+            use_multiple_param_groups=use_multiple_param_groups,
+            use_diff_optim_inputs=use_diff_optim_inputs,
+            **new_model_kwargs,  # specify `wrap_alt` to change wrapping
+        )
+        FSDP.set_state_dict_type(
+            model2,
+            state_dict_settings.state_dict_type,
+            state_dict_settings.state_dict_config,
+            state_dict_settings.optim_state_dict_config,
+        )
+        self._step_model(model2, optim2, num_iters=num_iters)
+        fsdp_osd2 = FSDP.optim_state_dict(model2, optim2, group=new_group)
+        # Compute two sharded optim state dicts: (1) for the first model
+        # according to the second model and (2) for the second model according
+        # to the second model
+        sharded_osd1 = FSDP.optim_state_dict_to_load(
+            fsdp_osd1, model2, optim2, group=new_group
+        )
+        sharded_osd2 = FSDP.optim_state_dict_to_load(
+            fsdp_osd2, model2, optim2, group=new_group
+        )
+
+        # As a sanity check, check that sharding the second model's full/sharded
+        # optimizer state dict according to itself is equivalent to its local
+        # optimizer's state dict
+        local_osd2 = optim2.state_dict()
+        self._check_same_param_groups(
+            sharded_osd2,
+            local_osd2,
+            check_same_param_keys=True,
+        )
+        self._check_same_state(
+            sharded_osd2,
+            local_osd2,
+            check_same_param_keys=True,
+        )
+        # Check that sharding the first model's full/sharded optimizer state dict
+        # according to the second model is equivalent to the second model's
+        # local optimizer state dict
+        self._check_same_param_groups(
+            sharded_osd1,
+            local_osd2,
+            check_same_param_keys=True,
+        )
+        self._check_same_state(
+            sharded_osd1,
+            local_osd2,
+            check_same_param_keys=True,
+        )
+        # As a sanity check, check that we can load and run a few iterations
+        optim2.load_state_dict(sharded_osd2)
+        self._step_model(model2, optim2, num_iters=num_iters)
+
 
 instantiate_parametrized_tests(TestFSDPOptimState)
 
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index d56e4f911f49..365e11afac16 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
+import io
 import itertools
 import sys
 from contextlib import suppress
@@ -10,6 +11,7 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
+from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
     checkpoint_wrapper,
@@ -116,16 +118,28 @@ def _broadcast_state_dict(self, model, state_dict):
         # TODO (rohan-varma): remove model
         return _broadcast_state_dict(self.rank, state_dict)
 
-    def _compare_models(self, model, model_new, assert_fn, check_fp16=False):
+    def _state_compare(self, model, model_new, assert_fn, state_generator="parameters"):
+        state_base = list(getattr(model, state_generator)())
+        state_new = list(getattr(model_new, state_generator)())
+        # Regardless of `assert_fn`, the number of parameters should be the same
+        self.assertEqual(len(state_base), len(state_new))
+        assert_fn(state_base, state_new)
+
+    def _compare_models(
+        self, model, model_new, assert_fn, check_fp16=False, check_buffers=True
+    ):
         assert assert_fn in (self.assertEqual, self.assertNotEqual)
         with FSDP.summon_full_params(model):
             with FSDP.summon_full_params(model_new):
-                params = list(model.parameters())
-                params_new = list(model_new.parameters())
-                # Regardless of `assert_fn`, the number of parameters should be
-                # the same
-                self.assertEqual(len(params), len(params_new))
-                assert_fn(params, params_new)
+                self._state_compare(model, model_new, assert_fn)
+                if check_buffers:
+                    has_buffers = any(
+                        [len(list(m.buffers())) for m in (model, model_new)]
+                    )
+                    if has_buffers:
+                        self._state_compare(
+                            model, model_new, assert_fn, state_generator="buffers"
+                        )
                 if check_fp16:
                     for tensor in model_new.parameters():
                         self.assertEqual(tensor.dtype, torch.float16)
@@ -157,6 +171,40 @@ def _get_simple_model(self, *fsdp_args, checkpoint_wrap=False, **fsdp_kwargs):
         model = FSDP(lin, *fsdp_args, **fsdp_kwargs)
         return model
 
+    def _get_multibuffer_nested_model(
+        self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs
+    ):
+        full_p = torch.float32
+        lin_mp = fsdp_kwargs.pop("mixed_precision", None)
+        bn_mp = (
+            MixedPrecision(param_dtype=full_p, reduce_dtype=full_p, buffer_dtype=full_p)
+            if lin_mp
+            else None
+        )
+        if wrap:
+            lin1 = nn.Linear(10, 10, bias=False).cuda()
+            bn1 = nn.BatchNorm1d(10).cuda()
+            lin2 = nn.Linear(10, 10, bias=False).cuda()
+            if checkpoint_wrap:
+                lin1 = checkpoint_wrapper(lin1)
+                bn1 = checkpoint_wrapper(bn1)
+                lin2 = checkpoint_wrapper(lin2)
+            seq = nn.Sequential(
+                FSDP(lin1, mixed_precision=lin_mp, *fsdp_args, **fsdp_kwargs),
+                FSDP(bn1, mixed_precision=bn_mp, *fsdp_args, **fsdp_kwargs),
+                lin2,
+            )
+            if checkpoint_wrap:
+                seq = checkpoint_wrapper(seq)
+            model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
+        else:
+            model = nn.Sequential(
+                nn.Linear(10, 10, bias=False).cuda(),
+                nn.BatchNorm1d(10).cuda(),
+                nn.Linear(10, 10, bias=False).cuda(),
+            )
+        return model
+
     def _get_non_fsdp_root_module(self, *fsdp_args, wrap=True, **fsdp_kwargs):
         class FSDPContainer(nn.Module):
             def __init__(self, fsdp_1, fsdp_2):
@@ -438,7 +486,7 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
     @parametrize("use_orig_params", [True, False])
     def test_basic_save_and_load_state_dict(
         self,
-        state_dict_type: StateDictType,
+        state_dict_type: str,
         cpu_offload: bool,
         fp16: bool,
         state_dict_rank0_and_offload: bool,
@@ -502,7 +550,7 @@ def test_basic_save_and_load_state_dict(
                 model_new.half()
 
             # zero the model to ensure parameters are different.
-            _zero_model(model_new)
+            _zero_model(model_new, zero_buffers=True)
             self._compare_models(model, model_new, self.assertNotEqual)
 
             # Verify parameters are the same in the new model.
@@ -513,6 +561,73 @@ def test_basic_save_and_load_state_dict(
 
             self._compare_models(model, model_new, self.assertEqual, check_fp16=fp16)
 
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
+    @parametrize(
+        "cpu_offload",
+        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
+    )
+    @parametrize("mixed_precision", [True, False])
+    @parametrize("state_dict_rank0_and_offload", [True, False])
+    @parametrize("use_orig_params", [True, False])
+    def test_buffers_save_and_load_state_dict(
+        self,
+        state_dict_type: str,
+        cpu_offload: bool,
+        mixed_precision: bool,
+        state_dict_rank0_and_offload: bool,
+        use_orig_params: bool,
+    ):
+        """
+        Tests that we can save a state_dict and load it for modules with persistent buffers, including
+        in the context of non-default mixed precision, different ``state_dict_type`` s and CPU offloading.
+        """
+        if (state_dict_rank0_and_offload and state_dict_type != "state_dict") or (
+            use_orig_params and state_dict_type not in _UNFLATTENED_STATE_DICT_IMPLS
+        ):
+            return  # not supported
+        mixed_precision = (
+            MixedPrecision(
+                param_dtype=torch.float16,
+                reduce_dtype=torch.float16,
+                buffer_dtype=torch.float16,
+            )
+            if mixed_precision
+            else None
+        )
+        model_call = partial(
+            self._get_multibuffer_nested_model,
+            cpu_offload=cpu_offload,
+            use_orig_params=use_orig_params,
+            mixed_precision=mixed_precision,
+        )
+        model = model_call()
+        ctx = self._get_state_dict_mgr(
+            model, state_dict_type, state_dict_rank0_and_offload
+        )
+        with ctx:
+            fsdp_state_dict = _get_state_dict(model, cpu_offload.offload_params, False)
+
+        self._validate_state_dict_contents(
+            model, fsdp_state_dict, state_dict_rank0_and_offload
+        )
+
+        model_new = model_call()
+        if not cpu_offload.offload_params:
+            model_new = model_new.cuda()
+
+        # zero the model to ensure parameters are different.
+        _zero_model(model_new, zero_buffers=True)
+        self._compare_models(model, model_new, self.assertNotEqual)
+
+        # Verify parameters are the same in the new model.
+        if state_dict_rank0_and_offload:
+            fsdp_state_dict = self._broadcast_state_dict(model, fsdp_state_dict)
+        with FSDP.state_dict_type(model_new, STATE_DICT_MAPPING[state_dict_type]):
+            model_new.load_state_dict(fsdp_state_dict, strict=True)
+
+        self._compare_models(model, model_new, self.assertEqual)
+
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     @parametrize("mixed_precision", [True, False])
@@ -655,24 +770,27 @@ def _dist_train(
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     def test_state_dict_save_load_flow(self, state_dict_type):
-        for move_to_cpu in [True, False]:
-            with self.subTest(move_to_cpu=move_to_cpu):
-                fsdp_params = self._dist_train(
-                    wrap_fsdp=True,
-                    state_dict_type=state_dict_type,
-                    move_to_cpu=move_to_cpu,
-                )
-                ddp_params = self._dist_train(wrap_fsdp=False)
-                self.assertEqual(ddp_params, fsdp_params)
+        self.run_subtests(
+            {"move_to_cpu": [True, False]},
+            self._test_state_dict_save_load_flow,
+            state_dict_type=state_dict_type,
+        )
+
+    def _test_state_dict_save_load_flow(self, state_dict_type, move_to_cpu):
+        fsdp_params = self._dist_train(
+            wrap_fsdp=True,
+            state_dict_type=state_dict_type,
+            move_to_cpu=move_to_cpu,
+        )
+        ddp_params = self._dist_train(wrap_fsdp=False)
+        self.assertEqual(ddp_params, fsdp_params)
 
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     def test_fsdp_state_dict_keys(self, state_dict_type):
         state_dict = self._state_dict(self._initialize_model(True), state_dict_type)
         if state_dict_type == "local_state_dict":
-            self.assertEqual(
-                set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys()
-            )
+            self.assertEqual({FLAT_PARAM, f"inner.{FLAT_PARAM}"}, state_dict.keys())
         elif state_dict_type in ("state_dict", "sharded_state_dict"):
             # Keys should match local model.
             local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False)
@@ -928,6 +1046,49 @@ def test_state_dict_type(self):
         for module in FSDP.fsdp_modules(fsdp):
             self.assertEqual(module._state_dict_type, StateDictType.FULL_STATE_DICT)
 
+    @skip_if_lt_x_gpu(2)
+    def test_local_state_dict_with_empty_ranks(self):
+        class Model(Module):
+            def __init__(self):
+                super().__init__()
+                self.my_tensor = torch.full((1,), 3.1415926)
+                self.my_parameter = nn.Parameter(self.my_tensor)
+
+            def forward(self, x):
+                return self.my_parameter
+
+        model = FSDP(Model().cuda())
+        with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
+            out = model(None)
+            out.backward()
+
+            state_dict = deepcopy(model.state_dict())
+            with torch.no_grad():
+                with FSDP.summon_full_params(model):
+                    self.assertEqual(model.my_parameter.item(), 3.1415926)
+                    model.my_parameter.copy_(torch.full((1,), 1.75).cuda())
+                    self.assertEqual(model.my_parameter.item(), 1.75)
+            model.load_state_dict(state_dict)
+            with FSDP.summon_full_params(model):
+                self.assertEqual(model.my_parameter.item(), 3.1415926)
+
+    @skip_if_lt_x_gpu(2)
+    def test_torch_save_load(self):
+        model = Model(wrap_fsdp=True).cuda()
+        with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
+            state_dict = model.state_dict()
+            checkpoint = io.BytesIO()
+            torch.save(state_dict, checkpoint)
+            checkpoint.seek(0)
+            state_dict_saved = torch.load(checkpoint)
+            for k, v in state_dict_saved.items():
+                if isinstance(v, ShardedTensor):
+                    self.assertEqual(
+                        v._local_shards[0].tensor, state_dict[k]._local_shards[0].tensor
+                    )
+                else:
+                    self.assertEqual(v, state_dict[k])
+
 
 instantiate_parametrized_tests(TestFSDPStateDict)
 
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
deleted file mode 100644
index 18055dbebffb..000000000000
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ /dev/null
@@ -1,752 +0,0 @@
-# Owner(s): ["oncall: distributed"]
-import contextlib
-import itertools
-import math
-import sys
-from copy import deepcopy
-from typing import List, Optional
-
-import torch
-import torch.nn as nn
-from torch import distributed as dist
-from torch.distributed.fsdp import (
-    CPUOffload,
-    FullyShardedDataParallel as FSDP,
-    MixedPrecision,
-    ShardingStrategy,
-)
-from torch.distributed.fsdp.flat_param import FlatParamHandle
-from torch.distributed.fsdp.wrap import enable_wrap, wrap
-from torch.nn.parallel.distributed import DistributedDataParallel as DDP
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    CUDAInitMode,
-    DeterministicModel,
-    FSDPInitMode,
-    FSDPTest,
-    NestedWrappedModule,
-    TransformerWithSharedParams,
-)
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-    run_tests,
-    TEST_WITH_DEV_DBG_ASAN,
-)
-
-if not dist.is_available():
-    print("Distributed not available, skipping tests", file=sys.stderr)
-    sys.exit(0)
-
-if TEST_WITH_DEV_DBG_ASAN:
-    print(
-        "Skip dev-asan as torch + multiprocessing spawn have known issues",
-        file=sys.stderr,
-    )
-    sys.exit(0)
-
-
-def _run_test_summon_full_param_writeback(
-    cls, writeback, modify_outer, *fsdp_args, **fsdp_kwargs
-):
-    with enable_wrap(wrapper_cls=FSDP, *fsdp_args, **fsdp_kwargs):
-        lin1 = wrap(nn.Linear(5, 5, bias=False).cuda(cls.rank))
-        lin2 = nn.Linear(5, 3, bias=False).cuda(cls.rank)
-        model = wrap(nn.Sequential(lin1, lin2))
-
-    # set the value
-    outer_param = model._handles[0].flat_param
-    inner_param = model.module[0]._handles[0].flat_param
-    p = outer_param if modify_outer else inner_param
-
-    with torch.no_grad():
-        # This sets the local shard value
-        p[0] = cls.rank + 2
-
-    with model.summon_full_params(model, writeback=writeback):
-        with torch.no_grad():
-            p.copy_(torch.zeros_like(p))
-
-    if writeback or cls.world_size == 1:
-        # When world_size = 1, FSDP does not shard and parameter is not set to
-        # a local shard, so write is always reflected.
-        cls.assertEqual(p.cpu()[0], 0)
-    else:
-        cls.assertEqual(p.cpu()[0], cls.rank + 2)
-
-
-class TestSummonFullParamsNoShard(FSDPTest):
-    @property
-    def world_size(self):
-        return 1  # does not shard
-
-    @skip_if_lt_x_gpu(2)
-    # TODO: CPUOffload summon + writeback does not
-    # work when param is not sharded
-    # (currently when world_size == 1)
-    def test_summon_full_param_writeback(self):
-        subtest_config = {
-            "writeback": [True, False],
-            "modify_outer": [True, False],
-            "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
-            "use_orig_params": [True, False],
-        }
-        self.run_subtests(
-            subtest_config,
-            _run_test_summon_full_param_writeback,
-            cls=self,
-            cpu_offload=CPUOffload(offload_params=False),
-        )
-
-
-class TestSummonFullParams(FSDPTest):
-    @property
-    def world_size(self):
-        return 2
-
-    def get_model_param_count(self, m):
-        return sum([p.numel() for p in m.parameters()])
-
-    # padding ensures that all shards have the same size with the least amount of padding
-    def get_expected_sharded_size(self, global_size):
-        return int(math.ceil(global_size / self.world_size))
-
-    @skip_if_lt_x_gpu(2)
-    def test_summon_full_param_writeback(self):
-        subtest_config = {
-            "writeback": [True, False],
-            "modify_outer": [True, False],
-            "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
-            "cpu_offload": [
-                CPUOffload(offload_params=False),
-                CPUOffload(offload_params=True),
-            ],
-            "use_orig_params": [True, False],
-        }
-        self.run_subtests(
-            subtest_config,
-            _run_test_summon_full_param_writeback,
-            cls=self,
-        )
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("mixed_precision", [True, False])
-    def test_summon_full_param_shard_value(self, mixed_precision):
-        mixed_precision = (
-            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
-        )
-        raw_model = nn.Linear(10, 11)
-        raw_model_size = self.get_model_param_count(raw_model)
-        expected_shard_size = self.get_expected_sharded_size(raw_model_size)
-
-        model = FSDP(raw_model.cuda(self.rank), mixed_precision=mixed_precision)
-        self.assertEqual(expected_shard_size, self.get_model_param_count(model))
-
-        # we're assuming a single flattened param
-        self.assertEqual(1, len(list(model.parameters())))
-
-        my_shard = torch.clone(next(model.parameters()))
-
-        with model.summon_full_params(model):
-            self.assertEqual(raw_model_size, self.get_model_param_count(model))
-            parameters = list(model.parameters())
-            all_shards = FlatParamHandle.flatten_params(parameters, requires_grad=False)
-            my_slice = torch.chunk(all_shards, self.world_size)[self.rank]
-
-            # shards are padded but the full_param tensor is not
-            a, b = my_shard[0 : my_slice.numel()], my_slice
-            self.assertTrue(
-                torch.equal(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu())
-            )
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("recurse", [True, False])
-    @parametrize("summon_outer", [True, False])
-    @parametrize("mixed_precision", [True, False])
-    def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precision):
-        mixed_precision = (
-            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
-        )
-        model = FSDP(
-            nn.Sequential(
-                FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
-                nn.Linear(5, 3, bias=False),
-            ),
-            mixed_precision=mixed_precision,
-        ).cuda(self.rank)
-
-        global_inner_numel = self.get_model_param_count(nn.Linear(5, 5, bias=False))
-        global_outer_numel = self.get_model_param_count(nn.Linear(5, 3, bias=False))
-
-        shard_inner_numel = int(math.ceil(global_inner_numel / self.world_size))
-        shard_outer_numel = int(math.ceil(global_outer_numel / self.world_size))
-
-        outer_param = model._handles[0].flat_param
-        inner_param = model.module[0]._handles[0].flat_param
-        self.assertEqual(shard_outer_numel, outer_param.numel())
-        self.assertEqual(shard_inner_numel, inner_param.numel())
-
-        model_to_summon = model if summon_outer else model[0]
-        # outer is summoned if _summon_full_param is called on the outer FSDP module
-        expected_outer_numel = global_outer_numel if summon_outer else shard_outer_numel
-
-        # inner is summoned if _summon_full_param is called with recursion or on the inner FSDP module
-        expected_inner_numel = (
-            global_inner_numel if recurse or not summon_outer else shard_inner_numel
-        )
-
-        with model_to_summon.summon_full_params(model_to_summon, recurse=recurse):
-            self.assertEqual(expected_outer_numel, outer_param.numel())
-            self.assertEqual(expected_inner_numel, inner_param.numel())
-
-    @skip_if_lt_x_gpu(2)
-    def test_cannot_summon_full_params_from_forward(self):
-        class MyModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.a = nn.Parameter(torch.zeros(5))
-
-            def forward(self, fsdp_module):
-                with fsdp_module.summon_full_params(fsdp_module):
-                    pass
-
-        model = FSDP(MyModule()).cuda(self.rank)
-        with self.assertRaisesRegex(
-            ValueError, "Current handle state is HandleTrainingState.FORWARD"
-        ):
-            model(model)
-
-    @skip_if_lt_x_gpu(2)
-    def test_cannot_summon_full_params_from_backward(self):
-        model = FSDP(nn.Linear(2, 1)).cuda(self.rank)
-
-        output = model(torch.ones(2).cuda(self.rank))
-
-        def bad_backwards_hook(tensor):
-            with model.summon_full_params(model):
-                pass
-            return None
-
-        self.assertTrue(output.requires_grad)
-        output.register_hook(bad_backwards_hook)
-
-        with self.assertRaisesRegex(
-            ValueError, "Current handle state is HandleTrainingState.BACKWARD_PRE"
-        ):
-            output.backward()
-
-    @skip_if_lt_x_gpu(2)
-    def test_summon_full_params_respects_reshard_after_forward(self):
-        self.run_subtests(
-            {
-                "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
-                "use_orig_params": [False, True],
-            },
-            self._test_summon_full_params_respects_reshard_after_forward,
-        )
-
-    def _test_summon_full_params_respects_reshard_after_forward(
-        self, mixed_precision: Optional[MixedPrecision], use_orig_params: bool
-    ):
-        fsdp_kwargs = {
-            "mixed_precision": mixed_precision,
-            "use_orig_params": use_orig_params,
-        }
-        model = FSDP(
-            nn.Sequential(
-                FSDP(nn.Linear(5, 5, bias=False), **fsdp_kwargs),
-                nn.Linear(5, 3, bias=False),
-            ),
-            **fsdp_kwargs,
-        ).cuda(self.rank)
-
-        outer_param = model._handles[0].flat_param
-        inner_param = model.module[0]._handles[0].flat_param
-        outer_full_param_size = outer_param.numel() * self.world_size
-
-        # trigger lazy init
-        model(torch.zeros(5).cuda(self.rank))
-        # the root FSDP module keeps all params around
-        self.assertEqual(
-            outer_full_param_size, outer_param._full_param_padded.storage().size()
-        )
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-
-        # similarly summon_full_params should have the same behavior
-        with model.summon_full_params(model):
-            pass
-        self.assertEqual(
-            outer_full_param_size, outer_param._full_param_padded.storage().size()
-        )
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-
-    @skip_if_lt_x_gpu(2)
-    def test_summon_single_param(self):
-        model = FSDP(nn.Linear(1, 1, bias=False)).cuda(self.rank)
-
-        p = model._handles[0].flat_param
-        self.assertEqual(1, p.numel())
-
-        with torch.no_grad():
-            # This sets the local shard value
-            p[0] = self.rank + 2
-
-        with model.summon_full_params(model, writeback=True):
-            self.assertEqual(1, p.numel())
-            with torch.no_grad():
-                p.copy_(torch.zeros_like(p))
-
-        # most ranks hold no data and wrote to padding so only rank zero will observe the above write
-        if self.rank == 0:
-            self.assertEqual(0, p[0])
-        else:
-            self.assertEqual(self.rank + 2, p[0])
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("rank0_only", [True, False])
-    @parametrize("offload_to_cpu", [True, False])
-    def test_summon_full_params_equivalence(self, rank0_only, offload_to_cpu):
-        offload = CPUOffload(offload_params=True)
-        model = FSDP(
-            DeterministicModel(wrap_fsdp=True, cpu_offload=offload), cpu_offload=offload
-        )
-        local_model = DeterministicModel(wrap_fsdp=False)
-
-        params_to_compare = (
-            [p.clone() for p in model.parameters()]
-            if rank0_only and self.rank != 0
-            else list(local_model.parameters())
-        )
-
-        writeback = not rank0_only
-
-        with model.summon_full_params(
-            model,
-            recurse=True,
-            rank0_only=rank0_only,
-            writeback=writeback,
-            offload_to_cpu=offload_to_cpu,
-        ):
-            if writeback:
-                with torch.no_grad():
-                    for p in model.parameters():
-                        p.add_(1)
-                    for p in params_to_compare:
-                        p.add_(1)
-            # Below sleep causes failures without stream synchronization in
-            # summon_full_params fix.
-            torch.cuda._sleep(1000000)
-            # FSDP param deepcopy() of params has issues
-            fsdp_params = [p.clone() for p in model.parameters()]
-
-        self.assertEqual(fsdp_params, params_to_compare)
-
-        # CPU offload is enabled for main API, so we should point back to CPU
-        for param in model.parameters():
-            self.assertEqual(param.device, torch.device("cpu"))
-
-    @skip_if_lt_x_gpu(2)
-    def test_summon_from_non_fsdp(self):
-        class FSDPContainer(nn.Module):
-            def __init__(self, fsdp_1, fsdp_2, fsdp_3):
-                super().__init__()
-                self.fsdp_1 = fsdp_1
-                self.fsdp_2 = fsdp_2
-                self.fsdp_3 = fsdp_3
-
-        model_fsdp = FSDPContainer(
-            FSDP(DeterministicModel(wrap_fsdp=True)),
-            FSDP(DeterministicModel(wrap_fsdp=True)),
-            DeterministicModel(wrap_fsdp=False),
-        )
-        model_no_fsdp = FSDPContainer(
-            DeterministicModel(wrap_fsdp=False),
-            DeterministicModel(wrap_fsdp=False),
-            DeterministicModel(wrap_fsdp=False),
-        )
-
-        params_to_compare = list(model_no_fsdp.parameters())
-        with FSDP.summon_full_params(model_fsdp):
-            fsdp_params = [p.clone() for p in model_fsdp.parameters()]
-
-        self.assertEqual(params_to_compare, fsdp_params)
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("rank0_only", [True, False])
-    @parametrize("offload_to_cpu", [True, False])
-    @parametrize("mixed_precision", [True, False])
-    def test_reshard_outside_forward_backward_iteration(
-        self, rank0_only, offload_to_cpu, mixed_precision
-    ):
-        mixed_precision = (
-            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
-        )
-        model = FSDP(
-            nn.Sequential(
-                FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
-                nn.Linear(5, 1, bias=False),
-            ),
-            mixed_precision=mixed_precision,
-        ).cuda(self.rank)
-
-        outer_param = model._handles[0].flat_param
-        inner_param = model.module[0]._handles[0].flat_param
-        outer_full_param_size = outer_param.numel() * self.world_size
-
-        # First lets validate our assumption about resharding
-
-        output = model(torch.zeros(5).cuda(self.rank))
-        # the root FSDP module keeps all params around
-        self.assertEqual(
-            outer_full_param_size, outer_param._full_param_padded.storage().size()
-        )
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-
-        output.backward()
-        # we reshard everything after backward() finishes
-        self.assertEqual(0, outer_param._full_param_padded.storage().size())
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-
-        # now lets repeat it with summon done in between
-
-        output = model(torch.zeros(5).cuda(self.rank))
-        self.assertEqual(
-            outer_full_param_size, outer_param._full_param_padded.storage().size()
-        )
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-        with model.summon_full_params(
-            model,
-            rank0_only=rank0_only,
-            writeback=not rank0_only,
-            offload_to_cpu=offload_to_cpu,
-        ):
-            pass
-        self.assertEqual(
-            outer_full_param_size, outer_param._full_param_padded.storage().size()
-        )
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-
-        output.backward()
-        with model.summon_full_params(
-            model,
-            rank0_only=rank0_only,
-            writeback=not rank0_only,
-            offload_to_cpu=offload_to_cpu,
-        ):
-            pass
-        self.assertEqual(0, outer_param._full_param_padded.storage().size())
-        self.assertEqual(0, inner_param._full_param_padded.storage().size())
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("rank0_only", [True, False])
-    @parametrize("offload_to_cpu", [True, False])
-    @parametrize("mixed_precision", [True, False])
-    def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precision):
-        layer_shape = (10, 12)
-        model = nn.Linear(*layer_shape, bias=False).cuda(self.rank)
-        mixed_precision = (
-            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
-        )
-        fsdp_model = FSDP(deepcopy(model), mixed_precision=mixed_precision).cuda(
-            self.rank
-        )
-
-        def _get_flat_param():
-            return fsdp_model._handles[0].flat_param
-
-        flattened_param = _get_flat_param()
-        self.assertEqual(layer_shape[0] * layer_shape[1] / 2, flattened_param.numel())
-
-        with fsdp_model.summon_full_params(
-            fsdp_model,
-            rank0_only=rank0_only,
-            writeback=not rank0_only,
-            offload_to_cpu=offload_to_cpu,
-        ):
-            if self.rank == 0 or not rank0_only:
-                self.assertEqual(fsdp_model.weight.shape, model.weight.shape)
-                expected_device = (
-                    torch.device("cpu")
-                    if offload_to_cpu
-                    else torch.device("cuda", torch.cuda.current_device())
-                )
-                self.assertTrue(expected_device == fsdp_model.weight.device)
-            else:
-                # Nonzero rank with rank0_only maintains original params.
-                flat_within_ctx = _get_flat_param()
-                self.assertEqual(flat_within_ctx, flattened_param)
-                self.assertEqual(
-                    flat_within_ctx.device, torch.device(torch.cuda.current_device())
-                )
-
-        # CPU offload should restore the param device
-        param = next(fsdp_model.parameters())
-        self.assertTrue(
-            param.device == torch.device("cuda", torch.cuda.current_device())
-        )
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("rank0_only", [True, False])
-    @parametrize("offload_to_cpu", [True, False])
-    @parametrize("mixed_precision", [True, False])
-    def test_params_count_and_value(
-        self,
-        rank0_only: bool,
-        offload_to_cpu: bool,
-        mixed_precision: bool,
-    ):
-        mixed_precision = (
-            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
-        )
-        model = NestedWrappedModule.init(
-            self.process_group,
-            FSDPInitMode.NO_FSDP,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-        )
-        fsdp_model = NestedWrappedModule.init(
-            self.process_group,
-            FSDPInitMode.RECURSIVE,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-        )
-        dev = (
-            torch.device("cpu")
-            if offload_to_cpu
-            else torch.device("cuda", torch.cuda.current_device())
-        )
-        params_to_compare = (
-            [p.to(dev) for p in model.module.parameters()]
-            if not rank0_only or self.rank == 0
-            else list(p.clone() for p in fsdp_model.parameters())
-        )
-        with FSDP.summon_full_params(
-            fsdp_model, rank0_only=rank0_only, writeback=not rank0_only
-        ):
-            for p1, p2 in itertools.zip_longest(
-                fsdp_model.parameters(), params_to_compare
-            ):
-                self.assertEqual(p1, p2)
-
-        # CPU offload should restore the param device
-        param = next(fsdp_model.parameters())
-        self.assertTrue(
-            param.device == torch.device("cuda", torch.cuda.current_device())
-        )
-
-    @skip_if_lt_x_gpu(2)
-    def test_raises_rank0_with_writeback(self):
-        """Tests that ``summon_full_params()`` with both ``rank0_only=True``
-        and ``writeback=True`` raises an error."""
-        nested_wrapped_module = NestedWrappedModule.init(
-            self.process_group,
-            FSDPInitMode.RECURSIVE,
-            CUDAInitMode.CUDA_BEFORE,
-        )
-        with self.assertRaisesRegex(ValueError, "is not supported"):
-            with FSDP.summon_full_params(
-                nested_wrapped_module, rank0_only=True, writeback=True
-            ):
-                pass
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize("prefix", ["", "test_prefix"])
-    @parametrize("recurse", [False, True])
-    def test_named_parameters_buffers(self, prefix: str, recurse: bool):
-        """Tests that ``named_parameters()`` and ``named_buffers()`` for a
-        top-level FSDP-wrapped model matches their behavior for the equivalent
-        non-wrapped model."""
-        model = NestedWrappedModule.init(
-            self.process_group,
-            FSDPInitMode.NO_FSDP,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-        )
-        model.register_buffer("buffer", torch.ones(1))
-        # `named_parameters()` and `named_buffers` will contain FSDP prefixes
-        # if called on a non-FSDP root module
-        fsdp_model = FSDP(
-            NestedWrappedModule.init(
-                self.process_group,
-                FSDPInitMode.NO_FSDP,
-                CUDAInitMode.CUDA_BEFORE,
-                deterministic=True,
-            ),
-            self.process_group,
-        )
-        fsdp_model.register_buffer("buffer", torch.ones(1))
-        with FSDP.summon_full_params(fsdp_model):
-            for call in ["named_parameters", "named_buffers"]:
-                for (n1, p1), (n2, p2) in itertools.zip_longest(
-                    getattr(fsdp_model, call)(prefix=prefix, recurse=recurse),
-                    getattr(model, call)(prefix=prefix, recurse=recurse),
-                ):
-                    self.assertEqual(n1, n2)
-                    self.assertEqual(p1, p2)
-
-    @skip_if_lt_x_gpu(2)
-    def test_with_grads_core(self):
-        """Tests the core usage of ``summon_full_params(with_grads=True)``."""
-        self.run_subtests(
-            {
-                "writeback": [False, True],
-                "offload_to_cpu": [False, True],
-                "sharding_strategy": [
-                    ShardingStrategy.FULL_SHARD,
-                    ShardingStrategy.SHARD_GRAD_OP,
-                    ShardingStrategy.NO_SHARD,
-                ],
-                "use_orig_params": [True],
-            },
-            self._test_with_grads_core,
-        )
-
-    def _test_with_grads_core(
-        self,
-        writeback: bool,
-        offload_to_cpu: bool,
-        sharding_strategy: ShardingStrategy,
-        use_orig_params: bool,
-    ):
-        def _check_grads(
-            ddp_model: DDP,
-            fsdp_model: FSDP,
-            old_fsdp_grads: Optional[List[torch.Tensor]],
-        ):
-            WRITEBACK_FACTOR = 2
-            with FSDP.summon_full_params(
-                fsdp_model,
-                writeback=writeback,
-                offload_to_cpu=offload_to_cpu,
-                with_grads=True,
-            ):
-                for (n1, p1), (n2, p2) in zip(
-                    ddp_model.module.named_parameters(),
-                    fsdp_model.named_parameters(),
-                ):
-                    # Parameter names are only expected to match because
-                    # `fsdp_model` has top-level FSDP, so its
-                    # `named_parameters()` cleans *all* of the names
-                    self.assertEqual(n1, n2)
-                    assert p1.grad is not None
-                    torch.testing.assert_close(p1.grad, p2.grad)
-                    # Ensure that the tensor is not all zeros, which would
-                    # mean that the multiplication is vacuous
-                    assert torch.count_nonzero(p2.grad) > 0
-                    p2.grad *= WRITEBACK_FACTOR
-            new_fsdp_grads = [
-                param.grad
-                for param in fsdp_model.parameters()
-                if param.grad is not None
-            ]
-            writeback_persists = (
-                writeback or sharding_strategy == ShardingStrategy.NO_SHARD
-            )
-            for old_grad, new_grad in zip(old_fsdp_grads, new_fsdp_grads):
-                if writeback_persists:
-                    torch.testing.assert_close(old_grad * WRITEBACK_FACTOR, new_grad)
-                else:
-                    torch.testing.assert_close(old_grad, new_grad)
-            if writeback_persists:
-                # Modify the DDP gradients for parity
-                for param in ddp_model.parameters():
-                    param.grad *= WRITEBACK_FACTOR
-
-        def _get_error_context(is_supported: bool):
-            return (
-                contextlib.suppress()
-                if is_supported
-                else self.assertRaises(NotImplementedError)
-            )  # some configs not implemented yet
-
-        def _get_fsdp_grads(fsdp_model: FSDP, is_supported: bool):
-            if is_supported:
-                return [
-                    param.grad.clone()
-                    for param in fsdp_model.parameters()
-                    if param.grad is not None
-                ]
-            return None  # unused
-
-        is_supported = use_orig_params and not offload_to_cpu
-        model = TransformerWithSharedParams.init(
-            self.process_group,
-            FSDPInitMode.NO_FSDP,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-        )
-        ddp_model = DDP(model, device_ids=[self.rank])
-        fsdp_model = TransformerWithSharedParams.init(
-            self.process_group,
-            FSDPInitMode.RECURSIVE,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-            fsdp_kwargs={
-                "use_orig_params": use_orig_params,
-                "sharding_strategy": sharding_strategy,
-            },
-        )
-        with FSDP.summon_full_params(fsdp_model):
-            for p1, p2 in zip(ddp_model.module.parameters(), fsdp_model.parameters()):
-                assert torch.all(torch.isclose(p1, p2))
-
-        # Check `summon_full_params()` after backward
-        inp = fsdp_model.get_input(torch.device("cuda"))
-        ddp_out = ddp_model(*inp)
-        fsdp_out = fsdp_model(*inp)
-        ddp_out.sum().backward()
-        fsdp_out.sum().backward()
-        old_fsdp_grads = _get_fsdp_grads(fsdp_model, is_supported)
-        with _get_error_context(is_supported):
-            _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
-
-        # Check `summon_full_params()` between forward and backward
-        inp = fsdp_model.get_input(torch.device("cuda"))
-        ddp_out = ddp_model(*inp)
-        fsdp_out = fsdp_model(*inp)
-        old_fsdp_grads = _get_fsdp_grads(fsdp_model, is_supported)
-        with _get_error_context(is_supported):
-            _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
-
-    @skip_if_lt_x_gpu(2)
-    def test_with_grads_none_grads(self):
-        """
-        Tests that if all ranks' ``FlatParameter`` has ``None`` gradient, then
-        each original parameter sees ``None`` gradient as well.
-        """
-        self.run_subtests(
-            {
-                "sharding_strategy": [
-                    ShardingStrategy.FULL_SHARD,
-                    ShardingStrategy.SHARD_GRAD_OP,
-                    ShardingStrategy.NO_SHARD,
-                ]
-            },
-            self._test_with_grads_none_grads,
-        )
-
-    def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy):
-        fsdp_model = TransformerWithSharedParams.init(
-            self.process_group,
-            FSDPInitMode.RECURSIVE,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-            fsdp_kwargs={
-                "use_orig_params": True,
-                "sharding_strategy": sharding_strategy,
-            },
-        )
-        for fsdp_module in FSDP.fsdp_modules(fsdp_model):
-            for handle in fsdp_module._handles:
-                assert handle.flat_param.grad is None
-        with FSDP.summon_full_params(fsdp_model, with_grads=True):
-            for param in fsdp_model.parameters():
-                self.assertTrue(param.grad is None)
-
-
-instantiate_parametrized_tests(TestSummonFullParams)
-instantiate_parametrized_tests(TestSummonFullParamsNoShard)
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py
index b9c7a0aeac9b..061ffbe9d914 100644
--- a/test/distributed/fsdp/test_fsdp_traversal.py
+++ b/test/distributed/fsdp/test_fsdp_traversal.py
@@ -38,7 +38,7 @@ def test_fsdp_modules(self):
             CUDAInitMode.CUDA_BEFORE,
         )
         modules = FSDP.fsdp_modules(nested_wrapped_module)
-        self.assertEquals(
+        self.assertEqual(
             modules,
             [
                 nested_wrapped_module.module.get_submodule("1"),
diff --git a/test/distributed/fsdp/test_fsdp_unshard_params.py b/test/distributed/fsdp/test_fsdp_unshard_params.py
new file mode 100644
index 000000000000..a5f9b553734d
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_unshard_params.py
@@ -0,0 +1,699 @@
+# Owner(s): ["oncall: distributed"]
+import contextlib
+import itertools
+import math
+import sys
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.nn as nn
+from torch import distributed as dist
+from torch.distributed.fsdp import (
+    CPUOffload,
+    FullyShardedDataParallel as FSDP,
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp._common_utils import clean_tensor_name
+from torch.distributed.fsdp.flat_param import FlatParameter
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    CUDAInitMode,
+    FSDPInitMode,
+    FSDPTest,
+    NestedWrappedModule,
+    TransformerWithSharedParams,
+)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class TestUnshardParamsBase(FSDPTest):
+    """
+    This contains any methods common to both the sharded and non-sharded cases.
+    """
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cuda", self.rank)
+
+    def _test_unshard_params_writeback(
+        self,
+        writeback: bool,
+        check_outer: bool,
+        **fsdp_kwargs: Dict[str, Any],
+    ):
+        model = nn.Sequential(
+            nn.Linear(5, 5, bias=False, device=self.device),
+            nn.Linear(5, 3, bias=False, device=self.device),
+        )
+        model[0] = FSDP(model[0], **fsdp_kwargs)
+        model = FSDP(model, **fsdp_kwargs)
+        uses_sharded_strategy = model.sharding_strategy != ShardingStrategy.NO_SHARD
+        offloading_params = model.cpu_offload.offload_params
+
+        # Assumes depth-first `.parameters()`
+        outer_param: Union[FlatParameter, nn.Parameter] = next(model.parameters())
+        inner_param: Union[FlatParameter, nn.Parameter] = next(model[0].parameters())
+        param_to_check = outer_param if check_outer else inner_param
+
+        # Write a known value to all elements of the *sharded* parameter or
+        # `FlatParameter` to check
+        with torch.no_grad():
+            param_to_check.zero_()
+            param_to_check += self.rank + 2
+        # Zero the *unsharded* parameters
+        with FSDP.summon_full_params(model, writeback=writeback), torch.no_grad():
+            for param in model.parameters():
+                param.zero_()
+
+        # Check the 0th singleton element of the sharded parameter to see if
+        # the zeroing from inside the context persists
+        param_elem_to_check = param_to_check[0]
+        if param_elem_to_check.numel() > 1:
+            # For `use_orig_params=True` and `NO_SHARD`, the parameter
+            # preserves the original 2D shape, so we must access one more time
+            param_elem_to_check = param_elem_to_check[0]
+        if writeback or (not uses_sharded_strategy and not offloading_params):
+            # When FSDP does not use a sharded strategy and is not offloading
+            # parameters to CPU, it directly exposes the tensor storage that
+            # serves as the unsharded source of truth, so the write is always
+            # reflected regardless of `writeback`.
+            self.assertEqual(param_elem_to_check, 0)
+        else:
+            self.assertEqual(param_elem_to_check, self.rank + 2)
+        if offloading_params:
+            cpu_device = torch.device("cpu")
+            for param in model.parameters():
+                self.assertEqual(param.device, cpu_device)
+
+    def _get_test_unshard_params_writeback_config(self) -> Dict[str, List[Any]]:
+        return {
+            "writeback": [True, False],
+            "check_outer": [True, False],
+            "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
+            "cpu_offload": [
+                CPUOffload(offload_params=False),
+                CPUOffload(offload_params=True),
+            ],
+            "use_orig_params": [True, False],
+        }
+
+    def _test_unshard_params_param_data(
+        self,
+        rank0_only: bool,
+        offload_to_cpu: bool,
+        cpu_offload: CPUOffload,
+        mixed_precision: Optional[MixedPrecision],
+        use_orig_params: bool,
+    ):
+        local_model = NestedWrappedModule.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+            fsdp_kwargs={},
+            deterministic=True,
+        )
+        # Apply FSDP such that the root module does not have FSDP applied,
+        # while there are multiple FSDP root submodules (as proven later)
+        fsdp_model = NestedWrappedModule.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+            fsdp_kwargs={
+                "cpu_offload": cpu_offload,
+                "mixed_precision": mixed_precision,
+                "use_orig_params": use_orig_params,
+            },
+            deterministic=True,
+        )
+        self.assertFalse(isinstance(fsdp_model, FSDP))
+        # Hard code the following names because getting them is non-trivial
+        non_fsdp_managed_param_names = {
+            "module.0.weight",
+            "module.0.bias",
+            "module.3.weight",
+            "module.3.bias",
+        }
+
+        with FSDP.summon_full_params(
+            fsdp_model,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
+            if not rank0_only or self.rank == 0:
+                for p1, (n2, p2) in zip(
+                    local_model.parameters(), fsdp_model.named_parameters()
+                ):
+                    self.assertEqual(p1.shape, p2.shape)
+                    if (
+                        offload_to_cpu
+                        and clean_tensor_name(n2) not in non_fsdp_managed_param_names
+                    ):
+                        self.assertEqual(torch.device("cpu"), p2.device)
+                    else:
+                        self.assertEqual(p1.device, p2.device)
+                    self.assertEqual(
+                        p1.dtype, p2.dtype
+                    )  # even if FSDP uses mixed precision
+                    self.assertEqual(p1, p2)
+                    self.assertTrue(isinstance(p2, nn.Parameter))
+            else:
+                # Check that each `FlatParameter` has the sharded size as a
+                # proxy for it being resharded
+                for handle in traversal_utils._get_fsdp_handles(fsdp_model):
+                    if handle.uses_sharded_strategy:
+                        self.assertEqual(
+                            handle.flat_param.shape, handle.flat_param._sharded_size
+                        )
+                    else:
+                        self.assertEqual(
+                            handle.flat_param.shape,
+                            handle.flat_param._unpadded_unsharded_size,
+                        )
+
+        # Prove the number of FSDP roots after lazy initialization
+        num_fsdp_roots = 0
+        for fsdp_state in traversal_utils._get_fsdp_states(fsdp_model):
+            num_fsdp_roots += fsdp_state._is_root
+        self.assertGreater(num_fsdp_roots, 1)
+
+    def _get_test_unshard_params_param_data_config(self) -> Dict[str, List[Any]]:
+        return {
+            "rank0_only": [False, True],
+            "offload_to_cpu": [False, True],
+            "cpu_offload": [
+                CPUOffload(offload_params=False),
+                CPUOffload(offload_params=True),
+            ],
+            "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
+            "use_orig_params": [True, False],
+        }
+
+
+class TestUnshardParams(TestUnshardParamsBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_params_writeback(self):
+        """Tests the ``writeback`` argument (using default for all others)."""
+        self.run_subtests(
+            self._get_test_unshard_params_writeback_config(),
+            self._test_unshard_params_writeback,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_params_param_data(self):
+        """
+        Tests that parameters are exposed correctly for ``recurse=True`` and
+        all other argument configs for a non-FSDP root module.
+        """
+        self.run_subtests(
+            self._get_test_unshard_params_param_data_config(),
+            self._test_unshard_params_param_data,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_singleton_param_writeback(self):
+        """
+        Tests ``writeback=True`` for a singleton parameter, which includes
+        testing that writing to padding does not persist.
+        NOTE: This method depends on FSDP internals.
+        """
+        model = FSDP(nn.Linear(1, 1, bias=False, device=self.device))
+        flat_param = model._handles[0].flat_param
+        self.assertEqual(1, flat_param.numel())
+        # Write a known value to the *sharded* `FlatParameter`
+        with torch.no_grad():
+            # For nonzero ranks, this write is to padding
+            flat_param[0] = self.rank + 2
+        with FSDP.summon_full_params(model, writeback=True):
+            self.assertEqual(1, flat_param.numel())
+            with torch.no_grad():
+                flat_param.zero_()
+        # NOTE: This checks that writes to padding did not persist, which is
+        # *not* strictly required for correctness.
+        if self.rank == 0:  # did not write to padding
+            self.assertEqual(0, flat_param[0])
+        else:  # wrote to padding
+            self.assertEqual(self.rank + 2, flat_param[0])
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_params_respects_reshard(self):
+        """
+        Tests that unsharding parameters respects the expected reshard behavior
+        between forward and backward as well as after backward.
+        """
+        self.run_subtests(
+            {
+                "rank0_only": [False, True],
+                "offload_to_cpu": [False, True],
+                "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
+                "use_orig_params": [False, True],
+            },
+            self._test_unshard_params_respects_reshard,
+        )
+
+    def _test_unshard_params_respects_reshard(
+        self,
+        rank0_only: bool,
+        offload_to_cpu: bool,
+        mixed_precision: Optional[MixedPrecision],
+        use_orig_params: bool,
+    ):
+        """NOTE: This method depends on FSDP internals."""
+        fsdp_kwargs = {
+            "mixed_precision": mixed_precision,
+            "use_orig_params": use_orig_params,
+        }
+        model = FSDP(
+            nn.Sequential(
+                FSDP(nn.Linear(5, 5, bias=False, device=self.device), **fsdp_kwargs),
+                nn.Linear(5, 3, bias=False, device=self.device),
+            ),
+            **fsdp_kwargs,
+        )
+        outer_flat_param = model._handles[0].flat_param
+        inner_flat_param = model.module[0]._handles[0].flat_param
+        # NOTE: This assumes uniform sharding with padding across ranks.
+        expected_outer_flat_param_unsharded_numel = (
+            outer_flat_param.numel() * self.world_size
+        )
+
+        def _get_unsharded_storage_size(flat_param: FlatParameter):
+            return flat_param._full_param_padded.storage().size()
+
+        # Validate the expected behavior: the root does not reshard after
+        # forward; the non-root reshards after forward; and both reshard after
+        # backward
+        output = model(torch.zeros(5, device=self.device))
+        self.assertEqual(
+            expected_outer_flat_param_unsharded_numel,
+            _get_unsharded_storage_size(outer_flat_param),
+        )
+        self.assertEqual(0, _get_unsharded_storage_size(inner_flat_param))
+        output.sum().backward()
+        self.assertEqual(0, _get_unsharded_storage_size(outer_flat_param))
+        self.assertEqual(0, _get_unsharded_storage_size(inner_flat_param))
+
+        # Check that with parameter unsharding in between forward and backward
+        # as well as after backward, the reshard behavior matches
+        output = model(torch.zeros(5, device=self.device))
+        with FSDP.summon_full_params(
+            model,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
+            pass
+        self.assertEqual(
+            expected_outer_flat_param_unsharded_numel,
+            _get_unsharded_storage_size(outer_flat_param),
+        )
+        self.assertEqual(0, _get_unsharded_storage_size(inner_flat_param))
+        output.sum().backward()
+        with FSDP.summon_full_params(
+            model,
+            rank0_only=rank0_only,
+            writeback=not rank0_only,
+            offload_to_cpu=offload_to_cpu,
+        ):
+            pass
+        self.assertEqual(0, _get_unsharded_storage_size(outer_flat_param))
+        self.assertEqual(0, _get_unsharded_storage_size(inner_flat_param))
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_params_recurse(self):
+        """Tests the ``recurse`` argument (using default for all others)."""
+        self.run_subtests(
+            {
+                "recurse": [False, True],
+                "unshard_outer": [False, True],
+                "mixed_precision": [MixedPrecision(param_dtype=torch.float16), None],
+                "use_orig_params": [False, True],
+            },
+            self._test_unshard_params_recurse,
+        )
+
+    def _test_unshard_params_recurse(
+        self,
+        recurse: bool,
+        unshard_outer: bool,
+        mixed_precision: Optional[MixedPrecision],
+        use_orig_params: bool,
+    ):
+        """NOTE: This method depends on FSDP internals."""
+        fsdp_kwargs = {
+            "mixed_precision": mixed_precision,
+            "use_orig_params": use_orig_params,
+        }
+        model = FSDP(
+            nn.Sequential(
+                FSDP(nn.Linear(5, 5, bias=False, device=self.device), **fsdp_kwargs),
+                nn.Linear(5, 3, bias=False, device=self.device),
+            ),
+            **fsdp_kwargs,
+        )
+        # Hard code the numel values based on the model
+        unsharded_inner_numel = 5 * 5
+        unsharded_outer_numel = 5 * 3
+        # Round up the sharded numel to account for padding
+        sharded_inner_numel = int(math.ceil(unsharded_inner_numel / self.world_size))
+        sharded_outer_numel = int(math.ceil(unsharded_outer_numel / self.world_size))
+        inner_flat_param = model.module[0]._handles[0].flat_param
+        outer_flat_param = model._handles[0].flat_param
+        self.assertEqual(sharded_inner_numel, inner_flat_param.numel())
+        self.assertEqual(sharded_outer_numel, outer_flat_param.numel())
+        expected_outer_numel = (
+            unsharded_outer_numel if unshard_outer else sharded_outer_numel
+        )
+        expected_inner_numel = (
+            unsharded_inner_numel
+            if recurse or not unshard_outer
+            else sharded_inner_numel
+        )
+        module_to_unshard = model if unshard_outer else model[0]
+        with FSDP.summon_full_params(module_to_unshard, recurse=recurse):
+            self.assertEqual(expected_outer_numel, outer_flat_param.numel())
+            self.assertEqual(expected_inner_numel, inner_flat_param.numel())
+
+    @skip_if_lt_x_gpu(2)
+    def test_named_parameters_and_buffers(self):
+        """
+        Tests that ``named_parameters()`` and ``named_buffers()`` for a
+        top-level FSDP-wrapped model matches their behavior for the equivalent
+        non-wrapped module.
+        """
+        self.run_subtests(
+            {"prefix": ["", "test_prefix"], "recurse": [False, True]},
+            self._test_named_parameters_and_buffers,
+        )
+
+    def _test_named_parameters_and_buffers(self, prefix: str, recurse: bool):
+        model = NestedWrappedModule.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+        )
+        model.register_buffer("buffer", torch.ones(1))
+        # Wrap the top-level with FSDP since `named_parameters()` and
+        # `named_buffers` will contain FSDP prefixes if called on a non-FSDP
+        # root module
+        fsdp_model = FSDP(
+            NestedWrappedModule.init(
+                self.process_group,
+                FSDPInitMode.NO_FSDP,
+                CUDAInitMode.CUDA_BEFORE,
+                deterministic=True,
+            ),
+            self.process_group,
+        )
+        fsdp_model.register_buffer("buffer", torch.ones(1))
+        with FSDP.summon_full_params(fsdp_model):
+            for call in ["named_parameters", "named_buffers"]:
+                for (n1, p1), (n2, p2) in itertools.zip_longest(
+                    getattr(fsdp_model, call)(prefix=prefix, recurse=recurse),
+                    getattr(model, call)(prefix=prefix, recurse=recurse),
+                ):
+                    self.assertEqual(n1, n2)
+                    self.assertEqual(p1, p2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_with_grads_core(self):
+        """
+        Tests the core usage of``with_grads=True`` by comparing against DDP as
+        the unsharded equivalent.
+        """
+        self.run_subtests(
+            {
+                "writeback": [False, True],
+                "offload_to_cpu": [False, True],
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.SHARD_GRAD_OP,
+                    ShardingStrategy.NO_SHARD,
+                ],
+                "use_orig_params": [True],
+            },
+            self._test_with_grads_core,
+        )
+
+    def _test_with_grads_core(
+        self,
+        writeback: bool,
+        offload_to_cpu: bool,
+        sharding_strategy: ShardingStrategy,
+        use_orig_params: bool,
+    ):
+        def _check_grads(
+            ddp_model: DDP,
+            fsdp_model: FSDP,
+            old_fsdp_grads: Optional[List[torch.Tensor]],
+        ):
+            """
+            Checks that writes to the FSDP parameters' gradients persist or do
+            not persist depending on ``writeback`` and the sharding strategy.
+            The DDP model is used for checking gradient parity to ensure that
+            FDSP all-gathers the correct gradient values.
+            """
+            WRITEBACK_FACTOR = 2
+            with FSDP.summon_full_params(
+                fsdp_model,
+                writeback=writeback,
+                offload_to_cpu=offload_to_cpu,
+                with_grads=True,
+            ):
+                for (n1, p1), (n2, p2) in zip(
+                    ddp_model.module.named_parameters(),
+                    fsdp_model.named_parameters(),
+                ):
+                    self.assertEqual(n1, clean_tensor_name(n2))
+                    assert p1.grad is not None
+                    torch.testing.assert_close(p1.grad, p2.grad)
+                    # Ensure that the tensor is not all zeros, which would
+                    # mean that the multiplication is vacuous
+                    assert torch.count_nonzero(p2.grad) > 0
+                    p2.grad *= WRITEBACK_FACTOR
+            new_fsdp_grads = [
+                param.grad
+                for param in fsdp_model.parameters()
+                if param.grad is not None
+            ]
+            writeback_persists = writeback or (
+                sharding_strategy == ShardingStrategy.NO_SHARD and not offload_to_cpu
+            )
+            for old_grad, new_grad in zip(old_fsdp_grads, new_fsdp_grads):
+                if writeback_persists:
+                    torch.testing.assert_close(old_grad * WRITEBACK_FACTOR, new_grad)
+                else:
+                    torch.testing.assert_close(old_grad, new_grad)
+            if writeback_persists:
+                # Modify the DDP gradients in the same way for parity
+                for param in ddp_model.parameters():
+                    param.grad *= WRITEBACK_FACTOR
+
+        def _get_error_context(is_supported: bool):
+            return (
+                contextlib.suppress()
+                if is_supported
+                else self.assertRaises(NotImplementedError)
+            )  # some configs are not implemented yet
+
+        def _get_fsdp_grads(fsdp_model: FSDP, is_supported: bool):
+            if is_supported:
+                return [
+                    param.grad.clone()
+                    for param in fsdp_model.parameters()
+                    if param.grad is not None
+                ]
+            return None  # unused
+
+        is_supported = use_orig_params and not offload_to_cpu
+        model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+        )
+        ddp_model = DDP(model, device_ids=[self.rank])
+        fsdp_model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+            fsdp_kwargs={
+                "use_orig_params": use_orig_params,
+                "sharding_strategy": sharding_strategy,
+            },
+        )
+        with FSDP.summon_full_params(fsdp_model):
+            for p1, p2 in zip(ddp_model.module.parameters(), fsdp_model.parameters()):
+                assert torch.all(torch.isclose(p1, p2))
+
+        # Check calling after backward
+        inp = fsdp_model.get_input(torch.device("cuda"))
+        ddp_out = ddp_model(*inp)
+        fsdp_out = fsdp_model(*inp)
+        ddp_out.sum().backward()
+        fsdp_out.sum().backward()
+        old_fsdp_grads = _get_fsdp_grads(fsdp_model, is_supported)
+        with _get_error_context(is_supported):
+            _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
+
+        # Check calling between forward and backward
+        inp = fsdp_model.get_input(torch.device("cuda"))
+        ddp_out = ddp_model(*inp)
+        fsdp_out = fsdp_model(*inp)
+        old_fsdp_grads = _get_fsdp_grads(fsdp_model, is_supported)
+        with _get_error_context(is_supported):
+            _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
+
+    @skip_if_lt_x_gpu(2)
+    def test_with_grads_none_grads(self):
+        """
+        Tests that if all ranks' ``FlatParameter`` has ``None`` gradient, then
+        each original parameter sees ``None`` gradient as well.
+        """
+        self.run_subtests(
+            {
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.SHARD_GRAD_OP,
+                    ShardingStrategy.NO_SHARD,
+                ]
+            },
+            self._test_with_grads_none_grads,
+        )
+
+    def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy):
+        fsdp_model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+            fsdp_kwargs={
+                "use_orig_params": True,
+                "sharding_strategy": sharding_strategy,
+            },
+        )
+        for fsdp_module in FSDP.fsdp_modules(fsdp_model):
+            for handle in fsdp_module._handles:
+                assert handle.flat_param.grad is None
+        with FSDP.summon_full_params(fsdp_model, with_grads=True):
+            for param in fsdp_model.parameters():
+                self.assertTrue(param.grad is None)
+
+
+class TestUnshardParamsNoShard(TestUnshardParamsBase):
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    @skip_if_lt_x_gpu(1)
+    def test_unshard_params_writeback_no_shard(self):
+        """Tests the ``writeback`` argument (using default for all others)."""
+        self.run_subtests(
+            self._get_test_unshard_params_writeback_config(),
+            self._test_unshard_params_writeback,
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_unshard_params_param_data_no_shard(self):
+        """
+        Tests that parameters are exposed correctly for ``recurse=True`` and
+        all other argument configs for a non-FSDP root module.
+        """
+        config = self._get_test_unshard_params_param_data_config()
+        # TODO: `offload_to_cpu=True` with `NO_SHARD` is not supported yet. See
+        # `test_offload_to_cpu_no_shard_raises()`.
+        config["offload_to_cpu"] = [False]
+        self.run_subtests(
+            config,
+            self._test_unshard_params_param_data,
+        )
+
+
+class TestUnshardParamsErrors(TestUnshardParamsBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_params_from_forward_raises(self):
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = nn.Parameter(torch.zeros(5))
+
+            def forward(self, fsdp_module):
+                with fsdp_module.summon_full_params(fsdp_module):
+                    pass
+
+        model = FSDP(MyModule()).cuda(self.rank)
+        with self.assertRaisesRegex(
+            AssertionError, "Cannot manually unshard parameters during forward/backward"
+        ):
+            model(model)
+
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_params_from_backward_raises(self):
+        model = FSDP(nn.Linear(2, 1, device=self.device))
+        output = model(torch.ones(2, device=self.device))
+
+        def invalid_backward_hook(*args, **kwargs):
+            with FSDP.summon_full_params(model):
+                pass
+
+        self.assertTrue(output.requires_grad)
+        output.register_hook(invalid_backward_hook)
+        with self.assertRaisesRegex(
+            AssertionError, "Cannot manually unshard parameters during forward/backward"
+        ):
+            output.backward()
+
+    @skip_if_lt_x_gpu(2)
+    def test_rank0_only_with_writeback_raises(self):
+        nested_wrapped_module = NestedWrappedModule.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+        )
+        with self.assertRaisesRegex(NotImplementedError, "is not supported"):
+            with FSDP.summon_full_params(
+                nested_wrapped_module, rank0_only=True, writeback=True
+            ):
+                pass
+
+    @skip_if_lt_x_gpu(2)
+    def test_offload_to_cpu_no_shard_raises(self):
+        nested_wrapped_module = NestedWrappedModule.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+            {"sharding_strategy": ShardingStrategy.NO_SHARD},
+        )
+        with self.assertRaisesRegex(NotImplementedError, "is not supported"):
+            with FSDP.summon_full_params(
+                nested_wrapped_module, rank0_only=True, writeback=True
+            ):
+                pass
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index c7898b2b58f0..042bbb16f114 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -4,7 +4,8 @@
 import functools
 import itertools
 import sys
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type
+import unittest
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
@@ -20,6 +21,7 @@
 from torch.distributed.fsdp.wrap import always_wrap_policy, ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     CUDAInitMode,
@@ -32,6 +34,7 @@
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 
 if not dist.is_available():
@@ -189,7 +192,8 @@ def _check_ddp_fsdp_param_parity(self, ddp_model: DDP, fsdp_model: FSDP):
             for (n1, p1), (n2, p2) in zip(
                 ddp_model.module.named_parameters(), fsdp_model.named_parameters()
             ):
-                self.assertEqual(n1, n2)
+                # Allow for FSDP prefixes
+                self.assertEqual(n1, clean_tensor_name(n2))
                 torch.testing.assert_close(p1, p2)
 
     def _get_sharding_strategy_from_str(
@@ -448,7 +452,7 @@ def run_iter():
             ddp_model.module.named_parameters(),
             fsdp_model.named_parameters(),
         ):
-            self.assertEqual(ddp_n, fsdp_n)
+            self.assertEqual(ddp_n, clean_tensor_name(fsdp_n))
             if fsdp_p.numel() == 0:
                 # Not in this rank's shard
                 self.assertTrue(fsdp_p.grad is None)
@@ -961,53 +965,6 @@ def test_writeback_shape_mismatch(self):
 
 
 class TestFSDPUseOrigParamsFQNs(FSDPTest):
-    @skip_if_lt_x_gpu(2)
-    def test_param_and_buffer_names(self):
-        """
-        Tests that, for ``use_orig_params=True``, the parameter and buffer
-        names match those of a local model even when sharded, meaning that they
-        do not include FSDP-specific prefixes.
-        """
-        self.run_subtests(
-            {"auto_wrap_policy": [None, always_wrap_policy]},
-            self._test_param_and_buffer_names,
-        )
-
-    def _test_param_and_buffer_names(self, auto_wrap_policy: Optional[Callable]):
-        class Container(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.param = nn.Parameter(torch.randn((5, 5)))
-                self.register_buffer("buf", torch.randn((5, 5)))
-
-            def forward(self, x):
-                return x @ self.param + self.buf
-
-        class Model(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.param = nn.Parameter(torch.randn((5, 5)))
-                self.lin = nn.Linear(5, 5)
-                self.container = Container()
-                self.register_buffer("buf", torch.randn((5, 5)))
-
-            def forward(self, x):
-                z = self.container(x)
-                z = z @ self.param + self.buf
-                z = self.lin(z)
-                return z
-
-        model = Model()
-        fsdp_model = FSDP(
-            Model(), auto_wrap_policy=auto_wrap_policy, use_orig_params=True
-        )
-        param_names = [n for n, _ in model.named_parameters()]
-        fsdp_param_names = [n for n, _ in fsdp_model.named_parameters()]
-        self.assertEqual(param_names, fsdp_param_names)
-        buffer_names = [n for n, _ in model.named_buffers()]
-        fsdp_buffer_names = [n for n, _ in fsdp_model.named_buffers()]
-        self.assertEqual(buffer_names, fsdp_buffer_names)
-
     @skip_if_lt_x_gpu(2)
     def test_named_parameters_in_forward(self):
         """
@@ -1024,7 +981,10 @@ def __init__(self) -> None:
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 nonlocal param_shapes
-                param_names = [tup[0] for tup in self.named_parameters()]
+                # Allow for FSDP prefixes
+                param_names = [
+                    clean_tensor_name(tup[0]) for tup in self.named_parameters()
+                ]
                 params = [tup[1] for tup in self.named_parameters()]
                 assert (
                     param_shapes[0] is not None and param_shapes[1] is not None
@@ -1073,7 +1033,7 @@ def test_no_sync_correctness(self):
         )
 
     def _test_no_sync_correctness(self, sharding_strategy: ShardingStrategy):
-        model = nn.Linear(3, 3, bias=False, device="cuda")
+        model = nn.Linear(3, 3, device="cuda")
         fsdp_kwargs = {
             "sharding_strategy": sharding_strategy,
         }
@@ -1135,7 +1095,9 @@ def _check_param_grad_parity(
             param.grad.detach().clone() for param in model_use_flat_params.parameters()
         ]
         ref_grads_use_orig_params = [
-            param.grad.detach().clone() for param in model_use_orig_params.parameters()
+            param.grad.detach().clone()
+            for param in model_use_orig_params.parameters()
+            if param.grad is not None
         ]
 
         # Run a forward/backward in `no_sync()`
@@ -1159,7 +1121,9 @@ def _check_param_grad_parity(
             param.grad.detach().clone() for param in model_use_flat_params.parameters()
         ]
         grads_use_orig_params = [
-            param.grad.detach().clone() for param in model_use_orig_params.parameters()
+            param.grad.detach().clone()
+            for param in model_use_orig_params.parameters()
+            if param.grad is not None
         ]
         for grad, ref_grad in zip(grads_use_flat_params, ref_grads_use_flat_params):
             torch.testing.assert_close(grad, 2 * ref_grad)
@@ -1184,7 +1148,7 @@ def test_no_sync_mixed_precision(self):
         )
 
     def _test_no_sync_mixed_precision(self, sharding_strategy: ShardingStrategy):
-        model = nn.Linear(3, 3, bias=False, device="cuda")
+        model = nn.Linear(3, 3, device="cuda")
         mixed_precision = MixedPrecision(
             param_dtype=torch.float16,
             reduce_dtype=torch.float32,
@@ -1215,6 +1179,25 @@ def _test_no_sync_mixed_precision(self, sharding_strategy: ShardingStrategy):
                 self.assertEqual(param.grad.dtype, torch.float32)
 
 
+# Define this to be large enough to trigger stack corruption
+NUM_SIZE0_TENSORS = 1000
+
+
+class TestMultiTensorApply(TestCase):
+    def test_multi_tensor_apply_size0_tensors_cpu(self):
+        size0_tensors = [torch.empty(0, device="cpu") for _ in range(NUM_SIZE0_TENSORS)]
+        # Check that this does not segfault
+        torch._foreach_mul_(size0_tensors, 0.1)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_multi_tensor_apply_size0_tensors_cuda(self):
+        size0_tensors = [
+            torch.empty(0, device="cuda") for _ in range(NUM_SIZE0_TENSORS)
+        ]
+        # Check that this does not segfault
+        torch._foreach_mul_(size0_tensors, 0.1)
+
+
 instantiate_parametrized_tests(TestFSDPUseOrigParamsMultipleParamGroups)
 instantiate_parametrized_tests(TestFSDPUseOrigParamsUnshardReshard)
 instantiate_parametrized_tests(TestFSDPUseOrigParamsParamAccess)
diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py
index 249fb5326f21..758561b4eded 100644
--- a/test/distributed/fsdp/test_utils.py
+++ b/test/distributed/fsdp/test_utils.py
@@ -11,10 +11,9 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
-from torch.distributed.fsdp._utils import _apply_to_tensors
 from torch.distributed.fsdp._wrap_utils import _get_fully_sharded_module_to_states
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
-from torch.distributed.utils import _replace_by_prefix
+from torch.distributed.utils import _apply_to_tensors, _replace_by_prefix
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -66,8 +65,8 @@ class SomeDataClass:
         # create a mixed bag of data.
         data = [1, "str"]
         data.append({"key1": get_a_tensor(), "key2": {1: get_a_tensor()}, "key3": 3})
-        data.insert(0, set(["x", get_a_tensor(), get_a_tensor()]))
-        data.append(([1], get_a_tensor(), (1), [get_a_tensor()], set((1, 2))))
+        data.insert(0, {"x", get_a_tensor(), get_a_tensor()})
+        data.append(([1], get_a_tensor(), (1), [get_a_tensor()], {1, 2}))
         data.append({"abc": SomeDataClass("some_key", 1.0, [get_a_tensor()])})
         od = OrderedDict()
         od["k"] = "value"
@@ -200,32 +199,24 @@ def test_get_fully_sharded_module_to_states(self):
         self.assertEqual(fully_sharded_modules[0], model)
         root_states = fully_sharded_module_to_states[fully_sharded_modules[0]]
         self.assertEqual(root_states.params, [model.lin.weight])
-        self.assertEqual(root_states.param_names, ["lin.weight"])
         self.assertEqual(root_states.buffers, [])
-        self.assertEqual(root_states.buffer_names, [])
         # - `seq1`
         self.assertEqual(fully_sharded_modules[1], model.seq1)
         seq1_states = fully_sharded_module_to_states[fully_sharded_modules[1]]
         self.assertEqual(
             seq1_states.params, [model.seq1[0].weight, model.seq1[1].weight]
         )
-        self.assertEqual(seq1_states.param_names, ["0.weight", "1.weight"])
         self.assertEqual(seq1_states.buffers, [model.seq1.seq1_buffer])
-        self.assertEqual(seq1_states.buffer_names, ["seq1_buffer"])
         # - `seq2`
         self.assertEqual(fully_sharded_modules[2], model.seq2)
         seq2_states = fully_sharded_module_to_states[fully_sharded_modules[2]]
         self.assertEqual(seq2_states.params, [model.seq2[1].weight])
-        self.assertEqual(seq2_states.param_names, ["1.weight"])
         self.assertEqual(seq2_states.buffers, [model.seq2[1].seq2_1_buffer])
-        self.assertEqual(seq2_states.buffer_names, ["1.seq2_1_buffer"])
         # - `seq2[0]`
         self.assertEqual(fully_sharded_modules[3], model.seq2[0])
         seq2_0_states = fully_sharded_module_to_states[fully_sharded_modules[3]]
         self.assertEqual(seq2_0_states.params, [])  # shared parameter
-        self.assertEqual(seq2_0_states.param_names, [])
         self.assertEqual(seq2_0_states.buffers, [])
-        self.assertEqual(seq2_0_states.buffer_names, [])
 
 
 instantiate_parametrized_tests(TestUtils)
diff --git a/test/distributed/launcher/api_test.py b/test/distributed/launcher/api_test.py
index fd9310ecc6db..8fbbb713f490 100644
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@@ -92,7 +92,7 @@ def elastic_launch_wrapper(
             rdzv_endpoint, min_nodes, max_nodes, nproc_per_node, run_id
         ),
         sys.executable,
-    )("-u", path("bin/test_script.py"), f"--touch_file_dir={test_dir}")
+    )("-u", path("bin/test_script.py"), f"--touch-file-dir={test_dir}")
 
 
 def _dist_sum(wait=0):
@@ -163,7 +163,7 @@ def test_launch_script_python(self):
         elastic_launch(
             get_test_launch_config(self._etcd_endpoint, nnodes, nnodes, nproc_per_node),
             sys.executable,
-        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+        )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
 
         # make sure all the workers ran.
         # each worker touches a file with its global rank as the name.
@@ -178,7 +178,7 @@ def test_launch_script_python_local_rank_transfer(self):
         elastic_launch(
             get_test_launch_config(self._etcd_endpoint, nnodes, nnodes, nproc_per_node),
             sys.executable,
-        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+        )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
 
         # make sure all the workers ran.
         # each worker touches a file with its global rank as the name.
@@ -248,7 +248,7 @@ def test_launch_elastic(self):
         elastic_launch(
             get_test_launch_config(self._etcd_endpoint, 1, 2, nproc_per_node),
             sys.executable,
-        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+        )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
 
         world_size = nproc_per_node
         self.check_works_ran(world_size)
@@ -283,7 +283,7 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
             elastic_launch(
                 get_test_launch_config(self._etcd_endpoint, 1, 2, 4),
                 sys.executable,
-            )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+            )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
         record_mock.assert_called_once()
 
     @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan")
@@ -345,7 +345,7 @@ def test_launch_shutdown(self, agent_mock_cls):
             elastic_launch(
                 get_test_launch_config(self._etcd_endpoint, 1, 1, 4),
                 sys.executable,
-            )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
+            )("-u", path("bin/test_script.py"), f"--touch-file-dir={self.test_dir}")
 
             rdzv_handler_mock.shutdown.assert_called_once()
 
diff --git a/test/distributed/launcher/bin/test_script.py b/test/distributed/launcher/bin/test_script.py
index e880eceaa7e0..188db03f1e91 100755
--- a/test/distributed/launcher/bin/test_script.py
+++ b/test/distributed/launcher/bin/test_script.py
@@ -24,6 +24,7 @@ def parse_args():
 
     # file is used for assertions
     parser.add_argument(
+        "--touch-file-dir",
         "--touch_file_dir",
         type=str,
         help="dir to touch a file with global rank as the filename",
diff --git a/test/distributed/launcher/bin/test_script_init_method.py b/test/distributed/launcher/bin/test_script_init_method.py
index 0f57ce08d9d1..9c06bb95dbc8 100755
--- a/test/distributed/launcher/bin/test_script_init_method.py
+++ b/test/distributed/launcher/bin/test_script_init_method.py
@@ -19,12 +19,14 @@ def parse_args():
     parser = argparse.ArgumentParser(description="test script")
 
     parser.add_argument(
+        "--init-method",
         "--init_method",
         type=str,
         required=True,
         help="init_method to pass to `dist.init_process_group()` (e.g. env://)",
     )
     parser.add_argument(
+        "--world-size",
         "--world_size",
         type=int,
         default=os.getenv("WORLD_SIZE", -1),
diff --git a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
index 534a8f247210..691c43ddb542 100755
--- a/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
+++ b/test/distributed/launcher/bin/test_script_is_torchelastic_launched.py
@@ -27,6 +27,7 @@
 def parse_args():
     parser = argparse.ArgumentParser(description="test script")
     parser.add_argument(
+        "--out-file",
         "--out_file",
         help="file to write indicating whether this script was launched with torchelastic",
     )
diff --git a/test/distributed/launcher/bin/test_script_local_rank.py b/test/distributed/launcher/bin/test_script_local_rank.py
index 3aa4f2c844a8..e0468c966772 100755
--- a/test/distributed/launcher/bin/test_script_local_rank.py
+++ b/test/distributed/launcher/bin/test_script_local_rank.py
@@ -15,6 +15,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description="test script")
 
     parser.add_argument(
+        "--local-rank",
         "--local_rank",
         type=int,
         required=True,
@@ -31,7 +32,7 @@ def main():
     actual_rank = args.local_rank
     if expected_rank != actual_rank:
         raise RuntimeError(
-            "Parameters passed: --local_rank that has different value "
+            "Parameters passed: --local-rank that has different value "
             f"from env var: expected: {expected_rank}, got: {actual_rank}"
         )
     print("End execution")
diff --git a/test/distributed/launcher/launch_test.py b/test/distributed/launcher/launch_test.py
index e80041226e89..e30d422c093f 100644
--- a/test/distributed/launcher/launch_test.py
+++ b/test/distributed/launcher/launch_test.py
@@ -47,12 +47,12 @@ def test_launch_without_env(self):
             master_port = sock.getsockname()[1]
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
+            f"--nproc-per-node={nproc_per_node}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--master-addr=localhost",
+            f"--master-port={master_port}",
+            "--node-rank=0",
             path("bin/test_script_local_rank.py"),
         ]
         launch.main(args)
@@ -69,15 +69,15 @@ def test_launch_with_env(self):
             master_port = sock.getsockname()[1]
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
-            "--use_env",
+            f"--nproc-per-node={nproc_per_node}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--master-addr=localhost",
+            f"--master-port={master_port}",
+            "--node-rank=0",
+            "--use-env",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         launch.main(args)
         # make sure all the workers ran
diff --git a/test/distributed/launcher/run_test.py b/test/distributed/launcher/run_test.py
index f626093e3d1e..4315d9135b3e 100644
--- a/test/distributed/launcher/run_test.py
+++ b/test/distributed/launcher/run_test.py
@@ -101,14 +101,14 @@ def _test_launch_user_script_python(self):
         world_size = nnodes * nproc_per_node
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         launch.main(args)
 
@@ -127,14 +127,14 @@ def test_launch_user_script_python_caffe2_bc(self):
             master_port = sock.getsockname()[1]
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
+            f"--nproc-per-node={nproc_per_node}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--master-addr=localhost",
+            f"--master-port={master_port}",
+            "--node-rank=0",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         launch.main(args)
 
@@ -152,19 +152,19 @@ def test_launch_user_script_bash(self):
         world_size = nnodes * nproc_per_node
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--no_python",
+            f"--nproc-per-node={nproc_per_node}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--no-python",
         ]
 
         script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
 
         with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
+            # --no-python cannot be used with --module
             launch.main(args + ["--module"] + script_args)
 
         launch.main(args + script_args)
@@ -182,18 +182,18 @@ def test_launch_user_script_default_nproc(self):
         world_size = 1
         args = [
             f"--nnodes={nnodes}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--no_python",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--no-python",
         ]
 
         script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
 
         with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
+            # --no-python cannot be used with --module
             launch.main(args + ["--module"] + script_args)
 
         launch.main(args + script_args)
@@ -223,7 +223,7 @@ def test_launch_with_env_vars(self):
         script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
 
         with self.assertRaises(ValueError):
-            # --no_python cannot be used with --module
+            # --no-python cannot be used with --module
             os.environ["PET_MODULE"] = "1"
             launch.main(script_args)
 
@@ -242,13 +242,13 @@ def _test_nproc_launch_configuration(self, nproc_type, expected_number):
 
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_type}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--no_python",
+            f"--nproc-per-node={nproc_type}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--no-python",
         ]
 
         script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]
@@ -292,14 +292,14 @@ def test_launch_elastic(self):
         world_size = nproc_per_node
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         launch.main(args)
 
@@ -323,13 +323,13 @@ def test_launch_elastic_worker_raise_exception(self, record_mock):
         nproc_per_node = 4
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--max_restarts=0",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--max-restarts=0",
+            "--start-method=spawn",
             path("bin/test_script.py"),
             "--fail",
         ]
@@ -354,15 +354,15 @@ def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run)
         nproc_per_node = 4
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--max_restarts=0",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--max-restarts=0",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
 
         mock_agent_run.side_effect = MockException
@@ -377,12 +377,12 @@ def test_launch_standalone(self):
         world_size = nnodes * nproc_per_node
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
+            f"--nproc-per-node={nproc_per_node}",
             "--standalone",
-            "--monitor_interval=1",
-            "--start_method=spawn",
+            "--monitor-interval=1",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         launch.main(args)
 
@@ -398,13 +398,13 @@ def test_launch_run_path(self):
         nproc_per_node = 4
         world_size = nnodes * nproc_per_node
         args = [
-            "--run_path",
+            "--run-path",
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         launch.main(args)
 
@@ -424,14 +424,14 @@ def test_launch_elastic_multiple_agents(self):
         world_size = nnodes * nproc_per_node
         args = [
             f"--nnodes={min_nodes}:{max_nodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--rdzv_backend=etcd",
-            f"--rdzv_endpoint={self._etcd_endpoint}",
-            f"--rdzv_id={run_id}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--rdzv-backend=etcd",
+            f"--rdzv-endpoint={self._etcd_endpoint}",
+            f"--rdzv-id={run_id}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         procs = []
         for _ in range(nnodes - 1):
@@ -466,11 +466,11 @@ def test_launch_shutdown(self, agent_mock_cls):
         nproc_per_node = 4
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
+            f"--nproc-per-node={nproc_per_node}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
             path("bin/test_script.py"),
-            f"--touch_file_dir={self.test_dir}",
+            f"--touch-file-dir={self.test_dir}",
         ]
         agent_mock = Mock()
         agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED)
@@ -492,12 +492,12 @@ def test_is_torchelastic_launched(self):
 
         launch.main(
             [
-                "--run_path",
+                "--run-path",
                 "--nnodes=1",
-                "--nproc_per_node=1",
-                "--monitor_interval=1",
+                "--nproc-per-node=1",
+                "--monitor-interval=1",
                 path("bin/test_script_is_torchelastic_launched.py"),
-                f"--out_file={out_file}",
+                f"--out-file={out_file}",
             ]
         )
 
@@ -519,7 +519,7 @@ def test_is_not_torchelastic_launched(self):
             "argv",
             [
                 path("bin/test_script_is_torchelastic_launched.py"),
-                f"--out_file={out_file}",
+                f"--out-file={out_file}",
             ],
         ):
             runpy.run_path(sys.argv[0], run_name="__main__")
@@ -534,9 +534,9 @@ def test_init_method_tcp(self):
             "argv",
             [
                 path("bin/test_script_init_method.py"),
-                f"--init_method=tcp://localhost:{port}",
+                f"--init-method=tcp://localhost:{port}",
                 "--rank=0",
-                "--world_size=1",
+                "--world-size=1",
             ],
         ):
             runpy.run_path(sys.argv[0], run_name="__main__")
@@ -547,14 +547,14 @@ def test_init_method_tcp_with_torchelastic(self):
         port = get_free_port()
         launch.main(
             [
-                "--run_path",
+                "--run-path",
                 "--nnodes=1",
-                "--nproc_per_node=4",
-                "--master_addr=localhost",
-                f"--master_port={port}",
-                "--monitor_interval=1",
+                "--nproc-per-node=4",
+                "--master-addr=localhost",
+                f"--master-port={port}",
+                "--monitor-interval=1",
                 path("bin/test_script_init_method.py"),
-                f"--init_method=tcp://localhost:{port}",
+                f"--init-method=tcp://localhost:{port}",
             ]
         )
         # nothing to validate, just make sure it runs
@@ -574,7 +574,7 @@ def test_init_method_env(self):
             "argv",
             [
                 path("bin/test_script_init_method.py"),
-                "--init_method=env://",
+                "--init-method=env://",
             ],
         ):
             runpy.run_path(sys.argv[0], run_name="__main__")
@@ -585,14 +585,14 @@ def test_init_method_env_with_torchelastic(self):
         port = get_free_port()
         launch.main(
             [
-                "--run_path",
+                "--run-path",
                 "--nnodes=1",
-                "--nproc_per_node=4",
-                "--master_addr=localhost",
-                f"--master_port={port}",
-                "--monitor_interval=1",
+                "--nproc-per-node=4",
+                "--master-addr=localhost",
+                f"--master-port={port}",
+                "--monitor-interval=1",
                 path("bin/test_script_init_method.py"),
-                "--init_method=env://",
+                "--init-method=env://",
             ]
         )
         # nothing to validate, just make sure it runs
diff --git a/test/distributed/optim/test_named_optimizer.py b/test/distributed/optim/test_named_optimizer.py
index 1f092e5bc314..7d30f6d1f7aa 100644
--- a/test/distributed/optim/test_named_optimizer.py
+++ b/test/distributed/optim/test_named_optimizer.py
@@ -28,7 +28,7 @@ def _run_model_training(model_optim_lists):
 
 class TestDummyModel(torch.nn.Module):
     def __init__(self):
-        super(TestDummyModel, self).__init__()
+        super().__init__()
         torch.manual_seed(0)
         self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
         self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
@@ -391,3 +391,37 @@ def test_add_param_group_error(self):
         err_msg = "some parameters are not in the module"
         with self.assertRaisesRegex(ValueError, err_msg):
             named_optim.add_param_group({"params": [torch.ones(8, 1)], "lr": 1e-5})
+
+    def test_init_state(self):
+        m = TestDummyModel()
+        named_optim = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            [
+                {"params": m.net1.parameters()},
+                {"params": m.net3.parameters(), "lr": 1e-3},
+            ],
+            lr=1e-2,
+            momentum=0.9,
+        )
+        named_sd = named_optim.state_dict()
+        self.assertTrue(m.net1[0].weight.grad is None)
+        self.assertTrue(len(named_sd["state"]) == 0)
+        named_optim.init_state()
+        named_sd = named_optim.state_dict()
+        self.assertTrue(m.net1[0].weight.grad is not None)
+        self.assertTrue("momentum_buffer" in named_sd["state"]["net1.0.weight"])
+        self.assertFalse(
+            torch.all(named_sd["state"]["net1.0.weight"]["momentum_buffer"]).item()
+        )
+        self.assertFalse(
+            torch.all(named_sd["state"]["net1.0.bias"]["momentum_buffer"]).item()
+        )
+        self.assertTrue(m.net3.bias.grad is not None)
+        self.assertTrue("momentum_buffer" in named_sd["state"]["net3.bias"])
+        self.assertFalse(
+            torch.all(named_sd["state"]["net3.bias"]["momentum_buffer"]).item()
+        )
+        self.assertFalse(
+            torch.all(named_sd["state"]["net3.weight"]["momentum_buffer"]).item()
+        )
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 3e0474c3a449..46fea149a117 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -64,7 +64,7 @@ def _get_backend_for_tests():
 @unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work.")
 class TestZeroRedundancyOptimizer(common_distributed.MultiProcessTestCase):
     def setUp(self):
-        super(TestZeroRedundancyOptimizer, self).setUp()
+        super().setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
         self._spawn_processes()
 
@@ -268,8 +268,8 @@ def test_zero_grad(self):
         self.assertNotEqual(m.weight.grad, torch.zeros_like(m.weight))
         self.assertNotEqual(m.weight.grad, torch.zeros_like(m.weight))
         o.zero_grad()
-        self.assertFalse(m.weight.grad)
-        self.assertFalse(m.bias.grad)
+        self.assertIsNone(m.weight.grad)
+        self.assertIsNone(m.bias.grad)
 
     def test_constructor(self):
         """Check the robustness of the ZeroRedundancyOptimizer constructor by
@@ -1236,19 +1236,8 @@ def test_zero_model_parallel(
         layers are assigned to different devices."""
         if self.rank >= 2:
             return
-        # Disable DDP + ReplicatedTensor when `parameter_as_bucket_view=True`
-        # since then ZeroRedundancyOptimizer modifies the model parameters in
-        # place.
-        from torch.nn.parallel._replicated_tensor_ddp_utils import (
-            _ddp_replicated_tensor,
-        )
-
-        context = (
-            _ddp_replicated_tensor(False) if parameters_as_bucket_view else suppress()
-        )
-        with context:
-            self.dist_init(self.rank, world_size=2)
-            self._test_zero_model_parallel(parameters_as_bucket_view)
+        self.dist_init(self.rank, world_size=2)
+        self._test_zero_model_parallel(parameters_as_bucket_view)
 
     def _test_ddp_zero_overlap(
         self,
@@ -1435,21 +1424,14 @@ def test_ddp_zero_overlap(
             else hook_with_zero_step_interleaved
         )
 
-        # Disable DDP + ReplicatedTensor since ZeroRedundancyOptimizer
-        # modifies the model parameters in place.
-        from torch.nn.parallel._replicated_tensor_ddp_utils import (
-            _ddp_replicated_tensor,
+        self._test_ddp_zero_overlap(
+            device,
+            hook_constructor,
+            gradient_as_bucket_view,
+            static_graph,
+            shard_buckets=shard_buckets,
         )
 
-        with _ddp_replicated_tensor(False):
-            self._test_ddp_zero_overlap(
-                device,
-                hook_constructor,
-                gradient_as_bucket_view,
-                static_graph,
-                shard_buckets=shard_buckets,
-            )
-
 
 instantiate_parametrized_tests(TestZeroRedundancyOptimizerSingleRank)
 instantiate_parametrized_tests(TestZeroRedundancyOptimizerDistributed)
diff --git a/test/distributed/pipeline/sync/skip/test_api.py b/test/distributed/pipeline/sync/skip/test_api.py
index afee90fdbead..be38d6d83dac 100644
--- a/test/distributed/pipeline/sync/skip/test_api.py
+++ b/test/distributed/pipeline/sync/skip/test_api.py
@@ -11,6 +11,7 @@
 from torch import nn
 
 from torch.distributed.pipeline.sync.skip import Namespace, skippable, stash
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_namespace_difference():
@@ -45,3 +46,7 @@ def forward(self, x):
 ))
 """.strip()
     )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_gpipe.py b/test/distributed/pipeline/sync/skip/test_gpipe.py
index 5a7f753ccdc9..21731d452da5 100644
--- a/test/distributed/pipeline/sync/skip/test_gpipe.py
+++ b/test/distributed/pipeline/sync/skip/test_gpipe.py
@@ -14,6 +14,7 @@
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.portal import PortalBlue, PortalCopy, PortalOrange
 from torch.distributed.pipeline.sync.utils import partition_model
+from torch.testing._internal.common_utils import run_tests
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
@@ -108,3 +109,7 @@ def assert_grad_fn_is_not_portal(grad_fn, visited=None):
 
     output.local_value().sum().backward()
     assert input.grad.mean().item() == 1
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py b/test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py
index 07c2c4aa3694..4d542285cd5a 100644
--- a/test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py
+++ b/test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py
@@ -10,6 +10,7 @@
 
 from torch.distributed.pipeline.sync.skip import Namespace, pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.layout import inspect_skip_layout
+from torch.testing._internal.common_utils import run_tests
 
 
 class Pass(nn.Module):
@@ -111,3 +112,7 @@ def test_namespace():
 
     # p3 pops 'bar' before 'foo', but the plan is sorted by source partition index.
     assert policy == [[], [], [(0, ns1, "foo"), (1, ns2, "foo")]]
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_leak.py b/test/distributed/pipeline/sync/skip/test_leak.py
index 91cbfd4960b4..e729670fb2c4 100644
--- a/test/distributed/pipeline/sync/skip/test_leak.py
+++ b/test/distributed/pipeline/sync/skip/test_leak.py
@@ -13,6 +13,7 @@
 from torch.distributed.pipeline.sync import Pipe, is_checkpointing, is_recomputing
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.tracker import current_skip_tracker
+from torch.testing._internal.common_utils import run_tests
 
 
 @skippable(stash=["skip"])
@@ -126,3 +127,7 @@ def deny(*args, **kwargs):
         model.eval()
         with torch.no_grad():
             model(input)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_portal.py b/test/distributed/pipeline/sync/skip/test_portal.py
index 8558b974e80c..e50b5e1059b8 100644
--- a/test/distributed/pipeline/sync/skip/test_portal.py
+++ b/test/distributed/pipeline/sync/skip/test_portal.py
@@ -12,6 +12,7 @@
 from torch.distributed.pipeline.sync.dependency import fork, join
 from torch.distributed.pipeline.sync.skip.portal import Portal
 from torch.distributed.pipeline.sync.stream import default_stream
+from torch.testing._internal.common_utils import run_tests
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
@@ -155,3 +156,7 @@ def test_tensor_life_3_plus_1(self, new_portal):
         another_tensor = torch.rand(1, requires_grad=True)
         portal.put_tensor(another_tensor, tensor_life=1)
         portal.blue()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_stash_pop.py b/test/distributed/pipeline/sync/skip/test_stash_pop.py
index dcb25a5dc3c2..e67cfd47bd92 100644
--- a/test/distributed/pipeline/sync/skip/test_stash_pop.py
+++ b/test/distributed/pipeline/sync/skip/test_stash_pop.py
@@ -12,6 +12,7 @@
 
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.tracker import SkipTracker, use_skip_tracker
+from torch.testing._internal.common_utils import run_tests
 
 
 @pytest.fixture(autouse=True)
@@ -136,3 +137,7 @@ def forward(self, input):
 
     l1 = Stash()
     l1(torch.tensor(42))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_tracker.py b/test/distributed/pipeline/sync/skip/test_tracker.py
index ce242c4d2a42..5810cab97681 100644
--- a/test/distributed/pipeline/sync/skip/test_tracker.py
+++ b/test/distributed/pipeline/sync/skip/test_tracker.py
@@ -18,6 +18,7 @@
 from torch.distributed.pipeline.sync.skip import pop, skippable, stash
 from torch.distributed.pipeline.sync.skip.layout import SkipLayout
 from torch.distributed.pipeline.sync.skip.tracker import SkipTracker, SkipTrackerThroughPotals, current_skip_tracker
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_default_skip_tracker():
@@ -127,3 +128,7 @@ def test_tensor_life_with_checkpointing():
     with enable_recomputing():
         skip_tracker.save(batch, None, "test", tensor)
     assert skip_tracker.portals[(None, "test")].tensor_life == 0
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/skip/test_verify_skippables.py b/test/distributed/pipeline/sync/skip/test_verify_skippables.py
index c995cdbe5332..6de439ec88d8 100644
--- a/test/distributed/pipeline/sync/skip/test_verify_skippables.py
+++ b/test/distributed/pipeline/sync/skip/test_verify_skippables.py
@@ -10,6 +10,7 @@
 from torch import nn
 
 from torch.distributed.pipeline.sync.skip import Namespace, skippable, verify_skippables
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_matching():
@@ -152,3 +153,7 @@ class Layer4(nn.Module):
     verify_skippables(
         nn.Sequential(Layer1().isolate(ns1), Layer2().isolate(ns1), Layer3().isolate(ns2), Layer4().isolate(ns2),)
     )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_balance.py b/test/distributed/pipeline/sync/test_balance.py
index 0072573eecd6..b8a81aabb74a 100644
--- a/test/distributed/pipeline/sync/test_balance.py
+++ b/test/distributed/pipeline/sync/test_balance.py
@@ -14,6 +14,7 @@
 
 from torch.distributed.pipeline.sync._balance import balance_by_size, balance_by_time, blockpartition
 from torch.distributed.pipeline.sync._balance.profile import layerwise_sandbox
+from torch.testing._internal.common_utils import run_tests
 
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 
@@ -223,3 +224,7 @@ def test_already_has_grad():
 
     with pytest.raises(ValueError, match="some parameter already has gradient"):
         balance_by_time(1, model, sample, device="cpu")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_bugs.py b/test/distributed/pipeline/sync/test_bugs.py
index ca1a6688d3a3..764d2af10ae3 100644
--- a/test/distributed/pipeline/sync/test_bugs.py
+++ b/test/distributed/pipeline/sync/test_bugs.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from torch.distributed.pipeline.sync import Pipe
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_python_autograd_function(setup_rpc):
@@ -137,3 +138,7 @@ def forward(self, x):
     y.norm().backward()
 
     assert y.to(torch.bool).tolist() == x.grad.to(torch.bool).tolist()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_checkpoint.py b/test/distributed/pipeline/sync/test_checkpoint.py
index 60953126f156..f3d57c218cf1 100644
--- a/test/distributed/pipeline/sync/test_checkpoint.py
+++ b/test/distributed/pipeline/sync/test_checkpoint.py
@@ -16,6 +16,7 @@
 from torch.distributed.pipeline.sync.checkpoint import Checkpointing, checkpoint, is_checkpointing, is_recomputing
 from torch.distributed.pipeline.sync.dependency import fork, join
 from torch.distributed.pipeline.sync.microbatch import Batch
+from torch.testing._internal.common_utils import run_tests
 
 devices = ["cpu"]
 if torch.cuda.is_available():
@@ -158,3 +159,7 @@ def forward(self, input):
 
     output = checkpoint(model, input)
     output[0].backward()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_copy.py b/test/distributed/pipeline/sync/test_copy.py
index 66ea35583674..171b7ffbb8ee 100644
--- a/test/distributed/pipeline/sync/test_copy.py
+++ b/test/distributed/pipeline/sync/test_copy.py
@@ -11,6 +11,7 @@
 
 from torch.distributed.pipeline.sync.copy import Copy, Wait
 from torch.distributed.pipeline.sync.stream import CPUStream, current_stream, get_device, is_cuda, new_stream, use_stream
+from torch.testing._internal.common_utils import run_tests
 
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 
@@ -68,3 +69,7 @@ def test_wait_multiple_tensors():
 
     assert a.grad_fn is b.grad_fn
     assert a.grad_fn.__class__ is Wait._backward_cls
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_deferred_batch_norm.py b/test/distributed/pipeline/sync/test_deferred_batch_norm.py
index 079dee387cfb..4e2578da9499 100644
--- a/test/distributed/pipeline/sync/test_deferred_batch_norm.py
+++ b/test/distributed/pipeline/sync/test_deferred_batch_norm.py
@@ -14,6 +14,7 @@
 from torch import nn, optim
 
 from torch.distributed.pipeline.sync.batchnorm import DeferredBatchNorm
+from torch.testing._internal.common_utils import run_tests
 
 CHUNKS = 4
 
@@ -192,3 +193,7 @@ def test_input_requiring_grad():
 
     assert not dbn.sum.requires_grad
     assert dbn.sum.grad_fn is None
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_dependency.py b/test/distributed/pipeline/sync/test_dependency.py
index 1821b3b038ec..cff408275994 100644
--- a/test/distributed/pipeline/sync/test_dependency.py
+++ b/test/distributed/pipeline/sync/test_dependency.py
@@ -12,6 +12,7 @@
 import torch
 
 from torch.distributed.pipeline.sync.dependency import Fork, Join, fork, join
+from torch.testing._internal.common_utils import run_tests
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
@@ -144,3 +145,7 @@ def test_join_when_fork_requires_grad():
     assert not b.requires_grad
     b = join(b, p)
     assert b.requires_grad
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_inplace.py b/test/distributed/pipeline/sync/test_inplace.py
index 04dc598b2327..eade0f43e1bd 100644
--- a/test/distributed/pipeline/sync/test_inplace.py
+++ b/test/distributed/pipeline/sync/test_inplace.py
@@ -11,6 +11,7 @@
 from torch import nn
 
 from torch.distributed.pipeline.sync import Pipe
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_inplace_on_requires_grad(setup_rpc):
@@ -71,3 +72,7 @@ def forward(self, foo_bar):
     # The gradient of 'foo' should be 2, but it is 3 actually because
     # bar.add_(1) was executed twice due to checkpointing.
     assert foo.grad.item() == 2.0
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_microbatch.py b/test/distributed/pipeline/sync/test_microbatch.py
index 0eb43902a07c..82f080299425 100644
--- a/test/distributed/pipeline/sync/test_microbatch.py
+++ b/test/distributed/pipeline/sync/test_microbatch.py
@@ -11,6 +11,7 @@
 import torch.cuda
 
 from torch.distributed.pipeline.sync.microbatch import Batch, check, gather, scatter
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_batch_atomic():
@@ -140,3 +141,7 @@ def test_scatter_multiple_tensors():
     assert list(b)[0].size() == (1, 1)
     assert list(a)[1].size() == (2, 2)
     assert list(b)[1].size() == (2, 2)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_phony.py b/test/distributed/pipeline/sync/test_phony.py
index 615e9c6e6f46..6aeb873b30b2 100644
--- a/test/distributed/pipeline/sync/test_phony.py
+++ b/test/distributed/pipeline/sync/test_phony.py
@@ -9,6 +9,7 @@
 import torch
 
 from torch.distributed.pipeline.sync.phony import get_phony
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_phony_size():
@@ -50,3 +51,7 @@ def forward(ctx, input):
     assert p1 is not p2
     assert p1.grad_fn is not None
     assert p2.grad_fn is None
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_pipe.py b/test/distributed/pipeline/sync/test_pipe.py
index abfa738603a1..cc03a66aa7be 100644
--- a/test/distributed/pipeline/sync/test_pipe.py
+++ b/test/distributed/pipeline/sync/test_pipe.py
@@ -18,6 +18,7 @@
 
 from torch.distributed.pipeline.sync import Pipe, NoChunk, WithDevice
 from torch.distributed.pipeline.sync.pipe import PipeSequential
+from torch.testing._internal.common_utils import run_tests
 
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 
@@ -662,7 +663,7 @@ def test_named_children(setup_rpc):
     model = nn.Sequential(OrderedDict([("a", a), ("b", b)]))
     model = Pipe(model)
 
-    names = set(n for n, _ in model.named_modules())
+    names = {n for n, _ in model.named_modules()}
     assert "partitions.0.0" in names
     assert "partitions.1.0" in names
 
@@ -819,3 +820,7 @@ def test_with_device_wrapper(setup_rpc):
     assert torch.device('cuda:0') == model(torch.rand(16, 16).cuda(0)).local_value().device
     assert [torch.device('cuda:0')] == model.devices
     assert torch.device('cuda:0') == fc2.weight.device
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_pipeline.py b/test/distributed/pipeline/sync/test_pipeline.py
index d08e1268c847..9548cb959db1 100644
--- a/test/distributed/pipeline/sync/test_pipeline.py
+++ b/test/distributed/pipeline/sync/test_pipeline.py
@@ -7,6 +7,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 from torch.distributed.pipeline.sync.pipeline import _clock_cycles
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_clock_cycles():
@@ -29,3 +30,7 @@ def test_clock_cycles():
         [(3, 0), (2, 1)],
         [(3, 1)],
     ]
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_stream.py b/test/distributed/pipeline/sync/test_stream.py
index 45f8116b7f57..6fa8e99b13db 100644
--- a/test/distributed/pipeline/sync/test_stream.py
+++ b/test/distributed/pipeline/sync/test_stream.py
@@ -21,6 +21,7 @@
     use_stream,
     wait_stream,
 )
+from torch.testing._internal.common_utils import run_tests
 
 skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required")
 
@@ -188,3 +189,7 @@ def test_record_stream_shifted_view(self, cuda_sleep):
         with torch.cuda.stream(stream_alloc):
             z = torch.rand(2, device=torch.device("cuda"))
         assert z.data_ptr() != data_ptr
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_transparency.py b/test/distributed/pipeline/sync/test_transparency.py
index c62db97c92b7..e9a312745b12 100644
--- a/test/distributed/pipeline/sync/test_transparency.py
+++ b/test/distributed/pipeline/sync/test_transparency.py
@@ -10,6 +10,7 @@
 from torch import nn
 
 from torch.distributed.pipeline.sync import Pipe
+from torch.testing._internal.common_utils import run_tests
 
 
 def test_simple_linears(setup_rpc):
@@ -43,3 +44,7 @@ def zero_grad(parameters):
 
     # Both grads should be identical.
     assert torch.allclose(grad_with_pipe, grad_without_pipe)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/pipeline/sync/test_worker.py b/test/distributed/pipeline/sync/test_worker.py
index 39758cb9182e..7d347d48a219 100644
--- a/test/distributed/pipeline/sync/test_worker.py
+++ b/test/distributed/pipeline/sync/test_worker.py
@@ -14,6 +14,7 @@
 from torch.distributed.pipeline.sync.microbatch import Batch
 from torch.distributed.pipeline.sync.stream import CPUStream
 from torch.distributed.pipeline.sync.worker import Task, spawn_workers
+from torch.testing._internal.common_utils import run_tests
 
 
 class fake_device:
@@ -109,3 +110,7 @@ def test_worker_per_device():
         # 3: fake1, 4: fake2
         assert in_queues[3] is not in_queues[4]
         assert out_queues[3] is not out_queues[4]
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/rpc/test_share_memory.py b/test/distributed/rpc/test_share_memory.py
index 067233b8c0cc..bdfddaa02382 100644
--- a/test/distributed/rpc/test_share_memory.py
+++ b/test/distributed/rpc/test_share_memory.py
@@ -53,9 +53,6 @@ def worker_fn(m):
     pass
 
 class TestRPCPickler(TestCase):
-    def setUp(self):
-        super().setUp()
-
     def test_case(self):
         os.environ['MASTER_ADDR'] = 'localhost'
         os.environ['MASTER_PORT'] = '29500'
diff --git a/test/distributed/tensor/parallel/test_2d_parallel.py b/test/distributed/tensor/parallel/test_2d_parallel.py
index e71be70ae9ab..acb33f840481 100644
--- a/test/distributed/tensor/parallel/test_2d_parallel.py
+++ b/test/distributed/tensor/parallel/test_2d_parallel.py
@@ -10,9 +10,10 @@
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
 from torch.distributed._tensor import DeviceMesh, DTensor as DT, Replicate
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 from torch.distributed.tensor.parallel import PairwiseParallel, parallelize_module
-from torch.distributed.tensor.parallel.fsdp import is_available
+from torch.distributed.tensor.parallel.fsdp import enable_2d_with_fsdp
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 
 from torch.testing._internal.common_utils import run_tests
@@ -29,7 +30,7 @@
 
 class SimpleModel(torch.nn.Module):
     def __init__(self):
-        super(SimpleModel, self).__init__()
+        super().__init__()
         self.net1 = torch.nn.Linear(5, 8)
         self.relu = torch.nn.ReLU()
         self.net2 = torch.nn.Linear(8, 4)
@@ -99,7 +100,7 @@ class Test2dParallelIntegration(DTensorTestBase):
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_2d_fsdp_integration_functionality(self) -> None:
-        if not is_available():
+        if not enable_2d_with_fsdp():
             self.skipTest("FSDP 2d parallel integration not available")
 
         model_tp = init_model()[0]
@@ -141,10 +142,15 @@ def _compare_params(self, m1, m2):
                         p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                     self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
 
+    def _clean_up_fsdp_param_name(self, name):
+        return ".".join(
+            filter(lambda name: name != FSDP_WRAPPED_MODULE, name.split("."))
+        )
+
     def _test_2d_e2e_flow(
         self, use_orig_params=False, fsdp_nested=False, multi_param_group=False
     ) -> None:
-        if not is_available():
+        if not enable_2d_with_fsdp():
             self.skipTest("FSDP 2d parallel integration not available")
         torch.manual_seed(0)
         model = SimpleModel().cuda(self.rank)
@@ -154,8 +160,14 @@ def _test_2d_e2e_flow(
             use_orig_params=use_orig_params, fsdp_nested=fsdp_nested
         )
         # Check named parameters are returning the same name at least.
-        param_names_2d = [name for name, _ in model_2d.named_parameters()]
+        param_names_2d = [
+            self._clean_up_fsdp_param_name(name)
+            for name, _ in model_2d.named_parameters()
+        ]
         for name, _ in model.named_parameters():
+            name = self._clean_up_fsdp_param_name(name)
+            if name not in param_names_2d:
+                print(name, param_names_2d)
             self.assertTrue(name in param_names_2d)
         self._compare_params(model, model_2d)
 
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index 7375de3ef181..a7b37172e374 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -1,9 +1,14 @@
 # Owner(s): ["oncall: distributed"]
+from collections import OrderedDict
 
 import torch
 from torch.distributed._tensor import DeviceMesh, DTensor, Replicate
 from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
-from torch.distributed.tensor.parallel.api import _parallelize_linear, _parallelize_mlp
+from torch.distributed.tensor.parallel.api import (
+    _parallelize_linear,
+    _parallelize_mlp,
+    parallelize_module,
+)
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     make_input_replicate_1d,
@@ -21,7 +26,7 @@
 
 class MLPModule(torch.nn.Module):
     def __init__(self, device):
-        super(MLPModule, self).__init__()
+        super().__init__()
         torch.manual_seed(5)
         self.net1 = torch.nn.Linear(10, 16, device=device)
         self.relu = torch.nn.ReLU()
@@ -77,6 +82,7 @@ def _compare_params(
         self,
         local_module,
         dist_module,
+        rank0_only,
         skip_rowwise_bias=False,
         compare_grad=False,
     ):
@@ -85,25 +91,32 @@ def _compare_params(
             dist_param = dist_module.get_parameter(name)
             param = param.grad if compare_grad else param
             dist_param = dist_param.grad if compare_grad else dist_param
-            if self.rank == 0 or (
-                name not in ["net2.bias"]
-                and not skip_rowwise_bias
-                or name not in ["bias", "net2.bias"]
+            if (
+                (not rank0_only)
+                or (self.rank == 0)
+                or (
+                    name not in ["net2.bias"]
+                    and not skip_rowwise_bias
+                    or name not in ["bias", "net2.bias"]
+                )
             ):
                 self.assertEqual(
                     param,
                     dist_param.redistribute(
                         device_mesh=dist_param.device_mesh, placements=replicate
                     ).to_local(),
+                    f"{name} not equal between dist and non-dist",
                 )
 
-    def _compare_module(self, local_module, dist_module, inp_size, rowwise=False):
+    def _compare_module(
+        self, local_module, dist_module, inp_size, rank0_only=True, rowwise=False
+    ):
         LR = 0.25  # the learning rate we use for testing
         local_optim = torch.optim.SGD(local_module.parameters(), lr=LR)
         dist_optim = torch.optim.SGD(dist_module.parameters(), lr=LR)
         torch.manual_seed(0)
         inp = torch.rand(*inp_size, device=self.device_type)
-        self._compare_params(local_module, dist_module)
+        self._compare_params(local_module, dist_module, rank0_only)
 
         # check forward correctness
         local_output = local_module(inp)
@@ -118,11 +131,11 @@ def _compare_module(self, local_module, dist_module, inp_size, rowwise=False):
         dist_output.sum().backward()
 
         # check backward and ensure gradients are same
-        self._compare_params(local_module, dist_module, rowwise, True)
+        self._compare_params(local_module, dist_module, rank0_only, rowwise, True)
 
         local_optim.step()
         dist_optim.step()
-        self._compare_params(local_module, dist_module, rowwise)
+        self._compare_params(local_module, dist_module, rank0_only, rowwise)
 
     @with_comms
     def test_parallelize_mlp(self):
@@ -141,6 +154,63 @@ def test_parallelize_mlp(self):
         model_tp = _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
         self._compare_module(model, model_tp, inp_size)
 
+    @with_comms
+    def test_parallelize_mlp_with_module_api(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        model_tp = MLPModule(self.device_type)
+
+        # Ensure model are initialized the same way.
+        self.assertEqual(model.net1.weight, model_tp.net1.weight)
+        self.assertEqual(model.net1.bias, model_tp.net1.bias)
+        self.assertEqual(model.net2.weight, model_tp.net2.weight)
+        self.assertEqual(model.net2.bias, model_tp.net2.bias)
+
+        # Parallelize module.
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {"net1": ColwiseParallel(), "net2": ColwiseParallel()},
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    def test_parallelize_mlp_with_module_api_nested(self):
+        inp_size = [12, 10]
+        model = torch.nn.Sequential(
+            OrderedDict([("dummy_encoder", MLPModule(self.device_type))])
+        )
+        model_tp = torch.nn.Sequential(
+            OrderedDict([("dummy_encoder", MLPModule(self.device_type))])
+        )
+
+        # Ensure model are initialized the same way.
+        self.assertEqual(
+            model.dummy_encoder.net1.weight, model_tp.dummy_encoder.net1.weight
+        )
+        self.assertEqual(
+            model.dummy_encoder.net1.bias, model_tp.dummy_encoder.net1.bias
+        )
+        self.assertEqual(
+            model.dummy_encoder.net2.weight, model_tp.dummy_encoder.net2.weight
+        )
+        self.assertEqual(
+            model.dummy_encoder.net2.bias, model_tp.dummy_encoder.net2.bias
+        )
+
+        # Parallelize module.
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "dummy_encoder.net1": ColwiseParallel(),
+                "dummy_encoder.net2": ColwiseParallel(),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
     @with_comms
     def test_parallelize_mlp_error(self):
         class DummyParallel(ParallelStyle):
@@ -177,7 +247,7 @@ def test_linear_row_wise_parallel(self):
 
         # let each rank generate unique local input
         torch.manual_seed(self.rank)
-        self._compare_module(model, model_tp, inp_size, True)
+        self._compare_module(model, model_tp, inp_size, rowwise=True)
 
     @with_comms
     def test_linear_col_wise_parallel(self):
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 12ee9b0b651c..190ff70637e8 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -2,10 +2,12 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed._tensor import DeviceMesh, Replicate
+from torch.distributed._tensor import DTensor, DeviceMesh, Replicate
 from torch.distributed.tensor.parallel import (
     PairwiseParallel,
+    PairwiseSequenceParallel,
     parallelize_module,
     TensorParallelMultiheadAttention,
 )
@@ -20,7 +22,7 @@
 
 class MLPModule(torch.nn.Module):
     def __init__(self, device):
-        super(MLPModule, self).__init__()
+        super().__init__()
         torch.manual_seed(5)
         self.net1 = torch.nn.Linear(10, 16, device=device)
         self.relu = torch.nn.ReLU()
@@ -41,22 +43,36 @@ def forward(self, query, key, value):
         return self.attn(query, key, value)
 
 
-# TODO: replace repeated test code with _check_module
 class DistTensorParallelExampleTest(DTensorTestBase):
-    @with_comms
-    def test_mlp_megatron_e2e(self):
+    def _check_module(self, m1, m2, check_grad=False, rank0_only_params=None):
+        rank0_only_params = [] if rank0_only_params is None else rank0_only_params
+        named_parameters = dict(m1.named_parameters())
+        for name, param_m2 in m2.named_parameters():
+            if self.rank != 0 and name in rank0_only_params:
+                continue
+            self.assertTrue(name in named_parameters)
+            param_m1 = named_parameters[name]
+            if check_grad:
+                param_m2 = param_m2.grad
+                param_m1 = param_m1.grad
+            if isinstance(param_m2, DTensor):
+                replicate = [Replicate()]
+                param_m2 = param_m2.redistribute(
+                    device_mesh=param_m2.device_mesh, placements=replicate
+                ).to_local()
+            self.assertEqual(param_m2, param_m1)
+
+    def _test_mlp_magatron_e2e(self, is_seq_parallel=False):
         inp_size = [5, 10]
         # Ensure all tp ranks have same input.
-        torch.manual_seed(0)
+        rng_seed = self.rank if is_seq_parallel else 0
+        torch.manual_seed(rng_seed)
         inp = torch.rand(*inp_size, device=self.device_type)
         model = MLPModule(self.device_type)
         model_tp = MLPModule(self.device_type)
 
         # Ensure model are initialized the same way.
-        self.assertEqual(model.net1.weight, model_tp.net1.weight)
-        self.assertEqual(model.net1.bias, model_tp.net1.bias)
-        self.assertEqual(model.net2.weight, model_tp.net2.weight)
-        self.assertEqual(model.net2.bias, model_tp.net2.bias)
+        self._check_module(model, model_tp)
 
         # Shard module and initialize optimizer.
         LR = 0.25
@@ -64,7 +80,8 @@ def test_mlp_megatron_e2e(self):
             self.device_type,
             torch.arange(0, NUM_DEVICES),
         )
-        model_tp = parallelize_module(model_tp, device_mesh, PairwiseParallel())
+        parallel_style = PairwiseSequenceParallel() if is_seq_parallel else PairwiseParallel()
+        model_tp = parallelize_module(model_tp, device_mesh, parallel_style)
         optim = torch.optim.SGD(model.parameters(), lr=LR)
         optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
 
@@ -75,71 +92,37 @@ def test_mlp_megatron_e2e(self):
         output.sum().backward()
         output_tp.sum().backward()
 
-        device_mesh = model_tp.net1.weight.device_mesh
-        replicate = [Replicate()] * device_mesh.ndim
+        if is_seq_parallel:
+            # Sum gradients from different ranks, since input
+            # are different across ranks for sequence parallel.
+            dist.all_reduce(model.net1.weight.grad)
+            dist.all_reduce(model.net1.bias.grad)
+            dist.all_reduce(model.net2.weight.grad)
+            dist.all_reduce(model.net2.bias.grad)
 
         # Ensure gradients are same.
-        self.assertEqual(
-            model.net1.weight.grad,
-            model_tp.net1.weight.grad.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
-        self.assertEqual(
-            model.net1.bias.grad,
-            model_tp.net1.bias.grad.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
-        self.assertEqual(
-            model.net2.weight.grad,
-            model_tp.net2.weight.grad.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
-        self.assertEqual(
-            model.net2.bias.grad,
-            model_tp.net2.bias.grad.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
+        self._check_module(model, model_tp, check_grad=True)
 
         optim.step()
         optim_tp.step()
 
         # Ensure model weights are still same after update.
-        self.assertEqual(
-            model.net1.weight,
-            model_tp.net1.weight.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
-        self.assertEqual(
-            model.net1.bias,
-            model_tp.net1.bias.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
-        self.assertEqual(
-            model.net2.weight,
-            model_tp.net2.weight.redistribute(
-                device_mesh=device_mesh, placements=replicate
-            ).to_local(),
-        )
         # Due to the trick we use for Partial aggregation, we only check the weight when local_rank = 0.
-        if self.rank == 0:
-            self.assertEqual(
-                model.net2.bias,
-                model_tp.net2.bias.redistribute(
-                    device_mesh=device_mesh, placements=replicate
-                ).to_local(),
-            )
+        self._check_module(model, model_tp, rank0_only_params=["net2.bias"])
 
         inp = torch.rand(*inp_size, device=self.device_type)
         output = model(inp)
         output_tp = model_tp(inp)
         self.assertEqual(output, output_tp)
 
+    @with_comms
+    def test_mlp_megatron_e2e_w_tensor_parallel(self):
+        self._test_mlp_magatron_e2e()
+
+    @with_comms
+    def test_mlp_megatron_e2e_w_sequence_parallel(self):
+        self._test_mlp_magatron_e2e(is_seq_parallel=True)
+
     # TensorParallelMultiheadAttention == dist_module(TensorParallelMultiheadAttention)
     # baddbmm introduces nan occasionally on CPU: https://github.com/pytorch/pytorch/issues/80588
     @with_comms
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
index 7aeb086f03a4..7856160c6638 100644
--- a/test/distributed/tensor/parallel/test_tp_style.py
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@@ -2,12 +2,15 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+import torch.distributed as dist
 from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     make_input_replicate_1d,
+    make_input_reshard_replicate,
     make_input_shard_1d,
     make_output_replicate_1d,
+    make_output_reshard_tensor,
     make_output_shard_1d,
     make_output_tensor,
     RowwiseParallel,
@@ -26,7 +29,7 @@ def world_size(self):
         return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
 
     def _1d_input_func_check(
-        self, input_local_tensor, expected_local_tensor, func
+        self, input_local_tensor, expected_local_tensor, func, tensor_input_only=False
     ) -> None:
         with self.assertRaisesRegex(
             RuntimeError, "device_mesh is not passed nor can be inferred"
@@ -46,12 +49,13 @@ def _1d_input_func_check(
         # test 1: replicate local tensor
         dtensor = func(input_local_tensor, device_mesh)
         self.assertEqual(expected_local_tensor, dtensor.to_local())
-        # test 2: replicate DTensor
-        dtensor = func(dtensor)
-        self.assertEqual(expected_local_tensor, dtensor.to_local())
-        # test 3: replicate DTensor with DeviceMesh passed
-        dtensor = func(dtensor, device_mesh)
-        self.assertEqual(expected_local_tensor, dtensor.to_local())
+        if not tensor_input_only:
+            # test 2: replicate DTensor
+            dtensor = func(dtensor)
+            self.assertEqual(expected_local_tensor, dtensor.to_local())
+            # test 3: replicate DTensor with DeviceMesh passed
+            dtensor = func(dtensor, device_mesh)
+            self.assertEqual(expected_local_tensor, dtensor.to_local())
 
     @with_comms
     def test_make_input_replicate_1d(self):
@@ -63,6 +67,17 @@ def test_make_input_shard_1d(self):
         tensor = torch.rand(8, 16, device=self.device_type)
         self._1d_input_func_check(tensor, tensor, make_input_shard_1d)
 
+    @with_comms
+    def test_make_input_reshard_replicate(self):
+        tensor = torch.rand(8, 16, device=self.device_type)
+        gathered_tensor = [
+            torch.empty(8, 16, device=self.device_type)
+            for _ in range(self.world_size)
+        ]
+        dist.all_gather(gathered_tensor, tensor)
+        gathered_tensor = torch.cat(gathered_tensor)
+        self._1d_input_func_check(tensor, gathered_tensor, make_input_reshard_replicate)
+
     # Common logic for testing prepare output funcs
     def _test_prepare_output(self, func, spec, dim=None, device_mesh_input_none=False):
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
@@ -129,6 +144,30 @@ def test_make_output_tensor(self):
             output, dtensor.redistribute(device_mesh, [Replicate()]).to_local()
         )
 
+    @with_comms
+    def test_make_output_reshard_tensor(self):
+        # test when output is sharded.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_reshard_tensor, [Shard(0)]
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Shard(0)]).to_local()
+        )
+        #  test when output is replicated.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_reshard_tensor, [Replicate()]
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Shard(0)]).to_local()
+        )
+        # test when input device_mesh is None.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_reshard_tensor, [Shard(0)], None, True
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Shard(0)]).to_local()
+        )
+
     # Common logic for testing prepare output funcs errors.
     def _test_prepare_output_error(self, func):
         tensor = torch.rand(8, 16, device=self.device_type)
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 6683fd255eef..4237473cdd02 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -76,7 +76,7 @@ def gpus_for_rank(world_size):
     return gpus_for_rank
 
 
-class AbstractTimeoutTest(object):
+class AbstractTimeoutTest:
     def _test_store_timeout(self, backend, init_method, c2p):
         try:
             dist.init_process_group(
@@ -131,7 +131,7 @@ def _test_default_store_timeout(self, backend):
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False)
         self.fc2 = nn.Linear(10, 50, bias=False)
         self.fc3 = nn.Linear(50, 4, bias=False)
@@ -146,7 +146,7 @@ def forward(self, x):
 
 class DoubleGpuNet(nn.Module):
     def __init__(self, gpus):
-        super(DoubleGpuNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False).to(gpus[0])
         self.fc2 = nn.Linear(10, 50, bias=False).to(gpus[1])
         self.fc3 = nn.Linear(50, 4, bias=False).to(gpus[1])
@@ -166,7 +166,7 @@ def forward(self, x):
 
 class QuadraGpuNet(nn.Module):
     def __init__(self, gpus):
-        super(QuadraGpuNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False).to(gpus[0])
         self.fc2 = nn.Linear(10, 50, bias=False).to(gpus[1])
         self.fc3 = nn.Linear(50, 4, bias=False).to(gpus[2])
@@ -190,7 +190,7 @@ def forward(self, x):
 
 class ConvNet(nn.Module):
     def __init__(self, gpus, layouts, dtypes):
-        super(ConvNet, self).__init__()
+        super().__init__()
         self.dtypes = dtypes
         if isinstance(gpus, list):
             self.layer_gpus = gpus
@@ -242,14 +242,14 @@ def forward(self, x, rank):
 
 class SparseGradientModule(nn.Module):
     def __init__(self):
-        super(SparseGradientModule, self).__init__()
+        super().__init__()
         self.embedding = nn.EmbeddingBag(10, 10, sparse=True)
 
     def forward(self, x):
         return F.softmax(self.embedding(x), dim=1)
 
 
-class CommonDistributedDataParallelTest(object):
+class CommonDistributedDataParallelTest:
     def tearDown(self):
         # DistributedDataParallel test doesn't seem to call FileStore destructor
         # TODO: investigate this test and the test is known to have issues
@@ -1037,7 +1037,7 @@ def test_multi_limit_multi_dtype(self):
         self.assertEqual(per_bucket_size_limits, [200, 200, 400, 400])
 
 
-class AbstractCommTest(object):
+class AbstractCommTest:
     @property
     def op_timeout_sec(self):
         return 1
@@ -1120,7 +1120,7 @@ def _test_sequence_num_incremented_default_group(self, backend_name):
         )
         self._test_sequence_num_incremented(
             c10d._get_default_group(),
-            ranks=list(i for i in range(dist.get_world_size())),
+            ranks=list(range(dist.get_world_size())),
         )
 
     def _test_sequence_num_incremented_subgroup(self, backend_name):
@@ -1300,11 +1300,11 @@ def _test_tensor_dtype_complex(self, backend):
 
 class CommTest(AbstractCommTest, MultiProcessTestCase):
     def setUp(self):
-        super(CommTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
-        super(CommTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -1419,11 +1419,11 @@ def recv(self, tensor_list, src, tag=0):
 
 class PythonProcessGroupExtensionTest(MultiProcessTestCase):
     def setUp(self):
-        super(PythonProcessGroupExtensionTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
-        super(PythonProcessGroupExtensionTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -1522,16 +1522,56 @@ def world_size(self):
         return 1
 
     def setUp(self):
-        super(ProcessGroupWithDispatchedCollectivesTests, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
-        super(ProcessGroupWithDispatchedCollectivesTests, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
             pass
 
+    def test_init_process_group_optional_backend(self):
+        with tempfile.NamedTemporaryFile() as f:
+            store = dist.FileStore(f.name, self.world_size)
+            # creates both gloo and nccl backend
+            if dist.is_gloo_available() and dist.is_nccl_available():
+                dist.init_process_group(
+                    store=store,
+                    rank=self.rank,
+                    world_size=self.world_size,
+                )
+                dist.destroy_process_group()
+
+    def test_init_process_group_for_all_backends(self):
+        for backend in dist.Backend.backend_list:
+            # skip if the backend is not available on the system
+            if backend == dist.Backend.UNDEFINED:
+                continue
+            elif backend == dist.Backend.MPI:
+                if not dist.is_mpi_available():
+                    continue
+            elif backend == dist.Backend.NCCL:
+                if not dist.is_nccl_available():
+                    continue
+            elif backend == dist.Backend.GLOO:
+                if not dist.is_gloo_available():
+                    continue
+            elif backend == dist.Backend.UCC:
+                if not dist.is_ucc_available():
+                    continue
+
+            with tempfile.NamedTemporaryFile() as f:
+                store = dist.FileStore(f.name, self.world_size)
+                dist.init_process_group(
+                    backend=backend,
+                    rank=self.rank,
+                    world_size=self.world_size,
+                    store=store
+                )
+                dist.destroy_process_group()
+
     def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # call collective with varying tensors to ensure that the tensors are
         # correctly dispatched
@@ -1611,11 +1651,11 @@ def _test_all_to_all_single(self, backend):
 
 class CompilerTest(MultiProcessTestCase):
     def setUp(self):
-        super(CompilerTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
-        super(CompilerTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
diff --git a/test/distributed/test_c10d_error_logger.py b/test/distributed/test_c10d_error_logger.py
index 7c8a6241b76b..868d44976309 100644
--- a/test/distributed/test_c10d_error_logger.py
+++ b/test/distributed/test_c10d_error_logger.py
@@ -50,7 +50,7 @@ def wrapper(self, *args, **kwargs):
 
 class C10dErrorLoggerTest(MultiProcessTestCase):
     def setUp(self):
-        super(C10dErrorLoggerTest, self).setUp()
+        super().setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
         os.environ["BACKEND"] = BACKEND
         self._spawn_processes()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 2b5f3f4a9465..d82d90573f6b 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -37,7 +37,6 @@
     ShardMetadata,
 )
 from torch.nn.parallel import DistributedDataParallel
-from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
 from torch.testing._internal.common_distributed import (
     create_device,
     MultiProcessTestCase,
@@ -217,7 +216,7 @@ def _create_process_group_gloo(self, store, rank, world_size, opts):
         return pg
 
     def setUp(self):
-        super(ProcessGroupGlooTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def opts(self, threads=2):
@@ -1458,7 +1457,7 @@ class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
 ):
     def setUp(self):
-        super(DistributedDataParallelTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def _get_process_group(self):
@@ -1528,7 +1527,7 @@ def _test_global_local_unused_params_grad(
 
         class GlobalLocalUnusedParamModule(nn.Module):
             def __init__(self):
-                super(GlobalLocalUnusedParamModule, self).__init__()
+                super().__init__()
                 self.t0 = Task()
                 self.t1 = Task()
                 self.task_unused = Task()
@@ -1610,7 +1609,7 @@ def test_find_unused_parameters_when_unused_parameters_empty(self):
 
         class FindUnusedParamModule(nn.Module):
             def __init__(self):
-                super(FindUnusedParamModule, self).__init__()
+                super().__init__()
                 self.t0 = Task()
                 self.t1 = Task()
 
@@ -1663,7 +1662,7 @@ def test_ignored_output(self):
 
         class IgnoredOutput(nn.Module):
             def __init__(self):
-                super(IgnoredOutput, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.relu = nn.ReLU()
@@ -1705,7 +1704,7 @@ def test_ignored_output_with_unused_parameters(self):
 
         class IgnoredOutputWithUnusedParameters(nn.Module):
             def __init__(self):
-                super(IgnoredOutputWithUnusedParameters, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.fc3 = nn.Linear(4, 4, bias=False)
@@ -1766,20 +1765,19 @@ def forward(self, x):
         local_shards = [Shard(torch.randn(5, 10, device=device), local_shard_metadata)]
         st = init_from_local_shards(local_shards, [10, 10])
         m = MyModule(st)
-        with _ddp_replicated_tensor(False):
-            DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
-                module=m,
-                params_and_buffers_to_ignore={'st'}
-            )
-            # test to make DDP constructor will not fail when module includes a ShardedTensor when ignored
-            DistributedDataParallel(
-                m,
-                device_ids=[device] if device.type == "gpu" else None,
-                process_group=pg,
-                gradient_as_bucket_view=True,
-                broadcast_buffers=False,
-                static_graph=True,
-            )
+        DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+            module=m,
+            params_and_buffers_to_ignore={'st'}
+        )
+        # test to make DDP constructor will not fail when module includes a ShardedTensor when ignored
+        DistributedDataParallel(
+            m,
+            device_ids=[device] if device.type == "gpu" else None,
+            process_group=pg,
+            gradient_as_bucket_view=True,
+            broadcast_buffers=False,
+            static_graph=True,
+        )
 
     def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         mult = 2
@@ -1813,7 +1811,7 @@ def test_save_load_checkpoint(self):
 
         class TestModel(nn.Module):
             def __init__(self):
-                super(TestModel, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.relu = nn.ReLU()
@@ -2113,7 +2111,7 @@ def div_by_world_size(fut):
 
 class ReducerModule(nn.Module):
     def __init__(self):
-        super(ReducerModule, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False)
         self.fc2 = nn.Linear(10, 4, bias=False)
         self.fc3 = nn.Linear(4, 4, bias=False)
@@ -2269,11 +2267,11 @@ def device(self):
 
 
     def setUp(self):
-        super(CommTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
-        super(CommTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -2296,9 +2294,9 @@ def _test_broadcast_coalesced(self, process_group, device, root_rank):
         # The tensors to pass to broadcast are identical to the target
         # only on the process that is the root of the broadcast.
         if self.rank == root_rank:
-            tensors = list(tensor.clone() for tensor in target)
+            tensors = [tensor.clone() for tensor in target]
         else:
-            tensors = list(torch.zeros_like(tensor) for tensor in target)
+            tensors = [torch.zeros_like(tensor) for tensor in target]
 
         if self.rank != root_rank:
             self.assertNotEqual(tensors, target)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 7101e9a0217e..1a1de0a525f2 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -82,7 +82,7 @@ def test_common_errors(self):
             "MASTER_PORT": str(common.find_free_port()),
         }
 
-        class Env(object):
+        class Env:
             def __init__(self, vars):
                 self.env_patcher = mock.patch.dict(os.environ, vars, clear=True)
 
@@ -221,7 +221,7 @@ def opts(self, high_priority_stream=False):
         return opts
 
     def setUp(self):
-        super(ProcessGroupNCCLTest, self).setUp()
+        super().setUp()
         # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
         # that use NCCL_BLOCKING_WAIT will test it as expected.
         os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
@@ -229,7 +229,7 @@ def setUp(self):
         self._spawn_processes()
 
     def tearDown(self):
-        super(ProcessGroupNCCLTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -1033,7 +1033,7 @@ class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
 ):
     def setUp(self):
-        super(DistributedDataParallelTest, self).setUp()
+        super().setUp()
         # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
         # that use NCCL_BLOCKING_WAIT will test it as expected.
         os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
@@ -1240,7 +1240,7 @@ def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False):
 
         class ForwardReturnValueModule(nn.Module):
             def __init__(self):
-                super(ForwardReturnValueModule, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.fc3 = nn.Linear(4, 4, bias=False)
@@ -1358,7 +1358,7 @@ def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False):
 
         class FindUnusedParametersModule(nn.Module):
             def __init__(self):
-                super(FindUnusedParametersModule, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.fc3 = nn.Linear(4, 4, bias=False)
@@ -1504,7 +1504,7 @@ def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False
 
         class MultipleOutputModule(nn.Module):
             def __init__(self):
-                super(MultipleOutputModule, self).__init__()
+                super().__init__()
 
                 def define_module():
                     return nn.Sequential(
@@ -1566,7 +1566,7 @@ def test_no_grad(self):
 
         class NoGradModule(nn.Module):
             def __init__(self):
-                super(NoGradModule, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.relu = nn.ReLU()
@@ -1681,7 +1681,7 @@ def test_failure_recovery(self):
 
         class TestModel(nn.Module):
             def __init__(self):
-                super(TestModel, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(2, 10, bias=False)
                 self.fc2 = nn.Linear(10, 4, bias=False)
                 self.relu = nn.ReLU()
@@ -2297,7 +2297,7 @@ def test_ddp_packed_sequence(self):
             store=store,
         )
         seqs = ["sequence_sequence", "seq", "sequence"]
-        vocab = ["<pad>"] + sorted(set([ch for seq in seqs for ch in seq]))
+        vocab = ["<pad>"] + sorted({ch for seq in seqs for ch in seq})
         vectorized_seqs = [[vocab.index(tok) for tok in seq] for seq in seqs]
         # Set the seed to make the embedding and LSTM deterministic (even
         # across ranks since DDP broadcasts parameters from rank 0)
@@ -2350,7 +2350,7 @@ def test_channels_last_contig(self):
 
 class NcclErrorHandlingTest(MultiProcessTestCase):
     def setUp(self):
-        super(NcclErrorHandlingTest, self).setUp()
+        super().setUp()
         # Need to skip return code checking for these tests since the child
         # processes don't exit cleanly.
         self.skip_return_code_checks = [
@@ -2365,7 +2365,7 @@ def setUp(self):
         self._spawn_processes()
 
     def tearDown(self):
-        super(NcclErrorHandlingTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -2593,14 +2593,14 @@ def device(self):
 
 
     def setUp(self):
-        super(CommTest, self).setUp()
+        super().setUp()
         # NCCL_BLOCKING_WAIT overrides NCCL_ASYNC_ERROR_HANDLING hence tests
         # that use NCCL_BLOCKING_WAIT will test it as expected.
         os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
         self._spawn_processes()
 
     def tearDown(self):
-        super(CommTest, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -2623,9 +2623,9 @@ def _test_broadcast_coalesced(self, process_group, device, root_rank):
         # The tensors to pass to broadcast are idential to the target
         # only on the process that is the root of the broadcast.
         if self.rank == root_rank:
-            tensors = list(tensor.clone() for tensor in target)
+            tensors = [tensor.clone() for tensor in target]
         else:
-            tensors = list(torch.zeros_like(tensor) for tensor in target)
+            tensors = [torch.zeros_like(tensor) for tensor in target]
 
         if self.rank != root_rank:
             self.assertNotEqual(tensors, target)
diff --git a/test/distributed/test_c10d_object_collectives.py b/test/distributed/test_c10d_object_collectives.py
index eed85704aa09..a132e6958ad2 100644
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@@ -41,7 +41,7 @@ def wrapper(self, *args, **kwargs):
 
 class TestObjectCollectives(MultiProcessTestCase):
     def setUp(self):
-        super(TestObjectCollectives, self).setUp()
+        super().setUp()
         os.environ["WORLD_SIZE"] = str(self.world_size)
         os.environ["BACKEND"] = BACKEND
         self._spawn_processes()
diff --git a/test/distributed/test_c10d_pypg.py b/test/distributed/test_c10d_pypg.py
index 9c9e0c4422d9..32f33591850f 100644
--- a/test/distributed/test_c10d_pypg.py
+++ b/test/distributed/test_c10d_pypg.py
@@ -43,7 +43,7 @@ class LonelyRankProcessGroup(dist.ProcessGroup):
     This PG only supports world_size of 1
     """
     def __init__(self, rank, world, use_wrapper):
-        super(LonelyRankProcessGroup, self).__init__(rank, world)
+        super().__init__(rank, world)
         assert rank == 0
         assert world == 1
 
@@ -91,7 +91,7 @@ def __repr__(self):
 # We cannot use parametrize as some tests are defined on the base class and use _get_process_group
 class AbstractDDPSingleRank(test_c10d_common.CommonDistributedDataParallelTest):
     def setUp(self):
-        super(AbstractDDPSingleRank, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     @property
@@ -99,7 +99,7 @@ def world_size(self):
         return 1
 
     def tearDown(self):
-        super(AbstractDDPSingleRank, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index 0e87bdc17297..8ac496ea6c06 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -33,7 +33,7 @@
     sys.exit(0)
 
 
-class AbstractProcessGroupShareTensorTest(object):
+class AbstractProcessGroupShareTensorTest:
     world_size = 2
 
     def _test_multiprocess(self, f, shared_tensors, init_pg, n_output):
@@ -103,11 +103,11 @@ def _test_allgather_process(
 
 class TestDistributedNNFunctions(MultiProcessTestCase):
     def setUp(self):
-        super(TestDistributedNNFunctions, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def tearDown(self):
-        super(TestDistributedNNFunctions, self).tearDown()
+        super().tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
diff --git a/test/distributed/test_c10d_spawn_gloo.py b/test/distributed/test_c10d_spawn_gloo.py
index 990f88fb9d31..0be3fc22971c 100644
--- a/test/distributed/test_c10d_spawn_gloo.py
+++ b/test/distributed/test_c10d_spawn_gloo.py
@@ -37,6 +37,7 @@ def _init_pg_gloo(cls, rank, filename, world_size):
             pg = c10d.distributed_c10d._get_default_group()
             pg._register_backend(torch.device("cpu"), c10d.ProcessGroup.BackendType.GLOO, backend)
             pg._register_backend(torch.device("cuda"), c10d.ProcessGroup.BackendType.GLOO, backend)
+
             return pg
 
         @sandcastle_skip_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
@@ -154,7 +155,7 @@ def test_rnn(self):
 
         class Net(nn.Module):
             def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers):
-                super(Net, self).__init__()
+                super().__init__()
                 self.input_dim = input_dim
                 self.hidden_dim = hidden_dim
                 self.output_dim = output_dim
diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index c1720344e49d..3a062b80cc97 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -33,7 +33,7 @@ class TestDataParallel(TestCase):
     def test_data_parallel_buffers_requiring_grad(self):
         class TestModule(nn.Module):
             def __init__(self, t):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.register_buffer('t_rg', t)
                 self.register_buffer('t_not_rg', t.clone().detach())
 
@@ -57,7 +57,7 @@ def test_data_parallel_rnn(self):
         class TestModule(torch.nn.Module):
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.rnn = torch.nn.LSTM(300, 1024, 1, batch_first=True, bidirectional=True)
 
             def forward(self, x):
@@ -305,7 +305,7 @@ def test_data_parallel_model_no_refcycles(self):
 
         class Model(nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.linear = nn.Linear(1, 1)
 
             def forward(self, x):
@@ -630,7 +630,7 @@ def test_zero_grad(self):
 
         class Net(torch.nn.Module):
             def __init__(self, testcase):
-                super(Net, self).__init__()
+                super().__init__()
                 self._testcase = testcase
 
             def forward(self, x):
@@ -648,11 +648,11 @@ def forward(self, x):
     def test_autocast(self):
         class Model(torch.nn.Linear):
             def __init__(self):
-                super(Model, self).__init__(8, 8)
+                super().__init__(8, 8)
 
             @torch.cuda.amp.autocast()
             def forward(self, input):
-                return super(Model, self).forward(input)
+                return super().forward(input)
 
         model = dp.DataParallel(Model().cuda().to(dtype=torch.float32))
         input = torch.randn((8, 8), dtype=torch.float32, device="cuda")
@@ -672,7 +672,7 @@ def test_save_replica_module(self):
     def test_strided_grad_layout(self):
         class ConvNet(nn.Module):
             def __init__(self, layouts, dtype_list):
-                super(ConvNet, self).__init__()
+                super().__init__()
                 self.dtypes = dtype_list
                 self.conv0 = torch.nn.Conv2d(8, 16, (2, 2)).to(memory_format=layouts[0], dtype=dtype_list[0])
                 self.conv1 = torch.nn.Conv2d(16, 32, (2, 2)).to(memory_format=layouts[1], dtype=dtype_list[1])
@@ -742,7 +742,7 @@ def forward(self, x):
     def test_parameter_list_dict_replica(self):
         class MyMod(torch.nn.Module):
             def __init__(self, data, check_fn):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.data = data
                 self.check_fn = check_fn
 
@@ -800,7 +800,7 @@ def test_data_parallel_module(self, device, dtype):
     def test_data_parallel_module_kwargs_only(self, device, dtype):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l = l
 
             def forward(self, input):
@@ -820,7 +820,7 @@ def forward(self, input):
     def test_data_parallel_module_kwargs_only_empty_list(self, device, dtype):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l = l
 
             def forward(self, input):
@@ -840,7 +840,7 @@ def forward(self, input):
     def test_data_parallel_module_kwargs_only_empty_dict(self, device, dtype):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l = l
 
             def forward(self, input):
@@ -860,7 +860,7 @@ def forward(self, input):
     def test_data_parallel_module_kwargs_only_empty_tuple(self, device, dtype):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l = l
 
             def forward(self, input):
diff --git a/test/distributed/test_distributed_spawn.py b/test/distributed/test_distributed_spawn.py
index b2a23ff22a9b..8499f167c6c9 100644
--- a/test/distributed/test_distributed_spawn.py
+++ b/test/distributed/test_distributed_spawn.py
@@ -33,7 +33,7 @@ class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
         def setUp(self):
             super().setUp()
             self._spawn_processes()
-            torch.backends.cudnn.flags(allow_tf32=False).__enter__()
+            torch.backends.cudnn.flags(enabled=True, allow_tf32=False).__enter__()
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index ade7d9254399..523c9360007d 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1,16 +1,14 @@
 # Owner(s): ["module: dynamo"]
 import copy
 import functools
-import os
 import random
 import unittest
 from unittest.mock import patch
 import numpy as np
 import torch
 import torch._dynamo
-from torch._dynamo.optimizations.distributed import DDPOptimizer
+from torch._dynamo.backends.distributed import DDPOptimizer
 import torch._dynamo.test_case
-import torch.distributed as dist
 from contextlib import contextmanager
 from torch import nn
 from torch._dynamo import config
@@ -21,10 +19,12 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
+    DynamoDistributedSingleProcTestCase,
+    DynamoDistributedMultiProcTestCase,
     import_transformers_or_skip,
     skip_if_lt_x_gpu,
-    requires_nccl
+    requires_nccl,
+    _dynamo_dist_per_rank_init,
 )
 import torch._dynamo.logging
 
@@ -62,15 +62,18 @@ def get_model(device, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5):
 def get_custom_model(device):
     class MyCustomLinear(torch.nn.Module):
         def __init__(self):
-            super(MyCustomLinear, self).__init__()
+            super().__init__()
             self.weight = nn.Parameter(torch.randn(512, 512))
 
         def forward(self, x):
-            return torch.mm(x, self.weight.t())
+            tmp = torch.mm(x, self.weight.t())
+            # test an edge case where torch.where.scalar was decomposed to aten.where.self(tensor, tensor, tensor)
+            # and the tensors T(0.4) and T(0.5) were not wrapped in FakeTensors during DDPOptimizer compilation
+            return tmp + torch.where(tmp < 0.5, 0.3, 0.6)
 
     class MyLinear(torch.nn.Module):
         def __init__(self):
-            super(MyLinear, self).__init__()
+            super().__init__()
             self.linear = torch.nn.Linear(512, 512)
 
         def forward(self, x):
@@ -78,22 +81,27 @@ def forward(self, x):
 
     class MyModule(torch.nn.Module):
         def __init__(self):
-            super(MyModule, self).__init__()
+            super().__init__()
             mods = [
                 (MyLinear(), torch.nn.ReLU()),
-                # sandwitch the custom in the middle so it comes before and after
+                # sandwich the custom in the middle so it comes before and after
                 (MyCustomLinear(), torch.nn.ReLU()),
                 (MyLinear(), torch.nn.ReLU()),
             ]
             self.seq = torch.nn.Sequential(*[x for items in mods for x in items])
 
-        def forward(self, x):
-            return self.seq(x)
+        def forward(self, x, y):
+            # test special case where the 0th bucket (layers close to graph input) is at capacity, which would
+            # trigger a new bucket, but there are only trivial ops without parameters to put into the new bucket.
+            # optimize this case by fusing that 'empty bucket' back together with the previous full one
+            return self.seq(x + y)
 
     m = MyModule().to(device)
     m.apply(init_weights)
     inputs = torch.rand((512, 512)).to(device)
-    correct_outputs = m(inputs)
+    # test duplicated inputs
+    inputs = (inputs, inputs)
+    correct_outputs = m(*inputs)
     return m, inputs, correct_outputs
 
 def get_hf_bert(rank):
@@ -120,21 +128,6 @@ def compile_fn(self, gm, example_inputs):
         self.compiler_called += 1
         return gm
 
-@contextmanager
-def _per_rank_init(rank, world_size):
-    # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
-    # Just manually implement the most important part of the dynamo behavior to reset/clear.
-    torch.cuda.set_device(rank)
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '6789'
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-    torch._dynamo.reset()
-    torch._dynamo.utils.counters.clear()
-    yield
-    torch._dynamo.reset()
-    torch._dynamo.utils.counters.clear()
-    dist.destroy_process_group()
-
 
 # This simulates DDP, but it doesn't actually do any process communication;
 # it just has enough properties so that the dynamo distributed optimization is
@@ -211,39 +204,16 @@ def forward(self):
 # single process version; if it's just a problem in the Dynamo distributed
 # optimizer, you should be able to repro it single process!
 @requires_nccl()
-class TestDistributedMultiProc(MultiProcessTestCase):
-    def setUp(self):
-        super(TestDistributedMultiProc, self).setUp()
-        self._spawn_processes()
-
-    def tearDown(self):
-        super(TestDistributedMultiProc, self).tearDown()
-        try:
-            os.remove(self.file_name)
-        except OSError:
-            pass
-
-    @property
-    def world_size(self) -> int:
-        return torch.cuda.device_count()
-
-    @classmethod
-    def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
-        # Don't enable DDP + ReplicatedTensor, as that breaks Dynamo+DDP
-        # TODO(whc) why is ReplicatedTensor defaulted=True in MultiProcessTestCase, and should we support it?
-        # from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
-        # _set_ddp_with_replicated_tensor(True)
-
-        # The rest is copypasta from MultiProcessTestCase._run
-        self = cls(test_name)
-        self.rank = rank
-        self.file_name = file_name
-        self.run_test(test_name, parent_pipe)
-
+class TestMultiProc(DynamoDistributedMultiProcTestCase):
+    """
+    Note: MultiProcTestCase spawns processes per test and is slow.
+    Prefer MultiThreadedTestCase for most tests. Perhaps use this one
+    sparingly for integration tests.
+    """
     @skip_if_lt_x_gpu(2)
     @patch.object(config, "optimize_ddp", False)
     def test_ddp_baseline_aot_eager_multiprocess(self):
-        with _per_rank_init(self.rank, self.world_size):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             self.assertFalse(config.optimize_ddp)
             m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
             m = DDP(m, device_ids=[self.rank])
@@ -258,7 +228,7 @@ def test_ddp_baseline_aot_eager_multiprocess(self):
     @patch.object(torch._inductor.config, "fallback_random", True)
     def test_hf_bert_ddp_inductor(self):
 
-        with _per_rank_init(self.rank, self.world_size):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             model, inputs = get_hf_bert(self.rank)
             model = DDP(model)
             run_hf_bert_ddp(self, model, inputs, "inductor")
@@ -267,14 +237,14 @@ def test_hf_bert_ddp_inductor(self):
     @import_transformers_or_skip()
     @patch.object(config, "optimize_ddp", True)
     def test_hf_bert_ddp_aot_eager(self):
-        with _per_rank_init(self.rank, self.world_size):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             model, inputs = get_hf_bert(self.rank)
             model = DDP(model)
             run_hf_bert_ddp(self, model, inputs, "aot_eager")
 
     @skip_if_lt_x_gpu(1)
     def test_fsdp_aot_eager(self):
-        with _per_rank_init(self.rank, self.world_size):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)
             m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
             fsdp_m = FSDP(m, use_orig_params=True)
@@ -298,7 +268,7 @@ def test_fsdp_aot_eager(self):
     @skip_if_lt_x_gpu(1)
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     def test_fsdp_inductor(self):
-        with _per_rank_init(self.rank, self.world_size):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)
             m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
             fsdp_m = FSDP(m, use_orig_params=True)
@@ -335,7 +305,7 @@ def apply_fsdp(model, wrap_policy):
             )
             return model
 
-        with _per_rank_init(self.rank, self.world_size):
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
             for (wrap_policy, test_instance) in (
                 (
                     None,
@@ -371,33 +341,13 @@ def apply_fsdp(model, wrap_policy):
 
 
 @requires_nccl()
-class TestDistributed(torch._dynamo.test_case.TestCase):
-    """
-    Test harness initializes dist process group
+class TestSingleProc(DynamoDistributedSingleProcTestCase):
     """
+    Test harness initializes dist process group.
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        # _exit_stack is set up in TestCase
-        cls._exit_stack.enter_context(
-            patch.dict(
-                os.environ,
-                {
-                    "MASTER_ADDR": "localhost",
-                    "MASTER_PORT": "12355",
-                },
-            )
-        )
-        cls.rank = 0
-        cls.device = f"cuda:{cls.rank}"
-        cls.device_ids = None if "cuda" in cls.device else [cls.rank]
-        dist.init_process_group("nccl", rank=cls.rank, world_size=1)
-
-    @classmethod
-    def tearDownClass(cls):
-        dist.destroy_process_group()
-        super().tearDownClass()
+    Test simple things here since they are simpler to debug.
+    Use TestMultiProc for things that really need to run on multiple nodes
+    """
 
     def get_model(self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5):
         m = ToyModel(in_feat=in_feat, hidden_feat=hidden_feat, out_feat=out_feat).to(self.device)
@@ -449,6 +399,13 @@ def opt_fn(inputs):
         self.assertTrue(same(correct_outputs, opt_outputs))
         self.assertEqual(check_splits_compiler.compiler_called, 3)
 
+        # ensure compatibilty with dynamo explain
+
+        explain_out = torch._dynamo.explain(ddp_m, inputs)
+        break_reasons = explain_out[4]
+        self.assertEqual(len(break_reasons), 3)
+        self.assertTrue(all(["DDPOptimizer" in r.reason for r in break_reasons]))
+
     @patch.object(config, "optimize_ddp", True)
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     def test_graph_split_inductor(self):
@@ -517,7 +474,7 @@ def test_custom_layer(self):
 
         @torch._dynamo.optimize(check_splits_compiler.compile_fn)
         def opt_fn(inputs):
-            return ddp_m(inputs)
+            return ddp_m(*inputs)
 
         opt_outputs = opt_fn(inputs)
         self.assertTrue(same(correct_outputs, opt_outputs))
@@ -560,7 +517,7 @@ def test_ignored_parameters(self):
 
         @torch._dynamo.optimize(ddp_optimizer.compile_fn)
         def opt_fn(inputs):
-            return ddp_m(inputs)
+            return ddp_m(*inputs)
 
         opt_outputs = opt_fn(inputs)
         self.assertTrue(same(correct_outputs, opt_outputs))
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
new file mode 100644
index 000000000000..8ccff34fe870
--- /dev/null
+++ b/test/distributed/test_functional_api.py
@@ -0,0 +1,269 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import torch
+import torch.distributed as dist
+import torch.distributed._functional_collectives as ft_c
+import torch.distributed.distributed_c10d as c10d
+import torch.distributed._tensor as dt
+
+from functorch import make_fx
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+from torch.testing._internal.common_distributed import (
+    MultiThreadedTestCase,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TestCase
+)
+
+def new_subgroups(group_size: int, pg_tag=None):
+    world_size = dist.get_world_size()
+    subgroups = []
+    cur_subgroup = None
+
+    for subgroup_id in range(world_size // group_size):
+        start_rank = subgroup_id * group_size
+        end_rank = start_rank + group_size
+        ranks_in_subgroup = list(range(start_rank, end_rank))
+        subgroup = c10d._new_group_with_tag(
+            ranks=ranks_in_subgroup,
+            pg_tag=pg_tag,
+        )
+        subgroups.append(subgroup)
+
+        rank = dist.get_rank()
+        if rank in ranks_in_subgroup:
+            cur_subgroup = subgroup
+
+    return cur_subgroup, subgroups
+
+
+class TestExpand(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 4
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
+
+    def test_expand_1d_rank_list(self):
+        tag, rankset, group_size = ft_c._expand_group([0, 1, 2, 3])
+        self.assertEqual("", tag)
+        self.assertEqual([0, 1, 2, 3], rankset)
+        self.assertEqual(4, group_size)
+
+        tag, rankset, group_size = ft_c._expand_group([0, 1, 2, 3], "bla")
+        self.assertEqual("bla", tag)
+
+    def test_expand_2d_rank_list(self):
+        tag, rankset, group_size = ft_c._expand_group([[0, 1], [2, 3]])
+        self.assertEqual("", tag)
+        self.assertEqual([0, 1, 2, 3], rankset)
+        self.assertEqual(2, group_size)
+
+        tag, rankset, group_size = ft_c._expand_group([[0, 1], [2, 3]], "blu")
+        self.assertEqual("blu", tag)
+
+        with self.assertRaisesRegex(ValueError, "group sizes must be identical"):
+            ft_c._expand_group([[0], [1, 2, 3]])
+
+    def test_expand_process_group(self):
+        tag, rankset, group_size = ft_c._expand_group(dist.group.WORLD)
+        self.assertEqual(c10d._get_group_tag(dist.group.WORLD), tag)
+        self.assertEqual([0, 1, 2, 3], rankset)
+        self.assertEqual(4, group_size)
+
+        tag, rankset, group_size = ft_c._expand_group(dist.group.WORLD, "bla")
+        self.assertEqual("bla", tag)
+
+        my_pg, others = new_subgroups(group_size=2)
+        tag, rankset, group_size = ft_c._expand_group(my_pg)
+        self.assertEqual(c10d._get_group_tag(my_pg), tag)
+        self.assertEqual(dist.get_process_group_ranks(my_pg), rankset)
+        self.assertEqual(2, group_size)
+
+        my_pg = None
+        for i in range(dist.get_world_size()):
+            group = c10d._new_group_with_tag([i], pg_tag="my_pg")
+            if i == dist.get_rank():
+                my_pg = group
+        tag, rankset, group_size = ft_c._expand_group(my_pg)
+        self.assertEqual("my_pg", tag)
+        self.assertEqual([dist.get_rank()], rankset)
+        self.assertEqual(1, group_size)
+
+        tag, rankset, group_size = ft_c._expand_group(my_pg, "bla")
+        self.assertEqual("bla", tag)
+
+    def test_expand_device_mesh(self):
+        mesh = dt.DeviceMesh("cpu", torch.arange(4))
+        tag, rankset, group_size = ft_c._expand_group(mesh)
+        self.assertEqual(c10d._get_group_tag(mesh.get_dim_groups()[0]), tag)
+        self.assertEqual([0, 1, 2, 3], rankset)
+        self.assertEqual(4, group_size)
+
+        mesh = dt.DeviceMesh("cpu", torch.arange(4))
+        tag, rankset, group_size = ft_c._expand_group(mesh)
+        self.assertEqual(c10d._get_group_tag(mesh.get_dim_groups()[0]), tag)
+        self.assertEqual([0, 1, 2, 3], rankset)
+        self.assertEqual(4, group_size)
+
+    def test_expand_device_mesh_tuple(self):
+        mesh = dt.DeviceMesh("cpu", torch.arange(4).view(2, 2))
+        tag, rankset, group_size = ft_c._expand_group(mesh)
+        self.assertEqual(c10d._get_group_tag(mesh.get_dim_groups()[0]), tag)
+        self.assertEqual([0, 2, 1, 3], rankset)
+        self.assertEqual(2, group_size)
+
+        tag, rankset, group_size = ft_c._expand_group((mesh, 0))
+        self.assertEqual(c10d._get_group_tag(mesh.get_dim_groups()[0]), tag)
+        self.assertEqual([0, 2, 1, 3], rankset)
+        self.assertEqual(2, group_size)
+
+        tag, rankset, group_size = ft_c._expand_group((mesh, 1))
+        self.assertEqual(c10d._get_group_tag(mesh.get_dim_groups()[1]), tag)
+        self.assertEqual([0, 1, 2, 3], rankset)
+        self.assertEqual(2, group_size)
+
+class TestPgTag(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 4
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
+
+    """
+    The behavior we want is as follow:
+
+    - rankset+tag will always result in the same PG.
+    Do we enforce this by failing creation of new PGs or returning existing ones?
+        Return existing one.
+
+    - default tag gives existing behavior.
+        This means we should create duplicates.
+    - _expand_group on _default-tagged pg should always resolve to it
+        This mean we can't depend on empty tag + rankset.
+    """
+    def test_pg_creation_with_tag(self):
+        my_group, _ = new_subgroups(group_size=2, pg_tag="blu")
+        my_group2, _ = new_subgroups(group_size=2, pg_tag="blu")
+        self.assertEqual(my_group, my_group2)
+
+        my_group3, _ = new_subgroups(group_size=2, pg_tag="blu2")
+        self.assertNotEqual(my_group, my_group3)
+
+        my_group4, _ = new_subgroups(group_size=2)
+        self.assertNotEqual(my_group, my_group4)
+
+        my_group5, _ = new_subgroups(group_size=2)
+        self.assertNotEqual(my_group4, my_group5)
+
+    def test_pg_lookup_roundtrip(self):
+        pg_tag0, _ = new_subgroups(group_size=2, pg_tag="blu")
+        pg_tag1, _ = new_subgroups(group_size=2, pg_tag="blu2")
+        pg_notag0, _ = new_subgroups(group_size=2)
+        pg_notag1, _ = new_subgroups(group_size=2)
+
+        def roundtrip(pg):
+            tag, rankset, _ = ft_c._expand_group(pg)
+            return c10d._find_pg_by_ranks_and_tag(tag, rankset)
+
+        self.assertEqual(pg_tag0, roundtrip(pg_tag0))
+        self.assertEqual(pg_tag1, roundtrip(pg_tag1))
+        self.assertEqual(pg_notag0, roundtrip(pg_notag0))
+        self.assertEqual(pg_notag1, roundtrip(pg_notag1))
+
+    def test_pg_lookup_with_tag(self):
+        pg_tag0, _ = new_subgroups(group_size=2, pg_tag="blu")
+        pg_tag1, _ = new_subgroups(group_size=2, pg_tag="bla")
+        pg_notag0, _ = new_subgroups(group_size=2)
+
+        def roundtrip(pg, pg_tag):
+            tag, rankset, _ = ft_c._expand_group(pg, pg_tag)
+            return c10d._find_pg_by_ranks_and_tag(tag, rankset)
+
+        self.assertEqual(pg_tag0, roundtrip(pg_tag1, "blu"))
+        self.assertEqual(pg_tag0, roundtrip(pg_notag0, "blu"))
+        # Cannot erase the tag of a PG
+        self.assertEqual(pg_tag0, roundtrip(pg_tag0, ""))
+
+    def test_find_or_create_pg(self):
+        pg = c10d._find_or_create_pg_by_ranks_and_tag("blu", [0, 1, 2, 3], 2)
+        pg_tag0, _ = new_subgroups(group_size=2, pg_tag="blu")
+        self.assertEqual(pg, pg_tag0)
+
+    def test_find_root_pg(self):
+        pg = c10d._find_pg_by_ranks_and_tag("", [0, 1, 2, 3])
+        self.assertEqual(dist.group.WORLD, pg)
+
+class TestTraceableCollectives(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 4
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
+
+    def test_all_reduce_eager(self):
+        tensor = torch.ones([4])
+        mesh = dt.DeviceMesh("cpu", torch.arange(4))
+
+        res = ft_c.all_reduce(tensor, "sum", mesh)
+        self.assertEqual(res, torch.tensor([4, 4, 4, 4], dtype=torch.float))
+
+        mesh = dt.DeviceMesh("cpu", torch.arange(4).view(2, 2))
+        res2 = ft_c.all_reduce(tensor, "sum", (mesh, 1))
+        self.assertEqual(res2, torch.tensor([2, 2, 2, 2], dtype=torch.float))
+
+class TestMetaCollectives(TestCase):
+    def test_all_reduce(self):
+        x = torch.rand((2, 3, 4), device="meta")
+        out = ft_c.all_reduce(x, "sum", [1])
+        self.assertEqual(x.size(), out.size())
+
+class TestGradCollectives(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 2
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
+
+    def test_all_reduce(self):
+        x = torch.rand([4], requires_grad=True)
+        y = torch.rand([4], requires_grad=True)
+        out = ft_c.all_reduce(x, "sum", [0, 1])
+        (out + y).sum().backward()
+        self.assertIsNone(x.grad)
+
+class TestMakeFx(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 2
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
+
+    def test_all_reduce_tracing(self):
+        def allred(input):
+            return ft_c.all_reduce(input, "sum", group=[0, 1]) + 1
+
+        graph = make_fx(allred)(torch.rand(4))
+        nodes = list(graph.graph.nodes)
+
+        self.assertEqual("aten::all_reduce", nodes[1].target.name())
+        self.assertEqual("aten::wait_tensor", nodes[2].target.name())
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/test_launcher.py b/test/distributed/test_launcher.py
index 154a04faa5fe..178d98ffdc9b 100644
--- a/test/distributed/test_launcher.py
+++ b/test/distributed/test_launcher.py
@@ -40,13 +40,13 @@ def test_launch_user_script(self):
             master_port = sock.getsockname()[1]
         args = [
             f"--nnodes={nnodes}",
-            f"--nproc_per_node={nproc_per_node}",
-            "--monitor_interval=1",
-            "--start_method=spawn",
-            "--master_addr=localhost",
-            f"--master_port={master_port}",
-            "--node_rank=0",
-            "--use_env",
+            f"--nproc-per-node={nproc_per_node}",
+            "--monitor-interval=1",
+            "--start-method=spawn",
+            "--master-addr=localhost",
+            f"--master-port={master_port}",
+            "--node-rank=0",
+            "--use-env",
             path("bin/test_script.py"),
         ]
         launch.main(args)
diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index dae3be152970..aca21f0f4cd5 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -10,7 +10,7 @@
                                                   IS_WINDOWS, load_tests,
                                                   TEST_WITH_ROCM,
                                                   sandcastle_skip_if)
-from torch.testing._internal.common_cuda import CUDA11OrLater, TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
 import re
 HIP_VERSION = 0.0 if torch.version.hip is None else float(re.search(r"^\d+\.\d+", torch.version.hip)[0])
@@ -26,7 +26,7 @@
 
 
 datatypes = [torch.float]
-if (TEST_CUDA and CUDA11OrLater and c10d.is_nccl_available() and nccl.version() >= (2, 10)) or TEST_WITH_ROCM:
+if (TEST_CUDA and c10d.is_nccl_available() and nccl.version() >= (2, 10)) or TEST_WITH_ROCM:
     datatypes.append(torch.bfloat16)
 
 class TestNCCL(TestCase):
diff --git a/test/distributed/test_pg_wrapper.py b/test/distributed/test_pg_wrapper.py
index c9bafe0dd862..8bb176dd7e97 100644
--- a/test/distributed/test_pg_wrapper.py
+++ b/test/distributed/test_pg_wrapper.py
@@ -28,7 +28,7 @@
 
 class AbstractProcessGroupWrapperTest(MultiProcessTestCase):
     def setUp(self):
-        super(AbstractProcessGroupWrapperTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     def _validate_error(self, exception, op_type, rank, tensor):
@@ -335,9 +335,6 @@ def _test_nccl_only_shape_mismatch(self, wrapper_pg):
 
 @requires_gloo()
 class ProcessGroupGlooWrapperTest(AbstractProcessGroupWrapperTest):
-    def setUp(self):
-        super(ProcessGroupGlooWrapperTest, self).setUp()
-
     def opts(self, threads=2, timeout=10.0):
         opts = c10d.ProcessGroupGloo._Options()
         opts._timeout = timeout
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index f5475c3c1aa6..bd26fcadb92d 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -16,7 +16,6 @@
     sys.exit(0)
 
 import torch.testing._internal.common_utils as common
-from torch._six import string_classes
 from torch.testing._internal.common_distributed import (
     skip_if_win32,
     create_tcp_store
@@ -60,7 +59,7 @@ def gpus_for_rank(world_size):
     return gpus_for_rank
 
 
-class StoreTestBase(object):
+class StoreTestBase:
     def _create_store(self, i):
         raise RuntimeError("not implemented")
 
@@ -122,7 +121,7 @@ def num_keys_total(self):
 
 class FileStoreTest(TestCase, StoreTestBase):
     def setUp(self):
-        super(FileStoreTest, self).setUp()
+        super().setUp()
         self.file = tempfile.NamedTemporaryFile(delete=False)
 
     def _create_store(self):
@@ -162,9 +161,6 @@ def num_keys_total(self):
 
 @skip_if_win32()
 class HashStoreTest(TestCase, StoreTestBase):
-    def setUp(self):
-        super(HashStoreTest, self).setUp()
-
     def _create_store(self):
         store = dist.HashStore()
         store.set_timeout(timedelta(seconds=300))
@@ -186,7 +182,7 @@ def test_get_underlying_store(self):
 
 class PrefixFileStoreTest(TestCase, StoreTestBase):
     def setUp(self):
-        super(PrefixFileStoreTest, self).setUp()
+        super().setUp()
         self.file = tempfile.NamedTemporaryFile(delete=False)
         self.filestore = dist.FileStore(self.file.name, 1)
         self.prefix = "test_prefix"
@@ -317,7 +313,7 @@ def test_multi_worker_with_nonfixed_world_size(self):
 
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
-        super(PrefixTCPStoreTest, self).setUp()
+        super().setUp()
         self.tcpstore = create_tcp_store()
         self.prefix = "test_prefix"
         self.tcpstore.set_timeout(timedelta(seconds=300))
@@ -335,11 +331,11 @@ def num_keys_total(self):
 
 class MyPythonStore(dist.Store):
     def __init__(self):
-        super(MyPythonStore, self).__init__()
+        super().__init__()
         self.store = {}
 
     def set(self, key, value):
-        if not isinstance(key, string_classes):
+        if not isinstance(key, str):
             raise AssertionError("Expected set to be called with string key")
         if type(value) is not bytes:
             raise AssertionError("Expected set to be called with bytes value")
@@ -358,9 +354,6 @@ def add(self, key, value):
 
 
 class PythonStoreTest(TestCase):
-    def setUp(self):
-        super(PythonStoreTest, self).setUp()
-
     def test_set_get(self):
         # If we were to inherit from StoreTestBase and try to use
         # its test_set_get function, we would exercise the Python
diff --git a/test/distributed/test_traceable_collectives.py b/test/distributed/test_traceable_collectives.py
new file mode 100644
index 000000000000..0070b5c034dc
--- /dev/null
+++ b/test/distributed/test_traceable_collectives.py
@@ -0,0 +1,271 @@
+# Owner(s): ["module: dynamo"]
+import functools
+import unittest
+from unittest.mock import patch
+import torch
+from torch._C import FileCheck
+from torch._dispatch.python import enable_python_dispatcher
+import torch._dynamo
+import torch._dynamo.test_case
+from torch._dynamo.utils import same
+from torch._dynamo.testing import CompileCounter
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_distributed import (
+    DynamoDistributedSingleProcTestCase,
+    DynamoDistributedMultiProcTestCase,
+    _dynamo_dist_per_rank_init,
+    requires_nccl,
+    skip_if_lt_x_gpu
+)
+from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
+from torch._inductor.utils import has_triton, run_and_get_triton_code
+import torch._dynamo.logging
+
+# LOL if you don't remember to import this, then the op isn't registered and it hits
+# the no-op C++ kernel that i am forced to implement despite not using it
+import torch.distributed._functional_collectives
+
+
+@requires_nccl()
+class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
+    """
+    Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
+    """
+    def get_world_trs(self):
+        return {
+            "tag": "",
+            "ranks": list(range(self.world_size)),
+            "group_size": self.world_size,
+        }
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(2)
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    def test_allreduce_inductor(self):
+        """
+        This is matmul/cat/allreduce is a pattern we aim to optimize.
+        """
+
+        def matmul_cat_col(a, b, c, d, e, f, *, tag, ranks, group_size):
+            x = torch.matmul(a, b)
+            y = torch.matmul(c, d)
+            z = torch.cat((x, y))
+            ar = torch.ops.aten.all_reduce(z, "sum", tag, ranks, group_size)
+            g = torch.matmul(e, f)
+            ar = torch.ops.aten.wait_tensor(ar)
+            out = torch.add(ar, g.repeat(2, 1))
+            return (out, )
+
+        def compile(func, example_inputs):
+            graph = make_fx(func)(*example_inputs)
+            return inductor_compile_fx(graph, example_inputs)
+
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+
+            matmul_cat_col = functools.partial(
+                matmul_cat_col,
+                **self.get_world_trs(),
+            )
+            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 6
+
+            # non-ideally, i seem to need to enable this at user level in order to construct a torchdispatch subclass
+            # inside py registered collective ops
+            with enable_python_dispatcher():
+                eager_out = matmul_cat_col(*inputs)
+                compiled_matmul_cat_col = compile(matmul_cat_col, inputs)
+                inductor_out = compiled_matmul_cat_col(*inputs)
+                assert same(eager_out, inductor_out, tol=0.001)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @skip_if_lt_x_gpu(2)
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    def test_allgather_into_tensor_inductor(self):
+        """
+        This is matmul/cat/allreduce is a pattern we aim to optimize.
+        """
+
+        def example(a, b, *, tag, ranks, group_size):
+            c = torch.matmul(a, b)
+            ag = torch.ops.aten.all_gather_into_tensor(c, tag, ranks, group_size)
+            ag = torch.ops.aten.wait_tensor(ag)
+            return (ag, )
+
+        def compile(func, example_inputs):
+            graph = make_fx(func)(*example_inputs)
+            return inductor_compile_fx(graph, example_inputs)
+
+        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
+
+            example = functools.partial(
+                example,
+                **self.get_world_trs(),
+            )
+            inputs = (torch.ones(4, 4, device="cuda") + self.rank,) * 2
+
+            # non-ideally, i seem to need to enable this at user level in order to construct a torchdispatch subclass
+            # inside py registered collective ops
+            with enable_python_dispatcher():
+                eager_out = example(*inputs)
+                compiled_matmul_cat_col = compile(example, inputs)
+                inductor_out = compiled_matmul_cat_col(*inputs)
+                assert same(eager_out, inductor_out, tol=0.001)
+
+
+@requires_nccl()
+class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
+    """
+    Prefer single-proc test runner for basic tests as it is easier to work with.
+    """
+    def get_world_trs(self, world_size=1):
+        return {
+            "tag": "",
+            "ranks": list(range(world_size)),
+            "group_size": world_size,
+        }
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_inductor_single_op(self):
+        torch._inductor.config.debug = True
+
+        def func(inp, *, tag, ranks, group_size):
+            ar = torch.ops.aten.all_reduce(inp, "sum", tag, ranks, group_size)
+            ar = torch.ops.aten.wait_tensor(ar)
+            return ar
+
+        inputs = torch.ones(4, 4, device="cuda")
+
+        with enable_python_dispatcher():
+            compiled = torch.compile(func)
+            out = compiled(inputs, **self.get_world_trs())
+            code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+            FileCheck() \
+                .check("buf0 = empty_strided") \
+                .check("buf0.copy_(arg0_1)") \
+                .check("buf0_work = dist.all_reduce(buf0") \
+                .check("buf0_work.wait()") \
+                .check("return (buf1, )") \
+                .run(code)
+            correct = func(inputs, **self.get_world_trs())
+            assert same(out, correct)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_inductor_steal_buffer(self):
+        """
+        it's ok and optimal if inductor allreduce mutates the buffer of an intermediate
+        that isn't going to be used again
+        """
+        torch._inductor.config.debug = True
+
+        def func(inp, *, tag, ranks, group_size):
+            x = inp + 1
+            ar = torch.ops.aten.all_reduce(x, "sum", tag, ranks, group_size)
+            ar = torch.ops.aten.wait_tensor(ar)
+            # ensure other is not incorrectly aliasing ar's buffer
+            other = torch.ones_like(inp) + 22
+            return ar, other
+
+        inputs = torch.ones(4, 4, device="cuda")
+
+        with enable_python_dispatcher():
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+            FileCheck() \
+                .check("buf1 = buf0; del buf0  # reuse") \
+                .check_not("buf1.copy_(") \
+                .check("buf1_work = dist.all_reduce(buf1") \
+                .check("buf1_work.wait()") \
+                .check("buf2 = buf1") \
+                .check("buf3 = empty_strided") \
+                .check("return (buf2, buf3") \
+                .run(code)
+            out = compiled(inputs, **self.get_world_trs())
+            correct = func(inputs, **self.get_world_trs())
+            assert same(out, correct)
+
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_inductor_doesnt_mutate_shared(self):
+        """
+        make sure that an intermediate that's going to be reuse isn't mutated unless copied
+        """
+        torch._inductor.config.debug = True
+
+        def func(inp, *, tag, ranks, group_size):
+            x = inp + 1
+            ar = torch.ops.aten.all_reduce(x, "sum", tag, ranks, group_size)
+            y = x + 2
+            ar = torch.ops.aten.wait_tensor(ar)
+            # ensure other is not incorrectly aliasing ar's buffer
+            other = torch.ones_like(inp) + 22
+            return ar, y, other
+
+        inputs = torch.ones(4, 4, device="cuda")
+
+        with enable_python_dispatcher():
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+            FileCheck() \
+                .check("buf0 = empty_strided(") \
+                .check("buf2 = empty_strided") \
+                .check("triton__0.run(arg0_1, buf0, buf2") \
+                .check_not("copy_(") \
+                .check("buf1 = buf0; del buf0  # reuse") \
+                .check("buf1_work = dist.all_reduce(buf1") \
+                .check("buf1_work.wait()") \
+                .check("buf3 = buf1") \
+                .check("return (buf3, buf2, buf4") \
+                .run(code)
+            out = compiled(inputs, **self.get_world_trs())
+            correct = func(inputs, **self.get_world_trs())
+            assert same(out, correct)
+
+    def test_dynamo_trace_allreduce(self):
+        def func(inp, *, tag, ranks, group_size):
+            ar = torch.ops.aten.all_reduce(inp, "sum", tag, ranks, group_size)
+            return ar
+
+        inputs = torch.ones(4, 4, device="cuda")
+        counter = CompileCounter()
+        with enable_python_dispatcher():
+            compiled = torch.compile(func, backend=counter)
+            out = compiled(inputs, **self.get_world_trs())
+            correct = func(inputs, **self.get_world_trs())
+            assert counter.frame_count == 1
+            assert counter.op_count == 1
+            assert same(out, correct)
+
+    def test_backwards(self):
+        """
+        It's probably not that common to need backwards support for collectives.
+
+        However, I wanted to at least see if it was possible to support it as a design goal.
+        """
+        def func(inp, *, tag, ranks, group_size):
+            ar = torch.ops.aten.all_reduce(inp, "sum", tag, ranks, group_size)
+            return ar
+
+        input = torch.ones(4, 4, device="cuda", requires_grad=True)
+        with enable_python_dispatcher():
+            # TODO implement backwards
+            with self.assertRaisesRegex(RuntimeError, "derivative for aten::all_reduce is not implemented"):
+                compiled = torch.compile(func, backend="aot_eager")  # inductor bug with single-op allreduce graph
+                out = compiled(input, **self.get_world_trs())
+                out.sum().backward()
+
+                correct_input = input.clone().detach().requires_grad_()
+                correct = func(correct_input, **self.get_world_trs())
+                correct.sum().backward()
+                assert same(out, correct)
+                assert same(input.grad, correct_input.grad)
+
+    def test_meta(self):
+        x = torch.rand((2, 3, 4), device="meta")
+        out = torch.ops.aten.all_reduce(x, "sum", **self.get_world_trs())
+        assert x.size() == out.size()
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/distributions/test_constraints.py b/test/distributions/test_constraints.py
index 475d9f33ec9a..b733cbc021e1 100644
--- a/test/distributions/test_constraints.py
+++ b/test/distributions/test_constraints.py
@@ -5,6 +5,7 @@
 import torch
 from torch.distributions import biject_to, constraints, transform_to
 from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import run_tests
 
 
 EXAMPLES = [
@@ -124,5 +125,5 @@ def test_transform_to(constraint_fn, args, is_cuda):
     assert torch.allclose(y, y2), "Error in transform_to({}) pseudoinverse".format(constraint)
 
 
-if __name__ == '__main__':
-    pytest.main([__file__])
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index f7d8371e967b..db364296e3b7 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -42,7 +42,7 @@
 # Distributions tests use double as the default dtype
 torch.set_default_dtype(torch.double)
 
-from torch._six import inf, nan
+from torch import inf, nan
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN, load_tests,
      gradcheck, skipIfTorchDynamo)
@@ -797,7 +797,7 @@ class DistributionsTestCase(TestCase):
     def setUp(self):
         """The tests assume that the validation flag is set."""
         torch.distributions.Distribution.set_default_validate_args(True)
-        super(DistributionsTestCase, self).setUp()
+        super().setUp()
 
 
 @skipIfTorchDynamo("Not a TorchDynamo suitable test")
@@ -943,7 +943,7 @@ def test_enumerate_support_type(self):
     def test_lazy_property_grad(self):
         x = torch.randn(1, requires_grad=True)
 
-        class Dummy(object):
+        class Dummy:
             @lazy_property
             def y(self):
                 return x + 1
@@ -1466,7 +1466,7 @@ def test_relaxed_bernoulli(self):
     def test_rounded_relaxed_bernoulli(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
 
-        class Rounded(object):
+        class Rounded:
             def __init__(self, dist):
                 self.dist = dist
 
@@ -1513,7 +1513,7 @@ def test_relaxed_one_hot_categorical_2d(self):
     def test_argmax_relaxed_categorical(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
 
-        class ArgMax(object):
+        class ArgMax:
             def __init__(self, dist):
                 self.dist = dist
 
@@ -1522,7 +1522,7 @@ def sample(self, *args, **kwargs):
                 _, idx = torch.max(s, -1)
                 return idx
 
-        class ScipyCategorical(object):
+        class ScipyCategorical:
             def __init__(self, dist):
                 self.dist = dist
 
@@ -1882,7 +1882,7 @@ def test_mixture_same_family_sample(self):
         loc = torch.randn(5)
         scale = torch.rand(5)
 
-        class ScipyMixtureNormal(object):
+        class ScipyMixtureNormal:
             def __init__(self, probs, mu, std):
                 self.probs = probs
                 self.mu = mu
@@ -3466,14 +3466,11 @@ def compute_v(x, alpha):
 
 class TestDistributionShapes(DistributionsTestCase):
     def setUp(self):
-        super(TestDistributionShapes, self).setUp()
+        super().setUp()
         self.scalar_sample = 1
         self.tensor_sample_1 = torch.ones(3, 2)
         self.tensor_sample_2 = torch.ones(3, 2, 3)
 
-    def tearDown(self):
-        super(TestDistributionShapes, self).tearDown()
-
     def test_entropy_shape(self):
         for Dist, params in EXAMPLES:
             for i, param in enumerate(params):
@@ -3930,11 +3927,11 @@ def test_continuous_bernoulli_shape_tensor_params(self):
 class TestKL(DistributionsTestCase):
 
     def setUp(self):
-        super(TestKL, self).setUp()
+        super().setUp()
 
         class Binomial30(Binomial):
             def __init__(self, probs):
-                super(Binomial30, self).__init__(30, probs)
+                super().__init__(30, probs)
 
         # These are pairs of distributions with 4 x 4 parameters as specified.
         # The first of the pair e.g. bernoulli[0] varies column-wise and the second
@@ -4593,7 +4590,7 @@ def test_continuous_bernoulli_with_logits_overflow(self):
 # TODO: make this a pytest parameterized test
 class TestLazyLogitsInitialization(DistributionsTestCase):
     def setUp(self):
-        super(TestLazyLogitsInitialization, self).setUp()
+        super().setUp()
         # ContinuousBernoulli is not tested because log_prob is not computed simply
         # from 'logits', but 'probs' is also needed
         self.examples = [e for e in EXAMPLES if e.Dist in
@@ -4640,7 +4637,7 @@ def test_lazy_probs_initialization(self):
 @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
 class TestAgainstScipy(DistributionsTestCase):
     def setUp(self):
-        super(TestAgainstScipy, self).setUp()
+        super().setUp()
         positive_var = torch.randn(20).exp()
         positive_var2 = torch.randn(20).exp()
         random_var = torch.randn(20)
@@ -4931,9 +4928,6 @@ def test_stack_transform(self):
 
 
 class TestValidation(DistributionsTestCase):
-    def setUp(self):
-        super(TestValidation, self).setUp()
-
     def test_valid(self):
         for Dist, params in EXAMPLES:
             for param in params:
@@ -5021,9 +5015,6 @@ def log_prob(self, value):
         with self.assertWarns(UserWarning):
             d.log_prob(sample)
 
-    def tearDown(self):
-        super(TestValidation, self).tearDown()
-
 
 class TestJit(DistributionsTestCase):
     def _examples(self):
@@ -5039,7 +5030,7 @@ def _examples(self):
     def _perturb_tensor(self, value, constraint):
         if isinstance(constraint, constraints._IntegerGreaterThan):
             return value + 1
-        if isinstance(constraint, constraints._PositiveDefinite) or isinstance(constraint, constraints._PositiveSemidefinite):
+        if isinstance(constraint, (constraints._PositiveDefinite, constraints._PositiveSemidefinite)):
             return value + torch.eye(value.shape[-1])
         if value.dtype in [torch.float, torch.double]:
             transform = transform_to(constraint)
diff --git a/test/distributions/test_transforms.py b/test/distributions/test_transforms.py
index d922c8367228..a4a025b83fd3 100644
--- a/test/distributions/test_transforms.py
+++ b/test/distributions/test_transforms.py
@@ -17,6 +17,7 @@
                                             identity_transform, Transform, _InverseTransform,
                                             PositiveDefiniteTransform)
 from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
+from torch.testing._internal.common_utils import run_tests
 
 
 def get_transforms(cache_size):
@@ -494,5 +495,5 @@ def test_save_load_transform():
     assert torch.allclose(log_prob, other.log_prob(x))
 
 
-if __name__ == '__main__':
-    pytest.main([__file__])
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributions/test_utils.py b/test/distributions/test_utils.py
index be2973760cc8..3855b7f15d63 100644
--- a/test/distributions/test_utils.py
+++ b/test/distributions/test_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
-
+from torch.testing._internal.common_utils import run_tests
 
 @pytest.mark.parametrize('shape', [
     (2, 2),
@@ -22,5 +22,5 @@ def test_tril_matrix_to_vec(shape):
         assert torch.allclose(tril_mat, actual)
 
 
-if __name__ == '__main__':
-    pytest.main([__file__])
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
index 3a0bbca3536c..f5476f1e128c 100644
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@@ -122,7 +122,7 @@ def fn(x, y):
     def test_call_fn_with_non_const_inputs_aot_safe(self):
         class ModuleSpecialFwd(torch.nn.Module):
             def __init__(self):
-                super(ModuleSpecialFwd, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(
                     in_channels=3, out_channels=20, kernel_size=(5, 5)
                 )
@@ -151,9 +151,6 @@ def forward(self, x):
 
     def test_call_fn_with_non_const_inputs_aot_unsafe(self):
         class ModuleSpecialFwd(torch.nn.Module):
-            def __init__(self):
-                super(ModuleSpecialFwd, self).__init__()
-
             def _some_bad_fwd(self, param, y):
                 prev_grad = torch.is_grad_enabled()
                 try:
@@ -190,9 +187,6 @@ def forward(self, x, y):
 
     def test_call_fn_with_non_const_inputs_aot_unsafe_control_flow(self):
         class ModuleSpecialFwd(torch.nn.Module):
-            def __init__(self):
-                super(ModuleSpecialFwd, self).__init__()
-
             def _some_bad_fwd(self, param, y):
                 if y[0][0] < 3:
                     return y + param
@@ -314,7 +308,6 @@ def guard_fail_fn(failure):
     def test_double_backward_errors(self):
         # Remove this test after we get double backward to actually work
         for grad_output in (torch.tensor(1.0, requires_grad=True), None):
-            # See @once_differentiable docs for why there are two different errors
             x = torch.tensor(1.0, requires_grad=True)
             err = "torch.compile with aot_autograd does not currently support double backward"
 
@@ -326,7 +319,7 @@ def f1(x):
                 (gx,) = torch.autograd.grad(
                     y, x, create_graph=True, grad_outputs=grad_output
                 )
-                gx.backward()
+                torch.autograd.grad(gx, x)
                 return gx
 
             compiled_f1 = torch.compile(backend="aot_eager")(f1)
@@ -345,7 +338,7 @@ def f2(x):
             compiled_f2 = torch.compile(backend="aot_eager")(f2)
             gx = compiled_f2(x)
             with self.assertRaisesRegex(RuntimeError, err):
-                gx.backward()
+                torch.autograd.grad(gx, x)
 
             # (3) double backward entirely outside compiled function
             def f3(x):
@@ -358,7 +351,7 @@ def f3(x):
                 y, x, create_graph=True, grad_outputs=grad_output
             )
             with self.assertRaisesRegex(RuntimeError, err):
-                gx.backward()
+                torch.autograd.grad(gx, x)
 
         # create_graph=False
         def f4(x):
@@ -444,6 +437,193 @@ def guard_fail_fn(failure):
 
         torch._dynamo.reset()
 
+    @patch("torch._functorch.config.debug_assert", True)
+    def test_arg_dupe_via_dynamo_recompiles_many_args_param_non_tensor_arg(self):
+        class F(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mean = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self, a, b, c, d, e, f):
+                a.t_()
+                b.t_()
+                c.t_()
+                d.t_()
+                return (a + b + c + d + self.mean) * e * f
+
+        a = torch.randn(3, 3, requires_grad=True)
+        b = torch.randn(3, 3, requires_grad=True)
+        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
+        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+
+        failure_reason = None
+
+        def guard_fail_fn(failure):
+            nonlocal failure_reason
+            failure_reason = failure[0]
+
+        self.assertTrue(failure_reason is None)
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f(a1, a1, a1, a1, 2, 2)
+        f(a2, b2, b2, b2, 2, 2)
+        self.assertEqual(cc.frame_count, 2)
+        self.assertEqual(failure_reason, "a is b")
+
+        torch._dynamo.reset()
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        c = torch.randn(3, 3, requires_grad=True)
+        d = torch.randn(3, 3, requires_grad=True)
+        c3, c4 = c.clone(), c.clone()
+        d3, d4 = d.clone(), d.clone()
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f(a3, b3, c3, c3, 3, 3)
+        f(a4, b4, c4, d4, 3, 3)
+        self.assertEqual(cc.frame_count, 2)
+        self.assertEqual(failure_reason, "c is d")
+
+    @patch("torch._functorch.config.debug_assert", True)
+    def test_arg_dupe_via_dynamo_recompiles_many_with_global(self):
+        z = None
+
+        class F(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mean = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self, a, b, c, d, e, f):
+                a.t_()
+                b.t_()
+                c.t_()
+                d.t_()
+                return (a + b + c + d + z + self.mean) * e * f
+
+        a = torch.randn(3, 3, requires_grad=True)
+        b = torch.randn(3, 3, requires_grad=True)
+        z = a
+        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
+        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+
+        failure_reason = None
+
+        def guard_fail_fn(failure):
+            nonlocal failure_reason
+            failure_reason = failure[0]
+
+        self.assertTrue(failure_reason is None)
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f(a1, a1, a1, a1, 2, 2)
+        f(a2, b2, b2, b2, 2, 2)
+        self.assertEqual(cc.frame_count, 2)
+        self.assertEqual(failure_reason, "a is b")
+
+    @patch("torch._functorch.config.debug_assert", True)
+    def test_arg_dupe_via_dynamo_recompiles_many_args_param_non_tensor_arg_list(self):
+        class F(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mean = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self, e, f, a, b, c, d):
+                a.t_()
+                b.t_()
+                c.t_()
+                d.t_()
+                return (a + b + c + d + self.mean) * e[0] * f[0]
+
+        a = torch.randn(3, 3, requires_grad=True)
+        b = torch.randn(3, 3, requires_grad=True)
+        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
+        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+
+        failure_reason = None
+
+        def guard_fail_fn(failure):
+            nonlocal failure_reason
+            failure_reason = failure[0]
+
+        self.assertTrue(failure_reason is None)
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f([3, 2, 1], [4, 5, 6], a1, a1, a1, a1)
+        f([3, 2, 1], [4, 5, 6], a2, b2, b2, b2)
+        self.assertEqual(cc.frame_count, 2)
+        self.assertEqual(failure_reason, "a is b")
+
+        torch._dynamo.reset()
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        c = torch.randn(3, 3, requires_grad=True)
+        d = torch.randn(3, 3, requires_grad=True)
+        c3, c4 = c.clone(), c.clone()
+        d3, d4 = d.clone(), d.clone()
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f([3, 2, 1], [4, 5, 6], a3, b3, c3, c3)
+        f([3, 2, 1], [4, 5, 6], a4, b4, c4, d4)
+        self.assertEqual(cc.frame_count, 2)
+
+    @patch("torch._functorch.config.debug_assert", True)
+    def test_arg_dupe_via_dynamo_recompiles_many_args_param(self):
+        class F(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mean = torch.nn.Parameter(torch.randn(3, 3))
+
+            def forward(self, a, b, c, d):
+                a.t_()
+                b.t_()
+                c.t_()
+                d.t_()
+                return a + b + c + d + self.mean
+
+        a = torch.randn(3, 3, requires_grad=True)
+        b = torch.randn(3, 3, requires_grad=True)
+        a1, a2, a3, a4 = a.clone(), a.clone(), a.clone(), a.clone()
+        b1, b2, b3, b4 = b.clone(), b.clone(), b.clone(), b.clone()
+
+        failure_reason = None
+
+        def guard_fail_fn(failure):
+            nonlocal failure_reason
+            failure_reason = failure[0]
+
+        self.assertTrue(failure_reason is None)
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f(a1, a1, a1, a1)
+        f(a2, b2, b2, b2)
+        self.assertEqual(cc.frame_count, 2)
+        self.assertEqual(failure_reason, "a is b")
+
+        torch._dynamo.reset()
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+
+        c = torch.randn(3, 3, requires_grad=True)
+        d = torch.randn(3, 3, requires_grad=True)
+        c3, c4 = c.clone(), c.clone()
+        d3, d4 = d.clone(), d.clone()
+
+        f = torch._dynamo.optimize(cc, guard_fail_fn=guard_fail_fn)(F())
+        f(a3, b3, c3, c3)
+        f(a4, b4, c4, d4)
+        self.assertEqual(cc.frame_count, 2)
+        self.assertEqual(failure_reason, "c is d")
+
     @patch("torch._functorch.config.debug_assert", True)
     def test_arg_dupe_via_dynamo_recompiles_many_args(self):
         class F(torch.nn.Module):
@@ -490,6 +670,71 @@ def guard_fail_fn(failure):
         self.assertEqual(cc.frame_count, 2)
         self.assertEqual(failure_reason, "c is d")
 
+    @patch("torch._functorch.config.debug_assert", True)
+    def test_multiple_aot_autograd_calls_dupe_args(self):
+        def maybe_dupe_op(x):
+            y = x + 1
+            z = x + 2
+            if x.numel() < 5:
+                return y, y
+            else:
+                return y, z
+
+        aten = torch.ops.aten
+        lib = torch.library.Library("custom", "DEF")
+        lib.define("maybe_dupe_op(Tensor a) -> (Tensor, Tensor)")
+        lib.impl("maybe_dupe_op", maybe_dupe_op, "CPU")
+        lib.impl("maybe_dupe_op", maybe_dupe_op, "Meta")
+
+        # this is just dealing with the fact that
+        # aot_module_simplified expects submods to always return tuples/lists
+        class WrapperModule(torch.nn.Module):
+            def __init__(self, mod):
+                super().__init__()
+                self.mod = mod
+
+            def forward(self, *args):
+                out = self.mod(*args)
+                if isinstance(out, (list, tuple)):
+                    return out
+                return (out,)
+
+        def compile_submod(input_mod, args):
+            from functorch.compile import nop
+            from torch._functorch.aot_autograd import aot_module_simplified
+
+            class WrapperModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.original = input_mod
+                    self.submod = aot_module_simplified(input_mod, args, nop)
+
+                def forward(self, *args):
+                    return self.submod(*args)
+
+            return WrapperModule()
+
+        def test_compile(fx_g, example_inps):
+            split_gm = torch.fx.passes.split_module.split_module(
+                fx_g, None, lambda node: 1 if "mul" in str(node) else 0
+            )
+            submod_1_inps = split_gm.submod_0(*example_inps)
+            split_gm.submod_0 = compile_submod(
+                WrapperModule(split_gm.submod_0), example_inps
+            )
+            split_gm.submod_1 = compile_submod(
+                WrapperModule(split_gm.submod_1), submod_1_inps
+            )
+            return split_gm
+
+        @torch._dynamo.optimize(test_compile)
+        def f(a):
+            b, c = torch.ops.custom.maybe_dupe_op(a)
+            return (b.mul_(c),)
+
+        f(torch.ones(4))
+        f(torch.ones(6))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_optimizations.py b/test/dynamo/test_backends.py
similarity index 55%
rename from test/dynamo/test_optimizations.py
rename to test/dynamo/test_backends.py
index 1049b9bc1ec4..0749bac9f8ad 100644
--- a/test/dynamo/test_optimizations.py
+++ b/test/dynamo/test_backends.py
@@ -1,41 +1,19 @@
 # Owner(s): ["module: dynamo"]
-import importlib
-import json
-import os
+import functools
 import unittest
 
 import torch
 
 import torch._dynamo
+import torch._dynamo.backends.ipex
 import torch._dynamo.test_case
-from torch._dynamo.optimizations import backends
-from torch._dynamo.optimizations.log_args import conv_args_analysis
-from torch._dynamo.optimizations.normalize import Inplacifier, normalize
+from torch._dynamo.backends.ipex import has_ipex
+from torch._dynamo.backends.onnxrt import has_onnxruntime
+from torch._dynamo.backends.tvm import has_tvm
 from torch._dynamo.testing import same
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
-
-def has_onnxruntime():
-    try:
-        importlib.import_module("onnxruntime")
-        return True
-    except ImportError:
-        return False
-
-
-def has_ipex():
-    try:
-        importlib.import_module("intel_extension_for_pytorch")
-        return True
-    except ImportError:
-        return False
-
-
-def has_functorch():
-    try:
-        importlib.import_module("functorch")
-        return True
-    except ImportError:
-        return False
+requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 
 class Seq(torch.nn.Module):
@@ -54,7 +32,7 @@ def forward(self, x):
 
 class Conv_Bn_Relu(torch.nn.Module):
     def __init__(self, in_channels, out_channels, **kwargs):
-        super(Conv_Bn_Relu, self).__init__()
+        super().__init__()
         self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
         self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001)
         self.relu = torch.nn.ReLU()
@@ -64,15 +42,6 @@ def forward(self, x):
 
 
 class TestOptimizations(torch._dynamo.test_case.TestCase):
-    def test_inplacifier(self):
-        gm = torch.fx.symbolic_trace(Seq())
-        normalize(gm)
-        Inplacifier(gm).inplacify()
-        gm.recompile()
-        code = gm.code.replace(" ", "")
-        self.assertIn("inplace=True", code)
-        self.assertIn("out=linear_1", code)
-
     def test_example_inputs(self):
         def fn(a, bc, d):
             b, c = bc
@@ -128,37 +97,6 @@ def fwd(*args):
         self.assertTrue(same(r1, r2))
         self.assertTrue(same(r1, r3))
 
-    @unittest.skipIf(not has_functorch(), "requires functorch")
-    def test_log_conv_args(self):
-        model = Conv_Bn_Relu(3, 32, kernel_size=3, stride=1)
-        model = model.to(memory_format=torch.channels_last)
-        model = model.eval()
-        input = torch.randn(8, 3, 64, 64).contiguous(memory_format=torch.channels_last)
-        r1 = model(input)
-        # check tmp/conv_args.json exists and has keys as arg names
-        filename = "tmp/conv_args.json"
-        if os.path.exists(filename):
-            os.remove(filename)
-        opt_model = torch._dynamo.optimize(conv_args_analysis)(model)
-        with torch.no_grad():
-            r2 = opt_model(input)
-        self.assertTrue(same(r1, r2.float(), tol=0.1))
-        self.assertTrue(os.path.exists(filename))
-        with open(filename) as f:
-            args_dict = json.load(f)
-            self.assertIn("convolution", args_dict.keys())
-            conv_args_dict = args_dict["convolution"]
-            self.assertIn("input", conv_args_dict.keys())
-            self.assertIn("weight", conv_args_dict.keys())
-            self.assertIn("bias", conv_args_dict.keys())
-            self.assertIn("stride", conv_args_dict.keys())
-            self.assertIn("padding", conv_args_dict.keys())
-            self.assertIn("dilation", conv_args_dict.keys())
-            self.assertIn("transposed", conv_args_dict.keys())
-            self.assertIn("output_padding", conv_args_dict.keys())
-            self.assertIn("groups", conv_args_dict.keys())
-        os.remove(filename)
-
     @unittest.skipIf(not has_ipex(), "requires ipex")
     def test_ipex_fp32(self):
         model = Conv_Bn_Relu(3, 32, kernel_size=3, stride=1)
@@ -166,11 +104,14 @@ def test_ipex_fp32(self):
         model = model.eval()
         input = torch.randn(8, 3, 64, 64).contiguous(memory_format=torch.channels_last)
         r1 = model(input)
-        opt_model = torch._dynamo.optimize(backends.ipex_fp32)(model)
-        with torch.no_grad():
-            r2 = opt_model(input)
-        self.assertTrue(same(r1, r2))
-        self.assertEqual(r2.dtype, torch.float32)
+        for dynamic_shapes in [True, False]:
+            torch._dynamo.reset()
+            opt_model = torch._dynamo.optimize("ipex", dynamic=dynamic_shapes)(model)
+            with torch.no_grad():
+                for _ in range(3):
+                    r2 = opt_model(input)
+            self.assertTrue(same(r1, r2))
+            self.assertEqual(r2.dtype, torch.float32)
 
     @unittest.skipIf(not has_ipex(), "requires ipex")
     def test_ipex_bf16(self):
@@ -179,15 +120,70 @@ def test_ipex_bf16(self):
         model = model.eval()
         input = torch.randn(8, 3, 64, 64).contiguous(memory_format=torch.channels_last)
         r1 = model(input)
-        opt_model = torch._dynamo.optimize(backends.ipex_bf16)(model)
-        with torch.no_grad(), torch.cpu.amp.autocast():
-            r2 = opt_model(input)
-        self.assertTrue(same(r1, r2.float(), tol=0.1))
-        self.assertEqual(r2.dtype, torch.bfloat16)
+        for dynamic_shapes in [True, False]:
+            torch._dynamo.reset()
+            opt_model = torch._dynamo.optimize("ipex", dynamic=dynamic_shapes)(model)
+            with torch.no_grad(), torch.cpu.amp.autocast():
+                for _ in range(3):
+                    r2 = opt_model(input)
+            self.assertTrue(same(r1, r2.float(), tol=0.1))
+            self.assertEqual(r2.dtype, torch.bfloat16)
+
+    def _check_backend_works(self, backend):
+        model = Seq().eval()
+        input = torch.randn(2, 10)
+        r1 = model(input)
+        r2 = torch.compile(model, backend=backend)(input)
+        self.assertTrue(same(r1, r2.float(), tol=0.01))
+
+    def test_eager(self):
+        self._check_backend_works("eager")
+
+    def test_torchscript(self):
+        self._check_backend_works("ts")
+
+    def test_aot_eager(self):
+        self._check_backend_works("aot_eager")
+
+    def test_aot_eager_decomp_partition(self):
+        self._check_backend_works("aot_eager_decomp_partition")
+
+    def test_aot_ts(self):
+        self._check_backend_works("aot_ts")
+
+    @requires_cuda()
+    def test_aot_cudagraphs(self):
+        self._check_backend_works("cudagraphs")
+
+    @requires_cuda()
+    def test_aot_ts_nvfuser(self):
+        self._check_backend_works("aot_ts_nvfuser")
+
+    @requires_cuda()
+    def test_nvprims_nvfuser(self):
+        self._check_backend_works("nvprims_nvfuser")
+
+    @requires_cuda()
+    def test_nvprims_aten(self):
+        self._check_backend_works("nvprims_aten")
+
+    @unittest.skipIf(not has_onnxruntime(), "requires onnxruntime")
+    def test_onnxrt(self):
+        self._check_backend_works("onnxrt")
+
+    @unittest.skipIf(not has_tvm(), "requires tvm")
+    def test_tvm(self):
+        self._check_backend_works("tvm")
+
+    def test_list_backends(self):
+        self.assertIn("inductor", torch._dynamo.list_backends())
+        self.assertIn("inductor", torch._dynamo.list_backends(exclude_tags=None))
+        self.assertNotIn("eager", torch._dynamo.list_backends())
+        self.assertNotIn("eager", torch._dynamo.list_backends(exclude_tags=["debug"]))
+        self.assertIn("eager", torch._dynamo.list_backends(exclude_tags=[]))
 
 
 class NormalizeIRTests(torch._dynamo.test_case.TestCase):
-    @unittest.skipIf(not has_functorch(), "requires functorch")
     def test_inplace_normalize(self):
         def fn(a, b):
             x = torch.cos(a)
diff --git a/test/dynamo/test_comptime.py b/test/dynamo/test_comptime.py
index 91a05fa02cbf..8444694e5765 100644
--- a/test/dynamo/test_comptime.py
+++ b/test/dynamo/test_comptime.py
@@ -241,6 +241,7 @@ def f(x):
             def _(ctx):
                 y = ctx.get_local("y")
                 SELF.assertEqual(y.as_fake().size(0), 2)
+                SELF.assertEqual(y.size(0), 2)
                 # Trigger a graph write (TODO: this is not so
                 # useful right now as there's no way to make use
                 # of the output proxy; maybe it's useful for inserting
diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_cudagraphs.py
similarity index 92%
rename from test/dynamo/test_aot_cudagraphs.py
rename to test/dynamo/test_cudagraphs.py
index 5299e92a060f..0b0ab79d6f74 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_cudagraphs.py
@@ -7,6 +7,7 @@
 import torch
 
 import torch._dynamo
+import torch._dynamo.config
 import torch._dynamo.test_case
 import torch._dynamo.testing
 from torch._dynamo.testing import same
@@ -46,7 +47,7 @@ def wrap(self, *args, **kwargs):
 def patch_all(ok=True):
     return composed(
         unittest.skipIf(TEST_WITH_ROCM, "ROCm not supported"),
-        patch("torch._dynamo.config.verify_correctness", True),
+        torch._dynamo.config.patch(verify_correctness=True),
         assert_aot_autograd_counter(ok),
     )
 
@@ -61,7 +62,7 @@ def test_basic(self):
         def model(x, y):
             return (x + y) * y
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x, y):
             for i in range(N_ITERS):
                 loss = model(x, y).sum()
@@ -78,7 +79,7 @@ def model(x, y):
             b = a.cpu() * 3
             return b
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x, y):
             for i in range(N_ITERS):
                 loss = model(x, y).sum()
@@ -94,7 +95,7 @@ def model(x, y):
             a = x + y
             return a * 3
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x, y):
             for i in range(N_ITERS):
                 loss = model(x, y).sum()
@@ -110,7 +111,7 @@ def model(x, y):
             y.add_(3)
             return x * y
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x, y):
             for i in range(N_ITERS):
                 with self.subTest(i):
@@ -130,7 +131,7 @@ def model(x, y):
             c.add_(2)
             return x * y * 0 + c
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x, y):
             for i in range(N_ITERS):
                 with self.subTest(i):
@@ -149,7 +150,7 @@ def model(y):
             x.add_(3)
             return x * y
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(y):
             for i in range(N_ITERS):
                 with self.subTest(i):
@@ -170,7 +171,7 @@ def model(x):
             x.fill_(2)
             return x
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x):
             for i in range(N_ITERS):
                 with self.subTest(i):
@@ -190,7 +191,7 @@ def model(x):
             y.fill_(3)
             return x, y
 
-        @torch._dynamo.optimize("aot_cudagraphs")
+        @torch._dynamo.optimize("cudagraphs")
         def fn(x):
             for i in range(N_ITERS):
                 with self.subTest(i):
diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index c4d522a4180f..b3714019aa03 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-
 from torch._dynamo import config
 from torch._dynamo.testing import make_test_cls_with_patches
 
@@ -25,20 +24,75 @@
 import unittest
 
 
-def make_dynamic_cls(cls):
-    return make_test_cls_with_patches(
-        cls, "DynamicShapes", "_dynamic_shapes", (config, "dynamic_shapes", True)
+test_classes = {}
+
+
+def make_dynamic_cls(cls, assume_static_by_default):
+    assume_static_by_default_suffix = (
+        "_static_default" if assume_static_by_default else ""
+    )
+    cls_prefix = "StaticDefault" if assume_static_by_default else ""
+    test_class = make_test_cls_with_patches(
+        cls,
+        f"{cls_prefix}DynamicShapes",
+        f"_dynamic_shapes{assume_static_by_default_suffix}",
+        (config, "dynamic_shapes", True),
+        (config, "assume_static_by_default", assume_static_by_default),
     )
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    globals()[test_class.__name__] = test_class
+    return test_class
+
+
+tests = [
+    test_functions.FunctionTests,
+    test_misc.MiscTests,
+    test_repros.ReproTests,
+    test_modules.NNModuleTests,
+    test_unspec.UnspecTests,
+    test_export.ExportTests,
+    test_subgraphs.SubGraphTests,
+]
+for test in tests:
+    for assume_static_by_default in [True, False]:
+        make_dynamic_cls(test, assume_static_by_default=assume_static_by_default)
+
+DynamicShapesMiscTestsDefaultStatic = test_classes[
+    "StaticDefaultDynamicShapesMiscTests"
+]
+DynamicShapesReproTests = test_classes["DynamicShapesReproTests"]
+DynamicShapesReproTestsDefaultStatic = test_classes[
+    "StaticDefaultDynamicShapesReproTests"
+]
+DynamicShapesSubGraphTests = test_classes["DynamicShapesSubGraphTests"]
+DynamicShapesSubGraphTestsDefaultStatic = test_classes[
+    "StaticDefaultDynamicShapesSubGraphTests"
+]
 
+unittest.expectedFailure(
+    DynamicShapesMiscTestsDefaultStatic.test_autocast_sdpa_dynamic_shapes_static_default
+)
+
+unittest.expectedFailure(
+    DynamicShapesMiscTestsDefaultStatic.test_parsing_sdpa_dynamic_shapes_static_default
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTestsDefaultStatic.test_convert_boxes_to_pooler_format_dynamic_shapes_static_default
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTestsDefaultStatic.test_do_paste_mask_dynamic_shapes_static_default
+)
 
-DynamicShapesFunctionTests = make_dynamic_cls(test_functions.FunctionTests)
-DynamicShapesMiscTests = make_dynamic_cls(test_misc.MiscTests)
-DynamicShapesReproTests = make_dynamic_cls(test_repros.ReproTests)
-DynamicShapesNNModuleTests = make_dynamic_cls(test_modules.NNModuleTests)
-DynamicShapesUnspecTests = make_dynamic_cls(test_unspec.UnspecTests)
-DynamicShapesExportTests = make_dynamic_cls(test_export.ExportTests)
-DynamicShapesSubGraphTests = make_dynamic_cls(test_subgraphs.SubGraphTests)
+unittest.expectedFailure(
+    DynamicShapesReproTestsDefaultStatic.test_hf_t5_forward_dynamic_shapes_static_default
+)
 
+unittest.expectedFailure(
+    DynamicShapesReproTestsDefaultStatic.test_sort_out2_dynamic_shapes_static_default
+)
 
 unittest.expectedFailure(
     DynamicShapesReproTests.test_do_paste_mask_dynamic_shapes
@@ -55,18 +109,19 @@ def make_dynamic_cls(cls):
     # Cannot call sizes() on tensor with symbolic sizes/strides
 )
 
-# DynamicShapesExportTests
 unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes
+    DynamicShapesReproTests.test_sort_out2_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
 )
+
 unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
+    DynamicShapesMiscTests.test_autocast_sdpa_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
 )
+
 unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
+    DynamicShapesMiscTests.test_parsing_sdpa_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
 )
 
 
@@ -75,19 +130,11 @@ def make_dynamic_cls(cls):
     DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes
 )
 
-# DynamicShapesUnspecTests
-# Missing decomp
-# RuntimeError: Failed running call_function <function batch_norm at 0x7f7d1ce38310>
-# (*(FakeTensor(FakeTensor(..., device='meta', size=(5, 1, 28, 28)), cpu),
-# FakeTensor(FakeTensor(..., device='meta', size=(1,)), cpu),
-#  FakeTensor(FakeTensor(..., device='meta', size=(1,)), cpu),
-#  FakeTensor(Parameter(FakeTensor(..., device='meta', size=(1,),
-#  requires_grad=True)), cpu),
-#  FakeTensor(Parameter(FakeTensor(..., device='meta', size=(1,),
-#  requires_grad=True)), cpu), False, 0.1,
-# FakeTensor(FakeTensor(..., device='meta', size=()), cpu)), **{}):
-# aten._local_scalar_dense.default
-unittest.expectedFailure(test_unspec.UnspecReproTests.test_batch_norm_act_unspec)
+# DynamicShapesSubGraphTests
+unittest.expectedFailure(
+    DynamicShapesSubGraphTestsDefaultStatic.test_enumerate_not_break_graph_dynamic_shapes_static_default
+)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index bf905a5de4a5..30d6e3aac666 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -1,5 +1,7 @@
 # Owner(s): ["module: dynamo"]
 import operator
+import unittest
+from enum import Enum
 from typing import Dict, List
 from unittest.mock import patch
 
@@ -8,6 +10,7 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 from functorch.experimental.control_flow import cond
+from torch._dynamo import config
 from torch.fx.experimental.proxy_tensor import make_fx
 
 
@@ -73,7 +76,7 @@ def func(x):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @config.patch(dynamic_shapes=True)
     def test_export_shape_control_flow_1(self):
         def func(x):
             if x.shape[0] > 10:
@@ -97,10 +100,37 @@ def func(x):
         for guard in out_guards:
             if guard.source == GuardSource.SHAPE_ENV:
                 hit = True
-                self.assertTrue("x.size()[0] <= 10" in guard.code_list[0])
+                if config.assume_static_by_default:
+                    # The guard produced here must be narrow, because
+                    # we are running with assume_static_by_default
+                    self.assertTrue("x.size()[0] == 6" in guard.code_list)
+                else:
+                    self.assertTrue("x.size()[0] <= 10" in guard.code_list)
 
         self.assertTrue(hit)
 
+    def test_export_control_flow_with_getattr(self):
+        class Animal(Enum):
+            COW = "moo"
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, a):
+                super().__init__()
+                self.a = a
+
+            def forward(self, x):
+                if self.a == Animal.COW.value:
+                    return x * x
+                else:
+                    raise ValueError("bad")
+
+        module = MyModule("moo")
+        input = (torch.ones(4, 3),)
+        resA = module(*input)
+        graph, _ = torch._dynamo.export(module, *input)
+        resB = graph(*input)
+        self.assertTrue(torch._dynamo.utils.same(resA, resB))
+
     def test_export_graph_bypass(self):
         inp = [
             torch.tensor([0.1, 0.1]),
@@ -320,7 +350,7 @@ def func(x, z, k):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_dupes_and_bypass_with_non_tensor_output(self):
         inp = torch.tensor([0.1, 0.1])
         inp2 = torch.tensor([0.1, 0.1])
@@ -366,7 +396,7 @@ def func(a, b, c):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_zeroes_in_new_shape_scalar_out(self):
         inp = torch.zeros(10)
         inp2 = torch.zeros(10)
@@ -390,7 +420,7 @@ def func(a, b, c):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_zeroes_in_new_shape_scalar_out_permute(self):
         inp = torch.zeros(10)
         inp2 = torch.zeros(10)
@@ -414,7 +444,7 @@ def func(a, b, c):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_zeroes_in_new_shape_scalar_out_permute_dupe_and_bypass(self):
         inp = torch.zeros(10)
         inp2 = torch.zeros(10)
@@ -771,7 +801,7 @@ def func(x, z, k):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_dupes_and_bypass_with_non_tensor_output_with_aten_graph(self):
         inp = torch.tensor([0.1, 0.1])
         inp2 = torch.tensor([0.1, 0.1])
@@ -873,9 +903,6 @@ def test_export_with_stack_trace(self):
         inp = torch.randn(4, 4)
 
         class MyBlock(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 x = torch.nn.functional.linear(x, torch.randn(4, 4))
                 return torch.cos(x).relu() + 1
@@ -907,6 +934,52 @@ def forward(self, x):
                 self.assertTrue(node.stack_trace is not None)
                 self.assertTrue(node.meta["nn_module_stack"] is not None)
                 self.assertTrue(node.meta["source_fn"] is not None)
+                self.assertTrue(node.meta["val"] is not None)
+
+    def test_export_preserves_nn_module_stack_for_get_attr(self):
+        inp = torch.randn(4, 4)
+
+        class MyBlock(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.ones(1, 1))
+                self.register_buffer("buffer", torch.ones(1, 1))
+
+            def forward(self, x):
+                x = torch.nn.functional.linear(x, torch.randn(4, 4))
+                return torch.cos(x).relu() + self.weight + self.buffer
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.block = MyBlock()
+
+            def forward(self, x):
+                out = self.block(x)
+                return out
+
+        m = MyModule()
+        exported = torch._dynamo.export(m, inp, aten_graph=False)
+        out_graph = exported[0]
+
+        attr_access_count = 0
+        for node in out_graph.graph.nodes:
+            if node.op == "get_attr":
+                attr_access_count += 1
+                self.assertTrue(node.meta["nn_module_stack"] is not None)
+        self.assertEqual(attr_access_count, 2)
+
+        torch._dynamo.reset()
+
+        exported = torch._dynamo.export(m, inp, aten_graph=True)
+        out_graph = exported[0]
+
+        attr_access_count = 0
+        for node in out_graph.graph.nodes:
+            if node.op == "get_attr":
+                attr_access_count += 1
+                self.assertTrue(node.meta["nn_module_stack"] is not None)
+        self.assertEqual(attr_access_count, 2)
 
     def test_export_compare_optimize_with_make_fx(self):
         inp = torch.tensor([0.1, 0.1])
@@ -1093,9 +1166,6 @@ def helper_fn(x):
             return torch.nonzero(x)
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, z):
                 y = helper_fn(x) + helper_fn(z)
                 return y
@@ -1421,7 +1491,7 @@ def nop(x):
                 f, (torch.randn(5)), aten_graph=False, tracing_mode="symbolic"
             )
 
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_export_with_module_layer(self):
         from functorch.experimental.control_flow import cond
 
@@ -1459,14 +1529,31 @@ def false_fn(val):
         dynamo_result_2 = out_graph(pred, x)
         self.assertTrue(torch._dynamo.utils.same(real_result_2, dynamo_result_2))
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @config.patch(dynamic_shapes=True)
+    def test_export_with_cond_dynamic_shape_pred(self):
+        from functorch.experimental.control_flow import cond
+
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                def true_fn(x):
+                    return x + x
+
+                def false_fn(x):
+                    return x[:2]
+
+                return cond(x.shape[0] <= 2, true_fn, false_fn, [x])
+
+        mod = Module()
+        x = torch.randn(2, 2)
+        out_graph, _ = torch._dynamo.export(mod, x)
+        test_x = torch.randn(3, 2)
+        self.assertEqual(out_graph(test_x), mod(test_x))
+
+    @config.patch(dynamic_shapes=True)
     def test_export_with_map_cond(self):
         from functorch.experimental.control_flow import cond, map
 
         class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def inner(self, x, pred):
                 def true_fn(x):
                     return x + x
@@ -1493,14 +1580,11 @@ def body(x, pred):
         out_graph, _ = torch._dynamo.export(mod, pred_x, x)
         self.assertEqual(real_result, out_graph(pred_y, y))
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @config.patch(dynamic_shapes=True)
     def test_export_with_map_zero_sized_tensor(self):
         from functorch.experimental.control_flow import map
 
         class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, xs):
                 def body(x):
                     return x + 1
@@ -1555,7 +1639,7 @@ def f(x: torch.Tensor) -> torch.Tensor:
 
         self.assertTrue(has_sym_size)
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @config.patch(dynamic_shapes=True)
     def test_dynamic_slicing(self):
         def f(x):
             return x[: x.shape[0] - 2, x.shape[1] - 1 :: 2]
@@ -1593,7 +1677,7 @@ def f(x):
         self.assertEqual(count, 3)
         self.assertEqual(gm_torch_mode(inp).shape, f(inp).shape)
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @config.patch(dynamic_shapes=True)
     def test_dynamic_slicing_invalid(self):
         def g(x, y):
             return x[y : x.shape[0]]
@@ -1610,7 +1694,7 @@ def g(x, y):
                 tracing_mode="symbolic",
             )
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @config.patch(capture_scalar_outputs=True, dynamic_shapes=True)
     def test_dynamic_slicing_simple(self):
         def f(x):
             return x[slice(None, None, None)]
@@ -1622,11 +1706,10 @@ def f(x):
         inp = torch.randn(6, 7)
         self.assertEqual(gm(inp), f(inp))
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_export_cond_in_aten_symbolic(self):
         class ConditionOp(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def true_fn(self, x, y):
                 return x * y
 
@@ -1650,6 +1733,282 @@ def forward(self, pred, x, y):
 
         self.assertEqual(gm(*inp), model(*inp))
 
+    def test_export_with_kwargs(self):
+        def fn_with_kwargs(pos0, tuple0, *myargs, mykw0=None, **mykwargs):
+            out = pos0
+            for arg in tuple0:
+                out *= arg
+            for arg in myargs:
+                out *= arg
+            out *= mykw0
+            out *= mykwargs["input0"] * mykwargs["input1"]
+            return out
+
+        mykwargs = {"input0": torch.randn(4), "input1": torch.randn(4)}
+        tuple0 = (torch.randn(4), torch.randn(4))
+        mykw0 = torch.randn(4)
+        pos0 = torch.randn(4)
+        myargs = [torch.randn(4), torch.randn(4)]
+
+        torch._dynamo.reset()
+        exported = torch._dynamo.export(
+            fn_with_kwargs,
+            pos0,
+            tuple0,
+            *myargs,
+            aten_graph=False,
+            mykw0=mykw0,
+            **mykwargs,
+        )
+
+        out_graph = exported[0]
+        dynamo_result = out_graph(pos0, tuple0, *myargs, mykw0=mykw0, **mykwargs)
+        real_result = fn_with_kwargs(pos0, tuple0, *myargs, mykw0=mykw0, **mykwargs)
+        self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
+
+    def test_export_with_kwargs_and_empty_args(self):
+        def fn_with_kwargs(mykw0=None, **mykwargs):
+            out = mykw0
+            out *= mykwargs["input0"] * mykwargs["input1"]
+            return out
+
+        mykwargs = {"input0": torch.randn(4), "input1": torch.randn(4)}
+        mykw0 = torch.randn(4)
+
+        torch._dynamo.reset()
+        exported = torch._dynamo.export(
+            fn_with_kwargs,
+            aten_graph=False,
+            mykw0=mykw0,
+            **mykwargs,
+        )
+
+        out_graph = exported[0]
+        dynamo_result = out_graph(mykw0=mykw0, **mykwargs)
+        real_result = fn_with_kwargs(mykw0=mykw0, **mykwargs)
+        self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
+
+    def test_export_with_args_and_empty_kwargs(self):
+        def fn_with_kwargs(pos0, tuple0, *myargs):
+            out = pos0
+            for arg in tuple0:
+                out *= arg
+            for arg in myargs:
+                out *= arg
+            return out
+
+        tuple0 = (torch.randn(4), torch.randn(4))
+        pos0 = torch.randn(4)
+        myargs = [torch.randn(4), torch.randn(4)]
+
+        torch._dynamo.reset()
+        exported = torch._dynamo.export(
+            fn_with_kwargs, pos0, tuple0, *myargs, aten_graph=False
+        )
+
+        out_graph = exported[0]
+        dynamo_result = out_graph(pos0, tuple0, *myargs)
+        real_result = fn_with_kwargs(pos0, tuple0, *myargs)
+        self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
+
+    def test_export_meta(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.p = torch.nn.Parameter(torch.ones(2, 3))
+
+            def forward(self, x):
+                return self.p + x
+
+        with torch.device("meta"):
+            m = MyModule()
+
+        inp = torch.ones(2, 3, device="meta")
+        exported = torch._dynamo.export(m, inp)
+        out_graph = exported[0]
+        dynamo_result = out_graph(inp)
+        self.assertEqual(dynamo_result, m(inp))
+
+    @config.patch(dynamic_shapes=True)
+    def test_export_raise_guard_full_constraint(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            if x.shape[0] == 3:
+                return x.sin()
+            return x.cos()
+
+        torch._dynamo.export(my_dyn_fn, y)
+        torch._dynamo.mark_dynamic(y, 0)
+
+        with self.assertRaises(
+            torch._dynamo.exc.InternalTorchDynamoError,
+        ):
+            torch._dynamo.export(my_dyn_fn, y)
+
+    @config.patch(dynamic_shapes=True)
+    def test_export_raise_guard_partial_constraint(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            if x.shape[0] > 3:
+                return x.sin()
+            return x.cos()
+
+        torch._dynamo.export(my_dyn_fn, y)
+        torch._dynamo.mark_dynamic(y, 0)
+
+        with self.assertRaises(
+            torch._dynamo.exc.InternalTorchDynamoError,
+        ):
+            torch._dynamo.export(my_dyn_fn, y)
+
+    @config.patch(dynamic_shapes=True)
+    def test_export_no_raise_on_relationship(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(a, b, c):
+            if a.shape[0] == b.shape[1] == c.shape[2]:
+                return a.sin()
+
+            return a.cos()
+
+        torch._dynamo.export(my_dyn_fn, y, y, y)
+        torch._dynamo.mark_dynamic(y, 0)
+        if config.assume_static_by_default:
+            # The assume_static flag causes this to raise, as
+            # we are now esentially comparing with a constant
+            with self.assertRaises(
+                torch._dynamo.exc.InternalTorchDynamoError,
+            ):
+                torch._dynamo.export(my_dyn_fn, y, y, y)
+        else:
+            torch._dynamo.export(my_dyn_fn, y, y, y)
+
+    @config.patch(dynamic_shapes=True)
+    def test_export_no_raise(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(a, b, c):
+            if a.shape[1] == 3:
+                return a.cos()
+            return a * b * c
+
+        torch._dynamo.export(my_dyn_fn, y, y, y)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.export(my_dyn_fn, y, y, y)
+
+    @config.patch(dynamic_shapes=True)
+    def test_export_multi_dynamic_dim_safe_relationship(self):
+        x = torch.randn([3, 3, 3])
+        y = torch.randn([2, 2, 2])
+        z = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(a, b, c):
+            if a.shape[0] == c.shape[0]:
+                return a.cos()
+            return a * c, b
+
+        torch._dynamo.export(my_dyn_fn, x, y, z)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(z, 0)
+        torch._dynamo.export(my_dyn_fn, x, y, z)
+
+    # This should not fail, but it does, because
+    # symbolic_shapes simplification _maybe_evaluate_static removes this guard
+    # see https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit#
+    @unittest.expectedFailure
+    @config.patch(dynamic_shapes=True)
+    def test_export_dynamic_dim_not_1(self):
+        x = torch.randn([1, 1, 1])
+
+        def my_dyn_fn(a):
+            if a.shape[0] != 1:
+                return a.cos()
+            return a * a
+
+        torch._dynamo.export(my_dyn_fn, x)
+        torch._dynamo.mark_dynamic(x, 0)
+        with self.assertRaises(
+            torch._dynamo.exc.InternalTorchDynamoError,
+        ):
+            torch._dynamo.export(my_dyn_fn, x)
+
+    @config.patch(dynamic_shapes=True)
+    def test_export_multi_dynamic_dim_constraint(self):
+        x = torch.randn([3, 3, 3])
+        y = torch.randn([2, 2, 2])
+        z = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(a, b, c):
+            if a.shape[0] == c.shape[0]:
+                return a.cos()
+            return a * c, b
+
+        torch._dynamo.export(my_dyn_fn, x, y, z)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.mark_dynamic(x, 2)
+        if config.assume_static_by_default:
+            # The assume_static flag causes this to raise, as
+            # we are now esentially comparing with a constant
+            with self.assertRaises(
+                torch._dynamo.exc.InternalTorchDynamoError,
+            ):
+                torch._dynamo.export(my_dyn_fn, x, y, z)
+        else:
+            torch._dynamo.export(my_dyn_fn, x, y, z)
+
+    @config.patch(dynamic_shapes=True)
+    def test_list_contains(self):
+        def func(x):
+            assert x.size(-1) in [4, 5, 6], "bad"
+            return x + x
+
+        inps = (torch.randn(1, 5),)
+        opt_func = torch._dynamo.optimize("eager", nopython=True)(func)
+        real_result = opt_func(*inps)
+
+        torch._dynamo.reset()
+
+        exported = torch._dynamo.export(func, *inps, aten_graph=True)
+        out_graph = exported[0]
+
+        dynamo_result = out_graph(*inps)
+
+        self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
+
+    def test_list_not_contains(self):
+        def func(x):
+            assert x.size(0) not in [4, 5, 6], "bad1"
+            assert "monkey" not in ["cow", "pig"], "bad2"
+            return x + x
+
+        inps = (torch.randn(1, 5),)
+        opt_func = torch._dynamo.optimize("eager", nopython=True)(func)
+        real_result = opt_func(*inps)
+
+        torch._dynamo.reset()
+
+        exported = torch._dynamo.export(func, *inps, aten_graph=True)
+        out_graph = exported[0]
+
+        dynamo_result = out_graph(*inps)
+
+        self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
+
+    def test_export_identity(self):
+        inp = torch.tensor([0.1, 0.1])
+
+        def func(x):
+            return x
+
+        torch._dynamo.reset()
+        exported, _ = torch._dynamo.export(func, inp)
+        dynamo_result = exported(inp)
+        self.assertTrue(torch._dynamo.utils.same(inp, dynamo_result))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_export_mutations.py b/test/dynamo/test_export_mutations.py
index 218935d3f8cb..1bc528050af0 100644
--- a/test/dynamo/test_export_mutations.py
+++ b/test/dynamo/test_export_mutations.py
@@ -57,9 +57,6 @@ def forward(self, x):
     def test_module_attribute_mutation_violation_positive_4(self):
         # Mutating attribute with an inline function
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def add(self, a, b):
                 return a + b
 
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 2a503c945ac2..d19697538d4f 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -371,6 +371,22 @@ def test_tensor_type2(a, b):
         m = a.to("cuda")
         return m + b.type(m.type())
 
+    @make_test
+    def test_tensor_type3(a, b):
+        m = a.type(torch.HalfTensor)
+        return b.type(m.type())
+
+    @make_test
+    def test_tensor_type4(a, b):
+        m = a.type("torch.HalfTensor")
+        return b.type(m.type())
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @make_test
+    def test_tensor_type5(a, b):
+        m = a.type(torch.cuda.HalfTensor)
+        return b.type(m.type())
+
     @make_test
     def test_ndim(x):
         if x.ndim == 2 and x.ndimension() == 2 and x.dim() == 2:
@@ -380,6 +396,10 @@ def test_ndim(x):
     def test_T(x):
         return torch.ones_like(x.T)
 
+    @make_test
+    def test_mT(x):
+        return torch.ones_like(x.mT)
+
     @make_test
     def test_is_sparse(x):
         if not x.is_sparse:
@@ -546,6 +566,24 @@ def fn(x):
         test = make_test(fn)
         test(self)
 
+    @make_test
+    def test_call_dict1(x):
+        d1 = dict()
+        d1["x"] = x + 1
+        d2 = collections.OrderedDict()
+        d2["x"] = x + 2
+        return d1["x"] + d2["x"] + 1
+
+    @make_test
+    def test_call_dict2(x):
+        d1 = dict()
+        d1["x"] = x
+        d2 = collections.OrderedDict(d1)
+        if isinstance(d2, collections.OrderedDict):
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_min_max(a, b):
         c = a + b
@@ -623,6 +661,35 @@ def test_list_reversed(a, b):
         tmp = [a + 1, a + 2, a + 3]
         return a + b + next(iter(reversed(tmp)))
 
+    @make_test
+    def test_list_sorted1(x):
+        tmp = [1, 10, 3, 0]
+        return x + 1, sorted(tmp), sorted(tmp, reverse=True)
+
+    @make_test
+    def test_list_sorted2(x):
+        y = [
+            ("john", "A", 8),
+            ("jane", "B", 5),
+            ("dave", "B", 10),
+        ]
+        return (
+            x + 1,
+            sorted(y),
+            sorted(y, key=lambda student: student[2]),
+            sorted(y, key=lambda student: student[2], reverse=True),
+        )
+
+    @make_test
+    def test_tuple_sorted(x):
+        tmp = (1, 10, 3, 0)
+        return x + 1, sorted(tmp), sorted(tmp, reverse=True)
+
+    @make_test
+    def test_dict_sorted(x):
+        tmp = {1: "D", 10: "B", 3: "E", 0: "F"}
+        return x + 1, sorted(tmp), sorted(tmp, reverse=True)
+
     @make_test
     def test_list_clear(a, b):
         tmp = [a + 1, a + 2]
@@ -719,6 +786,14 @@ def test_torch_distributions_functions(x):
         independent = torch.distributions.Independent(normal, 1)
         return independent.log_prob(x)
 
+    @make_test
+    def test_context_wrapping_nested_functions_no_closure(x):
+        @torch.no_grad()
+        def augment(x: torch.Tensor) -> torch.Tensor:
+            return (x + 1) * 2
+
+        return augment(x)
+
     # # This is to test the new syntax for pattern matching
     # # ("match ... case ...") added on python 3.10.
     # # Uncomment these test cases if you run on 3.10+
@@ -754,9 +829,6 @@ def global_func_with_default_tensor_args(
 
 
 class ModuleWithDefaultTensorArgsMethod(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, x=torch.zeros((2, 2)), *, kw_x=torch.zeros((1, 2))):
         x.add_(1)
         kw_x.add_(1)
@@ -856,6 +928,29 @@ def test_meth_default_tensor_args(self):
         self.assertEqual(cnts.frame_count, 3)
         self.assertEqual(cnts.op_count, 6)
 
+    def test_func_default_torch_args(self):
+        """
+        Tests other types of torch types as function default (size, dtype, device)
+        """
+
+        def func_with_default_torch_args(
+            dt=torch.float16, ds=torch.Size((1, 2, 3)), dd=torch.device("cpu")
+        ):
+            return torch.ones(ds, dtype=dt, device=dd)
+
+        def func():
+            return func_with_default_torch_args()
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        compiled_func = torch.compile(func, backend=cnts)
+        out = func()
+        compiled_out = compiled_func()
+        self.assertEqual(out.dtype, compiled_out.dtype)
+        self.assertEqual(out.device, compiled_out.device)
+        self.assertEqual(out.size(), compiled_out.size())
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_global.py b/test/dynamo/test_global.py
index 445a6cf103d4..237aefe08e57 100644
--- a/test/dynamo/test_global.py
+++ b/test/dynamo/test_global.py
@@ -11,7 +11,7 @@
     import test_global_declaration
 
 
-class Pair(object):  # noqa: B903
+class Pair:  # noqa: B903
     def __init__(self, x, y):
         self.x = x
         self.y = y
diff --git a/test/dynamo/test_interop.py b/test/dynamo/test_interop.py
new file mode 100644
index 000000000000..1576706171b5
--- /dev/null
+++ b/test/dynamo/test_interop.py
@@ -0,0 +1,38 @@
+# Owner(s): ["module: dynamo"]
+import torch
+
+import torch._dynamo.test_case
+import torch._dynamo.testing
+import torch.onnx.operators
+from torch._dynamo.testing import same
+
+
+def fn(a, b):
+    return a + b * 0.67
+
+
+class InteropTests(torch._dynamo.test_case.TestCase):
+    def _common(self, fn):
+        inputs = [torch.randn(10), torch.randn(10)]
+        ref = fn(*inputs)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(*inputs)
+        self.assertTrue(same(ref, res))
+
+    def test_fx_fn(self):
+        fx_fn = torch.fx.symbolic_trace(fn)
+        self._common(lambda a, b: fx_fn(a, b) + 1)
+
+    def test_script_fn(self):
+        script_fn = torch.jit.script(fn)
+        self._common(lambda a, b: script_fn(a, b) + 1)
+
+    def test_trace_fn(self):
+        trace_fn = torch.jit.trace(fn, [torch.zeros(10), torch.zeros(10)])
+        self._common(lambda a, b: trace_fn(a, b) + 1)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index 455777acf252..2175270ef50a 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -4,7 +4,6 @@
 import textwrap
 import unittest
 
-import torch
 import torch._dynamo
 from torch._dynamo.test_minifier_common import MinifierTestBase
 
@@ -13,7 +12,7 @@
 )
 
 RELU_COMPILE_ERROR_BACKEND = """\
-from torch._dynamo.optimizations.backends import register_backend
+from torch._dynamo import register_backend
 
 class DynamoCompileError(Exception):
     pass
@@ -27,7 +26,7 @@ def test_relu_compile_error(gm: torch.fx.GraphModule, example_inputs):
 """
 
 RELU_RUNTIME_ERROR_BACKEND = """\
-from torch._dynamo.optimizations.backends import register_backend
+from torch._dynamo import register_backend
 
 @register_backend
 def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
@@ -40,7 +39,7 @@ def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
 """
 
 RELU_ACCURACY_ERROR_BACKEND = """\
-from torch._dynamo.optimizations.backends import register_backend
+from torch._dynamo import register_backend
 
 @register_backend
 def test_relu_accuracy_error(gm: torch.fx.GraphModule, example_inputs):
@@ -315,12 +314,12 @@ def test_dynamo_config_serialization(self):
         run_code = textwrap.dedent(
             """\
             import torch._dynamo.config
-            torch._dynamo.config.log_level = 5
+            torch._dynamo.config.cache_size_limit = 55
             data = torch._dynamo.config.save_config()
-            torch._dynamo.config.log_level = 3
+            torch._dynamo.config.cache_size_limit = 3
             torch._dynamo.config.repro_after = "dynamo"
             torch._dynamo.config.load_config(data)
-            assert torch._dynamo.logging.get_loggers()[0].level == 5
+            assert torch._dynamo.config.cache_size_limit == 55
             assert torch._dynamo.config.repro_after == "dynamo"
         """
         )
@@ -338,11 +337,13 @@ def _test_after_dynamo_with_modified_config(
                 break
         else:
             self.assertTrue(False)
-        lines.insert(def_idx + 1, "    assert torch._dynamo.config.log_level == 5")
+        lines.insert(
+            def_idx + 1, "    assert torch._dynamo.config.cache_size_limit == 5"
+        )
         backend_code = "\n".join(lines)
         run_code = textwrap.dedent(
             f"""\
-            torch._dynamo.config.log_level = 5
+            torch._dynamo.config.cache_size_limit = 5
             @torch._dynamo.optimize("{self._get_fn_name(backend_code)}")
             def inner(x):
                 for _ in range(10):
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 7738064029a2..12d5bdc656bf 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -29,6 +29,10 @@
     unsupported,
 )
 from torch.nn import functional as F
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FUSED_SDPA,
+    SM80OrLater,
+)
 from torch.testing._internal.common_utils import freeze_rng_state
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -150,6 +154,267 @@ def matmul_op1(a, b):
         # TODO(jansel): FX doesn't support this, should add upstream support
         torch._dynamo.testing.standard_test(self, matmul_op1, 2, expected_ops=1)
 
+    def test_int_shape_binops(self):
+        def fn(x):
+            # Test reversal by putting int arg first.
+            y = 15 - x.shape[0]
+            y = 4 + y
+            y = 5 * y
+            y = 2 % y
+            y = 3**y
+            y = 10 // y
+            y = pow(2, y)
+            y = 10 / y
+            return x + y
+
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=11
+        )
+
+    def test_shape_int_inplace_binops(self):
+        def fn(x):
+            p = x.shape[0]
+            p += 2
+            p -= 2
+            p **= 2
+            p /= 2
+            p *= 2
+            p //= 2
+            p %= 2
+            return x + p
+
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=10
+        )
+
+    def test_int_shape_inplace_binops(self):
+        def fn(x):
+            p = x.shape[0]
+            # Test reversal by putting constant first
+            y = 2
+            y += p
+            y = 2
+            y -= p
+            y = 2
+            y **= p
+            y = 2
+            y /= p
+            y = 2
+            y *= p
+            y = 2
+            y //= p
+            y = 2
+            y %= p
+            return x + y
+
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=10
+        )
+
+    def test_int_int_comparisons(self):
+        def fn(x):
+            if 2 != 2:
+                out = 1
+            elif 2 < 1:
+                out = 1
+            elif 1 > 2:
+                out = 1
+            elif 1 >= 2:
+                out = 1
+            elif 2 <= 1:
+                out = 1
+            elif 2 == 2:
+                out = 2
+            else:
+                out = 1
+            return x + out
+
+        torch._dynamo.testing.standard_test(self, fn, 1, expected_ops=1)
+
+    def test_shape_int_comparisons(self):
+        def fn(x):
+            a = x.shape[0]
+            # Ensure support for constant on right side
+            if a != 10:
+                out = 1
+            elif a < 2:
+                out = 1
+            elif a > 12:
+                out = 1
+            elif a >= 12:
+                out = 1
+            elif a <= 2:
+                out = 1
+            elif a == 10:
+                out = 2
+            else:
+                out = 1
+            return x + out
+
+        # expect for dynamic: size, index, 6 comparison ops, add
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=9
+        )
+
+    def test_int_shape_comparisons(self):
+        def fn(x):
+            a = x.shape[0]
+            # Ensure support for constant on left side
+            if 10 != a:
+                out = 1
+            elif 12 < a:
+                out = 1
+            elif 2 > a:
+                out = 1
+            elif 2 >= a:
+                out = 1
+            elif 12 <= a:
+                out = 1
+            elif 10 == a:
+                out = 2
+            else:
+                out = 1
+            return x + out
+
+        # expect for dynamic: size, index, 6 comparison ops, add
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=9
+        )
+
+    def test_param_shape_binops(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.randn(15))
+
+            def forward(self, x):
+                # Test reversal by putting param shape arg first.
+                p = self.param.shape[0]
+                y = p - x.shape[0]
+                y = p + y
+                y = p * y
+                y = p % y
+                y = p**y
+                y = p // y
+                y = pow(p, y)
+                y = p / y
+                return x + y
+
+        counts = torch._dynamo.testing.CompileCounter()
+        mod = MyModule()
+        optimized_mod = torch._dynamo.optimize(counts, nopython=True)(mod)
+
+        x = torch.randn(3)
+        ref = mod(x)
+        res = optimized_mod(x)
+
+        self.assertTrue(same(ref, res))
+        self.assertEqual(counts.frame_count, 1)
+        expected_op_count = 13 if torch._dynamo.testing.config.dynamic_shapes else 1
+        self.assertEqual(counts.op_count, expected_op_count)
+
+    def test_user_defined_binop(self):
+        class MyClass:
+            def __init__(self, value):
+                self.value = value
+
+            def __radd__(self, other):
+                return self.value + other
+
+        def fn(x, c):
+            y = x.shape[0] + c
+            return x + y
+
+        counts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(counts)(fn)
+
+        x = torch.randn(3)
+        c = MyClass(4)
+        ref = fn(x, c)
+        res = opt_fn(x, c)
+
+        self.assertTrue(same(ref, res))
+        self.assertEqual(counts.frame_count, 1)
+        expected_op_count = 4 if torch._dynamo.testing.config.dynamic_shapes else 1
+        self.assertEqual(counts.op_count, expected_op_count)
+
+    def test_compare_shapes_eq(self):
+        def compare_shapes(a, b, to_list):
+            x = list(a.unsqueeze(-1).shape) if to_list else a.shape
+            y = list(b.unsqueeze(-1).shape) if to_list else b.shape
+            if x == y:
+                return a + 1
+            else:
+                return a + 2
+
+        # Test both ListVariable and ShapeVariable
+        torch._dynamo.testing.standard_test(
+            self, lambda a, b: compare_shapes(a, b, to_list=True), 2
+        )
+        torch._dynamo.testing.standard_test(
+            self, lambda a, b: compare_shapes(a, b, to_list=False), 2
+        )
+
+    def test_compare_shapes_tuple_eq(self):
+        def compare_shapes(a, b):
+            x = tuple(a.unsqueeze(-1).shape)
+            y = tuple(b.unsqueeze(-1).shape)
+            if x == y:
+                return a + 1
+            else:
+                return a + 2
+
+        torch._dynamo.testing.standard_test(self, lambda a, b: compare_shapes(a, b), 2)
+
+    def test_compare_shapes_tuple_neq(self):
+        def compare_shapes(a, b):
+            x = tuple(a.unsqueeze(-1).shape)
+            y = tuple(b.unsqueeze(-1).shape)
+            if x != y:
+                return a + 1
+            else:
+                return a + 2
+
+        torch._dynamo.testing.standard_test(self, lambda a, b: compare_shapes(a, b), 2)
+
+    def test_compare_shapes_neq(self):
+        def compare_shapes(a, b, to_list):
+            x = list(a.unsqueeze(-1).shape) if to_list else a.shape
+            y = list(b.unsqueeze(-1).shape) if to_list else b.shape
+            if x != y:
+                return a + 1
+            else:
+                return a + 2
+
+        # Test both ListVariable and ShapeVariable
+        torch._dynamo.testing.standard_test(
+            self, lambda a, b: compare_shapes(a, b, to_list=True), 2
+        )
+        torch._dynamo.testing.standard_test(
+            self, lambda a, b: compare_shapes(a, b, to_list=False), 2
+        )
+
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    def test_compare_shapes_with_constant(self):
+        def compare_shapes(a):
+            x = a.shape
+            if x[0] != 3:
+                return a * 4
+            return a * 3
+
+        guard_failure = None
+
+        def guard_failures(failure):
+            nonlocal guard_failure
+            guard_failure = failure
+
+        opt_fn = torch._dynamo.optimize(
+            "eager", nopython=True, guard_fail_fn=guard_failures
+        )(compare_shapes)
+        opt_fn(torch.randn([3, 4]))
+        opt_fn(torch.randn([4, 3]))
+        self.assertEqual(guard_failure.reason, "a.size()[0] == 3")
+
     def test_builtin_isinstance(self):
         def fn(x):
             t = torch.arange(1, 3)
@@ -183,6 +448,17 @@ def fn(x):
         r2 = opt_fn(i)
         self.assertTrue(same(r1, r2))
 
+    def test_tensor_iter(self):
+        def fn(x):
+            for y in x:
+                y.add_(1.0)
+            return y
+
+        # expect extra size node for dynamic
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=20, expected_ops_dynamic=21
+        )
+
     def test_empty_list(self):
         def fn(x, ll):
             if len(ll) == 0 and not ll and ll is not None:
@@ -196,6 +472,31 @@ def fn(x, ll):
         self.assertTrue(same(r1, r2))
         self.assertTrue(same(r1, r3))
 
+    def test_min_max_over_iterable(self):
+        def get_test_fn(func):
+            def _fn(a, b, func=func):
+                # try all of list, iterator, tuple, vararg.
+                lst = [a.shape[0] + 1, 8, a.shape[0]]
+                x = func(lst)
+                y = func(iter(lst))
+                z = func(tuple(lst))
+                w = func(*lst)
+                return a + (x + y + z + w)
+
+            return _fn
+
+        # expect for dynamic:
+        # 2 * (size, getitem) ops +
+        # 1 add op +
+        # 4 * 2 min / max ops +
+        # 4 final add ops = 17
+        torch._dynamo.testing.standard_test(
+            self, get_test_fn(func=min), 2, expected_ops=1, expected_ops_dynamic=17
+        )
+        torch._dynamo.testing.standard_test(
+            self, get_test_fn(func=max), 2, expected_ops=1, expected_ops_dynamic=17
+        )
+
     def test_config_obj(self):
         class Cfg:
             def __init__(self):
@@ -448,6 +749,7 @@ def fn(a):
             self, fn=fn, nargs=1, expected_ops=5, expected_ops_dynamic=8
         )
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_tensor_item_capture(self):
         def fn(a, b):
@@ -462,6 +764,7 @@ def fn(a, b):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 3)
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", False)
     def test_tensor_item_no_capture(self):
         def fn(a, b):
@@ -534,6 +837,17 @@ def fn1(a):
             self, fn=fn1, nargs=1, expected_ops=3
         )
 
+    def test_range_with_shape(self):
+        def fn(a):
+            for i in range(1, a.shape[0]):
+                a += 1
+            return a
+
+        # expect 1 more op (size call) for dynamic
+        return torch._dynamo.testing.standard_test(
+            self, fn=fn, nargs=1, expected_ops=9, expected_ops_dynamic=10
+        )
+
     def test_no_grad(self):
         def fn1(a, b):
             x = a + 1
@@ -621,6 +935,68 @@ def fn(count):
         self.assertEqual(cnts.frame_count, 0)
         self.assertEqual(cnts.op_count, 0)
 
+    def test_list_slice_mul(self):
+        def fn(count):
+            a = [1, 2, 3]
+            head_mask = count * a[1:] * count
+            return head_mask
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        self.assertEqual(opt_fn(2), [2, 3] * 4)
+        self.assertEqual(cnts.frame_count, 0)
+        self.assertEqual(cnts.op_count, 0)
+
+    def test_tuple_mul(self):
+        def fn(count):
+            head_mask = count * (2, 3) * count
+            return head_mask
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        self.assertEqual(opt_fn(2), (2, 3) * 4)
+        self.assertEqual(cnts.frame_count, 0)
+        self.assertEqual(cnts.op_count, 0)
+
+    def test_tuple_mul_with_shape(self):
+        def fn(a):
+            x = a.shape[0]
+            y = 2 * (x, 3) * 2
+            return a + y[4]
+
+        # expect 3 ops post folding for dynamic case: size, index, add
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=1, expected_ops_dynamic=3
+        )
+
+    def test_tuple_iadd_with_shape(self):
+        def fn(a):
+            output = (a + a.shape[0], a - a.shape[0])
+            # tuple += tuple
+            output += (a - a.shape[0], a + a.shape[0])
+            # tuple += constant tuple
+            output += (2, 3)
+            return output
+
+        # expect 4 add / subs for static, 4 * 3 (size, index, math op) for dynamic
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=4, expected_ops_dynamic=12
+        )
+
+    def test_list_iadd_with_shape(self):
+        def fn(a):
+            output = [a + a.shape[0], a - a.shape[0]]
+            # list += list
+            output += [a - a.shape[0], a + a.shape[0]]
+            # list += tuple
+            output += (a + a.shape[0], a - a.shape[0])
+            return output
+
+        # expect 6 add / subs for static, 6 * 3 (size, index, math op) for dynamic
+        torch._dynamo.testing.standard_test(
+            self, fn, 1, expected_ops=6, expected_ops_dynamic=18
+        )
+
     def test_user_getattr1(self):
         class MyConfig(dict):
             def __getattr__(self, name):
@@ -658,6 +1034,77 @@ def fn(cfg, x):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 3)
 
+    def test_user_getattribute(self):
+        class MyObject:
+            def __init__(self):
+                self.custom_dict = {"a": torch.rand((2, 2))}
+                self.my_number = 42
+
+            def __getattribute__(self, name):
+                custom_dict = super().__getattribute__("custom_dict")
+                if name in custom_dict:
+                    return custom_dict[name]
+                return super().__getattribute__(name)
+
+            def run(self, x):
+                return self.my_number * x + self.a * x
+
+        def fn(obj, x):
+            return obj.run(x)
+
+        obj = MyObject()
+        x = torch.rand((2, 2))
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        self.assertTrue(same(opt_fn(obj, x), fn(obj, x)))
+
+    def test_nn_module_getattr(self):
+        class MyMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.custom_dict = {"queue": [torch.rand((2, 2)) for _ in range(3)]}
+                self.other_attr = torch.rand((2, 2))
+
+            def __getattr__(self, name):
+                custom_dict = self.custom_dict
+                if name in custom_dict:
+                    return custom_dict[name]
+                return super().__getattr__(name)
+
+            def forward(self, x):
+                return x @ self.other_attr + self.queue[-1]
+
+        x = torch.rand((2, 2))
+        mod = MyMod()
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnts)(mod)
+        self.assertTrue(same(opt_mod(x), mod(x)))
+        self.assertTrue(cnts.frame_count, 1)
+        self.assertTrue(cnts.op_count, 2)
+
+    def test_nn_module_getattribute(self):
+        class MyMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.my_number = 42
+
+            def __getattribute__(self, name):
+                if name == "special_attr":
+                    return torch.tensor([[1, 2], [3, 4]])
+                return super().__getattribute__(name)
+
+            def forward(self, x):
+                return self.my_number * x + self.special_attr * x
+
+        def fn(mod, x):
+            return mod(x)
+
+        mod = MyMod()
+        x = torch.rand((2, 2))
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        self.assertTrue(same(opt_fn(mod, x), fn(mod, x)))
+
     def test_user_property(self):
         class MyConfig:
             @property
@@ -901,6 +1348,56 @@ def fn1(a, b, c):
 
         torch._dynamo.testing.standard_test(self, fn=fn1, nargs=3)
 
+    def test_user_defined_class_python_type(self):
+        class MyClass1:
+            pass
+
+        class ExampleMeta(type):
+            pass
+
+        class MyClass2(metaclass=ExampleMeta):
+            pass
+
+        def fn(x, c):
+            if isinstance(c, MyClass1):
+                return x + 1
+            elif isinstance(c, MyClass2):
+                return x + 2
+            else:
+                return x + 3
+
+        x = torch.rand(3)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        for c in [MyClass1, MyClass2]:
+            ref = fn(x, c)
+            res = opt_fn(x, c)
+            self.assertTrue(same(ref, res))
+
+    def test_super_calling_with_metaclass(self):
+        class ExampleMeta(type):
+            pass
+
+        class MyClass1(metaclass=ExampleMeta):
+            @classmethod
+            def add(cls, x):
+                return x + 1
+
+        class MyClass2(MyClass1):
+            @classmethod
+            def add(cls, x):
+                torch._dynamo.graph_break()
+                return x + super().add(x)
+
+        def fn(x, obj):
+            return x + obj.add(x)
+
+        x = torch.rand(3)
+        obj = MyClass2()
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        ref = fn(x, obj)
+        res = opt_fn(x, obj)
+        self.assertTrue(same(ref, res))
+
     def test_manual_seed(self):
         def fn(a, b):
             x = a + b
@@ -1287,7 +1784,7 @@ def f(x):
         self.assertTrue(same(ref1, res1))
 
     def test_is_tensor_like2(self):
-        class MyTensor(object):
+        class MyTensor:
             @classmethod
             def __torch_function__(cls, func, types, args=(), kwargs=None):
                 if kwargs is None:
@@ -1478,6 +1975,48 @@ def fn(x):
         self.assertTrue(same(ref, res))
         self.assertEqual(cnts.frame_count, 2)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_stream_context_manager1(self):
+        def fn(x):
+            s = torch.cuda.Stream()
+            x = torch.mul(x, 5)
+            x = torch.add(x, 2)
+            with torch.cuda.stream(s):
+                x = torch.relu(x)
+            x = torch.add(x, 1)
+            x = torch.cos(x)
+            return x
+
+        x = torch.randn((2, 2))
+        ref = fn(x)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 9)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_stream_context_manager2(self):
+        def fn(x, s):
+            x = torch.mul(x, 5)
+            x = torch.add(x, 2)
+            with torch.cuda.stream(s):
+                x = torch.relu(x)
+            x = torch.add(x, 1)
+            x = torch.cos(x)
+            return x
+
+        x = torch.randn((2, 2))
+        s = torch.cuda.Stream()
+        ref = fn(x, s)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+        res = opt_fn(x, s)
+        self.assertTrue(same(ref, res))
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 8)
+
     def test_autograd_profiler_enabled(self):
         def fn(x):
             if torch.autograd._profiler_enabled():
@@ -1536,6 +2075,33 @@ def f2(input):
         self.assertEqual(res1, 8)
         self.assertEqual(res2, 9)
 
+    def test_enum_as_dict_key(self):
+        class MyEnum(enum.Enum):
+            FOO = 10
+            BAR = 20
+
+        def fn(x):
+            y = x + 2
+            z = {
+                MyEnum.FOO: torch.tensor(1),
+                MyEnum.BAR: 10,
+                "MyEnum.BAR": torch.tensor(8),
+                5: torch.rand(3),
+            }
+            torch._dynamo.graph_break()
+            a = z[MyEnum.FOO] + z["MyEnum.BAR"]
+            b = y * 2
+            return a, b
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        for _ in range(10):
+            x = torch.rand(3)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertTrue(same(ref, res))
+        self.assertEqual(cnts.frame_count, 2)
+
     def test_const_dict_variable_python_type(self):
         from torch._dynamo.variables import ConstantVariable, ConstDictVariable
 
@@ -1844,7 +2410,6 @@ def foo(x):
         self.assertIs(x_ref(), None)
 
     def test_release_module_memory(self):
-
         mod = torch.nn.Linear(10, 10)
         x = torch.rand([10, 10])
         mod_weight_ref = weakref.ref(mod.weight)
@@ -1983,6 +2548,7 @@ def f(x, n):
         opt_f(x, n)
         self.assertEqual(cnts.frame_count, 1)
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_item(self):
         class MyMod(torch.nn.Module):
@@ -1996,6 +2562,7 @@ def forward(self, x):
 
         self.assertEqual(y, 11)
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_item_changes(self):
         class MyMod(torch.nn.Module):
@@ -2012,6 +2579,7 @@ def forward(self, x):
         self.assertEqual(y, 11)
         self.assertEqual(z, 61)
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_item_changes_new_shape(self):
         class MyMod(torch.nn.Module):
@@ -2187,7 +2755,6 @@ def __init__(self):
                 self.names = []
 
             def forward(self, idx, targets=None):
-
                 b, t = idx.size()
                 assert (
                     t <= self.block_size
@@ -2288,6 +2855,7 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+    @torch._dynamo.config.patch(raise_on_backend_change=True)
     def test_change_backends(self):
         @torch._dynamo.optimize("eager", nopython=True)
         def fn1():
@@ -2646,6 +3214,96 @@ def forward(self, x):
         self.assertEqual(exported.device.index, 0)
         self.assertEqual(exported.dtype, torch.bfloat16)
 
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_autocast_sdpa(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, query, key, value):
+                with torch.autocast("cpu"):
+                    with torch.autocast("cuda", dtype=torch.float32):
+                        out = F.scaled_dot_product_attention(
+                            query, key, value, None, 0.5, True
+                        )
+                return out
+
+        dtype = torch.float32
+        seq_len_q = 1
+        seq_len_k = 1
+        head_dim = 8
+        query = torch.ones(
+            1, 8, seq_len_q, head_dim, device="cuda", dtype=dtype, requires_grad=True
+        )
+        key = torch.ones(
+            1, 8, seq_len_k, head_dim, device="cuda", dtype=dtype, requires_grad=True
+        )
+        value = torch.ones(
+            1, 8, seq_len_k, head_dim, device="cuda", dtype=dtype, requires_grad=True
+        )
+
+        module = MyModule()
+        real = module(query, key, value)
+        real_device = real.device
+        real_dtype = real.dtype
+
+        opt_mod = torch._dynamo.optimize("inductor")(module)
+        compiled = opt_mod(query, key, value)
+
+        self.assertEqual(compiled.device, real_device)
+        self.assertEqual(compiled.dtype, real_dtype)
+
+        self.assertEqual(compiled.device.type, "cuda")
+        self.assertEqual(compiled.device.index, 0)
+        self.assertEqual(compiled.dtype, torch.float32)
+
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_parsing_sdpa(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, query, key, value):
+                out = F.scaled_dot_product_attention(query, key, value, None, 0, True)
+                out = F.scaled_dot_product_attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query, key, value, None, dropout_p=0, is_causal=True
+                )
+                return out
+
+        device = "cuda"
+        dtype = torch.float16
+        seq_len_q = 1
+        seq_len_k = 1
+        head_dim = 8
+        query = torch.ones(
+            1, 8, seq_len_q, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        key = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        value = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        module = MyModule()
+        opt_mod = torch._dynamo.optimize("inductor")(module)
+        opt_mod(query, key, value)
+
     def test_autocast_cpu(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
@@ -2671,15 +3329,60 @@ def forward(self, x):
         self.assertEqual(exported.device.type, "cpu")
         self.assertEqual(exported.dtype, torch.bfloat16)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
-    def test_autocast_float64(self):
+    def test_autocast_cpu_graph_break(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
-                a_float32 = torch.rand((8, 8), device="cuda")
-                b_float32 = torch.rand((8, 8), device="cuda")
-                d_float32 = torch.rand((8, 8), device="cuda")
-
-                with torch.autocast(device_type="cuda", dtype=torch.float64):
+                a_float32 = torch.rand((8, 8), device="cpu")
+                b_float32 = torch.rand((8, 8), device="cpu")
+                torch._dynamo.graph_break()
+                d_float32 = torch.rand((8, 8), device="cpu")
+
+                with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+                    e_float16 = torch.mm(a_float32, b_float32)
+                    torch._dynamo.graph_break()
+                    f_float16 = torch.mm(d_float32, e_float16)
+                return f_float16
+
+        module = MyModule()
+        real = module(torch.tensor([0.5]))
+        real_device = real.device
+        real_dtype = real.dtype
+
+        opt = torch._dynamo.optimize("eager")(module)
+        res = opt(torch.tensor([0.5]))
+        self.assertEqual(res.device, real_device)
+        self.assertEqual(res.dtype, real_dtype)
+
+        self.assertEqual(res.device.type, "cpu")
+        self.assertEqual(res.dtype, torch.bfloat16)
+
+    def test_autocast_cpu_graph_break_2(self):
+        # Regression for: https://github.com/pytorch/pytorch/issues/93890
+        def fn(x):
+            with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+                x = torch.mm(x, x)
+                torch._dynamo.graph_break()
+                x = torch.relu(x)
+            return x
+
+        x = torch.rand([4, 4])
+        self.assertEqual(x.dtype, torch.float32)
+        res = fn(x)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        opt_res = opt_fn(x)
+        self.assertTrue(torch.allclose(res, opt_res))
+        self.assertEqual(res.dtype, torch.bfloat16)
+        self.assertEqual(opt_res.dtype, torch.bfloat16)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_autocast_float64(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                a_float32 = torch.rand((8, 8), device="cuda")
+                b_float32 = torch.rand((8, 8), device="cuda")
+                d_float32 = torch.rand((8, 8), device="cuda")
+
+                with torch.autocast(device_type="cuda", dtype=torch.float64):
                     e_float64 = torch.mm(a_float32, b_float32)
                     f_float64 = torch.mm(d_float32, e_float64)
                 return f_float64
@@ -2990,7 +3693,7 @@ def func(x, y):
     def test_if_cond_nn_mod(self):
         class MockModule(torch.nn.Module):
             def __init__(self, output_relu=True):
-                super(MockModule, self).__init__()
+                super().__init__()
                 self.relu = torch.nn.ReLU() if output_relu else None
 
             def forward(self, x):
@@ -3015,6 +3718,94 @@ def forward(self, x):
         res = opt_model(x)
         self.assertTrue(same(ref, res))
 
+    def test_if_cond_user_defined_object(self):
+        # obj.__bool__ is not existed
+        class A:  # noqa: B903
+            def __init__(self, x):
+                self.x = x
+
+        # obj.__bool__ is function and returns bool type
+        class B:
+            def __init__(self, x):
+                self.x = x
+
+            def __bool__(self):
+                return self.x > 0
+
+        # obj.__bool__ is non-function
+        class C:
+            def __init__(self, x):
+                self.x = x
+                self.__bool__ = False
+
+        def fn(x, obj):
+            if not obj:
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.rand(4)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+        obj1 = A(0.5)
+        obj2 = B(0.5)
+        obj3 = B(-0.5)
+        obj4 = C(0.5)
+        for obj in [obj1, obj2, obj3, obj4, obj3, obj2]:
+            ref = fn(x, obj)
+            res = opt_fn(x, obj)
+            self.assertTrue(same(ref, res))
+        self.assertEqual(cnts.frame_count, 4)
+
+    def test_if_cond_user_defined_object2(self):
+        # obj.__bool__ is function and returns non-bool type
+        class MyObj:
+            def __init__(self, x):
+                self.x = x
+
+            def __bool__(self):
+                self.x = 1
+                return self.x
+
+        def fn(a, obj):
+            if not obj:
+                return a + obj.x
+            else:
+                return a - obj.x
+
+        x = torch.rand(4)
+        obj = MyObj(0.5)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        try:
+            opt_fn(x, obj)
+            self.assertFalse(True)
+        except TypeError as e:
+            self.assertIn("__bool__ should return bool, returned int", str(e))
+
+    def test_class_has_instancecheck_method(self):
+        class A:
+            pass
+
+        class ExampleMeta(type):
+            def __instancecheck__(cls, instance):
+                return True
+
+        class B(metaclass=ExampleMeta):
+            pass
+
+        def fn(x, obj):
+            if isinstance(obj, B):
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.rand(4)
+        obj = A()
+        ref = fn(x, obj)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x, obj)
+        self.assertTrue(same(ref, res))
+
     def test_torch_cuda_is_available(self):
         def fn(x):
             if torch.cuda.is_available():
@@ -3084,7 +3875,6 @@ def fn(x, y):
         self.assertTrue(same(ref, res))
 
     def test_disable_flag(self):
-
         cnt = torch._dynamo.testing.CompileCounter()
 
         with patch.dict(os.environ, {"TORCH_COMPILE_DISABLE": "1"}):
@@ -3137,6 +3927,42 @@ def guard_failures(failure):
         self.assertTrue(guard_failure is not None)
         self.assertEqual(guard_failure[0], "k == 3")
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    def test_guard_failure_fn_shape_control(self):
+        def fn(x, y):
+            if x.shape[0] < 3:
+                if y.shape[0] < 3:
+                    return x * y
+                else:
+                    return x + y
+            else:
+                return -1
+
+        x = torch.randn([2, 2])
+        y = torch.randn([2, 2])
+
+        guard_failure = None
+
+        def guard_failures(failure):
+            nonlocal guard_failure
+            guard_failure = failure
+
+        opt_fn = torch._dynamo.optimize(
+            "eager", nopython=True, guard_fail_fn=guard_failures
+        )(fn)
+
+        x2 = torch.randn([5, 5])
+        y2 = torch.randn([5, 5])
+
+        opt_fn(x, y)
+        opt_fn(x2, y2)
+
+        self.assertTrue(guard_failure is not None)
+        if torch._dynamo.config.assume_static_by_default:
+            self.assertEqual(guard_failure[0], "x.size()[0] == 2")
+        else:
+            self.assertEqual(guard_failure[0], "x.size()[0] < 3")
+
     def test_guard_failure_fn2(self):
         def fn(x, y):
             x = x + 1
@@ -3163,7 +3989,13 @@ def guard_failures(failure):
         opt_fn(x2, y2)
 
         if torch._dynamo.config.dynamic_shapes:
-            self.assertTrue(guard_failure is None)
+            if torch._dynamo.config.assume_static_by_default:
+                self.assertEqual(
+                    guard_failure[0],
+                    "x.size()[0] == 2",
+                )
+            else:
+                self.assertTrue(guard_failure is None)
         else:
             self.assertTrue(guard_failure is not None)
             self.assertEqual(
@@ -3171,6 +4003,36 @@ def guard_failures(failure):
                 "tensor 'x' size mismatch at index 0. expected 2, actual 3",
             )
 
+    def test_guard_failure_fn_tensor_iter(self):
+        def fn(x):
+            for y in x:
+                y.add_(1.0)
+            return y
+
+        guard_failure = None
+
+        def guard_failures(failure):
+            nonlocal guard_failure
+            guard_failure = failure
+
+        opt_fn = torch._dynamo.optimize(
+            "eager", nopython=True, guard_fail_fn=guard_failures
+        )(fn)
+
+        args1 = torch.randn(10, 10)
+        out = fn(args1)
+        opt_out = opt_fn(args1)
+        self.assertTrue(same(out, opt_out))
+
+        args2 = torch.randn(9, 10)
+        out = fn(args2)
+        opt_out = opt_fn(args2)
+        self.assertTrue(same(out, opt_out))
+
+        # guard is expected for both static and dynamic shapes
+        self.assertTrue(guard_failure is not None)
+        self.assertEqual(guard_failure[0], "len(x) == 10")
+
     def test_restore_graphstate(self):
         # This function does some guard accumulation,
         # and then rolls back due to control flow.
@@ -3223,7 +4085,11 @@ def fn(x, y):
         )
         # Dummy ctor
         graph = OutputGraph(
-            f_globals={}, code_options={}, compiler_fn=None, root_tx=None
+            f_globals={},
+            code_options={},
+            compiler_fn=None,
+            root_tx=None,
+            export=False,
         )
         # Contrived property so as not to have it be None
         graph.nn_modules = {}
@@ -3257,7 +4123,7 @@ def fn(x, y):
         self.assertEqual(graph.tracing_context.guards_context.dynamo_guards, guards)
 
     def test_call_parent_non_class_methods_from_child(self):
-        class A(object):
+        class A:
             def add(self, x):
                 return x + 10
 
@@ -3329,6 +4195,486 @@ def test_torch_package_working_with_trace(self):
 
         optimized_loaded_model = torch._dynamo.optimize("eager")(loaded_model)(*inputs)
 
+    def test_shape_and_tuple_equality(self):
+        def fn(x, y, t):
+            z = x * y
+            if x.size() == t:
+                return z.cos()
+            return z.sin()
+
+        torch._dynamo.optimize("eager", nopython=True)(fn)(
+            torch.randn([4, 4]), torch.randn([4, 4]), (4, 4)
+        )
+
+    # specifically test for tensor.attribute -> torch.something()
+    def test_real_imag_tensor_attribute(self):
+        def fn(x, y):
+            a = x.real
+            b = x.imag
+            return torch.mul(torch.add(a, y), b)
+
+        x_real = torch.rand((4, 4))
+        x_imag = torch.rand((4, 4))
+        x = torch.complex(x_real, x_imag)
+        y = torch.rand((4, 4))
+
+        ref = fn(x, y)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(same(ref, res))
+
+    def test_T_tensor_attribute(self):
+        def fn(x, y):
+            a = x.T
+            return torch.add(a, y)
+
+        x = torch.rand((4, 4))
+        y = torch.rand((4, 4))
+
+        ref = fn(x, y)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(same(ref, res))
+
+    def test_recursive_tensor_attribute(self):
+        def fn(x, y):
+            a = x.real.T
+            b = x.imag
+            return torch.mul(torch.add(a, y), b)
+
+        x_real = torch.rand((4, 4))
+        x_imag = torch.rand((4, 4))
+        x = torch.complex(x_real, x_imag)
+        y = torch.rand((4, 4))
+
+        ref = fn(x, y)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(same(ref, res))
+
+    def test_tagging_tensors_simple(self):
+        def foo(x, y):
+            return x * y, x, y
+
+        a = torch.randn([3, 3])
+        a.tag = "a"
+        a.frog = "ribbity ribbit"
+        b = torch.randn([3, 3])
+        b.tag = "b"
+        b.frog = "ribbit"
+
+        exported = torch._dynamo.export(foo, a, b)
+        out_graph = exported[0]
+
+        nodes = list(out_graph.graph.nodes)
+        placeholders = [node for node in nodes if node.op == "placeholder"]
+        all_tags = []
+        all_frogs = []
+        for placeholder in placeholders:
+            if "tensor_dict" in placeholder.meta:
+                all_tags.append(placeholder.meta["tensor_dict"]["tag"])
+                all_frogs.append(placeholder.meta["tensor_dict"]["frog"])
+
+        self.assertEqual(all_tags, ["a", "b"])
+        self.assertEqual(all_frogs, ["ribbity ribbit", "ribbit"])
+
+    def test_tagging_tensors_mix_used_unused_structure(self):
+        def pre_attention_state_ops(input, mems, state):
+            lc_key = state[0]
+            lc_val = state[1]
+            bar = []
+            for i in range(0, 4):
+                bar2 = []
+                for j in range(0, 3):
+                    bar2.append(
+                        lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
+                    )
+                bar.append(bar2)
+
+            return bar
+
+        mems = torch.tensor([[[1.8364, 0.2724, -1.4917, -0.4367, 0.8640]]])
+        state = [
+            torch.tensor([[[1.0517, 0.3848, -0.6472, 0.0823, 0.9116]]]),
+            torch.tensor([[[1.0517, 0.3848, -0.6472, 0.0823, 0.9116]]]),
+        ]
+        i = torch.tensor(
+            [
+                [0.0313, -0.1487, -0.3846, -0.5321],
+                [-1.7073, 1.3331, -0.0890, -1.4935],
+                [-0.8314, -0.1862, -0.5935, 1.5232],
+            ]
+        )
+
+        mems.tag = "MEMS"
+        i.tag = "FOO"
+        state[0].tag = "STATE_0"
+        state[1].tag = "HMMM"
+
+        exported = torch._dynamo.export(pre_attention_state_ops, i, mems, state)
+        out_graph = exported[0]
+
+        nodes = list(out_graph.graph.nodes)
+        placeholders = [node for node in nodes if node.op == "placeholder"]
+        all_tags = []
+        for placeholder in placeholders:
+            if "tensor_dict" in placeholder.meta:
+                all_tags.append(placeholder.meta["tensor_dict"]["tag"])
+
+        self.assertEqual(all_tags, ["STATE_0", "HMMM"])
+
+    def test_get_custom_tensor_attribute(self):
+        def fn(x):
+            return x.custom_attr * x
+
+        x = torch.rand((2, 2))
+        x.custom_attr = 3.14
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
+    def test_set_custom_tensor_attribute(self):
+        def fn(x):
+            x.custom_attr = 3.14
+            return x.custom_attr * x
+
+        x = torch.rand((2, 2))
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
+    @unittest.skipIf(sys.version_info < (3, 11), "requires Python 3.11+")
+    def test_py311_jump_offset(self):
+        new_inst = bytecode_transformation.create_instruction
+        consts = (None, 1, 2, 3, 4)
+
+        def create_test_code(jump_opname, target_idx):
+            targets = [
+                new_inst("LOAD_CONST", 1),
+                new_inst("LOAD_CONST", 3),
+            ]
+            jump_to_target_inst = new_inst(jump_opname, target=targets[target_idx])
+            """
+            pseudocode of generated bytecode:
+            def test_py311_fn():
+                goto target1
+            target0:
+                return 1
+            target1:
+                goto [target0/target2] (via fwd or bwd jump)
+                return 2
+            target2:
+                return 3
+                return 4
+            """
+            # test with LOAD_GLOBAL since it has a different instruction size
+            insts = [
+                new_inst("RESUME", 0),
+                new_inst("JUMP_FORWARD", target=jump_to_target_inst),
+                targets[0],
+                new_inst("LOAD_GLOBAL", argval="print"),
+                new_inst("POP_TOP"),
+                new_inst("RETURN_VALUE"),
+                jump_to_target_inst,
+                new_inst("LOAD_CONST", 2),
+                new_inst("LOAD_GLOBAL", argval="print"),
+                new_inst("POP_TOP"),
+                new_inst("RETURN_VALUE"),
+                targets[1],
+                new_inst("RETURN_VALUE"),
+                new_inst("LOAD_CONST", 4),
+                new_inst("RETURN_VALUE"),
+            ]
+            code_options = collections.OrderedDict(
+                [
+                    ("co_argcount", 0),
+                    ("co_posonlyargcount", 0),
+                    ("co_kwonlyargcount", 0),
+                    ("co_nlocals", 0),
+                    ("co_stacksize", 2),
+                    ("co_flags", 3),
+                    ("co_code", b""),
+                    ("co_consts", consts),
+                    ("co_names", ("print",)),
+                    ("co_varnames", ()),
+                    ("co_filename", __file__),
+                    ("co_name", "test_py311_fn"),
+                    ("co_qualname", "test_py311_fn"),
+                    ("co_firstlineno", 1),
+                    ("co_linetable", b""),
+                    ("co_exceptiontable", b""),
+                    ("co_freevars", ()),
+                    ("co_cellvars", ()),
+                ]
+            )
+            return bytecode_transformation.clean_and_assemble_instructions(
+                insts,
+                list(code_options.keys()),
+                code_options,
+            )
+
+        # format: jump_opname, target_idx, expected forward jump, expected return value
+        test_args = (
+            ("JUMP_FORWARD", 0, False, 1),
+            ("JUMP_FORWARD", 1, True, 3),
+            ("JUMP_BACKWARD", 0, False, 1),
+            ("JUMP_BACKWARD", 1, True, 3),
+        )
+
+        for test in test_args:
+            insts, code = create_test_code(test[0], test[1])
+            # check if offset of latest jump instruction is forward/backward
+            for inst in reversed(insts):
+                if inst.opname.startswith("JUMP"):
+                    if test[2]:
+                        self.assertIn("FORWARD", inst.opname)
+                    else:
+                        self.assertIn("BACKWARD", inst.opname)
+                    break
+            # run the code and check result
+
+            def dummy_fn():
+                pass
+
+            dummy_fn.__code__ = code
+            self.assertEqual(dummy_fn(), test[3])
+
+            # TODO should also pass the code object back into dynamo again, but
+            # dynamo is not enabled for Python 3.11 yet.
+
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_raise_guard_full_constraint(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            if x.shape[0] == 3:
+                return x.sin()
+            return x.cos()
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+        torch._dynamo.mark_dynamic(y, 0)
+
+        torch._dynamo.reset()
+        with self.assertRaises(
+            torch._dynamo.exc.InternalTorchDynamoError,
+        ):
+            torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_no_raise_guard_partial_constraint(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            if x.shape[0] > 3:
+                return x.sin()
+            return x.cos()
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.reset()
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_no_raise_guard_partial_constraint_across_break(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x, y):
+            z = x * y
+
+            torch._dynamo.graph_break()
+            if z.shape[0] > 2:
+                return z.cos()
+
+            return x.cos()
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y, y)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.reset()
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y, y)
+
+    # Sadly, this does not throw - we do not prop correctly across the graph break
+    @unittest.expectedFailure
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_raise_guard_partial_constraint_across_break(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x, y):
+            z = x * y
+
+            torch._dynamo.graph_break()
+            if z.shape[0] == 3:
+                return z.cos()
+
+            return x.cos()
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y, y)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.reset()
+        with self.assertRaisesRegex(
+            Exception,
+        ):
+            torch._dynamo.optimize("eager")(my_dyn_fn)(y, y)
+
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_raise_guard_partial_constraint_no_graph_break(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x, y):
+            z = x * y
+
+            if z.shape[0] == 3:
+                return z.cos()
+
+            return x.cos()
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y, y)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.reset()
+        with self.assertRaises(
+            torch._dynamo.exc.InternalTorchDynamoError,
+        ):
+            torch._dynamo.optimize("eager")(my_dyn_fn)(y, y)
+
+    def test_cannot_trace_mark_dynamic(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            torch._dynamo.mark_dynamic(x, 0)
+            return x * x
+
+        with self.assertRaisesRegex(
+            AssertionError, "Attempt to trace forbidden callable"
+        ):
+            torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+
+    def test_cannot_trace_mark_dynamic_safe_unreached(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            if x.shape[0] == 3:
+                return x
+            print("Running", torch._dynamo.mark_dynamic(x, 0))
+            return x * x
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+
+    @torch._dynamo.config.patch(dynamic_shapes=False)
+    def test_no_dynamic_shapes_mark_dynamic_illegal(self):
+        y = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(x):
+            if x.shape[0] > 3:
+                return x.sin()
+            return x.cos()
+
+        torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.reset()
+        with self.assertRaisesRegex(
+            AssertionError,
+            "mark_dynamic usage with dynamic_shapes=False is not yet supported",
+        ):
+            torch._dynamo.optimize("eager")(my_dyn_fn)(y)
+
+    @torch._dynamo.config.patch(dynamic_shapes=False)
+    def test_parameter_mark_dynamic_illegal(self):
+        y = torch.nn.Parameter(torch.tensor([0.25, 0.25]))
+        x = torch.tensor([0.5, 0.5])
+
+        class encoder(torch.nn.Module):
+            def __init__(self, y):
+                super().__init__()
+                self.register_parameter("param", y)
+
+            @torch._dynamo.disable
+            def helper(self, x, y):
+                return x * y
+
+            def forward(self, a, *args):
+                x = a + a
+                return self.helper(x, self.param)
+
+        e = encoder(y)
+        torch._dynamo.optimize("eager")(e)(x)
+        torch._dynamo.mark_dynamic(y, 0)
+        torch._dynamo.reset()
+        e = encoder(y)
+        with self.assertRaisesRegex(
+            AssertionError,
+            "mark_dynamic on parameter, parameters are always static today",
+        ):
+            torch._dynamo.optimize("eager")(e)(x)
+
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_py_guards_mark_dynamic(self):
+        x = torch.randn([3, 3, 3])
+
+        def my_dyn_fn(a):
+            if a.shape[0] > 2:
+                return a.cos()
+            return a.sin()
+
+        torch._dynamo.mark_dynamic(x, 0)
+        counter = CompileCounter()
+        # Run with dynamic
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        self.assertEqual(counter.frame_count, 1)
+        delattr(x, "_dynamo_dynamic_indices")
+
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        # Run without dynamic, no recompile
+        self.assertEqual(counter.frame_count, 1)
+
+        # Mark a new dim, 1, as dynamic
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        # Recompile triggered because we marked a new dym as dynamic
+        self.assertEqual(counter.frame_count, 2)
+
+        # Mark an existing dim, 1, as dynamic
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        # No Recompile triggered because we marked an existing dym as dynamic
+        self.assertEqual(counter.frame_count, 2)
+
+        # Reset
+        torch._dynamo.reset()
+        # Reset counter
+        counter = CompileCounter()
+        # Clear dynamic
+        delattr(x, "_dynamo_dynamic_indices")
+
+        # Run with dynamic 1
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        self.assertEqual(counter.frame_count, 1)
+
+        # Clear dynamic
+        delattr(x, "_dynamo_dynamic_indices")
+        # Run with dynamic 0, not subset
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        self.assertEqual(counter.frame_count, 2)
+
+        # Clear dynamic
+        delattr(x, "_dynamo_dynamic_indices")
+        # Run with dynamic 0, 1, 2, not subset
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.mark_dynamic(x, 1)
+        torch._dynamo.mark_dynamic(x, 2)
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        self.assertEqual(counter.frame_count, 3)
+
+        # Clear dynamic
+        delattr(x, "_dynamo_dynamic_indices")
+        # Run with dynamic 0, 2, subset!
+        torch._dynamo.mark_dynamic(x, 2)
+        torch._dynamo.mark_dynamic(x, 0)
+        torch._dynamo.optimize(counter)(my_dyn_fn)(x)
+        self.assertEqual(counter.frame_count, 3)
+
 
 class CustomFunc1(torch.autograd.Function):
     @staticmethod
@@ -3368,9 +4714,6 @@ def backward(ctx, grad_output):
 
 
 class Module1(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, foo):
         return CustomFunc1().apply(foo)
 
@@ -3385,9 +4728,6 @@ def forward(self, foo):
 
 
 class Module3(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, foo):
         return CustomFunc2().apply(foo)
 
@@ -3402,9 +4742,6 @@ def forward(self, foo):
 
 
 class Module5(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, foo):
         return CustomFunc3().apply(foo)
 
@@ -3423,7 +4760,7 @@ def test_jit_save(self):
         def fn():
             class Foo(torch.nn.Module):
                 def __init__(self):
-                    super(Foo, self).__init__()
+                    super().__init__()
                     self.a = 3
 
                 @torch.jit.export
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 733a189af8ba..828f9a15cb46 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1,6 +1,8 @@
 # Owner(s): ["module: dynamo"]
 
+import types
 from copy import deepcopy
+from typing import Tuple
 from unittest.mock import patch
 
 import torch
@@ -145,6 +147,21 @@ def forward(self, x):
         return 1 + self.mod(x * 1.5)
 
 
+class ModuleWithStaticForward(torch.nn.Module):
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+class ModuleCallModuleWithStaticForward(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mod = ModuleWithStaticForward()
+
+    def forward(self, x):
+        return self.mod(x)
+
+
 class ModuleStaticMethodCall(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -588,9 +605,6 @@ def forward(self, x):
 
 
 class ModuleAttributePrecedenceBase(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def linear(self, x):
         return x * 2.0
 
@@ -668,6 +682,15 @@ def forward(self, x):
         return x * self.scale
 
 
+class ModulePatch1(torch.nn.Module):
+    pass
+
+
+class ModulePatch2(torch.nn.Module):
+    def forward(self, x):
+        return x - 1
+
+
 def make_test(fn, expected_ops=None):
     def test_fn(self):
         return torch._dynamo.testing.standard_test(
@@ -686,6 +709,9 @@ class NNModuleTests(torch._dynamo.test_case.TestCase):
     test_submodules2 = make_test(SubmoduleExample())
     test_modulemethod1 = make_test(ModuleMethodCall())
     test_modulemethod2 = make_test(ModuleMethodCall())
+    test_module_call_module_with_static_forward = make_test(
+        ModuleCallModuleWithStaticForward()
+    )
     test_module_static_method = make_test(ModuleStaticMethodCall())
     test_fnmember = make_test(FnMember())
     test_fnmembercmp1 = make_test(FnMemberCmp(F.relu))
@@ -973,7 +999,7 @@ def test_torch_static():
     def test_call_fn_with_non_const_inputs_safe(self):
         class ModuleSpecialFwd(torch.nn.Module):
             def __init__(self):
-                super(ModuleSpecialFwd, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(
                     in_channels=3, out_channels=20, kernel_size=(5, 5)
                 )
@@ -1125,6 +1151,132 @@ def forward(self, x):
         # There will be a graph break for the inner mod being OptimizedModule
         self.assertEqual(cnt.frame_count, 2)
 
+    def test_module_patch(self):
+        mod = ModulePatch1()
+        mod.forward = types.MethodType(ModulePatch2.forward, mod)
+
+        def fn(x):
+            return mod(x)
+
+        self.assertTrue(
+            torch.allclose(
+                torch._dynamo.optimize("eager", nopython=True)(fn)(torch.ones(10)),
+                torch.zeros(1),
+            )
+        )
+
+    def test_hooks_outer(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return 2 * x + 1
+
+        m = TestModule()
+
+        def forward_hook(
+            module: torch.nn.Module, inputs: Tuple[torch.Tensor], output: torch.Tensor
+        ) -> torch.Tensor:
+            return 2 * output + 1
+
+        handle = m.register_forward_hook(forward_hook)
+        inp = torch.tensor(1.0, requires_grad=True)
+
+        failure_reason = None
+
+        def guard_fail_fn(failure):
+            nonlocal failure_reason
+            failure_reason = failure[0]
+
+        compiled_m = torch._dynamo.optimize(
+            guard_fail_fn=guard_fail_fn, backend="eager"
+        )(m)
+
+        self.assertEqual(compiled_m(inp), m(inp))
+        self.assertEqual(compiled_m(inp).item(), 7)
+        self.assertTrue(failure_reason is None)
+
+        # what if we remove our hook? we should recompile?
+        handle.remove()
+        self.assertEqual(compiled_m(inp), m(inp))
+        self.assertEqual(compiled_m(inp).item(), 3)
+        # self.assertTrue(failure_reason == "hook")
+
+        """
+        Summary:
+          - removing a hook doesn't fail a guard, becuase we weren't compiling the hook
+            (at least into the same graph) as forward in the first place! We do correctly
+            omit calling the removed hook, but since this hook is a post forward hook,
+            the 'RETURN' from forward is breaking the graph.
+
+            Why is 'forward' the entrypoint to an InstructionTranslator, after I changed
+            the eval_frame entrypoint to Module.__call__?
+        """
+
+    def test_hooks_inner(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return 2 * x + 1
+
+        m = TestModule()
+
+        def forward_hook(
+            module: torch.nn.Module, inputs: Tuple[torch.Tensor], output: torch.Tensor
+        ) -> torch.Tensor:
+            return 2 * output + 1
+
+        handle = m.register_forward_hook(forward_hook)
+
+        def outer_func(tensor):
+            x = tensor * 2 + 1
+            y = m(x)
+            return y
+
+        inp = torch.tensor(1.0, requires_grad=True)
+
+        failure_reason = None
+
+        def guard_fail_fn(failure):
+            nonlocal failure_reason
+            failure_reason = failure[0]
+
+        cc = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        compiled_func = torch._dynamo.optimize(
+            guard_fail_fn=guard_fail_fn,
+            backend=cc,
+        )(outer_func)
+
+        self.assertEqual(compiled_func(inp), outer_func(inp))
+        self.assertEqual(compiled_func(inp).item(), 15)
+
+        # We are compiling 1 big graph for all 3 functions including the hook.
+        self.assertEqual(cc.frame_count, 1)
+        self.assertEqual(cc.op_count, 6)
+
+        # If we remove the hook, we should recompile
+        handle.remove()
+        self.assertEqual(compiled_func(inp), outer_func(inp))
+        self.assertEqual(compiled_func(inp).item(), 7)
+        self.assertTrue("forward_hooks.keys" in failure_reason)
+        self.assertEqual(cc.frame_count, 1 + 1)
+        self.assertEqual(cc.op_count, 6 + 4)
+
+        # what if instead of removing, we alter our hook?
+        torch._dynamo.reset()
+        m = TestModule()
+        handle = m.register_forward_hook(forward_hook)
+        failure_reason = None
+        self.assertEqual(compiled_func(inp), outer_func(inp))
+        self.assertEqual(compiled_func(inp).item(), 15)
+
+        def new_forward_hook(
+            module: torch.nn.Module, inputs: Tuple[torch.Tensor], output: torch.Tensor
+        ) -> torch.Tensor:
+            return 2 * output + 2
+
+        m._forward_hooks[handle.id] = new_forward_hook
+        self.assertEqual(compiled_func(inp), outer_func(inp))
+        self.assertEqual(compiled_func(inp).item(), 16)
+        self.assertTrue("check_obj_id(m._forward_hooks" in failure_reason)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 4ff26ddeeb42..62c33345a6aa 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -48,20 +48,20 @@ class OptimizerTests(torch._dynamo.test_case.TestCase):
     # furthermore, the break is inside a for loop, so we bail on the frame
     # entirely.  This is basically an xfail; if the frame count goes up
     # you done good
-    test_radam = make_test(torch.optim.RAdam, exp_graph_count=0)
+    test_radam = torch._dynamo.testing.skip_if_pytest(
+        make_test(torch.optim.RAdam, exp_graph_count=0)
+    )
 
 
 # exclude SparseAdam because other areas of the stack don't support it yet
 # the others are handled specially above
-exclude = set(
-    [
-        "SGD",  # Handled above
-        "Optimizer",
-        "SparseAdam",  # Unsupported
-        "LBFGS",  # Unsupported
-        "RAdam",  # Has data dependent control for rectification (needs symint)
-    ]
-)
+exclude = {
+    "SGD",  # Handled above
+    "Optimizer",
+    "SparseAdam",  # Unsupported
+    "LBFGS",  # Unsupported
+    "RAdam",  # Has data dependent control for rectification (needs symint)
+}
 
 optimizers = [
     opt
@@ -80,9 +80,6 @@ class End2EndTests(torch._dynamo.test_case.TestCase):
     # https://github.com/pytorch/torchdynamo/issues/1604
     def test_optimizing_over_tensor_with_requires_grad(self):
         class Net(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 z = torch.bmm(x, y)
                 z = torch.flatten(z, 1)
diff --git a/test/dynamo/test_recompile_ux.py b/test/dynamo/test_recompile_ux.py
index b39bea3ce932..46520e0b68aa 100644
--- a/test/dynamo/test_recompile_ux.py
+++ b/test/dynamo/test_recompile_ux.py
@@ -18,9 +18,7 @@ class RecompileUxTests(torch._dynamo.test_case.TestCase):
     def setUpClass(cls):
         super().setUpClass()
         cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config, "cache_size_limit", cls.cache_limit
-            )
+            torch._dynamo.config.patch("cache_size_limit", cls.cache_limit)
         )
 
     def test_drop_cache_on_skip(self):
@@ -83,9 +81,7 @@ def model(input):
 
         expected_recompiles = 2
         compile_counter = torch._dynamo.testing.CompileCounter()
-        with unittest.mock.patch.object(
-            torch._dynamo.config, "cache_size_limit", expected_recompiles
-        ):
+        with torch._dynamo.config.patch("cache_size_limit", expected_recompiles):
             with self.assertLogs(logger="torch._dynamo", level="WARNING") as logs:
                 for _ in range(10):
                     bsz = torch.randint(low=0, high=1000, size=())
@@ -117,7 +113,7 @@ def func(a, b, c):
         c = torch.rand(3, 4, 5, device="cuda")
         compile_counter = torch._dynamo.testing.CompileCounter()
 
-        with unittest.mock.patch.object(torch._dynamo.config, "cache_size_limit", 2):
+        with torch._dynamo.config.patch("cache_size_limit", 2):
             opt_func = torch._dynamo.optimize(compile_counter)(func)
             opt_func(a, b, c)  # warmup
             self.assertEqual(compile_counter.frame_count, 1)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 5e294d303a69..4e34b2cd1428 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -10,7 +10,6 @@
 from collections import namedtuple
 from copy import deepcopy
 from typing import List
-from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -20,6 +19,7 @@
 import torch._dynamo.utils
 
 import torch._functorch.config
+from torch._dynamo.testing import skip_if_pytest
 
 try:
     from test_minifier import requires_cuda
@@ -29,15 +29,9 @@
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, requires_static_shapes, same
+from torch._dynamo.utils import ifdyn
 from torch.nn import functional as F
 
-try:
-    import torch._refs
-
-    HAS_REFS = True
-except ImportError:
-    HAS_REFS = False
-
 
 _orig_module_call = torch.nn.Module.__call__
 
@@ -49,13 +43,6 @@ def is_fx_tracing_test() -> bool:
     return torch.nn.Module.__call__ is not _orig_module_call
 
 
-def ifdyn(count1, count2):
-    if torch._dynamo.config.dynamic_shapes:
-        return count1
-    else:
-        return count2
-
-
 def has_detectron2():
     try:
         from detectron2.layers.mask_ops import _paste_masks_tensor_shape
@@ -365,7 +352,7 @@ def longformer_chunk(hidden_states, window_overlap=256):
 class PartialT5(torch.nn.Module):
     # Highly simplified T5Attention prefix
     def __init__(self):
-        super(PartialT5, self).__init__()
+        super().__init__()
         self.q = torch.nn.Linear(512, 512)
         self.k = torch.nn.Linear(512, 512)
         self.v = torch.nn.Linear(512, 512)
@@ -474,7 +461,7 @@ def apply_chunking_to_forward(forward_fn, *input_tensors):
 
 class FakeMamlInner(torch.nn.Module):
     def __init__(self):
-        super(FakeMamlInner, self).__init__()
+        super().__init__()
         self.linear = torch.nn.Linear(784, 5)
 
     def forward(self, x, ignored=None, bn_training=False):
@@ -484,7 +471,7 @@ def forward(self, x, ignored=None, bn_training=False):
 class PartialMaml(torch.nn.Module):
     # Highly simplified version of maml.meta.Meta.finetuning
     def __init__(self):
-        super(PartialMaml, self).__init__()
+        super().__init__()
         self.net = FakeMamlInner()
         self.update_step_test = 10
         self.update_lr = 0.4
@@ -584,9 +571,6 @@ def create_rand_mask_from_inputs(
 class SequentialAppendList(torch.nn.Sequential):
     """from timm/models/vovnet.py"""
 
-    def __init__(self, *args):
-        super(SequentialAppendList, self).__init__(*args)
-
     def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Tensor:
         for i, module in enumerate(self):
             if i == 0:
@@ -610,7 +594,7 @@ def __init__(
         act_layer=torch.nn.ReLU,
         inplace=True,
     ):
-        super(BatchNormAct2d, self).__init__(
+        super().__init__(
             num_features,
             eps=eps,
             momentum=momentum,
@@ -662,7 +646,9 @@ def _get_min_chunk_len(config):
         return config.lsh_attn_chunk_length
     elif len(attn_types_set) == 1 and attn_types[0] == "local":
         return config.local_attn_chunk_length
-    elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]):
+    elif len(attn_types_set) == 2 and attn_types_set == set(  # noqa: C405
+        ["lsh", "local"]
+    ):
         return min(config.lsh_attn_chunk_length, config.local_attn_chunk_length)
     else:
         raise NotImplementedError(
@@ -704,7 +690,7 @@ def _get_sorted_bucket_idx_and_undo_sorted_bucket_idx(buckets):
 
 class FeedForwardLayer(nn.Module):
     def __init__(self, d_model, dim_feedforward, activation, dropout) -> None:
-        super(FeedForwardLayer, self).__init__()
+        super().__init__()
         self.linear1 = nn.Linear(d_model, dim_feedforward)
         self.activation = activation
         self.dropout1 = nn.Dropout(dropout)
@@ -727,7 +713,7 @@ def __init__(
         activation=nn.ReLU(),
         layer_norm_eps=1e-5,
     ):
-        super(TransformerEncoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
         self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
         self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
@@ -863,6 +849,33 @@ def _reformer(self, nopython):
         self.assertTrue(same(opt_model(input), correct))
         return cnt
 
+    @requires_cuda()
+    def test_sub_alpha_scalar_repro(self):
+        @torch.compile(backend="aot_eager")
+        def f(x):
+            return x.sub(1, alpha=2)
+
+        f(torch.ones(2, device="cuda", dtype=torch.float64))
+
+    def test_embedding_backward_broadcasting_decomp(self):
+        def f(grad_output, indices):
+            num_weights = 10
+            padding_idx = 1
+            scale_grad_by_freq = True
+            return torch.ops.aten.embedding_dense_backward(
+                grad_output, indices, num_weights, padding_idx, scale_grad_by_freq
+            )
+
+        f_compiled = torch.compile(f, backend="aot_eager")
+
+        grad_output = torch.ones(2, 4, 3, dtype=torch.float16)
+        indices = torch.ones(2, 4, dtype=torch.int64)
+
+        out_ref = f(grad_output, indices)
+        out_test = f_compiled(grad_output, indices)
+
+        self.assertEqual(out_ref, out_test)
+
     def test_reformer_eval(self):
         with torch.no_grad():
             cnt = self._reformer(nopython=True)
@@ -954,8 +967,9 @@ def test_chunk_reformer_ff(self):
     # NB: When you remove the expectedFailure, don't forget to
     # uncomment/adjust the assertEqual below
     @unittest.expectedFailure
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @torch._dynamo.config.patch(
+        fake_tensor_propagation=True, capture_scalar_outputs=True, dynamic_shapes=True
+    )
     def test_maml_item_capture(self):
         a = torch.randn(5, 1, 28, 28)
         b = torch.zeros(5, dtype=torch.int64)
@@ -973,7 +987,7 @@ def test_maml_item_capture(self):
         self.assertIn(cnt.op_count, (36, 35, 34, 29, 28, 27))
 
     # see: https://github.com/pytorch/pytorch/issues/80067
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", False)
+    @torch._dynamo.config.patch(capture_scalar_outputs=False, dynamic_shapes=True)
     def test_maml_no_item_capture(self):
         a = torch.randn(5, 1, 28, 28)
         b = torch.zeros(5, dtype=torch.int64)
@@ -986,7 +1000,7 @@ def test_maml_no_item_capture(self):
         for _ in range(10):
             self.assertTrue(same(opt_model(a, b, c, d), correct))
 
-        self.assertEqual(cnt.frame_count, ifdyn(5, 4))
+        self.assertEqual(cnt.frame_count, 5)
         # TODO(jansel): figure out why op count depends on imports
         self.assertIn(cnt.op_count, (31, 36, 35, 34, 29, 28))
 
@@ -1326,7 +1340,7 @@ def blah(self, x):
         self.assertGreaterEqual(torch._dynamo.utils.counters["frames"]["ok"], 3)
         self.assertGreaterEqual(torch._dynamo.utils.counters["frames"]["total"], 3)
 
-    @patch.object(torch._dynamo.config, "suppress_errors", True)
+    @torch._dynamo.config.patch("suppress_errors", True)
     def test_guard_fail_tensor_bool(self):
         @torch._dynamo.skip
         def fn():
@@ -1338,12 +1352,10 @@ def fn():
                 (1, 5),
             )
 
-            tensors = list(
-                [
-                    torch.empty(shape, dtype=dtype).fill_(17)
-                    for shape, dtype in itertools.product(shapes, dtypes)
-                ]
-            )
+            tensors = [
+                torch.empty(shape, dtype=dtype).fill_(17)
+                for shape, dtype in itertools.product(shapes, dtypes)
+            ]
 
             x_vals = (5.0, *tensors)
             y_vals = (6.0, *tensors)
@@ -1412,7 +1424,6 @@ def fn(x):
         self.assertTrue(same(ref0, res0))
         self.assertTrue(same(ref1, res1))
 
-    @unittest.skipIf(not HAS_REFS, "requires recent PT version")
     def test_primtorch(self):
         @torch._dynamo.optimize("eager")
         def fn(x):
@@ -1420,7 +1431,6 @@ def fn(x):
 
         fn(torch.randn(3))
 
-    @unittest.skipIf(not HAS_REFS, "requires recent PT version")
     @unittest.expectedFailure
     # inline_call [('inline in skipfiles: bind ...python3.10/inspect.py', 1)]
     def test_primtorch_no_graph_break(self):
@@ -1430,6 +1440,13 @@ def fn(x):
 
         fn(torch.randn(3))
 
+    def test_torch_tensor_ops_no_graph_break(self):
+        @torch._dynamo.optimize("eager", nopython=True)
+        def fn(x):
+            torch.Tensor.abs_(x)
+
+        fn(torch.randn(3))
+
     @unittest.skipIf(
         not isinstance(torch.ops.aten.abs, torch._ops.OpOverloadPacket),
         "old pt doesn't work",
@@ -1442,6 +1459,16 @@ def fn(x):
 
         fn(torch.randn(3))
 
+    def test_torch_tensor_ops(self):
+        def fn(x):
+            return torch.Tensor.abs_(x)
+
+        x = torch.randn(3)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        y = fn(x)
+        y_ = opt_fn(x)
+        self.assertTrue(same(y, y_))
+
     def test_guard_ordering_shape_fail(self):
         # If a function which takes a tensor has an inner function which
         # is compiled and generates a guard on its shape,
@@ -1476,8 +1503,6 @@ def fn(x):
 
         fn(torch.randn(3))
 
-    # Bug with storage meta - torch.BoolStorage is becoming torch.storage._LegacyStorageMeta
-    @unittest.expectedFailure
     def test_isinstance_storage(self):
         @torch._dynamo.optimize("eager")
         def fn(x):
@@ -1533,6 +1558,24 @@ def fn():
         opt_fn = torch._dynamo.optimize("eager")(fn)
         opt_fn()
 
+    def test_sort_out2(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("sorted", torch.ones(4, 4))
+                self.register_buffer("indices", torch.ones(4, 4, dtype=torch.long))
+
+            def forward(self, x):
+                torch.sort(x, out=(self.sorted, self.indices))
+                return (x + 1, self.sorted, self.indices)
+
+        x = torch.randn(4, 4)
+        m = MyModule()
+        ref = m(x)
+        opt_m = torch._dynamo.optimize("eager")(m)
+        res = opt_m(x)
+        self.assertTrue(same(ref, res))
+
     def test_sigmoid_out(self):
 
         dtype = torch.float32
@@ -1548,6 +1591,23 @@ def fn():
         opt_fn = torch._dynamo.optimize("eager")(fn)
         opt_fn()
 
+    def test_sigmoid_out2(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("base", torch.ones(4, 4))
+
+            def forward(self, x):
+                torch.sigmoid(x, out=self.base)
+                return x + self.base
+
+        x = torch.randn(4, 4)
+        m = MyModule()
+        ref = m(x)
+        opt_m = torch._dynamo.optimize("eager")(m)
+        res = opt_m(x)
+        self.assertTrue(same(ref, res))
+
     def test_slice_into_list_mutable(self):
         class Mod(torch.nn.Module):
             def forward(self, listy):
@@ -1601,6 +1661,14 @@ def f():
 
         self.assertEqual(f(), torch._dynamo.optimize("eager")(f)())
 
+    def test_out_none(self):
+        # https://github.com/pytorch/pytorch/issues/92814
+        def fn(input):
+            return torch.nn.functional.normalize(input, dim=0, out=None)
+
+        x = torch.rand([1])
+        self.assertEqual(fn(x), torch._dynamo.optimize("eager")(fn)(x))
+
     @unittest.skipIf(not has_detectron2(), "requires detectron2")
     def test_multi_import(self):
         @torch._dynamo.optimize("eager", nopython=True)
@@ -1681,7 +1749,7 @@ def fn(x):
         opt_fn(x)
         self.assertEqual(cnt.frame_count, 1)
 
-    @patch.object(torch._functorch.config, "use_dynamic_shapes", True)
+    @torch._dynamo.config.patch(dynamic_shapes=True)
     def test_bigbird_unsqueeze_inplace(self):
         def fn(reshape_2):
             view_2 = reshape_2.clone()
@@ -2132,7 +2200,8 @@ def fn(x):
         self.assertEqual(cnt.frame_count, 2)
         self.assertEqual(cnt.op_count, 2)
 
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    @skip_if_pytest
+    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
     def test_rewrite_assert_with_msg(self):
         def f(x):
             b = x.sin()
@@ -2153,7 +2222,7 @@ def f(x):
         with self.assertRaisesRegex(AssertionError, ""):
             exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
 
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
     def test_not_rewrite_assert_for_other_errors(self):
         def f(x):
             b = x.sin()
@@ -2167,7 +2236,7 @@ def f(x):
             opt_fn(*args)
 
     # TODO (tmanlaibaatar) handle data-dependent fstring in assert statement.
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
     def test_rewrite_assert_with_fstring_msg(self):
         def f(x):
             b = x.sin()
@@ -2178,7 +2247,8 @@ def f(x):
         with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
             exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
 
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    @skip_if_pytest
+    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
     def test_rewrite_assert_without_msg(self):
         def f(x):
             b = x.sin()
@@ -2192,7 +2262,7 @@ def f(x):
         with self.assertRaisesRegex(AssertionError, ""):
             exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
 
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", True)
     def test_rewrite_assert_noop(self):
         def f(x):
             b = x.sin()
@@ -2214,7 +2284,7 @@ def f(x):
         exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
         self.assertTrue(same(exported(*args), f(*args)))
 
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", False)
+    @torch._dynamo.config.patch("rewrite_assert_with_torch_assert", False)
     def test_not_rewrite_assert(self):
         def f(x):
             b = x.sin()
@@ -2224,7 +2294,7 @@ def f(x):
         with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
             torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
 
-    @patch.object(torch._functorch.config, "use_dynamic_shapes", True)
+    @torch._dynamo.config.patch(dynamic_shapes=True)
     def test_batchnorm_e2e(self):
         class Repro(torch.nn.Module):
             def __init__(self):
@@ -2280,7 +2350,7 @@ def compiled_fn(x):
             for buffer_ref, buffer_test in zip(m_ref.buffers(), m_test.buffers()):
                 self.assertTrue(same(buffer_ref, buffer_test))
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @torch._dynamo.config.patch("dynamic_shapes", True)
     def test_dynamic_shapes_right_side(self):
         def f(x):
             return torch.ones(5 * x.shape[0])
@@ -2292,8 +2362,29 @@ def f(x):
         )
         self.assertEqual(gm(inp).shape, f(inp).shape)
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
-    @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("dynamic_shapes", True)
+    def test_dynamic_shapes_implicit_guard(self):
+        def f(x):
+            y = x * x.size(x.shape[0])
+            torch.sum(y, [y.shape[0]])
+            return y
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt, nopython=True)(f)
+        opt_fn(torch.randn(3, 1, 1, 1, 1))
+        self.assertEqual(cnt.frame_count, 1)
+
+    @torch._dynamo.config.patch("dynamic_shapes", True)
+    def test_dynamic_shapes_float_guard(self):
+        def f(x):
+            return torch.nn.functional.dropout(x, x.shape[0] / 6)
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt, nopython=True)(f)
+        opt_fn(torch.randn(3))
+        self.assertEqual(cnt.frame_count, 1)
+
+    @torch._dynamo.config.patch(dynamic_shapes=True, capture_scalar_outputs=True)
     def test_tensor_item(self):
         def f(x, y):
             val = y.item()
@@ -2315,7 +2406,7 @@ def f(x, y):
             gm(torch.zeros(6, 4), torch.tensor(2)),
         )
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @torch._dynamo.config.patch("dynamic_shapes", True)
     def test_tensor_split(self):
         def f(x):
             return torch.split(x, x.shape[0] // 2, dim=0)[0]
diff --git a/test/dynamo/test_subgraphs.py b/test/dynamo/test_subgraphs.py
index bc54a87c4ecb..80a37b206545 100644
--- a/test/dynamo/test_subgraphs.py
+++ b/test/dynamo/test_subgraphs.py
@@ -351,6 +351,9 @@ def fn(a, b):
 
     @disable_cache_limit()
     def test_dynamic_shapes(self):
+        if config.assume_static_by_default:
+            return unittest.skip("Already covered identically in test_dynamic_kwarg")
+
         def fn(a, b):
             return a - b * 10
 
@@ -379,10 +382,27 @@ def fn(a, b):
         torch._dynamo.reset()
         cnt_dynamic = torch._dynamo.testing.CompileCounter()
         opt_fn = torch._dynamo.optimize(cnt_dynamic, dynamic=True)(fn)
-        for i in range(2, 12):
+        start = 2
+        end = 12
+        steps = end - start
+        for i in range(start, end):
             opt_fn(torch.randn(i), torch.randn(i))
-        # just one graph
-        self.assertEqual(cnt_dynamic.frame_count, 1)
+
+        if config.assume_static_by_default:
+            # We run with `dynamic`, but assume_static_by_default will produce the same number
+            # of breaks as without dynamic, since no tensors were marked dyn.
+            self.assertEqual(cnt_dynamic.frame_count, steps)
+
+            torch._dynamo.reset()
+            # Reset the counter
+            cnt_dynamic = torch._dynamo.testing.CompileCounter()
+            opt_fn = torch._dynamo.optimize(cnt_dynamic, dynamic=False)(fn)
+            for i in range(start, end):
+                opt_fn(torch.randn(i), torch.randn(i))
+            self.assertEqual(cnt_dynamic.frame_count, steps)
+        else:
+            # just one graph
+            self.assertEqual(cnt_dynamic.frame_count, 1)
 
     def test_dynamic_duck_size(self):
         def fn(a, b):
@@ -415,7 +435,10 @@ def fn(a, b):
         # guards for when x and y didn't duck size together, so we end up
         # with a generic graph that also works when x and y happen to duck
         # size together.
-        self.assertEqual(cnt_dynamic.frame_count, 1)
+        if config.assume_static_by_default:
+            self.assertEqual(cnt_dynamic.frame_count, 2)
+        else:
+            self.assertEqual(cnt_dynamic.frame_count, 1)
 
         torch._dynamo.reset()
         cnt_dynamic.frame_count = 0
@@ -439,6 +462,7 @@ def fn(a):
         self.assertEqual(opt_fn(x), fn(x))
         self.assertEqual(cnt_dynamic.frame_count, 2)
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_no_graph_break_on_item(self):
         def fn(a, b):
@@ -450,6 +474,7 @@ def fn(a, b):
 
         self._common(fn, 1, 6)
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", False)
     def test_graph_break_on_item(self):
         def fn(a, b):
diff --git a/test/dynamo/test_torchxla_integration.py b/test/dynamo/test_torchxla_integration.py
deleted file mode 100644
index 831a5818c0bd..000000000000
--- a/test/dynamo/test_torchxla_integration.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Owner(s): ["module: dynamo"]
-import copy
-
-import torch
-
-import torch._dynamo.test_case
-import torch._dynamo.testing
-from functorch.compile import aot_module_simplified, make_boxed_compiler
-from torch._dynamo import disable
-
-try:
-    from .test_torchxla_util import maybe_skip_torchxla_test
-except ImportError:
-    from test_torchxla_util import maybe_skip_torchxla_test
-
-try:
-    import torch._dynamo.optimizations.torchxla_integration as integration
-    import torch_xla.core.xla_model as xm
-    import torch_xla.debug.metrics as metrics
-except ImportError:
-    # tests using torch_xla will be skipped. It's fine to ignore the
-    # importing error here.
-    pass
-
-from torch import fx, nn
-
-
-class BasicModule(nn.Module):
-    def __init__(self):
-        super(BasicModule, self).__init__()
-
-    def forward(self, x, y):
-        return x + y
-
-    def get_random_inputs(self):
-        return (torch.randn(10), torch.randn(10))
-
-
-class MatmulModule(nn.Module):
-    def __init__(self):
-        super(MatmulModule, self).__init__()
-
-    def forward(self, x, y):
-        return x @ y
-
-    def get_random_inputs(self):
-        return (torch.randn(5, 100), torch.randn(100, 5))
-
-
-class LinearModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.linear = nn.Linear(10, 5)
-
-    def forward(self, x):
-        return self.linear(x)
-
-    def get_random_inputs(self):
-        return (torch.randn(2, 10),)
-
-
-class MaxPoolModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 6, kernel_size=3, stride=2)
-        self.pool = nn.MaxPool2d(3, stride=2)
-
-    def forward(self, x):
-        x = self.conv(x)
-        return self.pool(x)
-
-    def get_random_inputs(self):
-        return (torch.randn(2, 3, 10, 10),)
-
-
-class ModuleInplaceUpdate(nn.Module):
-    def __init__(self):
-        super(ModuleInplaceUpdate, self).__init__()
-
-    def forward(self, a, b):
-        a.sub_(b)
-        return b - 1, b + 1
-
-    def get_random_inputs(self):
-        return (torch.randn(10), torch.randn(10))
-
-
-def allclose(expected, actual):
-    def unwrap(cont):
-        if isinstance(cont, (list, tuple)) and len(cont) == 1:
-            return cont[0]
-        return cont
-
-    expected = unwrap(expected)
-    actual = unwrap(actual)
-
-    if isinstance(expected, torch.Tensor) and isinstance(actual, torch.Tensor):
-        return torch.allclose(expected, actual)
-    elif isinstance(expected, (tuple, list)) and isinstance(actual, (tuple, list)):
-        return len(expected) == len(actual) and all(
-            torch.allclose(a, b) for a, b in zip(expected, actual)
-        )
-    else:
-        raise RuntimeError("Unexpected types")
-
-
-def make_reuse_graph_test(module_class, niter=100):
-    @maybe_skip_torchxla_test
-    def test_wrapper(self):
-        xla_dev = xm.xla_device()
-        xla_module = module_class().to(device=xla_dev)
-        inputs = tuple(x.to(device=xla_dev) for x in xla_module.get_random_inputs())
-        metrics.clear_counters()
-        optimized_mod = integration.extract_compiled_graph(
-            fx.symbolic_trace(xla_module), inputs
-        )
-
-        for i in range(niter):
-            xla_inputs = tuple(
-                inp.to(device=xla_dev) for inp in xla_module.get_random_inputs()
-            )
-            xla_inputs_copy = copy.deepcopy(xla_inputs)
-
-            expected = xla_module(*xla_inputs)
-            # make sure above lazy computation is executed.
-            xm.mark_step()
-
-            actual = optimized_mod(*xla_inputs_copy)
-
-            if not allclose(expected, actual):
-                print(
-                    f"Incorrect results at iter {i}. expected\n{expected}, actual\n{actual}"
-                )
-                self.assertTrue(False)
-
-            # make sure arguments match after calling the model forward method
-            # to handle inplace updates.
-            if not allclose(xla_inputs, xla_inputs_copy):
-                print(
-                    f"Incorrect updated arguments at iter {i}. expected\n{xla_inputs}, actual\n{xla_inputs_copy}"
-                )
-                self.assertTrue(False)
-
-    return test_wrapper
-
-
-def training_compiler(gm, example_inputs):
-    @make_boxed_compiler
-    @disable
-    def fw_compiler(graph, inputs, *args, **kwargs):
-        # tracing time inputs are FakeTensors, we can not pass them
-        # to extract_compiled_graph directly since we can not extract
-        # xla tensor id from fake tensors. Call extract_compiled_graph
-        # lazily and trigger that for the first call with non-fake tensors.
-        compiled_graph = None
-
-        def optimized_mod(*args):
-            nonlocal compiled_graph
-            if compiled_graph is None:
-                compiled_graph = integration.extract_compiled_graph(graph, args)
-            return compiled_graph(*args)
-
-        return optimized_mod
-
-    return aot_module_simplified(gm, example_inputs, fw_compiler=fw_compiler)
-
-
-def model_iter_fn_train(mod, inputs):
-    outputs = mod(*inputs)
-    loss = outputs.mean()
-    loss.backward()
-
-    param_list = list(mod.parameters())
-    return [param.grad for param in param_list]
-
-
-def make_training_test(model_cls):
-    @maybe_skip_torchxla_test
-    def test_wrapper(self):
-        import torch_xla.core.xla_model as xm
-
-        xla_dev = xm.xla_device()
-        model = model_cls()
-        inputs = model.get_random_inputs()
-
-        model = model.to(device=xla_dev)
-        inputs = tuple(inp.to(device=xla_dev) for inp in inputs)
-
-        # do baseline
-        baseline_model = copy.deepcopy(model)
-        baseline_inputs = copy.deepcopy(inputs)
-        expected_output = model_iter_fn_train(baseline_model, baseline_inputs)
-
-        compiler = training_compiler
-        optimize_ctx = torch._dynamo.optimize(compiler, nopython=False)
-        optimized_model_iter_fn = optimize_ctx(model_iter_fn_train)
-
-        actual_output = optimized_model_iter_fn(model, inputs)
-        print(f"expected_output:\n{expected_output}\nactual_output:\n{actual_output}")
-        assert allclose(expected_output, actual_output)
-
-    return test_wrapper
-
-
-class TorchXLAReuseGraphTest(torch._dynamo.test_case.TestCase):
-    test_basic = make_reuse_graph_test(BasicModule)
-    test_matmul = make_reuse_graph_test(MatmulModule)
-    test_linear = make_reuse_graph_test(LinearModule)
-    test_inplace_update = make_reuse_graph_test(ModuleInplaceUpdate)
-
-    test_training_linear = make_training_test(LinearModule)
-    test_training_maxpool = make_training_test(MaxPoolModule)
-
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/dynamo/test_torchxla_num_output.py b/test/dynamo/test_torchxla_num_output.py
deleted file mode 100644
index 0e91a358d469..000000000000
--- a/test/dynamo/test_torchxla_num_output.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Owner(s): ["module: dynamo"]
-import unittest
-
-import torch
-from torch import nn
-from torch._dynamo.optimizations.torchxla_integration import GraphInputMatcher
-from torch.utils._pytree import tree_map_only
-
-try:
-    from .test_torchxla_util import maybe_skip_torchxla_test
-except ImportError:
-    from test_torchxla_util import maybe_skip_torchxla_test
-
-try:
-    import torch_xla
-    import torch_xla.core.xla_model as xm
-except ImportError:
-    # tests using torch_xla will be skipped. It's fine to ignore the
-    # importing error here.
-    pass
-
-
-class DirectReturnModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, a, b, c):
-        """
-        The XLA graph will only return the first 2 items
-        """
-        return a + b, a + c, b
-
-    def get_example_inputs(self):
-        return (torch.rand(2), torch.rand(2), torch.rand(2))
-
-
-class DirectReturnWithInplaceUpdateModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, a, b, c):
-        """
-        Inplace update on b cause it to be returned in XLA graph
-        """
-        b.zero_()
-        return a + b, a + c, b
-
-    def get_example_inputs(self):
-        return (torch.rand(2), torch.rand(2), torch.rand(2))
-
-
-class DirectReturnWithDuplicatedInplaceUpdateModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, a, b, c):
-        """
-        Even if we return b twice, the XLA graph only return b once.
-        """
-        b.zero_()
-        return a + b, a + c, b, b
-
-    def get_example_inputs(self):
-        return (torch.rand(2), torch.rand(2), torch.rand(2))
-
-
-class TestNumOutput(unittest.TestCase):
-    def do_test(self, model_class, expected_num_output):
-        xla_dev = xm.xla_device()
-        model = model_class().to(device=xla_dev)
-        inputs = tree_map_only(
-            torch.Tensor, lambda x: x.to(device=xla_dev), model.get_example_inputs()
-        )
-
-        xm.mark_step()
-        args_tensor_ids = [
-            torch_xla._XLAC._xla_get_tensor_id(xla_arg) for xla_arg in inputs
-        ]
-        tensor_id_to_arg_idx = {
-            tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)
-        }
-        outputs = model(*inputs)
-        xla_graph_hash = torch_xla._XLAC._get_graph_hash(outputs)
-
-        (
-            graph_input_tensor_ids,
-            graph_input_xla_values,
-        ) = torch_xla._XLAC._get_tensors_xla_device_data_node(outputs)
-
-        graph_input_matcher = GraphInputMatcher(
-            tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_xla_values
-        )
-        torch_xla._XLAC._xla_sync_multi(outputs, [])
-
-        def run_cached_graph(*inputs):
-            torch_xla._XLAC._xla_sync_multi(inputs, [])
-            xla_graph_inputs = graph_input_matcher(inputs)
-            xla_graph_outputs = torch_xla._XLAC._run_cached_graph(
-                xla_graph_hash, xla_graph_inputs
-            )
-            return xla_graph_outputs
-
-        test_inputs = tree_map_only(
-            torch.Tensor, lambda x: x.to(device=xla_dev), model.get_example_inputs()
-        )
-        self.assertEqual(expected_num_output, len(run_cached_graph(*test_inputs)))
-
-    @maybe_skip_torchxla_test
-    def test_direct_return(self):
-        self.do_test(DirectReturnModule, expected_num_output=2)
-
-    @maybe_skip_torchxla_test
-    def test_direct_return_with_inplace_update(self):
-        self.do_test(DirectReturnWithInplaceUpdateModule, expected_num_output=3)
-
-    @maybe_skip_torchxla_test
-    def test_direct_return_with_duplicated_inplace_update(self):
-        self.do_test(
-            DirectReturnWithDuplicatedInplaceUpdateModule, expected_num_output=3
-        )
diff --git a/test/dynamo/test_torchxla_util.py b/test/dynamo/test_torchxla_util.py
deleted file mode 100644
index abf1d16bfc3d..000000000000
--- a/test/dynamo/test_torchxla_util.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Owner(s): ["module: dynamo"]
-import functools
-import unittest
-
-
-@functools.lru_cache(None)
-def should_run_torchxla_tests():
-    """
-    Run the tests if torch_xla is available and xla_device can be init.
-    """
-    try:
-        import torch_xla.core.xla_model as xm
-    except ImportError:
-        return False
-    try:
-        device = xm.xla_device()
-    except RuntimeError:
-        return False
-    return True
-
-
-def maybe_skip_torchxla_test(test_case):
-    return unittest.skipIf(
-        not should_run_torchxla_tests(),
-        "Skip the tests since torch_xla is not available or XLA devices are not specified",
-    )(test_case)
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 9bda5a47cc90..67d66058f4c5 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -2,7 +2,6 @@
 import functools
 import random
 import unittest
-from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -21,7 +20,7 @@
 def make_unspec_fn(fn):
     @functools.wraps(fn)
     def _fn(*args, **kwargs):
-        with patch.object(torch._dynamo.config, "specialize_int_float", False):
+        with torch._dynamo.config.patch("specialize_int_float", False):
             return fn(*args, **kwargs)
 
     return _fn
@@ -51,7 +50,7 @@ class UnspecTest(cls):
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
 
-@patch.object(torch._dynamo.config, "specialize_int_float", False)
+@torch._dynamo.config.patch("specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):
     def test_numpy_correctness(self):
         def fn(x, y, z):
@@ -138,7 +137,7 @@ def fn(x):
         res2 = opt_fn(x)
         self.assertTrue(same(res1, res2))
 
-    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    @torch._dynamo.config.patch("dynamic_shapes", True)
     def test_multiple_consecutive_random_calls_before_graph(self):
         def fn(x):
             dim1 = random.randrange(start=0, stop=5)
diff --git a/test/dynamo/test_verify_correctness.py b/test/dynamo/test_verify_correctness.py
index 7a6f8e3d4263..f3b30444ab68 100644
--- a/test/dynamo/test_verify_correctness.py
+++ b/test/dynamo/test_verify_correctness.py
@@ -1,34 +1,17 @@
 # Owner(s): ["module: dynamo"]
-import importlib
 import operator
 import unittest
-from unittest.mock import patch
 
 import torch
 
 import torch._dynamo
+import torch._dynamo.backends.ipex
 import torch._dynamo.config as config
 import torch._dynamo.test_case
-from torch._dynamo.optimizations import backends
+from torch._dynamo.backends.ipex import has_ipex
 from torch._dynamo.testing import same
 
 
-def has_onnxruntime():
-    try:
-        importlib.import_module("onnxruntime")
-        return True
-    except ImportError:
-        return False
-
-
-def has_ipex():
-    try:
-        importlib.import_module("intel_extension_for_pytorch")
-        return True
-    except ImportError:
-        return False
-
-
 class Seq(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -45,7 +28,7 @@ def forward(self, x):
 
 class Conv_Bn_Relu(torch.nn.Module):
     def __init__(self, in_channels, out_channels, **kwargs):
-        super(Conv_Bn_Relu, self).__init__()
+        super().__init__()
         self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
         self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001)
         self.relu = torch.nn.ReLU()
@@ -78,8 +61,8 @@ def transform(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
     return gm
 
 
+@config.patch("verify_correctness", True)
 class TestVerifyCorrectness(torch._dynamo.test_case.TestCase):
-    @patch.object(config, "verify_correctness", True)
     def test_example_inputs(self):
         def fn(a, bc, d):
             b, c = bc
@@ -106,16 +89,14 @@ def compiler_fn(graph, example_inputs):
         self.assertEqual(r1.device, r2.device)
         self.assertEqual(r1.device, r3.device)
 
-    @patch.object(config, "verify_correctness", True)
-    def test_nnc(self):
+    def test_torchscript(self):
         s = Seq()
         i = torch.randn(10)
         r1 = s(i)
-        opt_s = torch._dynamo.optimize("nnc")(s)
+        opt_s = torch._dynamo.optimize("ts")(s)
         r2 = opt_s(i)
         self.assertTrue(same(r1, r2))
 
-    @patch.object(config, "verify_correctness", True)
     def test_incorrect_verify_true(self):
         """
         If a bad optimization return a graph that
@@ -138,7 +119,7 @@ def incorrect_compile_fn(gm, example_inputs):
         else:
             self.fail("expected failure")
 
-    @patch.object(config, "verify_correctness", False)
+    @config.patch("verify_correctness", False)
     def test_incorrect_verify_false(self):
         """
         The bad optimization return a graph that
@@ -158,14 +139,13 @@ def incorrect_compile_fn(gm, example_inputs):
         self.assertTrue(not same(r1, r2))
 
     @unittest.skipIf(not has_ipex(), "requires ipex")
-    @patch.object(config, "verify_correctness", True)
     def test_ipex_fp32(self):
         model = Conv_Bn_Relu(3, 32, kernel_size=3, stride=1)
         model = model.to(memory_format=torch.channels_last)
         model = model.eval()
         input = torch.randn(8, 3, 64, 64).contiguous(memory_format=torch.channels_last)
         r1 = model(input)
-        opt_model = torch._dynamo.optimize(backends.ipex_fp32)(model)
+        opt_model = torch._dynamo.optimize("ipex")(model)
         with torch.no_grad():
             r2 = opt_model(input)
         self.assertTrue(same(r1, r2))
diff --git a/test/edge/CMakeLists.txt b/test/edge/CMakeLists.txt
index fa1e5720215c..6195fb2a68b7 100644
--- a/test/edge/CMakeLists.txt
+++ b/test/edge/CMakeLists.txt
@@ -9,12 +9,12 @@ file(GLOB_RECURSE all_python "${TORCH_ROOT}/torchgen/*.py")
 set(GEN_COMMAND
         "${PYTHON_EXECUTABLE}" -m torchgen.gen_executorch
         --source-path=${TEST_ROOT}
-        --install_dir=${OUTPUT_DIRECTORY}
+        --install-dir=${OUTPUT_DIRECTORY}
         --tags-path=${TORCH_ROOT}/aten/src/ATen/native/tags.yaml
-        --aten_yaml_path=${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml
-        --use_aten_lib
-        --op_selection_yaml_path=${TEST_ROOT}/selected_operators.yaml
-        --custom_ops_yaml_path=${TEST_ROOT}/custom_ops.yaml
+        --aten-yaml-path=${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml
+        --use-aten-lib
+        --op-selection-yaml-path=${TEST_ROOT}/selected_operators.yaml
+        --custom-ops-yaml-path=${TEST_ROOT}/custom_ops.yaml
         )
 set(GEN_COMMAND_sources
         ${OUTPUT_DIRECTORY}/RegisterCodegenUnboxedKernelsEverything.cpp
diff --git a/test/edge/RuntimeContext.h b/test/edge/RuntimeContext.h
new file mode 100644
index 000000000000..5fa0e95707a0
--- /dev/null
+++ b/test/edge/RuntimeContext.h
@@ -0,0 +1,22 @@
+#pragma once
+
+namespace torch {
+namespace executor {
+
+/**
+ * Bucket type abstraction that contains many elements of runtime state that
+ * a kernel author may want available, but would otherwise be unable to access.
+ *
+ * Forwarded along to all operators when running in lean mode.
+ * NOTE: Will not be forwarded to operators if running in ATen mode
+ * as those operators do not expect to receive a RuntimeContext and would not
+ * use it.
+ *
+ * This includes things like setting an error state, a scratch allocator for
+ * operators that need more then constant space, and a TensorResizer for dynamic
+ * shape tensors allowing programs to be more flexible with Tensor shape.
+ */
+class RuntimeContext {};
+
+} // namespace executor
+} // namespace torch
diff --git a/test/edge/operator_registry.h b/test/edge/operator_registry.h
index dee0b50c2a56..01b8d2374bcc 100644
--- a/test/edge/operator_registry.h
+++ b/test/edge/operator_registry.h
@@ -4,13 +4,14 @@
 
 #include <c10/util/ArrayRef.h>
 #include "Evalue.h"
+#include "RuntimeContext.h"
 #include <functional>
 #include <map>
 
 namespace torch {
 namespace executor {
 
-using OpFunction = std::function<void(EValue**)>;
+using OpFunction = std::function<void(RuntimeContext&, EValue**)>;
 
 template<typename T>
 using ArrayRef = at::ArrayRef<T>;
diff --git a/test/edge/test_operator_registration.cpp b/test/edge/test_operator_registration.cpp
index 89aed23df28e..905c5de4c8fc 100644
--- a/test/edge/test_operator_registration.cpp
+++ b/test/edge/test_operator_registration.cpp
@@ -18,7 +18,8 @@ TEST(OperatorRegistrationTest, Add) {
     for (size_t i = 0; i < 4; i++) {
         kernel_values[i] = &values[i];
     }
-    op(kernel_values);
+    RuntimeContext context{};
+    op(context, kernel_values);
     at::Tensor expected = at::ones({2, 3});
     expected = at::fill(expected, 2);
     ASSERT_TRUE(expected.equal(kernel_values[3]->toTensor()));
@@ -39,7 +40,8 @@ TEST(OperatorRegistrationTest, CustomAdd3) {
     for (size_t i = 0; i < 4; i++) {
         kernel_values[i] = &values[i];
     }
-    op(kernel_values);
+    RuntimeContext context{};
+    op(context, kernel_values);
     at::Tensor expected = at::ones({2, 3});
     expected = at::fill(expected, 3);
     ASSERT_TRUE(expected.equal(kernel_values[3]->toTensor()));
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
new file mode 100644
index 000000000000..b61a34ddebdb
--- /dev/null
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -0,0 +1,1350 @@
+aten::__ilshift__.Scalar
+aten::__ilshift__.Tensor
+aten::__irshift__.Scalar
+aten::__irshift__.Tensor
+aten::__lshift__.Scalar
+aten::__lshift__.Scalar_out
+aten::__lshift__.Tensor
+aten::__lshift__.Tensor_out
+aten::__rshift__.Scalar
+aten::__rshift__.Scalar_out
+aten::__rshift__.Tensor
+aten::__rshift__.Tensor_out
+aten::_adaptive_avg_pool2d_backward
+aten::_adaptive_avg_pool2d_backward.out
+aten::_adaptive_avg_pool3d
+aten::_adaptive_avg_pool3d.out
+aten::_adaptive_avg_pool3d_backward
+aten::_adaptive_avg_pool3d_backward.out
+aten::_add_relu.Scalar
+aten::_add_relu.Scalar_out
+aten::_add_relu.Tensor
+aten::_add_relu.out
+aten::_add_relu_.Scalar
+aten::_add_relu_.Tensor
+aten::_addmm_activation
+aten::_addmm_activation.out
+aten::_aminmax
+aten::_aminmax.dim
+aten::_aminmax.dim_out
+aten::_aminmax.out
+aten::_amp_foreach_non_finite_check_and_unscale
+aten::_amp_foreach_non_finite_check_and_unscale.out
+aten::_amp_foreach_non_finite_check_and_unscale_
+aten::_amp_update_scale
+aten::_amp_update_scale.out
+aten::_amp_update_scale_
+aten::_assert_async
+aten::_cdist_backward
+aten::_cdist_backward.out
+aten::_cdist_forward
+aten::_cdist_forward.out
+aten::_cholesky_solve_helper
+aten::_cholesky_solve_helper.out
+aten::_chunk_grad_outputs_efficient_attention
+aten::_coalesce
+aten::_coalesce.out
+aten::_coalesced
+aten::_coalesced.out
+aten::_coalesced_
+aten::_compute_linear_combination
+aten::_compute_linear_combination.out
+aten::_conj
+aten::_conj_copy
+aten::_conj_copy.out
+aten::_conj_physical
+aten::_conj_physical.out
+aten::_conv_depthwise2d
+aten::_conv_depthwise2d.out
+aten::_convert_indices_from_coo_to_csr
+aten::_convert_indices_from_coo_to_csr.out
+aten::_convert_indices_from_csr_to_coo
+aten::_convert_indices_from_csr_to_coo.out
+aten::_convolution
+aten::_convolution.out
+aten::_copy_from
+aten::_copy_from.out
+aten::_copy_from_and_resize
+aten::_copy_from_and_resize.out
+aten::_ctc_loss
+aten::_ctc_loss.Tensor
+aten::_ctc_loss.Tensor_out
+aten::_ctc_loss.out
+aten::_ctc_loss_backward
+aten::_ctc_loss_backward.Tensor
+aten::_ctc_loss_backward.out
+aten::_cudnn_ctc_loss
+aten::_cudnn_ctc_loss.Tensor
+aten::_cudnn_ctc_loss.out
+aten::_cudnn_init_dropout_state
+aten::_cudnn_init_dropout_state.out
+aten::_cudnn_rnn
+aten::_cudnn_rnn.out
+aten::_cudnn_rnn_backward
+aten::_cudnn_rnn_backward.out
+aten::_cudnn_rnn_flatten_weight
+aten::_cudnn_rnn_flatten_weight.out
+aten::_cummax_helper
+aten::_cummin_helper
+aten::_dimI
+aten::_dimV
+aten::_dirichlet_grad
+aten::_dirichlet_grad.out
+aten::_efficient_attention_backward
+aten::_efficient_attention_forward
+aten::_efficientzerotensor
+aten::_efficientzerotensor.out
+aten::_embedding_bag
+aten::_embedding_bag.out
+aten::_embedding_bag_dense_backward
+aten::_embedding_bag_dense_backward.out
+aten::_embedding_bag_forward_only
+aten::_embedding_bag_forward_only.out
+aten::_embedding_bag_per_sample_weights_backward
+aten::_embedding_bag_per_sample_weights_backward.out
+aten::_empty_affine_quantized
+aten::_empty_affine_quantized.out
+aten::_empty_per_channel_affine_quantized
+aten::_empty_per_channel_affine_quantized.out
+aten::_fake_quantize_learnable_per_channel_affine
+aten::_fake_quantize_learnable_per_channel_affine.out
+aten::_fake_quantize_learnable_per_channel_affine_backward
+aten::_fake_quantize_learnable_per_tensor_affine
+aten::_fake_quantize_learnable_per_tensor_affine.out
+aten::_fake_quantize_learnable_per_tensor_affine_backward
+aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams
+aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out
+aten::_fft_c2c
+aten::_fft_c2c.out
+aten::_fft_c2r
+aten::_fft_c2r.out
+aten::_fft_r2c
+aten::_fft_r2c.out
+aten::_flash_attention_backward
+aten::_flash_attention_forward
+aten::_foobar
+aten::_foobar.out
+aten::_foreach_abs
+aten::_foreach_abs.out
+aten::_foreach_abs_
+aten::_foreach_acos
+aten::_foreach_acos.out
+aten::_foreach_acos_
+aten::_foreach_add.List
+aten::_foreach_add.List_out
+aten::_foreach_add.Scalar
+aten::_foreach_add.ScalarList
+aten::_foreach_add.ScalarList_out
+aten::_foreach_add.Scalar_out
+aten::_foreach_add_.List
+aten::_foreach_add_.Scalar
+aten::_foreach_add_.ScalarList
+aten::_foreach_addcdiv.Scalar
+aten::_foreach_addcdiv.ScalarList
+aten::_foreach_addcdiv.ScalarList_out
+aten::_foreach_addcdiv.Scalar_out
+aten::_foreach_addcdiv.Tensor
+aten::_foreach_addcdiv.Tensor_out
+aten::_foreach_addcdiv_.Scalar
+aten::_foreach_addcdiv_.ScalarList
+aten::_foreach_addcdiv_.Tensor
+aten::_foreach_addcmul.Scalar
+aten::_foreach_addcmul.ScalarList
+aten::_foreach_addcmul.ScalarList_out
+aten::_foreach_addcmul.Scalar_out
+aten::_foreach_addcmul.Tensor
+aten::_foreach_addcmul.Tensor_out
+aten::_foreach_addcmul_.Scalar
+aten::_foreach_addcmul_.ScalarList
+aten::_foreach_addcmul_.Tensor
+aten::_foreach_asin
+aten::_foreach_asin.out
+aten::_foreach_asin_
+aten::_foreach_atan
+aten::_foreach_atan.out
+aten::_foreach_atan_
+aten::_foreach_ceil
+aten::_foreach_ceil.out
+aten::_foreach_ceil_
+aten::_foreach_clamp_max.List
+aten::_foreach_clamp_max.List_out
+aten::_foreach_clamp_max.Scalar
+aten::_foreach_clamp_max.ScalarList
+aten::_foreach_clamp_max.ScalarList_out
+aten::_foreach_clamp_max.Scalar_out
+aten::_foreach_clamp_max_.List
+aten::_foreach_clamp_max_.Scalar
+aten::_foreach_clamp_max_.ScalarList
+aten::_foreach_clamp_min.List
+aten::_foreach_clamp_min.List_out
+aten::_foreach_clamp_min.Scalar
+aten::_foreach_clamp_min.ScalarList
+aten::_foreach_clamp_min.ScalarList_out
+aten::_foreach_clamp_min.Scalar_out
+aten::_foreach_clamp_min_.List
+aten::_foreach_clamp_min_.Scalar
+aten::_foreach_clamp_min_.ScalarList
+aten::_foreach_cos
+aten::_foreach_cos.out
+aten::_foreach_cos_
+aten::_foreach_cosh
+aten::_foreach_cosh.out
+aten::_foreach_cosh_
+aten::_foreach_div.List
+aten::_foreach_div.List_out
+aten::_foreach_div.Scalar
+aten::_foreach_div.ScalarList
+aten::_foreach_div.ScalarList_out
+aten::_foreach_div.Scalar_out
+aten::_foreach_div_.List
+aten::_foreach_div_.Scalar
+aten::_foreach_div_.ScalarList
+aten::_foreach_erf
+aten::_foreach_erf.out
+aten::_foreach_erf_
+aten::_foreach_erfc
+aten::_foreach_erfc.out
+aten::_foreach_erfc_
+aten::_foreach_exp
+aten::_foreach_exp.out
+aten::_foreach_exp_
+aten::_foreach_expm1
+aten::_foreach_expm1.out
+aten::_foreach_expm1_
+aten::_foreach_floor
+aten::_foreach_floor.out
+aten::_foreach_floor_
+aten::_foreach_frac
+aten::_foreach_frac.out
+aten::_foreach_frac_
+aten::_foreach_lerp.List
+aten::_foreach_lerp.List_out
+aten::_foreach_lerp.Scalar
+aten::_foreach_lerp.Scalar_out
+aten::_foreach_lerp_.List
+aten::_foreach_lerp_.Scalar
+aten::_foreach_lgamma
+aten::_foreach_lgamma.out
+aten::_foreach_lgamma_
+aten::_foreach_log
+aten::_foreach_log.out
+aten::_foreach_log10
+aten::_foreach_log10.out
+aten::_foreach_log10_
+aten::_foreach_log1p
+aten::_foreach_log1p.out
+aten::_foreach_log1p_
+aten::_foreach_log2
+aten::_foreach_log2.out
+aten::_foreach_log2_
+aten::_foreach_log_
+aten::_foreach_maximum.List
+aten::_foreach_maximum.List_out
+aten::_foreach_maximum.Scalar
+aten::_foreach_maximum.ScalarList
+aten::_foreach_maximum.ScalarList_out
+aten::_foreach_maximum.Scalar_out
+aten::_foreach_maximum_.List
+aten::_foreach_maximum_.Scalar
+aten::_foreach_maximum_.ScalarList
+aten::_foreach_minimum.List
+aten::_foreach_minimum.List_out
+aten::_foreach_minimum.Scalar
+aten::_foreach_minimum.ScalarList
+aten::_foreach_minimum.ScalarList_out
+aten::_foreach_minimum.Scalar_out
+aten::_foreach_minimum_.List
+aten::_foreach_minimum_.Scalar
+aten::_foreach_minimum_.ScalarList
+aten::_foreach_mul.List
+aten::_foreach_mul.List_out
+aten::_foreach_mul.Scalar
+aten::_foreach_mul.ScalarList
+aten::_foreach_mul.ScalarList_out
+aten::_foreach_mul.Scalar_out
+aten::_foreach_mul_.List
+aten::_foreach_mul_.Scalar
+aten::_foreach_mul_.ScalarList
+aten::_foreach_neg
+aten::_foreach_neg.out
+aten::_foreach_neg_
+aten::_foreach_norm.Scalar
+aten::_foreach_norm.Scalar_out
+aten::_foreach_pow.List
+aten::_foreach_pow.List_out
+aten::_foreach_pow.Scalar
+aten::_foreach_pow.ScalarAndTensor
+aten::_foreach_pow.ScalarList
+aten::_foreach_pow.ScalarList_out
+aten::_foreach_pow.Scalar_out
+aten::_foreach_pow_.List
+aten::_foreach_pow_.Scalar
+aten::_foreach_pow_.ScalarList
+aten::_foreach_reciprocal
+aten::_foreach_reciprocal.out
+aten::_foreach_reciprocal_
+aten::_foreach_round
+aten::_foreach_round.out
+aten::_foreach_round_
+aten::_foreach_sigmoid
+aten::_foreach_sigmoid.out
+aten::_foreach_sigmoid_
+aten::_foreach_sin
+aten::_foreach_sin.out
+aten::_foreach_sin_
+aten::_foreach_sinh
+aten::_foreach_sinh.out
+aten::_foreach_sinh_
+aten::_foreach_sqrt
+aten::_foreach_sqrt.out
+aten::_foreach_sqrt_
+aten::_foreach_sub.List
+aten::_foreach_sub.List_out
+aten::_foreach_sub.Scalar
+aten::_foreach_sub.ScalarList
+aten::_foreach_sub.ScalarList_out
+aten::_foreach_sub.Scalar_out
+aten::_foreach_sub_.List
+aten::_foreach_sub_.Scalar
+aten::_foreach_sub_.ScalarList
+aten::_foreach_tan
+aten::_foreach_tan.out
+aten::_foreach_tan_
+aten::_foreach_tanh
+aten::_foreach_tanh.out
+aten::_foreach_tanh_
+aten::_foreach_trunc
+aten::_foreach_trunc.out
+aten::_foreach_trunc_
+aten::_foreach_zero
+aten::_foreach_zero.out
+aten::_foreach_zero_
+aten::_fused_adam
+aten::_fused_adam.out
+aten::_fused_adam_
+aten::_fused_adamw
+aten::_fused_adamw.out
+aten::_fused_adamw_
+aten::_fused_moving_avg_obs_fq_helper
+aten::_fused_moving_avg_obs_fq_helper.out
+aten::_fused_moving_avg_obs_fq_helper_functional
+aten::_fused_sdp_choice
+aten::_fw_primal
+aten::_fw_primal_copy
+aten::_fw_primal_copy.out
+aten::_grid_sampler_2d_cpu_fallback
+aten::_grid_sampler_2d_cpu_fallback.out
+aten::_has_same_storage_numel
+aten::_histogramdd_bin_edges
+aten::_histogramdd_bin_edges.out
+aten::_histogramdd_from_bin_cts
+aten::_histogramdd_from_bin_cts.out
+aten::_histogramdd_from_bin_tensors
+aten::_histogramdd_from_bin_tensors.out
+aten::_index_put_impl
+aten::_index_put_impl.out
+aten::_index_put_impl_
+aten::_indices
+aten::_indices_copy
+aten::_indices_copy.out
+aten::_is_all_true
+aten::_is_any_true
+aten::_linalg_check_errors
+aten::_linalg_det
+aten::_linalg_det.result
+aten::_linalg_eigh
+aten::_linalg_eigh.eigenvalues
+aten::_linalg_slogdet
+aten::_linalg_slogdet.sign
+aten::_linalg_solve_ex
+aten::_linalg_solve_ex.result
+aten::_linalg_svd
+aten::_linalg_svd.U
+aten::_local_scalar_dense
+aten::_logcumsumexp
+aten::_logcumsumexp.out
+aten::_lstm_mps
+aten::_lstm_mps.out
+aten::_make_dual
+aten::_make_dual_copy
+aten::_make_dual_copy.out
+aten::_make_per_channel_quantized_tensor
+aten::_make_per_channel_quantized_tensor.out
+aten::_make_per_tensor_quantized_tensor
+aten::_make_per_tensor_quantized_tensor.out
+aten::_masked_scale
+aten::_masked_scale.out
+aten::_masked_softmax
+aten::_masked_softmax.out
+aten::_masked_softmax_backward
+aten::_masked_softmax_backward.out
+aten::_mkldnn_reshape
+aten::_mkldnn_reshape.out
+aten::_mkldnn_transpose
+aten::_mkldnn_transpose.out
+aten::_mkldnn_transpose_
+aten::_mps_convolution
+aten::_mps_convolution.out
+aten::_mps_convolution_transpose
+aten::_mps_convolution_transpose.out
+aten::_native_batch_norm_legit.no_stats_out
+aten::_native_batch_norm_legit.out
+aten::_native_batch_norm_legit_no_training.out
+aten::_native_decoder_only_multi_head_attention
+aten::_native_decoder_only_multi_head_attention.out
+aten::_native_multi_head_attention
+aten::_native_multi_head_attention.out
+aten::_neg_view
+aten::_neg_view_copy
+aten::_neg_view_copy.out
+aten::_nested_from_padded
+aten::_nested_from_padded.out
+aten::_nested_from_padded_and_nested_example
+aten::_nested_from_padded_and_nested_example.out
+aten::_nested_select_backward
+aten::_nested_sum_backward
+aten::_nested_tensor_from_mask
+aten::_nested_tensor_from_mask.out
+aten::_nested_tensor_from_mask_left_aligned
+aten::_nested_tensor_from_tensor_list
+aten::_nested_tensor_from_tensor_list.out
+aten::_nested_tensor_offsets
+aten::_nested_tensor_size
+aten::_nested_tensor_size.out
+aten::_nested_tensor_softmax_with_shape
+aten::_nested_tensor_strides
+aten::_nested_tensor_strides.out
+aten::_nested_view_from_buffer
+aten::_nested_view_from_buffer_copy
+aten::_nested_view_from_buffer_copy.out
+aten::_new_zeros_with_same_feature_meta
+aten::_new_zeros_with_same_feature_meta.out
+aten::_nnpack_spatial_convolution
+aten::_nnpack_spatial_convolution.out
+aten::_nnz
+aten::_pack_padded_sequence
+aten::_pack_padded_sequence.out
+aten::_pdist_backward
+aten::_pdist_backward.out
+aten::_pdist_forward
+aten::_pdist_forward.out
+aten::_pin_memory
+aten::_pin_memory.out
+aten::_reshape_alias_copy
+aten::_reshape_alias_copy.out
+aten::_reshape_copy
+aten::_resize_output
+aten::_resize_output.out
+aten::_resize_output_
+aten::_sample_dirichlet
+aten::_sample_dirichlet.out
+aten::_scaled_dot_product_efficient_attention
+aten::_scaled_dot_product_efficient_attention_backward
+aten::_scaled_dot_product_flash_attention
+aten::_scaled_dot_product_flash_attention_backward
+aten::_segment_reduce_backward
+aten::_segment_reduce_backward.out
+aten::_slow_conv2d_backward.grad_input
+aten::_slow_conv2d_backward.output_mask
+aten::_slow_conv2d_backward.output_mask_out
+aten::_slow_conv2d_forward
+aten::_slow_conv2d_forward.output
+aten::_sparse_addmm
+aten::_sparse_addmm.out
+aten::_sparse_broadcast_to
+aten::_sparse_broadcast_to_copy
+aten::_sparse_broadcast_to_copy.out
+aten::_sparse_coo_tensor_with_dims
+aten::_sparse_coo_tensor_with_dims.out
+aten::_sparse_coo_tensor_with_dims_and_tensors
+aten::_sparse_coo_tensor_with_dims_and_tensors.out
+aten::_sparse_csr_prod.dim_dtype
+aten::_sparse_csr_prod.dim_dtype_out
+aten::_sparse_csr_sum.dim_dtype
+aten::_sparse_csr_sum.dim_dtype_out
+aten::_sparse_log_softmax
+aten::_sparse_log_softmax.out
+aten::_sparse_log_softmax_backward_data
+aten::_sparse_log_softmax_backward_data.out
+aten::_sparse_mm_reduce_impl
+aten::_sparse_mm_reduce_impl_backward
+aten::_sparse_softmax
+aten::_sparse_softmax.out
+aten::_sparse_softmax_backward_data
+aten::_sparse_softmax_backward_data.out
+aten::_sparse_sparse_matmul
+aten::_sparse_sparse_matmul.out
+aten::_sparse_sum.dim
+aten::_sparse_sum.dim_out
+aten::_sparse_sum_backward
+aten::_sparse_sum_backward.out
+aten::_spdiags
+aten::_spdiags.out
+aten::_stack
+aten::_stack.out
+aten::_standard_gamma
+aten::_standard_gamma.out
+aten::_standard_gamma_grad
+aten::_standard_gamma_grad.out
+aten::_test_autograd_multiple_dispatch.fullcoverage
+aten::_test_autograd_multiple_dispatch.fullcoverage_out
+aten::_test_autograd_multiple_dispatch_view
+aten::_test_autograd_multiple_dispatch_view_copy
+aten::_test_autograd_multiple_dispatch_view_copy.out
+aten::_test_optional_filled_intlist
+aten::_test_optional_filled_intlist.out
+aten::_test_optional_floatlist
+aten::_test_optional_floatlist.out
+aten::_test_optional_intlist
+aten::_test_optional_intlist.out
+aten::_test_warn_in_autograd
+aten::_test_warn_in_autograd.out
+aten::_thnn_fused_gru_cell
+aten::_thnn_fused_gru_cell.out
+aten::_thnn_fused_gru_cell_backward
+aten::_thnn_fused_gru_cell_backward.out
+aten::_thnn_fused_lstm_cell
+aten::_thnn_fused_lstm_cell.out
+aten::_thnn_fused_lstm_cell_backward_impl
+aten::_thnn_fused_lstm_cell_backward_impl.out
+aten::_to_dense
+aten::_to_dense.out
+aten::_transform_bias_rescale_qkv
+aten::_transform_bias_rescale_qkv.out
+aten::_transformer_decoder_only_layer_fwd
+aten::_transformer_decoder_only_layer_fwd.out
+aten::_transformer_encoder_layer_fwd
+aten::_transformer_encoder_layer_fwd.out
+aten::_trilinear
+aten::_trilinear.out
+aten::_triton_multi_head_attention
+aten::_triton_multi_head_attention.out
+aten::_triton_scaled_dot_attention
+aten::_triton_scaled_dot_attention.out
+aten::_unique
+aten::_unique.out
+aten::_unique2
+aten::_unique2.out
+aten::_upsample_bicubic2d_aa
+aten::_upsample_bicubic2d_aa.out
+aten::_upsample_bicubic2d_aa_backward
+aten::_upsample_bicubic2d_aa_backward.grad_input
+aten::_upsample_bilinear2d_aa
+aten::_upsample_bilinear2d_aa.out
+aten::_upsample_bilinear2d_aa_backward
+aten::_upsample_bilinear2d_aa_backward.grad_input
+aten::_upsample_nearest_exact1d
+aten::_upsample_nearest_exact1d.out
+aten::_upsample_nearest_exact1d_backward
+aten::_upsample_nearest_exact1d_backward.grad_input
+aten::_upsample_nearest_exact2d
+aten::_upsample_nearest_exact2d.out
+aten::_upsample_nearest_exact2d_backward
+aten::_upsample_nearest_exact2d_backward.grad_input
+aten::_upsample_nearest_exact3d
+aten::_upsample_nearest_exact3d.out
+aten::_upsample_nearest_exact3d_backward
+aten::_upsample_nearest_exact3d_backward.grad_input
+aten::_use_cudnn_ctc_loss
+aten::_use_cudnn_ctc_loss.Tensor
+aten::_validate_compressed_sparse_indices
+aten::_values
+aten::_values_copy
+aten::_values_copy.out
+aten::_weight_norm_interface
+aten::_weight_norm_interface.out
+aten::_weight_norm_interface_backward
+aten::_weight_norm_interface_backward.out
+aten::adaptive_avg_pool2d.out
+aten::adaptive_avg_pool3d.out
+aten::adaptive_avg_pool3d_backward.grad_input
+aten::adaptive_max_pool2d
+aten::adaptive_max_pool2d.out
+aten::adaptive_max_pool2d_backward
+aten::adaptive_max_pool2d_backward.grad_input
+aten::adaptive_max_pool3d
+aten::adaptive_max_pool3d.out
+aten::adaptive_max_pool3d_backward
+aten::adaptive_max_pool3d_backward.grad_input
+aten::addbmm
+aten::addbmm.out
+aten::addmv
+aten::addmv.out
+aten::addr_
+aten::affine_grid_generator
+aten::affine_grid_generator.out
+aten::alias_copy
+aten::alias_copy.out
+aten::all_gather_into_tensor
+aten::all_reduce
+aten::allclose
+aten::aminmax
+aten::aminmax.out
+aten::angle
+aten::angle.out
+aten::argmax
+aten::argmax.out
+aten::argmin
+aten::argmin.out
+aten::argsort.stable
+aten::argsort.stable_out
+aten::as_strided
+aten::as_strided_
+aten::as_strided_copy
+aten::as_strided_copy.out
+aten::avg_pool2d
+aten::avg_pool2d.out
+aten::avg_pool2d_backward
+aten::avg_pool2d_backward.grad_input
+aten::avg_pool3d
+aten::avg_pool3d.out
+aten::avg_pool3d_backward
+aten::avg_pool3d_backward.grad_input
+aten::baddbmm
+aten::baddbmm.out
+aten::bartlett_window
+aten::bartlett_window.out
+aten::bartlett_window.periodic
+aten::bartlett_window.periodic_out
+aten::batch_norm_backward_elemt
+aten::batch_norm_backward_elemt.out
+aten::batch_norm_backward_reduce
+aten::batch_norm_backward_reduce.out
+aten::batch_norm_elemt
+aten::batch_norm_elemt.out
+aten::batch_norm_gather_stats
+aten::batch_norm_gather_stats.out
+aten::batch_norm_gather_stats_with_counts
+aten::batch_norm_gather_stats_with_counts.out
+aten::batch_norm_stats
+aten::batch_norm_stats.out
+aten::batch_norm_update_stats
+aten::batch_norm_update_stats.out
+aten::bernoulli
+aten::bernoulli.Tensor
+aten::bernoulli.Tensor_out
+aten::bernoulli.float_out
+aten::bernoulli.out
+aten::bernoulli.p
+aten::bernoulli_.Tensor
+aten::bernoulli_.float
+aten::bincount
+aten::bincount.out
+aten::binomial
+aten::binomial.out
+aten::blackman_window
+aten::blackman_window.out
+aten::blackman_window.periodic
+aten::blackman_window.periodic_out
+aten::block_diag
+aten::block_diag.out
+aten::bmm
+aten::bmm.out
+aten::ccol_indices
+aten::ccol_indices_copy
+aten::ccol_indices_copy.out
+aten::channel_shuffle
+aten::channel_shuffle.out
+aten::cholesky
+aten::cholesky.out
+aten::cholesky_inverse
+aten::cholesky_inverse.out
+aten::cholesky_solve
+aten::cholesky_solve.out
+aten::col_indices
+aten::col_indices_copy
+aten::col_indices_copy.out
+aten::conv_depthwise3d
+aten::conv_depthwise3d.out
+aten::conv_tbc
+aten::conv_tbc.out
+aten::convolution
+aten::convolution.out
+aten::convolution_backward
+aten::convolution_backward.out
+aten::convolution_backward_overrideable
+aten::convolution_backward_overrideable.out
+aten::convolution_overrideable
+aten::convolution_overrideable.out
+aten::copy
+aten::copy.out
+aten::copy_
+aten::copy_sparse_to_sparse
+aten::copy_sparse_to_sparse.out
+aten::copy_sparse_to_sparse_
+aten::count_nonzero
+aten::count_nonzero.dim_IntList
+aten::count_nonzero.dim_IntList_out
+aten::count_nonzero.out
+aten::crow_indices
+aten::crow_indices_copy
+aten::crow_indices_copy.out
+aten::cudnn_affine_grid_generator
+aten::cudnn_affine_grid_generator.out
+aten::cudnn_affine_grid_generator_backward
+aten::cudnn_affine_grid_generator_backward.out
+aten::cudnn_convolution
+aten::cudnn_convolution.out
+aten::cudnn_convolution_add_relu
+aten::cudnn_convolution_add_relu.out
+aten::cudnn_convolution_relu
+aten::cudnn_convolution_relu.out
+aten::cudnn_convolution_transpose
+aten::cudnn_convolution_transpose.out
+aten::cudnn_grid_sampler
+aten::cudnn_grid_sampler.out
+aten::cudnn_grid_sampler_backward
+aten::cudnn_grid_sampler_backward.out
+aten::cummax
+aten::cummax.out
+aten::cummin
+aten::cummin.out
+aten::cumprod
+aten::cumprod.out
+aten::deg2rad
+aten::deg2rad.out
+aten::deg2rad_
+aten::dense_dim
+aten::dequantize.self
+aten::dequantize.self_out
+aten::dequantize.tensors
+aten::dequantize.tensors_out
+aten::detach_
+aten::detach_copy
+aten::detach_copy.out
+aten::dist
+aten::dist.out
+aten::embedding_renorm
+aten::embedding_renorm.out
+aten::embedding_renorm_
+aten::empty.memory_format
+aten::empty.names
+aten::empty.names_out
+aten::empty_permuted
+aten::empty_permuted.out
+aten::empty_quantized
+aten::empty_quantized.out
+aten::equal
+aten::expand_copy
+aten::expand_copy.out
+aten::fake_quantize_per_channel_affine_cachemask
+aten::fake_quantize_per_channel_affine_cachemask.out
+aten::fake_quantize_per_tensor_affine_cachemask
+aten::fake_quantize_per_tensor_affine_cachemask.out
+aten::fft_fftfreq
+aten::fft_fftfreq.out
+aten::fft_rfftfreq
+aten::fft_rfftfreq.out
+aten::fill.Scalar_out
+aten::fill.Tensor_out
+aten::fractional_max_pool2d
+aten::fractional_max_pool2d.output
+aten::fractional_max_pool2d_backward
+aten::fractional_max_pool2d_backward.grad_input
+aten::fractional_max_pool3d
+aten::fractional_max_pool3d.output
+aten::fractional_max_pool3d_backward
+aten::fractional_max_pool3d_backward.grad_input
+aten::frexp.Tensor
+aten::frexp.Tensor_out
+aten::from_file
+aten::from_file.out
+aten::full_like
+aten::full_like.out
+aten::gather
+aten::gather.out
+aten::geqrf
+aten::geqrf.a
+aten::glu_backward_jvp
+aten::glu_backward_jvp.out
+aten::glu_jvp
+aten::glu_jvp.out
+aten::grid_sampler_2d_backward
+aten::grid_sampler_2d_backward.out
+aten::grid_sampler_3d
+aten::grid_sampler_3d.out
+aten::grid_sampler_3d_backward
+aten::grid_sampler_3d_backward.out
+aten::hamming_window
+aten::hamming_window.out
+aten::hamming_window.periodic
+aten::hamming_window.periodic_alpha
+aten::hamming_window.periodic_alpha_beta
+aten::hamming_window.periodic_alpha_beta_out
+aten::hamming_window.periodic_alpha_out
+aten::hamming_window.periodic_out
+aten::hann_window
+aten::hann_window.out
+aten::hann_window.periodic
+aten::hann_window.periodic_out
+aten::histc
+aten::histc.out
+aten::histogram.bin_ct
+aten::histogram.bin_ct_out
+aten::histogram.bins_tensor
+aten::histogram.bins_tensor_out
+aten::hspmm
+aten::hspmm.out
+aten::i0
+aten::i0.out
+aten::index.Tensor
+aten::index.Tensor_out
+aten::index_put
+aten::index_put.out
+aten::index_reduce
+aten::index_reduce.out
+aten::indices
+aten::indices_copy
+aten::indices_copy.out
+aten::int_repr
+aten::int_repr.out
+aten::is_coalesced
+aten::is_pinned
+aten::is_set_to
+aten::isin.Scalar_Tensor
+aten::isin.Scalar_Tensor_out
+aten::isin.Tensor_Scalar
+aten::isin.Tensor_Scalar_out
+aten::isin.Tensor_Tensor
+aten::isin.Tensor_Tensor_out
+aten::kaiser_window
+aten::kaiser_window.beta
+aten::kaiser_window.beta_out
+aten::kaiser_window.out
+aten::kaiser_window.periodic
+aten::kaiser_window.periodic_out
+aten::kthvalue
+aten::kthvalue.values
+aten::lift_fresh_copy
+aten::lift_fresh_copy.out
+aten::linalg_cholesky_ex
+aten::linalg_cholesky_ex.L
+aten::linalg_cross
+aten::linalg_cross.out
+aten::linalg_eig
+aten::linalg_eig.out
+aten::linalg_householder_product
+aten::linalg_householder_product.out
+aten::linalg_inv_ex
+aten::linalg_inv_ex.inverse
+aten::linalg_ldl_factor_ex
+aten::linalg_ldl_factor_ex.out
+aten::linalg_ldl_solve
+aten::linalg_ldl_solve.out
+aten::linalg_lstsq
+aten::linalg_lstsq.out
+aten::linalg_lu
+aten::linalg_lu.out
+aten::linalg_lu_factor_ex
+aten::linalg_lu_factor_ex.out
+aten::linalg_lu_solve
+aten::linalg_lu_solve.out
+aten::linalg_matrix_exp
+aten::linalg_matrix_exp.out
+aten::linalg_pinv.atol_rtol_tensor
+aten::linalg_pinv.atol_rtol_tensor_out
+aten::linalg_qr
+aten::linalg_qr.out
+aten::linalg_solve_triangular
+aten::linalg_solve_triangular.out
+aten::linear.out
+aten::linear_backward
+aten::linear_backward.out
+aten::log_softmax.int_out
+aten::logaddexp2
+aten::logaddexp2.out
+aten::logcumsumexp
+aten::logcumsumexp.out
+aten::logit_backward.grad_input
+aten::lstm_mps_backward
+aten::lstm_mps_backward.out
+aten::lu_unpack
+aten::lu_unpack.out
+aten::masked_scatter
+aten::masked_scatter.out
+aten::masked_scatter_
+aten::masked_select
+aten::masked_select.out
+aten::matmul_backward
+aten::matmul_backward.out
+aten::max
+aten::max.dim
+aten::max.dim_max
+aten::max.unary_out
+aten::max_pool2d_backward
+aten::max_pool2d_backward.out
+aten::max_pool2d_with_indices
+aten::max_pool2d_with_indices.out
+aten::max_pool2d_with_indices_backward
+aten::max_pool2d_with_indices_backward.grad_input
+aten::max_pool3d_with_indices
+aten::max_pool3d_with_indices.out
+aten::max_pool3d_with_indices_backward
+aten::max_pool3d_with_indices_backward.grad_input
+aten::max_unpool2d
+aten::max_unpool2d.out
+aten::max_unpool3d
+aten::max_unpool3d.out
+aten::median
+aten::median.dim
+aten::median.dim_values
+aten::median.out
+aten::min
+aten::min.dim
+aten::min.dim_min
+aten::miopen_batch_norm
+aten::miopen_batch_norm.out
+aten::miopen_batch_norm_backward
+aten::miopen_batch_norm_backward.out
+aten::miopen_convolution
+aten::miopen_convolution.out
+aten::miopen_convolution_add_relu
+aten::miopen_convolution_relu
+aten::miopen_convolution_transpose
+aten::miopen_convolution_transpose.out
+aten::miopen_depthwise_convolution
+aten::miopen_depthwise_convolution.out
+aten::miopen_rnn
+aten::miopen_rnn.out
+aten::miopen_rnn_backward
+aten::miopen_rnn_backward.out
+aten::mkldnn_adaptive_avg_pool2d
+aten::mkldnn_adaptive_avg_pool2d.out
+aten::mkldnn_adaptive_avg_pool2d_backward
+aten::mkldnn_adaptive_avg_pool2d_backward.out
+aten::mkldnn_convolution
+aten::mkldnn_convolution.out
+aten::mkldnn_linear
+aten::mkldnn_linear.out
+aten::mkldnn_linear_backward
+aten::mkldnn_linear_backward.out
+aten::mkldnn_linear_backward_input
+aten::mkldnn_linear_backward_input.out
+aten::mkldnn_linear_backward_weights
+aten::mkldnn_linear_backward_weights.out
+aten::mkldnn_max_pool2d
+aten::mkldnn_max_pool2d.out
+aten::mkldnn_max_pool2d_backward
+aten::mkldnn_max_pool2d_backward.out
+aten::mkldnn_max_pool3d
+aten::mkldnn_max_pool3d.out
+aten::mkldnn_max_pool3d_backward
+aten::mkldnn_max_pool3d_backward.out
+aten::mkldnn_reorder_conv2d_weight
+aten::mkldnn_reorder_conv2d_weight.out
+aten::mkldnn_reorder_conv3d_weight
+aten::mkldnn_reorder_conv3d_weight.out
+aten::mkldnn_rnn_layer
+aten::mkldnn_rnn_layer.out
+aten::mkldnn_rnn_layer_backward
+aten::mkldnn_rnn_layer_backward.out
+aten::mm
+aten::mm.out
+aten::mode
+aten::mode.values
+aten::mps_convolution_backward
+aten::mps_convolution_backward.out
+aten::mps_convolution_transpose_backward
+aten::mps_convolution_transpose_backward.out
+aten::multi_margin_loss
+aten::multi_margin_loss.out
+aten::multi_margin_loss_backward
+aten::multi_margin_loss_backward.grad_input
+aten::multilabel_margin_loss_backward
+aten::multilabel_margin_loss_backward.grad_input
+aten::multilabel_margin_loss_forward
+aten::multilabel_margin_loss_forward.output
+aten::multinomial
+aten::multinomial.out
+aten::nanmedian
+aten::nanmedian.dim
+aten::nanmedian.dim_values
+aten::nanmedian.out
+aten::nansum
+aten::nansum.out
+aten::native_group_norm.out
+aten::native_norm
+aten::native_norm.ScalarOpt_dim_dtype
+aten::native_norm.ScalarOpt_dim_dtype_out
+aten::native_norm.out
+aten::nll_loss2d_forward
+aten::nll_loss2d_forward.output
+aten::nonzero
+aten::nonzero.out
+aten::normal_functional
+aten::ones.names
+aten::ones.names_out
+aten::ones.out
+aten::ormqr
+aten::ormqr.out
+aten::permute_copy
+aten::permute_copy.out
+aten::pixel_shuffle
+aten::pixel_shuffle.out
+aten::pixel_unshuffle
+aten::pixel_unshuffle.out
+aten::poisson
+aten::poisson.out
+aten::polar
+aten::polar.out
+aten::polygamma
+aten::polygamma.out
+aten::polygamma_
+aten::put
+aten::put.out
+aten::put_
+aten::q_per_channel_axis
+aten::q_per_channel_scales
+aten::q_per_channel_scales.out
+aten::q_per_channel_zero_points
+aten::q_per_channel_zero_points.out
+aten::q_scale
+aten::q_zero_point
+aten::qscheme
+aten::quantize_per_channel
+aten::quantize_per_channel.out
+aten::quantize_per_tensor
+aten::quantize_per_tensor.out
+aten::quantize_per_tensor.tensor_qparams
+aten::quantize_per_tensor.tensor_qparams_out
+aten::quantize_per_tensor.tensors
+aten::quantize_per_tensor.tensors_out
+aten::quantize_per_tensor_dynamic
+aten::quantize_per_tensor_dynamic.out
+aten::quantized_batch_norm
+aten::quantized_batch_norm.out
+aten::quantized_gru.data
+aten::quantized_gru.data_legacy
+aten::quantized_gru.input
+aten::quantized_gru.input_legacy
+aten::quantized_lstm.data
+aten::quantized_lstm.data_legacy
+aten::quantized_lstm.input
+aten::quantized_lstm.input_legacy
+aten::quantized_max_pool1d
+aten::quantized_max_pool1d.out
+aten::quantized_max_pool2d
+aten::quantized_max_pool2d.out
+aten::rad2deg
+aten::rad2deg.out
+aten::rad2deg_
+aten::rand
+aten::rand.generator
+aten::rand.generator_with_names
+aten::rand.generator_with_names_out
+aten::rand.names
+aten::rand.names_out
+aten::rand.out
+aten::rand_like
+aten::rand_like.out
+aten::randint
+aten::randint.generator
+aten::randint.generator_out
+aten::randint.low
+aten::randint.low_generator
+aten::randint.low_generator_out
+aten::randint.low_out
+aten::randint.out
+aten::randint_like
+aten::randint_like.low_dtype
+aten::randint_like.low_dtype_out
+aten::randint_like.out
+aten::randn.generator
+aten::randn.generator_with_names
+aten::randn.generator_with_names_out
+aten::randn.names
+aten::randn.names_out
+aten::randn_like
+aten::randn_like.out
+aten::random
+aten::random.from
+aten::random.from_out
+aten::random.out
+aten::random.to
+aten::random.to_out
+aten::random_
+aten::random_.from
+aten::random_.to
+aten::randperm
+aten::randperm.generator
+aten::randperm.generator_out
+aten::randperm.out
+aten::range
+aten::range.out
+aten::range.out_
+aten::range.step
+aten::record_stream
+aten::reflection_pad1d
+aten::reflection_pad1d.out
+aten::reflection_pad1d_backward
+aten::reflection_pad1d_backward.grad_input
+aten::reflection_pad2d
+aten::reflection_pad2d.out
+aten::reflection_pad2d_backward
+aten::reflection_pad2d_backward.grad_input
+aten::reflection_pad3d
+aten::reflection_pad3d.out
+aten::reflection_pad3d_backward
+aten::reflection_pad3d_backward.grad_input
+aten::renorm
+aten::renorm.out
+aten::repeat_interleave.Tensor
+aten::repeat_interleave.Tensor_out
+aten::replication_pad1d
+aten::replication_pad1d.out
+aten::replication_pad1d_backward
+aten::replication_pad1d_backward.grad_input
+aten::replication_pad2d
+aten::replication_pad2d.out
+aten::replication_pad2d_backward
+aten::replication_pad2d_backward.grad_input
+aten::replication_pad3d
+aten::replication_pad3d.out
+aten::replication_pad3d_backward
+aten::replication_pad3d_backward.grad_input
+aten::resize
+aten::resize.out
+aten::resize_
+aten::resize_as
+aten::resize_as.out
+aten::resize_as_
+aten::resize_as_sparse
+aten::resize_as_sparse.out
+aten::resize_as_sparse_
+aten::round
+aten::round.decimals
+aten::round.decimals_out
+aten::round.out
+aten::row_indices
+aten::row_indices_copy
+aten::row_indices_copy.out
+aten::rrelu_with_noise
+aten::rrelu_with_noise.out
+aten::rrelu_with_noise_
+aten::rsub.Scalar_out
+aten::rsub.Tensor_out
+aten::scalar_tensor
+aten::scalar_tensor.out
+aten::scatter.reduce
+aten::scatter.reduce_out
+aten::scatter.src
+aten::scatter.src_out
+aten::scatter.value
+aten::scatter.value_out
+aten::scatter.value_reduce
+aten::scatter.value_reduce_out
+aten::scatter_add
+aten::scatter_add.out
+aten::scatter_reduce.two
+aten::scatter_reduce.two_out
+aten::searchsorted.Scalar
+aten::searchsorted.Scalar_out
+aten::searchsorted.Tensor
+aten::searchsorted.Tensor_out
+aten::segment_reduce
+aten::segment_reduce.out
+aten::select.int
+aten::select_copy.int
+aten::select_copy.int_out
+aten::select_scatter
+aten::select_scatter.out
+aten::set
+aten::set.out
+aten::set.source_Storage
+aten::set.source_Storage_out
+aten::set.source_Storage_storage_offset
+aten::set.source_Storage_storage_offset_out
+aten::set.source_Tensor
+aten::set.source_Tensor_out
+aten::set_
+aten::set_.source_Storage
+aten::set_.source_Storage_storage_offset
+aten::set_.source_Tensor
+aten::slice_copy.Tensor
+aten::slice_copy.Tensor_out
+aten::slice_scatter
+aten::slice_scatter.out
+aten::slow_conv3d_forward
+aten::slow_conv3d_forward.output
+aten::slow_conv_dilated2d
+aten::slow_conv_dilated2d.out
+aten::slow_conv_dilated3d
+aten::slow_conv_dilated3d.out
+aten::slow_conv_transpose2d
+aten::slow_conv_transpose2d.out
+aten::slow_conv_transpose3d
+aten::slow_conv_transpose3d.out
+aten::smooth_l1_loss
+aten::smooth_l1_loss.out
+aten::smooth_l1_loss_backward
+aten::smooth_l1_loss_backward.grad_input
+aten::softmax.int_out
+aten::sort
+aten::sort.stable
+aten::sort.values
+aten::sort.values_stable
+aten::sparse_coo_tensor.size
+aten::sparse_coo_tensor.size_out
+aten::sparse_dim
+aten::sparse_mask
+aten::sparse_mask.out
+aten::sparse_resize
+aten::sparse_resize.out
+aten::sparse_resize_
+aten::sparse_resize_and_clear
+aten::sparse_resize_and_clear.out
+aten::sparse_resize_and_clear_
+aten::sparse_sampled_addmm
+aten::sparse_sampled_addmm.out
+aten::special_airy_ai
+aten::special_airy_ai.out
+aten::special_bessel_y0
+aten::special_bessel_y0.out
+aten::special_bessel_y1
+aten::special_bessel_y1.out
+aten::special_chebyshev_polynomial_t
+aten::special_chebyshev_polynomial_t.n_scalar_out
+aten::special_chebyshev_polynomial_t.out
+aten::special_chebyshev_polynomial_u
+aten::special_chebyshev_polynomial_u.n_scalar_out
+aten::special_chebyshev_polynomial_u.out
+aten::special_chebyshev_polynomial_v
+aten::special_chebyshev_polynomial_v.n_scalar_out
+aten::special_chebyshev_polynomial_v.out
+aten::special_chebyshev_polynomial_w
+aten::special_chebyshev_polynomial_w.n_scalar_out
+aten::special_chebyshev_polynomial_w.out
+aten::special_hermite_polynomial_h
+aten::special_hermite_polynomial_h.n_scalar_out
+aten::special_hermite_polynomial_h.out
+aten::special_hermite_polynomial_he
+aten::special_hermite_polynomial_he.n_scalar_out
+aten::special_hermite_polynomial_he.out
+aten::special_laguerre_polynomial_l
+aten::special_laguerre_polynomial_l.n_scalar_out
+aten::special_laguerre_polynomial_l.out
+aten::special_legendre_polynomial_p
+aten::special_legendre_polynomial_p.n_scalar_out
+aten::special_legendre_polynomial_p.out
+aten::special_modified_bessel_i0
+aten::special_modified_bessel_i0.out
+aten::special_modified_bessel_i1
+aten::special_modified_bessel_i1.out
+aten::special_modified_bessel_k0
+aten::special_modified_bessel_k0.out
+aten::special_modified_bessel_k1
+aten::special_modified_bessel_k1.out
+aten::special_scaled_modified_bessel_k0
+aten::special_scaled_modified_bessel_k0.out
+aten::special_scaled_modified_bessel_k1
+aten::special_scaled_modified_bessel_k1.out
+aten::special_shifted_chebyshev_polynomial_t
+aten::special_shifted_chebyshev_polynomial_t.n_scalar_out
+aten::special_shifted_chebyshev_polynomial_t.out
+aten::special_shifted_chebyshev_polynomial_u
+aten::special_shifted_chebyshev_polynomial_u.n_scalar_out
+aten::special_shifted_chebyshev_polynomial_u.out
+aten::special_shifted_chebyshev_polynomial_v
+aten::special_shifted_chebyshev_polynomial_v.n_scalar_out
+aten::special_shifted_chebyshev_polynomial_v.out
+aten::special_shifted_chebyshev_polynomial_w
+aten::special_shifted_chebyshev_polynomial_w.n_scalar_out
+aten::special_shifted_chebyshev_polynomial_w.out
+aten::split_copy.Tensor
+aten::split_copy.Tensor_out
+aten::split_with_sizes_copy
+aten::split_with_sizes_copy.out
+aten::squeeze_
+aten::squeeze_.dim
+aten::squeeze_.dims
+aten::squeeze_copy
+aten::squeeze_copy.dim
+aten::squeeze_copy.dim_out
+aten::squeeze_copy.dims
+aten::squeeze_copy.dims_out
+aten::squeeze_copy.out
+aten::sspaddmm.out
+aten::std_mean.correction_out
+aten::t_
+aten::t_copy
+aten::t_copy.out
+aten::take
+aten::take.out
+aten::tensordot.out
+aten::to_mkldnn
+aten::to_mkldnn.out
+aten::to_padded_tensor
+aten::to_padded_tensor.out
+aten::to_sparse
+aten::to_sparse.out
+aten::to_sparse.sparse_dim
+aten::to_sparse.sparse_dim_out
+aten::to_sparse_bsc
+aten::to_sparse_bsc.out
+aten::to_sparse_bsr
+aten::to_sparse_bsr.out
+aten::to_sparse_csc
+aten::to_sparse_csc.out
+aten::to_sparse_csr
+aten::to_sparse_csr.out
+aten::topk
+aten::topk.values
+aten::transpose_
+aten::transpose_copy.int
+aten::transpose_copy.int_out
+aten::triangular_solve
+aten::triangular_solve.X
+aten::unbind_copy.int
+aten::unbind_copy.int_out
+aten::unique_consecutive
+aten::unique_consecutive.out
+aten::unique_dim
+aten::unique_dim.out
+aten::unique_dim_consecutive
+aten::unique_dim_consecutive.out
+aten::unsafe_split.Tensor_out
+aten::unsqueeze_
+aten::unsqueeze_copy
+aten::unsqueeze_copy.out
+aten::upsample_bicubic2d.out
+aten::upsample_bicubic2d_backward
+aten::upsample_bicubic2d_backward.grad_input
+aten::upsample_bilinear2d.out
+aten::upsample_bilinear2d_backward
+aten::upsample_bilinear2d_backward.grad_input
+aten::upsample_linear1d
+aten::upsample_linear1d.out
+aten::upsample_linear1d_backward
+aten::upsample_linear1d_backward.grad_input
+aten::upsample_nearest1d.out
+aten::upsample_nearest1d_backward
+aten::upsample_nearest1d_backward.grad_input
+aten::upsample_nearest2d.out
+aten::upsample_nearest2d_backward
+aten::upsample_nearest2d_backward.grad_input
+aten::upsample_nearest3d.out
+aten::upsample_nearest3d_backward
+aten::upsample_nearest3d_backward.grad_input
+aten::upsample_trilinear3d
+aten::upsample_trilinear3d.out
+aten::upsample_trilinear3d_backward
+aten::upsample_trilinear3d_backward.grad_input
+aten::values
+aten::values_copy
+aten::values_copy.out
+aten::vdot
+aten::vdot.out
+aten::view_as_complex
+aten::view_as_complex_copy
+aten::view_as_complex_copy.out
+aten::view_as_real
+aten::view_as_real_copy
+aten::view_as_real_copy.out
+aten::view_copy
+aten::view_copy.dtype
+aten::view_copy.dtype_out
+aten::view_copy.out
+aten::wait_tensor
+aten::zeros.names
+aten::zeros.names_out
+aten::zeros.out
diff --git a/test/export/test_export.py b/test/export/test_export.py
new file mode 100644
index 000000000000..afa81736cf75
--- /dev/null
+++ b/test/export/test_export.py
@@ -0,0 +1,79 @@
+# Owner(s): ["module: dynamo"]
+from torch.testing._internal.common_utils import run_tests, TestCase
+from functorch.experimental.control_flow import cond
+from torch._export import do_not_use_experimental_export
+import torch._dynamo as torchdynamo
+import torch
+import unittest
+
+class TestExport(TestCase):
+    @unittest.skip("dynamo failure -> RuntimeError: Could not infer dtype of SymBool")
+    def test_export_cond(self):
+        def true_fn(x):
+            return x.sin()
+
+        def false_fn(x):
+            return x.cos()
+
+        def foo(x):
+            return cond(torch.tensor(x.shape[0] > 4), true_fn, false_fn, [x])
+
+        exported_program = do_not_use_experimental_export(foo, (torch.ones(6, 4, requires_grad=True),))
+        print(exported_program.graph_module.graph)
+
+    @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
+    def test_export_simple_model_with_attr(self):
+        class Foo(torch.nn.Module):
+            def __init__(self, float_val):
+                super().__init__()
+                self.float_val = float_val
+
+            def forward(self, x):
+                y = x + self.float_val
+                return y.cos()
+
+        inp = (torch.ones(6, 4, requires_grad=True),)
+        mod = Foo(0.5)
+
+        exported_program = do_not_use_experimental_export(mod, inp)
+        self.assertEqual(exported_program.fw_module(*inp)[0], mod(*inp))
+
+    @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
+    def test_export_simple_model(self):
+        class Foo(torch.nn.Module):
+            def __init__(self, float_val):
+                super().__init__()
+                self.float_val = float_val
+
+            def forward(self, x):
+                return x.cos()
+
+        inp = (torch.ones(6, 4, requires_grad=True),)
+        mod = Foo(0.5)
+
+        exported_program = do_not_use_experimental_export(mod, inp)
+        self.assertEqual(exported_program.fw_module(*inp)[0], mod(*inp))
+
+    @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
+    def test_export_simple_model_buffer_mutation(self):
+        class Foo(torch.nn.Module):
+            def __init__(self, float_val):
+                super().__init__()
+                self.register_buffer("buffer1", torch.ones(6, 1))
+
+            def forward(self, x):
+                self.buffer1.add_(2)
+                return x.cos() + self.buffer1.sin()
+
+        inp = (torch.ones(6, 4, requires_grad=True),)
+        mod = Foo(0.5)
+
+        exported_program = do_not_use_experimental_export(mod, inp)
+        mutated_buffer, output = exported_program.fw_module(*inp)
+        # TODO (tmanlaibaatar) enable this once we figure out
+        # how to do buffer mutation
+        # self.assertEqual(mutated_buffer.sum().item(), 30)
+        self.assertEqual(output, mod(*inp))
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index b5116e373cb8..ef51743c929a 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -118,6 +118,9 @@
     ("aten::_nested_tensor", datetime.date(9999, 1, 1)),
     ("prepacked::unpack_prepacked_sizes_conv2d", datetime.date(9999, 1, 1)),
     ("prepacked::unpack_prepacked_sizes_linear", datetime.date(9999, 1, 1)),
+    ("aten::_symeig_helper", datetime.date(9999, 1, 1)),
+    ("aten::symeig", datetime.date(9999, 1, 1)),
+    ("aten::symeig.e", datetime.date(9999, 1, 1)),
     ("aten::linalg_solve", datetime.date(2022, 8, 31)),
     ("aten::linalg_solve.out", datetime.date(2022, 8, 31)),
     ("aten::quantile", datetime.date(2022, 9, 30)),
@@ -147,6 +150,10 @@
     ("aten::sum.SymInt", datetime.date(2022, 11, 30)),
     ("aten::mps_linear", datetime.date(9999, 1, 1)),
     ("aten::_mps_linear", datetime.date(9999, 1, 1)),
+    ("aten::_mps_max_pool2d", datetime.date(9999, 1, 1)),
+    ("aten::_mps_max_pool2d.out", datetime.date(9999, 1, 1)),
+    ("aten::mps_max_pool2d_backward", datetime.date(9999, 1, 1)),
+    ("aten::mps_max_pool2d_backward.out", datetime.date(9999, 1, 1)),
     ("aten::view_copy.SymInt", datetime.date(2022, 11, 30)),
     ("aten::view_copy.SymInt_out", datetime.date(2022, 11, 30)),
     ("aten::expand_copy.SymInt", datetime.date(2022, 11, 30)),
@@ -286,8 +293,6 @@
     ("aten::vsplit.array", datetime.date(2022, 9, 1)),
     ("aten::vsplit.int", datetime.date(2022, 9, 1)),
     ("aten::sym_numel", datetime.date(2022, 10, 1)),
-    ("aten::_flash_scaled_dot_product_attention", datetime.date(2022, 11, 1)),
-    ("aten::_scaled_dot_product_attention", datetime.date(2022, 11, 1)),
     ("aten::to_padded_tensor", datetime.date(2022, 10, 1)),
     ("aten::nested_to_padded_tensor", datetime.date(2022, 10, 1)),
     ("aten::nested_tensor", datetime.date(2022, 10, 15)),
@@ -319,10 +324,30 @@
     ("aten::_upsample_nearest_exact1d_backward", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d_backward", datetime.date(2022, 12, 15)),
-    ("aten::_flash_scaled_dot_product_attention", datetime.date(2022, 12, 15)),
-    ("aten::_scaled_dot_product_attention_forward", datetime.date(2022, 12, 15)),
-    ("aten::_efficient_attention_backward", datetime.date(2022, 12, 15)),
+    ("aten::_scaled_dot_product_attention", datetime.date(2023, 3, 15)),
+    ("aten::_scaled_dot_product_flash_attention", datetime.date(2023, 3, 15)),
+    ("aten::_sparse_mask_helper", datetime.date(2023, 3, 15)),
+    ("aten::_fused_sdp_choice", datetime.date(2023, 3, 15)),
+    ("aten::_flash_attention_forward", datetime.date(2023, 3, 15)),
     ("mkldnn::_convolution_pointwise.binary", datetime.date(2022, 12, 15)),
+    ("prim::CudaFusionIvalGuard", datetime.date(2023, 2, 1)),
+    ("prim::CudaFusionGuard", datetime.date(2023, 2, 1)),
+    ("prim::CudaFusionGroup", datetime.date(2023, 2, 1)),
+    ("prim::CudaFusionViewGuard", datetime.date(2023, 2, 1)),
+    ("prim::CudaFusionSizeEq", datetime.date(2023, 2, 1)),
+    ("prim::transpose_copy.int", datetime.date(2023, 2, 1)),
+    ("prim::expand_as_copy", datetime.date(2023, 2, 1)),
+    ("prim::squeeze_copy", datetime.date(2023, 2, 1)),
+    ("prim::squeeze_copy.dim", datetime.date(2023, 2, 1)),
+    ("prim::unsqueeze_copy", datetime.date(2023, 2, 1)),
+    ("prim::expand_copy", datetime.date(2023, 2, 1)),
+    ("prim::flatten_copy", datetime.date(2023, 2, 1)),
+    ("prim::add_optional", datetime.date(2023, 2, 1)),
+    ("prim::reshape_copy", datetime.date(2023, 2, 1)),
+    ("prim::permute_copy", datetime.date(2023, 2, 1)),
+    ("prim::infer_unsqueeze_size", datetime.date(2023, 2, 1)),
+    ("prim::t_copy", datetime.date(2023, 2, 1)),
+    ("prim::view_copy", datetime.date(2023, 2, 1)),
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index 3f4f74b9224d..aafa179bc81b 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -426,7 +426,7 @@ def remove_torch(name):
 
 def get_list_of_all_tests():
     all_tests = list(tested_overridable_outplace_ops.keys())
-    return set([remove_torch(test) for test in all_tests])
+    return {remove_torch(test) for test in all_tests}
 
 
 mytest = {
@@ -459,11 +459,11 @@ def get_jvp_coverage(subset=None):
     supports_forwardad_ops_dct = {name: op_to_opinfo[fn] for name, fn in ops_dct.items()
                                   if op_to_opinfo[fn][0].supports_forward_ad}
 
-    ops = set([remove_torch(test) for test in list(ops_dct.keys())])
-    supports_autograd = set([remove_torch(test)
-                             for test in list(supports_autograd_ops_dct.keys())])
-    supports_forward_ad = set([remove_torch(test)
-                               for test in list(supports_forwardad_ops_dct.keys())])
+    ops = {remove_torch(test) for test in list(ops_dct.keys())}
+    supports_autograd = {remove_torch(test)
+                         for test in list(supports_autograd_ops_dct.keys())}
+    supports_forward_ad = {remove_torch(test)
+                           for test in list(supports_forwardad_ops_dct.keys())}
     assert supports_forward_ad.issubset(supports_autograd)
     assert supports_autograd.issubset(ops)
 
@@ -803,7 +803,7 @@ def all(cls):
     def query(self, operator_method, filter=(Support.NO, Support.YES, Support.UNKNOWN)):
         result = {}
         for key in filter:
-            result[key] = set([])
+            result[key] = set()
         for op in self.data:
             support_status = operator_method(op)
             if support_status in filter:
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index fc083d487ec0..65938f28e1c9 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -12,9 +12,8 @@
     TestCase,
     run_tests,
     IS_ARM64,
-    IS_WINDOWS,
     compare_equal_outs_and_grads,
-    outs_and_grads
+    outs_and_grads,
 )
 import torch
 import torch.nn as nn
@@ -23,7 +22,8 @@
 import warnings
 import itertools
 from functools import partial
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.nn.utils.rnn import PackedSequence
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, toleranceOverride, tol
 from torch.testing._internal.common_methods_invocations import op_db, wrapper_set_seed
 from torch.testing._internal.common_modules import module_db, modules
 from functorch import (
@@ -49,8 +49,7 @@
 )
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
-from torch.fx.experimental.symbolic_shapes import ShapeEnv
-from torch._functorch.named_members_polyfill import _named_buffers, _named_parameters
+from torch.fx.experimental.symbolic_shapes import ShapeEnv, GuardOnDataDependentSymNode
 
 USE_TORCHVISION = False
 try:
@@ -70,14 +69,6 @@
     warnings.warn("Some tests use networkx but it was not installed",
                   UserWarning)
 
-try:
-    import sympy  # noqa: F401
-    # TODO(jansel): these tests fail on windows
-    HAS_SYMPY = not IS_WINDOWS
-except ImportError:
-    HAS_SYMPY = False
-skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy")
-
 # NB: numpy is a testing dependency!
 
 class AOTTestCase(TestCase):
@@ -169,12 +160,12 @@ def f(x):
             return torch.tanh(x).sum()
 
         fx_f = make_fx(grad(f))(torch.randn(5))
-        ops = set([i.target for i in fx_f.graph.nodes])
+        ops = {i.target for i in fx_f.graph.nodes}
 
         self.assertEqual(torch.ops.aten.tanh_backward in ops, True)
 
         fx_f = make_fx(grad(f), decomposition_table)(torch.randn(5))
-        ops = set([i.target for i in fx_f.graph.nodes])
+        ops = {i.target for i in fx_f.graph.nodes}
         self.assertEqual(torch.ops.aten.tanh_backward in ops, False)
 
     def test_nnc_jit(self, device):
@@ -250,81 +241,95 @@ class TestAOTAutograd(AOTTestCase):
     def verify_aot_autograd(
         self,
         f,
-        inp: Union[Callable, List[Any]],
+        inp_: Union[Callable, List[Any]],
         *,
         test_mutation: bool = False,
         decompositions: Optional[Dict] = None,
     ):
-        # Some tests pass in a callable for inp, to generate the inputs
-        # (useful if we want to generate complicated aliasing inputs)
-        if isinstance(inp, Callable):
-            inp_callable = inp
-            # The callable should return a tuple of f_inputs, f_graph_inputs
-            # (The idea is that we might want to compile a function with the graph inputs,
-            # but test autograd backprop all the way through the actual inputs)
-            inp_copy, graph_inps_copy = inp_callable()
-            inp, graph_inps = inp_callable()
-        else:
-            inp_copy = []
-            # Our input clones need to mimic when inputs are duplicates of one another
-            dupes_map = {}
-            for i, x in enumerate(inp):
-                if x in dupes_map:
-                    x_dupe_idx = dupes_map[x]
-                    inp_copy.append(inp_copy[x_dupe_idx])
-                else:
-                    dupes_map[x] = i
-                    if not isinstance(x, torch.Tensor):
-                        x_copy = x
-                    else:
-                        x_copy = x.clone().detach().requires_grad_(x.requires_grad)
-                        if x.requires_grad and not x.is_leaf:
-                            x_copy = x_copy.clone()
-                    inp_copy.append(x_copy)
-
-            if test_mutation:
-                # For graphs where we mutate inputs, need our test to make sure inputs aren't leaves
-                graph_inps = [x.add(1) for x in inp]
-                graph_inps_copy = [x.add(1) for x in inp_copy]
+        for keep_input_mutations in [True, False]:
+            # Some tests pass in a callable for inp, to generate the inputs
+            # (useful if we want to generate complicated aliasing inputs)
+            if isinstance(inp_, Callable):
+                inp_callable = inp_
+                # The callable should return a tuple of f_inputs, f_graph_inputs
+                # (The idea is that we might want to compile a function with the graph inputs,
+                # but test autograd backprop all the way through the actual inputs)
+                inp_copy, graph_inps_copy = inp_callable()
+                inp, graph_inps = inp_callable()
             else:
-                graph_inps = inp
-                graph_inps_copy = inp_copy
-
-        # Create a copy of inputs, so we can test input mutation correctness.
+                inp_copy = []
+                inp = []
+                # Our input clones need to mimic when inputs are duplicates of one another
+                dupes_map = {}
+                for i, x in enumerate(inp_):
+                    if x in dupes_map:
+                        x_dupe_idx = dupes_map[x]
+                        inp_copy.append(inp_copy[x_dupe_idx])
+                        inp.append(inp[x_dupe_idx])
+                    else:
+                        dupes_map[x] = i
+                        if not isinstance(x, torch.Tensor):
+                            x_copy = x
+                            x_copy2 = x
+                        else:
+                            x_copy = x.clone().detach().requires_grad_(x.requires_grad)
+                            x_copy2 = x.clone().detach().requires_grad_(x.requires_grad)
+                            if x.requires_grad and not x.is_leaf:
+                                x_copy = x_copy.clone()
+                                x_copy2 = x_copy2.clone()
+                        inp_copy.append(x_copy)
+                        inp.append(x_copy2)
 
-        fw_graph_cell = [None]
-        if isinstance(f, nn.Module):
-            compiled_f = aot_module(
-                f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop, decompositions=decompositions)
-        else:
-            compiled_f = aot_function(
-                f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop, decompositions=decompositions)
-        ref_out, ref_grad = outs_and_grads(f, graph_inps, inp)
-        test_out, test_grad = outs_and_grads(compiled_f, graph_inps_copy, inp_copy)
-        self.assertEqual(ref_grad, test_grad)
-
-        if isinstance(ref_out, torch.Tensor):
-            self.assertTrue(isinstance(test_out, torch.Tensor))
-            ref_out, test_out = [ref_out], [test_out]
-        for ref_o, test_o in zip(ref_out, test_out):
-            if isinstance(ref_o, torch.Tensor):
-                self.assertEqual(ref_o.requires_grad, test_o.requires_grad)
-                self.assertEqual(ref_o.is_leaf, test_o.is_leaf)
-                if ref_o.requires_grad:
-                    # _is_view() should probably unconditionally be the same,
-                    # but in practice I don't think this matters for tensors that don't require grad
-                    self.assertEqual(ref_o._is_view(), test_o._is_view())
-                self.assertEqual(ref_o, test_o)
                 if test_mutation:
-                    # This tests that autograd meta is set properly on the output we can
-                    # mutate it.
-                    ref_o.mul_(2)
-                    test_o.mul_(2)
+                    # For graphs where we mutate inputs, need our test to make sure inputs aren't leaves
+                    graph_inps = [x.add(1) for x in inp]
+                    graph_inps_copy = [x.add(1) for x in inp_copy]
+                else:
+                    graph_inps = inp
+                    graph_inps_copy = inp_copy
+            fw_graph_cell = [None]
+            if isinstance(f, nn.Module):
+                compiled_f = aot_module(
+                    f,
+                    fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+                    bw_compiler=nop,
+                    decompositions=decompositions,
+                    keep_inference_input_mutations=keep_input_mutations
+                )
+            else:
+                compiled_f = aot_function(
+                    f,
+                    fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+                    bw_compiler=nop,
+                    decompositions=decompositions,
+                    keep_inference_input_mutations=keep_input_mutations
+                )
+            ref_out, ref_grad = outs_and_grads(f, graph_inps, inp)
+            test_out, test_grad = outs_and_grads(compiled_f, graph_inps_copy, inp_copy)
+            self.assertEqual(ref_grad, test_grad)
+
+            if isinstance(ref_out, torch.Tensor):
+                self.assertTrue(isinstance(test_out, torch.Tensor))
+                ref_out, test_out = [ref_out], [test_out]
+            for ref_o, test_o in zip(ref_out, test_out):
+                if isinstance(ref_o, torch.Tensor):
+                    self.assertEqual(ref_o.requires_grad, test_o.requires_grad)
+                    self.assertEqual(ref_o.is_leaf, test_o.is_leaf)
+                    if ref_o.requires_grad:
+                        # _is_view() should probably unconditionally be the same,
+                        # but in practice I don't think this matters for tensors that don't require grad
+                        self.assertEqual(ref_o._is_view(), test_o._is_view())
                     self.assertEqual(ref_o, test_o)
-        for ref_i, test_i in zip(inp, inp_copy):
-            if isinstance(ref_i, torch.Tensor):
-                self.assertEqual(ref_i.requires_grad, test_i.requires_grad)
-            self.assertEqual(ref_i, test_i)
+                    if test_mutation:
+                        # This tests that autograd meta is set properly on the output we can
+                        # mutate it.
+                        ref_o.mul_(2)
+                        test_o.mul_(2)
+                        self.assertEqual(ref_o, test_o)
+            for ref_i, test_i in zip(inp, inp_copy):
+                if isinstance(ref_i, torch.Tensor):
+                    self.assertEqual(ref_i.requires_grad, test_i.requires_grad)
+                self.assertEqual(ref_i, test_i)
         return fw_graph_cell[0]
 
     def test_non_tensor_and_none_inputs(self):
@@ -333,24 +338,32 @@ def f(a, b, c):
             return a * c
         inp = [2, None, torch.ones(3, 3, dtype=torch.float32, requires_grad=True)]
         self.verify_aot_autograd(f, inp)
+        inp = [2, None, torch.ones(3, 3, dtype=torch.float32, requires_grad=False)]
+        self.verify_aot_autograd(f, inp)
 
     def test_single_output(self):
         def f(a, b):
             return a + b
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
+        inp = [torch.randn(3, 3, requires_grad=False), torch.randn(3, 3)]
+        self.verify_aot_autograd(f, inp)
 
     def test_multi_output(self):
         def f(a, b):
             return a + b, a - b
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
+        inp = [torch.randn(3, 3, requires_grad=False), torch.randn(3, 3)]
+        self.verify_aot_autograd(f, inp)
 
     def test_multi_output_list(self):
         def f(a, b):
             return [a + b, a - b]
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
+        inp = [torch.randn(3, 3, requires_grad=False), torch.randn(3, 3)]
+        self.verify_aot_autograd(f, inp)
 
     # Test for bug occurring at the intersection of fake tensors & functionalization.
     @patch("torch._functorch.config.use_dynamic_shapes", True)
@@ -363,6 +376,8 @@ def f(a):
 
         inp = [torch.randn(3, 1, requires_grad=True)]
         self.verify_aot_autograd(f, inp)
+        inp = [torch.randn(3, 1, requires_grad=False)]
+        self.verify_aot_autograd(f, inp)
 
     @patch("torch._functorch.config.use_dynamic_shapes", True)
     @patch("torch._functorch.config.use_fake_tensor", True)
@@ -388,8 +403,9 @@ def f(a):
             a.mul_(2)
             return a * 3
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         # Things to note:
         # - the extra clone is because we need to pass the pre-mutated input to grad(),
         #   but autograd operates above functionalization so we need to manually clone.
@@ -406,12 +422,12 @@ def test_input_mutation_simple_with_none_and_nontensor(self):
         # Tensor, None, int
         def f(a, b, c):
             return a * c
-        inp = [torch.ones(3, 3, requires_grad=True), None, 3]
-
         f_compiled = aot_function(f, nop)
-        out_ref = f(*inp)
-        out_test = f_compiled(*inp)
-        self.assertEqual(out_ref, out_test)
+        for req_grad in [True, False]:
+            inp = [torch.ones(3, 3, requires_grad=req_grad), None, 3]
+            out_ref = f(*inp)
+            out_test = f_compiled(*inp)
+            self.assertEqual(out_ref, out_test)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
     def test_input_mutation_is_output(self):
@@ -419,8 +435,9 @@ def f(a):
             a.mul_(2)
             return a
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
@@ -434,13 +451,16 @@ def f(a, b, c):
             c.mul_(2)
             return a + b + c
 
-        inp = [
-            torch.ones(3, 3, requires_grad=True),
-            torch.ones(3, 3, requires_grad=True),
-            torch.ones(3, 3, requires_grad=True),
-        ]
+        def create_inp(req_grad):
+            return [
+                torch.ones(3, 3, requires_grad=req_grad),
+                torch.ones(3, 3, requires_grad=req_grad),
+                torch.ones(3, 3, requires_grad=req_grad),
+            ]
 
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
+
+        fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2, primals_3):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
@@ -456,12 +476,15 @@ def test_input_mutation_metadata(self):
         def f(a, b):
             a.transpose_(1, 0)
             return a + b
-        inp = [
-            torch.ones(3, 3, requires_grad=True),
-            torch.ones(3, 3, requires_grad=True),
-        ]
 
-        self.verify_aot_autograd(f, inp, test_mutation=True)
+        def create_inp(req_grad):
+            return [
+                torch.ones(3, 3, requires_grad=req_grad),
+                torch.ones(3, 3, requires_grad=req_grad),
+            ]
+
+        self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
     def test_input_mutation_metadata2(self):
@@ -470,7 +493,8 @@ def f(a):
             a.mul_(2)
             return a + 1
         inp = [torch.ones(3, 3, requires_grad=True)]
-
+        self.verify_aot_autograd(f, inp, test_mutation=True)
+        inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
@@ -483,7 +507,12 @@ def f(a, b):
             torch.ones(3, 3),
             torch.ones(2, 2, requires_grad=True),
         ]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
 
+        inp = [
+            torch.ones(3, 3),
+            torch.ones(2, 2),
+        ]
         self.verify_aot_autograd(f, inp, test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
@@ -494,6 +523,26 @@ def f(inpt, weight, bias, running_mean, running_var):
             # This tests that what we save for the backward is actually cloned inputs,
             # and not the original inputs that got mutated.
             return torch._native_batch_norm_legit(inpt, weight, bias, running_mean, running_var, True, 0.5, 1e-5)
+
+        def create_inp(req_grad):
+            return [
+                torch.ones(2, 5, 5, 5, requires_grad=req_grad),
+                torch.ones(5, requires_grad=req_grad),
+                torch.ones(5, requires_grad=req_grad),
+                torch.ones(5),
+                torch.ones(5),
+            ]
+
+        from torch._decomp import get_decompositions
+        # This simulates what inductor does (running the fw + bw decompositions)
+        decompositions = get_decompositions([
+            torch.ops.aten._native_batch_norm_legit_functional,
+            torch.ops.aten.native_batch_norm_backward,
+        ])
+        self.verify_aot_autograd(f, create_inp(True), test_mutation=True, decompositions=decompositions)
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True, decompositions=decompositions)
+
+    def test_batchnorm_inference(self):
         inp = [
             torch.ones(2, 5, 5, 5, requires_grad=True),
             torch.ones(5, requires_grad=True),
@@ -502,22 +551,31 @@ def f(inpt, weight, bias, running_mean, running_var):
             torch.ones(5),
         ]
 
-        from torch._decomp import get_decompositions
-        # This simulates what inductor does (running the fw + bw decompositions)
-        decompositions = get_decompositions([
-            torch.ops.aten._native_batch_norm_legit_functional,
-            torch.ops.aten.native_batch_norm_backward,
-        ])
-        self.verify_aot_autograd(f, inp, test_mutation=True, decompositions=decompositions)
+        m = torch.nn.BatchNorm2d(4, 4)
+        m.eval()
+        fw_graph_cell = [None]
+        inp = torch.ones(4, 4, 4, 4)
+        fw_graph_cell = [None]
+        compiled_m = aot_module(
+            m,
+            fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+            bw_compiler=nop,
+            keep_inference_input_mutations=True,
+        )
+        inp = torch.ones(4, 4, 4, 4)
+        with torch.no_grad():
+            out = compiled_m(inp)
+        # expectation: there are no copy_() calls in the decomposed batch norm when running under training=False (eval mode)
+        code = fw_graph_cell[0].code.strip()
+        self.assertTrue("copy_" not in str(code))
 
     @patch("functorch.compile.config.use_fake_tensor", True)
     def test_input_output_view_simple(self):
         def f(a):
             return a.view(-1)
-        inp = [
-            torch.ones(2, 2, requires_grad=True).add(1),
-        ]
-
+        inp = [torch.ones(2, 2, requires_grad=False).add(1)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
+        inp = [torch.ones(2, 2, requires_grad=True).add(1)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         # Outputs that alias inputs are pulled out of the graph entirely, so we don't compile anything here
         self.assertExpectedInline(fw_graph.code.strip(), """\
@@ -531,13 +589,16 @@ def f(a, b, c):
             a.mul_(2)
             c.mul_(3)
             return b.view(2, 2), c.view(2, 2)
-        inp = [
-            torch.ones(2, 2, requires_grad=True).add(1),
-            torch.ones(2, 2, requires_grad=True).add(1),
-            torch.ones(2, 2, requires_grad=True).add(1),
-        ]
 
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        def create_inp(req_grad):
+            return [
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+            ]
+
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
         # The original function returned two outputs, both of which aliased inputs.
         # We expect two outputs in the functional graph, a_updated and c_updated.
         # The actual aliased outputs themselves aren't in the compiled forward graph;
@@ -558,13 +619,16 @@ def f(a, b, c):
             b.mul_(3)
             c.t_()
             return a.view(2, 2), b.view(2, 2), c.view(2, 2)
-        inp = [
-            torch.ones(2, 2, requires_grad=True).add(1),
-            torch.ones(2, 2, requires_grad=True).add(1),
-            torch.ones(2, 2, requires_grad=True).add(1),
-        ]
 
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        def create_inp(req_grad):
+            return [
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+            ]
+
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
         # Important thing to check here: of the three inputs:
         # Only the b.mul_(3) should show up in the graph (we functionalize it and return it).
         # Everything else that does not show up in the graph includes:
@@ -586,10 +650,9 @@ def test_input_mutation_and_output_view(self):
         def f(a):
             a.add_(1)
             return a.view(-1)
-        inp = [
-            torch.ones(2, 2, requires_grad=True).add(1),
-        ]
-
+        inp = [torch.ones(2, 2, requires_grad=False).add(1)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
+        inp = [torch.ones(2, 2, requires_grad=True).add(1)]
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         # Here, total # of outputs is 1 because:
         # - num_mutated_inps = 1 (a_updated)
@@ -608,14 +671,17 @@ def f(a, b, c, d):
             b.transpose_(1, 0)
             c.add_(1)
             return d + 1, b.diagonal(), a + c
-        inp = [
-            torch.arange(4, requires_grad=True, dtype=torch.float32).view(2, 2).add(1),
-            torch.arange(4, requires_grad=True, dtype=torch.float32).view(2, 2).add(1),
-            torch.ones(2, 2, requires_grad=True).add(1),
-            torch.ones(2, 2, requires_grad=True).add(1),
-        ]
 
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        def create_inp(req_grad):
+            return [
+                torch.arange(4, requires_grad=req_grad, dtype=torch.float32).view(2, 2).add(1),
+                torch.arange(4, requires_grad=req_grad, dtype=torch.float32).view(2, 2).add(1),
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+                torch.ones(2, 2, requires_grad=req_grad).add(1),
+            ]
+
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2, primals_3, primals_4):
     view = torch.ops.aten.view.default(primals_2, [2, 2]);  primals_2 = None
@@ -632,8 +698,9 @@ def test_output_aliases_intermediate_single(self):
         def f(a):
             out = torch.mul(a, 3)
             return out.view(-1)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         # In AOTAutograd, we are obligated to make the compiled forward directly return `out`,
         # and reconstruct `out.view(-1)` as a fresh output.
@@ -649,8 +716,9 @@ def f(a, b):
             out = torch.mul(a, 3)
             # First output is an alias of an intermediate that doesn't require grad
             return out.view(-1), b.add(1)
+        inp = [torch.ones(3, 3), torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3), torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         # important bit: we don't bother generating an intermediate base as an output in the graph,
         # because the intermediate base itself didn't require gradients.
@@ -668,8 +736,9 @@ def f(a):
             out = torch.mul(a, 3)
             out_view = out.view(-1)
             return out, out_view, out
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
@@ -678,8 +747,9 @@ def f(a):
             out = torch.mul(a, 3)
             # AOTAutograd should manually generate these two output views in the epilogue.
             return out.view(-1), out.view(-1)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -695,8 +765,9 @@ def f(a):
             # AOTAutograd should manually generate the first output (a view of an intermediate)
             # but not the second (which is itself the intermediate for the first)
             return out.view(-1), out
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -711,8 +782,9 @@ def f(a):
             # AOTAutograd should manually generate the first output (a view of an intermediate)
             # but not the second (which is itself the intermediate for the first)
             return out, out.view(-1)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -727,8 +799,9 @@ def f(a):
             # AOTAutograd should manually generate the first output (a view of an intermediate)
             # but not the second (which is itself the intermediate for the first)
             return out.view(-1), out, out[0].detach()
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -747,7 +820,7 @@ def f(a):
         inp = [torch.ones(2, 4, requires_grad=True)]
 
         # TODO: fix this test.
-        # See <github issue link>
+        # See https://github.com/pytorch/pytorch/issues/90507
         # self.verify_aot_autograd(f, inp, test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
@@ -760,8 +833,9 @@ def f(a):
             # `out` will show up as having OutputType.non_alias,
             # and ._is_view() == False
             return out
+        inp = [torch.ones(2, 4, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(2, 4, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -791,8 +865,9 @@ def f(a):
             out2 = torch.mul(a, 4)
             # AOTAutograd should manually generate these two output views in the epilogue.
             return out1.view(-1), out2.transpose(1, 0), out1.transpose(1, 0)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -814,9 +889,13 @@ def f(a):
             a.transpose_(1, 0)
             tmp = a.mul(2)
             return tmp.squeeze(), tmp.transpose(1, 0), a.unsqueeze(0)
-        inp = [torch.ones(1, 2, 4, requires_grad=True)]
 
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        def inp_callable(req_grad):
+            x = torch.ones(1, 2, 4, requires_grad=req_grad).clone()
+            return [(x,), (x,)]
+
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         # TODO: make this test run with dynamic shapes so it is more meaningful
         # metadata output order: (a_updated_meta, out1_meta, out2_meta, out3_meta)
         self.assertExpectedInline(fw_graph.code.strip(), """\
@@ -835,8 +914,9 @@ def f(a):
             a.t_()
             a[0].mul_(2)
             return a.view(a.shape)
+        inp = [torch.ones(3, 3, requires_grad=False)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
         inp = [torch.ones(3, 3, requires_grad=True)]
-
         fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
@@ -857,12 +937,15 @@ def test_view_and_inplace_view(self):
         def f(a, b):
             a.t_()
             return b.view(b.shape), a.view(a.shape)
-        inp = [
-            torch.ones(3, 3, requires_grad=True),
-            torch.ones(3, 3, requires_grad=True)
-        ]
 
-        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True)
+        def create_inp(req_grad):
+            return [
+                torch.ones(3, 3, requires_grad=req_grad),
+                torch.ones(3, 3, requires_grad=req_grad)
+            ]
+
+        self.verify_aot_autograd(f, create_inp(False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, create_inp(True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2):
     view = torch.ops.aten.view.default(primals_1, [3, 3]);  primals_1 = None
@@ -877,10 +960,9 @@ def f(a):
             tmp = a.detach()
             a.mul_(2)
             return a, tmp
-        inp = [
-            torch.ones(3, 3, requires_grad=True),
-        ]
-
+        inp = [torch.ones(3, 3, requires_grad=True)]
+        self.verify_aot_autograd(f, inp, test_mutation=True)
+        inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
@@ -914,15 +996,44 @@ def f(a, b):
             b.t_()
             return a.mul(b)
 
-        def inp_callable():
-            base = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base = torch.ones(2, 2, requires_grad=req_grad)
             # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
             x = base.add(1)
             inp1 = x[0]
             inp2 = x[1]
             return [base], [inp1, inp2]
 
-        self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    def test_mem_leak_from_save_for_bw(self):
+        # See a full diagnosis at this issue: https://github.com/pytorch/pytorch/issues/94990
+        # Note [Detaching saved tensors in AOTAutograd]
+        # This program creates a ref-cycle. Long term, we should fix this ref cycle
+        # (since it can arise, naturally albeit rarely, from uses of autograd.Function).
+        # But AOTAutograd makes it more likely to show up from tracing user programs,
+        # so we deal with it by manually detaching the tensors that we save for backward.
+        # This is completely wrong and would give wrong results if we were to do double backward.
+        # Fortunately today, double backward is explicitly banned in AOTAutograd.
+        def f(a, b):
+            add = a + a
+            split = torch.functional.split(add, [4, 4], dim=1)
+            getitem_2 = split[1]
+            unsqueeze = getitem_2.unsqueeze(-1)
+            mul = unsqueeze * b
+            return (getitem_2, mul)
+
+        f_compiled = aot_function(f, nop)
+        inps = [
+            torch.ones(8, 8, device='cuda', requires_grad=True),
+            torch.ones(1, 4, 1, device='cuda', requires_grad=True),
+        ]
+        mem_before = torch.cuda.memory_allocated()
+        f_compiled(*inps)
+        mem_after = torch.cuda.memory_allocated()
+        self.assertTrue(mem_after == mem_before)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
     def test_output_aliases_multiple_inputs_get_correct_one(self):
@@ -931,15 +1042,16 @@ def test_output_aliases_multiple_inputs_get_correct_one(self):
         def f(a, b):
             return a.view(a.shape), b.view(b.shape)
 
-        def inp_callable():
-            base = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base = torch.ones(2, 2, requires_grad=req_grad)
             # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
             x = base.mul(2)
             inp1 = x.view(-1)
             inp2 = x[0]
             return [base], [inp1, inp2]
 
-        self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
 
     @patch("functorch.compile.config.use_fake_tensor", True)
     def test_input_mutation_aliases_other_input(self):
@@ -947,15 +1059,16 @@ def f(a, b):
             a.add_(1)
             return a + b
 
-        def inp_callable():
-            base = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base = torch.ones(2, 2, requires_grad=req_grad)
             # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
             x = base.add(1)
             inp1 = x[0]
             inp2 = x[1]
             return [base], [inp1, inp2]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         # Important parts of the graph:
         # - the compiled graph takes in a base, and we generate a and b (the views) off of the base
         # - clone() is still in the graph, because we need to call grad() on the original (non-mutated) inputs
@@ -977,15 +1090,16 @@ def f(a, b):
             a.add_(1)
             return a + b
 
-        def inp_callable():
-            base = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base = torch.ones(2, 2, requires_grad=req_grad)
             x = base.add(1)
             inp1 = x[0]
             # Here, one of the aliased inputs is the base itself
             inp2 = x
             return [base], [inp1, inp2]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
@@ -1005,12 +1119,13 @@ def f(a, b):
             a.add_(1)
             return b.view(b.shape)
 
-        def inp_callable():
-            base = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base = torch.ones(2, 2, requires_grad=req_grad)
             x = base.add(1)
             return [base], [x.view(-1), x.view(-1)]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
@@ -1033,14 +1148,15 @@ def f(a, b, c):
             #     The original fw takes in 3 args, but the compiled fw takes in only 2 args.
             return b.add(1), c.view(-1)
 
-        def inp_callable():
-            base1 = torch.ones(2, 2, requires_grad=True)
-            base2 = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base1 = torch.ones(2, 2, requires_grad=req_grad)
+            base2 = torch.ones(2, 2, requires_grad=req_grad)
             x = base1.add(1)
             y = base2.add(1)
             return [base1, base2], [x.view(-1), y, x.view(-1)]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
@@ -1062,12 +1178,13 @@ def f(a, b):
             a.t_()
             return a + b
 
-        def inp_callable():
-            base = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base = torch.ones(2, 2, requires_grad=req_grad)
             x = base.add(1)
             return [base], [x.view(-1), x.view(-1)]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         # Expectation: fwd() takes in 2 args, and we don't construct a synthetic base.
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2):
@@ -1084,13 +1201,14 @@ def f(a, b, c):
             a.mul_(2)
             return b + 1, c + 1
 
-        def inp_callable():
+        def inp_callable(req_grad):
             base = torch.ones(2, 2)
-            c_arg = torch.ones(2, 2, requires_grad=True)
+            c_arg = torch.ones(2, 2, requires_grad=req_grad)
             x = base.add(1)
             return [base, c_arg], [x.view(-1), x.view(-1), c_arg]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
@@ -1115,15 +1233,16 @@ def f(a, b, c, d):
             d.t_()
             return a + c + d, b.view(-1)
 
-        def inp_callable():
-            base1 = torch.ones(2, 2, requires_grad=True)
-            base2 = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base1 = torch.ones(2, 2, requires_grad=req_grad)
+            base2 = torch.ones(2, 2, requires_grad=req_grad)
             x1 = base1.add(1)
             x2 = base2.add(1)
             # a and c alias, b and d alias
             return [base1, base2], [x1.view(-1), x2.view(-1), x1.view(-1), x2.view(-1)]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         # 3 graph inputs: (b_d_base, a, c)
         # 2 returns: (b_updated, a+c+d)
         # (there are 2 original fw outs, but one is a view of b so it's not part of the graph)
@@ -1143,12 +1262,29 @@ def forward(self, primals_1, primals_2, primals_3):
     view_1 = torch.ops.aten.view.default(as_strided_18, [-1]);  as_strided_18 = None
     return [as_strided_2, t_1, add_2, view_1]""")  # noqa: B950
 
-    # Mondo test that tests a combination of:
-    # input is mutated, that aliases another input (so we make a synthetic base)
-    # an output is an alias of another output
-    # an output is an alias of an intermediate
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    def test_synthetic_base_base_attribute_is_none(self):
+        def f(a, b):
+            a.add_(1)
+            return a + b
+
+        def inp_callable():
+            base = torch.ones(4, 4, device='cuda')
+            # detach() so that none of the inputs have a ._base attribute.
+            a = base[0].detach()
+            b = base[1].detach()
+            base2 = torch.ones(2, 2, requires_grad=True)
+            return [base], [a, b]
+
+        self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+
+
     @patch("functorch.compile.config.use_fake_tensor", True)
     def test_input_mutation_alias_everything(self):
+        # Mondo test that tests a combination of:
+        # input is mutated, that aliases another input (so we make a synthetic base)
+        # an output is an alias of another output
+        # an output is an alias of an intermediate
         # a and c are aliased
         def f(a, b, c):
             c.mul_(2)  # mutates c
@@ -1161,9 +1297,9 @@ def f(a, b, c):
             # out2 aliases an input, so we don't return it
             return out1, out2, out3
 
-        def inp_callable():
-            base1 = torch.ones(2, 2, requires_grad=True)
-            base2 = torch.ones(2, 2, requires_grad=True)
+        def inp_callable(req_grad):
+            base1 = torch.ones(2, 2, requires_grad=req_grad)
+            base2 = torch.ones(2, 2, requires_grad=req_grad)
             # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
             base1_ = base1.add(1)
             base2_ = base2.add(1)
@@ -1172,7 +1308,8 @@ def inp_callable():
             c = base1_.view(-1)
             return [base1, base2], [a, b, c]
 
-        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True)
+        self.verify_aot_autograd(f, partial(inp_callable, req_grad=False), test_mutation=True)
+        fw_graph = self.verify_aot_autograd(f, partial(inp_callable, req_grad=True), test_mutation=True)
         # Expected:
         # - 2 inputs in the forward: synthetic_base_a_c, b
         # - 1 output in the forward: "tmp"
@@ -1367,7 +1504,7 @@ def forward(self, x, y):
         fxx = aot_module_simplified(F(), (x, x), nop)
         self.assertExpectedRaisesInline(
             AssertionError, lambda: fxx(x, y),
-            """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+            """At compilation time, graph 2 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
 
@@ -1489,6 +1626,26 @@ def f(x, y):
 
         self.assertEqual(ref_out, test_out)
 
+    def test_resize_input_smaller(self):
+        def f(x, y):
+            y.resize_(4)
+            y.zero_()
+            self.assertEqual(x.shape, (4,))
+            return y
+
+        # NB: don't use verify_aot_autograd as the inputs get
+        # mutated and I don't trust verify to do it right
+
+        compiled_f = aot_function(f, nop)
+        ref_x = torch.randn(5)
+        ref_out = f(ref_x, ref_x)
+
+        test_x = torch.randn(5)
+        test_out = compiled_f(test_x, test_x)
+
+        self.assertEqual(ref_out, test_out)
+
+
     def test_custom_autograd(self):
         class CustomFn(torch.autograd.Function):
             @staticmethod
@@ -1559,7 +1716,6 @@ def bn(x):
 
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
-    @skipIfNoSympy
     def test_output_op_depending_on_symint(self):
         """
         It won't be obvious from reading this test what it's testing for.  We should probably make it into a more
@@ -1588,7 +1744,6 @@ def f(x):
 
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
-    @skipIfNoSympy
     def test_default_partitioner_saves_symints_not_tensors_for_bw(self):
         """
         In this test, the important thing is that primals_1 is **only** needed in the backward
@@ -1781,7 +1936,6 @@ def f(x, mod_weight, mod_bias):
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
-    @skipIfNoSympy
     def test_min_cut_partitioner_save_shape(self):
 
         def f(x):
@@ -1822,7 +1976,6 @@ def f(a, b, c):
 
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
-    @skipIfNoSympy
     def test_default_partitioner_output_tensor_shape_tensor(self):
 
         inp = [
@@ -1887,7 +2040,6 @@ def f(a, b, c, d):
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
-    @skipIfNoSympy
     def test_min_cut_partitioner_output_tensor_shape_tensor(self):
 
         inp = [
@@ -2118,6 +2270,23 @@ def forward(self, x, y):
         assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
         assert torch.allclose(inputs[1].grad, cloned_inputs[1].grad)
 
+    def test_inference_python_dispatcher(self):
+        # Extracted from unet
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.upsample = torch.nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+
+            def forward(self, x):
+                return (self.upsample(x), )
+
+        mod = MockModule()
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+        x = torch.randn(2, 512, 40, 59)  # NB: must not require grad
+        inputs = [x]
+        fake_inputs = [fake_mode.from_tensor(x) for x in inputs]
+        compiled_f = aot_module_simplified(mod, fake_inputs, nop)
 
     def test_aot_module_simplified_preserves_stack_trace(self):
         class MockModule(torch.nn.Module):
@@ -2166,9 +2335,6 @@ def test_aot_module_simplified_fake_tensor_gm_raises(self):
         fake_z = fake_mode.from_tensor(real_z)
 
         class MockModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 # Accessing a free variable fake tensor will look like a
                 # constant to make_fx, and result in the tensor being traced
@@ -2211,7 +2377,7 @@ def forward(self, x):
 
     # Worked with real but not with fake
     xfail('cholesky_inverse'),
-    xfail('segment_reduce', 'lengths'),
+    xfail('_segment_reduce', 'lengths'),
     skip('nn.functional.nll_loss', ''),  # UBSAN failure!
 
     # Misc
@@ -2220,12 +2386,16 @@ def forward(self, x):
     xfail('cov'),
     xfail('chalf'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
     xfail('sparse.sampled_addmm'),
+    xfail('normal', 'number_mean'),  # TypeError: randn_like(): argument 'input' (position 1) must be Tensor, not float
+    xfail('sparse.mm', 'reduce'),
     skip('nn.functional.binary_cross_entropy_with_logits'),  # seems to fail sometimes?
     skip('nn.functional.margin_ranking_loss'),  # seems flaky
     skip('linalg.lu_solve'),  # flaky
     skip('linalg.householder_product'),  # flaky
     decorate('matmul', decorator=unittest.skipIf(IS_ARM64, 'flaky')),
     decorate('__rmatmul__', decorator=unittest.skipIf(IS_ARM64, 'flaky')),
+    # overrides atol=1e-4, rtol=1e-5 would do as well
+    decorate('svd_lowrank', decorator=toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-05)})),
 }
 
 symbolic_aot_autograd_failures = {
@@ -2234,7 +2404,6 @@ def forward(self, x):
     xfail('amin', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
     xfail('block_diag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('cartesian_prod', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('cdist', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cholesky_inverse', ''),  # could not find kernel
     xfail('cholesky_solve', ''),  # could not find kernel
@@ -2244,7 +2413,6 @@ def forward(self, x):
     xfail('cummax', ''),  # aten.cummax.default - couldn't find symbolic meta function/decomposition
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
-    xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
@@ -2273,7 +2441,6 @@ def forward(self, x):
     xfail('gradient', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('hsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('i0', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
-    xfail('index_put', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('inner', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('kron', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('kthvalue', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -2318,17 +2485,13 @@ def forward(self, x):
     xfail('masked.amax', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked.amin', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked.cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('masked.prod', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decompos...
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decompo...
     xfail('median', ''),  # could not find kernel
-    xfail('meshgrid', 'list_of_tensors'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('meshgrid', 'variadic_tensors'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('min', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mode', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional._scaled_dot_product_attention', ''),  # Cannot call sizes() on tensor with symbolic ...
     xfail('nn.functional.adaptive_avg_pool3d', ''),  # aten._adaptive_avg_pool3d_backward.default - couldn't ...
     xfail('nn.functional.adaptive_max_pool1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.adaptive_max_pool2d', ''),  # aten.adaptive_max_pool2d.default - couldn't find symbo...
@@ -2346,11 +2509,9 @@ def forward(self, x):
     xfail('nn.functional.grid_sample', ''),  # RuntimeError: aten.grid_sampler_3d.default - couldn't find sym ...
     xfail('nn.functional.group_norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.interpolate', 'bicubic'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'linear'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'trilinear'),  # Cannot call sizes() on tensor with symbolic sizes/st...
     xfail('nn.functional.max_pool1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.max_pool2d', ''),  # aten.max_pool2d_with_indices_backward.default - couldn't find s...
     xfail('nn.functional.max_pool3d', ''),  # aten.max_pool3d_with_indices.default - couldn't find symbolic m...
     xfail('nn.functional.max_unpool1d', ''),  # aten.max_unpool2d.default - couldn't find symbolic meta funct...
     xfail('nn.functional.max_unpool1d', 'grad'),  # aten.max_unpool2d.default - couldn't find symbolic meta ...
@@ -2368,9 +2529,7 @@ def forward(self, x):
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta...
     xfail('nn.functional.rrelu', ''),  # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
-    xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', 'nuc'),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('normal', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('normal', 'number_mean'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('ormqr', ''),  # aten.ormqr.default - couldn't find symbolic meta function/decomposition
     xfail('pca_lowrank', ''),  # could not find kernel
@@ -2386,22 +2545,16 @@ def forward(self, x):
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('renorm', ''),  # aten.renorm.default - couldn't find symbolic meta function/decomposition
     xfail('repeat_interleave', ''),  # aten.repeat_interleave.Te...
-    xfail('reshape_as', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('roll', ''),  # narrow() received an invalid combination of arguments - got (FakeTensor, int, torch._C...
-    xfail('segment_reduce', 'lengths'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
-    xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
+    xfail('_segment_reduce', 'lengths'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
+    xfail('_segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
     xfail('sgn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
-    xfail('std', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('std', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('std_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('std_mean', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('stft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('sum_to_size', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('svd', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('svd_lowrank', ''),  # could not find kernel
-    xfail('symeig', ''),  # aten.symeig.default - couldn't find symbolic meta function/decomposition
     xfail('take_along_dim', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('take', ''),  # aten.take.default - couldn't find symbolic meta function/decomposition
     xfail('tensordot', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -2410,11 +2563,7 @@ def forward(self, x):
     xfail('trapz', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/de...
     xfail('unflatten', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('var', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('var', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('var_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('var_mean', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('view_as', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('_upsample_bilinear2d_aa'),  # RuntimeError: isIntList() INTERNAL ASSERT FAILED  Expected IntList but got GenericList
     xfail('vsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
 }
 
@@ -2427,10 +2576,10 @@ def call_forwards_backwards(f):
             flat_out, _ = pytree.tree_flatten(out)
             sm = 0
             for i in flat_out:
-                sm += i.sum()
+                sm += i.sum().abs()
             sm.backward()
         else:
-            out.sum().backward()
+            out.sum().abs().backward()
 
     def reset_grads():
         def f(x):
@@ -2491,7 +2640,16 @@ def f(args):
             return op.op(*c_args, **c_kwargs)
 
         compiled_f = compiled_function(f, nop, nop)
-        _test_aot_autograd_forwards_backwards_helper(self, f, compiled_f, args)
+        try:
+            _test_aot_autograd_forwards_backwards_helper(self, f, compiled_f, args)
+        except GuardOnDataDependentSymNode:
+            # Carveout for getitem; I don't want to xfail the entire test
+            # because that will reject known to be good tests see
+            # https://github.com/pytorch/pytorch/issues/94705
+            if op.name == "__getitem__":
+                self.skipTest("Dynamic output shape operation in trace")
+            else:
+                raise
 
 def _test_aot_autograd_module_helper(self, device, dtype, training, module_info):
     module_cls = module_info.module_cls
@@ -2508,11 +2666,17 @@ def _test_aot_autograd_module_helper(self, device, dtype, training, module_info)
 
         # Lazy modules need to see an input first to initialize params.
         args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
+        flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+
+        # PackedSequence is only used for RNNs. It might be possible to fake-ify if they're pytrees but
+        # torchdynamo already doesn't support RNNs
+        if any(tuple(isinstance(flat_arg, PackedSequence) for flat_arg in flat_args)):
+            continue
+
         if issubclass(module_info.module_cls, torch.nn.modules.lazy.LazyModuleMixin):
             with torch.no_grad():
                 m(*args, **kwargs)
 
-        flat_args, args_spec = pytree.tree_flatten((args, kwargs))
         sentinel_val = -42
         is_tensor_spec = [sentinel_val if isinstance(arg, torch.Tensor)
                           else arg for arg in flat_args]
@@ -2529,8 +2693,8 @@ def f(params_buffers_args):
             params_and_buffers = {**named_params, **named_buffers}
             return torch.func.functional_call(m, params_and_buffers, c_args, c_kwargs)
 
-        named_params = dict(_named_parameters(m, remove_duplicate=False))
-        named_buffers = dict(_named_buffers(m, remove_duplicate=False))
+        named_params = dict(m.named_parameters(remove_duplicate=False))
+        named_buffers = dict(m.named_buffers(remove_duplicate=False))
         num_params_buffers = len(named_params) + len(named_buffers)
         compiled_f = aot_function(f, nop, num_params_buffers=num_params_buffers)
         params_buffers_args = [named_params, named_buffers, args]
@@ -2544,7 +2708,6 @@ def test_aot_autograd_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op)
 
     @ops(op_db, allowed_dtypes=(torch.float,))
-    @skipIfNoSympy
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
     @patch("functorch.compile.config.use_functionalize", True)
@@ -2580,6 +2743,7 @@ def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
     torch.nn.GaussianNLLLoss,  # NotImplementedError: local_scalar_dense/item NYI for torch.bool
     torch.nn.CrossEntropyLoss,  # Cannot call sizes() on tensor with symbolic sizes/strides
     torch.nn.Bilinear,  # Cannot call sizes() on tensor with symbolic sizes/strides
+    torch.nn.MultiheadAttention,  # baddbmm - Cannot call sizes() on tensor with symbolic ...
 }
 
 
@@ -2590,7 +2754,6 @@ def test_aot_autograd_module_exhaustive(self, device, dtype, training, module_in
         _test_aot_autograd_module_helper(self, device, dtype, training, module_info)
 
     @modules(module_db, allowed_dtypes=(torch.float,))
-    @skipIfNoSympy
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
     @patch("functorch.compile.config.use_functionalize", True)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 13bafaaf36a4..cdc3ceed1b1a 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -1,9 +1,10 @@
 # Owner(s): ["module: functorch"]
+import unittest
+
 import torch
 from functorch.experimental import control_flow
 from functorch.experimental.control_flow import cond
 from functorch.experimental.control_flow import UnsupportedAliasMutationException
-from functorch.experimental import functionalize
 from torch.fx.experimental.proxy_tensor import make_fx
 
 from torch.testing._internal.common_utils import run_tests, TestCase
@@ -20,6 +21,30 @@ def false_fn(x):
         result = cond(False, true_fn, false_fn, [x])
         self.assertEqual(result, torch.cos(x))
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
+    def test_cond_gpu(self):
+        def true_fn(x):
+            return x.sin()
+
+        def false_fn(x):
+            return x.cos()
+
+        x = torch.randn(4, device="cuda")
+        pred = torch.tensor(False, device="cuda")
+        result = cond(False, true_fn, false_fn, [x])
+        self.assertEqual(result, torch.cos(x))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA.")
+    def test_map_gpu(self):
+        def f(x, y):
+            return x + y
+
+        xs = torch.ones(3, 2, 2, device="cuda")
+        y = torch.ones(2, device="cuda")
+        res = control_flow.map(f, xs, y)
+
+        self.assertEqual(res, control_flow.map(f, torch.ones(3, 2, 2), torch.ones(2)))
+
 
 class TestControlFlowTraced(TestCase):
     def test_cond_traced_not_nested(self):
@@ -89,10 +114,10 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(4, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
         self.assertEqual(functional_f(*example_inputs), f(*example_inputs))
 
-        graph_module = make_fx(functionalize(f))(*example_inputs)
+        graph_module = make_fx(torch.func.functionalize(f))(*example_inputs)
         self.assertEqual(graph_module(*example_inputs), f(*example_inputs))
 
         all_ops_in_true_branch = []
@@ -114,7 +139,7 @@ def f(x):
 
         inp = torch.ones(1, 2)
         gm_non_functional = make_fx(f, tracing_mode="real")(inp)
-        gm_functional = make_fx(functionalize(gm_non_functional), tracing_mode="real")(inp)
+        gm_functional = make_fx(torch.func.functionalize(gm_non_functional), tracing_mode="real")(inp)
         self.assertEqual(gm_functional(torch.zeros(1, 2)), f(torch.zeros(1, 2)))
 
     def test_cond_functionalized_nested(self):
@@ -138,10 +163,10 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(4, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
         self.assertEqual(functional_f(*example_inputs), f(*example_inputs))
 
-        graph_module = make_fx(functionalize(f))(*example_inputs)
+        graph_module = make_fx(torch.func.functionalize(f))(*example_inputs)
         self.assertEqual(graph_module(*example_inputs), f(*example_inputs))
 
         gm_true_true_branch = graph_module.true_graph_0.true_graph_0
@@ -165,10 +190,10 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(4, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
         self.assertEqual(functional_f(*example_inputs), f(*example_inputs))
 
-        graph_module = make_fx(functionalize(f))(*example_inputs)
+        graph_module = make_fx(torch.func.functionalize(f))(*example_inputs)
         self.assertEqual(graph_module(*example_inputs), f(*example_inputs))
 
     def test_cond_functionalized_input_mutation_on_true_branch(self):
@@ -185,12 +210,12 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(4, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
             functional_f(*example_inputs)
 
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
-            make_fx(functionalize(f))(*example_inputs)
+            make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_input_mutation_on_false_branch(self):
         def true_fn(x):
@@ -206,12 +231,12 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(5, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
             functional_f(*example_inputs)
 
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
-            make_fx(functionalize(f))(*example_inputs)
+            make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_output_alias_input(self):
         def true_fn(x):
@@ -226,13 +251,13 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(5, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
 
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch might be aliasing"):
             functional_f(*example_inputs)
 
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch might be aliasing"):
-            make_fx(functionalize(f))(*example_inputs)
+            make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_functionalized_nested_input_mutation(self):
         def true_true_fn(x):
@@ -254,12 +279,12 @@ def f(x):
             return cond(pred, true_fn, false_fn, [x])
 
         example_inputs = (torch.ones(4, 5),)
-        functional_f = functionalize(f)
+        functional_f = torch.func.functionalize(f)
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
             functional_f(*example_inputs)
 
         with self.assertRaisesRegex(UnsupportedAliasMutationException, "One of torch.cond branch"):
-            make_fx(functionalize(f))(*example_inputs)
+            make_fx(torch.func.functionalize(f))(*example_inputs)
 
     def test_cond_nested_traced_other_inputs(self):
         def true_nested(y):
@@ -570,6 +595,65 @@ def g(xs, y):
         self.assertEqual(res, g(x, y))
         self.check_map_graph(gm, "val")
 
+    def test_map_functionalized(self):
+        def map_fn(x, y):
+            z = x + y
+            z.add_(4)
+            return z
+
+        def f(xs, y):
+            return control_flow.map(map_fn, xs, y)
+
+        example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
+        functional_f = torch.func.functionalize(f)
+        self.assertEqual(functional_f(*example_inputs), f(*example_inputs))
+
+        gm = make_fx(torch.func.functionalize(f))(*example_inputs)
+        self.assertEqual(gm(*example_inputs), f(*example_inputs))
+
+        for node in gm.body_graph_0.graph.nodes:
+            if node.op == "call_function":
+                self.assertTrue(not node.target._schema.is_mutable)
+
+    def test_map_functionalized_arg_mutation(self):
+        def map_fn(x, y):
+            y.add_(4)
+            return x + y
+
+        def f(xs, y):
+            return control_flow.map(map_fn, xs, y)
+
+        example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
+        functional_f = torch.func.functionalize(f)
+        with self.assertRaisesRegex(UnsupportedAliasMutationException, "torch.map is mutating the input!"):
+            functional_f(*example_inputs)
+
+    def test_map_functionalized_elem_mutation(self):
+        def map_fn(x, y):
+            x.add_(4)
+            return x + y
+
+        def f(xs, y):
+            return control_flow.map(map_fn, xs, y)
+
+        example_inputs = (torch.ones(3, 2, 4), torch.ones(4))
+        functional_f = torch.func.functionalize(f)
+        with self.assertRaisesRegex(UnsupportedAliasMutationException, "torch.map is mutating the input!"):
+            functional_f(*example_inputs)
+
+    def test_map_functionalized_elem_alias(self):
+        def map_fn(x):
+            x.view(x.shape)
+            return x
+
+        def f(xs):
+            return control_flow.map(map_fn, xs)
+
+        example_inputs = (torch.ones(3, 2, 4),)
+        functional_f = torch.func.functionalize(f)
+        with self.assertRaisesRegex(UnsupportedAliasMutationException, "torch.map is aliasing the input!"):
+            functional_f(*example_inputs)
+
     def test_nested_map_cond_real(self):
         def true_fn(x, y):
             return x * y
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index c52686c35182..c75ef6205bcb 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -20,8 +20,9 @@
 import unittest
 import warnings
 import math
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCPU
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyCPU, dtypes, onlyCUDA
 from torch.testing._internal.common_dtype import get_all_fp_dtypes
+from torch.testing import make_tensor
 from torch._subclasses.fake_tensor import FakeTensorMode
 from functools import partial
 from functorch.experimental import replace_all_batch_norm_modules_
@@ -40,7 +41,7 @@
 from torch._ops import PyOperator
 from torch._functorch.utils import enable_single_level_autograd_function
 import torch.autograd.forward_ad as fwAD
-from torch.func import functional_call, stack_module_state
+from torch.func import functional_call, stack_module_state, linearize
 
 # NB: numpy is a testing dependency!
 import numpy as np
@@ -368,9 +369,10 @@ def foo(x):
             assert not x.is_conj()
             y = x.conj()
             assert y.is_conj()
-            return y
+            return y.abs()
         res = grad(foo)(x)
-        self.assertEqual(res, torch.ones_like(res))
+        with torch.no_grad():
+            self.assertEqual(res, torch.ones_like(res) * torch.sgn(x))
 
     def test_composed_with_autograd(self, device):
         x = torch.randn([], requires_grad=True, device=device)
@@ -2112,6 +2114,33 @@ def f(x, idx):
             with self.assertRaisesRegex(RuntimeError, msg):
                 jacrev(fn, chunk_size=2, _preallocate_and_copy=_preallocate_and_copy)(x, idx)
 
+    def test_complex_error(self, device):
+        # Verify complex input raises error
+        # C -> C
+        def fn(x):
+            return x.conj()
+
+        x = torch.randn(1, device=device, dtype=torch.cfloat)
+
+        with self.assertRaisesRegex(RuntimeError, "jacrev: Expected all inputs"):
+            jacrev(fn)(x)
+
+        with self.assertRaisesRegex(RuntimeError, "jacfwd: Expected all inputs"):
+            jacfwd(fn)(x)
+
+        # Verify complex output raises error
+        # R -> C
+        def fn(x):
+            return torch.conj(x * 0.5j)
+
+        x = torch.randn(1, device=device, dtype=torch.float)
+
+        with self.assertRaisesRegex(RuntimeError, "jacrev: Expected all outputs"):
+            jacrev(fn)(x)
+
+        with self.assertRaisesRegex(RuntimeError, "jacfwd: Expected all outputs"):
+            jacfwd(fn)(x)
+
 
 class TestHessian(TestCase):
     def _test_against_reference(self, f, inputs):
@@ -2500,6 +2529,102 @@ def push_jvp(dummy, x):
         vmap(vmap(push_jvp, (0, None)))(dummy, x)
 
 
+class TestLinearize(TestCase):
+    @dtypes(torch.float)
+    def test_linearize_basic(self, device, dtype):
+        x_p = make_tensor((3, 1), device=device, dtype=dtype)
+        x_t = make_tensor((3, 1), device=device, dtype=dtype)
+
+        def fn(x):
+            return x.cos()
+
+        actual_output, jvp_fn = linearize(fn, x_p)
+        actual_jvp = jvp_fn(x_t)
+        expected_output, expected_jvp = jvp(fn, (x_p,), (x_t,))
+        self.assertEqual(actual_output, expected_output)
+        self.assertEqual(actual_jvp, expected_jvp)
+
+    @dtypes(torch.float)
+    def test_linearize_return(self, device, dtype):
+        x_p = make_tensor((3, 1), device=device, dtype=dtype)
+        x_t = make_tensor((3, 1), device=device, dtype=dtype)
+
+        def fn(x):
+            return (x.cos(), x.sum())
+
+        actual_output, jvp_fn = linearize(fn, x_p)
+        actual_jvp = jvp_fn(x_t)
+        expected_output, expected_jvp = jvp(fn, (x_p,), (x_t,))
+        self.assertEqual(actual_output, expected_output)
+        self.assertEqual(actual_jvp, expected_jvp)
+
+    @dtypes(torch.float)
+    def test_linearize_composition(self, device, dtype):
+        x_p = make_tensor((3, 1), device=device, dtype=dtype)
+        x_t = make_tensor((3, 3, 1), device=device, dtype=dtype)
+
+        def fn(x):
+            return (x.cos(), x.sum())
+
+        _, jvp_fn = linearize(fn, x_p)
+        actual_batched_jvp = vmap(jvp_fn)(x_t)
+
+        def jvp_fn(x_t):
+            return jvp(fn, (x_p,), (x_t,))[1]
+        expected_batched_jvp = vmap(jvp_fn)(x_t)
+
+        self.assertEqual(actual_batched_jvp, expected_batched_jvp)
+
+    @dtypes(torch.float)
+    def test_linearize_nested_input_nested_output(self, device, dtype):
+        x_p = make_tensor((3, 1), device=device, dtype=dtype)
+        x_t = make_tensor((3, 1), device=device, dtype=dtype)
+        y_p = make_tensor((3, 1), device=device, dtype=dtype)
+        y_t = make_tensor((3, 1), device=device, dtype=dtype)
+        z_p = make_tensor((3, 1), device=device, dtype=dtype)
+        z_t = make_tensor((3, 1), device=device, dtype=dtype)
+
+        def fn(arg):
+            x = arg['x']
+            y = arg['yz'][0]
+            z = arg['yz'][1]
+
+            return {'a': x.sum(), 'b': {'c': y + z, 'd': (x * z, y.exp())}}
+
+        inp_p = {'x': x_p, 'yz': (y_p, z_p)}
+        inp_t = {'x': x_t, 'yz': (y_t, z_t)}
+        actual_output, jvp_fn = linearize(fn, inp_p)
+        actual_jvp = jvp_fn(inp_t)
+
+        expected_output, expected_jvp = jvp(fn, (inp_p,), (inp_t,))
+
+        self.assertEqual(actual_output, expected_output)
+        self.assertEqual(actual_jvp, expected_jvp)
+
+    @onlyCUDA
+    def test_linearize_errors(self):
+        dtype = torch.float
+        device = torch.device('cpu')
+        x_p = make_tensor((3, 1), device=device, dtype=dtype)
+        x_t = make_tensor((3, 1), device=device, dtype=dtype)
+
+        def fn(x):
+            return x.sin()
+
+        _, jvp_fn = linearize(fn, x_p)
+
+        with self.assertRaisesRegex(RuntimeError, "to have the same argspec as the primals"):
+            jvp_fn((x_t, x_t))
+
+        with self.assertRaisesRegex(RuntimeError, "in flattened pytree doesn't match the shape"):
+            jvp_fn(x_t.unsqueeze(0))
+
+        with self.assertRaisesRegex(RuntimeError, "in flattened pytree doesn't match the dtype"):
+            jvp_fn(x_t.to(torch.double))
+
+        with self.assertRaisesRegex(RuntimeError, "in flattened pytree doesn't match the device"):
+            jvp_fn(x_t.to(torch.device('cuda')))
+
 # The tests here follow the cases in [Forward Grad View/inplace]
 # https://github.com/pytorch/pytorch/blob/master/torch/csrc/autograd/autograd_meta.cpp#L18-L43
 class TestVmapJvpInplaceView(TestCase):
@@ -2959,7 +3084,7 @@ def test_no_warning_on_import_functorch(self, device):
             [sys.executable, "-W", "all", "-c", "import functorch"],
             stderr=subprocess.STDOUT,
             cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
-        self.assertEquals(out, "")
+        self.assertEqual(out, "")
 
     def test_requires_grad_inside_transform(self, device):
         def f(x):
@@ -3317,7 +3442,7 @@ def forward(self, x):
     def test_correctness_mnist(self, mechanism):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
                 self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
                 self.conv2_drop = nn.Dropout2d()
@@ -3476,7 +3601,7 @@ def _update_params(self, params, grads, alpha, mechanism):
     def test_maml_regression(self, device, mechanism):
         class ThreeLayerNet(nn.Module):
             def __init__(self):
-                super(ThreeLayerNet, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(1, 40)
                 self.relu1 = nn.ReLU()
                 self.fc2 = nn.Linear(40, 40)
@@ -4452,6 +4577,11 @@ def test_functional_call_multiple_dicts(self):
     globals(),
     only_for=only_for,
 )
+instantiate_device_type_tests(
+    TestLinearize,
+    globals(),
+    only_for=only_for,
+)
 instantiate_device_type_tests(
     TestVmapJvpInplaceView,
     globals(),
diff --git a/test/functorch/test_minifier.py b/test/functorch/test_minifier.py
index 7ed13921d907..9e6f495bcd4b 100644
--- a/test/functorch/test_minifier.py
+++ b/test/functorch/test_minifier.py
@@ -18,7 +18,7 @@ def failing_f(x, y):
         failing_f = make_fx(failing_f)(*inps)
 
         def has_mul(fx_g, inps):
-            return (torch.ops.aten.mul.Tensor in set([i.target for i in fx_g.graph.nodes]))
+            return (torch.ops.aten.mul.Tensor in (i.target for i in fx_g.graph.nodes))
 
         min_f, inps = minifier(failing_f, inps, has_mul)
         self.assertEqual(len(min_f.graph.nodes), 4)
@@ -74,7 +74,7 @@ def f(a, b):
         inps = [torch.randn(3), torch.randn(3)]
 
         def has_add(fx_g, inps):
-            return (torch.ops.aten.add.Tensor in set([i.target for i in fx_g.graph.nodes]))
+            return (torch.ops.aten.add.Tensor in (i.target for i in fx_g.graph.nodes))
 
         failing_f = make_fx(f)(*inps)
         min_f, inps = minifier(failing_f, inps, has_add)
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 83bfa9385dc2..0e4d80707234 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -10,7 +10,7 @@
 import unittest
 
 from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_MACOS, \
-    IS_X86, parametrize, TEST_WITH_ASAN, noncontiguous_like, IS_WINDOWS
+    IS_X86, parametrize, TEST_WITH_ASAN, noncontiguous_like
 import torch
 from torch import Tensor
 import functools
@@ -370,6 +370,7 @@ class TestOperators(TestCase):
     @skipOps('TestOperators', 'test_grad', vjp_fail.union({
         xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
         xfail('sparse.sampled_addmm', ''),  # RuntimeError: Sparse CSR tensors do not have strides
+        xfail('sparse.mm', 'reduce'),  # RuntimeError: Sparse CSR tensors do not have strides
 
         # Non-contiguous Bugs
         #
@@ -383,8 +384,9 @@ class TestOperators(TestCase):
 
         # RuntimeError: Tensor must have a last dimension with stride 1
         xfail('view_as_complex'),
-        decorate('nn.functional._scaled_dot_product_attention',
-                 decorator=expectedFailureIf(not IS_WINDOWS), device_type='cuda'),
+        # query: last dimension must be contiguous
+        # Fused attention kernels require last dim to be contiguous
+        xfail('nn.functional.scaled_dot_product_attention', device_type='cuda'),
     }))
     @opsToleranceOverride('TestOperators', 'test_grad', (
         tol1('nn.functional.binary_cross_entropy_with_logits',
@@ -392,7 +394,7 @@ class TestOperators(TestCase):
         tol1('masked.cumprod',
              {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
         tol1('svd_lowrank',
-             {torch.float32: tol(atol=3e-05, rtol=3e-05)}, device_type='cuda'),
+             {torch.float32: tol(atol=3e-05, rtol=3e-04)}, device_type='cuda'),
         tol1('linalg.tensorsolve',
              {torch.float32: tol(atol=3e-04, rtol=3e-04)}, device_type='cuda'),
     ))
@@ -428,10 +430,15 @@ def wrapped_fn(*args, **kwargs):
                 if sample.output_process_fn_grad is not None:
                     result = sample.output_process_fn_grad(result)
 
+                def abs_if_complex(t):
+                    if t.dtype.is_complex:
+                        return t.abs()
+                    return t
+
                 # Reduce into single value for grad
                 if isinstance(result, torch.Tensor):
-                    return result.sum()
-                result = sum([res.sum() for res in result])
+                    return abs_if_complex(result.sum())
+                result = sum([abs_if_complex(res.sum()) for res in result])
                 return result
 
             result = grad(wrapped_fn, diff_argnums)(*args, **kwargs)
@@ -454,7 +461,7 @@ def wrapped_fn(*args, **kwargs):
         xfail("native_batch_norm"),          # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
         xfail("_native_batch_norm_legit"),    # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
 
-        xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
+        xfail('nn.functional.scaled_dot_product_attention', device_type='cuda'),
 
         xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
         xfail('NumpyExpMarkDirtyAutogradFunction'),  # TODO: https://github.com/pytorch/pytorch/issues/91280
@@ -566,15 +573,15 @@ def maybe_clone_inputs():
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
         xfail('sparse.sampled_addmm', ''),
+        xfail('sparse.mm', 'reduce'),
 
         # ---- Non-Contiguous Failures ----
         # This is expected to fail as the operator
         # expects last dim to have stride=1
         xfail('view_as_complex'),
         # RuntimeError: query: last dimension must be contiguous
-        # NOTE: This passes on Windows!
-        decorate('nn.functional._scaled_dot_product_attention',
-                 decorator=unittest.skipIf(not IS_WINDOWS, "expects contiguous inputs")),
+        # The fused attention kernels require the last dim to be contiguous
+        xfail('nn.functional.scaled_dot_product_attention', device_type="cuda"),
         # BUG
         # AssertionError: Tensor-likes are not close!
         xfail('as_strided'),
@@ -645,7 +652,8 @@ def f(inp, *args, **kwargs):
         xfail('nn.functional.ctc_loss'),  # Not Implemented
         xfail('native_layer_norm', ''),  # Expected a proper Tensor but got None for argument #1 'other'
         xfail('sparse.sampled_addmm', ''),  # sparse tensors have no strides
-        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
+        xfail('sparse.mm', 'reduce'),  # sparse tensors have no strides
+        skip('nn.functional.scaled_dot_product_attention', device_type='cuda'),
         # AssertionError: Tensor-likes are not close!
         # Mismatched elements: 1 / 15 (6.7%)
         # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
@@ -740,7 +748,7 @@ def fn(inp, *args, **kwargs):
         skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
         skip("nn.functional.fractional_max_pool2d"),  # calls random op
         skip("nn.functional.fractional_max_pool3d"),  # calls random op
-        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
         # It looks like you're either (1) calling .item() on a Tensor or
         # (2) attempting to use a Tensor in some data-dependent control flow or
         # (3) encountering this error in PyTorch internals.
@@ -768,6 +776,7 @@ def fn(inp, *args, **kwargs):
         xfail("quantile", device_type='cpu'),  # Batching rule not implemented for `at::equal`
         xfail("scatter_reduce", "prod"),  # vmap (looks like you are calling item/data-dependent)
         xfail("sparse.sampled_addmm"),  # RuntimeError: Sparse CSR tensors do not have strides
+        xfail("sparse.mm", "reduce"),  # RuntimeError: Sparse CSR tensors do not have strides
         xfail("svd_lowrank"),  # calls random op
         xfail("take"),  # vmap: inplace into a regular tensor
         xfail("to"),  # rank 4 tensor for channels_last
@@ -849,7 +858,7 @@ def vjp_of_vjp(*args_and_cotangents):
         skip('nn.functional.dropout2d'),  # randomness
         skip('nn.functional.dropout3d', ''),  # randomness
         skip('nn.functional.alpha_dropout'),  # randomness
-        skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        skip('nn.functional.scaled_dot_product_attention'),  # randomness
         xfail('as_strided'),  # as_strided is too wild for us to support, wontfix
         xfail('index_put', ''),  # not possible due to dynamic shapes; we support a subset
         xfail('masked_scatter'),  # dynamic
@@ -894,6 +903,7 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('nn.functional.max_unpool2d', 'grad'),
 
         xfail('sparse.sampled_addmm', ''),
+        xfail('sparse.mm', 'reduce'),
         xfail('as_strided_scatter', ''),  # calls as_strided
         xfail('index_reduce', ''),  # .item() call
         # ---------------------------------------------------------------------
@@ -941,7 +951,7 @@ def test_vmapvjp(self, device, dtype, op):
         skip('nn.functional.rrelu'),  # randomness
         skip('nn.functional.dropout2d', ''),
         skip('nn.functional.dropout3d', ''),
-        skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        skip('nn.functional.scaled_dot_product_attention'),  # randomness
         skip('nn.functional.alpha_dropout'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),
         skip('nn.functional.feature_alpha_dropout', 'with_train'),
@@ -1043,11 +1053,9 @@ def test_vmapjvpall(self, device, dtype, op):
         xfail('fill'),
         skip('masked.mean'),  # ???
         xfail('masked_scatter'),
-        xfail('index_fill'),
         xfail('put'),
         xfail('take'),
         xfail('nn.functional.max_pool3d'),
-        xfail('vdot'),
         xfail('nn.functional.feature_alpha_dropout', 'without_train'),
         xfail('linalg.lu_factor', ''),
         xfail('nn.functional.dropout2d', ''),
@@ -1077,7 +1085,7 @@ def test_vmapjvpall(self, device, dtype, op):
         xfail('nn.functional.dropout3d', ''),
         xfail('as_strided_scatter', ''),
         xfail('masked.cumprod', ''),
-        xfail('linalg.vecdot', ''),
+        xfail("_upsample_bilinear2d_aa"),  # hit vmap fallback, which is disabled
     }))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     def test_vmapjvpall_has_batch_rule(self, device, dtype, op):
@@ -1116,8 +1124,6 @@ def test():
         xfail('fill'),
         xfail('narrow'),  # Batching rule not implemented for `narrow.Tensor` (and view op)
         xfail('special.log_ndtr'),
-        xfail('index_copy'),
-        xfail('index_fill'),
         xfail('linalg.householder_product'),
         xfail('lu'),
         xfail('lu_solve'),
@@ -1139,7 +1145,6 @@ def test():
         xfail('to_sparse'),
         xfail('unfold'),
         xfail('unfold_copy'),
-        xfail('vdot'),
         xfail('nn.functional.dropout'),
         xfail('fft.ihfft2'),
         xfail('fft.ihfftn'),
@@ -1182,13 +1187,14 @@ def test():
         xfail('index_reduce', ''),
         xfail('nn.functional.dropout3d', ''),
         xfail('as_strided_scatter', ''),
-        xfail('segment_reduce', 'offsets'),
-        xfail('linalg.vecdot', ''),
-        xfail('segment_reduce', 'lengths'),
+        xfail('_segment_reduce', 'offsets'),
+        xfail('_segment_reduce', 'lengths'),
         xfail('sparse.sampled_addmm', ''),
+        xfail('sparse.mm', 'reduce'),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
         xfail("native_dropout_backward"),
+        xfail("_upsample_bilinear2d_aa"),  # hit vmap fallback, which is disabled
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
@@ -1226,7 +1232,7 @@ def test():
         skip('nn.functional.rrelu'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
-        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
+        skip('nn.functional.scaled_dot_product_attention', device_type='cuda'),
         skip('nn.functional.alpha_dropout'),  # randomness
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         skip('to_sparse', ''),  # non-dense output
@@ -1259,6 +1265,7 @@ def test():
         xfail('nn.functional.dropout3d', ''),
         xfail('as_strided_scatter', ''),
         xfail('sparse.sampled_addmm', ''),
+        xfail('sparse.mm', 'reduce'),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
         xfail('as_strided', 'partial_views'),
@@ -1347,19 +1354,19 @@ def get_vjp(cotangents, *primals):
         xfail('NumpyCubeNotComposableAutogradFunction'),  # not composable
         xfail('renorm', ''),  # NYI: forward AD for renorm
         xfail('ormqr', ''),  # NYI: forward AD for ormqr
-        xfail('symeig', ''),  # NYI: forward AD for symeig
         xfail('nn.functional.multilabel_margin_loss', ''),  # NYI: multilabel_margin_loss_forward
         xfail('nn.functional.multilabel_soft_margin_loss', ''),  # NYI: log_sigmoid_backward
         xfail('nn.functional.soft_margin_loss', ''),  # NYI: forward-AD for log_sigmoid_backward
         xfail('nn.functional.ctc_loss', ''),  # NYI: forward-AD for _ctc_loss
         xfail('nn.functional.pdist', ''),  # NYI: forward-AD with _pdist_forward
-        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
+        skip('nn.functional.scaled_dot_product_attention', device_type='cuda'),
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
-        xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
+        xfail('_segment_reduce', 'offsets'),  # NYI: forward-AD for _segment_reduce
+        xfail('sparse.mm', 'reduce'),  # Sparse tensors have no strides
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
-        xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
+        xfail('_segment_reduce', 'lengths'),  # NYI: forward-AD for _segment_reduce
         xfail('native_dropout_backward'),  # NYI
 
     }))
@@ -1477,7 +1484,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('nn.functional.dropout2d'),  # calls random op
         xfail('nn.functional.dropout3d'),  # calls random op
         xfail('nn.functional.dropout'),  # calls random op
-        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
         xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
         xfail('nn.functional.alpha_dropout'),  # calls randomn op
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
@@ -1510,11 +1517,11 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('quantile'),  # Batching rule not implemented for aten::equal
         xfail('renorm'),  # Forward AD not implemented and no decomposition
         xfail('scatter_reduce', 'prod'),  # Forward AD not implemented and no decomposition
-        xfail('segment_reduce', 'lengths'),  # Forward AD not implemented and no decomposition
-        xfail('segment_reduce', 'offsets'),  # Forward AD not implemented and no decomposition
+        xfail('_segment_reduce', 'lengths'),  # Forward AD not implemented and no decomposition
+        xfail('_segment_reduce', 'offsets'),  # Forward AD not implemented and no decomposition
         xfail('sparse.sampled_addmm'),  # RuntimeError: Sparse CSR tensors do not have strides
+        xfail('sparse.mm', 'reduce'),  # RuntimeError: Sparse CSR tensors do not have strides
         xfail('svd_lowrank'),  # calls random op
-        xfail('symeig'),  # Forward AD not implemented and no decomposition
         xfail('take'),  # vmap: inplace into regular tensor
         xfail('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('to_sparse'),  # Forward AD not implemented and no decomposition
@@ -1762,6 +1769,7 @@ def fn(input, weight, bias):
         skip('linalg.lu_factor_ex', dtypes=(torch.float32,), device_type='cuda'),  # fails on all but windows
         skip('linalg.multi_dot', '', device_type='cpu'),
         skip('sparse.sampled_addmm', ''),
+        skip('sparse.mm', 'reduce'),
         skip('native_layer_norm', '', device_type='cpu'),
     })
     @opsToleranceOverride('TestOperators', 'test_vmap_autograd_grad', (
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 0eca8dcecc64..df00c89ee800 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -20,7 +20,7 @@
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_cuda import with_tf32_off
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, \
-    skipCUDAIfNoMagma, OpDTypes
+    OpDTypes
 from torch.testing._internal.common_device_type import ops
 from torch.testing._internal.common_utils import (
     parametrize,
@@ -50,6 +50,7 @@
 )
 import types
 from collections import namedtuple
+import contextlib
 
 import functorch
 from functorch import vmap, grad, grad_and_value, jvp, vjp, jacfwd
@@ -3260,16 +3261,6 @@ def f(t):
         with self.assertRaisesRegex(RuntimeError, r"Attempted to vmap over aten::where"):
             vmap(f)(x)
 
-    @skipCUDAIfNoMagma
-    @allowVmapFallbackUsage
-    def test_symeig(self, device):
-        def op(x):
-            return torch.symeig(x, eigenvectors=True)[0]
-
-        x = torch.randn(3, 3, device=device, requires_grad=True)
-        self._batched_grad_test(op, (x,), {})
-        self._batched_grad_grad_test(op, (x,), {})
-
     def test_threshold(self, device):
         x = torch.randn(2, 3, device=device, requires_grad=True)
         self._batched_grad_test(lambda x: F.threshold(x, 0.5, 0.0), (x,))
@@ -3456,7 +3447,7 @@ def test():
         xfail('__getitem__'),  # dynamic mask
         xfail('index_put'),  # dynamic mask
         xfail('nn.functional.dropout'),  # works, can't check against for loop because of randomness inconsistency
-        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
         xfail('masked_select'),  # dynamic op
         xfail('nonzero'),  # dynamic op
         xfail('unique', ''),  # dynamic op
@@ -3484,6 +3475,7 @@ def test():
         xfail('pca_lowrank', ''),  # random operation
         xfail('svd_lowrank', ''),  # random operation
         xfail('sparse.sampled_addmm'),  # sparse
+        xfail('sparse.mm', 'reduce'),  # sparse
         xfail("NumpyCubeNotComposableAutogradFunction"),  # Not composable autograd.Function
         skip('_softmax_backward_data'),
         skip('linalg.eigh', ''),  # not unique, see test_linalg_eigh for manual test
@@ -3612,7 +3604,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('native_batch_norm'),
         xfail('_native_batch_norm_legit'),
         xfail('histogram'),
-        xfail('index_fill'),
         xfail('scatter_reduce', 'sum'),
         xfail('scatter_reduce', 'mean'),
         xfail('scatter_reduce', 'amax'),
@@ -3634,13 +3625,12 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('take'),
         xfail('tensor_split'),
         xfail('to_sparse'),
-        xfail('vdot'),
         xfail('tril'),  # Exception not raised on error input
         xfail('triu'),  # Exception not raised on error input
         xfail('__getitem__', ''),
         xfail('count_nonzero'),
         xfail('nn.functional.dropout'),  # works, can't check against for loop because of randomness inconsistency
-        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional.scaled_dot_product_attention'),  # randomness
         xfail('resize_'),
         xfail('view_as_complex'),
         xfail('matrix_exp'),
@@ -3712,10 +3702,11 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('clamp_min', ''),
         xfail('special.bessel_j0'),
         xfail('sparse.sampled_addmm'),
+        xfail('sparse.mm', 'reduce'),
         xfail('special.bessel_y0'),
         xfail('special.chebyshev_polynomial_u'),
         xfail('special.modified_bessel_k1'),
-        xfail('segment_reduce', 'offsets'),
+        xfail('_segment_reduce', 'offsets'),
         xfail('special.bessel_j1'),
         xfail('index_reduce', ''),
         xfail('special.laguerre_polynomial_l'),
@@ -3723,7 +3714,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('jiterator_binary', device_type='cuda'),
         xfail('special.modified_bessel_i0'),
         xfail('jiterator_4inputs_with_extra_args', device_type='cuda'),
-        xfail('segment_reduce', 'lengths'),
+        xfail('_segment_reduce', 'lengths'),
         xfail('lu_solve', ''),
         xfail('special.bessel_y1'),
         xfail('special.hermite_polynomial_he'),
@@ -3731,7 +3722,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('nn.functional.dropout3d', ''),
         xfail('special.scaled_modified_bessel_k1'),
         xfail('special.modified_bessel_k0'),
-        xfail('linalg.vecdot', ''),
         xfail('linalg.ldl_factor', ''),
         xfail('special.modified_bessel_i1'),
         xfail('special.chebyshev_polynomial_t'),
@@ -3740,14 +3730,17 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('linalg.lu', ''),
         skip('linalg.ldl_solve', ''),
         skip('_softmax_backward_data'),
+        # AssertionError: Tensor-likes are not equal!
+        # Issue: https://github.com/pytorch/pytorch/issues/70904
+        xfail('bitwise_left_shift', device_type='cpu'),
+        decorate('bitwise_right_shift', device_type='cpu',
+                 decorator=expectedFailureIf(not (IS_MACOS and IS_X86))),
+        # UBSAN: runtime error: shift exponent -1 is negative
+        decorate('bitwise_left_shift', decorator=unittest.skipIf(TEST_WITH_UBSAN, "Fails with above error")),
+        decorate('bitwise_right_shift', decorator=unittest.skipIf(TEST_WITH_UBSAN, "Fails with above error")),
         # One or more of the overload doesn't have a Batch rule.
         xfail('where'),
         xfail('bincount'),
-        xfail('bitwise_and'),
-        xfail('bitwise_or'),
-        xfail('bitwise_xor'),
-        xfail('bitwise_left_shift'),
-        xfail('bitwise_right_shift'),
         xfail('float_power'),
         xfail('gt'),
         xfail('le'),
@@ -3758,6 +3751,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         # RuntimeError: Expected all tensors to be on the same device,
         # but found at least two devices, cuda:0 and cpu!
         xfail('ge', device_type='cuda'),
+        xfail('_upsample_bilinear2d_aa'),
     }))
     def test_op_has_batch_rule(self, device, dtype, op):
         # needs to be fixed
@@ -3859,11 +3853,83 @@ def test_slogdet(self, device):
         # There's no OpInfo for this
         def test():
             B = 2
-            x = torch.randn(2, 5, 5, device=device)
+            x = torch.randn(B, 5, 5, device=device)
             self.vmap_outplace_test(torch.slogdet, (x,), {}, (0,))
 
         check_vmap_fallback(self, test, torch.slogdet)
 
+    def test_index_fill(self, device):
+        # There's no OpInfo for these tests
+
+        B = 2
+
+        def test1():
+            # negative dim
+            x = torch.randn(B, 5, 5, device=device)
+            dim = -2
+            index = torch.tensor([[2, 3], [0, 4]], device=device)
+            value = 5.0
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+
+        def test2():
+            # self batched, self logical rank 1, index logical rank 1
+            x = torch.zeros(B, 3, device=device)
+            dim = 0
+            index = torch.tensor([[0], [1]], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None))
+
+        def test3():
+            # self batched, self logical rank 1, index logical rank 0
+            x = torch.zeros(B, 3, device=device)
+            dim = 0
+            index = torch.tensor([0, 1], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None))
+
+        def test4():
+            # self not batched, self logical rank 0, index logical rank 1
+            x = torch.zeros([], device=device)
+            dim = 0
+            index = torch.tensor([[0], [0]], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+
+        def test5():
+            # self not batched, self logical rank 0, index logical rank 0
+            x = torch.zeros([], device=device)
+            dim = 0
+            index = torch.tensor([0, 0], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+
+        def test6():
+            # self not batched, self logical rank 0, index logical rank 1
+            x = torch.zeros(3, device=device)
+            dim = 0
+            index = torch.tensor([[0], [1]], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+
+        def test7():
+            # self not batched, self logical rank 0, index logical rank 0
+            x = torch.zeros(3, device=device)
+            dim = 0
+            index = torch.tensor([0, 1], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (None, None, 0, None))
+
+        def test8():
+            # self batched, self logical rank > 1, index logical rank 0
+            x = torch.zeros(B, 3, 3, device=device)
+            dim = 0
+            index = torch.tensor([0, 1], device=device)
+            value = 1
+            self.vmap_outplace_test(torch.index_fill, (x, dim, index, value), {}, (0, None, 0, None))
+
+        for test in (test1, test2, test3, test4, test5, test6, test7, test8):
+            check_vmap_fallback(self, test, torch.index_fill)
+
     def test_fill__Tensor(self, device):
         # There's no OpInfo for fill_.Tensor, so here's an extra test for it.
         def test():
@@ -4265,6 +4331,23 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing_no_returns")):
             torch.ops.aten._linalg_check_errors(escaped, 'linalg.inv', is_matrix=False)
 
+    def test_vmap_with_anomaly_detection(self):
+        with torch.autograd.set_detect_anomaly(True):
+            x = torch.zeros(3) - 1
+
+            def fn(x):
+                return x.sum()
+
+            per_sample_grad = vmap(grad(fn))(x)
+            self.assertEqual(per_sample_grad, torch.ones_like(x))
+
+            def bad_fn(x):
+                return x.sqrt().sum()
+
+            err_msg = "Function 'SqrtBackward0' returned nan values in its 0th output."
+            with self.assertRaisesRegex(RuntimeError, err_msg):
+                vmap(grad(bad_fn))(x)
+
 class TestRandomness(TestCase):
     def _reset_random(self, generator, orig_state, use_generator, seed):
         return generator.set_state(orig_state) if use_generator else torch.manual_seed(seed)
@@ -4908,6 +4991,21 @@ def test_jacfwd_with_random(self):
         jacfwd(torch.bernoulli, randomness="same")(x)
         jacfwd(torch.bernoulli, randomness="different")(x)
 
+    @parametrize('randomness', ['error', 'same', 'different'])
+    def test_dropout_unbatched(self, device, randomness):
+        x = torch.randn(3, device=device)
+        y = torch.randn(1, 3, device=device)
+
+        def fn(x, y):
+            # output from dropout should be a Tensor[B, 1, 3] (B=3)
+            return x + torch.nn.functional.dropout(y, p=0.5).mean(1)
+
+        # We just verify that this doesn't raise an error for
+        # `same` and `different` randomness.
+        # Ref: https://github.com/pytorch/pytorch/issues/92283
+        context = self.assertRaises(RuntimeError) if randomness == 'error' else contextlib.nullcontext()
+        with context:
+            vmap(fn, in_dims=(0, None), randomness=randomness)(x, y)
 
 class TestTransformFailure(TestCase):
     @parametrize('transform', ['vmap', 'grad', 'grad_and_value', 'vjp', 'jvp', 'jacrev', 'jacfwd'])
diff --git a/test/functorch/test_vmap_registrations.py b/test/functorch/test_vmap_registrations.py
index d52148401d13..27b1f6d47260 100644
--- a/test/functorch/test_vmap_registrations.py
+++ b/test/functorch/test_vmap_registrations.py
@@ -16,15 +16,11 @@
 
 xfail_functorch_batched = {
     "aten::flatten.using_ints",
-    "aten::gather_backward",
     "aten::imag",
     "aten::is_nonzero",
     "aten::isfinite",
     "aten::isreal",
     "aten::item",
-    "aten::linalg_matrix_power",
-    "aten::linalg_matrix_rank.atol_rtol_float",
-    "aten::linalg_matrix_rank.atol_rtol_tensor",
     "aten::linalg_pinv",
     "aten::linalg_pinv.atol_rtol_float",
     "aten::linalg_slogdet",
@@ -36,10 +32,6 @@
     "aten::movedim.intlist",
     "aten::one_hot",
     "aten::real",
-    "aten::relu6",
-    "aten::relu6_",
-    "aten::selu",
-    "aten::selu_",
     "aten::silu_backward",
     "aten::special_xlogy",
     "aten::special_xlogy.other_scalar",
@@ -86,9 +78,6 @@
     "aten::arctanh_",
     "aten::argwhere",
     "aten::bilinear",
-    "aten::bitwise_and_.Scalar",
-    "aten::bitwise_or_.Scalar",
-    "aten::bitwise_xor_.Scalar",
     "aten::can_cast",
     "aten::cat.names",
     "aten::chain_matmul",
@@ -132,7 +121,6 @@
     "aten::floor_divide_.Scalar",
     "aten::frobenius_norm",
     "aten::fused_moving_avg_obs_fake_quant",
-    "aten::gather_backward",
     "aten::get_gradients",
     "aten::greater.Scalar",
     "aten::greater_.Scalar",
@@ -165,10 +153,7 @@
     "aten::linalg_eigh.eigvals",
     "aten::linalg_ldl_factor",
     "aten::linalg_lu_factor",
-    "aten::linalg_matrix_power",
     "aten::linalg_matrix_rank",
-    "aten::linalg_matrix_rank.atol_rtol_float",
-    "aten::linalg_matrix_rank.atol_rtol_tensor",
     "aten::linalg_matrix_rank.out_tol_tensor",
     "aten::linalg_matrix_rank.tol_tensor",
     "aten::linalg_pinv",
@@ -178,7 +163,6 @@
     "aten::linalg_slogdet",
     "aten::linalg_svd.U",
     "aten::linalg_tensorsolve",
-    "aten::linalg_vecdot",
     "aten::linear",
     "aten::log_sigmoid",
     "aten::log_softmax.int",
@@ -231,8 +215,6 @@
     "aten::quantile.scalar",
     "aten::real",
     "aten::refine_names",
-    "aten::relu6",
-    "aten::relu6_",
     "aten::rename",
     "aten::rename_",
     "aten::requires_grad_",
@@ -244,9 +226,6 @@
     "aten::rnn_tanh.data",
     "aten::rnn_tanh.input",
     "aten::rnn_tanh_cell",
-    "aten::rrelu_",
-    "aten::selu",
-    "aten::selu_",
     "aten::set_.source_Tensor_storage_offset",
     "aten::set_data",
     "aten::silu_backward",
@@ -321,6 +300,7 @@
     "aten::var_mean.names_dim",
     "aten::where",
     "aten::where.Scalar",
+
 }
 
 
diff --git a/test/functorch/xfail_suggester.py b/test/functorch/xfail_suggester.py
index cdf2cca13671..cfe1460a01ac 100644
--- a/test/functorch/xfail_suggester.py
+++ b/test/functorch/xfail_suggester.py
@@ -114,7 +114,7 @@ def get_suggested_xfails(base, tests):
     tests = [test[len(base):] for test in tests if
              belongs_to_base(test, base)]
 
-    base_tests = set([remove_device_dtype(test) for test in tests])
+    base_tests = {remove_device_dtype(test) for test in tests}
     tests = set(tests)
     for base in base_tests:
         cpu_variant = base + '_cpu_float32'
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index 4f46b9982ba9..b8074049eaec 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -119,9 +119,6 @@ def test_dead_placeholder(self):
         """
 
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return x + 7
 
@@ -136,9 +133,6 @@ def test_dead_placeholder_with_user(self):
         """
 
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 a = y + 2
                 return x + 7
@@ -172,9 +166,6 @@ def test_keep_torch_assert(self):
         """
 
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, a: torch.Tensor) -> torch.Tensor:
                 torch._assert(torch.equal(a, a), "a must equal a")
                 return a * 2
diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py
index d7f3b16f2466..9641a1f9ba97 100644
--- a/test/fx/test_fx_const_fold.py
+++ b/test/fx/test_fx_const_fold.py
@@ -133,9 +133,6 @@ def test_const_fold_basic_placeholder_reordered(self):
         """
 
         class ConstFoldTestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return x * 2 + y
 
@@ -703,7 +700,7 @@ def forward(self, x, y):
         for n in mod_folded.graph.nodes:
             if n.op == "get_attr":
                 attr = self._get_attr(n)
-                self.assertEquals(_extract_tensor_metadata(attr), n.meta["tensor_meta"])
+                self.assertEqual(_extract_tensor_metadata(attr), n.meta["tensor_meta"])
 
         # Now run both folded and non-folded to check results equal.
         base_result = mod(in_x, in_y)
diff --git a/test/fx/test_gradual_type.py b/test/fx/test_gradual_type.py
index 131debf149fb..23c6496b3a29 100644
--- a/test/fx/test_gradual_type.py
+++ b/test/fx/test_gradual_type.py
@@ -12,14 +12,7 @@
 from torch.fx import GraphModule
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.testing._internal.common_utils import TestCase
-
-
-try:
-    import sympy
-    HAS_SYMPY = True
-except ImportError:
-    HAS_SYMPY = False
-skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy")
+import sympy
 
 
 try:
@@ -278,7 +271,7 @@ def test_type_check_batch_norm_2D(self):
         class BasicBlock(torch.nn.Module):
 
             def __init__(self, inplanes, planes):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
@@ -309,7 +302,7 @@ def test_type_check_batch_norm_2D_false(self):
         class BasicBlock(torch.nn.Module):
 
             def __init__(self, inplanes, planes):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
@@ -331,7 +324,7 @@ def test_type_check_batch_norm_2D_broadcast(self):
         class BasicBlock(torch.nn.Module):
 
             def __init__(self, inplanes, planes):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
@@ -368,7 +361,7 @@ def forward(self, x: Dyn):
     def test_type_check_conv2D(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, inplanes, planes, stride=1):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = norm_layer(planes)
@@ -398,7 +391,7 @@ def forward(self, x: Dyn):
     def test_type_check_conv2D_2(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, inplanes, planes, stride=1):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = norm_layer(planes)
@@ -466,7 +459,7 @@ def test_type_check_conv2D_2_fully_static(self):
 
             class BasicBlock(torch.nn.Module):
                 def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                    super(BasicBlock, self).__init__()
+                    super().__init__()
                     self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                                  kernel_size=kernel_size, stride=stride,
                                                  padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -496,7 +489,7 @@ def forward(self, x):
             # test with intermediate annotations
             class BasicBlock(torch.nn.Module):
                 def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                    super(BasicBlock, self).__init__()
+                    super().__init__()
                     self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                                  kernel_size=kernel_size, stride=stride,
                                                  padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -529,7 +522,7 @@ class BasicBlock(torch.nn.Module):
 
             def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                          base_width=64, dilation=1):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 if groups != 1 or base_width != 64:
                     raise ValueError('BasicBlock only supports groups=1 and base_width=64')
@@ -580,7 +573,7 @@ def test_type_check_conv2D_maxpool2d_flatten(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
 
                 self.conv1 = torch.nn.Conv2d(3, 6, 5)
                 self.pool = torch.nn.MaxPool2d(2, 2)
@@ -664,7 +657,7 @@ def test_type_typechecl_maxpool2d_3dinput(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.pool = torch.nn.MaxPool2d(5, 8)
 
             def forward(self, x : TensorType((64, 8, 8))):
@@ -706,7 +699,7 @@ def test_type_maxpool2d_fully_static(self):
 
             class BasicBlock(torch.nn.Module):
                 def __init__(self, kernel_size, stride, padding, dilation):
-                    super(BasicBlock, self).__init__()
+                    super().__init__()
                     self.pool = torch.nn.MaxPool2d(kernel_size, stride=stride,
                                                    padding=padding, dilation=dilation,
                                                    return_indices=False, ceil_mode=False)
@@ -736,7 +729,7 @@ def forward(self, x):
             # test with intermediate annotations
             class BasicBlock(torch.nn.Module):
                 def __init__(self, kernel_size, stride, padding, dilation):
-                    super(BasicBlock, self).__init__()
+                    super().__init__()
                     self.pool = torch.nn.MaxPool2d(kernel_size, stride=stride,
                                                    padding=padding, dilation=dilation,
                                                    return_indices=False, ceil_mode=False)
@@ -787,7 +780,7 @@ def test_flatten_fully_static(self):
 
             class BasicBlock(torch.nn.Module):
                 def __init__(self, start, end):
-                    super(BasicBlock, self).__init__()
+                    super().__init__()
                     self.start = start
                     self.end = end
 
@@ -813,7 +806,6 @@ def forward(self, x):
                 if n.op == 'output':
                     assert is_consistent(n.type, TensorType(b.size()))
 
-    @skipIfNoSympy
     @skipIfNoTorchVision
     def test_resnet50(self):
         gm_run = symbolic_trace(resnet50())
@@ -860,12 +852,11 @@ def test_resnet50(self):
             batch_sizes.add(n.type.__args__[0])
         assert (len(batch_sizes) == 1)
 
-    @skipIfNoSympy
     def test_type_check_batch_norm_symbolic(self):
         class BasicBlock(torch.nn.Module):
 
             def __init__(self, inplanes, planes):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.bn1 = norm_layer(planes)
 
@@ -892,7 +883,6 @@ def forward(self, x: Dyn):
         for n in graph.nodes:
             assert n.type == next(my_types)
 
-    @skipIfNoSympy
     def test_symbolic_add_with_broadcast(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
@@ -921,7 +911,6 @@ def forward(self, x: TensorType((1, 2, 3, Dyn)), y: TensorType((2, 3, 4))):
         for n in symbolic_traced.graph.nodes:
             assert n.type == next(expected_iter)
 
-    @skipIfNoSympy
     def test_symbolic_add_with_broadcast_2(self):
         class M(torch.nn.Module):
             def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))):
@@ -943,11 +932,10 @@ def forward(self, x: TensorType((1, 2)), y: TensorType((Dyn, 2))):
         for n in symbolic_traced.graph.nodes:
             assert n.type == next(expected_iter)
 
-    @skipIfNoSympy
     def test_type_check_conv2D_types(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, inplanes, planes, stride=1):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 norm_layer = torch.nn.BatchNorm2d
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = norm_layer(planes)
@@ -971,12 +959,11 @@ def forward(self, x: Dyn):
                 assert isinstance(n.type.__args__[2], sympy.floor)
                 assert isinstance(n.type.__args__[3], sympy.floor)
 
-    @skipIfNoSympy
     def test_type_check_symbolic_inferenceconv2D_maxpool2d_flatten(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
 
                 self.conv1 = torch.nn.Conv2d(3, 6, 5)
                 self.pool = torch.nn.MaxPool2d(2, 2)
diff --git a/test/fx/test_matcher_utils.py b/test/fx/test_matcher_utils.py
index 6240c2f4df65..3361a63ec2bd 100644
--- a/test/fx/test_matcher_utils.py
+++ b/test/fx/test_matcher_utils.py
@@ -46,3 +46,29 @@ def forward(self, x):
         subgraph_matcher = SubgraphMatcher(pattern_graph)
         match_result = subgraph_matcher.match(large_model_graph)
         self.assertEqual(len(match_result), 1)
+
+    def test_subgraph_matcher_with_list(self):
+        def original(x, y):
+            return torch.ops.aten.view(x, [5, y.shape[0]])
+        original_graph = torch.fx.symbolic_trace(original).graph
+
+        def pattern(x, y, z):
+            return torch.ops.aten.view(x, [z, y.shape[0]])
+        pattern_graph = torch.fx.symbolic_trace(pattern).graph
+
+        subgraph_matcher = SubgraphMatcher(pattern_graph)
+        match_result = subgraph_matcher.match(original_graph)
+        self.assertEqual(len(match_result), 1)
+
+    def test_subgraph_matcher_with_list_bad(self):
+        def original(x, y):
+            return torch.ops.aten._reshape_alias_copy.default(x, [1, y.shape[0]], [y.shape[1], y.shape[1]])
+        original_graph = torch.fx.symbolic_trace(original).graph
+
+        def pattern(x, y, b):
+            return torch.ops.aten._reshape_alias_copy.default(x, [b, y.shape[0], y.shape[1]], [y.shape[1]])
+        pattern_graph = torch.fx.symbolic_trace(pattern).graph
+
+        subgraph_matcher = SubgraphMatcher(pattern_graph)
+        match_result = subgraph_matcher.match(original_graph)
+        self.assertEqual(len(match_result), 0)
diff --git a/test/fx/test_pass_infra.py b/test/fx/test_pass_infra.py
index b14eddb3b982..9cb6dc3860cd 100644
--- a/test/fx/test_pass_infra.py
+++ b/test/fx/test_pass_infra.py
@@ -52,9 +52,6 @@ def replace_sub_with_add_pass(gm) -> PassResult:
 
 
 class AddModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, x):
         y = torch.add(x, x)
         z = torch.add(y, x)
diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index 77c081fe3141..da9e4c63d028 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -775,9 +775,6 @@ def gemm_bias_mul_replacement_with_c(a, b, bias, c):
 
     def test_replace_pattern_with_filters(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, scale, zero_point):
                 # Match, second input to add is a scalar
                 x = x.dequantize()
diff --git a/test/fx/test_z3_gradual_types.py b/test/fx/test_z3_gradual_types.py
index e8b239b81538..f9f2e8e92b6d 100644
--- a/test/fx/test_z3_gradual_types.py
+++ b/test/fx/test_z3_gradual_types.py
@@ -33,9 +33,6 @@ class TorchDynamoUseCases(unittest.TestCase):
 
     def test_dim(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x: TensorType([1, 2])):
                 y = x.dim()
                 return y
@@ -56,9 +53,6 @@ def test_reshape(self):
         """
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x: Dyn):
                 y = x.view(100)
                 tmp = y.size()[0]
@@ -82,9 +76,6 @@ def test_eq_dim(self):
         test dimensions and equalities
         """
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([32, 4, 4])):
                 eq = x.dim() == 3
                 return eq
@@ -111,9 +102,6 @@ def test_conditional_ne_1(self):
 
         """
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([32, 4, 4]), y: TensorType([32, 4, 4])):
                 size_5 = x.size()
                 getitem_7 = size_5[0]
@@ -138,9 +126,6 @@ def forward(self, x: TensorType([32, 4, 4]), y: TensorType([32, 4, 4])):
 
     def test_bmm(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, 2, 3]), y: TensorType([1, 3, 2])):
                 bmm = torch.bmm(x, y)
                 return bmm
@@ -161,9 +146,6 @@ def forward(self, x: TensorType([Dyn, 2, 3]), y: TensorType([1, 3, 2])):
 
     def test_bmm2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: TensorType([1, 3, 2])):
                 bmm = torch.bmm(x, y)
                 return bmm
@@ -183,9 +165,6 @@ def forward(self, x: Dyn, y: TensorType([1, 3, 2])):
 
     def test_bmm3(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 3, 3]), y: TensorType([1, 3, 2])):
                 bmm = torch.bmm(x, y)
                 return bmm
@@ -200,9 +179,6 @@ def forward(self, x: TensorType([2, 3, 3]), y: TensorType([1, 3, 2])):
 
     def test_transpose(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([1, 2, 3, 4])):
                 transpose = x.transpose(0, 1)
                 return transpose
@@ -235,9 +211,6 @@ def forward(self, x: TensorType([1, 2, 3, 4])):
 
     def test_index_select(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2050, 1024]), y: Dyn):
                 index_select = x.index_select(0, y)
                 return index_select
@@ -269,9 +242,6 @@ def forward(self, x: TensorType([2050, 1024]), y: Dyn):
 
     def test_get_attr(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([1, 2, 3])):
                 getattr = x.device
                 to = x.to(getattr)
@@ -291,9 +261,6 @@ def forward(self, x: TensorType([1, 2, 3])):
 
     def test_expand(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([1, 4])):
                 size = x.size()
                 getitem = size[-1]
@@ -307,7 +274,7 @@ def forward(self, x: TensorType([1, 4])):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         expand_res = z3.Const(4, tensor_type)
         assert s.model()[expand_res].arg(0).arg(1) == b.shape[0]
         assert s.model()[expand_res].arg(1).arg(1) == b.shape[1]
@@ -322,15 +289,12 @@ def forward(self, x: TensorType([1, 4])):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         assert s.model()[expand_res].arg(1).arg(1) == b.shape[1]
 
     def test_getitem_tensor(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([4, 4])):
                 getitem = x[(None, None, slice(None, None, None), slice(None, None, None))]
                 return getitem
@@ -343,7 +307,7 @@ def forward(self, x: TensorType([4, 4])):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         get_item_res = z3.Const(2, tensor_type)
         assert s.model()[get_item_res].arg(0).arg(1) == b.shape[0]
         assert s.model()[get_item_res].arg(1).arg(1) == b.shape[1]
@@ -366,9 +330,6 @@ def forward(self, x: TensorType([4, 4])):
 
     def test_getitem_tensor2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([4, 4])):
                 getitem = x[(None, None)]
                 return getitem
@@ -380,7 +341,7 @@ def forward(self, x: TensorType([4, 4])):
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         get_item_res = z3.Const(2, tensor_type)
         assert s.model()[get_item_res].arg(0).arg(1) == b.shape[0]
         assert s.model()[get_item_res].arg(1).arg(1) == b.shape[1]
@@ -390,9 +351,6 @@ def forward(self, x: TensorType([4, 4])):
 
     def test_getitem_tensor_3(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([4, 4])):
                 getitem = x[(None, slice(None, None, None), None, slice(None, None, None))]
                 return getitem
@@ -403,7 +361,7 @@ def forward(self, x: TensorType([4, 4])):
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         get_item_res = z3.Const(2, tensor_type)
         assert s.model()[get_item_res].arg(0).arg(1) == b.shape[0]
         assert s.model()[get_item_res].arg(1).arg(1) == b.shape[1]
@@ -416,7 +374,7 @@ def test_layer_norm(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.l = torch.nn.LayerNorm((1024,))
 
             def forward(self, x: Dyn):
@@ -429,7 +387,7 @@ def forward(self, x: Dyn):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # make the output a size 1 tensor which should result
         # in the migration of the input
@@ -472,9 +430,6 @@ def forward(self, x: Dyn):
     def test_layer_norm_functional(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn):
                 return torch.nn.functional.layer_norm(x, (1024,))
 
@@ -485,7 +440,7 @@ def forward(self, x: Dyn):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # make the output a size 1 tensor which should result
         # in the migration of the input
@@ -502,9 +457,6 @@ def forward(self, x: Dyn):
     def test_ne_int_long_type_as(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn, Dyn])):
                 ne_int = torch.ne(x, y).int()
                 type_as = ne_int.type_as(y)
@@ -515,7 +467,7 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn, Dyn])):
         transformed = transform_all_constraints(symbolic_traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # migrate one of the parameters to a fully static shape so we can compare
 
@@ -527,7 +479,7 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn, Dyn])):
         s.add(input == tensor_type.tensor2(D(1, 2), D(1, 4)))
         s.add(input_2 == tensor_type.tensor2(D(1, s1), D(1, s2)))
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         actual_shape = BasicBlock().forward(torch.rand(2, 4), torch.rand(2, 4)).shape
         self.assertEqual(s.model()[output_long].arg(0).arg(1), actual_shape[0])
         self.assertEqual(s.model()[output_long].arg(1).arg(1), actual_shape[1])
@@ -539,9 +491,6 @@ def test_ne(self):
         d1, d2 = D(s11, s1), D(0, s2)
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: Dyn):
                 return torch.ne(x, y)
 
@@ -552,7 +501,7 @@ def forward(self, x: Dyn, y: Dyn):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # change the annotations
         for n in graph.nodes:
@@ -565,7 +514,7 @@ def forward(self, x: Dyn, y: Dyn):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # force the second dimension to be Dyn
         # output should still be TensorType([2, 2])
@@ -580,9 +529,6 @@ def forward(self, x: Dyn, y: Dyn):
 
     def test_cumsum(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, 4, 3])):
                 t = torch.cumsum(x, 3)
                 return t
@@ -634,9 +580,6 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
 
     def test_cumsum_kwargs(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, 4, 3])):
                 t = torch.cumsum(x, dim=3)
                 return t
@@ -662,9 +605,6 @@ def forward(self, x: TensorType([Dyn, 4, 3])):
 
     def test_arange(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4])):
                 size = x.size()
                 getitem = size[-1]
@@ -703,9 +643,6 @@ def forward(self, x: TensorType([2, 4])):
 
     def test_scalar_add(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4])):
                 size = x.size()
                 getitem = size[-1]
@@ -726,9 +663,6 @@ def forward(self, x: TensorType([2, 4])):
 
     def test_regular_add_2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4])):
                 to = x.to()
                 size = to.size()
@@ -749,9 +683,6 @@ def forward(self, x: TensorType([2, 4])):
 
     def test_regular_add_3(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4])):
                 to = x.to()
                 size = to.size()
@@ -772,7 +703,7 @@ def forward(self, x: TensorType([2, 4])):
     def test_embedding(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.embedding = torch.nn.Embedding(256008, 1024, padding_idx=1)
 
             def forward(self, x: TensorType([2, 4])):
@@ -786,7 +717,7 @@ def forward(self, x: TensorType([2, 4])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         embedding_result = z3.Const(2, tensor_type)
 
         assert s.model()[embedding_result].arg(0).arg(1) == B[0]
@@ -801,7 +732,7 @@ def forward(self, x: TensorType([2, 4])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         assert s.model()[embedding_result].arg(0).arg(0) == 0
         assert s.model()[embedding_result].arg(1).arg(0) == 0
         assert s.model()[embedding_result].arg(2).arg(1) == B[2]
@@ -815,14 +746,11 @@ def forward(self, x: TensorType([2, 4])):
         s = z3.Solver()
         s.add(transformed)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
 
     def test_embedding_2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4]), y: TensorType([Dyn, 1024])):
                 return torch.nn.functional.embedding(x, y)
 
@@ -833,7 +761,7 @@ def forward(self, x: TensorType([2, 4]), y: TensorType([Dyn, 1024])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         embedding_result = z3.Const(5, tensor_type)
 
         assert s.model()[embedding_result].arg(0).arg(1) == B[0]
@@ -842,9 +770,6 @@ def forward(self, x: TensorType([2, 4]), y: TensorType([Dyn, 1024])):
 
     def test_size_two_args(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, 2, Dyn])):
                 size = x.size(-1)
                 return size
@@ -874,9 +799,6 @@ def forward(self, x: TensorType([Dyn, 2, Dyn])):
 
     def test_size_getitem(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn):
                 size = x.size()
                 getitem = size[-1]
@@ -891,7 +813,7 @@ def forward(self, x: Dyn):
         s = z3.Solver()
         s.add(transformed)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # force the input to be of size 4
 
@@ -903,18 +825,15 @@ def forward(self, x: Dyn):
         s.add(input == tensor_type.tensor4(d1, d2, d3, d4))
 
         # check if the model is still SAT
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         s1, s2 = z3.Int(23), z3.Int(3)
 
         # check that the item is correct
-        self.assertEquals(s.model()[s1], s.model()[s2])
+        self.assertEqual(s.model()[s1], s.model()[s2])
 
         # invalid index but should still be SAT because input will be Dyn
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn):
                 size = x.size()
                 getitem = size[-10]
@@ -928,14 +847,14 @@ def forward(self, x: Dyn):
         s = z3.Solver()
         s.add(transformed)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(input != z3_dyn)
         self.assertEqual(s.check(), z3.unsat)
 
     def test_view_mul(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.embed_tokens = torch.nn.Embedding(256008, 1024, padding_idx=1)
 
             def forward(self, x: TensorType([2, 4])):
@@ -958,7 +877,7 @@ def forward(self, x: TensorType([2, 4])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         # print(s.model())
 
         embedding_result = z3.Const(6, tensor_type)
@@ -974,9 +893,6 @@ def forward(self, x: TensorType([2, 4])):
 
     def test_gt(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, 4])):
                 size = x.size()
                 getitem_1 = size[-1]
@@ -990,15 +906,12 @@ def forward(self, x: TensorType([Dyn, 4])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         res = z3.Bool(4)
         self.assertEqual(s.model()[res], True)
 
     def test_view(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4])):
                 view = x.view(-1, 8)
                 return view
@@ -1010,13 +923,10 @@ def forward(self, x: TensorType([2, 4])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
     def test_lt_tensor(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4]), y: Dyn):
                 lt = x > y
                 return lt
@@ -1028,7 +938,7 @@ def forward(self, x: TensorType([2, 4]), y: Dyn):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
 
     def test_conditional_wrong_assumption(self):
@@ -1036,9 +946,6 @@ def test_conditional_wrong_assumption(self):
         Test condition after making the wrong assumption about the input
         """
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn):
                 gt = x > 1
                 return gt
@@ -1067,7 +974,7 @@ def test_conditional(self):
         """
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.embed_tokens = torch.nn.Embedding(256008, 1024, padding_idx=1)
 
             def forward(self, x: TensorType([Dyn, 4])):
@@ -1127,7 +1034,7 @@ def test_conditional_2(self):
         """
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.embed_tokens = torch.nn.Embedding(256008, 1024, padding_idx=1)
 
             def forward(self, x: TensorType([Dyn, 4])):
@@ -1157,9 +1064,6 @@ class ComposeOperationsGradualTypes(unittest.TestCase):
 
     def test_masked_fill(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 4])):
                 size = x.size()
                 getitem = size[-1]
@@ -1203,9 +1107,6 @@ def forward(self, x: TensorType([2, 4])):
 
     def test_add_reshape_1(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: Dyn):
                 return torch.add(torch.reshape(x, (1, 2)), torch.reshape(y, (2, 2)))
 
@@ -1217,13 +1118,10 @@ def forward(self, x: Dyn, y: Dyn):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
     def test_add_reshape_2(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: Dyn):
                 return torch.add(torch.reshape(x, (-1, 2)), torch.reshape(y, (2, 2, 2)))
 
@@ -1234,12 +1132,12 @@ def forward(self, x: Dyn, y: Dyn):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
     def test_conv_reshape_add_0(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1254,13 +1152,13 @@ def forward(self, x: Dyn, y: Dyn):
         new_transformed_c = transform_all_constraints(traced)
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
 
 
     def test_conv_reshape_add_0_2(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1279,7 +1177,7 @@ def forward(self, x: Dyn, y: TensorType([4, 1])):
         new_transformed_c = transform_all_constraints(traced)
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
 
 
         conv_result = z3.Const(4, tensor_type)
@@ -1299,9 +1197,9 @@ def forward(self, x: Dyn, y: TensorType([4, 1])):
         assert solver.model()[s4].as_long() == res[3]
 
         solver.add(input_2 == tensor_type.tensor2(D(1, 4), D(1, 1)))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         solver.add(add_result == tensor_type.tensor4(d1, d2, d3, d4))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
 
         # first dimension could be anything because we have broadcasting
         assert solver.model()[s1] == res[0]
@@ -1312,7 +1210,7 @@ def forward(self, x: Dyn, y: TensorType([4, 1])):
     def test_conv_reshape_add_0_3(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1327,13 +1225,13 @@ def forward(self, x: Dyn, y: TensorType([11, 1])):
         new_transformed_c = transform_all_constraints(traced)
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
 
     def test_conv_reshape_add_1(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1348,7 +1246,7 @@ def forward(self, x: Dyn, y: TensorType([1, 2, 10, 20])):
         new_transformed_c = transform_all_constraints(traced)
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
 
 class GradualTypes(unittest.TestCase):
@@ -1356,7 +1254,7 @@ def test_conv_reshape_unsat(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1371,12 +1269,12 @@ def forward(self, x: Dyn):
         new_transformed_c = transform_all_constraints(traced)
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
     def test_conv_reshape0(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1393,7 +1291,7 @@ def forward(self, x: Dyn):
 
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         conv_result = z3.Const(3, tensor_type)
 
         s1, s2, s3, s4 = z3.Ints('x1 x2 x3 x4')
@@ -1429,7 +1327,7 @@ def forward(self, x: Dyn):
     def test_conv_reshape1(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1446,7 +1344,7 @@ def forward(self, x: TensorType([20, 20])):
 
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         conv_result = z3.Const(3, tensor_type)
 
         s1, s2, s3, s4 = z3.Ints('x1 x2 x3 x4')
@@ -1467,7 +1365,7 @@ class TestSingleOperation(unittest.TestCase):
     def test_conv_wrong_example(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=2, out_channels=2,
                                              kernel_size=2, stride=2,
                                              padding=2, groups=2, bias=False, dilation=2)
@@ -1515,7 +1413,7 @@ def test_conv_dyn(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -1550,13 +1448,13 @@ def forward(self, x: Dyn):
         assert solver3.model()[s22].as_long() == 0
 
         solver3.add(s22 != 0)
-        self.assertEquals(solver3.check(), z3.unsat)
+        self.assertEqual(solver3.check(), z3.unsat)
 
         solver2 = z3.Solver()
         solver2.add(transformed)
         assert solver2.check() == z3.sat
         solver2.add(x == tensor_type.tensor3(d1, d2, d3))
-        self.assertEquals(solver2.check(), z3.unsat)
+        self.assertEqual(solver2.check(), z3.unsat)
 
 
     def test_add(self):
@@ -1565,9 +1463,6 @@ def test_add(self):
         d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: Dyn):
                 return torch.add(x, y)
 
@@ -1579,25 +1474,22 @@ def forward(self, x: Dyn, y: Dyn):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # make the tensor be of size 1
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor1(D(1, s11)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         y = z3.Const(2, tensor_type)
         s.add(y == tensor_type.tensor1(D(1, s22)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         s.add(s11 == 1)  # tensor[1]
         s.add(s22 == 2)  # tensor[2]
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         class BasicBlock2(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock2, self).__init__()
-
             def forward(self, x: TensorType((Dyn,)), y: Dyn):
                 return torch.add(x, y)
 
@@ -1608,22 +1500,19 @@ def forward(self, x: TensorType((Dyn,)), y: Dyn):
         transformed = transform_all_constraints(traced)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         # make the tensor be of size 1
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor1(D(1, s11)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         y = z3.Const(2, tensor_type)
         s.add(y == tensor_type.tensor1(D(1, s22)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(s11 == 4)  # tensor[4]
         s.add(s22 == 5)  # tensor[5]
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
         class BasicBlock3(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock3, self).__init__()
-
             def forward(self, x: TensorType((Dyn,)), y: Dyn):
                 return torch.add(x, y)
 
@@ -1636,15 +1525,12 @@ def forward(self, x: TensorType((Dyn,)), y: Dyn):
         s.add(transformed)
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor2(d1, d2))
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
     def test_add_padding(self):
         s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType((Dyn,)), y: TensorType((Dyn, Dyn))):
                 return torch.add(x, y)
 
@@ -1656,12 +1542,12 @@ def forward(self, x: TensorType((Dyn,)), y: TensorType((Dyn, Dyn))):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor1(D(1, s1)))
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # print(s.model())
 
@@ -1669,9 +1555,6 @@ def test_add_padding_2(self):
         s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn])):
                 return torch.add(x, y)
 
@@ -1683,16 +1566,16 @@ def forward(self, x: TensorType([Dyn, Dyn]), y: TensorType([Dyn])):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         # print(s.model())
 
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor2(D(1, s1), D(1, s2)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         y = z3.Const(2, tensor_type)
         s.add(y == tensor_type.tensor1(D(0, s3)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         add_result = z3.Const(3, tensor_type)
         broadcast_res1, broadcast_res2 = z3.Const(4, tensor_type), z3.Const(5, tensor_type)
@@ -1720,9 +1603,6 @@ def test_add_padding_3(self):
         s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, 1]), y: TensorType([Dyn])):
                 return torch.add(x, y)
 
@@ -1735,7 +1615,7 @@ def forward(self, x: TensorType([Dyn, 1]), y: TensorType([Dyn])):
         s = z3.Solver()
         s.add(transformed)
         # print(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
@@ -1744,7 +1624,7 @@ def forward(self, x: TensorType([Dyn, 1]), y: TensorType([Dyn])):
         s.add(x == tensor_type.tensor2(D(0, s1), D(s2, 1)))
         s.add(y == tensor_type.tensor1(D(0, s3)))
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         # print(s.model())
 
@@ -1755,9 +1635,6 @@ def forward(self, x: TensorType([Dyn, 1]), y: TensorType([Dyn])):
 
     def test_add_padding_4(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 1]), y: TensorType([3])):
                 return torch.add(x, y)
 
@@ -1770,16 +1647,13 @@ def forward(self, x: TensorType([2, 1]), y: TensorType([3])):
         s = z3.Solver()
         s.add(transformed)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         add_result = z3.Const(3, tensor_type)
         assert s.model()[add_result] == tensor_type.tensor2(D(1, 2), D(1, 3))
 
     def test_add_padding_5(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([2, 2]), y: TensorType([3])):
                 return torch.add(x, y)
 
@@ -1791,14 +1665,11 @@ def forward(self, x: TensorType([2, 2]), y: TensorType([3])):
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
     def test_add_size_3(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
 
@@ -1810,7 +1681,7 @@ def forward(self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])
 
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
@@ -1820,18 +1691,15 @@ def forward(self, x: TensorType([Dyn, Dyn, Dyn]), y: TensorType([Dyn, Dyn, Dyn])
         s.add(x == tensor_type.tensor3(D(1, s1), D(1, 1), D(1, s2)))
         s.add(y == tensor_type.tensor3(D(1, s3), D(1, s4), D(1, s5)))
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(s2 == 5)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(s5 == 6)
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
     def test_add_padding_6(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
 
@@ -1842,7 +1710,7 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
@@ -1852,19 +1720,16 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn])):
         s.add(x == tensor_type.tensor1(D(1, s1)))
         s.add(y == tensor_type.tensor3(D(1, s2), D(1, s3), D(1, s4)))
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         s.add(s1 == 4)
         s.add(s4 == 5)
 
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
     def test_add_padding_7(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
 
@@ -1875,19 +1740,16 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         s1, s2, s3, s4, s5 = z3.Ints('s1 s2 s3 s4 s5')
         s.add(x == tensor_type.tensor2(D(s1, s2), D(s2, s3)))
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
 
     def test_add_padding_8(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
 
@@ -1898,7 +1760,7 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         transformed = transform_all_constraints(traced, counter=0)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
@@ -1906,17 +1768,14 @@ def forward(self, x: TensorType([Dyn]), y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         s.add(x == tensor_type.tensor1(D(s1, 1)))
         s.add(s1 >= 0)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         s.add(y == tensor_type.tensor4(D(0, s2), D(0, s3), D(0, s4), D(0, s5)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
     def test_add_padding_9(self):
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: TensorType([Dyn, Dyn, Dyn, Dyn])):
                 return torch.add(x, y)
 
@@ -1928,21 +1787,21 @@ def forward(self, x: Dyn, y: TensorType([Dyn, Dyn, Dyn, Dyn])):
         s = z3.Solver()
         s.add(transformed)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
         s1, s2, s3, s4, s5, s6, s7 = z3.Ints('s1 s2 s3 s4 s5 s6 s7')
         s.add(x == tensor_type.tensor1(D(s1, s7)))
         s.add(s1 == 1)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         s.add(y == tensor_type.tensor4(D(0, s2), D(0, s3), D(0, s4), D(s6, s5)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
 
         s.add(s6 == 1)
 
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(s5 != 1, s7 != 1)
         assert s.check()
 
@@ -1958,7 +1817,7 @@ def test_conv_static(self):
 
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, dilation=dilation)
@@ -1976,14 +1835,14 @@ def forward(self, x: TensorType((1, 2, 10, 20))):
         new_transformed_c = transform_all_constraints(traced)
         solver = z3.Solver()
         solver.add(new_transformed_c)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
 
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
         solver.add(x == tensor_type.tensor4(d1, d2, d3, d4))
         solver.add(y == tensor_type.tensor4(b1, b2, b3, b4))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         # print(solver.model())
         assert solver.model()[e3].as_long() == res[2]
         assert solver.model()[e4].as_long() == res[3]
@@ -2000,7 +1859,7 @@ def forward(self, x: TensorType((1, 2, 10, 20))):
         solver.add(x == tensor_type.tensor4(d1, d2, d3, d4))
         solver.add(y == tensor_type.tensor4(b1, b2, b3, b4))
 
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         assert solver.model()[e3].as_long() == res2[2]
         assert solver.model()[e4].as_long() == res2[3]
 
@@ -2008,9 +1867,6 @@ def test_reshape_dyn(self):
         s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn):
                 return torch.reshape(x, (2, -1))
 
@@ -2021,14 +1877,14 @@ def forward(self, x: Dyn):
         transformed = transform_all_constraints(traced)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor1(D(1, s11)))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(z3.Or([s11 == 2, s11 == 4, s11 == 9]))
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         s.add(s11 == 9)
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
 
     def test_reshape_annotated(self):
@@ -2037,9 +1893,6 @@ def test_reshape_annotated(self):
         d1, d2, d3, d4 = D(s11, s1), D(s22, s2), D(s33, s3), D(s44, s4),
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn])):
                 return torch.reshape(x, (2, -1))
 
@@ -2049,18 +1902,15 @@ def forward(self, x: TensorType([Dyn])):
         transformed = transform_all_constraints(traced)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor2(d1, d2))
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
     def test_reshape_static_target(self):
         s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: TensorType([Dyn])):
                 return torch.reshape(x, (2, 3))
 
@@ -2071,21 +1921,18 @@ def forward(self, x: TensorType([Dyn])):
         # print(transformed)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor1(D(1, s11)))
         s.check()
         assert s.model()[s11].as_long() == 6
         s.add(s11 != 6)
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
     def test_reshape_static_target2(self):
         s11, s22, s33, s44 = z3.Ints('s11 s22 s33 s44')
 
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn):
                 return torch.reshape(x, (2, 3, 1, 1))
 
@@ -2095,19 +1942,19 @@ def forward(self, x: Dyn):
         transformed = transform_all_constraints(traced)
         s = z3.Solver()
         s.add(transformed)
-        self.assertEquals(s.check(), z3.sat)
+        self.assertEqual(s.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         s.add(x == tensor_type.tensor1(D(1, s11)))
         s.check()
         assert s.model()[s11].as_long() == 6
         s.add(s11 != 6)
-        self.assertEquals(s.check(), z3.unsat)
+        self.assertEqual(s.check(), z3.unsat)
 
 
     def test_conv2D_maxpool2d_flatten(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
 
                 self.conv1 = torch.nn.Conv2d(3, 6, 5)
                 self.pool = torch.nn.MaxPool2d(2, 2)
@@ -2144,7 +1991,7 @@ def forward(self, x : TensorType((4, 3, 32, 32))):
     def test_conv2D_maxpool2d_flatten_unsat(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
 
                 self.conv1 = torch.nn.Conv2d(3, 6, 5)
                 self.pool = torch.nn.MaxPool2d(2, 2)
@@ -2172,12 +2019,12 @@ def forward(self, x : TensorType((4, 3, 32, 32))):
         solver.check()
         input = z3.Const(1, tensor_type)
         solver.add(input == tensor_type.tensor4(D(1, 4), D(1, 3), D(1, 32), D(1, 45)))
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
     def test_conv2D_maxpool2d_flatten_dyn(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self):
-                super(BasicBlock, self).__init__()
+                super().__init__()
 
                 self.conv1 = torch.nn.Conv2d(3, 6, 5)
                 self.pool = torch.nn.MaxPool2d(2, 2)
@@ -2202,7 +2049,7 @@ def forward(self, x : TensorType((Dyn, 3, 32, 32))):
         constraints = transform_all_constraints(traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
 
     def test_type_check_flatten(self):
         s1, s2, s3, s4 = z3.Ints('s1 s2 s3 s4')
@@ -2216,7 +2063,7 @@ def forward(self, x: TensorType([2, 3, 4, 5])):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         flatten = z3.Const(2, tensor_type)
 
         res = M().forward(torch.rand(2, 3, 4, 5)).size()
@@ -2232,12 +2079,12 @@ def forward(self, x: TensorType([2, 3, Dyn, 5])):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         x = z3.Const(1, tensor_type)
         y = z3.Const(2, tensor_type)
 
         solver.add(x == tensor_type.tensor4(D(1, 2), D(1, 3), D(0, s1), D(1, 5)))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         assert solver.model()[y].arg(1).arg(0) == 0
 
 
@@ -2251,15 +2098,12 @@ def forward(self, x: TensorType([2, 3, Dyn])):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
 class ConstraintGeneration(unittest.TestCase):
 
     def test_add_reshape(self):
         class BasicBlock(torch.nn.Module):
-            def __init__(self):
-                super(BasicBlock, self).__init__()
-
             def forward(self, x: Dyn, y: Dyn):
                 return torch.add(torch.reshape(x, (1, 2)), torch.reshape(y, (2, 2)))
 
@@ -2275,7 +2119,7 @@ def forward(self, x: Dyn, y: Dyn):
     def test_conv_reshape_add(self):
         class BasicBlock(torch.nn.Module):
             def __init__(self, in_planes, out_planes, kernel_size, stride, padding, groups, dilation):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(in_channels=in_planes, out_channels=out_planes,
                                              kernel_size=kernel_size, stride=stride,
                                              padding=padding, groups=groups, bias=False, dilation=dilation)
@@ -2338,7 +2182,7 @@ def test_resnet50_unsat(self):
         input = z3.Const(1, tensor_type)
         # input with 3 dimensions
         solver.add(input == tensor_type.tensor3(D(1, 1), D(1, 3), D(1, 224)))
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
 
 
@@ -2352,12 +2196,12 @@ def test_resnet50(self):
         constraints = transform_all_constraints(traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         linear = z3.Const(650, tensor_type)
 
         input = z3.Const(1, tensor_type)
         solver.add(input == tensor_type.tensor4(D(1, 1), D(1, 3), D(1, 224), D(1, 224)))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         assert solver.model()[linear] == tensor_type.tensor2(D(1, res[0]), D(1, res[1]))
 
     def test_resnet502(self):
@@ -2389,9 +2233,9 @@ def test_resnet503(self):
         batch, d1, d2 = z3.Ints('b d1 d2')
         solver.add(input == tensor_type.tensor4(D(1, batch), D(1, 3), D(1, 224), D(1, 224)))
         solver.add(linear == tensor_type.tensor2(D(1, d1), D(1, d2)))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         solver.add(batch != d1)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
 @skipIfNoTorchVision
 class TestAlexNet(unittest.TestCase):
@@ -2409,11 +2253,11 @@ def test_alexnet1(self):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         input = z3.Const(1, tensor_type)
         conv = z3.Const(2, tensor_type)
         solver.add(input == tensor_type.tensor4(D(1, 10), D(1, 3), D(1, 227), D(1, 227)))
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
         assert solver.model()[conv] == tensor_type.tensor4(D(1, 10), D(1, 64), D(1, 56), D(1, 56))
 
         relu = z3.Const(7, tensor_type)
@@ -2446,7 +2290,7 @@ def test_alexnet2(self):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
     def test_alexnet3(self):
         alexnet = models.alexnet()
@@ -2459,7 +2303,7 @@ def test_alexnet3(self):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.sat)
+        self.assertEqual(solver.check(), z3.sat)
 
     def test_alexnet4(self):
         alexnet = models.alexnet()
@@ -2472,7 +2316,7 @@ def test_alexnet4(self):
         constraints = transform_all_constraints(symbolic_traced, counter=0)
         solver = z3.Solver()
         solver.add(constraints)
-        self.assertEquals(solver.check(), z3.unsat)
+        self.assertEqual(solver.check(), z3.unsat)
 
 
 
diff --git a/test/inductor/minifier_smoke.py b/test/inductor/minifier_smoke.py
new file mode 100644
index 000000000000..569a09b23a31
--- /dev/null
+++ b/test/inductor/minifier_smoke.py
@@ -0,0 +1,58 @@
+# Owner(s): ["module: inductor"]
+os.environ["TORCHDYNAMO_REPRO_AFTER"] = "dynamo"
+import torch
+import torch._dynamo as torchdynamo
+import torch._inductor.lowering
+import torch._ops
+
+
+def func(x):
+    x = torch.sigmoid(x)
+    x = torch.mul(x, torch.ones(2))
+    x = torch.add(x, torch.zeros(2))
+    x = torch.ops.aten.round(x)
+    return x
+
+
+error_injection_str = """
+import torch._inductor.lowering
+
+def inject_error():
+    def throw(x):
+        assert False
+    # inject an error in the lowerings
+    for x in list(torch._inductor.lowering.lowerings.keys()):
+        if 'round' in x.__name__:
+            torch._inductor.lowering.lowerings[x] = throw
+
+inject_error()
+"""
+
+exec(error_injection_str)
+
+
+def patch_launcher():
+    minifier_launcher_path = torchdynamo.debug_utils.get_minifier_repro_path()
+    with open(minifier_launcher_path, "r") as f:
+        code = f.read()
+        code = code.replace(
+            torchdynamo.debug_utils.TEST_REPLACEABLE_COMMENT, error_injection_str
+        )
+
+    with open(minifier_launcher_path, "w") as f:
+        f.write(code)
+
+    return code
+
+
+def run_internal_minifier():
+    torchdynamo.config.debug_dir_root = "."
+    try:
+        f_opt = torch.compile(func)
+        f_opt(torch.ones(2))
+    except Exception as e:
+        patch_launcher()
+        raise e
+
+
+run_internal_minifier()
diff --git a/test/inductor/test_config.py b/test/inductor/test_config.py
new file mode 100644
index 000000000000..728b696f8f6c
--- /dev/null
+++ b/test/inductor/test_config.py
@@ -0,0 +1,182 @@
+# Owner(s): ["module: inductor"]
+import logging
+import math
+import unittest
+
+import torch
+
+import torch._dynamo.config as dynamo_config
+from torch._dynamo.test_case import run_tests, TestCase
+
+from torch._inductor import config
+from torch.testing._internal.inductor_utils import HAS_CPU
+
+
+def dummy_fn(x):
+    return torch.sigmoid(x + math.pi) / 10.0
+
+
+class TestInductorConfig(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._saved_config = config.save_config()
+
+    def tearDown(self):
+        super().tearDown()
+        config.load_config(self._saved_config)
+
+    def test_set(self):
+        config.max_fusion_size = 13337
+        self.assertEqual(config.max_fusion_size, 13337)
+        self.assertEqual(config.to_dict()["max_fusion_size"], 13337)
+        config.to_dict()["max_fusion_size"] = 32
+        self.assertEqual(config.max_fusion_size, 32)
+
+        # a nested config
+        prior = config.triton.cudagraphs
+        config.triton.cudagraphs = not prior
+        self.assertEqual(config.triton.cudagraphs, not prior)
+        self.assertEqual(config.to_dict()["triton.cudagraphs"], not prior)
+
+    def test_save_load(self):
+        config.max_fusion_size = 123
+        config.triton.cudagraphs = True
+        saved1 = config.save_config()
+        config.max_fusion_size = 321
+        config.triton.cudagraphs = False
+        saved2 = config.save_config()
+
+        self.assertEqual(config.max_fusion_size, 321)
+        self.assertEqual(config.triton.cudagraphs, False)
+        config.load_config(saved1)
+        self.assertEqual(config.max_fusion_size, 123)
+        self.assertEqual(config.triton.cudagraphs, True)
+        config.load_config(saved2)
+        self.assertEqual(config.max_fusion_size, 321)
+        self.assertEqual(config.triton.cudagraphs, False)
+
+    def test_hasattr(self):
+        self.assertTrue(hasattr(config, "max_fusion_size"))
+        self.assertFalse(hasattr(config, "missing_name"))
+
+    def test_invalid_names(self):
+        self.assertRaises(AttributeError, lambda: config.does_not_exist)
+        self.assertRaises(AttributeError, lambda: config.triton.does_not_exist)
+
+        def store1():
+            config.does_not_exist = True
+
+        def store2():
+            config.triton.does_not_exist = True
+
+        self.assertRaises(AttributeError, store1)
+        self.assertRaises(AttributeError, store2)
+
+    def test_patch(self):
+        with config.patch(max_fusion_size=456):
+            self.assertEqual(config.max_fusion_size, 456)
+            with config.patch(max_fusion_size=789):
+                self.assertEqual(config.max_fusion_size, 789)
+            self.assertEqual(config.max_fusion_size, 456)
+
+        with config.patch({"cpp.threads": 9000, "max_fusion_size": 9001}):
+            self.assertEqual(config.cpp.threads, 9000)
+            self.assertEqual(config.max_fusion_size, 9001)
+            with config.patch("cpp.threads", 8999):
+                self.assertEqual(config.cpp.threads, 8999)
+            self.assertEqual(config.cpp.threads, 9000)
+
+    def test_log_level_property(self):
+        old = dynamo_config.log_level
+        try:
+            dynamo_config.log_level = logging.CRITICAL
+            self.assertEqual(logging.getLogger("torch._dynamo").level, logging.CRITICAL)
+        finally:
+            dynamo_config.log_level = old
+
+    @unittest.skipIf(not HAS_CPU, "requires C++ compiler")
+    def test_compile_api(self):
+        # these are mostly checking config processing doesn't blow up with exceptions
+        x = torch.randn(8)
+        y = dummy_fn(x)
+        checks = [
+            {},
+            {"mode": "default"},
+            {"mode": "reduce-overhead"},
+            {"mode": "max-autotune"},
+            {
+                "options": {
+                    "max-fusion-size": 128,
+                    "unroll_reductions_threshold": 32,
+                    "triton.cudagraphs": False,
+                }
+            },
+            {"dynamic": True},
+            {"fullgraph": True, "backend": "inductor"},
+            {"disable": True},
+        ]
+
+        for kwargs in checks:
+            torch._dynamo.reset()
+            opt_fn = torch.compile(dummy_fn, **kwargs)
+            torch.testing.assert_allclose(
+                opt_fn(x), y, msg=f"torch.compile(..., **{kwargs!r}) failed"
+            )
+
+    def test_compile_api_passes_config(self):
+        # ensure configs are actually passed down to inductor
+        self.assertRaises(
+            torch._dynamo.exc.BackendCompilerFailed,
+            lambda: torch.compile(dummy_fn, options={"_raise_error_for_testing": True})(
+                torch.randn(10)
+            ),
+        )
+
+    @torch._dynamo.config.patch(raise_on_backend_change=True)
+    def test_inductor_config_changes_warning(self):
+        import torch
+
+        @torch.compile
+        def a(x):
+            return x + 1
+
+        @torch.compile
+        def b(x):
+            return x + 2
+
+        @torch.compile(mode="max-autotune")
+        def c(x):
+            return x + 3
+
+        @torch.compile(mode="max-autotune")
+        def d(x):
+            return x + 4
+
+        # no warning same config
+        a(torch.randn(10))
+        b(torch.randn(10))
+        a(torch.randn(10))
+        b(torch.randn(10))
+
+        torch._dynamo.reset()
+        # no warning after reset
+        c(torch.randn(10))
+        c(torch.randn(10))
+        d(torch.randn(10))
+        d(torch.randn(10))
+
+        self.assertRaises(torch._dynamo.exc.ResetRequired, lambda: a(torch.randn(10)))
+
+        with torch._dynamo.config.patch(
+            raise_on_backend_change=False
+        ), self.assertWarns(Warning):
+            # normally it is just a warning
+            a(torch.randn(10))
+
+        # only warn once
+        a(torch.randn(10))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index 2c3f8787f2c8..4799b0c588b2 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -7,7 +7,7 @@
 import torch._dynamo
 import torch._inductor.utils
 from torch._dynamo.test_minifier_common import MinifierTestBase
-from torch.testing._internal.common_utils import IS_MACOS
+from torch.testing._internal.common_utils import IS_JETSON, IS_MACOS
 
 _HAS_TRITON = torch._inductor.utils.has_triton()
 requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
@@ -99,11 +99,13 @@ def inner(x):
             (test_proc.returncode, repro_proc.returncode),
         )
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_cpu_compile_error(self):
         (tb1, tb2), _ = self._test_after_aot("cpu", CPP_COMPILE_ERROR, 2)
         self.assertIn("CppCompileError", tb1)
         self.assertIn("CppCompileError", tb2)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_cpu_accuracy_error(self):
         (tb1, tb2), _ = self._test_after_aot("cpu", CPP_ACCURACY_ERROR, 4)
         self.assertIn("AccuracyError", tb1)
@@ -149,6 +151,7 @@ def inner(x):
         self.assertEqual(test_proc.returncode, repro_proc.returncode)
         self.assertNotEqual(test_proc.returncode, 0)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_cpu_runtime_error(self):
         self._test_after_aot_runtime_error("cpu", CPP_RUNTIME_ERROR)
 
@@ -181,12 +184,15 @@ def inner(x):
         self.assertEqual(proc.returncode, 0)
         self.assertIsNone(repro_dir)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_cpu_compile_backend_passes(self):
         self._test_after_aot_backend_passes("cpu", 2, CPP_COMPILE_ERROR)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_cpu_runtime_backend_passes(self):
         self._test_after_aot_backend_passes("cpu", 2, CPP_RUNTIME_ERROR)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_cpu_accuracy_backend_passes(self):
         self._test_after_aot_backend_passes("cpu", 4, CPP_ACCURACY_ERROR)
 
@@ -206,6 +212,7 @@ def test_after_aot_cuda_accuracy_backend_passes(self):
 
     # Test that inductor config can be saved and restored, especially class
     # variables.
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_inductor_config_serialization(self):
         run_code = textwrap.dedent(
             """\
@@ -248,11 +255,13 @@ def inner(x):
         )
         return (test_proc.stderr.decode("utf-8"), repro_proc.stderr.decode("utf-8"))
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_with_modified_config_compile_error(self):
         tb1, tb2 = self._test_after_aot_with_modified_config(CPP_COMPILE_ERROR, 2)
         self.assertIn("CppCompileError", tb1)
         self.assertIn("CppCompileError", tb2)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_after_aot_with_modified_config_accuracy_error(self):
         tb1, tb2 = self._test_after_aot_with_modified_config(CPP_ACCURACY_ERROR, 4)
         self.assertIn("AccuracyError", tb1)
@@ -287,21 +296,25 @@ def inner(x):
             (test_proc.returncode, repro_proc.returncode),
         )
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_torch_compile_after_dynamo_compile_error(self):
         (tb1, tb2), _ = self._test_torch_compile("dynamo", 2, CPP_COMPILE_ERROR)
         self.assertIn("CppCompileError", tb1)
         self.assertIn("CppCompileError", tb2)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_torch_compile_after_dynamo_accuracy_error(self):
         (tb1, tb2), _ = self._test_torch_compile("dynamo", 4, CPP_ACCURACY_ERROR)
         self.assertIn("AccuracyError", tb1)
         self.assertIn("AccuracyError", tb2)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_torch_compile_after_aot_compile_error(self):
         (tb1, tb2), _ = self._test_torch_compile("aot", 2, CPP_COMPILE_ERROR)
         self.assertIn("CppCompileError", tb1)
         self.assertIn("CppCompileError", tb2)
 
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
     def test_torch_compile_after_aot_accuracy_error(self):
         (tb1, tb2), _ = self._test_torch_compile("aot", 4, CPP_ACCURACY_ERROR)
         self.assertIn("AccuracyError", tb1)
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
new file mode 100644
index 000000000000..7bba18e6bf8c
--- /dev/null
+++ b/test/inductor/test_pattern_matcher.py
@@ -0,0 +1,117 @@
+# Owner(s): ["module: inductor"]
+import torch
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.utils import counters
+from torch.testing._internal.common_utils import IS_LINUX
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+class TestPaternMatcher(TestCase):
+    def test_mm_plus_mm(self):
+        def fn(a, b, c, d):
+            return torch.add(torch.mm(a, b), torch.mm(c, d))
+
+        args = [
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+        ]
+        expected = fn(*args)
+        actual = torch.compile(fn)(*args)
+        torch.testing.assert_close(actual, expected)
+        self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+        self.assertEqual(counters["inductor"]["pattern_matcher_nodes"], 3)
+
+    def test_addmm(self):
+        def fn(a, b, c):
+            return torch.add(a, torch.mm(b, c)), torch.mm(a, b) + c
+
+        args = [
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+        ]
+        e1, e2 = fn(*args)
+        a1, a2 = torch.compile(fn)(*args)
+        torch.testing.assert_close(a1, e1)
+        torch.testing.assert_close(a2, e2)
+        self.assertEqual(counters["inductor"]["pattern_matcher_count"], 2)
+        self.assertEqual(counters["inductor"]["pattern_matcher_nodes"], 4)
+
+    def test_cat_mm(self):
+        def fn(a, b, c):
+            return torch.cat(
+                [
+                    torch.mm(a, b),
+                    torch.mm(b, c),
+                    torch.mm(a, c),
+                ],
+                1,
+            )
+
+        args = [
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+        ]
+        expected = fn(*args)
+        actual = torch.compile(fn)(*args)
+        torch.testing.assert_close(actual, expected)
+        self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+        self.assertEqual(counters["inductor"]["pattern_matcher_nodes"], 4)
+
+    def test_cat_addmm(self):
+        def fn(a, b, c):
+            return torch.cat(
+                [
+                    torch.addmm(a, b, c),
+                    torch.addmm(b, c, a),
+                    torch.addmm(c, a, b),
+                ],
+                1,
+            )
+
+        args = [
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+            torch.randn(16, 16, device="cuda"),
+        ]
+        expected = fn(*args)
+        actual = torch.compile(fn)(*args)
+        torch.testing.assert_close(actual, expected)
+        self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+        self.assertEqual(counters["inductor"]["pattern_matcher_nodes"], 4)
+
+    def test_cat_slice_cat(self):
+        def fn(a, b):
+            cat_1 = torch.ops.aten.cat.default([a, b], 1)
+            slice_1 = torch.ops.aten.slice.Tensor(cat_1, 0, 0, 9223372036854775807)
+            slice_2 = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 19)
+            return torch.ops.aten.cat.default([cat_1, slice_2], 1)
+
+        args = [
+            torch.randn(2, 32, device="cuda"),
+            torch.randn(2, 16, device="cuda"),
+        ]
+        expected = fn(*args)
+        actual = torch.compile(fn)(*args)
+        torch.testing.assert_close(actual, expected)
+        self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+        self.assertEqual(counters["inductor"]["pattern_matcher_nodes"], 4)
+
+        counters.clear()
+        args = [
+            torch.randn(2, 8, device="cuda"),
+            torch.randn(2, 16, device="cuda"),
+        ]
+        expected = fn(*args)
+        actual = torch.compile(fn)(*args)
+        torch.testing.assert_close(actual, expected)
+        self.assertEqual(counters["inductor"]["pattern_matcher_count"], 1)
+        self.assertEqual(counters["inductor"]["pattern_matcher_nodes"], 4)
+
+
+if __name__ == "__main__":
+    if IS_LINUX and HAS_CUDA:
+        run_tests()
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 2b53c163421c..9279e4a9d8a3 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -1,15 +1,17 @@
 # Owner(s): ["module: inductor"]
 import contextlib
+import sys
 from unittest.mock import patch
 
 import functorch
 
 import torch._dynamo
 import torch._inductor.config as config
-from torch._dynamo.optimizations.backends import register_backend
+from torch._dynamo.backends.registry import register_backend
 from torch._inductor import metrics
 from torch._inductor.compile_fx import compile_fx, count_bytes_inner
 from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
     TEST_WITH_ROCM,
     TestCase as TorchTestCase,
 )
@@ -23,9 +25,17 @@ def count_bytes_inductor(gm, example_inputs):
     return compile_fx(gm, example_inputs, inner_compile=count_bytes_inner)
 
 
-@torch._dynamo.optimize("count_bytes_inductor")
-def f(x):
-    return torch.cat([x, x.cos()])
+# TODO remove version check once dynamo supports 3.11
+if sys.version_info < (3, 11) and not IS_WINDOWS:
+
+    @torch._dynamo.optimize("count_bytes_inductor")
+    def f(x):
+        return torch.cat([x, x.cos()])
+
+else:
+
+    def f(x):
+        return torch.cat([x, x.cos()])
 
 
 def count_numel(f, *args):
@@ -325,6 +335,16 @@ def f(a, b):
         inp = (T(10, 10), TI(20, mx=10))
         self.assertExpectedInline(count_numel(f, *inp), """140""")
 
+    def test_mutation_fusion(self):
+        def f(a, b, c):
+            a0 = a.add(c)
+            b0 = b.add(a0)
+            b.copy_(b0)
+            a.copy_(a0)
+
+        inp = (T(10, 10), T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """500""")
+
 
 class SchedulerFusionTests(TestCase):
     """
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index ffc0003e7112..cd87461d083b 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: inductor"]
 import functools
-import logging
 from unittest.mock import patch
 
 import torch
@@ -17,17 +16,15 @@
 
 
 def patches(fn):
-    def skip_cache(self, key, generate):
-        return generate()
+    def skip_cache(self, choices, name, key, generate):
+        return {choice: generate(choice) for choice in choices}
 
     for patcher in [
-        patch.object(dynamo_config, "log_level", logging.INFO),
-        patch.object(dynamo_config, "verbose", True),
-        patch.object(inductor_config, "debug", True),
-        patch.object(inductor_config, "max_autotune", True),
-        patch.object(inductor_config, "epilogue_fusion", True),
+        dynamo_config.patch(verbose=True),
+        inductor_config.patch(debug=True, max_autotune=True, epilogue_fusion=True),
         patch.object(select_algorithm, "VERIFY", dict(atol=1e-4, rtol=1e-4)),
         patch.object(select_algorithm.AlgorithmSelectorCache, "lookup", skip_cache),
+        torch.backends.cudnn.flags(allow_tf32=False),
     ]:
         fn = patcher(fn)
 
@@ -66,11 +63,30 @@ def test_addmm(self):
         def foo(input, weight, bias):
             return torch.addmm(bias, input, weight)
 
-        foo(
+        inps = (
             torch.randn(20, 33, device="cuda"),
             torch.randn(33, 16, device="cuda"),
             torch.randn(20, 16, device="cuda"),
         )
+
+        foo(*inps)
+        # Autotuning checks correctness of each version
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
+    @patch.object(select_algorithm, "VERIFY", dict(atol=5e-2, rtol=5e-2))
+    @patches
+    def test_addmm_fp16(self):
+        @torch.compile
+        def foo(input, weight, bias):
+            return torch.addmm(bias, input, weight)
+
+        inps = (
+            torch.randn(2, 320, device="cuda", dtype=torch.half),
+            torch.randn(320, 320, device="cuda", dtype=torch.half).t(),
+            torch.empty(320, device="cuda", dtype=torch.half),
+        )
+
+        foo(*inps)
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
@@ -86,6 +102,18 @@ def foo(a, b):
         )
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+    @patches
+    def test__int_mm(self):
+        @torch.compile
+        def foo(a, b):
+            return torch._int_mm(a, b)
+
+        foo(
+            torch.randint(-10, 10, (64, 32), device="cuda", dtype=torch.int8),
+            torch.randint(-10, 10, (32, 64), device="cuda", dtype=torch.int8),
+        )
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
     @patches
     def test_mm_skip(self):
         @torch.compile
@@ -138,6 +166,21 @@ def foo(a, b, c):
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+    @patches
+    def test_mm_plus_mm(self):
+        @torch.compile
+        def foo(a, b, c, d):
+            return (a @ b) + (c @ d)
+
+        foo(
+            torch.randn(32, 32, device="cuda"),
+            torch.randn(32, 32, device="cuda"),
+            torch.randn(32, 32, device="cuda"),
+            torch.randn(32, 32, device="cuda"),
+        )
+        # Autotuning checks correctness of each version
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
index 89079723bc22..da2b2d288d45 100644
--- a/test/inductor/test_smoke.py
+++ b/test/inductor/test_smoke.py
@@ -1,15 +1,17 @@
 # Owner(s): ["module: inductor"]
 import logging
+import unittest
 
 import torch
 import torch._dynamo as torchdynamo
 import torch._inductor.config as torchinductor_config
 from torch.testing._internal.common_utils import IS_LINUX, TestCase
+from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
 class MLP(torch.nn.Module):
     def __init__(self):
-        super(MLP, self).__init__()
+        super().__init__()
         self.l1 = torch.nn.Linear(1, 6)
         self.l2 = torch.nn.Linear(6, 1)
 
@@ -24,6 +26,7 @@ def _test_f(x):
 
 
 class SmokeTest(TestCase):
+    @unittest.skipIf(not HAS_CUDA, "Triton is not available")
     def test_mlp(self):
         torchdynamo.config.log_level = logging.INFO
         torchdynamo.config.verbose = True
@@ -36,6 +39,7 @@ def test_mlp(self):
         torchdynamo.config.verbose = False
         torchinductor_config.debug = False
 
+    @unittest.skipIf(not HAS_CUDA, "Triton is not available")
     def test_compile_decorator(self):
         @torch.compile
         def foo(x):
diff --git a/test/inductor/test_standalone_compile.py b/test/inductor/test_standalone_compile.py
new file mode 100644
index 000000000000..eceddfea94da
--- /dev/null
+++ b/test/inductor/test_standalone_compile.py
@@ -0,0 +1,102 @@
+# Owner(s): ["module: inductor"]
+import torch
+from torch import _dynamo as dynamo, _inductor as inductor
+from torch._dynamo.test_case import run_tests, TestCase
+from torch.fx import symbolic_trace
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.inductor_utils import HAS_CPU
+
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = torch.nn.Linear(10, 10)
+        self.b = torch.nn.Linear(10, 10)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.a(x))
+        x = torch.sigmoid(self.b(x))
+        return x
+
+
+class MyModule2(MyModule):
+    def forward(self, x):  # takes a dict of list
+        a, b = x["key"]
+        return {"result": super().forward(a) + b}
+
+
+class MyModule3(MyModule):
+    def forward(self, x):
+        return (super().forward(x),)
+
+
+class TestStandaloneInductor(TestCase):
+    """
+    These test check that you can call TorchInductor directly without
+    going through TorchDynamo.
+    """
+
+    def test_inductor_via_fx(self):
+        mod = MyModule3().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        mod_opt = inductor.compile(symbolic_trace(mod), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_fx_tensor_return(self):
+        mod = MyModule().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        mod_opt = inductor.compile(symbolic_trace(mod), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_fx_dict_input(self):
+        mod = MyModule2().eval()
+        inp = {"key": [torch.randn(10), torch.randn(10)]}
+        correct = mod(inp)
+        mod_opt = inductor.compile(symbolic_trace(mod), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_make_fx(self):
+        mod = MyModule().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        mod_opt = inductor.compile(make_fx(mod)(inp), [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_bare_module(self):
+        mod = MyModule3().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        # no FX graph at all (mod must return list/tuple in this case)
+        mod_opt = inductor.compile(mod, [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_export1(self):
+        mod = MyModule3().eval()
+        inp = torch.randn(10)
+        correct = mod(inp)
+        gm, guards = dynamo.export(mod, inp, aten_graph=True, tracing_mode="symbolic")
+        mod_opt = inductor.compile(gm, [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+    def test_inductor_via_export2(self):
+        mod = MyModule2().eval()
+        inp = {"key": [torch.randn(10), torch.randn(10)]}
+        correct = mod(inp)
+        gm, guards = dynamo.export(mod, inp)
+        mod_opt = inductor.compile(gm, [inp])
+        actual = mod_opt(inp)
+        self.assertEqual(actual, correct)
+
+
+if __name__ == "__main__":
+    if HAS_CPU:
+        run_tests()
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index fbeb819ee060..60acd0824918 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2,12 +2,10 @@
 import contextlib
 import dataclasses
 import functools
-import glob
 import importlib
 import itertools
 import os
 import random
-import shutil
 import sys
 import typing
 import unittest
@@ -17,70 +15,87 @@
 
 import numpy as np
 
+import sympy
+
 import torch
 
 import torch._dynamo
+from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.debug_utils import same_two_models
-from torch._dynamo.testing import make_test_cls_with_patches, rand_strided, same
+from torch._dynamo.testing import rand_strided, same
+from torch._inductor.codegen.cpp import CppVecKernelChecker
+from torch._inductor.graph import GraphLowering
+from torch._inductor.ir import InterpreterShim
+from torch._inductor.utils import run_and_get_triton_code
+from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import all_types
 from torch.testing._internal.common_utils import (
+    IS_CI,
+    IS_MACOS,
+    IS_WINDOWS,
+    IS_X86,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
+    TEST_WITH_SLOW,
     TestCase as TorchTestCase,
 )
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
-try:
-    import sympy
-
-    importlib.import_module("functorch")
-    importlib.import_module("filelock")
-
-    import torch._inductor.config
-    from functorch.compile import config as functorch_config
-    from torch._decomp import get_decompositions
-    from torch._inductor import codecache, config, metrics, test_operators
-    from torch._inductor.codegen.cpp import cexpr, CppOverrides, CppVecOverrides
-    from torch._inductor.codegen.triton import texpr
-    from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
-    from torch._inductor.ir import IndexingDiv, ModularIndexing
-    from torch._inductor.overrides import (
-        linear_permute_fusion,
-        linear_transpose,
-        permute_linear_fusion,
-        permute_matmul_fusion,
-        sink_cat_after_pointwise,
-        transpose_linear,
-        transpose_matmul,
+if IS_WINDOWS and IS_CI:
+    sys.stderr.write(
+        "Windows CI does not have necessary dependencies for test_torchinductor yet\n"
     )
-    from torch._inductor.sizevars import SizeVarAllocator
-    from torch._inductor.utils import has_torchvision_roi_align, timed
-
-    # This will only pass on pytorch builds newer than roughly 5/15/2022
-    assert get_decompositions([torch.ops.aten.trace])
-    # Requires functorch
-    from torch._inductor.compile_fx import compile_fx_inner
-except (ImportError, AssertionError) as e:
-    sys.stderr.write(f"{type(e)}: {e}\n")
     if __name__ == "__main__":
         sys.exit(0)
-    raise unittest.SkipTest("requires sympy/functorch/filelock") from e
+    raise unittest.SkipTest("requires sympy/functorch/filelock")
+
+importlib.import_module("functorch")
+importlib.import_module("filelock")
+
+from functorch.compile import config as functorch_config
+from torch._decomp import get_decompositions
+from torch._inductor import codecache, config, metrics, test_operators
+from torch._inductor.codegen.cpp import cexpr, CppOverrides, CppVecOverrides
+from torch._inductor.codegen.triton import texpr
+from torch._inductor.codegen.wrapper import pexpr
+
+from torch._inductor.compile_fx import (
+    compile_fx,
+    compile_fx_inner,
+    complex_memory_overlap,
+)
+from torch._inductor.ir import ModularIndexing
+from torch._inductor.overrides import (
+    linear_permute_fusion,
+    linear_transpose,
+    permute_linear_fusion,
+    permute_matmul_fusion,
+    sink_cat_after_pointwise,
+    transpose_linear,
+    transpose_matmul,
+)
+from torch._inductor.sizevars import SizeVarAllocator
+from torch._inductor.utils import has_torchvision_roi_align, timed
+from torch.fx.experimental.symbolic_shapes import FloorDiv
 
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 HAS_MULTIGPU = HAS_CUDA and torch.cuda.device_count() >= 2
+HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
 aten = torch.ops.aten
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 requires_multigpu = functools.partial(
     unittest.skipIf, not HAS_MULTIGPU, "requires multiple cuda devices"
 )
-
-torch._inductor.config.triton.autotune_pointwise = False  # too slow
+slow = functools.partial(unittest.skipIf, not TEST_WITH_SLOW, "too slow")
+skip_if_x86_mac = functools.partial(
+    unittest.skipIf, IS_MACOS and IS_X86, "Does not work on x86 Mac"
+)
 
 
 # For OneDNN bf16 path, OneDNN requires the cpu has intel avx512 with avx512bw,
@@ -108,6 +123,7 @@ def has_bf16_support():
     torch.nn.GELU(approximate="tanh"),
     torch.nn.ReLU6(),
     torch.nn.SiLU(),
+    torch.nn.Hardsigmoid(),
     lambda x: F.relu(x),
     lambda x: F.sigmoid(x),
     lambda x: F.tanh(x),
@@ -118,6 +134,13 @@ def has_bf16_support():
     lambda x: F.gelu(x, approximate="tanh"),
     lambda x: F.relu6(x),
     lambda x: F.silu(x),
+    lambda x: F.hardsigmoid(x),
+    lambda x: torch.relu(x),
+    lambda x: torch.sigmoid(x),
+    lambda x: torch.tanh(x),
+    lambda x: x.relu(),
+    lambda x: x.sigmoid(),
+    lambda x: x.tanh(),
 ]
 
 
@@ -180,8 +203,16 @@ class TestCase(TorchTestCase):
     def setUpClass(cls):
         super().setUpClass()
         cls._stack = contextlib.ExitStack()
-        cls._stack.enter_context(patch.object(config, "debug", True))
-        cls._stack.enter_context(patch.object(config.cpp, "min_chunk_size", 1))
+        cls._stack.enter_context(
+            config.patch(
+                {
+                    "debug": True,
+                    "cpp.min_chunk_size": 1,
+                    "triton.autotune_pointwise": False,  # too slow
+                    "implicit_fallbacks": False,
+                }
+            )
+        )
 
     @classmethod
     def tearDownClass(cls):
@@ -262,12 +293,26 @@ def gather_leaf_tensors(args, kwargs):
 def clone_preserve_strides(x):
     if not isinstance(x, torch.Tensor):
         return x
-    buffer = torch.as_strided(x, (x.storage().size(),), (1,), 0).clone()
+    buffer = torch.as_strided(
+        x, (x.untyped_storage().size() // x.element_size(),), (1,), 0
+    ).clone()
     out = torch.as_strided(buffer, x.size(), x.stride(), x.storage_offset())
     return out
 
 
-@patch.object(torch._inductor.config.triton, "cudagraphs", False)
+@patch.object(config, "debug", True)
+def run_and_get_cpp_code(fn, args):
+    torch._dynamo.reset()
+    import io
+    from contextlib import redirect_stdout
+
+    f = io.StringIO()
+    with redirect_stdout(f):
+        fn(*args)
+    s = f.getvalue()
+    return s
+
+
 def check_model(
     self: TestCase,
     model,
@@ -389,7 +434,6 @@ def run(*ex, **kwargs):
                     assert correct_val.dtype == actual_val.dtype
 
     if check_gradient:
-
         # generate random unit norm gradients
         grads = [
             torch.rand(r.shape, device=r.device, dtype=r.dtype)
@@ -414,7 +458,7 @@ def run(*ex, **kwargs):
     torch._dynamo.reset()
 
 
-@patch.object(torch._inductor.config.triton, "cudagraphs", False)
+@torch._inductor.config.patch("triton.cudagraphs", False)
 def check_model_cuda(
     self: TestCase,
     model,
@@ -472,6 +516,8 @@ def downcast_fn(x):
         example_inputs = list(map(downcast_fn, example_inputs))
         if hasattr(model, "to"):
             model = model.to(torch.half)
+        if rtol is not None:
+            rtol = max(2e-3, rtol)
         check_model(
             self,
             model,
@@ -530,10 +576,10 @@ def populate(cls):
 class TestIndexingSimplification(TorchTestCase):
     def test_indexing_simplification(self):
         sizevars = SizeVarAllocator()
-        i0 = sympy.Symbol("i0")
-        i1 = sympy.Symbol("i1")
-        i2 = sympy.Symbol("i2")
-        r3 = sympy.Symbol("r3")
+        i0 = sympy.Symbol("i0", integer=True)
+        i1 = sympy.Symbol("i1", integer=True)
+        i2 = sympy.Symbol("i2", integer=True)
+        r3 = sympy.Symbol("r3", integer=True)
 
         var_ranges = {i0: 3136, i1: 64, i2: 32, r3: 3}
         expr = (
@@ -552,7 +598,7 @@ def test_indexing_simplification(self):
         self.assertEqual(
             sizevars.simplify_with_ranges(expr, var_ranges), i1 + 128 * i2 + 64 * r3
         )
-        # if there are negative terms in ModularIndexing base, we cannot replace it with IndexingDiv
+        # if there are negative terms in ModularIndexing base, we cannot replace it with FloorDiv
         expr = ModularIndexing(i1 - 15, 1, 64)
         self.assertEqual(
             sizevars.simplify_with_ranges(expr, var_ranges),
@@ -560,8 +606,8 @@ def test_indexing_simplification(self):
         )
         # small terms should be kept if the rest is not guaranteed to be divisible
         self.assertEqual(
-            sizevars.simplify_with_ranges(IndexingDiv(r3 + i2 + i1, 32), var_ranges),
-            IndexingDiv(r3 + i2 + i1, 32),
+            sizevars.simplify_with_ranges(FloorDiv(r3 + i2 + i1, 32), var_ranges),
+            FloorDiv(r3 + i2 + i1, 32),
         )
 
         expr = ModularIndexing(2 * i2 + r3, 1, 64)
@@ -569,7 +615,7 @@ def test_indexing_simplification(self):
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), 2 * i2 + r3)
 
         # check the same thing but with symbolic divisor
-        self.assertEqual(IndexingDiv(r3 * i0, r3), i0)
+        self.assertEqual(FloorDiv(r3 * i0, r3), i0)
         self.assertEqual(ModularIndexing(r3 * i0, r3, 10), ModularIndexing(i0, 1, 10))
 
         # (10*i) % 10 is always zero and should get optimized away
@@ -597,7 +643,7 @@ def test_indexing_simplification(self):
 
         # Constant fold from divisor into base
         self.assertEqual(ModularIndexing(i0 * 4, 2, 10), ModularIndexing(i0 * 2, 1, 10))
-        self.assertEqual(IndexingDiv(i0 * 4, 2), i0 * 2)
+        self.assertEqual(FloorDiv(i0 * 4, 2), i0 * 2)
 
         # Nested modular indexing is correctly simplified
         var_ranges = {"i1": 13, "i2": 121}
@@ -607,16 +653,16 @@ def test_indexing_simplification(self):
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), expr)
         var_ranges = {"i2": 784}
         expr = ModularIndexing(ModularIndexing(i2, 1, 28), 7, 4)
-        expected = IndexingDiv(ModularIndexing(i2, 1, 28), 7)
+        expected = FloorDiv(ModularIndexing(i2, 1, 28), 7)
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), expected)
         expr = ModularIndexing(ModularIndexing(i2, 1, 28) + 1, 7, 4)
         self.assertEqual(sizevars.simplify_with_ranges(expr, var_ranges), expr)
 
     def test_indexing_join(self):
         sizevars = SizeVarAllocator()
-        i0 = sympy.Symbol("i0")
-        i1 = sympy.Symbol("i1")
-        i2 = sympy.Symbol("i2")
+        i0 = sympy.Symbol("i0", integer=True)
+        i1 = sympy.Symbol("i1", integer=True)
+        i2 = sympy.Symbol("i2", integer=True)
 
         # join two ModularIndexing calls into one larger one when possible
         expr1 = ModularIndexing(i0, 1, 32) + 32 * ModularIndexing(i0, 32, 4)
@@ -654,8 +700,8 @@ def test_indexing_join(self):
             ModularIndexing(i0, 10, i1 * i2) + 10,
         )
 
-        # works for ModularIndexing + IndexingDiv
-        expr5 = 197 * IndexingDiv(i0, 197) + ModularIndexing(i0, 1, 197)
+        # works for ModularIndexing + FloorDiv
+        expr5 = 197 * FloorDiv(i0, 197) + ModularIndexing(i0, 1, 197)
         simplified = sizevars.simplify_with_ranges(expr5, {})
         self.assertEqual(simplified, i0)
         self.assertEqual(expr5.subs({i0: 39485}), simplified.subs({i0: 39485}))
@@ -667,9 +713,9 @@ def test_indexing_join(self):
         )
 
         # divisor != 1
-        expr6 = 197 * IndexingDiv(i0, 197 * 3) + ModularIndexing(i0, 3, 197)
+        expr6 = 197 * FloorDiv(i0, 197 * 3) + ModularIndexing(i0, 3, 197)
         simplified = sizevars.simplify_with_ranges(expr6, {})
-        self.assertEqual(simplified, IndexingDiv(i0, 3))
+        self.assertEqual(simplified, FloorDiv(i0, 3))
         self.assertEqual(expr6.subs({i0: 39485}), simplified.subs({i0: 39485}))
 
 
@@ -717,6 +763,16 @@ def fn(x, y):
 
         self.common(fn, (x, y))
 
+    def test_concat_add_inplace(self):
+        def fn(x, y, z):
+            return torch.cat([x, y], dim=1).add_(z)
+
+        x = torch.randn([2, 12, 14, 14])
+        y = torch.randn([2, 12, 14, 14])
+        z = torch.randn([2, 24, 14, 14])
+
+        self.common(fn, (x, y, z))
+
     def test_abs(self):
         def fn(a):
             return (a / (torch.abs(a) + 1),)
@@ -729,6 +785,16 @@ def fn(a):
 
         self.common(fn, [torch.linspace(-10, 10, 41)])
 
+    def test_randn_generator(self):
+        def fn(a, generator):
+            torch.randn([20, 20], generator=generator, device=a.device)
+
+        self.common(fn, (torch.linspace(-10, 10, 41), None))
+
+        # generator not yet supported in dynamo
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "Generator"):
+            self.common(fn, (torch.linspace(-10, 10, 41), torch.Generator(self.device)))
+
     def test_sgn_extremal(self):
         def fn(a):
             return (torch.sgn(a),)
@@ -746,6 +812,23 @@ def fn(a, b):
         t2[1] = float("nan")
         self.common(fn, (t1, t2))
 
+    def test_neg_max_uint8(self):
+        # https://github.com/pytorch/pytorch/issues/93380
+        def fn(a, b):
+            c = torch.neg(a)
+            return torch.maximum(b, c)
+
+        a = torch.randint(256, (1,), dtype=torch.uint8)
+        b = torch.randint(256, (8390,), dtype=torch.uint8)
+        self.common(fn, (a, b))
+
+    def test_compar(self):
+        def fn(x):
+            return x.gt(3.5), x.ge(3.5), x.eq(3.5), x.le(2.5), x.lt(3.5), x.ne(3.5)
+
+        a = torch.tensor([3])
+        self.common(fn, (a,))
+
     def test_horizonal_fusion1(self):
         def fn(a, b, c):
             return (a + b, a - c, b * c)
@@ -873,6 +956,7 @@ def fn(a):
 
         self.common(fn, (torch.tensor([float("-inf"), 0.0, float("inf")]),))
 
+    @skip_if_x86_mac()
     def test_reduction2(self):
         def fn(a):
             # FIXME: a.argmax
@@ -880,6 +964,7 @@ def fn(a):
 
         self.common(fn, (torch.full((4,), float("inf")),))
 
+    @skip_if_x86_mac()
     def test_reduction3(self):
         def fn(a):
             # FIXME: a.argmin
@@ -898,7 +983,16 @@ def fn(a):
         for i in inputs:
             self.common(fn, (i,))
 
-    @patch.object(config, "dynamic_shapes", False)
+    @config.patch(unroll_reductions_threshold=1)
+    def test_reduction5(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("Non-deterministic CPU results")
+
+        def fn(a):
+            return (a.sum(), a.max(), a.min(), a.argmax())
+
+        self.common(fn, (torch.full((4,), float("-inf")),))
+
     def test_unroll_small_reduction(self):
         def fn(x):
             val1, index1 = x.min(-1)
@@ -917,11 +1011,11 @@ def fn(x):
                 x.amax(-1),
             )
 
-        with patch.object(config, "unroll_reductions_threshold", 8):
+        with config.patch(unroll_reductions_threshold=8):
             # small sized reductions will get unrolled
             self.common(fn, (torch.randn(8, 3),))
         torch._dynamo.reset()
-        with patch.object(config, "unroll_reductions_threshold", 1):
+        with config.patch(unroll_reductions_threshold=1):
             # make sure things also work if they aren't unrolled
             self.common(fn, (torch.randn(8, 3),))
 
@@ -974,6 +1068,14 @@ def fn(a, b):
 
         self.common(fn, (torch.randn(8, 8), torch.randn(8, 8)))
 
+    def test_clamp_type_promotion(self):
+        def fn(a):
+            b = torch.tensor(1.0, dtype=torch.double, device=self.device)
+            c = torch.full((4,), 2, device=self.device)
+            return a.clamp(min=b, max=c)
+
+        self.common(fn, (torch.randint(4, (4,)),))
+
     def test_arange1(self):
         def fn(x):
             rng1 = torch.arange(8 * 8, dtype=torch.float32, device=x.device).view(8, 8)
@@ -1004,6 +1106,28 @@ def fn(x):
 
         self.common(fn, (torch.randn(1024),))
 
+    def test_arange5(self):
+        def fn(step, device):
+            return torch.arange(512, -512, step, device=device)
+
+        compiled_fn = torch._dynamo.optimize()(fn)
+
+        # NOTE: use assertEqual to check dtypes which self.common doesn't do
+        for step in (-1, -1.0):
+            expect = fn(step, self.device)
+            actual = compiled_fn(step, self.device)
+            self.assertEqual(expect, actual)
+        self.assertEqual(expect, actual)
+
+    def test_arange6(self):
+        def fn(x):
+            return torch.arange(0.1, 8.0001, 1, dtype=x.dtype, device=x.device)
+
+        # Test that float arguments are truncated to int when dtype is set explicitly
+        make_arg = functools.partial(make_tensor, device="cpu", requires_grad=False)
+        self.common(fn, (make_arg(1, dtype=torch.float32),))
+        self.common(fn, (make_arg(1, dtype=torch.int64),))
+
     def test_linspace1(self):
         def fn(x):
             return torch.linspace(0.125, 0.875, 7, device=x.device) + x
@@ -1102,6 +1226,22 @@ def forward(arg1, arg2):
             ),
         )
 
+    def test_views4(self):
+        # example taken from hf_BigBird
+        def forward(arg1, arg2):
+            arg1 = arg1.index_select(0, arg2)
+            arg1 = torch.ops.aten.view(arg1, [2, 3, 4, 5, 5])
+            arg1 = torch.ops.aten.view(arg1, [2, 3, 2, 10, -1])
+            return arg1
+
+        self.common(
+            forward,
+            (
+                torch.randn(12, 5, 5),
+                torch.randint(0, 11, (24,)),
+            ),
+        )
+
     def test_relu(self):
         def fn(a, b):
             return (torch.relu(a), torch.relu(a + b) / 10)
@@ -1529,9 +1669,6 @@ def fn(a, b):
 
     def test_shape_prop_torch_ones(self):
         class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
             def forward(self, attention_scores):
                 extended_attention_mask = torch.ones(
                     8, 1, 1, 512, device=attention_scores.device
@@ -1547,6 +1684,7 @@ def forward(self, attention_scores):
                 (torch.randn(8, 12, 512, 512),),
             )
 
+    @slow()
     def test_conv_bn_fuse(self):
         # For gpu path, there is an accuracy issue
         if self.device == "cuda":
@@ -1623,7 +1761,7 @@ def __init__(
                 dtype=None,
             ):
                 factory_kwargs = {"device": device, "dtype": dtype}
-                super(BatchNorm, self).__init__(
+                super().__init__(
                     num_features,
                     eps=eps,
                     momentum=momentum,
@@ -1689,19 +1827,59 @@ def forward(self, x):
                 (v,),
             )
 
+    def test_upsample_cat_conv(self):
+        if self.device == "cuda":
+            raise unittest.SkipTest("only support cpu upsample_cat_conv test")
+
+        class M(torch.nn.Module):
+            def __init__(
+                self,
+                **kwargs,
+            ):
+                super().__init__()
+                self.upsample = torch.nn.UpsamplingNearest2d(scale_factor=2)
+                self.conv = torch.nn.Conv2d(
+                    8,
+                    5,
+                    kernel_size=1,
+                    padding=0,
+                    stride=1,
+                    dilation=1,
+                    **kwargs,
+                )
+
+            def forward(self, x, y):
+                x = self.upsample(x)
+                z = torch.cat([x, y], dim=1)
+                z = self.conv(z)
+                return z
+
+        v1 = torch.randn([8, 2, 12, 26])
+        v2 = torch.randn([8, 6, 24, 52])
+
+        with torch.no_grad():
+            self.common(
+                M().eval(),
+                (v1, v2),
+            )
+
     def test_conv2d_packed(self):
         if self.device == "cuda":
             raise unittest.SkipTest("only support cpu conv2d packed test")
 
         x_shape = (1, 3, 56, 56)
-        mod = torch.nn.Sequential(torch.nn.Conv2d(3, 64, 3, 3)).eval()
-        v = torch.randn(x_shape, dtype=torch.float32)
-        with torch.no_grad():
-            self.common(
-                mod,
-                (v,),
+        for mode_train in [True, False]:
+            mod = torch.nn.Sequential(torch.nn.Conv2d(3, 64, 3, 3)).train(
+                mode=mode_train
             )
+            v = torch.randn(x_shape, dtype=torch.float32)
+            with torch.no_grad():
+                self.common(
+                    mod,
+                    (v,),
+                )
 
+    @slow()
     def test_conv2d_unary(self):
         # For gpu path, there is an accuracy issue
         # see https://github.com/pytorch/pytorch/issues/87745
@@ -1716,7 +1894,7 @@ def __init__(
                 out_channels,
                 **kwargs,
             ):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(
                     in_channels,
                     out_channels,
@@ -1737,6 +1915,7 @@ def forward(self, x):
             [1, 4],
             ["same", 0],
             test_memory_format,
+            [True, False],
         )
 
         for (
@@ -1747,6 +1926,7 @@ def forward(self, x):
             groups,
             padding,
             memory_format,
+            mode_train,
         ) in options:
             oC = 32 * groups
             iC = 3 * groups
@@ -1760,7 +1940,7 @@ def forward(self, x):
                 dilation=dilation,
                 groups=groups,
                 bias=bias,
-            ).eval()
+            ).train(mode=mode_train)
 
             # TODO: add bf16 test for cpu path?
             # TODO: this test fails when requires_grad=False
@@ -1775,6 +1955,7 @@ def forward(self, x):
                     (v,),
                 )
 
+    @slow()
     def test_conv2d_binary(self):
         # For gpu path, there is an accuracy issue
         # see https://github.com/pytorch/pytorch/issues/87745
@@ -1794,7 +1975,7 @@ def __init__(
                 bias,
                 **kwargs,
             ):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(
                     in_channels,
                     out_channels,
@@ -1833,6 +2014,7 @@ def forward(self, x):
             [1, 4],
             ["same", 0],
             test_memory_format,
+            [True, False],
         )
 
         for (
@@ -1844,6 +2026,7 @@ def forward(self, x):
             groups,
             padding,
             memory_format,
+            mode_train,
         ) in options:
             oC = 32 * groups
             iC = 3 * groups
@@ -1858,7 +2041,7 @@ def forward(self, x):
                 padding,
                 bias,
                 kernel_size=kernel_size,
-            ).eval()
+            ).train(mode=mode_train)
             mod = mod.to(memory_format=memory_format)
             # TODO: add bf16 test
             v = torch.randn(x_shape, dtype=torch.float32).to(
@@ -1871,7 +2054,7 @@ def forward(self, x):
                 )
 
     def test_linear_packed(self):
-        options = itertools.product([[2, 3, 10], [2, 10]], [True, False])
+        options = itertools.product([[2, 3, 10], [2, 10], [10]], [True, False])
         for input_shape, bias in options:
             mod = torch.nn.Sequential(
                 torch.nn.Linear(input_shape[-1], 30, bias=bias)
@@ -1884,6 +2067,36 @@ def test_linear_packed(self):
                     (v,),
                 )
 
+    def test_linear_buffer_reuse(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(16, 16)
+                self.tanh = torch.nn.Tanh()
+                self.linear2 = torch.nn.Linear(16, 16)
+
+            def forward(self, x):
+                x = self.linear1(x)
+                x = self.tanh(x)
+                x = self.linear2(x)
+                return x
+
+        mod = M().eval()
+        v = torch.randn(1, 16)
+
+        with torch.no_grad():
+
+            def compile_fx_wrapper(model_, example_inputs_):
+                return compile_fx(model_, example_inputs_)
+
+            def run(*ex, **kwargs):
+                return mod(*ex, **kwargs)
+
+            run = torch._dynamo.optimize(compile_fx_wrapper)(run)
+            code = run_and_get_cpp_code(run, (v,))
+            self.assertFalse("= as_strided(" in code)
+            self.assertEqual(run(*v), mod(*v))
+
     def test_linear_unary(self):
         class M(torch.nn.Module):
             def __init__(
@@ -1894,7 +2107,7 @@ def __init__(
                 bias,
                 **kwargs,
             ):
-                super(M, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(
                     in_features,
                     out_features,
@@ -1924,7 +2137,7 @@ def forward(self, x):
     def test_linear_binary(self):
         class M(torch.nn.Module):
             def __init__(self, eltwise_fn, in_channels, out_channels, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(
                     in_channels, out_channels, bias=bias, **kwargs
                 )
@@ -1949,6 +2162,95 @@ def forward(self, x, y):
                 with torch.no_grad():
                     self.common(mod, (v, other), atol=2e-3, rtol=0.016)
 
+    def test_conv_transpose2d_packed(self):
+        if self.device == "cuda":
+            raise unittest.SkipTest("only support cpu conv_transpose2d packed test")
+
+        x_shape = (1, 3, 28, 28)
+        mod = torch.nn.Sequential(torch.nn.ConvTranspose2d(3, 64, 3, 3)).eval()
+        v = torch.randn(x_shape, dtype=torch.float32)
+        with torch.no_grad():
+            self.common(
+                mod,
+                (v,),
+            )
+
+    def test_conv_transpose2d_unary(self):
+        if self.device == "cuda":
+            raise unittest.SkipTest("only support cpu conv_transpose2d unary test")
+
+        class M(torch.nn.Module):
+            def __init__(
+                self,
+                unary_fn,
+                in_channels,
+                out_channels,
+                **kwargs,
+            ):
+                super().__init__()
+                self.conv_transpose2d = torch.nn.ConvTranspose2d(
+                    in_channels,
+                    out_channels,
+                    **kwargs,
+                )
+                self.unary_fn = unary_fn
+
+            def forward(self, x):
+                x = self.conv_transpose2d(x)
+                return self.unary_fn(x)
+
+        test_memory_format = [torch.contiguous_format, torch.channels_last]
+        options = itertools.product(
+            unary_list,
+            [True, False],
+            [1, 3],
+            [1, 2],
+            [1, 4],
+            [0, 1],
+            test_memory_format,
+        )
+
+        for (
+            unary_fn,
+            bias,
+            kernel_size,
+            dilation,
+            groups,
+            padding,
+            memory_format,
+        ) in options:
+            oC = 32 * groups
+            iC = 3 * groups
+            x_shape = (1, iC, 28, 28)
+            mod = M(
+                unary_fn,
+                iC,
+                oC,
+                kernel_size=kernel_size,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            ).eval()
+
+            v = torch.randn(x_shape, dtype=torch.float32).to(
+                memory_format=memory_format
+            )
+            with torch.no_grad():
+                self.common(
+                    mod,
+                    (v,),
+                )
+
+    def test_view_detach(self):
+        def fn(a):
+            return a[0].detach()
+
+        self.common(
+            fn,
+            (torch.randn([4, 4], requires_grad=True),),
+        )
+
     def test_gather1(self):
         def fn(a, b):
             return (
@@ -1973,6 +2275,18 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
+    def test_gather3(self):
+        def fn(a, b):
+            return torch.gather(a, 1, b, sparse_grad=True)
+
+        self.common(
+            fn,
+            (
+                torch.randn([4, 5, 10, 6], requires_grad=True),
+                torch.randint(5, [4, 5, 10, 1], dtype=torch.int64),
+            ),
+        )
+
     def test_slice1(self):
         def fn(a):
             return (
@@ -2370,6 +2684,19 @@ def fn(x):
         )
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
 
+    # From https://github.com/pytorch/pytorch/issues/94775
+    def test_max_pool2d7(self):
+        # ceil mode turns on
+        def fn(x):
+            return torch.nn.functional.max_pool2d(
+                x, 1, stride=(2, 2), padding=0, ceil_mode=True
+            )
+
+        self.common(
+            fn,
+            (torch.randn([1, 1, 6, 7]),),
+        )
+
     def test_avg_pool2d1(self):
         def fn(x):
             return aten.avg_pool2d(x, [3, 3], [2, 2])
@@ -2475,6 +2802,15 @@ def fn(x):
             (torch.randn([16, 16]),),
         )
 
+    def test_tan(self):
+        def fn(x):
+            return aten.tan(x) + 2, aten.tan(x + 1)
+
+        self.common(
+            fn,
+            (torch.randn([16, 16]),),
+        )
+
     def test_tanh(self):
         def fn(x):
             return aten.tanh(x) + 2, aten.tanh(x + 1)
@@ -2562,7 +2898,7 @@ def fn(x):
             (torch.randn([1, 2, 4, 8]),),
         )
 
-    @patch.object(config, "pick_loop_orders", True)
+    @config.patch(pick_loop_orders=True)
     def test_transposed_propagates(self):
         @torch._dynamo.optimize("inductor", nopython=True)
         def fn(x, y):
@@ -2634,11 +2970,12 @@ def fn(a, b):
         if self.device != "cpu":
             self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
-    def test_softmax_one_kernel(self):
+    @patch.object(config.triton, "persistent_reductions", True)
+    def test_softmax_one_kernel_persist(self):
         def fn(x):
             dim = 1
             x_max = torch.amax(x, dim, keepdim=True)
-            unnormalized = torch.exp(x * x_max)
+            unnormalized = torch.exp(x - x_max)
             result = unnormalized / torch.sum(unnormalized, dim, keepdim=True)
             return result
 
@@ -2646,6 +2983,18 @@ def fn(x):
         if self.device != "cpu":
             self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+    @patch.object(config.triton, "persistent_reductions", False)
+    def test_softmax_one_kernel_loop(self):
+        def fn(x):
+            x_max = torch.amax(x, 1, keepdim=True)
+            unnormalized = torch.exp(x - x_max)
+            result = unnormalized / torch.sum(unnormalized, 1, keepdim=True)
+            return result
+
+        self.common(fn, (torch.randn([16, 32]),), check_lowp=False)
+        if self.device != "cpu":
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
     def test_cauchy(self):
         def fn(x, y):
             return torch.sum(1 / (torch.unsqueeze(x, -1) - y))
@@ -2691,7 +3040,7 @@ def fn(node_feat, edge_index):
         if self.device != "cpu":
             self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
 
-    @patch.object(torch._inductor.config, "max_fusion_size", 1)
+    @config.patch(max_fusion_size=1)
     def test_no_mega_fusion_during_lowering(self):
         n = 50
 
@@ -2772,7 +3121,6 @@ def fn(mask, value):
             ),
             torch.randint(16, (16, 16), device=self.device),
         ):
-
             inputs = (
                 torch.randint(0, 1, [1, 16], dtype=torch.bool, device=self.device),
                 inp,
@@ -2814,7 +3162,13 @@ def fn(x):
 
         self.common(
             fn,
-            (torch.randn([16, 16]),),
+            # TODO: Remove dtype once https://github.com/pytorch/pytorch/issues/94010 is fixed
+            (
+                torch.randn(
+                    [16, 16],
+                    dtype=torch.float64 if self.device == "cpu" else torch.float32,
+                ),
+            ),
             # Mismatched elements: 9 / 256 (3.5%)
             # Greatest absolute difference: 2.491354329061828e+28 at index (6, 6) (up to 1e-05 allowed)
             # Greatest relative difference: 2.9793410720160818e-05 at index (4, 5) (up to 1.3e-06 allowed)
@@ -2898,7 +3252,7 @@ def __init__(
                 self,
                 **kwargs,
             ):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(
                     64,
                     5,
@@ -2924,6 +3278,19 @@ def forward(self, x, y):
         self.assertEqual(y, opt_y)
         self.assertEqual(y.stride(), opt_y.stride())
 
+    def test_cat_inplace(self):
+        def fn(x):
+            rt = torch.cat([x])
+            v = x.sin_()
+            return rt
+
+        # can't use self.common because input is modified inplace
+        inp = torch.ones(2)
+        opt_fn = torch.compile(fn)
+        res = opt_fn(inp.clone())
+        expected = fn(inp.clone())
+        self.assertEqual(res, expected)
+
     def test_stack(self):
         def fn(a, b):
             return torch.stack(
@@ -3398,6 +3765,7 @@ def fn(a):
 
         self.common(fn, (torch.randn([3, 3, 6, 12]),))
 
+    @skip_if_x86_mac()
     def test_upsample_bilinear2d_a(self):
         def fn(a):
             return (
@@ -3405,7 +3773,7 @@ def fn(a):
                 aten.upsample_bilinear2d(a, None, True, [2.0, 2.0]),
             )
 
-        self.common(fn, (torch.randn([2, 4, 37, 38]),))
+        self.common(fn, (torch.randn([2, 4, 37, 38]),), atol=2.5e-5, rtol=1.3e-6)
 
     def test_upsample_bilinear2d_b(self):
         def fn(a):
@@ -3416,6 +3784,8 @@ def fn(a):
             [
                 torch.randn([1, 2, 40, 59]),
             ],
+            atol=2.5e-5,
+            rtol=1.3e-6,
         )
 
     def test_reflection_pad2d(self):
@@ -3550,6 +3920,15 @@ def fn(a):
             fn, (torch.randint(0, 999, size=[2, 4, 4, 4], dtype=torch.float32),)
         )
 
+    def test_constant_pad_float64(self):
+        # Repro for https://github.com/pytorch/pytorch/issues/93351
+        def fn(input):
+            v1 = torch.nn.functional.pad(input, pad=(1, 0))
+            return torch.gt(v1, input)
+
+        x = torch.rand([1, 2, 2, 1], dtype=torch.float64)
+        self.common(fn, (x,))
+
     def test_l1_loss(self):
         def fn(a, b):
             return torch.nn.functional.l1_loss(a, b), torch.nn.functional.mse_loss(a, b)
@@ -3589,6 +3968,41 @@ def fn(x, y):
         self.assertTrue(same(out, inp_clone + inputs[1]))
         self.assertTrue(out is inputs[0])
 
+    # The following 2 tests are meant to check the logic that drops
+    # xmask from triton load/store if xnumel = 1
+    @requires_cuda()
+    def test_single_elem(self):
+        def fn(a):
+            b = a + 1
+            return (b,)
+
+        self.common(fn, (torch.randn(1),))
+
+    @requires_cuda()
+    def test_single_elem_indirect(self):
+        def fn(a, b):
+            c = a[b] + 1
+            return (c,)
+
+        a = torch.randn(1)
+        b = (torch.tensor([0], dtype=torch.int64),)
+
+        self.common(fn, (a, b))
+
+    # This test is meant to check for issues from the logic
+    # that drops xmask from trito load/store if XBLOCK divides xnumel
+
+    @requires_cuda()
+    def test_xblock_divides_xnumel(self):
+        def fn(a):
+            b = a + 1
+            return (b,)
+
+        # assumption is that XBLOCK is always a divisor of 1024
+        # so xmask will be dropped iff xnumel is multiple of 1024
+        self.common(fn, (torch.randn(1024),))
+        self.common(fn, (torch.randn(1025),))
+
     def test_inplace_mixed_dtype_ops(self):
         @torch._dynamo.optimize("inductor")
         def fn(x, y):
@@ -3604,8 +4018,9 @@ def fn(x, y):
         out_eager = (inputs[0] + inputs[1].float()).add_(inputs[1]).mul_(inputs[1])
         self.assertTrue(same(out, out_eager))
 
-    @patch.object(config.triton, "ordered_kernel_names", True)
-    @patch.object(config.triton, "descriptive_kernel_names", False)
+    @config.patch(
+        {"triton.ordered_kernel_names": True, "triton.descriptive_kernel_names": False}
+    )
     def test_kernel_names(self):
         @torch._dynamo.optimize("inductor")
         def fn(x):
@@ -3614,7 +4029,7 @@ def fn(x):
         inputs = (rand_strided((8,), (1,), device=self.device),)
         self.assertTrue(same(fn(*inputs), 2 * inputs[0]))
 
-    @patch.object(config.triton, "cudagraphs", True)
+    @config.patch({"triton.cudagraphs": True})
     def test_strided_inputs(self):
         @torch._dynamo.optimize("inductor")
         def fn(x, y):
@@ -3626,7 +4041,7 @@ def fn(x, y):
         )
         self.assertTrue(same(fn(*inputs), inputs[0] + inputs[1]))
 
-    @patch.object(config.triton, "cudagraphs", True)
+    @config.patch({"triton.cudagraphs": True})
     @patch.object(functorch_config, "use_fake_tensor", True)
     def test_input_mutation1(self):
         def fn(a):
@@ -3729,7 +4144,9 @@ def fn(a):
         opt_fn = torch._dynamo.optimize_assert(compile_fx)(fn)
         opt_fn(arg2)
 
-        self.assertTrue(same(arg1, arg2))
+        # TODO, fix: See https://github.com/pytorch/pytorch/issues/94693
+        if self.device != "cpu":
+            self.assertTrue(same(arg1, arg2))
 
     def test_indirect_load_broadcast(self):
         def fn(in_ptr0, in_ptr1, in_ptr2):
@@ -3842,7 +4259,7 @@ def fn(a, b, c):
             rtol=0.001,
         )
 
-    @patch.object(config.triton, "max_tiles", 2)
+    @config.patch({"triton.max_tiles": 2})
     def test_fuse_tiled(self):
         def fn(a, b, c):
             return a + b, c + 1
@@ -3992,7 +4409,7 @@ def fn(a, b, c, d, e):
             ),
         )
 
-    @patch.object(config, "fallback_random", True)
+    @config.patch(fallback_random=True)
     def test_bernoulli1(self):
         def fn(a):
             b = torch.empty_like(a)
@@ -4027,7 +4444,22 @@ def fn(x):
                 aten.as_strided(x + 1, (8, 8, 64), (8 * 64, 64, 1), 0) + 2,
             )
 
+        def fn_channels_last(x):
+            return (
+                aten.as_strided(
+                    x, (8, 384, 2, 20, 12), (153600, 1, 61440, 384, 7680), 0
+                ),
+                aten.as_strided(
+                    x + 1, (8, 384, 2, 20, 12), (153600, 1, 61440, 384, 7680), 0
+                )
+                + 2,
+            )
+
         self.common(fn, [torch.randn(64, 64)])
+        self.common(
+            fn_channels_last,
+            [torch.randn(8, 384, 20, 20).to(memory_format=torch.channels_last)],
+        )
 
     def test_as_strided_scatter(self):
         def fn(a, b):
@@ -4258,7 +4690,7 @@ def fn(a):
 
         self.common(fn, [torch.randn(55)], assert_equal=False)
 
-    @patch.object(torch._inductor.config.triton, "cudagraphs", True)
+    @config.patch({"triton.cudagraphs": True})
     def test_dropout(self):
         random.seed(1234)
         torch.manual_seed(1234)
@@ -4289,7 +4721,7 @@ def fn(a):
             return torch.nn.functional.dropout(a, 0.55, True)
 
         for cg in (False, True):
-            with patch.object(torch._inductor.config.triton, "cudagraphs", cg):
+            with patch.object(config.triton, "cudagraphs", cg):
                 torch._dynamo.reset()
 
                 x = torch.ones(1024, device=self.device, dtype=torch.float32)
@@ -4346,6 +4778,30 @@ def fn(a):
         self.assertTrue((d >= 0).all())
         self.assertTrue((d < 1).all())
 
+    def test_randn_like_empty(self):
+        class Model(torch.nn.Module):
+            def __init__(
+                self,
+            ):
+                super().__init__()
+
+            def forward(self, v1: torch.Tensor):
+                vx = v1.min(dim=1).values
+                v2 = torch.randn_like(vx)
+                return v2
+
+        model = Model()
+        x = torch.rand(10, 3, 0)
+
+        self.common(model, (x,))
+
+    @config.patch(fallback_random=True)
+    def test_like_rands(self):
+        def fn(x):
+            return torch.rand_like(x), torch.randn_like(x)
+
+        self.common(fn, [torch.zeros([20, 20])])
+
     def test_max_pool2d_with_indices_backward(self):
         def fn(a, b, c):
             return aten.max_pool2d_with_indices_backward(
@@ -4564,6 +5020,7 @@ def fn(a, b):
         )
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
 
+    @config.patch(search_autotune_cache=False)
     def test_mm_views(self):
         def fn(a, b):
             return torch.mm(a.view(32, 32), b.view(32, 32))
@@ -4582,7 +5039,7 @@ def fn(a, b):
             torch._inductor.metrics.generated_kernel_count, expected_kernel
         )
 
-    @patch.object(config.triton, "cudagraphs", False)
+    @config.patch({"triton.cudagraphs": False})
     def test_lowmem_dropout1(self):
         n = 100000
         weight = torch.ones(
@@ -4636,6 +5093,7 @@ def check(r, g):
         self.assertTrue(same(r2, r3))
         self.assertTrue(same(g2, g3))
 
+    @config.patch(search_autotune_cache=False)
     def test_lowmem_dropout2(self):
         m = torch.nn.Sequential(
             torch.nn.Linear(32, 32, bias=False),
@@ -4671,6 +5129,16 @@ def fn(a):
             ],
         )
 
+    def test_argmax_min_int32(self):
+        # https://github.com/pytorch/pytorch/issues/94055
+        def fn(a, b):
+            c = a.argmax(3)
+            return torch.min(b, c)
+
+        a = torch.rand(3, 4, 2, 1).int()
+        b = torch.rand(2, 2, 1, 4, 1).int()
+        self.common(fn, (a, b))
+
     def test_argmax_argmin1(self):
         def fn(x):
             return (aten.argmax(x), aten.argmin(x))
@@ -4705,7 +5173,6 @@ def fn(x):
 
     def test_conv_backward(self):
         def fn(rank4_inps, rank3_inps, rank5_inps):
-
             out1 = aten.convolution_backward(
                 *rank4_inps,
                 [C],
@@ -4769,7 +5236,7 @@ def shrink_rank(x, rank):
         rank3_inps = [shrink_rank(x, 4) for x in [grad_out, inp, weight]]
         rank5_inps = [shrink_rank(x, 5) for x in [grad_out, inp, weight]]
 
-        with torch.backends.cudnn.flags(allow_tf32=False):
+        with torch.backends.cudnn.flags(enabled=True, allow_tf32=False):
             self.common(
                 fn,
                 [rank4_inps, rank3_inps, rank5_inps],
@@ -4815,32 +5282,22 @@ def forward(
             div_default,
             reciprocal_default,
         ):
-            var_default = torch.ops.prims.var.default(
+            var_default = torch.ops.aten.var(
                 convert_element_type_default, [2], correction=0
             )
             sub_tensor = torch.ops.aten.sub.Tensor(add_tensor, div_default)
             mul_tensor_1 = torch.ops.aten.mul.Tensor(sub_tensor, reciprocal_default)
             mul_tensor_2 = torch.ops.aten.mul.Tensor(mul_tensor_1, primals_3)
             add_tensor_2 = torch.ops.aten.add.Tensor(mul_tensor_2, primals_4)
-            convert_element_type_default_1 = (
-                torch.ops.prims.convert_element_type.default(
-                    add_tensor_2, torch.float32
-                )
-            )
-            convert_element_type_default_2 = (
-                torch.ops.prims.convert_element_type.default(
-                    convert_element_type_default_1, torch.float32
-                )
+            convert_element_type_default_1 = add_tensor_2.to(dtype=torch.float32)
+            convert_element_type_default_2 = convert_element_type_default_1.to(
+                dtype=torch.float32
             )
-            var_default_1 = torch.ops.prims.var.default(
+            var_default_1 = torch.ops.aten.var(
                 convert_element_type_default_2, [2], correction=0
             )
-            broadcast_in_dim_default_2 = torch.ops.prims.broadcast_in_dim.default(
-                var_default_1, [1, 512, 1], [0, 1]
-            )
-            sum_default_1 = torch.ops.prims.sum.default(
-                convert_element_type_default_2, [2]
-            )
+            broadcast_in_dim_default_2 = var_default_1.reshape(1, 512, 1)
+            sum_default_1 = convert_element_type_default_2.sum(2)
             add_tensor_3 = torch.ops.aten.add.Tensor(broadcast_in_dim_default_2, 1e-05)
             return (var_default, sum_default_1, add_tensor_3)
 
@@ -4867,11 +5324,13 @@ def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
             sum_default_7 = torch.ops.aten.sum.default(mul_tensor_24)
             return (new_zeros_default_4, sum_default_7)
 
+        # TODO: Remove once https://github.com/pytorch/pytorch/issues/94017 is resolved
+        dtype = torch.float64 if self.device == "cpu" else torch.float32
         args = [
-            ((1, 88, 40, 40), (140800, 1600, 40, 1), torch.float32),
-            ((), (), torch.float32),
-            ((1, 88, 40, 40), (140800, 1600, 40, 1), torch.float32),
-            ((3,), (1,), torch.float32),
+            ((1, 88, 40, 40), (140800, 1600, 40, 1), dtype),
+            ((), (), dtype),
+            ((1, 88, 40, 40), (140800, 1600, 40, 1), dtype),
+            ((3,), (1,), dtype),
         ]
         args = [
             rand_strided(shape, stride, dtype).requires_grad_(True).add(1)
@@ -4958,8 +5417,30 @@ def test_zero_dim_reductions(self):
 
             self.assertTrue(torch.allclose(actual, expected, atol=1e-3, rtol=1e-3))
 
-    @requires_cuda()
+    def test_lerp(self):
+        # non-contiguous inputs for lerp
+        def fn0(i0, i1):
+            x1 = i0.transpose(-2, -3)
+            return torch.lerp(i1, x1, 70000)
+
+        # contiguous inputs for lerp
+        def fn1(i0, i1):
+            return torch.lerp(i1, i0, 70000)
+
+        def compare(fn, inputs):
+            compiled = torch._dynamo.optimize("inductor")(fn)
+            expected = fn(*inputs)
+            actual = compiled(*inputs)
+            self.assertEqual(expected, actual)
+            self.assertEqual(expected.stride(), actual.stride())
+
+        compare(fn0, [torch.rand(10, 3, 10), torch.rand(3, 10, 10)])
+        compare(fn1, [torch.rand(3, 10, 10), torch.rand(3, 10, 10)])
+
     def test_unspec_inputs(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("segfault with CPU backend")
+
         def fn(x, y):
             return x + y, x * y, x / y
 
@@ -4983,13 +5464,12 @@ def fn(x, y):
             self.assertTrue(same(opt(*inputs), fn(*inputs)))
 
     def test_list_clearing(self):
-
         if self.device == "cpu":
             contexts = [contextlib.nullcontext]
         else:
             contexts = [
                 contextlib.nullcontext,
-                lambda: patch.object(config.triton, "cudagraphs", True),
+                lambda: config.patch({"triton.cudagraphs": True}),
             ]
 
         for context in contexts:
@@ -5064,7 +5544,29 @@ def fn(x, y):
             [torch.randn((4, 2)), torch.randn((4))],
         )
 
-    @patch.object(config, "profiler_mark_wrapper_call", True)
+    @torch._dynamo.config.patch(dynamic_shapes=True)
+    def test_int_input_dynamic_shapes(self):
+        @torch.compile(dynamic=True)
+        def fn(x, i):
+            y = x * i
+            return y
+
+        # Constant must not get matched as constant
+        self.common(fn, [torch.randn(3, 1, 1, 1, 1), 9132])
+
+    @unittest.skipIf(HAS_CUDA, "test in_out_ptr for CppKernel")
+    def test_in_out_buffer(self):
+        def fn(x, y):
+            z = torch.matmul(x, y.transpose(-1, -2)) / 8.0
+            return z
+
+        inps = [torch.randn(1, 2, 8, 4), torch.randn(1, 2, 8, 4)]
+        fn_opt = torch._dynamo.optimize("inductor")(fn)
+        code = run_and_get_cpp_code(fn_opt, inps)
+        self.assertTrue("in_out_ptr" in code)
+        self.assertEqual(fn_opt(*inps), fn(*inps))
+
+    @config.patch(profiler_mark_wrapper_call=True)
     def test_profiler_mark_wrapper_call(self):
         from torch.profiler import profile
 
@@ -5080,7 +5582,7 @@ def fn(a, b):
             e.name for e in prof.profiler.function_events
         )
 
-    @patch.object(config, "cpp_wrapper", True)
+    @config.patch(cpp_wrapper=True, search_autotune_cache=False)
     def test_cpp_wrapper(self):
         if self.device == "cuda":
             raise unittest.SkipTest("cpp_wrapper only supports cpu")
@@ -5110,6 +5612,7 @@ def test_cpp_wrapper(self):
             assert callable(func), "not a callable"
             func()
 
+    @unittest.skipIf(IS_X86 and not HAS_AVX2, "Requires AVX2")
     def test_pixel_shuffle_channels_last(self):
         def fn(x):
             x = torch.nn.functional.pixel_shuffle(x, 2)
@@ -5121,167 +5624,69 @@ def fn(x):
             (torch.randn(1, 16, 64, 72).to(memory_format=torch.channels_last),),
         )
 
+    def test_where_broadcast(self):
+        # https://github.com/pytorch/pytorch/issues/93374
+        def fn(x, p1, p0):
+            o = torch.where(x, p1, p0)
+            return o
+
+        # https://github.com/pytorch/pytorch/issues/94725
+        class Repro(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer(
+                    "_tensor_constant0", torch.randn([], dtype=torch.float32)
+                )
+
+            def forward(self, arg0_1, arg1_1):
+                convert_element_type = torch.ops.prims.convert_element_type.default(
+                    arg1_1, torch.bool
+                )
+                bitwise_not = torch.ops.aten.bitwise_not.default(convert_element_type)
+                _tensor_constant0 = self._tensor_constant0
+                lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(
+                    _tensor_constant0
+                )
+                where = torch.ops.aten.where.self(bitwise_not, lift_fresh_copy, arg0_1)
+                return (where, bitwise_not)
+
+        self.common(
+            fn,
+            (torch.tensor([[True]]), torch.rand(13, 7, 3), torch.rand(1, 1)),
+        )
+
+        if not torch._dynamo.config.dynamic_shapes:
+            args = [
+                torch.randn(1, 4, 64, 64),
+                torch.zeros(1, 1, 64, 64, dtype=torch.uint8),
+            ]
+            args[1][:, :, :32, :32] = 1
+            eager_args = [x.clone() for x in args]
+            eager_mod = Repro()
+            mod = make_fx(eager_mod, tracing_mode="real")(*args)
+            compiled = compile_fx_inner(mod, args)
+            inductor_out = compiled(args)
+            eager_out = eager_mod(*eager_args)
+            self.assertEqual(inductor_out, eager_out)
+
+    def test_where_with_logical_op(self):
+        def fn_and(x, y):
+            return torch.where(torch.logical_and(x, y), 1.0, 0.0)
+
+        def fn_or(x, y):
+            return torch.where(torch.logical_or(x, y), 1.0, 0.0)
+
+        self.common(
+            fn_and,
+            (torch.randn(32), torch.randn(32)),
+        )
+        self.common(
+            fn_or,
+            (torch.randn(32), torch.randn(32)),
+        )
+
 
-test_skips = {
-    "test_add_inplace_permuted_dynamic_shapes": ("cuda",),
-    "test_addmm_dynamic_shapes": ("cuda",),
-    "test_alexnet_prefix_dynamic_shapes": ("cpu", "cuda"),
-    "test_any_dynamic_shapes": ("cuda",),
-    "test_argmax_argmin2_dynamic_shapes": ("cuda",),
-    "test_as_strided_dynamic_shapes": ("cuda",),
-    "test_as_strided_scatter_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d1_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d2_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d3_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d4_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d5_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d6_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d_backward2_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d_backward3_dynamic_shapes": ("cuda",),
-    "test_avg_pool2d_backward_dynamic_shapes": ("cuda",),
-    "test_baddbmm_dynamic_shapes": ("cpu", "cuda"),
-    "test_batch_norm_2d_dynamic_shapes": ("cuda",),
-    "test_cat_dynamic_shapes": ("cuda",),
-    "test_cat_extern_kernel_dynamic_shapes": ("cuda",),
-    "test_cat_upcasting_dynamic_shapes": ("cuda",),
-    "test_cauchy_dynamic_shapes": ("cuda",),
-    "test_clamp_dynamic_shapes": ("cuda",),
-    "test_clone_dynamic_shapes": ("cuda",),
-    "test_conv2d_binary_dynamic_shapes": ("cpu",),
-    "test_conv2d_packed_dynamic_shapes": ("cpu",),
-    "test_conv2d_unary_dynamic_shapes": ("cpu",),
-    "test_conv_bn_fuse_dynamic_shapes": ("cpu",),
-    "test_conv_functional_bn_fuse_dynamic_shapes": ("cpu",),
-    "test_cos_dynamic_shapes": ("cuda",),
-    "test_cpp_wrapper_dynamic_shapes": ("cpu",),
-    "test_cudnn_rnn_dynamic_shapes": ("cuda",),
-    "test_div1_dynamic_shapes": ("cuda",),
-    "test_div2_dynamic_shapes": ("cuda",),
-    "test_div3_dynamic_shapes": ("cuda",),
-    "test_div4_dynamic_shapes": ("cuda",),
-    "test_div5_dynamic_shapes": ("cuda",),
-    "test_div6_dynamic_shapes": ("cuda",),
-    "test_div7_dynamic_shapes": ("cuda",),
-    "test_elu_dynamic_shapes": ("cuda",),
-    "test_exp2_dynamic_shapes": ("cuda",),
-    "test_exp_dynamic_shapes": ("cuda",),
-    "test_expand_as_dynamic_shapes": ("cuda",),
-    "test_expanded_reduction_dynamic_shapes": ("cuda",),
-    "test_fill1_dynamic_shapes": ("cuda",),
-    "test_fill2_dynamic_shapes": ("cuda",),
-    "test_flip_dynamic_shapes": ("cuda",),
-    "test_fuse_tiled_dynamic_shapes": ("cuda",),
-    "test_gather_scatter_dynamic_shapes": ("cuda",),
-    "test_gelu_dynamic_shapes": ("cuda",),
-    "test_grid_sampler_2d_dynamic_shapes": ("cpu", "cuda"),
-    "test_horizonal_fusion1_dynamic_shapes": ("cuda",),
-    "test_index1_dynamic_shapes": ("cuda",),
-    "test_index2_dynamic_shapes": ("cuda",),
-    "test_index_put1_dynamic_shapes": ("cuda",),
-    "test_index_put2_dynamic_shapes": ("cuda",),
-    "test_index_put3_dynamic_shapes": ("cuda",),
-    "test_index_select_dynamic_shapes": ("cuda",),
-    "test_indirect_load_broadcast_dynamic_shapes": ("cpu", "cuda"),
-    "test_inplace_add_dynamic_shapes": ("cpu", "cuda"),
-    "test_inplace_mixed_dtype_ops_dynamic_shapes": ("cpu", "cuda"),
-    "test_input_mutation2_dynamic_shapes": ("cpu", "cuda"),
-    "test_invalid_operand_issue1_dynamic_shapes": ("cpu", "cuda"),
-    "test_kwargs_dynamic_shapes": ("cpu",),
-    "test_l1_loss_dynamic_shapes": ("cuda",),
-    "test_leaky_relu_dynamic_shapes": ("cuda",),
-    "test_lgamma_dynamic_shapes": ("cuda",),
-    "test_linear_binary_dynamic_shapes": ("cpu",),
-    "test_linear_packed_dynamic_shapes": ("cpu",),
-    "test_linear_unary_dynamic_shapes": ("cpu",),
-    "test_list_clearing_dynamic_shapes": ("cpu", "cuda"),
-    "test_log_softmax_dynamic_shapes": ("cuda",),
-    "test_logsumexp_dynamic_shapes": ("cuda",),
-    "test_long_tensor_dynamic_shapes": ("cuda",),
-    "test_lowmem_dropout1_dynamic_shapes": ("cpu", "cuda"),
-    "test_lowmem_dropout2_dynamic_shapes": ("cpu", "cuda"),
-    "test_masked_fill_dynamic_shapes": ("cuda",),
-    "test_masked_fill_promotion_dynamic_shapes": ("cuda",),
-    "test_max_pool2d1_dynamic_shapes": ("cuda",),
-    "test_max_pool2d2_dynamic_shapes": ("cuda",),
-    "test_max_pool2d3_dynamic_shapes": ("cuda",),
-    "test_max_pool2d4_dynamic_shapes": ("cuda",),
-    "test_max_pool2d5_dynamic_shapes": ("cuda",),
-    "test_max_pool2d_with_indices_backward2_dynamic_shapes": ("cuda",),
-    "test_max_pool2d_with_indices_backward3_dynamic_shapes": ("cuda",),
-    "test_max_pool2d_with_indices_backward4_dynamic_shapes": ("cuda",),
-    "test_max_pool2d_with_indices_backward_dynamic_shapes": ("cuda",),
-    "test_mean_dynamic_shapes": ("cuda",),
-    "test_min_max_reduction_dynamic_shapes": ("cuda",),
-    "test_move_arange_dynamic_shapes": ("cpu", "cuda"),
-    "test_narrow_dynamic_shapes": ("cuda",),
-    "test_nll_loss_forward_dynamic_shapes": ("cpu", "cuda"),
-    "test_output_strides_dynamic_shapes": ("cpu", "cuda"),
-    "test_permute1_dynamic_shapes": ("cuda",),
-    "test_permute2_dynamic_shapes": ("cpu", "cuda"),
-    "test_pixel_shuffle_channels_last_dynamic_shapes": ("cpu",),
-    "test_pow1_dynamic_shapes": ("cuda",),
-    "test_pow2_dynamic_shapes": ("cuda",),
-    "test_rand_like_deterministic_dynamic_shapes": ("cpu", "cuda"),
-    "test_recompile_on_index_dynamic_shapes": ("cpu", "cuda"),
-    "test_reduction4_dynamic_shapes": ("cuda",),
-    "test_relu_dynamic_shapes": ("cuda",),
-    "test_repeat_dynamic_shapes": ("cuda",),
-    "test_roi_align_dynamic_shapes": ("cpu",),
-    "test_roll_dynamic_shapes": ("cuda",),
-    "test_round_dynamic_shapes": ("cuda",),
-    "test_scatter4_dynamic_shapes": ("cuda",),
-    "test_scatter_add2_dynamic_shapes": ("cuda",),
-    "test_scatter_reduce2_dynamic_shapes": ("cuda",),
-    "test_scheduler_vertical_fusion1_dynamic_shapes": ("cuda",),
-    "test_select_scatter_dynamic_shapes": ("cuda",),
-    "test_sigmoid_dynamic_shapes": ("cuda",),
-    "test_silu_dynamic_shapes": ("cuda",),
-    "test_simplify_loops_dynamic_shapes": ("cuda",),
-    "test_sin_dynamic_shapes": ("cuda",),
-    "test_sizehint_issue1_dynamic_shapes": ("cpu", "cuda"),
-    "test_slice1_dynamic_shapes": ("cuda",),
-    "test_slice2_dynamic_shapes": ("cuda",),
-    "test_slice_mutation1_dynamic_shapes": ("cuda",),
-    "test_slice_scatter_dynamic_shapes": ("cuda",),
-    "test_softmax_dynamic_shapes": ("cuda",),
-    "test_softmax_one_kernel_dynamic_shapes": ("cuda",),
-    "test_split_with_sizes_dynamic_shapes": ("cuda",),
-    "test_squeeze2_dynamic_shapes": ("cuda",),
-    "test_std_dynamic_shapes": ("cuda",),
-    "test_strided_inputs_dynamic_shapes": ("cpu", "cuda"),
-    "test_sum1_dynamic_shapes": ("cuda",),
-    "test_sum2_dynamic_shapes": ("cuda",),
-    "test_sum3_dynamic_shapes": ("cuda",),
-    "test_sum4_dynamic_shapes": ("cuda",),
-    "test_sum5_dynamic_shapes": ("cuda",),
-    "test_sum_dtype_dynamic_shapes": ("cuda",),
-    "test_sum_keepdims_dynamic_shapes": ("cuda",),
-    "test_tanh_dynamic_shapes": ("cuda",),
-    "test_tmp_not_defined_issue1_dynamic_shapes": ("cuda",),
-    "test_tmp_not_defined_issue2_dynamic_shapes": ("cpu", "cuda"),
-    "test_to_memory_format_dynamic_shapes": ("cuda",),
-    "test_transpose_add_dynamic_shapes": ("cuda",),
-    "test_transpose_dynamic_shapes": ("cuda",),
-    "test_transposed_propagates_dynamic_shapes": ("cuda",),
-    "test_triu_dynamic_shapes": ("cuda",),
-    "test_unroll_small_reduction_dynamic_shapes": ("cpu", "cuda"),
-    "test_unspec_inputs_dynamic_shapes": ("cpu", "cuda"),
-    "test_unsqueeze_dynamic_shapes": ("cuda",),
-    "test_unsqueeze_inplace_dynamic_shapes": ("cuda",),
-    "test_upsample_bilinear2d_a_dynamic_shapes": ("cpu", "cuda"),
-    "test_upsample_bilinear2d_b_dynamic_shapes": ("cpu", "cuda"),
-    "test_upsample_nearest1d_dynamic_shapes": ("cpu", "cuda"),
-    "test_upsample_nearest2d_backward_dynamic_shapes": ("cpu", "cuda"),
-    "test_upsample_nearest2d_dynamic_shapes": ("cpu", "cuda"),
-    "test_upsample_nearest3d_dynamic_shapes": ("cpu", "cuda"),
-    "test_var_mean_dynamic_shapes": ("cuda",),
-    "test_vertical_fusion1_dynamic_shapes": ("cuda",),
-    "test_views1_dynamic_shapes": ("cuda",),
-    "test_views3_dynamic_shapes": ("cpu",),
-}
-
-
-def copy_tests(my_cls, other_cls, suffix):  # noqa: B902
+def copy_tests(my_cls, other_cls, suffix, test_skips=None):  # noqa: B902
     for name, value in my_cls.__dict__.items():
         if name.startswith("test_"):
             # You cannot copy functions in Python, so we use lambdas here to
@@ -5289,7 +5694,7 @@ def copy_tests(my_cls, other_cls, suffix):  # noqa: B902
             # would modify all methods sharing the same object id. Also, by
             # using a default argument in a lambda, we create a copy instead of
             # a reference. Otherwise, we would lose access to the value.
-            skips = test_skips.get(name)
+            skips = test_skips and test_skips.get(name)
             if skips and suffix in skips:
                 setattr(
                     other_cls,
@@ -5302,20 +5707,6 @@ def copy_tests(my_cls, other_cls, suffix):  # noqa: B902
                 )
 
 
-def make_dynamic_cls(cls):
-    return make_test_cls_with_patches(
-        cls,
-        "DynamicShapes",
-        "_dynamic_shapes",
-        (config, "dynamic_shapes", True),
-        (torch._dynamo.config, "dynamic_shapes", True),
-        (functorch_config, "use_dynamic_shapes", True),
-    )
-
-
-DynamicShapesCommonTemplate = make_dynamic_cls(CommonTemplate)
-
-
 if HAS_CPU:
 
     class SweepInputsCpuTest(SweepInputs2, TestCase):
@@ -5328,7 +5719,6 @@ class CpuTests(TestCase):
         device = "cpu"
 
     copy_tests(CommonTemplate, CpuTests, "cpu")
-    copy_tests(DynamicShapesCommonTemplate, CpuTests, "cpu")
 
     class CPUReproTests(TestCase):
         def test_conv_stride_constraints(self):
@@ -5389,6 +5779,26 @@ def fn(x):
             # aten parallel.
             assert same(result, mod(v), tol=5e-1)
 
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_sigmoid_with_reduction(self):
+            def fn(x):
+                x = torch.ops.aten.sigmoid.default(x)
+                return torch.ops.aten.mean.dim(x, [-1, -2], True)
+
+            x = torch.randn((1, 8, 8, 8))
+            with config.patch({"cpp.simdlen": None}):
+                torch._dynamo.reset()
+                metrics.reset()
+                opt_fn = torch._dynamo.optimize("inductor")(fn)
+                opt_fn(x)
+
+                real_out = fn(x)
+                compiled_out = opt_fn(x)
+                assert same(real_out, compiled_out, equal_nan=True)
+
         def test_inplace_add_alpha(self):
             def fn(x, y):
                 aten.add_.Tensor(x, y, alpha=0.55)
@@ -5459,15 +5869,13 @@ def test_complex_memory_overlap(self):
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
-        @patch.object(config, "dynamic_shapes", True)
-        @patch.object(torch._dynamo.config, "dynamic_shapes", True)
-        @patch.object(functorch_config, "use_dynamic_shapes", True)
+        @torch._dynamo.config.patch(dynamic_shapes=True)
         def test_vec_dynamic_shapes(self):
             def fn(x):
                 return torch.softmax(x, -1)
 
             value = torch.randn((2, 10))
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
                 opt_fn = torch._dynamo.optimize("inductor")(fn)
@@ -5491,37 +5899,37 @@ def test_auto_simd(self):
             self.assertTrue(vec_avx512.nelements(torch.bfloat16) == 32)
             self.assertTrue(vec_avx2.nelements(torch.bfloat16) == 16)
 
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 isa = codecache.pick_vec_isa()
                 if vec_avx512 in codecache.valid_vec_isa_list():
                     self.assertTrue(isa == vec_avx512)
                 else:
                     self.assertTrue(isa == vec_avx2)
 
-            with patch.object(config.cpp, "simdlen", 0):
+            with config.patch({"cpp.simdlen": 0}):
                 isa = codecache.pick_vec_isa()
                 self.assertFalse(isa)
 
-            with patch.object(config.cpp, "simdlen", 1):
+            with config.patch({"cpp.simdlen": 1}):
                 isa = codecache.pick_vec_isa()
                 self.assertFalse(isa)
 
-            with patch.object(config.cpp, "simdlen", 257):
+            with config.patch({"cpp.simdlen": 257}):
                 isa = codecache.pick_vec_isa()
                 self.assertFalse(isa)
 
-            with patch.object(config.cpp, "simdlen", 513):
+            with config.patch({"cpp.simdlen": 513}):
                 isa_list = codecache.valid_vec_isa_list()
                 if vec_avx512 in isa_list:
                     self.assertFalse(isa)
 
-            with patch.object(config.cpp, "simdlen", 512):
+            with config.patch({"cpp.simdlen": 512}):
                 isa_list = codecache.valid_vec_isa_list()
                 if vec_avx512 in isa_list:
                     isa = codecache.pick_vec_isa()
                     self.assertTrue(isa == vec_avx512)
 
-            with patch.object(config.cpp, "simdlen", 256):
+            with config.patch({"cpp.simdlen": 256}):
                 isa_list = codecache.valid_vec_isa_list()
                 if vec_avx2 in isa_list:
                     isa = codecache.pick_vec_isa()
@@ -5539,7 +5947,7 @@ def fn(value, mask):
 
             value = torch.randn((2, 17))
             mask = torch.randint(0, 1, size=(2, 17), dtype=torch.uint8)
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
                 opt_fn = torch._dynamo.optimize("inductor")(fn)
@@ -5597,7 +6005,7 @@ def fn(x):
             x[0, 0] = torch.nan
             x[1, -1] = torch.nan
 
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x)
@@ -5611,7 +6019,7 @@ def fn(x):
         @patch("torch.cuda.is_available", lambda: False)
         def test_vec_cpu_only_for_all_available_isa(self):
             def fn(x):
-                return (torch.erf(x),)
+                return (torch.sin(torch.cos(torch.erf(x))),)
 
             x = torch.randn((2, 9))
             x[0, 0] = torch.nan
@@ -5621,7 +6029,7 @@ def fn(x):
                 None
             ]
             for item in bit_widths:
-                with patch.object(config.cpp, "simdlen", item):
+                with config.patch({"cpp.simdlen": item}):
                     torch._dynamo.reset()
                     metrics.reset()
                     traced = make_fx(fn)(x)
@@ -5629,6 +6037,59 @@ def fn(x):
                     assert same(fn(x)[0], compiled([x])[0], equal_nan=True)
                     assert metrics.generated_cpp_vec_kernel_count == 1
 
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test__adaptive_avg_pool2d(self):
+            def wrap_fn(oh, ow):
+                def fn(x):
+                    return torch._adaptive_avg_pool2d(x, (oh, ow))
+
+                return fn
+
+            bit_widths = [isa._bit_width for isa in codecache.valid_vec_isa_list()]
+            ih = [16, 65]
+            iw = ih
+            oh = ih
+            ow = ih
+            for _ih, _iw, _oh, _ow, _simd_len in itertools.product(
+                ih, iw, oh, ow, bit_widths
+            ):
+                x = torch.randn(2, 3, _ih, _iw).to(memory_format=torch.channels_last)
+                _fn = wrap_fn(_oh, _ow)
+                with config.patch({"cpp.simdlen": _simd_len}):
+                    torch._dynamo.reset()
+                    metrics.reset()
+                    compiled = torch.compile(_fn)
+                    compiled(x)
+                    assert same(_fn(x), compiled(x), equal_nan=True)
+                    assert metrics.generated_cpp_vec_kernel_count == 1
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_vec_logical_and_or(self):
+            def wrap_fn(op: Callable):
+                def fn(x: torch.Tensor, y: torch.Tensor):
+                    return torch.where(op(x, y), 1.0, 0.0)
+
+                return fn
+
+            x = torch.randn(64)
+            y = torch.randn(64)
+            logical_fns = [torch.logical_and, torch.logical_or]
+            for logical_fn in logical_fns:
+                _fn = wrap_fn(logical_fn)
+                torch._dynamo.reset()
+                metrics.reset()
+                compiled = torch.compile(_fn)
+
+                compiled(x, y)
+                assert same(_fn(x, y), compiled(x, y), equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
@@ -5663,7 +6124,7 @@ def fn(x):
 
             x = torch.randn((2, 9))
 
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x)
@@ -5675,6 +6136,218 @@ def fn(x):
                     - metrics.generated_cpp_vec_kernel_count
                 ) == 0
 
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_cpp_vec_constant_checker(self):
+            _graph: torch.fx.Graph = torch.fx.Graph()
+            a: torch.fx.Node = _graph.create_node("placeholder", "ops")
+            iv: torch.fx.Node = _graph.create_node("placeholder", "iv")
+            fv: torch.fx.Node = _graph.create_node("placeholder", "fv")
+            b: torch.fx.Node = _graph.create_node(
+                "call_method",
+                "constant",
+                args=(
+                    a,
+                    iv,
+                    torch.int64,
+                ),
+            )
+            c: torch.fx.Node = _graph.create_node(
+                "call_method",
+                "constant",
+                args=(
+                    a,
+                    fv,
+                    torch.double,
+                ),
+            )
+            _graph.output((b, c))
+
+            def get_index():
+                return ""
+
+            submodules = {"get_index": get_index}
+
+            graph_lowering = GraphLowering(
+                torch.fx.GraphModule(submodules, _graph),
+                shape_env=None,
+                num_static_inputs=0,
+            )
+            with patch.object(graph_lowering, "wrapper_code", ""), V.set_graph_handler(
+                graph_lowering
+            ):
+                # The moset inner loop variable is used in the index_expr
+                tiling_factor = codecache.pick_vec_isa().nelements(dtype=torch.float)
+                with CppVecKernelChecker(
+                    args=None, num_threads=1, tiling_factor=tiling_factor
+                ) as vec_checker:
+                    i32_iinfo = np.iinfo(np.int32)
+                    f32_iinfo = np.finfo(np.float32)
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.max, f32_iinfo.max
+                    )
+                    self.assertTrue(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.min, f32_iinfo.min
+                    )
+                    self.assertTrue(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.min, np.inf
+                    )
+                    self.assertTrue(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.min, -np.inf
+                    )
+                    self.assertTrue(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.min - 1, f32_iinfo.min
+                    )
+                    self.assertFalse(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.max + 1, f32_iinfo.max
+                    )
+                    self.assertFalse(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.min, f32_iinfo.min * (1 + 1e-5)
+                    )
+                    self.assertFalse(vec_checker.simd_vec)
+
+                    vec_checker.simd_vec = True
+                    InterpreterShim(_graph, submodules).run(
+                        V.get_ops_handler(), i32_iinfo.max, f32_iinfo.max * (1 + 1e-5)
+                    )
+                    self.assertFalse(vec_checker.simd_vec)
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_cpp_vec_index_expr_checker(self):
+            _graph: torch.fx.Graph = torch.fx.Graph()
+            a: torch.fx.Node = _graph.create_node("placeholder", "ops")
+            b: torch.fx.Node = _graph.create_node("call_module", "get_index", args=())
+            c: torch.fx.Node = _graph.create_node(
+                "call_method",
+                "index_expr",
+                args=(
+                    a,
+                    b,
+                    torch.int64,
+                ),
+            )
+            _graph.output(c)
+
+            def get_index():
+                return ""
+
+            submodules = {"get_index": get_index}
+            graph_lowering = GraphLowering(
+                torch.fx.GraphModule(submodules, _graph),
+                shape_env=None,
+                num_static_inputs=0,
+            )
+            with patch.object(graph_lowering, "wrapper_code", ""), V.set_graph_handler(
+                graph_lowering
+            ):
+                itervars = [sympy.Symbol("i"), sympy.Symbol("j"), sympy.Symbol("k")]
+
+                tiling_factor = codecache.pick_vec_isa().nelements(dtype=torch.float)
+                # The moset inner loop variable is used in the index_expr
+                with CppVecKernelChecker(
+                    args=None, num_threads=1, tiling_factor=tiling_factor
+                ) as vec_checker:
+
+                    def get_index():
+                        return -itervars[0] ** 2 + 2 * itervars[0] + itervars[1]
+
+                    ranges = [0, 100, 200]
+                    vec_checker.itervars = itervars[:2]
+                    vec_checker.ranges = ranges[:2]
+                    submodules = {"get_index": get_index}
+                    InterpreterShim(_graph, submodules).run(V.get_ops_handler())
+                    self.assertFalse(vec_checker.simd_vec)
+
+                # Most inner loop variable irrevalant
+                with CppVecKernelChecker(
+                    args=None, num_threads=1, tiling_factor=tiling_factor
+                ) as vec_checker:
+
+                    def get_index():
+                        return -itervars[0] ** 2 + 2 * itervars[0] + itervars[1]
+
+                    ranges = [0, 100, 200]
+                    vec_checker.itervars = itervars
+                    vec_checker.ranges = ranges
+                    submodules = {"get_index": get_index}
+                    InterpreterShim(_graph, submodules).run(V.get_ops_handler())
+                    self.assertTrue(vec_checker.simd_vec)
+
+                i32_iinfo = np.iinfo(np.int32)
+                _max_value = i32_iinfo.max + 1
+                ranges = [_max_value, _max_value, _max_value]
+                # Most inner loop variable irrevalant but max value is greater than
+                # the max value of INT32
+                with CppVecKernelChecker(
+                    args=None, num_threads=1, tiling_factor=tiling_factor
+                ) as vec_checker:
+
+                    def get_index():
+                        return itervars[0]
+
+                    submodules = {"get_index": get_index}
+                    vec_checker.itervars = itervars
+                    vec_checker.ranges = ranges
+                    InterpreterShim(_graph, submodules).run(V.get_ops_handler())
+                    self.assertFalse(vec_checker.simd_vec)
+
+                # Most inner loop variable irrevalant but min value is greater than
+                # the min value of INT32
+                with CppVecKernelChecker(
+                    args=None, num_threads=1, tiling_factor=tiling_factor
+                ) as vec_checker:
+
+                    def get_index():
+                        return -itervars[0] - 2
+
+                    submodules = {"get_index": get_index}
+                    vec_checker.itervars = itervars
+                    vec_checker.ranges = ranges
+                    InterpreterShim(_graph, submodules).run(V.get_ops_handler())
+                    self.assertFalse(vec_checker.simd_vec)
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_maxpool2d_cpu_only(self):
+            input = torch.randn(10, 32, 20, 20).to(memory_format=torch.channels_last)
+            maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+            def func(x):
+                return maxpool(x)
+
+            with patch.object(config.cpp, "simdlen", None):
+                torch._dynamo.reset()
+                metrics.reset()
+                graph = torch.compile(func, backend="inductor")
+                graph(input)
+                assert same(graph(input), func(input), equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
@@ -5687,7 +6360,7 @@ def fn(x):
             x[0, 0] = torch.nan
             x[1, -1] = torch.nan
 
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x)
@@ -5739,7 +6412,7 @@ def fn(x1, x2):
             x1 = torch.randn((10, 20))
             x2 = torch.randn((10, 20))
 
-            with patch.object(config.cpp, "simdlen", 1):
+            with config.patch({"cpp.simdlen": 1}):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x1, x2)
@@ -5747,7 +6420,7 @@ def fn(x1, x2):
                 assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
                 assert metrics.generated_cpp_vec_kernel_count == 0
 
-            with patch.object(config.cpp, "simdlen", None):
+            with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x1, x2)
@@ -5777,7 +6450,7 @@ def fn(x1, x2):
             sys.platform != "linux", "cpp kernel profile only support linux now"
         )
         @patch("torch.cuda.is_available", lambda: False)
-        @patch.object(config.cpp, "enable_kernel_profile", True)
+        @config.patch({"cpp.enable_kernel_profile": True})
         def test_cpp_kernel_profile(self):
             from torch.profiler import profile
 
@@ -5811,7 +6484,7 @@ def channel_shuffle(x, groups):
                 return x.contiguous(memory_format=torch.channels_last)
 
             for simdlen in (None, 256, 1):
-                with patch.object(config.cpp, "simdlen", simdlen):
+                with config.patch({"cpp.simdlen": simdlen}):
                     torch._dynamo.reset()
                     metrics.reset()
                     x = torch.randn(64, 58, 28, 28)
@@ -5820,6 +6493,7 @@ def channel_shuffle(x, groups):
                     if simdlen != 1:
                         assert metrics.generated_cpp_vec_kernel_count == 1
 
+        @slow()
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
@@ -5828,7 +6502,7 @@ def test_transpose_with_norm(self):
 
             class Model(torch.nn.Module):
                 def __init__(self):
-                    super(Model, self).__init__()
+                    super().__init__()
                     self.linear = torch.nn.Linear(
                         in_features=256, out_features=1536, bias=True
                     )
@@ -5850,7 +6524,7 @@ def forward(self, x):
 
             x = torch.randn(128, 196, 256)
             for simdlen in (None, 256, 1):
-                with patch.object(config.cpp, "simdlen", simdlen):
+                with config.patch({"cpp.simdlen": simdlen}):
                     for eval_mode in [True, False]:
                         torch._dynamo.reset()
                         metrics.reset()
@@ -5868,7 +6542,7 @@ def fn(a):
                 return a.t().contiguous()
 
             for simdlen in (None, 256, 1):
-                with patch.object(config.cpp, "simdlen", simdlen):
+                with config.patch({"cpp.simdlen": simdlen}):
                     for shape in (
                         (7, 7),
                         (8, 8),
@@ -5886,8 +6560,45 @@ def fn(a):
                         if simdlen != 1:
                             assert metrics.generated_cpp_vec_kernel_count == 1
 
+        def test_transpose_non_contiguous(self):
+            def fn(a):
+                # From part of timm HaloAttn:
+                # (https://github.com/rwightman/pytorch-image-models/blob/main/timm/layers/halo_attn.py#L97).
+                # Fixed https://github.com/pytorch/pytorch/issues/94269 accuracy issue.
+                as_strided = torch.ops.aten.as_strided.default(
+                    a, [1, 384, 2, 20, 12], [153600, 1, 61440, 384, 7680]
+                )
+                as_strided_1 = torch.ops.aten.as_strided.default(
+                    as_strided,
+                    [1, 384, 2, 2, 12, 12],
+                    [153600, 1, 61440, 3072, 7680, 384],
+                )
+                clone_1 = torch.ops.aten.clone.default(
+                    as_strided_1, memory_format=torch.contiguous_format
+                )
+                _unsafe_view_1 = torch.ops.aten._unsafe_view.default(
+                    clone_1, [8, 48, 4, 144]
+                )
+                permute_2 = torch.ops.aten.permute.default(_unsafe_view_1, [0, 2, 3, 1])
+                split_with_sizes = torch.ops.aten.split_with_sizes.default(
+                    permute_2, [16, 32], -1
+                )
+                getitem = split_with_sizes[0]
+                getitem_1 = split_with_sizes[1]
+                permute_3 = torch.ops.aten.permute.default(getitem, [0, 1, 3, 2])
+                expand_1 = torch.ops.aten.expand.default(permute_3, [8, 4, 16, 144])
+                clone_3 = torch.ops.aten.clone.default(
+                    expand_1, memory_format=torch.contiguous_format
+                )
+                return clone_3
+
+            x = torch.randn(1, 384, 20, 20).to(memory_format=torch.channels_last)
+            opt_fn = torch._dynamo.optimize("inductor")(fn)
+            same(fn(x), opt_fn(x))
+            assert metrics.generated_cpp_vec_kernel_count == 0
 
-if HAS_CUDA:
+
+if HAS_CUDA and not TEST_WITH_ASAN:
     import triton
     import triton.language as tl
 
@@ -5909,19 +6620,21 @@ def fn(a):
             )
 
         def test_sink_cat_after_pointwise(self):
-            class TestModule(torch.nn.Module):
-                def forward(self, x, y):
-                    return torch.cat([x, y], dim=-1).view(-1).view(128).tanh()
+            def test_kwarg(x, y):
+                return torch.cat([x, y], dim=-1).view(-1).view(128).tanh()
+
+            def test_arg(x, y):
+                return torch.cat([x, y], -1).view(-1).view(128).tanh()
 
             trace_func = chain_passes(torch.fx.symbolic_trace, sink_cat_after_pointwise)
             inputs = [
                 torch.randn(8, 8, device="cuda"),
                 torch.randn(8, 8, device="cuda"),
             ]
-            module = TestModule()
-            traced = trace_func(module, inputs)
-            self.assertTrue(torch.allclose(module(*inputs), traced(*inputs)))
-            self.assertEqual(count_call_method(traced, "tanh"), 2)
+            for f in [test_kwarg, test_arg]:
+                traced = trace_func(f, inputs)
+                self.assertTrue(torch.allclose(f(*inputs), traced(*inputs)))
+                self.assertEqual(count_call_method(traced, "tanh"), 2)
 
         def test_linear_permute_fusion(self):
             class TestModule(torch.nn.Module):
@@ -5947,12 +6660,9 @@ def forward(self, input: torch.Tensor):
 
             self.assertTrue(torch.allclose(module(input), traced(input)))
 
-        @patch.object(config, "permute_fusion", True)
+        @config.patch(permute_fusion=True)
         def test_permute_fusion(self):
             class Repro(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, view, reshape_2):
                     permute = view.permute(0, 2, 1)
                     view = None
@@ -5976,7 +6686,7 @@ def forward(self, view, reshape_2):
             res = opt_mod(*args)
             self.assertTrue(same(ref, res))
 
-        @patch.object(config.triton, "autotune_pointwise", True)
+        @config.patch({"triton.autotune_pointwise": True})
         def test_inplace_add_alpha_autotune(self):
             def fn(x, y):
                 aten.add_.Tensor(x, y, alpha=0.55)
@@ -5994,7 +6704,7 @@ def fn(x, y):
             fn_compiled([x3, y])
             assert same(x2, x3)
 
-        @patch.object(config.triton, "autotune_pointwise", True)
+        @config.patch({"triton.autotune_pointwise": True})
         def test_inplace_buffer_autotune(self):
             def foo(x, y, z):
                 a = x @ y
@@ -6061,7 +6771,6 @@ def forward(self, input: torch.Tensor):
             self.assertTrue(torch.allclose(module(input), traced(input)))
 
     copy_tests(CommonTemplate, CudaTests, "cuda")
-    copy_tests(DynamicShapesCommonTemplate, CudaTests, "cuda")
 
     class CudaReproTests(TestCase):
         common = check_model_cuda
@@ -6130,9 +6839,6 @@ def foo(m, inp):
         @requires_cuda()
         def test_unspec_inputs_interop(self):
             class Repro(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, x, y):
                     unsqueeze = torch.ops.aten.unsqueeze.default(x, 4)
                     permute = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 4, 3])
@@ -6160,7 +6866,7 @@ def fn(x):
             out = opt_fn(x)
             out.backward(gO)
 
-        @patch.object(config, "fallback_random", True)
+        @config.patch(fallback_random=True)
         def test_dtype_factory_issue(self):
             def forward():
                 randn = torch.ops.aten.randn.default(
@@ -6176,7 +6882,7 @@ def forward():
             compiled = compile_fx_inner(mod, ())
             assert compiled([])[0].device.type == "cuda"
 
-        @patch.object(config.triton, "cudagraphs", True)
+        @config.patch({"triton.cudagraphs": True})
         def test_expanded_inputs_cudagraphs(self):
             @torch._dynamo.optimize("inductor")
             def fn(x, y):
@@ -6189,9 +6895,7 @@ def fn(x, y):
             self.assertTrue(same(fn(*inputs), inputs[0] + inputs[1]))
 
         # TODO: Abstract this out, test more extensively
-        @patch.object(config, "dynamic_shapes", True)
-        @patch.object(torch._dynamo.config, "dynamic_shapes", True)
-        @patch.object(functorch_config, "use_dynamic_shapes", True)
+        @torch._dynamo.config.patch(dynamic_shapes=True)
         def test_dynamic_shapes(self):
             torch._dynamo.reset()  # Needed since everywhere else uses "inductor"
 
@@ -6212,8 +6916,7 @@ def f(x):
             self.assertEqual(real_out, compiled_out)
             torch._dynamo.reset()
 
-        @patch.object(config, "size_asserts", False)
-        @patch.object(config.triton, "cudagraphs", True)
+        @config.patch({"triton.cudagraphs": True, "size_asserts": False})
         def test_expanded_inputs_cudagraphs_no_size_asserts(self):
             @torch._dynamo.optimize("inductor")
             def fn(x, y):
@@ -6225,11 +6928,42 @@ def fn(x, y):
             )
             self.assertTrue(same(fn(*inputs), inputs[0] + inputs[1]))
 
-        @patch.object(config.triton, "cudagraphs", True)
+        @config.patch(tune_layout=True)
+        def test_tune_layout(self):
+            class Repro(torch.nn.Module):
+                def forward(self, arg1_1, unsqueeze, unsqueeze_1):
+                    convolution_1 = torch.ops.aten.convolution.default(
+                        unsqueeze,
+                        unsqueeze_1,
+                        arg1_1,
+                        [1, 1],
+                        [1, 0],
+                        [1, 1],
+                        False,
+                        [0, 0],
+                        1,
+                    )
+                    unsqueeze = unsqueeze_1 = arg1_1 = None
+                    return (convolution_1,)
+
+            args = [
+                ((512,), (1,), torch.float16, "cuda"),
+                ((4096, 512, 16, 1), (8192, 16, 1, 1), torch.float16, "cuda"),
+                ((512, 512, 3, 1), (1536, 3, 1, 1), torch.float16, "cuda"),
+            ]
+            args = [rand_strided(sh, st, dt, dev) for (sh, st, dt, dev) in args]
+
+            mod = Repro()
+            opt_mod = torch._dynamo.optimize("inductor")(mod)
+            ref = mod(*args)
+            res = opt_mod(*args)
+            self.assertTrue(same(ref, res))
+
+        @config.patch({"triton.cudagraphs": True})
         def test_inplace_updates_cudagraphs(self):
             class Repro(torch.nn.Module):
                 def __init__(self):
-                    super(Repro, self).__init__()
+                    super().__init__()
                     self.weight1 = torch.nn.Parameter(
                         torch.randn(10, 20, requires_grad=True)
                     )
@@ -6251,9 +6985,7 @@ def forward(self, x):
                 output_res = model_opt(input)
                 output_ref.sum().backward()
                 output_res.sum().backward()
-                for (p_ref, p_res) in zip(
-                    model_ref.parameters(), model_opt.parameters()
-                ):
+                for p_ref, p_res in zip(model_ref.parameters(), model_opt.parameters()):
                     self.assertEqual(p_ref.grad, p_res.grad)
                 with torch.no_grad():
                     for param in model_ref.parameters():
@@ -6426,22 +7158,6 @@ def fn(x, y):
             assert same(fn(a, b), fn_optimized(a, b))
 
     class TritonCodeGenTests(TestCase):
-        counter = itertools.count(0)
-
-        class DebugDirManager(object):
-            def __init__(self):
-                self.id = next(TritonCodeGenTests.counter)
-                self.prev_debug_name = None
-
-            def __enter__(self):
-                self.prev_debug_name = torch._dynamo.config.debug_dir_root
-                self.new_name = f"{self.prev_debug_name}_tmp_{self.id}"
-                torch._dynamo.config.debug_dir_root = self.new_name
-
-            def __exit__(self, *args):
-                shutil.rmtree(self.new_name)
-                torch._dynamo.config.debug_dir_root = self.prev_debug_name
-
         from torch._inductor.triton_ops.autotune import CachingAutotuner
 
         class NoOpCompilerBackend:
@@ -6524,47 +7240,35 @@ def fn(a: torch.Tensor) -> torch.Tensor:
             self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
             torch._dynamo.reset()
 
-        @staticmethod
-        def run_and_get_triton_code(fn, args):
-            from torch._inductor.debug import DebugContext
-            from torch._inductor.virtualized import V
-
-            torch._dynamo.reset()
-
-            context = DebugContext()
-
-            with TritonCodeGenTests.DebugDirManager(), patch.object(
-                config.trace, "enabled", True
-            ), context, V.set_debug_handler(context):
-
-                dir_name = "/".join(context._path.split("/")[:-1]) + "/"
-                fil = dir_name + "*inference*"
-                existing_dirs = glob.glob(fil)
-
-                fn(*args)
-
-                assert context._path is not None
-
-                dir_dbg = [x for x in glob.glob(fil) if x not in existing_dirs]
-
-                assert len(dir_dbg) == 1, f"{dir_dbg}, {context._path}"
-
-                full_name = os.path.join(dir_dbg[0], "output_code.py")
-                with open(full_name, "r") as f:
-                    return f.read()
-
         def test_optimize_indexing_dtype(self):
             def fn(x: torch.Tensor) -> torch.Tensor:
                 return aten.upsample_bilinear2d.vec(x, None, True, [2.0, 2.0])
 
             fn_opt = torch._dynamo.optimize("inductor")(fn)
             inps = [torch.randn(2, 4, 16, 16).cuda()]
-            code = self.run_and_get_triton_code(fn_opt, inps)
+            code = run_and_get_triton_code(fn_opt, *inps)
             self.assertTrue("to(tl.int32)" in code)
             self.assertFalse("to(tl.int64)" in code)
 
             self.assertEqual(fn_opt(*inps), fn(*inps))
 
+        def test_not_materialize_pointwise_reduction(self):
+            def fn(a, b):
+                return (a - b).sum(dim=-1).amax(dim=-1)
+
+            N = 16
+            K = 7
+            fn_opt = torch._dynamo.optimize("inductor")(fn)
+            inps = [
+                torch.randn(N, 1, K, device="cuda"),
+                torch.randn(1, N, K, device="cuda"),
+            ]
+            code = run_and_get_triton_code(fn_opt, *inps)
+            self.assertEqual(code.count("tl.store"), 1)
+            self.assertTrue("out_ptr1" in code)
+            self.assertFalse("out_ptr0" in code)
+            self.assertEqual(fn_opt(*inps), fn(*inps))
+
         def test_cant_optimize_compute(self):
             def ones():
                 return torch.ones([4], device="cuda")
@@ -6585,7 +7289,7 @@ def fn():
                     return suffix(foo(ones()))
 
                 fn_opt = torch._dynamo.optimize("inductor")(fn)
-                code = self.run_and_get_triton_code(fn_opt, [])
+                code = run_and_get_triton_code(fn_opt)
 
                 # this cannot be optimized away, value too large
                 self.assertTrue("to(tl.int64)" in code)
@@ -6608,7 +7312,7 @@ def fn():
                     return suffix(foo(ones()))
 
                 fn_opt = torch._dynamo.optimize("inductor")(fn)
-                code = self.run_and_get_triton_code(fn_opt, [])
+                code = run_and_get_triton_code(fn_opt)
 
                 # this can be optimized away, value too large
                 self.assertTrue("to(tl.int64)" not in code)
@@ -6616,6 +7320,18 @@ def fn():
 
                 self.assertEqual(fn_opt(), fn())
 
+        def test_split_op_with_sym(self):
+            for dynamic_shapes in [True, False]:
+                torch._dynamo.config.dynamic_shapes = dynamic_shapes
+
+                def fn(x: torch.Tensor) -> torch.Tensor:
+                    # split(tensor, sympy.Integer), split(tensor, sympy.Expr)
+                    return torch.split(x, x.shape[0]), torch.split(x, x.shape[0] // 2)
+
+                fn_opt = torch._dynamo.optimize("inductor", dynamic=dynamic_shapes)(fn)
+                inps = torch.randn([5, 5])
+                fn_opt(inps)
+
 
 class ExprPrinterTests(TestCase):
     def test_print_pow(self):
@@ -6641,8 +7357,14 @@ def test_print_pow(self):
             self.assertEqual(cexpr(expr), result)
             self.assertEqual(texpr(expr), result)
 
+    def test_print_floor(self):
+        s1 = sympy.Symbol("s1", integer=False)
+        expr = sympy.floor(s1)
+        self.assertEqual(texpr(expr), "tl.libdevice.floor(s1)")
+        self.assertEqual(pexpr(expr), "math.floor(s1)")
+
 
-if HAS_CUDA:
+if HAS_CUDA and not TEST_WITH_ASAN:
 
     class RNNTest(TestCase):
         class Model(torch.nn.Module):
@@ -6661,6 +7383,53 @@ def test_rnn_compile_safe(self):
             model(x)
 
 
+if HAS_CPU:
+
+    class TestFull(TestCase):
+        def test_full_dtype(self):
+            pytypes = (
+                bool,
+                int,
+                float,
+                # TODO: Triton's JITFunction._type_of has no support for complex
+                # complex,
+            )
+
+            dtypes = (
+                torch.bool,
+                torch.int32,
+                torch.int64,
+                torch.float32,
+                torch.float64,
+                None,
+                # torch.complex64,
+                # torch.complex128,
+            )
+
+            def fn(pytype, dtype):
+                if pytype is bool:
+                    fill_value = True
+                elif pytype is int:
+                    fill_value = 42
+                elif pytype is float:
+                    fill_value = 42.0
+                else:
+                    raise AssertionError(f"Unexpected Python type: {pytype}")
+
+                return torch.full(
+                    (4, 6), fill_value, dtype=dtype, device=torch.device("cpu")
+                )
+
+            fn_opt = torch._dynamo.optimize("inductor")(fn)
+
+            for pytype, dtype in itertools.product(pytypes, dtypes):
+                with enable_python_dispatcher():
+                    with torch.no_grad():
+                        ret_opt = fn_opt(pytype, dtype)
+
+                self.assertEqual(ret_opt, fn(pytype, dtype))
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
new file mode 100644
index 000000000000..a8eb3425b5ab
--- /dev/null
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -0,0 +1,95 @@
+# Owner(s): ["module: inductor"]
+import importlib
+import os
+import sys
+import unittest
+
+import torch
+from torch._dynamo.testing import make_test_cls_with_patches
+from torch.testing._internal.common_utils import (
+    IS_CI,
+    IS_WINDOWS,
+    TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
+    TestCase,
+)
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+
+if IS_WINDOWS and IS_CI:
+    sys.stderr.write(
+        "Windows CI does not have necessary dependencies for test_torchinductor_dynamic_shapes yet\n"
+    )
+    if __name__ == "__main__":
+        sys.exit(0)
+    raise unittest.SkipTest("requires sympy/functorch/filelock")
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+from inductor.test_torchinductor import (
+    check_model,
+    check_model_cuda,
+    CommonTemplate,
+    copy_tests,
+)
+
+importlib.import_module("filelock")
+
+test_skips = {
+    "test_alexnet_prefix_dynamic_shapes": ("cuda",),
+    "test_baddbmm_dynamic_shapes": ("cpu", "cuda"),
+    "test_cpp_wrapper_dynamic_shapes": ("cpu",),
+    "test_cudnn_rnn_dynamic_shapes": ("cuda",),
+    "test_gather3_dynamic_shapes": ("cpu", "cuda"),
+    "test_kwargs_dynamic_shapes": ("cpu",),
+    "test_lowmem_dropout2_dynamic_shapes": ("cpu", "cuda"),
+    "test_rand_like_deterministic_dynamic_shapes": ("cpu", "cuda"),
+    "test_randn_like_empty_dynamic_shapes": ("cpu", "cuda"),
+    # test_roi_align uses torchvision, which doesn't work with dynamic shapes
+    "test_roi_align_dynamic_shapes": ("cpu", "cuda"),
+    "test_sizehint_issue1_dynamic_shapes": ("cpu", "cuda"),
+    "test_unroll_small_reduction_dynamic_shapes": ("cpu", "cuda"),
+    "test_upsample_bilinear2d_a_dynamic_shapes": ("cpu"),
+    "test_upsample_bilinear2d_b_dynamic_shapes": ("cpu"),
+    "test_upsample_nearest1d_dynamic_shapes": ("cpu"),
+    "test_upsample_nearest2d_backward_dynamic_shapes": ("cpu", "cuda"),
+    "test_upsample_nearest2d_dynamic_shapes": ("cpu"),
+    "test_upsample_nearest3d_dynamic_shapes": ("cpu"),
+}
+
+
+def make_dynamic_cls(cls):
+    return make_test_cls_with_patches(
+        cls,
+        "DynamicShapes",
+        "_dynamic_shapes",
+        (torch._dynamo.config, "dynamic_shapes", True),
+    )
+
+
+DynamicShapesCommonTemplate = make_dynamic_cls(CommonTemplate)
+
+
+if HAS_CPU:
+
+    class DynamicShapesCpuTests(TestCase):
+        common = check_model
+        device = "cpu"
+
+    copy_tests(DynamicShapesCommonTemplate, DynamicShapesCpuTests, "cpu", test_skips)
+
+
+if HAS_CUDA and not TEST_WITH_ASAN:
+
+    class DynamicShapesCudaTests(TestCase):
+        common = check_model_cuda
+        device = "cuda"
+
+    copy_tests(DynamicShapesCommonTemplate, DynamicShapesCudaTests, "cuda", test_skips)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    if (HAS_CPU or HAS_CUDA) and not TEST_WITH_ROCM:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 1bca433f91b2..8d9dff20780b 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -11,6 +11,7 @@
 import torch
 
 import torch._dynamo
+from torch._dynamo.test_case import run_tests
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyNativeDeviceTypes,
@@ -19,10 +20,11 @@
     skipCPUIf,
     skipCUDAIf,
 )
-from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_methods_invocations import op_db, skipOps
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
-    run_tests,
+    IS_MACOS,
+    IS_X86,
     skipCUDAMemoryLeakCheckIf,
     skipIfCrossRef,
     skipIfTorchDynamo,
@@ -52,14 +54,13 @@
 i64 = torch.int64
 b8 = torch.bool
 u8 = torch.uint8  # not tested
-c64 = torch.complex64
 
 _ops = partial(
     ops, dtypes=OpDTypes.supported, allowed_dtypes=[f16, f32, f64, i32, i64, b8]
 )
 
 # Success forces pass; failure forces fail; skip unconditionally skips testing
-TestExpect = Enum("TestExpect", ("SUCCESS", "XFAILURE", "SKIP"))
+ExpectedTestResult = Enum("ExpectedTestResult", ("SUCCESS", "XFAILURE", "SKIP"))
 
 COLLECT_EXPECT = os.getenv("PYTORCH_COLLECT_EXPECT", "0") == "1"
 FAIL_ON_SUCCESS = os.getenv("PYTORCH_FAIL_ON_SUCCESS", "1") == "1"
@@ -119,35 +120,21 @@ def process(device_type):
 if COLLECT_EXPECT:
     atexit.register(print_seen)
 
+# Note, in these skip/xfail dictionaries use a string as the key
+# for the default test, and a tuple of two strings for variants
+
 inductor_skips = defaultdict(dict)
 
 inductor_skips["cpu"] = {
     "linalg.ldl_solve": {b8, f16, f32, f64, i32, i64},  # segfault
     "linalg.ldl_factor": {f32, f64},  # flaky
     "__rdiv__": {b8, f16, f32, f64, i32, i64},  # flaky
-    # fft ops sometimes succeed locally and fail on CI.
-    # they return complex values which is known unsupported,
-    # so there is not much point in testing them currently.
-    "fft.fft": {b8, f16, f32, f64, i32, i64},
-    "fft.fft2": {b8, f16, f32, f64, i32, i64},
-    "fft.fftn": {b8, f16, f32, f64, i32, i64},
-    "fft.hfft": {b8, f16, f32, f64, i32, i64},
-    "fft.hfft2": {b8, f16, f32, f64, i32, i64},
-    "fft.hfftn": {b8, f16, f32, f64, i32, i64},
-    "fft.ifft": {f16, f32, f64, b8, i32, i64},
-    "fft.ifft2": {b8, f16, f32, f64, i32, i64},
-    "fft.ifftn": {b8, f16, f32, f64, i32, i64},
-    "fft.ihfft": {f16, f32, f64, c64, b8, i32, i64},
-    "fft.ihfft2": {f16, f32, f64, c64, b8, i32, i64},
-    "fft.ihfftn": {f16, f32, f64, c64, b8, i32, i64},
-    "fft.irfft": {b8, f16, f32, f64, i32, i64},
-    "fft.irfft2": {b8, f16, f32, f64, i32, i64},
-    "fft.irfftn": {b8, f16, f32, f64, i32, i64},
-    "fft.rfft": {f16, f32, f64, b8, i32, i64},
-    "fft.rfft2": {f16, f32, f64},
-    "fft.rfftn": {f16, f32, f64},
+    "nn.functional.cosine_embedding_loss": {b8},  # flaky
 }
 
+if IS_MACOS and IS_X86:
+    inductor_skips["cpu"]["rsqrt"] = {b8, i32}
+
 inductor_skips["cuda"] = {
     # Jiterator kernel is not expected to work with inductor
     "jiterator_2inputs_2outputs": {b8, f16, f32, f64, i32, i64},
@@ -156,36 +143,15 @@ def process(device_type):
     "jiterator_binary_return_by_ref": {b8, f16, f32, f64, i32, i64},
     "jiterator_unary": {b8, f16, f32, f64, i32, i64},
     # flaky
+    "nn.functional.cosine_embedding_loss": {b8},
     "native_batch_norm": {f16, f32, f64},
     "_native_batch_norm_legit": {f16, f32, f64},
-    # fft ops sometimes succeed locally and fail on CI.
-    # they return complex values which is known unsupported,
-    # so there is not much point in testing them currently.
-    "fft.fft": {b8, f16, f32, f64, i32, i64},
-    "fft.fft2": {b8, f16, f32, f64, i32, i64},
-    "fft.fftn": {b8, f16, f32, f64, i32, i64},
-    "fft.hfft": {b8, f16, f32, f64, i32, i64},
-    "fft.hfft2": {b8, f16, f32, f64, i32, i64},
-    "fft.hfftn": {b8, f16, f32, f64, i32, i64},
-    "fft.ifft": {f16, f32, f64, b8, i32, i64},
-    "fft.ifft2": {b8, f16, f32, f64, i32, i64},
-    "fft.ifftn": {b8, f16, f32, f64, i32, i64},
-    "fft.ihfft": {f16, f32, f64, c64, b8, i32, i64},
-    "fft.ihfft2": {f16, f32, f64, c64, b8, i32, i64},
-    "fft.ihfftn": {f16, f32, f64, c64, b8, i32, i64},
-    "fft.irfft": {b8, f16, f32, f64, i32, i64},
-    "fft.irfft2": {b8, f16, f32, f64, i32, i64},
-    "fft.irfftn": {b8, f16, f32, f64, i32, i64},
-    "fft.rfft": {f16, f32, f64, b8, i32, i64},
-    "fft.rfft2": {f16, f32, f64},
-    "fft.rfftn": {f16, f32, f64},
 }
 
 inductor_expected_failures_single_sample = defaultdict(dict)
 
 inductor_expected_failures_single_sample["cpu"] = {
     "__getitem__": {b8, f16, f32, f64, i32, i64},
-    "addr": {f16},
     "allclose": {f16, f32, f64},
     "amax": {f16},
     "amin": {f16},
@@ -194,12 +160,8 @@ def process(device_type):
     "bernoulli": {f32, f64},
     "bincount": {i32, i64},
     "bucketize": {b8, f16, f32, f64, i32, i64},
-    "cdouble": {b8, f16, f32, f64, i32, i64},
-    "cfloat": {b8, f16, f32, f64, i32, i64},
-    "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
-    "complex": {f16, f32, f64},
     "corrcoef": {f32, f64, i32, i64},
     "cov": {f32, f64, i32, i64},
     "equal": {b8, f16, f32, f64, i32, i64},
@@ -211,21 +173,20 @@ def process(device_type):
     "linalg.eigvals": {f32, f64},
     "linalg.eigvalsh": {f32, f64},
     "linalg.lstsq": {f32, f64},
-    "linalg.lstsq.grad_oriented": {f32, f64},
-    "linalg.matrix_rank": {f32, f64},
-    "linalg.matrix_rank.hermitian": {f32, f64},
+    # This pair of strings denotes a test variant
+    ("linalg.lstsq", "grad_oriented"): {f32, f64},
     "masked.var": {f16},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
-    "max.reduction_no_dim": {f16},
-    "max.reduction_with_dim": {b8},
-    "min.reduction_no_dim": {f16},
-    "min.reduction_with_dim": {b8},
+    ("max", "reduction_no_dim"): {f16},
+    ("max", "reduction_with_dim"): {b8},
+    ("min", "reduction_no_dim"): {f16},
+    ("min", "reduction_with_dim"): {b8},
     "multinomial": {f32, f64},
     "nanquantile": {f32, f64},
     "nn.functional.avg_pool1d": {i64},
-    "nn.functional.avg_pool2d": {i64, f64},
-    "nn.functional.adaptive_avg_pool2d": {f16, f64},
+    "nn.functional.avg_pool2d": {i64},
+    "nn.functional.adaptive_avg_pool2d": {f16},
     "nn.functional.ctc_loss": {f32, f64},
     "nn.functional.gaussian_nll_loss": {f32, f64},
     "nn.functional.local_response_norm": {i64},
@@ -234,7 +195,7 @@ def process(device_type):
     "nn.functional.triplet_margin_with_distance_loss": {f32, f64, i32, i64},
     "nonzero": {b8, f16, f32, f64, i32, i64},
     "normal": {f16, f32, f64},
-    "normal.number_mean": {f16, f32, f64},
+    ("normal", "number_mean"): {f16, f32, f64},
     "polar": {f32, f64},
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
@@ -243,39 +204,67 @@ def process(device_type):
     "randn_like": {f16, f32, f64},
     "repeat_interleave": {b8, f16, f32, f64, i32, i64},
     "scatter_add": {f16},
-    "scatter_reduce.sum": {f16},
-    "scatter_reduce.prod": {f16, f32, f64},
-    "segment_reduce.lengths": {f16, f32, f64},
+    ("scatter_reduce", "sum"): {f16},
+    ("scatter_reduce", "prod"): {f16, f32, f64},
+    ("_segment_reduce", "lengths"): {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
+    ("sparse.mm", "reduce"): {bf16, f32, f64},
     "stft": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
     "to_sparse": {f32, f64},
-    "uniform": {f16, f32, f64},
-    "unique": {b8, f32, f64, i32, i64},
-    "unique_consecutive": {b8, f32, f64, i32, i64},
+    # AssertionError: Tensor-likes are not close!
+    "cauchy": {f16},
+    "exponential": {f16},
+    "geometric": {f16},
+    "log_normal": {f16},
+    ("normal", "in_place"): {f16, f32, f64},
+    "uniform": {f16},
+    "unique": {b8, f16, f32, f64, i32, i64},
+    "unique_consecutive": {b8, f16, f32, f64, i32, i64},
     "var": {f16},
     "var_mean": {f16},
     "view_as_complex": {f16},
+    ("norm", "inf"): {f16},
+    "fft.fft": {b8, f16, f32, f64, i32, i64},
+    "fft.fft2": {b8, f16, f32, f64, i32, i64},
+    "fft.fftn": {b8, f16, f32, f64, i32, i64},
+    "fft.hfft": {b8, f16, f32, f64, i32, i64},
+    "fft.hfft2": {b8, f16, f32, f64, i32, i64},
+    "fft.hfftn": {b8, f16, f32, f64, i32, i64},
+    "fft.ifft": {f16, f32, f64, b8, i32, i64},
+    "fft.ifft2": {b8, f16, f32, f64, i32, i64},
+    "fft.ifftn": {b8, f16, f32, f64, i32, i64},
+    "fft.ihfft": {f16, f32, f64, b8, i32, i64},
+    "fft.ihfft2": {f16, f32, f64, b8, i32, i64},
+    "fft.ihfftn": {f16, f32, f64, b8, i32, i64},
+    "fft.irfft": {b8, f16, f32, f64, i32, i64},
+    "fft.irfft2": {b8, f16, f32, f64, i32, i64},
+    "fft.irfftn": {b8, f16, f32, f64, i32, i64},
+    "fft.rfft": {f16, f32, f64, b8, i32, i64},
+    "fft.rfft2": {f16, f32, f64},
+    "fft.rfftn": {f16, f32, f64},
+    # These return complex tensors
+    "cdouble": {b8, i32, i64, f16, f32, f64},
+    "cfloat": {b8, i32, i64, f16, f32, f64},
+    "chalf": {b8, i32, i64, f16, f32, f64},
+    "complex": {f16, f32, f64},
 }
 
 
 inductor_expected_failures_single_sample["cuda"] = {
     "__getitem__": {b8, f16, f32, f64, i32, i64},
     "__rdiv__": {b8, f16, f32, f64, i32, i64},
+    "addr": {f16},
     "allclose": {f16, f32, f64},
     "angle": {f32, f64},
     "argwhere": {b8, f16, f32, f64, i32, i64},
-    "as_strided.partial_views": {f16, f32, f64},
+    ("as_strided", "partial_views"): {b8, f16, f32, f64, i32, i64},
     "baddbmm": {f16},
     "bernoulli": {f16, f32, f64},
     "bincount": {i32, i64},
     "bucketize": {b8, f16, f32, f64, i32, i64},
-    "cdouble": {b8, f16, f32, f64, i32, i64},
-    "cfloat": {b8, f16, f32, f64, i32, i64},
-    "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
-    "complex": {f16, f32, f64},
     "corrcoef": {f16, f32, f64, i32, i64},
     "cov": {f16, f32, f64, i32, i64},
     "equal": {b8, f16, f32, f64, i32, i64},
@@ -286,15 +275,11 @@ def process(device_type):
     "linalg.eigvals": {f32, f64},
     "linalg.eigvalsh": {f32, f64},
     "linalg.lstsq": {f32, f64},
-    "linalg.lstsq.grad_oriented": {f32, f64},
-    "linalg.matrix_rank": {f32, f64},
-    "linalg.matrix_rank.hermitian": {f32, f64},
-    "masked.argmax": {f16, f32, f64, i32},
-    "masked.argmin": {f16, f32, f64, i32},
+    ("linalg.lstsq", "grad_oriented"): {f32, f64},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
-    "max.reduction_with_dim": {b8},
-    "min.reduction_with_dim": {b8},
+    ("max", "reduction_with_dim"): {b8},
+    ("min", "reduction_with_dim"): {b8},
     "multinomial": {f16, f32, f64},
     "nn.functional.adaptive_avg_pool2d": {f16},
     "nn.functional.ctc_loss": {f32, f64},
@@ -306,7 +291,7 @@ def process(device_type):
     "nn.functional.triplet_margin_with_distance_loss": {f16, f32, f64, i32, i64},
     "nonzero": {b8, f16, f32, f64, i32, i64},
     "normal": {f16, f32, f64},
-    "normal.number_mean": {f16, f32, f64},
+    ("normal", "number_mean"): {f16, f32, f64},
     "polar": {f32, f64},
     "pow": {i32, i64},
     "rand_like": {f16, f32, f64},
@@ -314,19 +299,57 @@ def process(device_type):
     "randint": {f16, f32, f64, i32, i64},
     "randn_like": {f16, f32, f64},
     "repeat_interleave": {b8, f16, f32, f64, i32, i64},
-    "round.decimals_3": {f16},
-    "scatter_reduce.prod": {f16, f32, f64},
-    "segment_reduce.lengths": {f16, f32, f64},
+    ("round", "decimals_3"): {f16},
+    ("scatter_reduce", "prod"): {f16, f32, f64},
+    ("_segment_reduce", "lengths"): {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
-    "std_mean.unbiased": {f16},
+    ("std_mean", "unbiased"): {f16},
     "stft": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
     "to_sparse": {f16, f32, f64},
+    # AssertionError: Tensor-likes are not close!
+    "cauchy": {f16, f32, f64},
+    "exponential": {f16, f32, f64},
+    "geometric": {f16, f32, f64, i32, i64},
+    ("normal", "in_place"): {f16, f32, f64},
+    "log_normal": {f16, f32, f64},
     "uniform": {f16, f32, f64},
     "unique": {b8, f16, f32, f64, i32, i64},
     "unique_consecutive": {b8, f16, f32, f64, i32, i64},
     # AssertionError: Tensor-likes are not close!
     "nn.functional.triplet_margin_loss": {f16},
+    # The following 3 tests fail on CUDA with AssertionError: expected size 5==5, stride 5==1 at dim=0
+    # linalg._svd's return value has different strides on CUDA vs CPU which causes this
+    # In test_meta.py there is a mechanism to skipping strides checks for some ops
+    # (including _linalg_svd), possibly we should have something similar here
+    "linalg.cond": {f32, f64},
+    "linalg.svdvals": {f32, f64},
+    ("norm", "nuc"): {f32, f64},
+    # AssertionError: Scalars are not close!
+    "nn.functional.soft_margin_loss": {f16},
+    "fft.fft": {b8, f16, f32, f64, i32, i64},
+    "fft.fft2": {b8, f16, f32, f64, i32, i64},
+    "fft.fftn": {b8, f16, f32, f64, i32, i64},
+    "fft.hfft": {b8, f16, f32, f64, i32, i64},
+    "fft.hfft2": {b8, f16, f32, f64, i32, i64},
+    "fft.hfftn": {b8, f16, f32, f64, i32, i64},
+    "fft.ifft": {f16, f32, f64, b8, i32, i64},
+    "fft.ifft2": {b8, f16, f32, f64, i32, i64},
+    "fft.ifftn": {b8, f16, f32, f64, i32, i64},
+    "fft.ihfft": {f16, f32, f64, b8, i32, i64},
+    "fft.ihfft2": {f16, f32, f64, b8, i32, i64},
+    "fft.ihfftn": {f16, f32, f64, b8, i32, i64},
+    "fft.irfft": {b8, f16, f32, f64, i32, i64},
+    "fft.irfft2": {b8, f16, f32, f64, i32, i64},
+    "fft.irfftn": {b8, f16, f32, f64, i32, i64},
+    "fft.rfft": {f16, f32, f64, b8, i32, i64},
+    "fft.rfft2": {f16, f32, f64},
+    "fft.rfftn": {f16, f32, f64},
+    # These return complex tensors
+    "cdouble": {b8, i32, i64, f16, f32, f64},
+    "cfloat": {b8, i32, i64, f16, f32, f64},
+    "chalf": {b8, i32, i64, f16, f32, f64},
+    "complex": {f16, f32, f64},
 }
 
 inductor_gradient_expected_failures_single_sample = defaultdict(dict)
@@ -337,9 +360,8 @@ def process(device_type):
     "linalg.vector_norm": {f64, f64},
     "kron": {f16},
     "nanquantile": {f32, f64},
-    "nn.functional._scaled_dot_product_attention": {f16},
     "nn.functional.avg_pool2d": {f16, f32, f64},
-    "nn.functional.batch_norm.without_cudnn": {f16},
+    ("nn.functional.batch_norm", "without_cudnn"): {f16},
     "nn.functional.batch_norm": {f16},
     "nn.functional.cosine_similarity": {f16},
     "nn.functional.instance_norm": {f16},
@@ -364,6 +386,30 @@ def process(device_type):
 }
 
 
+def get_skips_and_xfails(from_dict, xfails=True):
+    retval = set()
+    for device, d in from_dict.items():
+        for op, dtypes in d.items():
+            if type(op) is tuple:
+                op, variant_name = op
+            else:
+                variant_name = ""
+            retval.add((op, variant_name, device, tuple(dtypes), xfails))
+    return retval
+
+
+# Note: if you get a "AssertionError: Couldn't find OpInfo for ..." error for an OpInfo you are sure
+# exists, you might be trying to use a test variant and you need to replace, for example,
+# "max.reduction_no_dim" with ("max", "reduction_no_dim") as the key of one of these dictionaries
+test_skips_or_fails = (
+    get_skips_and_xfails(inductor_skips, xfails=False)
+    | get_skips_and_xfails(inductor_expected_failures_single_sample, xfails=True)
+    | get_skips_and_xfails(
+        inductor_gradient_expected_failures_single_sample, xfails=True
+    )
+)
+
+
 def wrapper_set_seed(op, *args, **kwargs):
     """Wrapper to set seed manually for some functions like dropout
     See: https://github.com/pytorch/pytorch/pull/62315#issuecomment-896143189 for more details.
@@ -383,12 +429,18 @@ def wrapper_set_seed(op, *args, **kwargs):
 inductor_override_kwargs = {
     # the return value of empty is undefined
     "empty": {"assert_equal": False},
+    "empty_permuted": {"assert_equal": False},
     "empty_like": {"assert_equal": False},
     "new_empty": {"assert_equal": False},
     "new_empty_strided": {"assert_equal": False},
     "randn": {"assert_equal": False},
+    ("masked.softmin", "cuda", f16): {"atol": 1e-4, "rtol": 0.01},
     ("nn.functional.tanhshrink", "cuda", f16): {"atol": 3e-4, "rtol": 0.001},
+    ("nn.functional.softmin", "cuda", f16): {"atol": 1e-4, "rtol": 0.01},
+    ("special.log_ndtr", "cuda", f64): {"atol": 1e-6, "rtol": 1e-5},
     ("cummax", "cuda", f16): {"atol": 5e-4, "rtol": 0.002},
+    ("softmax", "cuda", f16): {"atol": 1e-4, "rtol": 0.02},
+    ("softmax", "cpu", f16): {"atol": 1e-4, "rtol": 0.02},
     ("_softmax_backward_data", "cuda", f16): {"atol": 0.008, "rtol": 0.002},
     "gradient": {"check_gradient": False},  # segfault on check_gradient
     # Following tests failed, and causing subsequent tests failing with unrecoverable CUDA error
@@ -399,6 +451,7 @@ def wrapper_set_seed(op, *args, **kwargs):
 
 # Always test with all sample for following ops
 inductor_all_samples = {
+    "arange",
     "softmax.with_dtype",
     "index_add",
     "index_copy",
@@ -412,8 +465,13 @@ def wrapper_set_seed(op, *args, **kwargs):
     "all",
     "T",
     "H",
+    "isinf",
+    "isposinf",
+    "isneginf",
+    "nan_to_num",
     "mT",
     "mH",
+    "rsub",
 }
 
 
@@ -431,7 +489,11 @@ class TestInductorOpInfo(TestCase):
     @skipIfTorchDynamo("Test uses dynamo already")
     @skipIfCrossRef
     @_ops(op_db[START:END])
+    @skipOps("TestInductorOpInfo", "test_comprehensive", test_skips_or_fails)
     @patch("torch._dynamo.config.raise_on_unsafe_aot_autograd", True)
+    @torch._inductor.config.patch(
+        {"implicit_fallbacks": False, "triton.autotune_pointwise": False}
+    )
     def test_comprehensive(self, device, dtype, op):
         torch._dynamo.reset()
         with torch.no_grad():
@@ -450,11 +512,10 @@ def test_comprehensive(self, device, dtype, op):
         #     print(f"CONSIDERING OP {op_name} on {device_type} with {dtype} |
         # {inductor_skips[device_type].get(op_name, set())}", flush=True)
         if dtype in inductor_skips[device_type].get(op_name, set()):
-            test_expect = TestExpect.SKIP
+            test_expect = ExpectedTestResult.SKIP
             # with open("test_output.txt", "a") as f:
             #     print(f"SKIPPING OP {op_name} on {device_type}", flush=True, file=f)
             #     print(f"SKIPPING OP {op_name} on {device_type}", flush=True)
-            self.skipTest(f"{op_name} in {dtype} not supported")
         elif dtype in inductor_expected_failures_single_sample[device_type].get(
             op_name, set()
         ) or dtype in inductor_gradient_expected_failures_single_sample[
@@ -462,9 +523,9 @@ def test_comprehensive(self, device, dtype, op):
         ].get(
             op_name, set()
         ):
-            test_expect = TestExpect.XFAILURE
+            test_expect = ExpectedTestResult.XFAILURE
         else:
-            test_expect = TestExpect.SUCCESS
+            test_expect = ExpectedTestResult.SUCCESS
 
         overridden_kwargs = {}
         if op_name in inductor_override_kwargs:
@@ -539,8 +600,8 @@ def fn(*args, **kwargs):
 
         except Exception as e:
 
-            if test_expect is TestExpect.XFAILURE:
-                return
+            if test_expect is ExpectedTestResult.XFAILURE:
+                raise e
 
             seen_failed[device_type].setdefault(op_name, set()).add(dtype)
 
@@ -563,7 +624,7 @@ def fn(*args, **kwargs):
         #     print(f"SUCCEEDED OP {op_name} on {device_type} with {dtype}", flush=True, file=f)
         seen_succeeded[device_type].setdefault(op_name, set()).add(dtype)
 
-        if test_expect is TestExpect.XFAILURE and not COLLECT_EXPECT:
+        if test_expect is ExpectedTestResult.XFAILURE and not COLLECT_EXPECT:
             if FAIL_ON_SUCCESS:
                 raise RuntimeError(
                     f"unexpected success {op_name}, {dtype}, {device_type}"
diff --git a/test/jit/_imported_class_test/bar.py b/test/jit/_imported_class_test/bar.py
index 7c99373d9a20..f6bdc593109b 100644
--- a/test/jit/_imported_class_test/bar.py
+++ b/test/jit/_imported_class_test/bar.py
@@ -4,6 +4,6 @@
 
 
 @torch.jit.script  # noqa: B903
-class FooSameName(object):  # noqa: B903
+class FooSameName:  # noqa: B903
     def __init__(self, y):
         self.y = y
diff --git a/test/jit/_imported_class_test/foo.py b/test/jit/_imported_class_test/foo.py
index de231415380b..fe0123be3254 100644
--- a/test/jit/_imported_class_test/foo.py
+++ b/test/jit/_imported_class_test/foo.py
@@ -5,7 +5,7 @@
 
 
 @torch.jit.script  # noqa: B903
-class FooSameName(object):
+class FooSameName:
     def __init__(self, x):
         self.x = x
         self.nested = bar.FooSameName(x)
diff --git a/test/jit/_imported_class_test/very/very/nested.py b/test/jit/_imported_class_test/very/very/nested.py
index 12fa0e82057b..dcf8dcb40cf8 100644
--- a/test/jit/_imported_class_test/very/very/nested.py
+++ b/test/jit/_imported_class_test/very/very/nested.py
@@ -4,6 +4,6 @@
 
 
 @torch.jit.script  # noqa: B903
-class FooUniqueName(object):  # noqa: B903
+class FooUniqueName:  # noqa: B903
     def __init__(self, y):
         self.y = y
diff --git a/test/jit/fixtures/test_versioned_random_func_v10.ptl b/test/jit/fixtures/test_versioned_random_func_v10.ptl
new file mode 100644
index 000000000000..124f3824e811
Binary files /dev/null and b/test/jit/fixtures/test_versioned_random_func_v10.ptl differ
diff --git a/test/jit/fixtures/test_versioned_random_out_v10.ptl b/test/jit/fixtures/test_versioned_random_out_v10.ptl
new file mode 100644
index 000000000000..f03123f7f30e
Binary files /dev/null and b/test/jit/fixtures/test_versioned_random_out_v10.ptl differ
diff --git a/test/jit/fixtures/test_versioned_random_v10.ptl b/test/jit/fixtures/test_versioned_random_v10.ptl
new file mode 100644
index 000000000000..ce4f5e4f5beb
Binary files /dev/null and b/test/jit/fixtures/test_versioned_random_v10.ptl differ
diff --git a/test/jit/fixtures_srcs/fixtures_src.py b/test/jit/fixtures_srcs/fixtures_src.py
index dff23702311a..afba17800c9c 100644
--- a/test/jit/fixtures_srcs/fixtures_src.py
+++ b/test/jit/fixtures_srcs/fixtures_src.py
@@ -2,9 +2,6 @@
 from typing import Union
 
 class TestVersionedDivTensorExampleV7(torch.nn.Module):
-    def __init__(self):
-        super(TestVersionedDivTensorExampleV7, self).__init__()
-
     def forward(self, a, b):
         result_0 = a / b
         result_1 = torch.div(a, b)
@@ -12,48 +9,49 @@ def forward(self, a, b):
         return result_0, result_1, result_2
 
 class TestVersionedLinspaceV7(torch.nn.Module):
-    def __init__(self):
-        super(TestVersionedLinspaceV7, self).__init__()
-
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
         c = torch.linspace(a, b, steps=5)
         d = torch.linspace(a, b)
         return c, d
 
 class TestVersionedLinspaceOutV7(torch.nn.Module):
-    def __init__(self):
-        super(TestVersionedLinspaceOutV7, self).__init__()
-
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
         return torch.linspace(a, b, out=out)
 
 class TestVersionedLogspaceV8(torch.nn.Module):
-    def __init__(self):
-        super(TestVersionedLogspaceV8, self).__init__()
-
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
         c = torch.logspace(a, b, steps=5)
         d = torch.logspace(a, b)
         return c, d
 
 class TestVersionedLogspaceOutV8(torch.nn.Module):
-    def __init__(self):
-        super(TestVersionedLogspaceOutV8, self).__init__()
-
     def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
         return torch.logspace(a, b, out=out)
 
 class TestVersionedGeluV9(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, x):
         return torch._C._nn.gelu(x)
 
 class TestVersionedGeluOutV9(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, x):
         out = torch.zeros_like(x)
         return torch._C._nn.gelu(x, out=out)
+
+class TestVersionedRandomV10(torch.nn.Module):
+    def forward(self, x):
+        out = torch.zeros_like(x)
+        return out.random_(0, 10)
+
+
+class TestVersionedRandomFuncV10(torch.nn.Module):
+    def forward(self, x):
+        out = torch.zeros_like(x)
+        return out.random(0, 10)
+
+
+class TestVersionedRandomOutV10(torch.nn.Module):
+    def forward(self, x):
+        x = torch.zeros_like(x)
+        out = torch.zeros_like(x)
+        x.random(0, 10, out=out)
+        return out
diff --git a/test/jit/fixtures_srcs/generate_models.py b/test/jit/fixtures_srcs/generate_models.py
index e00153745138..1c7ad8958d2f 100644
--- a/test/jit/fixtures_srcs/generate_models.py
+++ b/test/jit/fixtures_srcs/generate_models.py
@@ -96,6 +96,9 @@ def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor:
     TestVersionedLogspaceOutV8(): "aten::logspace.out",
     TestVersionedGeluV9(): "aten::gelu",
     TestVersionedGeluOutV9(): "aten::gelu.out",
+    TestVersionedRandomV10(): "aten::random_.from",
+    TestVersionedRandomFuncV10(): "aten::random.from",
+    TestVersionedRandomOutV10(): "aten::random.from_out",
 }
 
 """
diff --git a/test/jit/myexception.py b/test/jit/myexception.py
index 5937bd3c91b7..e60d30bd1769 100644
--- a/test/jit/myexception.py
+++ b/test/jit/myexception.py
@@ -4,5 +4,4 @@
 is captured correctly in suce cases.
 """
 class MyKeyError(KeyError):
-    def __init__(self, msg):
-        super(KeyError, self).__init__(msg)
+    pass
diff --git a/test/jit/test_async.py b/test/jit/test_async.py
index f8a1baea6713..36fdc01f5a7b 100644
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@@ -87,7 +87,7 @@ class Mod(torch.jit.ScriptModule):
             __constants__ = ['const']
 
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.const = 42
                 self.param = nn.Parameter(torch.randn(2, 2))
 
@@ -244,15 +244,12 @@ def foo_script_kwargs(x1, x2):
     @_inline_everything
     def test_async_script_trace(self):
         class Traced(nn.Module):
-            def __init__(self):
-                super(Traced, self).__init__()
-
             def forward(self, x):
                 return (torch.neg(x), x)
 
         class Mod(torch.jit.ScriptModule):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 x = torch.rand(3, 3)
                 self.traced = torch.jit.trace(Traced(), (x), _force_outplace=True)
 
@@ -273,7 +270,7 @@ def forward(self, x: Tensor) -> Tuple[List[Tensor], Tuple[Tensor, Tensor], Tenso
 
         class TupleCl(nn.Module):
             def __init__(self):
-                super(TupleCl, self).__init__()
+                super().__init__()
                 self.module = Mod()
 
             def forward(self, x):
@@ -424,9 +421,6 @@ def add_one(input):
             return input + torch.ones(input.size())
 
         class TestListFutureModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 input_list = []
                 for i in range(3):
@@ -458,9 +452,6 @@ def add_one(input):
             return input + torch.ones(input.size())
 
         class DifferentOutputModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 fut_res = torch.jit._fork(add_one, (input))
 
diff --git a/test/jit/test_attr.py b/test/jit/test_attr.py
index 55f06383826f..1fd85be9fadc 100644
--- a/test/jit/test_attr.py
+++ b/test/jit/test_attr.py
@@ -16,7 +16,7 @@ def test_getattr_with_default(self):
 
         class A(torch.nn.Module):
             def __init__(self):
-                super(A, self).__init__()
+                super().__init__()
                 self.init_attr_val = 1.0
 
             def forward(self, x):
diff --git a/test/jit/test_autodiff.py b/test/jit/test_autodiff.py
index 3173e81f549b..a77569fb4f91 100644
--- a/test/jit/test_autodiff.py
+++ b/test/jit/test_autodiff.py
@@ -2,9 +2,12 @@
 
 import torch
 
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase
 from typing import List
 
+
+@skipIfTorchDynamo()
 class TestAutodiffJit(JitTestCase):
     def test_undefined_tensor_lists(self):
         def fn(tensor_list: List[torch.Tensor], add_tensor):
diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py
index f643061703ce..fbdcc1909145 100644
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@@ -86,7 +86,7 @@ def test_bias_as_module_attr(self):
         with enable_profiling_mode_for_profiling_tests():
             class M(torch.nn.Module):
                 def __init__(self, has_bias):
-                    super(M, self).__init__()
+                    super().__init__()
                     self.ll = torch.nn.Linear(10, 10, has_bias)
 
                 def forward(self, x, y):
diff --git a/test/jit/test_await.py b/test/jit/test_await.py
new file mode 100644
index 000000000000..1500ed27b7f2
--- /dev/null
+++ b/test/jit/test_await.py
@@ -0,0 +1,386 @@
+# Owner(s): ["oncall: jit"]
+
+import io
+import torch
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.jit_utils import make_global
+from typing import List, Optional, Tuple
+from torch import Tensor
+from torch._awaits import _Await as Await
+
+
+class TestAwait(JitTestCase):
+    def test_await_python(self):
+        def foo(x: int) -> int:
+            return x + 13
+        aw: Await[int] = torch.jit._awaitable(foo, 13)
+        self.assertTrue(aw.fn()(*aw.args()) == torch.jit._awaitable_wait(aw))
+        nw = torch.jit._awaitable_nowait(33)
+        self.assertTrue(nw.is_nowait())
+        self.assertTrue(nw.args() == (33,))
+
+    def test_await_type_python(self):
+        def foo() -> Tensor:
+            return torch.randn()
+        awaits = torch.jit.annotate(List[Await[Tensor]], [])
+        awaits.append(torch.jit._awaitable(foo))
+
+    def test_script(self):
+        def delayed(z: int) -> int:
+            return z + 3
+
+        def fn(x: Tensor):
+            aw: Await[int] = torch.jit._awaitable(delayed, 99)
+            a = torch.eye(2)
+            b = torch.jit._awaitable_wait(aw)
+            return a + b + x
+
+        inp = torch.zeros(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2) + 102, script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_nowait(self):
+        def fn(x: Tensor):
+            aw = torch.jit._awaitable_nowait(13)
+            a = torch.eye(2)
+            b = torch.jit._awaitable_wait(aw)
+            return a + b + x
+
+        inp = torch.zeros(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2) + 13, script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_nowait_class(self):
+        class C:
+            def __init__(self, a: Tensor, b: Tensor):
+                self._a = a
+                self._b = b
+
+            def a(self) -> Tensor:
+                return self._a
+
+        def fn(x: Tensor):
+            aw = torch.jit._awaitable_nowait(C(torch.zeros(2), torch.ones(2)))
+            _a = torch.eye(2)
+            c = torch.jit._awaitable_wait(aw)
+            return _a + c.a() + x
+
+        make_global(C)
+        inp = torch.zeros(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+
+    def test_await_class_arg(self):
+
+        class C:
+            def __init__(self, a: Tensor, b: Tensor):
+                self.__a = a
+                self.__b = b
+
+            def a(self) -> Tensor:
+                return self.__a
+
+        make_global(C)
+
+        def delayed(c: C) -> Tensor:
+            return c.a()
+
+        def fn(x: Tensor):
+            c = C(torch.zeros(2), torch.ones(2))
+            aw = torch.jit._awaitable(delayed, c)
+            _a = torch.eye(2)
+            c2_t = torch.jit._awaitable_wait(aw)
+            return _a + c2_t + x
+        inp = torch.zeros(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_awaitable_to_await(self):
+        class C:
+            __slots__ = ["_a", "_b"]
+
+            def __init__(self, a: Tensor, b: Tensor):
+                self._a = a
+                self._b = b
+
+
+        make_global(C)
+
+        # Can not stay in the class as Jit does not support Recursive annotations
+        # (self in wait_impl can not be annotated as C as C is not defined by this time)
+        def C_wait_impl(self: C):
+            return self._a + self._b
+
+        def fn(x: Tensor):
+            aw = torch.jit._awaitable(C_wait_impl, C(torch.zeros(2), torch.ones(2)))
+            _a = torch.eye(2)
+            c_wait_impl_res = torch.jit._awaitable_wait(aw)
+            return _a + c_wait_impl_res + x
+
+        inp = torch.ones(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2) + 2 * torch.ones(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_await_class_return(self):
+
+        class C:
+            __slots__ = ["a", "b"]
+
+            def __init__(self, a: Tensor, b: Tensor):
+                self.a = a
+                self.b = b
+
+
+        make_global(C)
+
+        # Can not stay in the class as Jit does not support Recursive annotations
+        # (self in wait_impl can not be annotated as C as C is not defined by this time)
+        def C_wait_impl(self: C) -> C:
+            return C(self.a * 2, self.b * 3)
+
+        def fn_arg_C(x: C) -> Tensor:
+            return x.a + x.b
+
+        def fn(x: Tensor):
+            aw: Await[C] = torch.jit._awaitable(C_wait_impl, C(x, x))
+            _a = torch.eye(2)
+            y = fn_arg_C(torch.jit._awaitable_wait(aw))
+            return _a + y + x
+
+        inp = torch.ones(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2) + 6 * torch.ones(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+        self.assertGraphContainsExactly(sm.graph, kind='prim::awaitable_wait', num_kind_nodes=1)
+
+    def test_await_getattr_implicit_convertion(self):
+        class C:
+            def __init__(self, a: Tensor, b: Tensor):
+                self._a = a
+                self._b = b
+
+            def b(self):
+                return self._b
+
+
+        make_global(C)
+
+        # Can not stay in the class as Jit does not support Recursive annotations
+        # (self in wait_impl can not be annotated as C as C is not defined by this time)
+        def C_wait_impl(self: C) -> C:
+            return C(self._a * 2, self._b * 3)
+
+        def fn_arg_C(x: C) -> Tensor:
+            return x._a + x._b
+
+        def fn(x: Tensor):
+            aw: Await[C] = torch.jit._awaitable(C_wait_impl, C(x, x))
+            _a = torch.eye(2)
+            ai = aw._a
+            awb = aw.b()
+            c = C(2 * x, 2 * x)
+            return _a + ai + x + c._a + c.b()
+
+        inp = torch.ones(2)
+
+        sm = torch.jit.script(fn)
+        out = fn(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(torch.eye(2) + 7 * torch.ones(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+        self.assertGraphContainsExactly(sm.graph, kind='prim::awaitable_wait', num_kind_nodes=2)
+
+    def test_await_nested(self):
+
+        class C:
+            def __init__(self, a: Tensor, b: Tensor):
+                self.__a = a
+                self.__b = b
+
+            def a(self) -> Tensor:
+                return self.__a
+
+        make_global(C)
+
+        def delayed(c: C) -> Await[Tensor]:
+            return torch.jit._awaitable_nowait(3 * c.a())
+
+        def fn(x: Tensor) -> Await[Await[Tensor]]:
+            return torch.jit._awaitable(delayed, C(2 * x, x))
+
+        def main(x: Tensor) -> Tensor:
+            awaw = fn(x)
+            return torch.jit._awaitable_wait(torch.jit._awaitable_wait(awaw))
+
+        inp = torch.eye(2)
+
+        sm = torch.jit.script(main)
+        out = main(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(6 * torch.eye(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_eager_await_non_scriptable(self):
+        # Tree type can not be compiled (Recursive type)
+        class Tree:
+            def __init__(self, v):
+                self.parent = torch.jit.annotate(Optional[Tree], None)
+                self.v = v
+        make_global(Tree)
+
+        def delayed(t: Tree):
+            t.v = t.v + 1
+            return t
+
+        aw = torch.jit._awaitable(delayed, Tree(2))
+        t = torch.jit._awaitable_wait(aw)
+        self.assertTrue(t.v == 3)
+
+    def test_await_isinstance(self):
+        def delayed(x: Tensor) -> Tensor:
+            return 2 * (x + 1)
+
+        def main(x: Tensor) -> Tensor:
+            aw = torch.jit._awaitable(delayed, x)
+            if torch.jit.is_scripting():
+                assert isinstance(aw, torch.jit._Await)
+            return torch.jit._awaitable_wait(aw)
+
+        inp = torch.eye(2)
+
+        sm = torch.jit.script(main)
+        out = main(inp)
+        script_out = sm(inp)
+        self.assertTrue(torch.allclose(2 * torch.eye(2) + 2 * torch.ones(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_await_eager_lazy(self):
+        def delayed(x: Tensor) -> Tensor:
+            return 2 * (x + 1)
+        t = torch.ones(2, dtype=torch.int64)
+        aw = torch.jit._awaitable(delayed, t)
+        self.assertTrue(isinstance(aw, torch._C._Await))
+        self.assertTrue(t.dtype == aw.dtype)
+
+    def test_await_out_of_interpreter(self):
+        def delayed(x: Tensor) -> Tensor:
+            return 2 * (x + 1)
+
+        def main(x: Tensor) -> Await[Tensor]:
+            aw = torch.jit._awaitable(delayed, x)
+            return aw
+
+        inp = torch.eye(2)
+
+        sm = torch.jit.script(main)
+        out_aw = main(inp)
+        out = torch.jit._awaitable_wait(out_aw)
+
+        script_out_aw = sm(inp)
+        script_out = torch.jit._awaitable_wait(script_out_aw)
+        self.assertTrue(torch.allclose(2 * torch.eye(2) + 2 * torch.ones(2), script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+    def test_jit_trace(self):
+        def gap(x: Tensor):
+            return torch.relu(x) + torch.sin(x)
+
+        def delayed(x: Tensor) -> Tensor:
+            return 2 * (torch.cos(x) + 1)
+
+        def main(x: Tensor, y: Tensor) -> Tensor:
+            aw = torch.jit._awaitable(delayed, x)
+            z = gap(y)
+            k = torch.jit._awaitable_wait(aw)
+            return y + k
+
+        inp = torch.randn(2)
+        tm = torch.jit.trace(main, (inp, inp))
+        inp_check = torch.ones(2)
+        self.assertEqual(main(inp_check, inp_check), tm(inp_check, inp_check))
+
+    def test_await_multiout_save(self):
+        def gap(x: Tensor):
+            return torch.relu(x) + torch.sin(x)
+
+        def delayed(x: Tensor) -> Tuple[Tensor, List[Tensor]]:
+            l = [x * i for i in range(5)]
+            return (100 * x, l)
+
+        def main(x: Tensor) -> Tensor:
+            aw = torch.jit._awaitable(delayed, x)
+            z = gap(x)
+            (_, l) = torch.jit._awaitable_wait(aw)
+            return l[3] + z
+
+        inp = torch.eye(2)
+
+        sm = torch.jit.script(main)
+        out = main(inp)
+        script_out = sm(inp)
+        expected = 4.8415 * torch.eye(2)
+        self.assertTrue(torch.allclose(expected, script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+        iofile = io.BytesIO()
+        torch.jit.save(sm, iofile)
+        iofile.seek(0)
+        sm = torch.jit.load(iofile)
+        script_out_load = sm(inp)
+        self.assertTrue(torch.allclose(expected, script_out_load))
+
+    def test_await_func_arg(self):
+        def gap(x: Tensor):
+            return torch.relu(x) + torch.sin(x)
+
+        def delayed(x: Tensor) -> Tensor:
+            return -1 * x
+
+        def fn(aw: Await[Tensor]) -> Tensor:
+            return 3 * torch.jit._awaitable_wait(aw)
+
+        def main(x: Tensor) -> Tensor:
+            aw = torch.jit._awaitable(delayed, x)
+            z = gap(x)
+            y = fn(aw)
+            return y + x
+
+        inp = torch.eye(2)
+
+        sm = torch.jit.script(main)
+        out = main(inp)
+        script_out = sm(inp)
+        expected = -2 * torch.eye(2)
+        self.assertTrue(torch.allclose(expected, script_out))
+        self.assertTrue(torch.allclose(script_out, out))
+
+        iofile = io.BytesIO()
+        torch.jit.save(sm, iofile)
+        iofile.seek(0)
+        sm = torch.jit.load(iofile)
+        script_out_load = sm(inp)
+        self.assertTrue(torch.allclose(expected, script_out_load))
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index 1a34fca32155..e114a54ae3f2 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -52,9 +52,6 @@ class BasicModule(torch.nn.Module):
     A simple Module used to test to_backend lowering machinery.
     """
 
-    def __init__(self):
-        super().__init__()
-
     def forward(self, x, h):
         return self.accum(x, h), self.sub_accum(x, h)
 
@@ -476,9 +473,6 @@ class BasicModuleAdd(torch.nn.Module):
     A simple add Module used to test to_backend lowering machinery.
     """
 
-    def __init__(self):
-        super().__init__()
-
     def forward(self, x, h):
         return x + h
 
@@ -568,16 +562,10 @@ class ModuleNotSupported(torch.nn.Module):
         """
         A module with an operator that is not supported.
         """
-        def __init__(self):
-            super().__init__()
-
         def forward(self, x, h):
             return x * h
             self._loweredmodule.forward()
 
-    def setUp(self):
-        super().setUp()
-
     def test_errors(self):
         scripted_module_n = torch.jit.script(ErrorMessagesWithCompiler.ModuleNotSupported())
         # Test exception is thrown when lowering a module with an unsupported operator
@@ -600,9 +588,6 @@ class BasicModuleSub(torch.nn.Module):
         """
         A simple subtraction Module to be used in CompModule.
         """
-        def __init__(self):
-            super().__init__()
-
         def forward(self, x, h):
             return x - h
 
@@ -694,9 +679,6 @@ class ModuleAdd(torch.nn.Module):
         A simple Module used to test to_backend lowering machinery.
         """
 
-        def __init__(self):
-            super().__init__()
-
         def forward(self, x, h):
             return x + h
 
diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py
index e3670aa79872..0009e4b78634 100644
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@@ -28,17 +28,17 @@ class TestBuiltins(JitTestCase):
     def test_has_attr(self):
         class HasA(torch.nn.Module):
             def __init__(self):
-                super(HasA, self).__init__()
+                super().__init__()
                 self.a = 0
 
         class HasB(torch.nn.Module):
             def __init__(self):
-                super(HasB, self).__init__()
+                super().__init__()
                 self.b = 1
 
         class Mod(torch.nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.mods = torch.nn.ModuleList([HasA(), HasB()])
 
             def forward(self):
@@ -59,7 +59,7 @@ def forward(self):
     def test_has_attr_invalid_args(self):
         class Mod(torch.nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.mod = torch.nn.Linear(1, 1)
 
             def forward(self, name):
@@ -70,9 +70,6 @@ def forward(self, name):
             torch.jit.script(Mod())
 
         class Mod(torch.nn.Module):
-            def __init__(self):
-                super(Mod, self).__init__()
-
             def forward(self, name):
                 # not allowed, `torch.rand` is not a class type
                 return hasattr(torch.rand(2, 3), name)
@@ -158,20 +155,20 @@ def fn(x):
             return x.{}
         """
 
-        EQUALITY_MISMATCH = set([
+        EQUALITY_MISMATCH = {
             # TorchScript doesn't have real enums so they return an int instead
             # of the actual value
             'dtype',
             'layout',
-        ])
-        MISSING_PROPERTIES = set([
+        }
+        MISSING_PROPERTIES = {
             'grad_fn',
             # This is an undocumented property so it's not included
             "output_nr",
             # This has a longer implementation, maybe not worth copying to
             # TorchScript if named tensors don't work there anyways
             'names',
-        ])
+        }
 
         for p in properties:
             if p in MISSING_PROPERTIES:
diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index d01063a65a3b..80829795d0ab 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -29,7 +29,7 @@ def test_reference_semantics(self):
         Test that modifications made to a class instance in TorchScript
         are visible in eager.
         """
-        class Foo(object):
+        class Foo:
             def __init__(self, a: int):
                 self.a = a
 
@@ -59,7 +59,7 @@ def test_fn(obj: Foo):
         self.assertEqual(obj.attr, 2)
 
     def test_get_with_method(self):
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x):
                 self.foo = x
 
@@ -74,7 +74,7 @@ def fn(x):
         self.assertEqual(fn(input), input)
 
     def test_get_attr(self):
-        class FooTest(object):  # noqa: B903
+        class FooTest:  # noqa: B903
             def __init__(self, x):
                 self.foo = x
 
@@ -87,7 +87,7 @@ def fn(x):
         self.assertEqual(fn(input), input)
 
     def test_in(self):
-        class FooTest(object):  # noqa: B903
+        class FooTest:  # noqa: B903
             def __init__(self):
                 pass
 
@@ -102,7 +102,7 @@ def fn():
         self.assertEqual(fn(), (True, False))
 
     def test_set_attr_in_method(self):
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x: int) -> None:
                 self.foo = x
 
@@ -120,7 +120,7 @@ def fn(x: int) -> int:
     def test_set_attr_type_mismatch(self):
         with self.assertRaisesRegexWithHighlight(RuntimeError, "Wrong type for attribute assignment", "self.foo = 10"):
             @torch.jit.script
-            class FooTest(object):
+            class FooTest:
                 def __init__(self, x):
                     self.foo = x
                     self.foo = 10  # should error since int != Tensor
@@ -128,7 +128,7 @@ def __init__(self, x):
     def test_get_attr_not_initialized(self):
         with self.assertRaisesRegexWithHighlight(RuntimeError, "object has no attribute or method", "self.asdf"):
             @torch.jit.script
-            class FooTest(object):
+            class FooTest:
                 def __init__(self, x):
                     self.foo = x
 
@@ -138,7 +138,7 @@ def get_non_initialized(self):
     def test_set_attr_non_initialized(self):
         with self.assertRaisesRegexWithHighlight(RuntimeError, "Tried to set nonexistent attribute", "self.bar = y"):
             @torch.jit.script
-            class FooTest(object):
+            class FooTest:
                 def __init__(self, x):
                     self.foo = x
 
@@ -160,7 +160,7 @@ def FooTest(x):
     def test_type_annotations(self):
         with self.assertRaisesRegexWithHighlight(RuntimeError, "Expected a value of type \'bool", ""):
             @torch.jit.script  # noqa: B903
-            class FooTest(object):  # noqa: B903
+            class FooTest:  # noqa: B903
                 def __init__(self, x: bool) -> None:
                     self.foo = x
 
@@ -173,13 +173,13 @@ def fn(x):
     def test_conditional_set_attr(self):
         with self.assertRaisesRegexWithHighlight(RuntimeError, "assignment cannot be in a control-flow block", ""):
             @torch.jit.script
-            class FooTest(object):
+            class FooTest:
                 def __init__(self, x):
                     if 1 == 1:
                         self.attr = x
 
     def test_class_type_as_param(self):
-        class FooTest(object):  # noqa: B903
+        class FooTest:  # noqa: B903
             def __init__(self, x):
                 self.attr = x
 
@@ -198,7 +198,7 @@ def fn2(x):
         self.assertEqual(fn2(input), input)
 
     def test_out_of_order_methods(self):
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x):
                 self.x = x
                 self.x = self.get_stuff(x)
@@ -215,7 +215,7 @@ def fn(x):
         self.assertEqual(fn(input), input + input)
 
     def test_save_load_with_classes(self):
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x):
                 self.x = x
 
@@ -245,7 +245,7 @@ def forward(self, a):
         self.assertEqual(input, output)
 
     def test_save_load_with_classes_returned(self):
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x):
                 self.x = x
 
@@ -277,16 +277,16 @@ def forward(self, a):
         self.assertEqual(input, output)
 
     def test_save_load_with_classes_nested(self):
-        class FooNestedTest(object):  # noqa: B903
+        class FooNestedTest:  # noqa: B903
             def __init__(self, y):
                 self.y = y
 
-        class FooNestedTest2(object):
+        class FooNestedTest2:
             def __init__(self, y):
                 self.y = y
                 self.nested = FooNestedTest(y)
 
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x):
                 self.class_attr = FooNestedTest(x)
                 self.class_attr2 = FooNestedTest2(x)
@@ -315,7 +315,7 @@ def forward(self, a):
         self.assertEqual(2 * input, output)
 
     def test_python_interop(self):
-        class Foo(object):  # noqa: B903
+        class Foo:  # noqa: B903
             def __init__(self, x, y):
                 self.x = x
                 self.y = y
@@ -341,7 +341,7 @@ def use_foo(foo: Foo) -> Foo:
         self.assertEqual(y, f2.y)
 
     def test_class_specialization(self):
-        class Foo(object):  # noqa: B903
+        class Foo:  # noqa: B903
             def __init__(self, x, y):
                 self.x = x
                 self.y = y
@@ -365,7 +365,7 @@ def use_foo(foo: Foo, foo2: Foo, tup: Tuple[Foo, Foo]) -> torch.Tensor:
         FileCheck().check_count("prim::GetAttr", 4).run(graphstr)
 
     def test_class_sorting(self):
-        class Foo(object):  # noqa: B903
+        class Foo:  # noqa: B903
             def __init__(self, x: int) -> None:
                 self.x = x
 
@@ -429,7 +429,7 @@ def test():
 
         with self.assertRaisesRegexWithHighlight(RuntimeError, "must define a __lt__", ""):
             @torch.jit.script
-            class NoMethod(object):
+            class NoMethod:
                 def __init__(self):
                     pass
 
@@ -441,7 +441,7 @@ def test():
             test()
 
         @torch.jit.script
-        class WrongLt(object):
+        class WrongLt:
             def __init__(self):
                 pass
 
@@ -459,7 +459,7 @@ def test():
 
     def test_class_inheritance(self):
         @torch.jit.script
-        class Base(object):
+        class Base:
             def __init__(self):
                 self.b = 2
 
@@ -538,7 +538,7 @@ def forward(self, a):
 
     def test_interface(self):
         @torch.jit.script
-        class Foo(object):
+        class Foo:
             def __init__(self):
                 pass
 
@@ -549,7 +549,7 @@ def two(self, x):
                 return 2 * x
 
         @torch.jit.script
-        class Bar(object):
+        class Bar:
             def __init__(self):
                 pass
 
@@ -560,7 +560,7 @@ def two(self, x):
                 return 2 / x
 
         @torch.jit.interface
-        class OneTwo(object):
+        class OneTwo:
             def one(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 pass
 
@@ -568,7 +568,7 @@ def two(self, x: torch.Tensor) -> torch.Tensor:
                 pass
 
         @torch.jit.interface
-        class OneTwoThree(object):
+        class OneTwoThree:
             def one(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 pass
 
@@ -579,7 +579,7 @@ def three(self, x: torch.Tensor) -> torch.Tensor:
                 pass
 
         @torch.jit.interface
-        class OneTwoWrong(object):
+        class OneTwoWrong:
             def one(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 pass
 
@@ -587,7 +587,7 @@ def two(self, x: int) -> int:
                 pass
 
         @torch.jit.script
-        class NotMember(object):
+        class NotMember:
             def __init__(self):
                 pass
 
@@ -596,7 +596,7 @@ def one(self, x, y):
             # missing two
 
         @torch.jit.script
-        class NotMember2(object):
+        class NotMember2:
             def __init__(self):
                 pass
 
@@ -650,7 +650,7 @@ def wrong4(x: OneTwoWrong) -> int:
         # Test interface/class python assignment
         class TestPyAssign(nn.Module):
             def __init__(self):
-                super(TestPyAssign, self).__init__()
+                super().__init__()
                 self.proxy_mod = Foo()
 
             def forward(self, x):
@@ -665,7 +665,7 @@ def forward(self, x):
 
         class TestPyAssignError(nn.Module):
             def __init__(self, obj):
-                super(TestPyAssignError, self).__init__()
+                super().__init__()
                 self.proxy_mod = obj
 
             def forward(self, x):
@@ -678,7 +678,7 @@ def forward(self, x):
             torch.jit.script(TestPyAssignError(Foo()))
 
         # test pure python object assignment to interface fails
-        class PyClass(object):
+        class PyClass:
             def __init__(self):
                 pass
 
@@ -690,7 +690,7 @@ def __init__(self):
 
     def test_overloaded_fn(self):
         @torch.jit.script
-        class Foo(object):
+        class Foo:
             def __init__(self, x):
                 self.x = x
 
@@ -715,7 +715,7 @@ def test_overload():
 
         # TODO - support compiling classes from strings in jit.CompilationUnit
         @torch.jit.script
-        class MyClass(object):
+        class MyClass:
             def __init__(self, x: int) -> None:
                 self.x = x
 
@@ -827,7 +827,7 @@ def test():
 
     def test_cast_overloads(self):
         @torch.jit.script
-        class Foo(object):
+        class Foo:
             def __init__(self, val: float) -> None:
                 self.val = val
 
@@ -858,7 +858,7 @@ def test(foo: Foo) -> Tuple[int, float, bool]:
         self.assertTrue("0." in (str(Foo(0.0))))
 
         @torch.jit.script
-        class BadBool(object):
+        class BadBool:
             def __init__(self):
                 pass
 
@@ -874,7 +874,7 @@ def test():
 
     def test_init_compiled_first(self):
         @torch.jit.script  # noqa: B903
-        class Foo(object):  # noqa: B903
+        class Foo:  # noqa: B903
             def __before_init__(self):
                 # accessing this field should not throw, since __init__ should be compiled
                 return self.x
@@ -885,7 +885,7 @@ def __init__(self, x, y):
 
     def test_class_constructs_itself(self):
         @torch.jit.script  # noqa: B903
-        class LSTMStateStack(object):  # noqa: B903
+        class LSTMStateStack:  # noqa: B903
             def __init__(self, num_layers: int, hidden_size: int) -> None:
                 self.num_layers = num_layers
                 self.hidden_size = hidden_size
@@ -903,13 +903,13 @@ def copy(self):
 
     def test_optional_type_promotion(self):
         @torch.jit.script
-        class Leaf(object):
+        class Leaf:
             def __init__(self):
                 self.x = 1
 
         # should not throw
         @torch.jit.script  # noqa: B903
-        class Tree(object):  # noqa: B903
+        class Tree:  # noqa: B903
             def __init__(self):
                 self.child = torch.jit.annotate(Optional[Leaf], None)
 
@@ -922,7 +922,7 @@ def test_recursive_class(self):
         """
         with self.assertRaises(RuntimeError):
             @torch.jit.script  # noqa: B903
-            class Tree(object):  # noqa: B903
+            class Tree:  # noqa: B903
                 def __init__(self):
                     self.parent = torch.jit.annotate(Optional[Tree], None)
 
@@ -931,7 +931,7 @@ class M(torch.nn.Module):
             __constants__ = ["w"]
 
             def __init__(self, w):
-                super(M, self).__init__()
+                super().__init__()
                 self.w = w
 
             def forward(self, x):
@@ -953,7 +953,7 @@ def forward(self, x):
             self.assertEqual(m.w, m_loaded.w)
 
     def test_py_class_to_ivalue_missing_attribute(self):
-        class Foo(object):
+        class Foo:
             i : int
             f : float
 
@@ -977,7 +977,7 @@ def test_unused_method(self):
         Test unused methods on scripted classes.
         """
         @torch.jit.script
-        class Unused(object):
+        class Unused:
             def __init__(self):
                 self.count: int = 0
                 self.items: List[int] = []
@@ -1029,7 +1029,7 @@ def test_self_referential_method(self):
         in its type annotations.
         """
         @torch.jit.script
-        class Meta(object):
+        class Meta:
             def __init__(self, a: int):
                 self.a = a
 
@@ -1240,7 +1240,7 @@ def free_function(x: int) -> int:
             return x + 1
 
         @torch.jit.script
-        class Properties(object):
+        class Properties:
             __jit_unused_properties__ = ["unsupported"]
 
             def __init__(self, a: int):
@@ -1268,7 +1268,7 @@ def attr(self, value: int):
                 self.a = value + 3
 
         @torch.jit.script
-        class NoSetter(object):
+        class NoSetter:
             def __init__(self, a: int):
                 self.a = a
 
@@ -1277,7 +1277,7 @@ def attr(self) -> int:
                 return free_function(self.a)
 
         @torch.jit.script
-        class MethodThatUsesProperty(object):
+        class MethodThatUsesProperty:
             def __init__(self, a: int):
                 self.a = a
 
@@ -1315,7 +1315,7 @@ def test_custom_delete(self):
         Test that del can be called on an instance of a class that
         overrides __delitem__.
         """
-        class Example(object):
+        class Example:
             def __init__(self):
                 self._data: Dict[str, torch.Tensor] = {"1": torch.tensor(1.0)}
 
@@ -1333,7 +1333,7 @@ def fn() -> bool:
         self.checkScript(fn, ())
 
         # Test the case in which the class does not have __delitem__ defined.
-        class NoDelItem(object):
+        class NoDelItem:
             def __init__(self):
                 self._data: Dict[str, torch.Tensor] = {"1": torch.tensor(1.0)}
 
@@ -1359,7 +1359,7 @@ def test_recursive_script_builtin_type_resolution(self):
         device_t = torch.device
         device_ty = torch.device
 
-        class A(object):
+        class A:
             def __init__(self):
                 pass
 
@@ -1425,13 +1425,13 @@ def test_class_attribute_wrong_type(self):
         to an IValue that has an attribute of the wrong type.
         """
         @torch.jit.script  # noqa: B903
-        class ValHolder(object):  # noqa: B903
+        class ValHolder:  # noqa: B903
             def __init__(self, val):
                 self.val = val
 
         class Mod(nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.mod1 = ValHolder("1")
                 self.mod2 = ValHolder("2")
 
@@ -1450,7 +1450,7 @@ def test_recursive_scripting(self):
         Test that class types are recursively scripted when an Python instance of one
         is encountered as a module attribute.
         """
-        class Class(object):
+        class Class:
             def __init__(self, a: int):
                 self.a = a
 
@@ -1473,7 +1473,7 @@ def test_recursive_scripting_failed(self):
         are added as failed attributes and do not cause compilation itself
         to fail unless they are used in scripted code.
         """
-        class UnscriptableClass(object):
+        class UnscriptableClass:
             def __init__(self, a: int):
                 self.a = a
 
@@ -1511,7 +1511,7 @@ def forward(self, x: int) -> int:
 
 
     def test_unresolved_class_attributes(self):
-        class UnresolvedAttrClass(object):
+        class UnresolvedAttrClass:
             def __init__(self):
                 pass
 
diff --git a/test/jit/test_complexity.py b/test/jit/test_complexity.py
index d0d24c269e3b..569a330486a0 100644
--- a/test/jit/test_complexity.py
+++ b/test/jit/test_complexity.py
@@ -44,12 +44,12 @@ def num_non_tensor_nodes(block):
 
 class TestComplexity(JitTestCase):
     def setUp(self):
-        super(TestComplexity, self).setUp()
+        super().setUp()
         self.grad_enabled = torch.is_grad_enabled()
         torch.set_grad_enabled(False)
 
     def tearDown(self):
-        super(TestComplexity, self).tearDown()
+        super().tearDown()
         torch.set_grad_enabled(self.grad_enabled)
 
     @suppress_warnings
diff --git a/test/jit/test_convert_activation.py b/test/jit/test_convert_activation.py
index 0c06fb69d349..f414459ecec4 100644
--- a/test/jit/test_convert_activation.py
+++ b/test/jit/test_convert_activation.py
@@ -109,7 +109,7 @@ def test2(x):
         # at the global scope
         class Test3(nn.Module):
             def __init__(self, x):
-                super(Test3, self).__init__()
+                super().__init__()
                 self.x = x
 
             def forward(self):
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
index a151756d598f..6937af9f2927 100644
--- a/test/jit/test_cuda.py
+++ b/test/jit/test_cuda.py
@@ -44,13 +44,10 @@ class TestCUDA(JitTestCase):
     """
     A suite of tests for the CUDA API in TorchScript.
     """
-    def setUp(self):
-        super(TestCUDA, self).setUp()
-
     def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
-        super(TestCUDA, self).tearDown()
+        super().tearDown()
 
     @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
diff --git a/test/jit/test_dataclasses.py b/test/jit/test_dataclasses.py
index ee674c4326d2..b8f68d7073ac 100644
--- a/test/jit/test_dataclasses.py
+++ b/test/jit/test_dataclasses.py
@@ -65,9 +65,6 @@ class TestDataclasses(JitTestCase):
     def tearDownClass(cls):
          torch._C._jit_clear_class_registry()
 
-    # We only support InitVar in JIT dataclasses for Python 3.8+ because it would be very hard
-    # to support without the `type` attribute on InitVar (see comment in _dataclass_impls.py).
-    @unittest.skipIf(sys.version_info < (3, 8), "InitVar not supported in Python < 3.8")
     def test_init_vars(self):
         @torch.jit.script
         @dataclass(order=True)
diff --git a/test/jit/test_dce.py b/test/jit/test_dce.py
index c3ca980972c1..60a18b3595ff 100644
--- a/test/jit/test_dce.py
+++ b/test/jit/test_dce.py
@@ -22,7 +22,7 @@ def forward(self):
 
     def test_setattr_removed(self):
         @torch.jit.script
-        class Thing1(object):
+        class Thing1:
             def __init__(self):
                 self.x = torch.zeros([2, 2])
 
diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index 3221a35ea4fc..5198688c08df 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -244,7 +244,7 @@ class Color(Enum):
 
         class TestModule(torch.nn.Module):
             def __init__(self, e: Color):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.e = e
 
             def forward(self):
@@ -270,7 +270,7 @@ class Color(Enum):
 
         class TestModule(torch.nn.Module):
             def __init__(self, e: Color):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.e = e
 
             def forward(self):
@@ -306,7 +306,7 @@ class Color(Enum):
 
         class TestModule(torch.nn.Module):
             def __init__(self, e: Color):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.e = e
 
             def forward(self):
diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py
index dce38e3be892..2cc000196291 100644
--- a/test/jit/test_exception.py
+++ b/test/jit/test_exception.py
@@ -10,7 +10,7 @@ class TestException(TestCase):
     def test_pyop_exception_message(self):
         class Foo(torch.jit.ScriptModule):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 10, kernel_size=5)
 
             @torch.jit.script_method
@@ -156,8 +156,7 @@ def fn(self):
 
     def test_custom_python_exception(self):
         class MyValueError(ValueError):
-            def __init__(self, msg):
-                super(MyValueError, self).__init__(msg)
+            pass
 
         @torch.jit.script
         def fn():
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 3e2fc80be24a..70cf01fb058a 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -43,7 +43,7 @@ class TestFreezing(JitTestCase):
     def test_freeze_module(self):
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.a = 1                      # folded
                 self.b = 1.2                    # folded
                 self.c = "hello"                # folded
@@ -101,7 +101,7 @@ def forward(self, x):
     def test_freeze_module_with_submodule(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = 11
                 self.b = 2
 
@@ -110,7 +110,7 @@ def forward(self, x):
 
         class SubModule2(nn.Module):
             def __init__(self):
-                super(SubModule2, self).__init__()
+                super().__init__()
                 self.a = 12
                 self.b = 2
 
@@ -120,7 +120,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub1 = SubModule()
                 self.sub2 = SubModule2()
                 self.a = 3
@@ -166,7 +166,7 @@ def forward(self, x):
     def test_freeze_module_with_fork(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = torch.ones(20, 20)
                 self.b = torch.ones(20, 20)
 
@@ -175,7 +175,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
 
             def forward(self, x):
@@ -206,7 +206,7 @@ def forward(self, x):
     def test_freeze_module_with_nested_fork(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = torch.ones(20, 20)
                 self.b = torch.ones(20, 20)
 
@@ -215,7 +215,7 @@ def forward(self, x):
 
         class SubModule2(nn.Module):
             def __init__(self):
-                super(SubModule2, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
                 self.c = torch.ones(20, 20)
 
@@ -227,7 +227,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub = SubModule2()
                 self.d = 1
 
@@ -266,7 +266,7 @@ def foo(x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.a = torch.ones(20, 20)
                 self.b = torch.ones(20, 20)
 
@@ -307,7 +307,7 @@ def foo(x, y):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.a = torch.ones(20, 20)
                 self.b = torch.ones(20, 20)
 
@@ -347,7 +347,7 @@ def forward(self, x):
     def test_freeze_module_with_sharedclasstype(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = torch.tensor([2.2])
 
@@ -366,7 +366,7 @@ def modify_b(self, x):
 
         class SubModule2(nn.Module):
             def __init__(self):
-                super(SubModule2, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
                 self.b = torch.tensor([3.3])
 
@@ -376,7 +376,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub1 = SubModule()  # sub1 and sub2.sub shared same class type.
                 self.sub2 = SubModule2()
                 self.a = torch.tensor([4.4])
@@ -439,7 +439,7 @@ def forward(self, x):
     def test_freeze_module_with_nestedaliasing(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = torch.tensor([2.2])
 
@@ -459,7 +459,7 @@ def modify_b(self, x):
 
         class SubModule2(nn.Module):
             def __init__(self):
-                super(SubModule2, self).__init__()
+                super().__init__()
                 self.sub = Sub  # aliasing
 
             def forward(self, x):
@@ -467,7 +467,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub1 = Sub  # aliasing
                 self.sub2 = SubModule2()
 
@@ -495,7 +495,7 @@ def forward(self, x):
     def test_freeze_module_with_nestedaliasingscalar(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = 1.1
                 self.b = 2.2
 
@@ -515,7 +515,7 @@ def modify_b(self, x):
 
         class SubModule2(nn.Module):
             def __init__(self):
-                super(SubModule2, self).__init__()
+                super().__init__()
                 self.sub = Sub  # aliasing
 
             def forward(self, x):
@@ -523,7 +523,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub1 = Sub  # aliasing
                 self.sub2 = SubModule2()
 
@@ -551,7 +551,7 @@ def forward(self, x):
     def test_freeze_module_with_preserve_sub_module(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = 2.2
 
@@ -560,7 +560,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub1 = SubModule()  # aliasing
                 self.sub2 = SubModule()
 
@@ -584,7 +584,7 @@ def forward(self, x):
     def test_freeze_module_with_preserve_sub_module_and_mutation(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = 2.2
 
@@ -594,7 +594,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub1 = SubModule()  # aliasing
                 self.sub2 = SubModule()
 
@@ -622,7 +622,7 @@ def forward(self, x):
     def test_freeze_module_with_helperfunction(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = 11
                 self.b = 2
 
@@ -631,7 +631,7 @@ def forward(self, x):
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
                 self.a = 3
                 self.b = 4
@@ -655,7 +655,7 @@ def _forward(self, x):
     def test_freeze_module_with_inplace_mutable(self):
         class FreezeMe(torch.jit.ScriptModule):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = [11, 22]
 
             @torch.jit.script_method
@@ -677,7 +677,7 @@ def forward(self, x):
     def test_freeze_module_with_mutable_list(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = [1, 2]
 
             def forward(self, x):
@@ -704,7 +704,7 @@ def forward(self, x):
     def test_freeze_module_with_mutable_dict(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = {"layer" : "4"}
 
             def forward(self, x):
@@ -733,7 +733,7 @@ def modify_a(self, x):
     def test_freeze_module_with_mutable_tensor(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1., 2., 3.])
 
             def forward(self, x):
@@ -755,7 +755,7 @@ def forward(self, x):
     def test_freeze_module_with_tuple(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = (torch.tensor([1, 2, 3, 4, 5, 6]), "hi")
 
             def forward(self, x):
@@ -777,7 +777,7 @@ def forward(self, x):
     def test_freeze_module_with_tensor(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1, 2, 3, 4, 5, 6])
 
             def forward(self, x):
@@ -799,7 +799,7 @@ def forward(self, x):
     def test_freeze_module_with_list(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = [torch.tensor([1, 2, 3, 4, 5, 6])]
 
             def forward(self, x):
@@ -820,7 +820,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_tensor_attr(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1, 2, 3, 4, 5, 6])
                 self.b = self.a.view(2, 3)
 
@@ -841,7 +841,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_tensor_attr2(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1, 2, 3, 4, 5, 6])
                 self.b = {"layer" : ([self.a.view(2, 3), torch.tensor([10])], 20)}
                 self.c = ([self.a.view(2, 3), torch.tensor([10])], 20)
@@ -862,7 +862,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_tensor_attr3(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1, 2, 3, 4, 5, 6])
                 self.b = [self.a, torch.tensor([10])]
 
@@ -885,7 +885,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_tensor_attr4(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1, 2, 3, 4, 5, 6])
                 self.b = [self.a, torch.tensor([10])]
 
@@ -907,7 +907,7 @@ def test_freeze_module_with_overlapping_attrs(self):
 
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.b = [a.view(3, 2), torch.tensor([10])]
                 self.c = (20, a.view(2, 3))
 
@@ -927,7 +927,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_attr(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = [1, 2, 3, 4, 5, 6]
                 self.b = self.a
                 self.c = (self.a, 10)
@@ -954,7 +954,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_attr2(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = [1, 2, 3, 4, 5, 6]
                 self.b = ([11], [10])
 
@@ -978,7 +978,7 @@ def forward(self, x):
     def test_freeze_module_with_aliased_attr3(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = [1, 2, 3, 4, 5, 6]
                 self.b = ([11], [10])
 
@@ -1002,7 +1002,7 @@ def forward(self, x):
     def test_freeze_module_return_self(self):
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1., 2., 3.])
 
             def forward(self, x):
@@ -1016,14 +1016,14 @@ def forward(self, x):
 
     def test_freeze_module_inlining(self):
         @torch.jit.script  # noqa: B903
-        class Obj(object):  # noqa: B903
+        class Obj:  # noqa: B903
             def __init__(self, x: int, y: int):
                 self.x = x
                 self.y = y
 
         class Mod(nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.obj = Obj(2, 3)
 
             def forward(self, i: int):
@@ -1046,7 +1046,7 @@ def test_freeze_module_return_sub_module(self):
 
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 32, 3, 1)
 
             def forward(self, x):
@@ -1062,7 +1062,7 @@ def test_freeze_module_no_forward(self):
 
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.lin = nn.Linear(10, 1)
 
             @torch.jit.export
@@ -1081,7 +1081,7 @@ def test_freeze_no_forward(self):
 
         class FreezeMe(nn.Module):
             def __init__(self):
-                super(FreezeMe, self).__init__()
+                super().__init__()
                 self.lin = nn.Linear(10, 1)
 
             @torch.jit.export
@@ -1099,7 +1099,7 @@ def foo(self, x):
     def test_freeze_module_in_training_mode(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 32, 3, 1)
                 self.conv2 = nn.Conv2d(32, 64, 3, 1)
                 self.dropout1 = nn.Dropout2d(0.25)
@@ -1243,7 +1243,7 @@ def test_freeze_module_detach_gradient(self):
     def test_freeze_module_with_user_preserved_attr(self):
         class Module(nn.Module):
             def __init__(self):
-                super(Module, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = torch.tensor([2.2])
 
@@ -1260,7 +1260,7 @@ def forward(self, x):
     def test_freeze_module_with_user_preserved_method(self):
         class Module(nn.Module):
             def __init__(self):
-                super(Module, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = torch.tensor([2.2])
 
@@ -1291,7 +1291,7 @@ def modify_b(self, x):
     def test_freeze_module_with_user_preserved_method2(self):
         class Module(nn.Module):
             def __init__(self):
-                super(Module, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([1.1])
                 self.b = torch.tensor([2.2])
 
@@ -1313,7 +1313,7 @@ def modify_a(self, x):
     def test_freeze_module_with_user_preserved_attribute_on_submodule(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = 1
                 self.b = 2
 
@@ -1322,7 +1322,7 @@ def forward(self):
 
         class Module(nn.Module):
             def __init__(self):
-                super(Module, self).__init__()
+                super().__init__()
                 self.sub1 = SubModule()
                 self.sub2 = SubModule()
 
@@ -1347,7 +1347,7 @@ def forward(self):
     def test_freeze_module_with_user_preserved_attribute_on_unused_submodule(self):
         class SubModule(nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = 1
                 self.b = 2
 
@@ -1360,7 +1360,7 @@ def method_a(self):
 
         class Module(nn.Module):
             def __init__(self):
-                super(Module, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
 
             def forward(self):
@@ -1377,9 +1377,6 @@ def forward(self):
 
     def test_freeze_module_with_user_preserved_method_on_submodule(self):
         class SubModule(nn.Module):
-            def __init__(self):
-                super(SubModule, self).__init__()
-
             def forward(self, x):
                 return self.method_a(x) + self.method_b(x)
 
@@ -1391,7 +1388,7 @@ def method_b(self, x):
 
         class Module(nn.Module):
             def __init__(self):
-                super(Module, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
 
             def forward(self, x):
@@ -1409,7 +1406,7 @@ def forward(self, x):
     def test_module_with_shared_type_instances(self):
         class Child(nn.Module):
             def __init__(self):
-                super(Child, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 1, 1).to(dtype=torch.float32)
 
             def forward(self, x):
@@ -1418,7 +1415,7 @@ def forward(self, x):
 
         class Parent(nn.Module):
             def __init__(self):
-                super(Parent, self).__init__()
+                super().__init__()
                 self.quant = torch.ao.quantization.QuantStub()
                 self.conv1 = nn.Conv2d(1, 1, 1).to(dtype=torch.float32)
                 self.child = Child()
@@ -1459,13 +1456,13 @@ def _static_quant(model):
 
     def test_module_getattr_indirection(self):
         @torch.jit.script
-        class ValHolder(object):
+        class ValHolder:
             def __init__(self, val: int):
                 self.val: int = val
 
         class Mod(nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.mod1 = ValHolder(1)
                 self.mod2 = ValHolder(2)
 
@@ -1536,7 +1533,7 @@ def forward(self, inp: torch.Tensor) -> torch.Tensor:
 
         class ImplementsInterface(torch.nn.Module):
             def __init__(self):
-                super(ImplementsInterface, self).__init__()
+                super().__init__()
                 self.sum = torch.zeros((2, 2))
 
             def forward(self, inp: torch.Tensor) -> torch.Tensor:
@@ -1612,7 +1609,7 @@ def forward(self, inp: torch.Tensor) -> torch.Tensor:
 
         class InnerImpl(torch.nn.Module):
             def __init__(self):
-                super(InnerImpl, self).__init__()
+                super().__init__()
                 self.x = torch.ones((2, 2))
 
             def forward(self, inp):
@@ -1622,7 +1619,7 @@ class OuterImpl(torch.nn.Module):
             inner_impl: InnerInterface
 
             def __init__(self):
-                super(OuterImpl, self).__init__()
+                super().__init__()
                 self.inner_impl = InnerImpl()
 
             def forward(self, inp):
@@ -1632,7 +1629,7 @@ class WrapperModule(torch.nn.Module):
             outer_impl: OuterInterface
 
             def __init__(self):
-                super(WrapperModule, self).__init__()
+                super().__init__()
                 self.outer_impl = OuterImpl()
 
             def forward(self, inp):
@@ -1662,7 +1659,7 @@ def forward(self, inp: torch.Tensor) -> torch.Tensor:
 
         class InnerImpl1(torch.nn.Module):
             def __init__(self):
-                super(InnerImpl1, self).__init__()
+                super().__init__()
                 self.x = torch.ones((2, 2))
 
             def forward(self, inp):
@@ -1671,7 +1668,7 @@ def forward(self, inp):
 
         class InnerImpl2(torch.nn.Module):
             def __init__(self):
-                super(InnerImpl2, self).__init__()
+                super().__init__()
                 self.x = torch.ones((2, 2)) * 2
 
             def forward(self, inp):
@@ -1681,7 +1678,7 @@ class OuterImpl(torch.nn.Module):
             inner_impl: InnerInterface
 
             def __init__(self):
-                super(OuterImpl, self).__init__()
+                super().__init__()
                 self.inner_impl = InnerImpl1()
                 self.impl1 = InnerImpl1()
                 self.impl2 = InnerImpl1()
@@ -1699,7 +1696,7 @@ class WrapperModule(torch.nn.Module):
             outer_impl: OuterInterface
 
             def __init__(self):
-                super(WrapperModule, self).__init__()
+                super().__init__()
                 self.outer_impl = OuterImpl()
 
             def forward(self, inp):
@@ -1730,7 +1727,7 @@ class WrapperModule1(torch.nn.Module):
             interface_impl: MyInterface
 
             def __init__(self):
-                super(WrapperModule1, self).__init__()
+                super().__init__()
                 self.interface_impl = Impl1()
                 self.impl1 = Impl1()
                 self.impl2 = Impl2()
@@ -1752,7 +1749,7 @@ class WrapperModule2(torch.nn.Module):
             interface_impl: MyInterface
 
             def __init__(self):
-                super(WrapperModule2, self).__init__()
+                super().__init__()
                 self.interface_impl = Impl1()
                 self.impl1 = Impl1()
                 self.impl2 = Impl2()
@@ -1795,7 +1792,7 @@ def forward(self, inp: torch.Tensor) -> torch.Tensor:
 
         class InnerImpl(torch.nn.Module):
             def __init__(self):
-                super(InnerImpl, self).__init__()
+                super().__init__()
                 self.x = torch.ones((2, 2))
 
             def forward(self, inp):
@@ -1805,7 +1802,7 @@ class OuterImpl(torch.nn.Module):
             impl: InnerInterface
 
             def __init__(self):
-                super(OuterImpl, self).__init__()
+                super().__init__()
                 self.impl = InnerImpl()
                 self.x = torch.ones((2, 2)) * 5
 
@@ -1819,7 +1816,7 @@ class WrapperModule(torch.nn.Module):
             impl: OuterInterface
 
             def __init__(self):
-                super(WrapperModule, self).__init__()
+                super().__init__()
                 self.impl = OuterImpl()
 
             def forward(self, inp):
@@ -1839,7 +1836,7 @@ def forward(self, inp):
     def test_freeze_non_interface_module_swap(self):
         class InnerModule(torch.nn.Module):
             def __init__(self, x):
-                super(InnerModule, self).__init__()
+                super().__init__()
                 self.x = x
 
             def forward(self, inp: torch.Tensor) -> torch.Tensor:
@@ -1914,7 +1911,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.expectEqual(expected, actual)
 
     def test_freeze_non_module_class_getattr(self):
-        class BoxCoder(object):
+        class BoxCoder:
             def __init__(self, bbox_xform_clip):
                 # type: (float) -> None
                 self.bbox_xform_clip = bbox_xform_clip
@@ -1928,7 +1925,7 @@ class MyModule(torch.nn.Module):
             }
 
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.box_coder = BoxCoder(50.)
 
             def forward(self, input):
@@ -1944,9 +1941,6 @@ def forward(self, input):
 
     def test_freeze_module_with_tupleoutput_submodule(self):
         class SubModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return (x + 1, x + 2)
 
@@ -2000,10 +1994,12 @@ def make_prediction(self, x):
 
 class TestFrozenOptimizations(JitTestCase):
     def setUp(self):
+        super().setUp()
         self.default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(torch.double)
 
     def tearDown(self):
+        super().tearDown()
         torch.set_default_dtype(self.default_dtype)
 
     def test_conv_bn_folding(self):
@@ -2015,7 +2011,7 @@ def test_conv_bn_folding(self):
         for use_bias, modules, tracing, track_stats in product(conv_bias, module_pairs, use_tracing, bn_running_stats):
             class ConvBN(torch.nn.Module):
                 def __init__(self, in_channels, out_channels, **kwargs):
-                    super(ConvBN, self).__init__()
+                    super().__init__()
                     self.conv = modules[0](in_channels, out_channels, bias=use_bias, **kwargs)
                     self.bn = modules[1](out_channels, eps=0.001, track_running_stats=track_stats)
 
@@ -2060,7 +2056,7 @@ def forward(self, x):
     def test_conv_bn_folding_not_forward(self):
         class ConvBN(torch.nn.Module):
             def __init__(self, in_channels, out_channels, **kwargs):
-                super(ConvBN, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=True, **kwargs)
                 self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001)
                 self.amt = 3.2
@@ -2092,7 +2088,7 @@ def test_conv_bn_folding_autocast_scenario_cuda(self):
 
         class ConvBN(torch.nn.Module):
             def __init__(self, in_channels, out_channels, **kwargs):
-                super(ConvBN, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=False, dtype=torch.half, **kwargs)
                 self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001, dtype=torch.float)
 
@@ -2123,7 +2119,7 @@ class ConvOp(torch.nn.Module):
                 __constants__ = ['use_scalar']
 
                 def __init__(self, in_channels, out_channels, tensor=None, **kwargs):
-                    super(ConvOp, self).__init__()
+                    super().__init__()
                     self.conv = module(in_channels, out_channels, bias=use_bias, **kwargs)
                     self.conv2 = module(in_channels, out_channels, bias=use_bias, **kwargs)
                     self.use_scalar = scalar
@@ -2202,7 +2198,7 @@ def test_conv_mul_add_bn(self):
         class Conv_Mul_Add_Bn(nn.Module):
 
             def __init__(self, in_channels, out_channels, **kwargs):
-                super(Conv_Mul_Add_Bn, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
                 self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
                 self.tensor1 = torch.tensor(2.2)
@@ -2231,7 +2227,7 @@ def test_linear_bn_folding(self):
         for modules, tracing, track_stats in product(module_pairs, use_tracing, bn_running_stats):
             class LinearBN(torch.nn.Module):
                 def __init__(self, in_features, out_features):
-                    super(LinearBN, self).__init__()
+                    super().__init__()
                     self.linear = modules[0](in_features, out_features)
                     self.bn = modules[1](out_features, eps=0.001, track_running_stats=track_stats)
 
@@ -2286,7 +2282,7 @@ def test_linear_bn_folding_autocast_scenario_cuda(self):
         for modules, tracing, track_stats in product(module_pairs, use_tracing, bn_running_stats):
             class LinearBN(torch.nn.Module):
                 def __init__(self, in_features, out_features):
-                    super(LinearBN, self).__init__()
+                    super().__init__()
                     self.linear = modules[0](in_features, out_features, bias=False, dtype=torch.half)
                     self.bn = modules[1](out_features, eps=0.001, dtype=torch.float)
 
@@ -2331,7 +2327,7 @@ def test_linear_concat(self):
         for w1_dim, w2_dim in out_dimms:
             class ModMultLinear(nn.Module):
                 def __init__(self, w1_dim, w2_dim):
-                    super(ModMultLinear, self).__init__()
+                    super().__init__()
                     self.w1 = nn.Parameter(torch.rand([w1_dim, 5]))
                     self.b1 = nn.Parameter(torch.rand([w1_dim]))
                     self.w2 = nn.Parameter(torch.rand([w2_dim, 5]))
@@ -2355,7 +2351,7 @@ def test_linear_concat_complex(self):
         """
         class ModMultLinear(nn.Module):
             def __init__(self):
-                super(ModMultLinear, self).__init__()
+                super().__init__()
                 w1_dim = 5
                 w2_dim = 10
                 self.w1 = nn.Parameter(torch.rand([w1_dim, 5]))
@@ -2384,7 +2380,7 @@ def test_linear_concat_different_input(self):
         # Freezing requires that the graph be a module
         class ModMultLinear(nn.Module):
             def __init__(self, w1_dim, w2_dim):
-                super(ModMultLinear, self).__init__()
+                super().__init__()
                 self.w1 = nn.Parameter(torch.rand([w1_dim, 5]))
                 self.b1 = nn.Parameter(torch.rand([w1_dim]))
                 self.w2 = nn.Parameter(torch.rand([w2_dim, 5]))
@@ -2404,7 +2400,7 @@ def forward(self, in_tensor1, in_tensor2):
     def test_linear_multiple_blocks(self):
         class ModMultLinear(nn.Module):
             def __init__(self, w1_dim, w2_dim):
-                super(ModMultLinear, self).__init__()
+                super().__init__()
                 self.w1 = nn.Parameter(torch.rand([w1_dim, 5]))
                 self.b1 = nn.Parameter(torch.rand([w1_dim]))
                 self.w2 = nn.Parameter(torch.rand([w2_dim, 5]))
@@ -2472,7 +2468,7 @@ def test_optimize_freeze_module(self):
     def test_freeze_remove_dropout(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.dropout = nn.Dropout(0.5)
 
             def forward(self, x):
@@ -2493,7 +2489,7 @@ def forward(self, x):
     def test_freeze_remove_feature_dropout(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.dropout = nn.Dropout2d(0.5)
 
             def forward(self, x):
@@ -2554,7 +2550,7 @@ def test_conv_to_mkldnn(self):
     def test_linear_transpose(self):
         class ModLinear(torch.nn.Module):
             def __init__(self):
-                super(ModLinear, self).__init__()
+                super().__init__()
                 self.bias = torch.nn.Parameter(torch.rand(30))
                 self.weight = torch.nn.Parameter(torch.rand([30, 20]))
 
@@ -2568,7 +2564,7 @@ def forward(self, x):
     def test_linear_non_constant_weight(self):
         class ModLinear(torch.nn.Module):
             def __init__(self):
-                super(ModLinear, self).__init__()
+                super().__init__()
                 self.bias = torch.nn.Parameter(torch.rand(30))
 
             def forward(self, x, weight):
@@ -2704,7 +2700,7 @@ def test_freeze_conv_relu_fusion(self):
             for use_bias, conv, add_z, tracing in product(conv_bias, conv_ops, add_z, use_tracing):
                 class Net(nn.Module):
                     def __init__(self, in_channels, out_channels, **kwargs):
-                        super(Net, self).__init__()
+                        super().__init__()
                         self.conv = conv(in_channels, out_channels, bias=use_bias, **kwargs)
                         self.relu = nn.ReLU(inplace=True)
                         self.add_z = add_z
@@ -2748,7 +2744,7 @@ def test_freeze_conv_relu_fusion_not_forward(self):
         with set_default_dtype(torch.float):
             class Net(nn.Module):
                 def __init__(self, in_channels, out_channels, **kwargs):
-                    super(Net, self).__init__()
+                    super().__init__()
                     self.conv = nn.Conv2d(in_channels, out_channels, bias=None, **kwargs)
                     self.relu = nn.ReLU(inplace=True)
 
@@ -2883,7 +2879,7 @@ def test_conv_hardswish(self):
         with set_default_dtype(torch.float):
             class Clamp(torch.nn.Module):
                 def __init__(self, min_val, max_val, **kwargs):
-                    super(Clamp, self).__init__()
+                    super().__init__()
                     self.min_val = min_val
                     self.max_val = max_val
 
@@ -2965,9 +2961,6 @@ def forward(self, x):
 
     def test_remove_detach(self):
         class Mod(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 y = x.detach()
                 return y * y
@@ -2980,9 +2973,6 @@ def forward(self, x):
 
     def test_remove_detach_not_applied(self):
         class Mod(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 y = x.detach()
                 return x is y
@@ -2996,10 +2986,12 @@ def forward(self, x):
 @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
 class TestMKLDNNReinplacing(JitTestCase):
     def setUp(self):
+        super().setUp()
         self.default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(torch.float)
 
     def tearDown(self):
+        super().tearDown()
         torch.set_default_dtype(self.default_dtype)
 
     def getConv(self):
diff --git a/test/jit/test_graph_rewrite_passes.py b/test/jit/test_graph_rewrite_passes.py
index 95bb564da790..3ecdba6bb404 100644
--- a/test/jit/test_graph_rewrite_passes.py
+++ b/test/jit/test_graph_rewrite_passes.py
@@ -10,7 +10,7 @@ class TestGraphRewritePasses(JitTestCase):
     def test_fuse_linear(self):
         class FunctionalLinear(torch.nn.Module):
             def __init__(self, weight, bias):
-                super(FunctionalLinear, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
 
@@ -44,7 +44,7 @@ def forward(self, x):
         # check matmuls are not fused
         class Matmul(torch.nn.Module):
             def __init__(self, weight):
-                super(Matmul, self).__init__()
+                super().__init__()
                 self.weight = weight
 
             def forward(self, x):
diff --git a/test/jit/test_ignore_context_manager.py b/test/jit/test_ignore_context_manager.py
index c58c6c501c4f..4d0660e9eb82 100644
--- a/test/jit/test_ignore_context_manager.py
+++ b/test/jit/test_ignore_context_manager.py
@@ -21,9 +21,6 @@ class TestIgnoreContextManager(JitTestCase):
     @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
     def test_with_ignore_context_manager_with_inp_out(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self):
                 a: int = 4
                 b: int = 5
@@ -40,9 +37,6 @@ def forward(self):
         self.assertEqual(s(), 20)
 
         class B(torch.nn.Module):
-            def __init__(self):
-                super(B, self).__init__()
-
             def forward(self):
                 a: int = 4
                 b: int = 5
@@ -57,9 +51,6 @@ def forward(self):
         self.assertEqual(s(), model())
 
         class C(torch.nn.Module):
-            def __init__(self):
-                super(C, self).__init__()
-
             def forward(self):
                 a: int = 4
                 b: int = 5
@@ -75,9 +66,6 @@ def forward(self):
     @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
     def test_with_ignore_context_manager_with_just_inp(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self):
                 a: int = 4
                 b: int = 5
@@ -92,9 +80,6 @@ def forward(self):
     @unittest.skipUnless(_IS_ASTUNPARSE_INSTALLED, "astunparse package is required")
     def test_with_ignore_context_manager_with_just_out(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self):
                 with torch.jit._IgnoreContextManager(c="out:List[int]"):
                     c = [2 for i in range(7) if i > 2]
diff --git a/test/jit/test_jit_utils.py b/test/jit/test_jit_utils.py
index ceb46489f20d..c72aad3623a9 100644
--- a/test/jit/test_jit_utils.py
+++ b/test/jit/test_jit_utils.py
@@ -3,7 +3,6 @@
 import os
 import sys
 from textwrap import dedent
-import unittest
 
 import torch
 
@@ -30,7 +29,6 @@ def fn_positional_or_keyword_args_only(x, y):
             torch._jit_internal.get_callable_argument_names(fn_positional_or_keyword_args_only))
 
     # Tests that POSITIONAL_ONLY arguments are ignored.
-    @unittest.skipIf(sys.version_info < (3, 8), 'POSITIONAL_ONLY arguments are not supported before 3.8')
     def test_get_callable_argument_names_positional_only(self):
         code = dedent('''
             def fn_positional_only_arg(x, /, y):
@@ -39,7 +37,7 @@ def fn_positional_only_arg(x, /, y):
 
         fn_positional_only_arg = jit_utils._get_py3_code(code, 'fn_positional_only_arg')
         self.assertEqual(
-            [],
+            ["y"],
             torch._jit_internal.get_callable_argument_names(fn_positional_only_arg))
 
     # Tests that VAR_POSITIONAL arguments are ignored.
@@ -48,7 +46,7 @@ def test_get_callable_argument_names_var_positional(self):
         def fn_var_positional_arg(x, *arg):
             return x + arg[0]
         self.assertEqual(
-            [],
+            ["x"],
             torch._jit_internal.get_callable_argument_names(fn_var_positional_arg))
 
     # Tests that KEYWORD_ONLY arguments are ignored.
@@ -56,7 +54,7 @@ def test_get_callable_argument_names_keyword_only(self):
         def fn_keyword_only_arg(x, *, y):
             return x + y
         self.assertEqual(
-            [],
+            ["x"],
             torch._jit_internal.get_callable_argument_names(fn_keyword_only_arg))
 
     # Tests that VAR_KEYWORD arguments are ignored.
@@ -69,7 +67,6 @@ def fn_var_keyword_arg(**args):
 
     # Tests that a function signature containing various different types of
     # arguments are ignored.
-    @unittest.skipIf(sys.version_info < (3, 8), 'POSITIONAL_ONLY arguments are not supported before 3.8')
     def test_get_callable_argument_names_hybrid(self):
         code = dedent('''
             def fn_hybrid_args(x, /, y, *args, **kwargs):
@@ -77,7 +74,7 @@ def fn_hybrid_args(x, /, y, *args, **kwargs):
         ''')
         fn_hybrid_args = jit_utils._get_py3_code(code, 'fn_hybrid_args')
         self.assertEqual(
-            [],
+            ["y"],
             torch._jit_internal.get_callable_argument_names(fn_hybrid_args))
 
     def test_checkscriptassertraisesregex(self):
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index 686ab5236c52..f30d7f36ed7f 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -226,7 +226,7 @@ def foo2():
         self.checkScript(foo2, ())
 
         def foo3():
-            return list(list("abc"))
+            return list(list("abc"))  # noqa: C414
 
         self.checkScript(foo3, ())
         FileCheck().check_count("aten::list", 2, exactly=True).run(torch.jit.script(foo3).graph)
@@ -421,6 +421,7 @@ def func2():
 
         self.checkScript(func2, ())
 
+    @skipIfTorchDynamo("TorchDynamo fails to raise on this checkScriptRaisesRegex, because we trace it properly now")
     def test_list_ops(self):
         def test_equality():
             a = [1, 2, 3]
@@ -1515,7 +1516,7 @@ def specialized_list():
             li.append(3)
             return li
 
-        self.assertTrue(set(specialized_list()) == set([1, 2, 3]))
+        self.assertTrue(set(specialized_list()) == {1, 2, 3})
 
     @skipIfTorchDynamo("TorchDynamo fails for this test for unknown reason")
     def test_values(self):
@@ -1975,7 +1976,7 @@ class TheType(NamedTuple):
 
         class MyModule(types.ModuleType):
             def __init__(self):
-                super(MyModule, self).__init__('MyModule')
+                super().__init__('MyModule')
 
             def __getattr__(self, attr):
                 return TheType
@@ -2563,7 +2564,7 @@ def test_extend(self):
         """
         Test extend.
         """
-        class Iterable(object):
+        class Iterable:
             def __init__(self, limit: int):
                 self.limit = limit
                 self.value = 0
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index 2c2bf2ceb691..16e4d5661382 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -210,7 +210,7 @@ class M(nn.Module):
             sub : OneTwoModule
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.sub = BarMod()
 
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -221,11 +221,11 @@ def use_module_interface(mod_list: List[OneTwoModule], x: torch.Tensor):
 
         torch._C._enable_mobile_interface_call_export()
         scripted_M_mod = torch.jit.script(M())
-        self.assertTrue(set(['aten::mul.Scalar', 'aten::mul.Tensor', 'aten::reciprocal']).issubset(
+        self.assertTrue({'aten::mul.Scalar', 'aten::mul.Tensor', 'aten::reciprocal'}.issubset(
             set(torch.jit.export_opnames(scripted_M_mod))))
 
         scripted_M_mod.sub = torch.jit.script(FooMod())
-        self.assertTrue(set(['aten::add.Tensor', 'aten::mul.Scalar']).issubset(
+        self.assertTrue({'aten::add.Tensor', 'aten::mul.Scalar'}.issubset(
             set(torch.jit.export_opnames(scripted_M_mod))))
 
     def test_math_inf(self):
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index 2f67e27cb1d7..bc4b9d63cc79 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -31,7 +31,7 @@
 
 class MnistNet(nn.Module):
     def __init__(self):
-        super(MnistNet, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
         self.conv2_drop = nn.Dropout2d()
@@ -52,7 +52,7 @@ class TestModels(JitTestCase):
     def _test_dcgan_models(self, device, check_export_import=True):
         class DCGANGenerator(nn.Module):
             def __init__(self, nz, ngf, nc):
-                super(DCGANGenerator, self).__init__()
+                super().__init__()
                 self.main = nn.Sequential(
                     # input is Z, going into a convolution
                     nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
@@ -81,7 +81,7 @@ def forward(self, input):
 
         class DCGANDiscriminator(nn.Module):
             def __init__(self, nc, ndf):
-                super(DCGANDiscriminator, self).__init__()
+                super().__init__()
                 self.main = nn.Sequential(
                     # input is (nc) x 64 x 64
                     nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
@@ -126,7 +126,7 @@ def test_dcgan_models_cuda(self):
     def _test_neural_style(self, device, check_export_import=True):
         class TransformerNet(torch.nn.Module):
             def __init__(self):
-                super(TransformerNet, self).__init__()
+                super().__init__()
                 # Initial convolution layers
                 self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
                 self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
@@ -165,7 +165,7 @@ def forward(self, X):
 
         class ConvLayer(torch.nn.Module):
             def __init__(self, in_channels, out_channels, kernel_size, stride):
-                super(ConvLayer, self).__init__()
+                super().__init__()
                 reflection_padding = kernel_size // 2
                 self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
                 self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
@@ -182,7 +182,7 @@ class ResidualBlock(torch.nn.Module):
             """
 
             def __init__(self, channels):
-                super(ResidualBlock, self).__init__()
+                super().__init__()
                 self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
                 self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
                 self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
@@ -204,7 +204,7 @@ class UpsampleConvLayer(torch.nn.Module):
             """
 
             def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None):
-                super(UpsampleConvLayer, self).__init__()
+                super().__init__()
                 self.upsample = upsample
                 if upsample:
                     self.upsample_layer = torch.nn.Upsample(mode='nearest', scale_factor=upsample)
@@ -276,7 +276,7 @@ def train(iters):
     def _test_reinforcement_learning(self, device, test_export_import=True):
         class Policy(nn.Module):
             def __init__(self):
-                super(Policy, self).__init__()
+                super().__init__()
                 self.affine1 = nn.Linear(4, 128)
                 self.affine2 = nn.Linear(128, 2)
 
@@ -303,9 +303,9 @@ class Bottle(nn.Module):
 
             def forward(self, input):
                 if len(input.size()) <= 2:
-                    return super(Bottle, self).forward(input)
+                    return super().forward(input)
                 size = input.size()[:2]
-                out = super(Bottle, self).forward(input.view(size[0] * size[1], -1))
+                out = super().forward(input.view(size[0] * size[1], -1))
                 return out.view(size[0], size[1], -1)
 
         class Linear(Bottle, nn.Linear):
@@ -314,7 +314,7 @@ class Linear(Bottle, nn.Linear):
         class Encoder(nn.Module):
 
             def __init__(self, config):
-                super(Encoder, self).__init__()
+                super().__init__()
                 self.config = config
                 input_size = config.d_proj if config.projection else config.d_embed
                 dropout = 0 if config.n_layers == 1 else config.dp_ratio
@@ -332,7 +332,7 @@ def forward(self, inputs):
         class SNLIClassifier(nn.Module):
 
             def __init__(self, config):
-                super(SNLIClassifier, self).__init__()
+                super().__init__()
                 self.config = config
                 self.embed = nn.Embedding(config.n_embed, config.d_embed)
                 self.projection = Linear(config.d_embed, config.d_proj)
@@ -416,7 +416,7 @@ def _test_super_resolution(self, device, check_export_import=True):
         class Net(nn.Module):
 
             def __init__(self, upscale_factor):
-                super(Net, self).__init__()
+                super().__init__()
 
                 self.relu = nn.ReLU()
                 self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
@@ -449,7 +449,7 @@ def test_super_resolution_cuda(self):
     def test_time_sequence_prediction(self):
         class Sequence(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sequence, self).__init__()
+                super().__init__()
                 self.lstm1 = nn.LSTMCell(1, 51)
                 self.lstm2 = nn.LSTMCell(51, 51)
                 self.linear = nn.Linear(51, 1)
@@ -484,7 +484,7 @@ def forward(self, input):
 
         class Traced(nn.Module):
             def __init__(self):
-                super(Traced, self).__init__()
+                super().__init__()
                 self.seq = Sequence()
 
             def forward(self, input):
@@ -500,7 +500,7 @@ def forward(self, input):
     def _test_vae(self, device, check_export_import=True, quantized=False):
         class VAE(nn.Module):
             def __init__(self):
-                super(VAE, self).__init__()
+                super().__init__()
 
                 self.fc1 = nn.Linear(784, 400)
                 self.fc21 = nn.Linear(400, 20)
@@ -594,7 +594,7 @@ class BasicBlock(torch.jit.ScriptModule):
             __constants__ = ['downsample']
 
             def __init__(self, inplanes, planes, stride=1, downsample=None):
-                super(BasicBlock, self).__init__()
+                super().__init__()
                 self.conv1 = conv3x3(inplanes, planes, stride)
                 self.bn1 = nn.BatchNorm2d(planes)
                 self.relu = nn.ReLU(inplace=True)
@@ -626,7 +626,7 @@ class ResNet(torch.jit.ScriptModule):
             __constants__ = ['layer1', 'layer2', 'layer3', 'layer4']
 
             def __init__(self, block, layers, num_classes=1000):
-                super(ResNet, self).__init__()
+                super().__init__()
                 self.inplanes = 64
                 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                        bias=False)
diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py
index f253c2453b3b..31b6030c97c2 100644
--- a/test/jit/test_module_containers.py
+++ b/test/jit/test_module_containers.py
@@ -21,22 +21,16 @@
 class TestModuleContainers(JitTestCase):
     def test_sequential_intermediary_types(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self, x):
                 return x + 3
 
         class B(torch.nn.Module):
-            def __init__(self):
-                super(B, self).__init__()
-
             def forward(self, x):
                 return {"1": x}
 
         class C(torch.nn.Module):
             def __init__(self):
-                super(C, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Sequential(A(), B())
 
             def forward(self, x):
@@ -59,7 +53,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 modules = OrderedDict([
                     ('one', Inner()),
                     ('two', Inner2()),
@@ -90,9 +84,6 @@ def forward(self, x, skip_name):
                 return x, names
 
         class M2(M):
-            def __init__(self):
-                super(M2, self).__init__()
-
             def forward(self, x, skip_name):
                 # type: (Tensor, str)
                 names = torch.jit.annotate(List[str], [])
@@ -137,8 +128,7 @@ def forward(self, x):
 
         class CustomSequential(nn.Sequential):
             def __init__(self):
-                super(CustomSequential, self).__init__(
-                    nn.ReLU(), Inner())
+                super().__init__(nn.ReLU(), Inner())
 
             def forward(self, x):
                 x = x + 3
@@ -150,8 +140,7 @@ def forward(self, x):
 
         class CustomModuleList(nn.ModuleList):
             def __init__(self):
-                super(CustomModuleList, self).__init__(
-                    [nn.ReLU(), Inner()])
+                super().__init__([nn.ReLU(), Inner()])
 
             def forward(self, x):
                 x = x + 3
@@ -163,7 +152,7 @@ def forward(self, x):
 
         class CustomModuleDict(nn.ModuleDict):
             def __init__(self):
-                super(CustomModuleDict, self).__init__(
+                super().__init__(
                     OrderedDict([
                         ('one', Inner()),
                         ('two', nn.ReLU()),
@@ -183,7 +172,7 @@ def forward(self, x):
     def test_script_module_list_sequential(self):
         class M(torch.jit.ScriptModule):
             def __init__(self, mod_list):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = mod_list
 
             @torch.jit.script_method
@@ -199,7 +188,7 @@ def forward(self, v):
     def test_script_modulelist_index(self):
         class Sub(torch.nn.Module):
             def __init__(self, i):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.i = i
 
             def forward(self, thing):
@@ -207,7 +196,7 @@ def forward(self, thing):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = nn.ModuleList([Sub(i) for i in range(10)])
 
             def forward(self, v):
@@ -221,7 +210,7 @@ def forward(self, v):
 
         class MForward(torch.nn.Module):
             def __init__(self):
-                super(MForward, self).__init__()
+                super().__init__()
                 self.mods = nn.ModuleList([Sub(i) for i in range(10)])
 
             def forward(self, v):
@@ -233,9 +222,6 @@ def forward(self, v):
         self.checkModule(MForward(), (torch.tensor(1),))
 
         class M2(M):
-            def __init__(self):
-                super(M2, self).__init__()
-
             def forward(self, v):
                 return self.mods[-11].forward(v)
 
@@ -243,9 +229,6 @@ def forward(self, v):
             torch.jit.script(M2())
 
         class M3(M):
-            def __init__(self):
-                super(M3, self).__init__()
-
             def forward(self, v):
                 i = 3
                 return self.mods[i].forward(v)
@@ -255,8 +238,7 @@ def forward(self, v):
 
     def test_module_interface_special_methods(self):
         class CustomModuleInterface(torch.nn.Module):
-            def __init__(self):
-                super(CustomModuleInterface, self).__init__()
+            pass
 
         class CustomModuleList(CustomModuleInterface, torch.nn.ModuleList):
             def __init__(self, modules=None):
@@ -275,7 +257,7 @@ def __init__(self, modules=None):
 
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 # work around aliasing issue for 'is' operator by scripting ReLU up front
                 self.submod = torch.jit.script(torch.nn.ReLU())
                 self.modulelist = CustomModuleList([self.submod])
@@ -321,8 +303,7 @@ def forward(self, inputs):
 
     def test_special_method_with_override(self):
         class CustomModuleInterface(torch.nn.Module):
-            def __init__(self):
-                super(CustomModuleInterface, self).__init__()
+            pass
 
         class CustomModuleList(CustomModuleInterface, torch.nn.ModuleList):
             def __init__(self, modules=None):
@@ -337,7 +318,7 @@ def __len__(self):
 
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 # work around aliasing issue for 'is' operator by scripting ReLU up front
                 self.submod = torch.jit.script(torch.nn.ReLU())
                 self.modulelist = CustomModuleList([self.submod])
@@ -353,7 +334,7 @@ def forward(self, inputs):
     def test_moduledict_getitem(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.relu = torch.jit.script(torch.nn.ReLU())
                 self.tanh = torch.jit.script(torch.nn.Tanh())
                 self.moduledict = torch.nn.ModuleDict({"relu": self.relu,
@@ -370,7 +351,7 @@ def forward(self, input):
     def test_moduledict_keyerror(self):
         class BadModule(torch.nn.Module):
             def __init__(self):
-                super(BadModule, self).__init__()
+                super().__init__()
                 self.moduledict = torch.nn.ModuleDict({"foo": None,
                                                        "bar": None})
 
@@ -383,7 +364,7 @@ def forward(self, input):
 
         class AnotherBadModule(torch.nn.Module):
             def __init__(self):
-                super(AnotherBadModule, self).__init__()
+                super().__init__()
                 self.moduledict = torch.nn.ModuleDict({"foo": None,
                                                        "bar": None})
 
@@ -416,8 +397,7 @@ def forward(self):
 
     def test_empty_dict_override_contains(self):
         class CustomModuleInterface(torch.nn.Module):
-            def __init__(self):
-                super(CustomModuleInterface, self).__init__()
+            pass
 
         class CustomModuleDict(CustomModuleInterface, torch.nn.ModuleDict):
             def __init__(self, modules=None):
@@ -426,7 +406,7 @@ def __init__(self, modules=None):
 
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 # work around aliasing issue for 'is' operator by scripting ReLU up front
                 self.submod = torch.jit.script(torch.nn.ReLU())
                 self.moduledict = CustomModuleDict()
diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py
index 194e2abbbc2d..f9e9aea23542 100644
--- a/test/jit/test_module_interface.py
+++ b/test/jit/test_module_interface.py
@@ -18,9 +18,6 @@
                        "instead.")
 
 class OrigModule(nn.Module):
-    def __init__(self):
-        super(OrigModule, self).__init__()
-
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
         return inp1 + inp2 + 1
 
@@ -31,9 +28,6 @@ def forward(self, input: Tensor) -> Tensor:
         return input + self.one(input, input) + 1
 
 class NewModule(nn.Module):
-    def __init__(self):
-        super(NewModule, self).__init__()
-
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
         return inp1 * inp2 + 1
 
@@ -51,7 +45,7 @@ class TestNotModuleInterfaceCall(nn.Module):
             proxy_mod : ModuleInterface
 
             def __init__(self):
-                super(TestNotModuleInterfaceCall, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input: Tensor) -> Tensor:
@@ -73,7 +67,7 @@ def forward(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.interface
-        class OneTwoClass(object):
+        class OneTwoClass:
             def one(self, x: Tensor, y: Tensor) -> Tensor:
                 pass
 
@@ -144,7 +138,7 @@ class TestModule(nn.Module):
             proxy_mod : TestInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input):
@@ -173,7 +167,7 @@ def as_module_interface(x: OneTwoModule) -> OneTwoModule:
             return x
 
         @torch.jit.script
-        class Foo(object):
+        class Foo:
             def one(self, x: Tensor, y: Tensor) -> Tensor:
                 return x + y
 
@@ -260,7 +254,7 @@ class TestModule(nn.Module):
             proxy_mod : ModuleInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input: Tensor) -> Tensor:
@@ -288,9 +282,6 @@ def forward(self, input: Tensor) -> Tensor:
                 pass
 
         class NewModuleWrong(nn.Module):
-            def __init__(self):
-                super(NewModuleWrong, self).__init__()
-
             def forward(self, input: int) -> int:
                 return input + 1
 
@@ -298,7 +289,7 @@ class TestModule(nn.Module):
             proxy_mod : ModuleInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input: Tensor) -> Tensor:
@@ -322,16 +313,13 @@ class TestModule(nn.Module):
             proxy_mod : ModuleInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input: Tensor) -> Tensor:
                 return self.proxy_mod.forward(input)
 
         class NewModuleMethodNotLazyCompile(nn.Module):
-            def __init__(self):
-                super(NewModuleMethodNotLazyCompile, self).__init__()
-
             def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
                 return inp1 * inp2 + 1
 
@@ -345,9 +333,6 @@ def forward(self, input: Tensor) -> Tensor:
             scripted_mod.proxy_mod = torch.jit.script(NewModuleMethodNotLazyCompile())
 
         class NewModuleMethodManualExport(nn.Module):
-            def __init__(self):
-                super(NewModuleMethodManualExport, self).__init__()
-
             @torch.jit.export
             def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
                 return inp1 * inp2 + 1
@@ -363,7 +348,7 @@ def test_module_swap_no_module_interface(self):
         # test module swapping with no module interface
         class TestNoModuleInterface(nn.Module):
             def __init__(self):
-                super(TestNoModuleInterface, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input: Tensor) -> Tensor:
@@ -388,9 +373,6 @@ def forward(self, input: Tensor) -> Tensor:
                 pass
 
         class OrigScriptModule(torch.jit.ScriptModule):
-            def __init__(self):
-                super(OrigScriptModule, self).__init__()
-
             @torch.jit.script_method
             def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
                 return inp1 + inp2 + 1
@@ -400,9 +382,6 @@ def forward(self, input: Tensor) -> Tensor:
                 return input + self.one(input, input) + 1
 
         class NewScriptModule(torch.jit.ScriptModule):
-            def __init__(self):
-                super(NewScriptModule, self).__init__()
-
             @torch.jit.script_method
             def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
                 return inp1 * inp2 + 1
@@ -415,7 +394,7 @@ class TestNNModuleWithScriptModule(nn.Module):
             proxy_mod : ModuleInterface
 
             def __init__(self):
-                super(TestNNModuleWithScriptModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigScriptModule()
 
             def forward(self, input: Tensor) -> Tensor:
@@ -433,7 +412,7 @@ def forward(self, input: Tensor) -> Tensor:
     def test_freeze_module_with_interface(self):
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.b = 20
 
             def forward(self, x):
@@ -441,7 +420,7 @@ def forward(self, x):
 
         class OrigMod(torch.nn.Module):
             def __init__(self):
-                super(OrigMod, self).__init__()
+                super().__init__()
                 self.a = 0
 
             def forward(self, x):
@@ -456,7 +435,7 @@ class TestModule(torch.nn.Module):
             proxy_mod : ModInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigMod()
                 self.sub = SubModule()  # folded
 
@@ -476,7 +455,7 @@ def forward(self, x):
     def test_freeze_module_with_setattr_in_interface(self):
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.b = 20
 
             def forward(self, x):
@@ -489,7 +468,7 @@ def getb(self, x):
 
         class OrigMod(torch.nn.Module):
             def __init__(self):
-                super(OrigMod, self).__init__()
+                super().__init__()
                 self.a = 0
 
             def forward(self, x):
@@ -504,7 +483,7 @@ class TestModule(torch.nn.Module):
             proxy_mod : ModInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigMod()
                 self.sub = SubModule()
 
@@ -519,7 +498,7 @@ def forward(self, x):
     def test_freeze_module_with_inplace_mutation_in_interface(self):
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.b = torch.tensor([1.5])
 
             def forward(self, x):
@@ -532,7 +511,7 @@ def getb(self, x):
 
         class OrigMod(torch.nn.Module):
             def __init__(self):
-                super(OrigMod, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([0.5])
 
             def forward(self, x):
@@ -547,7 +526,7 @@ class TestModule(torch.nn.Module):
             proxy_mod : ModInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigMod()
                 self.sub = SubModule()
 
@@ -565,7 +544,7 @@ def forward(self, x):
     def test_freeze_module_with_mutated_interface(self):
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.b = torch.tensor([1.5])
 
             def forward(self, x):
@@ -577,7 +556,7 @@ def getb(self, x):
 
         class OrigMod(torch.nn.Module):
             def __init__(self):
-                super(OrigMod, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([0.5])
 
             def forward(self, x):
@@ -592,7 +571,7 @@ class TestModule(torch.nn.Module):
             proxy_mod : ModInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigMod()
                 self.sub = SubModule()
 
@@ -610,7 +589,7 @@ def forward(self, x):
     def test_freeze_module_with_interface_and_fork(self):
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.b = torch.tensor([1.5])
 
             def forward(self, x):
@@ -619,7 +598,7 @@ def forward(self, x):
 
         class OrigMod(torch.nn.Module):
             def __init__(self):
-                super(OrigMod, self).__init__()
+                super().__init__()
                 self.a = torch.tensor([0.5])
 
             def forward(self, x):
@@ -634,7 +613,7 @@ class TestModule(torch.nn.Module):
             proxy_mod : ModInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigMod()
                 self.sub = SubModule()
 
@@ -645,7 +624,7 @@ def forward(self, x):
 
         class MainModule(torch.nn.Module):
             def __init__(self):
-                super(MainModule, self).__init__()
+                super().__init__()
                 self.test = TestModule()
 
             def forward(self, x):
@@ -668,7 +647,7 @@ class TestModule(nn.Module):
             proxy_mod : ModuleInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigModule()
 
             def forward(self, input):
diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
index a6527a3ffdff..78d3fae59371 100644
--- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py
+++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py
@@ -40,7 +40,7 @@ def check_replacement(
     def test_replace_conv1d_with_conv2d(self):
         class TestConv1d(torch.nn.Module):
             def __init__(self, weight, bias):
-                super(TestConv1d, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
 
@@ -167,7 +167,7 @@ def __init__(
                 conv2d_weight,
                 conv2d_bias,
             ):
-                super(TestFuseActivationLinearConv2d, self).__init__()
+                super().__init__()
                 self.linear_weight = linear_weight
                 self.linear_bias = linear_bias
                 self.conv2d_weight = conv2d_weight
diff --git a/test/jit/test_pdt.py b/test/jit/test_pdt.py
index baab4c8dc444..dd8c00685114 100644
--- a/test/jit/test_pdt.py
+++ b/test/jit/test_pdt.py
@@ -28,9 +28,6 @@ class TestPDT(JitTestCase):
     """
     def test_nn_module(self):
         class TestPDTModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x) -> Any:
                 if isinstance(x, int):
                     return x + 1
@@ -49,9 +46,6 @@ def forward(self, x) -> Any:
 
     def test_nested_nn_module_class(self):
         class NestedPDTInner(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 if isinstance(x, int):
                     return x * 10
@@ -76,9 +70,6 @@ def forward(self, x):
 
     def test_nested_nn_module_class_with_args(self):
         class NestedModulePDTInner(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 if isinstance(x, int):
                     return x * 10 + y
@@ -105,9 +96,6 @@ def forward(self, x):
 
     def test_nested_function_in_forward(self):
         class NestedFunctionInForward(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return self.fun(x) + 10
 
@@ -127,9 +115,6 @@ def fun(self, x):
 
     def test_nn_module_with_export_function(self):
         class TestModelWithExport(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @torch.jit.export
             def fn(self, x, y) -> Any:
                 assert not (isinstance(x, bool) and isinstance(y, bool))
diff --git a/test/jit/test_peephole.py b/test/jit/test_peephole.py
index 12f7a1fc709e..e79fbf650479 100644
--- a/test/jit/test_peephole.py
+++ b/test/jit/test_peephole.py
@@ -194,7 +194,7 @@ def test_conv_dim_folding(self):
         for mod in modules:
             class ConvDim(torch.nn.Module):
                 def __init__(self):
-                    super(ConvDim, self).__init__()
+                    super().__init__()
                     self.conv = mod(3, 32, kernel_size=3, stride=2, bias=False)
 
                 def forward(self, x):
@@ -208,7 +208,7 @@ def forward(self, x):
 
             class ConvDimMutate(torch.nn.Module):
                 def __init__(self):
-                    super(ConvDimMutate, self).__init__()
+                    super().__init__()
                     self.conv = mod(3, 32, kernel_size=3, stride=2, bias=False)
 
                 def forward(self, x):
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 81df055f55b7..5389751a5bec 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -4,6 +4,7 @@
 import sys
 
 import torch
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -15,6 +16,7 @@
                        "\tpython test/test_jit.py TESTNAME\n\n"
                        "instead.")
 
+@skipIfTorchDynamo()
 class TestProfiler(JitTestCase):
     def setUp(self):
         self.prev_exec = torch._C._jit_set_profiling_executor(True)
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index db073d327472..fe2a20278cc8 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -27,7 +27,7 @@ class TestRecursiveScript(JitTestCase):
     def test_inferred_nonetype(self):
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.x = None
 
             def forward(self):
@@ -47,7 +47,7 @@ def fn2(x):
 
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -62,7 +62,7 @@ def forward(self, x):
     def test_python_function_attribute(self):
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -78,7 +78,7 @@ def fn(x):
 
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -128,7 +128,7 @@ def forward(self):
     def test_module_name(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.x = 2
 
             def forward(self, t):
@@ -186,30 +186,26 @@ def forward(self, t):
 
         self.checkModule(M2(), (torch.randn(2, 2),))
 
-        if sys.version_info[:2] >= (3, 8):
-            class M3(torch.nn.Module):
-                x : typing.Final[int]
+        class M3(torch.nn.Module):
+            x : typing.Final[int]
 
-                def __init__(self):
-                    super().__init__()
-                    self.x = 2
+            def __init__(self):
+                super().__init__()
+                self.x = 2
 
-                def forward(self, t):
-                    return t + self.x
+            def forward(self, t):
+                return t + self.x
 
-            self.checkModule(M3(), (torch.randn(2, 2),))
+        self.checkModule(M3(), (torch.randn(2, 2),))
 
     def test_ignore_class(self):
         @torch.jit.ignore
-        class MyScriptClass(object):
+        class MyScriptClass:
             def unscriptable(self):
                 return "a" + 200
 
 
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, x):
                 return MyScriptClass()
 
@@ -234,7 +230,7 @@ def forward(self, x):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(10, 10, 3)
                 self.lin = nn.Linear(10, 10)
                 self.sub = Submodule()
@@ -271,7 +267,7 @@ def test_module_dir(mod):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(10, 10, 3)
                 self.lin = nn.Linear(10, 10)
 
@@ -291,7 +287,7 @@ def test_class_compile(self):
         def other_fn(a: int, b: Tensor) -> Tensor:
             return a * b
 
-        class B(object):
+        class B:
             def __init__(self, x):
                 self.x = 2
 
@@ -300,9 +296,6 @@ def helper(self, a):
 
 
         class N(torch.nn.Module):
-            def __init__(self):
-                super(N, self).__init__()
-
             def forward(self, x):
                 b = B(x)
                 return b.helper(x)
@@ -343,15 +336,12 @@ def b(x):
             return c(x)
 
         class Submodule(torch.nn.Module):
-            def __init__(self):
-                super(Submodule, self).__init__()
-
             def forward(self, x):
                 return b(x)
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.submodule = Submodule()
 
             def some_method(self, y):
@@ -385,7 +375,7 @@ def a_script_fn(d, e, f):
         self.assertEqual(a_script_fn(t, t, t), t + t + t)
 
     def test_error_stack_class(self):
-        class X(object):
+        class X:
             def bad_fn(self):
                 import pdb  # noqa: F401
 
@@ -401,7 +391,7 @@ def fn(x) -> X:
             checker.run(str(e))
 
     def test_error_stack_annotation(self):
-        class X(object):
+        class X:
             def bad_fn(self):
                 import pdb  # noqa: F401
 
@@ -422,7 +412,7 @@ class Other(torch.nn.Module):
             __constants__ = ['x']
 
             def __init__(self, x):
-                super(Other, self).__init__()
+                super().__init__()
                 self.x = x
                 self.param = torch.nn.Parameter(torch.ones(2, 2))
 
@@ -437,7 +427,7 @@ def forward(self, t):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.other = Other(200)
 
             def forward(self, t):
@@ -450,7 +440,7 @@ class Other(torch.nn.Module):
             __constants__ = ['x']
 
             def __init__(self, x):
-                super(Other, self).__init__()
+                super().__init__()
                 self.x = x
                 self.param = torch.nn.Parameter(torch.ones(2, 2))
 
@@ -464,7 +454,7 @@ def forward(self, t):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.other = Other(200)
 
             def forward(self, t):
@@ -479,7 +469,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.sequential = nn.Sequential(
                     Inner(),
                     Inner(),
@@ -514,7 +504,7 @@ def __prepare_scriptable__(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 shared = SeluButReluWhenScripted()
                 self.sequential = nn.Sequential(
                     SeluButReluWhenScripted(),
@@ -550,18 +540,18 @@ def test_prepare_scriptable_cycle(self):
 
     def test_attributes(self):
         @torch.jit.script
-        class Inner2(object):
+        class Inner2:
             def __init__(self):
                 self.b = "a string"
 
         @torch.jit.script
-        class Foo(object):
+        class Foo:
             def __init__(self):
                 self.a = 4
                 self.inner = Inner2()
 
         @torch.jit.script
-        class SFoo(object):
+        class SFoo:
             def __init__(self):
                 self.a = 4
                 self.inner = Inner2()
@@ -604,9 +594,6 @@ class M(torch.nn.Module):
             # my_empty_dict : Dict[str, int]
             # my_none : Optional[int]
 
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x):
                 return (
                     self.my_dict,
@@ -654,7 +641,7 @@ def forward(self, x):
     def test_function_attribute_in_submodule(self):
         class N(nn.Module):
             def __init__(self, norm):
-                super(N, self).__init__()
+                super().__init__()
                 self.activation = torch.nn.functional.relu
                 self.norm = norm
 
@@ -665,7 +652,7 @@ def forward(self, src):
 
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 encoder_norm = nn.ReLU()
                 self.encoder = N(encoder_norm)
 
@@ -682,7 +669,7 @@ def forward(self, x):
 
         class Model(nn.Module):
             def __init__(self, dummies):
-                super(Model, self).__init__()
+                super().__init__()
                 self._dummies = dummies
 
             def forward(self, x):
@@ -709,7 +696,7 @@ def forward(self, x):
 
         class ContainsLoaded(torch.nn.Module):
             def __init__(self):
-                super(ContainsLoaded, self).__init__()
+                super().__init__()
                 self.encoder = dummy
 
             def forward(self, input):
@@ -720,7 +707,7 @@ def forward(self, input):
     def test_optional_module(self):
         class Dummy(nn.Module):
             def __init__(self):
-                super(Dummy, self).__init__()
+                super().__init__()
                 self.foo = nn.Linear(2, 2)
 
             def forward(self, x):
diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py
index 4c393a7f1a0f..2f7559f84e1d 100644
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@@ -268,7 +268,7 @@ def test_common_pytorch_list_ops(self):
         for op in ["cat", "stack", "vstack", "hstack", "dstack"]:
             class OpMod(torch.nn.Module):
                 def __init__(self, op):
-                    super(OpMod, self).__init__()
+                    super().__init__()
                     self.op = torch_op
 
                 def forward(self):
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index 16babb7c7a25..a21c3dc39339 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -35,7 +35,7 @@ def test_different_modules(self):
 
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Linear(2, 2)
                 self.bar = torch.nn.Linear(2, 2)
 
@@ -53,7 +53,7 @@ def forward(self, x):
 
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Linear(2, 2)
 
             def forward(self, x):
@@ -152,12 +152,12 @@ def test_different_interfaces(self):
         """
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script
-        class ImplementInterface(object):
+        class ImplementInterface:
             def __init__(self):
                 pass
 
@@ -182,12 +182,12 @@ def forward(self, x):
         clear_class_registry()
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def not_bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script  # noqa: F811
-        class ImplementInterface(object):  # noqa: F811
+        class ImplementInterface:  # noqa: F811
             def __init__(self):
                 pass
 
@@ -238,12 +238,12 @@ class MyCoolNamedTuple(NamedTuple):
             a: int
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script
-        class ImplementInterface(object):
+        class ImplementInterface:
             def __init__(self):
                 pass
 
@@ -278,12 +278,12 @@ def forward(self, x):
         clear_class_registry()
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def not_bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script  # noqa: F811
-        class ImplementInterface(object):  # noqa: F811
+        class ImplementInterface:  # noqa: F811
             def __init__(self):
                 pass
 
@@ -457,8 +457,7 @@ def test_save_load_params_buffers_submodules(self):
         """
 
         class Submodule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
+            pass
 
         class TestModule(torch.nn.Module):
             def __init__(self):
@@ -508,7 +507,7 @@ def test_save_load_meta_tensors(self):
 
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Linear(2, 3, device="meta")
                 self.bar = torch.nn.Linear(3, 4)
                 self.register_buffer("buffer", torch.randn(4, device="meta"))
@@ -525,8 +524,8 @@ def forward(self, x):
             len(list(m.named_modules())), len(list(m_loaded.named_modules()))
         )
         self.assertEqual(
-            set(name for name, _ in m.named_modules()),
-            set(name for name, _ in m_loaded.named_modules()),
+            {name for name, _ in m.named_modules()},
+            {name for name, _ in m_loaded.named_modules()},
         )
         # Check parameters.
         m_params = dict(m.named_parameters())
@@ -550,6 +549,105 @@ def forward(self, x):
         self.assertTrue(m_buffers["buffer"].is_meta)
         self.assertTrue(m_loaded_buffers["buffer"].is_meta)
 
+    def test_save_load_with_saved_traced_inputs(self):
+        """
+        Check that saving and loading with traced inputs works as expected
+        """
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ones(1)
+
+        def get_loaded_inputs(inputs):
+            traced_module = torch.jit.trace(module, input1)
+            traced_inputs = list(traced_module.graph.inputs())
+            with TemporaryFileName() as fname:
+                path = pathlib.Path(fname)
+                traced_module.save(path)
+                print(traced_module.graph)
+                loaded_module = torch.jit.load(path, _restore_shapes=True)
+                print(loaded_module.graph)
+                return traced_inputs, list(loaded_module.graph.inputs())
+
+        module = Module()
+        input_tensor = torch.rand(1, 3, 24, 24)
+        # Validate that with no input specified the traced inputs are stored
+        traced_module = torch.jit.trace(module, input_tensor)
+        traced_inputs = list(traced_module.graph.inputs())
+        self.assertEquals(traced_module._c._retrieve_traced_inputs()['forward'], [input_tensor])
+        with TemporaryFileName() as fname:
+            path = pathlib.Path(fname)
+            traced_module.save(path)
+            loaded_module = torch.jit.load(path, _restore_shapes=True)
+            loaded_inputs = list(loaded_module.graph.inputs())
+            self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
+            self.assertEqual(traced_inputs[1].type().sizes(), loaded_inputs[1].type().sizes())
+            # Validate that if no shapes are requested previous functionality remains
+            loaded_module = torch.jit.load(path)
+            loaded_inputs = list(loaded_module.graph.inputs())
+            self.assertEqual(loaded_inputs[1].type().sizes(), None)
+
+        # Validate that inputs aren't saved when requested not to
+        traced_module = torch.jit.trace(module, input_tensor, _store_inputs=False)
+        traced_inputs = list(traced_module.graph.inputs())
+        self.assertEquals(len(traced_module._c._retrieve_traced_inputs()), 0)
+
+        with TemporaryFileName() as fname:
+            path = pathlib.Path(fname)
+            traced_module.save(path)
+            loaded_module = torch.jit.load(path, _restore_shapes=True)
+            loaded_inputs = list(loaded_module.graph.inputs())
+            self.assertEqual(loaded_inputs[1].type().sizes(), None)
+            # Validate that if no shapes are requested previous functionality remains
+            loaded_module = torch.jit.load(path)
+            loaded_inputs = list(loaded_module.graph.inputs())
+            self.assertEqual(loaded_inputs[1].type().sizes(), None)
+
+        # Validate that complex inputs work
+        # Testing dict of list with empty tensors
+        input1 = {
+            "1000": (
+                torch.tensor([0]),
+                torch.tensor([], dtype=torch.int64),
+                torch.tensor([])
+            )
+        }
+        traced_inputs, loaded_inputs = get_loaded_inputs(input1)
+        self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
+
+        # Testing dict of list
+        input2 = {
+            "1000": (
+                torch.tensor([0]),
+                torch.tensor([1500000, 1500004], dtype=torch.int64),
+                torch.tensor([2.0, 3.0])
+            )
+        }
+        traced_inputs, loaded_inputs = get_loaded_inputs(input2)
+        self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
+
+        # Testing list
+        input3 = [torch.tensor([0]),
+                  torch.tensor([1500000, 1500004], dtype=torch.int64),
+                  torch.tensor([2.0, 3.0])]
+
+        traced_inputs, loaded_inputs = get_loaded_inputs(input3)
+        self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
+
+        # Testing list of dict of list
+        input4 = [{
+            "1000": (
+                torch.tensor([0]),
+                torch.tensor([1500000, 1500004], dtype=torch.int64),
+                torch.tensor([2.0, 3.0])
+            )
+        }]
+
+        traced_inputs, loaded_inputs = get_loaded_inputs(input4)
+        self.assertEqual(traced_inputs[1].type(), loaded_inputs[1].type())
 
 def script_module_to_buffer(script_module):
     module_buffer = io.BytesIO(
@@ -571,7 +669,7 @@ def test_different_modules(self):
 
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Linear(2, 2)
                 self.bar = torch.nn.Linear(2, 2)
 
@@ -587,7 +685,7 @@ def forward(self, x):
 
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Linear(2, 2)
 
             def forward(self, x):
@@ -683,12 +781,12 @@ def test_different_interfaces(self):
         """
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script
-        class ImplementInterface(object):
+        class ImplementInterface:
             def __init__(self):
                 pass
 
@@ -710,12 +808,12 @@ def forward(self, x):
         clear_class_registry()
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def not_bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script  # noqa: F811
-        class ImplementInterface(object):  # noqa: F811
+        class ImplementInterface:  # noqa: F811
             def __init__(self):
                 pass
 
@@ -766,12 +864,12 @@ class MyCoolNamedTuple(NamedTuple):
             a: int
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script
-        class ImplementInterface(object):
+        class ImplementInterface:
             def __init__(self):
                 pass
 
@@ -804,12 +902,12 @@ def forward(self, x):
         clear_class_registry()
 
         @torch.jit.interface
-        class MyInterface(object):
+        class MyInterface:
             def not_bar(self, x: Tensor) -> Tensor:
                 pass
 
         @torch.jit.script  # noqa: F811
-        class ImplementInterface(object):  # noqa: F811
+        class ImplementInterface:  # noqa: F811
             def __init__(self):
                 pass
 
@@ -921,7 +1019,7 @@ def forward(self) -> Optional[FooTuple]:
     def test_module_info_flatbuffer(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Linear(2, 2)
                 self.bar = torch.nn.Linear(2, 2)
 
@@ -952,8 +1050,7 @@ def test_save_load_params_buffers_submodules(self):
         """
 
         class Submodule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
+            pass
 
         class TestModule(torch.nn.Module):
             def __init__(self):
@@ -1002,9 +1099,6 @@ def test_save_load_with_extra_files(self):
         """
 
         class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x: Tensor):
                 return x
 
diff --git a/test/jit/test_save_load_for_op_version.py b/test/jit/test_save_load_for_op_version.py
index b5e38b37d3eb..328f65684a70 100644
--- a/test/jit/test_save_load_for_op_version.py
+++ b/test/jit/test_save_load_for_op_version.py
@@ -75,9 +75,6 @@ def historic_div(self, other):
 
         # Tensor x Tensor
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, a, b):
                 result_0 = a / b
                 result_1 = torch.div(a, b)
@@ -123,9 +120,6 @@ def historic_div_(self, other):
             return self.divide_(other, rounding_mode='trunc')
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, a, b):
                 a /= b
                 return a
@@ -169,9 +163,6 @@ def historic_div_out(self, other, out):
             return torch.divide(self, other, out=out, rounding_mode='trunc')
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, a, b, out):
                 return a.div(b, out=out)
 
@@ -220,16 +211,10 @@ def historic_div_scalar_int(self, other: int):
             return torch.divide(self, other, rounding_mode='trunc')
 
         class MyModuleFloat(torch.nn.Module):
-            def __init__(self):
-                super(MyModuleFloat, self).__init__()
-
             def forward(self, a, b: float):
                 return a / b
 
         class MyModuleInt(torch.nn.Module):
-            def __init__(self):
-                super(MyModuleInt, self).__init__()
-
             def forward(self, a, b: int):
                 return a / b
 
@@ -279,16 +264,10 @@ def historic_div_scalar_int_reciprocal(self, other: int):
             return torch.divide(other, self, rounding_mode='trunc')
 
         class MyModuleFloat(torch.nn.Module):
-            def __init__(self):
-                super(MyModuleFloat, self).__init__()
-
             def forward(self, a, b: float):
                 return b / a
 
         class MyModuleInt(torch.nn.Module):
-            def __init__(self):
-                super(MyModuleInt, self).__init__()
-
             def forward(self, a, b: int):
                 return b / a
 
@@ -348,17 +327,11 @@ def historic_div_scalar_int_inplace(self, other: int):
             return self.divide_(other, rounding_mode='trunc')
 
         class MyModuleFloat(torch.nn.Module):
-            def __init__(self):
-                super(MyModuleFloat, self).__init__()
-
             def forward(self, a, b: float):
                 a /= b
                 return a
 
         class MyModuleInt(torch.nn.Module):
-            def __init__(self):
-                super(MyModuleInt, self).__init__()
-
             def forward(self, a, b: int):
                 a /= b
                 return a
@@ -396,9 +369,6 @@ def _helper(m, fn):
     #   so this test verifies the behavior is unchanged.
     def test_versioned_div_scalar_scalar(self):
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, a: float, b: int, c: float, d: int):
                 result_0 = a / b
                 result_1 = a / c
@@ -425,9 +395,6 @@ def _helper(m, fn):
 
     def test_versioned_linspace(self):
         class Module(torch.nn.Module):
-            def __init__(self):
-                super(Module, self).__init__()
-
             def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
                 c = torch.linspace(a, b, steps=5)
                 d = torch.linspace(a, b, steps=100)
@@ -455,9 +422,6 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
 
     def test_versioned_linspace_out(self):
         class Module(torch.nn.Module):
-            def __init__(self):
-                super(Module, self).__init__()
-
             def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
                 return torch.linspace(a, b, steps=100, out=out)
 
@@ -484,9 +448,6 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex],
 
     def test_versioned_logspace(self):
         class Module(torch.nn.Module):
-            def __init__(self):
-                super(Module, self).__init__()
-
             def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
                 c = torch.logspace(a, b, steps=5)
                 d = torch.logspace(a, b, steps=100)
@@ -514,9 +475,6 @@ def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
 
     def test_versioned_logspace_out(self):
         class Module(torch.nn.Module):
-            def __init__(self):
-                super(Module, self).__init__()
-
             def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor):
                 return torch.logspace(a, b, steps=100, out=out)
 
diff --git a/test/jit/test_script_profile.py b/test/jit/test_script_profile.py
index f350a49adf2d..438994b6a8f3 100644
--- a/test/jit/test_script_profile.py
+++ b/test/jit/test_script_profile.py
@@ -18,7 +18,7 @@
 
 class Sequence(nn.Module):
     def __init__(self):
-        super(Sequence, self).__init__()
+        super().__init__()
         self.lstm1 = nn.LSTMCell(1, 51)
         self.lstm2 = nn.LSTMCell(51, 51)
         self.linear = nn.Linear(51, 1)
diff --git a/test/jit/test_scriptmod_ann.py b/test/jit/test_scriptmod_ann.py
index 5d256bac4937..47e010e6122e 100644
--- a/test/jit/test_scriptmod_ann.py
+++ b/test/jit/test_scriptmod_ann.py
@@ -54,7 +54,7 @@ def forward(self, x: List[int]):
     def test_annotated_empty_tensor(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.x: torch.Tensor = torch.empty(0)
 
             def forward(self, x: torch.Tensor):
@@ -68,7 +68,7 @@ def forward(self, x: torch.Tensor):
     def test_annotated_with_jit_attribute(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.x = torch.jit.Attribute([], List[int])
 
             def forward(self, x: List[int]):
diff --git a/test/jit/test_slice.py b/test/jit/test_slice.py
index 5878f6c43bf2..ceb3c3b48e89 100644
--- a/test/jit/test_slice.py
+++ b/test/jit/test_slice.py
@@ -133,7 +133,7 @@ def tuple_slice(a):
         self.assertEqual(scripted_fn(torch.tensor(1)), (2, 3))
         tuple_graph = scripted_fn.graph
         slices = tuple_graph.findAllNodes("prim::TupleConstruct")
-        num_outputs = set(len(x.output().type().elements()) for x in slices)
+        num_outputs = {len(x.output().type().elements()) for x in slices}
         # there should be only one tupleSlice with length of 2
         self.assertTrue(num_outputs == {2})
         self.run_pass('lower_all_tuples', tuple_graph)
diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py
index 3e3cb3ffed73..261b2c68a8d4 100644
--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@@ -20,6 +20,7 @@
 # XXX: still in prototype
 class TestSymbolicShapeAnalysis(JitTestCase):
     def setUp(self):
+        super(JitTestCase, self).setUp()
         self.prev_symbolic_shapes_test_enabled = torch._C._jit_symbolic_shapes_test_mode_enabled()
         torch._C._jit_set_symbolic_shapes_test_mode(True)
 
@@ -309,7 +310,7 @@ class CatMod(nn.Module):
             __constants__ = ['dim']
 
             def __init__(self, dim=0):
-                super(CatMod, self).__init__()
+                super().__init__()
                 self.dim = dim
 
             def forward(self, x, y):
@@ -442,7 +443,7 @@ def test_partial_eval_stitching(self):
     def test_refinement_through_graph_stitching(self):
         class TwoConvs(torch.nn.Module):
             def __init__(self):
-                super(TwoConvs, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
                 self.conv2 = torch.nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
 
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index e97767c00039..b92793eb8d94 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -65,7 +65,7 @@ def f():
         test_equality(f, lambda x: x)
 
         # test nn module with prepare_scriptable function
-        class NonJitableClass(object):
+        class NonJitableClass:
             def __init__(self, int1, int2):
                 self.int1 = int1
                 self.int2 = int2
@@ -75,7 +75,7 @@ def return_vals(self):
 
         class CustomWrapper(torch.nn.Module):
             def __init__(self, foo):
-                super(CustomWrapper, self).__init__()
+                super().__init__()
                 self.foo = foo
 
             def forward(self) -> None:
@@ -239,7 +239,7 @@ def foo():
     def test_torchbind_class_attr_recursive(self):
         class FooBar(torch.nn.Module):
             def __init__(self, foo_model):
-                super(FooBar, self).__init__()
+                super().__init__()
                 self.foo_mod = foo_model
 
             def forward(self) -> int:
@@ -256,7 +256,7 @@ def to_ivalue(self):
     def test_torchbind_class_attribute(self):
         class FooBar1234(torch.nn.Module):
             def __init__(self):
-                super(FooBar1234, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._StackString(["3", "4"])
 
             def forward(self):
@@ -272,7 +272,7 @@ def forward(self):
     def test_torchbind_getstate(self):
         class FooBar4321(torch.nn.Module):
             def __init__(self):
-                super(FooBar4321, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
             def forward(self):
@@ -293,7 +293,7 @@ def forward(self):
     def test_torchbind_deepcopy(self):
         class FooBar4321(torch.nn.Module):
             def __init__(self):
-                super(FooBar4321, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
             def forward(self):
@@ -309,7 +309,7 @@ def forward(self):
     def test_torchbind_python_deepcopy(self):
         class FooBar4321(torch.nn.Module):
             def __init__(self):
-                super(FooBar4321, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
             def forward(self):
@@ -324,7 +324,7 @@ def forward(self):
     def test_torchbind_tracing(self):
         class TryTracing(torch.nn.Module):
             def __init__(self):
-                super(TryTracing, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
             def forward(self):
@@ -340,12 +340,12 @@ def test_torchbind_pass_wrong_type(self):
     def test_torchbind_tracing_nested(self):
         class TryTracingNest(torch.nn.Module):
             def __init__(self):
-                super(TryTracingNest, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._PickleTester([3, 4])
 
         class TryTracing123(torch.nn.Module):
             def __init__(self):
-                super(TryTracing123, self).__init__()
+                super().__init__()
                 self.nest = TryTracingNest()
 
             def forward(self):
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index b36003a2b920..170395102771 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -40,7 +40,7 @@ class TestTracer(JitTestCase):
     def test_large_nbr_kernel_args(self):
         class Recurrence(nn.Module):
             def __init__(self, seq_len):
-                super(Recurrence, self).__init__()
+                super().__init__()
                 self.seq_len = seq_len
 
             def forward(self, input):
@@ -87,9 +87,6 @@ def f(x, y):
 
     def test_trace_checking_with_global_name(self):
         class MyClass(torch.nn.Module):
-            def __init__(self):
-                super(MyClass, self).__init__()
-
             def forward(self, xs: List[Tensor]):
                 y = torch.cat(xs, dim=0)
                 return y
@@ -105,7 +102,7 @@ def forward(self, xs: List[Tensor]):
     def test_trace_aliased_parameter(self):
         class M(nn.Module):
             def __init__(self, x):
-                super(M, self).__init__()
+                super().__init__()
                 self.x = nn.Parameter(x)
 
             def forward(self, y):
@@ -622,9 +619,6 @@ def test(d):
     def test_input_dict_remembers_keys(self):
         """Check that the trace remembers which keys were in a dict input"""
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, dict_input):
                 return dict_input['x']
 
@@ -649,9 +643,6 @@ def forward(self, dict_input):
     def test_input_dict_insertion_order(self):
         """Check that dictionary access doesn't care about insertion order"""
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, dict_input):
                 return dict_input['x'], dict_input['y']
         input_x_then_y = {}
@@ -671,9 +662,6 @@ def forward(self, dict_input):
 
     def test_input_dict_recursive(self):
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, dict_input):
                 return dict_input['x'][1]
 
@@ -833,7 +821,7 @@ def f(x):
     def test_shared_param(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.b = self.a = nn.Parameter(torch.randn(2, 2))
 
             def forward(self, x):
@@ -852,9 +840,6 @@ def test_trace_c10_ops(self):
             self.skipTest("Skip the test since c2 ops are not registered.")
 
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
                     (scores), (bbox_deltas), (im_info), (anchors),
@@ -955,7 +940,7 @@ def foo(a):
     def test_traced_module_cuda(self):
         class Model(nn.Module):
             def __init__(self, num_features, num_layers):
-                super(Model, self).__init__()
+                super().__init__()
                 self.num_layers = num_layers
                 layers = [[nn.Linear(num_features, num_features), nn.Sigmoid()]
                           for _ in range(num_layers)]
@@ -1135,7 +1120,7 @@ def foo(x):
     def test_trace_dict_input(self):
         class Bar(torch.nn.Module):
             def __init__(self):
-                super(Bar, self).__init__()
+                super().__init__()
                 self.foo = Foo()
 
             def forward(self, a, b):
@@ -1267,7 +1252,7 @@ def forward(self):
     def test_trace_save_load_copy(self):
         class Test(torch.nn.Module):
             def __init__(self):
-                super(Test, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3)
 
             def forward(self, x):
@@ -1285,7 +1270,7 @@ def forward(self, x):
     def test_trace_export_fns(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.a = 3
 
             @torch.jit.export
@@ -1316,7 +1301,7 @@ def check(mod):
     def test_trace_export_fns_recursive(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.a = 3
 
             @torch.jit.export
@@ -1333,7 +1318,7 @@ def forward(self, x):
 
         class Wrapper(torch.nn.Module):
             def __init__(self):
-                super(Wrapper, self).__init__()
+                super().__init__()
                 self.foo = Foo()
 
             def forward(self, x):
@@ -1354,9 +1339,6 @@ def check(mod):
 
         # Note that Bar's forward can only be traced, but not scripted
         class Bar(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @torch.jit.export
             def addTwo(self, x):
                 return x + 2
@@ -1369,7 +1351,7 @@ def forward(self, input):
         # being traced.
         class WrapperExports(torch.nn.Module):
             def __init__(self):
-                super(WrapperExports, self).__init__()
+                super().__init__()
                 self.bar = Bar()
 
             @torch.jit.export
@@ -1403,7 +1385,7 @@ def forward(self, x):
 
         class Wrapper(torch.nn.Module):
             def __init__(self):
-                super(Wrapper, self).__init__()
+                super().__init__()
                 self.tm = TracedModule()
 
             def forward(self, x):
@@ -1455,7 +1437,7 @@ def forward(self, x, y):
     def test_interpolate_trace(self):
         class test(nn.Module):
             def __init__(self):
-                super(test, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 32, kernel_size=3, padding=1)
 
             def forward(self, x):
@@ -1515,7 +1497,7 @@ def traced_fn(x):
 
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 5))
 
             def forward(self, x):
@@ -1533,7 +1515,7 @@ def forward(self, x):
     def test_call_traced_module_from_traced_module(self):
         class TracedModule1(torch.nn.Module):
             def __init__(self):
-                super(TracedModule1, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(5, 7))
 
             def forward(self, x):
@@ -1541,7 +1523,7 @@ def forward(self, x):
 
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 5))
                 self.mod = torch.jit.trace(TracedModule1(), torch.rand(3, 5))
 
@@ -1697,7 +1679,7 @@ def foo(x):
     def test_trace_modulelist(self):
         class MySubmod(torch.nn.Module):
             def __init__(self):
-                super(MySubmod, self).__init__()
+                super().__init__()
                 self.relu = torch.nn.ReLU()
 
             def forward(self, x):
@@ -1705,7 +1687,7 @@ def forward(self, x):
 
         class MyMod(torch.nn.Module):
             def __init__(self):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.ml = torch.nn.ModuleList([
                     MySubmod(),
                     MySubmod()
@@ -1721,7 +1703,7 @@ def forward(self, x):
     def test_trace_fork_join_and_module(self):
         class MySubmod(torch.nn.Module):
             def __init__(self):
-                super(MySubmod, self).__init__()
+                super().__init__()
                 self.relu = torch.nn.ReLU()
 
             def forward(self, x):
@@ -1729,7 +1711,7 @@ def forward(self, x):
 
         class Mod(torch.nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.ml = torch.nn.ModuleList([
                     MySubmod() for i in range(2)
                 ])
@@ -1751,7 +1733,7 @@ def forward(self, x):
     def test_trace_invert_module_hierarchy(self):
         class MySubmod(torch.nn.Module):
             def __init__(self):
-                super(MySubmod, self).__init__()
+                super().__init__()
                 self.relu = torch.nn.ReLU()
 
             def forward(self, x):
@@ -1763,7 +1745,7 @@ def forward(self, x, submod):
 
         class Mod(torch.nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.sm = MySubmod()
                 self.fm = MyFunctionalMod()
 
@@ -1790,9 +1772,6 @@ def foo(bar, baz):
     @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
     def test_tracing_hooks(self):
         class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-
             def forward(self, x):
                 return x + x
 
@@ -1851,9 +1830,6 @@ def pre_hook_ret(mod, input):
 
     def test_tracing_backward_hook_error(self):
         class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-
             def forward(self, x):
                 return x + x
 
@@ -1869,7 +1845,7 @@ def backward_hook(module, grad_input, grad_output):
     def test_tracing_multiple_methods(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
             def forward(self, x):
@@ -1930,7 +1906,7 @@ def forward(self, inputs):
     def test_trace_with_conditional_property(self):
         class Net(nn.Module):
             def __init__(self, attr=None):
-                super(Net, self).__init__()
+                super().__init__()
                 if attr is not None:
                     self._attr = attr
                 self.attr_name = '_attr'
@@ -1964,7 +1940,7 @@ def fn(first_arg: torch.Tensor, second_arg=1) -> torch.Tensor:
     def test_trace_module_argument_names_captured(self):
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
             def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor):
@@ -1983,6 +1959,20 @@ def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor):
         FileCheck().check("first_arg").check_next("second_arg") \
             .run(str(traced_module.graph))
 
+    def test_trace_checking_with_deprecated_name(self):
+        class MyClass(torch.nn.Module):
+            def __init__(self):
+                super(MyClass, self).__init__()
+
+            def forward(self, x, y, **deprecated_arguments):
+                if len(deprecated_arguments) > 0:
+                    raise RuntimeError(f"Got unexpected arguments: {deprecated_arguments}")
+                return x + y
+
+        model = MyClass()
+        m2 = torch.jit.trace(model, (torch.ones(1), torch.ones(1)))
+        m3 = torch.jit.trace(model, example_kwarg_inputs={'x': torch.ones(1), "y": torch.ones(1)}, strict=False)
+
 
 class TestMixTracingScripting(JitTestCase):
     def test_trace_script(self):
@@ -2105,7 +2095,7 @@ def test_trace_hierarchy(self):
 
         class AnotherScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(AnotherScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(1, 2, 3))
 
             @torch.jit.script_method
@@ -2114,7 +2104,7 @@ def bar(self):
 
         class SomeScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(SomeScriptMod, self).__init__()
+                super().__init__()
                 self.asm = AnotherScriptMod()
 
             @torch.jit.script_method
@@ -2127,7 +2117,7 @@ def bar(self):
 
         class TraceMe(torch.nn.Module):
             def __init__(self):
-                super(TraceMe, self).__init__()
+                super().__init__()
                 self.ssm = SomeScriptMod()
 
             def forward(self, x):
@@ -2154,7 +2144,7 @@ def forward(self, x):
     def test_trace_parameter(self):
         class Param(nn.Module):
             def __init__(self):
-                super(Param, self).__init__()
+                super().__init__()
                 self.register_parameter("bias", nn.Parameter(torch.empty(4, 4)))
 
             def forward(self, x):
@@ -2162,7 +2152,7 @@ def forward(self, x):
 
         class M3(torch.jit.ScriptModule):
             def __init__(self, model):
-                super(M3, self).__init__()
+                super().__init__()
                 self.traced = torch.jit.trace(model, (torch.rand(3, 3)))
 
             @torch.jit.script_method
@@ -2171,7 +2161,7 @@ def forward(self, x):
 
         class M2(nn.Module):
             def __init__(self, model):
-                super(M2, self).__init__()
+                super().__init__()
                 self.module = M3(model)
 
             def forward(self, x):
@@ -2179,7 +2169,7 @@ def forward(self, x):
 
         class M1(torch.jit.ScriptModule):
             def __init__(self, model):
-                super(M1, self).__init__()
+                super().__init__()
                 self.traced = torch.jit.trace(M2(model), (torch.rand(3, 3)))
 
             @torch.jit.script_method
@@ -2199,7 +2189,7 @@ def scripted_fn(x):
 
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 5))
 
             def forward(self, x):
@@ -2212,7 +2202,7 @@ def forward(self, x):
     def test_call_script_module_from_traced_module(self):
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param_foo = torch.nn.Parameter(torch.rand(5, 7))
 
             @torch.jit.script_method
@@ -2221,7 +2211,7 @@ def forward(self, x):
 
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 5))
                 self.mod = ScriptMod()
 
@@ -2247,9 +2237,6 @@ def script_fn(x):
     def test_call_traced_mod_from_script_fn(self):
         with self.assertRaisesRegex(RuntimeError, "Cannot call a ScriptModule that is not a submodule of the caller"):
             class TracedModule(torch.nn.Module):
-                def __init__(self):
-                    super(TracedModule, self).__init__()
-
                 def forward(self, x):
                     return torch.mm(x, torch.zeros(4, 3))
 
@@ -2267,7 +2254,7 @@ def traced_fn(x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
 
             @torch.jit.script_method
@@ -2281,7 +2268,7 @@ def forward(self, x):
     def test_call_tracing_mod_from_script_module(self):
         class TracedMod(torch.nn.Module):
             def __init__(self):
-                super(TracedMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(3, 5))
 
             def forward(self, x):
@@ -2289,7 +2276,7 @@ def forward(self, x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
                 self.tm = torch.jit.trace(TracedMod(), torch.rand(3, 3))
 
@@ -2302,15 +2289,12 @@ def forward(self, x):
 
     def test_script_inline_trace_multiple_args(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, input, input2):
                 return input + input2
 
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 self.m = torch.jit.trace(M(), (torch.zeros(4, 3), torch.zeros(4, 3)))
 
             @torch.jit.script_method
@@ -2324,7 +2308,7 @@ def forward(self, inp):
     def test_trace_dict_mix_script(self):
         class testB(torch.nn.Module):
             def __init__(self):
-                super(testB, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(2, 2)
 
             def forward(self, feature_map: Dict[str, List[Tensor]]) -> Tensor:
@@ -2336,7 +2320,7 @@ def forward(self, feature_map: Dict[str, List[Tensor]]) -> Tensor:
 
         class testA(torch.nn.Module):
             def __init__(self):
-                super(testA, self).__init__()
+                super().__init__()
                 self.b = torch.jit.script(testB())
 
             def forward(self, input_map: Dict[str, List[Tensor]]) -> Tensor:
@@ -2357,9 +2341,6 @@ def test_trace_script_returning_complex_dict(self):
         The dictionary can should be able to contain other containers (like a tuple) recursively.
         """
         class ReturnsDict(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(
                 self, id_score_list: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]
             ) -> Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
@@ -2373,9 +2354,6 @@ def forward(
                 return result
 
         class ChecksDict(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input: Dict[str, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
                 v = input["1000"]
                 return v[1] + 1
@@ -2418,9 +2396,6 @@ def test_trace_returning_dict_with_tensor_tuples(self):
         should work.
         """
         class ReturnsDict(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(
                 self, k: torch.Tensor, v: torch.Tensor
             ) -> Dict[str, Tuple[torch.Tensor, torch.Tensor]]:
@@ -2432,9 +2407,6 @@ def forward(
                 return result
 
         class ReturnsBadDict(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(
                 self, k: torch.Tensor, v: torch.Tensor
             ) -> Dict[str, Tuple[torch.Tensor, float]]:
@@ -2473,7 +2445,7 @@ def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor) -> torch.Te
 
         class TestModule(nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
             def forward(self, first_arg: torch.Tensor, second_arg: torch.Tensor) -> torch.Tensor:
@@ -2541,3 +2513,21 @@ def forward(self, input: torch.Tensor):
         top = TopModule()
         top_example_input = torch.ones(1)
         torch.jit.trace(top, top_example_input)
+
+    def test_jit_trace_callfunction_return_shapes(self):
+        # a torch.jit.script function gets inserted as a CallFunction node
+        @torch.jit.script
+        def inner_fn(x):
+            return torch.cat((x, x))
+
+        def outer_fn(x, y):
+            return inner_fn(x + y).relu()
+
+        x, y = [torch.rand((2, 2), dtype=torch.float) for _ in range(2)]
+        fn_t = torch.jit.trace(outer_fn, (x, y))
+
+        # expect that the CallFunction node return type has shape information on it.
+        FileCheck().check("Float").check("4, 2").check("CallFunction").run(fn_t.graph)
+        for n in fn_t.graph.nodes():
+            if n.kind() == "prim::CallFunction":
+                self.assertTrue(n.output().isCompleteTensor())
diff --git a/test/jit/test_type_sharing.py b/test/jit/test_type_sharing.py
index 17b61382a56b..c2b84fc4e50d 100644
--- a/test/jit/test_type_sharing.py
+++ b/test/jit/test_type_sharing.py
@@ -35,7 +35,7 @@ def assertDifferentType(self, m1, m2):
     def test_basic(self):
         class M(torch.nn.Module):
             def __init__(self, a, b, c):
-                super(M, self).__init__()
+                super().__init__()
                 self.a = a
                 self.b = b
                 self.c = c
@@ -55,7 +55,7 @@ def test_diff_attr_values(self):
         """
         class M(torch.nn.Module):
             def __init__(self, a, b, c):
-                super(M, self).__init__()
+                super().__init__()
                 self.a = a
                 self.b = b
                 self.c = c
@@ -77,7 +77,7 @@ class M(torch.nn.Module):
             __constants__ = ["const"]
 
             def __init__(self, attr, const):
-                super(M, self).__init__()
+                super().__init__()
                 self.attr = attr
                 self.const = const
 
@@ -113,7 +113,7 @@ def test_submodules(self):
         """
         class M(torch.nn.Module):
             def __init__(self, in1, out1, in2, out2):
-                super(M, self).__init__()
+                super().__init__()
                 self.submod1 = torch.nn.Linear(in1, out1)
                 self.submod2 = torch.nn.Linear(in2, out2)
 
@@ -139,7 +139,7 @@ def test_param_vs_attribute(self):
         """
         class M(torch.nn.Module):
             def __init__(self, foo):
-                super(M, self).__init__()
+                super().__init__()
                 self.foo = foo
 
             def forward(self, x):
@@ -160,7 +160,7 @@ class A(torch.nn.Module):
             __constants__ = ["const"]
 
             def __init__(self, in1, out1, in2, out2):
-                super(A, self).__init__()
+                super().__init__()
                 self.submod1 = torch.nn.Linear(in1, out1)
                 self.submod2 = torch.nn.Linear(in2, out2)
                 self.const = 5
@@ -174,7 +174,7 @@ class B(torch.nn.Module):
             __constants__ = ["const"]
 
             def __init__(self, in1, out1, in2, out2):
-                super(B, self).__init__()
+                super().__init__()
                 self.submod1 = torch.nn.Linear(in1, out1)
                 self.submod2 = torch.nn.Linear(in2, out2)
                 self.const = 5
@@ -194,7 +194,7 @@ def test_mutate_attr_value(self):
         """
         class M(torch.nn.Module):
             def __init__(self, in1, out1, in2, out2):
-                super(M, self).__init__()
+                super().__init__()
                 self.submod1 = torch.nn.Linear(in1, out1)
                 self.submod2 = torch.nn.Linear(in2, out2)
                 self.foo = torch.ones(in1, in1)
@@ -216,7 +216,7 @@ def test_assign_python_attr(self):
         """
         class M(torch.nn.Module):
             def __init__(self, in1, out1, in2, out2):
-                super(M, self).__init__()
+                super().__init__()
                 self.submod1 = torch.nn.Linear(in1, out1)
                 self.submod2 = torch.nn.Linear(in2, out2)
                 self.foo = torch.ones(in1, in1)
@@ -246,7 +246,7 @@ def test_failed_attribute_compilation(self):
         """
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 # assign a type we know can't be converted to TorchScript
                 self.foo = object
 
@@ -274,7 +274,7 @@ def fn2(x):
 
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -288,7 +288,7 @@ def forward(self, x):
     def test_builtin_function_same(self):
         class Caller(torch.nn.Module):
             def __init__(self, fn):
-                super(Caller, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, input):
@@ -302,7 +302,7 @@ def forward(self, input):
     def test_builtin_function_different(self):
         class Caller(torch.nn.Module):
             def __init__(self, fn):
-                super(Caller, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, input):
@@ -323,7 +323,7 @@ def fn(x):
 
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -346,7 +346,7 @@ def fn2(x):
 
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -366,7 +366,7 @@ def fn(x):
 
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -384,9 +384,6 @@ def test_tracing_gives_different_types(self):
         trace runs, tracing must always generate a unique type.
         """
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x, y):
                 if x.sum() > y.sum():
                     return x
@@ -400,7 +397,7 @@ def forward(self, x, y):
     def test_ignored_fns(self):
         class M(torch.nn.Module):
             def __init__(self, foo):
-                super(M, self).__init__()
+                super().__init__()
                 self.foo = foo
 
             @torch.jit.ignore
@@ -418,9 +415,6 @@ def forward(self):
     @suppress_warnings
     def test_script_module_containing_traced_module(self):
         class Traced(torch.nn.Module):
-            def __init__(self):
-                super(Traced, self).__init__()
-
             def forward(self, x):
                 if x.sum() > 0:
                     return x
@@ -429,7 +423,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self, input):
-                super(M, self).__init__()
+                super().__init__()
                 self.traced = torch.jit.trace(Traced(), input)
 
             def forward(self, x):
@@ -442,7 +436,7 @@ def forward(self, x):
     def test_loaded_modules_work(self):
         class AB(torch.nn.Module):
             def __init__(self):
-                super(AB, self).__init__()
+                super().__init__()
                 self.a = 1
                 self.b = 1
 
@@ -451,7 +445,7 @@ def forward(self):
 
         class A(torch.nn.Module):
             def __init__(self):
-                super(A, self).__init__()
+                super().__init__()
                 self.a = 1
 
             def forward(self):
@@ -459,7 +453,7 @@ def forward(self):
 
         class Wrapper(torch.nn.Module):
             def __init__(self, sub):
-                super(Wrapper, self).__init__()
+                super().__init__()
                 self.sub = sub
 
             def forward(self):
@@ -483,15 +477,12 @@ def test_module_dict_same_type_different_name(self):
         that have different keys but the same value types.
         """
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self, x):
                 return x
 
         class Foo(torch.nn.Module):
             def __init__(self, s):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.dict = torch.nn.ModuleDict(s)
 
             def forward(self, x):
@@ -536,9 +527,6 @@ def forward(self, x):
                 return x
 
         class B(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x
 
diff --git a/test/jit/test_types.py b/test/jit/test_types.py
index fd28448387d9..8374afc5424d 100644
--- a/test/jit/test_types.py
+++ b/test/jit/test_types.py
@@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
 from collections import namedtuple
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Iterator, List, Optional, Tuple
 
 from torch.testing._internal.jit_utils import JitTestCase
 from torch.testing import FileCheck
@@ -50,9 +50,6 @@ def fn(m: torch.Tensor) -> torch.device:
         GG = namedtuple('GG', ['f', 'g'])
 
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @torch.jit.ignore
             def foo(self, x: torch.Tensor, z: torch.Tensor) -> Tuple[GG, GG]:
                 return GG(x, z), GG(x, z)
@@ -64,9 +61,6 @@ def forward(self, x, z):
         y = foo(torch.randn(2, 2), torch.randn(2, 2))
 
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @torch.jit.ignore
             def foo(self, x, z) -> Tuple[GG, GG]:
                 return GG(x, z)
@@ -83,9 +77,6 @@ def fn(x: Dict[str, Optional[torch.Tensor]]):
             return x + 10
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, in_batch: Dict[str, Optional[torch.Tensor]]) -> torch.Tensor:
                 self.dropout_modality(in_batch)
                 fn(in_batch)
@@ -99,7 +90,7 @@ def dropout_modality(self, in_batch: Dict[str, Optional[torch.Tensor]]) -> Dict[
         FileCheck().check("dropout_modality").check("in_batch").run(str(sm.graph))
 
     def test_python_callable(self):
-        class MyPythonClass(object):
+        class MyPythonClass:
             @torch.jit.ignore
             def __call__(self, *args) -> str:
                 return str(type(args[0]))
@@ -200,9 +191,6 @@ def test_ignoring_module_attributes(self):
         Test that module attributes can be ignored.
         """
         class Sub(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, a: int) -> int:
                 return sum([a])
 
@@ -244,6 +232,32 @@ def forward(self) -> int:
         with self.assertRaisesRegexWithHighlight(RuntimeError, r"attribute was ignored during compilation", "self.sub"):
             scripted_mod = torch.jit.script(mod)
 
+
+    def test_ignoring_fn_with_nonscriptable_types(self):
+        class CFX:
+            def __init__(self, a: List[torch.Tensor]) -> None:
+                self.a = a
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sin(x)
+
+            @torch.jit._drop
+            def __iter__(self) -> Iterator[torch.Tensor]:
+                return iter(self.a)
+
+            @torch.jit._drop
+            def __fx_create_arg__(self, tracer: torch.fx.Tracer) -> torch.fx.node.Argument:
+                # torch.fx classes are not scriptable
+                return tracer.create_node(
+                    "call_function",
+                    CFX,
+                    args=(tracer.create_arg(self.features),),
+                    kwargs={},
+                )
+
+        torch.jit.script(CFX)
+
+
     def test_unimported_type_resolution(self):
         # verify fallback from the python resolver to the c++ resolver
 
@@ -280,7 +294,7 @@ def test_annotate_outside_init(self):
         # Simple case
         with self.assertRaisesRegexWithHighlight(ValueError, msg, highlight):
             @torch.jit.script
-            class BadModule(object):
+            class BadModule:
                 def __init__(self, x: int):
                     self.x = x
 
@@ -290,7 +304,7 @@ def set(self, val: int):
         # Type annotation in a loop
         with self.assertRaisesRegexWithHighlight(ValueError, msg, highlight):
             @torch.jit.script
-            class BadModuleLoop(object):
+            class BadModuleLoop:
                 def __init__(self, x: int):
                     self.x = x
 
@@ -300,7 +314,7 @@ def set(self, val: int):
 
         # Type annotation in __init__, should not fail
         @torch.jit.script
-        class GoodModule(object):
+        class GoodModule:
             def __init__(self, x: int):
                 self.x: int = x
 
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index e0932d40ebde..fd0187a2e7a1 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -196,7 +196,7 @@ def stuff4(x):
         li_1, li_2, li_3 = stuff4([True])
         li_3 = li_3[0]
         for li in [li_1, li_2, li_3]:
-            self.assertTrue(type(li[0]) == type(True))
+            self.assertTrue(type(li[0]) == bool)
 
     def test_nested_list(self):
         def foo(z):
diff --git a/test/jit/test_union.py b/test/jit/test_union.py
index c5b9e59bcb9d..bee1efc0317c 100644
--- a/test/jit/test_union.py
+++ b/test/jit/test_union.py
@@ -113,7 +113,7 @@ def fn(x: Union[str, Color]) -> str:
     def test_union_in_class_constructor(self):
 
         @torch.jit.script  # noqa: B903
-        class A(object):    # noqa: B903
+        class A:    # noqa: B903
             def __init__(self, x: Union[int, str]) -> None:
                 self.x = x
 
diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index ddbd90a025da..03638ed31809 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -33,7 +33,7 @@ def test_with_as(self):
         to targets work as expected.
         """
         @torch.jit.script
-        class Context(object):
+        class Context:
             """
             This class implements a basic context manager interface for use in
             the unit tests. Unlike Context, the stateful part of this class
@@ -190,7 +190,7 @@ def test_with_no_as(self):
         to targets work as expected.
         """
         @torch.jit.script
-        class Context(object):
+        class Context:
             """
             This class implements a basic context manager interface for use in
             the unit tests. Unlike Context, the stateful part of this class
@@ -346,7 +346,7 @@ def test_with_exceptions(self):
         handled correctly.
         """
         @torch.jit.script
-        class Context(object):
+        class Context:
             """
             This class implements a basic context manager interface for use in
             the unit tests. Unlike Context, the stateful part of this class
@@ -434,7 +434,7 @@ def test_with_errors(self):
         """
 
         @torch.jit.script
-        class NoEnterNoExit(object):
+        class NoEnterNoExit:
             """
             This class is missing __enter__ and __exit__ methods.
             """
@@ -443,7 +443,7 @@ def __init__(self):
                 self.count = 1
 
         @torch.jit.script
-        class BadEnter(object):
+        class BadEnter:
             """
             This class has an __enter__ method with an incorrect signature.
             """
@@ -458,7 +458,7 @@ def __exit__(self, type: Any, value: Any, tb: Any):
                 pass
 
         @torch.jit.script
-        class BadExit(object):
+        class BadExit:
             """
             This class has an __exit__ method with an incorrect signature.
             """
@@ -473,7 +473,7 @@ def __exit__(self, type: Any, value: Any):
                 pass
 
         @torch.jit.script
-        class ExitIncorrectTypes(object):
+        class ExitIncorrectTypes:
             """
             This class has an __exit__ method with unsupported argument types.
             """
@@ -581,9 +581,6 @@ def test_no_grad_assignment(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         # Check that @torch.jit.ignored functions respect no_grad when it is
         # called in JIT mode.
         class NoGradModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @torch.jit.ignore
             def adder(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
                 w = x + y
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index c54d9ba1b088..4c7bc4aa628c 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -38,9 +38,6 @@ def forward(self, x):
 
     def test_xnnpack_lowering(self):
         class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x + x
 
@@ -98,9 +95,6 @@ def forward(self, x):
 
     def test_xnnpack_backend_add(self):
         class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 z = x + y
                 z = z + x
@@ -130,9 +124,6 @@ def forward(self, x, y):
 
     def test_xnnpack_broadcasting(self):
         class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return x + y
 
@@ -159,9 +150,6 @@ def forward(self, x, y):
 
     def test_xnnpack_unsupported(self):
         class AddSpliceModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 z = x + y[:, :, 1, :]
                 return z
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
index 0d916952be3b..bde68ae4dcf4 100644
--- a/test/lazy/test_extract_compiled_graph.py
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -16,16 +16,10 @@
 import copy
 
 class ModuleConstScale(nn.Module):
-    def __init__(self):
-        super(ModuleConstScale, self).__init__()
-
     def forward(self, a):
         return a * 2
 
 class ModuleSub(nn.Module):
-    def __init__(self):
-        super(ModuleSub, self).__init__()
-
     def forward(self, a, b):
         return a - b
 
@@ -33,16 +27,10 @@ class ModuleAddcmul(nn.Module):
     """
     addcmul function takes a at::Scalar which results in a special TSData containing a Scalar rather than a Tensor.
     """
-    def __init__(self):
-        super(ModuleAddcmul, self).__init__()
-
     def forward(self, a, b, c):
         return torch.addcmul(a, b, c, value=5)
 
 class ModuleReturnMulti(nn.Module):
-    def __init__(self):
-        super(ModuleReturnMulti, self).__init__()
-
     def forward(self, a, b):
         return (b + 1, a - 1)
 
@@ -50,7 +38,7 @@ def forward(self, a, b):
 # a custom tracer.
 # class ModuleEagerTensor(nn.Module):
 #     def __init__(self):
-#         super(ModuleEagerTensor, self).__init__()
+#         super().__init__()
 #
 #     def forward(self, a):
 #         b = torch.randn(2, 3, device="cpu") # eager device
@@ -65,7 +53,7 @@ def forward(self, a, b):
 # method to a constant.. Comment out for now
 # class ModuleReturnEagerTensorOnDefaultDevice(nn.Module):
 #     def __init__(self):
-#         super(ModuleReturnEagerTensorOnDefaultDevice, self).__init__()
+#         super().__init__()
 #
 #     def forward(self):
 #         return torch.tensor((2, 3), dtype=torch.float32)
@@ -76,17 +64,11 @@ class ModuleReturnDupTensor(nn.Module):
     returned tuple. torchbench like drq will hit this corner case when running
     thru torchdynamo..
     """
-    def __init__(self):
-        super(ModuleReturnDupTensor, self).__init__()
-
     def forward(self, a, b):
         c = a + b
         return a - b, c, a + 1, c
 
 class ModuleInplaceUpdate(nn.Module):
-    def __init__(self):
-        super(ModuleInplaceUpdate, self).__init__()
-
     def forward(self, a, b):
         a.sub_(b)
         return b - 1, b + 1
diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
index 092ba3d0388d..070d97af189d 100644
--- a/test/lazy/test_ts_opinfo.py
+++ b/test/lazy/test_ts_opinfo.py
@@ -34,8 +34,8 @@ def init_lists():
         yaml_ts = yaml.load(f, yaml.Loader)
     LAZY_OPS_LIST = set(remove_suffixes(itertools.chain(yaml_ts["full_codegen"], yaml_ts["supported"], yaml_ts["autograd"])))
     HAS_SYMINT_SUFFIX = yaml_ts["symint"]
-    FALLBACK_LIST = set(["clamp"])
-    SKIP_RUNTIME_ERROR_LIST = set([
+    FALLBACK_LIST = {"clamp"}
+    SKIP_RUNTIME_ERROR_LIST = {
         'index_select',  # Empty output_sizes is not supported
         'clone',  # is clone decomposed?
 
@@ -46,19 +46,19 @@ def init_lists():
         'all',  # ASAN failure
         'any',  # ASAN failure
         'logdet',  # ASAN failure
-    ])
-    SKIP_INCORRECT_RESULTS_LIST = set([
+    }
+    SKIP_INCORRECT_RESULTS_LIST = {
         'squeeze',  # Value out of range
         't',  # Value out of range
         'transpose',  # Value out of range
         'bernoulli',  # incorrect results
         'pow',  # incorrect results
         'addcdiv',  # incorrect results (on CI not locally?)
-    ])
+    }
     # The following ops all show up directly in ts_native_functions.yaml,
     # but run functionalized versions of the composite kernels in core.
     # This means that we don't expect the ops to show directly in the LTC metrics.
-    FUNCTIONAL_DECOMPOSE_LIST = set([
+    FUNCTIONAL_DECOMPOSE_LIST = {
         'diag_embed',
         'block_diag',
         'new_empty_strided',
@@ -70,13 +70,13 @@ def init_lists():
         'linalg_inv_ex',
         'linalg_pinv.atol_rtol_tensor',
         'logsumexp',
-    ])
+    }
     # For some ops, we don't support all variants. Here we use formatted_name
     # to uniquely identify the variant.
-    SKIP_VARIANT_LIST = set([
+    SKIP_VARIANT_LIST = {
         'norm_nuc',
         'min_reduction_with_dim'
-    ])
+    }
 
     return (LAZY_OPS_LIST,
             FALLBACK_LIST,
diff --git a/test/mkldnn_verbose.py b/test/mkldnn_verbose.py
index 804eb9a24567..60fe87bd2308 100644
--- a/test/mkldnn_verbose.py
+++ b/test/mkldnn_verbose.py
@@ -3,7 +3,7 @@
 
 class Module(torch.nn.Module):
     def __init__(self):
-        super(Module, self).__init__()
+        super().__init__()
         self.conv = torch.nn.Conv2d(1, 10, 5, 1)
 
     def forward(self, x):
diff --git a/test/mobile/lightweight_dispatch/build.sh b/test/mobile/lightweight_dispatch/build.sh
index b478f048ff8e..7e062a89ea63 100755
--- a/test/mobile/lightweight_dispatch/build.sh
+++ b/test/mobile/lightweight_dispatch/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# This script should be called from .jenkins/pytorch/build.sh. Assuming we are at pytorch source root directory.
+# This script should be called from .ci/pytorch/build.sh. Assuming we are at pytorch source root directory.
 
 # Required environment variable: $BUILD_ENVIRONMENT
 # (This is set by default in the Docker images we build, so you don't
diff --git a/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
index 80f26e68d260..1b879118b5b8 100644
--- a/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
+++ b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp
@@ -197,15 +197,16 @@ TEST(LiteInterpreterTest, MultipleOps) {
   auto testModelFile = "ModelWithMultipleOps.ptl";
 
   // class ModelWithMultipleOps(torch.nn.Module):
-  //           def __init__(self):
-  //               super(Model, self).__init__()
-  //               self.ops = torch.nn.Sequential(
-  //                   torch.nn.ReLU(),
-  //                   torch.nn.Flatten(),
-  //               )
-  //           def forward(self, x):
-  //               x[1] = -2
-  //               return self.ops(x)
+  //     def __init__(self):
+  //         super().__init__()
+  //         self.ops = torch.nn.Sequential(
+  //             torch.nn.ReLU(),
+  //             torch.nn.Flatten(),
+  //         )
+  //
+  //     def forward(self, x):
+  //         x[1] = -2
+  //         return self.ops(x)
 
   Module bc = _load_for_mobile(testModelFile);
   auto b = at::ones({2, 2, 2, 2});
diff --git a/test/mobile/model_test/android_api_module.py b/test/mobile/model_test/android_api_module.py
index 109e3aa963e8..acada05fc2ff 100644
--- a/test/mobile/model_test/android_api_module.py
+++ b/test/mobile/model_test/android_api_module.py
@@ -5,9 +5,6 @@
 
 
 class AndroidAPIModule(torch.jit.ScriptModule):
-    def __init__(self):
-        super(AndroidAPIModule, self).__init__()
-
     @torch.jit.script_method
     def forward(self, input):
         return None
diff --git a/test/mobile/model_test/builtin_ops.py b/test/mobile/model_test/builtin_ops.py
index 75b57f7b0613..b315c4f3897c 100644
--- a/test/mobile/model_test/builtin_ops.py
+++ b/test/mobile/model_test/builtin_ops.py
@@ -5,9 +5,6 @@
 
 
 class TSBuiltinOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TSBuiltinOpsModule, self).__init__()
-
     def forward(self):
         x = torch.tensor(1)
         y = torch.tensor(0.5)
@@ -90,9 +87,6 @@ def forward(self):
 
 
 class TSCollectionOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TSCollectionOpsModule, self).__init__()
-
     def forward(self):
         s = "abcde"
         # list
diff --git a/test/mobile/model_test/gen_test_model.py b/test/mobile/model_test/gen_test_model.py
index 370e8d08541f..7c6b780e8d6d 100644
--- a/test/mobile/model_test/gen_test_model.py
+++ b/test/mobile/model_test/gen_test_model.py
@@ -140,7 +140,7 @@ def calcOpsCoverage(ops):
                 "_coverage": round(coverage, 2),
                 "uncovered_ops": uncovered_ops_dict,
                 "covered_ops": covered_ops_dict,
-                "all_generated_ops": sorted(list(all_generated_ops)),
+                "all_generated_ops": sorted(all_generated_ops),
             },
             f,
         )
diff --git a/test/mobile/model_test/math_ops.py b/test/mobile/model_test/math_ops.py
index 551c712ed38b..009ec2e0c0c6 100644
--- a/test/mobile/model_test/math_ops.py
+++ b/test/mobile/model_test/math_ops.py
@@ -6,9 +6,6 @@
 
 
 class PointwiseOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(PointwiseOpsModule, self).__init__()
-
     def forward(self):
         return self.pointwise_ops()
 
@@ -212,9 +209,6 @@ def pointwise_ops(self):
 
 
 class ReductionOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(ReductionOpsModule, self).__init__()
-
     def forward(self):
         return self.reduction_ops()
 
@@ -265,9 +259,6 @@ def reduction_ops(self):
 
 
 class ComparisonOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(ComparisonOpsModule, self).__init__()
-
     def forward(self):
         a = torch.tensor(0)
         b = torch.tensor(1)
@@ -313,9 +304,6 @@ def forward(self):
 
 
 class OtherMathOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(OtherMathOpsModule, self).__init__()
-
     def forward(self):
         return self.other_ops()
 
@@ -387,9 +375,6 @@ def other_ops(self):
 
 
 class SpectralOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(SpectralOpsModule, self).__init__()
-
     def forward(self):
         return self.spectral_ops()
 
@@ -409,9 +394,6 @@ def spectral_ops(self):
 
 
 class BlasLapackOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(BlasLapackOpsModule, self).__init__()
-
     def forward(self):
         return self.blas_lapack_ops()
 
diff --git a/test/mobile/model_test/nn_ops.py b/test/mobile/model_test/nn_ops.py
index 338359c96408..6389a0081268 100644
--- a/test/mobile/model_test/nn_ops.py
+++ b/test/mobile/model_test/nn_ops.py
@@ -5,7 +5,7 @@
 # https://pytorch.org/docs/stable/nn.html
 class NNConvolutionModule(torch.nn.Module):
     def __init__(self):
-        super(NNConvolutionModule, self).__init__()
+        super().__init__()
         self.input1d = torch.randn(1, 4, 36)
         self.input2d = torch.randn(1, 4, 30, 10)
         self.input3d = torch.randn(1, 4, 10, 4, 4)
@@ -40,7 +40,7 @@ def forward(self):
 
 class NNPoolingModule(torch.nn.Module):
     def __init__(self):
-        super(NNPoolingModule, self).__init__()
+        super().__init__()
         self.input1d = torch.randn(1, 16, 50)
         self.module1d = nn.ModuleList(
             [
@@ -86,7 +86,7 @@ def forward(self):
 
 class NNPaddingModule(torch.nn.Module):
     def __init__(self):
-        super(NNPaddingModule, self).__init__()
+        super().__init__()
         self.input1d = torch.randn(1, 4, 50)
         self.module1d = nn.ModuleList(
             [
@@ -125,7 +125,7 @@ def forward(self):
 
 class NNNormalizationModule(torch.nn.Module):
     def __init__(self):
-        super(NNNormalizationModule, self).__init__()
+        super().__init__()
         self.input1d = torch.randn(1, 4, 50)
         self.module1d = nn.ModuleList(
             [
@@ -164,7 +164,7 @@ def forward(self):
 
 class NNActivationModule(torch.nn.Module):
     def __init__(self):
-        super(NNActivationModule, self).__init__()
+        super().__init__()
         self.activations = nn.ModuleList(
             [
                 nn.ELU(),
@@ -209,7 +209,7 @@ def forward(self):
 
 class NNRecurrentModule(torch.nn.Module):
     def __init__(self):
-        super(NNRecurrentModule, self).__init__()
+        super().__init__()
         self.rnn = nn.ModuleList(
             [
                 nn.RNN(4, 8, 2),
@@ -239,7 +239,7 @@ def forward(self):
 
 class NNTransformerModule(torch.nn.Module):
     def __init__(self):
-        super(NNTransformerModule, self).__init__()
+        super().__init__()
         self.transformers = nn.ModuleList(
             [
                 nn.Transformer(
@@ -265,7 +265,7 @@ def forward(self):
 
 class NNLinearModule(torch.nn.Module):
     def __init__(self):
-        super(NNLinearModule, self).__init__()
+        super().__init__()
         self.linears = nn.ModuleList(
             [
                 nn.Identity(54),
@@ -284,9 +284,6 @@ def forward(self):
 
 
 class NNDropoutModule(torch.nn.Module):
-    def __init__(self):
-        super(NNDropoutModule, self).__init__()
-
     def forward(self):
         a = torch.randn(8, 4)
         b = torch.randn(8, 4, 4, 4)
@@ -301,9 +298,6 @@ def forward(self):
 
 
 class NNSparseModule(torch.nn.Module):
-    def __init__(self):
-        super(NNSparseModule, self).__init__()
-
     def forward(self):
         input = torch.tensor([[1, 2, 4, 5], [4, 3, 2, 9]])
         input2 = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
@@ -317,9 +311,6 @@ def forward(self):
 
 
 class NNDistanceModule(torch.nn.Module):
-    def __init__(self):
-        super(NNDistanceModule, self).__init__()
-
     def forward(self):
         a = torch.randn(8, 4)
         b = torch.randn(8, 4)
@@ -332,7 +323,7 @@ def forward(self):
 
 class NNLossFunctionModule(torch.nn.Module):
     def __init__(self):
-        super(NNLossFunctionModule, self).__init__()
+        super().__init__()
         self.x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
         self.y = torch.LongTensor([[3, 0, -1, 1]])
 
@@ -371,7 +362,7 @@ def forward(self):
 
 class NNVisionModule(torch.nn.Module):
     def __init__(self):
-        super(NNVisionModule, self).__init__()
+        super().__init__()
         self.input = torch.randn(1, 4, 9, 9)
         self.vision_modules = nn.ModuleList(
             [
@@ -401,7 +392,7 @@ def forward(self):
 
 class NNShuffleModule(torch.nn.Module):
     def __init__(self):
-        super(NNShuffleModule, self).__init__()
+        super().__init__()
         self.shuffle = nn.ChannelShuffle(2)
 
     def forward(self):
@@ -410,7 +401,7 @@ def forward(self):
 
 class NNUtilsModule(torch.nn.Module):
     def __init__(self):
-        super(NNUtilsModule, self).__init__()
+        super().__init__()
         self.flatten = nn.Sequential(
             nn.Linear(50, 50),
             nn.Unflatten(1, (2, 5, 5))
diff --git a/test/mobile/model_test/quantization_ops.py b/test/mobile/model_test/quantization_ops.py
index d0fdb346545e..dd34137b51a0 100644
--- a/test/mobile/model_test/quantization_ops.py
+++ b/test/mobile/model_test/quantization_ops.py
@@ -4,15 +4,15 @@
 
 class GeneralQuantModule(torch.nn.Module):
     def __init__(self):
-        super(GeneralQuantModule, self).__init__()
-        self.embedding = torch.nn.quantized.Embedding(
+        super().__init__()
+        self.embedding = torch.ao.nn.quantized.Embedding(
             num_embeddings=10, embedding_dim=12
         )
         self.embedding_input = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8])
-        self.func = torch.nn.quantized.QFunctional()
-        self.conv1 = torch.nn.quantized.ConvTranspose1d(16, 33, 3, stride=2)
-        self.conv2 = torch.nn.quantized.ConvTranspose2d(16, 33, 3, stride=2)
-        self.conv3 = torch.nn.quantized.ConvTranspose3d(16, 33, 3, stride=2)
+        self.func = torch.ao.nn.quantized.QFunctional()
+        self.conv1 = torch.ao.nn.quantized.ConvTranspose1d(16, 33, 3, stride=2)
+        self.conv2 = torch.ao.nn.quantized.ConvTranspose2d(16, 33, 3, stride=2)
+        self.conv3 = torch.ao.nn.quantized.ConvTranspose3d(16, 33, 3, stride=2)
 
     def forward(self):
         a = torch.quantize_per_tensor(torch.tensor([3.0]), 1.0, 0, torch.qint32)
@@ -48,11 +48,11 @@ def forward(self):
 
 class DynamicQuantModule:
     def __init__(self):
-        super(DynamicQuantModule, self).__init__()
+        super().__init__()
         self.module = self.M()
 
     def getModule(self):
-        return torch.quantization.quantize_dynamic(self.module, dtype=torch.qint8)
+        return torch.ao.quantization.quantize_dynamic(self.module, dtype=torch.qint8)
 
     class M(torch.nn.Module):
         def __init__(self):
@@ -111,21 +111,18 @@ def forward(self):
 
 
 class StaticQuantModule:
-    def __init__(self):
-        super(StaticQuantModule, self).__init__()
-
     def getModule(self):
         model_fp32 = self.M()
         model_fp32.eval()
-        model_fp32.qconfig = torch.quantization.get_default_qconfig("qnnpack")
-        model_fp32_prepared = torch.quantization.prepare(model_fp32)
-        model_int8 = torch.quantization.convert(model_fp32_prepared)
+        model_fp32.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+        model_fp32_prepared = torch.ao.quantization.prepare(model_fp32)
+        model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
         return model_int8
 
     class M(torch.nn.Module):
         def __init__(self):
             super(StaticQuantModule.M, self).__init__()
-            self.quant = torch.quantization.QuantStub()
+            self.quant = torch.ao.quantization.QuantStub()
             self.input1d = torch.randn(4, 2, 2)
             self.input2d = torch.randn((4, 2, 4, 4))
             self.input3d = torch.randn(4, 2, 2, 4, 4)
@@ -144,7 +141,7 @@ def __init__(self):
                 nn.Conv3d(2, 2, 1), nn.BatchNorm3d(2), nn.InstanceNorm3d(1), nn.ReLU()
             )
             self.layer4 = nn.Sequential(nn.Linear(4, 3))
-            self.dequant = torch.quantization.DeQuantStub()
+            self.dequant = torch.ao.quantization.DeQuantStub()
 
         def forward(self):
             x = self.quant(self.input1d)
@@ -165,14 +162,11 @@ def forward(self):
 
 
 class FusedQuantModule:
-    def __init__(self):
-        super(FusedQuantModule, self).__init__()
-
     def getModule(self):
         model_fp32 = self.M()
         model_fp32.eval()
-        model_fp32.qconfig = torch.quantization.get_default_qconfig("qnnpack")
-        model_fp32_fused = torch.quantization.fuse_modules(
+        model_fp32.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+        model_fp32_fused = torch.ao.quantization.fuse_modules(
             model_fp32,
             [
                 ["conv1d", "relu1"],
@@ -181,14 +175,14 @@ def getModule(self):
                 ["linear", "relu4"],
             ],
         )
-        model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
-        model_int8 = torch.quantization.convert(model_fp32_prepared)
+        model_fp32_prepared = torch.ao.quantization.prepare(model_fp32_fused)
+        model_int8 = torch.ao.quantization.convert(model_fp32_prepared)
         return model_int8
 
     class M(torch.nn.Module):
         def __init__(self):
             super(FusedQuantModule.M, self).__init__()
-            self.quant = torch.quantization.QuantStub()
+            self.quant = torch.ao.quantization.QuantStub()
             self.input1d = torch.randn(4, 2, 2)
             self.input2d = torch.randn((4, 2, 4, 4))
             self.input3d = torch.randn(4, 2, 2, 4, 4)
@@ -200,7 +194,7 @@ def __init__(self):
             self.relu2 = nn.ReLU()
             self.relu3 = nn.ReLU()
             self.relu4 = nn.ReLU()
-            self.dequant = torch.quantization.DeQuantStub()
+            self.dequant = torch.ao.quantization.DeQuantStub()
 
         def forward(self):
             x = self.input1d
diff --git a/test/mobile/model_test/sampling_ops.py b/test/mobile/model_test/sampling_ops.py
index a1ac71a3a319..50e6d9141ca2 100644
--- a/test/mobile/model_test/sampling_ops.py
+++ b/test/mobile/model_test/sampling_ops.py
@@ -4,9 +4,6 @@
 # https://pytorch.org/docs/stable/torch.html#random-sampling
 
 class SamplingOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(SamplingOpsModule, self).__init__()
-
     def forward(self):
         a = torch.empty(3, 3).uniform_(0.0, 1.0)
         size = (1, 4)
diff --git a/test/mobile/model_test/tensor_ops.py b/test/mobile/model_test/tensor_ops.py
index 9e04c6703d27..089cf10c0f54 100644
--- a/test/mobile/model_test/tensor_ops.py
+++ b/test/mobile/model_test/tensor_ops.py
@@ -2,9 +2,6 @@
 
 
 class TensorOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TensorOpsModule, self).__init__()
-
     def forward(self):
         return self.tensor_general_ops()
 
@@ -102,9 +99,6 @@ def tensor_general_ops(self):
 
 
 class TensorCreationOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TensorCreationOpsModule, self).__init__()
-
     def forward(self):
         return self.tensor_creation_ops()
 
@@ -161,9 +155,6 @@ def tensor_creation_ops(self):
 
 
 class TensorIndexingOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TensorIndexingOpsModule, self).__init__()
-
     def forward(self):
         return self.tensor_indexing_ops()
 
@@ -227,9 +218,6 @@ def tensor_indexing_ops(self):
 
 
 class TensorTypingOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TensorTypingOpsModule, self).__init__()
-
     def forward(self):
         return self.tensor_typing_ops()
 
@@ -255,9 +243,6 @@ def tensor_typing_ops(self):
 
 
 class TensorViewOpsModule(torch.nn.Module):
-    def __init__(self):
-        super(TensorViewOpsModule, self).__init__()
-
     def forward(self):
         return self.tensor_view_ops()
 
diff --git a/test/mobile/model_test/torchvision_models.py b/test/mobile/model_test/torchvision_models.py
index 232afbc54b1e..8684724d4771 100644
--- a/test/mobile/model_test/torchvision_models.py
+++ b/test/mobile/model_test/torchvision_models.py
@@ -5,9 +5,6 @@
 
 
 class MobileNetV2Module:
-    def __init__(self):
-        super(MobileNetV2Module, self).__init__()
-
     def getModule(self):
         model = torchvision.models.mobilenet_v2(pretrained=True)
         model.eval()
diff --git a/test/mobile/nnc/aot_test_model.py b/test/mobile/nnc/aot_test_model.py
index c5e123bf374c..834b731a306f 100644
--- a/test/mobile/nnc/aot_test_model.py
+++ b/test/mobile/nnc/aot_test_model.py
@@ -3,9 +3,6 @@
 
 
 class NeuralNetwork(nn.Module):
-    def __init__(self):
-        super(NeuralNetwork, self).__init__()
-
     def forward(self, x):
         return torch.add(x, 10)
 
diff --git a/test/mobile/test_bytecode.py b/test/mobile/test_bytecode.py
index 50a4c2f3f541..b5a493e1103e 100644
--- a/test/mobile/test_bytecode.py
+++ b/test/mobile/test_bytecode.py
@@ -311,9 +311,6 @@ def test_get_model_ops_and_info(self):
 
     def test_get_mobile_model_contained_types(self):
         class MyTestModule(torch.nn.Module):
-            def __init__(self):
-                super(MyTestModule, self).__init__()
-
             def forward(self, x):
                 return x + 10
 
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index 9089977b77f1..f75a02b28c2a 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -34,9 +34,6 @@ def getScriptExportImportCopy(self, m, save_mobile_debug_info=True, also_test_fi
 
     def test_load_mobile_module(self):
         class MyTestModule(torch.nn.Module):
-            def __init__(self):
-                super(MyTestModule, self).__init__()
-
             def forward(self, x):
                 return x + 10
 
@@ -60,15 +57,12 @@ def forward(self, x):
 
     def test_save_mobile_module_with_debug_info_with_trace(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self, x, y):
                 return x * y
 
         class B(torch.nn.Module):
             def __init__(self):
-                super(B, self).__init__()
+                super().__init__()
                 self.A0 = A()
                 self.A1 = A()
 
@@ -103,9 +97,6 @@ def forward(self, x, y, z):
 
     def test_load_mobile_module_with_debug_info(self):
         class MyTestModule(torch.nn.Module):
-            def __init__(self):
-                super(MyTestModule, self).__init__()
-
             def forward(self, x):
                 return x + 5
 
@@ -161,7 +152,7 @@ def forward(self, arg):
     def test_method_calls_with_optional_arg(self):
         class A(torch.nn.Module):
             def __init__(self):
-                super(A, self).__init__()
+                super().__init__()
 
             # opt arg in script-to-script invocation
             def forward(self, x, two: int = 2):
@@ -169,7 +160,7 @@ def forward(self, x, two: int = 2):
 
         class B(torch.nn.Module):
             def __init__(self):
-                super(B, self).__init__()
+                super().__init__()
                 self.A0 = A()
 
             # opt arg in Python-to-script invocation
@@ -227,12 +218,11 @@ def forward(self, arg):
 
     def test_unsupported_return_list_with_module_class(self):
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super(Foo, self).__init__()
+            pass
 
         class MyTestModuleForListWithModuleClass(torch.nn.Module):
             def __init__(self):
-                super(MyTestModuleForListWithModuleClass, self).__init__()
+                super().__init__()
                 self.foo = Foo()
 
             def forward(self):
@@ -250,12 +240,11 @@ def forward(self):
 
     def test_unsupported_return_dict_with_module_class(self):
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super(Foo, self).__init__()
+            pass
 
         class MyTestModuleForDictWithModuleClass(torch.nn.Module):
             def __init__(self):
-                super(MyTestModuleForDictWithModuleClass, self).__init__()
+                super().__init__()
                 self.foo = Foo()
 
             def forward(self):
@@ -274,7 +263,7 @@ def forward(self):
     def test_module_export_operator_list(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.weight = torch.ones((20, 1, 5, 5))
                 self.bias = torch.ones(20)
 
@@ -391,7 +380,7 @@ def forward(self, x, w):
     def test_source_range_raise_exc(self):
         class FooTest5(torch.jit.ScriptModule):
             def __init__(self, val: int):
-                super(FooTest5, self).__init__()
+                super().__init__()
                 self.val = val
 
             @torch.jit.script_method
@@ -434,9 +423,6 @@ def forwardError(self, x) -> torch.Tensor:
                 pass
 
         class B(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x
 
@@ -496,7 +482,7 @@ def test_quantization_example(self):
         # From the example in Static Quantization section of https://pytorch.org/docs/stable/quantization.html
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.quant = torch.ao.quantization.QuantStub()
                 self.conv = torch.nn.Conv2d(1, 1, 1)
                 self.relu = torch.nn.ReLU()
@@ -524,9 +510,6 @@ def forward(self, x):
 
     def test_bundled_input_with_dynamic_type(self):
         class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
             def forward(
                 self,
                 x: Dict[int, torch.Tensor],
diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py
index 44eb6d4778e8..913c5271737a 100644
--- a/test/mobile/test_lite_script_type.py
+++ b/test/mobile/test_lite_script_type.py
@@ -42,7 +42,7 @@ class Foo(NamedTuple):
 
         class Bar(torch.nn.Module):
             def __init__(self):
-                super(Bar, self).__init__()
+                super().__init__()
                 self.foo = Foo(torch.tensor(1))
 
             def forward(self, a: torch.Tensor):
@@ -104,7 +104,7 @@ class Foo(NamedTuple):
 
         class Bar(torch.nn.Module):
             def __init__(self):
-                super(Bar, self).__init__()
+                super().__init__()
                 self.foo = Foo(torch.tensor(1))
 
             def forward(self, a: torch.Tensor):
@@ -153,7 +153,7 @@ class Foo(NamedTuple):
 
         class Bar(torch.nn.Module):
             def __init__(self):
-                super(Bar, self).__init__()
+                super().__init__()
                 self.foo = Foo(torch.tensor(1), Baz(torch.tensor(1)))
 
             def forward(self, a: torch.Tensor):
diff --git a/test/mobile/test_quantize_fx_lite_script_module.py b/test/mobile/test_quantize_fx_lite_script_module.py
index ebc96d17697b..06562ec99a1e 100644
--- a/test/mobile/test_quantize_fx_lite_script_module.py
+++ b/test/mobile/test_quantize_fx_lite_script_module.py
@@ -58,7 +58,7 @@ def forward(self, indices):
     def test_conv2d(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.conv2 = nn.Conv2d(1, 1, 1)
 
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 0f2bb0c44188..f35a7779d882 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -13,14 +13,14 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_dtype import floating_types_and, floating_and_complex_types_and
 from torch.testing._internal.common_utils import run_tests, \
-    skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_SCIPY, TEST_WITH_ROCM, \
+    skipIfRocmVersionLessThan, TEST_SCIPY, TEST_WITH_ROCM, \
     download_file, parametrize as parametrize_test, subtest, \
     instantiate_parametrized_tests, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
 from torch.testing._internal.common_nn import NNTestCase, _test_module_empty_input
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
     dtypesIfCUDA, precisionOverride, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
-    skipCUDAIfRocm, skipCUDAIfRocmVersionLessThan, skipCUDAIfNotMiopenSuggestNHWC, \
+    skipCUDAIfRocm, skipCUDAIfRocmVersionLessThan, \
     onlyNativeDeviceTypes, largeTensorTest, skipMeta, \
     disableMkldnn, skipCPUIfNoMkldnn, disablecuDNN, skipCUDAIfMiopen, skipCUDAIfNoMiopen
 
@@ -629,7 +629,6 @@ def test_conv_tbc(self):
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @skipIfRocmVersionLessThan((4, 3))
-    @skipIfNotMiopenSuggestNHWC
     def test_grouped_conv_cudnn_nhwc_support(self):
         # in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version
         input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
@@ -920,7 +919,7 @@ def test_Conv2d_large_workspace(self, device, dtype):
         ]
 
         def run_test(benchmark):
-            with torch.backends.cudnn.flags(benchmark=benchmark):
+            with torch.backends.cudnn.flags(enabled=True, benchmark=benchmark):
                 conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to(device, dtype)
                 for size in sizes:
                     x = torch.randn(size, device=device, dtype=dtype)
@@ -1058,7 +1057,7 @@ def test_noncontig_conv_grad(self, device, dtype):
     @onlyCUDA
     @dtypes(torch.double)
     def test_conv_double_backward(self, device, dtype):
-        with torch.backends.cudnn.flags(deterministic=True):
+        with torch.backends.cudnn.flags(enabled=True, deterministic=True):
             # Double backward only runs with DoubleTensor due to precision reason
             batch_size = 1
             for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
@@ -1271,24 +1270,24 @@ def test_conv1d_same_padding_backward(self, device, dtype):
 
         # Symmetric padding
         z = F.conv1d(x, y, padding=3, dilation=2)
-        z.sum().backward()
+        z.sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
         z = F.conv1d(x, y, padding='same', dilation=2)
-        z.sum().backward()
+        z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
         x.grad, y.grad = None, None
 
         # Asymmetric padding
         z = F.conv1d(x, y, padding=2)[..., 1:]
-        z.sum().backward()
+        z.sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
         z = F.conv1d(x, y, padding='same')
-        z.sum().backward()
+        z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
@@ -1300,12 +1299,12 @@ def test_conv2d_same_padding_backward(self, device, dtype):
 
         # Symmetric padding
         z = F.conv2d(x, y, padding=(3, 4), dilation=2)
-        z.sum().backward()
+        z.sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
         z = F.conv2d(x, y, padding='same', dilation=2)
-        z.sum().backward()
+        z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
         x.grad, y.grad = None, None
@@ -1313,12 +1312,12 @@ def test_conv2d_same_padding_backward(self, device, dtype):
         # Asymmetric padding
         y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype, requires_grad=True)
         z = F.conv2d(x, y, padding=2)[..., 1:, 1:]
-        z.sum().backward()
+        z.sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
         z = F.conv2d(x, y, padding='same')
-        z.sum().backward()
+        z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
@@ -1332,12 +1331,12 @@ def test_conv3d_same_padding_backward(self, device, dtype):
 
         # Symmetric padding
         z = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
-        z.sum().backward()
+        z.sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
         z = F.conv3d(x, y, padding='same', dilation=2)
-        z.sum().backward()
+        z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
         x.grad, y.grad = None, None
@@ -1352,12 +1351,12 @@ def test_conv3d_same_padding_backward(self, device, dtype):
         # Asymmetric padding
         y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True)
         z = F.conv3d(x, y, padding=2)[..., 1:, 1:]
-        z.sum().backward()
+        z.sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
         z = F.conv3d(x, y, padding='same')
-        z.sum().backward()
+        z.sum().abs().backward()
         self.assertEqual(gx_expect, x.grad)
         self.assertEqual(gy_expect, y.grad)
 
@@ -1373,11 +1372,11 @@ def test_conv1d_valid_padding_backward(self, device, dtype):
         # Test F.conv1d gradients work with padding='valid'
         x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True)
         y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
-        F.conv1d(x, y, padding=0).sum().backward()
+        F.conv1d(x, y, padding=0).sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        F.conv1d(x, y, padding='valid').sum().backward()
+        F.conv1d(x, y, padding='valid').sum().abs().backward()
         gx_actual, gy_actual = x.grad, y.grad
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
@@ -1511,11 +1510,11 @@ def test_conv2d_valid_padding_backward(self, device, dtype):
         # Test F.conv2d gradients work with padding='valid'
         x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True)
         y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype, requires_grad=True)
-        F.conv2d(x, y, padding=0).sum().backward()
+        F.conv2d(x, y, padding=0).sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        F.conv2d(x, y, padding='valid').sum().backward()
+        F.conv2d(x, y, padding='valid').sum().abs().backward()
         gx_actual, gy_actual = x.grad, y.grad
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
@@ -1527,11 +1526,11 @@ def test_conv3d_valid_padding_backward(self, device, dtype):
         # Test F.conv3d gradients work with padding='valid'
         x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True)
         y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device, requires_grad=True)
-        F.conv3d(x, y, padding=0).sum().backward()
+        F.conv3d(x, y, padding=0).sum().abs().backward()
         gx_expect, gy_expect = x.grad, y.grad
         x.grad, y.grad = None, None
 
-        F.conv3d(x, y, padding='valid').sum().backward()
+        F.conv3d(x, y, padding='valid').sum().abs().backward()
         gx_actual, gy_actual = x.grad, y.grad
         self.assertEqual(gx_expect, gx_actual)
         self.assertEqual(gy_expect, gy_actual)
@@ -2102,17 +2101,17 @@ def conv2d_depthwise(x, weight):
     @onlyCPU
     @dtypes(torch.float, torch.double)
     def test_conv_thnn_nhwc(self, device, dtype):
-        def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, input_format, weight_format):
+        def helper(mod, n, c, h, w, out_channels, kernel_size, dilation, groups, input_format, weight_format):
             input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
                 .to(memory_format=input_format)
             input.requires_grad_()
-            conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
+            conv = mod(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
                 .to(device='cpu', dtype=dtype, memory_format=weight_format)
             for p in conv.parameters():
                 p.data = torch.randint_like(p, -3, 3)
 
             ref_input = input.detach().clone().contiguous().requires_grad_()
-            ref_conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)
+            ref_conv = mod(c, out_channels, kernel_size, dilation=dilation, groups=groups)
             # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
             ref_conv.load_state_dict(conv.state_dict())
             ref_conv = ref_conv.to(device='cpu', dtype=dtype, memory_format=torch.contiguous_format)
@@ -2139,39 +2138,49 @@ def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, input_format
                        [torch.contiguous_format, torch.channels_last]]
             for input_format, weight_format in formats:
                 # non-dilated conv: thnn_conv2d normal path (with im2col)
-                helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
+                helper(nn.Conv2d, 2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
                        input_format=input_format, weight_format=weight_format)
-                helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
+                helper(nn.Conv2d, 2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
                        input_format=input_format, weight_format=weight_format)
                 # test when input chanels is 1 and not converted to channels last
-                helper(2, 1, 10, 10, out_channels=8, kernel_size=3, dilation=1, groups=1,
+                helper(nn.Conv2d, 2, 1, 10, 10, out_channels=8, kernel_size=3, dilation=1, groups=1,
                        input_format=torch.contiguous_format, weight_format=torch.channels_last)
                 # non-dilated conv: thnn_conv2d fast path (skip im2col)
-                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
+                helper(nn.Conv2d, 1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
                        input_format=input_format, weight_format=weight_format)
                 # ic == oc == 1 here, so need to stick input to CL to activate channels last
-                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16,
+                helper(nn.Conv2d, 1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16,
                        input_format=torch.channels_last, weight_format=weight_format)
                 # dilated conv: slow_conv_dilated2d
-                helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1,
+                helper(nn.Conv2d, 2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1,
                        input_format=input_format, weight_format=weight_format)
-                helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16,
+                helper(nn.Conv2d, 2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16,
+                       input_format=input_format, weight_format=weight_format)
+                # transposed-conv: slow_conv_transpose2d
+                helper(nn.ConvTranspose2d, 2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
+                       input_format=input_format, weight_format=weight_format)
+                helper(nn.ConvTranspose2d, 2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
+                       input_format=input_format, weight_format=weight_format)
+                helper(nn.ConvTranspose2d, 1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
+                       input_format=input_format, weight_format=weight_format)
+                helper(nn.ConvTranspose2d, 1, 16, 56, 56, out_channels=32, kernel_size=1, dilation=1, groups=16,
                        input_format=input_format, weight_format=weight_format)
 
     @onlyCUDA
     @skipCUDAIfRocmVersionLessThan((4, 3))
-    @skipCUDAIfNotMiopenSuggestNHWC
     @skipCUDAIfCudnnVersionLessThan(7603)
+    # randint and randint_like with dtype=torch.cfloat raises
+    # RuntimeError: check_random_bounds handles only integral, floating-point and boolean types
     @dtypes(torch.half, torch.float, torch.cfloat)
     def test_conv_cudnn_nhwc(self, device, dtype):
         def helper(n, c, h, w, out_channels, kernel_size, groups):
-            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
-                .to(memory_format=torch.channels_last)
+            input = torch.randint(-3, 3, (n, c, h, w), device=device)\
+                .to(memory_format=torch.channels_last, dtype=dtype)
             input.requires_grad_()
             conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)\
                 .to(device='cuda', dtype=dtype, memory_format=torch.channels_last)
             for p in conv.parameters():
-                p.data = torch.randint_like(p, -3, 3)
+                p.data = torch.randint_like(p, -3, 3, dtype=torch.int64).to(dtype=dtype)
 
             # use FP64 channels-first conv as reference
             ref_input = input.detach().clone().contiguous().double().requires_grad_()
@@ -2183,7 +2192,7 @@ def helper(n, c, h, w, out_channels, kernel_size, groups):
             out = conv(input)
             ref_out = ref_conv(ref_input)
 
-            grad = torch.randint_like(out, -3, 3)
+            grad = torch.randint_like(out, -3, 3, dtype=torch.int64).to(dtype=dtype)
             ref_grad = grad.detach().clone().double().contiguous()
 
             out.backward(grad)
@@ -2304,7 +2313,6 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
 
     @onlyCUDA
     @skipCUDAIfRocmVersionLessThan((4, 3))
-    @skipCUDAIfNotMiopenSuggestNHWC
     @skipCUDAIfCudnnVersionLessThan(7603)
     @tf32_on_and_off(0.05)
     def test_conv_cudnn_mismatch_memory_format(self, device):
@@ -2349,6 +2357,7 @@ def test_conv2d_no_grad(self, device, dtype):
                 output = m(input)
                 self.assertEqual(output, output_ng, rtol=1e-2, atol=1e-5)
 
+    @skipCUDAIfRocm  # started failing fp16 after enabling channels last
     @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.float16)
@@ -2377,6 +2386,7 @@ def test_cudnn_convolution_relu(self, device, dtype):
             else:
                 self.assertEqual(conv2d_out.relu(), cudnn_out)
 
+    @skipCUDAIfRocm  # started failing fp16 after enabling channels last
     @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(torch.float, torch.float16)
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index f4e42aa4cfd2..b2b5323f2a4e 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -6,7 +6,7 @@
 
 import torch
 from torch.testing._internal.common_utils import run_tests, set_default_dtype, \
-    instantiate_parametrized_tests, parametrize as parametrize_test, _assertGradAndGradgradChecks
+    instantiate_parametrized_tests, parametrize as parametrize_test, _assertGradAndGradgradChecks, IS_JETSON
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_device_type import onlyNativeDeviceTypes, dtypes, \
@@ -692,36 +692,38 @@ def test_embedding_bag_out_of_bounds_idx(self, device, dtypes, padding_idx, mode
                                                   mode=mode)
 
     def test_embedding_bag_dimension_errors(self, device):
-        weight = torch.full((2, 0, 0, 6, 6,), 0, dtype=torch.float64, device=device)
-        indices = torch.full((2, 0, 0, 6, 6,), 2, dtype=torch.int64, device=device)
-        offsets = torch.full((2, 0, 0, 6, 6), 0, dtype=torch.int64, device=device)
-
-        with self.assertRaisesRegex(ValueError, r'input has to be 1D or 2D Tensor'):
-            torch.nn.functional.embedding_bag(indices, weight, offsets)
-
-        with self.assertRaisesRegex(RuntimeError, r'input has to be a 1D or 2D Tensor'):
-            torch.embedding_bag(weight, indices, offsets)
-
-        with self.assertRaisesRegex(RuntimeError, r'input has to be a 1D or 2D Tensor'):
-            torch._embedding_bag(weight, indices, offsets)
-
-        with self.assertRaisesRegex(RuntimeError, r'input has to be a 1D or 2D Tensor'):
-            torch._embedding_bag_forward_only(weight, indices, offsets)
-
-        weight = torch.full((2,), 0, dtype=torch.float64, device=device)
-        indices = torch.full((2,), 2, dtype=torch.int64, device=device)
+        funcs = (
+            lambda x, y, z: torch.nn.functional.embedding_bag(y, x, z),
+            torch.embedding_bag,
+            torch._embedding_bag,
+            torch._embedding_bag_forward_only
+        )
+        for i, f in enumerate(funcs):
+            err_type = ValueError if i == 0 else RuntimeError
+
+            weight = torch.full((2, 6,), 0, dtype=torch.float64, device=device)
+            indices = torch.full((2, 0, 0, 6, 6,), 2, dtype=torch.int64, device=device)
+            offsets = torch.full((2, 0, 0, 6, 6), 0, dtype=torch.int64, device=device)
+
+            if i == 0:
+                error_msg = 'input has to be 1D or 2D Tensor'
+            else:
+                error_msg = 'input has to be a 1D or 2D Tensor'
+            with self.assertRaisesRegex(err_type, error_msg):
+                f(weight, indices, offsets)
 
-        with self.assertRaisesRegex(ValueError, r'offsets has to be a 1D Tensor'):
-            torch.nn.functional.embedding_bag(indices, weight, offsets)
+            weight = torch.full((2, 2), 0, dtype=torch.float64, device=device)
+            indices = torch.full((2,), 1, dtype=torch.int64, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r'offsets has to be a 1D Tensor'):
-            torch.embedding_bag(weight, indices, offsets)
+            with self.assertRaisesRegex(err_type, 'offsets has to be a 1D Tensor'):
+                f(weight, indices, offsets)
 
-        with self.assertRaisesRegex(RuntimeError, r'offsets has to be a 1D Tensor'):
-            torch._embedding_bag(weight, indices, offsets)
+            weight = torch.full((2, 2, 2), 0, dtype=torch.float64, device=device)
+            indices = torch.full((2,), 2, dtype=torch.int64, device=device)
+            offsets = torch.full((2,), 0, dtype=torch.int64, device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r'offsets has to be a 1D Tensor'):
-            torch._embedding_bag_forward_only(weight, indices, offsets)
+            with self.assertRaisesRegex(err_type, 'weight has to be a 2D Tensor'):
+                f(weight, indices, offsets)
 
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_EmbeddingBag_per_sample_weights_failures(self, device, dtypes):
@@ -818,7 +820,10 @@ def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
         return torch.stack(bags)
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.half, torch.float, torch.double)))
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                               (torch.half, torch.bfloat16, torch.float, torch.double)))
+    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                                     (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes):
         # Test empty input and per sample weight, and backward pass. There was a CUDA
         # invalid configuration bug (more context in #46572)
@@ -857,7 +862,10 @@ def test_per_sample_weights(mode, trainable_scale):
             test_per_sample_weights(mode, trainable)
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                               (torch.float, torch.double, torch.half, torch.bfloat16)))
+    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                                     (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes):
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
@@ -891,7 +899,10 @@ def test_per_sample_weights(mode, trainable_scale):
             test_per_sample_weights(mode, trainable)
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                               (torch.float, torch.double, torch.half, torch.bfloat16)))
+    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                                     (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes):
         def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offset, has_weight=True):
             es = nn.EmbeddingBag(5, 2, mode=mode, include_last_offset=include_last_offset).to(dtype=dtypes[2], device=device)
@@ -1156,8 +1167,13 @@ def _test_EmbeddingBag(
             self.assertRaises(RuntimeError, lambda: es(input.view(-1), offset))
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                               (torch.float, torch.double, torch.half, torch.bfloat16)))
+    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                                     (torch.float, torch.double, torch.half)))
     def test_embedding_bag_device(self, device, dtypes):
+        if IS_JETSON and torch.bfloat16 in dtypes and device == "cpu":
+            self.skipTest("bfloat16 not supported with Jetson cpu")
         with set_default_dtype(torch.double):
             self._test_EmbeddingBag(device, 'sum', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
             self._test_EmbeddingBag(device, 'mean', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
@@ -1192,7 +1208,10 @@ def test_embedding_bag_device(self, device, dtypes):
             )
 
     @skipMeta
-    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half)))
+    @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                               (torch.float, torch.double, torch.half, torch.bfloat16)))
+    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
+                                     (torch.float, torch.double, torch.half)))
     def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
         weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device)
 
@@ -1216,7 +1235,7 @@ def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
             )
         self.assertEqual(output_non_contig, output_contig)
 
-    @onlyCUDA
+    @onlyNativeDeviceTypes  # currently fails on XLA
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_bfloat16(self, device, dtypes):
         with set_default_dtype(torch.double):
diff --git a/test/nn/test_init.py b/test/nn/test_init.py
index 9e72c1040a55..b4d0c8d998d9 100644
--- a/test/nn/test_init.py
+++ b/test/nn/test_init.py
@@ -16,7 +16,7 @@
 
 class TestNNInit(TestCase):
     def setUp(self):
-        super(TestNNInit, self).setUp()
+        super().setUp()
         random.seed(123)
 
     def _is_normal(self, tensor, mean, std):
diff --git a/test/nn/test_lazy_modules.py b/test/nn/test_lazy_modules.py
index c3a9dff20022..d3b0d58c0130 100644
--- a/test/nn/test_lazy_modules.py
+++ b/test/nn/test_lazy_modules.py
@@ -219,9 +219,6 @@ def test_lazy_pre_forward_hook(self):
         functions successfully.
         """
         class TestModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def initialize_parameters(self, input):
                 return None
 
@@ -242,9 +239,6 @@ def test_lazy_forward_hook(self):
         functions successfully.
         """
         class TestModule(torch.nn.modules.lazy.LazyModuleMixin, torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def initialize_parameters(self, input):
                 return None
 
@@ -551,7 +545,7 @@ def test_materialize_device(self):
     def test_chained_initialization(self):
         class MyNetwork(torch.nn.Module):
             def __init__(self):
-                super(MyNetwork, self).__init__()
+                super().__init__()
                 self.linear_1 = torch.nn.LazyLinear(15)
                 self.linear_2 = torch.nn.LazyLinear(10)
 
diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
index 2aa64814857e..9edabd1f1294 100644
--- a/test/nn/test_module_hooks.py
+++ b/test/nn/test_module_hooks.py
@@ -393,9 +393,6 @@ def bw_hook(module: nn.Module, _inputs, _outputs):
             counter['backward'] += 1
 
         class TestModule(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, dict):
                 inp = dict['x']
                 x = torch.nn.functional.softmax(inp, dim=0)
@@ -478,7 +475,7 @@ def test_load_state_dict_module_pre_hook(self):
         # Test with module instance method as hook
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Parameter(torch.rand(10))
 
             def my_pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
@@ -543,7 +540,7 @@ def test_load_state_dict_post_hook(self):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Parameter(torch.rand(10))
 
             def my_post_load_hook(self, module, incompatible_keys):
diff --git a/test/nn/test_packed_sequence.py b/test/nn/test_packed_sequence.py
index 04856dc7096e..34362129bd76 100644
--- a/test/nn/test_packed_sequence.py
+++ b/test/nn/test_packed_sequence.py
@@ -24,7 +24,7 @@ class PackedSequenceTest(TestCase):
     }
 
     def __init__(self, *args, **kwargs):
-        super(PackedSequenceTest, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.batch_size = 5
         self.max_length = 6
 
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index df8eb592dcec..455382fc129a 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -10,11 +10,11 @@
 import itertools
 import math
 
-from torch._six import inf, nan
+from torch import inf, nan
 import torch
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, set_default_dtype, \
-    instantiate_parametrized_tests, slowTest, parametrize as parametrize_test, subtest, skipIfMps
+    instantiate_parametrized_tests, slowTest, parametrize as parametrize_test, subtest, skipIfMps, gcIfJetson
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_nn import NNTestCase, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import largeTensorTest, onlyNativeDeviceTypes, dtypes, \
@@ -353,6 +353,10 @@ def test_max_unpool(self):
             self.assertEqual(F.max_unpool3d(output, indices, 2), F.max_unpool3d(output, indices, 2, stride=2))
             gradcheck(F.max_unpool3d, (output, indices, 2), check_forward_ad=True)
 
+    def test_max_unpool3d_input_check(self):
+        x = torch.ones(1, 3, 1, 1, 1)
+        with self.assertRaises(RuntimeError):
+            F.max_unpool3d(x, torch.zeros(x.shape, dtype=int), [1, 1])
 
 class TestPoolingNNDeviceType(NNTestCase):
     @onlyNativeDeviceTypes
@@ -707,6 +711,7 @@ def test_adaptive_pooling_no_suppot_input(self, device, dtype):
                     output = module(input)
 
     @onlyNativeDeviceTypes
+    @gcIfJetson
     @dtypes(torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     def test_avg_pool2d_nhwc(self, device, dtype):
@@ -794,6 +799,7 @@ def check(x, *args, **kwargs):
         check(tensor.transpose(1, 2), 3, 2, 1, 2, ceil_mode=True)
 
     @onlyCUDA
+    @gcIfJetson
     def test_max_pool2d(self, device):
         def helper(n, c, h, w, ks):
             x = torch.randn(n, c, h, w, device='cuda', dtype=torch.float, requires_grad=True)
@@ -817,6 +823,7 @@ def helper(n, c, h, w, ks):
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @gcIfJetson
     def test_max_pool2d_nhwc(self, device, dtype):
         def helper(n, c, h, w, kernel_size, stride=None):
             if stride is None:
@@ -853,6 +860,7 @@ def helper(n, c, h, w, kernel_size, stride=None):
     @onlyNativeDeviceTypes
     @dtypes(torch.half, torch.float, torch.double)
     @onlyCUDA
+    @gcIfJetson
     def test_max_pool3d_ndhwc(self, device, dtype):
         def helper(n, c, h, w, d, kernel_size, stride=None):
             batch = n
@@ -942,6 +950,7 @@ def helper(n, c, h, w, kernel_size, stride, memory_format):
         helper(1, 19, 20, 10, 8, 2, torch.channels_last)
 
     @onlyCUDA
+    @gcIfJetson
     def test_max_pool2d_indices(self, device):
         def helper(n, c, h, w, ks):
             if n is None:
@@ -1255,6 +1264,7 @@ def test_maxpool_indices_no_batch_dim(self, device, dtype):
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float)
     @onlyNativeDeviceTypes  # TODO: Fails on XLA
+    @gcIfJetson
     def test_max_pool_nan_inf(self, device, dtype):
         for adaptive in ['', 'adaptive_']:
             for num_dim in [1, 2, 3]:
diff --git a/test/onnx/internal/test_diagnostics.py b/test/onnx/internal/test_diagnostics.py
index 49402204e9d2..0269d76a2681 100644
--- a/test/onnx/internal/test_diagnostics.py
+++ b/test/onnx/internal/test_diagnostics.py
@@ -192,12 +192,11 @@ def test_diagnostics_engine_records_diagnosis_reported_outside_of_export(
             self._sample_rule,
             sample_level,
         ):
-            diagnostics.context.diagnose(self._sample_rule, sample_level)
+            diagnostics.export_context().diagnose(self._sample_rule, sample_level)
 
     def test_diagnostics_records_python_call_stack(self):
-        diagnostic = diagnostics.ExportDiagnostic(
-            self._sample_rule, diagnostics.levels.NOTE
-        )
+        diagnostic = diagnostics.ExportDiagnostic(self._sample_rule, diagnostics.levels.NOTE)  # fmt: skip
+        # Do not break the above line, otherwise it will not work with Python-3.8+
         stack = diagnostic.python_call_stack
         assert stack is not None  # for mypy
         self.assertGreater(len(stack.frames), 0)
diff --git a/test/onnx/model_defs/op_test.py b/test/onnx/model_defs/op_test.py
index 56a66870c700..195e3c8dc849 100644
--- a/test/onnx/model_defs/op_test.py
+++ b/test/onnx/model_defs/op_test.py
@@ -19,17 +19,11 @@ def forward(self, x):
 
 
 class ConcatNet(nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, inputs):
         return torch.cat(inputs, 1)
 
 
 class PermuteNet(nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, input):
         return input.permute(2, 3, 0, 1)
 
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index fe5e2411aa38..50013fbc7dde 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -40,7 +40,7 @@ def run_model_test(test_suite: _TestONNXRuntime, *args, **kwargs):
     if hasattr(test_suite, "check_dtype"):
         options.check_dtype = test_suite.check_dtype
 
-    names = set([f.name for f in dataclasses.fields(options)])
+    names = {f.name for f in dataclasses.fields(options)}
     keywords_to_pop = []
     for k, v in kwargs.items():
         if k in names:
diff --git a/test/onnx/test_fx_to_onnx.py b/test/onnx/test_fx_to_onnx.py
new file mode 100644
index 000000000000..78cd7b2bd8dd
--- /dev/null
+++ b/test/onnx/test_fx_to_onnx.py
@@ -0,0 +1,81 @@
+# Owner(s): ["module: onnx"]
+import unittest
+
+import pytorch_test_common
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.onnx._internal import fx as fx_onnx
+from torch.testing._internal import common_utils
+
+
+class TestFxToOnnx(pytorch_test_common.ExportTestCase):
+    def setUp(self):
+        super().setUp()
+        self.opset_version = torch.onnx._constants.ONNX_DEFAULT_OPSET
+
+    def test_simple_function(self):
+        def func(x):
+            y = x + 1
+            z = y.relu()
+            return (y, z)
+
+        _ = fx_onnx.export(func, torch.randn(1, 1, 2), opset_version=self.opset_version)
+
+    @unittest.skip(
+        "Conv Op is not supported at the time. https://github.com/microsoft/onnx-script/issues/397"
+    )
+    def test_mnist(self):
+        class MNISTModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 32, 3, 1, bias=False)
+                self.conv2 = nn.Conv2d(32, 64, 3, 1, bias=False)
+                self.fc1 = nn.Linear(9216, 128, bias=False)
+                self.fc2 = nn.Linear(128, 10, bias=False)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.conv1(tensor_x)
+                tensor_x = F.sigmoid(tensor_x)
+                tensor_x = self.conv2(tensor_x)
+                tensor_x = F.sigmoid(tensor_x)
+                tensor_x = F.max_pool2d(tensor_x, 2)
+                tensor_x = torch.flatten(tensor_x, 1)
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = F.sigmoid(tensor_x)
+                tensor_x = self.fc2(tensor_x)
+                output = F.log_softmax(tensor_x, dim=1)
+                return output
+
+        tensor_x = torch.rand((64, 1, 28, 28), dtype=torch.float32)
+        _ = fx_onnx.export(MNISTModel(), tensor_x, opset_version=self.opset_version)
+
+    def test_trace_only_op_with_evaluator(self):
+        model_input = torch.tensor([[1.0, 2.0, 3.0], [1.0, 1.0, 2.0]])
+
+        class ArgminArgmaxModel(torch.nn.Module):
+            def forward(self, input):
+                return (
+                    torch.argmin(input),
+                    torch.argmax(input),
+                    torch.argmin(input, keepdim=True),
+                    torch.argmax(input, keepdim=True),
+                    torch.argmin(input, dim=0, keepdim=True),
+                    torch.argmax(input, dim=1, keepdim=True),
+                )
+
+        _ = fx_onnx.export(
+            ArgminArgmaxModel(), model_input, opset_version=self.opset_version
+        )
+
+    def test_multiple_outputs_op_with_evaluator(self):
+        class TopKModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.topk(x, 3)
+
+        x = torch.arange(1.0, 6.0, requires_grad=True)
+        _ = fx_onnx.export(TopKModel(), x, opset_version=self.opset_version)
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_fx_to_onnx_with_onnxruntime.py b/test/onnx/test_fx_to_onnx_with_onnxruntime.py
new file mode 100644
index 000000000000..0cb3fa2ae52b
--- /dev/null
+++ b/test/onnx/test_fx_to_onnx_with_onnxruntime.py
@@ -0,0 +1,378 @@
+# Owner(s): ["module: onnx"]
+from __future__ import annotations
+
+import inspect
+
+import io
+import os
+import tempfile
+
+from typing import Any, Callable, Sequence, Tuple, Union
+
+import onnx.reference
+import onnx_test_common
+
+import onnxruntime  # type: ignore[import]
+
+import torch
+import transformers  # type: ignore[import]
+from torch import nn
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.onnx._internal import diagnostics, fx as fx_onnx
+from torch.testing._internal import common_utils
+from torch.utils import _pytree as pytree
+
+
+def _run_onnx_reference_runtime(
+    onnx_model: Union[str, io.BytesIO],
+    pytorch_inputs: Tuple[Any, ...],
+    verbose: int = 10,
+) -> Sequence[Any]:
+    session = onnx.reference.ReferenceEvaluator(onnx_model, verbose=verbose)
+    return session.run(
+        None, {k: v.cpu().numpy() for k, v in zip(session.input_names, pytorch_inputs)}
+    )
+
+
+def _run_ort(
+    onnx_model: Union[str, io.BytesIO], pytorch_inputs: Tuple[Any, ...]
+) -> Sequence[Any]:
+    session = onnxruntime.InferenceSession(
+        onnx_model, providers=["CPUExecutionProvider"]
+    )
+    input_names = [ort_input.name for ort_input in session.get_inputs()]
+    return session.run(
+        None, {k: v.cpu().numpy() for k, v in zip(input_names, pytorch_inputs)}
+    )
+
+
+def _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
+    model: Union[torch.nn.Module, Callable],
+    input_args,
+    rtol: float = 1e-3,
+    atol: float = 1e-7,
+    opset_version: int = 17,
+    **input_kwargs,
+):
+    # Feed args and kwargs into exporter.
+    # Note that exporter should flatten kwargs into positional args the exported model;
+    # since ONNX doesn't represent kwargs.
+    onnx_model = fx_onnx.export_after_normalizing_args_and_kwargs(
+        model,
+        *input_args,
+        opset_version=opset_version,
+        use_binary_format=True,
+        **input_kwargs,
+    )
+
+    # Inspect the model's signature. It will be used
+    # to flatten kwargs.
+    if isinstance(model, torch.nn.Module):
+        signature = inspect.signature(model.forward)
+    else:
+        signature = inspect.signature(model)
+
+    # Bind args and kwargs to the model's signature to
+    # flatten kwargs into positional args since ONNX
+    # model cannot be called with kwargs.
+    bound = signature.bind(*input_args, **input_kwargs)
+    # Fill optional inputs.
+    bound.apply_defaults()
+    assert not bound.kwargs
+
+    ref_outputs, _ = pytree.tree_flatten(model(*input_args, **input_kwargs))
+    ort_outputs = _run_ort(onnx_model, bound.args)
+    for ref_output, ort_output in zip(ref_outputs, ort_outputs):
+        torch.testing.assert_close(
+            ref_output, torch.tensor(ort_output), rtol=rtol, atol=atol
+        )
+
+
+class TestFxToOnnxWithOnnxRuntime(onnx_test_common._TestONNXRuntime):
+    def setUp(self):
+        super().setUp()
+        self.diag_ctx = diagnostics.engine.create_diagnostic_context(
+            "test_fx_export", version=torch.__version__
+        )
+        self.opset_version = 17
+
+    def tearDown(self):
+        diagnostics.engine.dump(
+            f"test_report_{self._testMethodName}.sarif", compress=False
+        )
+        super().tearDown()
+
+    def test_simple_function(self):
+        def func(x):
+            # TODO(justinchuby): Replicate torch's type casting policy
+            # in the exporter for type promotion support
+            y = x + 1.0
+            z = y.relu()
+            return (y, z)
+
+        tensor_x = torch.randn(1, 1, 2, dtype=torch.float32)
+
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(func, (tensor_x,))
+
+    def test_func_with_args_and_kwargs(self):
+        # Non-tensor optional kwargs are always folded into constant and
+        # removed from input list in Dynamo-traced graph, so we can't
+        # define a function like
+        #   def func(x, b=1.0)
+        # here. E.g., if you change the `b` to 1.0 below, it will complain
+        # somewhere that model is called with extra args because the modified
+        # function is traced into
+        #   def forward(self, x : torch.Tensor):
+        #     add = x + 1.0;  x = None
+        #     relu = add.relu()
+        #     return (add, relu)
+        # To summarize, optional kwargs must be tensors; otherwise, they are
+        # treated as in-graph constants in Dynamo.
+        def func(x, b=torch.tensor(1.0)):
+            y = x + b
+            z = y.relu()
+            return (y, z)
+
+        tensor_x = torch.randn(1, 1, 2, dtype=torch.float32)
+
+        # Test without providing optional kwarg.
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(func, (tensor_x,))
+        # Test with only positional args.
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
+            func, (tensor_x, torch.tensor(8.0))
+        )
+        # Test while specifying optional kwarg.
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
+            func, (tensor_x,), b=torch.tensor(5.0)
+        )
+
+    def test_mnist(self):
+        class MNISTModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 32, 3, 1, bias=True)
+                self.conv2 = nn.Conv2d(32, 64, 3, 2, bias=True)
+                self.fc1 = nn.Linear(9216, 128, bias=True)
+                self.fc2 = nn.Linear(128, 10, bias=True)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.conv1(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = self.conv2(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = torch.flatten(tensor_x, 1)
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                output = self.fc2(tensor_x)
+                return output
+
+        tensor_x = torch.rand((64, 1, 28, 28), dtype=torch.float32)
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(MNISTModel(), (tensor_x,))
+
+    # test single op with no kwargs
+    def test_sigmoid(self):
+        x = torch.randn(1, 4, 2, 3)
+
+        class SigmoidModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sigmoid = torch.nn.Sigmoid()
+
+            def forward(self, x):
+                return self.sigmoid(x)
+
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(SigmoidModel(), (x,))
+
+    # test single op with no kwargs
+    def test_sigmoid_add(self):
+        self.opset_version = 17
+        # TODO(titaiwang): change to randn once it's ready
+        x = torch.tensor([1.0, 2.0], dtype=torch.float)
+
+        class SigmoidAddModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sigmoid = torch.nn.Sigmoid()
+
+            def forward(self, x):
+                x = torch.ops.aten.add(x, 1.0, alpha=2.0)
+                return self.sigmoid(x)
+
+        _run_test_with_fx_to_onnx_exporter_and_onnx_runtime(SigmoidAddModel(), (x,))
+
+    def test_gpt2_tiny(self):
+        model_name = "sshleifer/tiny-gpt2"
+        # Download pytorch model
+        model = transformers.AutoModel.from_pretrained(model_name)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+
+        # Transform input tokens
+        inputs = tokenizer("Hello world!", return_tensors="pt")
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+
+        onnx_model = fx_onnx.export_after_normalizing_args_and_kwargs(
+            model, use_binary_format=True, opset_version=self.opset_version, **inputs
+        )
+
+        ref_outputs, _ = pytree.tree_flatten(model(**inputs, return_dict=False))
+        ort_outputs = _run_ort(onnx_model, (input_ids, attention_mask))
+        assert len(ref_outputs) == len(ort_outputs)
+        assert len(ref_outputs) == 5
+        for ref_output, ort_output in zip(ref_outputs, ort_outputs):
+            torch.testing.assert_close(ref_output, torch.tensor(ort_output))
+
+    def _test_large_scale_exporter(
+        self,
+        model_name,
+        create_model: Callable,
+        create_args: Callable,
+        create_pytorch_only_kwargs: Callable,
+    ):
+        """Test helper for large-scale exporter.
+
+        Arguments:
+            model_name: Name of the model. It used to name temporary files.
+            create_model: A function that creates a model. It should always create the same model.
+            create_args: A function that creates random input arguments for the model.
+            create_pytorch_only_kwargs: A function that creates kwargs for calling PyTorch model with real tensors.
+
+        This test contains several steps.
+
+        1. Create a toy model.
+        2. Save the toy's state (parameters) to a file. This is for simulating a checkpoint file.
+        3. Load it back and export it to ONNX with large-scale exporter.
+            All operations (including model loading) are done under
+            FakeTensorMode so no real tensor is created and no real
+            computation happens.
+        4. The ONNX model generated in step 3 doesn't contain parameters,
+            and this step adds them as external data and save a new ONNX model.
+        5. Run PyTorch and ONNX models and compare their results.
+        """
+
+        # Create the toy model.
+        model = create_model()
+
+        with tempfile.NamedTemporaryFile(
+            prefix=model_name, suffix=".pt"
+        ) as tmp_file, tempfile.TemporaryDirectory(
+            suffix="large_scale_export"
+        ) as tmp_folder:
+            # Dump state_dict to a file to simulate how HuggingFace model is initialized.
+            # The file will be loaded via .load_state_dict(...)
+            torch.save(model.state_dict(), tmp_file.name)
+
+            ftm = FakeTensorMode(
+                allow_non_fake_inputs=True, allow_fallback_kernels=False
+            )
+            ctx = fx_onnx.FxToOnnxContext()
+
+            # The following coed block does several things.
+            #  1. Create a model whose parameters and buffers are all FakeTensor's.
+            #  2. Convert nn.Module into ONNX model without initializers.
+            #  3. Record the file paths to find real initializers.
+            with ftm, ctx:
+                # Toy model with parameters and buffers as FakeTensor's.
+                fake_model = create_model()
+                fake_model.load_state_dict(torch.load(tmp_file.name))
+                # Toy inputs as FakeTensor's.
+                fake_args = create_args()
+                # Export ONNX model without initializers while ctx.paths records
+                # all files that contains real initializers.
+                (onnx_model, _, _, _) = fx_onnx.export_without_parameters_and_buffers(
+                    fake_model,
+                    *fake_args,
+                    use_binary_format=False,
+                    opset_version=self.opset_version,
+                )
+
+            # Tasks done by the following block.
+            #  1. Iterate through all tensors stored in ctx.paths (the file content is loaded torch.load)
+            #  2. If a tensor's name matches a "onnx_model"'s input name, an initializer is created and saved to
+            #     a seperated folder.
+            #  3. A new ONNX model is saved into file with the initializers saved in the previous step.
+            #  4. ORT executes the new ONNX model and compares the results with the original GPT model.
+
+            # Model saved to tmp_folder/onnx_model_location
+            # Initializers are saved to tmp_folder/onnx_initializer_location/*.onnx
+            onnx_model_location = model_name + "_external_data.onnx"
+            onnx_initializer_location = model_name + "_initializers"
+            fx_onnx.save_model_with_external_data(
+                tmp_folder,
+                onnx_model_location,
+                onnx_initializer_location,
+                tuple(ctx.paths),
+                onnx_model,
+            )
+
+            # Generate random inputs.
+            args = create_args()
+            kwargs = create_pytorch_only_kwargs()
+            # Original outputs.
+            ref_outputs, _ = pytree.tree_flatten(model(*args, **kwargs))
+            # ORT outputs.
+            ort_outputs = _run_ort(
+                os.path.join(tmp_folder, onnx_model_location),
+                (arg for arg in args if arg is not None),
+            )
+
+            assert len(ref_outputs) == len(ort_outputs)
+
+            for ref_output, ort_output in zip(ref_outputs, ort_outputs):
+                torch.testing.assert_close(ref_output, torch.tensor(ort_output))
+
+    def test_large_scale_exporter_with_toy_mlp(self):
+        class MLPModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc0 = nn.Linear(8, 8, bias=True)
+                self.fc1 = nn.Linear(8, 4, bias=True)
+                self.fc2 = nn.Linear(4, 2, bias=True)
+                self.fc3 = nn.Linear(2, 2, bias=True)
+
+            def forward(self, tensor_x: torch.Tensor):
+                tensor_x = self.fc0(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = self.fc1(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                tensor_x = self.fc2(tensor_x)
+                tensor_x = torch.sigmoid(tensor_x)
+                output = self.fc3(tensor_x)
+                return output
+
+        def create_model():
+            return MLPModel()
+
+        def create_args():
+            return (torch.rand((97, 8), dtype=torch.float32),)
+
+        def create_pytorch_only_extra_kwargs():
+            return {}
+
+        self._test_large_scale_exporter(
+            "toy_mlp1", create_model, create_args, create_pytorch_only_extra_kwargs
+        )
+
+    def test_large_scale_exporter_with_tiny_gpt2(self):
+        model_name = "sshleifer/tiny-gpt2"
+
+        def create_model():
+            return transformers.AutoModel.from_pretrained(model_name)
+
+        def create_args():
+            tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+            kwargs = tokenizer("Hello world!", return_tensors="pt")
+            input_ids = kwargs["input_ids"]
+            attention_mask = kwargs["attention_mask"]
+            return input_ids, None, attention_mask
+
+        def create_pytorch_only_extra_kwargs():
+            return {"return_dict": False}
+
+        self._test_large_scale_exporter(
+            "tiny_gpt2", create_model, create_args, create_pytorch_only_extra_kwargs
+        )
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
index 15904839957e..b50e8e903c7b 100644
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@@ -13,7 +13,7 @@
 from model_defs.srresnet import SRResNet
 from model_defs.super_resolution import SuperResolutionNet
 from pytorch_test_common import skipIfUnsupportedMinOpsetVersion, skipScriptTest
-from torch import quantization
+from torch.ao import quantization
 from torch.autograd import Variable
 from torch.onnx import OperatorExportTypes
 from torch.testing._internal import common_utils
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index 4b7bdb58ae51..af259b4e1d67 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -245,6 +245,7 @@ def test_faster_rcnn(self):
             atol=1e-5,
         )
 
+    @unittest.skip("Failing after ONNX 1.13.0")
     @skipIfUnsupportedMinOpsetVersion(11)
     @skipScriptTest()
     def test_mask_rcnn(self):
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index ef79e82ee266..7c008624db4f 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -170,9 +170,6 @@ def test_maxpool(self):
 
     def test_upsample(self):
         class MyModule(Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
                 size = [int(i) for i in size]
@@ -201,9 +198,6 @@ def forward(self, x):
 
     def test_cast_constant(self):
         class MyModule(Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x - 1
 
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index cfb36732af4d..7bc47e8cefc4 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -880,7 +880,7 @@ def test_cumsum(self):
     #    def test_c2_op(self):
     #        class MyModel(torch.nn.Module):
     #            def __init__(self):
-    #                super(MyModel, self).__init__()
+    #                super().__init__()
     #
     #            def forward(self, scores, bbox_deltas, im_info, anchors):
     #                a, b = torch.ops._caffe2.GenerateProposals(
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 69edf370c492..0bd78d3732ec 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -2,6 +2,8 @@
 
 """Tests for onnx export that don't run the exported model."""
 
+from __future__ import annotations
+
 import contextlib
 import io
 import itertools
@@ -97,7 +99,7 @@ def forward(self, x):
 
         class TraceMe(torch.nn.Module):
             def __init__(self):
-                super(TraceMe, self).__init__()
+                super().__init__()
                 self.foo = Foo()
 
             def forward(self, x):
@@ -118,9 +120,6 @@ def foo(x):
 
     def test_onnx_export_script_module(self):
         class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ModuleToExport, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 y = x - x
@@ -136,9 +135,6 @@ def func_with_warning(inp):
             return torch.nn.functional.sigmoid(inp)  # triggers a deprecation warning
 
         class WarningTest(torch.nn.Module):
-            def __init__(self):
-                super(WarningTest, self).__init__()
-
             def forward(self, x):
                 return func_with_warning(x)
 
@@ -149,16 +145,13 @@ def forward(self, x):
 
     def test_onnx_export_script_python_fail(self):
         class PythonModule(torch.jit.ScriptModule):
-            def __init__(self):
-                super(PythonModule, self).__init__()
-
             @torch.jit.ignore
             def forward(self, x):
                 return torch.neg(x)
 
         class ModuleToExport(torch.jit.ScriptModule):
             def __init__(self):
-                super(ModuleToExport, self).__init__()
+                super().__init__()
                 self.mod = PythonModule()
 
             @torch.jit.script_method
@@ -173,15 +166,12 @@ def forward(self, x):
 
     def test_onnx_export_script_inline_trace(self):
         class ModuleToInline(torch.nn.Module):
-            def __init__(self):
-                super(ModuleToInline, self).__init__()
-
             def forward(self, x):
                 return torch.neg(x)
 
         class ModuleToExport(torch.jit.ScriptModule):
             def __init__(self):
-                super(ModuleToExport, self).__init__()
+                super().__init__()
                 self.mod = torch.jit.trace(ModuleToInline(), torch.zeros(1, 2, 3))
 
             @torch.jit.script_method
@@ -194,16 +184,13 @@ def forward(self, x):
 
     def test_onnx_export_script_inline_script(self):
         class ModuleToInline(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ModuleToInline, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 return torch.neg(x)
 
         class ModuleToExport(torch.jit.ScriptModule):
             def __init__(self):
-                super(ModuleToExport, self).__init__()
+                super().__init__()
                 self.mod = ModuleToInline()
 
             @torch.jit.script_method
@@ -216,9 +203,6 @@ def forward(self, x):
 
     def test_onnx_export_script_module_loop(self):
         class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ModuleToExport, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 # test if we support end to end onnx export on loop and
@@ -234,9 +218,6 @@ def forward(self, x):
     @common_utils.suppress_warnings
     def test_onnx_export_script_truediv(self):
         class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ModuleToExport, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 z = x.size(0) / 2
@@ -250,9 +231,6 @@ def forward(self, x):
 
     def test_onnx_export_script_non_alpha_add_sub(self):
         class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ModuleToExport, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 bs = x.size(0) + 1
@@ -263,9 +241,6 @@ def forward(self, x):
 
     def test_onnx_export_script_module_if(self):
         class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ModuleToExport, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 if bool(torch.sum(x) > 0):
@@ -278,7 +253,7 @@ def forward(self, x):
     def test_onnx_export_script_inline_params(self):
         class ModuleToInline(torch.jit.ScriptModule):
             def __init__(self):
-                super(ModuleToInline, self).__init__()
+                super().__init__()
                 self.m = torch.nn.Parameter(torch.ones(3, 3))
                 self.unused = torch.nn.Parameter(torch.ones(1, 2, 3))
 
@@ -288,7 +263,7 @@ def forward(self, x):
 
         class ModuleToExport(torch.jit.ScriptModule):
             def __init__(self):
-                super(ModuleToExport, self).__init__()
+                super().__init__()
                 self.mod = ModuleToInline()
                 self.param = torch.nn.Parameter(torch.ones(3, 4))
 
@@ -308,7 +283,7 @@ def forward(self, x):
     def test_onnx_export_speculate(self):
         class Foo(torch.jit.ScriptModule):
             def __init__(self, m):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.m = m
 
             @torch.jit.script_method
@@ -691,9 +666,6 @@ def forward(self, x):
 
     def test_onnx_proto_checker(self):
         class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return 2 * x
 
@@ -815,9 +787,6 @@ def test_pack_padded_pad_packed_trace(self):
         T, B, C = 3, 5, 7
 
         class PadPackedWrapper(torch.nn.Module):
-            def __init__(self):
-                super(PadPackedWrapper, self).__init__()
-
             def forward(self, x, seq_lens):
                 x = pack_padded_sequence(x, seq_lens)
                 x, _ = pad_packed_sequence(x)
@@ -869,7 +838,7 @@ def test_rnn_trace_override(self):
 
         class RNNTraceWrapper(torch.nn.Module):
             def __init__(self, cell_type):
-                super(RNNTraceWrapper, self).__init__()
+                super().__init__()
                 if cell_type == "RNN":
                     self.rnn = torch.nn.RNN(
                         input_size=C, hidden_size=C, num_layers=num_layers
@@ -919,6 +888,65 @@ def forward(self, x, seq_lens):
             f = io.BytesIO()
             torch.onnx.export(m, (x, seq_lens), f, verbose=False)
 
+    def test_pushpackingpastrnn_in_peephole_create_own_gather_input(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        num_layers = 3
+        T, B, C = 11, 5, 7
+        mask_start_point = 0
+
+        class LSTMTraceWrapper(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+                self.rnn = torch.nn.LSTM(
+                    input_size=C, hidden_size=C, num_layers=num_layers
+                )
+
+            def forward(self, x, seq_lens):
+                mask = torch.arange(mask_start_point, x.shape[1])
+                seq_lens = seq_lens[mask]
+                x = pack_padded_sequence(x, seq_lens)
+                # Calculate sizes and prepare views to our zero buffer to pass as hx
+                max_batch_size = x.batch_sizes[0]
+                hx = torch.randn(num_layers, max_batch_size, C)
+                cx = torch.randn(num_layers, max_batch_size, C)
+                x, _ = self.rnn(x, (hx, cx))
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = torch.ones(T, B, C)
+        # length 5 because of B
+        seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+        m = LSTMTraceWrapper()
+
+        f = io.BytesIO()
+        torch.onnx.export(
+            m,
+            (x, seq_lens),
+            f,
+            verbose=True,
+            input_names=["input", "seq_len"],
+            dynamic_axes={"input": {1: "B"}},
+        )
+        onnx_proto = onnx.load_model_from_string(f.getvalue())
+        # the first argument in onnx::Range should be constant node with value 0
+        const_node = []
+        constant_input_name = None
+        for n in onnx_proto.graph.node:
+            if n.op_type == "Constant":
+                const_node.append(n)
+            elif n.op_type == "Range":
+                constant_input_name = n.input[0]
+        self.assertNotEqual(constant_input_name, None)
+        self.assertNotEqual(len(const_node), 0)
+
+        value = None
+        for n in const_node:
+            if n.output[0] == constant_input_name:
+                value = np.frombuffer(n.attribute[0].t.raw_data, dtype=np.int64)
+        self.assertEqual(value, 0)
+
     def test_trace_fork_wait_inline_onnx(self):
         def fork_body(x):
             return torch.neg(x), torch.neg(x)
@@ -1040,12 +1068,12 @@ def test_onnx_aten_fallback_must_not_fallback(self):
         # For BUILD_CAFFE2=0, aten fallback only when not exportable
         class ONNXExportable(torch.nn.Module):
             def __init__(self):
-                super(ONNXExportable, self).__init__()
-                self.quant = torch.quantization.QuantStub()
+                super().__init__()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.fc1 = torch.nn.Linear(12, 8)
                 self.fc2 = torch.nn.Linear(8, 4)
                 self.fc3 = torch.nn.Linear(4, 6)
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -1095,6 +1123,39 @@ def forward(self, x):
                     dim,
                 )
 
+    def test_col2im(self):
+        # This test can be moved to test/onnx/test_pytorch_onnx_onnxruntime.py when ORT implement ::Col2Im
+
+        # Random batched RGB 32x32 image-shaped input tensor of batch size 64
+        original_image_inputs = torch.randn((64, 3, 32, 32))
+        output_size = tuple(original_image_inputs.shape[2:])
+        kernel_size = (1, 2)
+        dilation = 3
+        padding = 2
+        stride = 1
+        model_im2col = torch.nn.Unfold(
+            kernel_size, dilation=dilation, padding=padding, stride=stride
+        )
+        blocks = model_im2col(original_image_inputs)
+
+        model = torch.nn.Fold(
+            output_size=output_size,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride,
+        )
+        f = io.BytesIO()
+        torch.onnx.export(model, (blocks,), f, opset_version=18)
+
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertEqual(onnx_model.graph.node[-1].op_type, "Col2Im")
+        self.assertEqual(onnx_model.graph.node[-1].domain, "")
+        self.assertEqual(len(onnx_model.graph.node[-1].input), 3)
+        self.assertEqual(onnx_model.graph.node[-1].attribute[0].name, "dilations")
+        self.assertEqual(onnx_model.graph.node[-1].attribute[1].name, "pads")
+        self.assertEqual(onnx_model.graph.node[-1].attribute[2].name, "strides")
+
 
 class TestQuantizeEagerONNXExport(common_utils.TestCase):
     def _test_lower_graph_impl(self, model, data):
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index ff7fac109fe9..b35e66182e7c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -44,7 +44,9 @@
 # The min onnx opset version to test for
 MIN_ONNX_OPSET_VERSION = 9
 # The max onnx opset version to test for
-MAX_ONNX_OPSET_VERSION = _constants.ONNX_MAX_OPSET
+MAX_ONNX_OPSET_VERSION = (
+    _constants.ONNX_MAX_OPSET - 1
+)  # TODO: ORT does not support opset 18 yet
 
 
 def _init_test_generalized_rcnn_transform():
@@ -849,9 +851,6 @@ def forward(self, x: int, y):
     @skipDtypeChecking
     def test_primitive_input_floating(self):
         class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x: float, y):
                 return x + y
 
@@ -861,9 +860,6 @@ def forward(self, x: float, y):
 
     def test_primitive_input_bool(self):
         class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, flag: bool, x, y):
                 if flag:
                     return x
@@ -1616,10 +1612,12 @@ def forward(self, x: int, y: int):
         y = 2
         self.run_test(ArithmeticModule(), (x, y))
 
-    # Outputs that are always None are removed.
-    # Issue 84130: ONNX ignores mustNone() node, while pytorch
-    # doesn't, and that makes Optional comparison difficult to achieve.
-    @skipScriptTest()  # TODO Use onnx::Optional to replace erase None in shape_type_inference.cpp
+    @skipScriptTest(
+        15,
+        reason="In trace: Outputs that are always None are removed. \
+                In script: Outputs that are always None are removed before opset 15. \
+                After opset 15, we replace the None in output with Optional node.",
+    )
     def test_tuple_with_none_outputs(self):
         class TupleModel(torch.nn.Module):
             def forward(self, x):
@@ -3707,6 +3705,19 @@ def forward(self, x):
         x = torch.randn(3, 3)
         self.run_test(Model(), x)
 
+    def test_norm_with_dtype(self):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                # TODO(bowbao): There is a slight gap in today's test infrastructure
+                # to directly test aten ops. OpInfo `torch.norm`` in `common_methods_invocations.py`
+                # will not decompose to below aten op.
+                return torch.ops.aten.norm(
+                    x, p=2, dim=[1], keepdim=True, dtype=torch.float64
+                )
+
+        x = torch.randn(3, 3)
+        self.run_test(Model(), x)
+
     def test_layer_norm(self):
         # As layer_norm works on the last D dimension, please keep
         # this test case at least three dimension to prevent the
@@ -4550,7 +4561,7 @@ def make_model(layers, packed_sequence):
         def make_input(batch_size, layers, packed_sequence):
             batch_first = True if packed_sequence == 2 else False
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -6639,6 +6650,18 @@ def forward(self, input_mask, some_const):
         constant = torch.tensor(5, dtype=torch.float)
         self.run_test(MaskedScatterModel(), (mask, constant))
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_index_put_with_1d_mask_to_masked_scatter(self):
+        class MaskedScatterModel(torch.nn.Module):
+            def forward(self, tensor, mask, some_const):
+                tensor[mask] = some_const
+                return tensor
+
+        mask = torch.tensor([0, 1, 0, 1, 0, 1, 0, 1], dtype=torch.bool)
+        tensor = torch.randn(8, 4, 5, requires_grad=True)
+        some_const = torch.randn(4, 4, 5, dtype=torch.float)
+        self.run_test(MaskedScatterModel(), (tensor, mask, some_const))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_pixel_shuffle(self):
         class PixelShuffle(torch.nn.Module):
@@ -9408,7 +9431,7 @@ def forward(self, input: rnn_utils.PackedSequence):
                     )
                 )
         else:
-            model = ElmanWithStateModel(
+            model = ElmanWithoutStateModel(
                 layers=layers,
                 bidirect=bidirectional,
                 nonlinearity=nonlinearity,
@@ -9422,7 +9445,7 @@ def forward(self, input: rnn_utils.PackedSequence):
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -9489,7 +9512,7 @@ def _lstm_test(
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -9632,7 +9655,7 @@ def forward(self, input, hx):
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -9710,7 +9733,7 @@ def forward(self, input):
     # forward on class: __torch__.torch.nn.modules.linear.Linear
     @skipScriptTest()
     def test_fake_quantize_activation(self):
-        from torch import quantization
+        from torch.ao import quantization
 
         m = torch.nn.Linear(1, 1)
         m.qconfig = quantization.QConfig(
@@ -11920,9 +11943,6 @@ def forward(self, x):
 
     def test_tuple_output_from_if_with_raised_exception(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, t: Tensor) -> Tuple[Tensor, Tensor]:
                 if float(t) < 0:
                     raise Exception("Negative input")
@@ -11974,7 +11994,7 @@ def test_quantized_adaptive_avg_pool2d(self):
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_quantized_conv2d_relu(self):
-        model = torch.nn.intrinsic.quantized.ConvReLU2d(16, 33, 3, stride=2)
+        model = torch.ao.nn.intrinsic.quantized.ConvReLU2d(16, 33, 3, stride=2)
         # Manually initialize model weight and bias to random numbers.
         # By default all zeros.
         q_weight = torch.quantize_per_tensor(
@@ -11988,7 +12008,7 @@ def test_quantized_conv2d_relu(self):
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_quantized_conv1d_relu(self):
-        model = torch.nn.intrinsic.quantized.ConvReLU1d(16, 33, 3, stride=2)
+        model = torch.ao.nn.intrinsic.quantized.ConvReLU1d(16, 33, 3, stride=2)
         # Manually initialize model weight and bias to random numbers.
         # By default all zeros.
         q_weight = torch.quantize_per_tensor(
@@ -12012,11 +12032,11 @@ def test_quantized_conv1d_relu(self):
                 name="leaky_relu",
             ),
             common_utils.subtest(
-                torch.nn.quantized.LeakyReLU(2.0, 1),
+                torch.ao.nn.quantized.LeakyReLU(2.0, 1),
                 name="quantized_leaky_relu",
             ),
             common_utils.subtest(
-                torch.nn.quantized.Hardswish(2.0, 1),
+                torch.ao.nn.quantized.Hardswish(2.0, 1),
                 name="quantized_hardswish",
             ),
             common_utils.subtest(
@@ -12024,7 +12044,7 @@ def test_quantized_conv1d_relu(self):
                 name="sigmoid",
             ),
             common_utils.subtest(
-                torch.nn.quantized.Sigmoid(2.0, 1),
+                torch.ao.nn.quantized.Sigmoid(2.0, 1),
                 name="quantized_sigmoid",
             ),
             common_utils.subtest(
@@ -12056,7 +12076,7 @@ def test_quantized_conv1d_relu(self):
                 name="select",
             ),
             common_utils.subtest(
-                torch.nn.quantized.LayerNorm(
+                torch.ao.nn.quantized.LayerNorm(
                     [4, 2, 3],
                     torch.nn.Parameter(torch.ones([4, 2, 3])),
                     torch.nn.Parameter(torch.zeros([4, 2, 3])),
@@ -12066,7 +12086,7 @@ def test_quantized_conv1d_relu(self):
                 name="layer_norm",
             ),
             common_utils.subtest(
-                torch.nn.quantized.InstanceNorm1d(
+                torch.ao.nn.quantized.InstanceNorm1d(
                     2,
                     torch.nn.Parameter(torch.ones(4)),
                     torch.nn.Parameter(torch.zeros(4)),
@@ -12076,7 +12096,7 @@ def test_quantized_conv1d_relu(self):
                 name="instance_norm",
             ),
             common_utils.subtest(
-                torch.nn.quantized.GroupNorm(
+                torch.ao.nn.quantized.GroupNorm(
                     2,
                     4,
                     torch.nn.Parameter(torch.zeros(4)),
@@ -12117,15 +12137,12 @@ def forward(self, input):
         x = torch.quantize_per_tensor(torch.randn(1, 2, 3, 4), 1, 0, torch.quint8)
         self.run_test(FlattenModel(), x)
 
-    @unittest.skip(
-        "ONNX Runtime 1.11 does not support quantized cat. Enable after ORT 1.12 is enabled in CI."
-    )
     @skipIfUnsupportedMinOpsetVersion(10)
     @skipScriptTest()  # torch.jit.frontend.FrontendError: Cannot instantiate class 'QFunctional' in a script function:
     def test_quantized_cat_when_concatinating_the_same_tensor(self):
         class QuantizedSelfConcatenationModel(torch.nn.Module):
             def forward(self, x):
-                return torch.nn.quantized.QFunctional().cat((x, x), dim=1)
+                return torch.ao.nn.quantized.QFunctional().cat((x, x), dim=1)
 
         q_input = torch.quantize_per_tensor(torch.ones(2, 3), 0.26, 128, torch.quint8)
         self.run_test(QuantizedSelfConcatenationModel(), q_input)
@@ -12173,15 +12190,12 @@ def forward(self, x):
             ),
         ],
     )
-    @unittest.skip(
-        "ONNX Runtime 1.11 does not support quantized cat. Enable after ORT 1.12 is enabled in CI."
-    )
     @skipIfUnsupportedMinOpsetVersion(10)
     @skipScriptTest()  # torch.jit.frontend.FrontendError: Cannot instantiate class 'QFunctional' in a script function:
     def test_quantized_cat(self, x: torch.Tensor, y: torch.Tensor):
         class QuantizedConcatenationModel(torch.nn.Module):
             def forward(self, x, y):
-                return torch.nn.quantized.QFunctional().cat((x, y), dim=0)
+                return torch.ao.nn.quantized.QFunctional().cat((x, y), dim=0)
 
         self.run_test(QuantizedConcatenationModel(), (x, y))
 
@@ -12240,9 +12254,9 @@ def test_qat_linear_per_channel(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.linear = torch.nn.Linear(4, 3)
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -12251,27 +12265,51 @@ def forward(self, x):
                 return x
 
         model = M()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model)
         # Set fixed weight and bias to avoid flaky test.
         model.linear.weight = torch.nn.Parameter(
             _construct_tensor_for_quantization_test((3, 4))
         )
         model.linear.bias = torch.nn.Parameter(torch.arange(3, dtype=torch.float))
-        model = torch.quantization.convert(model)
+        model = torch.ao.quantization.convert(model)
 
         # Set fixed input to avoid flaky test.
         input = _construct_tensor_for_quantization_test((4, 4), offset=-8)
         self.run_test(model, input)
 
+    @unittest.skip(
+        "ORT fails with Validating no unexpected access using an invalid node_index on torch converted model"
+    )
+    @skipIfUnsupportedMinOpsetVersion(13)
+    def test_quantized_list_of_inputs_with_cat(self):
+        class TestModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = torch.ao.quantization.QuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = torch.cat([x, x], 1)
+                x = self.dequant(x)
+                return x
+
+        model = TestModel()
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model)
+        model = torch.ao.quantization.convert(model)
+        x = torch.randn(2, 4, 6)
+        self.run_test(model, x)
+
     @skipIfUnsupportedMinOpsetVersion(13)
     def test_qat_relu(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.relu = torch.nn.ReLU()
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -12280,9 +12318,9 @@ def forward(self, x):
                 return x
 
         model = M()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model)
-        model = torch.quantization.convert(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model)
+        model = torch.ao.quantization.convert(model)
         input = torch.randn(8, 4)
         self.run_test(model, input)
 
@@ -12291,9 +12329,9 @@ def test_qat_conv2d(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.conv = torch.nn.Conv2d(2, 4, 3, stride=2)
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -12302,14 +12340,14 @@ def forward(self, x):
                 return x
 
         model = M()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model)
         # Set fixed weight and bias to avoid flaky test.
         model.conv.weight = torch.nn.Parameter(
             _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2)
         )
         model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0]))
-        model = torch.quantization.convert(model)
+        model = torch.ao.quantization.convert(model)
 
         # Set fixed input to avoid flaky test.
         input = _construct_tensor_for_quantization_test(
@@ -12322,10 +12360,10 @@ def test_qat_conv2d_relu(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.conv = torch.nn.Conv2d(2, 4, 3, stride=2)
                 self.relu = torch.nn.ReLU()
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -12335,14 +12373,14 @@ def forward(self, x):
                 return x
 
         model = M()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model)
         # Set fixed weight and bias to avoid flaky test.
         model.conv.weight = torch.nn.Parameter(
             _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2)
         )
         model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0]))
-        model = torch.quantization.convert(model)
+        model = torch.ao.quantization.convert(model)
 
         # Set fixed input to avoid flaky test.
         input = _construct_tensor_for_quantization_test(
@@ -12355,10 +12393,10 @@ def test_qat_conv2d_relu_fused(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.conv = torch.nn.Conv2d(2, 4, 3, stride=2)
                 self.relu = torch.nn.ReLU()
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -12368,15 +12406,15 @@ def forward(self, x):
                 return x
 
         model = M()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.fuse_modules(model.eval(), [["conv", "relu"]])
-        model = torch.quantization.prepare_qat(model.train())
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.fuse_modules(model.eval(), [["conv", "relu"]])
+        model = torch.ao.quantization.prepare_qat(model.train())
         # Set fixed weight and bias to avoid flaky test.
         model.conv.weight = torch.nn.Parameter(
             _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2)
         )
         model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0]))
-        model = torch.quantization.convert(model)
+        model = torch.ao.quantization.convert(model)
 
         # Set fixed input to avoid flaky test.
         input = _construct_tensor_for_quantization_test(
@@ -12389,9 +12427,9 @@ def test_qat_maxpool2d(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.pool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -12400,9 +12438,9 @@ def forward(self, x):
                 return x
 
         model = M()
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model.train())
-        model = torch.quantization.convert(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model.train())
+        model = torch.ao.quantization.convert(model)
 
         # Set fixed input to avoid flaky test.
         input = _construct_tensor_for_quantization_test((4, 4, 3, 2))
@@ -12411,26 +12449,26 @@ def forward(self, x):
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_qat_avg_pool2d(self):
         model = torch.nn.Sequential(
-            torch.quantization.QuantStub(),
+            torch.ao.quantization.QuantStub(),
             torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1),
-            torch.quantization.DeQuantStub(),
+            torch.ao.quantization.DeQuantStub(),
         )
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model.train())
-        model = torch.quantization.convert(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model.train())
+        model = torch.ao.quantization.convert(model)
         input = _construct_tensor_for_quantization_test((4, 4, 3, 2))
         self.run_test(model, input)
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_qat_upsample_nearest2d(self):
         model = torch.nn.Sequential(
-            torch.quantization.QuantStub(),
+            torch.ao.quantization.QuantStub(),
             torch.nn.UpsamplingNearest2d(scale_factor=1.5),
-            torch.quantization.DeQuantStub(),
+            torch.ao.quantization.DeQuantStub(),
         )
-        model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        model = torch.quantization.prepare_qat(model.train())
-        model = torch.quantization.convert(model)
+        model.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        model = torch.ao.quantization.prepare_qat(model.train())
+        model = torch.ao.quantization.convert(model)
         input = _construct_tensor_for_quantization_test((4, 3, 2, 2))
         self.run_test(model, input)
 
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index b0f47c1277fa..dd33c2ca689c 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -8,6 +8,7 @@
 import torch
 from pytorch_test_common import skipIfUnsupportedMinOpsetVersion
 from torch.onnx import _constants, symbolic_helper
+from torch.onnx._internal import jit_utils
 from torch.testing._internal import common_utils
 
 
@@ -22,6 +23,17 @@ def verify(actual_type):
     return verify
 
 
+def g_op(graph: torch.Graph, op_name: str, *args, **kwargs):
+    return jit_utils.GraphContext(
+        graph=graph,
+        block=graph.block(),
+        opset=_constants.ONNX_MAX_OPSET,
+        original_node=None,  # type: ignore[arg-type]
+        params_dict={},
+        env={},
+    ).op(op_name, *args, **kwargs)
+
+
 class TestONNXShapeInference(pytorch_test_common.ExportTestCase):
     def setUp(self):
         self.opset_version = _constants.ONNX_MAX_OPSET
@@ -43,21 +55,23 @@ def create_empty_graph(self):
         return g
 
     def insert_tensor_constant(self, g, tensor):
-        return g.op("Constant", value_t=tensor)
+        return g_op(g, "Constant", value_t=tensor)
 
     def test_cast(self):
         # Test cast with input of unknown scalar type.
         g = self.create_empty_graph()
         input = g.addInput()
-        cast_out = g.op("Cast", input, to_i=1)
+        cast_out = g_op(g, "Cast", input, to_i=1)
         self.run_test(g, cast_out.node(), expect_tensor("Float"))
 
     def test_constant_of_shape(self):
         # Test ConstantOfShape with input of onnx::Shape node.
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(1, 2, 3, 4))
-        shape = g.op("Shape", constant)
-        constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0]))
+        shape = g_op(g, "Shape", constant)
+        constant_of_shape = g_op(
+            g, "ConstantOfShape", shape, value_t=torch.tensor([2.0])
+        )
         self.run_test(
             g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4))
         )
@@ -69,9 +83,11 @@ def test_constant_of_shape_static(self):
         constants = [
             self.insert_tensor_constant(g, torch.tensor(i + 1)) for i in range(rank)
         ]
-        shape = g.op("prim::ListConstruct", *constants)
+        shape = g_op(g, "prim::ListConstruct", *constants)
         shape.setType(torch._C.ListType.ofInts())
-        constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0]))
+        constant_of_shape = g_op(
+            g, "ConstantOfShape", shape, value_t=torch.tensor([2.0])
+        )
         self.run_test(
             g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4))
         )
@@ -81,9 +97,11 @@ def test_constant_of_shape_dynamic(self):
         rank = 4
         g = self.create_empty_graph()
         inputs = [g.addInput() for i in range(rank)]
-        shape = g.op("prim::ListConstruct", *inputs)
+        shape = g_op(g, "prim::ListConstruct", *inputs)
         shape.setType(torch._C.ListType.ofInts())
-        constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0]))
+        constant_of_shape = g_op(
+            g, "ConstantOfShape", shape, value_t=torch.tensor([2.0])
+        )
         self.run_test(
             g,
             constant_of_shape.node(),
@@ -98,7 +116,7 @@ def test_gather_dynamic_index(self):
         )
         indices = g.addInput()
         indices.setType(indices.type().with_dtype(torch.int64).with_sizes([None]))
-        output = g.op("Gather", input, indices, axis_i=1)
+        output = g_op(g, "Gather", input, indices, axis_i=1)
         self.run_test(
             g, output.node(), expect_tensor("Float", shape=([None, None, 16, 16]))
         )
@@ -110,26 +128,26 @@ def test_gather_scalar_index(self):
             input.type().with_dtype(torch.float).with_sizes([None, 3, 16, 16])
         )
         indices = self.insert_tensor_constant(g, torch.tensor(1))
-        output = g.op("Gather", input, indices, axis_i=1)
+        output = g_op(g, "Gather", input, indices, axis_i=1)
         self.run_test(g, output.node(), expect_tensor("Float", shape=([None, 16, 16])))
 
     def test_reshape(self):
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(2, 16, 5, 5))
         constant_2 = self.insert_tensor_constant(g, torch.tensor([2, 0, -1]))
-        shape = g.op("Reshape", constant, constant_2)
+        shape = g_op(g, "Reshape", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(2, 16, 25)))
 
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(2, 16, 5, 4))
         constant_2 = self.insert_tensor_constant(g, torch.tensor([-1, 0, 4]))
-        shape = g.op("Reshape", constant, constant_2)
+        shape = g_op(g, "Reshape", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(10, 16, 4)))
 
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(2, 16, 5, 4))
         constant_2 = self.insert_tensor_constant(g, torch.tensor([-1, 0, 0]))
-        shape = g.op("Reshape", constant, constant_2)
+        shape = g_op(g, "Reshape", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(8, 16, 5)))
 
     def test_reshape_symbolic(self):
@@ -137,7 +155,7 @@ def test_reshape_symbolic(self):
         input = g.addInput()
         input.setType(input.type().with_sizes([None, None, 2, 8]))
         constant = self.insert_tensor_constant(g, torch.tensor([0, 0, -1]))
-        output = g.op("Reshape", input, constant)
+        output = g_op(g, "Reshape", input, constant)
         self.run_test(g, output.node(), expect_tensor(None, shape=(None, None, 16)))
 
     @skipIfUnsupportedMinOpsetVersion(14)
@@ -146,7 +164,7 @@ def test_reshape_allowzero(self):
         input = g.addInput()
         input.setType(input.type().with_sizes([3, 4, 0]))
         constant = self.insert_tensor_constant(g, torch.tensor([0, 4, 3]))
-        output = g.op("Reshape", input, constant, allowzero_i=1)
+        output = g_op(g, "Reshape", input, constant, allowzero_i=1)
         self.run_test(g, output.node(), expect_tensor(None, shape=(0, 4, 3)))
 
     def test_slice(self):
@@ -158,35 +176,35 @@ def test_slice(self):
         end = self.insert_tensor_constant(g, torch.tensor([3]))
         axis = self.insert_tensor_constant(g, torch.tensor([0]))
         step = self.insert_tensor_constant(g, torch.tensor([1]))
-        slice = g.op("Slice", input, start_input, end, axis, step)
+        slice = g_op(g, "Slice", input, start_input, end, axis, step)
         self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None)))
 
     def test_broadcast_matmul(self):
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
         constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
-        shape = g.op("MatMul", constant, constant_2)
+        shape = g_op(g, "MatMul", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 5, 1, 1)))
 
         # test when first input is of rank 1
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(2))
         constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1))
-        shape = g.op("MatMul", constant, constant_2)
+        shape = g_op(g, "MatMul", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 1, 1)))
 
         # test when second input is of rank 1
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2))
         constant_2 = self.insert_tensor_constant(g, torch.ones(2))
-        shape = g.op("MatMul", constant, constant_2)
+        shape = g_op(g, "MatMul", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=(5, 1)))
 
         # test when both inputs are of rank 1
         g = self.create_empty_graph()
         constant = self.insert_tensor_constant(g, torch.ones(2))
         constant_2 = self.insert_tensor_constant(g, torch.ones(2))
-        shape = g.op("MatMul", constant, constant_2)
+        shape = g_op(g, "MatMul", constant, constant_2)
         self.run_test(g, shape.node(), expect_tensor("Float", shape=()))
 
     def test_expand(self):
@@ -194,8 +212,8 @@ def test_expand(self):
         input = g.addInput()
         constant = self.insert_tensor_constant(g, torch.ones(2, 4))
         input.setType(constant.type().with_sizes([None, None]))
-        shape = g.op("Shape", input)
-        expand = g.op("Expand", constant, shape)
+        shape = g_op(g, "Shape", input)
+        expand = g_op(g, "Expand", constant, shape)
         self.run_test(g, expand.node(), expect_tensor("Float", shape=(None, None)))
 
     def test_pad(self):
@@ -203,8 +221,8 @@ def test_pad(self):
         input = g.addInput()
         input.setType(input.type().with_dtype(torch.float).with_sizes([3, 320, 100]))
         constant = self.insert_tensor_constant(g, torch.ones(6, dtype=torch.long))
-        none = g.op("prim::Constant").setType(torch.NoneType.get())
-        pad = g.op("Pad", input, constant, none, mode_s="constant")
+        none = g_op(g, "prim::Constant").setType(torch.NoneType.get())
+        pad = g_op(g, "Pad", input, constant, none, mode_s="constant")
         self.run_test(g, pad.node(), expect_tensor("Float", shape=(5, 322, 102)))
 
     def test_pad_with_dynamic_input_shape(self):
@@ -212,8 +230,8 @@ def test_pad_with_dynamic_input_shape(self):
         input = g.addInput()
         input.setType(input.type().with_dtype(torch.float).with_sizes([3, None, None]))
         constant = self.insert_tensor_constant(g, torch.ones(6, dtype=torch.long))
-        none = g.op("prim::Constant").setType(torch.NoneType.get())
-        pad = g.op("Pad", input, constant, none, mode_s="constant")
+        none = g_op(g, "prim::Constant").setType(torch.NoneType.get())
+        pad = g_op(g, "Pad", input, constant, none, mode_s="constant")
         self.run_test(g, pad.node(), expect_tensor("Float", shape=(5, None, None)))
 
     def test_pad_with_dynamic_pad_size(self):
@@ -222,19 +240,20 @@ def test_pad_with_dynamic_pad_size(self):
         input.setType(input.type().with_dtype(torch.float).with_sizes([3, 320, 100]))
         pad_size = g.addInput()
         pad_size.setType(pad_size.type().with_dtype(torch.long).with_sizes([6]))
-        none = g.op("prim::Constant").setType(torch.NoneType.get())
-        pad = g.op("Pad", input, pad_size, none, mode_s="constant")
+        none = g_op(g, "prim::Constant").setType(torch.NoneType.get())
+        pad = g_op(g, "Pad", input, pad_size, none, mode_s="constant")
         self.run_test(g, pad.node(), expect_tensor("Float", shape=(None, None, None)))
 
     def test_resize(self):
         g = self.create_empty_graph()
         input = g.addInput()
         input.setType(input.type().with_dtype(torch.float).with_sizes([4, 32, 64, 64]))
-        none = g.op("prim::Constant").setType(torch.NoneType.get())
+        none = g_op(g, "prim::Constant").setType(torch.NoneType.get())
         scales = self.insert_tensor_constant(
             g, torch.tensor([1, 1, 2, 2], dtype=torch.float)
         )
-        resize = g.op(
+        resize = g_op(
+            g,
             "Resize",
             input,
             none,
@@ -250,7 +269,7 @@ def test_resize_after_concat(self):
         g = self.create_empty_graph()
         input = g.addInput()
         input.setType(input.type().with_dtype(torch.float).with_sizes([4, 32, 64, 64]))
-        none = g.op("prim::Constant").setType(torch.NoneType.get())
+        none = g_op(g, "prim::Constant").setType(torch.NoneType.get())
         scale_1 = self.insert_tensor_constant(
             g, torch.tensor([1, 1], dtype=torch.float)
         )
@@ -258,8 +277,9 @@ def test_resize_after_concat(self):
             g, torch.tensor([2, 2], dtype=torch.float)
         )
         # `scales` values should be statically known due to constant folding in shape inference.
-        scales = g.op("Concat", scale_1, scale_2, axis_i=0)
-        resize = g.op(
+        scales = g_op(g, "Concat", scale_1, scale_2, axis_i=0)
+        resize = g_op(
+            g,
             "Resize",
             input,
             none,
@@ -275,14 +295,14 @@ def test_reduce_prod_with_axes(self):
         g = self.create_empty_graph()
         input = g.addInput()
         input.setType(input.type().with_dtype(torch.long).with_sizes([2]))
-        reduce_prod = g.op("ReduceProd", input, axes_i=[0])
+        reduce_prod = g_op(g, "ReduceProd", input, axes_i=[0])
         self.run_test(g, reduce_prod.node(), expect_tensor("Long", shape=(1,)))
 
     def test_reduce_prod_without_axes(self):
         g = self.create_empty_graph()
         input = g.addInput()
         input.setType(input.type().with_dtype(torch.long).with_sizes([2]))
-        reduce_prod = g.op("ReduceProd", input)
+        reduce_prod = g_op(g, "ReduceProd", input)
         self.run_test(g, reduce_prod.node(), expect_tensor("Long", shape=(1,)))
 
     def test_proceeding_nodes_use_prim_pack_padded_output_dtype_correctly(self):
@@ -291,14 +311,14 @@ def test_proceeding_nodes_use_prim_pack_padded_output_dtype_correctly(self):
         input.setType(input.type().with_dtype(torch.float).with_sizes([4, 16]))
         length = g.addInput()
         length.setType(length.type().with_dtype(torch.long).with_sizes([4]))
-        padded, batch_size = g.op("prim::PackPadded", input, length, outputs=2)
+        padded, batch_size = g_op(g, "prim::PackPadded", input, length, outputs=2)
         # `prim::PackPadded` only occurs in tracing mode. Hence its outputs inherits
         # shape and data type from traced graph.
         padded.setType(padded.type().with_dtype(torch.float).with_sizes([None, None]))
         batch_size.setType(batch_size.type().with_dtype(torch.long).with_sizes([None]))
         # `Gather` should use the data type of `batch_size` as the data type of its output.
         gather_idx = self.insert_tensor_constant(g, torch.tensor([0], dtype=torch.long))
-        gather = g.op("Gather", batch_size, gather_idx, axis_i=0)
+        gather = g_op(g, "Gather", batch_size, gather_idx, axis_i=0)
         self.run_test(g, gather.node(), expect_tensor("Long", shape=(None,)))
 
 
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 66d694895963..e94c7bb8f4e6 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -984,7 +984,8 @@ def forward(self, x, y, z):
         self.assertIn("NWithOverloads.1", func_names)
         self.assertIn("NWithOverloads.2", func_names)
 
-    @skipIfUnsupportedMinOpsetVersion(15)
+    # Failing after ONNX 1.13.0
+    @skipIfUnsupportedMaxOpsetVersion(1)
     def test_local_function_infer_scopes(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -1624,9 +1625,6 @@ def f(x: torch.Tensor, y: torch.Tensor):
             return x + z
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return f(x, y)
 
diff --git a/test/onnx/test_verification.py b/test/onnx/test_verification.py
index 9745b4dd8784..04d5678081b5 100644
--- a/test/onnx/test_verification.py
+++ b/test/onnx/test_verification.py
@@ -6,10 +6,12 @@
 import unittest
 
 import numpy as np
+import onnx
 import parameterized
 import pytorch_test_common
 
 import torch
+from packaging import version
 from torch.onnx import _constants, _experimental, verification
 from torch.testing._internal import common_utils
 
@@ -150,8 +152,12 @@ def tearDown(self):
         [
             common_utils.subtest(
                 verification.OnnxBackend.REFERENCE,
-                # TODO: enable this when ONNX submodule catches up to >= 1.13.
-                decorators=[unittest.expectedFailure],
+                decorators=[
+                    unittest.skipIf(
+                        version.Version(onnx.__version__) < version.Version("1.13"),
+                        reason="Reference Python runtime was introduced in 'onnx' 1.13.",
+                    )
+                ],
             ),
             verification.OnnxBackend.ONNX_RUNTIME_CPU,
         ],
diff --git a/test/onnx_caffe2/test_pytorch_onnx_caffe2.py b/test/onnx_caffe2/test_pytorch_onnx_caffe2.py
index 78440ac6ecb5..a3b0d0656eb8 100644
--- a/test/onnx_caffe2/test_pytorch_onnx_caffe2.py
+++ b/test/onnx_caffe2/test_pytorch_onnx_caffe2.py
@@ -138,7 +138,7 @@ def convert_cuda(self, model, input):
         cuda_model = model.cuda()
         # input might be nested - we want to move everything to GPU
         cuda_input = function._nested_map(
-            lambda o: isinstance(o, Variable) or isinstance(o, torch.Tensor),
+            lambda o: isinstance(o, (Variable, torch.Tensor)),
             lambda o: o.cuda(),
         )(input)
         return cuda_model, cuda_input
@@ -424,7 +424,7 @@ def _elman_rnn_test(
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -485,7 +485,7 @@ def _lstm_test(
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -540,7 +540,7 @@ def _gru_test(self, layers, bidirectional, initial_state, packed_sequence, dropo
 
         def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
-            seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+            seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
             inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
@@ -581,7 +581,7 @@ def make_input(batch_size):
     def test_rnn_init_predict_split(self):
         model = nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 3, bidirectional=True)
         seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=7)
-        seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
+        seq_lengths = sorted(map(int, seq_lengths), reverse=True)
         input = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
         input = rnn_utils.pad_sequence(input)
 
@@ -814,9 +814,6 @@ def test_constant(self):
         c = torch.randn(BATCH_SIZE, 3, 224, 224)
 
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 return input + c.type_as(input)
 
@@ -828,9 +825,6 @@ def test_consumed_bn(self):
 
     def _test_index_generic(self, fn):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 return fn(input)
 
@@ -925,9 +919,6 @@ def test_tensor_index_advanced_indexing_masked(self):
 
     def test_chunk(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 # TODO: Why index? This returns a tuple and test runner doesn't
                 # support tuple comparison.
@@ -937,9 +928,6 @@ def forward(self, input):
 
     def test_sqrt(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 return input.sqrt()
 
@@ -956,9 +944,6 @@ def forward(self, input):
 
     def test_log(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 return input.log()
 
@@ -968,9 +953,6 @@ def forward(self, input):
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_erf(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 return input.erf()
 
@@ -980,9 +962,6 @@ def forward(self, input):
     def test_trigonometry(self):
         def test_func(name):
             class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, input):
                     return getattr(input, name)()
 
@@ -1000,9 +979,6 @@ def forward(self, input):
 
     def test_addconstant(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 # TODO: Why index? This returns a tuple and test runner doesn't
                 # support tuple comparison.
@@ -1012,9 +988,6 @@ def forward(self, input):
 
     def test_subconstant(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input):
                 # TODO: Why index? This returns a tuple and test runner doesn't
                 # support tuple comparison.
@@ -1169,9 +1142,6 @@ def test_mnist(self):
 
     def test_mm(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, m1, m2):
                 return torch.mm(m1, m2)
 
@@ -1183,9 +1153,6 @@ def forward(self, m1, m2):
 
     def test_addmm(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, ma, m1, m2):
                 return torch.addmm(ma, m1, m2)
 
@@ -1259,9 +1226,6 @@ def forward(self, x):
     # test for a pytorch optimization pass, see https://github.com/pytorch/pytorch/pull/7872
     def test_consecutive_transposes(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x.transpose(1, 2).transpose(2, 3)
 
@@ -1275,9 +1239,6 @@ def test_sum(self):
         for params in [{}] + [{"dim": i} for i in range(len(shape))]:
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, x):
                     return torch.sum(x, **params)
 
@@ -1291,9 +1252,6 @@ def test_cumsum(self):
         for params in [{"dim": i} for i in range(len(shape))]:
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, x):
                     return torch.cumsum(x, **params)
 
@@ -1412,9 +1370,6 @@ def get_GruNet_model_and_inputs(
 
     def test_repeat(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x.repeat(1, 2, 3, 4)
 
@@ -1434,9 +1389,6 @@ def test_upsample(self):
     @skipIfUnsupportedOpsetVersion([10])
     def test_interpolate_upsample(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
                 # work around for now: turn the dynamic sizes into constant
@@ -1452,9 +1404,6 @@ def forward(self, x):
     @skipIfUnsupportedOpsetVersion([7, 8, 10])
     def test_interpolate_upsample_dynamic_sizes(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 size = [v * 2 for v in x.size()[2:]]
                 return nn.functional.interpolate(x, size=size, mode="nearest")
@@ -1467,9 +1416,6 @@ def forward(self, x):
 
     def test_repeat_dim_overflow(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x.repeat(1, 2, 3, 4)
 
@@ -1480,9 +1426,6 @@ def forward(self, x):
 
     def test_repeat_dynamic(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return x.repeat(y.size()[0] // 2, y.size()[1] * 2)
 
@@ -1511,9 +1454,6 @@ def test_mean(self):
         for params in [{}] + [{"dim": i} for i in range(len(shape))]:
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, x):
                     return torch.mean(x, **params)
 
@@ -1598,9 +1538,6 @@ def test_unsqueeze(self):
         for dim in range(-len(shape) - 1, len(shape) + 1):
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, x):
                     return x.unsqueeze(dim)
 
@@ -1615,9 +1552,6 @@ def test_squeeze(self):
         for dim in range(-len(shape), len(shape)):
 
             class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-
                 def forward(self, x):
                     return x.squeeze(dim)
 
@@ -1644,9 +1578,6 @@ def test_pixel_shuffle(self):
 
     def test_dynamic_sizes(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 shape = torch.onnx.operators.shape_as_tensor(x)
                 new_shape = torch.cat((torch.LongTensor([-1]), shape[0].view(1)))
@@ -1659,9 +1590,6 @@ def forward(self, x):
 
     def test_advanced_broadcast(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return torch.mul(x, y)
 
@@ -2362,9 +2290,6 @@ def forward(self, feature, im_info, anchors):
 
     def test_c2_roi_align(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, feature, rois):
                 roi_feature = torch.ops._caffe2.RoIAlign(
                     feature,
@@ -2395,9 +2320,6 @@ def rand_roi(N, C, H, W):
 
     def test_c2_generate_proposals(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, scores, bbox_deltas, im_info, anchors):
                 a, b = torch.ops._caffe2.GenerateProposals(
                     scores,
@@ -2433,9 +2355,6 @@ def forward(self, scores, bbox_deltas, im_info, anchors):
 
     def test_c2_bbox_transform(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, rois, deltas, im_info):
                 a, b = torch.ops._caffe2.BBoxTransform(
                     rois,
@@ -2504,9 +2423,6 @@ def test_c2_box_with_nms_limits(self):
         topk_per_image = int(sum(roi_counts) / 2)
 
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, class_prob, pred_bbox, batch_splits):
                 a, b, c, d, e, f = torch.ops._caffe2.BoxWithNMSLimit(
                     class_prob,
@@ -2545,9 +2461,6 @@ def test_c2_inference_lstm(self):
         is_bidirectional = True
 
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, lstm_in):
                 a, b, c = torch.ops._caffe2.InferenceLSTM(
                     lstm_in, num_layers, has_bias, batch_first, is_bidirectional
diff --git a/test/onnx_caffe2/test_verify.py b/test/onnx_caffe2/test_verify.py
index af8c29bbbe1f..3a5dc2714840 100644
--- a/test/onnx_caffe2/test_verify.py
+++ b/test/onnx_caffe2/test_verify.py
@@ -48,9 +48,6 @@ def forward(self, x, y):
 
     def test_jumbled_params(self):
         class MyModel(Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 y = x * x
                 self.param = Parameter(torch.tensor([2.0]))
diff --git a/test/package/package_a/fake_interface.py b/test/package/package_a/fake_interface.py
index 66802b37d075..02d343af4e1b 100644
--- a/test/package/package_a/fake_interface.py
+++ b/test/package/package_a/fake_interface.py
@@ -11,9 +11,6 @@ def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
 class OrigModule(torch.nn.Module):
     """A module that implements ModuleInterface."""
 
-    def __init__(self):
-        super(OrigModule, self).__init__()
-
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
         return inp1 + inp2 + 1
 
@@ -27,9 +24,6 @@ def forward(self, input: Tensor) -> Tensor:
 class NewModule(torch.nn.Module):
     """A *different* module that implements ModuleInterface."""
 
-    def __init__(self):
-        super(NewModule, self).__init__()
-
     def one(self, inp1: Tensor, inp2: Tensor) -> Tensor:
         return inp1 * inp2 + 1
 
diff --git a/test/package/package_a/fake_script_class.py b/test/package/package_a/fake_script_class.py
index f68b8352fa5d..988a726b3ed3 100644
--- a/test/package/package_a/fake_script_class.py
+++ b/test/package/package_a/fake_script_class.py
@@ -30,9 +30,6 @@ def returns_self(self) -> "IdListFeature":
 
 
 class UsesIdListFeature(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self, feature: Any):
         if isinstance(feature, IdListFeature):
             return feature.id_list
diff --git a/test/package/package_a/std_sys_module_hacks.py b/test/package/package_a/std_sys_module_hacks.py
index fa8df64f20df..bb7435cb1243 100644
--- a/test/package/package_a/std_sys_module_hacks.py
+++ b/test/package/package_a/std_sys_module_hacks.py
@@ -8,8 +8,5 @@
 
 
 class Module(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
     def forward(self):
         return os.path.abspath("test")
diff --git a/test/package/package_a/test_nn_module.py b/test/package/package_a/test_nn_module.py
index 17ce63000a5d..fec5fd29e64a 100644
--- a/test/package/package_a/test_nn_module.py
+++ b/test/package/package_a/test_nn_module.py
@@ -5,7 +5,7 @@
 
 class TestNnModule(torch.nn.Module):
     def __init__(self, nz=6, ngf=9, nc=3):
-        super(TestNnModule, self).__init__()
+        super().__init__()
         self.main = torch.nn.Sequential(
             # input is Z, going into a convolution
             torch.nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py
index b8350ddf8824..eb1c48c427ba 100644
--- a/test/package/test_dependency_api.py
+++ b/test/package/test_dependency_api.py
@@ -247,6 +247,8 @@ def test_intern_error(self):
                 * Module did not match against any action pattern. Extern, mock, or intern it.
                     package_a
                     package_a.subpackage
+
+                Set debug=True when invoking PackageExporter for a visualization of where broken modules are coming from!
                 """
             ),
         )
@@ -294,6 +296,8 @@ def import_module(self, module_name):
                 * Module is a C extension module. torch.package supports Python modules only.
                     foo
                     bar
+
+                Set debug=True when invoking PackageExporter for a visualization of where broken modules are coming from!
                 """
             ),
         )
@@ -313,6 +317,8 @@ def test_invalid_import(self):
                 * Dependency resolution failed.
                     foo
                       Context: attempted relative import beyond top-level package
+
+                Set debug=True when invoking PackageExporter for a visualization of where broken modules are coming from!
                 """
             ),
         )
diff --git a/test/package/test_dependency_hooks.py b/test/package/test_dependency_hooks.py
index df155ab1dea3..a4824f9a42e3 100644
--- a/test/package/test_dependency_hooks.py
+++ b/test/package/test_dependency_hooks.py
@@ -31,7 +31,7 @@ def my_extern_hook(package_exporter, module_name):
             exporter.register_extern_hook(my_extern_hook)
             exporter.save_source_string("foo", "import module_a")
 
-        self.assertEqual(my_externs, set(["module_a"]))
+        self.assertEqual(my_externs, {"module_a"})
 
     def test_multiple_extern_hooks(self):
         buffer = BytesIO()
@@ -93,7 +93,7 @@ def my_extern_hook2(package_exporter, module_name):
             exporter.save_source_string("foo", "import module_a")
 
         self.assertEqual(my_externs, set())
-        self.assertEqual(my_externs2, set(["module_a"]))
+        self.assertEqual(my_externs2, {"module_a"})
 
     def test_extern_and_mock_hook(self):
         buffer = BytesIO()
@@ -114,8 +114,8 @@ def my_mock_hook(package_exporter, module_name):
             exporter.register_mock_hook(my_mock_hook)
             exporter.save_source_string("foo", "import module_a; import package_a")
 
-        self.assertEqual(my_externs, set(["module_a"]))
-        self.assertEqual(my_mocks, set(["package_a"]))
+        self.assertEqual(my_externs, {"module_a"})
+        self.assertEqual(my_mocks, {"package_a"})
 
 
 if __name__ == "__main__":
diff --git a/test/package/test_digraph.py b/test/package/test_digraph.py
index 0ccc09bcf74c..90dc11f3a100 100644
--- a/test/package/test_digraph.py
+++ b/test/package/test_digraph.py
@@ -82,7 +82,7 @@ def test_iter(self):
         for n in g:
             nodes.add(n)
 
-        self.assertEqual(nodes, set([1, 2, 3]))
+        self.assertEqual(nodes, {1, 2, 3})
 
     def test_contains(self):
         g = DiGraph()
@@ -101,8 +101,8 @@ def test_forward_closure(self):
         g.add_edge("2", "3")
         g.add_edge("5", "4")
         g.add_edge("4", "3")
-        self.assertTrue(g.forward_transitive_closure("1") == set(["1", "2", "3"]))
-        self.assertTrue(g.forward_transitive_closure("4") == set(["4", "3"]))
+        self.assertTrue(g.forward_transitive_closure("1") == {"1", "2", "3"})
+        self.assertTrue(g.forward_transitive_closure("4") == {"4", "3"})
 
     def test_all_paths(self):
         g = DiGraph()
@@ -116,7 +116,7 @@ def test_all_paths(self):
 
         result = g.all_paths("1", "3")
         # to get rid of indeterminism
-        actual = set([i.strip("\n") for i in result.split(";")[2:-1]])
+        actual = {i.strip("\n") for i in result.split(";")[2:-1]}
         expected = {
             '"2" -> "3"',
             '"1" -> "7"',
diff --git a/test/package/test_package_script.py b/test/package/test_package_script.py
index 6dcaa2678c4d..04e3a5b2dae3 100644
--- a/test/package/test_package_script.py
+++ b/test/package/test_package_script.py
@@ -240,9 +240,6 @@ def test_save_scriptmodules_submod_redefinition(self):
         """
 
         class Submod(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input: str):
                 input = input + "_submod"
                 return input
@@ -260,9 +257,6 @@ def forward(self, input: str):
         # redefinition is intentional, change single inner string
         # string attribute, should trigger new module type
         class Submod(torch.nn.Module):  # noqa: F811
-            def __init__(self):
-                super().__init__()
-
             def forward(self, input: str):
                 input = input + "_submod(changed)"
                 return input
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 84442724205a..488b4066a815 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -844,14 +844,17 @@ def _lookup_tensor_categories(
             if key.storage.allocation_id == max(ids | {-1})
         }
 
-    def _run_and_check_parameters_and_gradients(self, inner_fn, model):
+    def _run_and_check_parameters_and_gradients(self, inner_fn, model, grads_none: bool = False):
 
         with profile() as prof:
             inner_fn()
 
         memory_profile = prof._memory_profile()
 
-        def assert_category(t: torch.Tensor, category: _memory_profiler.Category):
+        def assert_category(t: torch.Tensor, category: _memory_profiler.Category, should_be_none: bool = False):
+            if should_be_none:
+                assert t is None, "tensor should be None but is not."
+                return
             self.assertIsNotNone(t)
             categories = self._lookup_tensor_categories(t, memory_profile)
             self.assertGreater(len(categories), 0)
@@ -859,7 +862,7 @@ def assert_category(t: torch.Tensor, category: _memory_profiler.Category):
 
         for p in model.parameters():
             assert_category(p, _memory_profiler.Category.PARAMETER)
-            assert_category(p.grad, _memory_profiler.Category.GRADIENT)
+            assert_category(p.grad, _memory_profiler.Category.GRADIENT, grads_none)
 
         # Rely on internal asserts
         _ = memory_profile.timeline
@@ -929,16 +932,15 @@ def fwd_only():
             _ = model(torch.ones((2, 2)))
 
         def fwd_bwd_step():
+            optimizer.zero_grad()
             y = model(torch.ones((2, 2)))
             torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
             optimizer.step()
-            optimizer.zero_grad()
 
         # If we profile the first step then gradients will not have been
         # created when we call `model.forward`, so if we don't call `.backward`
         # then gradients are never created.
-        with self.assertRaises(AssertionError):
-            self._run_and_check_parameters_and_gradients(inner_fn=fwd_only, model=model)
+        self._run_and_check_parameters_and_gradients(inner_fn=fwd_only, model=model, grads_none=True)
 
         # On the first step we must rely on `AccumulateGrad`, since gradients
         # did not exist when `model.forward` was called.
@@ -1078,10 +1080,10 @@ def test_lazily_initialized(self) -> None:
 
         def inner_fn():
             y = model(torch.ones((2, 2)))
-            torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
             optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
-            optimizer.step()
             optimizer.zero_grad()
+            torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
+            optimizer.step()
 
         self._run_and_check_parameters_and_gradients(inner_fn=inner_fn, model=model)
         self.assertEqual(len(list(model.parameters())), 6)
@@ -1220,9 +1222,7 @@ def step_fn(mark_region):
 
             -- Optimizer --------------------------------------------------------------------------------------------
             aten::add_.Tensor                        3 (PARAMETER), 25 (GRADIENT)                  -> 3 (PARAMETER)
-            aten::add_.Tensor                        5 (PARAMETER), 23 (GRADIENT)                  -> 5 (PARAMETER)
-            aten::zero_                              25 (GRADIENT)                                 -> 25 (GRADIENT)
-            aten::zero_                              23 (GRADIENT)                                 -> 23 (GRADIENT)""",
+            aten::add_.Tensor                        5 (PARAMETER), 23 (GRADIENT)                  -> 5 (PARAMETER)""",
         )
 
     def test_categories_e2e_simple_module_fwd(self) -> None:
@@ -1317,9 +1317,7 @@ def step_fn(mark_region):
             aten::clone                              9 (GRADIENT)                                  -> 11 (OPTIMIZER_STATE)
             aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
             aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
-            aten::add_.Tensor                        3 (PARAMETER), 11 (OPTIMIZER_STATE)           -> 3 (PARAMETER)
-            aten::zero_                              7 (GRADIENT)                                  -> 7 (GRADIENT)
-            aten::zero_                              9 (GRADIENT)                                  -> 9 (GRADIENT)""",
+            aten::add_.Tensor                        3 (PARAMETER), 11 (OPTIMIZER_STATE)           -> 3 (PARAMETER)""",
         )
 
     def test_categories_e2e_sequential_fwd(self) -> None:
@@ -1482,7 +1480,7 @@ def id_for_testing(key):
 
             # We generally don't care about tiny allocations during memory
             # profiling and they add a lot of noise to the unit test.
-            if size >= 256
+            if size >= 512
         ]
 
         self.assertExpectedInline(
@@ -1550,9 +1548,9 @@ def id_for_testing(key):
             destroy                    ???                         27(v1)            2 kB
             increment_version          PARAMETER                    2(v0)         1024 kB
             destroy                    ???                         29(v1)         1024 kB
-            increment_version          GRADIENT                    16(v0)          128 kB
-            increment_version          GRADIENT                    17(v0)            2 kB
-            increment_version          GRADIENT                    13(v0)         1024 kB""")
+            destroy                    GRADIENT                    16(v0)          128 kB
+            destroy                    GRADIENT                    17(v0)            2 kB
+            destroy                    GRADIENT                    13(v0)         1024 kB""")
 
 
 if __name__ == "__main__":
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index c31b1ea164f1..93f0cf7d1cc7 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -7,8 +7,10 @@
 import re
 import tempfile
 import textwrap
+import threading
 import unittest
 from unittest.mock import patch
+import weakref
 from dataclasses import dataclass, field
 from typing import List, Optional
 
@@ -55,7 +57,10 @@
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import skipCUDAVersionIn
 from torch.testing._internal.common_utils import (
+    IS_JETSON,
     IS_WINDOWS,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
     TemporaryDirectoryName,
     TemporaryFileName,
@@ -338,10 +343,11 @@ def trace_handler(p):
                 p.step()
             eg.stop()
 
-        eg.unregister_callback()
-
         assert trace_called_num == 2
         assert fp.name == eg.get_output_file_path()
+
+        # cleanup
+        eg.unregister_callback()
         nodes = self.get_execution_graph_root(fp.name)
         loop_count = 0
         found_root_node = False
@@ -369,9 +375,9 @@ def test_execution_graph_alone(self):
             with record_function(f"## LOOP {idx} ##"):
                 self.payload(use_cuda=use_cuda)
         eg.stop()
-        eg.unregister_callback()
 
         assert fp.name == eg.get_output_file_path()
+        eg.unregister_callback()
         nodes = self.get_execution_graph_root(fp.name)
         loop_count = 0
         # Expected tensor object tuple size, in th form of:
@@ -407,13 +413,13 @@ def test_execution_graph_start_stop(self):
                 eg.start()
             elif idx == 9:
                 eg.stop()
-                eg.unregister_callback()
             if eg._execution_graph_running:
                 expected_loop_events += 1
             with record_function(f"## LOOP {idx} ##"):
                 self.payload(use_cuda=use_cuda)
 
         assert fp.name == eg.get_output_file_path()
+        eg.unregister_callback()
         nodes = self.get_execution_graph_root(fp.name)
         loop_count = 0
         found_root_node = False
@@ -465,9 +471,9 @@ def test_execution_graph_no_capture(self):
         fp.close()
         eg = ExecutionGraphObserver()
         eg.register_callback(fp.name)
-        eg.unregister_callback()
 
         assert fp.name == eg.get_output_file_path()
+        eg.unregister_callback()
         nodes = self.get_execution_graph_root(fp.name)
         for n in nodes:
             assert "name" in n
@@ -476,6 +482,7 @@ def test_execution_graph_no_capture(self):
         assert found_root_node
 
 
+@instantiate_parametrized_tests
 class TestProfiler(TestCase):
 
     @unittest.skipIf(TEST_WITH_CROSSREF, "crossref intercepts calls and changes the callsite.")
@@ -498,7 +505,7 @@ def ts_method_1(x, y, z):
 
         class DummyModule(nn.Module):
             def __init__(self):
-                super(DummyModule, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
 
             def forward(self, x):
@@ -547,6 +554,161 @@ def extract(pattern: str):
 
         torch._C._set_graph_executor_optimize(prev_opt)
 
+    @parametrize(
+        "name,thread_spec",
+        {
+            "basic": ((False, False),),
+            "multiple_preexisting": ((False, False), ) * 2,
+            "open_in_scope": ((True, False),),
+            "close_in_scope": ((False, True),),
+            "complex": (
+                # Large number of background threads
+                (False, False),
+                (False, False),
+                (False, False),
+                (False, False),
+
+                # some of which finish during profiling
+                (False, True),
+                (False, True),
+
+                # And the profiled section is also multithreaded
+                (True, False),
+                (True, True),
+
+            ),
+        }.items(),
+        name_fn=lambda name, thread_spec: name
+    )
+    @parametrize("work_in_main_thread", [True, False])
+    def test_source_multithreaded(self, name, thread_spec, work_in_main_thread):
+        """Test various threading configurations.
+
+        `thread_spec` is a Tuple[Tuple[bool, bool], ...] where each pair is a
+        thread. The first bool indicates if the thread should be started under
+        the profiler context and the second is if it should be joined under the
+        profiler context.
+        """
+
+        timeout = 15
+        num_threads = len(thread_spec) + 1  # Main thread
+        start_barrier = threading.Barrier(num_threads, timeout=timeout)
+        end_barrier = threading.Barrier(num_threads, timeout=timeout)
+
+        class Task(threading.Thread):
+
+            def __init__(self):
+                self._end_gate = threading.Event()
+                super().__init__(daemon=True)
+                self.start()
+                self.finished = False
+
+            def run(self):
+                self._run(self._end_gate)
+
+            def release(self):
+                self._end_gate.set()
+
+            @staticmethod
+            def _run(end_gate=None):
+
+                def known_preexisting_function():
+                    start_barrier.wait()
+
+                # Fixed point that we can use to test capture of functions
+                # which are already running when profiling is enabled.
+                known_preexisting_function()
+
+                model = torch.nn.Sequential(
+                    torch.nn.Linear(10, 10),
+                    torch.nn.ReLU(),
+                )
+
+                def invoked_during_run():
+                    pass
+
+                invoked_during_run()
+
+                _ = model(torch.rand(4, 10))
+                end_barrier.wait()
+
+                if end_gate is not None:
+                    end_gate.wait(timeout=timeout)
+
+        threads = {}
+
+        def add_threads(context: bool):
+            for idx, (start_under_profiler, _) in enumerate(thread_spec):
+                if start_under_profiler == context:
+                    assert idx not in threads
+                    threads[idx] = Task()
+
+        def join_threads(context: bool):
+            for idx, (_, end_under_profiler) in enumerate(thread_spec):
+                if end_under_profiler == context:
+                    threads[idx].release()
+
+            for idx, (_, end_under_profiler) in enumerate(thread_spec):
+                t = threads[idx]
+                if end_under_profiler == context:
+                    t.join(timeout=timeout)
+
+        try:
+            add_threads(False)
+            with torch.profiler.profile(with_stack=True) as prof:
+                # Threads added while the profiler are running will not be observed
+                # since there is no way to hook into Python's thread start call to
+                # register the observer. These are here purely to verify safety.
+                add_threads(True)
+
+                if work_in_main_thread:
+                    Task._run()
+                else:
+                    start_barrier.wait()
+                    end_barrier.wait()
+
+                join_threads(True)
+            join_threads(False)
+
+        finally:
+            # It is very important that we clean up everything because the
+            # Python tracer will detect ALL active threads. (Even orphans from
+            # prior failed tests.) If we don't clean up properly we can
+            # contaminate subsequent tests.
+            start_barrier.abort()
+            end_barrier.abort()
+            for t in threads.values():
+                t.release()
+
+            for t in threads.values():
+                t.join(timeout=timeout)
+
+            for t in threads.values():
+                self.assertFalse(t.is_alive())
+
+        roots = prof.profiler.kineto_results.experimental_event_tree()
+        nodes = [node for node in _utils.traverse_dfs(roots) if isinstance(node.extra_fields, _ExtraFields_PyCall)]
+        tid_counts = collections.Counter([node.start_tid for node in nodes])
+
+        prior_threads = sum(not start_under_profiler for start_under_profiler, _ in thread_spec)
+        expected_threads = prior_threads + 1
+        self.assertEqual(len(tid_counts), expected_threads, f"{expected_threads}, {tid_counts}")
+        self.assertEqual(len(nodes), sum(tid_counts.values()))
+
+        # Profiler uses uint64_t max as a placeholder until TID can be determined.
+        no_tid = 2 ** 64 - 1
+        self.assertFalse(no_tid in tid_counts)
+
+        worker_threads = prior_threads + (1 if work_in_main_thread else 0)
+
+        observed_preexisting = [node.start_tid for node in nodes if "known_preexisting_function" in node.name]
+        self.assertEqual(len(observed_preexisting), worker_threads)
+        self.assertEqual(len(observed_preexisting), len(set(observed_preexisting)))
+
+        observed_during_run = [node.start_tid for node in nodes if "invoked_during_run" in node.name]
+        self.assertEqual(len(observed_during_run), worker_threads)
+        self.assertEqual(len(observed_during_run), len(set(observed_during_run)))
+
     def payload(self, use_cuda=False):
         x = torch.randn(10, 10)
         if use_cuda:
@@ -763,6 +925,7 @@ def create_mkldnn_tensor():
                 ]
             )
 
+    @unittest.skipIf(IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared")
     def test_oom_tracing(self):
         def run_profiler(tensor_creation_fn):
             with _profile(profile_memory=True, record_shapes=True) as prof:
@@ -806,9 +969,6 @@ def check_trace(fname):
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     def test_module_hierarchy(self):
         class A(nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def my_new_method(self, x):
                 return x * 3
 
@@ -820,15 +980,12 @@ def forward(self, x, y):
                 return self.forward_impl_(x, y)
 
         class B(nn.Module):
-            def __init__(self):
-                super(B, self).__init__()
-
             def forward(self, x):
                 return x + 2
 
         class C(nn.Module):
             def __init__(self):
-                super(C, self).__init__()
+                super().__init__()
                 self.A0 = A()
                 self.B0 = B()
 
@@ -884,7 +1041,7 @@ def __getitem__(self, idx):
 
         class TwoLayerNet(torch.nn.Module):
             def __init__(self, D_in, H, D_out):
-                super(TwoLayerNet, self).__init__()
+                super().__init__()
                 self.linear1 = torch.nn.Linear(D_in, H)
                 self.linear2 = torch.nn.Linear(H, D_out)
 
@@ -895,7 +1052,7 @@ def forward(self, x):
 
         class CustomSGD(torch.optim.SGD):
             def __init__(self, *args, **kwargs):
-                super(CustomSGD, self).__init__(*args, **kwargs)
+                super().__init__(*args, **kwargs)
 
         def train():
             for _, data in enumerate(dataloader):
@@ -2162,6 +2319,44 @@ def test_allocations(self):
         self.assertEqual(node.extra_fields.device, torch.device("cpu"))
         self.assertEqual(node.extra_fields.total_allocated, total_allocated - alloc_size)
 
+    def test_refcounts(self):
+
+        class Sentinel:
+            pass
+
+        def make():
+            outer_sentinel = Sentinel()
+
+            def outer():
+                # Python will only close over variables used in the function.
+                _ = outer_sentinel
+                inner_sentinel = Sentinel()
+
+                def inner():
+                    _ = inner_sentinel
+
+
+                with profile(with_stack=True):
+                    inner()
+
+                return weakref.ref(inner_sentinel)
+
+            return outer, weakref.ref(outer_sentinel)
+
+        # Use a factory function to ensure the test scope never sees strong
+        # references. `del` has strange semantics that interact with closures
+        # at an AST level, so this is simpler.
+        outer, outer_sentinel_ref = make()
+        inner_sentinel_ref = outer()
+
+        self.assertIsNone(inner_sentinel_ref())
+
+        # `outer` holds the last reference via closure.
+        self.assertIsNotNone(outer_sentinel_ref())
+
+        del outer
+        self.assertIsNone(outer_sentinel_ref())
+
 
 @dataclass(frozen=True)
 class MockKinetoEvent():
@@ -2492,6 +2687,7 @@ def test_utils_compute_idle_time(self):
 0 [CPU (After GPU)]
 100000 [CPU (After GPU)]""")
 
+    @unittest.skipIf(IS_JETSON, "JSON not behaving as expected on Jetson")
     def test_utils_get_optimizable_events(self):
         basic_evaluation = _utils.BasicEvaluation(self.load_mock_profile())
         optimizable_events = basic_evaluation.get_optimizable_events(
@@ -2669,10 +2865,10 @@ def test_profiler_grad_not_set_to_none_pattern(self):
         )
         optimizer = torch.optim.Adam(model.parameters())
         cases = (
-            (1, lambda: optimizer.zero_grad()),
-            (1, lambda: model.zero_grad()),
-            (0, lambda: optimizer.zero_grad(set_to_none=True)),
-            (0, lambda: model.zero_grad(set_to_none=True))
+            (0, lambda: optimizer.zero_grad()),
+            (0, lambda: model.zero_grad()),
+            (1, lambda: optimizer.zero_grad(set_to_none=False)),
+            (1, lambda: model.zero_grad(set_to_none=False))
         )
         num_matched = []
         for _, fn in cases:
diff --git a/test/quantization/ao_migration/common.py b/test/quantization/ao_migration/common.py
index 50045a39e7ab..de6e67d35a55 100644
--- a/test/quantization/ao_migration/common.py
+++ b/test/quantization/ao_migration/common.py
@@ -4,40 +4,6 @@
 from typing import List, Optional
 
 class AOMigrationTestCase(TestCase):
-    def _test_package_import(self, package_name: str,
-                             base: Optional[str] = None,
-                             skip: List[str] = None,
-                             new_package_name: Optional[str] = None):
-        r"""Tests the module import by making sure that all the internals match
-        (except the dunder methods).
-
-        Args:
-            package_name: The name of the package to be tested
-            base: The base namespace where the `package_name` resides
-            skip: The list of the subpackages/modules/functions to skip
-        """
-        skip = skip or []
-        base = base or 'quantization'
-        old_base = 'torch.' + base
-        new_base = 'torch.ao.' + base
-        if new_package_name is None:
-            new_package_name = package_name
-        old_module = importlib.import_module(f'{old_base}.{package_name}')
-        new_module = importlib.import_module(f'{new_base}.{new_package_name}')
-        old_module_dir = set(dir(old_module))
-        new_module_dir = set(dir(new_module))
-        # Remove magic modules from checking in subsets
-        for el in list(old_module_dir):
-            if el.startswith('__') and el.endswith('__'):
-                # Remove dunder
-                old_module_dir.remove(el)
-            if el in skip:
-                # Remove skips
-                old_module_dir.remove(el)
-        assert (old_module_dir <= new_module_dir), \
-            f"Importing {old_module} vs. {new_module} does not match: " \
-            f"{old_module_dir - new_module_dir}"
-
     def _test_function_import(self, package_name: str, function_list: List[str],
                               base: Optional[str] = None, new_package_name: Optional[str] = None):
         r"""Tests individual function list import by comparing the functions
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index f2efa1e1f04f..374fc205e375 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -3,127 +3,7 @@
 from .common import AOMigrationTestCase
 
 
-class TestAOMigrationQuantization(AOMigrationTestCase):
-    def test_package_import_quantize(self):
-        self._test_package_import('quantize')
-
-    def test_function_import_quantize(self):
-        function_list = [
-            '_convert',
-            '_observer_forward_hook',
-            '_propagate_qconfig_helper',
-            '_remove_activation_post_process',
-            '_remove_qconfig',
-            '_add_observer_',
-            'add_quant_dequant',
-            'convert',
-            '_get_observer_dict',
-            '_get_unique_devices_',
-            '_is_activation_post_process',
-            'prepare',
-            'prepare_qat',
-            'propagate_qconfig_',
-            'quantize',
-            'quantize_dynamic',
-            'quantize_qat',
-            '_register_activation_post_process_hook',
-            'swap_module',
-        ]
-        self._test_function_import('quantize', function_list)
-
-    def test_package_import_stubs(self):
-        self._test_package_import('stubs')
-
-    def test_function_import_stubs(self):
-        function_list = [
-            'QuantStub',
-            'DeQuantStub',
-            'QuantWrapper',
-        ]
-        self._test_function_import('stubs', function_list)
-
-    def test_package_import_quantize_jit(self):
-        self._test_package_import('quantize_jit')
-
-    def test_function_import_quantize_jit(self):
-        function_list = [
-            '_check_is_script_module',
-            '_check_forward_method',
-            'script_qconfig',
-            'script_qconfig_dict',
-            'fuse_conv_bn_jit',
-            '_prepare_jit',
-            'prepare_jit',
-            'prepare_dynamic_jit',
-            '_convert_jit',
-            'convert_jit',
-            'convert_dynamic_jit',
-            '_quantize_jit',
-            'quantize_jit',
-            'quantize_dynamic_jit',
-        ]
-        self._test_function_import('quantize_jit', function_list)
-
-    def test_package_import_fake_quantize(self):
-        self._test_package_import('fake_quantize')
-
-    def test_function_import_fake_quantize(self):
-        function_list = [
-            '_is_per_channel',
-            '_is_per_tensor',
-            '_is_symmetric_quant',
-            'FakeQuantizeBase',
-            'FakeQuantize',
-            'FixedQParamsFakeQuantize',
-            'FusedMovingAvgObsFakeQuantize',
-            'default_fake_quant',
-            'default_weight_fake_quant',
-            'default_fixed_qparams_range_neg1to1_fake_quant',
-            'default_fixed_qparams_range_0to1_fake_quant',
-            'default_per_channel_weight_fake_quant',
-            'default_histogram_fake_quant',
-            'default_fused_act_fake_quant',
-            'default_fused_wt_fake_quant',
-            'default_fused_per_channel_wt_fake_quant',
-            '_is_fake_quant_script_module',
-            'disable_fake_quant',
-            'enable_fake_quant',
-            'disable_observer',
-            'enable_observer',
-        ]
-        self._test_function_import('fake_quantize', function_list)
-
-
 class TestAOMigrationNNQuantized(AOMigrationTestCase):
-    def test_package_import_nn_quantized_modules(self):
-        r"""Tests the migration of the torch.nn.quantized.modules"""
-        self._test_package_import('modules', base='nn.quantized')
-        self._test_package_import('modules.activation', base='nn.quantized')
-        self._test_package_import('modules.batchnorm', base='nn.quantized')
-        self._test_package_import('modules.conv', base='nn.quantized')
-        self._test_package_import('modules.dropout', base='nn.quantized')
-        self._test_package_import('modules.embedding_ops', base='nn.quantized')
-        self._test_package_import('modules.functional_modules', base='nn.quantized')
-        self._test_package_import('modules.linear', base='nn.quantized')
-        self._test_package_import('modules.normalization', base='nn.quantized')
-        self._test_package_import('modules.utils', base='nn.quantized')
-
-    def test_package_import_nn_quantized(self):
-        skip = [
-            # These are added in the `torch.nn.quantized` to allow
-            # for the legacy import, s.a. `import torch.nn.quantized.conv`, etc.
-            'activation',
-            'batchnorm',
-            'conv',
-            'dropout',
-            'embedding_ops',
-            'functional_modules',
-            'linear',
-            'normalization',
-            '_reference',
-        ]
-        self._test_package_import('quantized', base='nn', skip=skip)
-
     def test_functional_import(self):
         r"""Tests the migration of the torch.nn.quantized.functional"""
         function_list = [
@@ -277,16 +157,6 @@ def test_modules_utils(self):
         self._test_function_import('utils', function_list,
                                    base='nn.quantized.modules')
 
-    def test_package_import_nn_quantized_dynamic(self):
-        self._test_package_import('dynamic', base='nn.quantized')
-
-    def test_package_import_nn_quantized_dynamic_modules(self):
-        r"""Tests the migration of the torch.nn.quantized.modules"""
-        self._test_package_import('modules', base='nn.quantized.dynamic')
-        self._test_package_import('modules.conv', base='nn.quantized.dynamic')
-        self._test_package_import('modules.linear', base='nn.quantized.dynamic')
-        self._test_package_import('modules.rnn', base='nn.quantized.dynamic')
-
     def test_import_nn_quantized_dynamic_import(self):
         module_list = [
             # Modules
@@ -305,15 +175,6 @@ def test_import_nn_quantized_dynamic_import(self):
         ]
         self._test_function_import('dynamic', module_list, base='nn.quantized')
 
-    def test_package_import_nn_quantizable(self):
-        self._test_package_import('quantizable', base='nn')
-
-    def test_package_import_nn_quantizable_modules(self):
-        r"""Tests the migration of the torch.nn.quantizable.modules"""
-        self._test_package_import('modules', base='nn.quantizable')
-        self._test_package_import('modules.activation', base='nn.quantizable')
-        self._test_package_import('modules.rnn', base='nn.quantizable')
-
     def test_import_nn_quantizable_activation(self):
         module_list = [
             # Modules
@@ -329,23 +190,6 @@ def test_import_nn_quantizable_rnn(self):
         ]
         self._test_function_import('rnn', module_list, base='nn.quantizable.modules')
 
-    # torch.nn.qat and torch.nn.qat.dynamic
-    def test_package_import_nn_qat(self):
-        self._test_package_import('qat', base='nn')
-
-    def test_package_import_nn_qat_modules(self):
-        r"""Tests the migration of the torch.nn.qat.modules"""
-        self._test_package_import('modules', base='nn.qat')
-        self._test_package_import('modules.conv', base='nn.qat')
-        self._test_package_import('modules.embedding_ops', base='nn.qat')
-        self._test_package_import('modules.linear', base='nn.qat')
-
-    def test_package_import_nn_qat_dynamic(self):
-        r"""Tests the migration of the torch.nn.qat.modules"""
-        self._test_package_import('dynamic', base='nn.qat')
-        self._test_package_import('dynamic.modules', base='nn.qat')
-        self._test_package_import('dynamic.modules.linear', base='nn.qat')
-
     def test_import_nn_qat_conv(self):
         module_list = [
             'Conv1d',
@@ -375,15 +219,6 @@ def test_import_nn_qat_dynamic_linear(self):
 
 
 class TestAOMigrationNNIntrinsic(AOMigrationTestCase):
-    def test_package_import_nn_intrinsic_modules(self):
-        r"""Tests the migration of the torch.nn.intrinsic.modules"""
-        self._test_package_import('modules', base='nn.intrinsic')
-        self._test_package_import('modules.fused', base='nn.intrinsic')
-
-    def test_package_import_nn_intrinsic(self):
-        skip = []
-        self._test_package_import('intrinsic', base='nn', skip=skip)
-
     def test_modules_import_nn_intrinsic(self):
         module_list = [
             # Modules
@@ -424,11 +259,6 @@ def test_modules_nn_intrinsic_fused(self):
         self._test_function_import('fused', function_list,
                                    base='nn.intrinsic.modules')
 
-    def test_package_import_nn_intrinsic_qat(self):
-        r"""Tests the migration of the torch.nn.intrinsic.modules"""
-        self._test_package_import('qat', base='nn.intrinsic')
-        self._test_package_import('qat.modules', base='nn.intrinsic')
-
     def test_modules_import_nn_intrinsic_qat(self):
         module_list = [
             "LinearReLU",
@@ -478,11 +308,6 @@ def test_modules_intrinsic_qat_linear_relu(self):
         self._test_function_import('linear_relu', function_list,
                                    base='nn.intrinsic.qat.modules')
 
-    def test_package_import_nn_intrinsic_quantized(self):
-        r"""Tests the migration of the torch.nn.intrinsic.quantized"""
-        self._test_package_import('quantized', base='nn.intrinsic')
-        self._test_package_import('quantized.modules', base='nn.intrinsic')
-
     def test_modules_import_nn_intrinsic_quantized(self):
         module_list = [
             'BNReLU2d',
@@ -517,3 +342,9 @@ def test_modules_intrinsic_quantized_linear_relu(self):
         ]
         self._test_function_import('linear_relu', function_list,
                                    base='nn.intrinsic.quantized.modules')
+
+    def test_modules_no_import_nn_intrinsic_quantized_dynamic(self):
+        # TODO(future PR): generalize this
+        import torch
+        _ = torch.ao.nn.intrinsic.quantized.dynamic
+        _ = torch.nn.intrinsic.quantized.dynamic
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 60df1d174f6f..356ab4da0e65 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -7,9 +7,6 @@ class TestAOMigrationQuantization(AOMigrationTestCase):
     r"""Modules and functions related to the
     `torch/quantization` migration to `torch/ao/quantization`.
     """
-    def test_package_import_quantize(self):
-        self._test_package_import('quantize')
-
     def test_function_import_quantize(self):
         function_list = [
             '_convert',
@@ -34,9 +31,6 @@ def test_function_import_quantize(self):
         ]
         self._test_function_import('quantize', function_list)
 
-    def test_package_import_stubs(self):
-        self._test_package_import('stubs')
-
     def test_function_import_stubs(self):
         function_list = [
             'QuantStub',
@@ -45,9 +39,6 @@ def test_function_import_stubs(self):
         ]
         self._test_function_import('stubs', function_list)
 
-    def test_package_import_quantize_jit(self):
-        self._test_package_import('quantize_jit')
-
     def test_function_import_quantize_jit(self):
         function_list = [
             '_check_is_script_module',
@@ -67,9 +58,6 @@ def test_function_import_quantize_jit(self):
         ]
         self._test_function_import('quantize_jit', function_list)
 
-    def test_package_import_fake_quantize(self):
-        self._test_package_import('fake_quantize')
-
     def test_function_import_fake_quantize(self):
         function_list = [
             '_is_per_channel',
@@ -96,9 +84,6 @@ def test_function_import_fake_quantize(self):
         ]
         self._test_function_import('fake_quantize', function_list)
 
-    def test_package_import_fuse_modules(self):
-        self._test_package_import('fuse_modules')
-
     def test_function_import_fuse_modules(self):
         function_list = [
             '_fuse_modules',
@@ -112,9 +97,6 @@ def test_function_import_fuse_modules(self):
         ]
         self._test_function_import('fuse_modules', function_list)
 
-    def test_package_import_quant_type(self):
-        self._test_package_import('quant_type')
-
     def test_function_import_quant_type(self):
         function_list = [
             'QuantType',
@@ -122,9 +104,6 @@ def test_function_import_quant_type(self):
         ]
         self._test_function_import('quant_type', function_list)
 
-    def test_package_import_observer(self):
-        self._test_package_import('observer')
-
     def test_function_import_observer(self):
         function_list = [
             "_PartialWrapper",
@@ -156,9 +135,6 @@ def test_function_import_observer(self):
         ]
         self._test_function_import('observer', function_list)
 
-    def test_package_import_qconfig(self):
-        self._test_package_import('qconfig')
-
     def test_function_import_qconfig(self):
         function_list = [
             "QConfig",
@@ -184,9 +160,6 @@ def test_function_import_qconfig(self):
         ]
         self._test_function_import('qconfig', function_list)
 
-    def test_package_import_quantization_mappings(self):
-        self._test_package_import('quantization_mappings')
-
     def test_function_import_quantization_mappings(self):
         function_list = [
             "no_observer_set",
@@ -214,9 +187,6 @@ def test_function_import_quantization_mappings(self):
         self._test_function_import('quantization_mappings', function_list)
         self._test_dict_import('quantization_mappings', dict_list)
 
-    def test_package_import_fuser_method_mappings(self):
-        self._test_package_import('fuser_method_mappings')
-
     def test_function_import_fuser_method_mappings(self):
         function_list = [
             "fuse_conv_bn",
@@ -230,9 +200,6 @@ def test_function_import_fuser_method_mappings(self):
         self._test_function_import('fuser_method_mappings', function_list)
         self._test_dict_import('fuser_method_mappings', dict_list)
 
-    def test_package_import_utils(self):
-        self._test_package_import('utils')
-
     def test_function_import_utils(self):
         function_list = [
             'activation_dtype',
diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 84e966acdae3..1c4d30a39190 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -3,9 +3,6 @@
 from .common import AOMigrationTestCase
 
 class TestAOMigrationQuantizationFx(AOMigrationTestCase):
-    def test_package_import_quantize_fx(self):
-        self._test_package_import('quantize_fx')
-
     def test_function_import_quantize_fx(self):
         function_list = [
             '_check_is_graph_module',
@@ -25,12 +22,6 @@ def test_function_import_quantize_fx(self):
         ]
         self._test_function_import('quantize_fx', function_list)
 
-    def test_package_import_fx(self):
-        self._test_package_import('fx', skip=[
-            'fusion_patterns',
-            'quantization_patterns',
-        ])
-
     def test_function_import_fx(self):
         function_list = [
             'prepare',
@@ -39,9 +30,6 @@ def test_function_import_fx(self):
         ]
         self._test_function_import('fx', function_list)
 
-    def test_package_import_fx_graph_module(self):
-        self._test_package_import('fx.graph_module')
-
     def test_function_import_fx_graph_module(self):
         function_list = [
             'FusedGraphModule',
@@ -53,9 +41,6 @@ def test_function_import_fx_graph_module(self):
         ]
         self._test_function_import('fx.graph_module', function_list)
 
-    def test_package_import_fx_pattern_utils(self):
-        self._test_package_import('fx.pattern_utils')
-
     def test_function_import_fx_pattern_utils(self):
         function_list = [
             'QuantizeHandler',
@@ -67,9 +52,6 @@ def test_function_import_fx_pattern_utils(self):
         ]
         self._test_function_import('fx.pattern_utils', function_list)
 
-    def test_package_import_fx_equalize(self):
-        self._test_package_import('fx._equalize')
-
     def test_function_import_fx_equalize(self):
         function_list = [
             'reshape_scale',
@@ -101,12 +83,6 @@ def test_function_import_fx_equalize(self):
         ]
         self._test_function_import('fx._equalize', function_list)
 
-    def test_package_import_fx_quantization_patterns(self):
-        self._test_package_import(
-            'fx.quantization_patterns',
-            new_package_name='fx.quantize_handler',
-        )
-
     def test_function_import_fx_quantization_patterns(self):
         function_list = [
             'QuantizeHandler',
@@ -130,9 +106,6 @@ def test_function_import_fx_quantization_patterns(self):
             new_package_name='fx.quantize_handler',
         )
 
-    def test_package_import_fx_match_utils(self):
-        self._test_package_import('fx.match_utils')
-
     def test_function_import_fx_match_utils(self):
         function_list = [
             '_MatchResult',
@@ -142,37 +115,22 @@ def test_function_import_fx_match_utils(self):
         ]
         self._test_function_import('fx.match_utils', function_list)
 
-    def test_package_import_fx_prepare(self):
-        self._test_package_import('fx.prepare')
-
     def test_function_import_fx_prepare(self):
         function_list = [
             'prepare'
         ]
         self._test_function_import('fx.prepare', function_list)
 
-    def test_package_import_fx_convert(self):
-        self._test_package_import('fx.convert')
-
     def test_function_import_fx_convert(self):
         function_list = [
             'convert'
         ]
         self._test_function_import('fx.convert', function_list)
 
-    def test_package_import_fx_fuse(self):
-        self._test_package_import('fx.fuse')
-
     def test_function_import_fx_fuse(self):
         function_list = ['fuse']
         self._test_function_import('fx.fuse', function_list)
 
-    def test_package_import_fx_fusion_patterns(self):
-        self._test_package_import(
-            'fx.fusion_patterns',
-            new_package_name='fx.fuse_handler',
-        )
-
     def test_function_import_fx_fusion_patterns(self):
         function_list = [
             'FuseHandler',
@@ -189,9 +147,6 @@ def test_function_import_fx_fusion_patterns(self):
     # new: torch.ao.quantization.utils
     # both are valid, but we'll deprecate the old path in the future
 
-    def test_package_import_fx_utils(self):
-        self._test_package_import('fx.utils')
-
     def test_function_import_fx_utils(self):
         function_list = [
             'get_custom_module_class_keys',
diff --git a/test/quantization/bc/test_backward_compatibility.py b/test/quantization/bc/test_backward_compatibility.py
index 83f2c790a6eb..0dbe60d93166 100644
--- a/test/quantization/bc/test_backward_compatibility.py
+++ b/test/quantization/bc/test_backward_compatibility.py
@@ -11,7 +11,7 @@
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
-import torch.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized as nniq
 from torch.fx import GraphModule
 
 # Testing utils
@@ -173,7 +173,7 @@ def _do_quant_transforms(
         ) -> torch.nn.Module:
             example_inputs = (input_tensor,)
             # do the quantizaton transforms and save result
-            qconfig = torch.quantization.get_default_qconfig('fbgemm')
+            qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
             mp = quantize_fx.prepare_fx(m, {'': qconfig}, example_inputs=example_inputs)
             mp(input_tensor)
             mq = quantize_fx.convert_fx(mp)
@@ -360,7 +360,7 @@ def test_per_tensor_observer(self):
     def test_default_qat_qconfig(self):
         class Model(nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.linear = nn.Linear(5, 5)
                 self.relu = nn.ReLU()
 
diff --git a/test/quantization/core/experimental/apot_fx_graph_mode_ptq.py b/test/quantization/core/experimental/apot_fx_graph_mode_ptq.py
index c19384294734..cbf3cb675629 100644
--- a/test/quantization/core/experimental/apot_fx_graph_mode_ptq.py
+++ b/test/quantization/core/experimental/apot_fx_graph_mode_ptq.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-import torch.quantization
+import torch.ao.quantization
 from torchvision.models.quantization.resnet import resnet18
 from torch.ao.quantization.experimental.quantization_helper import (
     evaluate,
@@ -25,8 +25,8 @@
 Prepare models
 """
 
-# Note that this is temporary, we'll expose these functions to torch.quantization after official releasee
-from torch.quantization.quantize_fx import prepare_qat_fx
+# Note that this is temporary, we'll expose these functions to torch.ao.quantization after official releasee
+from torch.ao.quantization.quantize_fx import prepare_qat_fx
 
 def calibrate(model, data_loader):
     model.eval()
diff --git a/test/quantization/core/experimental/quantization_util.py b/test/quantization/core/experimental/quantization_util.py
index fcba45b765c9..b96e297994de 100644
--- a/test/quantization/core/experimental/quantization_util.py
+++ b/test/quantization/core/experimental/quantization_util.py
@@ -2,7 +2,7 @@
 import torchvision
 import torchvision.transforms.transforms as transforms
 import os
-import torch.quantization
+import torch.ao.quantization
 from torchvision.models.quantization.resnet import resnet18
 from torch.autograd import Variable
 
@@ -15,7 +15,7 @@
 )
 warnings.filterwarnings(
     action='default',
-    module=r'torch.quantization'
+    module=r'torch.ao.quantization'
 )
 
 """
@@ -28,7 +28,7 @@
 train_batch_size = 30
 eval_batch_size = 50
 
-class AverageMeter(object):
+class AverageMeter:
     """Computes and stores the average and current value"""
     def __init__(self, name, fmt=':f'):
         self.name = name
diff --git a/test/quantization/core/test_backend_config.py b/test/quantization/core/test_backend_config.py
index 7f44809b3676..3cb6dcc9c4a3 100644
--- a/test/quantization/core/test_backend_config.py
+++ b/test/quantization/core/test_backend_config.py
@@ -1,9 +1,9 @@
 # Owner(s): ["oncall: quantization"]
 
 import torch
-import torch.nn.intrinsic as nni
-import torch.nn.qat as nnqat
-import torch.nn.quantized._reference as nnqr
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.quantized.reference as nnqr
 from torch.testing._internal.common_quantization import QuantizationTestCase
 
 from torch.ao.quantization.backend_config import (
@@ -137,8 +137,7 @@ def _get_backend_op_config2(self):
             ._set_root_node_getter(_default_root_node_getter) \
             ._set_extra_inputs_getter(self._extra_inputs_getter) \
             ._set_num_tensor_args_to_observation_type(self._num_tensor_args_to_observation_type) \
-            ._set_input_type_to_index(self._input_type_to_index) \
-            ._set_input_output_observed(False)
+            ._set_input_type_to_index(self._input_type_to_index)
 
     def _get_backend_pattern_config_dict1(self):
         return {
@@ -161,7 +160,6 @@ def _get_backend_pattern_config_dict2(self):
             "extra_inputs_getter": self._extra_inputs_getter,
             "num_tensor_args_to_observation_type": self._num_tensor_args_to_observation_type,
             "input_type_to_index": self._input_type_to_index,
-            "input_output_observed": False,
         }
 
     def test_backend_op_config_set_observation_type(self):
@@ -233,12 +231,6 @@ def test_backend_op_config_set_input_type_to_index(self):
         conf._set_input_type_to_index(self._input_type_to_index)
         self.assertEqual(conf._input_type_to_index, self._input_type_to_index)
 
-    def test_backend_op_config_set_input_output_observed(self):
-        conf = BackendPatternConfig(torch.nn.Embedding)
-        self.assertTrue(conf._input_output_observed is None)
-        conf._set_input_output_observed(False)
-        self.assertEqual(conf._input_output_observed, False)
-
     def test_backend_op_config_from_dict(self):
         conf_dict1 = self._get_backend_pattern_config_dict1()
         conf1 = BackendPatternConfig.from_dict(conf_dict1)
@@ -253,7 +245,6 @@ def test_backend_op_config_from_dict(self):
         self.assertTrue(conf1._extra_inputs_getter is None)
         self.assertEqual(len(conf1._num_tensor_args_to_observation_type), 0)
         self.assertEqual(len(conf1._input_type_to_index), 0)
-        self.assertTrue(conf1._input_output_observed is None)
         # Test temporary/internal keys
         conf_dict2 = self._get_backend_pattern_config_dict2()
         conf2 = BackendPatternConfig.from_dict(conf_dict2)
@@ -268,7 +259,6 @@ def test_backend_op_config_from_dict(self):
         self.assertEqual(conf2._extra_inputs_getter, self._extra_inputs_getter)
         self.assertEqual(conf2._num_tensor_args_to_observation_type, self._num_tensor_args_to_observation_type)
         self.assertEqual(conf2._input_type_to_index, self._input_type_to_index)
-        self.assertEqual(conf2._input_output_observed, False)
 
     def test_backend_op_config_to_dict(self):
         conf1 = self._get_backend_op_config1()
diff --git a/test/quantization/core/test_docs.py b/test/quantization/core/test_docs.py
index 27842b46ce7e..ab41c51388ba 100644
--- a/test/quantization/core/test_docs.py
+++ b/test/quantization/core/test_docs.py
@@ -6,7 +6,7 @@
 
 import torch
 
-# import torch.nn.quantized as nnq
+# import torch.ao.nn.quantized as nnq
 from torch.testing._internal.common_quantization import (
     QuantizationTestCase,
     SingleLayerLinearModel,
@@ -25,7 +25,7 @@ class TestQuantizationDocs(QuantizationTestCase):
 
     def run(self, result=None):
         with override_quantized_engine("qnnpack") if IS_ARM64 else contextlib.nullcontext():
-            super(TestQuantizationDocs, self).run(result)
+            super().run(result)
 
     def _get_code(
         self, path_from_pytorch, unique_identifier, offset=2, short_snippet=False
@@ -140,7 +140,7 @@ def test_quantization_doc_custom(self):
         path_from_pytorch = "docs/source/quantization.rst"
         unique_identifier = "Custom API Example::"
 
-        global_inputs = {"nnq": torch.nn.quantized}
+        global_inputs = {"nnq": torch.ao.nn.quantized}
 
         code = self._get_code(path_from_pytorch, unique_identifier)
         self._test_code(code, global_inputs)
diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index ef971d1c874b..41d82355ce9f 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -2,7 +2,7 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.intrinsic as nni
+import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.intrinsic.quantized as nniq
 import torch.ao.nn.quantized.reference as nnqr
 import torch.ao.quantization
@@ -22,6 +22,7 @@
     skipIfNoFBGEMM,
     lengths_to_offsets,
     skipIfNoONEDNN,
+    _make_conv_add_extra_input_tensor,
 )
 from torch.testing._internal.common_quantized import (
     _calculate_dynamic_qparams,
@@ -263,7 +264,7 @@ def _test_conv_api_impl(
             in_channels_per_group, input_feature_map_size, out_channels_per_group,
             groups, kernel_size, stride, padding, padding_mode, dilation,
             X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point,
-            use_bias, use_fused, use_channelwise):
+            use_bias, post_op, use_channelwise, X2_scale=1.0, X2_zero_point=0):
         for i in range(len(kernel_size)):
             assume(input_feature_map_size[i] + 2 * padding[i]
                    >= dilation[i] * (kernel_size[i] - 1) + 1)
@@ -274,6 +275,14 @@ def _test_conv_api_impl(
             batch_size, in_channels_per_group, input_feature_map_size,
             out_channels_per_group, groups, kernel_size, X_scale, X_zero_point,
             W_scale, W_zero_point, use_bias, use_channelwise)
+        example_input = [X, ]
+        example_input_q = [X_q, ]
+
+        if post_op in ["add", "add_relu"]:
+            X2, X2_q = _make_conv_add_extra_input_tensor(X2_scale, X2_zero_point, conv_module[0](X).size())
+            example_input = [X, X2]
+            example_input_q = [X_q, X2_q]
+
         # Make sure the weight shape is correct
         self.assertTrue(qconv_module.weight().shape == W_q.shape)
 
@@ -281,14 +290,10 @@ def _test_conv_api_impl(
         qconv_module.scale = Y_scale
         qconv_module.zero_point = Y_zero_point
 
-        if use_fused:
-            conv_module[0].weight.data = W
-            if use_bias:
-                conv_module[0].bias.data = b
-        else:
-            conv_module.weight.data = W
-            if use_bias:
-                conv_module.bias.data = b
+        raw_conv_module = conv_module[0] if post_op in ["relu", "add", "add_relu"] else conv_module
+        raw_conv_module.weight.data = W
+        if use_bias:
+            raw_conv_module.bias.data = b
 
         # Test members
         self.assertTrue(module_name == qconv_module._get_name(), module_name + " " + qconv_module._get_name())
@@ -304,10 +309,10 @@ def _test_conv_api_impl(
         self.assertEqual(Y_zero_point, qconv_module.zero_point)
 
         # Test forward
-        Y_exp = conv_module(X)
+        Y_exp = conv_module(*example_input)
         Y_exp = torch.quantize_per_tensor(
             Y_exp, scale=Y_scale, zero_point=Y_zero_point, dtype=torch.quint8)
-        Y_act = qconv_module(X_q)
+        Y_act = qconv_module(*example_input_q)
 
         # Make sure the results match
         # assert_array_almost_equal compares using the following formula:
@@ -351,7 +356,7 @@ def _test_conv_api_impl(
         self.assertEqual(qconv_module.scale, loaded_qconv_module.scale)
         self.assertEqual(qconv_module.zero_point,
                          loaded_qconv_module.zero_point)
-        Y_loaded = loaded_qconv_module(X_q)
+        Y_loaded = loaded_qconv_module(*example_input_q)
         np.testing.assert_array_almost_equal(
             Y_exp.int_repr().numpy(), Y_loaded.int_repr().numpy(), decimal=0)
 
@@ -372,7 +377,7 @@ def _test_conv_api_impl(
         self.assertEqual(copied_conv.scale, qconv_module.scale)
         self.assertEqual(copied_conv.zero_point,
                          qconv_module.zero_point)
-        Y_copied = copied_conv(X_q)
+        Y_copied = copied_conv(*example_input_q)
         np.testing.assert_array_almost_equal(
             Y_exp.int_repr().numpy(), Y_copied.int_repr().numpy(), decimal=0)
 
@@ -381,20 +386,29 @@ def _test_conv_api_impl(
         self.assertEqual(deepcopied_conv.scale, qconv_module.scale)
         self.assertEqual(deepcopied_conv.zero_point,
                          qconv_module.zero_point)
-        Y_deepcopied = copied_conv(X_q)
+        Y_deepcopied = deepcopied_conv(*example_input_q)
         np.testing.assert_array_almost_equal(
             Y_exp.int_repr().numpy(), Y_deepcopied.int_repr().numpy(), decimal=0)
 
         # JIT testing
         self.checkScriptable(
-            qconv_module, [[X_q]],
+            qconv_module, [example_input_q],
             check_save_load=True)
 
+        class _FusedModule_two_input_args(torch.ao.nn.intrinsic._FusedModule):
+            # Help Module for ConvAdd2d since torch.ao.nn.intrinsic._FusedModule only support one input arg
+            def forward(self, x1, x2):
+                input = self[0](x1, x2)
+                return input
+
         # Test from_float
-        fused_conv_module = torch.nn.intrinsic._FusedModule(conv_module)
+        fused_conv_module = _FusedModule_two_input_args(conv_module) \
+            if post_op in ["add", "add_relu"] else torch.ao.nn.intrinsic._FusedModule(conv_module)
+
         fused_conv_module.qconfig = torch.ao.quantization.default_qconfig
         torch.ao.quantization.prepare(fused_conv_module, inplace=True)
-        fused_conv_module(X.float())
+        example_input[0] = example_input[0].float()
+        fused_conv_module(*example_input)
         converted_qconv_module = fused_conv_module
         reference_mapping = get_default_static_quant_module_mappings()
         reference_mapping[type(conv_module)] = type(qconv_module)
@@ -402,12 +416,8 @@ def _test_conv_api_impl(
 
         # Smoke test to make sure the module actually runs
         if use_bias:
-            if use_fused:
-                self.assertEqual(conv_module[0].bias,
-                                 converted_qconv_module[0].bias())
-            else:
-                self.assertEqual(conv_module.bias,
-                                 converted_qconv_module[0].bias())
+            self.assertEqual(conv_module[0].bias if (post_op in ["relu", "add", "add_relu"]) else conv_module.bias,
+                             converted_qconv_module[0].bias())
         # Smoke test extra_repr
         self.assertTrue(module_name == converted_qconv_module[0]._get_name())
 
@@ -416,10 +426,9 @@ def test_conv1d_api(self):
         options = itertools.product(
             ["zeros", "reflect"],  # pad_mode
             [True, False],  # use_bias
-            [True, False],  # use_fused
             [True, False],  # use_channelwise
         )
-        for pad_mode, use_bias, use_fused, use_channelwise in options:
+        for pad_mode, use_bias, use_channelwise in options:
             if torch.backends.quantized.engine == "qnnpack":
                 use_channelwise = False
             batch_size = 2
@@ -447,13 +456,60 @@ def test_conv1d_api(self):
             Y_zero_point = 4
             if torch.backends.quantized.engine == 'qnnpack':
                 use_channelwise = False
-            # use_fused -> quantized class
-            class_map = {
-                True: (nniq.ConvReLU1d, "QuantizedConvReLU1d"),
-                False: (nnq.Conv1d, "QuantizedConv1d")
-            }
+            qconv_cls = nnq.Conv1d
+            module_name = "QuantizedConv1d"
+            qconv_module = qconv_cls(
+                in_channels, out_channels, kernel, stride, pad,
+                dilation, groups, use_bias, padding_mode=pad_mode
+            )
 
-            qconv_cls, module_name = class_map[use_fused]
+            conv_module = nn.Conv1d(
+                in_channels, out_channels, kernel, stride, pad,
+                dilation, groups, use_bias, padding_mode=pad_mode)
+            conv_module = conv_module.float()
+
+            self._test_conv_api_impl(
+                module_name, qconv_module, conv_module, batch_size,
+                in_channels_per_group, input_feature_map_size,
+                out_channels_per_group, groups, kernel_size, stride, pad, pad_mode,
+                dilation, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
+                Y_zero_point, use_bias, "none", use_channelwise)
+
+    @override_qengines
+    def test_conv1d_relu_api(self):
+        options = itertools.product(
+            ["zeros", "reflect"],  # pad_mode
+            [True, False],  # use_bias
+            [True, False],  # use_channelwise
+        )
+        batch_size = 2
+        in_channels_per_group = 2
+        length = 8
+        out_channels_per_group = 2
+        groups = 3
+        kernel = 3
+        stride = 2
+        pad = 1
+        dilation = 1
+        # Tests the correctness of the conv2d module.
+        in_channels = in_channels_per_group * groups
+        out_channels = out_channels_per_group * groups
+        input_feature_map_size = (length,)
+        kernel_size = (kernel, )
+        stride = (stride, )
+        pad = (pad, )
+        dilation = (dilation, )
+        X_scale = 1.3
+        X_zero_point = 2
+        W_scale = [0.5]
+        W_zero_point = [0] if qengine_is_onednn() else [3]
+        Y_scale = 5.0
+        Y_zero_point = 4
+        qconv_cls = nniq.ConvReLU1d
+        module_name = "QuantizedConvReLU1d"
+        for pad_mode, use_bias, use_channelwise in options:
+            if torch.backends.quantized.engine == 'qnnpack':
+                use_channelwise = False
             qconv_module = qconv_cls(
                 in_channels, out_channels, kernel, stride, pad,
                 dilation, groups, use_bias, padding_mode=pad_mode
@@ -462,9 +518,8 @@ def test_conv1d_api(self):
             conv_module = nn.Conv1d(
                 in_channels, out_channels, kernel, stride, pad,
                 dilation, groups, use_bias, padding_mode=pad_mode)
-            if use_fused:
-                relu_module = nn.ReLU()
-                conv_module = nni.ConvReLU1d(conv_module, relu_module)
+            relu_module = nn.ReLU()
+            conv_module = nni.ConvReLU1d(conv_module, relu_module)
             conv_module = conv_module.float()
 
             self._test_conv_api_impl(
@@ -472,17 +527,16 @@ def test_conv1d_api(self):
                 in_channels_per_group, input_feature_map_size,
                 out_channels_per_group, groups, kernel_size, stride, pad, pad_mode,
                 dilation, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
-                Y_zero_point, use_bias, use_fused, use_channelwise)
+                Y_zero_point, use_bias, "relu", use_channelwise)
 
     @override_qengines
     def test_conv2d_api(self):
         options = itertools.product(
             ["zeros", "reflect"],  # pad_mode
             [True, False],  # use_bias
-            [True, False],  # use_fused
             [True, False],  # use_channelwise
         )
-        for pad_mode, use_bias, use_fused, use_channelwise in options:
+        for pad_mode, use_bias, use_channelwise in options:
             if torch.backends.quantized.engine == "qnnpack":
                 use_channelwise = False
             batch_size = 2
@@ -512,13 +566,64 @@ def test_conv2d_api(self):
             W_zero_point = [0] if qengine_is_onednn() else [3]
             Y_scale = 5.0
             Y_zero_point = 4
-            # use_fused -> quantized class
-            class_map = {
-                True: (nniq.ConvReLU2d, "QuantizedConvReLU2d"),
-                False: (nnq.Conv2d, "QuantizedConv2d")
-            }
+            qconv_cls = nnq.Conv2d
+            module_name = "QuantizedConv2d"
+            qconv_module = qconv_cls(
+                in_channels, out_channels, kernel_size, stride, padding,
+                dilation, groups, use_bias, padding_mode=pad_mode
+            )
+
+            conv_module = nn.Conv2d(
+                in_channels, out_channels, kernel_size, stride, padding,
+                dilation, groups, use_bias, padding_mode=pad_mode)
+            conv_module = conv_module.float()
+
+            self._test_conv_api_impl(
+                module_name, qconv_module, conv_module, batch_size,
+                in_channels_per_group, input_feature_map_size,
+                out_channels_per_group, groups, kernel_size, stride, padding,
+                pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, "none", use_channelwise)
 
-            qconv_cls, module_name = class_map[use_fused]
+    @override_qengines
+    def test_conv2d_relu_api(self):
+        options = itertools.product(
+            ["zeros", "reflect"],  # pad_mode
+            [True, False],  # use_bias
+            [True, False],  # use_channelwise
+        )
+        batch_size = 2
+        in_channels_per_group = 2
+        H = 8
+        W = 8
+        out_channels_per_group = 2
+        groups = 3
+        kernel_h = 3
+        kernel_w = 3
+        stride_h = 2
+        stride_w = 2
+        pad_h = 1
+        pad_w = 1
+        dilation = 1
+        # Tests the correctness of the conv2d module.
+        in_channels = in_channels_per_group * groups
+        out_channels = out_channels_per_group * groups
+        input_feature_map_size = (H, W)
+        kernel_size = (kernel_h, kernel_w)
+        stride = (stride_h, stride_w)
+        padding = (pad_h, pad_w)
+        dilation = (dilation, dilation)
+        X_scale = 1.3
+        X_zero_point = 2
+        W_scale = [0.5]
+        W_zero_point = [0] if qengine_is_onednn() else [3]
+        Y_scale = 5.0
+        Y_zero_point = 4
+        qconv_cls = nniq.ConvReLU2d
+        module_name = "QuantizedConvReLU2d"
+        for pad_mode, use_bias, use_channelwise in options:
+            if torch.backends.quantized.engine == "qnnpack":
+                use_channelwise = False
             qconv_module = qconv_cls(
                 in_channels, out_channels, kernel_size, stride, padding,
                 dilation, groups, use_bias, padding_mode=pad_mode
@@ -527,9 +632,8 @@ def test_conv2d_api(self):
             conv_module = nn.Conv2d(
                 in_channels, out_channels, kernel_size, stride, padding,
                 dilation, groups, use_bias, padding_mode=pad_mode)
-            if use_fused:
-                relu_module = nn.ReLU()
-                conv_module = nni.ConvReLU2d(conv_module, relu_module)
+            relu_module = nn.ReLU()
+            conv_module = nni.ConvReLU2d(conv_module, relu_module)
             conv_module = conv_module.float()
 
             self._test_conv_api_impl(
@@ -537,78 +641,251 @@ def test_conv2d_api(self):
                 in_channels_per_group, input_feature_map_size,
                 out_channels_per_group, groups, kernel_size, stride, padding,
                 pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_fused, use_channelwise)
+                Y_scale, Y_zero_point, use_bias, "relu", use_channelwise)
 
     @skipIfNoFBGEMM
     def test_conv3d_api(self):
         options = itertools.product(
             [True, False],  # use_bias
-            [True, False],  # use_fused
             [True, False],  # use_channelwise
         )
-        for use_bias, use_fused, use_channelwise in options:
+        batch_size = 2
+        in_channels_per_group = 2
+        H = 8
+        W = 8
+        D = 8
+        out_channels_per_group = 2
+        groups = 3
+        kernel_h = 3
+        kernel_w = 3
+        kernel_d = 3
+        stride_h = 2
+        stride_w = 2
+        stride_d = 2
+        pad_mode = "zeros"  # 3d doesn't support reflect padding
+        pad_h = 1
+        pad_w = 1
+        pad_d = 1
+        dilation = 1
+        # Tests the correctness of the conv3d module.
+        in_channels = in_channels_per_group * groups
+        out_channels = out_channels_per_group * groups
+        input_feature_map_size = (D, H, W)
+        kernel_size = (kernel_d, kernel_h, kernel_w)
+        stride = (stride_d, stride_h, stride_w)
+        padding = (pad_d, pad_h, pad_w)
+        dilation = (dilation, dilation, dilation)
+        X_scale = 1.3
+        X_zero_point = 2
+        W_scale = [0.5]
+        W_zero_point = [0] if qengine_is_onednn() else [3]
+        Y_scale = 5.0
+        Y_zero_point = 4
+        qconv_cls = nnq.Conv3d
+        module_name = "QuantizedConv3d"
+        for use_bias, use_channelwise in options:
+            if torch.backends.quantized.engine == "qnnpack":
+                use_channelwise = False
+            with override_quantized_engine('fbgemm'):
+                qconv_module = qconv_cls(
+                    in_channels, out_channels, kernel_size, stride, padding,
+                    dilation, groups, use_bias, padding_mode=pad_mode
+                )
+
+                conv_module = nn.Conv3d(
+                    in_channels, out_channels, kernel_size, stride, padding,
+                    dilation, groups, use_bias, padding_mode=pad_mode)
+                conv_module = conv_module.float()
+
+                self._test_conv_api_impl(
+                    module_name, qconv_module, conv_module, batch_size,
+                    in_channels_per_group, input_feature_map_size,
+                    out_channels_per_group, groups, kernel_size, stride, padding,
+                    pad_mode, dilation, X_scale, X_zero_point, W_scale,
+                    W_zero_point, Y_scale, Y_zero_point, use_bias, "none",
+                    use_channelwise)
+
+    @skipIfNoFBGEMM
+    def test_conv3d_relu_api(self):
+        options = itertools.product(
+            [True, False],  # use_bias
+            [True, False],  # use_channelwise
+        )
+        batch_size = 2
+        in_channels_per_group = 2
+        H = 8
+        W = 8
+        D = 8
+        out_channels_per_group = 2
+        groups = 3
+        kernel_h = 3
+        kernel_w = 3
+        kernel_d = 3
+        stride_h = 2
+        stride_w = 2
+        stride_d = 2
+        pad_mode = "zeros"  # 3d doesn't support reflect padding
+        pad_h = 1
+        pad_w = 1
+        pad_d = 1
+        dilation = 1
+        # Tests the correctness of the conv3d module.
+        in_channels = in_channels_per_group * groups
+        out_channels = out_channels_per_group * groups
+        input_feature_map_size = (D, H, W)
+        kernel_size = (kernel_d, kernel_h, kernel_w)
+        stride = (stride_d, stride_h, stride_w)
+        padding = (pad_d, pad_h, pad_w)
+        dilation = (dilation, dilation, dilation)
+        X_scale = 1.3
+        X_zero_point = 2
+        W_scale = [0.5]
+        W_zero_point = [0] if qengine_is_onednn() else [3]
+        Y_scale = 5.0
+        Y_zero_point = 4
+        qconv_cls = nniq.ConvReLU3d
+        module_name = "QuantizedConvReLU3d"
+        for use_bias, use_channelwise in options:
             if torch.backends.quantized.engine == "qnnpack":
                 use_channelwise = False
+            with override_quantized_engine('fbgemm'):
+                qconv_module = qconv_cls(
+                    in_channels, out_channels, kernel_size, stride, padding,
+                    dilation, groups, use_bias, padding_mode=pad_mode
+                )
+
+                conv_module = nn.Conv3d(
+                    in_channels, out_channels, kernel_size, stride, padding,
+                    dilation, groups, use_bias, padding_mode=pad_mode)
+                relu_module = nn.ReLU()
+                conv_module = nni.ConvReLU3d(conv_module, relu_module)
+                conv_module = conv_module.float()
+
+                self._test_conv_api_impl(
+                    module_name, qconv_module, conv_module, batch_size,
+                    in_channels_per_group, input_feature_map_size,
+                    out_channels_per_group, groups, kernel_size, stride, padding,
+                    pad_mode, dilation, X_scale, X_zero_point, W_scale,
+                    W_zero_point, Y_scale, Y_zero_point, use_bias, "relu",
+                    use_channelwise)
+
+    @skipIfNoONEDNN
+    def test_conv2d_add(self):
+        """test API functionality for nn.intrinsic.quantized.ConvAdd2d"""
+        with override_quantized_engine('onednn'):
+            options = itertools.product(
+                ["zeros", "reflect"],  # pad_mode
+                [True, False],  # use_bias
+                [True, False],  # use_channelwise
+            )
             batch_size = 2
             in_channels_per_group = 2
             H = 8
             W = 8
-            D = 8
             out_channels_per_group = 2
             groups = 3
             kernel_h = 3
             kernel_w = 3
-            kernel_d = 3
             stride_h = 2
             stride_w = 2
-            stride_d = 2
-            pad_mode = "zeros"  # 3d doesn't support reflect padding
             pad_h = 1
             pad_w = 1
-            pad_d = 1
             dilation = 1
-            # Tests the correctness of the conv3d module.
+            # Tests the correctness of the conv2d module.
             in_channels = in_channels_per_group * groups
             out_channels = out_channels_per_group * groups
-            input_feature_map_size = (D, H, W)
-            kernel_size = (kernel_d, kernel_h, kernel_w)
-            stride = (stride_d, stride_h, stride_w)
-            padding = (pad_d, pad_h, pad_w)
-            dilation = (dilation, dilation, dilation)
+            input_feature_map_size = (H, W)
+            kernel_size = (kernel_h, kernel_w)
+            stride = (stride_h, stride_w)
+            padding = (pad_h, pad_w)
+            dilation = (dilation, dilation)
             X_scale = 1.3
             X_zero_point = 2
+            X2_scale = 1.2
+            X2_zero_point = 1
             W_scale = [0.5]
             W_zero_point = [0] if qengine_is_onednn() else [3]
             Y_scale = 5.0
             Y_zero_point = 4
-            # use_fused -> quantized class
-            class_map = {
-                True: (nniq.ConvReLU3d, "QuantizedConvReLU3d"),
-                False: (nnq.Conv3d, "QuantizedConv3d")
-            }
+            qconv_cls = nniq.ConvAdd2d
+            module_name = "QuantizedConvAdd2d"
+            for pad_mode, use_bias, use_channelwise in options:
+                qconv_module = qconv_cls(
+                    in_channels, out_channels, kernel_size, stride, padding,
+                    dilation, groups, use_bias, padding_mode=pad_mode
+                )
 
-            with override_quantized_engine('fbgemm'):
-                qconv_cls, module_name = class_map[use_fused]
+                conv_module = nn.Conv2d(
+                    in_channels, out_channels, kernel_size, stride, padding,
+                    dilation, groups, use_bias, padding_mode=pad_mode)
+                conv_module = torch.ao.nn.intrinsic.ConvAdd2d(conv_module, torch.add)
+                conv_module = conv_module.float()
+
+                self._test_conv_api_impl(
+                    module_name, qconv_module, conv_module, batch_size,
+                    in_channels_per_group, input_feature_map_size,
+                    out_channels_per_group, groups, kernel_size, stride, padding,
+                    pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point,
+                    Y_scale, Y_zero_point, use_bias, "add", use_channelwise, X2_scale, X2_zero_point)
+
+    @skipIfNoONEDNN
+    def test_conv2d_add_relu(self):
+        """test API functionality for nn.intrinsic.quantized.ConvAdd2d"""
+        with override_quantized_engine('onednn'):
+            options = itertools.product(
+                ["zeros", "reflect"],  # pad_mode
+                [True, False],  # use_bias
+                [True, False],  # use_channelwise
+            )
+            batch_size = 2
+            in_channels_per_group = 2
+            H = 8
+            W = 8
+            out_channels_per_group = 2
+            groups = 3
+            kernel_h = 3
+            kernel_w = 3
+            stride_h = 2
+            stride_w = 2
+            pad_h = 1
+            pad_w = 1
+            dilation = 1
+            # Tests the correctness of the conv2d module.
+            in_channels = in_channels_per_group * groups
+            out_channels = out_channels_per_group * groups
+            input_feature_map_size = (H, W)
+            kernel_size = (kernel_h, kernel_w)
+            stride = (stride_h, stride_w)
+            padding = (pad_h, pad_w)
+            dilation = (dilation, dilation)
+            X_scale = 1.3
+            X_zero_point = 2
+            X2_scale = 1.2
+            X2_zero_point = 1
+            W_scale = [0.5]
+            W_zero_point = [0] if qengine_is_onednn() else [3]
+            Y_scale = 5.0
+            Y_zero_point = 4
+            qconv_cls = nniq.ConvAddReLU2d
+            module_name = "QuantizedConvAddReLU2d"
+            for pad_mode, use_bias, use_channelwise in options:
                 qconv_module = qconv_cls(
                     in_channels, out_channels, kernel_size, stride, padding,
                     dilation, groups, use_bias, padding_mode=pad_mode
                 )
 
-                conv_module = nn.Conv3d(
+                conv_module = nn.Conv2d(
                     in_channels, out_channels, kernel_size, stride, padding,
                     dilation, groups, use_bias, padding_mode=pad_mode)
-                if use_fused:
-                    relu_module = nn.ReLU()
-                    conv_module = nni.ConvReLU3d(conv_module, relu_module)
+                conv_module = torch.ao.nn.intrinsic.ConvAddReLU2d(conv_module, torch.add, nn.ReLU())
                 conv_module = conv_module.float()
 
                 self._test_conv_api_impl(
                     module_name, qconv_module, conv_module, batch_size,
                     in_channels_per_group, input_feature_map_size,
                     out_channels_per_group, groups, kernel_size, stride, padding,
-                    pad_mode, dilation, X_scale, X_zero_point, W_scale,
-                    W_zero_point, Y_scale, Y_zero_point, use_bias, use_fused,
-                    use_channelwise)
+                    pad_mode, dilation, X_scale, X_zero_point, W_scale, W_zero_point,
+                    Y_scale, Y_zero_point, use_bias, "add_relu", use_channelwise, X2_scale, X2_zero_point)
 
     def test_pool_api(self):
         """Tests the correctness of the pool module.
@@ -663,7 +940,7 @@ def _test_dropout_serialization(self, get_model, data1, data2):
         ref1 = mq1(data2)
 
         m2 = get_model()
-        m2.qconfig = torch.quantization.default_qconfig
+        m2.qconfig = torch.ao.quantization.default_qconfig
         mp2 = torch.ao.quantization.prepare(m2)
         mq2 = torch.ao.quantization.convert(mp2)
 
@@ -732,7 +1009,7 @@ def _test_batch_norm_serialization(self, get_model, data1, data2):
         ref1 = mq1(data2)
 
         m2 = get_model()
-        m2.qconfig = torch.quantization.default_qconfig
+        m2.qconfig = torch.ao.quantization.default_qconfig
         mp2 = torch.ao.quantization.prepare(m2)
         mq2 = torch.ao.quantization.convert(mp2)
 
@@ -926,6 +1203,25 @@ def test_leaky_relu(self):
     def test_sigmoid(self):
         self._test_activation_module_impl("Sigmoid", nn.Sigmoid, nnq.Sigmoid, {})
 
+    def _test_hard_swish_serialization(self):
+        scale_original = 10.0 / 256
+        zero_point_original = 1.0
+
+        quant_mod_original = nnq.Hardswish(scale_original, zero_point_original)
+        state_dict = quant_mod_original.state_dict()
+
+        scale_new = 5.0 / 256
+        zero_point_new = 2.0
+        quant_mod_new = nnq.Hardswish(scale_new, zero_point_new)
+        quant_mod_new.load_state_dict(state_dict)
+
+        self.assertEqual(quant_mod_original.scale, quant_mod_new.scale)
+        self.assertEqual(quant_mod_original.zero_point, quant_mod_new.zero_point)
+
+    def test_hard_swish(self):
+        self._test_activation_module_impl("Hardswish", nn.Hardswish, nnq.Hardswish, {})
+        self._test_hard_swish_serialization()
+
     @given(
         num_embeddings=st.integers(10, 50),
         embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 0106eac52c60..ed37552e1ce9 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -330,7 +330,7 @@ def test_qhardsigmoid(self):
             {
                 'quantized_fn': [
                     torch.ao.nn.quantized.functional.hardsigmoid,
-                    torch.nn.quantized.functional.hardsigmoid,
+                    torch.ao.nn.quantized.functional.hardsigmoid,
                 ],
                 'reference_fn': torch.nn.functional.hardsigmoid,
                 'output_range': (0.0, 1.0),
@@ -339,7 +339,7 @@ def test_qhardsigmoid(self):
             {
                 'quantized_fn': [
                     torch.ao.nn.quantized.functional.hardsigmoid,
-                    torch.nn.quantized.functional.hardsigmoid,
+                    torch.ao.nn.quantized.functional.hardsigmoid,
                 ],
                 'reference_fn': torch.nn.functional.hardsigmoid,
                 'output_range': (0.0, 1.0),
@@ -661,7 +661,7 @@ def test_qthreshold(self, X, threshold, value):
         ops_under_test = {
             'native': torch.threshold,
             'nn.functional': torch.nn.functional.threshold,
-            'nn.quantized.functional': torch.nn.quantized.functional.threshold,
+            'nn.quantized.functional': torch.ao.nn.quantized.functional.threshold,
             'ao.nn.quantized.functional': torch.ao.nn.quantized.functional.threshold,
         }
 
@@ -734,7 +734,7 @@ def test_hardtanh(self, X, min_val, max_val):
 
             ops_under_test = {
                 'nn.quantized.functional.hardtanh':
-                    torch.nn.quantized.functional.hardtanh,
+                    torch.ao.nn.quantized.functional.hardtanh,
                 'ao.nn.quantized.functional.hardtanh':
                     torch.ao.nn.quantized.functional.hardtanh,
             }
@@ -745,7 +745,7 @@ def test_hardtanh(self, X, min_val, max_val):
 
             ops_under_test_inplace = {
                 'inplace nn.quantized.functional.hardtanh':
-                    torch.nn.quantized.functional.hardtanh,
+                    torch.ao.nn.quantized.functional.hardtanh,
                 'inplace ao.nn.quantized.functional.hardtanh':
                     torch.ao.nn.quantized.functional.hardtanh,
             }
@@ -1346,7 +1346,7 @@ def test_max_pool1d(self, X, kernel, stride, dilation, padding, ceil_mode):
         ops_under_test = {
             "torch": torch.max_pool1d,
             "nn.functional": torch.nn.functional.max_pool1d,
-            "nn.quantized.functional": torch.nn.quantized.functional.max_pool1d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.max_pool1d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.max_pool1d,
         }
 
@@ -1443,7 +1443,7 @@ def test_max_pool2d(self, X, kernel, stride, dilation, padding, ceil_mode):
         ops_under_test = {
             "torch": torch.max_pool2d,
             "nn.functional": torch.nn.functional.max_pool2d,
-            "nn.quantized.functional": torch.nn.quantized.functional.max_pool2d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.max_pool2d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.max_pool2d,
         }
 
@@ -1502,7 +1502,7 @@ def test_max_pool2d_nhwc(self, X, kernel, stride, dilation, padding, ceil_mode):
         ops_under_test = {
             "torch": torch.max_pool2d,
             "nn.functional": torch.nn.functional.max_pool2d,
-            "nn.quantized.functional": torch.nn.quantized.functional.max_pool2d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.max_pool2d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.max_pool2d,
         }
 
@@ -1552,7 +1552,7 @@ def test_avg_pool2d(self, X, kernel, stride, padding, ceil_mode, count_include_p
             ceil_mode=ceil_mode, count_include_pad=count_include_pad, divisor_override=divisor_override)
         ops_under_test = {
             "nn.functional": torch.nn.functional.avg_pool2d,
-            "nn.quantized.functional": torch.nn.quantized.functional.avg_pool2d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool2d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool2d,
         }
         error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}"
@@ -1614,7 +1614,7 @@ def test_avg_pool2d_nhwc(self, X, kernel, stride, padding, ceil_mode, count_incl
         self.assertTrue(qX.stride() != sorted(qX.stride()))
         ops_under_test = {
             "nn.functional": torch.nn.functional.avg_pool2d,
-            "nn.quantized.functional": torch.nn.quantized.functional.avg_pool2d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool2d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool2d,
         }
         error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}"
@@ -1669,7 +1669,7 @@ def test_avg_pool3d(self, X, kernel, stride, padding, ceil_mode, count_include_p
 
         ops_under_test = {
             "nn.functional": torch.nn.functional.avg_pool3d,
-            "nn.quantized.functional": torch.nn.quantized.functional.avg_pool3d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool3d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool3d,
         }
         error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}"
@@ -1732,7 +1732,7 @@ def test_avg_pool3d_nhwc(self, X, kernel, stride, padding, ceil_mode, count_incl
         self.assertTrue(qX.stride() != sorted(qX.stride()))
         ops_under_test = {
             "nn.functional": torch.nn.functional.avg_pool3d,
-            "nn.quantized.functional": torch.nn.quantized.functional.avg_pool3d,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool3d,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.avg_pool3d,
         }
         error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}"
@@ -1802,7 +1802,7 @@ def test_adaptive_avg_pool2d_nhwc(self):
             ops_under_test = {
                 "nn.functional": torch.nn.functional.adaptive_avg_pool2d,
                 "nn.quantized.functional":
-                    torch.nn.quantized.functional.adaptive_avg_pool2d,
+                    torch.ao.nn.quantized.functional.adaptive_avg_pool2d,
                 "ao.nn.quantized.functional":
                     torch.ao.nn.quantized.functional.adaptive_avg_pool2d,
             }
@@ -1873,7 +1873,7 @@ def test_adaptive_avg_pool(self):
                     "nn.functional":
                         getattr(torch.nn.functional, 'adaptive_avg_pool{}d'.format(dim)),
                     "nn.quantized.functional":
-                        getattr(torch.nn.quantized.functional, 'adaptive_avg_pool{}d'.format(dim)),
+                        getattr(torch.ao.nn.quantized.functional, 'adaptive_avg_pool{}d'.format(dim)),
                     "ao.nn.quantized.functional":
                         getattr(torch.ao.nn.quantized.functional, 'adaptive_avg_pool{}d'.format(dim))
                 }
@@ -1952,7 +1952,7 @@ def test_adaptive_avg_pool3d_ndhwc(self):
             ops_under_test = {
                 "nn.functional": torch.nn.functional.adaptive_avg_pool3d,
                 "nn.quantized.functional":
-                    torch.nn.quantized.functional.adaptive_avg_pool3d,
+                    torch.ao.nn.quantized.functional.adaptive_avg_pool3d,
                 "ao.nn.quantized.functional":
                     torch.ao.nn.quantized.functional.adaptive_avg_pool3d,
             }
@@ -2100,7 +2100,7 @@ def test_interpolate(self, X, size, mode, scale_factor, align_corners, nhwc_layo
 
         ops_under_test = {
             "nn.functional": torch.nn.functional.interpolate,
-            "nn.quantized.functional": torch.nn.quantized.functional.interpolate,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.interpolate,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.interpolate,
         }
         error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}"
@@ -2154,7 +2154,7 @@ def test_interpolate3d(self, X, size, mode, scale_factor, align_corners, nhwc_la
 
         ops_under_test = {
             "nn.functional": torch.nn.functional.interpolate,
-            "nn.quantized.functional": torch.nn.quantized.functional.interpolate,
+            "nn.quantized.functional": torch.ao.nn.quantized.functional.interpolate,
             "ao.nn.quantized.functional": torch.ao.nn.quantized.functional.interpolate,
         }
 
@@ -2443,7 +2443,7 @@ def test_instance_norm(self):
         affine_list = (True, False)
         combined = [shape_list, torch_types, y_scales, y_zero_points, channels_last_list, affine_list]
         test_cases_product = itertools.product(*combined)
-        test_cases = list(test_case for test_case in test_cases_product)
+        test_cases = list(test_cases_product)
         # add just one test case to test overflow
         test_cases.append([
             [1, 4, 224, 224, 160],  # shape,
@@ -2831,7 +2831,7 @@ def test_custom_module_lstm(self):
                 lstm_prepared = torch.ao.quantization.prepare(lstm)
                 self.assertTrue(hasattr(lstm_prepared[0], 'layers'))
                 self.assertEqual(num_layers, len(lstm_prepared[0].layers))
-                assert type(lstm_prepared[0]) == torch.nn.quantizable.LSTM
+                assert type(lstm_prepared[0]) == torch.ao.nn.quantizable.LSTM
 
                 # Calibrate
                 y = lstm_prepared(x)
@@ -2839,7 +2839,7 @@ def test_custom_module_lstm(self):
 
                 # Quantize
                 lstm_quantized = torch.ao.quantization.convert(lstm_prepared)
-                assert type(lstm_quantized[0]) == torch.nn.quantized.LSTM
+                assert type(lstm_quantized[0]) == torch.ao.nn.quantized.LSTM
                 qy = lstm_quantized(qx)
 
                 snr = _snr(y, qy)
@@ -2861,7 +2861,7 @@ def test_custom_module_lstm(self):
     def test_custom_module_multi_head_attention(self):
         class MultiheadAttentionModel(torch.nn.Module):
             def __init__(self, *args, **kwargs):
-                super(MultiheadAttentionModel, self).__init__()
+                super().__init__()
                 self.layer = torch.nn.MultiheadAttention(*args, **kwargs)
 
             def forward(
@@ -3007,7 +3007,7 @@ def test_qlinear(self, batch_size, input_channels, output_channels,
         # W_scale = 1.0
         # W_zp = 0
         W_scales = np.ones(output_channels)
-        W_zps = np.zeros(output_channels).astype(np.int)
+        W_zps = np.zeros(output_channels).astype(int)
         W_value_min = -128
         W_value_max = 127
         W_q0 = np.round(
@@ -3571,9 +3571,9 @@ def _test_qlinear_impl(self, batch_size, input_channels, output_channels, use_bi
             # xnnpack forces W_zp to 0 when using symmetric quantization
             # ONEDNN only supports symmetric quantization of weight
             if dtype == torch.qint8 or qengine_is_onednn():
-                W_zps = np.zeros(output_channels).astype(np.int)
+                W_zps = np.zeros(output_channels).astype(int)
             else:
-                W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int)
+                W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(int)
             # when using symmetric quantization
             # special restriction for xnnpack fully connected op weight
             # [-127, 127] instead of [-128, 127]
@@ -3958,6 +3958,37 @@ def test_qlinear_unpack(self, W, use_channelwise):
             np.testing.assert_equal(
                 W_q.q_zero_point(), W_q_origin.q_zero_point())
 
+    """Tests the correctness of the quantized::linear_unpack after freeing original tensor op."""
+    @skipIfNoQNNPACK
+    @given(W=hu.tensor(shapes=hu.array_shapes(2, 2,),
+                       qparams=hu.qparams(dtypes=torch.qint8)))
+    @override_qengines
+    def test_qlinear_qnnpack_free_memory_and_unpack(self, W):
+        assert(qengine_is_qnnpack)
+        W, (W_scale, W_zp, torch_type) = W
+        qlinear_prepack = torch.ops.quantized.linear_prepack
+        qlinear_unpack = torch.ops.quantized.linear_unpack
+
+        W = torch.from_numpy(W)
+        # ONEDNN only supports symmetric quantization of weight
+        if qengine_is_onednn():
+            W_zp = 0
+        W_q = torch.quantize_per_tensor(W, scale=W_scale, zero_point=W_zp, dtype=torch_type)
+        # Weight prepacking operator for quantized Linear
+        W_prepack = qlinear_prepack(W_q)
+        dummy_input = torch.randn((1, W.shape[1]))
+        # Make sure we free original tensor by running matrix multiplication in backend.
+        torch.ops.quantized.linear_dynamic(dummy_input, W_prepack)
+        torch.ops.quantized.linear_dynamic(dummy_input, W_prepack)
+        # At this step, original tensor should be recovered from a data_ptr
+        W_q_origin = qlinear_unpack(W_prepack)[0]
+        # Assert equal
+        np.testing.assert_equal(W_q.int_repr(), W_q_origin.int_repr().numpy())
+        np.testing.assert_equal(np.float32(
+            W_q.q_scale()), np.float32(W_q_origin.q_scale()))
+        np.testing.assert_equal(
+            W_q.q_zero_point(), W_q_origin.q_zero_point())
+
     @skipIfNoONEDNN
     def test_qlinear_leaky_relu(self):
         with override_quantized_engine('onednn'):
@@ -4594,11 +4625,13 @@ def _test_qconv_impl(
         input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, o_pads,
         dilations, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
-        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose,
+        Y_zero_point, use_bias, post_op, use_channelwise, use_transpose,
         device=torch.device("cpu"),
         input_dtype=torch.quint8,
         weight_dtype=torch.qint8,
         output_dtype=torch.quint8,
+        X2_scale=1.0,
+        X2_zero_point=128
     ):
         # ONEDNN only supports symmetric quantization of weight
         if qengine_is_onednn() and W_zero_point is not None:
@@ -4618,11 +4651,36 @@ def _test_qconv_impl(
         conv_op.bias = torch.nn.Parameter(
             bias_float, requires_grad=False) if use_bias else None
         result_ref = conv_op(X)
-        if use_relu:
+        if post_op == 'relu':
             assert not use_transpose, "Cannot fuse ReLU with ConvTranspose"
             relu = torch.nn.ReLU()
             result_ref = relu(result_ref)
-
+        elif post_op == 'add':
+            (X_value_min, X_value_max) = (0, 4)
+            X2_init = torch.randint(
+                X_value_min,
+                X_value_max,
+                result_ref.size(),
+                device=device
+            )
+            X2 = X2_scale * (X2_init - X2_zero_point).float()
+            X2_q = torch.quantize_per_tensor(
+                X2, scale=X2_scale, zero_point=X2_zero_point, dtype=input_dtype)
+            result_ref = result_ref + X2
+        elif post_op == 'add_relu':
+            (X_value_min, X_value_max) = (0, 4)
+            X2_init = torch.randint(
+                X_value_min,
+                X_value_max,
+                result_ref.size(),
+                device=device
+            )
+            X2 = X2_scale * (X2_init - X2_zero_point).float()
+            X2_q = torch.quantize_per_tensor(
+                X2, scale=X2_scale, zero_point=X2_zero_point, dtype=input_dtype)
+            result_ref = result_ref + X2
+            relu = torch.nn.ReLU()
+            result_ref = relu(result_ref)
         # Quantize reference results for comparison
         result_ref_q = torch.quantize_per_tensor(
             result_ref, scale=Y_scale, zero_point=Y_zero_point,
@@ -4635,12 +4693,21 @@ def _test_qconv_impl(
             else:
                 W_prepack = qconv_prepack_fn(
                     W_q, bias_float, strides, pads, dilations, groups)
-            Y_q = qconv_fn(
-                X_q,
-                W_prepack,
-                Y_scale,
-                Y_zero_point,
-            )
+            if post_op == 'add' or post_op == 'add_relu':
+                Y_q = qconv_fn(
+                    X_q,
+                    X2_q,
+                    W_prepack,
+                    Y_scale,
+                    Y_zero_point,
+                )
+            else:
+                Y_q = qconv_fn(
+                    X_q,
+                    W_prepack,
+                    Y_scale,
+                    Y_zero_point,
+                )
         else:
             # quantized conv op without prepacking
             Y_q = qconv_fn(X_q, W_q, bias_float, strides, pads, dilations, groups, Y_scale, Y_zero_point)
@@ -4687,7 +4754,6 @@ def _test_qconv_impl(
            Y_scale=st.floats(4.2, 5.6),
            Y_zero_point=st.integers(0, 4),
            use_bias=st.booleans(),
-           use_relu=st.booleans(),
            use_channelwise=st.booleans())
     @override_qengines
     def test_qconv2d(
@@ -4712,7 +4778,6 @@ def test_qconv2d(
             Y_scale,
             Y_zero_point,
             use_bias,
-            use_relu,
             use_channelwise,
     ):
         input_channels = input_channels_per_group * groups
@@ -4723,8 +4788,6 @@ def test_qconv2d(
         dilations = (dilation, dilation)
 
         qconv = torch.ops.quantized.conv2d
-        if use_relu:
-            qconv = torch.ops.quantized.conv2d_relu
         qconv_prepack = torch.ops.quantized.conv2d_prepack
         conv_op = torch.nn.Conv2d(
             input_channels,
@@ -4750,7 +4813,201 @@ def test_qconv2d(
                 input_channels_per_group, (height, width),
                 output_channels_per_group, groups, kernels, strides, pads, None,
                 dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False, input_dtype=X_qdtype, output_dtype=X_qdtype)
+                Y_scale, Y_zero_point, use_bias, "none", use_channelwise, False, input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+    @given(batch_size=st.integers(1, 3),
+           input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
+           height=st.integers(10, 16),
+           width=st.integers(7, 14),
+           output_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]),
+           groups=st.integers(1, 300),
+           kernel_h=st.integers(1, 7),
+           kernel_w=st.integers(1, 7),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           X_scale=st.floats(1.2, 1.6),
+           X_zero_point=st.integers(0, 4),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           W_zero_point=st.lists(st.integers(-5, 5), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           Y_zero_point=st.integers(0, 4),
+           use_bias=st.booleans(),
+           use_channelwise=st.booleans())
+    @override_qengines
+    def test_qconv2d_relu(
+            self,
+            batch_size,
+            input_channels_per_group,
+            height,
+            width,
+            output_channels_per_group,
+            groups,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            pad_h,
+            pad_w,
+            dilation,
+            X_scale,
+            X_zero_point,
+            W_scale,
+            W_zero_point,
+            Y_scale,
+            Y_zero_point,
+            use_bias,
+            use_channelwise,
+    ):
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_h, kernel_w)
+        strides = (stride_h, stride_w)
+        pads = (pad_h, pad_w)
+        dilations = (dilation, dilation)
+
+        qconv = torch.ops.quantized.conv2d_relu
+        qconv_prepack = torch.ops.quantized.conv2d_prepack
+        conv_op = torch.nn.Conv2d(
+            input_channels,
+            output_channels,
+            kernels,
+            strides,
+            pads,
+            dilations,
+            groups,
+        )
+
+        act_qdtypes = [torch.quint8]
+        # Only qnnpack qengine supportes qint8
+        if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
+            act_qdtypes.append(torch.qint8)
+
+        for X_qdtype in act_qdtypes:
+            if X_qdtype == torch.qint8:
+                W_zero_point = [0 for i in range(len(W_zero_point))]
+
+            self._test_qconv_impl(
+                qconv, qconv_prepack, conv_op, batch_size,
+                input_channels_per_group, (height, width),
+                output_channels_per_group, groups, kernels, strides, pads, None,
+                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, "relu", use_channelwise, False, input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+    @skipIfNoONEDNN
+    def test_qconv2d_add(self):
+        batch_size = 3
+        groups_list = [1, 10]
+        input_channels_per_group = 2
+        output_channels_per_group = 2
+        height = 10
+        width = 10
+        kernel_h = 3
+        kernel_w = 3
+        stride_h = 2
+        stride_w = 2
+        pad_h = 1
+        pad_w = 1
+        dilation = 1
+        X_scale = 1.5
+        X_zero_point = 2
+        W_scale = [1.5]
+        W_zero_point = [-3]
+        Y_scale = 4.2
+        Y_zero_point = 0
+        use_bias_list = [False, True]
+        use_channelwise_list = [False, True]
+        X2_scale = 1.2
+        X2_zero_point_list = [0, 4]
+        options = itertools.product(groups_list, use_bias_list, use_channelwise_list, X2_zero_point_list)
+        for groups, use_bias, use_channelwise, X2_zero_point in options:
+            with override_quantized_engine('onednn'):
+                input_channels = input_channels_per_group * groups
+                output_channels = output_channels_per_group * groups
+                kernels = (kernel_h, kernel_w)
+                strides = (stride_h, stride_w)
+                pads = (pad_h, pad_w)
+                dilations = (dilation, dilation)
+
+                qconv = torch.ops.quantized.conv2d_add
+                qconv_prepack = torch.ops.quantized.conv2d_prepack
+                conv_op = torch.nn.Conv2d(
+                    input_channels,
+                    output_channels,
+                    kernels,
+                    strides,
+                    pads,
+                    dilations,
+                    groups,
+                )
+
+                X_qdtype = torch.quint8
+                self._test_qconv_impl(
+                    qconv, qconv_prepack, conv_op, batch_size,
+                    input_channels_per_group, (height, width),
+                    output_channels_per_group, groups, kernels, strides, pads, None,
+                    dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                    Y_scale, Y_zero_point, use_bias, "add", use_channelwise, False,
+                    input_dtype=X_qdtype, output_dtype=X_qdtype, X2_scale=X2_scale, X2_zero_point=X2_zero_point)
+
+    @skipIfNoONEDNN
+    def test_qconv2d_add_relu(self):
+        batch_size = 3
+        height = 10
+        width = 10
+        groups_list = [1, 10]
+        input_channels_per_group = 2
+        output_channels_per_group = 2
+        kernel_h = 3
+        kernel_w = 3
+        stride_h = 2
+        stride_w = 2
+        pad_h = 1
+        pad_w = 1
+        dilation = 1
+        X_scale = 1.5
+        X_zero_point = 2
+        W_scale = [1.5]
+        W_zero_point = [-3]
+        Y_scale = 4.2
+        Y_zero_point = 0
+        use_bias_list = [False, True]
+        use_channelwise_list = [False, True]
+        X2_scale = 1.2
+        X2_zero_point_list = [0, 4]
+
+        options = itertools.product(groups_list, use_bias_list, use_channelwise_list, X2_zero_point_list)
+        for groups, use_bias, use_channelwise, X2_zero_point in options:
+            with override_quantized_engine('onednn'):
+                input_channels = input_channels_per_group * groups
+                output_channels = output_channels_per_group * groups
+                kernels = (kernel_h, kernel_w)
+                strides = (stride_h, stride_w)
+                pads = (pad_h, pad_w)
+                dilations = (dilation, dilation)
+
+                qconv = torch.ops.quantized.conv2d_add_relu
+                qconv_prepack = torch.ops.quantized.conv2d_prepack
+                conv_op = torch.nn.Conv2d(
+                    input_channels,
+                    output_channels,
+                    kernels,
+                    strides,
+                    pads,
+                    dilations,
+                    groups,
+                )
+
+                X_qdtype = torch.quint8
+                self._test_qconv_impl(
+                    qconv, qconv_prepack, conv_op, batch_size,
+                    input_channels_per_group, (height, width),
+                    output_channels_per_group, groups, kernels, strides, pads, None,
+                    dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+                    Y_scale, Y_zero_point, use_bias, "add_relu", use_channelwise, False,
+                    input_dtype=X_qdtype, output_dtype=X_qdtype, X2_scale=X2_scale, X2_zero_point=X2_zero_point)
 
     # TODO: merge this test with test_qconv2d when CUDNN runtime flags becomes available
     """Tests the correctness of quantized 2D convolution cudnn op."""
@@ -4780,7 +5037,6 @@ def test_qconv2d(
            Y_scale=st.floats(4.2, 5.6),
            Y_zero_point=st.sampled_from([0]),
            use_bias=st.booleans(),
-           use_relu=st.booleans(),
            # TODO: enable channelwise
            use_channelwise=st.sampled_from([False]))
     @skipIfNoFBGEMM
@@ -4810,7 +5066,6 @@ def test_qconv2d_cudnn(
             Y_scale,
             Y_zero_point,
             use_bias,
-            use_relu,
             use_channelwise,
     ):
         input_channels = input_channels_per_group * groups
@@ -4820,10 +5075,90 @@ def test_qconv2d_cudnn(
         pads = (pad_h, pad_w)
         dilations = (dilation, dilation)
 
-        if use_relu:
-            qconv = torch.ops.quantized.conv2d_relu
-        else:
-            qconv = torch.ops.quantized.conv2d
+        qconv = torch.ops.quantized.conv2d
+        conv_op = torch.nn.Conv2d(
+            input_channels,
+            output_channels,
+            kernels,
+            strides,
+            pads,
+            dilations,
+            groups,
+        ).to(torch.device("cuda"))
+        self._test_qconv_impl(
+            qconv, torch.ops.quantized.conv2d_prepack, conv_op, batch_size,
+            input_channels_per_group, (height, width),
+            output_channels_per_group, groups, kernels, strides, pads, None,
+            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+            Y_scale, Y_zero_point, use_bias, "none", use_channelwise, False,
+            device=torch.device("cuda"),
+            input_dtype=torch.qint8, weight_dtype=torch.qint8, output_dtype=torch.qint8)
+
+    @given(batch_size=st.integers(1, 3),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           input_channels_per_group=st.integers(1, 32),
+           height=st.integers(10, 16),
+           width=st.integers(7, 14),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 1),  # currently padding only supports groups=1
+           kernel_h=st.integers(1, 7),
+           kernel_w=st.integers(1, 7),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           # result for dilation == 2 is not correct
+           # dilation=st.integers(1, 2),
+           # currently cudnn has only been verified to work for dilation = 1
+           # TODO: check backend works for dilation > 1
+           dilation=st.integers(1, 1),
+           X_scale=st.floats(1.2, 1.6),
+           X_zero_point=st.sampled_from([0]),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           W_zero_point=st.lists(st.integers(0, 0), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           Y_zero_point=st.sampled_from([0]),
+           use_bias=st.booleans(),
+           # TODO: enable channelwise
+           use_channelwise=st.sampled_from([False]))
+    @skipIfNoFBGEMM
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the qconv2d_cudnn op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_qconv2d_relu_cudnn(
+            self,
+            batch_size,
+            input_channels_per_group,
+            height,
+            width,
+            output_channels_per_group,
+            groups,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            pad_h,
+            pad_w,
+            dilation,
+            X_scale,
+            X_zero_point,
+            W_scale,
+            W_zero_point,
+            Y_scale,
+            Y_zero_point,
+            use_bias,
+            use_channelwise,
+    ):
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_h, kernel_w)
+        strides = (stride_h, stride_w)
+        pads = (pad_h, pad_w)
+        dilations = (dilation, dilation)
+
+        qconv = torch.ops.quantized.conv2d_relu
         conv_op = torch.nn.Conv2d(
             input_channels,
             output_channels,
@@ -4838,7 +5173,7 @@ def test_qconv2d_cudnn(
             input_channels_per_group, (height, width),
             output_channels_per_group, groups, kernels, strides, pads, None,
             dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
+            Y_scale, Y_zero_point, use_bias, "relu", use_channelwise, False,
             device=torch.device("cuda"),
             input_dtype=torch.qint8, weight_dtype=torch.qint8, output_dtype=torch.qint8)
 
@@ -4990,7 +5325,7 @@ def test_qconv_transpose1d(self):
                     input_channels_per_group, (width, ),
                     output_channels_per_group, groups, kernels, strides, pads, o_pads,
                     dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-                    Y_scale, Y_zero_point, use_bias, use_relu=False,
+                    Y_scale, Y_zero_point, use_bias, post_op="none",
                     use_channelwise=False, use_transpose=True, input_dtype=X_qdtype, output_dtype=X_qdtype)
 
                 # check that this doesn't error
@@ -5116,7 +5451,7 @@ def test_qconv_transpose2d(
                 input_channels_per_group, (height, width),
                 output_channels_per_group, groups, kernels, strides, pads, o_pads,
                 dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_relu=False,
+                Y_scale, Y_zero_point, use_bias, post_op="none",
                 use_channelwise=False, use_transpose=True, input_dtype=X_qdtype, output_dtype=X_qdtype)
 
             # check that this doesn't error
@@ -5243,7 +5578,7 @@ def test_qconv_transpose3d(
             input_channels_per_group, (time, height, width),
             output_channels_per_group, groups, kernels, strides, pads, o_pads,
             dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu=False,
+            Y_scale, Y_zero_point, use_bias, post_op="none",
             use_channelwise=False, use_transpose=True)
 
         # check that this doesn't error
@@ -5369,7 +5704,6 @@ def test_qconv2d_unpack(self, inputs, stride, pad, o_pad, channelwise):
            Y_scale=st.floats(4.2, 5.6),
            Y_zero_point=st.integers(0, 4),
            use_bias=st.booleans(),
-           use_relu=st.booleans(),
            use_channelwise=st.booleans())
     @override_qengines
     def test_qconv1d(
@@ -5390,7 +5724,6 @@ def test_qconv1d(
         Y_scale,
         Y_zero_point,
         use_bias,
-        use_relu,
         use_channelwise,
     ):
         input_channels = input_channels_per_group * groups
@@ -5408,8 +5741,6 @@ def test_qconv1d(
         )
         qconv_prepack = torch.ops.quantized.conv1d_prepack
         qconv = torch.ops.quantized.conv1d
-        if use_relu:
-            qconv = torch.ops.quantized.conv1d_relu
 
         act_qdtypes = [torch.quint8]
         # Only qnnpack qengine supportes qint8
@@ -5425,7 +5756,78 @@ def test_qconv1d(
                 input_channels_per_group, (length, ),
                 output_channels_per_group, groups, kernel, [stride], [pad], None,
                 [dilation], X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
+                Y_scale, Y_zero_point, use_bias, "none", use_channelwise, False,
+                input_dtype=X_qdtype, output_dtype=X_qdtype)
+
+    @given(batch_size=st.integers(1, 6),
+           input_channels_per_group=st.sampled_from((2, 4, 5, 8, 16, 32)),
+           output_channels_per_group=st.sampled_from((2, 4, 5, 8, 16, 32)),
+           groups=st.integers(1, 3),
+           length=st.integers(4, 16),
+           kernel=st.integers(1, 7),
+           stride=st.integers(1, 2),
+           pad=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           X_scale=st.floats(1.2, 1.6),
+           X_zero_point=st.integers(0, 4),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           W_zero_point=st.lists(st.integers(-5, 5), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           Y_zero_point=st.integers(0, 4),
+           use_bias=st.booleans(),
+           use_channelwise=st.booleans())
+    @override_qengines
+    def test_qconv1d_relu(
+        self,
+        batch_size,
+        input_channels_per_group,
+        output_channels_per_group,
+        groups,
+        length,
+        kernel,
+        stride,
+        pad,
+        dilation,
+        X_scale,
+        X_zero_point,
+        W_scale,
+        W_zero_point,
+        Y_scale,
+        Y_zero_point,
+        use_bias,
+        use_channelwise,
+    ):
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        if torch.backends.quantized.engine == 'qnnpack':
+            use_channelwise = False
+        conv1d = torch.nn.Conv1d(
+            input_channels,
+            output_channels,
+            kernel,
+            stride,
+            pad,
+            dilation,
+            groups,
+        )
+        qconv_prepack = torch.ops.quantized.conv1d_prepack
+        qconv = torch.ops.quantized.conv1d_relu
+
+        act_qdtypes = [torch.quint8]
+        # Only qnnpack qengine supportes qint8
+        if qengine_is_qnnpack() and torch.backends.xnnpack.enabled:
+            act_qdtypes.append(torch.qint8)
+
+        for X_qdtype in act_qdtypes:
+            if X_qdtype == torch.qint8:
+                W_zero_point = [0 for i in range(len(W_zero_point))]
+
+            self._test_qconv_impl(
+                qconv, qconv_prepack, conv1d, batch_size,
+                input_channels_per_group, (length, ),
+                output_channels_per_group, groups, kernel, [stride], [pad], None,
+                [dilation], X_scale, X_zero_point, W_scale, W_zero_point,
+                Y_scale, Y_zero_point, use_bias, "relu", use_channelwise, False,
                 input_dtype=X_qdtype, output_dtype=X_qdtype)
 
     # TODO: merge this test with test_qconv1d when CUDNN runtime flags becomes available
@@ -5453,7 +5855,6 @@ def test_qconv1d(
            # currently conv cudnn backend is only implemented for int8 symmetric
            Y_zero_point=st.sampled_from([0]),
            use_bias=st.booleans(),
-           use_relu=st.booleans(),
            # TODO: enable channelwise
            use_channelwise=st.sampled_from([False]))
     @skipIfNoFBGEMM
@@ -5479,7 +5880,6 @@ def test_qconv1d_cudnn(
         Y_scale,
         Y_zero_point,
         use_bias,
-        use_relu,
         use_channelwise,
     ):
         input_channels = input_channels_per_group * groups
@@ -5495,17 +5895,88 @@ def test_qconv1d_cudnn(
             groups,
         ).to(torch.device("cuda"))
         qconv_prepack = torch.ops.quantized.conv1d_prepack
-        if use_relu:
-            qconv = torch.ops.quantized.conv1d_relu
-        else:
-            qconv = torch.ops.quantized.conv1d
+        qconv = torch.ops.quantized.conv1d
 
         self._test_qconv_impl(
             qconv, qconv_prepack, conv1d, batch_size,
             input_channels_per_group, (length, ),
             output_channels_per_group, groups, kernel, [stride], [pad], None,
             [dilation], X_scale, X_zero_point, W_scale, W_zero_point,
-            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
+            Y_scale, Y_zero_point, use_bias, "none", use_channelwise, False,
+            device=torch.device("cuda"),
+            input_dtype=torch.qint8, weight_dtype=torch.qint8, output_dtype=torch.qint8)
+
+    @given(batch_size=st.integers(1, 6),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           input_channels_per_group=st.integers(1, 32),
+           # cudnn only supports multiples of 4, but we have explicitly added padding on the backend
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 1),  # currently padding only supports groups=1
+           length=st.integers(4, 16),
+           kernel=st.integers(1, 7),
+           stride=st.integers(1, 2),
+           pad=st.integers(0, 2),
+           # currently cudnn has only been verified to work for dilation = 1
+           # TODO: check backend works for dilation > 1
+           dilation=st.integers(1, 1),
+           X_scale=st.floats(1.2, 1.6),
+           # currently conv cudnn backend is only implemented for int8 symmetric
+           X_zero_point=st.sampled_from([0]),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           # currently conv cudnn backend is only implemented for int8 symmetric
+           W_zero_point=st.lists(st.integers(0, 0), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           # currently conv cudnn backend is only implemented for int8 symmetric
+           Y_zero_point=st.sampled_from([0]),
+           use_bias=st.booleans(),
+           # TODO: enable channelwise
+           use_channelwise=st.sampled_from([False]))
+    @skipIfNoFBGEMM
+    @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.")
+    @unittest.skip("Local only - currently the qconv1d_cudnn op is bulid "
+                   "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test "
+                   "after it is built by default")
+    def test_qconv1d_relu_cudnn(
+        self,
+        batch_size,
+        input_channels_per_group,
+        output_channels_per_group,
+        groups,
+        length,
+        kernel,
+        stride,
+        pad,
+        dilation,
+        X_scale,
+        X_zero_point,
+        W_scale,
+        W_zero_point,
+        Y_scale,
+        Y_zero_point,
+        use_bias,
+        use_channelwise,
+    ):
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+
+        conv1d = torch.nn.Conv1d(
+            input_channels,
+            output_channels,
+            kernel,
+            stride,
+            pad,
+            dilation,
+            groups,
+        ).to(torch.device("cuda"))
+        qconv_prepack = torch.ops.quantized.conv1d_prepack
+        qconv = torch.ops.quantized.conv1d_relu
+
+        self._test_qconv_impl(
+            qconv, qconv_prepack, conv1d, batch_size,
+            input_channels_per_group, (length, ),
+            output_channels_per_group, groups, kernel, [stride], [pad], None,
+            [dilation], X_scale, X_zero_point, W_scale, W_zero_point,
+            Y_scale, Y_zero_point, use_bias, "relu", use_channelwise, False,
             device=torch.device("cuda"),
             input_dtype=torch.qint8, weight_dtype=torch.qint8, output_dtype=torch.qint8)
 
@@ -5533,7 +6004,6 @@ def test_qconv1d_cudnn(
            Y_scale=st.floats(4.2, 5.6),
            Y_zero_point=st.integers(0, 4),
            use_bias=st.booleans(),
-           use_relu=st.booleans(),
            use_channelwise=st.booleans(),
            qengine=st.sampled_from(("qnnpack", "fbgemm")))
     def test_qconv3d(
@@ -5562,7 +6032,6 @@ def test_qconv3d(
         Y_scale,
         Y_zero_point,
         use_bias,
-        use_relu,
         use_channelwise,
         qengine
     ):
@@ -5578,8 +6047,6 @@ def test_qconv3d(
 
         with override_quantized_engine(qengine):
             qconv = torch.ops.quantized.conv3d
-            if use_relu:
-                qconv = torch.ops.quantized.conv3d_relu
             qconv_prepack = torch.ops.quantized.conv3d_prepack
             conv_op = torch.nn.Conv3d(
                 input_channels,
@@ -5595,7 +6062,91 @@ def test_qconv3d(
                 input_channels_per_group, (D, H, W), output_channels_per_group,
                 groups, kernels, strides, pads, None, dilations, X_scale,
                 X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point,
-                use_bias, use_relu, use_channelwise, use_transpose=False)
+                use_bias, "none", use_channelwise, use_transpose=False)
+
+    @given(batch_size=st.integers(1, 4),
+           input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16]),
+           D=st.integers(4, 8),
+           H=st.integers(4, 8),
+           W=st.integers(4, 8),
+           output_channels_per_group=st.sampled_from([2, 4, 5, 8, 16]),
+           groups=st.integers(1, 3),
+           kernel_d=st.integers(1, 4),
+           kernel_h=st.integers(1, 4),
+           kernel_w=st.integers(1, 4),
+           stride_d=st.integers(1, 2),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_d=st.integers(0, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           X_scale=st.floats(1.2, 1.6),
+           X_zero_point=st.integers(0, 4),
+           W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2),
+           W_zero_point=st.lists(st.integers(-5, 5), min_size=1, max_size=2),
+           Y_scale=st.floats(4.2, 5.6),
+           Y_zero_point=st.integers(0, 4),
+           use_bias=st.booleans(),
+           use_channelwise=st.booleans(),
+           qengine=st.sampled_from(("qnnpack", "fbgemm")))
+    def test_qconv3d_relu(
+        self,
+        batch_size,
+        input_channels_per_group,
+        D,
+        H,
+        W,
+        output_channels_per_group,
+        groups,
+        kernel_d,
+        kernel_h,
+        kernel_w,
+        stride_d,
+        stride_h,
+        stride_w,
+        pad_d,
+        pad_h,
+        pad_w,
+        dilation,
+        X_scale,
+        X_zero_point,
+        W_scale,
+        W_zero_point,
+        Y_scale,
+        Y_zero_point,
+        use_bias,
+        use_channelwise,
+        qengine
+    ):
+        if qengine not in supported_qengines:
+            return
+
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_d, kernel_h, kernel_w)
+        strides = (stride_d, stride_h, stride_w)
+        pads = (pad_d, pad_h, pad_w)
+        dilations = (dilation, dilation, dilation)
+
+        with override_quantized_engine(qengine):
+            qconv = torch.ops.quantized.conv3d_relu
+            qconv_prepack = torch.ops.quantized.conv3d_prepack
+            conv_op = torch.nn.Conv3d(
+                input_channels,
+                output_channels,
+                kernels,
+                strides,
+                pads,
+                dilations,
+                groups,
+            )
+            self._test_qconv_impl(
+                qconv, qconv_prepack, conv_op, batch_size,
+                input_channels_per_group, (D, H, W), output_channels_per_group,
+                groups, kernels, strides, pads, None, dilations, X_scale,
+                X_zero_point, W_scale, W_zero_point, Y_scale, Y_zero_point,
+                use_bias, "relu", use_channelwise, use_transpose=False)
 
     """Tests the correctness of the quantized::qconv3d_unpack op."""
     @given(
@@ -5650,22 +6201,46 @@ def test_conv_reorder_issue_onednn(self):
             bs = 1
             ic, oc = 128, 512
             kh, kw = 1, 1
-            ih, iw = 28, 28
             bias = None
-            strides, paddings, dilates, groups = (1, 1), (0, 0), (1, 1), 1
-            w = torch.randn((oc, ic, kh, kw))
+            strides, paddings, dilates = (1, 1), (0, 0), (1, 1)
+            for groups in [1, 2]:
+                ih, iw = 28, 28
+                w = torch.randn((oc * groups, ic, kh, kw))
+                qw = torch.quantize_per_tensor(w, scale=1.0, zero_point=0, dtype=torch.qint8)
+                x = torch.randn((bs, ic * groups, ih, iw))
+                qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)
+                w_packed = torch.ops.quantized.conv2d_prepack(
+                    qw, bias, strides, paddings, dilates, groups
+                )
+                torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
+                ih, iw = 5, 4
+                x = torch.randn((bs, ic * groups, ih, iw))
+                qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)
+                # The following should pass when input shape is changed
+                torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
+
+    @skipIfNoONEDNN
+    def test_conv_transpose_reorder_issue_onednn(self):
+        with override_quantized_engine('onednn'):
+            bs = 1
+            ic, oc = 16, 33
+            kh, kw = 3, 3
+            ih, iw = 50, 100
+            bias = None
+            strides, paddings, output_paddings, dilates, groups = [2, 2], [0, 0], [0, 0], [1, 1], 1
+            w = torch.randn((ic, oc, kh, kw))
             qw = torch.quantize_per_tensor(w, scale=1.0, zero_point=0, dtype=torch.qint8)
             x = torch.randn((bs, ic, ih, iw))
             qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)
-            w_packed = torch.ops.quantized.conv2d_prepack(
-                qw, bias, strides, paddings, dilates, groups
+            w_packed = torch.ops.quantized.conv_transpose2d_prepack(
+                qw, bias, strides, paddings, output_paddings, dilates, groups
             )
-            torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
+            torch.ops.quantized.conv_transpose2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
             ih, iw = 5, 4
             x = torch.randn((bs, ic, ih, iw))
             qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)
             # The following should pass when input shape is changed
-            torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
+            torch.ops.quantized.conv_transpose2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
 
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 98e21ab30f09..96d5cea156af 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -22,7 +22,7 @@
 
 class Foo(torch.nn.Module):
     def __init__(self):
-        super(Foo, self).__init__()
+        super().__init__()
         self.qscheme = torch.per_tensor_symmetric
 
 def _calculate_dynamic_qparams(X, dtype, reduce_range=False):
@@ -777,8 +777,8 @@ def test_qtensor_permute(self):
 
                 # change memory format
                 qlast = qr.contiguous(memory_format=torch.channels_last)
-                self.assertEqual(qr.stride(), list(reversed(sorted(qr.stride()))))
-                self.assertNotEqual(qlast.stride(), list(reversed(sorted(qlast.stride()))))
+                self.assertEqual(qr.stride(), sorted(qr.stride(), reverse=True))
+                self.assertNotEqual(qlast.stride(), sorted(qlast.stride(), reverse=True))
                 self.assertEqual(qr.int_repr(), qlast.int_repr())
                 self.assertEqual(qr.q_scale(), qlast.q_scale())
                 self.assertEqual(qr.q_zero_point(), qlast.q_zero_point())
@@ -804,8 +804,8 @@ def test_qtensor_per_channel_permute(self):
 
             # but we can change memory format
             qlast = qr.contiguous(memory_format=torch.channels_last)
-            self.assertEqual(qr.stride(), list(reversed(sorted(qr.stride()))))
-            self.assertNotEqual(qlast.stride(), list(reversed(sorted(qlast.stride()))))
+            self.assertEqual(qr.stride(), sorted(qr.stride(), reverse=True))
+            self.assertNotEqual(qlast.stride(), sorted(qlast.stride(), reverse=True))
             self.assertEqual(qr.int_repr(), qlast.int_repr())
             self.assertEqual(scales.to(dtype=torch.float64), qlast.q_per_channel_scales())
             self.assertEqual(zero_points, qlast.q_per_channel_zero_points())
@@ -1404,7 +1404,7 @@ class M(torch.jit.ScriptModule):
                 __constants__ = ['fname']
 
                 def __init__(self):
-                    super(M, self).__init__()
+                    super().__init__()
                     self.fname = fname
 
                 @torch.jit.script_method
@@ -1432,7 +1432,7 @@ def test_pickle_checkpoint_qtensor(self):
     def test_jit_serialization(self):
         class SimpleQTensor(torch.jit.ScriptModule):
             def __init__(self, per_channel):
-                super(SimpleQTensor, self).__init__()
+                super().__init__()
                 x = torch.rand(5, 5).float()
                 if not per_channel:
                     x_q = torch.quantize_per_tensor(x, 0.2, 10, torch.quint8)
@@ -1513,7 +1513,7 @@ def test_decomposed_dynamic_quant_pattern(self):
 
         # Now try decomposed pattern
         (scale_decomposed, zero_point_decomposed) = torch.ops.quantized_decomposed.choose_qparams.tensor(
-            X, quant_min, quant_max, dtype)
+            X, quant_min, quant_max, torch.Tensor([torch.finfo(torch.float32).eps]), dtype)
         quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor.tensor(
             X, scale_decomposed, zero_point_decomposed, quant_min, quant_max, dtype)
 
diff --git a/test/quantization/core/test_utils.py b/test/quantization/core/test_utils.py
index 55d889f88eb3..94ae61609604 100644
--- a/test/quantization/core/test_utils.py
+++ b/test/quantization/core/test_utils.py
@@ -3,7 +3,7 @@
 import torch
 from torch.testing._internal.common_utils import TestCase
 from torch.ao.quantization.utils import get_fqn_to_example_inputs
-from torch.nn.quantized.modules.utils import _quantize_weight
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
 from torch.ao.quantization import MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver
 
 
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 8f8ad4d50c38..87a8c31c87c9 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -909,7 +909,7 @@ def test_qat_convbn_fused_syncbn_replacement(self):
             # create conv-bn
             class Model(nn.Module):
                 def __init__(self):
-                    super(Model, self).__init__()
+                    super().__init__()
                     self.conv = nn.Conv2d(4, 1, 3, padding=1)
                     self.bn = nn.BatchNorm2d(1)
 
@@ -958,7 +958,7 @@ def test_device_affinity(self):
         class Model(nn.Module):
 
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 1)
                 self.bn = nn.BatchNorm2d(1)
                 self.relu = nn.ReLU()
@@ -1189,7 +1189,7 @@ def test_fused_mod_reduce_range(self):
     def test_embedding_bag_qat_config(self):
         class Model(nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.emb1 = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
                                                   include_last_offset=True, scale_grad_by_freq=False, mode='sum')
                 self.emb2 = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
@@ -1269,7 +1269,7 @@ def test_embedding_qat_config(self):
     def test_default_fused_qat_config(self):
         class Model(nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.linear = nn.Linear(2, 2)
                 self.relu = nn.ReLU()
 
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index a0687d88fa57..a3528098b256 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -629,7 +629,7 @@ def test_fake_quant_control(self):
     def test_fake_quant_preserves_qparam_shapes_for_activations(self):
         class Model(nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.linear = nn.Linear(4, 4)
 
             def forward(self, x):
diff --git a/test/quantization/eager/test_bias_correction_eager.py b/test/quantization/eager/test_bias_correction_eager.py
index 0fc8743e9205..d29d39bb3028 100644
--- a/test/quantization/eager/test_bias_correction_eager.py
+++ b/test/quantization/eager/test_bias_correction_eager.py
@@ -68,7 +68,7 @@ def correct_artificial_bias_quantize(self, float_model, img_data):
     def test_linear_chain(self):
         class LinearChain(nn.Module):
             def __init__(self):
-                super(LinearChain, self).__init__()
+                super().__init__()
                 self.linear1 = nn.Linear(3, 4)
                 self.linear2 = nn.Linear(4, 5)
                 self.linear3 = nn.Linear(5, 6)
@@ -87,7 +87,7 @@ def forward(self, x):
     def test_conv_chain(self):
         class ConvChain(nn.Module):
             def __init__(self):
-                super(ConvChain, self).__init__()
+                super().__init__()
                 self.conv2d1 = nn.Conv2d(3, 4, 5, 5)
                 self.conv2d2 = nn.Conv2d(4, 5, 5, 5)
                 self.conv2d3 = nn.Conv2d(5, 6, 5, 5)
diff --git a/test/quantization/eager/test_equalize_eager.py b/test/quantization/eager/test_equalize_eager.py
index 2fd8557faae9..f08ff2b8d023 100644
--- a/test/quantization/eager/test_equalize_eager.py
+++ b/test/quantization/eager/test_equalize_eager.py
@@ -73,7 +73,7 @@ def test_equalize(self):
         '''
         class ChainModule(nn.Module):
             def __init__(self):
-                super(ChainModule, self).__init__()
+                super().__init__()
                 self.linear1 = nn.Linear(3, 4)
                 self.linear2 = nn.Linear(4, 5)
                 self.linear3 = nn.Linear(5, 6)
diff --git a/test/quantization/eager/test_fuse_eager.py b/test/quantization/eager/test_fuse_eager.py
index 1ebc4bfd094e..6343d044cfed 100644
--- a/test/quantization/eager/test_fuse_eager.py
+++ b/test/quantization/eager/test_fuse_eager.py
@@ -5,8 +5,8 @@
 import torch
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
-import torch.nn.intrinsic as nni
-import torch.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.quantized as nniq
 import torch.ao.nn.intrinsic.qat as nniqat
 from torch.ao.quantization import (
     quantize,
diff --git a/test/quantization/eager/test_model_numerics.py b/test/quantization/eager/test_model_numerics.py
index bcefb78bd752..1a1ef3b917fc 100644
--- a/test/quantization/eager/test_model_numerics.py
+++ b/test/quantization/eager/test_model_numerics.py
@@ -95,8 +95,8 @@ def test_weight_only_activation_only_fakequant(self):
                 torch.manual_seed(67)
                 calib_data = torch.rand(2048, 3, 15, 15, dtype=torch.float32)
                 eval_data = torch.rand(10, 3, 15, 15, dtype=torch.float32)
-                qconfigset = set([torch.ao.quantization.default_weight_only_qconfig,
-                                  torch.ao.quantization.default_activation_only_qconfig])
+                qconfigset = {torch.ao.quantization.default_weight_only_qconfig,
+                              torch.ao.quantization.default_activation_only_qconfig}
                 SQNRTarget = [35, 45]
                 for idx, qconfig in enumerate(qconfigset):
                     my_model = ModelMultipleOpsNoAvgPool().to(torch.float32)
diff --git a/test/quantization/eager/test_numeric_suite_eager.py b/test/quantization/eager/test_numeric_suite_eager.py
index c8cf9c3dddf8..128f7cb96a06 100644
--- a/test/quantization/eager/test_numeric_suite_eager.py
+++ b/test/quantization/eager/test_numeric_suite_eager.py
@@ -40,7 +40,7 @@
 
 class SubModule(torch.nn.Module):
     def __init__(self):
-        super(SubModule, self).__init__()
+        super().__init__()
         self.qconfig = default_qconfig
         self.mod1 = torch.nn.Conv2d(3, 3, 3, bias=False).to(dtype=torch.float)
         self.mod2 = nn.ReLU()
@@ -57,7 +57,7 @@ def forward(self, x):
 
 class ModelWithSubModules(torch.nn.Module):
     def __init__(self):
-        super(ModelWithSubModules, self).__init__()
+        super().__init__()
         self.mod1 = SubModule()
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
 
@@ -69,7 +69,7 @@ def forward(self, x):
 
 class ModelWithFunctionals(torch.nn.Module):
     def __init__(self):
-        super(ModelWithFunctionals, self).__init__()
+        super().__init__()
         self.mycat = nnq.FloatFunctional()
         self.myadd = nnq.FloatFunctional()
         self.mymul = nnq.FloatFunctional()
@@ -542,9 +542,9 @@ def _test_vision_model(self, float_model):
         float_model.to('cpu')
         float_model.eval()
         float_model.fuse_model()
-        float_model.qconfig = torch.quantization.default_qconfig
+        float_model.qconfig = torch.ao.quantization.default_qconfig
         img_data = [(torch.rand(2, 3, 224, 224, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)]
-        qmodel = quantize(float_model, torch.quantization.default_eval_fn, [img_data], inplace=False)
+        qmodel = quantize(float_model, torch.ao.quantization.default_eval_fn, [img_data], inplace=False)
 
         wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict())
 
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index ae0f6f164dce..9b3e1ddd76c3 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -121,7 +121,7 @@ def forward(self, x):
         original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach())
         original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach())
 
-        original_m.qconfig = torch.quantization.default_qconfig
+        original_m.qconfig = torch.ao.quantization.default_qconfig
 
         m = prepare(original_m)
         # calibration
@@ -135,7 +135,7 @@ def forward(self, x):
 
         # quantize the reference model
         original_ref_m.eval()
-        original_ref_m.qconfig = torch.quantization.default_qconfig
+        original_ref_m.qconfig = torch.ao.quantization.default_qconfig
 
         ref_m = prepare(original_ref_m)
         ref_m(data)
@@ -365,10 +365,10 @@ def checkQuantized(model):
                 # test one line API - out of place version
                 base = AnnotatedSingleLayerLinearModel(qengine)
                 base.qconfig = qconfig
-                keys_before = set(list(base.state_dict().keys()))
+                keys_before = set(base.state_dict().keys())
                 model = quantize(base, test_only_eval_fn, [self.calib_data])
                 checkQuantized(model)
-                keys_after = set(list(base.state_dict().keys()))
+                keys_after = set(base.state_dict().keys())
                 self.assertEqual(keys_before, keys_after)  # simple check that nothing changed
 
                 # in-place version
@@ -1077,9 +1077,9 @@ def __init__(self, d_model, nhead, batch_first):
         qengine = torch.backends.quantized.engine
         for batch_first in [True, False]:
             model = TransformerDecoderLayer(512, 8, batch_first)
-            quantization_config = torch.quantization.get_default_qconfig(qengine)
+            quantization_config = torch.ao.quantization.get_default_qconfig(qengine)
             model.qconfig = quantization_config
-            prepared_model = torch.quantization.prepare(model, inplace=False)
+            prepared_model = torch.ao.quantization.prepare(model, inplace=False)
             self.assertTrue(prepared_model.self_attn.batch_first == model.self_attn.batch_first)
 
 @skipIfNoFBGEMM
@@ -1107,10 +1107,10 @@ def checkQuantized(model):
 
             # test one line API - out of place version
             base = SingleLayerLinearDynamicModel()
-            keys_before = set(list(base.state_dict().keys()))
+            keys_before = set(base.state_dict().keys())
             model = quantize_dynamic(base, qconfig_dict)
             checkQuantized(model)
-            keys_after = set(list(base.state_dict().keys()))
+            keys_after = set(base.state_dict().keys())
             self.assertEqual(keys_before, keys_after)  # simple check that nothing changed
 
             # in-place version
@@ -1120,7 +1120,7 @@ def checkQuantized(model):
 
             # Test set qconfig
             model = SingleLayerLinearDynamicModel()
-            quantize_dynamic(model, set([nn.Linear]), inplace=True, dtype=dtype)
+            quantize_dynamic(model, {nn.Linear}, inplace=True, dtype=dtype)
             checkQuantized(model)
 
     def test_two_layers(self):
@@ -1362,7 +1362,7 @@ def checkQuantized(model, module_type):
 
             class ScriptWrapperPackedLSTM(torch.nn.Module):
                 def __init__(self, cell):
-                    super(ScriptWrapperPackedLSTM, self).__init__()
+                    super().__init__()
                     self.cell = cell
 
                 def forward(self, x: PackedSequence) -> Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]:
@@ -1370,7 +1370,7 @@ def forward(self, x: PackedSequence) -> Tuple[PackedSequence, Tuple[torch.Tensor
 
             class ScriptWrapperPackedGRU(torch.nn.Module):
                 def __init__(self, cell):
-                    super(ScriptWrapperPackedGRU, self).__init__()
+                    super().__init__()
                     self.cell = cell
 
                 def forward(self, x: PackedSequence) -> Tuple[PackedSequence, torch.Tensor]:
diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py
index 44911b6d9e11..d51fcbb99971 100644
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@@ -120,7 +120,7 @@ def reset_bn_parameters(self):
             init.uniform_(self.bias, -bound, bound)
 
     def reset_parameters(self):
-        super(_ReferenceConvBnNd, self).reset_parameters()
+        super().reset_parameters()
         # A hack to avoid resetting on undefined parameters
         if hasattr(self, 'gamma'):
             self.reset_bn_parameters()
@@ -191,7 +191,7 @@ def _forward(self, input):
 
     def extra_repr(self):
         # TODO(jerryzh): extend
-        return super(_ReferenceConvBnNd, self).extra_repr()
+        return super().extra_repr()
 
     def forward(self, input):
         return self.activation_post_process(self._forward(input))
@@ -226,7 +226,7 @@ def from_float(cls, mod, qconfig=None):
         return qat_convbn
 
 class _ReferenceConvBn2d(_ReferenceConvBnNd, nn.Conv2d):
-    _FLOAT_MODULE = torch.nn.intrinsic.ConvBn2d
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvBn2d
 
     def __init__(self,
                  # ConvNd args
@@ -1053,7 +1053,7 @@ def test_linear_bn_numerics(self):
         m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
 
         # without fake_quants, fused QAT module should match fp32 module
-        m.apply(torch.quantization.disable_fake_quant)
+        m.apply(torch.ao.quantization.disable_fake_quant)
         data = torch.randn(4, 4)
         r1 = m_ref(data)
         r2 = m(data)
@@ -1076,7 +1076,7 @@ def test_linear_bn_symm_numerics(self):
         m = nniqat.LinearBn1d.from_float(m_ref_copy[0])
 
         # without fake_quants, fused QAT module should match fp32 module
-        m.apply(torch.quantization.disable_fake_quant)
+        m.apply(torch.ao.quantization.disable_fake_quant)
         data = torch.randn(4, 4)
         r1 = m_ref(data)
         r2 = m(data)
diff --git a/test/quantization/fx/test_equalize_fx.py b/test/quantization/fx/test_equalize_fx.py
index e3560fd29149..059c5bb68b9d 100644
--- a/test/quantization/fx/test_equalize_fx.py
+++ b/test/quantization/fx/test_equalize_fx.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized as nniq
 import torch.ao.nn.quantized as nnq
 from torch.ao.quantization import default_qconfig
 from torch.ao.quantization.observer import MinMaxObserver, PerChannelMinMaxObserver
diff --git a/test/quantization/fx/test_model_report_fx.py b/test/quantization/fx/test_model_report_fx.py
index c688946eaf8b..85f99759f540 100644
--- a/test/quantization/fx/test_model_report_fx.py
+++ b/test/quantization/fx/test_model_report_fx.py
@@ -20,7 +20,7 @@
     default_per_channel_weight_observer,
     default_observer
 )
-from torch.nn.intrinsic.modules.fused import ConvReLU2d, LinearReLU
+from torch.ao.nn.intrinsic.modules.fused import ConvReLU2d, LinearReLU
 from torch.testing._internal.common_quantization import (
     ConvModel,
     QuantizationTestCase,
@@ -434,14 +434,14 @@ def test_qat_aware_model_example(self):
         # first we want a QAT model
         class QATConvLinearReluModel(torch.nn.Module):
             def __init__(self):
-                super(QATConvLinearReluModel, self).__init__()
+                super().__init__()
                 # QuantStub converts tensors from floating point to quantized
-                self.quant = torch.quantization.QuantStub()
+                self.quant = torch.ao.quantization.QuantStub()
                 self.conv = torch.nn.Conv2d(1, 1, 1)
                 self.bn = torch.nn.BatchNorm2d(1)
                 self.relu = torch.nn.ReLU()
                 # DeQuantStub converts tensors from quantized to floating point
-                self.dequant = torch.quantization.DeQuantStub()
+                self.dequant = torch.ao.quantization.DeQuantStub()
 
             def forward(self, x):
                 x = self.quant(x)
@@ -455,17 +455,17 @@ def forward(self, x):
             # create a model instance
             model_fp32 = QATConvLinearReluModel()
 
-            model_fp32.qconfig = torch.quantization.get_default_qat_qconfig("qnnpack")
+            model_fp32.qconfig = torch.ao.quantization.get_default_qat_qconfig("qnnpack")
 
             # model must be in eval mode for fusion
             model_fp32.eval()
-            model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [["conv", "bn", "relu"]])
+            model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32, [["conv", "bn", "relu"]])
 
             # model must be set to train mode for QAT logic to work
             model_fp32_fused.train()
 
             # prepare the model for QAT, different than for post training quantization
-            model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
+            model_fp32_prepared = torch.ao.quantization.prepare_qat(model_fp32_fused)
 
             # run the detector
             per_channel_detector = PerChannelDetector(torch.backends.quantized.engine)
@@ -704,7 +704,7 @@ def forward(self, x):
 
         class ModifiedThreeOps(torch.nn.Module):
             def __init__(self, batch_norm_dim):
-                super(ModifiedThreeOps, self).__init__()
+                super().__init__()
                 self.obs1 = ModelReportObserver()
                 self.linear = torch.nn.Linear(7, 3, 2)
                 self.obs2 = ModelReportObserver()
@@ -728,7 +728,7 @@ def forward(self, x):
 
         class HighDimensionNet(torch.nn.Module):
             def __init__(self):
-                super(HighDimensionNet, self).__init__()
+                super().__init__()
                 self.obs1 = ModelReportObserver()
                 self.fc1 = torch.nn.Linear(3, 7)
                 self.block1 = ModifiedThreeOps(3)
@@ -787,7 +787,7 @@ class TestFxModelReportDetectDynamicStatic(QuantizationTestCase):
     def test_nested_detection_case(self):
         class SingleLinear(torch.nn.Module):
             def __init__(self):
-                super(SingleLinear, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(3, 3)
 
             def forward(self, x):
@@ -796,7 +796,7 @@ def forward(self, x):
 
         class TwoBlockNet(torch.nn.Module):
             def __init__(self):
-                super(TwoBlockNet, self).__init__()
+                super().__init__()
                 self.block1 = SingleLinear()
                 self.block2 = SingleLinear()
 
@@ -895,17 +895,17 @@ def test_constructor(self):
             model_prep = quantize_fx.prepare_fx(model, q_config_mapping, model.get_example_inputs()[0])
 
             # make an example set of detectors
-            test_detector_set = set([DynamicStaticDetector(), PerChannelDetector(backend)])
+            test_detector_set = {DynamicStaticDetector(), PerChannelDetector(backend)}
             # initialize with an empty detector
             model_report = ModelReport(model_prep, test_detector_set)
 
             # make sure internal valid reports matches
-            detector_name_set = set([detector.get_detector_name() for detector in test_detector_set])
+            detector_name_set = {detector.get_detector_name() for detector in test_detector_set}
             self.assertEqual(model_report.get_desired_reports_names(), detector_name_set)
 
             # now attempt with no valid reports, should raise error
             with self.assertRaises(ValueError):
-                model_report = ModelReport(model, set([]))
+                model_report = ModelReport(model, set())
 
             # number of expected obs of interest entries
             num_expected_entries = len(test_detector_set)
@@ -932,7 +932,7 @@ def test_prepare_model_callibration(self):
             # make an example set of detectors
             torch.backends.quantized.engine = "fbgemm"
             backend = torch.backends.quantized.engine
-            test_detector_set = set([DynamicStaticDetector(), PerChannelDetector(backend)])
+            test_detector_set = {DynamicStaticDetector(), PerChannelDetector(backend)}
             # initialize with an empty detector
 
             # prepare the model
@@ -1029,8 +1029,8 @@ def test_generate_report(self):
             torch.backends.quantized.engine = "fbgemm"
 
             # check whether the correct number of reports are being generated
-            filled_detector_set = set([DynamicStaticDetector(), PerChannelDetector(torch.backends.quantized.engine)])
-            single_detector_set = set([DynamicStaticDetector()])
+            filled_detector_set = {DynamicStaticDetector(), PerChannelDetector(torch.backends.quantized.engine)}
+            single_detector_set = {DynamicStaticDetector()}
 
             # create our models
             model_full = TwoThreeOps()
@@ -1316,7 +1316,7 @@ def test_input_weight_equalization_determine_points(self):
         # then create model report instance with detector
         with override_quantized_engine('fbgemm'):
 
-            detector_set = set([InputWeightEqualizationDetector(0.5)])
+            detector_set = {InputWeightEqualizationDetector(0.5)}
 
             # get tst model and callibrate
             non_fused = self._get_prepped_for_calibration_model(self.TwoBlockComplexNet(), detector_set)
@@ -1326,10 +1326,10 @@ def test_input_weight_equalization_determine_points(self):
             for prepared_for_callibrate_model, mod_report in [non_fused, fused]:
 
                 # supported modules to check
-                mods_to_check = set([nn.Linear, nn.Conv2d])
+                mods_to_check = {nn.Linear, nn.Conv2d}
 
                 # get the set of all nodes in the graph their fqns
-                node_fqns = set([node.target for node in prepared_for_callibrate_model.graph.nodes])
+                node_fqns = {node.target for node in prepared_for_callibrate_model.graph.nodes}
 
                 # there should be 4 node fqns that have the observer inserted
                 correct_number_of_obs_inserted = 4
@@ -1362,7 +1362,7 @@ def test_input_weight_equalization_report_gen(self):
         with override_quantized_engine('fbgemm'):
 
             test_input_weight_detector = InputWeightEqualizationDetector(0.4)
-            detector_set = set([test_input_weight_detector])
+            detector_set = {test_input_weight_detector}
             model = self.TwoBlockComplexNet()
             # prepare the model for callibration
             prepared_for_callibrate_model, model_report = self._get_prepped_for_calibration_model(
@@ -1471,7 +1471,7 @@ def test_input_weight_equalization_report_gen_empty(self):
         # then create model report instance with detector
         with override_quantized_engine('fbgemm'):
             test_input_weight_detector = InputWeightEqualizationDetector(0.4)
-            detector_set = set([test_input_weight_detector])
+            detector_set = {test_input_weight_detector}
             model = self.ReluOnly()
             # prepare the model for callibration
             prepared_for_callibrate_model, model_report = self._get_prepped_for_calibration_model(model, detector_set)
@@ -1547,7 +1547,7 @@ def test_outlier_detection_determine_points(self):
         # not explicitly testing fusion because fx workflow automatically
         with override_quantized_engine('fbgemm'):
 
-            detector_set = set([OutlierDetector(reference_percentile=0.95)])
+            detector_set = {OutlierDetector(reference_percentile=0.95)}
 
             # get tst model and callibrate
             prepared_for_callibrate_model, mod_report = self._get_prepped_for_calibration_model(
@@ -1555,7 +1555,7 @@ def test_outlier_detection_determine_points(self):
             )
 
             # supported modules to check
-            mods_to_check = set([nn.Linear, nn.Conv2d, nn.ReLU])
+            mods_to_check = {nn.Linear, nn.Conv2d, nn.ReLU}
 
             # there should be 4 node fqns that have the observer inserted
             correct_number_of_obs_inserted = 4
@@ -1590,7 +1590,7 @@ def test_no_outlier_report_gen(self):
             dynamic_static_detector = DynamicStaticDetector(tolerance=0.5)
 
             param_size: int = 4
-            detector_set = set([outlier_detector, dynamic_static_detector])
+            detector_set = {outlier_detector, dynamic_static_detector}
             model = self.LargeBatchModel(param_size=param_size)
 
             # get tst model and callibrate
@@ -1640,7 +1640,7 @@ def test_all_outlier_report_gen(self):
             outlier_detector = OutlierDetector(ratio_threshold=1, reference_percentile=0)
 
             param_size: int = 16
-            detector_set = set([outlier_detector])
+            detector_set = {outlier_detector}
             model = self.LargeBatchModel(param_size=param_size)
 
             # get tst model and callibrate
@@ -1690,7 +1690,7 @@ def test_multiple_run_consistent_spike_outlier_report_gen(self):
             outlier_detector = OutlierDetector(reference_percentile=0.95)
 
             param_size: int = 8
-            detector_set = set([outlier_detector])
+            detector_set = {outlier_detector}
             model = self.LargeBatchModel(param_size=param_size)
 
             # get tst model and callibrate
@@ -1874,8 +1874,8 @@ def test_generate_tables_match_with_report(self):
             channel_headers, channel_table = table_dict[ModelReportVisualizer.TABLE_CHANNEL_KEY]
 
             # these two together should be the same as the generated report info in terms of keys
-            tensor_info_modules = set(row[1] for row in tensor_table)
-            channel_info_modules = set(row[1] for row in channel_table)
+            tensor_info_modules = {row[1] for row in tensor_table}
+            channel_info_modules = {row[1] for row in channel_table}
             combined_modules: Set = tensor_info_modules.union(channel_info_modules)
 
             generated_report_keys: Set = set(mod_rep_visualizer.generated_reports.keys())
@@ -1901,8 +1901,8 @@ def test_generate_tables_no_match(self):
             tensor_headers, tensor_table = empty_tables_dict[ModelReportVisualizer.TABLE_TENSOR_KEY]
             channel_headers, channel_table = empty_tables_dict[ModelReportVisualizer.TABLE_CHANNEL_KEY]
 
-            tensor_info_modules = set(row[1] for row in tensor_table)
-            channel_info_modules = set(row[1] for row in channel_table)
+            tensor_info_modules = {row[1] for row in tensor_table}
+            channel_info_modules = {row[1] for row in channel_table}
             combined_modules: Set = tensor_info_modules.union(channel_info_modules)
             self.assertEqual(len(combined_modules), 0)  # should be no matching modules
 
@@ -1946,7 +1946,7 @@ def _get_prepped_for_calibration_model_helper(model, detector_set, example_input
 
     # if they passed in fusion paramter, make sure to test that
     if fused:
-        model = torch.quantization.fuse_modules(model, model.get_fusion_modules())
+        model = torch.ao.quantization.fuse_modules(model, model.get_fusion_modules())
 
     model_prep = quantize_fx.prepare_fx(model, q_config_mapping, example_input)
 
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index eb7dcdfac355..f84e20487753 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -11,8 +11,9 @@
 from torch.ao.quantization import (
     default_dynamic_qconfig,
     QConfigMapping,
+    get_default_qconfig_mapping,
 )
-import torch.nn.quantized as nnq
+import torch.ao.nn.quantized as nnq
 toq = torch.ops.quantized
 from torch.ao.quantization.quantize_fx import (
     convert_fx,
@@ -26,6 +27,8 @@
     ConvModel,
     QuantizationTestCase,
     skipIfNoFBGEMM,
+    skipIfNoQNNPACK,
+    withQNNPACKBackend,
     SingleLayerLinearDynamicModel,
     SingleLayerLinearModel,
     LSTMwithHiddenDynamicModel,
@@ -82,6 +85,8 @@
     print_comparisons_n_shadows_model,
     loggers_set_enabled,
     loggers_set_save_activations,
+    _prepare_n_shadows_add_loggers_model,
+    _n_shadows_compare_weights,
 )
 from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 from torch.ao.quantization.backend_config import get_native_backend_config
@@ -385,9 +390,6 @@ def test_simple_mod_multi(self):
     @skipIfNoFBGEMM
     def test_simple_tensor_ops(self):
         class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 z = x + y
                 return z
@@ -428,9 +430,6 @@ def test_matching_failure_node_type(self):
     def test_nodes_before_cat(self):
         # verify that nodes before cat get matched
         class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x0):
                 x1 = torch.add(x0, 1.0)
                 y1 = torch.add(x0, 1.0)
@@ -463,9 +462,6 @@ def forward(self, x0):
     def test_dict_return_type(self):
         # verify that we can traverse up nodes which return dictionaries
         class M(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x0):
                 x1 = torch.add(x0, 1.0)
                 y1 = torch.add(x0, 1.0)
@@ -934,10 +930,13 @@ def _test_match_shadow_activations(
             m.eval()
         else:
             m.train()
+        print("qconfig_dict:", qconfig_dict)
         mp = prepare_fn(copy.deepcopy(m), qconfig_dict, example_inputs=data)
+        print("prepared:", mp)
         mp(*data)
         mp_copy = copy.deepcopy(mp)
         mq = convert_fx(mp_copy)
+        print("quantized:", mq)
 
         if compare_fp32_vs_fp32_prepared:
             m_shadows_mp = add_shadow_loggers(
@@ -2072,7 +2071,7 @@ def forward(self, x):
             mt_shadows_mt_copy, OutputLogger, 'b')
         self.assertTrue(len(act_compare_dict) == 1)
 
-@skipIfNoFBGEMM
+@skipIfNoQNNPACK
 class TestFXNumericSuiteNShadows(FXNumericSuiteQuantizationTestCase):
     """
     Tests the "n shadows" workflow.
@@ -2100,6 +2099,7 @@ def _test_impl(self, m, example_input, qconfig_mappings):
         print_comparisons_n_shadows_model(results)
         return msq
 
+    @withQNNPACKBackend
     def test_linear_mod(self):
         class M(nn.Module):
             def __init__(self):
@@ -2114,9 +2114,10 @@ def forward(self, x):
         example_input = (torch.randn(2, 2),)
 
         qconfig_mappings = \
-            QConfigMultiMapping().set_global([torch.quantization.default_qconfig])
+            QConfigMultiMapping().set_global([torch.ao.quantization.default_qconfig])
         self._test_impl(m, example_input, qconfig_mappings)
 
+    @withQNNPACKBackend
     def test_linear_relu_mod(self):
         class M(nn.Module):
             def __init__(self):
@@ -2136,12 +2137,13 @@ def forward(self, x):
 
         qconfig_mappings = (
             QConfigMultiMapping().set_global([
-                torch.quantization.default_qconfig,
-                torch.quantization.default_dynamic_qconfig
+                torch.ao.quantization.default_qconfig,
+                torch.ao.quantization.default_dynamic_qconfig
             ])
         )
         self._test_impl(m, example_input, qconfig_mappings)
 
+    @withQNNPACKBackend
     def test_conv_bn_relu_mod(self):
         class M(nn.Module):
             def __init__(self):
@@ -2161,11 +2163,12 @@ def forward(self, x):
 
         qconfig_mappings = QConfigMultiMapping() \
             .set_global([
-                torch.quantization.default_qconfig,
-                torch.quantization.default_per_channel_qconfig
+                torch.ao.quantization.default_qconfig,
+                torch.ao.quantization.default_per_channel_qconfig
             ])
         self._test_impl(m, example_input, qconfig_mappings)
 
+    @withQNNPACKBackend
     def test_functions(self):
         class M(nn.Module):
             def __init__(self):
@@ -2201,9 +2204,10 @@ def forward(self, x):
         example_input = (torch.randn(2, 2),)
 
         qconfig_mappings = QConfigMultiMapping() \
-            .set_global([torch.quantization.default_qconfig])
+            .set_global([torch.ao.quantization.default_qconfig])
         self._test_impl(m, example_input, qconfig_mappings)
 
+    @withQNNPACKBackend
     def test_partial_qconfig_mapping(self):
         class M(nn.Module):
             def __init__(self):
@@ -2229,12 +2233,13 @@ def forward(self, x):
             .set_object_type(F.relu, [qconfig])
         self._test_impl(m, example_input, qconfig_mappings)
 
+    @withQNNPACKBackend
     def test_logger_enabled_and_save_activations_flags(self):
         m = nn.Sequential(nn.Linear(1, 1)).eval()
         example_input = (torch.randn(1, 1),)
 
         qconfig_mappings = QConfigMultiMapping() \
-            .set_global([torch.quantization.default_qconfig])
+            .set_global([torch.ao.quantization.default_qconfig])
         backend_config = get_native_backend_config()
 
         msp = prepare_n_shadows_model(
@@ -2277,6 +2282,7 @@ def _check_logger_count(model, exp_count_stats, exp_count_comparisons):
         _check_logger_count(msq, 0, 1)
 
     @skip_if_no_torchvision
+    @withQNNPACKBackend
     def test_mobilenet_v2(self):
         import torchvision
         m = torchvision.models.quantization.mobilenet_v2(
@@ -2284,17 +2290,19 @@ def test_mobilenet_v2(self):
         example_input = (torch.randn(1, 3, 224, 224),)
 
         qconfig_mappings = QConfigMultiMapping() \
-            .set_global([torch.quantization.default_qconfig, torch.quantization.default_dynamic_qconfig])
+            .set_global([torch.ao.quantization.default_qconfig, torch.ao.quantization.default_dynamic_qconfig])
 
         self._test_impl(m, example_input, qconfig_mappings)
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_deduplication(self):
         # check that insertion deduplicates qconfigs
         qconfig_multi_mapping = QConfigMultiMapping().set_global(
-            [torch.quantization.default_qconfig, torch.quantization.default_qconfig]
+            [torch.ao.quantization.default_qconfig, torch.ao.quantization.default_qconfig]
         )
         self.assertEqual(len(qconfig_multi_mapping.qconfig_mappings_list), 1)
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_insert_padding(self):
         # test that inserting a higher priority qconfig style with fewer elements than a lower priority qconfig will
         # result in adding None to the extra QConfigMappings at that same style+key
@@ -2302,15 +2310,15 @@ def test_qconfig_multi_mapping_insert_padding(self):
             QConfigMultiMapping()
             .set_global(
                 [
-                    torch.quantization.default_qconfig,
-                    torch.quantization.default_dynamic_qconfig,
+                    torch.ao.quantization.default_qconfig,
+                    torch.ao.quantization.default_dynamic_qconfig,
                 ]
             )
-            .set_object_type(torch.nn.Linear, [torch.quantization.default_qconfig])
-            .set_module_name_regex("fc", [torch.quantization.default_qconfig])
-            .set_module_name("fc2", [torch.quantization.default_qconfig])
+            .set_object_type(torch.nn.Linear, [torch.ao.quantization.default_qconfig])
+            .set_module_name_regex("fc", [torch.ao.quantization.default_qconfig])
+            .set_module_name("fc2", [torch.ao.quantization.default_qconfig])
             .set_module_name_object_type_order(
-                "", nn.Linear, 0, [torch.quantization.default_qconfig]
+                "", nn.Linear, 0, [torch.ao.quantization.default_qconfig]
             )
         )
 
@@ -2337,21 +2345,22 @@ def test_qconfig_multi_mapping_insert_padding(self):
             None,
         )
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_retroactive_padding(self):
         # test that inserting a lower priority qconfig style with more elements thhan lower priority qconfig styles
         # will result in the new QConfigMapping having None at all previously existing styles+keys
         qconfig_multi_mapping = (
             QConfigMultiMapping()
-            .set_object_type(torch.nn.Linear, [torch.quantization.default_qconfig])
-            .set_module_name_regex("fc", [torch.quantization.default_qconfig])
-            .set_module_name("fc2", [torch.quantization.default_qconfig])
+            .set_object_type(torch.nn.Linear, [torch.ao.quantization.default_qconfig])
+            .set_module_name_regex("fc", [torch.ao.quantization.default_qconfig])
+            .set_module_name("fc2", [torch.ao.quantization.default_qconfig])
             .set_module_name_object_type_order(
-                "", nn.Linear, 0, [torch.quantization.default_qconfig]
+                "", nn.Linear, 0, [torch.ao.quantization.default_qconfig]
             )
             .set_global(
                 [
-                    torch.quantization.default_qconfig,
-                    torch.quantization.default_dynamic_qconfig,
+                    torch.ao.quantization.default_qconfig,
+                    torch.ao.quantization.default_dynamic_qconfig,
                 ]
             )
         )
@@ -2379,6 +2388,7 @@ def test_qconfig_multi_mapping_retroactive_padding(self):
             None,
         )
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_end_to_end(self):
         # test that the prepare/convert_n_shadows_model works as expected
         # with qconfig_multi_mapping and avoids unwanted matches
@@ -2390,11 +2400,11 @@ def test_qconfig_multi_mapping_end_to_end(self):
             QConfigMultiMapping()
             .set_global(
                 [
-                    torch.quantization.default_qconfig,
-                    torch.quantization.default_dynamic_qconfig,
+                    torch.ao.quantization.default_qconfig,
+                    torch.ao.quantization.default_dynamic_qconfig,
                 ]
             )
-            .set_module_name("fc2", [None, torch.quantization.default_qconfig])
+            .set_module_name("fc2", [None, torch.ao.quantization.default_qconfig])
         )
         self.assertEqual(
             qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"],
@@ -2407,6 +2417,7 @@ def test_qconfig_multi_mapping_end_to_end(self):
         self.checkQuantizedLinear(msq.shadow_wrapper_1_1.mod_0)
         self.assertRaisesRegex(AttributeError, ".*", lambda: msq.shadow_wrapper_1_2)
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_from_list(self):
         # test QConfigMultiMapping.from_list_qconfig_mapping works as expected
 
@@ -2414,10 +2425,10 @@ def test_qconfig_multi_mapping_from_list(self):
         example_input = m.get_example_inputs()
 
         qconfig_mappings_list = [
-            QConfigMapping().set_global(torch.quantization.default_qconfig),
+            QConfigMapping().set_global(torch.ao.quantization.default_qconfig),
             QConfigMapping()
-            .set_global(torch.quantization.default_dynamic_qconfig)
-            .set_module_name("fc2", torch.quantization.default_qconfig),
+            .set_global(torch.ao.quantization.default_dynamic_qconfig)
+            .set_module_name("fc2", torch.ao.quantization.default_qconfig),
         ]
 
         qconfig_multi_mapping = QConfigMultiMapping().from_list_qconfig_mapping(
@@ -2435,6 +2446,7 @@ def test_qconfig_multi_mapping_from_list(self):
         self.checkQuantizedLinear(msq.shadow_wrapper_1_1.mod_0)
         self.assertRaisesRegex(AttributeError, ".*", lambda: msq.shadow_wrapper_1_2)
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_ordering(self):
         # test that the module ordering ignores None
 
@@ -2465,6 +2477,7 @@ def test_qconfig_multi_mapping_ordering(self):
         self.checkDynamicQuantizedLinear(msq.shadow_wrapper_1_1.mod_0, torch.qint8)
         self.checkQuantizedLinear(msq.shadow_wrapper_1_2.mod_0)
 
+    @withQNNPACKBackend
     def test_qconfig_multi_mapping_repr(self):
         qconfig_multi_mapping = (
             QConfigMultiMapping()
@@ -2485,6 +2498,7 @@ def test_qconfig_multi_mapping_repr(self):
         )
         self.assertTrue(isinstance(qconfig_multi_mapping.__repr__(), str))
 
+    @withQNNPACKBackend
     def test_custom_functions_and_tracer(self):
         class M(nn.Module):
             def __init__(self):
@@ -2501,7 +2515,7 @@ def forward(self, x):
         example_inputs = (torch.randn(2, 2),)
 
         qconfig_mappings = QConfigMultiMapping().set_global(
-            [torch.quantization.default_qat_qconfig]
+            [torch.ao.quantization.default_qat_qconfig]
         )
 
         custom_tracer = torch.ao.quantization.quantize_fx.QuantizationTracer(
@@ -2545,6 +2559,193 @@ def custom_convert_fn(module, to_print):
         results = extract_results_n_shadows_model(msq)
         print_comparisons_n_shadows_model(results)
 
+    def _test_extract_weights_impl(self, m, example_input, qconfig_mapping):
+        backend_config = get_native_backend_config()
+        results = _n_shadows_compare_weights(
+            m, example_input, qconfig_mapping, backend_config)
+        print_comparisons_n_shadows_model(results)
+
+    @withQNNPACKBackend
+    def test_extract_weights_linear(self):
+        class M(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w1 = nn.Parameter(torch.randn(2, 2))
+                self.b1 = nn.Parameter(torch.randn(2))
+                torch.nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5))
+                self.w2 = nn.Parameter(torch.randn(2, 2))
+                self.b2 = nn.Parameter(torch.randn(2))
+                torch.nn.init.kaiming_uniform_(self.w2, a=math.sqrt(5))
+                self.w3 = nn.Parameter(torch.randn(2, 2))
+                self.b3 = nn.Parameter(torch.randn(2))
+                torch.nn.init.kaiming_uniform_(self.w3, a=math.sqrt(5))
+                self.w4 = nn.Parameter(torch.randn(2, 2))
+                self.b4 = nn.Parameter(torch.randn(2))
+                torch.nn.init.kaiming_uniform_(self.w4, a=math.sqrt(5))
+
+            def forward(self, x):
+                x = F.linear(x, self.w1, self.b1)
+                x = F.linear(x, self.w2, self.b2)
+                x = F.relu(x)
+                x = F.linear(x, self.w3, self.b3)
+                x = F.linear(x, self.w4, self.b4)
+                return x
+
+        per_tensor_qconfig = torch.ao.quantization.default_qconfig
+
+        m = M().eval()
+        example_input = (torch.randn(2, 2),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        # test unquantized
+        qconfig_mapping.set_module_name_object_type_order(
+            '', F.linear, 2, None)
+        # test per-tensor
+        qconfig_mapping.set_module_name_object_type_order(
+            '', F.linear, 3, per_tensor_qconfig)
+        self._test_extract_weights_impl(m, example_input, qconfig_mapping)
+
+
+    def _test_add_loggers_impl(self, m, example_input, qconfig_mapping):
+        backend_config = get_native_backend_config()
+        m_copy = copy.deepcopy(m)
+
+        # test that input is valid
+        _ = m(*example_input)
+
+        msp = _prepare_n_shadows_add_loggers_model(
+            m, example_input, qconfig_mapping, backend_config)
+        # print('msp', msp)
+
+        msp(*example_input)
+
+        msq = convert_n_shadows_model(msp)
+        # print('msq', msq)
+
+        loggers_set_enabled(msq, True)
+        output_fp32 = msq(*example_input)
+
+        results = extract_results_n_shadows_model(msq)
+        # print(results)
+        # print_comparisons_n_shadows_model(results)
+
+        # get the last quantized output from results
+        inner_results = results['model']['node_output']
+        last_subgraph = list(inner_results.keys())[-1]
+        output_shadow = inner_results[last_subgraph][0]['values'][-1]
+
+        # verify that both fp32 and quantized output matches reference
+        output_fp32_ref = m_copy(*example_input)
+        mp_ref = prepare_fx(m_copy, qconfig_mapping, example_input)
+        for _ in range(2):
+            mp_ref(*example_input)
+        mq_ref = convert_fx(mp_ref)
+        output_shadow_ref = mq_ref(*example_input)
+        self.assertTrue(
+            torch.allclose(output_fp32, output_fp32_ref),
+            f"fp32 comparison: {output_fp32} not close to {output_fp32_ref}")
+
+        # print('shadow', output_shadow.shape, output_shadow)
+        # print('shadow_ref', output_shadow_ref.shape, output_shadow_ref)
+
+        self.assertTrue(
+            torch.allclose(output_shadow, output_shadow_ref),
+            f"shadow comparison: {output_shadow} not close to {output_shadow_ref}")
+
+        return msq
+
+    @withQNNPACKBackend
+    def test_add_loggers_linear_mod_quant_quant(self):
+        m = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+        example_input = (torch.randn(2, 2),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @withQNNPACKBackend
+    def test_add_loggers_linear_mod_fp32_quant(self):
+        m = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+        example_input = (torch.randn(2, 2),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        qconfig_mapping.set_module_name('0', None)
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @withQNNPACKBackend
+    def test_add_loggers_linear_mod_quant_fp32(self):
+        m = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+        example_input = (torch.randn(2, 2),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        qconfig_mapping.set_module_name('1', None)
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @withQNNPACKBackend
+    def test_add_loggers_linear_mod_fp32_fp32(self):
+        m = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+        example_input = (torch.randn(2, 2),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        qconfig_mapping.set_module_name('0', None)
+        qconfig_mapping.set_module_name('1', None)
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @withQNNPACKBackend
+    def test_add_loggers_conv_bn_relu_fusion_quant(self):
+        m = nn.Sequential(nn.Conv2d(1, 1, 1), nn.BatchNorm2d(1), nn.ReLU())
+        m.eval()
+        example_input = (torch.randn(16, 1, 4, 4),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @withQNNPACKBackend
+    def test_add_loggers_conv_bn_relu_fusion_fp32(self):
+        m = nn.Sequential(nn.Conv2d(1, 1, 1), nn.BatchNorm2d(1), nn.ReLU())
+        m.eval()
+        example_input = (torch.randn(16, 1, 4, 4),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        qconfig_mapping.set_module_name('0', None)
+        qconfig_mapping.set_module_name('1', None)
+        qconfig_mapping.set_module_name('2', None)
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @withQNNPACKBackend
+    def test_add_loggers_functions(self):
+        class M(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w1 = nn.Parameter(torch.randn(2, 2))
+                self.b1 = nn.Parameter(torch.randn(2))
+                torch.nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5))
+
+            def forward(self, x):
+                x = F.linear(x, self.w1, self.b1)
+                x = F.relu(x)
+                x = x + x
+                x = x + 1
+                # TODO(future PR): support first arg being a scalar
+                # x = 1 + x
+                x = torch.cat([x, x])
+                x = torch.cat([x, x])
+                x = torch.cat(tensors=[x, x])
+                # function not matchable by quantization
+                x = torch.nn.functional.rrelu(x)
+                x = F.linear(x, self.w1, self.b1)
+                return x
+
+        m = M().eval()
+        example_input = (torch.randn(16, 2),)
+        for qconfig_mapping in (
+            get_default_qconfig_mapping(),
+            QConfigMapping(),
+        ):
+            self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
+    @skip_if_no_torchvision
+    @withQNNPACKBackend
+    def test_add_loggers_mobilenet_v2(self):
+        import torchvision
+        m = torchvision.models.quantization.mobilenet_v2(
+            pretrained=False, quantize=False).eval()
+        example_input = (torch.randn(8, 3, 224, 224),)
+        qconfig_mapping = get_default_qconfig_mapping()
+        self._test_add_loggers_impl(m, example_input, qconfig_mapping)
+
 
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index ecfc0f3be730..19f2d12337f3 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -10,8 +10,9 @@
 import torch.ao.nn.quantized.dynamic as nnqd
 import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.intrinsic.quantized as nniq
-import torch.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
 import torch.multiprocessing as mp
+from torch.fx.graph_module import _USER_PRESERVED_ATTRIBUTES_KEY
 
 # graph mode quantization based on fx
 from torch.ao.quantization.quantize_fx import (
@@ -157,6 +158,7 @@
     LinearReluModel,
     LinearBnLeakyReluModel,
     LinearTanhModel,
+    ConvBnAddReluModel,
     QuantizationTestCase,
     skipIfNoFBGEMM,
     skipIfNoQNNPACK,
@@ -447,6 +449,114 @@ def test_linear_tanh_not_fused_by_default(self):
             expected_node_list=expected_nodes,
             expected_node_occurrence=expected_occurrence)
 
+    def test_fuse_conv_bn_add_relu_onednn(self):
+        # conv - bn - add - relu is fused for onednn backend only
+        from torch.ao.quantization.backend_config import get_onednn_backend_config
+        options = itertools.product(
+            [True, False],  # with_bn
+            [True, False],  # with_relu
+            [True, False],  # conv in the left
+            [True, False],  # with_two_conv
+            [True, False],  # use_torch_add
+        )
+        for with_bn, with_relu, left_conv, two_conv, use_torch_add in options:
+            expected_nodes = [
+                ns.call_module(nni.ConvAddReLU2d if with_relu else nni.ConvAdd2d),
+            ]
+            expected_occurrence = {
+                ns.call_module(nni.ConvAddReLU2d if with_relu else nni.ConvAdd2d): 1,
+                ns.call_module(nn.BatchNorm2d): 0,
+            }
+
+            # test eval mode
+            m = ConvBnAddReluModel(
+                with_bn=with_bn,
+                with_relu=with_relu,
+                left_conv=left_conv,
+                two_conv=two_conv,
+                use_torch_add=use_torch_add).eval()
+
+            m = fuse_fx(m,
+                        backend_config=get_onednn_backend_config())
+            self.checkGraphModuleNodes(
+                m,
+                expected_node_list=expected_nodes,
+                expected_node_occurrence=expected_occurrence)
+
+    def test_fuse_conv_bn_add_relu_by_default(self):
+        options = itertools.product(
+            [True, False],  # with_bn
+            [True, False],  # with_relu
+            [True, False],  # conv in the left
+            [True, False],  # with_two_conv
+            [True, False],  # use_torch_add
+        )
+        for with_bn, with_relu, left_conv, two_conv, use_torch_add in options:
+            # test eval mode
+            expected_nodes = [
+                ns.call_module(nn.Conv2d),
+            ]
+            expected_occurrence = {
+                ns.call_module(nni.ConvAdd2d): 0,
+            }
+            m = ConvBnAddReluModel(
+                with_bn=with_bn,
+                with_relu=with_relu,
+                left_conv=left_conv,
+                two_conv=two_conv,
+                use_torch_add=use_torch_add).eval()
+            m = fuse_fx(m)
+            self.checkGraphModuleNodes(
+                m,
+                expected_node_list=expected_nodes,
+                expected_node_occurrence=expected_occurrence)
+
+    @skipIfNoONEDNN
+    def test_fuse_conv_bn_add_relu_lowering(self):
+        """ Test fusion and lowering of Conv2d - (bn -) ReLU
+            by FX. For onednn backedn only.
+        """
+        from torch.ao.quantization.backend_config import get_onednn_backend_config
+        qconfig_mapping = get_default_qconfig_mapping('onednn')
+        with override_quantized_engine('onednn'):
+            options = itertools.product(
+                [True, False],  # with_bn
+                [True, False],  # with_relu
+                [True, False],  # conv in the left
+                [True, False],  # two_conv
+                [True, False],  # use_torch_add
+            )
+            for with_bn, with_relu, left_conv, two_conv, use_torch_add in options:
+                node_occurrence = {
+                    ns.call_function(torch.quantize_per_tensor): 1 if two_conv else 2,
+                    ns.call_method("dequantize"): 1,
+                    ns.call_module(nniq.ConvAddReLU2d if with_relu else nniq.ConvAdd2d): 1,
+                    ns.call_module(nn.Conv2d): 0,
+                    ns.call_module(nn.ReLU): 0,
+                }
+                node_occurrence_ref = {
+                    ns.call_function(torch.quantize_per_tensor): 3,
+                    ns.call_method("dequantize"): 3,
+                }
+
+                # test eval mode
+                m = ConvBnAddReluModel(
+                    with_bn=with_bn,
+                    with_relu=with_relu,
+                    left_conv=left_conv,
+                    two_conv=two_conv,
+                    use_torch_add=use_torch_add).eval()
+                example_x = m.get_example_inputs()
+                m = prepare_fx(m, qconfig_mapping,
+                               example_inputs=example_x,
+                               backend_config=get_onednn_backend_config())
+                m_copy = copy.deepcopy(m)
+                m = convert_fx(m, backend_config=get_onednn_backend_config())
+                m_ref = convert_to_reference_fx(m_copy)
+                self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
+                self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref)
+                m(*example_x)
+
     def test_fuse_convtranspose_bn_eval(self):
 
         m = ModelForConvTransposeBNFusion().eval()
@@ -573,7 +683,7 @@ def forward(self, x):
         }
         m = prepare_fx(model, qconfig_dict, example_inputs=(torch.randn(1, 5),))
 
-        self.checkGraphModuleNodes(m, expected_node=ns.call_module(torch.nn.intrinsic.modules.fused.LinearReLU))
+        self.checkGraphModuleNodes(m, expected_node=ns.call_module(torch.ao.nn.intrinsic.modules.fused.LinearReLU))
 
     @unittest.skip("Temporarily skipping the test case, will enable after the simple"
                    "pattern format is supported")
@@ -770,7 +880,7 @@ def conv_res_relu_extra_inputs_getter(pattern):
         m = fuse_fx(m, backend_config=backend_config)
         self.assertEqual(type(m.conv1), torch.nn.Conv2d)
         self.assertEqual(type(m.conv2), torch.nn.Conv2d)
-        # check relu are gone since we replaced the both patterns to conv
+        # check relu are gone since we replaced both patterns to conv
         self.assertFalse(hasattr(m, "relu1"))
         self.assertFalse(hasattr(m, "relu2"))
 
@@ -1338,7 +1448,7 @@ def test_qat_prepare_device_affinity(self):
         class Model(nn.Module):
 
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 1)
                 self.bn = nn.BatchNorm2d(1)
                 self.relu = nn.ReLU()
@@ -1590,7 +1700,7 @@ def test_standalone_module_quantized_interface(self):
     def test_qconfig_none(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.conv2 = nn.Conv2d(1, 1, 1)
 
@@ -1688,9 +1798,6 @@ def forward(self, x):
 
     def test_qconfig_function(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x, y):
                 return x + y
 
@@ -1713,7 +1820,7 @@ def forward(self, x, y):
     def test_qconfig_module_name_regex(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.conv2 = nn.Conv2d(1, 1, 1)
 
@@ -1742,7 +1849,7 @@ def test_qconfig_precedence(self):
         for device in get_supported_device_types():
             class M(torch.nn.Module):
                 def __init__(self):
-                    super(M, self).__init__()
+                    super().__init__()
                     self.linear = nn.Linear(1, 1)
                     self.conv = nn.Conv2d(1, 1, 1)
                     self.module_conv1 = nn.Conv2d(1, 1, 1)
@@ -1912,10 +2019,11 @@ def forward(self, x):
         self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
 
+    @override_qengines
     def test_qconfig_dict_with_fused_modules(self):
         class LinearReLUModel(torch.nn.Module):
             def __init__(self, relu):
-                super(LinearReLUModel, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(3, 3)
                 self.relu = relu
 
@@ -1926,7 +2034,7 @@ def forward(self, x):
 
         class ConvReLUModel(torch.nn.Module):
             def __init__(self, relu):
-                super(ConvReLUModel, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv1d(3, 3, 3)
                 self.relu = relu
 
@@ -1937,7 +2045,7 @@ def forward(self, x):
 
         class ConvBnReLUModel(torch.nn.Module):
             def __init__(self, relu):
-                super(ConvBnReLUModel, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv1d(3, 3, 3)
                 self.bn = torch.nn.BatchNorm1d(3)
                 self.relu = relu
@@ -1951,7 +2059,8 @@ def forward(self, x):
         for model in [LinearReLUModel, ConvReLUModel, ConvBnReLUModel]:
             for relu in [torch.nn.ReLU(), torch.nn.functional.relu, torch.relu]:
                 m = model(relu).eval()
-                qconfig_dict = torch.ao.quantization.get_default_qconfig_mapping("fbgemm")
+                qengine = torch.backends.quantized.engine
+                qconfig_dict = torch.ao.quantization.get_default_qconfig_mapping(qengine)
                 # should not crash as in https://github.com/pytorch/pytorch/issues/75825
                 prepare_fx(m, qconfig_dict, example_inputs=(torch.randn(1, 3, 3, 3),))
 
@@ -2650,11 +2759,11 @@ def test_save_observer_state_dict(self):
 
         # run it through input
         model(x)
+        # save state_dict of model
+        obs_dict = torch.ao.quantization.get_observer_state_dict(model)
 
         quant = convert_fx(model)
 
-        # save state_dict of model
-        obs_dict = torch.ao.quantization.get_observer_state_dict(model)
         b = io.BytesIO()
         torch.save(obs_dict, b)
         b.seek(0)
@@ -3008,18 +3117,12 @@ def forward(self, x0):
     @skipIfNoFBGEMM
     def test_non_traceable_module(self):
         class NonTraceable(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 for k in x.keys():
                     print(x[k])
                 return x
 
         class NonTraceable2(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 # data dependent control flow is not traceable
                 for i in x:
@@ -3301,7 +3404,6 @@ def forward(self, x):
         # Expect each quantized linear op to have a scale and zero point
         self.assertTrue(scale_count == 3, "Expect each quantized linear op to have a scale in state_dict")
         self.assertTrue(zero_point_count == 3, "Expect each quantized linear op to have a zero_point in state_dict")
-        # ensure it runs
         m(*example_inputs)
         # ensure it is scriptable
         scripted = torch.jit.script(m)
@@ -3398,9 +3500,6 @@ def test_getattr_with_nontensor_result(self):
         pattern.
         """
         class M1(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 dims = x.ndim
                 dims_sub = dims - 1
@@ -3409,9 +3508,6 @@ def forward(self, x):
                 return x
 
         class M2(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 dims = x.ndim
                 dims_sub = dims - 2
@@ -3460,7 +3556,7 @@ def _check_not_observed(self, model, node_info_to_non_tensor_args):
         # this is a helper function (for easier recursion) that checks whether
         # arg_node is observed
         def _check_node_not_observed(model, arg_node, node):
-            if isinstance(arg_node, tuple) or isinstance(arg_node, list):
+            if isinstance(arg_node, (tuple, list)):
                 for new_node in arg_node:
                     _check_node_not_observed(model, new_node, node)
             elif arg_node.op == "call_module":
@@ -3643,9 +3739,6 @@ def func(x, y, z):
 
     def test_propagate_dtypes_for_known_nodes_dict_tuple_args(self):
         class reshape_module(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y, z):
                 return x.reshape(y["shape"])
 
@@ -3889,9 +3982,6 @@ def test_not_used(self):
         """ Test quantizing a not used value"""
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 x = x + x
                 x.sigmoid_()
@@ -4110,13 +4200,19 @@ def forward(self, x):
             {"": default_qconfig},
             example_inputs=(torch.randn(1),),
             prepare_custom_config={"preserved_attributes": ["attr"]})
+        # preserved attributes are also stored in meta so that it doesn't get lost
+        # during deepcopy
         self.assertTrue(hasattr(m, "attr"))
+        self.assertTrue("attr" in m.meta[_USER_PRESERVED_ATTRIBUTES_KEY])
         m2 = copy.deepcopy(m)
         self.assertTrue(hasattr(m2, "attr"))
+        self.assertTrue("attr" in m2.meta[_USER_PRESERVED_ATTRIBUTES_KEY])
         m = convert_fx(m, convert_custom_config={"preserved_attributes": ["attr"]})
         self.assertTrue(hasattr(m, "attr"))
+        self.assertTrue("attr" in m.meta[_USER_PRESERVED_ATTRIBUTES_KEY])
         m2 = copy.deepcopy(m)
         self.assertTrue(hasattr(m2, "attr"))
+        self.assertTrue("attr" in m2.meta[_USER_PRESERVED_ATTRIBUTES_KEY])
 
     def test_output_lists_and_dicts(self):
         """Verify that specifying complicated output types does not crash.
@@ -4481,7 +4577,7 @@ def from_float(cls, other):
         prepare_custom_config = PrepareCustomConfig() \
             .set_float_to_observed_mapping(torch.nn.LSTM, UserLSTM)
         convert_custom_config = ConvertCustomConfig() \
-            .set_observed_to_quantized_mapping(UserLSTM, torch.ao.nn.quantized.LSTM)
+            .set_observed_to_quantized_mapping(torch.ao.nn.quantizable.LSTM, torch.ao.nn.quantized.LSTM)
         model = MyModel()
         model = prepare_fx(model, qconfig_mapping, example_inputs, prepare_custom_config=prepare_custom_config)
 
@@ -4503,10 +4599,11 @@ def validate_qparams(inner_module: torch.nn.Module, scale: float, zero_point: in
         validate_qparams(cell.fgate_cx_igate_cgate, 2 ** -11, 0, torch.qint32)
         validate_qparams(cell.ogate_cy, 2 ** -7, 2 ** 7, torch.quint8)
 
-        # Make sure the rest of the flow runs
+        # Ensure the final converted model is quantized
         model(*example_inputs)
         model = convert_fx(model, convert_custom_config=convert_custom_config, _remove_qconfig=False)
         model(*example_inputs)
+        self.assertEqual(type(model.my_lstm), torch.ao.nn.quantized.LSTM)
 
     def test_reroute_tuple_getitem_patterns(self):
         """
@@ -5054,9 +5151,6 @@ def forward(self, x):
                 return x
 
         class M2(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 x = x.reshape()
                 return x
@@ -5160,7 +5254,7 @@ def forward(self, x):
 
         mod = M()
 
-        qconfig_dict = {"": torch.quantization.get_default_qat_qconfig()}
+        qconfig_dict = {"": torch.ao.quantization.get_default_qat_qconfig()}
         prepare_custom_config_dict = {
             "non_traceable_module_class": [UnTraceableModuleClass],
             "non_traceable_module_name": ["untraceable_module_name"],
@@ -5194,7 +5288,7 @@ def forward(self, x):
     def test_qconfig_dict_setup(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.Conv1d = torch.nn.Conv1d(1, 1, 1)
                 self.Conv2d = torch.nn.Conv2d(1, 1, 1)
                 self.Conv3d = torch.nn.Conv3d(1, 1, 1)
@@ -5300,7 +5394,7 @@ def test_backend_config_quantization_range(self):
         """
         class MyModel(torch.nn.Module):
             def __init__(self):
-                super(MyModel, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(30, 4).float()
 
             def forward(self, x):
@@ -5363,7 +5457,7 @@ def test_backend_config_scale_min(self):
         """
         class MyModel(torch.nn.Module):
             def __init__(self):
-                super(MyModel, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(30, 4).float()
 
             def forward(self, x):
@@ -5423,7 +5517,7 @@ def test_qnnpack_backend_config(self):
         """
         class MyModel(torch.nn.Module):
             def __init__(self):
-                super(MyModel, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(30, 4).float()
 
             def forward(self, x):
@@ -5454,7 +5548,7 @@ def test_symmetric_qnnpack_qconfig_mapping(self):
 
         class MyModel(torch.nn.Module):
             def __init__(self):
-                super(MyModel, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(30, 4).float()
 
             def forward(self, x):
@@ -5796,6 +5890,16 @@ def test_linear_tanh_lowering(self):
         """
         from torch.ao.quantization.backend_config import get_onednn_backend_config
         qconfig_mapping = get_default_qconfig_mapping('onednn')
+        # TODO Currently it's required that separate ops in a fused op/module have the same qconfig.
+        #      Need to be able to support fusion of ops with different qconfigs
+        # Since tanh must have 'fixed_qparams_qconfig' while linear should use
+        # the global qconfig, we need to set qconfigs for them manually here for
+        # fusion and cannot put such configs in onednn's default qconfig_mapping.
+        # Known issue:
+        # Cannot fuse linear - tanh and quantize standalone tanh at the same time.
+        qconfig = get_default_qconfig('onednn')
+        qconfig_mapping.set_object_type(torch.nn.Linear, qconfig)
+        qconfig_mapping.set_object_type(torch.nn.Tanh, qconfig)
         with override_quantized_engine('onednn'):
             m = LinearTanhModel()
             self._test_linear_activation_fusion_lowering_helper(
@@ -5807,6 +5911,70 @@ def test_linear_tanh_lowering(self):
                 nn.Linear,
                 nn.Tanh)
 
+    @override_qengines
+    def test_linear_size_view(self):
+        class M(torch.nn.Module):
+            def __init__(self, use_relu=False):
+                super().__init__()
+                self.linear = torch.nn.Linear(16, 32)
+                self.relu = torch.nn.ReLU()
+                self.use_relu = use_relu
+
+            def forward(self, x):
+                x = self.linear(x)
+                if self.use_relu:
+                    x = self.relu(x)
+                return x.view(x.size(0), 1, 4, 8)
+
+        for use_relu in [False, True]:
+            model_fp32 = M(use_relu).eval()
+            qengine = torch.backends.quantized.engine
+            qconfig_mapping = get_default_qconfig_mapping(qengine)
+            x = torch.randn((5, 16))
+            model_fp32(x)
+            prepared_model = prepare_fx(model_fp32, qconfig_mapping, x)
+            prepared_model(x)
+            quantized_model = convert_fx(prepared_model)
+            node_occurrence = {
+                ns.call_module(nnq.Linear): 0 if use_relu else 1,
+                ns.call_module(nniq.LinearReLU): 1 if use_relu else 0,
+                ns.call_function(torch.quantize_per_tensor): 1,
+                ns.call_method("dequantize"): 1
+            }
+            self.checkGraphModuleNodes(quantized_model, expected_node_occurrence=node_occurrence)
+
+    @override_qengines
+    def test_linear_shape_view(self):
+        class M(torch.nn.Module):
+            def __init__(self, use_relu=False):
+                super().__init__()
+                self.linear = torch.nn.Linear(16, 32)
+                self.relu = torch.nn.ReLU()
+                self.use_relu = use_relu
+
+            def forward(self, x):
+                x = self.linear(x)
+                if self.use_relu:
+                    x = self.relu(x)
+                return x.view(x.shape[0], 1, 4, 8)
+
+        for use_relu in [False, True]:
+            model_fp32 = M(use_relu).eval()
+            qengine = torch.backends.quantized.engine
+            qconfig_mapping = get_default_qconfig_mapping(qengine)
+            x = torch.randn((5, 16))
+            model_fp32(x)
+            prepared_model = prepare_fx(model_fp32, qconfig_mapping, x)
+            prepared_model(x)
+            quantized_model = convert_fx(prepared_model)
+            node_occurrence = {
+                ns.call_module(nnq.Linear): 0 if use_relu else 1,
+                ns.call_module(nniq.LinearReLU): 1 if use_relu else 0,
+                ns.call_function(torch.quantize_per_tensor): 1,
+                ns.call_method("dequantize"): 1
+            }
+            self.checkGraphModuleNodes(quantized_model, expected_node_occurrence=node_occurrence)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     def setUp(self):
@@ -5850,7 +6018,7 @@ def test_linear_module(self):
         with override_quantized_engine('fbgemm'):
             class LinearModel(torch.nn.Module):
                 def __init__(self):
-                    super(LinearModel, self).__init__()
+                    super().__init__()
                     self.linear = torch.nn.Linear(30, 4).float()
 
                 def forward(self, x):
@@ -5858,7 +6026,7 @@ def forward(self, x):
 
             class LinearReLUModel(torch.nn.Module):
                 def __init__(self, f_relu=False):
-                    super(LinearReLUModel, self).__init__()
+                    super().__init__()
                     self.linear = torch.nn.Linear(30, 4).float()
                     if f_relu:
                         self.relu = F.relu
@@ -5872,7 +6040,7 @@ def forward(self, x):
 
             class LinearBnModel(torch.nn.Module):
                 def __init__(self):
-                    super(LinearBnModel, self).__init__()
+                    super().__init__()
                     self.linear = torch.nn.Linear(4, 4).float()
                     self.bn = torch.nn.BatchNorm1d(4)
 
@@ -5912,7 +6080,7 @@ def test_functional_linear(self):
         with override_quantized_engine('fbgemm'):
             class FuncLinear(torch.nn.Module):
                 def __init__(self, use_bias, has_relu, f_relu):
-                    super(FuncLinear, self).__init__()
+                    super().__init__()
                     self.w = torch.randn(4, 30)
                     self.b = torch.randn(4)
                     self.use_bias = use_bias
@@ -6007,7 +6175,7 @@ def test_linear_dynamic_fp16(self):
         with override_quantized_engine('fbgemm'):
             class FuncLinear(torch.nn.Module):
                 def __init__(self, use_bias, has_relu, f_relu):
-                    super(FuncLinear, self).__init__()
+                    super().__init__()
                     self.w = torch.randn(4, 30)
                     self.b = torch.randn(4)
                     self.use_bias = use_bias
@@ -6062,7 +6230,7 @@ def forward(self, x):
     def test_linear_static_fp16(self):
         class FuncLinear(torch.nn.Module):
             def __init__(self, use_bias, has_relu, f_relu):
-                super(FuncLinear, self).__init__()
+                super().__init__()
                 self.w = torch.randn(4, 30)
                 self.b = torch.randn(4)
                 self.use_bias = use_bias
@@ -6128,7 +6296,7 @@ def test_conv_module(self):
 
         class ConvWrapper(torch.nn.Module):
             def __init__(self, dim):
-                super(ConvWrapper, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -6261,7 +6429,7 @@ def test_quantized_conv_relu(self):
 
         class ConvNdRelu(torch.nn.Module):
             def __init__(self, dim, inplace):
-                super(ConvNdRelu, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
                 self.relu = torch.nn.ReLU(inplace)
 
@@ -6270,7 +6438,7 @@ def forward(self, x):
 
         class ConvNdFunctionalRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(ConvNdFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -6278,7 +6446,7 @@ def forward(self, x):
 
         class ConvNdInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(ConvNdInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -6450,9 +6618,6 @@ def forward(self, x):
     @unittest.skip("This is no longer needed right now, can enable later with new api")
     def test_bmm(self):
         class BMMMethod(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return x.bmm(y)
 
@@ -6685,7 +6850,7 @@ def test_qbatch_norm(self):
 
         class M(torch.nn.Module):
             def __init__(self, dim):
-                super(M, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
 
             def forward(self, x):
@@ -6714,7 +6879,7 @@ def test_qbatch_norm_relu(self):
 
         class BNRelu(torch.nn.Module):
             def __init__(self, dim, inplace):
-                super(BNRelu, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
                 self.relu = torch.nn.ReLU(inplace=inplace)
 
@@ -6723,7 +6888,7 @@ def forward(self, x):
 
         class BNFuncRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(BNFuncRelu, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
 
             def forward(self, x):
@@ -6731,7 +6896,7 @@ def forward(self, x):
 
         class BNFuncInplaceRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(BNFuncInplaceRelu, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
 
             def forward(self, x):
@@ -6762,7 +6927,7 @@ def _test_activation_impl(
         '''
         class M(torch.nn.Module):
             def __init__(self, is_module, inplace):
-                super(M, self).__init__()
+                super().__init__()
                 self.is_module = is_module
                 self.inplace = inplace
                 if self.is_module:
@@ -6807,7 +6972,7 @@ def test_leaky_relu(self):
     def test_prelu(self):
         class M(torch.nn.Module):
             def __init__(self, num_param: int):
-                super(M, self).__init__()
+                super().__init__()
                 self.op = torch.nn.PReLU(num_parameters=num_param)
 
             def forward(self, input):
@@ -6818,7 +6983,7 @@ def forward(self, input):
         quantized_nodes = {
             # is_reference
             True: ns.call_module(torch.nn.PReLU),
-            False: ns.call_module(torch.nn.quantized.PReLU),
+            False: ns.call_module(torch.ao.nn.quantized.PReLU),
         }
 
         for num_parameter, quant_type, is_reference in options:
@@ -6834,7 +6999,7 @@ def _test_norm_impl(
         '''
         class M(torch.nn.Module):
             def __init__(self, is_module):
-                super(M, self).__init__()
+                super().__init__()
                 self.is_module = is_module
                 if self.is_module:
                     self.op = float_module(*op_args)
@@ -6869,7 +7034,7 @@ def _test_norm_float16_impl(
         '''
         class M(torch.nn.Module):
             def __init__(self, is_module):
-                super(M, self).__init__()
+                super().__init__()
                 self.is_module = is_module
                 if self.is_module:
                     self.op = float_module(*op_args)
@@ -7171,7 +7336,7 @@ def forward(self, x, y):
     def test_clamp(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu6 = torch.nn.ReLU6()
                 self.relu6_ = torch.nn.ReLU6(True)
@@ -7300,7 +7465,7 @@ def test_general_shape_ops(self):
         """
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.maxpool1d = torch.nn.MaxPool1d(kernel_size=3)
                 self.maxpool2d = torch.nn.MaxPool2d(kernel_size=3)
                 self.maxpool3d = torch.nn.MaxPool3d(kernel_size=3)
@@ -8148,6 +8313,41 @@ def forward(self, x, y):
         # verify no crash
         res = mq(*example_inputs)
 
+    def test_pixel_shuffle(self):
+        class MyBias(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = nn.Parameter(torch.randn(8))
+
+        class MyModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv2d(8, 8, 1, bias=False)
+                self.bias = MyBias()
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = nn.functional.pixel_shuffle(x, 2)
+                x = x.view(-1, 8, 2, 2)
+                bias = self.bias.bias
+                return x + bias
+
+        backend_config = get_qnnpack_backend_config()
+        qconfig_mapping = get_default_qconfig_mapping("qnnpack")
+        model = MyModel()
+        m = prepare_fx(
+            model,
+            qconfig_mapping=qconfig_mapping,
+            example_inputs=(torch.randn(1, 8, 3, 3),),
+            backend_config=backend_config
+        )
+        m = convert_fx(m)
+        expected_occurrence = {
+            ns.call_function(torch.quantize_per_tensor): 2,
+            ns.call_method("dequantize"): 1,
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=expected_occurrence)
+
 class TestQuantizeFxModels(QuantizationTestCase):
     @skipIfNoFBGEMM
     @unittest.skipIf(not TEST_CUDA, "gpu is not available.")
@@ -8155,7 +8355,7 @@ def test_static_gpu_convert_basic(self):
 
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.relu1 = nn.ReLU()
                 self.conv1 = nn.Conv2d(1, 6, 5)
                 self.linear1 = nn.Linear(120, 1)
@@ -8181,7 +8381,7 @@ def test_switch_device_prepare_convert(self):
 
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.relu1 = nn.ReLU()
                 self.conv1 = nn.Conv2d(1, 6, 5)
                 self.linear1 = nn.Linear(120, 1)
@@ -8208,7 +8408,7 @@ def forward(self, x):
     def test_prepare_serialize_switch_device_convert(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 6, 5)
                 self.linear1 = nn.Linear(120, 1)
 
@@ -8508,7 +8708,7 @@ def test_qat_embeddingbag_linear(self):
         for device in get_supported_device_types():
             class EmbeddingBagLinear(torch.nn.Module):
                 def __init__(self):
-                    super(EmbeddingBagLinear, self).__init__()
+                    super().__init__()
                     self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12, mode='sum')
                     self.linear = torch.nn.Linear(12, 1).to(dtype=torch.float)
 
@@ -8519,8 +8719,9 @@ def forward(self, input: torch.Tensor, offsets: Optional[torch.Tensor] = None,
                     return x
 
             qengine = torch.backends.quantized.engine
-            qconfig_dict = {"": get_default_qat_qconfig(qengine),
-                            "object_type": [(torch.nn.EmbeddingBag, default_embedding_qat_qconfig)]}
+            qconfig_dict = QConfigMapping() \
+                .set_global(get_default_qat_qconfig(qengine)) \
+                .set_object_type(torch.nn.EmbeddingBag, default_embedding_qat_qconfig)
 
             train_indices = [[torch.randint(0, 10, (12, 12)), torch.randn((12, 1))] for _ in range(2)]
             eval_output = [[torch.randint(0, 10, (12, 1))]]
@@ -8548,7 +8749,7 @@ def test_qat_embedding_linear(self):
         for device in get_supported_device_types():
             class EmbeddingLinear(torch.nn.Module):
                 def __init__(self):
-                    super(EmbeddingLinear, self).__init__()
+                    super().__init__()
                     self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=12)
                     self.linear = torch.nn.Linear(12, 1).to(dtype=torch.float)
 
diff --git a/test/quantization/fx/test_quantize_pt2e.py b/test/quantization/fx/test_quantize_pt2e.py
index a5e347e0bf18..4a88627b727b 100644
--- a/test/quantization/fx/test_quantize_pt2e.py
+++ b/test/quantization/fx/test_quantize_pt2e.py
@@ -2,6 +2,7 @@
 import torch
 import torch.nn as nn
 import torch._dynamo as torchdynamo
+from torch.testing._internal.common_utils import xfailIfPython311
 from torch.testing._internal.common_quantization import (
     QuantizationTestCase,
     skip_if_no_torchvision,
@@ -28,10 +29,11 @@
 
 @skipIfNoQNNPACK
 class TestQuantizePT2E(QuantizationTestCase):
+    @xfailIfPython311
     def test_qconfig_none(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.conv2 = nn.Conv2d(1, 1, 1)
 
@@ -76,6 +78,7 @@ def forward(self, x):
             self.checkGraphModuleNodes(
                 m, expected_node_list=node_list, expected_node_occurrence=node_occurrence)
 
+    @xfailIfPython311
     def test_qconfig_module_type(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -126,6 +129,7 @@ def forward(self, x):
 class TestQuantizePT2EModels(QuantizationTestCase):
     @skip_if_no_torchvision
     @skipIfNoQNNPACK
+    @xfailIfPython311
     def test_resnet18(self):
         import torchvision
         with override_quantized_engine("qnnpack"):
diff --git a/test/quantization/jit/test_deprecated_jit_quant.py b/test/quantization/jit/test_deprecated_jit_quant.py
index 97e361d66440..806cff230fe4 100644
--- a/test/quantization/jit/test_deprecated_jit_quant.py
+++ b/test/quantization/jit/test_deprecated_jit_quant.py
@@ -73,7 +73,7 @@ def test_rnn_cell_quantized(self):
             if isinstance(cell, torch.jit.quantized.QuantizedLSTMCell):
                 class ScriptWrapper(torch.jit.ScriptModule):
                     def __init__(self, cell):
-                        super(ScriptWrapper, self).__init__()
+                        super().__init__()
                         self.cell = cell
 
                     @torch.jit.script_method
@@ -85,7 +85,7 @@ def forward(self, x: torch.Tensor,
 
                 class ScriptWrapper(torch.jit.ScriptModule):
                     def __init__(self, cell):
-                        super(ScriptWrapper, self).__init__()
+                        super().__init__()
                         self.cell = cell
 
                     @torch.jit.script_method
@@ -197,7 +197,7 @@ def compare_quantized_unquantized(ScriptWrapper, cell):
             if isinstance(cell, torch.jit.quantized.QuantizedGRU):
                 class ScriptWrapper(torch.jit.ScriptModule):
                     def __init__(self, cell):
-                        super(ScriptWrapper, self).__init__()
+                        super().__init__()
                         self.cell = cell
 
                     @torch.jit.script_method
@@ -209,7 +209,7 @@ def forward(self, x: torch.Tensor, hiddens: torch.Tensor) -> Tuple[torch.Tensor,
                 for cell in [cell_int8, cell_fp16]:
                     class ScriptWrapper(torch.jit.ScriptModule):
                         def __init__(self, cell):
-                            super(ScriptWrapper, self).__init__()
+                            super().__init__()
                             self.cell = cell
 
                         @torch.jit.script_method
@@ -227,7 +227,7 @@ def test_quantization_modules(self):
 
             class FooBar(torch.nn.Module):
                 def __init__(self):
-                    super(FooBar, self).__init__()
+                    super().__init__()
                     self.linear1 = torch.nn.Linear(K1, N1).float()
 
                 def forward(self, x):
@@ -261,7 +261,7 @@ def forward(self, x):
     def test_erase_class_tensor_shapes(self):
         class Linear(torch.nn.Module):
             def __init__(self, in_features, out_features):
-                super(Linear, self).__init__()
+                super().__init__()
                 qweight = torch._empty_affine_quantized(
                     [out_features, in_features], scale=1, zero_point=0,
                     dtype=torch.qint8)
diff --git a/test/quantization/jit/test_fusion_passes.py b/test/quantization/jit/test_fusion_passes.py
index 1f796939429a..d35b341f05ad 100644
--- a/test/quantization/jit/test_fusion_passes.py
+++ b/test/quantization/jit/test_fusion_passes.py
@@ -9,9 +9,6 @@
 class TestFusionPasses(QuantizationTestCase):
     def test_quantized_add_relu_fusion(self):
         class MAdd(torch.nn.Module):
-            def __init__(self):
-                super(MAdd, self).__init__()
-
             def forward(self, x, y):
                 a = torch.ops.quantized.add(x, y, 1., 0)
                 relu_out = torch.relu(a)
@@ -44,9 +41,6 @@ def forward(self, x, y):
         self.assertEqual(ref_output, output)
 
         class MAddOut(torch.nn.Module):
-            def __init__(self):
-                super(MAddOut, self).__init__()
-
             def forward(self, x, y, z):
                 a = torch.ops.quantized.add_out(x, y, z)
                 relu_out = torch.relu(a)
@@ -74,9 +68,6 @@ def forward(self, x, y, z):
         self.assertEqual(ref_output, output)
 
         class MAddScalar(torch.nn.Module):
-            def __init__(self):
-                super(MAddScalar, self).__init__()
-
             def forward(self, x, y : float):
                 a = torch.ops.quantized.add_scalar(x, y)
                 relu_out = torch.relu(a)
@@ -96,9 +87,6 @@ def forward(self, x, y : float):
         self.assertEqual(ref_output, output)
 
         class MAddScalarOut(torch.nn.Module):
-            def __init__(self):
-                super(MAddScalarOut, self).__init__()
-
             def forward(self, x, y : float, z):
                 a = torch.ops.quantized.add_scalar_out(x, y, z)
                 relu_out = torch.relu(a)
diff --git a/test/quantization/jit/test_ondevice_quantization.py b/test/quantization/jit/test_ondevice_quantization.py
index fa3cfaab24b0..b3bd4b945030 100644
--- a/test/quantization/jit/test_ondevice_quantization.py
+++ b/test/quantization/jit/test_ondevice_quantization.py
@@ -33,7 +33,7 @@
 
 class myMod(torch.nn.Module):
     def __init__(self, weight):
-        super(myMod, self).__init__()
+        super().__init__()
         self.fc1 = torch.nn.Linear(5, 5).float()
         self.fc1.weight = weight
         self.fc2 = torch.nn.Linear(5, 5).float()
@@ -44,7 +44,7 @@ def forward(self, x):
 
 class MyConvLinearModule(torch.nn.Module):
     def __init__(self):
-        super(MyConvLinearModule, self).__init__()
+        super().__init__()
         self.conv = torch.nn.Conv2d(3, 5, 3)
         weight = torch.nn.Parameter(torch.ones(5, 5))
         self.weight1 = torch.nn.Parameter(torch.ones(5, 5))
@@ -60,7 +60,7 @@ def get_example_inputs(self):
         return (torch.rand(1, 3, 12, 7),)
 
 
-class OnDevicePTQUtils(object):
+class OnDevicePTQUtils:
     observer_module_name = ['MinMaxObserver', 'PerChannelMinMaxObserver']
 
     @staticmethod
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 7726dc04c711..2787626d9967 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -89,7 +89,7 @@ class TestQuantizeJitPasses(QuantizationTestCase):
     def test_skip_dequant_constant_prop(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3).float()
 
             def forward(self, x):
@@ -133,7 +133,7 @@ def test_foldbn_trivial(self):
         # Test trivial case
         class TestModule(torch.nn.Module):
             def __init__(self, dim):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](1, 20, 5, 1)
                 self.bn = bn_module[dim](num_features=20)
                 self.bn.eps = 0.0023
@@ -176,7 +176,7 @@ def test_foldbn_trivial_nobias(self):
         # Test trivial case
         class TestModule(torch.nn.Module):
             def __init__(self, dim):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](1, 20, 5, 1, bias=False)
                 self.bn = bn_module[dim](num_features=20)
                 # to make sure new bias is not zero
@@ -220,7 +220,7 @@ def test_foldbn_in_submodule(self):
         # Test that we find Conv-BN patterns in submodules
         class SubModule(torch.nn.Module):
             def __init__(self, dim):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](1, 20, 5, 1)
                 self.bn = bn_module[dim](num_features=20)
 
@@ -231,7 +231,7 @@ def forward(self, x):
 
         class TestModule(torch.nn.Module):
             def __init__(self, dim):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub = SubModule(dim)
 
             def forward(self, x):
@@ -262,7 +262,7 @@ def test_foldbn_shared_classtype(self):
 
         class TestModule(torch.nn.Module):
             def __init__(self, dim, bias=False):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.conv1 = conv_module[dim](5, 5, 3, bias=bias)
                 self.bn1 = bn_module[dim](num_features=5)
                 self.bn1.running_mean.fill_(-0.2)
@@ -296,22 +296,16 @@ def test_foldbn_no_fusion(self):
         """Test that we don't fuse the cases when module type does not match"""
 
         class CustomConv(torch.nn.Module):
-            def __init__(self):
-                super(CustomConv, self).__init__()
-
             def forward(self, x):
                 return x
 
         class CustomBn(torch.nn.Module):
-            def __init__(self):
-                super(CustomBn, self).__init__()
-
             def forward(self, x):
                 return x
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = CustomConv()
                 self.bn = CustomBn()
 
@@ -333,7 +327,7 @@ def test_foldbn_complex_cases(self):
 
         class SubModule(torch.nn.Module):
             def __init__(self, dim, num_blocks, enable_bias, enable_affine):
-                super(SubModule, self).__init__()
+                super().__init__()
                 layers = []
                 for i in range(num_blocks):
                     layers.append(conv_module[dim](20, 20, 5, 1, bias=enable_bias))
@@ -353,7 +347,7 @@ def forward(self, x):
 
         class TestModule(torch.nn.Module):
             def __init__(self, dim, num_blocks, enable_bias, enable_affine):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub = SubModule(dim, num_blocks, enable_bias, enable_affine)
 
             def forward(self, x):
@@ -386,7 +380,7 @@ def forward(self, x):
     def test_fuse_linear(self):
         class FunctionalLinear(torch.nn.Module):
             def __init__(self, weight, bias):
-                super(FunctionalLinear, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
 
@@ -430,7 +424,7 @@ def forward(self, x):
         # check matmuls are not fused
         class Matmul(torch.nn.Module):
             def __init__(self, weight):
-                super(Matmul, self).__init__()
+                super().__init__()
                 self.weight = weight
 
             def forward(self, x):
@@ -449,7 +443,7 @@ def forward(self, x):
     def test_insert_observers(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
 
             def forward(self, x):
@@ -471,7 +465,7 @@ def addOne(self, inp) -> torch.Tensor:
 
         class Sub(torch.nn.Module):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5)
 
             def addOne(self, inp):
@@ -482,7 +476,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
                 self.sub = Sub()
 
@@ -538,7 +532,7 @@ def forward(self, inp):
     def test_insert_observers_child_qconfig(self):
         class Sub(torch.nn.Module):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, x):
@@ -546,7 +540,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
                 self.sub = Sub()
 
@@ -573,7 +567,7 @@ def forward(self, x):
     def test_insert_observers_skip_values(self):
         class ConvFunctionalReLU(torch.nn.Module):
             def __init__(self):
-                super(ConvFunctionalReLU, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
 
             def forward(self, x):
@@ -581,7 +575,7 @@ def forward(self, x):
 
         class ConvReLUModule(torch.nn.Module):
             def __init__(self):
-                super(ConvReLUModule, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
                 self.relu = torch.nn.ReLU()
 
@@ -590,7 +584,7 @@ def forward(self, x):
 
         class AddReLUModule(torch.nn.Module):
             def __init__(self):
-                super(AddReLUModule, self).__init__()
+                super().__init__()
                 self.relu = torch.nn.ReLU()
                 self.conv = torch.nn.Conv2d(3, 3, 3).float()
 
@@ -601,7 +595,7 @@ def forward(self, x):
 
         class AddFunctionalReLU(torch.nn.Module):
             def __init__(self):
-                super(AddFunctionalReLU, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3).float()
 
             def forward(self, x):
@@ -651,7 +645,7 @@ def attrs_with_prefix(module, prefix):
     def test_insert_observers_weight_dtype(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
 
             def forward(self, x):
@@ -660,16 +654,16 @@ def forward(self, x):
         m = torch.jit.script(M())
         qconfig_dict = {"": default_qconfig}
         m = prepare_jit(m, qconfig_dict)
-        activation_dtypes = set(
+        activation_dtypes = {
             obs.getattr("dtype")
             for x, obs in m._modules._c.items()
             if x.startswith("_observer_")
-        )
-        weight_dtypes = set(
+        }
+        weight_dtypes = {
             obs.getattr("dtype")
             for x, obs in m.conv._modules._c.items()
             if x.startswith("_observer_")
-        )
+        }
         assert len(activation_dtypes) == 1, "Expected to have 1 activation dtype"
         assert len(weight_dtypes) == 1, "Expected to have 1 weight dtype"
         assert (
@@ -679,9 +673,6 @@ def forward(self, x):
 
     def test_insert_observers_for_reused_weight(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x, y, weight):
                 x = F.conv2d(x, weight)
                 y = F.conv2d(y, weight)
@@ -695,7 +686,7 @@ def forward(self, x, y, weight):
     def test_insert_observers_shared_class_type(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 5, 3).float()
                 self.conv2 = torch.nn.Conv2d(3, 5, 3).float()
 
@@ -722,7 +713,7 @@ def test_insert_observers_for_general_ops(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3).float()
 
             def forward(self, x):
@@ -754,7 +745,7 @@ def test_insert_observers_propagate_observed(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 3, 3).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 3).float()
 
@@ -792,7 +783,7 @@ def test_insert_observers_propagate_observed_in_submodule(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 3, 3).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 3).float()
                 self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
@@ -839,7 +830,7 @@ def channel_shuffle(x: torch.Tensor, groups: int) -> torch.Tensor:
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 3, 1).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 1).float()
 
@@ -874,7 +865,7 @@ def forward(self, x):
     def test_insert_observers_for_if(self):
         class QuantProp(torch.nn.Module):
             def __init__(self, use_skip):
-                super(QuantProp, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
                 self.use_skip = use_skip
 
@@ -888,7 +879,7 @@ def forward(self, x):
 
         class Res(torch.nn.Module):
             def __init__(self, use_skip):
-                super(Res, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
                 self.use_skip = use_skip
 
@@ -900,7 +891,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.quant_prop = QuantProp(True)
                 self.res = Res(False)
 
@@ -948,7 +939,7 @@ def forward(self, x):
     def test_insert_observers_for_nested_if(self):
         class Res(torch.nn.Module):
             def __init__(self, use_skip):
-                super(Res, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
                 self.cond = use_skip
                 self.use_skip = use_skip
@@ -964,7 +955,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.res1 = Res(True)
                 self.res2 = Res(False)
 
@@ -990,7 +981,7 @@ def test_insert_observers_for_if_consistent_observation(self):
 
         class M(torch.nn.Module):
             def __init__(self, cond):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3).float()
                 self.cond = cond
 
@@ -1003,7 +994,7 @@ def forward(self, x):
 
         class M2(torch.nn.Module):
             def __init__(self, cond):
-                super(M2, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 3, 3).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 3).float()
                 self.cond = cond
@@ -1041,7 +1032,7 @@ def forward(self, x):
     def test_insert_quant_dequant(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3).float()
 
             def forward(self, x):
@@ -1075,7 +1066,7 @@ def forward(self, x):
     def test_insert_quant_dequant_shared_class_type(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 3, 3).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 3).float()
 
@@ -1141,7 +1132,7 @@ def forward(self, x):
     def test_dedup_module_uses(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.relu = torch.nn.ReLU()
 
             def forward(self, x):
@@ -1166,7 +1157,7 @@ def forward(self, x):
     def test_replicate_dequantize(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
 
             def forward(self, x):
@@ -1188,7 +1179,7 @@ def forward(self, x):
     def test_replicate_dequantize_in_block(self):
         class M(torch.nn.Module):
             def __init__(self, cond):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
 
                 self.cond = cond
@@ -1224,9 +1215,6 @@ def linear(input, weight, bias):
             return torch.nn.functional.linear(input, weight, bias)
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x, weight, bias):
                 x = torch.dequantize(x)
                 weight = torch.dequantize(weight)
@@ -1259,7 +1247,7 @@ def test_replicate_quantize_for_if(self):
 
         class Res(torch.nn.Module):
             def __init__(self):
-                super(Res, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 1).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 1).float()
                 self.use_skip = True
@@ -1274,7 +1262,7 @@ def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.res1 = Res()
                 self.res2 = Res()
 
@@ -1293,7 +1281,7 @@ def forward(self, x):
     def test_finalize_for_linear(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5).float()
 
             def forward(self, x):
@@ -1325,7 +1313,7 @@ def test_inplace_option(self):
     def test_finalize_debug(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3).float()
                 self.avgpool = torch.nn.AvgPool2d(3)
 
@@ -1353,7 +1341,7 @@ def forward(self, x):
     def test_module_list(self):
         class SimpleLinearLayer(torch.nn.Module):
             def __init__(self):
-                super(SimpleLinearLayer, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5).float()
 
             def forward(self, x):
@@ -1361,7 +1349,7 @@ def forward(self, x):
 
         class ComplexModel(torch.nn.Module):
             def __init__(self):
-                super(ComplexModel, self).__init__()
+                super().__init__()
                 self.layers = torch.nn.ModuleList(
                     [SimpleLinearLayer() for i in range(2)]
                 )
@@ -1387,7 +1375,7 @@ def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
     def test_conv_trace(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1d = torch.nn.Conv1d(3, 3, 3).float()
                 self.conv2d = torch.nn.Conv2d(3, 3, 3).float()
                 self.conv3d = torch.nn.Conv3d(3, 3, 3).float()
@@ -1419,7 +1407,7 @@ def forward(self, x, y, z):
     def test_convtranspose_trace(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.convtranspose1d = torch.nn.ConvTranspose1d(3, 3, 3).float()
                 self.convtranspose2d = torch.nn.ConvTranspose2d(3, 3, 3).float()
                 self.convtranspose3d = torch.nn.ConvTranspose3d(3, 3, 3).float()
@@ -1456,7 +1444,7 @@ def forward(self, x, y, z):
     def test_replicate_dequant_same_value(self):
         class Mul(torch.nn.Module):
             def __init__(self):
-                super(Mul, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3).float()
 
             def forward(self, x):
@@ -1472,7 +1460,7 @@ def forward(self, x):
     def test_interface_with_fork(self):
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.embedding1 = torch.nn.EmbeddingBag(
                     num_embeddings=10,
                     embedding_dim=12,
@@ -1486,7 +1474,7 @@ def forward(self, x, y):
 
         class OrigMod(torch.nn.Module):
             def __init__(self):
-                super(OrigMod, self).__init__()
+                super().__init__()
                 self.embedding1 = torch.nn.EmbeddingBag(
                     num_embeddings=10,
                     embedding_dim=12,
@@ -1507,7 +1495,7 @@ class TestModule(torch.nn.Module):
             proxy_mod: ModInterface
 
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.proxy_mod = OrigMod()
                 self.sub = SubModule()
 
@@ -1518,7 +1506,7 @@ def forward(self, x, y):
 
         class MainModule(torch.nn.Module):
             def __init__(self):
-                super(MainModule, self).__init__()
+                super().__init__()
                 self.test = TestModule()
 
             def forward(self, x, y):
@@ -1586,7 +1574,7 @@ def test_quantize_fork_wait(self):
 
         class MainModule(nn.Module):
             def __init__(self):
-                super(MainModule, self).__init__()
+                super().__init__()
                 self.fork_ops = ForkModule()
 
             def init_values(self, x):
@@ -1598,9 +1586,6 @@ def forward(self, x):
                 return val
 
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, x):
                 w = torch.ones(5, 5)
                 b = torch.zeros(5)
@@ -1608,7 +1593,7 @@ def forward(self, x):
 
         class ForkModule(nn.Module):
             def __init__(self):
-                super(ForkModule, self).__init__()
+                super().__init__()
                 self.test = TestModule()
 
             def forward(self, x):
@@ -1634,7 +1619,7 @@ class TestQuantizeJitOps(QuantizationTestCase):
     def test_linear(self):
         class ModuleLinear(torch.nn.Module):
             def __init__(self, has_relu=False, f_relu=False):
-                super(ModuleLinear, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(30, 4).float()
                 if has_relu:
                     if f_relu:
@@ -1649,7 +1634,7 @@ def forward(self, x):
 
         class FuncLinear(torch.nn.Module):
             def __init__(self, has_relu=False, f_relu=False):
-                super(FuncLinear, self).__init__()
+                super().__init__()
                 self.w = torch.randn(4, 30)
                 self.b = torch.randn(4)
                 if has_relu:
@@ -1696,7 +1681,7 @@ def test_quantized_conv(self):
 
         class Conv(torch.nn.Module):
             def __init__(self, dim):
-                super(Conv, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -1727,7 +1712,7 @@ def test_quantized_conv_relu(self):
 
         class ConvNdRelu(torch.nn.Module):
             def __init__(self, dim, inplace):
-                super(ConvNdRelu, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
                 self.relu = torch.nn.ReLU(inplace)
 
@@ -1736,7 +1721,7 @@ def forward(self, x):
 
         class ConvNdFunctionalRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(ConvNdFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -1744,7 +1729,7 @@ def forward(self, x):
 
         class ConvNdInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(ConvNdInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = conv_module[dim](3, 3, 3).float()
 
             def forward(self, x):
@@ -1782,7 +1767,7 @@ def test_quantized_add_alpha(self):
 
         class QuantizedAdd(torch.nn.Module):
             def __init__(self):
-                super(QuantizedAdd, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1812,7 +1797,7 @@ def test_quantized_add_relu_alpha(self):
 
         class AddRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(AddRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
@@ -1827,7 +1812,7 @@ def forward(self, x, y):
 
         class InplaceAddRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(InplaceAddRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
@@ -1842,7 +1827,7 @@ def forward(self, x, y):
 
         class AddFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(AddFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1856,7 +1841,7 @@ def forward(self, x, y):
 
         class InplaceAddFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceAddFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1870,7 +1855,7 @@ def forward(self, x, y):
 
         class AddInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(AddInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1884,7 +1869,7 @@ def forward(self, x, y):
 
         class InplaceAddInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceAddInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1931,7 +1916,7 @@ def forward(self, x, y):
     def test_quantized_add(self):
         class QuantizedAdd(torch.nn.Module):
             def __init__(self):
-                super(QuantizedAdd, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1942,7 +1927,7 @@ def forward(self, x, y):
 
         class QuantizedInplaceAdd(torch.nn.Module):
             def __init__(self):
-                super(QuantizedInplaceAdd, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -1953,16 +1938,10 @@ def forward(self, x, y):
                 return x
 
         class NonQuantizedAdd(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedAdd, self).__init__()
-
             def forward(self, x, y):
                 return x + y
 
         class NonQuantizedInplaceAdd(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedInplaceAdd, self).__init__()
-
             def forward(self, x, y):
                 x += y
                 return x
@@ -1994,7 +1973,7 @@ def forward(self, x, y):
     def test_quantized_add_scalar(self):
         class QuantizedAddScalar(torch.nn.Module):
             def __init__(self):
-                super(QuantizedAddScalar, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2003,7 +1982,7 @@ def forward(self, x):
 
         class QuantizedInplaceAddScalar(torch.nn.Module):
             def __init__(self):
-                super(QuantizedInplaceAddScalar, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2012,16 +1991,10 @@ def forward(self, x):
                 return x
 
         class NonQuantizedAddScalar(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedAddScalar, self).__init__()
-
             def forward(self, x):
                 return x + 3
 
         class NonQuantizedInplaceAddScalar(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedInplaceAddScalar, self).__init__()
-
             def forward(self, x):
                 x += 3
                 return x
@@ -2050,7 +2023,7 @@ def forward(self, x):
     def test_quantized_add_relu(self):
         class AddRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(AddRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
@@ -2063,7 +2036,7 @@ def forward(self, x, y):
 
         class InplaceAddRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(InplaceAddRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
@@ -2076,7 +2049,7 @@ def forward(self, x, y):
 
         class AddFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(AddFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2088,7 +2061,7 @@ def forward(self, x, y):
 
         class InplaceAddFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceAddFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2100,7 +2073,7 @@ def forward(self, x, y):
 
         class AddInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(AddInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2112,7 +2085,7 @@ def forward(self, x, y):
 
         class InplaceAddInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceAddInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2152,7 +2125,7 @@ def forward(self, x, y):
     def test_quantized_add_scalar_relu(self):
         class AddScalarRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(AddScalarRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
 
@@ -2162,7 +2135,7 @@ def forward(self, x):
 
         class InplaceAddScalarRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(InplaceAddScalarRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
 
@@ -2173,7 +2146,7 @@ def forward(self, x):
 
         class AddScalarFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(AddScalarFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2182,7 +2155,7 @@ def forward(self, x):
 
         class InplaceAddScalarFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceAddScalarFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2192,7 +2165,7 @@ def forward(self, x):
 
         class AddScalarInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(AddScalarInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2201,7 +2174,7 @@ def forward(self, x):
 
         class InplaceAddScalarInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceAddScalarInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2244,7 +2217,7 @@ def test_quantized_cat(self):
 
         class QuantizedCat(torch.nn.Module):
             def __init__(self):
-                super(QuantizedCat, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2254,9 +2227,6 @@ def forward(self, x, y):
                 return torch.cat([x, y], 1)
 
         class NonQuantizedCat(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedCat, self).__init__()
-
             def forward(self, x, y):
                 return torch.cat([x, y], 1)
 
@@ -2283,7 +2253,7 @@ def test_qbatch_norm(self):
 
         class M(torch.nn.Module):
             def __init__(self, dim):
-                super(M, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
 
             def forward(self, x):
@@ -2303,7 +2273,7 @@ def test_qbatch_norm_relu_BNRelu(self):
 
         class BNRelu(torch.nn.Module):
             def __init__(self, dim, inplace):
-                super(BNRelu, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
                 self.relu = torch.nn.ReLU(inplace=inplace)
 
@@ -2326,7 +2296,7 @@ def test_qbatch_norm_relu_BNFuncRelu(self):
 
         class BNFuncRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(BNFuncRelu, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
 
             def forward(self, x):
@@ -2348,7 +2318,7 @@ def test_qbatch_norm_relu_BNFuncInplaceRelu(self):
 
         class BNFuncInplaceRelu(torch.nn.Module):
             def __init__(self, dim):
-                super(BNFuncInplaceRelu, self).__init__()
+                super().__init__()
                 self.bn = bn_module[dim](3).to(torch.float)
 
             def forward(self, x):
@@ -2368,7 +2338,7 @@ def forward(self, x):
     def test_quantized_mul(self):
         class QuantizedMul(torch.nn.Module):
             def __init__(self):
-                super(QuantizedMul, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2379,7 +2349,7 @@ def forward(self, x, y):
 
         class QuantizedInplaceMul(torch.nn.Module):
             def __init__(self):
-                super(QuantizedInplaceMul, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2390,16 +2360,10 @@ def forward(self, x, y):
                 return x
 
         class NonQuantizedMul(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedMul, self).__init__()
-
             def forward(self, x, y):
                 return x * y
 
         class NonQuantizedInplaceMul(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedInplaceMul, self).__init__()
-
             def forward(self, x, y):
                 x *= y
                 return x
@@ -2431,7 +2395,7 @@ def forward(self, x, y):
     def test_quantized_mul_scalar(self):
         class QuantizedMulScalar(torch.nn.Module):
             def __init__(self):
-                super(QuantizedMulScalar, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2440,7 +2404,7 @@ def forward(self, x):
 
         class QuantizedInplaceMulScalar(torch.nn.Module):
             def __init__(self):
-                super(QuantizedInplaceMulScalar, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2449,16 +2413,10 @@ def forward(self, x):
                 return x
 
         class NonQuantizedMulScalar(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedMulScalar, self).__init__()
-
             def forward(self, x):
                 return x * 3
 
         class NonQuantizedInplaceMulScalar(torch.nn.Module):
-            def __init__(self):
-                super(NonQuantizedInplaceMulScalar, self).__init__()
-
             def forward(self, x):
                 x *= 3
                 return x
@@ -2487,7 +2445,7 @@ def forward(self, x):
     def test_quantized_mul_relu(self):
         class MulRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(MulRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
@@ -2500,7 +2458,7 @@ def forward(self, x, y):
 
         class InplaceMulRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(InplaceMulRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
@@ -2513,7 +2471,7 @@ def forward(self, x, y):
 
         class MulFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(MulFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2525,7 +2483,7 @@ def forward(self, x, y):
 
         class InplaceMulFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceMulFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2537,7 +2495,7 @@ def forward(self, x, y):
 
         class MulInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(MulInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2549,7 +2507,7 @@ def forward(self, x, y):
 
         class InplaceMulInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceMulInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(2, 2, 2).float()
                 self.conv2 = torch.nn.Conv2d(2, 2, 2).float()
 
@@ -2589,7 +2547,7 @@ def forward(self, x, y):
     def test_quantized_mul_scalar_relu(self):
         class MulScalarRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(MulScalarRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
 
@@ -2599,7 +2557,7 @@ def forward(self, x):
 
         class InplaceMulScalarRelu(torch.nn.Module):
             def __init__(self, inplace):
-                super(InplaceMulScalarRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu = torch.nn.ReLU(inplace)
 
@@ -2610,7 +2568,7 @@ def forward(self, x):
 
         class MulScalarFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(MulScalarFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2619,7 +2577,7 @@ def forward(self, x):
 
         class InplaceMulScalarFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceMulScalarFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2629,7 +2587,7 @@ def forward(self, x):
 
         class MulScalarInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(MulScalarInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2638,7 +2596,7 @@ def forward(self, x):
 
         class InplaceMulScalarInplaceFunctionalRelu(torch.nn.Module):
             def __init__(self):
-                super(InplaceMulScalarInplaceFunctionalRelu, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
 
             def forward(self, x):
@@ -2676,7 +2634,7 @@ def forward(self, x):
     def test_hardswish(self):
         class FunctionalHardswish(torch.nn.Module):
             def __init__(self, inplace):
-                super(FunctionalHardswish, self).__init__()
+                super().__init__()
                 self.inplace = inplace
 
             def forward(self, input):
@@ -2701,7 +2659,7 @@ def forward(self, input):
     def test_elu(self):
         class FunctionalELU(torch.nn.Module):
             def __init__(self, inplace=False):
-                super(FunctionalELU, self).__init__()
+                super().__init__()
                 self.inplace = inplace
 
             def forward(self, input):
@@ -2760,7 +2718,7 @@ def test_dequantize_tuple(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = torch.nn.Conv2d(3, 3, 3).float()
                 self.conv2 = torch.nn.Conv2d(3, 3, 3).float()
 
@@ -2776,7 +2734,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     def test_clamp(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(2, 2, 2).float()
                 self.relu6 = torch.nn.ReLU6()
                 self.relu6_ = torch.nn.ReLU6(True)
@@ -2817,7 +2775,7 @@ def test_general_shape_ops(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.maxpool1d = torch.nn.MaxPool1d(kernel_size=3)
                 self.maxpool2d = torch.nn.MaxPool2d(kernel_size=3)
                 self.maxpool3d = torch.nn.MaxPool3d(kernel_size=3)
@@ -2933,7 +2891,7 @@ def test_general_value_ops(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 3, 3)
                 self.avg_pool1d = torch.nn.AvgPool1d(3)
                 self.avg_pool2d = torch.nn.AvgPool2d(3)
@@ -3058,7 +3016,7 @@ def test_conv_with_benchmark_flag(self):
     def test_cat_linear(self):
         class LinearModel(torch.nn.Module):
             def __init__(self):
-                super(LinearModel, self).__init__()
+                super().__init__()
                 self.weight = torch.randn(5, 5)
 
             def forward(self, x, y):
@@ -3082,7 +3040,7 @@ class TestQuantizeDynamicJitPasses(QuantizationTestCase):
     def test_prepare_dynamic(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, x):
@@ -3109,7 +3067,7 @@ def forward(self, x):
     def test_prepare_dynamic_child_qconfig(self):
         class Sub(torch.nn.Module):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, x):
@@ -3117,7 +3075,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(3, 5, 3)
                 self.sub = Sub()
 
@@ -3147,7 +3105,7 @@ def forward(self, x):
     def test_insert_quant_dequant_linear_dynamic(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc1 = torch.nn.Linear(5, 5).float()
                 self.fc2 = torch.nn.Linear(5, 5).float()
 
@@ -3198,7 +3156,7 @@ def forward(self, x):
     def test_dynamic_multi_op(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
 
             def forward(self, x):
@@ -3217,7 +3175,7 @@ def forward(self, x):
     def test_dynamic_quant_multi_uses(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5).float()
 
             def forward(self, x):
@@ -3245,7 +3203,7 @@ def forward(self, x):
 
         class DynamicModel(torch.nn.Module):
             def __init__(self):
-                super(DynamicModel, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.ones(5, 5))
                 self.mod1 = myMod(self.weight)
 
@@ -3278,7 +3236,7 @@ def forward(self, x):
     def test_dynamic_with_if(self):
         class Res(torch.nn.Module):
             def __init__(self):
-                super(Res, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.ones(5, 5))
 
             def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
@@ -3289,7 +3247,7 @@ def forward(self, x: torch.Tensor, cond: bool) -> torch.Tensor:
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.res1 = Res()
                 self.res2 = Res()
 
@@ -3334,7 +3292,7 @@ def forward(self, x):
     def test_dynamic_weight_observer(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5).float()
                 self.fc2 = torch.nn.Linear(5, 5).float()
 
@@ -3366,7 +3324,7 @@ def forward(self, x):
     def test_convert_dynamic_fp16(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, x):
@@ -3381,7 +3339,7 @@ def forward(self, x):
     def test_quantize_dynamic_fp16(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(5, 5)
 
             def forward(self, x):
@@ -3404,7 +3362,7 @@ class TestQuantizeDynamicJitOps(QuantizationTestCase):
     def test_linear(self):
         class FunctionalLinear(torch.nn.Module):
             def __init__(self, weight, bias):
-                super(FunctionalLinear, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
 
@@ -3437,7 +3395,7 @@ def forward(self, x):
     def test_embedding_bag(self):
         class M(torch.nn.Module):
             def __init__(self, weights):
-                super(M, self).__init__()
+                super().__init__()
                 self.embedding1 = torch.nn.EmbeddingBag(
                     num_embeddings=10,
                     embedding_dim=12,
@@ -3536,7 +3494,7 @@ def forward(self, indices1, offsets1, indices2, offsets2):
     def test_embedding_bag_padding_idx_error(self):
         class M(torch.nn.Module):
             def __init__(self, weights):
-                super(M, self).__init__()
+                super().__init__()
                 self.embedding = torch.nn.EmbeddingBag(
                     num_embeddings=10,
                     embedding_dim=12,
diff --git a/test/run_test.py b/test/run_test.py
index eaffb182724d..851660aefb82 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -52,7 +52,7 @@
 
 # Note [ROCm parallel CI testing]
 # https://github.com/pytorch/pytorch/pull/85770 added file-granularity parallel testing.
-# In .jenkins/pytorch/test.sh, TEST_CONFIG == "default", CUDA and HIP_VISIBLE_DEVICES is set to 0.
+# In .ci/pytorch/test.sh, TEST_CONFIG == "default", CUDA and HIP_VISIBLE_DEVICES is set to 0.
 # This results in multiple test files sharing the same GPU.
 # This should be a supported use case for ROCm, but it exposed issues in the kernel driver resulting in hangs.
 # See https://github.com/pytorch/pytorch/issues/90940.
@@ -193,7 +193,7 @@ def skip_test_p(name: str) -> bool:
     "distributed/elastic/events/lib_test",
     "distributed/elastic/agent/server/test/api_test",
     "test_deploy",
-    "distributed/test_c10d_error_logger.py"
+    "distributed/test_c10d_error_logger"
 ]
 
 WINDOWS_BLOCKLIST = [
@@ -245,7 +245,6 @@ def skip_test_p(name: str) -> bool:
     "distributed/_shard/sharded_tensor/ops/test_softmax",
     "distributed/_shard/sharded_optim/test_sharded_optim",
     "distributed/_shard/test_partial_tensor",
-    "distributed/_shard/test_replicated_tensor",
 ] + FSDP_TEST
 
 ROCM_BLOCKLIST = [
@@ -272,7 +271,6 @@ def skip_test_p(name: str) -> bool:
     "distributed/_shard/sharded_tensor/ops/test_softmax",
     "distributed/_shard/sharded_optim/test_sharded_optim",
     "distributed/_shard/test_partial_tensor",
-    "distributed/_shard/test_replicated_tensor",
     "test_determination",
     "test_jit_legacy",
     "test_cuda_nvml_based_avail",
@@ -321,6 +319,7 @@ def skip_test_p(name: str) -> bool:
     'functorch/test_vmap',  # OOM
     'test_fx',  # gets SIGKILL
     'test_dataloader',  # frequently hangs for ROCm
+    'test_serialization',   # test_serialization_2gb_file allocates a tensor of 2GB, and could cause OOM
 ]
 
 # A subset of our TEST list that validates PyTorch's ops, modules, and autograd function as expected
@@ -431,18 +430,11 @@ def print_to_stderr(message):
     print(message, file=sys.stderr)
 
 
-def get_executable_command(options, allow_pytest, disable_coverage=False):
+def get_executable_command(options, disable_coverage=False):
     if options.coverage and not disable_coverage:
         executable = ["coverage", "run", "--parallel-mode", "--source=torch"]
     else:
         executable = [sys.executable, "-bb"]
-    if options.pytest:
-        if allow_pytest:
-            executable += ["-m", "pytest"]
-        else:
-            print_to_stderr(
-                "Pytest cannot be used for this test. Falling back to unittest."
-            )
     return executable
 
 
@@ -468,8 +460,9 @@ def run_test(
 
     # If using pytest, replace -f with equivalent -x
     if options.pytest:
+        unittest_args.extend(get_pytest_args(options))
         unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
-    elif IS_CI:
+    if IS_CI:
         ci_args = ["--import-slow-tests", "--import-disabled-tests"]
         if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
             ci_args.append("--rerun-disabled-tests")
@@ -477,9 +470,7 @@ def run_test(
         unittest_args.extend(ci_args)
 
     # Extra arguments are not supported with pytest
-    executable = get_executable_command(
-        options, allow_pytest=not extra_unittest_args
-    )
+    executable = get_executable_command(options)
 
     # Can't call `python -m unittest test_*` here because it doesn't run code
     # in `if __name__ == '__main__': `. So call `python test_*.py` instead.
@@ -712,7 +703,7 @@ def run_doctests(test_module, test_directory, options):
     if enabled['qengine'] == 'auto':
         try:
             # Is there a better check if quantization is enabled?
-            import torch.nn.quantized as nnq  # NOQA
+            import torch.ao.nn.quantized as nnq  # NOQA
             torch.backends.quantized.engine = 'qnnpack'
             torch.backends.quantized.engine = 'fbgemm'
         except (ImportError, RuntimeError):
@@ -796,7 +787,7 @@ def print_log_file(test: str, file_path: str, failed: bool) -> None:
         print_to_stderr("")
 
 
-def run_test_ops(test_module, test_directory, options):
+def get_pytest_args(options):
     if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
         # When under rerun-disabled-tests mode, run the same tests multiple times to determine their
         # flakiness status. Default to 50 re-runs
@@ -809,23 +800,16 @@ def run_test_ops(test_module, test_directory, options):
         # failure
         rerun_options = ["-x", "--reruns=2"]
 
-    default_unittest_args = [
+    pytest_args = [
         "--use-pytest",
         "-vv",
         "-rfEX"
     ]
-    default_unittest_args.extend(rerun_options)
+    pytest_args.extend(rerun_options)
+    return pytest_args
 
-    if 'slow-gradcheck' in os.getenv("BUILD_ENVIRONMENT", ""):
-        extra_unittest_args = default_unittest_args.copy()
-        # there are a lot of tests that take up a lot of space in slowgrad check, so don't bother parallelizing
-        # it's also on periodic so we don't care about TTS as much
-        return run_test(
-            test_module,
-            test_directory,
-            copy.deepcopy(options),
-            extra_unittest_args=extra_unittest_args,
-        )
+def run_test_ops(test_module, test_directory, options):
+    default_unittest_args = get_pytest_args(options)
 
     return_codes = []
     os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
@@ -908,7 +892,7 @@ def parse_test_module(test):
 
 class TestChoices(list):
     def __init__(self, *args, **kwargs):
-        super(TestChoices, self).__init__(args[0])
+        super().__init__(args[0])
 
     def __contains__(self, item):
         return list.__contains__(self, parse_test_module(item))
@@ -1073,6 +1057,19 @@ def parse_args():
             "Use 'all' to execute all doctests or specify a specific "
             "doctest to run")
     )
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--dynamo",
+        action="store_true",
+        help="Run tests with TorchDynamo+EagerBackend turned on",
+    )
+    group.add_argument(
+        "--inductor",
+        action="store_true",
+        help="Run tests with TorchInductor turned on",
+    )
+
     return parser.parse_args()
 
 
@@ -1208,17 +1205,6 @@ def get_selected_tests(options):
             WINDOWS_BLOCKLIST.append("jit")
             WINDOWS_BLOCKLIST.append("jit_fuser")
 
-        # This is exception that's caused by this issue https://github.com/pytorch/pytorch/issues/69460
-        # This below code should be removed once this issue is solved
-        if (
-            torch.version.cuda is not None and
-            LooseVersion(torch.version.cuda) >= "11.5" and
-            LooseVersion(torch.version.cuda) <= "11.6"
-        ):
-            WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot")
-            WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot_ninja")
-            WINDOWS_BLOCKLIST.append("test_cpp_extensions_aot_no_ninja")
-
         selected_tests = exclude_tests(WINDOWS_BLOCKLIST, selected_tests, "on Windows")
 
     elif TEST_WITH_ROCM:
@@ -1327,6 +1313,11 @@ def main():
         # downloading test cases configuration to local environment
         get_test_case_configs(dirpath=test_directory)
 
+    if options.dynamo:
+        os.environ["PYTORCH_TEST_WITH_DYNAMO"] = "1"
+    elif options.inductor:
+        os.environ["PYTORCH_TEST_WITH_INDUCTOR"] = "1"
+
     failure_messages = []
 
     # parallel = in parallel with other files
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 1a8263a79f93..127d964f91dd 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -10,12 +10,12 @@
 
 class TestAutocastCPU(TestCase):
     def setUp(self):
-        super(TestAutocastCPU, self).setUp()
+        super().setUp()
         self.autocast_lists = AutocastCPUTestLists(torch.device('cpu'))
 
     def tearDown(self):
         del self.autocast_lists
-        super(TestAutocastCPU, self).tearDown()
+        super().tearDown()
 
     def _run_autocast_outofplace(self, op, args, run_as_type, out_type=None, module=torch, add_kwargs=None):
         # helper to cast args
diff --git a/test/test_autograd.py b/test/test_autograd.py
index b1084306a4bc..dda17d7bfafb 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -23,7 +23,7 @@
 import torch
 
 from torch import nn
-from torch._six import inf, nan
+from torch import inf, nan
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import (profile, record_function, emit_nvtx, emit_itt)
 from torch.autograd.profiler_util import (_format_time, EventList, FunctionEvent, FunctionEventAvg)
@@ -3717,6 +3717,42 @@ def hook(t_):
         with self.assertRaisesRegex(RuntimeError, "expects the current backward to be executed with multithreading disabled"):
             t.backward()
 
+    def test_view_replay_enabled(self):
+        def f(x):
+            out = x.clone().view(-1)
+            # mutate the view, triggering autograd view-replay logic
+            out.add_(1)
+            return out
+
+        x = torch.ones(2, 2, requires_grad=True)
+        with torch.autograd._force_original_view_tracking(True):
+            out = f(x)
+
+        # view-replay was enabled, so we should see ViewBackward in the graph
+        # instead of AsStridedBackward.
+        self.assertTrue("ViewBackward" in str(out.grad_fn))
+
+        # Without view-replay we should as an AsStridedBackward
+        out = f(x)
+        self.assertTrue("AsStridedBackward" in str(out.grad_fn))
+
+    def test_unsafe_set_version_counter(self):
+        x = torch.ones(2, requires_grad=True).clone()
+        x.add_(1)
+        x.add_(2)
+        self.assertEqual(2, x._version)
+        with torch.autograd._unsafe_preserve_version_counter(x):
+            x.mul_(2)
+            x.mul_(3)
+        # version counter doesn't change inside of the context manager
+        self.assertEqual(2, x._version)
+
+        torch._C._autograd._unsafe_set_version_counter(x, 0)
+        self.assertEqual(0, x._version)
+        with self.assertRaisesRegex(RuntimeError, "Cannot set"):
+            torch._C._autograd._unsafe_set_version_counter(x, -1)
+
+
     def test_current_node(self):
         pr = []
 
@@ -4096,7 +4132,7 @@ def test_inplace_on_view_saved_output(self):
         # its output. Previously, this created a reference cycle.
         dealloc = [0]
 
-        class IncrementOnDelete(object):
+        class IncrementOnDelete:
             def __del__(self):
                 dealloc[0] += 1
 
@@ -4386,7 +4422,7 @@ def get_ref():
             #
             # We want to test that when grad goes out of scope at the end of this function that PyObject is destroyed
             # We can test this by seeing whether Foo is not kept alive once t is destroyed
-            class Foo(object):
+            class Foo:
                 pass
             my_obj = Foo()
             meta_dict = t.grad_fn.metadata
@@ -4443,7 +4479,7 @@ def backward(ctx, gO):
                     with detect_anomaly():
                         ginp.backward()
 
-            class Foo(object):
+            class Foo:
                 pass
             my_obj = Foo()
             meta_dict = out.grad_fn.metadata
@@ -4482,14 +4518,6 @@ def run_fn(a):
 
                 out.backward()
 
-    # TODO: update these tests to use the linalg module and move to test_linalg.py
-    @skipIfNoLapack
-    def test_symeig_no_eigenvectors(self):
-        A = torch.tensor([[1., 2.], [2., 4.]], dtype=torch.float32, requires_grad=True)
-        w, v = torch.symeig(A, eigenvectors=False)
-        with self.assertRaisesRegex(RuntimeError, 'is not differentiable'):
-            torch.autograd.backward([w, v], [torch.ones_like(w), torch.ones_like(v)])
-
     def test_no_grad_copy(self):
         # create autograd function that saves grad pointer as class static
         class MyFunc(Function):
@@ -4560,7 +4588,7 @@ def backward(ctx, grad):
                 i = torch.ones(1, 1, dtype=torch.long)
                 nv = v.expand(8, 3)
                 ni = i.expand(1, 8)
-                ngrad = torch.sparse.FloatTensor(ni, nv, torch.Size([10, 3]))
+                ngrad = torch.sparse_coo_tensor(ni, nv, (10, 3), dtype=torch.float32)
                 NonContGradFunc.static_grad_ptr = ngrad._values().data_ptr()
                 return ngrad, ngrad
 
@@ -4623,7 +4651,7 @@ def fn(sparse):
                       check_batched_grad=False, fast_mode=fast_mode)
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
                 gradcheck(fn, torch.rand(10, dtype=torch.double).to_sparse().requires_grad_(True), check_sparse_nnz=False,
-                          check_batched_grad=False, fast_mode=fast_mode)
+                          check_batched_grad=False, fast_mode=fast_mode, masked=True)
         check(fast_mode=True)
         check(fast_mode=False)
 
@@ -4637,8 +4665,8 @@ def fn(sparse_csr):
 
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
                 gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=False,
-                          check_batched_grad=False, fast_mode=fast_mode)
-        # check(fast_mode=True) # RuntimeError: sparse_mask_sparse_csr expects self to be 2D
+                          check_batched_grad=False, fast_mode=fast_mode, masked=True)
+        check(fast_mode=True)
         check(fast_mode=False)
 
     def test_gradcheck_sparse_csc_input(self):
@@ -4651,8 +4679,8 @@ def fn(sparse_csc):
 
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
                 gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csc().requires_grad_(True), check_sparse_nnz=False,
-                          check_batched_grad=False, fast_mode=fast_mode)
-        # check(fast_mode=True) # RuntimeError: Expected result Tensor to be of format CSR
+                          check_batched_grad=False, fast_mode=fast_mode, masked=True)
+        check(fast_mode=True)
         check(fast_mode=False)
 
     def test_gradcheck_sparse_bsr_input(self):
@@ -4665,9 +4693,8 @@ def fn(sparse_bsr):
 
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
                 gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_bsr((2, 2)).requires_grad_(True),
-                          check_sparse_nnz=False, check_batched_grad=False, fast_mode=fast_mode)
-        # RuntimeError: "empty_sparse_compressed" expected sparse compressed (non-block) tensor layout but got SparseBsr
-        # check(fast_mode=True)
+                          check_sparse_nnz=False, check_batched_grad=False, fast_mode=fast_mode, masked=True)
+        check(fast_mode=True)
         check(fast_mode=False)
 
     def test_gradcheck_sparse_bsc_input(self):
@@ -4680,9 +4707,8 @@ def fn(sparse_bsc):
 
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
                 gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_bsc((2, 2)).requires_grad_(True),
-                          check_sparse_nnz=False, check_batched_grad=False, fast_mode=fast_mode)
-        # RuntimeError: "empty_sparse_compressed" expected sparse compressed (non-block) tensor layout but got SparseBsc
-        # check(fast_mode=True)
+                          check_sparse_nnz=False, check_batched_grad=False, fast_mode=fast_mode, masked=True)
+        check(fast_mode=True)
         check(fast_mode=False)
 
     def test_gradcheck_nondeterministic(self):
@@ -4718,7 +4744,7 @@ def check(fast_mode):
             x = torch.rand(10, requires_grad=True).to_sparse()
             with self.assertRaisesRegex(RuntimeError, 'dense when check_sparse_nnz is set to False.'):
                 gradcheck(lambda x: x.to_dense(), (x,), check_sparse_nnz=False, check_batched_grad=False,
-                          fast_mode=fast_mode)
+                          fast_mode=fast_mode, masked=True)
             self.assertFalse(gradcheck(lambda x: x.to_dense(), (x,), check_sparse_nnz=False,
                                        check_batched_grad=False, raise_exception=False, fast_mode=fast_mode))
 
@@ -6107,6 +6133,14 @@ def test_grad_fn_attr_bindings(self):
         with self.assertRaisesRegex(RuntimeError, "after they have already been freed"):
             out.grad_fn._saved_weight
 
+        num_tensors = 3
+        input_tensors = [torch.ones(2, 2, requires_grad=True) for _ in range(num_tensors)]
+        scalars = [0.0 for _ in range(num_tensors)]                       # ArrayRef<Scalar> -> Tuple[Scalar, ...]
+        results = torch._foreach_maximum(input_tensors, scalars)
+        for t in results:
+            self.assertEqual(t.grad_fn._saved_scalars, scalars)
+
+
     def test_cant_create_saved_tensors(self):
         with self.assertRaisesRegex(RuntimeError, "Trying to create a SavedTensor object from Python is forbidden"):
             torch.autograd.SavedTensor()
@@ -6421,7 +6455,7 @@ def fn(a, b):
                         with self.assertRaisesRegex(RuntimeError, err_msg):
                             fn(a, b)
                     else:
-                        fn(a, b).backward()
+                        fn(a, b).abs().backward()
 
                     expected_called = 1
                     expected_ga_nz = True
@@ -6775,11 +6809,14 @@ def jvp(ctx, x_t):
 
     def test_named_tensor_for_complex_views(self):
         names = ["batch", "height", "width", "complex"]
-        z = torch.ones((5, 12, 14, 2), requires_grad=True)
+        z = torch.ones((2, 1, 2, 2), requires_grad=True)
         z_named = z.refine_names(*names)
         z_complex = torch.view_as_complex(z_named.rename(None)).refine_names(*names[:-1])
-        z_complex.sum().backward()
-        self.assertEqual(z.grad, torch.view_as_real(torch.ones_like(z_complex).rename(None)))
+        z_complex.sum().abs().backward()
+        expected = torch.ones_like(z_complex).rename(None)
+        abs_1_1j = abs(1 + 1j)
+        expected.fill_(complex(abs_1_1j / 2, abs_1_1j / 2))
+        self.assertEqual(z.grad, torch.view_as_real(expected))
 
     def test_custom_function_return_view_in_nograd(self):
         class Alias(Function):
@@ -7672,6 +7709,36 @@ def test_disabling_saved_tensor_hooks_nested(self):
 
         self.assertTrue(torch._C._autograd._saved_tensors_hooks_is_enabled())
 
+    def test_saved_tensor_hooks_custom_error_propagaation(self):
+        class CustomError(Exception):
+            pass
+
+        class error_on_pack_hook(torch.autograd.graph.saved_tensors_hooks):
+            def __init__(self):
+                def pack_hook(x):
+                    raise CustomError("pack")
+
+                super().__init__(pack_hook, lambda x: x)
+
+        class error_on_unpack_hook(torch.autograd.graph.saved_tensors_hooks):
+            def __init__(self):
+                def unpack_hook(x):
+                    raise CustomError("unpack")
+
+                super().__init__(lambda x: x, unpack_hook)
+
+        a = torch.tensor(1., requires_grad=True)
+
+        with error_on_pack_hook():
+            with self.assertRaisesRegex(CustomError, "pack"):
+                out = torch.sin(a)
+
+        with error_on_unpack_hook():
+            out = torch.sin(a)
+            with self.assertRaisesRegex(CustomError, "unpack"):
+                out.backward()
+
+
     def test_save_on_cpu_and_checkpoint(self):
         a = torch.randn(2, 2, requires_grad=True)
 
@@ -8858,15 +8925,15 @@ def backward(ctx, grad_x):
 
         # sparse first
         x = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
-        (fn.apply(x, sparse_grad1) + fn.apply(x, dense_grad) + fn.apply(x, sparse_grad2)).sum().backward()
+        (fn.apply(x, sparse_grad1) + fn.apply(x, dense_grad) + fn.apply(x, sparse_grad2)).sum().abs().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # dense first
         x = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
-        (fn.apply(x, dense_grad) + fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().backward()
+        (fn.apply(x, dense_grad) + fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().abs().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # sparse only
         x = torch.randn(size, dtype=dtype, device=device, requires_grad=True)
-        (fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().backward()
+        (fn.apply(x, sparse_grad1) + fn.apply(x, sparse_grad2)).sum().abs().backward()
         self.assertEqual(x.grad, sparse_grad1 + sparse_grad2)
 
     # autograd tests via common_method_invocations don't allow input tensors to
@@ -9573,8 +9640,10 @@ def test_copy_r_to_c(self, device):
 
         def do_test():
             out_c.copy_(inp_r)
-            out_c.sum().backward()
-            self.assertEqual(inp_r.grad, torch.ones_like(inp_r))
+            out_c_inter = out_c.sum()
+            out_c_inter.abs().backward()
+            with torch.no_grad():
+                self.assertEqual(inp_r.grad, torch.ones_like(inp_r) * torch.sgn(out_c_inter).real)
 
         self.assertNotWarn(do_test)
 
@@ -9583,8 +9652,10 @@ def do_test():
             inp_r = torch.randn(3, 2, dtype=torch.double, device=device,
                                 requires_grad=True)
             out = inp_r.to(torch.complex128)
-            out.sum().backward()
-            self.assertEqual(inp_r.grad, torch.ones_like(inp_r))
+            out_inter = out.sum()
+            out_inter.abs().backward()
+            with torch.no_grad():
+                self.assertEqual(inp_r.grad, torch.ones_like(inp_r) * torch.sgn(out_inter).real)
 
         self.assertNotWarn(do_test)
 
@@ -9608,6 +9679,32 @@ def test_warning_in_backward(self, device):
         with self.assertWarnsRegex(UserWarning, "Warn from backward"):
             b.backward()
 
+    def test_complex_scalar_backward(self, device):
+        a = torch.zeros(1, device=device, requires_grad=True)
+        b = a * 0.5j
+
+        msg = "grad can be implicitly created only for real scalar outputs"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            b.backward()
+
+        with self.assertRaisesRegex(RuntimeError, msg):
+            torch.autograd.grad(b, a)
+
+    def test_pow_real_negative_base_complex_exponent(self, device):
+        # OpInfo doesn't naturally support input of mixed types, hence this test here.
+        base = -torch.ones(2, device=device, dtype=torch.double)
+        exponent = torch.randn(2, device=device, dtype=torch.cdouble, requires_grad=True)
+
+        def fn(exponent):
+            return torch.pow(base, exponent)
+
+        torch.autograd.gradcheck(fn, (exponent,))
+
+        def fn(exponent):
+            return torch.pow(-1, exponent)
+
+        torch.autograd.gradcheck(fn, (exponent,))
+
 class TestAllowMutationOnSaved(TestCase):
     def assertClonedLenEqual(self, ctx, n):
         self.assertEqual(len(list(ctx.cloned.items())), n)
@@ -9740,14 +9837,14 @@ def test_with_math_views(self):
             b = a.conj()
             out = (b**2).sum()
             a.sin_()
-            out.backward()
+            out.abs().backward()
 
             a = torch.tensor([1 + 1j], requires_grad=True).clone()
             b = a.conj()
             out = (b**2).sum()
             # in this case, it is no longer a view it seems
             b.sin_()
-            out.backward()
+            out.abs().backward()
 
     def test_with_out_variant(self):
         with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
@@ -9768,7 +9865,7 @@ def test_backward_out_of_context(self):
             out = (a**2).sum()
 
         msg = "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
-        with self.assertRaisesRegex(RuntimeError, msg):
+        with self.assertRaisesRegex(AssertionError, msg):
             out.backward()
 
         # Different context
@@ -9777,7 +9874,7 @@ def test_backward_out_of_context(self):
             out = (a**2).sum()
 
         with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
-            with self.assertRaisesRegex(RuntimeError, msg):
+            with self.assertRaisesRegex(AssertionError, msg):
                 out.backward()
 
     def test_disallow_nesting(self):
@@ -10170,12 +10267,12 @@ class PropagatingThread(threading.Thread):
             def run(self):
                 self.exception = None
                 try:
-                    self.ret = super(PropagatingThread, self).run()
+                    self.ret = super().run()
                 except Exception as e:
                     self.exception = e
 
             def join(self, timeout=None):
-                super(PropagatingThread, self).join(timeout)
+                super().join(timeout)
                 if self.exception:
                     raise self.exception from self.exception
                 return self.ret
@@ -10600,6 +10697,33 @@ def backward(ctx, gO):
         TestFn.apply(inp, None).sum().backward()
         self.assertFalse(threads_eq)
 
+    @onlyCUDA
+    def test_backward_tls_stash(self):
+
+        local = threading.local()
+        local.my_obj = {}
+        local.my_obj[10] = 10
+        test_self = self
+        torch._C._stash_obj_in_tls("my_obj", local.my_obj)
+
+        class TestFn(Function):
+            @staticmethod
+            def forward(ctx, x, self):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, gO):
+                test_self.assertTrue(torch._C._is_key_in_tls("my_obj"))
+                test_self.assertTrue(torch._C._get_obj_in_tls("my_obj")[10] == 10)
+                torch._C._get_obj_in_tls("my_obj")[10] = 5
+                return gO, None
+
+        inp = torch.rand(10, device="cuda", requires_grad=True)
+
+        TestFn.apply(inp, None).sum().backward()
+        self.assertEqual(local.my_obj[10], 5)
+
+
 # Import test cases from below autograd/ here. These are found
 # implicitly by the loader, so Flake8 thinks they are unused, hence
 # the suppressions.
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 8ffab2daa6e2..52d7c7a4ffcb 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -14,7 +14,7 @@
 from functools import partial
 
 import torch.autograd.forward_ad as fwAD
-from torch._six import inf, nan
+from torch import inf, nan
 from torch.testing._internal.common_utils import (
     TestCase,
     slowTest,
@@ -1088,6 +1088,34 @@ def test_div_rounding_numpy(self, device, dtype):
                 actual, expect, exact_device=False, exact_dtype=exact_dtype
             )
 
+    @dtypes(*complex_types())
+    def test_complex_div_underflow_overflow(self, device, dtype):
+        # test to make sure the complex division does not produce underflow or overflow
+        # in the intermediate of its calculations
+        # NOTE: the calculation still produces an error if the number is greater than
+        # finfo.max / 2, but hopefully people realized that it's a dangerous region to work with
+        finfo = torch.finfo(dtype)
+        nom_lst = [complex(finfo.min / 2, finfo.min / 2),
+                   complex(finfo.max / 2, finfo.max / 2),
+                   complex(finfo.tiny, finfo.tiny),
+                   complex(finfo.tiny, 0.0),
+                   complex(0.0, 0.0)]
+        denom_lst = [complex(finfo.min / 2, finfo.min / 2),
+                     complex(finfo.max / 2, finfo.max / 2),
+                     complex(finfo.tiny, finfo.tiny),
+                     complex(0.0, finfo.tiny),
+                     complex(finfo.tiny, finfo.tiny)]
+        expected_lst = [complex(1.0, 0.0),
+                        complex(1.0, 0.0),
+                        complex(1.0, 0.0),
+                        complex(0.0, -1.0),
+                        complex(0.0, 0.0)]
+        nom = torch.tensor(nom_lst, dtype=dtype, device=device)
+        denom = torch.tensor(denom_lst, dtype=dtype, device=device)
+        expected = torch.tensor(expected_lst, dtype=dtype, device=device)
+        res = nom / denom
+        self.assertEqual(res, expected)
+
     # Tests that trying to add, inplace, a CUDA tensor to a CPU tensor
     #   throws the correct error message
     @onlyCUDA
@@ -1529,7 +1557,7 @@ def test_pow_inplace_resizing_exception(self, device):
             ((2, 1), (2, 2)),
             ((2, 2), (2, 1, 1)),
         )
-        test_inputs = list(
+        test_inputs = [
             (
                 make_tensor(
                     base_size, dtype=torch.float64, device=device, high=10.0, low=0.0
@@ -1539,7 +1567,7 @@ def test_pow_inplace_resizing_exception(self, device):
                 ),
             )
             for base_size, exp_size in test_cases
-        )
+        ]
         for base, exponent in test_inputs:
             regex = "doesn't match the broadcast shape"
             self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent)
@@ -1577,10 +1605,10 @@ def test_float_scalar_pow_float_tensor(self, device, dtype):
             (2, 1),
             (2, 2, 2),
         )
-        tensors = list(
+        tensors = [
             make_tensor(shape, dtype=dtype, device=device, low=0)
             for shape in exponent_shapes
-        )
+        ]
         floats_tensor = torch.tensor(floats, dtype=dtype, device=device)
         for base in floats:
             self._test_pow(base, floats_tensor)
@@ -3387,6 +3415,12 @@ def _test_logaddexp(self, device, dtype, base2):
         if base2:
             ref_func = np.logaddexp2
             our_func = torch.logaddexp2
+        elif dtype in (torch.complex64, torch.complex128):
+            # numpy has not implemented logaddexp for complex
+            def _ref_func(x, y):
+                return scipy.special.logsumexp(np.stack((x, y), axis=0), axis=0)
+            ref_func = _ref_func
+            our_func = torch.logaddexp
         else:
             ref_func = np.logaddexp
             our_func = torch.logaddexp
@@ -3425,7 +3459,8 @@ def _test_helper(a, b):
         )
         _test_helper(a, b)
 
-    @dtypes(torch.float32, torch.float64, torch.bfloat16)
+    @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16)
+    @dtypes(torch.float32, torch.float64, torch.bfloat16, torch.complex64, torch.complex128)
     def test_logaddexp(self, device, dtype):
         self._test_logaddexp(device, dtype, base2=False)
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index 0330af378746..db3c8df9b872 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -194,7 +194,7 @@ def foo(self, arg):
 
         # Check helper that work on all functions
         all_info = loaded.get_bundled_inputs_functions_and_info()
-        self.assertEqual(set(all_info.keys()), set(['forward', 'foo']))
+        self.assertEqual(set(all_info.keys()), {'forward', 'foo'})
         self.assertEqual(all_info['forward']['get_inputs_function_name'], ['get_all_bundled_inputs_for_forward'])
         self.assertEqual(all_info['foo']['get_inputs_function_name'], ['get_all_bundled_inputs_for_foo'])
         self.assertEqual(all_info['forward']['info'], info)
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index f29922b4a3f7..75ea8a9c7de3 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -3,8 +3,7 @@
 from itertools import repeat
 import os
 import re
-import sys
-from typing import Union
+from typing import Union, get_args, get_origin
 import unittest
 
 import torch.testing._internal.common_utils as common
@@ -14,16 +13,6 @@
 import torch.backends.cudnn
 import torch.utils.cpp_extension
 
-if sys.version_info >= (3, 8):
-    from typing import get_args, get_origin
-else:
-    def get_args(tp):
-        return tp.__args__
-
-    def get_origin(tp):
-        if hasattr(tp, "__origin__"):
-            return tp.__origin__
-
 try:
     import pytest
     HAS_PYTEST = True
@@ -202,7 +191,7 @@ def check_union(self, funcs):
         In these cases we expect to get exactly one function per python type.
         """
         # Verify that all functions have the same return type.
-        union_type = set(self.expected_return_type(f) for f in funcs)
+        union_type = {self.expected_return_type(f) for f in funcs}
         assert len(union_type) == 1
         union_type = union_type.pop()
         self.assertIs(Union, get_origin(union_type))
@@ -293,7 +282,7 @@ def test_conv_backend_override(self):
 class TestRNGExtension(common.TestCase):
 
     def setUp(self):
-        super(TestRNGExtension, self).setUp()
+        super().setUp()
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
     def test_rng(self):
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index 3b6d7ee0c290..9351d5ece715 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -16,6 +16,7 @@
 import torch.utils.cpp_extension
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 from torch.testing._internal.common_utils import gradcheck
+import torch.multiprocessing as mp
 
 
 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
@@ -30,12 +31,15 @@
 
 
 def remove_build_path():
-    if sys.platform == "win32":
-        print("Not wiping extensions build folder because Windows")
-        return
     default_build_root = torch.utils.cpp_extension.get_default_build_root()
     if os.path.exists(default_build_root):
-        shutil.rmtree(default_build_root)
+        if IS_WINDOWS:
+            # rmtree returns permission error: [WinError 5] Access is denied
+            # on Windows, this is a word-around
+            subprocess.run(["rm", "-rf", default_build_root], stdout=subprocess.PIPE)
+        else:
+            shutil.rmtree(default_build_root)
+
 
 # There's only one test that runs gracheck, run slow mode manually
 class TestCppExtensionJIT(common.TestCase):
@@ -145,16 +149,30 @@ def _check_cuobjdump_output(expected_values, is_ptx=False):
         old_envvar = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
         try:
             os.environ['TORCH_CUDA_ARCH_LIST'] = flags
-            torch.utils.cpp_extension.load(
-                name="cudaext_archflags",
-                sources=[
+
+            params = {
+                "name": "cudaext_archflags",
+                "sources": [
                     "cpp_extensions/cuda_extension.cpp",
                     "cpp_extensions/cuda_extension.cu",
                 ],
-                extra_cuda_cflags=["-O2"],
-                verbose=True,
-                build_directory=temp_dir,
-            )
+                "extra_cuda_cflags": ["-O2"],
+                "verbose": True,
+                "build_directory": temp_dir,
+            }
+
+            if IS_WINDOWS:
+                p = mp.Process(target=torch.utils.cpp_extension.load, kwargs=params)
+
+                # Compile and load the test CUDA arch in a different Python process to avoid
+                # polluting the current one and causes test_jit_cuda_extension to fail on
+                # Windows. There is no clear way to unload a module after it has been imported
+                # and torch.utils.cpp_extension.load builds and loads the module in one go.
+                # See https://github.com/pytorch/pytorch/issues/61655 for more details
+                p.start()
+                p.join()
+            else:
+                torch.utils.cpp_extension.load(**params)
 
             # Expected output for --list-elf:
             #   ELF file    1: cudaext_archflags.1.sm_61.cubin
@@ -166,7 +184,9 @@ def _check_cuobjdump_output(expected_values, is_ptx=False):
                 _check_cuobjdump_output(expected[1], is_ptx=True)
         finally:
             if IS_WINDOWS:
-                print("Not wiping extensions build folder because Windows")
+                # rmtree returns permission error: [WinError 5] Access is denied
+                # on Windows, this is a word-around
+                subprocess.run(["rm", "-rf", temp_dir], stdout=subprocess.PIPE)
             else:
                 shutil.rmtree(temp_dir)
 
@@ -512,7 +532,7 @@ def test_cpp_frontend_module_python_inter_op(self):
         # Create a torch.nn.Module which uses the C++ module as a submodule.
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.x = torch.nn.Parameter(torch.tensor(1.0))
                 self.net = extension.Net(3, 5)
 
@@ -565,7 +585,7 @@ def forward(self, input):
         # Try calling zero_grad()
         net.zero_grad()
         for p in net.parameters():
-            self.assertEqual(p.grad, torch.zeros_like(p))
+            assert p.grad is None, "zero_grad defaults to setting grads to None"
 
         # Test train(), eval(), training (a property)
         self.assertTrue(net.training)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 47610a979d81..201b7928ae71 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -16,18 +16,19 @@
 import threading
 import unittest
 import warnings
+import subprocess
 from random import randint
 
 import torch
 import torch.cuda
 import torch.cuda.comm as comm
+from torch import inf, nan
 from torch.nn.parallel import scatter_gather
 from torch.utils.checkpoint import checkpoint_sequential
-from torch._six import inf, nan
 from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \
     NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_REMOTE_GPU, IS_SANDCASTLE, IS_WINDOWS, \
     slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_WITH_ROCM, TEST_NUMPY, \
-    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest
+    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson
 from torch.testing._internal.autocast_test_lists import AutocastTestLists
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -48,8 +49,10 @@
 TEST_CUDAMALLOCASYNC = TEST_CUDA and (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
 TEST_LARGE_TENSOR = TEST_CUDA
 TEST_MEDIUM_TENSOR = TEST_CUDA
+TEST_GRAPH = TEST_CUDA
 TEST_CUDNN = TEST_CUDA
 TEST_BF16 = False
+TEST_PYNVML = not torch.cuda._HAS_PYNVML
 if TEST_CUDA:
     torch.ones(1).cuda()  # initialize cuda context
     TEST_CUDNN = TEST_CUDA and (TEST_WITH_ROCM or
@@ -57,17 +60,8 @@
     TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
     TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
     TEST_BF16 = torch.cuda.is_bf16_supported()
-
-
-def make_sparse_tensor(t, n, *sizes):
-    assert t.is_sparse
-    tensor = t()
-    i = tensor._indices()
-    i = i.new(len(sizes), n).copy_(
-        torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0))
-    v = tensor._values()
-    v = v.new(n).copy_(torch.randn(n))
-    return t(i, v, torch.Size(sizes)).coalesce()
+    TEST_GRAPH = (torch.version.cuda and int(torch.version.cuda.split(".")[0]) >= 11) or \
+                 (torch.version.hip and float(".".join(torch.version.hip.split(".")[0:2])) >= 5.3)
 
 _cycles_per_ms = None
 
@@ -78,12 +72,12 @@ class TestCuda(TestCase):
     FIFTY_MIL_CYCLES = 50000000
 
     def setUp(self):
-        super(TestCuda, self).setUp()
+        super().setUp()
         self.autocast_lists = AutocastTestLists(torch.device('cuda:0'))
 
     def tearDown(self):
         del self.autocast_lists
-        super(TestCuda, self).tearDown()
+        super().tearDown()
 
     def _check_memory_stat_consistency(self):
         snapshot = torch.cuda.memory_snapshot()
@@ -106,6 +100,10 @@ def _check_memory_stat_consistency(self):
             expected["active_bytes.all.current"] += segment["active_size"]
             expected["active_bytes." + pool_str + ".current"] += segment["active_size"]
 
+            expected["requested_bytes.all.current"] += segment["requested_size"]
+            expected["requested_bytes." + pool_str + ".current"] += segment["requested_size"]
+
+            sum_requested = 0
             is_split = len(segment["blocks"]) > 1
             for block in segment["blocks"]:
                 if block["state"] == "active_allocated":
@@ -113,6 +111,7 @@ def _check_memory_stat_consistency(self):
                     expected["allocation." + pool_str + ".current"] += 1
 
                 if block["state"].startswith("active_"):
+                    sum_requested += block["requested_size"]
                     expected["active.all.current"] += 1
                     expected["active." + pool_str + ".current"] += 1
 
@@ -122,6 +121,8 @@ def _check_memory_stat_consistency(self):
                     expected["inactive_split_bytes.all.current"] += block["size"]
                     expected["inactive_split_bytes." + pool_str + ".current"] += block["size"]
 
+            self.assertEqual(sum_requested, segment["requested_size"])
+
         for device, expected in expected_each_device.items():
             stats = torch.cuda.memory_stats(device)
             for k, v in expected.items():
@@ -379,7 +380,7 @@ def test_out_of_memory(self):
         self.assertTrue((tensor == 1).all())
 
 
-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "Segmentation fault (core dumped)")
+    @unittest.skipIf(TEST_CUDAMALLOCASYNC or IS_JETSON, "Segmentation fault (core dumped)")
     def test_out_of_memory_retry(self):
         torch.cuda.empty_cache()
         total_memory = torch.cuda.get_device_properties(0).total_memory
@@ -1589,7 +1590,7 @@ def _spawn_test_multinomial_invalid_probs_cuda(self, probs):
             p = subprocess.Popen([sys.executable, '-c', f"""\
 import sys
 import torch
-from torch._six import inf, nan
+from torch import inf, nan
 try:
     with torch.random.fork_rng(devices=[0]):
         torch.multinomial(torch.tensor({probs}).to('cuda'), 2, replacement=True)
@@ -1737,6 +1738,10 @@ def _test(idx):
             before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx)
             # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
             t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx))
+            if IS_JETSON:
+                # w/o syncing, mem_get_info will run before memory allocated has actually increased.
+                # This race condition causes consistent failure
+                torch.cuda.synchronize()
             after_free_bytes, after_available_bytes = torch.cuda.mem_get_info(idx)
 
             self.assertTrue(after_free_bytes < before_free_bytes)
@@ -1760,9 +1765,18 @@ def leak_gpu0():
             l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:0")))
 
         no_leak()
-
-        with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 0.+"):
-            leak_gpu0()
+        regex = r"CUDA driver API confirmed .+ on device 0.+"
+        if IS_JETSON:
+            try:
+                leak_gpu0()
+            except RuntimeError as e:
+                import re
+                assert re.match(regex, str(e)), str(e) + "\n does not match: \n" + regex
+        else:
+            # assertRaisesRegex does not pass with Python for Jetson,
+            # even though the RuntimeError matches regex using re.match
+            with self.assertRaisesRegex(RuntimeError, regex):
+                leak_gpu0()
 
         if TEST_MULTIGPU:
             @self.wrap_with_cuda_memory_check
@@ -1791,6 +1805,7 @@ def test_cuda_kernel_loop_overflow(self):
         self.assertEqual(y[0, 0, 0, 2**30], expected)
 
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @gcIfJetson
     def test_cuda_kernel_loop_overflow_large(self):
         # Make sure input.numel() > INT_MAX is handled:
         x = torch.randn(1, 1, 1, 2**31, dtype=torch.float16, device="cuda")
@@ -1865,7 +1880,7 @@ def test_streaming_backwards_multiple_streams(self):
 
         class StreamModel(torch.nn.Module):
             def __init__(self):
-                super(StreamModel, self).__init__()
+                super().__init__()
                 self.event = torch.cuda.Event()
                 self.stream0 = torch.cuda.Stream()
                 self.stream1 = torch.cuda.Stream()
@@ -2214,11 +2229,6 @@ def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float):
         found_inf = torch.empty((1,), dtype=dtype, device=device)
         cur = found_inf.device
 
-        # As of d0c925f (4/16/20), docs are unclear about best API for sparse cuda tensor construction.
-        # https://pytorch.org/docs/master/tensors.html shows torch.sparse_coo_tensor(...), but it has no docstring.
-        # The same page shows several tensors with layout=torch.sparse_coo, but no constructors using that layout.
-        # Meanwhile, https://pytorch.org/docs/master/sparse.html shows torch.sparse.FloatTensor(...), which looks
-        # legacy and does not accept a device="cuda" kwarg.  Going with torch.sparse_coo_tensor.
         i = torch.tensor([[0, 1, 1],
                           [2, 0, 2]], device="cuda", dtype=torch.int64)
         v = torch.tensor([16., 32., 64.], device="cuda", dtype=torch.float)
@@ -2423,7 +2433,7 @@ def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
         # NOTE(mkozuki): With current way of testing, `torch.optim.Adam` is failing in spite of `foreach` and `fused`.
         #   Giving some flexibility to this test might help.
         context = contextlib.nullcontext
-        if optimizer_ctor in (torch.optim.Adam,):
+        if optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
             from functools import partial
             context = partial(self.assertRaises, AssertionError)
         with context():
@@ -2438,25 +2448,30 @@ def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
             )
 
     def test_grad_scaling_autocast(self):
-        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam):
+        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
             self._grad_scaling_autocast_test(optimizer_ctor=optimizer_ctor)
 
     def test_grad_scaling_autocast_foreach(self):
-        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam):
+        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
             self._grad_scaling_autocast_test(optimizer_ctor=optimizer_ctor, optimizer_kwargs={"foreach": True})
 
     def test_grad_scaling_autocast_fused(self):
-        self._grad_scaling_autocast_test(optimizer_ctor=torch.optim.Adam, optimizer_kwargs={"fused": True})
+        for optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
+            self._grad_scaling_autocast_test(optimizer_ctor=optimizer_ctor, optimizer_kwargs={"fused": True})
 
+    # Compare non-fused optimizer vs fused one as the fused one unscales gradients
+    # inside its cuda kernel unlike the other.
     def test_grad_scaling_autocast_fused_optimizers(self):
-        for optimizer_ctor, optimizer_kwargs in (
-            (torch.optim.Adam, {"fused": True, "amsgrad": False}),
-            (torch.optim.Adam, {"fused": True, "amsgrad": True}),
+        for optimizer_ctor, optimizer_kwargs, separate_unscale in product(
+            (torch.optim.Adam, torch.optim.AdamW),
+            ({"fused": True, "amsgrad": False}, {"fused": True, "amsgrad": True}),
+            (False, True),
         ):
-            self._grad_scaling_autocast_fused_optimizers(
-                optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs)
+            with self.subTest(optim=optimizer_ctor, kwargs=optimizer_kwargs, separate_unscale=separate_unscale):
+                self._grad_scaling_autocast_fused_optimizers(
+                    optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs, separate_unscale=separate_unscale)
 
-    def _grad_scaling_autocast_fused_optimizers(self, optimizer_ctor, optimizer_kwargs):
+    def _grad_scaling_autocast_fused_optimizers(self, optimizer_ctor, optimizer_kwargs, separate_unscale):
         (
             mod_control, mod_scaling, opt_control, opt_scaling, data, loss_fn, _,
         ) = self._create_scaling_case(optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs)
@@ -2480,6 +2495,8 @@ def _grad_scaling_autocast_fused_optimizers(self, optimizer_ctor, optimizer_kwar
                 output_scaling = mod_scaling(input)
                 loss_scaling = loss_fn(output_scaling, target)
             scaler.scale(loss_scaling).backward()
+            if separate_unscale:
+                scaler.unscale_(opt_scaling)
             scaler.step(opt_scaling)
             scaler.update()
 
@@ -2494,7 +2511,7 @@ def _grad_scaling_autocast_fused_optimizers(self, optimizer_ctor, optimizer_kwar
                     actual = state_scaling[k]
                     if k == "step":
                         actual = actual.squeeze()
-                    self.assertEqual(state_control[k], actual, msg=k)
+                    self.assertEqual(state_control[k], actual)
 
     def test_grad_scaling_clipping(self):
         def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
@@ -2648,6 +2665,42 @@ def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
                             chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
                 self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
 
+    def test_grad_scaler_pass_itself(self):
+        class _PlaceHolderOptimizer(torch.optim.Optimizer):
+            tester = self
+
+            def __init__(self, params, defaults=None):
+                if defaults is None:
+                    defaults = {}
+                super().__init__(params, defaults)
+                self._step_supports_amp_scaling = True
+
+        class Optimizer1(_PlaceHolderOptimizer):
+            def step(self, closure=None, *, grad_scaler=None):
+                self.tester.assertTrue(isinstance(grad_scaler, torch.cuda.amp.GradScaler))
+                self.tester.assertFalse(hasattr(self, "grad_scale"))
+                self.tester.assertFalse(hasattr(self, "found_inf"))
+
+        class Optimizer2(_PlaceHolderOptimizer):
+            def step(self, closure=None):
+                self.tester.assertTrue(isinstance(self.grad_scale, torch.Tensor))
+                self.tester.assertTrue(isinstance(self.found_inf, torch.Tensor))
+
+        x = torch.randn(4, 4).cuda()
+        m = torch.nn.Linear(4, 1).cuda()
+        o1 = Optimizer1(m.parameters())
+        o2 = Optimizer2(m.parameters())
+        scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
+
+        with torch.cuda.amp.autocast():
+            y = m(x)
+            loss = y.mean()
+        scaler.scale(loss).backward()
+        with self.assertWarns(FutureWarning):
+            scaler.step(o1)
+        scaler.step(o2)
+        scaler.update()
+
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_grad_scaling_multigpu(self):
         # Same as above, but runs some of the models on device 1.
@@ -3315,9 +3368,7 @@ def test_graph_is_current_stream_capturing(self):
                 self.assertTrue(torch.cuda.is_current_stream_capturing())
                 g.capture_end()
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_capture_simple(self):
         s = torch.cuda.Stream()
 
@@ -3336,6 +3387,36 @@ def test_graph_capture_simple(self):
 
         self.assertTrue(b.sum().item() == 11000.)
 
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    def test_graph_error(self):
+        # We need to run this test in a separate thread as the error we trigger
+        # puts the cuda context in a bad state
+        script = """
+import torch
+
+g = torch.cuda.CUDAGraph()
+try:
+    g.capture_begin()
+except RuntimeError as e:
+    if "CUDA graphs must be captured on a non-default stream." in str(e):
+        exit(0)
+    else:
+        exit(1)
+exit(2)
+"""
+        try:
+            a = subprocess.check_output(
+                [sys.executable, '-c', script],
+                stderr=subprocess.STDOUT,
+                # On Windows, opening the subprocess with the default CWD makes `import torch`
+                # fail, so just set CWD to this script's directory
+                cwd=os.path.dirname(os.path.realpath(__file__)),)
+        except subprocess.CalledProcessError as e:
+            if e.returncode == 1:
+                self.assertTrue(False, "Error raise by starting capture without a stream is not the expected one")
+            elif e.returncode == 2:
+                self.assertTrue(False, "Error raised by starting capture without a stream was not caught")
+
     @unittest.skipIf((not TEST_CUDA) or
                      TEST_WITH_ROCM or
                      int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
@@ -3348,9 +3429,7 @@ def test_graph_warn_if_has_zero_nodes(self):
                 g.capture_end()
         self.assertTrue(any("The CUDA Graph is empty" in str(w.message) for w in caught))
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_capture_oom(self):
         oom_regex = "would exceed allowed memory" if TEST_CUDAMALLOCASYNC else \
                     "out of memory"
@@ -3358,9 +3437,7 @@ def test_graph_capture_oom(self):
             with torch.cuda.graph(torch.cuda.CUDAGraph()):
                 torch.zeros(2 ** 40, device="cuda")
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_repeat_graph_capture_cublas_workspace_memory(self):
         (x, y, z) = 1024, 512, 64
         a = torch.rand((x, y), device='cuda')
@@ -3383,9 +3460,7 @@ def test_repeat_graph_capture_cublas_workspace_memory(self):
 
         self.assertFalse(used_gb_before + 0.1 < used_gb_after)
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_rng_functional(self):
         ops_with_kwargs = ((torch.nn.functional.dropout, {"p": 0.1}),
                            (torch.nn.functional.rrelu, {"training": True}),)
@@ -3469,9 +3544,7 @@ def run(op, kwargs):
         for op, kwargs in ops_with_kwargs:
             run(op, kwargs)
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_rng_distributions(self):
         size = 10000
         input = torch.rand((size,), device="cuda", dtype=torch.float)
@@ -3598,9 +3671,7 @@ def run(module, op, args, kwargs):
             # Adds an empty dict for kwargs, which none of the Tensor methods use
             run("Tensor", *(meth_with_args + ({},)))
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_two_successive(self):
         torch.cuda.empty_cache()
 
@@ -3663,14 +3734,14 @@ def func_with_temps(t, val):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
+    @unittest.skipIf((not TEST_GRAPH) or
                      IS_WINDOWS or  # appears to still be broken on Windows as of 11.4+
-                     int(torch.version.cuda.split(".")[0]) < 11 or
-                     (int(torch.version.cuda.split(".")[0]) == 11 and
-                      int(torch.version.cuda.split(".")[1]) < 4),
+                     (torch.version.cuda and
+                     int(torch.version.cuda.split(".")[0]) == 11 and
+                     int(torch.version.cuda.split(".")[1]) < 4),
                      "Graph bindings disallow concurrent replay for CUDA < 11.4, see " +
                      "https://github.com/pytorch/pytorch/pull/57556")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_concurrent_replay(self):
         torch.cuda.empty_cache()
 
@@ -3739,9 +3810,7 @@ def func_with_temps(t, val):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_three_successive(self):
         torch.cuda.empty_cache()
 
@@ -3801,10 +3870,8 @@ def test_graph_three_successive(self):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     TEST_CUDAMALLOCASYNC or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH) or
+                     TEST_CUDAMALLOCASYNC , "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
         kSmallSize = 1048576
         kSmallBuffer = 2097152
@@ -3907,9 +3974,7 @@ def test_graph_memory_stats_and_use_result_after_destroy_graph(self):
             torch.cuda.synchronize()
             torch.cuda.empty_cache()
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_record_stream(self):
         # Makes sure graph capture defers attempting to reclaim allocations used across streams. See
         # "Q. Why skip process_events if a capture might be underway?" in c10/cuda/CUDACachingAllocator.cpp
@@ -3949,9 +4014,8 @@ def test_graph_record_stream(self):
         # dummy allocation triggers process_events, Hopefully successfully processes b's end-of-life event.
         c = torch.zeros((3,), device="cuda")
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @skipIfRocm
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     # If this test is the first in the process to try cudnn rnns with dropout, it'll initialize
     # DropoutState's long-lived internal buffer. Calling code perceives this (correct) behavior
     # as a memory leak unless we skip the leak check.
@@ -3980,9 +4044,7 @@ def test_graph_cudnn_dropout(self):
 
         y = model(x)
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_grad_scaling(self):
         torch.cuda.empty_cache()
 
@@ -4006,9 +4068,11 @@ def test_graph_grad_scaling(self):
         opt.zero_grad(set_to_none=True)
 
         # capture
-        with torch.cuda.graph(g):
+        with torch.cuda.stream(s):
+            g.capture_begin()
             loss = (weight.half() * static_input).sum()
             scaler.scale(loss).backward()
+            g.capture_end()
 
         input_vals = [5, 20000, 5, 40000]
         # If the scale gets updated properly, these are the scale, growth tracker,
@@ -4029,15 +4093,12 @@ def test_graph_grad_scaling(self):
             self.assertEqual(scaler._scale, scale)
             self.assertEqual(scaler._growth_tracker, growth_tracker)
 
-    @unittest.skipIf(
-        (not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11,
-        "CUDA >= 11.0 required for graphs",
-    )
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     @parametrize(
         "with_amp,cache_enabled,allow_unused_input",
         [
-            (False, False, True),
-            (True, False, True),
+            subtest((False, False, True), decorators=[skipIfRocm]),
+            subtest((True, False, True), decorators=[skipIfRocm]),
             subtest((True, True, True), decorators=[unittest.expectedFailure]),
             subtest((False, False, False), decorators=[unittest.expectedFailure]),
         ],
@@ -4201,9 +4262,7 @@ def _test_graphed_optimizer(self, steps_warmup, steps_train, optimizer_ctor, kwa
             for p_control, p_graphed in zip(params_control, params_graphed):
                 self.assertEqual(p_control, p_graphed)
 
-    @unittest.skipIf((not TEST_CUDA) or
-                     TEST_WITH_ROCM or
-                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
     def test_graph_adam_adamw(self):
         # Needs generalization if we want to extend this test to non-Adam-like optimizers.
         cases = [
@@ -4211,22 +4270,64 @@ def test_graph_adam_adamw(self):
             for optimizer_ctor, foreach, amsgrad in product(
                 (torch.optim.Adam, torch.optim.AdamW), (False, True), (False, True),)
         ] + [
-            (torch.optim.Adam, {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad})
-            for amsgrad in (False, True)
+            (optimizer_ctor, {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad})
+            for optimizer_ctor, amsgrad in product((torch.optim.Adam, torch.optim.AdamW), (False, True))
         ]
 
         for optimizer_ctor, kwargs in cases:
             with self.subTest(optimizer_ctor=optimizer_ctor, kwargs=kwargs):
                 self._test_graphed_optimizer(3, 2, optimizer_ctor, kwargs)
 
-    @unittest.skipIf(
-        (not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11,
-        "CUDA >= 11.0 required for graphs",
-    )
-    def test_graph_scaling_fusedadam(self):
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    def test_graph_adam_adamw_with_explicitly_capturable_param_groups(self):
+        # mimicking `_test_graphed_optimizer` maladroitly to pass two param_groups to optimizer.__init__
+        n_warmup, n_replay = 3, 2
+        for optimizer, second_param_group_capturable in product((torch.optim.Adam, torch.optim.AdamW), (True, False)):
+            ref_p1, param1 = [torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)]
+            ref_p2, param2 = [torch.nn.Parameter(torch.ones(1, device="cuda")) for _ in range(2)]
+            grads1, grads2 = [[torch.randn_like(param1) for _ in range(n_warmup + n_replay)] for _ in range(2)]
+            ref_grads1, ref_grads2 = [[t.clone() for t in tensors] for tensors in (grads1, grads2)]
+            params = [
+                {"params": [param1], "capturable": True},
+                {"params": [param2], "capturable": second_param_group_capturable},
+            ]
+            opt = optimizer(params)
+            opt_ = optimizer([
+                {"params": [ref_p1], "capturable": False},
+                {"params": [ref_p2], "capturable": False},
+            ])
+
+            for i in range(n_warmup + n_replay):
+                ref_p1.grad = ref_grads1[i]
+                ref_p2.grad = ref_grads2[i]
+                opt_.step()
+
+            for i in range(n_warmup):
+                param1.grad = grads1[i]
+                param2.grad = grads2[i]
+                opt.step()
+
+            g = torch.cuda.CUDAGraph()
+            if not second_param_group_capturable:
+                with self.assertRaisesRegex(RuntimeError, "Attempting CUDA graph"):
+                    with torch.cuda.graph(g):
+                        opt.step()
+            else:
+                with torch.cuda.graph(g):
+                    opt.step()
+
+                for i in range(n_replay):
+                    param1.grad.copy_(grads1[n_warmup + i])
+                    param2.grad.copy_(grads2[n_warmup + i])
+                    g.replay()
+                self.assertEqual(ref_p1, param1)
+                self.assertEqual(ref_p2, param2)
+
+    @unittest.skipIf((not TEST_GRAPH), "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    def test_graph_scaling_fused_optimizers(self):
         cases = [
-            (torch.optim.Adam, {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad})
-            for amsgrad in (False, True)
+            (optimizer_ctor, {"lr": 0.1, "betas": (0.8, 0.7), "fused": True, "amsgrad": amsgrad})
+            for optimizer_ctor, amsgrad in product((torch.optim.Adam, torch.optim.AdamW), (False, True))
         ]
 
         steps_warmup = 3
@@ -4459,16 +4560,16 @@ def test_broadcast_coalesced(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
-            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
+            self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
-            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
-            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
-            make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
-            make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
+            self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
+            self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
-            make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
+            self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
@@ -4534,16 +4635,16 @@ def test_reduce_add_coalesced(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
-            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
+            self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
-            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
-            make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
-            make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
-            make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
+            self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
+            self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
-            make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
+            self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
@@ -4906,7 +5007,8 @@ def power2_div(size, div_factor):
             return ret
 
         torch.cuda.memory.empty_cache()
-        key = 'active_bytes.all.allocated' if not TEST_CUDAMALLOCASYNC else 'allocated_bytes.all.current'
+        key_allocated = 'active_bytes.all.allocated' if not TEST_CUDAMALLOCASYNC else 'allocated_bytes.all.current'
+        key_requested = 'requested_bytes.all.allocated'
 
         nelems = 21 * 1024 * 1024
         nbytes = 4 * nelems  # floats are 4 bytes
@@ -4914,49 +5016,52 @@ def power2_div(size, div_factor):
         nelems_big = 100 * 1024 * 1024
         nbytes_big = 4 * nelems_big  # floats are 4 bytes
 
-        start_mem = torch.cuda.memory_stats()[key]
+        start_mem = torch.cuda.memory_stats()[key_allocated]
         torch.cuda.memory._set_allocator_settings("")
         x = torch.rand(nelems, device='cuda')
 
         # test roundup_power2_divisions single value syntax
-        reg_mem = torch.cuda.memory_stats()[key]
+        reg_mem = torch.cuda.memory_stats()[key_allocated]
+        start_requested = torch.cuda.memory_stats()[key_requested]
         torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:4")
         y = torch.rand(nelems, device='cuda')
 
-        pow2_div4_mem = torch.cuda.memory_stats()[key]
+        pow2_div4_mem = torch.cuda.memory_stats()[key_allocated]
+        current_requested = torch.cuda.memory_stats()[key_requested]
 
         self.assertTrue(reg_mem - start_mem == nbytes)
         if not TEST_CUDAMALLOCASYNC:
             # not supported with the cudaMallocAsync backend
             self.assertTrue(pow2_div4_mem - reg_mem == power2_div(nbytes, 4))
+            self.assertTrue(current_requested - start_requested == nbytes)
 
         torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5")
         torch.cuda.memory._set_allocator_settings("garbage_collection_threshold:0.5,max_split_size_mb:40")
 
         # should have reset the power2 divisions now
         torch.cuda.memory.empty_cache()
-        start_mem = torch.cuda.memory_stats()[key]
+        start_mem = torch.cuda.memory_stats()[key_allocated]
         z = torch.rand(nelems, device='cuda')
-        reg_mem = torch.cuda.memory_stats()[key]
+        reg_mem = torch.cuda.memory_stats()[key_allocated]
         self.assertTrue(reg_mem - start_mem == nbytes)
 
         # roundup_power2_divisions knob array syntax
         torch.cuda.memory.empty_cache()
         torch.cuda.memory._set_allocator_settings(
             "garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,128:2,256:2,512:2,1024:1,>:1]")
-        start_mem = torch.cuda.memory_stats()[key]
+        start_mem = torch.cuda.memory_stats()[key_allocated]
         w = torch.rand(nelems, device='cuda')
 
-        pow2_div8_mem = torch.cuda.memory_stats()[key]
+        pow2_div8_mem = torch.cuda.memory_stats()[key_allocated]
         if not TEST_CUDAMALLOCASYNC:
             # not supported with the cudaMallocAsync backend
             self.assertTrue(pow2_div8_mem - start_mem == power2_div(nbytes, 8))
 
         torch.cuda.memory.empty_cache()
-        start_mem = torch.cuda.memory_stats()[key]
+        start_mem = torch.cuda.memory_stats()[key_allocated]
         v = torch.rand(nelems_big, device='cuda')
 
-        pow2_div2_mem = torch.cuda.memory_stats()[key]
+        pow2_div2_mem = torch.cuda.memory_stats()[key_allocated]
         if not TEST_CUDAMALLOCASYNC:
             # not supported with the cudaMallocAsync backend
             self.assertTrue(pow2_div2_mem - start_mem == power2_div(nbytes_big, 2))
@@ -5020,6 +5125,22 @@ def cb(device, alloc, device_alloc, device_free):
             torch.empty(1024 * 1024 * 1024 * 1024, device='cuda')
         self.assertTrue(x)
 
+    @unittest.skipIf(TEST_PYNVML, "pynvml is not available")
+    def test_nvml_get_handler(self):
+        self.assertTrue(torch.cuda._get_pynvml_handler() is not None)
+
+    @unittest.skipIf(TEST_PYNVML, "pynvml is not available")
+    def test_temperature(self):
+        self.assertTrue(0 <= torch.cuda.temperature() <= 150)
+
+    @unittest.skipIf(TEST_PYNVML, "pynvml is not available")
+    def test_power_draw(self):
+        self.assertTrue(torch.cuda.power_draw() >= 0)
+
+    @unittest.skipIf(TEST_PYNVML, "pynvml is not available")
+    def test_clock_speed(self):
+        self.assertTrue(torch.cuda.clock_rate() >= 0)
+
 
 instantiate_parametrized_tests(TestCuda)
 
diff --git a/test/test_cuda_nvml_based_avail.py b/test/test_cuda_nvml_based_avail.py
index 26a72c361dfd..04bad0ff86af 100644
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@@ -13,7 +13,7 @@
     # Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
     # otherwise be triggered by the `torch.testing._internal.common_utils` module import
     from torch.testing._internal.common_utils import (parametrize, instantiate_parametrized_tests, run_tests, TestCase,
-                                                      IS_WINDOWS)
+                                                      IS_WINDOWS, IS_JETSON)
     # NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
     # `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
     # to bypass that method here which should be irrelevant to the parameterized tests in this module.
@@ -48,6 +48,8 @@ def in_bad_fork_test() -> bool:
     @parametrize("nvml_avail", [True, False])
     @parametrize("avoid_init", ['1', '0', None])
     def test_cuda_is_available(self, avoid_init, nvml_avail):
+        if IS_JETSON and nvml_avail and avoid_init == '1':
+            self.skipTest('Not working for Jetson')
         patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
         with patch.dict(os.environ, **patch_env):
             if nvml_avail:
@@ -63,6 +65,67 @@ def test_cuda_is_available(self, avoid_init, nvml_avail):
                 assert in_bad_fork
 
 
+class TestVisibleDeviceParses(TestCase):
+
+    def test_env_var_parsing(self):
+        def _parse_visible_devices(val):
+            from torch.cuda import _parse_visible_devices as _pvd
+            with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
+                return _pvd()
+
+        # rest of the string is ignored
+        self.assertEqual(_parse_visible_devices("1gpu2,2ampere"), [1, 2])
+        # Negatives abort parsing
+        self.assertEqual(_parse_visible_devices("0, 1, 2, -1, 3"), [0, 1, 2])
+        # Double mention of ordinal returns empty set
+        self.assertEqual(_parse_visible_devices("0, 1, 2, 1"), [])
+        # Unary pluses and minuses
+        self.assertEqual(_parse_visible_devices("2, +3, -0, 5"), [2, 3, 0, 5])
+        # Random string is used as empty set
+        self.assertEqual(_parse_visible_devices("one,two,3,4"), [])
+        # Random string is used as separator
+        self.assertEqual(_parse_visible_devices("4,3,two,one"), [4, 3])
+        # GPU ids are parsed
+        self.assertEqual(_parse_visible_devices("GPU-9e8d35e3"), ["GPU-9e8d35e3"])
+        # Ordinals are not included in GPUid set
+        self.assertEqual(_parse_visible_devices("GPU-123, 2"), ["GPU-123"])
+        # MIG ids are parsed
+        self.assertEqual(_parse_visible_devices("MIG-89c850dc"), ["MIG-89c850dc"])
+
+    def test_partial_uuid_resolver(self):
+        from torch.cuda import _transform_uuid_to_ordinals
+        uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
+                 'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
+                 'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
+                 'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
+                 'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
+                 'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
+                 'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
+                 'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
+        self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
+        # First invalid UUID aborts parsing
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
+        # First ambigous UUID aborts parsing
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
+        # Duplicate UUIDs result in empty set
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
+
+    def test_ordinal_parse_visible_devices(self):
+        def _device_count_nvml(val):
+            from torch.cuda import _device_count_nvml as _dc
+            with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
+                return _dc()
+
+        with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
+            self.assertEqual(_device_count_nvml("1, 0"), 2)
+            # Ordinal out of bounds aborts parsing
+            self.assertEqual(_device_count_nvml("1, 5, 0"), 1)
+
+
+
 instantiate_parametrized_tests(TestExtendedCUDAIsAvail)
 
 if __name__ == '__main__':
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 0e3433bcb2e7..55d3ba666257 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -31,7 +31,7 @@
 from torch.utils.data.dataset import random_split
 from torch.utils.data.datapipes.iter import IterableWrapper
 from torch._utils import ExceptionWrapper
-from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
+from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_JETSON,
                                                   IS_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
                                                   load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE,
                                                   IS_MACOS)
@@ -78,11 +78,6 @@
 # as well during the execution of this test suite, and it will cause
 # CUDA OOM error on Windows.
 TEST_CUDA = torch.cuda.is_available()
-if TEST_CUDA:
-    dev_name = torch.cuda.get_device_name(torch.cuda.current_device()).lower()
-    IS_JETSON = 'xavier' in dev_name or 'nano' in dev_name or 'jetson' in dev_name or 'tegra' in dev_name
-else:
-    IS_JETSON = False
 
 if not NO_MULTIPROCESSING_SPAWN:
     # We want to use `spawn` if able because some of our tests check that the
@@ -187,7 +182,7 @@ def __init__(self, test_object, custom_list):
                 self.test_object = test_object
 
             def __getitem__(self, key):
-                self.test_object.assertEqual(type(key), type(0))
+                self.test_object.assertEqual(type(key), int)
                 return self.data[key]
 
             def __len__(self):
@@ -283,7 +278,7 @@ def test_slicing_of_subset_of_subset(self):
 
 class CUDACountingDataset(Dataset):
     def __init__(self, n):
-        super(CUDACountingDataset, self).__init__()
+        super().__init__()
         self.n = n
 
     def __getitem__(self, i):
@@ -295,7 +290,7 @@ def __len__(self):
 
 class CountingDataset(Dataset):
     def __init__(self, n):
-        super(CountingDataset, self).__init__()
+        super().__init__()
         self.n = n
 
     def __getitem__(self, i):
@@ -307,7 +302,7 @@ def __len__(self):
 
 class CountingIterableDataset(IterableDataset):
     def __init__(self, n):
-        super(CountingIterableDataset, self).__init__()
+        super().__init__()
         self.n = n
 
     def __iter__(self):
@@ -459,7 +454,7 @@ class ErrorTrackingProcess(mp.Process):
     # Setting disable_stderr=True may generate a lot of unrelated error outputs
     # but could be helpful for debugging.
     def __init__(self, disable_stderr=True, **kwargs):
-        super(ErrorTrackingProcess, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self._pconn, self._cconn = mp.Pipe()
         self._exception = None
         self.disable_stderr = disable_stderr
@@ -471,7 +466,7 @@ def run(self):
             with open(os.devnull, 'w') as devnull:
                 os.dup2(devnull.fileno(), sys.stderr.fileno())
         try:
-            super(ErrorTrackingProcess, self).run()
+            super().run()
             self._cconn.send(None)
         except Exception:
             self._cconn.send(ExceptionWrapper(sys.exc_info()))
@@ -940,7 +935,7 @@ def filter_len(row):
 class TestDataLoader(TestCase):
 
     def setUp(self):
-        super(TestDataLoader, self).setUp()
+        super().setUp()
         self.data = torch.randn(100, 2, 3, 5)
         self.labels = torch.randperm(50).repeat(2)
         self.dataset = TensorDataset(self.data, self.labels)
@@ -1111,6 +1106,7 @@ def test_sequential_pin_memory(self):
             self.assertTrue(input.is_pinned())
             self.assertTrue(target.is_pinned())
 
+    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
     def test_multiple_dataloaders(self):
         for multiprocessing_context in supported_multiprocessing_contexts:
             loader1_it = iter(self._get_data_loader(self.dataset, num_workers=1))
@@ -1361,7 +1357,7 @@ def test_iterable_style_dataset(self):
             dataloader_iter = iter(dataloader)
             fetched = list(dataloader_iter)
             self.assertEqual(len(fetched), 4)
-            fetched = set(tuple(t.tolist()) for t in fetched)
+            fetched = {tuple(t.tolist()) for t in fetched}
             self.assertEqual(fetched, {tuple(range(4)), tuple(range(7)), tuple(range(7, 14)), tuple(range(14, 20))})
 
             # [auto-batching] test that workers exit gracefully
@@ -1399,7 +1395,7 @@ def test_iterable_style_dataset(self):
             dataloader_iter = iter(dataloader)
             fetched = list(dataloader_iter)
             self.assertEqual(len(fetched), 2)
-            fetched = set(tuple(t.tolist()) for t in fetched)
+            fetched = {tuple(t.tolist()) for t in fetched}
             self.assertEqual(fetched, {tuple(range(7)), tuple(range(7, 14))})
 
             # [auto-batching & drop_last] test that workers exit gracefully
@@ -1435,6 +1431,7 @@ def test_chain_iterable_style_dataset(self):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
     @unittest.skipIf(IS_MACOS, "Not working on macos")
+    @unittest.skipIf(IS_MACOS or IS_JETSON, "Not working on macos or Jetson")
     @skipIfRocm  # https://github.com/pytorch/pytorch/issues/90940
     def test_multiprocessing_contexts(self):
         reference = [
@@ -1460,6 +1457,7 @@ def test_multiprocessing_contexts(self):
                     reference, list(self._get_data_loader(ds_cls(counting_ds_n), multiprocessing_context=ctx, **dl_common_args)))
 
     @skipIfNoNumpy
+    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
     def test_multiprocessing_iterdatapipe(self):
         # Testing to make sure that function from global scope (e.g. imported from library) can be serialized
         # and used with multiprocess DataLoader
@@ -1500,7 +1498,7 @@ def get_dataloader():
         num_workers = 6
         batch_size = 1
         dataset = SynchronizedSeedDataset(num_workers, batch_size, num_workers)
-        self.assertEqual(set(int(batch) for batch in get_dataloader()), set(int(batch) for batch in get_dataloader()))
+        self.assertEqual({int(batch) for batch in get_dataloader()}, {int(batch) for batch in get_dataloader()})
 
     def test_multi_epochs_reproducibility(self):
         num_workers = 2
@@ -2295,7 +2293,7 @@ def __getitem__(self, ndx):
     "fork is not supported. Dying (set die_after_fork=0 to override)")
 class TestStringDataLoader(TestCase):
     def setUp(self):
-        super(TestStringDataLoader, self).setUp()
+        super().setUp()
         self.dataset = StringDataset()
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
@@ -2325,7 +2323,7 @@ def __getitem__(self, ndx):
     "fork is not supported. Dying (set die_after_fork=0 to override)")
 class TestDictDataLoader(TestCase):
     def setUp(self):
-        super(TestDictDataLoader, self).setUp()
+        super().setUp()
         self.dataset = DictDataset()
 
     def test_sequential_batch(self):
@@ -2400,7 +2398,7 @@ def __getitem__(self, idx):
 class TestDataLoaderPersistentWorkers(TestDataLoader):
 
     def setUp(self):
-        super(TestDataLoaderPersistentWorkers, self).setUp()
+        super().setUp()
         self.persistent_workers = True
 
     @unittest.skipIf(IS_SANDCASTLE, "subprocess doesn't work in FB internal CI")
@@ -2513,7 +2511,7 @@ def __getitem__(self, ndx):
     "fork is not supported. Dying (set die_after_fork=0 to override)")
 class TestNamedTupleDataLoader(TestCase):
     def setUp(self):
-        super(TestNamedTupleDataLoader, self).setUp()
+        super().setUp()
         self.dataset = NamedTupleDataset()
 
     def test_dataloader_with_namedtuple(self):
@@ -2533,7 +2531,7 @@ def test_dataloader_with_namedtuple(self):
             self.assertIsInstance(batch.data, NamedTupleDataset.Data)
             self.assertNotIsInstance(batch.data.positive, torch.Tensor)
 
-class SimpleCustomBatch(object):
+class SimpleCustomBatch:
     def __init__(self, data):
         transposed_data = list(zip(*data))
         self.inp = torch.stack(transposed_data[0], 0)
@@ -2576,7 +2574,7 @@ def collate_into_packed_sequence_batch_first(batch):
     "fork is not supported. Dying (set die_after_fork=0 to override)")
 class TestCustomPinFn(TestCase):
     def setUp(self):
-        super(TestCustomPinFn, self).setUp()
+        super().setUp()
         inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
         tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
         self.dataset = TensorDataset(inps, tgts)
@@ -2634,7 +2632,7 @@ def __len__(self):
     "Flaky with ASAN, see https://github.com/pytorch/pytorch/issues/65727")
 class TestIndividualWorkerQueue(TestCase):
     def setUp(self):
-        super(TestIndividualWorkerQueue, self).setUp()
+        super().setUp()
         self.dataset = TestWorkerQueueDataset(list(range(128)))
 
     def _run_ind_worker_queue_test(self, batch_size, num_workers):
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index fcbc151b4565..59abbc28260e 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -54,7 +54,7 @@
 )
 from torch.utils.data.datapipes.dataframe import CaptureDataFrame
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
-from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES
+from torch.utils.data.datapipes.iter.sharding import SHARDING_PRIORITIES
 
 try:
     import dill
@@ -436,6 +436,31 @@ def order_fn(data):
                 rec[i][1].close()
         self.assertEqual(count, 8)
 
+        # testing the keep_key option
+        datapipe4 = dp.iter.Grouper(datapipe1, group_key_fn=group_fn, keep_key=True, group_size=2)
+
+        def order_fn(data):
+            data[1].sort(key=lambda f: f[0], reverse=True)
+            return data
+
+        datapipe5 = dp.iter.Mapper(datapipe4, fn=order_fn)  # type: ignore[var-annotated]
+
+        expected_result = [
+            ("a", ("a.png", "a.json")), ("c", ("c.png", "c.json")), ("b", ("b.png", "b.json")),
+            ("d", ("d.png", "d.json")), ("f", ("f.png", "f.json")), ("g", ("g.png", "g.json")),
+            ("e", ("e.png", "e.json")), ("h", ("h.txt", "h.json"))]
+
+        count = 0
+        for rec, expected in zip(datapipe5, expected_result):
+            count = count + 1
+            self.assertEqual(rec[0], expected[0])
+            self.assertEqual(rec[1][0][0], expected[1][0])
+            self.assertEqual(rec[1][1][0], expected[1][1])
+            for i in [0, 1]:
+                self.assertEqual(rec[1][i][1].read(), b'12345abcde')
+                rec[1][i][1].close()
+        self.assertEqual(count, 8)
+
     def test_demux_mux_datapipe(self):
         numbers = NumbersDataset(10)
         n1, n2 = numbers.demux(2, lambda x: x % 2)
@@ -603,8 +628,7 @@ def __init__(self, input_dp):
     # Prevent in-place modification
     def __iter__(self):
         input_dp = self.input_dp if isinstance(self.input_dp, IterDataPipe) else copy.deepcopy(self.input_dp)
-        for i in input_dp:
-            yield i
+        yield from input_dp
 
 
 def _fake_fn(data):
@@ -1731,7 +1755,7 @@ def test_zip_iterdatapipe(self):
             len(zipped_dp)
 
         # Functional Test: zips the results properly
-        exp = list((i, i) for i in range(5))
+        exp = [(i, i) for i in range(5)]
         self.assertEqual(list(zipped_dp), exp)
 
         # Functional Test: zips the inputs properly even when lengths are different (zips to the shortest)
@@ -2177,8 +2201,7 @@ def __iter__(self) -> Iterator[tuple]:  # type: ignore[override]
 
         class DP2(IterDataPipe[T_co]):
             def __iter__(self) -> Iterator[T_co]:
-                for d in range(10):
-                    yield d  # type: ignore[misc]
+                yield from range(10)  # type: ignore[misc]
 
         self.assertTrue(issubclass(DP2, IterDataPipe))
         dp2 = DP2()  # type: ignore[var-annotated]
@@ -2282,8 +2305,7 @@ def __init__(self, datasource):
 
             @runtime_validation
             def __iter__(self) -> Iterator[Tuple[int, T_co]]:
-                for d in self.ds:
-                    yield d
+                yield from self.ds
 
         dss = ([(1, '1'), (2, '2')],
                [(1, 1), (2, '2')])
@@ -2319,8 +2341,7 @@ def __init__(self, ds):
 
             @runtime_validation
             def __iter__(self) -> Iterator[T]:
-                for d in self.ds:
-                    yield d
+                yield from self.ds
 
         ds = list(range(10))
         # Valid type reinforcement
@@ -2343,7 +2364,7 @@ def __iter__(self) -> Iterator[T]:
 
         # Context Manager to disable the runtime validation
         with runtime_validation_disabled():
-            self.assertEqual(list(d for d in dp3), ds)
+            self.assertEqual(list(dp3), ds)
 
 
 class NumbersDataset(IterDataPipe):
@@ -2351,8 +2372,7 @@ def __init__(self, size=10):
         self.size = size
 
     def __iter__(self):
-        for i in range(self.size):
-            yield i
+        yield from range(self.size)
 
     def __len__(self):
         return self.size
@@ -2702,6 +2722,47 @@ def construct_sharded_pipe():
         with self.assertRaises(Exception):
             dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
 
+    # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatbility
+    # TODO: Remove this test once tud.datapipes.iter.grouping.SHARDING_PRIORITIES is deprecated
+    def test_sharding_groups_in_legacy_grouping_package(self):
+        with self.assertWarnsRegex(FutureWarning, r'Please use `SHARDING_PRIORITIES` '
+                                                  'from the `torch.utils.data.datapipes.iter.sharding`'):
+            from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES as LEGACY_SHARDING_PRIORITIES
+
+        def construct_sharded_pipe():
+            sharding_pipes = []
+            dp = NumbersDataset(size=90)
+            dp = dp.sharding_filter(sharding_group_filter=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED)
+            sharding_pipes.append(dp)
+            dp = dp.sharding_filter(sharding_group_filter=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+            sharding_pipes.append(dp)
+            dp = dp.sharding_filter(sharding_group_filter=300)
+            sharding_pipes.append(dp)
+            return dp, sharding_pipes
+
+        dp, sharding_pipes = construct_sharded_pipe()
+
+        for pipe in sharding_pipes:
+            pipe.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED)
+            pipe.apply_sharding(5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+            pipe.apply_sharding(3, 1, sharding_group=300)
+
+        actual = list(dp)
+        expected = [17, 47, 77]
+        self.assertEqual(expected, actual)
+        self.assertEqual(3, len(dp))
+
+        dp, _ = construct_sharded_pipe()
+        dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
+        with self.assertRaises(Exception):
+            dp.apply_sharding(5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+
+        dp, _ = construct_sharded_pipe()
+        dp.apply_sharding(5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+        with self.assertRaises(Exception):
+            dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
+
+
     def test_sharding_length(self):
         numbers_dp = dp.iter.IterableWrapper(range(13))
         sharded_dp0 = numbers_dp.sharding_filter()
diff --git a/test/test_decomp.py b/test/test_decomp.py
index ddb4cedd7e5b..2ba1b5b615b6 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -17,6 +17,7 @@
     run_tests,
     skipIfTorchDynamo,
 )
+from torch.testing._internal.common_modules import module_db, modules
 from torch.testing._internal.common_device_type import (
     onlyNativeDeviceTypes,
     ops,
@@ -25,6 +26,7 @@
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch._dispatch.python import enable_python_dispatcher
+from torch._ops import has_key, DispatchKey
 
 import itertools
 import functools
@@ -166,7 +168,10 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
         (torch.float16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5,
         (torch.bfloat16, torch.ops.aten.linalg_vector_norm.default): 1e-4,
         (torch.float16, torch.ops.aten.linalg_vector_norm.default): 1e-4,
+        (torch.bfloat16, torch.ops.aten.var_mean.correction): 5e-7,
+        (torch.float16, torch.ops.aten.var_mean.correction): 5e-7,
         (torch.bfloat16, torch.ops.aten.var_mean.dim): 5e-7,
+        (torch.float16, torch.ops.aten.var_mean.dim): 5e-7,
         (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
         (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
     }
@@ -324,6 +329,8 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "norm"),
     # native_batch_norm is only implicit when python dispatcher is on (and noncomposite otherwise)
     (None, None, "native_batch_norm"),
+
+    (None, None, "_upsample_bilinear2d_aa"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
@@ -416,6 +423,130 @@ def test_uniform(self, device):
         res = torch._decomp.decompositions.uniform(x, low=low, high=high)
         self.assertEqual(ref, res)
 
+
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @suppress_warnings
+    # only tests RNNs since we have py dispsatcher decomps for them
+    @modules(filter(lambda m: m.module_cls in (torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU), module_db))
+    def test_rnn_decomp_module(self, device, dtype, module_info, training):
+        module_cls = module_info.module_cls
+        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
+                                                       requires_grad=True, training=training)
+        for module_input in module_inputs:
+            if module_input.forward_input is None:
+                continue
+            args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+            m = module_cls(*args, **kwargs)
+            m.to(device).to(dtype)
+
+            args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
+            with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all=True), enable_python_dispatcher():
+                decomp_out = m(*args, **kwargs)
+
+            non_decomp_out = m(*args, **kwargs)
+            # without this check, incorrect decomps at the python dispatcher level can still pass because
+            # they're checking aten decomps at the torch_dispatch level
+            self.assertEqual(decomp_out, non_decomp_out)
+
+    class DecompCrossRefMode(TorchDispatchMode):
+        def __init__(self, test_case, saved_precision, saved_rel_tol, dtype, run_all):
+            self.test_case = test_case
+            self.saved_precision = saved_precision
+            self.saved_rel_tol = saved_rel_tol
+            self.test_dtype = dtype
+            self.run_all = run_all
+
+            # We check the correctness of each decomposition right after running it.
+            # So, when we encounter a decomposition, we run the function normally, and
+            # then run the decomposition, and ensure they're identical.
+            self.called = set()
+            self.decomposed = set()
+
+        def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+            self.test_case.precision = self.saved_precision
+            self.test_case.rel_tol = self.saved_rel_tol
+
+            self.called.add(func)
+            all_called[func] += 1
+
+            # Stuff we shouldn't bother testing
+            # (TODO: remove detach from the decomp table?)
+            # N.b. Testing in-place ops would need dedicated logic
+            in_place = func.name()[-1] == '_'
+            if func not in decomposition_table or func in [
+                torch.ops.aten.detach.default,
+                # non-deterministic ops
+                torch.ops.aten.empty.memory_format,
+                torch.ops.aten.empty_like.default,
+                torch.ops.aten.new_empty.default,
+                torch.ops.aten.empty_strided.default,
+                torch.ops.aten.new_empty_strided.default,
+                torch.ops.aten.randn.default,
+                torch.ops.aten.native_dropout.default,
+            ] or any_unsupported(args, kwargs) or in_place:
+                return func(*args, **kwargs)
+
+            self.decomposed.add(func)
+            all_decomposed.add(func)
+
+            # We take 2 main strategies for verifying correctness/numerical stability of decompositions
+            # The first one is simply tolerance checking between decomp_out and pytorch_out
+            # However, for fp16/bf16 and reductions, this becomes very
+            # finicky, as there are not many guarantees we can make.
+            # So, for fp16/bf16, we instead compare the difference of
+            # {decomp_out, pytorch_out_64} and {pytorch_out,
+            # pytorch_out_64}. In other words, we compare how far the
+            # decomposition and pytorch are from the "ground truth" (i.e.
+            # fp64). If the decomposition results in more error, we error
+
+            # We also decompose the decomposition recursively for
+            # further coverage, as some paths not be exercised directly by
+            # OpInfos (sadly) but just by other ops
+
+            decomposition = decomposition_table[func]
+
+            do_relative_check = self.test_dtype in [torch.float16, torch.bfloat16]
+            if self.run_all:
+                # Execute recursively via DFS, to find the root of a possible error first
+                with self:
+                    decomp_out, _ = tree_flatten(decomposition(*args, **kwargs))
+            else:
+                decomp_out, _ = tree_flatten(decomposition(*args, **kwargs))
+
+            # At this stage we should not be decomposing an in-place op
+            # We'd like to have decompositions that decompose out-of-place ops into out-of-place ops
+            #  because decompositions are run after functionalisation and we would not like them to
+            #  de-functionalise the graph, as that would break AoTAutograd
+            # We run the real function *after* the decomposition to make sure that the
+            # decomposition does not modify any of the inputs in-place. If it does
+            # real_out should be differen than decom_out so we should catch this
+            real_out_unflat = func(*args, **kwargs)
+            real_out, _ = tree_flatten(real_out_unflat)
+
+            assert len(real_out) == len(decomp_out)
+
+            if do_relative_check:
+                upcast = partial(upcast_tensor, dtype=torch.float64)
+                real_out_double, _ = tree_flatten(
+                    func(*tree_map(upcast, args), **tree_map(upcast, kwargs))
+                )
+                for i, (orig, decomp, ref) in enumerate(zip(real_out, decomp_out, real_out_double)):
+                    if not isinstance(orig, torch.Tensor):
+                        assert type(orig) == type(decomp)
+                        assert orig == decomp
+                        continue
+                    op_assert_ref(self.test_case, func, self.test_dtype, i, orig, decomp, ref, args, kwargs)
+            else:
+                for orig, decomp in zip(real_out, decomp_out):
+                    if not isinstance(orig, torch.Tensor):
+                        assert type(orig) == type(decomp)
+                        assert orig == decomp
+                        continue
+                    op_assert_equal(self.test_case, func, self.test_dtype, orig, decomp, args, kwargs)
+
+            return real_out_unflat
+
+
     @skipIfTorchDynamo("Test does not work with TorchDynamo")
     def do_cross_ref(self, device, dtype, op, *, run_all):
         test_keys = [
@@ -427,102 +558,6 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
             self.skipTest(f"{op.name} in {dtype} not supported")
 
         skip_decomp_vjp = any(key in CROSS_REF_BACKWARD_EXCLUDE_SET for key in test_keys)
-        test_dtype = dtype
-
-        # We check the correctness of each decomposition right after running it.
-        # So, when we encounter a decomposition, we run the function normally, and
-        # then run the decomposition, and ensure they're identical.
-        called = set()
-        decomposed = set()
-
-        saved_precision = self.precision
-        saved_rel_tol = self.rel_tol
-        test_case = self
-
-        class DecompCrossRefMode(TorchDispatchMode):
-            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                test_case.precision = saved_precision
-                test_case.rel_tol = saved_rel_tol
-
-                called.add(func)
-                all_called[func] += 1
-
-                # Stuff we shouldn't bother testing
-                # (TODO: remove detach from the decomp table?)
-                # N.b. Testing in-place ops would need dedicated logic
-                in_place = func.name()[-1] == '_'
-                if func not in decomposition_table or func in [
-                    torch.ops.aten.detach.default,
-                    # non-deterministic ops
-                    torch.ops.aten.empty.memory_format,
-                    torch.ops.aten.empty_like.default,
-                    torch.ops.aten.new_empty.default,
-                    torch.ops.aten.empty_strided.default,
-                    torch.ops.aten.new_empty_strided.default,
-                    torch.ops.aten.randn.default,
-                    torch.ops.aten.native_dropout.default,
-                ] or any_unsupported(args, kwargs) or in_place:
-                    return func(*args, **kwargs)
-
-                decomposed.add(func)
-                all_decomposed.add(func)
-
-                # We take 2 main strategies for verifying correctness/numerical stability of decompositions
-                # The first one is simply tolerance checking between decomp_out and pytorch_out
-                # However, for fp16/bf16 and reductions, this becomes very
-                # finicky, as there are not many guarantees we can make.
-                # So, for fp16/bf16, we instead compare the difference of
-                # {decomp_out, pytorch_out_64} and {pytorch_out,
-                # pytorch_out_64}. In other words, we compare how far the
-                # decomposition and pytorch are from the "ground truth" (i.e.
-                # fp64). If the decomposition results in more error, we error
-
-                # We also decompose the decomposition recursively for
-                # further coverage, as some paths not be exercised directly by
-                # OpInfos (sadly) but just by other ops
-
-                decomposition = decomposition_table[func]
-
-                do_relative_check = test_dtype in [torch.float16, torch.bfloat16]
-                if run_all:
-                    # Execute recursively via DFS, to find the root of a possible error first
-                    with self:
-                        decomp_out, _ = tree_flatten(decomposition(*args, **kwargs))
-                else:
-                    decomp_out, _ = tree_flatten(decomposition(*args, **kwargs))
-
-                # At this stage we should not be decomposing an in-place op
-                # We'd like to have decompositions that decompose out-of-place ops into out-of-place ops
-                #  because decompositions are run after functionalisation and we would not like them to
-                #  de-functionalise the graph, as that would break AoTAutograd
-                # We run the real function *after* the decomposition to make sure that the
-                # decomposition does not modify any of the inputs in-place. If it does
-                # real_out should be differen than decom_out so we should catch this
-                real_out_unflat = func(*args, **kwargs)
-                real_out, _ = tree_flatten(real_out_unflat)
-
-                assert len(real_out) == len(decomp_out)
-
-                if do_relative_check:
-                    upcast = partial(upcast_tensor, dtype=torch.float64)
-                    real_out_double, _ = tree_flatten(
-                        func(*tree_map(upcast, args), **tree_map(upcast, kwargs))
-                    )
-                    for i, (orig, decomp, ref) in enumerate(zip(real_out, decomp_out, real_out_double)):
-                        if not isinstance(orig, torch.Tensor):
-                            assert type(orig) == type(decomp)
-                            assert orig == decomp
-                            continue
-                        op_assert_ref(test_case, func, test_dtype, i, orig, decomp, ref, args, kwargs)
-                else:
-                    for orig, decomp in zip(real_out, decomp_out):
-                        if not isinstance(orig, torch.Tensor):
-                            assert type(orig) == type(decomp)
-                            assert orig == decomp
-                            continue
-                        op_assert_equal(test_case, func, test_dtype, orig, decomp, args, kwargs)
-
-                return real_out_unflat
 
         requires_grad = (
             op.supports_autograd
@@ -533,13 +568,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             # but that when we do backwards we expect other ops like add to work
             and not dtype == torch.complex32
         )
-        samples = op.sample_inputs(device, test_dtype, requires_grad=requires_grad)
+        samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
-        def check_decomposed(aten_name):
+        def check_decomposed(aten_name, mode):
             self.assertTrue(
-                any(overload_to_aten_name(c) == aten_name for c in decomposed),
+                any(overload_to_aten_name(c) == aten_name for c in mode.decomposed),
                 msg=(f"aten.{aten_name} was not decomposed, saw calls for: "
-                     f"{', '.join(map(str, list(called)))}. If your op is  "
+                     f"{', '.join(map(str, list(mode.called)))}. If your op is  "
                      f"CompositeImplicitAutograd you should skip this test "
                      "by updating CROSS_REF_EXCLUDE_SET.")
             )
@@ -558,29 +593,29 @@ def check_decomposed(aten_name):
                 # store the called list on the mode object instance and no
                 # explicit clearing is necessary as I will create a fresh mode
                 # for each region
-                decomposed.clear()
-                with DecompCrossRefMode(), enable_python_dispatcher():
+                with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all)\
+                     as mode, enable_python_dispatcher():
                     decomp_out, decomp_vjp_fn = ref_vjp_no_create(fn, *primals)
                 if aten_name in decomposition_names:
-                    check_decomposed(aten_name)
+                    check_decomposed(aten_name, mode)
 
                 if not skip_decomp_vjp and (op.aten_backward_name in decomposition_names or run_all):
                     cotangents = tree_map(lambda x: torch.randn_like(x), decomp_out)
 
-                    decomposed.clear()
-                    with DecompCrossRefMode(), enable_python_dispatcher():
+                    with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all)\
+                         as mode, enable_python_dispatcher():
                         decomp_vjp_fn(cotangents)
                     if not run_all:
-                        check_decomposed(op.aten_backward_name)
+                        check_decomposed(op.aten_backward_name, mode)
 
             elif aten_name in decomposition_names or run_all:
                 args = [sample_input.input] + list(sample_input.args)
                 kwargs = sample_input.kwargs
-                decomposed.clear()
-                with DecompCrossRefMode(), enable_python_dispatcher():
+                with self.DecompCrossRefMode(self, self.precision, self.rel_tol, dtype, run_all)\
+                     as mode, enable_python_dispatcher():
                     func(*args, **kwargs)
                 if not run_all:
-                    check_decomposed(aten_name)
+                    check_decomposed(aten_name, mode)
             else:
                 assert op.supports_autograd
                 self.skipTest(
@@ -664,5 +699,54 @@ def test_amp_batch_norm_backward(self):
 
 instantiate_device_type_tests(DecompAmpTests, globals())
 
+class HasDecompTest(TestCase):
+    def setUp(self):
+        super().setUp()
+        self.maxDiff = None
+
+    def test_has_decomposition(self):
+
+        def can_appear_in_trace(op) -> bool:
+            has_tensor_arg = any(
+                "Tensor" in str(a.type)
+                for a in itertools.chain(op._schema.arguments, op._schema.returns))
+            if not has_tensor_arg:
+                return False
+
+            try:
+                # CompositeImplicitAutograd ops are transparent to the tracer, so don't need decompositions
+                return not has_key(op, DispatchKey.CompositeImplicitAutograd)
+            except RuntimeError as e:
+                # has_key fails for some jit-registered ops, which shouldn't be
+                # relevant here anyway
+                if 'does not exist' in str(e):
+                    return False
+                raise
+
+        def all_aten_overloads():
+            for name in torch._C._dispatch_get_all_op_names():
+                if not name.startswith("aten::"):
+                    continue
+
+                name = name[6:]
+                if "." in name:
+                    packet_name, overload_name = name.split(".")
+                else:
+                    packet_name, overload_name = name, "default"
+
+                packet = getattr(aten, packet_name)
+                assert isinstance(packet, torch._ops.OpOverloadPacket)
+                op = getattr(packet, overload_name)
+                yield op
+
+        # This is for operators that are only registered in some CI
+        # configurations, so would cause the test to fail
+        allow_list = {aten.get_gradients.default}
+
+        overloads_wanting_decomp = {op for op in all_aten_overloads() if can_appear_in_trace(op)}
+        ops_missing_decomp = overloads_wanting_decomp - decomposition_table.keys()
+        ops_missing_decomp -= allow_list
+        self.assertExpected("".join(sorted(op.name() + "\n" for op in ops_missing_decomp)))
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_determination.py b/test/test_determination.py
index 3de8f1cfc4e2..038339425b9b 100644
--- a/test/test_determination.py
+++ b/test/test_determination.py
@@ -6,7 +6,7 @@
 from torch.testing._internal.common_utils import TestCase, run_tests
 
 
-class DummyOptions(object):
+class DummyOptions:
     verbose = False
 
 
@@ -43,7 +43,7 @@ def test_target_det_list_is_sorted(self):
     def test_config_change_only(self):
         """CI configs trigger all tests"""
         self.assertEqual(
-            self.determined_tests([".jenkins/pytorch/test.sh"]), self.TESTS
+            self.determined_tests([".ci/pytorch/test.sh"]), self.TESTS
         )
 
     def test_run_test(self):
diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index 8dbb1058abd3..3536b2edd344 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -3,7 +3,7 @@
 
 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, skipMeta,
     onlyNativeDeviceTypes)
@@ -52,6 +52,10 @@ def test_dlpack_conversion_with_streams(self, device, dtype):
         # (hence data dependency) at the exchange boundary.
         # DLPack manages this synchronization for us, so we don't need to
         # explicitly wait until x is populated
+        if IS_JETSON:
+            # DLPack protocol that establishes correct stream order
+            # does not behave as expected on Jetson
+            stream.synchronize()
         stream = torch.cuda.Stream()
         with torch.cuda.stream(stream):
             z = from_dlpack(x)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index e1545708e10b..fce82fc3d9cc 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -5,34 +5,25 @@
 import torch.fx
 import torch.nn.functional as F
 from torch.testing._internal.common_utils import run_tests, TestCase, skipIfTorchDynamo, \
-    IS_WINDOWS, parametrize, instantiate_parametrized_tests
-import unittest
+    parametrize, instantiate_parametrized_tests
 import torch
 import operator
 import itertools
-import random
 import contextlib
 import math
-import atexit
-import os
+import copy
+import sympy
 from torch.utils._pytree import tree_map
 from torch.fx.experimental import symbolic_shapes
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode, sym_sqrt, sym_int, to_node
+from torch.fx.experimental.symbolic_shapes import SymNode, \
+    FloorDiv, ShapeEnv, sym_sqrt, sym_float, to_node, GuardOnDataDependentSymNode, \
+    guard_bool, guard_int, guard_float
 from torch.utils._python_dispatch import TorchDispatchMode
-from torch import SymInt
+from torch import SymBool, SymInt, SymFloat, sym_int
 
 aten = torch.ops.aten
 
-try:
-    import sympy
-    # TODO(jansel): these tests fail on windows
-    HAS_SYMPY = not IS_WINDOWS
-except ImportError:
-    HAS_SYMPY = False
-skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy")
-
-
 meta_funcs = {}
 
 
@@ -125,16 +116,16 @@ def create_symbolic_tensor(name, arg, shape_env):
         shape_env.create_symbolic_sizes_strides_storage_offset(arg, source=ConstantSource(name))
     return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device, sym_storage_offset)
 
-def create_symint(shape_env, i):
+def create_symint(shape_env, i: int):
     from torch._dynamo.source import ConstantSource
     return shape_env.create_symintnode(
-        shape_env.create_symbol(i, source=ConstantSource(f"__testing_only{len(shape_env.var_to_val)}"))
+        shape_env.create_symbol(i, source=ConstantSource(f"__testing_only{len(shape_env.var_to_val)}")),
+        hint=i
     )
 
 @skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
 class TestPySymInt(TestCase):
 
-    @skipIfNoSympy
     def test_arith_ops(self):
         shape_env = ShapeEnv()
         symints = []
@@ -149,7 +140,6 @@ def test_arith_ops(self):
                     self.assertTrue(op(args[0][1], args[1][1]) == op(args[0][0], args[1][0]))
 
 
-    @skipIfNoSympy
     def test_reverse_arith_ops(self):
         shape_env = ShapeEnv()
 
@@ -160,7 +150,6 @@ def test_reverse_arith_ops(self):
         self.assertTrue(5 * a == 5 * 2)
 
 
-    @skipIfNoSympy
     def test_roundtrip(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
@@ -186,7 +175,6 @@ def test_roundtrip(self):
         self.assertTrue(isinstance(y.storage_offset(), SymInt))
         self.assertTrue(y.storage_offset() == 12)
 
-    @skipIfNoSympy
     def test_binary(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
@@ -204,7 +192,6 @@ def test_binary(self):
         self.assertTrue(z.shape[1] == 4)
         self.assertTrue(z.shape[2] == 3)
 
-    @skipIfNoSympy
     def test_symint_args(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
@@ -221,7 +208,6 @@ def test_symint_args(self):
         z = x.narrow_copy(LAST_DIM, 0, x.shape[LAST_DIM] - 1)
         self.assertTrue(z.shape[2] == 2)
 
-    @skipIfNoSympy
     def test_symint_vargs(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
@@ -266,13 +252,11 @@ def test_symint_vargs(self):
         z = y.expand((y.shape[1],))
         z = y.expand(y.shape[1])
 
-    @skipIfNoSympy
     def test_stride(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 5), shape_env)
         self.assertIsInstance(x.stride()[0], SymInt)
 
-    @skipIfNoSympy
     def test_size_expressions(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5), shape_env)
@@ -288,7 +272,6 @@ def test_size_expressions(self):
         self.assertTrue(str(expand_x.shape[1]), str(x.shape[0]))
         self.assertTrue(str(expand_x.shape[1]), str(result.shape[0]))
 
-    @skipIfNoSympy
     def test_numel(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5), shape_env)
@@ -299,14 +282,12 @@ def test_numel(self):
         self.assertIsInstance(x.numel(), int)
         self.assertIsInstance(torch.numel(x), int)
 
-    @skipIfNoSympy
     def test_int_to_float(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5), shape_env)
         r = sym_float(x.shape[0])
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
 
-    @skipIfNoSympy
     def test_aten_ops(self):
 
         shape_env = ShapeEnv()
@@ -329,21 +310,18 @@ def forward(self, x):
         # tuple of ints, not tuple
         torch.fx.symbolic_trace(m)
 
-    @skipIfNoSympy
     def test_meta_symint(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         r = torch.empty(a0, device='meta')
         self.assertIsInstance(r.shape[0], SymInt)
 
-    @skipIfNoSympy
     def test_guard_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         self.assertEqual(guard_int(a0), 2)
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
 
-    @skipIfNoSympy
     def test_sym_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
@@ -364,7 +342,12 @@ def test_sym_int(self):
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(str(shape_env.guards[2][0]), """Eq(ceiling(-s2/2), -1)""")
 
-    @skipIfNoSympy
+        a3 = create_symint(shape_env, 3)
+        r = sym_int(2.0 * sym_float(a3))
+        self.assertEqual(guard_int(r), 6)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertExpectedInline(str(shape_env.guards[3][0]), """Eq(2*s2, 6)""")
+
     def test_sym_sqrt(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 4)
@@ -373,7 +356,6 @@ def test_sym_sqrt(self):
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(sqrt(s0), 2)""")
 
-    @skipIfNoSympy
     def test_sym_floor(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
@@ -381,21 +363,65 @@ def test_sym_floor(self):
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(floor(s0/2), 2)""")
+        r = math.floor(3.0 * a0)
+        self.assertEqual(r, 15)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(3*s0, 15)""")
+
+    def test_sym_ceil(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 5)
+        r = math.ceil(a0 / 2)
+        self.assertEqual(r, 3)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(ceiling(s0/2), 3)""")
+        r = math.floor(3.0 * a0)
+        self.assertEqual(r, 15)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(3*s0, 15)""")
+
 
-    @skipIfNoSympy
     def test_int_conversion(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
-        self.assertRaisesRegex(RuntimeError, "Trying to extract", lambda: int(a0))
+        int(a0)
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s0, 2)""")
+
+    def test_data_dependent_guard(self):
+        shape_env = ShapeEnv()
+        s0 = shape_env.create_unbacked_symint()
+        self.assertRaises(GuardOnDataDependentSymNode, lambda: bool(s0 == 0))
 
-    @skipIfNoSympy
     def test_non_overlapping_and_dense(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 5)
         r = torch.empty_strided((a0, 7), (1, a0), device='meta')
         self.assertTrue(torch.ops.aten.is_non_overlapping_and_dense.default(r))
 
-    @skipIfNoSympy
+    def test_specialize_zero_one(self):
+        shape_env = ShapeEnv(specialize_zero_one=True)
+        a0 = create_symint(shape_env, 5)
+        assert a0 != 1
+        self.assertEqual(len(shape_env.guards), 0)
+
+        shape_env = ShapeEnv(specialize_zero_one=False)
+        a0 = create_symint(shape_env, 5)
+        assert a0 != 1
+        self.assertEqual(len(shape_env.guards), 1)
+
+    def test_duck_shape(self):
+        shape_env = ShapeEnv(duck_shape=True)
+        a0 = create_symint(shape_env, 5)
+        a1 = create_symint(shape_env, 5)
+        assert a0 == a1
+        self.assertEqual(len(shape_env.guards), 0)
+
+        shape_env = ShapeEnv(duck_shape=False)
+        a0 = create_symint(shape_env, 5)
+        a1 = create_symint(shape_env, 5)
+        assert a0 == a1
+        self.assertEqual(len(shape_env.guards), 1)
+
     def test_symint_as_scalar(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
@@ -419,7 +445,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         self.assertTrue(sym_int_encountered)
 
-    @skipIfNoSympy
+    def test_deepcopy(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 2)
+        assert a0 < 4
+        new_shape_env = copy.deepcopy(shape_env)
+        self.assertEqual(len(new_shape_env.guards), 1)
+
     def test_print_readable_with_symints(self):
         def f(a, b):
             dim0 = a.shape[0] + b.shape[0]
@@ -447,43 +479,6 @@ def forward(self, a_1: f32[s0, s1], b_1: f32[s2, s1]):
         getitem_1: b8[s0 + s2, 2*s1] = native_dropout[1];  native_dropout = None
         return (getitem, getitem_1)""")  # noqa: B950
 
-# This environment variable controls whether or not we print expected failure
-# lists at the end of a test suite run.  The intended usage looks like this:
-#
-# 1. Run `PYTORCH_COLLECT_EXPECT=1 python test/test_dynamic_shapes.py -k TestSymNumberMagicMethods`.
-# 2. Given the printed xfail list, add them to the set expected_failure_sym_magic_methods.
-COLLECT_EXPECT = os.getenv('PYTORCH_COLLECT_EXPECT', '0') == '1'
-
-seen_failed = []
-def print_seen():
-    out = []
-    for key, reason in seen_failed:
-        # Make sure the generated line is lint clean
-        msg = f"    {key},  # {reason}"
-        eol = msg.find("\n")
-        if eol != -1:
-            msg = msg[:eol]
-        out.append(msg[:120])
-
-    print("expected_failure_sym_magic_methods = {")
-    print("\n".join(out))
-    print("}")
-
-if COLLECT_EXPECT:
-    atexit.register(print_seen)
-
-expected_failure_sym_magic_methods = {
-    ('floordiv', 'SymFloat', 'float'),  # Cannot convert complex to float
-    ('floordiv', 'float', 'SymFloat'),  # Cannot convert complex to float
-    ('floordiv', 'SymFloat', 'SymFloat'),  # Cannot convert complex to float
-    ('floordiv', 'SymFloat', 'int'),  # Scalars are not close!
-    ('floordiv', 'float', 'SymInt'),  # Scalars are not close!
-    ('floordiv', 'SymFloat', 'SymInt'),  # Scalars are not close!
-    ('floordiv', 'SymInt', 'float'),  # Cannot convert complex to float
-    ('floordiv', 'int', 'SymFloat'),  # Cannot convert complex to float
-    ('floordiv', 'SymInt', 'SymFloat'),  # Cannot convert complex to float
-}
-
 @skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
 class TestSymNumberMagicMethods(TestCase):
     def _do_test(self, fn, inp1, inp2, shape_env, is_unary_fn):
@@ -501,18 +496,22 @@ def get_sym_inp(inp):
                 return torch.SymFloat(to_node(seed_node, inp))
 
         def maybe_xfail(inp1, inp2):
-            key = (fn, type(inp1).__name__, type(inp2).__name__)
-            if COLLECT_EXPECT:
-                @contextlib.contextmanager
-                def context():
-                    try:
-                        yield
-                    except (TypeError, AssertionError) as e:
-                        seen_failed.append((key, str(e)))
-                return context()
-
-            if key in expected_failure_sym_magic_methods:
-                return self.assertRaises((TypeError, AssertionError))
+            if fn == "sym_sqrt" and inp1 < 0:
+                # ValueError: math domain error
+                return self.assertRaises((ValueError,))
+            elif fn in ("truediv", "floordiv", "mod") and inp2 == 0:
+                # ZeroDivisionError: division by zero
+                return self.assertRaises((ZeroDivisionError,))
+            elif fn == "pow" and inp1 == 0 and inp2 < 0:
+                # ZeroDivisionError: 0.0 cannot be raised to a negative power
+                return self.assertRaises((ZeroDivisionError,))
+            elif fn == "pow" and inp1 < 0 and inp2 in (2.5, -2.5) and (
+                type(inp1) in (SymFloat, SymInt) or
+                type(inp2) in (SymFloat, SymInt)
+            ):
+                # Complex result, which we do not support:
+                # TypeError: Cannot convert complex to float
+                return self.assertRaises((TypeError,))
             else:
                 return contextlib.nullcontext()
 
@@ -525,19 +524,16 @@ def context():
         else:
             lambda_apply = getattr(operator, fn)
 
-        if fn in symbolic_shapes.always_float_magic_methods:
-            tp = "float"
-        elif fn in symbolic_shapes.always_int_magic_methods:
-            tp = "int"
-        elif fn in symbolic_shapes.always_bool_magic_methods:
-            tp = "bool"
-        elif is_unary_fn:
-            tp = "float" if isinstance(inp1, float) else "int"
-        else:
-            tp = "float" if any(isinstance(i, float) for i in [inp1, inp2]) else "int"
-
         def guard_fn(v):
-            return getattr(v.node, f"guard_{tp}")("", 0)
+            try:
+                if type(v) in (SymBool, bool):
+                    return guard_bool(v)
+                elif type(v) in (SymFloat, float):
+                    return guard_float(v)
+                else:  # SymInt, int
+                    return guard_int(v)
+            except Exception as e:
+                raise e
 
         # Get reference result
         with maybe_xfail(inp1, inp2):
@@ -553,7 +549,8 @@ def guard_fn(v):
                 out = lambda_apply(sym_inp1)
             else:
                 out = lambda_apply(sym_inp1, inp2)
-            self.assertEqual(guard_fn(out), ref_out)
+            out = guard_fn(out)
+            self.assertEqual(out, ref_out)
 
         if is_unary_fn:
             return
@@ -562,12 +559,14 @@ def guard_fn(v):
         sym_inp2 = get_sym_inp(inp2)
         with maybe_xfail(inp1, sym_inp2):
             out = lambda_apply(inp1, sym_inp2)
-            self.assertEqual(guard_fn(out), ref_out)
+            out = guard_fn(out)
+            self.assertEqual(out, ref_out)
 
         # Symified both args
         with maybe_xfail(sym_inp1, sym_inp2):
             out = lambda_apply(sym_inp1, sym_inp2)
-            self.assertEqual(guard_fn(out), ref_out)
+            out = guard_fn(out)
+            self.assertEqual(out, ref_out)
 
 
     @parametrize("fn", list(symbolic_shapes.magic_methods.keys()))
@@ -596,20 +595,211 @@ def test_method(self, fn, first_type, second_type):
         if fn in symbolic_shapes.bool_magic_methods:
             self.skipTest(f"{fn} is bool")
 
-        # We could pass int/float directly for types but then the
-        # mangled test name is bad
-        inp1 = random.random() * 2.5
-        if first_type == "int":
-            inp1 = int(inp1)
-        inp2 = random.random() * 2.5
-        if second_type == "int":
-            inp2 = int(inp2)
+        # Only floats here since these will be converted to int if necessary.
+        # We also ignore complex and bool.
+        values = (
+            0.0,
+            1.0,
+            2.5,
+        )
 
-        shape_env = ShapeEnv()
+        neg_values = tuple(-x for x in values)
 
-        self._do_test(fn, inp1, inp2, shape_env, is_unary_fn)
+        for inp1, inp2 in itertools.chain(
+            itertools.product(values, values),
+            itertools.product(values, neg_values),
+            itertools.product(neg_values, values),
+            itertools.product(neg_values, neg_values),
+        ):
+            if first_type == "int":
+                inp1 = int(inp1)
+            if second_type == "int":
+                inp2 = int(inp2)
+
+            shape_env = ShapeEnv()
+
+            self._do_test(fn, inp1, inp2, shape_env, is_unary_fn)
 
 instantiate_parametrized_tests(TestSymNumberMagicMethods)
 
+class TestFloorDiv(TestCase):
+    @staticmethod
+    def python_floordiv(x, y):
+        return x // y
+
+    @staticmethod
+    def torch_floordiv(x, y):
+        # Note: we fully evaluate here since FloorDiv might not always do
+        # that.
+        shape_env = ShapeEnv()
+        return shape_env.evaluate_expr(FloorDiv(x, y))
+
+    @staticmethod
+    def yield_test_cases(values, negate=True):
+        for x, y in values:
+            yield (x, y)
+            if negate:
+                yield (-x, y)
+                yield (x, -y)
+                yield (-x, -y)
+
+    def test_floordiv_float_int(self):
+        values = (
+            (2.5, 2.1),
+            (2.1, 2.5),
+            (2.0, 2.1),
+            (7, 2.5),
+            (2.1, 7),
+            (7, 2),
+        )
+
+        for x, y in TestFloorDiv.yield_test_cases(values):
+            self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y))
+
+    def test_floordiv_bool(self):
+        values = (
+            (False, True),
+            (True, 2.5),
+            (2.5, True),
+            (False, 7),
+            (7, True),
+        )
+
+        for x, y in TestFloorDiv.yield_test_cases(values, negate=False):
+            # Compares to int since our FloorDiv has no bool support
+            self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(int(x), int(y)))
+            # Tests that our impl throws
+            self.assertRaisesRegex(
+                TypeError,
+                (rf"unsupported operand type\(s\) for //: "
+                 rf"'{type(sympy.sympify(x)).__name__}' and '{type(sympy.sympify(y)).__name__}'"
+                 rf", expected integer or real"),
+                lambda: TestFloorDiv.torch_floordiv(x, y))
+
+    def test_floordiv_complex(self):
+        values = (
+            (1.5 + 2.5j, 1.3 + 3.5j),
+            (1.5 + 2.5j, 2.5),
+            (2.5, 1.5 + 2.5j),
+            (1.5 + 2.5j, 7),
+            (7, 1.5 + 2.5j),
+        )
+
+        for x, y in TestFloorDiv.yield_test_cases(values):
+            # We don't test error messages to avoid depending on Python
+            # interpreter version
+            self.assertRaises(TypeError, lambda: TestFloorDiv.python_floordiv(x, y))
+            self.assertRaisesRegex(
+                TypeError,
+                (rf"unsupported operand type\(s\) for //: "
+                 rf"'{type(sympy.sympify(x)).__name__}' and '{type(sympy.sympify(y)).__name__}'"
+                 rf", expected integer or real"),
+                lambda: TestFloorDiv.torch_floordiv(x, y))
+
+    def test_floordiv_div_by_zero(self):
+        values = (
+            (2.5, 0),
+            (2.1, 0.0),
+            (2.3, sympy.Symbol("s", zero=True)),
+        )
+
+        for x, y in TestFloorDiv.yield_test_cases(values, negate=False):
+            # We don't test error messages to avoid depending on Python
+            # interpreter version
+            if type(y) is not sympy.Symbol:
+                self.assertRaises(ZeroDivisionError, lambda: TestFloorDiv.python_floordiv(x, y))
+            self.assertRaisesRegex(
+                ZeroDivisionError,
+                "division by zero",
+                lambda: TestFloorDiv.torch_floordiv(x, y))
+
+    def test_floordiv_zero_base(self):
+        values = (
+            (0, 2.5),
+            (0.0, 2.1),
+            (sympy.Symbol("s", zero=True), 2.3),
+        )
+
+        for x, y in TestFloorDiv.yield_test_cases(values, negate=False):
+            if type(x) is not sympy.Symbol:
+                self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y))
+            else:
+                self.assertEqual(0, TestFloorDiv.torch_floordiv(x, y))
+
+    def test_floordiv_div_by_one(self):
+        values = (
+            (2.5, 1),
+            (2.1, 1.0),
+            (2, 1.0),
+            (2, 1),
+        )
+
+        for x, y in TestFloorDiv.yield_test_cases(values):
+            self.assertEqual(TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y))
+
+    def test_floordiv_simplify(self):
+        # Tests how we simplify or evaluate FloorDiv without free variables
+        shape_env = ShapeEnv()
+        result = 21
+        exprs = (
+            7 * FloorDiv(6, 2),
+            7 * FloorDiv(6.28, 2),
+            7 * FloorDiv(6.28, 2.0),
+            7 * FloorDiv(6.28, (FloorDiv(6.28, 3.14))),
+        )
+
+        for expr in exprs:
+            self.assertEqual(expr, result)
+            self.assertEqual(expr.doit(deep=False), result)
+            self.assertEqual(expr.doit(deep=True), result)
+            self.assertEqual(sympy.simplify(expr), result)
+            self.assertEqual(shape_env.simplify(expr), result)
+            self.assertEqual(shape_env.evaluate_expr(expr), result)
+
+    def test_floordiv_assumptions(self):
+        # We define two Symbols (with different names) for each type to make
+        # sure the behavior is consistent regardless of whether both arguments
+        # are the same object or not.
+        cases = (
+            sympy.Symbol("i1", integer=True),
+            sympy.Symbol("i2", integer=True),
+            sympy.Symbol("r1", real=True),
+            sympy.Symbol("r2", real=True),
+            sympy.Symbol("c1", complex=True, real=False, integer=False),
+            sympy.Symbol("c2", complex=True, real=False, integer=False),
+            sympy.Symbol("s1"),
+            sympy.Symbol("s2"),
+        )
+
+        for base, divisor in itertools.product(cases, repeat=2):
+            def op():
+                return FloorDiv(base, divisor)
+
+            def is_complex(x):
+                return x.is_integer is False and x.is_real is False and x.is_complex
+
+            if is_complex(base) or is_complex(divisor):
+                self.assertRaisesRegex(
+                    TypeError,
+                    (r"unsupported operand type\(s\) for //: 'Symbol' and 'Symbol',"
+                     r" expected integer or real"),
+                    op)
+                continue
+
+            op = op()
+
+            # In regular Python, x//x == 1.0 if x is a float, but FloorDiv
+            # always returns an integer 1 when both args are the same object.
+            # This even works for Symbols with no assumptions specified.
+            if base is divisor:
+                self.assertTrue(op.is_integer)
+                self.assertTrue(op.is_real)
+            elif base.is_integer and divisor.is_integer:
+                self.assertTrue(op.is_integer)
+                self.assertTrue(op.is_real)
+            else:
+                self.assertEqual(op.is_integer, None)
+                self.assertTrue(op.is_real)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_expanded_weights.py b/test/test_expanded_weights.py
index a7f4709c27d2..2cb43386b9ea 100644
--- a/test/test_expanded_weights.py
+++ b/test/test_expanded_weights.py
@@ -11,12 +11,15 @@
 from torch.nn.utils._per_sample_grad import call_for_per_sample_grads
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import OpDTypes, instantiate_device_type_tests, ops
+from torch.testing._internal.common_modules import module_db, modules
 from torch.testing._internal.common_nn import TestBase, module_tests, new_module_tests
 from torch.testing._internal.common_utils import TestCase, freeze_rng_state, make_tensor, run_tests, parametrize
 from torch.testing._internal.common_methods_invocations import SampleInput, op_db
 from torch.nn.utils._expanded_weights import ExpandedWeight
 from torch.nn.utils._expanded_weights.expanded_weights_utils import forward_helper, set_grad_sample_if_exists, \
     unpack_expanded_weight_or_tensor, sum_over_all_but_batch_and_last_n, standard_kwargs
+from torch.utils._pytree import tree_map_only
+
 
 class TestContext:
     pass
@@ -77,13 +80,14 @@ def test_forward_helper_failure_args(self, device):
 
     def test_set_grad_sample_if_exists(self, device):
         def test_fn(a):
-            return True
+            return grad_sample
 
         orig_weight = torch.randn(4, device=device, requires_grad=True)
         expanded_weight = ExpandedWeight(orig_weight, 3, loss_reduction="sum")
+        grad_sample = torch.randn(3)
         set_grad_sample_if_exists(expanded_weight, test_fn)
         self.assertTrue(hasattr(orig_weight, 'grad_sample'))
-        self.assertTrue(orig_weight.grad_sample)
+        self.assertEqual(orig_weight.grad_sample, grad_sample)
 
         basic_tensor = torch.randn(4, device=device)
         set_grad_sample_if_exists(basic_tensor, test_fn)
@@ -383,14 +387,22 @@ def test_group_norm_error(self, device):
             F.group_norm(inp, 2)  # 5 is not divisible by 2
 
 class TestExpandedWeightModule(TestCase):
-    def _do_test(self, module, input):
-        batch_size = input.shape[0]
+    def _do_test(self, module, input, args=None, kwargs=None, batch_first=True, atol=None, rtol=None):
+        args = args or ()
+        kwargs = kwargs or {}
+
+        batch_dim = 0 if batch_first else 1
+        batch_size = input.shape[batch_dim]
         diff_input = input.dtype == torch.float or input.dtype == torch.double
         if diff_input:
             input.requires_grad_()
+
         with freeze_rng_state():
             # get per sample grads with ExpandedWeights context manager
-            actual_res = call_for_per_sample_grads(module, loss_reduction="sum")(input).sum()
+            actual_res = call_for_per_sample_grads(module,
+                                                   batch_size=batch_size,
+                                                   loss_reduction="sum",
+                                                   batch_first=batch_first)(input, *args, **kwargs).sum()
             actual_res.backward()
             actual_grads = []
             for param in module.parameters():
@@ -401,20 +413,26 @@ def _do_test(self, module, input):
                 input.grad = torch.zeros_like(input.grad)
 
             # get per sample grads with a for loop
-            expected_res = torch.tensor(0., device=input.device, dtype=torch.double)
+            expected_res = torch.tensor(0., device=input.device, dtype=actual_res.dtype)
             expected_grads = []
             for i in range(batch_size):
-                input_slice = input[i]
+                input_slice = input.narrow(batch_dim, i, 1)
+                input_slice = input_slice.squeeze(batch_dim)
+
+                # h's batch dim is always the first dim. Must be contiguous for CUDA
+                sliced_args = tree_map_only(torch.Tensor, lambda t: t.narrow(1, i, 1).contiguous(), args)
                 diff_params = module.parameters()
                 if diff_input:
                     diff_params = chain(diff_params, (input_slice,))
-                res = module(input_slice.unsqueeze(0)).sum()
+                res = module(input_slice.unsqueeze(batch_dim).contiguous(), *sliced_args, **kwargs).sum()
                 out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
                 expected_grads.append(out_grads)
                 expected_res += res
-            expected_grads = tuple(torch.stack(grad) for grad in zip(*expected_grads))
+            expected_grads = [torch.stack(grad) for grad in zip(*expected_grads)]
+            if not batch_first:
+                expected_grads[-1] = expected_grads[-1].transpose(0, 1)
         self.assertEqual(actual_res, expected_res)
-        [self.assertEqual(actual, expected) for (actual, expected) in zip(actual_grads, expected_grads)]
+        [self.assertEqual(actual, expected, atol=atol, rtol=rtol) for (actual, expected) in zip(actual_grads, expected_grads)]
 
     def _do_test_multi_input(self, module, input):
         class TestModule(nn.Module):
@@ -457,6 +475,98 @@ def forward(self, input):
         expected_grads = tuple(expected_grad for expected_grad in expected_grads if expected_grad is not None)
         assert [self.assertEqual(actual, 2 * expected) for (actual, expected) in zip(actual_grads, expected_grads)]
 
+    def _do_test_rnn_packed_sequence(self, module, input, args=None, kwargs=None, atol=None, rtol=None):
+        args = args if args is not None else ()
+        kwargs = kwargs if kwargs is not None else {}
+
+        batch_size = max(tuple(input.batch_sizes)).item()
+
+        with freeze_rng_state():
+            # get per sample grads with ExpandedWeights context manager
+            actual_res = call_for_per_sample_grads(module,
+                                                   batch_size=batch_size,
+                                                   loss_reduction="sum")(input, *args, **kwargs).data.sum()
+            actual_res.backward()
+            actual_grads = []
+            for param in module.parameters():
+                self.assertEqual(param.grad_sample.shape[0], batch_size)
+                actual_grads.append(param.grad_sample)
+                del param.grad_sample
+
+            input.data.grad = torch.zeros_like(input.data)
+
+            # compute the per sample grads with a for loop
+            expected_res = torch.zeros_like(actual_res)
+            expected_grads = []
+            padded_input, seq_sizes = torch.nn.utils.rnn.pad_packed_sequence(input, batch_first=True)
+            for i in range(len(seq_sizes)):
+                input_slice = padded_input[i].narrow(0, 0, seq_sizes[i])
+                diff_params = module.parameters()
+                batch_dim = 0 if module.m.batch_first else 1
+                res = module(input_slice.unsqueeze(batch_dim), *args, **kwargs).sum()
+                expected_res += res
+                out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True)
+                expected_grads.append(out_grads)
+
+            expected_grads = [torch.stack(grad) for grad in zip(*expected_grads)]
+            self.assertEqual(actual_res, expected_res)
+            [self.assertEqual(actual, expected, atol=atol, rtol=rtol) for (actual, expected) in zip(actual_grads, expected_grads)]
+
+    @modules(filter(lambda m_info: m_info.module_cls in (torch.nn.RNN, torch.nn.LSTM, torch.nn.GRU), module_db))
+    def test_module(self, device, dtype, module_info, training):
+        class RNNWrapper(torch.nn.Module):
+            def __init__(self, m_cons, args, kwargs):
+                super().__init__()
+                self.m = m_cons(*args, **kwargs)
+
+            def forward(self, *inps):
+                ret = self.m(*inps)
+                assert isinstance(ret, tuple)
+                return ret[0]
+
+        def batch_hidden(h):
+            new_h_shape = [1] * (len(h.shape) + 1)
+            new_h_shape[1] = 2
+            return h.unsqueeze(1).repeat(new_h_shape)
+
+
+        module_cls = module_info.module_cls
+        atol, rtol = (1e-4, 1e-5) if module_cls == torch.nn.GRU and dtype == torch.float32 else (None, None)
+        module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
+                                                       requires_grad=True, training=training, with_packed_sequence=True)
+        for module_input in module_inputs:
+            if module_input.forward_input is None:
+                continue
+            args, kwargs = module_input.constructor_input.args, module_input.constructor_input.kwargs
+            m = RNNWrapper(module_cls, args, kwargs)
+            batch_first = m.m.batch_first
+            m.to(device).to(dtype)
+
+            args, kwargs = module_input.forward_input.args, module_input.forward_input.kwargs
+
+            # if the RNN tests use unbatched inputs--batch the inputs
+            input = args[0]
+            if isinstance(input, torch.Tensor) and input.dim() == 2:
+                input = input.detach()
+                new_input_shape = [1] * (len(input.shape) + 1)
+                if batch_first:
+                    new_input_shape[0] = 2
+                    input = input.repeat(new_input_shape)
+                else:
+                    new_input_shape[1] = 2
+                    input = input.unsqueeze(1).repeat(new_input_shape)
+
+                h = args[1] if len(args) > 1 else None
+                if h is not None:
+                    h = batch_hidden(h) if isinstance(h, torch.Tensor) else tuple(batch_hidden(hx) for hx in h)
+                    args = list(args)
+                    args[1] = h
+
+            if isinstance(input, torch.nn.utils.rnn.PackedSequence):
+                self._do_test_rnn_packed_sequence(m, input, args[1:], kwargs, atol=atol, rtol=rtol)
+            else:
+                self._do_test(m, input, args[1:], kwargs, batch_first=batch_first, atol=atol, rtol=rtol)
+
     def test_per_sample_api_failing(self):
         module = nn.Linear(10, 10)
         input = torch.randn(64, 10)
@@ -665,5 +775,6 @@ def clone_if_tensor(t):
 
 instantiate_device_type_tests(TestExpandedWeightHelperFunction, globals())
 instantiate_device_type_tests(TestExpandedWeightFunctional, globals())
+instantiate_device_type_tests(TestExpandedWeightModule, globals())
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 1a13d56fe161..5d52ef38c26d 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -22,6 +22,8 @@
 import contextlib
 import weakref
 import copy
+import torch._functorch.config
+from unittest.mock import patch
 
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -456,7 +458,7 @@ def check_copy(mod, mod_copied):
 
         class ModuleNew(torch.nn.Module):
             def __init__(self):
-                super(ModuleNew, self).__init__()
+                super().__init__()
                 self.a = torch.rand([10, 2])
                 self.b = self.a
                 self.c = self.a[0]
@@ -485,6 +487,17 @@ def test_scalar_inputs(self):
             self.assertEqual(ten.dtype, torch.float)
             self.checkType(ten, "cpu", [2])
 
+    def test_allow_meta(self):
+        def run_meta():
+            with FakeTensorMode():
+                x = torch.rand([4], device="meta")
+                return x + x
+
+        self.checkType(run_meta(), "meta", [4])
+
+        with patch.object(torch._functorch.config, "fake_tensor_allow_meta", False):
+            self.assertRaises(Exception, run_meta)
+
 
 class FakeTensorConstHandling(TestCase):
     def assertConst(self, *args):
@@ -540,7 +553,7 @@ def fn(tensors):
             return tensors[0].new_full(batch_shape, 0.0)
 
         with self.assertRaises(torch._subclasses.fake_tensor.DataDependentOutputException):
-            with torch._subclasses.fake_tensor.FakeTensorMode(throw_on_data_dependent_ops=True):
+            with torch._subclasses.fake_tensor.FakeTensorMode():
                 a = torch.randn(3, 800, 1199)
                 b = torch.randn(3, 800, 800)
                 inputs = [a, b]
@@ -571,6 +584,11 @@ def test_aliased_const_write(self):
             y[0] = 1
             self.assertNotConst(x)
 
+    def test_constant_propagate_through_functions(self):
+        with FakeTensorMode():
+            y = torch.div(4, 4, rounding_mode='trunc')
+            self.assertConst(y)
+
 def contains_type(type: torch._C.Type, maybe_contained_type: torch._C.Type):
     return maybe_contained_type.isSubtypeOf(type) or any(
         contains_type(e, maybe_contained_type) for e in type.containedTypes()
@@ -737,6 +755,11 @@ def test_sparse_new(self):
             # error
             sparse2 = sparse.new(indices, values, extra)
 
+    def test_tensor_new(self):
+        with FakeTensorMode():
+            x = torch.Tensor([1, 2, 3])
+        self.assertIsInstance(x, FakeTensor)
+
     def test_like_ops(self):
         for schema in self.get_all_aten_schemas():
             if "_like" == schema.name[-5:]:
@@ -839,6 +862,33 @@ def to_fake_tensor(x):
                     failed = True
                 self.assertTrue(failed)
 
+
+    def test_fake_tensor_prop_on_nn_module_with_optional_args(self):
+        class OptionalArgumentInBetween(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer1 = torch.nn.Linear(4, 3)
+                self.layer2 = torch.nn.Linear(3, 2)
+
+            def forward(self, value, another_value=None, another_optional_value=None):
+                # Mimic huggingface's `forward` methods which have several optional arguments.
+                # For example, GPT accepts forward(self, input_ids, None, attention_mask, ...).
+                # To apply FakeTensorProp, its from_real_tensor(...) needs to accept None.
+                if another_value is None:
+                    another_value = torch.rand_like(value)
+                if another_optional_value is None:
+                    another_optional_value = torch.rand_like(value)
+                value = value + another_value + another_optional_value
+                return value * value
+
+        fake_mode = FakeTensorMode(allow_non_fake_inputs=True, allow_fallback_kernels=False)
+        with fake_mode:
+            model = OptionalArgumentInBetween()
+            value = torch.randn(5, 4)
+            another_optional_value = torch.randn(5, 4)
+            graph_model = torch.fx.symbolic_trace(model, (value, None, another_optional_value))
+            FakeTensorProp(graph_model, fake_mode).propagate(value, None, another_optional_value)
+
 instantiate_parametrized_tests(FakeTensorTest)
 
 if __name__ == "__main__":
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 130f010a8565..824a0b216364 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -65,6 +65,26 @@ def __call__(self, inputs, is_cuda, is_fastpath, **kwargs):
         return inputs[0] if self._is_inplace else actual
 
 
+def get_transform_func(num_tensors, dtype, device, is_fastpath):
+    def transform(t):
+        if not torch.is_tensor(t):
+            return t
+        return make_tensor(
+            (num_tensors, num_tensors), dtype=dtype, device=device,
+            requires_grad=True, noncontiguous=not is_fastpath,
+        )
+    return transform
+
+
+def clone(arg):
+    if isinstance(arg, (list, tuple)):
+        return [clone(a) for a in arg]
+    if torch.is_tensor(arg):
+        return arg.clone().detach().requires_grad_()
+    else:
+        return arg
+
+
 class TestForeach(TestCase):
 
     @property
@@ -82,18 +102,21 @@ def _get_funcs(self, op):
             RegularFuncWrapper(op.ref_inplace),
         )
 
-    def _binary_test(self, dtype, op, ref, inputs, is_fastpath, is_inplace, *, alpha=None):
+    def _binary_test(self, dtype, op, ref, inputs, is_fastpath, is_inplace, *, alpha=None, scalar_self_arg=False):
         ref_inputs = [[t.clone().detach() for t in inputs[0]], inputs[1]] if is_inplace else inputs
 
         try:
             actual = op(inputs, self.is_cuda, is_fastpath)
         except RuntimeError as e:
             with self.assertRaisesRegex(type(e), re.escape(str(e))):
-                ref(ref_inputs)
+                if not scalar_self_arg:
+                    ref(ref_inputs)
+                else:
+                    [ref.func(ref_inputs[0], t) for t in ref_inputs[1]]
         else:
-            expected = ref(ref_inputs)
+            expected = ref(ref_inputs) if not scalar_self_arg else [ref.func(ref_inputs[0], t) for t in ref_inputs[1]]
             self.assertEqual(actual, expected)
-        if alpha is not None:
+        if alpha is not None and not scalar_self_arg:
             kwargs = {'alpha': alpha}
             ref_inputs = inputs
             try:
@@ -112,24 +135,54 @@ def _binary_test(self, dtype, op, ref, inputs, is_fastpath, is_inplace, *, alpha
     @ops(foreach_binary_op_db)
     @parametrize("is_fastpath", (True, False))
     def test_binary_op(self, device, dtype, op, is_fastpath):
-        for sample in op.sample_inputs(device, dtype, noncontiguous=not is_fastpath):
+        scalar_self_arg_test_complete = False
+        for i, sample in enumerate(op.sample_inputs(device, dtype, noncontiguous=not is_fastpath)):
             rhs_arg, = sample.args
             kwargs = {} or sample.kwargs
             alpha = kwargs.pop("alpha", None)
             disable_fastpath = kwargs.pop("disable_fastpath") if is_fastpath else False
-
             wrapped_op, ref, inplace_op, inplace_ref = self._get_funcs(op)
             self._binary_test(
-                dtype, wrapped_op, ref, [sample.input, rhs_arg], is_fastpath and not disable_fastpath, False, alpha=alpha)
+                dtype, wrapped_op, ref, [sample.input, rhs_arg],
+                is_fastpath and not disable_fastpath, False, alpha=alpha)
             self._binary_test(
-                dtype, inplace_op, inplace_ref, [sample.input, rhs_arg], is_fastpath and not disable_fastpath, True, alpha=alpha)
+                dtype, inplace_op, inplace_ref, [sample.input, rhs_arg],
+                is_fastpath and not disable_fastpath, True, alpha=alpha)
+
+            if op.supports_autograd and dtype in floating_types():
+                transformed_sample = sample.transform(get_transform_func(len(sample.input), dtype, device, is_fastpath))
+                tensors = transformed_sample.input
+                rhs_arg, = transformed_sample.args
+                ref_tensors, ref_rhs_arg = clone(tensors), clone(rhs_arg)
+                try:
+                    sum(wrapped_op([tensors, rhs_arg], is_cuda=False, is_fastpath=False)).mean().backward()
+                except RuntimeError:
+                    with self.assertRaises(RuntimeError):
+                        sum(ref([ref_tensors, ref_rhs_arg])).mean().backward()
+                else:
+                    sum(ref([ref_tensors, ref_rhs_arg])).mean().backward()
+                    self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
+                    if isinstance(rhs_arg, list) and isinstance(rhs_arg[0], torch.Tensor):
+                        self.assertEqual([t.grad for t in rhs_arg], [t.grad for t in ref_rhs_arg])
+            if op.supports_scalar_self_arg and isinstance(rhs_arg, Number) and (not scalar_self_arg_test_complete):
+                scalar_self_arg_test_complete = True
+                self._binary_test(
+                    dtype, wrapped_op, ref, [rhs_arg, sample.input], is_fastpath, False,
+                    alpha=alpha, scalar_self_arg=True)
+                if op.supports_autograd and dtype == torch.float32:
+                    transformed_sample = sample.transform(
+                        get_transform_func(len(sample.input), dtype, device, is_fastpath))
+                    tensors = transformed_sample.input
+                    rhs_arg, = transformed_sample.args
+                    ref_tensors, ref_rhs_arg = clone(tensors), clone(rhs_arg)
+                    sum(wrapped_op([rhs_arg, tensors], is_cuda=False, is_fastpath=False)).mean().backward()
+                    sum([ref.func(ref_rhs_arg, t) for t in ref_tensors]).mean().backward()
+                    self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
 
     @ops(foreach_pointwise_op_db)
     @parametrize("is_fastpath", (True, False))
     def test_pointwise_op(self, device, dtype, op, is_fastpath):
-        for sample in op.sample_inputs(device, dtype):
-            if not is_fastpath:
-                sample = sample.noncontiguous()
+        for sample in op.sample_inputs(device, dtype, noncontiguous=not is_fastpath):
             assert isinstance(sample.args, tuple)
             assert len(sample.args) == 2
             inputs = [sample.input, *sample.args]
@@ -138,7 +191,27 @@ def test_pointwise_op(self, device, dtype, op, is_fastpath):
             wrapped_op, ref, inplace_op, inplace_ref = self._get_funcs(op)
             values = kwargs.pop("values")
             self._pointwise_test(wrapped_op, ref, inputs, is_fastpath and not disable_fastpath, False, values=values)
-            self._pointwise_test(inplace_op, inplace_ref, inputs, is_fastpath and not disable_fastpath, True, values=values)
+            self._pointwise_test(
+                inplace_op, inplace_ref, inputs, is_fastpath and not disable_fastpath,
+                True, values=values)
+
+            if op.supports_autograd and dtype in floating_types():
+                transformed_sample = sample.transform(
+                    get_transform_func(len(sample.input), dtype, device, is_fastpath))
+                tensors = transformed_sample.input
+                rhs_arg = transformed_sample.args
+                ref_tensors, ref_rhs_arg = clone(tensors), clone(rhs_arg)
+                try:
+                    sum(wrapped_op([tensors, *rhs_arg], is_cuda=False, is_fastpath=False)).mean().backward()
+                except RuntimeError:
+                    with self.assertRaises(RuntimeError):
+                        sum(ref([ref_tensors, *ref_rhs_arg])).mean().backward()
+                else:
+                    sum(ref([ref_tensors, *ref_rhs_arg])).mean().backward()
+                    self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
+                    for op_list, ref_list in zip(rhs_arg, ref_rhs_arg):
+                        if isinstance(op_list, list) and isinstance(op_list[0], torch.Tensor):
+                            self.assertEqual([t.grad for t in op_list], [t.grad for t in ref_list])
 
             if is_fastpath and isinstance(values, list):
                 sample = sample.transform(lambda t: t.clone().detach() if torch.is_tensor(t) else t)
@@ -222,24 +295,6 @@ def _inplace_unary_test(self, inplace, inplace_ref, inputs, is_fastpath):
             inplace_ref(copied_inputs),
             self.assertEqual(copied_inputs, inputs)
 
-    def _test_unary(self, device, dtype, opinfo, N, is_fastpath):
-        op, ref, inplace_op, inplace_ref = self._get_funcs(opinfo, 1)
-        inputs = opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath),
-        # note(mkozuki): Complex inputs for `_foreach_abs` go through slowpath.
-        if opinfo.name == "_foreach_abs" and dtype in complex_types():
-            is_fastpath = False
-        self._regular_unary_test(dtype, op, ref, inputs, is_fastpath)
-        self._inplace_unary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath)
-
-        if opinfo.supports_autograd and dtype in floating_types():
-            tensors = opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath, same_size=True)
-            tensors = [t.requires_grad_() for t in tensors]
-            ref_tensors = [t.clone().detach().requires_grad_() for t in tensors]
-
-            sum(op.func(tensors)).mean().backward()
-            sum([ref.func(t) for t in ref_tensors]).mean().backward()
-            self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
-
     @skipMeta
     @ops(foreach_unary_op_db)
     @parametrize("is_fastpath", (True, False))
@@ -257,19 +312,39 @@ def test_unary_op(self, device, dtype, op, is_fastpath):
             )
             self.assertEqual(ref(inputs), wrapped_op(inputs, self.is_cuda, is_fastpath and not disable_fastpath))
             self._inplace_unary_test(inplace_op, inplace_ref, [sample.input], is_fastpath and not disable_fastpath)
+            if op.supports_autograd and dtype in floating_types():
+                num_tensors = len(sample.input)
+                tensors = [
+                    make_tensor(
+                        (num_tensors, num_tensors), dtype=dtype, device=device,
+                        requires_grad=True, noncontiguous=not is_fastpath,
+                    )
+                    for _ in range(num_tensors)
+                ]
+                ref_tensors = [t.clone().detach().requires_grad_() for t in tensors]
+                sum(wrapped_op.func(tensors)).mean().backward()
+                sum([ref.func(t) for t in ref_tensors]).mean().backward()
+                self.assertEqual([t.grad for t in tensors], [t.grad for t in ref_tensors])
 
     @ops(foreach_reduce_op_db)
     @parametrize("is_fastpath", (True, False))
     def test_reduce_op(self, device, dtype, op, is_fastpath):
-        for sample in op.sample_inputs(device, dtype):
-            if not is_fastpath:
-                sample = sample.noncontiguous()
+        for sample in op.sample_inputs(device, dtype, noncontiguous=not is_fastpath):
             ord = sample.kwargs.pop("ord")
             disable_fastpath = sample.kwargs.pop("disable_fastpath", False)
 
             inputs = (sample.input,)
             wrapped_op, ref, _, _ = self._get_funcs(op)
             self.assertEqual(ref(inputs, ord=ord), wrapped_op(inputs, self.is_cuda, is_fastpath and not disable_fastpath, ord=ord))
+            if op.supports_autograd and dtype in floating_types():
+                transformed_sample = sample.transform(get_transform_func(len(sample.input), dtype, device, is_fastpath))
+                tensors = transformed_sample.input
+                ref_tensors = clone(tensors)
+                sum(wrapped_op((tensors,), False, False, ord=ord)).backward()
+                sum(ref((ref_tensors,), ord=ord)).backward()
+                self.assertEqual(
+                    [t.grad for t in tensors], [t.grad for t in ref_tensors],
+                )
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
@@ -283,7 +358,6 @@ def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype):
 
     @ops(foreach_binary_op_db, dtypes=OpDTypes.supported)
     def test_binary_op_scalar_with_overlapping_tensors(self, device, dtype, op):
-        print(op, device, dtype)
         foreach_op, ref = op.method_variant, op.ref
         tensors = [torch.ones(1, 1, device=device, dtype=dtype).expand(2, 1, 3)]
 
@@ -466,7 +540,7 @@ def test_binary_op_tensors_on_different_devices(self, device, dtype, op):
         # `tensors2`: ['cuda', 'cpu']
         _cuda_tensors = list(op.sample_inputs(device, dtype, num_input_tensors=[2], same_size=True))[0].input
         _cpu_tensors = list(op.sample_inputs("cpu", dtype, num_input_tensors=[2], same_size=True))[0].input
-        tensors1, tensors2 = list(tensors for tensors in zip(_cuda_tensors, _cpu_tensors))
+        tensors1, tensors2 = list(zip(_cuda_tensors, _cpu_tensors))
 
         foreach_op, foreach_op_ = op.method_variant, op.inplace_variant
         native_op, native_op_ = op.ref, op.ref_inplace
@@ -494,7 +568,7 @@ def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op):
         # tensors3: ['cuda', 'cpu]
         _cuda_tensors = list(op.sample_inputs(device, dtype, num_input_tensors=[3], same_size=True))[0].input
         _cpu_tensors = list(op.sample_inputs("cpu", dtype, num_input_tensors=[3], same_size=True))[0].input
-        tensors1, tensors2, tensors3 = list(tensors for tensors in zip(_cuda_tensors, _cpu_tensors))
+        tensors1, tensors2, tensors3 = list(zip(_cuda_tensors, _cpu_tensors))
 
         foreach_op, foreach_op_, native_op = op.method_variant, op.inplace_variant, op.ref
         actual = foreach_op(tensors1, tensors2, tensors3)
@@ -531,7 +605,6 @@ def test_foreach_l2_large_value_input(self, device, dtype, op):
     def test_lerp(self, device, dtype, op, is_fastpath):
         for sample in op.sample_inputs(device, dtype, noncontiguous=not is_fastpath):
             wrapped_op, ref, inplace_op, _ = self._get_funcs(op)
-
             args = [*sample.args]
             inputs = [sample.input, args[0]]
 
@@ -557,6 +630,24 @@ def test_lerp(self, device, dtype, op, is_fastpath):
             inplace_actual = inplace_op(inplace_inputs, self.is_cuda, is_fastpath, **kwargs)
             self.assertEqual(inplace_actual, expected)
 
+            if op.supports_autograd and dtype in floating_types():
+                transformed_sample = sample.transform(get_transform_func(len(sample.input), dtype, device, is_fastpath))
+                args = [*transformed_sample.args]
+                inputs = [transformed_sample.input, args[0]]
+
+                kwargs, ref_kwargs = {}, {}
+                if isinstance(args[1], list):
+                    inputs.append(args[1])
+                else:
+                    kwargs = ref_kwargs = {"weight": args[1]}
+                ref_tensors = clone(transformed_sample.input)
+                sum(wrapped_op((transformed_sample.input, *inputs[1:]), False, False, **kwargs)).mean().backward()
+                sum(ref((ref_tensors, *inputs[1:]), **ref_kwargs)).mean().backward()
+                self.assertEqual(
+                    [t.grad for t in transformed_sample.input], [t.grad for t in ref_tensors],
+                    msg=f"{transformed_sample.input[0].grad[:2, :2]}, {ref_tensors[0].grad[:2, :2]}"
+                )
+
     @onlyCUDA
     @ops(foreach_reduce_op_db)
     def test_foreach_reduce_large_input(self, device, dtype, op):
diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py
index 24e5088be08d..98eb79f808c1 100644
--- a/test/test_functional_optim.py
+++ b/test/test_functional_optim.py
@@ -22,7 +22,7 @@ def forward(self, t1):
         return self.lin2(F.relu(self.lin1(t1)))
 
 # dummy class to showcase custom optimizer registration with functional wrapper
-class MyDummyFnOptimizer(object):
+class MyDummyFnOptimizer:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 026740403a59..bdd01ec2a02e 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -178,7 +178,7 @@ def g(x):
             from torch._functorch.aot_autograd import setup_stacktrace_preservation_hooks
             import torch.fx.traceback as fx_traceback
             setup_stacktrace_preservation_hooks([loss.grad_fn])
-            with fx_traceback.override_stack_trace():
+            with fx_traceback.preserve_node_meta():
                 loss.backward()
             return x.grad
 
@@ -583,6 +583,21 @@ def forward(self, arg0_1):
     return diagonal_scatter
     """)
 
+    def test_channels_last_contiguous(self):
+        def f(x):
+            return x.contiguous(memory_format=torch.channels_last)
+            tmp = torch.ones(2)
+            y = x.diagonal()
+            y.add_(tmp)
+            return x
+        x = torch.randn(4, 8, 8, 3).permute(0, 3, 1, 2)
+        self.assert_functionalization(f, x)
+        logs = self.get_logs(f, x).strip()
+        # There should be no clone in the graph
+        self.assertExpectedInline(logs, """\
+def forward(self, arg0_1):
+    return arg0_1""")
+
     def test_split(self):
         def f(x):
             # test: view ops that return multiple tensors (split)
@@ -1189,6 +1204,14 @@ def forward(self, arg0_1):
     return as_strided_3
     """)
 
+    def test_resize_same_size_diff_rank(self):
+        def f(x):
+            y = x.clone()
+            y.resize_(25, 5)
+            return y
+
+        self.assert_functionalization(f, torch.ones(5, 5, 5))
+
     def test_resize_larger_valid(self):
         def f(x):
             y = x + 1
@@ -1427,7 +1450,7 @@ def forward(self, arg0_1, arg1_1, arg2_1):
     def test_batch_norm(self):
         def f(x, running_mean, running_var):
             with enable_python_dispatcher():
-                return torch.batch_norm(x, None, None, running_mean, running_var, False, 0.1, 1e-5, False)
+                return torch.batch_norm(x, None, None, running_mean, running_var, True, 0.1, 1e-5, False)
 
         self.assert_functionalization(f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100))
         logs = self.get_logs(f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100))
@@ -1437,7 +1460,7 @@ def f(x, running_mean, running_var):
 
 def forward(self, arg0_1, arg1_1, arg2_1):
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(arg0_1, None, None, arg1_1, arg2_1, False, 0.1, 1e-05);  arg0_1 = None
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(arg0_1, None, None, arg1_1, arg2_1, True, 0.1, 1e-05);  arg0_1 = None
     getitem = _native_batch_norm_legit_functional[0]
     getitem_1 = _native_batch_norm_legit_functional[1]
     getitem_2 = _native_batch_norm_legit_functional[2]
@@ -1457,7 +1480,7 @@ def forward(self, arg0_1, arg1_1, arg2_1):
 
 def forward(self, arg0_1, arg1_1, arg2_1):
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(arg0_1, None, None, arg1_1, arg2_1, False, 0.1, 1e-05);  arg0_1 = None
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(arg0_1, None, None, arg1_1, arg2_1, True, 0.1, 1e-05);  arg0_1 = None
     getitem = _native_batch_norm_legit_functional[0]
     getitem_1 = _native_batch_norm_legit_functional[1]
     getitem_2 = _native_batch_norm_legit_functional[2]
diff --git a/test/test_fx.py b/test/test_fx.py
index bb31befbf75c..49ea19a88a12 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -45,11 +45,10 @@
 from fx.test_pass_infra import TestPassManager  # noqa: F401
 from fx.test_common_passes import TestCommonPass  # noqa: F401
 from fx.test_cse_pass import TestCSEPass  # noqa: F401
+from fx.test_matcher_utils import TestMatcher  # noqa: F401
 
-if sys.version_info >= (3, 7):
-    from fx.test_gradual_type import AnnotationsTest  # noqa: F401
-if sys.version_info >= (3, 7):
-    from fx.test_gradual_type import TypeCheckerTest  # noqa: F401
+from fx.test_gradual_type import AnnotationsTest  # noqa: F401
+from fx.test_gradual_type import TypeCheckerTest  # noqa: F401
 from typing import Any, Callable, Dict, NamedTuple, List, Optional, Tuple, Union
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -148,7 +147,7 @@ def _custom_fx_repr_fn(self) -> str:
         return f"Pair(x={_format_arg(self.x)}, y={_format_arg(self.y)})"
 
 # for testing pytrees
-class Foo(object):  # noqa: B209
+class Foo:  # noqa: B209
     def __init__(self, a, b):
         self.a = a
         self.b = b
@@ -443,7 +442,7 @@ def test_wrap_with_submodule(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.batchnorm1d = torch.nn.BatchNorm1d(2, affine=False)
 
             def forward(self, x: torch.Tensor):
@@ -1599,8 +1598,8 @@ def forward(self, x):
             if node.op == 'output':
                 output_shape = node.args[0].meta['tensor_meta'].shape
                 output_stride = node.args[0].meta['tensor_meta'].stride
-        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method',
-                                       'call_module', 'output']))
+        self.assertEqual(opcodes, {'placeholder', 'get_attr', 'call_function', 'call_method',
+                                   'call_module', 'output'})
 
         # Test shape propagation and make sure results match actual
         self.assertEqual(output_shape, ref_out.shape)
@@ -1721,6 +1720,31 @@ def forward(self, x):
         stack_list = list(mod_stack.items())
         self.assertEqual(stack_list, expected_stack)
 
+    def test_transformer_preserves_nn_module_stack_for_get_attr(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.ones(1, 1))
+
+            def forward(self, x):
+                return self.weight + x
+
+        tracer = torch.fx.Tracer()
+        graph = tracer.trace(M())
+        gm = GraphModule(tracer.root, graph)
+        for node in gm.graph.nodes:
+            if node.op == 'get_attr':
+                node.meta["nn_module_stack"] = "self"
+                node.meta["stack_trace"] = "stack_trace"
+                node.meta["source_fn"] = "source_fn"
+        new_gm = Transformer(gm).transform()
+        for node in new_gm.graph.nodes:
+            if node.op == 'get_attr':
+                self.assertEqual(node.meta["nn_module_stack"], "self")
+                self.assertEqual(node.meta["stack_trace"], "stack_trace")
+                self.assertEqual(node.meta["source_fn"], "source_fn")
+
+
     def test_interpreter(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -1833,8 +1857,8 @@ def test_interpreter_gc_values(self):
         interp = Interpreter(symbolic_trace(rn18))
         inp = torch.rand(5, 3, 224, 224)
         out = interp.run(inp)
-        env_key_names = set(n.name for n in interp.env.keys())
-        self.assertEqual(env_key_names, set(['output']))
+        env_key_names = {n.name for n in interp.env.keys()}
+        self.assertEqual(env_key_names, {'output'})
 
     def test_interpreter_default_args(self):
         class Model(torch.nn.Module):
@@ -1974,9 +1998,6 @@ def do_nothing():
             yield
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @do_nothing()
             def forward(self, x):
                 return torch.relu(x)
@@ -1995,9 +2016,6 @@ def test_typename_print(self):
 
     def test_layout(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return torch.empty_like(x, layout=torch.strided, pin_memory=False).fill_(0)
 
@@ -2007,9 +2025,6 @@ def forward(self, x):
 
     def test_ellipsis(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 return x + y[:, 1:10, ...]
 
@@ -2053,7 +2068,7 @@ def test_deepcopy_recursion_depth(self):
 
         for orig_node, new_node in zip(g.nodes, copied_graph.nodes):
             orig_users = set(orig_node.users.keys())
-            orig_users_equiv = set(val_map[u] for u in orig_users)
+            orig_users_equiv = {val_map[u] for u in orig_users}
             new_users = set(new_node.users.keys())
             self.assertEqual(orig_users_equiv, new_users)
 
@@ -2231,7 +2246,7 @@ def test_find_uses(self):
 
         users_of_x = x.node.users
         self.assertEqual(len(users_of_x), 3)
-        expected_ops = set(['relu', 'add', 'neg'])
+        expected_ops = {'relu', 'add', 'neg'}
         for use in users_of_x:
             assert any(use.name.startswith(prefix) for prefix in expected_ops)
 
@@ -2381,9 +2396,6 @@ def forward(self, x):
 
     def test_single_default_arg(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, y=1):
                 return y
 
@@ -2393,9 +2405,6 @@ def forward(self, y=1):
 
     def test_multiple_default_args(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, y=1, z=2):
                 return y + z
 
@@ -2406,9 +2415,6 @@ def forward(self, y=1, z=2):
 
     def test_regular_and_default_args(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y=1):
                 return x + y
 
@@ -2418,9 +2424,6 @@ def forward(self, x, y=1):
 
     def test_string_literal_return(self):
         class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self):
                 return "foo"
 
@@ -2448,7 +2451,7 @@ def test_torchbind_class_attribute_in_fx(self):
 
         class FooBar1234(torch.nn.Module):
             def __init__(self):
-                super(FooBar1234, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._StackString(["3", "4"])
 
             def forward(self):
@@ -2463,7 +2466,7 @@ def test_torchbind_class_attribute_in_fx_tensor_arg(self):
 
         class FooBar2341(torch.nn.Module):
             def __init__(self):
-                super(FooBar2341, self).__init__()
+                super().__init__()
                 self.f = torch.classes._TorchScriptTesting._ReLUClass()
 
             def forward(self, x):
@@ -2615,7 +2618,7 @@ def forward(self, x):
     def test_snake_case(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.activations = torch.nn.ModuleDict([
                     ["snake_case", torch.nn.ReLU()],
                     ["PascalCase", torch.nn.LeakyReLU()],
@@ -2681,7 +2684,7 @@ def f_higher(a, f):
     def test_custom_traceback_raised_when_exception_source_is_graphmodule(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.W = torch.nn.Parameter(torch.randn(5))
 
             def forward(self, x):
@@ -2893,7 +2896,7 @@ def to_trace(y):
     def test_ast_rewriter_wrap_with_submodule(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.batchnorm1d = torch.nn.BatchNorm1d(2, affine=False)
 
             def forward(self, x: torch.Tensor):
@@ -2912,7 +2915,7 @@ def forward(self, x: torch.Tensor):
     def test_submodule_manipulation_API(self):
         class C(torch.nn.Module):
             def __init__(self):
-                super(C, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(16, 33, 3, stride=2)
                 self.param = torch.nn.Parameter(torch.rand(2, 3))
 
@@ -2921,7 +2924,7 @@ def forward(self, x):
 
         class B(torch.nn.Module):
             def __init__(self):
-                super(B, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(100, 200)
                 self.register_buffer("buf", torch.randn(2, 3))
                 self.net_c = C()
@@ -2931,7 +2934,7 @@ def forward(self, x):
 
         class A(torch.nn.Module):
             def __init__(self):
-                super(A, self).__init__()
+                super().__init__()
                 self.net_b = B()
                 self.param = torch.nn.Parameter(torch.rand(2, 3))
 
@@ -3336,6 +3339,7 @@ def test_annotation_with_future(self):
         finally:
             del sys.modules["__future__"]
 
+    @unittest.skipIf(sys.version_info > (3, 11), "Does not work in 3.11")
     def test_annotations_empty_tuple(self):
         class Foo(torch.nn.Module):
             def forward(self, x: Tuple[()], y: Tuple[str, Tuple[()]]):
@@ -3601,6 +3605,12 @@ def test_deepcopy_graphmodule(self):
         copy_m = copy.deepcopy(m)
         self.assertEqual(copy_m.meta['hello'], 'world')
 
+    def test_deepcopy_no_recursion(self):
+        m = symbolic_trace(SimpleTest())
+        m.meta['hello'] = m  # circular reference
+        copy_m = copy.deepcopy(m)  # finishes
+        self.assertEqual(id(copy_m), id(copy_m.meta['hello']))
+
 
 def run_getitem_target():
     from torch.fx._symbolic_trace import _wrapped_methods_to_patch
@@ -3871,7 +3881,7 @@ def check_symbols_have_bc_designation(m, prefix):
                     continue
                 if isinstance(v, types.ModuleType):
                     check_symbols_have_bc_designation(v, prefix + [k])
-                elif isinstance(v, type) or isinstance(v, types.FunctionType):
+                elif isinstance(v, (type, types.FunctionType)):
                     if v not in _MARKED_WITH_COMATIBLITY:
                         non_back_compat_objects.setdefault(v)
 
@@ -3955,6 +3965,7 @@ def tearDown(self):
         "relu_": BUILT_IN_FUNC,
         "rrelu_": BUILT_IN_FUNC,
         "selu_": BUILT_IN_FUNC,
+        "scaled_dot_product_attention": BUILT_IN_FUNC,
         "softplus": BUILT_IN_FUNC,
         "softshrink": BUILT_IN_FUNC,
         "threshold_": BUILT_IN_FUNC,
@@ -4117,7 +4128,7 @@ def generate_test_func(cls, func_name, fn):
 
         def functional_test(self):
             if func_name in self.UNTRACEABLE_FUNCTIONALS_PY38 and \
-                    sys.version_info >= (3, 8) and sys.version_info < (3, 11):
+                    sys.version_info >= (3, 8) and sys.version_info < (3, 12):
                 exc, err = self.UNTRACEABLE_FUNCTIONALS_PY38[func_name]
                 with self.assertRaisesRegex(exc, err):
                     symbolic_trace(fn)
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index e94c1bc7cc44..e933fe0c088b 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -247,7 +247,7 @@ def create_mlp(self, num_of_layers: int, input_size: int, output_size: int):
                 return layers
 
             def __init__(self):
-                super(MyRecommendationModule, self).__init__()
+                super().__init__()
                 layers = self.create_mlp(4, 4, 4)
                 self.bottom_layers = torch.nn.Sequential(*layers)
                 layers = self.create_mlp(3, 24, 24)
@@ -301,7 +301,7 @@ def forward(self, a, b, offset):
     def test_partition_latency(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(4, 4)
 
             def forward(self, a):
@@ -420,7 +420,7 @@ def get_node_to_latency_mapping(fx_module: GraphModule):
     def test_aot_based_partition(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.b = torch.rand(4)
                 self.c = torch.rand(4)
 
@@ -479,7 +479,7 @@ def forward(self, a, b):
     def test_saturate_host(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(4, 4)
 
             def forward(self, a):
@@ -535,7 +535,7 @@ def test_conv_bn_fusion(self):
     def test_conv_bn_fusion_not_running_state(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(32, 64, 3, stride=2)
                 self.bn = torch.nn.BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=False)
 
@@ -873,7 +873,7 @@ def is_leaf_module(
             ) -> bool:
                 # `leaves` contains the set of standard `nn.Modules` that are not
                 # currently symbolically traceable. Ideally this set would be empty
-                leaves = set([torch.nn.BatchNorm2d])
+                leaves = {torch.nn.BatchNorm2d}
                 return type(m) in leaves
 
         traced = torch.fx.GraphModule(m, FunctionalTracer().trace(m))
@@ -987,9 +987,6 @@ def forward(self, {params}):
 
     def test_normalize_args_preserve_meta(self):
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, a):
                 return torch.add(a, 3)
 
@@ -1057,7 +1054,7 @@ def is_leaf_module(
             ) -> bool:
                 # `leaves` contains the set of standard `nn.Modules` that are not
                 # currently symbolically traceable. Ideally this set would be empty
-                leaves = set([torch.nn.BatchNorm2d])
+                leaves = {torch.nn.BatchNorm2d}
                 return type(m) in leaves
 
         traced_functionals = torch.fx.GraphModule(m, FunctionalTracer().trace(m))
@@ -1190,7 +1187,7 @@ def foo(x):
     def test_to_folder(self):
         class Test(torch.nn.Module):
             def __init__(self):
-                super(Test, self).__init__()
+                super().__init__()
                 self.W = torch.nn.Parameter(torch.randn(2))
                 self.seq = torch.nn.Sequential(torch.nn.BatchNorm1d(2, 2))
                 self.linear = torch.nn.Linear(2, 2)
@@ -1393,13 +1390,13 @@ def forward(self, x):
 
     def test_type_matches(self):
         should_be_equal = [
-            (int, type(5)),
-            (numbers.Number, type(5)),
-            (numbers.Number, type(5.0)),
+            (int, int),
+            (numbers.Number, int),
+            (numbers.Number, float),
             (int, type(torch.float)),
-            (Union[int, float], type(5)),
-            (Union[int, float], type(5.0)),
-            (List[int], type(5)),
+            (Union[int, float], int),
+            (Union[int, float], float),
+            (List[int], int),
             (List[int], create_type_hint([int, int])),
             (List[int], create_type_hint((int, int))),
             (List[torch.Tensor], create_type_hint([torch.Tensor, torch.Tensor])),
@@ -1511,7 +1508,7 @@ class TestNormalizeOperators(JitTestCase):
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_normalize_operator_exhaustive(self, device, dtype, op):
         # These ops currently don't trace in FX for various reasons (i.e. they take a list of tensors)
-        fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot"}
+        fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot", "_upsample_bilinear2d_aa"}
         sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False)
         if isinstance(op.op, torch._ops.OpOverload):
             self.skipTest("normalize operator doesn't work on torch.ops")
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 5dc23a3d5465..38bddda44690 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -15,7 +15,7 @@
     TestCase, run_tests, TEST_WITH_TORCHDYNAMO)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
-    onlyNativeDeviceTypes)
+    onlyNativeDeviceTypes, skipXLA)
 
 
 class TestIndexing(TestCase):
@@ -911,6 +911,13 @@ def test_index_ind_dtype(self, device):
             torch.index_put_(inp_res, (ind_int, ind_int), src, accum)
             self.assertEqual(inp_ref, inp_res)
 
+    @skipXLA
+    def test_index_put_accumulate_empty(self, device):
+        # Regression test for https://github.com/pytorch/pytorch/issues/94667
+        input = torch.rand([], dtype=torch.float32, device=device)
+        with self.assertRaises(RuntimeError):
+            input.index_put([], torch.tensor([1.0], device=device), True)
+
     def test_multiple_byte_mask(self, device):
         v = torch.randn(5, 7, 3, device=device)
         # note: these broadcast together and are transposed to the first dim
@@ -1582,6 +1589,15 @@ def test_broadcast_subspace(self, device):
         expected = b.float().unsqueeze(1).expand(100, 100)
         self.assertEqual(a, expected)
 
+    def test_truncate_leading_1s(self, device):
+        col_max = torch.randn(1, 4)
+        kernel = col_max.T * col_max  # [4, 4] tensor
+        kernel2 = kernel.clone()
+        # Set the diagonal
+        kernel[range(len(kernel)), range(len(kernel))] = torch.square(col_max)
+        torch.diagonal(kernel2).copy_(torch.square(col_max.view(4)))
+        self.assertEqual(kernel, kernel2)
+
 instantiate_device_type_tests(TestIndexing, globals(), except_for='meta')
 instantiate_device_type_tests(NumpyTests, globals(), except_for='meta')
 
diff --git a/test/test_itt.py b/test/test_itt.py
index b43df322a51a..99841e1932d5 100644
--- a/test/test_itt.py
+++ b/test/test_itt.py
@@ -10,12 +10,6 @@
 
 @unittest.skipIf(not torch.profiler.itt.is_available(), "ITT is required")
 class TestItt(TestCase):
-    def setUp(self):
-        super(TestItt, self).setUp()
-
-    def tearDown(self):
-        super(TestItt, self).tearDown()
-
     def test_itt(self):
         # Just making sure we can see the symbols
         torch.profiler.itt.range_push("foo")
diff --git a/test/test_jit.py b/test/test_jit.py
index d054fc7c59c4..e54ece07b625 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -14,6 +14,7 @@
 from jit.test_backend_nnapi import TestNnapiBackend  # noqa: F401
 from jit.test_list_dict import TestList, TestDict, TestNamedTuple, TestScriptDict, TestScriptList  # noqa: F401
 from jit.test_async import TestAsync  # noqa: F401
+from jit.test_await import TestAwait  # noqa: F401
 from jit.test_data_parallel import TestDataParallel  # noqa: F401
 from jit.test_models import TestModels  # noqa: F401
 from jit.test_modules import TestModules  # noqa: F401
@@ -338,9 +339,11 @@ def _sum_of_list(tensorlist):
 # has to be at top level or Pickle complains
 class FooToPickle(torch.nn.Module):
     def __init__(self):
-        super(FooToPickle, self).__init__()
+        super().__init__()
         self.bar = torch.jit.ScriptModule()
 
+
+@skipIfTorchDynamo()
 class TestJit(JitTestCase):
     @unittest.skip("Requires a lot of RAM")
     def test_big(self):
@@ -395,7 +398,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
     def test_restore_device(self):
         class M(torch.jit.ScriptModule):
             def __init__(self, cpu_device_str):
-                super(M, self).__init__()
+                super().__init__()
                 self.p0 = nn.Parameter(torch.tensor([0.3], dtype=torch.float,
                                                     device=cpu_device_str))
                 self.b0 = torch.tensor([0.9], dtype=torch.float,
@@ -413,7 +416,7 @@ def __init__(self, cpu_device_str):
     def test_restore_device_cuda(self):
         class MyModule(torch.jit.ScriptModule):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.register_buffer('b0', torch.randn(1, 3))
                 self.p0 = nn.Parameter(torch.randn(2, 3))
 
@@ -467,7 +470,7 @@ def forward(self, x):
     def test_restore_shared_storage_on_cuda(self):
         class Foo(torch.jit.ScriptModule):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 whole_tensor = torch.randn(4, 5, dtype=torch.float, device='cpu')
                 self.p0 = nn.Parameter(whole_tensor.narrow(0, 0, 1))
                 self.register_buffer('b0', whole_tensor.narrow(0, 3, 1))
@@ -485,7 +488,7 @@ def __init__(self):
     def test_add_relu_fusion(self):
         class M(torch.nn.Module):
             def __init__(self, relu_op):
-                super(M, self).__init__()
+                super().__init__()
                 self.relu_op = relu_op
 
             def forward(self, a, b, c):
@@ -532,7 +535,7 @@ def forward(self, a, b, c):
 
         class Madd_(torch.nn.Module):
             def __init__(self, relu_op):
-                super(Madd_, self).__init__()
+                super().__init__()
                 self.relu_op = relu_op
 
             def forward(self, a, b):
@@ -566,7 +569,7 @@ def forward(self, a, b):
 
         class Madd_out(torch.nn.Module):
             def __init__(self, relu_op):
-                super(Madd_out, self).__init__()
+                super().__init__()
                 self.relu_op = relu_op
 
             def forward(self, a, b):
@@ -833,9 +836,6 @@ def foo(x):
             return x + 2
 
         class Mod(nn.Module):
-            def __init__(self):
-                super(Mod, self).__init__()
-
             def forward(self, t):
                 return t + 2
 
@@ -887,7 +887,7 @@ def get_element_size_script(x):
     def test_Sequential(self):
         class Seq(nn.Module):
             def __init__(self):
-                super(Seq, self).__init__()
+                super().__init__()
                 self.seq = nn.Sequential(nn.Linear(10, 20), nn.Linear(20, 30))
 
             @torch.jit.script_method
@@ -902,7 +902,7 @@ def forward(self, x):
     def test_ModuleList(self):
         class Mod(nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.model = nn.ModuleList([nn.Linear(10, 10) for _ in range(10)])
                 self.model += (nn.Linear(10, 20),)
                 self.model.append(nn.Linear(20, 30))
@@ -948,7 +948,7 @@ def forward(self, input):
 
         class MyModule(torch.jit.ScriptModule):
             def __init__(self, module):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.module = module
 
             @torch.jit.script_method
@@ -1398,7 +1398,7 @@ def test_pattern_based_module_rewrite(self):
         # Check match::module behavior
         class Test(torch.nn.Module):
             def __init__(self):
-                super(Test, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(1, 20, 5, 1)
                 self.bn = torch.nn.BatchNorm2d(num_features=20)
 
@@ -1422,9 +1422,6 @@ def forward(self, x):
 
     def test_pattern_based_rewrite_with_source_range_preserved(self):
         class TestModule1(torch.nn.Module):
-            def __init__(self):
-                super(TestModule1, self).__init__()
-
             def forward(self, x, y, z, w):
                 x = x + y
                 x = x * z
@@ -1454,9 +1451,6 @@ def forward(self, x, y, z, w):
         self.assertTrue(source_range_1 == source_range_2)
 
         class TestModule2(torch.nn.Module):
-            def __init__(self):
-                super(TestModule2, self).__init__()
-
             def forward(self, x, y, z, w):
                 x = x + y
                 x = x + z
@@ -1819,7 +1813,7 @@ def test_dropout_module_requires_grad(self):
         with enable_profiling_mode_for_profiling_tests():
             class MyModule(torch.nn.Module):
                 def __init__(self, M):
-                    super(MyModule, self).__init__()
+                    super().__init__()
                     self.dropout = torch.nn.Dropout(0.5)
                     self.linear = torch.nn.Linear(M, M)
 
@@ -2072,7 +2066,7 @@ def addmm(mat, mat1, mat2, alpha, beta):
     def test_sparse_tensors(self):
         @torch.jit.ignore
         def get_sparse():
-            return torch.sparse.FloatTensor(2, 3)
+            return torch.sparse_coo_tensor((2, 3), dtype=torch.float32)
 
         @torch.jit.script
         def test_is_sparse(input):
@@ -2450,7 +2444,7 @@ def func():
     def test_cuda_export_restore(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(3, 4))
 
             @torch.jit.script_method
@@ -2459,7 +2453,7 @@ def forward(self, thing):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mod = Sub()
 
             @torch.jit.script_method
@@ -2493,7 +2487,7 @@ def test_export_rnn(self):
         for clazz in [nn.RNN(10, 20, 2), nn.GRU(10, 20, 2)]:
             class RNNTest(torch.nn.Module):
                 def __init__(self):
-                    super(RNNTest, self).__init__()
+                    super().__init__()
                     self.rnn = clazz
 
                 def forward(self, x, lengths, h0):
@@ -2515,7 +2509,7 @@ def forward(self, x, lengths, h0):
     def test_export_lstm(self):
         class LSTMTest(torch.nn.Module):
             def __init__(self):
-                super(LSTMTest, self).__init__()
+                super().__init__()
                 self.rnn = nn.LSTM(10, 20, 2)
 
             def forward(self, x, lengths, hiddens):
@@ -2538,7 +2532,7 @@ def forward(self, x, lengths, hiddens):
     def test_unique_state_dict(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 shared_param = torch.nn.Parameter(torch.ones(1))
                 self.register_parameter('w1', shared_param)
                 self.register_parameter('w2', shared_param)
@@ -2649,9 +2643,6 @@ def foo(a):
     def test_import_method(self):
         with torch._jit_internal._disable_emit_hooks():
             class Foo(torch.jit.ScriptModule):
-                def __init__(self):
-                    super(Foo, self).__init__()
-
                 @torch.jit.script_method
                 def forward(self, x, y):
                     return 2 * x + y
@@ -2668,7 +2659,7 @@ def forward(self, x, y):
     def test_non_ascii_string(self):
         class Foo(torch.jit.ScriptModule):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.a = "Over \u0e55\u0e57 57"
 
             @torch.jit.script_method
@@ -2748,9 +2739,6 @@ def test_module_default_values(self):
         four = torch.tensor(4)
 
         class Test(torch.jit.ScriptModule):
-            def __init__(self):
-                super(Test, self).__init__()
-
             @torch.jit.script_method
             def forward(self, input, other=four):
                 return input + other
@@ -2822,9 +2810,6 @@ def fn(x):
     @unittest.skipIf(True, "TODO: re-enable with https://github.com/pytorch/pytorch/pull/29339")
     def test_torch_load_error(self):
         class J(torch.jit.ScriptModule):
-            def __init__(self):
-                super(J, self).__init__()
-
             @torch.jit.script_method
             def forward(self, input):
                 return input + 100
@@ -2886,9 +2871,6 @@ def lstm(x, hx, cx, w_ih, w_hh, b_ih, b_hh):
 
     def test_export_opnames(self):
         class Foo(torch.jit.ScriptModule):
-            def __init__(self):
-                super(Foo, self).__init__()
-
             def one(self, x, y):
                 # type: (Tensor, Tensor) -> Tensor
                 return x + y
@@ -2904,7 +2886,7 @@ def forward(self, x):
 
         class Bar(torch.jit.ScriptModule):
             def __init__(self):
-                super(Bar, self).__init__()
+                super().__init__()
                 self.sub = Foo()
 
             @torch.jit.script_method
@@ -3002,19 +2984,17 @@ def foo(x):
         self.assertRegex(graph.__repr__(), source_range_regex)
 
 
+@skipIfTorchDynamo()
 class TestFrontend(JitTestCase):
 
     def test_instancing_error(self):
         @torch.jit.ignore
-        class MyScriptClass(object):
+        class MyScriptClass:
             def unscriptable(self):
                 return "a" + 200
 
 
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, x):
                 return MyScriptClass()
 
@@ -3028,16 +3008,10 @@ def forward(self, x):
 
     def test_dictionary_as_example_inputs_for_jit_trace(self):
         class TestModule_v1(torch.nn.Module):
-            def __init__(self):
-                super(TestModule_v1, self).__init__()
-
             def forward(self, key2=None, key3=None, key4=None, key5=None, key1=None, key6=None):
                 return key1 + key2 + key3
 
         class TestModule_v2(torch.nn.Module):
-            def __init__(self):
-                super(TestModule_v2, self).__init__()
-
             def forward(self, x, y):
                 return x + y
 
@@ -3067,6 +3041,7 @@ def test_func(x, y):
             res_2 = traced_model_2(**{'x': torch.rand([2]), 'z': torch.rand([2])})
 
 
+@skipIfTorchDynamo()
 class TestScript(JitTestCase):
 
     # Tests that calling torch.jit.script repeated on function is allowed.
@@ -3095,16 +3070,13 @@ def foo(x):
             return torch.add(x, x)
 
         class MyNestedMod(torch.nn.Module):
-            def __init__(self):
-                super(MyNestedMod, self).__init__()
-
             def forward(self, x):
                 return torch.sub(x, x)
 
 
         class MyMod(torch.nn.Module):
             def __init__(self):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.nested = MyNestedMod()
 
             def forward(self, x):
@@ -3123,9 +3095,6 @@ def test_static_method_on_module(self):
         Check that the `@staticmethod` annotation on a function on a module works.
         """
         class MyCell(torch.nn.Module):
-            def __init__(self):
-                super(MyCell, self).__init__()
-
             @staticmethod
             def do_it(x, h):
                 new_h = torch.tanh(x + h)
@@ -3152,9 +3121,6 @@ def foo(x=torch.ones(1)):
             return x
 
         class Moddy(torch.nn.Module):
-            def __init__(self):
-                super(Moddy, self).__init__()
-
             def forward(self, x):
                 return foo()
 
@@ -3173,9 +3139,6 @@ def foo(x=torch.ones(1)):
             return x
 
         class Moddy(torch.nn.Module):
-            def __init__(self):
-                super(Moddy, self).__init__()
-
             def forward(self, x):
                 return foo()
 
@@ -3386,7 +3349,7 @@ def fct_loop(z, size):
     def test_ignored_method_binding(self):
         class Bar(torch.nn.Module):
             def __init__(self):
-                super(Bar, self).__init__()
+                super().__init__()
                 self.x : int = 0
 
             @torch.jit.export
@@ -3416,7 +3379,7 @@ class A(torch.nn.Module):
             __annotations__ = {"x": Optional[torch.Tensor]}
 
             def __init__(self):
-                super(A, self).__init__()
+                super().__init__()
                 self.x = None
 
             @torch.jit.ignore
@@ -3439,7 +3402,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ["foo"]
 
             def __init__(self, foo):
-                super(M, self).__init__()
+                super().__init__()
                 self.foo = foo
 
         m = M(5)
@@ -3453,7 +3416,7 @@ class M(torch.jit.ScriptModule):
             FOO = 0
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.foo = self.FOO
         m = M()
         self.assertEqual(m.foo, M.FOO)
@@ -3462,9 +3425,6 @@ def test_class_attribute_in_script(self):
         class M(torch.jit.ScriptModule):
             FOO = 0
 
-            def __init__(self):
-                super(M, self).__init__()
-
             @torch.jit.script_method
             def forward(self):
                 return self.FOO
@@ -3481,7 +3441,7 @@ def __init__(self):
     def test_attribute_in_init(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.foo = torch.jit.Attribute(0.1, float)
                 # we should be able to use self.foo as a float here
                 assert 0.0 < self.foo
@@ -3490,7 +3450,7 @@ def __init__(self):
     def test_scriptable_fn_as_attr(self):
         class M(torch.nn.Module):
             def __init__(self, fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.fn = fn
 
             def forward(self, x):
@@ -3542,9 +3502,6 @@ def fn2():
         FileCheck().check("NamedTuple").run(fn2.graph)
 
         class MyMod(torch.nn.Module):
-            def __init__(self):
-                super(MyMod, self).__init__()
-
             @torch.jit.unused
             def fn(self):
                 # type: () -> MyTuple
@@ -3561,9 +3518,6 @@ def forward(self, x):
 
     def test_unused_decorator(self):
         class MyMod(torch.nn.Module):
-            def __init__(self):
-                super(MyMod, self).__init__()
-
             @torch.jit.unused
             @torch.no_grad()
             def fn(self, x):
@@ -3742,16 +3696,10 @@ def _test(m):
             self.assertFalse(loaded._c.getattr('training'))
 
         class M(nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x):
                 return self.training
 
         class OldM(torch.jit.ScriptModule):
-            def __init__(self):
-                super(OldM, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 return self.training
@@ -3761,17 +3709,11 @@ def forward(self, x):
 
     def test_inherit_method(self):
         class A(torch.jit.ScriptModule):
-            def __init__(self):
-                super(A, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 return x + self.bar(x)
 
         class B(A):
-            def __init__(self):
-                super(B, self).__init__()
-
             @torch.jit.script_method
             def bar(self, x):
                 return x * x
@@ -3784,16 +3726,13 @@ def bar(self, x):
         self.assertEqual(b(v), v + v * v)
 
         class C(torch.jit.ScriptModule):
-            def __init__(self):
-                super(C, self).__init__()
-
             @torch.jit.script_method
             def bar(self, x):
                 return x
 
         class D(C, B):
             def __init__(self):
-                super(D, self).__init__()
+                super().__init__()
 
         self.assertEqual(D()(v), v + v)
 
@@ -3821,7 +3760,7 @@ def check_subclass_warn(input: torch.LongTensor) -> torch.LongTensor:
     def test_first_class_module(self):
         class Foo(torch.jit.ScriptModule):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.foo = nn.Parameter(torch.rand(3, 4))
 
             @torch.jit.script_method
@@ -3836,7 +3775,7 @@ def forward(self, input):
     @_tmp_donotuse_dont_inline_everything
     def test_first_class_calls(self):
         @torch.jit.script
-        class Foo(object):
+        class Foo:
             def __init__(self, x):
                 self.bar = x
 
@@ -3856,9 +3795,6 @@ def bar(x):
 
     def test_static_methods(self):
         class M(nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             @staticmethod
             def my_method(x):
                 return x + 100
@@ -3867,9 +3803,6 @@ def forward(self, x):
                 return x + M.my_method(x)
 
         class N(nn.Module):
-            def __init__(self):
-                super(N, self).__init__()
-
             @staticmethod
             def my_method(x):
                 return x * 100
@@ -3906,7 +3839,7 @@ def invalid_prefix_annotation3(a):
     def test_builtin_function_attributes(self):
         class Add(nn.Module):
             def __init__(self):
-                super(Add, self).__init__()
+                super().__init__()
                 self.add = torch.add
 
             def forward(self, input):
@@ -3923,7 +3856,7 @@ def f():
         t = node.outputsAt(0).type()
         self.assertIsNotNone(t)
 
-    @unittest.skipIf(IS_WINDOWS and sys.version_info >= (3, 8), 'TODO: need to fix the test case')
+    @unittest.skipIf(IS_WINDOWS, 'TODO: need to fix the test case')
     def test_unmatched_type_annotation(self):
         message1 = re.escape("Number of type annotations (2) did not match the number of function parameters (1):")
         message2 = 'def invalid2\\(a\\):\n\\s*~+\\.*\\s+<--- HERE\n\\s+# type: \\(Int, Int\\) -> Int\n\\s+return a \\+ 2'
@@ -4126,16 +4059,13 @@ def foo(x):
 
         class What(torch.jit.ScriptModule):
             def __init__(self, x):
-                super(What, self).__init__()
+                super().__init__()
                 self.foo = x
         a = What(foo)
         c = What(foo)
 
     def test_training_param(self):
         class What(torch.jit.ScriptModule):
-            def __init__(self):
-                super(What, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 # type: (int) -> int
@@ -4156,13 +4086,13 @@ def forward(self, x):
 
     def test_class_as_attribute(self):
         @torch.jit.script
-        class Foo321(object):
+        class Foo321:
             def __init__(self):
                 self.x = 3
 
         class FooBar1234(torch.nn.Module):
             def __init__(self):
-                super(FooBar1234, self).__init__()
+                super().__init__()
                 self.f = Foo321()
 
             def forward(self, x):
@@ -4221,9 +4151,6 @@ def test_annoying_doubles(self):
 
         with torch._jit_internal._disable_emit_hooks():
             class Foo(torch.jit.ScriptModule):
-                def __init__(self):
-                    super(Foo, self).__init__()
-
                 @torch.jit.script_method
                 def forward(self):
                     return math.pi, 0.1, mod.inf, mod.ninf, 2.225073858507201e-308, mod.nan
@@ -4278,7 +4205,7 @@ def stuff(x):
 
     def test_nested_aug_assign(self):
         @torch.jit.script
-        class SomeClass(object):
+        class SomeClass:
             def __init__(self):
                 self.num = 99
 
@@ -4292,7 +4219,7 @@ def __eq__(self, other):
                 return self.num == other.num
 
         @torch.jit.script
-        class SomeOutOfPlaceClass(object):
+        class SomeOutOfPlaceClass:
             def __init__(self):
                 self.num = 99
 
@@ -4337,7 +4264,7 @@ def forward(self):
         self.assertEqual(a.child.list, sa.child.list)
 
         @torch.jit.script
-        class SomeNonAddableClass(object):
+        class SomeNonAddableClass:
             def __init__(self):
                 self.num = 99
 
@@ -4360,7 +4287,7 @@ def forward(self):
 
     def test_var_aug_assign(self):
         @torch.jit.script
-        class SomeNonAddableClass(object):
+        class SomeNonAddableClass:
             def __init__(self):
                 self.num = 99
 
@@ -4376,7 +4303,7 @@ def fn():
                 return a
 
         @torch.jit.script
-        class SomeClass(object):
+        class SomeClass:
             def __init__(self):
                 self.num = 99
 
@@ -4390,7 +4317,7 @@ def __eq__(self, other):
                 return self.num == other.num
 
         @torch.jit.script
-        class SomeOutOfPlaceClass(object):
+        class SomeOutOfPlaceClass:
             def __init__(self):
                 self.num = 99
 
@@ -4439,7 +4366,7 @@ def foobar(xyz):
             scripted = torch.jit.script(foobar)
 
     def test_file_line_error_class_defn(self):
-        class FooBar(object):
+        class FooBar:
             def baz(self, xyz):
                 return torch.blargh(xyz)
 
@@ -4644,16 +4571,13 @@ def test_circular_dependency(self):
         https://github.com/pytorch/pytorch/issues/25871
         """
         class A(torch.jit.ScriptModule):
-            def __init__(self):
-                super(A, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 return x
 
         class B(torch.jit.ScriptModule):
             def __init__(self):
-                super(B, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.ModuleList([A()])
 
             @torch.jit.script_method
@@ -4664,7 +4588,7 @@ def forward(self, x):
 
         class C(torch.jit.ScriptModule):
             def __init__(self):
-                super(C, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Sequential(B())
 
             @torch.jit.script_method
@@ -5266,7 +5190,7 @@ def func(x):
     def test_module_copy_with_attributes(self):
         class Vocabulary(torch.jit.ScriptModule):
             def __init__(self, vocab_list):
-                super(Vocabulary, self).__init__()
+                super().__init__()
                 self._vocab = torch.jit.Attribute(vocab_list, List[str])
                 self.some_idx = torch.jit.Attribute(2, int)
                 self.idx = torch.jit.Attribute(
@@ -6957,12 +6881,12 @@ def bar(c, b):
             return foo(c, b)
 
         @torch.jit.script
-        class Bar(object):
+        class Bar:
             def one(self, x, y):
                 return bar(x, y)
 
         @torch.jit.interface
-        class IFace(object):
+        class IFace:
             def one(self, x, y):
                 # type: (Tensor, Tensor) -> Tensor
                 pass
@@ -7210,7 +7134,7 @@ def func():
     def test_nested_select_assign(self):
         class SubSubModule(torch.nn.Module):
             def __init__(self):
-                super(SubSubModule, self).__init__()
+                super().__init__()
                 self.abc = 11
 
             def forward(self, x):
@@ -7218,7 +7142,7 @@ def forward(self, x):
 
         class SubModule(torch.nn.Module):
             def __init__(self):
-                super(SubModule, self).__init__()
+                super().__init__()
                 self.a = 11
                 self.nested = SubSubModule()
 
@@ -7227,7 +7151,7 @@ def forward(self, x):
 
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.sub = SubModule()
                 self.hi = 1
 
@@ -7795,7 +7719,7 @@ def opt_func(x):
     def test_dropout_eval(self):
         class ScriptedConv2d(torch.jit.ScriptModule):
             def __init__(self, in_channels, out_channels, **kwargs):
-                super(ScriptedConv2d, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
                 self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
 
@@ -7807,7 +7731,7 @@ def forward(self, x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.Conv2d_1a_3x3 = ScriptedConv2d(3, 32, kernel_size=3, stride=2)
 
             @torch.jit.script_method
@@ -7817,7 +7741,7 @@ def forward(self, x):
 
         class EagerConv2d(torch.nn.Module):
             def __init__(self, in_channels, out_channels, **kwargs):
-                super(EagerConv2d, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
                 self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
 
@@ -7828,7 +7752,7 @@ def forward(self, x):
 
         class EagerMod(torch.nn.Module):
             def __init__(self):
-                super(EagerMod, self).__init__()
+                super().__init__()
                 self.Conv2d_1a_3x3 = EagerConv2d(3, 32, kernel_size=3, stride=2)
 
             def forward(self, x):
@@ -8254,7 +8178,7 @@ def with_docstring(self, x):
     def test_script_module(self):
         class M1(torch.jit.ScriptModule):
             def __init__(self):
-                super(M1, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -8263,7 +8187,7 @@ def forward(self, thing):
 
         class PModule(nn.Module):
             def __init__(self):
-                super(PModule, self).__init__()
+                super().__init__()
                 self.a = nn.Parameter(torch.randn(2, 3))
 
             def forward(self, a):
@@ -8271,7 +8195,7 @@ def forward(self, a):
 
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 # test submodule
                 self.sub = M1()
                 self.sub2 = PModule()
@@ -8823,7 +8747,7 @@ def test_bad_input():
     def test_script_module_call_noscript(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.value = 1
 
             @torch.jit.ignore
@@ -8848,7 +8772,7 @@ def forward(self, input):
     def test_script_module_nochange_submodule(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.sub = nn.Linear(5, 5)
 
             @torch.jit.script_method
@@ -8864,22 +8788,16 @@ def forward(self, input):
 
     def test_module_apis(self):
         class Sub(torch.nn.Module):
-            def __init__(self):
-                super(Sub, self).__init__()
-
             def forward(self, thing):
                 return thing - 2
 
         class Double(torch.nn.Module):
-            def __init__(self):
-                super(Double, self).__init__()
-
             def forward(self, thing):
                 return thing * 2
 
         class MyMod(torch.nn.Module):
             def __init__(self):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.mod = (Sub())
                 self.mod2 = (Sub())
                 self.mod3 = nn.Sequential(nn.Sequential(Sub()))
@@ -8918,7 +8836,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['b', 'i', 'c', 's']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.b = False
                 self.i = 1
                 self.c = 3.5
@@ -8937,9 +8855,6 @@ def forward(self):
 
     def test_script_module_fail_exist(self):
         class M(torch.jit.ScriptModule):
-            def __init__(self):
-                super(M, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 return x + self.whatisgoingon
@@ -8950,7 +8865,7 @@ def forward(self, x):
     def test_script_module_none_exist_fail(self):
         class M(torch.jit.ScriptModule):
             def __init__(self, my_optional):
-                super(M, self).__init__()
+                super().__init__()
                 self.my_optional = my_optional
 
             @torch.jit.script_method
@@ -8968,7 +8883,7 @@ class Foo(torch.jit.ScriptModule):
             __constants__ = ['invalid']
 
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.invalid = [nn.Linear(3, 4)]
 
         with self.assertRaisesRegex(
@@ -8980,8 +8895,8 @@ class Foo2(torch.jit.ScriptModule):
             __constants__ = ['invalid']
 
             def __init__(self):
-                super(Foo2, self).__init__()
-                self.invalid = type(1)
+                super().__init__()
+                self.invalid = int
 
         with self.assertRaisesRegex(TypeError, "not a valid constant"):
             Foo2()
@@ -8990,7 +8905,7 @@ class Foo3(torch.jit.ScriptModule):
             __constants__ = ['invalid']
 
             def __init__(self):
-                super(Foo3, self).__init__()
+                super().__init__()
                 self.invalid = (3, 4, {})
 
         with self.assertRaisesRegex(TypeError, "not a valid constant"):
@@ -9000,7 +8915,7 @@ class Foo4(torch.jit.ScriptModule):
             __constants__ = ['invalid']
 
             def __init__(self):
-                super(Foo4, self).__init__()
+                super().__init__()
                 self.invalid = np.int64(5)
 
         # verify that we capture human understandable class name
@@ -9011,7 +8926,7 @@ def test_script_module_param_buffer_mutation(self):
         # TODO: add param mutation test case after JIT support it
         class ModuleBufferMutate(torch.jit.ScriptModule):
             def __init__(self):
-                super(ModuleBufferMutate, self).__init__()
+                super().__init__()
                 self.register_buffer('running_var', torch.tensor(0, dtype=torch.long))
 
             @torch.jit.script_method
@@ -9031,7 +8946,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['b']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.b = [1, 2, 3, 4]
 
             @torch.jit.script_method
@@ -9047,9 +8962,6 @@ def forward(self):
 
     def test_override_magic(self):
         class OverrideMagic(nn.Module):
-            def __init__(self):
-                super(OverrideMagic, self).__init__()
-
             @torch.jit.export
             def __len__(self):
                 return 10
@@ -9058,9 +8970,6 @@ def __len__(self):
         self.assertEqual(len(mod), len(torch.jit.script(mod)))
 
         class OverrideMagicSeq(nn.Sequential):
-            def __init__(self):
-                super(OverrideMagicSeq, self).__init__()
-
             @torch.jit.export
             def __len__(self):
                 return 10
@@ -9072,7 +8981,7 @@ def __len__(self):
     def test_script_module_for2(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -9081,7 +8990,7 @@ def forward(self, thing):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = nn.ModuleList([Sub() for i in range(10)])
 
             @torch.jit.script_method
@@ -9104,7 +9013,7 @@ def forward(self, v):
     def test_attr_qscheme_script(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.qscheme = torch.per_tensor_affine
 
             def forward(self):
@@ -9120,7 +9029,7 @@ def forward(self):
     def test_script_module_const_submodule_fail(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -9129,7 +9038,7 @@ def forward(self, thing):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = [Sub() for _ in range(10)]
 
             @torch.jit.script_method
@@ -9290,7 +9199,7 @@ def tensordot_dims_tuple(a: torch.Tensor, b: torch.Tensor, dims: Tuple[List[int]
     def test_missing_getstate(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.x = 1
 
             def forward(self, x):
@@ -9320,7 +9229,7 @@ def fee(x):
     def test_pack_unpack_nested(self):
         class SubSubMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(SubSubMod, self).__init__()
+                super().__init__()
                 self.register_buffer('buf', torch.ones(3, 4) * 3)
 
             @torch.jit.script_method
@@ -9337,7 +9246,7 @@ def forward(self, x):
 
         class SubMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(SubMod, self).__init__()
+                super().__init__()
                 self.register_buffer('buf', torch.ones(3, 4) * 2)
                 self.ssm = SubSubMod()
 
@@ -9355,7 +9264,7 @@ def forward(self, x):
 
         class Mod(torch.jit.ScriptModule):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.submod = SubMod()
                 self.register_buffer('buf', torch.ones(3, 4) * 1)
 
@@ -9428,7 +9337,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['mods']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = 1
 
             @torch.jit.script_method
@@ -9442,7 +9351,7 @@ def forward(self, v):
     def test_attr_module_constants(self):
         class M2(torch.jit.ScriptModule):
             def __init__(self, mod_list):
-                super(M2, self).__init__()
+                super().__init__()
                 self.mods = mod_list
 
             @torch.jit.script_method
@@ -9456,7 +9365,7 @@ def forward(self, x):
     def test_script_sequential_for(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -9465,7 +9374,7 @@ def forward(self, thing):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = nn.Sequential(Sub(), Sub(), Sub())
 
             @torch.jit.script_method
@@ -9493,7 +9402,7 @@ def forward2(self, v):
     def test_script_sequential_sliced_iteration(self):
         class seq_mod(nn.Module):
             def __init__(self):
-                super(seq_mod, self).__init__()
+                super().__init__()
                 self.layers = [nn.ReLU(), nn.ReLU(), nn.ReLU()]
                 self.layers = nn.Sequential(*self.layers)
 
@@ -9511,7 +9420,7 @@ def forward(self, input):
     def test_script_sequential_orderdict(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = nn.Sequential(OrderedDict([
                     ("conv", nn.Conv2d(1, 20, 5)),
                     ("relu", nn.ReLU())
@@ -9527,7 +9436,7 @@ def forward(self, input):
     def test_script_sequential_multi_output_fail(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -9535,16 +9444,13 @@ def forward(self, thing):
                 return self.weight + thing
 
         class ReturnMulti(torch.jit.ScriptModule):
-            def __init__(self):
-                super(ReturnMulti, self).__init__()
-
             @torch.jit.script_method
             def forward(self, x):
                 return x, x, x
 
         class HaveSequential(torch.jit.ScriptModule):
             def __init__(self):
-                super(HaveSequential, self).__init__()
+                super().__init__()
                 self.someseq = nn.Sequential(
                     Sub(),
                     ReturnMulti(),
@@ -9565,7 +9471,7 @@ def forward(self, x):
     def test_script_sequential_in_mod_list(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -9574,7 +9480,7 @@ def forward(self, thing):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = nn.ModuleList([Sub(), nn.Sequential(Sub(), nn.Sequential(Sub(), Sub()), Sub())])
 
             @torch.jit.script_method
@@ -9592,7 +9498,7 @@ def forward(self, v):
     def test_script_nested_mod_list(self):
         class Sub(torch.jit.ScriptModule):
             def __init__(self):
-                super(Sub, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -9601,7 +9507,7 @@ def forward(self, thing):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.mods = nn.ModuleList([nn.ModuleList([Sub()]), nn.Sequential(Sub()), nn.ModuleList([Sub(), Sub()])])
 
             @torch.jit.script_method
@@ -9621,7 +9527,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['dim']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.dim = 1
 
             @torch.jit.script_method
@@ -9652,7 +9558,7 @@ def test_script_star_expr(self):
 
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 self.m = torch.jit.trace(TestScript.StarTestSumStarred(),
                                          (torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3)))
                 self.g = torch.jit.trace(TestScript.StarTestReturnThree(), torch.ones(4, 3))
@@ -9668,7 +9574,7 @@ def forward(self, rep):
     def test_script_star_expr_string(self):
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 self.m = torch.jit.trace(TestScript.StarTestSumStarred(),
                                          (torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3)))
                 self.g = torch.jit.trace(TestScript.StarTestReturnThree(), torch.ones(4, 3))
@@ -9695,7 +9601,7 @@ def forward(self, *inputs):
     def test_script_star_assign(self):
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 self.g = torch.jit.trace(TestScript.StarTestSumAndReturnThree(), torch.ones(4, 3))
                 self.define('''
             def forward(self, rep):
@@ -9709,7 +9615,7 @@ def forward(self, rep):
     def test_script_module_star_assign2(self):
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 self.g = torch.jit.trace(
                     TestScript.StarTestSumAndReturnThree(),
                     (torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3)),
@@ -9726,7 +9632,7 @@ def forward(self, rep):
     def test_script_module_star_assign2_inplace(self):
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 self.g = torch.jit.trace(
                     TestScript.StarTestSumAndReturnThree(),
                     (torch.ones(4, 3), torch.ones(4, 3), torch.ones(4, 3)),
@@ -9748,7 +9654,7 @@ def test_script_module_star_assign_fail_pythonop(self):
         with self.assertRaisesRegex(RuntimeError, "cannot be used as a tuple"):
             class M2(torch.jit.ScriptModule):
                 def __init__(self):
-                    super(M2, self).__init__()
+                    super().__init__()
 
                     @torch.jit.ignore
                     def myfunc():
@@ -9767,7 +9673,7 @@ def test_script_module_star_assign_fail_builtin(self):
         with self.assertRaisesRegex(RuntimeError, "cannot be used as a tuple"):
             class M2(torch.jit.ScriptModule):
                 def __init__(self):
-                    super(M2, self).__init__()
+                    super().__init__()
 
                     self.define('''
                 def forward(self, rep):
@@ -9932,7 +9838,7 @@ class M(torch.nn.Module):
             }
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.val = None
 
             def some_method(self):
@@ -9951,16 +9857,13 @@ def forward(self, x):
     def test_script_forward_method_replacement(self):
         # We want to support the use case of attaching a different `forward` method
         class LowLevelModule(torch.nn.Module):
-            def __init__(self):
-                super(LowLevelModule, self).__init__()
-
             def forward(self, input: torch.Tensor):
                 # Generic forward dispatch
                 return self.forward_pytorch(input) * 2
 
         class TestModule(LowLevelModule):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 # Replace the forward method
                 self.forward = types.MethodType(LowLevelModule.forward, self)
 
@@ -10125,7 +10028,7 @@ def forward(self, x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self, mod):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 x = torch.zeros(1, 3)
                 mod_fn = lambda : mod(x)  # noqa: E731
                 self.mod = torch.jit.trace(mod_fn, tuple())
@@ -10465,7 +10368,7 @@ def foo3(a):
     def test_script_module_export_submodule(self):
         class M1(torch.jit.ScriptModule):
             def __init__(self):
-                super(M1, self).__init__()
+                super().__init__()
                 self.weight = nn.Parameter(torch.randn(2))
 
             @torch.jit.script_method
@@ -10474,7 +10377,7 @@ def forward(self, thing):
 
         class M2(torch.jit.ScriptModule):
             def __init__(self):
-                super(M2, self).__init__()
+                super().__init__()
                 # test submodule
                 self.sub = M1()
                 self.weight = nn.Parameter(torch.randn(2, 3))
@@ -10517,7 +10420,7 @@ def forward(self, input):
     def test_compile_module_with_constant(self):
         class Double(nn.Module):
             def __init__(self, downsample=None):
-                super(Double, self).__init__()
+                super().__init__()
 
             def forward(self, input):
                 return input * 2
@@ -10526,7 +10429,7 @@ class Mod(nn.Module):
             __constants__ = ['downsample']
 
             def __init__(self, downsample=None):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.downsample = downsample
 
             def forward(self, input):
@@ -10549,7 +10452,7 @@ def f():
     def test_script_module_export_tensor_type(self):
         class M(torch.jit.ScriptModule):
             def __init__(self, type):
-                super(M, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.zeros((5, 5), dtype=type).random_())
 
             @torch.jit.script_method
@@ -10570,7 +10473,7 @@ def test_script_module_export_tensor_cuda(self):
         class M(torch.jit.ScriptModule):
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.zeros((5, 5), device='cuda:0').random_())
 
             @torch.jit.script_method
@@ -10588,7 +10491,7 @@ def foo(self):
     def test_script_module_export_blocks(self):
         class M(torch.jit.ScriptModule):
             def __init__(self, n, m):
-                super(M, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(n, m))
 
             @torch.jit.script_method
@@ -10609,7 +10512,7 @@ def test_script_module_export_shared_storage(self):
         class M(torch.jit.ScriptModule):
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.param1 = torch.nn.Parameter(torch.rand(5, 5))
                 self.param2 = torch.nn.Parameter(self.param1[3])
                 self.param3 = torch.nn.Parameter(torch.rand(5, 5))
@@ -10630,22 +10533,16 @@ def foo(self):
 
     def test_sequential_intermediary_types(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             def forward(self, x):
                 return x + 3
 
         class B(torch.nn.Module):
-            def __init__(self):
-                super(B, self).__init__()
-
             def forward(self, x):
                 return {"1": x}
 
         class C(torch.nn.Module):
             def __init__(self):
-                super(C, self).__init__()
+                super().__init__()
                 self.foo = torch.nn.Sequential(A(), B())
 
             def forward(self, x):
@@ -10895,9 +10792,6 @@ def t(x):
 
     def test_torch_ignore_conversion_to_none(self):
         class A(torch.nn.Module):
-            def __init__(self):
-                super(A, self).__init__()
-
             @torch.jit.ignore
             def ignored(self, a: int) -> None:
                 l: int = len([2 for i in range(a) if i > 2])
@@ -10910,9 +10804,6 @@ def forward(self) -> int:
                 return a + b
 
         class B(torch.nn.Module):
-            def __init__(self):
-                super(B, self).__init__()
-
             @torch.jit.ignore
             def ignored(self, a: int):
                 l: int = len([2 for i in range(a) if i > 2])
@@ -10980,7 +10871,7 @@ def test_batch_norm_inference_backward_cuda(self):
         with enable_profiling_mode_for_profiling_tests():
             class MyBatchNorm(torch.nn.Module):
                 def __init__(self, num_features, affine, track_running_stats):
-                    super(MyBatchNorm, self).__init__()
+                    super().__init__()
                     self.bn = torch.nn.BatchNorm2d(
                         num_features, 1e-5, affine=affine, track_running_stats=track_running_stats).float()
 
@@ -11042,7 +10933,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['d']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.d = torch.device('cpu')
 
             @torch.jit.script_method
@@ -11231,7 +11122,7 @@ def test_remove_dropout(self):
 
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.weight_0 = torch.nn.Parameter(torch.rand(weight_0_shape))
                 self.weight_1 = torch.nn.Parameter(torch.rand(weight_1_shape))
 
@@ -11610,23 +11501,17 @@ def test_none_type_str(self):
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     def test_zip_enumerate_modulelist(self):
         class Sub(torch.nn.Module):
-            def __init__(self):
-                super(Sub, self).__init__()
-
             def forward(self, thing):
                 return thing - 2
 
         class Double(torch.nn.Module):
-            def __init__(self):
-                super(Double, self).__init__()
-
             def forward(self, thing):
                 return thing * 2
 
         # zipping over two
         class ZipModLists(torch.nn.Module):
             def __init__(self, mods, mods2):
-                super(ZipModLists, self).__init__()
+                super().__init__()
                 self.mods = mods
                 self.mods2 = mods2
 
@@ -11641,7 +11526,7 @@ class ZipWithValues(torch.nn.Module):
             __constants__ = ['tup_larger', 'tup_smaller']
 
             def __init__(self, mods, mods2):
-                super(ZipWithValues, self).__init__()
+                super().__init__()
                 self.mods = mods
                 self.mods2 = mods2
                 self.tup_larger = list(range(len(mods2) + 1))
@@ -11674,7 +11559,7 @@ def forward(self, thing):
 
         class Mod(torch.nn.Module):
             def __init__(self):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.mods = nn.ModuleList([Double(), Double()])
 
             def forward(self, x):
@@ -11775,7 +11660,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self, mod_list):
-                super(M, self).__init__()
+                super().__init__()
                 self.module_list = mod_list
 
             def forward(self, x):
@@ -11790,7 +11675,7 @@ def forward(self, x):
 
         class M2(M):
             def __init__(self, mod_list):
-                super(M2, self).__init__(mod_list)
+                super().__init__(mod_list)
 
             def forward(self, x):
                 out = [mod(x) for mod in self.module_list]
@@ -12320,7 +12205,7 @@ def traced_fn(x):
     def test_call_python_mod_from_tracing_fn(self):
         class PythonMod(torch.nn.Module):
             def __init__(self):
-                super(PythonMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3), requires_grad=False)
 
             def forward(self, x):
@@ -12354,7 +12239,7 @@ def traced_fn(x):
     def test_call_traced_mod_from_tracing_fn(self):
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3), requires_grad=False)
 
             def forward(self, x):
@@ -12384,7 +12269,7 @@ def test_call_script_mod_from_tracing_fn(self):
         with self.assertRaisesRegex(RuntimeError, "must be registered as submodules"):
             class ScriptMod(torch.jit.ScriptModule):
                 def __init__(self):
-                    super(ScriptMod, self).__init__()
+                    super().__init__()
                     self.param = torch.nn.Parameter(torch.rand(3, 4), requires_grad=False)
 
                 @torch.jit.script_method
@@ -12406,7 +12291,7 @@ def python_fn(x):
 
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
 
             def forward(self, x):
@@ -12423,7 +12308,7 @@ def forward(self, x):
     def test_call_python_mod_from_traced_module(self):
         class PythonModule(torch.nn.Module):
             def __init__(self):
-                super(PythonModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(5, 7))
 
             def forward(self, x):
@@ -12431,7 +12316,7 @@ def forward(self, x):
 
         class TracedModule(torch.nn.Module):
             def __init__(self):
-                super(TracedModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 5))
                 self.mod = PythonModule()
 
@@ -12514,7 +12399,7 @@ def script_fn(x):
     def test_call_python_mod_from_script_fn(self):
         class PythonModule(torch.nn.Module):
             def __init__(self):
-                super(PythonModule, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(5, 7))
 
             def forward(self, x):
@@ -12545,9 +12430,6 @@ def script_fn(x):
     def test_call_script_mod_from_script_fn(self):
         with self.assertRaisesRegex(RuntimeError, "Cannot call a ScriptModule that is not a submodule of the caller"):
             class ScriptMod(torch.jit.ScriptModule):
-                def __init__(self):
-                    super(ScriptMod, self).__init__()
-
                 @torch.jit.script_method
                 def forward(self, x):
                     return torch.mm(x, torch.zeros([4, 3]))
@@ -12565,7 +12447,7 @@ def python_fn(x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
 
             @torch.jit.script_method
@@ -12579,7 +12461,7 @@ def forward(self, x):
     def test_call_python_mod_from_script_module(self):
         class PythonMod(torch.nn.Module):
             def __init__(self):
-                super(PythonMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(3, 5))
 
             @torch.jit.ignore
@@ -12588,7 +12470,7 @@ def forward(self, x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
                 self.pm = PythonMod()
 
@@ -12609,7 +12491,7 @@ def script_fn(x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
 
             @torch.jit.script_method
@@ -12624,7 +12506,7 @@ def forward(self, x):
     def test_call_script_mod_from_script_module(self):
         class ScriptMod1(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod1, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(3, 5))
 
             @torch.jit.script_method
@@ -12633,7 +12515,7 @@ def forward(self, x):
 
         class ScriptMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(ScriptMod, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.rand(4, 3))
                 self.tm = ScriptMod1()
 
@@ -12652,7 +12534,7 @@ def test_module_with_params_called_fails(self):
         with self.assertRaisesRegex(RuntimeError, "Cannot call a ScriptModule that is not a submodule of the caller"):
             class ScriptMod(torch.jit.ScriptModule):
                 def __init__(self):
-                    super(ScriptMod, self).__init__()
+                    super().__init__()
                     self.param = torch.nn.Parameter(torch.rand(3, 3))
 
                 @torch.jit.script_method
@@ -12946,7 +12828,7 @@ def foo(x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]:
     def test_annot_string_py3_method(self):
         class TestModule(torch.jit.ScriptModule):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
 
         code = '''
             def foo(self, x : {input}, y : Tuple[Tensor, Tensor]) -> Tuple[{output}, {output}]:
@@ -12978,7 +12860,7 @@ def foo(x, y):
     def test_annot_string_mypy_method(self):
         class TestModule(torch.jit.ScriptModule):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
 
         code = '''
         def foo(self, x, y):
@@ -13175,7 +13057,7 @@ def test_module_parameters_and_buffers(self):
 
         class TestLinear(torch.nn.Module):
             def __init__(self, in_features, out_features):
-                super(TestLinear, self).__init__()
+                super().__init__()
                 self.in_features = in_features
                 self.out_features = out_features
                 self.weight = torch.nn.Parameter(torch.empty(out_features, in_features))
@@ -13196,7 +13078,7 @@ def forward(self, input):
         # Initialize a ScriptModule that uses the weak module above multiple times
         class Strong(torch.jit.ScriptModule):
             def __init__(self):
-                super(Strong, self).__init__()
+                super().__init__()
                 self.fc1 = TestLinear(10, 10)
                 self.fc1.weight = torch.nn.Parameter(weights)
                 self.fc1.bias = torch.nn.Parameter(bias)
@@ -13225,15 +13107,12 @@ def forward(self, x):
 
     def test_module_copying(self):
         class Submodule(torch.nn.Module):
-            def __init__(self):
-                super(Submodule, self).__init__()
-
             def forward(self, x):
                 return x + 100
 
         class Weak(torch.nn.Module):
             def __init__(self, in_features, out_features):
-                super(Weak, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.ones(out_features, in_features))
                 self.bias = torch.nn.Parameter(torch.ones(out_features))
                 self.register_buffer("buffer", torch.ones(out_features))
@@ -13245,7 +13124,7 @@ def forward(self, x):
 
         class Strong(torch.jit.ScriptModule):
             def __init__(self, weak):
-                super(Strong, self).__init__()
+                super().__init__()
                 self.weak = weak
 
             @torch.jit.script_method
@@ -13318,9 +13197,6 @@ def test_ignored_props(self):
         class A(nn.Module):
             __jit_ignored_attributes__ = ["ignored", "ignored_return_val"]
 
-            def __init__(self):
-                super().__init__()
-
             @property
             def ignored(self):
                 raise ValueError("shouldn't be called")
@@ -13498,7 +13374,7 @@ def test_id_scalars():
                 return id(2) == id(None)
 
         @torch.jit.script
-        class FooTest(object):
+        class FooTest:
             def __init__(self, x):
                 self.foo = x
 
@@ -13693,7 +13569,7 @@ class Root(torch.jit.ScriptModule):
             __constants__ = ['number']
 
             def __init__(self, number):
-                super(Root, self).__init__()
+                super().__init__()
                 self.register_buffer('buffer1', torch.ones(2, 2))
                 self.register_buffer('buffer2', torch.ones(2, 2))
                 self.number = number
@@ -13712,7 +13588,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['number']
 
             def __init__(self, number, submodule):
-                super(M, self).__init__()
+                super().__init__()
                 self.register_buffer('buffer1', torch.ones(2, 2))
                 self.register_buffer('buffer2', torch.ones(2, 2))
                 self.number = number
@@ -13749,7 +13625,7 @@ def __setstate__(self, state):
         # Check simpler module
         class NoArgState(torch.nn.Module):
             def __init__(self):
-                super(NoArgState, self).__init__()
+                super().__init__()
                 self.register_buffer('buffer1', torch.ones(2, 2))
                 self.register_buffer('buffer2', torch.ones(2, 2))
 
@@ -14273,7 +14149,7 @@ class Mod(torch.nn.Module):
             __constants__ = ['val']
 
             def __init__(self, val):
-                super(Mod, self).__init__()
+                super().__init__()
                 self.val = val
 
             def forward(self):
@@ -14358,9 +14234,6 @@ class Point(NamedTuple):
         make_global(Point)
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, point: Point):
                 return point
 
@@ -14384,9 +14257,6 @@ class Point(NamedTuple):
         make_global(Point)
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, point: Point):
                 return point
 
@@ -14411,9 +14281,6 @@ class Point(NamedTuple):
         make_global(Point)
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, point: Point):
                 return point
 
@@ -14438,9 +14305,6 @@ class Point(NamedTuple):
         make_global(Point)
 
         class M(torch.nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, point: Point):
                 return point
 
@@ -14452,7 +14316,6 @@ def forward(self, point: Point):
             m = torch.jit.script(M())
             m(p)
 
-    @unittest.skipIf(sys.version_info < (3, 7, 0), "defaults keyword added in Python 3.8")
     def test_namedtuple_default_values_using_factory_constructor(self):
         Pair = namedtuple("Pair", ["x", "y"], defaults=(1, 2))
 
@@ -14706,9 +14569,6 @@ def null_overload_driver():
             torch.jit.script(null_overload_driver)
 
         class OverloadMisuse(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             @torch.jit._overload_method
             def forward(self, x: int):
                 pass
@@ -14767,9 +14627,6 @@ def test_uses():
 
     def test_method_overloading(self):
         class Over(torch.nn.Module):
-            def __init__(self):
-                super(Over, self).__init__()
-
             @torch.jit._overload_method  # noqa: F811
             def forward(self, x):  # noqa: F811
                 # type: (Tuple[Tensor, Tensor]) -> Tensor
@@ -14788,7 +14645,7 @@ def forward(self, x):  # noqa: F811
 
         class S(torch.jit.ScriptModule):
             def __init__(self):
-                super(S, self).__init__()
+                super().__init__()
                 self.weak = Over()
 
             @torch.jit.script_method
@@ -14804,9 +14661,6 @@ def forward(self, x):
         self.assertEqual(over((x)), x + 20)
 
         class Unannotated(torch.nn.Module):
-            def __init__(self):
-                super(Unannotated, self).__init__()
-
             @torch.jit._overload_method  # noqa: F811
             def hello(self, x):  # noqa: F811
                 pass
@@ -14827,9 +14681,6 @@ def forward(self):
             torch.jit.script(w)
 
         class CompileOverloadError(torch.nn.Module):
-            def __init__(self):
-                super(CompileOverloadError, self).__init__()
-
             @torch.jit._overload_method  # noqa: F811
             def hello(self, x):  # noqa: F811
                 # type: (str) -> (int)
@@ -14853,9 +14704,6 @@ def forward(self):
         # testing overload declared first, then non-overload
         with self.assertRaisesRegex(Exception, "Overloads are not useable when a module"):
             class W3(torch.nn.Module):
-                def __init__(self):
-                    super(W3, self).__init__()
-
                 @torch.jit._overload_method  # noqa: F811
                 def forward(self, x):  # noqa: F811
                     # type: (int) -> int
@@ -14873,9 +14721,6 @@ def forward(self, x):  # noqa: F811
             b = torch.jit.script(a)
 
             class W3(torch.nn.Module):
-                def __init__(self):
-                    super(W3, self).__init__()
-
                 def forward(self, x):  # noqa: F811
                     return x + 5 + 10
 
@@ -14884,9 +14729,6 @@ def forward(self, x):  # noqa: F811
 
         # testing non-overload declared first, then overload
         class W2(torch.nn.Module):
-            def __init__(self):
-                super(W2, self).__init__()
-
             def hello(self, x1, x2):
                 return x1 + x2
 
@@ -14897,9 +14739,6 @@ def forward(self, x):
         self.assertEqual(a(torch.tensor(1)), torch.tensor(2))
 
         class W2(torch.nn.Module):
-            def __init__(self):
-                super(W2, self).__init__()
-
             @torch.jit._overload_method  # noqa: F811
             def hello(self, x):  # noqa: F811
                 pass
@@ -14936,7 +14775,7 @@ def foo(x):
     def test_nn_LSTM_with_layers(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.rnn = nn.LSTM(2, 3, 2, dropout=0)
 
             @torch.jit.script_method
@@ -14945,7 +14784,7 @@ def forward(self, x, lengths, h0, c0):
 
         class Eager(torch.nn.Module):
             def __init__(self):
-                super(Eager, self).__init__()
+                super().__init__()
                 self.rnn = nn.LSTM(2, 3, 2, dropout=0)
 
             def forward(self, x, lengths, h0, c0):
@@ -14962,7 +14801,7 @@ def test_nn_LSTM(self):
 
         class S(torch.jit.ScriptModule):
             def __init__(self):
-                super(S, self).__init__()
+                super().__init__()
                 self.x = torch.nn.LSTM(5, 5)
 
             @torch.jit.script_method
@@ -14980,7 +14819,7 @@ def test_nn_GRU(self):
 
         class SeqLengthGRU(torch.jit.ScriptModule):
             def __init__(self):
-                super(SeqLengthGRU, self).__init__()
+                super().__init__()
                 self.x = torch.nn.GRU(5, 5)
 
             @torch.jit.script_method
@@ -14989,7 +14828,7 @@ def forward(self, input: PackedSequence) -> Tuple[PackedSequence, torch.Tensor]:
 
         class TensorGRU(torch.jit.ScriptModule):
             def __init__(self):
-                super(TensorGRU, self).__init__()
+                super().__init__()
                 self.x = torch.nn.GRU(5, 5)
 
             @torch.jit.script_method
@@ -15099,7 +14938,7 @@ def test_scriptmodule_multi_head_attn_cuda(self):
 
         class MyModule(torch.jit.ScriptModule):
             def __init__(self, embed_dim, num_heads):
-                super(MyModule, self).__init__()
+                super().__init__()
                 sample_q = torch.randn(3, 2, embed_dim)
                 sample_kv = torch.randn(3, 2, embed_dim)
                 attention = nn.MultiheadAttention(embed_dim, num_heads)
@@ -15135,7 +14974,7 @@ def test_scriptmodule_transformer_cuda(self):
 
         class MyModule(torch.jit.ScriptModule):
             def __init__(self, transformer, sample_q, sample_kv):
-                super(MyModule, self).__init__()
+                super().__init__()
                 transformer.eval()
 
                 self.mod = torch.jit.trace(transformer,
@@ -15184,7 +15023,7 @@ def fn(lst):
     def test_weak_cuda(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.lstm = torch.nn.LSTM(5, 5)
                 self.lstm.cuda()
 
@@ -15201,7 +15040,7 @@ def test_ignore_decorator(self):
         with warnings.catch_warnings(record=True) as warns:
             class M(torch.jit.ScriptModule):
                 def __init__(self):
-                    super(M, self).__init__()
+                    super().__init__()
                     tensor = torch.zeros(1, requires_grad=False)
                     self.register_buffer('some_state', torch.nn.Parameter(tensor))
 
@@ -15228,9 +15067,6 @@ def ignored_code(self, x):
 
     def test_ignored_as_value(self):
         class Model(nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
             @torch.jit.unused
             def tuple_ignored(self, x):
                 # type: (Tensor) -> Tuple[Tensor, Tensor]
@@ -15263,9 +15099,6 @@ def forward(self, x, use_ignore_path):
 
     def test_module_error(self):
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, foo):
                 return foo
 
@@ -15286,7 +15119,7 @@ def fn(x, y):
     def test_module_attrs(self):
         class M(torch.jit.ScriptModule):
             def __init__(self, table):
-                super(M, self).__init__()
+                super().__init__()
                 self.table = torch.jit.Attribute(table, Dict[str, torch.Tensor])
                 self.x = torch.nn.Parameter(torch.tensor([100.0]))
 
@@ -15304,7 +15137,7 @@ def forward(self, key):
     def test_module_none_attrs(self):
         class MyMod(torch.jit.ScriptModule):
             def __init__(self):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.optional_value = None
 
             @torch.jit.script_method
@@ -15350,7 +15183,7 @@ def test_attribute_serialization(self):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 for name, value, the_type in tester.get_pickle_values():
                     setattr(self, name, torch.jit.Attribute(value, the_type))
 
@@ -15390,7 +15223,7 @@ def test_attribute_unpickling(self):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 for name, value, the_type in tester.get_pickle_values():
                     setattr(self, "_" + name, torch.jit.Attribute(value, the_type))
 
@@ -15469,7 +15302,7 @@ def forward(self,
     def test_submodule_attribute_serialization(self):
         class S(torch.jit.ScriptModule):
             def __init__(self, list_data):
-                super(S, self).__init__()
+                super().__init__()
                 self.table = torch.jit.Attribute({"I": "am", "a test": "test"}, Dict[str, str])
                 self.list = torch.jit.Attribute(list_data, List[Tuple[int, int]])
 
@@ -15479,7 +15312,7 @@ def forward(self):
 
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.table = torch.jit.Attribute({"this": "is", "a different": "dict"}, Dict[str, str])
                 self.tensor = torch.jit.Attribute(torch.randn(2, 2), torch.Tensor)
                 self.s1 = S([(1, 2)])
@@ -15496,7 +15329,7 @@ def forward(self):
     def test_serialization_big_ints(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.int32_max = torch.jit.Attribute(2**31 - 1, int)
                 self.int32_min = torch.jit.Attribute(-2**31, int)
                 self.uint32_max = torch.jit.Attribute(2**32, int)
@@ -15528,7 +15361,7 @@ def test_script_scope(self):
     def test_serialization_sharing(self):
         class M(torch.jit.ScriptModule):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.list = torch.jit.Attribute([], List[str])
 
             @torch.jit.script_method
@@ -15565,7 +15398,7 @@ def test_sys_stdout_override(self):
         def foo():
             print('foo')
 
-        class Redirect(object):
+        class Redirect:
             def __init__(self):
                 self.s = ''
 
@@ -15585,7 +15418,7 @@ def write(self, s):
     def test_dtype_attr(self):
         class Foo(torch.nn.Module):
             def __init__(self):
-                super(Foo, self).__init__()
+                super().__init__()
                 self.dtype = torch.zeros([]).dtype
 
             def forward(self):
@@ -15598,7 +15431,7 @@ def forward(self):
     def test_named_buffers_are_iterable(self):
         class MyMod(torch.nn.Module):
             def __init__(self):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.mod = (torch.nn.ReLU())
                 self.mod2 = (torch.nn.ReLU())
                 self.mod3 = torch.nn.Sequential(torch.nn.Sequential(torch.nn.ReLU()))
@@ -15637,7 +15470,7 @@ def forward(self, x):
     def test_static_if_prop(self):
         class MaybeHasAttr(torch.nn.Module):
             def __init__(self, add_attr):
-                super(MaybeHasAttr, self).__init__()
+                super().__init__()
                 if add_attr:
                     self.maybe_attr = 1
 
@@ -15649,7 +15482,7 @@ def forward(self):
 
         class MaybeHasAttr2(torch.nn.Module):
             def __init__(self, add_attr):
-                super(MaybeHasAttr2, self).__init__()
+                super().__init__()
                 if add_attr:
                     self.maybe_attr = 1
 
@@ -15703,7 +15536,7 @@ def hi(self, x):  # noqa: F811
         self.checkModule(HasAttrMod(), ())
 
         @torch.jit.script
-        class FooTest(object):
+        class FooTest:
             def __init__(self):
                 self.x = 1
 
@@ -15724,7 +15557,7 @@ class M(torch.jit.ScriptModule):
                 __constants__ = ['fname']
 
                 def __init__(self, tensor):
-                    super(M, self).__init__()
+                    super().__init__()
                     self.fname = fname
                     self.tensor = torch.nn.Parameter(tensor)
 
@@ -15748,7 +15581,7 @@ class M(torch.jit.ScriptModule):
                 __constants__ = ['fname']
 
                 def __init__(self, tensor):
-                    super(M, self).__init__()
+                    super().__init__()
                     self.fname = fname
                     self.tensor = torch.nn.Parameter(tensor)
 
@@ -15806,7 +15639,7 @@ def test(self, a):
     def test_get_set_state_with_tensors(self):
         class M(torch.nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.tensor = torch.randn(2, 2)
 
             @torch.jit.export
@@ -15940,7 +15773,7 @@ def __init__(self,
                          b   # type: int
                          ):
                 # type: (...) -> None
-                super(M, self).__init__()
+                super().__init__()
                 self.a = a  # type: int
                 self.b = b  # type: int
 
@@ -15955,9 +15788,6 @@ def f(x):
 
     def test_module_method_reassignment(self):
         class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def _forward(self, x):
                 return x
 
@@ -15979,9 +15809,6 @@ def parameter_script(x: torch.nn.Parameter):
 
     def test_save_load_attr_error(self):
         class Inner(nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x
 
@@ -16061,9 +15888,6 @@ def fn(x):
     def test_signed_float_zero(self):
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, x):
                 return torch.div(x, -0.)
 
@@ -16072,9 +15896,6 @@ def forward(self, x):
 
     def test_index_with_tuple(self):
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, x):
                 return x[(1,)]
 
@@ -16082,9 +15903,6 @@ def forward(self, x):
 
     def test_context_manager(self):
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-
             def forward(self, x, y):
                 p = x + y
                 q = p + 2.0
@@ -16175,10 +15993,12 @@ def forward(self, x, y):
 }
 
 
+@skipIfTorchDynamo()
 class TestJitGeneratedModule(JitTestCase):
     pass
 
 
+@skipIfTorchDynamo()
 class TestJitGeneratedFunctional(JitTestCase):
     pass
 
@@ -16273,7 +16093,7 @@ class TheModule(torch.jit.ScriptModule):
                 __constants__ = submodule_constants
 
                 def __init__(self):
-                    super(TheModule, self).__init__()
+                    super().__init__()
                     self.submodule = nn_module(*constructor_args)
 
             def make_module(script):
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index d311eb687a76..6fbb04b6cf9d 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -664,9 +664,6 @@ def forward(self, x, y):
     @unittest.skipIf(not TEST_CUDA, "No cuda")
     def test_jit_freeze_autocast_basic(self):
         class TestModule(torch.nn.Module):
-            def __init__(self):
-                super(TestModule, self).__init__()
-
             def forward(self, x, y):
                 with torch.cuda.amp.autocast():
                     return torch.mm(x, y)
@@ -691,7 +688,7 @@ def forward(self, x, y):
     def test_jit_freeze_autocast_constants(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
-                super(TestModule, self).__init__()
+                super().__init__()
                 self.x = torch.rand((3, 4), dtype=torch.float).cuda()
 
             def forward(self, y):
@@ -753,7 +750,7 @@ def foo(x):
 
 class convbn(torch.nn.Module):
     def __init__(self, bias_enabled=True):
-        super(convbn, self).__init__()
+        super().__init__()
         self.conv = torch.nn.Conv2d(3, 64, 7, stride=2, bias=bias_enabled)
         self.bn = torch.nn.BatchNorm2d(64)
 
@@ -762,7 +759,7 @@ def forward(self, x):
 
 class TestJitTraceAutocast(JitTestCase):
     def setUp(self):
-        super(TestJitTraceAutocast, self).setUp()
+        super().setUp()
         self.previous_default_dtype = torch.get_default_dtype()
         torch.set_default_dtype(torch.float32)
         self.models = [MnistNet(),
@@ -776,7 +773,7 @@ def setUp(self):
     def tearDown(self):
         torch._C._jit_set_autocast_mode(self.previous_jit_autocast_pass)
         torch.set_default_dtype(self.previous_default_dtype)
-        super(TestJitTraceAutocast, self).tearDown()
+        super().tearDown()
 
     def test_generate_autocast_jit_trace_model(self):
         def test_generate_autocast_jit_trace_model(model, x):
@@ -821,11 +818,9 @@ def test_nhwc_autocast_jit_trace_model(model, x):
 
     def test_cat_promote(self):
         class TestModel(torch.nn.Module):
-            def __init__(self):
-                super(TestModel, self).__init__()
-
             def forward(self, a, b):
                 return torch.cat([a, b], 0)
+
         with torch.jit.fuser("none"):
             # In this testcase, we will check whether cat has done the promotion in AMP with mixed dtype inputs.
             # To avoid the fusion group from TE, we will disable the fuser here.
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 4f0ea9dcd344..c735bd996abc 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -1,5333 +1,11 @@
-# Owner(s): ["oncall: jit"]
+# Owner(s): ["module: nvfuser"]
 
-import contextlib
-import unittest
-import os
-import random
-import enum
-import copy
-from functools import reduce
-import operator
-import warnings
-
-import torch
-from torch.nn import functional
-from torch.profiler import profile, ProfilerActivity
-
-from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes
-from torch.testing._internal.common_jit import JitCommonTestCase
-from torch.testing._internal.common_methods_invocations import op_db, SampleInput
-from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, TEST_WITH_ROCM, slowTest, \
-    is_iterable_of_tensors, freeze_rng_state, skipIfRocm
-from torch.testing._internal.jit_utils import clone_inputs, get_traced_sample_variant_pairs, JitTestCase, RUN_CUDA
-from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
-from torch.testing import FileCheck
-
-from jit.test_fuser_common import TestFuserCommon  # noqa: F401
-
-import itertools
-import numpy as np
-import math
-
-from torch.autograd.gradcheck import gradcheck
-
-from typing import List
-
-RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
-CUDA_MAJOR, CUDA_MINOR = 0, 0
-
-if RUN_NVFUSER and torch.version.cuda is not None:
-    CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2])
-
-if 'PYTORCH_NVFUSER_ENABLE' not in os.environ:
-    os.environ['PYTORCH_NVFUSER_ENABLE'] = ""
-os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition,' + os.environ['PYTORCH_NVFUSER_ENABLE']
-if 'PYTORCH_NVFUSER_DISABLE' not in os.environ:
-    os.environ['PYTORCH_NVFUSER_DISABLE'] = ""
-os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,' + os.environ['PYTORCH_NVFUSER_DISABLE']
-os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0'
-# TODO: enable complex when we fixes the extremal cases in OpInfo
-# see issue https://github.com/csarofeen/pytorch/issues/1730"
-# os.environ['PYTORCH_NVFUSER_ENABLE'] = 'complex'
-
-if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
-    torch._C._jit_set_texpr_fuser_enabled(False)
-    torch._C._jit_set_profiling_executor(True)
-    torch._C._jit_set_profiling_mode(True)
-
-FUSION_GROUP = 'prim::CudaFusionGroup'
-FUSION_GUARD = 'prim::CudaFusionGuard'
-# TODO: revert disabled alias ops
-ALIAS_TEST_DISABLED = True
-
-
-@contextlib.contextmanager
-def nvfuser_singleton_fusion(flag):
-    old_value = torch._C._jit_set_nvfuser_single_node_mode(flag)
-    try:
-        yield
-    finally:
-        torch._C._jit_set_nvfuser_single_node_mode(old_value)
-
-@contextlib.contextmanager
-def nvfuser_horizontal_fusion(flag):
-    old_value = torch._C._jit_set_nvfuser_horizontal_mode(flag)
-    try:
-        yield
-    finally:
-        torch._C._jit_set_nvfuser_horizontal_mode(old_value)
-
-def is_pre_volta():
-    if not RUN_NVFUSER:
-        return False
-    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
-    return prop.major < 7
-
-TEST_BF16 = RUN_NVFUSER and torch.cuda.is_bf16_supported()
-
-TEST_LARGE_TENSOR = RUN_NVFUSER
-if RUN_NVFUSER:
-    torch.ones(1).cuda()  # initialize cuda context
-    TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
-
-class CudaFuserTestOptions():
-    def __init__(self):
-        self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
-        self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
-        torch._C._jit_override_can_fuse_on_cpu(False)
-        torch._C._jit_override_can_fuse_on_gpu(False)
-        self.old_guard = torch._C._jit_set_nvfuser_guard_mode(False)
-        torch._C._debug_set_autodiff_subgraph_inlining(False)
-        self.old_value = torch._C._jit_set_autocast_mode(True)
-
-        if(RUN_CUDA):
-            self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(True)
-
-    def restore(self):
-        if(RUN_CUDA):
-            torch._C._jit_set_nvfuser_enabled(self.old_nvfuser)
-        torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse)
-        torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse)
-        torch._C._jit_set_nvfuser_guard_mode(self.old_guard)
-        torch._C._debug_set_autodiff_subgraph_inlining(True)
-        torch._C._jit_set_autocast_mode(self.old_value)
-
-class TestCudaFuser(JitTestCase):
-    def assertEqual(self, *args, **kwargs):
-        kwargs["exact_layout"] = True
-        super(JitTestCase, self).assertEqual(*args, **kwargs)
-
-    def _getSubgraphInFusion(self, graph):
-        num_node = 0
-        subgraph = None
-
-        def count(block, ret):
-            for n in block.nodes():
-                if n.kind() == FUSION_GROUP:
-                    ret[0] = ret[0] + 1
-                    self.assertTrue(n.hasAttribute('Subgraph'))
-                    ret[1] = n.g('Subgraph')
-                for block in n.blocks():
-                    count(block, ret)
-        ret = [num_node, subgraph]
-        count(graph, ret)
-        self.assertEqual(ret[0], 1)
-        return ret[1]
-
-    def setUp(self):
-        super(TestCudaFuser, self).setUp()
-
-        self.skip_node_list = []
-        disabled_ops = ("aten::batch_norm",
-                        "aten::_batch_norm_impl_index",
-                        "aten::_batch_norm_impl_index_backward",
-                        "aten::native_batch_norm_backward",)
-        for op in disabled_ops:
-            disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
-            if disabled_flag:
-                torch._C._jit_set_nvfuser_skip_node_kind(op, True)
-                self.skip_node_list.append(op)
-
-        # cpu backup to avoid errors in case this is run on a CPU-only machine
-        dev = 'cuda' if RUN_NVFUSER else 'cpu'
-        self.special_values = torch.tensor(
-            [float("-inf"), -10, -math.pi,
-                -1, -0.5, 0, 1, 0.5,
-                math.pi, 10, float("inf"),
-                float("nan")], dtype=torch.float, device=dev)
-
-        self.int_types = [
-            torch.int8,
-            torch.uint8,
-            torch.int16,
-            torch.int32,
-            torch.int64
-        ]
-
-        self.support_tensor_dtypes = [
-            torch.int32,
-            torch.int64,
-            torch.float16,
-            torch.float32,
-            torch.float64,
-            torch.bool,
-            torch.complex64,
-            torch.complex128,
-        ]
-        if TEST_BF16:
-            self.support_tensor_dtypes.append(torch.bfloat16)
-
-        if(RUN_NVFUSER):
-            self.cuda_fuser_options = CudaFuserTestOptions()
-
-    def tearDown(self):
-        # restoring skip node to the configuration before tests
-        for op in self.skip_node_list:
-            disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
-            if not disabled_flag:
-                torch._C._jit_set_nvfuser_skip_node_kind(op, True)
-
-        if(RUN_NVFUSER):
-            self.cuda_fuser_options.restore()
-        super(TestCudaFuser, self).tearDown()
-
-    def _run_helper(self, jit_op, op, *args, check_stride=False, num_fusion=1, check_runs=1):
-        seed = 123
-        torch.cuda.manual_seed_all(seed)
-        jit_o = jit_op(*args)
-
-        for i in range(check_runs):
-            torch.cuda.manual_seed_all(seed + i)
-            jit_o = jit_op(*args)
-            torch.cuda.manual_seed_all(seed + i)
-            o = op(*args)
-
-            if type(jit_o) is torch.Tensor:
-                jit_o = [jit_o, ]
-                o = [o, ]
-
-            for oo, jit_oo in zip(o, jit_o):
-                self.assertEqual(oo.dtype, jit_oo.dtype)
-                self.assertEqual(oo, jit_oo)
-                if check_stride:
-                    self.assertEqual(oo.stride(), jit_oo.stride())
-
-        self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, num_fusion, consider_subgraphs=True)
-
-    def _run_training_helper(self, jit_op, op, grads, *args):
-        torch.cuda.manual_seed_all(123)
-        jit_o = jit_op(*args)
-        jit_g = jit_o.backward(grads)
-        torch.cuda.manual_seed_all(123)
-        jit_o = jit_op(*args)
-        jit_g = jit_o.backward(grads)
-        torch.cuda.manual_seed_all(123)
-        jit_o = jit_op(*args)
-        jit_g = jit_o.backward(grads)
-        torch.cuda.manual_seed_all(123)
-        o = op(*args)
-        g = o.backward(grads)
-        self.assertEqual(o, jit_o)
-        self.assertEqual(g, jit_g)
-        self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True)
-        bwd_graph = list(
-            list(jit_op.get_debug_state().execution_plans.values())[
-                0].code.grad_executor_states()[0].execution_plans.values()
-        )[0].graph
-        self.assertGraphContainsExactly(bwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_half(self):
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float):
-            o_16 = torch.add(x, y)
-            o_32_a = torch.add(y, z, alpha=alpha)
-            o_32_b = torch.add(o_16, z)
-            return (o_16, o_32_a, o_32_b)
-
-        t_jit = torch.jit.script(t)
-        alpha = 0.5
-        # stick to integers, this avoid the numerical difference due to our
-        # promotion
-        x = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
-        y = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
-        z = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
-        jit_o = t_jit(x, y, z, alpha)
-        jit_o = t_jit(x, y, z, alpha)
-        o = t(x, y, z, alpha)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD)
-
-
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_bfloat(self):
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float):
-            o_16 = torch.add(x, y)
-            o_32_a = torch.add(y, z, alpha=alpha)
-            o_32_b = torch.add(o_16, z)
-            return (o_16, o_32_a, o_32_b)
-
-        t_jit = torch.jit.script(t)
-        alpha = 0.5
-        # stick to integers, this avoid the numerical difference due to our
-        # promotion
-        x = torch.randint(0, 256, (4, 8)).to(dtype=torch.bfloat16, device="cuda")
-        y = torch.randint(0, 256, (4, 8)).to(dtype=torch.bfloat16, device="cuda")
-        z = torch.randint(0, 256, (4, 8)).to(dtype=torch.bfloat16, device="cuda")
-        jit_o = t_jit(x, y, z, alpha)
-        jit_o = t_jit(x, y, z, alpha)
-        o = t(x, y, z, alpha)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_const(self):
-        def t(x, y):
-            o = x + y
-            o = o + 2.0
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_chunk(self):
-        def t(x, y, z, q):
-            o = x + q
-            x0, x1 = torch.chunk(o, 2)
-            o = x0 + x1
-            o = o + y
-            o = o * z
-            o = torch.relu(o)
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(2, 8, dtype=torch.float, device="cuda")
-        z = torch.randn(2, 8, dtype=torch.float, device="cuda")
-        q = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, z, q)
-        jit_o = t_jit(x, y, z, q)
-        o = t(x, y, z, q)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, z, q), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_reduction_dtypes_axis(self):
-
-        for op in [torch.sum, torch.mean, torch.amax, torch.var, torch.std]:
-            for dtype in [torch.float16, torch.float32, torch.double]:
-                for axis in [-1, 2, 0]:
-                    def make_func(op):
-                        def func(x: torch.Tensor):
-                            o = torch.mul(x, 2.0)
-                            o = op(o, dim=[axis])
-                            return o
-                        return func
-
-                    x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
-                    t = make_func(op)
-                    t_jit = torch.jit.trace(t, x)
-                    jit_o = t_jit(x)
-                    jit_o = t_jit(x)
-                    o = t(x)
-                    self.assertEqual(o.dtype, jit_o.dtype)
-                    self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
-                    self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_variance(self):
-
-        for op in [torch.var, torch.std]:
-            for dtype in [torch.float16, torch.float32, torch.double]:
-                for axis in [-2, -1, 2, 1]:
-                    for unbiased in [False, True]:
-                        def make_func(op):
-                            def func(x: torch.Tensor):
-                                o = torch.mul(x, 2.0)
-                                o = op(o, dim=[axis])
-                                return o
-                            return func
-
-                        x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
-                        t = make_func(op)
-                        t_jit = torch.jit.trace(t, x)
-                        jit_o = t_jit(x)
-                        jit_o = t_jit(x)
-                        o = t(x)
-                        self.assertEqual(o.dtype, jit_o.dtype)
-                        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
-                        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_variance_profiling(self):
-        with nvfuser_singleton_fusion(True):
-            for op in [torch.var, torch.std]:
-                for dtype in [torch.float16, torch.float32, torch.double]:
-                    for axis in [-2, -1, 2, 1]:
-                        for unbiased in [False, True]:
-                            for keepdim in [False, True]:
-                                def t(x: torch.Tensor, dim: List[int], unbiased: bool, keepdim: bool):
-                                    o = torch.mul(x, 2.0)
-                                    o = op(o, dim=dim, unbiased=unbiased, keepdim=keepdim)
-                                    return o
-
-                                x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
-                                t_jit = torch.jit.script(t)
-                                self._run_helper(t_jit, t, x, [axis], unbiased, keepdim, check_stride=False, check_runs=5)
-
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_scalar_input(self):
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = x + y
-            o = o + z
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, 1, 32, dtype=torch.float, device="cuda")
-        y = y.expand(4, 8, 32, 32)
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_0(self):
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = x + y
-            o = o + z
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_1(self):
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = x + y
-            o = o + z
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(1, 32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_2(self):
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = x + y
-            o = o + z
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 1, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(8, 32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_3(self):
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = x + y
-            o = o + z
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(8, 17, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(8, 17, 1, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
-
-    # test_broadcasting_partition_logic_X
-    # Testing partition logic that is capable to avoid creating unsupported
-    # broadcasting semantics in CudaFusionGroup
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_partition_logic_0(self):
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            x = x + 12.0
-            o1 = x + y
-            o2 = x + z
-            o = o1 + o2
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, 6, 8, dtype=torch.float32, device="cuda")
-        y = torch.randn(8, 6, 8, dtype=torch.float32, device="cuda")
-        z = torch.randn(6, 8, dtype=torch.float32, device="cuda")
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, z))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_partition_logic_1(self):
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            x = x + 12.0
-            o1 = x + y
-            o2 = x + z
-            o = o1 + o2
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(8, 6, 8, dtype=torch.float32, device="cuda")
-        y = torch.randn(4, 8, 6, 8, dtype=torch.float32, device="cuda")
-        z = torch.randn(4, 1, 6, 8, dtype=torch.float32, device="cuda")
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, z))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False)
-
-    @unittest.skipIf(True, "Broadcast with different output not supported yet")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_multiple_output_shape(self):
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = x + 12
-            o1 = o + y
-            o2 = o + z
-            oo = o1.sum() + o2.sum()
-            return oo
-        t_jit = torch.jit.script(t)
-        x = torch.randn(32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(2, 32, 32, dtype=torch.float, device="cuda")
-        z = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        self.assertEqual(o, jit_o)
-        # Currently cannot fuse this
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-    @unittest.skipIf(True, "broadcast on branches can't be resolved yet")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_broadcasting_multiple_output(self):
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = x + 12
-            o1 = o + y
-            o2 = o + z
-            oo = o1.sum() + o2.sum()
-            return oo
-        t_jit = torch.jit.script(t)
-        x = torch.randn(32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
-        z = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        self.assertEqual(o, jit_o)
-        # Currently cannot fuse this
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-    def _unary_test_helper(self, operation, dtype, random_data):
-        gradient_check = (dtype == torch.float64) and random_data
-        shape = self.special_values.shape
-        torch.cuda.manual_seed_all(211)
-
-        # need additional def of t for boolean ops
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = x * y
-            o = o + 5e-3
-            o = operation(o)
-            return o
-
-        y = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check)
-        y = y.to(dtype=dtype)
-
-        if random_data:
-            x = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check)
-            if dtype in self.int_types:
-                # prefer a larger variance for integer types
-                x = x * 5
-            x = x.to(dtype=dtype)
-        else:
-            x = self.special_values.to(dtype=dtype)
-        try:
-            ref = t(x, y)
-        except Exception:
-            # same way as TE checker, if eager mode throws, ignore this test
-            return
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        if gradient_check:
-            if jit_o.dtype != torch.bool:
-                # bool dtype has no `-`
-                gradcheck(t_jit, [x, y], nondet_tol=1e-5)
-        elif dtype in self.support_tensor_dtypes:
-            self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-        o = t(x, y)
-        self.assertEqual(o.dtype, jit_o.dtype)
-
-        if dtype == torch.bfloat16:
-            # compare with the actual ground truth for
-            #  bfloat16 kernels instead of eager mode
-            #  implementation, since mismatch in cast
-            #  adds excessive noise.
-            o = t(x.to(torch.float64), y.to(torch.float64))
-            if o.dtype.is_floating_point:
-                o = o.to(torch.bfloat16)
-        else:
-            o = t(x, y)
-
-        self.assertTrue(self._compare("failing case {}\n{}\n{}\n{}".format(dtype, operation, x, y), o, jit_o, 1e-2))
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_unary_ops(self):
-        data_types = [
-            *self.int_types,
-            torch.float16,
-            torch.float32,
-            torch.float64,
-            # TODO: revert this
-            # see issue https://github.com/csarofeen/pytorch/issues/1730"
-            # torch.cfloat,
-            # torch.cdouble,
-        ]
-        if TEST_BF16:
-            data_types.append(torch.bfloat16)
-        operations = [torch.neg,
-                      torch.abs,
-                      torch.log,
-                      torch.log10,
-                      torch.log1p,
-                      torch.log2,
-                      torch.lgamma,
-                      torch.exp,
-                      torch.expm1,
-                      torch.erf,
-                      torch.erfc,
-                      torch.cos,
-                      torch.acos,
-                      torch.cosh,
-                      torch.sin,
-                      torch.asin,
-                      torch.sinh,
-                      torch.tan,
-                      torch.atan,
-                      torch.sqrt,
-                      torch.rsqrt,
-                      torch.ceil,
-                      torch.floor,
-                      torch.round,
-                      torch.trunc,
-                      torch.frac,
-                      torch.reciprocal,
-                      torch.isfinite,
-                      torch.isinf,
-                      torch.isnan,
-                      torch.isneginf,
-                      torch.isposinf,
-                      torch.isreal,
-                      torch.nn.functional.softplus,
-                      torch.nn.functional.gelu,
-                      torch.nn.functional.leaky_relu,
-                      torch.nn.functional.silu,
-                      torch.relu,
-                      torch.sigmoid,
-                      torch.bitwise_not,
-                      torch.tan,
-                      torch.tanh]
-        skip_complex = {torch.rsqrt, torch.reciprocal}
-        for op, dtype in itertools.product(operations, data_types):
-            if dtype.is_complex and op in skip_complex:
-                continue
-            self._unary_test_helper(op, dtype, False)  # test special numbers
-            self._unary_test_helper(op, dtype, True)  # test random data
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_category_rule(self):
-        def run_tensor(x, z):
-            def t(x: torch.Tensor, z: torch.Tensor):
-                o = x + z
-                o = torch.abs(o)
-                return o
-            t_jit = torch.jit.script(t)
-            jit_o = t_jit(x, z)
-            jit_o = t_jit(x, z)
-            o = t(x, z)
-            self.assertEqual(o.dtype, jit_o.dtype)
-            self.assertEqual(o, jit_o)
-            self.assertGraphContains(t_jit.graph_for(x, z), FUSION_GUARD)
-
-        def run_scalar(x, z):
-            def t(x: torch.Tensor, z: float):
-                o = x + z
-                o = torch.abs(o)
-                return o
-            t_jit = torch.jit.script(t)
-            jit_o = t_jit(x, z)
-            jit_o = t_jit(x, z)
-            o = t(x, z)
-            self.assertEqual(o.dtype, jit_o.dtype)
-            self.assertEqual(o, jit_o)
-            self.assertGraphContains(t_jit.graph_for(x, z), FUSION_GUARD)
-
-        # n-dim with 0-dim (no type-promote)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        z = torch.tensor(2.0, dtype=torch.double, device="cuda")
-        run_tensor(x, z)
-
-        # n-dim with 0-dim (type-promote)
-        x = torch.randn(4, 8, 32, 32, device="cuda").to(dtype=torch.long)
-        z = torch.tensor(2.0, dtype=torch.double, device="cuda")
-        run_tensor(x, z)
-
-        # n-dim with n-dim (type-promote)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        z = torch.randn(4, 8, 32, 32, dtype=torch.double, device="cuda")
-        run_tensor(x, z)
-
-        # n-dim with scalar (no type-promote)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float16, device="cuda")
-        z = torch.tensor(3., dtype=torch.double)
-        run_scalar(x, z)
-        if TEST_BF16:
-            # n-dim with scalar (no type-promote)
-            x = torch.randn(4, 8, 32, 32, dtype=torch.bfloat16, device="cuda")
-            z = torch.tensor(3., dtype=torch.double)
-            run_scalar(x, z)
-
-        # n-dim with scalar (type-promote)
-        x = torch.randn(4, 8, 32, 32, device="cuda").to(dtype=torch.long)
-        z = torch.tensor(3., dtype=torch.double)
-        run_scalar(x, z)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_unary_bitwise(self):
-        def bit_not(x: torch.Tensor):
-            return ~(x + 1)
-
-        jitted = torch.jit.script(bit_not)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long)
-        jit_o = jitted(x)
-        jit_o = jitted(x)
-        o = bit_not(x)
-        self.assertEqual(o, jit_o)
-        jitted.graph_for(x)  # Shows up in second instance, not first
-        self.assertGraphContains(jitted.graph_for(x), FUSION_GUARD)
-
-        def bool_not(x: torch.Tensor, y: torch.Tensor):
-            return ~(x & y)
-
-        jitted = torch.jit.script(bool_not)
-        x = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool)
-        y = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool)
-        jit_o = jitted(x, y)
-        jit_o = jitted(x, y)
-        o = bool_not(x, y)
-        self.assertEqual(o, jit_o)
-        jitted.graph_for(x, y)  # Shows up in second instance, not first
-        self.assertGraphContains(jitted.graph_for(x, y), FUSION_GUARD)
-
-    def _get_scalar_binary_test_fn(self, category_and_type1, category_and_type2, operation):
-        category1, dtype_arg1 = category_and_type1
-        category2, dtype_arg2 = category_and_type2
-
-        def t_intx_tensory(x: int, y: torch.Tensor):
-            o = operation(x, y)
-            o = 2 + o
-            return o
-
-        def t_doublex_tensory(x: float, y: torch.Tensor):
-            o = operation(x, y)
-            o = 2 + o
-            return o
-
-        def t_cdoublex_tensory(x: complex, y: torch.Tensor):
-            o = operation(x, y)
-            o = 2 + o
-            return o
-
-        # Omit both scalar cases and swap cases
-        assert category1 == "scalar" and category2 != "scalar"
-        if dtype_arg1.is_floating_point:
-            return t_doublex_tensory
-        if dtype_arg1 == torch.int64 or dtype_arg1 == torch.int32:
-            return t_intx_tensory
-        if dtype_arg1.is_complex or dtype_arg1 == torch.int32:
-            return t_cdoublex_tensory
-        raise NotImplementedError
-
-    def _binary_test_helper(self, operation, dtypes, random_data, categories="ndim"):
-        if isinstance(dtypes, tuple):
-            dtype_arg1, dtype_arg2 = dtypes
-        else:
-            dtype_arg1 = dtype_arg2 = dtypes
-
-        if isinstance(categories, tuple) and random_data:
-            category1, category2 = categories
-        elif not random_data:
-            category1 = category2 = "ndim"
-        else:
-            category1 = category2 = categories
-
-        def is_cpu_category(x):
-            return x == "0dimcpu" or x == "scalar"
-
-        # skip unsupported cases
-        if is_cpu_category(category1) and is_cpu_category(category2):
-            return
-
-        # only test cases with first operand as scalar
-        if category2 == "scalar":
-            return
-
-        # skip ops that doesn't support scalar inputs in eager
-        if operation in [
-            torch.atan2,
-            torch.max,
-            torch.min,
-            torch.remainder,  # unsupported in nvfuser
-        ]:
-            if category1 == "scalar" or category2 == "scalar":
-                return
-
-        if operation in [
-            torch.fmod,
-            torch.eq,
-            torch.ne,
-            torch.ge,
-            torch.gt,
-            torch.le,
-            torch.lt
-        ]:
-            if category1 == "scalar":
-                return
-
-        # operators that does not support bfloat16
-        if operation in [torch.fmod]:
-            if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16:
-                return
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = operation(x, y)
-            o = o + z
-            return o
-
-        shape = (4, 32, 32)
-
-        shapex = shape if category1 == "ndim" else ()
-        shapey = shape if category2 == "ndim" else ()
-
-        if random_data:
-            x = (torch.randn(shapex, dtype=torch.float, device="cuda") * 5).to(dtype_arg1)
-            y = (torch.randn(shapey, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
-        else:
-            x = self.special_values.to(dtype=dtype_arg1)
-            y = (torch.rand_like(self.special_values) * 5).to(dtype_arg2)
-
-        r"""
-            Category conversion
-        """
-        has_scalar = False
-        if category1 == "scalar":
-            has_scalar = True
-            x = x.item()
-
-        if category1 == "0dimcpu":
-            x = x.to(device="cpu")
-
-        if category2 == "scalar":
-            has_scalar = True
-            y = y.item()
-
-        if category2 == "0dimcpu":
-            y = y.to(device="cpu")
-
-        z = torch.tensor([2], device="cuda").to(dtype_arg1)
-        is_dtype_arg1_int = dtype_arg1 == torch.int32 or dtype_arg1 == torch.int64
-        is_dtype_arg2_int = dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64
-
-        if operation in [torch.pow]:
-            if is_dtype_arg1_int and is_dtype_arg2_int:
-                if category2 == "scalar":
-                    # RuntimeError: Integers to negative integer powers are not allowed
-                    y = abs(y)
-                if category2 == "0dimcpu" and y == -1:
-                    # https://github.com/pytorch/pytorch/issues/73196
-                    y = y - 1
-                if category2 == "0dimcpu" and y == -2:
-                    # avoid pow(0, -2), which gives inconsistent results on integer tensor
-                    y = y - 1
-
-        # Avoid division by zero for integer tensors
-        div_like = [torch.div, torch.fmod, torch.remainder]
-        if operation in div_like and (dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64):
-            y[y == 0] = 1
-
-        test_value = True
-        if dtype_arg1 == torch.half or dtype_arg2 == torch.half:
-            test_value = False
-        if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16:
-            test_value = False
-
-        try:
-            if not has_scalar:
-                o = t(x, y, z)
-                t_jit = torch.jit.script(t)
-                jit_o = t_jit(x, y, z)
-                jit_o = t_jit(x, y, z)
-                jit_o = t_jit(x, y, z)
-
-                self.assertEqual(o.dtype, jit_o.dtype)
-                if test_value:
-                    self.assertEqual(o, jit_o)
-                self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-            elif category2 != "scalar":  # only test the case where first is scalar
-                test_fn = self._get_scalar_binary_test_fn((category1, dtype_arg1), (category2, dtype_arg2), operation)
-                o = test_fn(x, y)
-                t_jit = torch.jit.script(test_fn)
-                jit_o = t_jit(x, y)
-                jit_o = t_jit(x, y)
-                jit_o = t_jit(x, y)
-
-                self.assertEqual(o.dtype, jit_o.dtype)
-                if test_value:
-                    self.assertEqual(o, jit_o)
-                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-        except Exception as e:
-            print("failing test for op: ", operation.__name__)
-            print("with input\n\tx: ", x)
-            print("\ty: ", y)
-            print("\tz: ", z)
-            raise e
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_binary_ops(self):
-        data_types = [
-            torch.int32,
-            torch.int64,
-            torch.float16,
-            torch.float32,
-            torch.float64,
-        ]
-        if TEST_BF16:
-            data_types.append(torch.bfloat16)
-        operations = [torch.mul,
-                      torch.div,
-                      torch.atan2,
-                      torch.max,
-                      torch.min,
-                      torch.pow,
-                      torch.remainder,
-                      torch.fmod,
-                      torch.eq,
-                      torch.ne,
-                      torch.ge,
-                      torch.gt,
-                      torch.le,
-                      torch.lt]
-
-        category_types = [
-            "scalar",
-            "0dim",
-            "0dimcpu",
-            "ndim"
-        ]
-
-        binary_dtype_combinations = list(itertools.combinations(data_types, 2))
-        category_combinations = list(itertools.combinations(category_types, 2))
-
-        for op, dtypes, categories in itertools.product(operations, binary_dtype_combinations, category_combinations):
-            self._binary_test_helper(op, dtypes, True, categories)  # random data
-
-        for op, dtypes in itertools.product(operations, binary_dtype_combinations):
-            self._binary_test_helper(op, dtypes, False)  # special numbers
-
-    # TODO: revert this
-    @unittest.skipIf(True, "see issue https://github.com/csarofeen/pytorch/issues/1730")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_binary_ops_complex(self):
-        data_types = [torch.cfloat, torch.cdouble]
-        operations = [torch.mul, torch.div, torch.pow, torch.eq, torch.ne]
-
-        category_types = [
-            "scalar",
-            "0dim",
-            "0dimcpu",
-            "ndim"
-        ]
-
-        binary_dtype_combinations = list(itertools.combinations(data_types, 2))
-        category_combinations = list(itertools.combinations(category_types, 2))
-
-        for op, dtypes, categories in itertools.product(operations, binary_dtype_combinations, category_combinations):
-            self._binary_test_helper(op, dtypes, True, categories)  # random data
-
-        for op, dtypes in itertools.product(operations, binary_dtype_combinations):
-            self._binary_test_helper(op, dtypes, False)  # special numbers
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_binary_bitwise(self):
-        dtypes = [torch.bool, torch.int32, torch.int64]
-
-        for dtype1, dtype2, dtype3 in itertools.product(dtypes, repeat=3):
-            def jit_and(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-                return torch.bitwise_and(x, y) & z
-
-            def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-                return torch.bitwise_or(x, y) | z
-
-            def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-                return torch.bitwise_xor(x, y) ^ z
-
-            def jit_lshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-                return torch.bitwise_left_shift(x, y) << z
-
-            def jit_rshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-                return torch.bitwise_right_shift(x, y) >> z
-
-            for jit_func in [jit_and, jit_or, jit_xor, jit_lshift, jit_rshift]:
-                if torch.bool in {dtype1, dtype2, dtype3} and jit_func in {jit_lshift, jit_rshift}:
-                    continue
-                x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype1)
-                y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype2)
-                z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(2).to(dtype3)
-
-                jitted = torch.jit.script(jit_func)
-                jit_o = jitted(x, y, z)
-                jit_o = jitted(x, y, z)
-                o = jit_func(x, y, z)
-                self.assertEqual(o, jit_o)
-                self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_type_as_op(self):
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = torch.lt(x, z)
-            o = o.type_as(y)
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 0.5)
-        jit_o = t_jit(x, y, 0.5)
-        o = t(x, y, 0.5)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, 0.5), FUSION_GUARD)
-
-    def _ternary_integer_test_helper(self, dtype_arg1):
-        shape = (4, 8, 32, 32)
-        magnitude = 100
-        if (dtype_arg1 in self.int_types):
-            x = torch.randint(-magnitude, magnitude, shape, dtype=dtype_arg1, device="cuda")
-        else:
-            x = torch.randn(shape, dtype=dtype_arg1, device="cuda") * magnitude
-        arg2 = int(0)
-        arg3 = int(magnitude * 0.1)
-
-        def clamp0(x: torch.Tensor, f: int):
-            o = 2. * torch.clamp(x, min=f)
-            return o
-        clamp0_jit = torch.jit.script(clamp0)
-        self._run_helper(clamp0_jit, clamp0, x, arg2)
-
-        def clamp1(x: torch.Tensor, f: int, ff: int):
-            o = 2. * torch.clamp(x, min=f, max=ff)
-            return o
-        clamp1_jit = torch.jit.script(clamp1)
-        self._run_helper(clamp1_jit, clamp1, x, arg2, arg3)
-
-        def clamp2(x: torch.Tensor, f: float, ff: int):
-            o = 2. * torch.clamp(x, min=f, max=ff)
-            return o
-        clamp2_jit = torch.jit.script(clamp2)
-        self._run_helper(clamp2_jit, clamp2, x, float(arg2), arg3)
-
-        def clamp3(x: torch.Tensor, f: int, ff: float):
-            o = 2. * torch.clamp(x, min=f, max=ff)
-            return o
-        clamp3_jit = torch.jit.script(clamp3)
-        self._run_helper(clamp3_jit, clamp3, x, arg2, float(arg3))
-
-        def threshold(x: torch.Tensor, th: int, val: int):
-            o = 2. * torch.threshold(x, th, val)
-            return o
-        threshold_jit = torch.jit.script(threshold)
-        self._run_helper(threshold_jit, threshold, x, arg2, arg3)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_ternary_ops_integer_compatibility(self):
-        data_types = [
-            torch.float16,
-            torch.float32,
-            torch.float64
-        ]
-        for dtype in data_types:
-            self._ternary_integer_test_helper(dtype)
-
-    def _ternary_test_helper(self, operation, dtypes, random_data):
-        if isinstance(dtypes, tuple):
-            dtype_arg1, dtype_arg2, dtype_arg3 = dtypes
-        else:
-            dtype_arg1 = dtype_arg2 = dtype_arg3 = dtypes
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: torch.Tensor):
-            o = operation(x, y, z)
-            o = o + alpha
-            return o
-
-        shape = (4, 32, 32)
-        if operation is torch.where:
-            dtype_arg1 = torch.bool
-            if random_data:
-                x = torch.randint(0, 2, shape).to(dtype=torch.bool, device="cuda")
-                y = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
-                z = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg3)
-            else:
-                x = torch.randint(0, 2, self.special_values.size()).to(dtype=torch.bool, device="cuda")
-                y = self.special_values.to(dtype=dtype_arg2)
-                z = (torch.rand_like(self.special_values) * 5).to(dtype_arg3)
-        elif random_data:
-            x = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg1)
-            y = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
-            z = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg3)
-        else:
-            x = self.special_values.to(dtype=dtype_arg1)
-            y = (torch.rand_like(self.special_values) * 5).to(dtype_arg2)
-            z = (torch.rand_like(self.special_values) * 5).to(dtype_arg3)
-        alpha = torch.tensor([2], device="cuda").to(dtype_arg1)
-
-        o = t(x, y, z, alpha)
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, z, alpha)
-        jit_o = t_jit(x, y, z, alpha)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_ternary_ops_type_promotion(self):
-        # TODO: update accuracy tolerance for bf16 / fp16 data types
-        data_types = [
-            # torch.float16,
-            torch.float32,
-            torch.float64
-        ]
-        '''
-        if TEST_BF16:
-            data_types.append(torch.bfloat16)
-        '''
-        # TODO: Add Tensor support for clamp
-        operations = [torch.clamp]
-        ternary_dtype_combinations = itertools.combinations(data_types, 3)
-        for op, dtypes in itertools.product(operations, ternary_dtype_combinations):
-            self._ternary_test_helper(op, dtypes, True)  # random data
-            self._ternary_test_helper(op, dtypes, False)  # special numbers
-
-    # We can't test the scalar version of rsub from python
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective")
-    def test_rsub(self):
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-
-        def rsub(x: torch.Tensor, y: torch.Tensor):
-            o = torch.rsub(x, y)
-            o = o * 2.
-            return o
-
-        rsub_jit = torch.jit.script(rsub)
-        self._run_helper(rsub_jit, rsub, x, y)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    # legacy fuser does not work for rand_like, see issue #34361
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective")
-    def test_ternary_ops(self):
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        cond = torch.randint(0, 2, (4, 8, 32, 32)).to(dtype=torch.bool, device="cuda")
-
-        def add(x: torch.Tensor, other: torch.Tensor, alpha: float):
-            o = torch.relu(x)
-            o = torch.add(o, other=other, alpha=alpha)
-            return o
-        add_jit = torch.jit.script(add)
-        self._run_helper(add_jit, add, x, y, 2.0)
-
-        def clamp0(x: torch.Tensor, f: float):
-            o = 2. * torch.clamp(x, min=f)
-            return o
-        clamp0_jit = torch.jit.script(clamp0)
-        self._run_helper(clamp0_jit, clamp0, x, 0.5)
-
-        def clamp1(x: torch.Tensor, f: float, ff: float):
-            o = 2. * torch.clamp(x, min=f, max=ff)
-            return o
-        clamp1_jit = torch.jit.script(clamp1)
-        self._run_helper(clamp1_jit, clamp1, x, -0.2, 0.7)
-
-        def threshold(x: torch.Tensor, th: float, val: float):
-            o = 2. * torch.threshold(x, th, val)
-            return o
-        threshold_jit = torch.jit.script(threshold)
-        self._run_helper(threshold_jit, threshold, x, 0.2, 0.9)
-
-        def where(x: torch.Tensor, y: torch.Tensor, cond: torch.Tensor):
-            o = 2. * torch.where(cond, x, y)
-            return o
-        where_jit = torch.jit.script(where)
-        self._run_helper(where_jit, where, x, y, cond)
-
-        def lerp(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = 2. * torch.lerp(x, y, z)
-            return o
-        lerp_jit = torch.jit.script(lerp)
-        self._run_helper(lerp_jit, lerp, x, y, z)
-
-        def lerp_scale(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = 2. * torch.lerp(x, y, z)
-            return o
-        lerp_scale_jit = torch.jit.script(lerp_scale)
-        self._run_helper(lerp_scale_jit, lerp_scale, x, y, 0.5)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
-    def test_addcmul_ops(self):
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-
-        def addcmul(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, value: float):
-            o = torch.add(x, 0.5)
-            o = torch.addcmul(o, y, z, value=value)
-            return o
-        addcmul_jit = torch.jit.script(addcmul)
-        self._run_helper(addcmul_jit, addcmul, x, y, z, 2.0)
-
-        def addcmul_no_alpha(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = torch.add(x, 0.5)
-            o = torch.addcmul(o, y, z)
-            return o
-        addcmul_no_alpha_jit = torch.jit.script(addcmul_no_alpha)
-        self._run_helper(addcmul_no_alpha_jit, addcmul_no_alpha, x, y, z)
-
-        def addcmul_const_alpha(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = torch.add(x, 0.5)
-            o = torch.addcmul(o, y, z, value=0.75)
-            return o
-        addcmul_const_alpha_jit = torch.jit.script(addcmul_const_alpha)
-        self._run_helper(addcmul_const_alpha_jit, addcmul_const_alpha, x, y, z)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_dynamic_size(self):
-        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
-        torch._C._jit_set_bailout_depth(20)
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: float):
-            o = x + y
-            o = o + z
-            return o
-        t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
-        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
-        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
-
-        # this test is not ideal, as we rely on the bailout to test it and we
-        # don't know a way to verify the bailout graph to validate the proper
-        # fusion.
-        x = torch.randn(8, 32, 16, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(16, 8, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
-        x = torch.randn(8, 17, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(8, 17, 1, dtype=torch.float, device="cuda")
-        jit_o = t_jit(x, y, 2.0)
-        jit_o = t_jit(x, y, 2.0)
-        o = t(x, y, 2.0)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
-        torch._C._jit_set_nvfuser_guard_mode(old_guard)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_random_topo(self):
-        os.environ["PYTORCH_NVFUSER_DISABLE_FALLBACK"] = "1"
-        self.assertTrue(runDefaultTestWithSeed(28449))
-
-    def _compare(self, desc, inp1, inp2, error):
-        a = inp1.clone()
-        b = inp2.clone()
-        close = torch.allclose(a, b, rtol=error, atol=error, equal_nan=True)
-        if not close:
-            print(desc, close)
-            z = a - b
-            index = (torch.abs(z) >= error + error * torch.abs(b)).nonzero()
-            print("dif    : ", z[index])
-            print("inp1   : ", a[index])
-            print("inp2   : ", b[index])
-            print("maximum difference", z[index].max())
-        return close
-
-    # Permutation helper that applies binary operation between two tensors:
-    #   1. applies separate permutation `perm0` & `perm1` to two inputs
-    #   2. reduce dimension `broadcast_axis` of operand two to size 1
-    # The purpose of this test is to ensure permutation works well in
-    # complicated cases with arbitrary stride order and broadcasting dimensions
-    def _permutation_helper(self, sizes, broadcast_axis, dtype, device, perm0, perm1):
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = torch.add(x, y)
-            o = torch.relu(o)
-            return o
-
-        x = torch.randn([sizes[i] for i in perm0], dtype=dtype, device=device).permute(
-            [perm0.index(i) for i in range(len(sizes))])
-        if broadcast_axis >= 0:
-            sizes[broadcast_axis] = 1
-        y = torch.randn([sizes[i] for i in perm1], dtype=dtype, device=device).permute(
-            [perm1.index(i) for i in range(len(sizes))])
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertEqual(o.stride(), jit_o.stride())
-        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-
-    # end-2-end test of permutation & contiguity handling in integration.
-    # we are testing inputs with all combination of permutation order, just to
-    # ensure that integration would be able to generate functionally correct
-    # kernels
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_binary_ops_permutation(self):
-        # note that num_dim is exclusive from len(x), so we are not reducing
-        # to single element (codegen limitation at this moment)
-        x = [7, 8, 12]
-        b_axes = range(-1, len(x))
-        for b_axis in b_axes:
-            for perm0 in itertools.permutations(range(len(x))):
-                for perm1 in itertools.permutations(range(len(x))):
-                    x = [7, 8, 12]
-                    self._permutation_helper(x, b_axis, torch.float32, "cuda", perm0, perm1)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_binary_ops_channels_last_with_bcast(self):
-        device = "cuda"
-        x = torch.randn([4, 3, 2, 5], device=device).to(memory_format=torch.channels_last)
-        w = torch.randn([2, 5], device=device)
-
-        def t(x: torch.Tensor, b: torch.Tensor):
-            o = x + b
-            return torch.relu(o)
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, w)
-        jit_o = t_jit(x, w)
-        jit_o = t_jit(x, w)
-        o = t(x, w)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
-        self.assertGraphContains(t_jit.graph_for(x, w), FUSION_GUARD)
-
-    def _reduction_helper(self, sizes, reduction_axis, dtype, device, perm0, perm1, keepdim=False):
-        class MyReduction(torch.nn.Module):
-            __constants__ = ['reduction_axis', 'keepdim']
-
-            def __init__(self):
-                super(MyReduction, self).__init__()
-                self.reduction_axis = reduction_axis
-                self.keepdim = keepdim
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor):
-                o = torch.add(x, y)
-                o = torch.sum(o, dim=self.reduction_axis, keepdim=self.keepdim)
-                return o
-
-        t = MyReduction()
-
-        x = torch.randn([sizes[i] for i in perm0], dtype=dtype, device=device).permute(
-            [perm0.index(i) for i in range(len(sizes))])
-        y = torch.randn([sizes[i] for i in perm1], dtype=dtype, device=device).permute(
-            [perm1.index(i) for i in range(len(sizes))])
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        # numerical issues here due to our scheduling.
-        # can't use `self.assertEqual(o, jit_o)`
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
-        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_reduction(self):
-        for x in ([7, 8, 12], [12, 8, 7, 9, 15], [128, 16, 8, 32]):
-            # note that num_dim is exclusive from len(x), so we are not reducing
-            # to single element (codegen limitation at this moment)
-            for num_reduce_dim in range(1, len(x)):
-                for axes in itertools.combinations(range(len(x)), num_reduce_dim):
-                    for keepdim in (True, False):
-                        perm0 = range(len(x))
-                        perm1 = range(len(x))
-                        self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1, keepdim)
-
-    def _layer_norm_autodiff_helper(self, model, grad, shapes, args):
-        jit_model = torch.jit.script(model)
-
-        eps = np.random.random() * 1e-4
-        use_cudnn = bool(np.random.randint(0, 2))
-
-        # profile/optimization runs
-        for i in range(3):
-            jit_o = jit_model(shapes, *args, eps, use_cudnn)
-            jit_o.backward(grad)
-
-        ref_args = [t.detach().clone().requires_grad_() for t in args]
-        [t.grad.zero_() for t in args]
-        jit_o = jit_model(shapes, *args, eps, use_cudnn)
-        jit_o.backward(grad)
-
-        o = model(shapes, *ref_args, eps, use_cudnn)
-        o.backward(grad)
-        self.assertEqual(jit_o, o)
-        for arg, ref_arg in zip(args, ref_args):
-            self.assertEqual(arg.grad, ref_arg.grad)
-
-        # check fusion in fw & bw
-        g = jit_model.graph_for(shapes, *args, eps, use_cudnn)
-        for node in g.nodes():
-            n = node
-        dbg_state = jit_model.get_debug_state()
-        for val in dbg_state.execution_plans.values():
-            v = val
-        state2 = v.code.grad_executor_states()
-        for val in state2[0].execution_plans.values():
-            v2 = val
-        FileCheck().check(FUSION_GUARD).run(g)
-        FileCheck().check(FUSION_GUARD).run(v2.graph)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_layer_norm_autodiff(self):
-        def t_wb(shapes: List[int], x, w, b, eps: float, cudnn: bool):
-            o = torch.layer_norm(x, shapes, w, b, eps, cudnn)
-            o = torch.relu(o)
-            return o
-
-        def t_w(shapes: List[int], x, w, eps: float, cudnn: bool):
-            o = torch.layer_norm(x, shapes, w, None, eps, cudnn)
-            o = torch.relu(o)
-            return o
-
-        def t_b(shapes: List[int], x, b, eps: float, cudnn: bool):
-            o = torch.layer_norm(x, shapes, None, b, eps, cudnn)
-            o = torch.relu(o)
-            return o
-
-        def t(shapes: List[int], x, eps: float, cudnn: bool):
-            o = torch.layer_norm(x, shapes, None, None, eps, cudnn)
-            o = torch.relu(o)
-            return o
-
-        model = {3: t_wb, 2: t_w, 1: t_b, 0: t}
-
-        for w, b in itertools.product([True, False], repeat=2):
-            batch = [2]
-            # note: awkward shape here to avoid vectorized fast kernel, which is
-            # buggy in aten
-            shapes = [2, 7, 3]
-            m = model[w * 2 + b]
-
-            grad = torch.randn(batch + shapes, dtype=torch.float32, device="cuda")
-            args = [torch.randn(batch + shapes, dtype=torch.float32, device="cuda").requires_grad_()]
-            if w:
-                args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
-            if b:
-                args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
-            self._layer_norm_autodiff_helper(m, grad, shapes, args)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_layer_norm_parser(self):
-        dtype = torch.float32
-        device = "cuda"
-        x = torch.randn([4, 4, 2], dtype=dtype, device=device)
-        w = torch.randn([4, 2], dtype=dtype, device=device)
-        b = torch.randn([4, 2], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, w: torch.Tensor, b: torch.Tensor):
-            o = torch.relu(x)
-            o = torch.layer_norm(o, [4, 2], w, b, 1e-5)
-            return o
-
-        o = t(x, w, b)
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, w, b)
-        jit_o = t_jit(x, w, b)
-        o = t(x, w, b)
-        self.assertGraphContains(t_jit.graph_for(x, w, b), FUSION_GUARD)
-
-    def _native_layer_norm_helper(self, shape, norm_shape, dtype, device, error, affine=True):
-        class MyLayerNorm(torch.nn.Module):
-            __constants__ = ['norm_shape']
-
-            def __init__(self, elementwise_affine=True):
-                super(MyLayerNorm, self).__init__()
-                self.norm_shape = norm_shape
-                if elementwise_affine:
-                    self.weight = torch.randn(norm_shape, dtype=dtype, device=device)
-                    self.bias = torch.randn(norm_shape, dtype=dtype, device=device)
-                    with torch.no_grad():
-                        self.weight.fill_(1)
-                        self.bias.fill_(0)
-                else:
-                    self.weight = None
-                    self.bias = None
-
-            def forward(self, x: torch.Tensor):
-                o = torch.relu(x)
-                o = torch.native_layer_norm(o, self.norm_shape, self.weight, self.bias, 1e-5)
-                return o
-
-        t = MyLayerNorm(affine)
-
-        x = torch.randn(shape, dtype=dtype, device=device)
-        t_jit = torch.jit.script(t)
-        jit_o, jit_mean, jit_rstd = t_jit(x)
-        jit_o, jit_mean, jit_rstd = t_jit(x)
-        o, mean, rstd = t(x)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        # numerical issues here due to our scheduling.
-        # can't use `self.assertEqual(o, jit_o)`
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        self.assertTrue(self._compare("comparing mean failed", mean, jit_mean, error))
-        self.assertTrue(self._compare("comparing rstd failed", rstd, jit_rstd, error))
-        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_native_layer_norm(self):
-        dims = 4
-        rnds = 3
-        for idx in range(rnds):
-            for offset in range(1, dims):
-                for affine in (True, False):
-                    input_shape = [random.randint(10, 30) for idx in range(dims)]
-                    norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
-                    self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_native_layer_norm_half(self):
-        dims = 4
-        rnds = 3
-        for idx in range(rnds):
-            for offset in range(1, dims):
-                input_shape = [random.randint(10, 30) for idx in range(dims)]
-                norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
-                self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_native_layer_norm_bfloat(self):
-        dims = 4
-        rnds = 3
-        for idx in range(rnds):
-            for offset in range(1, dims):
-                input_shape = [random.randint(10, 30) for idx in range(dims)]
-                norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
-                self._native_layer_norm_helper(input_shape, norm_shape, torch.bfloat16, "cuda", 1e-1)
-
-    def _norm_helper(self,
-                     shape,
-                     dtype,
-                     device,
-                     error,
-                     is_batch_norm_else_instance_norm,
-                     memory_format=torch.contiguous_format,
-                     *,
-                     layer_dtype=torch.float32):
-        class MyBatchNorm(torch.nn.Module):
-            def __init__(self):
-                super(MyBatchNorm, self).__init__()
-
-            def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
-                o = torch.nn.functional.batch_norm(x, r_mean, r_var, training=True)
-                o = torch.relu(o)
-                return o
-
-        class MyInstanceNorm(torch.nn.Module):
-            def __init__(self):
-                super(MyInstanceNorm, self).__init__()
-
-            def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
-                o = torch.nn.functional.instance_norm(x, r_mean, r_var, use_input_stats=True)
-                o = torch.relu(o)
-                return o
-
-        t = MyBatchNorm() if is_batch_norm_else_instance_norm else MyInstanceNorm()
-
-        x = torch.randn(shape, dtype=dtype, device=device).to(memory_format=memory_format)
-        running_mean = torch.zeros(shape[1], dtype=layer_dtype, device=device)
-        running_var = torch.ones(shape[1], dtype=layer_dtype, device=device)
-        t_jit = torch.jit.script(t)
-
-        eager_running_mean = running_mean.clone()
-        eager_running_var = running_var.clone()
-        jit_running_mean = running_mean.clone()
-        jit_running_var = running_var.clone()
-
-        jit_o = t_jit(x, running_mean.clone(), running_var.clone())
-
-        self.assertTrue(self._compare("prerun comparing running_mean failed", eager_running_mean, jit_running_mean, error))
-        self.assertTrue(self._compare("prerun comparing running_var failed", eager_running_var, jit_running_var, error))
-
-        jit_o = t_jit(x, jit_running_mean, jit_running_var)
-        o = t(x, eager_running_mean, eager_running_var)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.stride(), jit_o.stride())
-        # numerical issues here due to our scheduling.
-        # can't use `self.assertEqual(o, jit_o)`
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        self.assertTrue(self._compare("comparing running_mean failed", eager_running_mean, jit_running_mean, error))
-        self.assertTrue(self._compare("comparing running_var failed", eager_running_var, jit_running_var, error))
-        self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_layer_norm_trivial_reduce_dim(self):
-        def t_wb(shapes: List[int], x, w, b, eps: float, cudnn: bool):
-            o = torch.layer_norm(x, shapes, w, b, eps, cudnn)
-            o = torch.relu(o)
-            return o
-
-        batch = [1]
-        shapes = [2, 7, 3]
-
-        grad = torch.randn(batch + shapes, dtype=torch.float32, device="cuda")
-        args = [torch.randn(batch + shapes, dtype=torch.float32, device="cuda").requires_grad_()]
-        args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
-        args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
-        self._layer_norm_autodiff_helper(t_wb, grad, shapes, args)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_norm_half_layer(self):
-        size = [2, 4, 2, 2]
-
-        for is_batch_norm_else_instance_norm in [False, True]:
-            for mf in [torch.channels_last, torch.contiguous_format]:
-                self._norm_helper(size, torch.float16, "cuda", 1e-3, is_batch_norm_else_instance_norm,
-                                  memory_format=mf, layer_dtype=torch.float16)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_norm_channels_last(self):
-        size = [3, 4, 5, 6]
-
-        with torch.backends.cudnn.flags(enabled=False):
-            for is_batch_norm_else_instance_norm in [False, True]:
-                for mf in [torch.channels_last, torch.contiguous_format]:
-                    self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_norm(self):
-        output_elements = 10000
-        channel_sizes = [67, 457, 1024, 4096]
-
-        with torch.backends.cudnn.flags(enabled=False):
-            for is_batch_norm_else_instance_norm in [False, True]:
-                for dims in range(3, 6):
-                    output_size = int(pow(output_elements, 1. / (dims - 1)))
-                    for C in channel_sizes:
-                        x = [output_size for idx in range(dims)]
-                        x[1] = C
-                        self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
-
-    @skipIfRocm
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_norm_large(self):
-        output_elements = 262144
-        channel_sizes = 67, 457, 1024
-
-        for is_batch_norm_else_instance_norm in [True, False]:
-            for dims in range(3, 6):
-                output_size = int(pow(output_elements, 1. / (dims - 1)))
-                for C in channel_sizes:
-                    x = [output_size for idx in range(dims)]
-                    x[1] = C
-                    self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_norm_half(self):
-        output_elements = 10000
-        channel_sizes = [67, 457, 1024, 4096]
-
-        with torch.backends.cudnn.flags(enabled=False):
-            # TODO instance norm on ROCm was giving ~50% incorrect results
-            for is_batch_norm_else_instance_norm in [True] if TEST_WITH_ROCM else [False, True]:
-                for dims in range(3, 6):
-                    output_size = int(pow(output_elements, 1. / (dims - 1)))
-                    for C in channel_sizes:
-                        x = [output_size for idx in range(dims)]
-                        x[1] = C
-                        self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_norm_bfloat(self):
-        output_elements = 10000
-        channel_sizes = [67, 457, 1024, 4096]
-
-        with torch.backends.cudnn.flags(enabled=False):
-            # TODO instance norm on ROCm was giving ~50% incorrect results
-            for is_batch_norm_else_instance_norm in [True] if TEST_WITH_ROCM else [False, True]:
-                for dims in range(3, 6):
-                    output_size = int(pow(output_elements, 1. / (dims - 1)))
-                    for C in channel_sizes:
-                        x = [output_size for idx in range(dims)]
-                        x[1] = C
-                        self._norm_helper(x, torch.bfloat16, "cuda", 1e-1, is_batch_norm_else_instance_norm)
-
-    def _softmax_helper(self, shape, reduction_axis, is_log_softmax, dtype, device, error):
-        class MySoftmax(torch.nn.Module):
-            __constants__ = ['reduction_axis']
-
-            def __init__(self):
-                super(MySoftmax, self).__init__()
-                self.reduction_axis = reduction_axis
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor):
-                o = torch.add(x, y)
-                o = torch.nn.functional.softmax(o, dim=self.reduction_axis)
-                return o
-
-        class MyLogSoftmax(torch.nn.Module):
-            __constants__ = ['reduction_axis']
-
-            def __init__(self):
-                super(MyLogSoftmax, self).__init__()
-                self.reduction_axis = reduction_axis
-
-            def forward(self, x: torch.Tensor, y: torch.Tensor):
-                o = torch.add(x, y)
-                o = torch.nn.functional.log_softmax(o, dim=self.reduction_axis)
-                return o
-
-        gradient_check = (dtype == torch.float64)
-        t = MyLogSoftmax() if is_log_softmax else MySoftmax()
-
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check)
-        y = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check)
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-
-        if gradient_check:
-            gradcheck(t_jit.forward, [x, y], nondet_tol=1e-5)
-        else:
-            o = t(x, y)
-            self.assertEqual(o.dtype, jit_o.dtype)
-            # numerical issues here due to our scheduling.
-            # can't use `self.assertEqual(o, jit_o)`
-            self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-            self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_softmax_dtype(self):
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = torch.mul(x, y)
-            o = torch.nn.functional.softmax(o, dim=0, dtype=torch.float32)
-            return o
-
-        x = torch.randn([4, 4], dtype=torch.float16, device="cuda").requires_grad_()
-        y = torch.randn_like(x).requires_grad_()
-        grad = torch.randn_like(x).float()
-
-        ref_x = x.detach().requires_grad_()
-        ref_y = y.detach().requires_grad_()
-        o = t(ref_x, ref_y)
-        o.backward(grad)
-
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-        x.grad.zero_()
-        y.grad.zero_()
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(ref_x.grad, x.grad)
-        self.assertEqual(ref_y.grad, y.grad)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
-        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
-        bwd_graph = list(
-            list(t_jit.get_debug_state().execution_plans.values())[
-                0].code.grad_executor_states()[0].execution_plans.values()
-        )[0].graph
-        FileCheck().check(FUSION_GUARD).run(bwd_graph)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test__softmax_function(self):
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = torch.mul(x, y)
-            o = torch._softmax(o, dim=-1, half_to_float=False)
-            return o
-
-        x = torch.randn([4, 4], dtype=torch.float16, device="cuda")
-        y = torch.randn_like(x)
-
-        o = t(x, y)
-
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
-        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test__softmax_function_half_to_float(self):
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = torch.mul(x, y)
-            o = torch._softmax(o, dim=-1, half_to_float=True)
-            return o
-
-        x = torch.randn([4, 4], dtype=torch.float16, device="cuda")
-        y = torch.randn_like(x)
-
-        o = t(x, y)
-
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
-        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_softmax(self):
-        output_size = 10000
-        dims = 4
-        output_size = int(pow(output_size, 1. / dims))
-        reduction_sizes = [67, 256, 1024, 4096]
-
-        # gradient check
-        for reduction_dim in range(dims):
-            for is_log_softmax in [False, True]:
-                shape = [output_size for idx in range(dims)]
-                self._softmax_helper(shape, reduction_dim, is_log_softmax, torch.float64, "cuda", 1e-4)
-
-        for reduction_dim in range(dims):
-            for reduction_size in reduction_sizes:
-                x = [output_size for idx in range(dims)]
-                x[reduction_dim] = reduction_size
-                for is_log_softmax in [False, True]:
-                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float32, "cuda", 1e-4)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_softmax_half(self):
-        output_size = 10000
-        dims = 4
-        output_size = int(pow(output_size, 1. / dims))
-        reduction_sizes = [67, 256, 1024, 4096]
-
-        for reduction_dim in range(dims):
-            for reduction_size in reduction_sizes:
-                x = [output_size for idx in range(dims)]
-                x[reduction_dim] = reduction_size
-                for is_log_softmax in [False, True]:
-                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float16, "cuda", 5e-3)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_softmax_bfloat(self):
-        output_size = 10000
-        dims = 4
-        output_size = int(pow(output_size, 1. / dims))
-        reduction_sizes = [67, 256, 1024, 4096]
-
-        for reduction_dim in range(dims):
-            for reduction_size in reduction_sizes:
-                x = [output_size for idx in range(dims)]
-                x[reduction_dim] = reduction_size
-                for is_log_softmax in [False, True]:
-                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.bfloat16, "cuda", 1e-1)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_reduction_permutation(self):
-        x = [7, 8, 12]
-        # note that num_dim is exclusive from len(x), so we are not reducing
-        # to single element (codegen limitation at this moment)
-        for num_reduce_dim in range(1, len(x)):
-            for axes in itertools.combinations(range(len(x)), num_reduce_dim):
-                for perm0 in itertools.permutations(range(len(x))):
-                    for perm1 in itertools.permutations(range(len(x))):
-                        self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_reduction_multiple_output(self):
-        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
-        torch._C._jit_set_bailout_depth(20)
-
-        def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor):
-            o = torch.mul(x, y)
-            o = torch.mul(o, scale)
-            out1 = torch.mul(o, z)
-            out2 = torch.sum(out1, dim=[2])
-            return out1, out2
-
-        t_jit = torch.jit.script(t)
-        x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
-        y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
-        z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
-        scale = 0.5
-        jit_o = t_jit(x, y, scale, z)
-        jit_o = t_jit(x, y, scale, z)
-        o = t(x, y, scale, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GUARD)
-
-        x = x.to(memory_format=torch.channels_last)
-        y = y.to(memory_format=torch.channels_last)
-        z = z.to(memory_format=torch.channels_last)
-        jit_o = t_jit(x, y, scale, z)
-        jit_o = t_jit(x, y, scale, z)
-        o = t(x, y, scale, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GUARD)
-        torch._C._jit_set_nvfuser_guard_mode(old_guard)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_channels_last_with_broadcast(self):
-        # setting this true forces a new graph to be generated with a new
-        # input a different broadcast shape
-        torch._C._jit_set_nvfuser_guard_mode(True)
-
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = torch.mul(x, y)
-            o = o + 2.0
-            return o
-        t_jit = torch.jit.script(t)
-
-        # Single Channel broadcasts
-        # Test 1
-        x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
-        x = x.to(memory_format=torch.channels_last)
-
-        y = torch.randn(8, 4, 10, 1, dtype=torch.float, device="cuda")
-        y = y.to(memory_format=torch.channels_last)
-
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
-                         jit_o.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(o, jit_o)
-
-        # Test 2
-        y = torch.randn(8, 4, 1, 16, dtype=torch.float, device="cuda")
-        y = y.to(memory_format=torch.channels_last)
-
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
-                         jit_o.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(o, jit_o)
-
-        # Test 3
-        y = torch.randn(8, 1, 10, 16, dtype=torch.float, device="cuda")
-        y = y.to(memory_format=torch.channels_last)
-
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
-                         jit_o.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(o, jit_o)
-
-        # Test 3
-        y = torch.randn(1, 4, 10, 16, dtype=torch.float, device="cuda")
-        y = y.to(memory_format=torch.channels_last)
-
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
-                         jit_o.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(o, jit_o)
-
-        '''
-        Currently, the JIT doesn't have tensor merge logic to handle adding
-        a broadcast tensor with more than one broadcast into a non-broadcast
-        tensor.  Therefore, either of these tests can fail depending on the
-        sort implementation.  The second test is known to fail.
-
-        # Two Channel broadcasts
-        # Test 1
-        y = torch.randn(8, 4, 1, 1, dtype=torch.float, device="cuda")
-        y = y.to(memory_format=torch.channels_last)
-
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
-                         jit_o.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(o, jit_o)
-
-        # Test 2
-        y = torch.randn(8, 4, 1, 1, dtype=torch.float, device="cuda")
-        y = y.to(memory_format=torch.channels_last).transpose(2,3)
-        x = x.transpose(2,3)
-
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
-                         jit_o.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(o, jit_o)
-        '''
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_pw_single_reduction_partition(self):
-        sizes = [2, 2, 2]
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device)
-        y = torch.randn(sizes, dtype=dtype, device=device)
-        z = torch.randn(sizes, dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = torch.add(x, y)
-            o = torch.sum(o, dim=[0])
-            o = torch.add(o, z)
-            return o
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_permutation_preservation(self):
-        sizes = [2, 3, 4, 5]
-        dtype = torch.float
-        device = "cuda"
-
-        with nvfuser_singleton_fusion(True):
-
-            def t(x: torch.Tensor):
-                return torch.relu(x)
-
-            t_jit = torch.jit.script(t)
-            x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-            self._run_helper(t_jit, t, x, check_stride=True)
-
-            def t(x: torch.Tensor, y: torch.Tensor):
-                return torch.add(x, y)
-
-            t_jit = torch.jit.script(t)
-            x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-            y = torch.randn(sizes[1:], dtype=dtype, device=device)
-            self._run_helper(t_jit, t, x, y, check_stride=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_permutation_preservation_edge_case_0(self):
-        sizes = [2, 3, 4, 5]
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-        # mismatch rank with *note* different permutation recognized by PE
-        bias = torch.randn(3, dtype=dtype, device=device).unsqueeze(-1).unsqueeze(-1)
-
-        def t(x, y):
-            return x + y
-
-        t_jit = torch.jit.script(t)
-        with nvfuser_singleton_fusion(True):
-            self._run_helper(t_jit, t, x, bias, check_stride=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_permutation_preservation_edge_case_1_broken(self):
-        sizes = [2, 3, 4, 5]
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-        # in-compatible permutation, this will cause format propagation to break
-        bias = torch.randn(4, 5, dtype=dtype, device=device)
-
-        def t(x, y):
-            return x + y
-
-        t_jit = torch.jit.script(t)
-        with nvfuser_singleton_fusion(True):
-            for _ in range(5):
-                jit_o = t_jit(x, bias)
-
-        o = t(x, bias)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        try:
-            # nvfuser does not support in-compatible permutation, this will throw
-            self.assertEqual(o.stride(), jit_o.stride())
-        except Exception as e:
-            warnings.warn(
-                "permutation propagation is broken, proper support should come after nvfuser permutation scheduler update")
-        self.assertGraphContains(t_jit.graph_for(x, bias), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_permutation_preservation_edge_case_2(self):
-        sizes = [2, 3, 4, 5]
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-        y = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-        z = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
-
-        def t(x, y, w):
-            tmp = torch.lerp(x, y, w)
-            tmp = torch.clamp(tmp, -1.0, 0.5)
-            tmp = torch.nn.functional.softplus(tmp)
-            return torch.threshold(tmp, -2.0, 0.5)
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y, z, check_stride=True)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_normalization_partition(self):
-        sizes = [3, 8, 5]
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(sizes, dtype=dtype, device=device)
-        y = torch.randn(sizes, dtype=dtype, device=device)
-        z = torch.randn(sizes, dtype=dtype, device=device)
-        r_m = torch.randn(8, dtype=dtype, device=device)
-        r_v = torch.randn(8, dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
-            o = torch.add(x, y)
-            o = torch.nn.functional.softmax(o, dim=0)
-            o = torch.add(o, z)
-            o = torch.nn.functional.batch_norm(o, r_mean, r_var, training=True)
-            return o
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, z, r_m, r_v)
-        jit_o = t_jit(x, y, z, r_m, r_v)
-        o = t(x, y, z, r_m, r_v)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, z, r_m, r_v), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_sum_to_one(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([4, 5, 6], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor):
-            o = torch.add(x, 1)
-            o = torch.sum(o, dim=[0, 1, 2])
-            return o
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x)
-        jit_o = t_jit(x)
-        o = t(x)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_single_reduction_broadcast(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([7, 4, 8], dtype=dtype, device=device)
-        y = torch.randn([4, 8], dtype=dtype, device=device)
-        z = torch.randn([1, 4, 8], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
-            o = torch.add(x, y)
-            o = torch.add(o, z)
-            o = torch.sum(o, dim=[0])
-            return o
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_trivial_reduction(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([1, 4, 8], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor):
-            o = torch.add(x, 1)
-            o = torch.sum(o, dim=[0])
-            o = torch.sum(o, dim=[0])
-            return o
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x)
-        jit_o = t_jit(x)
-        o = t(x)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-
-    @unittest.skip("Skipped due to rand_like behavior change")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_profiling_node(self):
-        # TODO: should we change this test to not use rand_like, or just
-        # remove this test?
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(4, 8, 8, 8, dtype=dtype, device=device)
-
-        def repro(x: torch.Tensor, alpha: float):
-            o = torch.rand_like(x)
-            o = torch.add(o, alpha)
-            return o
-        repro_jit = torch.jit.script(repro)
-        self._run_helper(repro_jit, repro, x, 0.6)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_reduction_sizes_op(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn(2, 3, 4, 5, dtype=dtype, device=device)
-        y = torch.randn(2, 3, 4, 5, dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = x + y
-            o = torch.relu(o)
-            o = o.sum((1, 3))
-            return o.size()
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o = t_jit(x, y)
-        o = t(x, y)
-        self.assertEqual(o, jit_o)
-        # since the output value is not used at all, the fusion operator should
-        # have been optimized away
-        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_profile_ivalue(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([7, 4, 7], dtype=dtype, device=device)
-        y = torch.randn([7, 4, 7], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, y: torch.Tensor, dim: List[int], keepdim: bool):
-            o = torch.add(x, y)
-            o = o.sum(dim, keepdim=keepdim)
-            return o
-
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y, (0, 1), False)
-        jit_o = t_jit(x, y, (0, 1), False)
-        o = t(x, y, (0, 1), False)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_profile_ivalue_multiple_profiles(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([7, 4, 7], dtype=dtype, device=device)
-
-        def t(x, num: int):
-            for i in range(num):
-                # varying reduction axes should break profile_ivalue
-                tmp = x.sum(i, keepdim=True)
-                # inplace add on input/output, can't be functionalized/fused
-                x += tmp
-            return x
-
-        with nvfuser_singleton_fusion(True):
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x, 3, num_fusion=0)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_sum_to_size(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([2, 4, 4], dtype=dtype, device=device)
-        y = torch.randn([2, 4, 4], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, y: torch.Tensor, new_size: List[int]):
-            o = torch.add(x, y)
-            o = o.sum_to_size(new_size)
-            return o
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y, (4, 1))
-
-        # update shape: old kernel should handle dynamic shape well without
-        # recompilation
-        x = torch.randn([2, 5, 8], dtype=dtype, device=device)
-        y = torch.randn([2, 5, 8], dtype=dtype, device=device)
-        # (TODO) check executed kernel, should extend autograd.profiler to fused
-        # kernels
-        self._run_helper(t_jit, t, x, y, (5, 1))
-
-        with nvfuser_singleton_fusion(True):
-            x = torch.randn([2, 5, 8], dtype=dtype, device=device)
-
-            def t(x: torch.Tensor):
-                # no-op reduction
-                return x.sum_to_size((2, 5, 8))
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_grad_sum_to_size(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([2, 4, 4], dtype=dtype, device=device).requires_grad_()
-        y = torch.randn([4], dtype=dtype, device=device).requires_grad_()
-        grad = torch.randn([2, 4, 4], dtype=dtype, device=device)
-
-        ref_x = x.detach().clone().requires_grad_()
-        ref_y = y.detach().clone().requires_grad_()
-
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = torch.add(x, y)
-            o = torch.relu(o)
-            return o
-
-        # profiling runs for forward & backward
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-
-        x.grad = None
-        y.grad = None
-        jit_o = t_jit(x, y)
-        jit_o.backward(grad)
-        o = t(ref_x, ref_y)
-        o.backward(grad)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertEqual(x.grad, ref_x.grad)
-        self.assertEqual(y.grad, ref_y.grad)
-        bwd_graph = list(
-            list(t_jit.get_debug_state().execution_plans.values())[
-                0].code.grad_executor_states()[0].execution_plans.values()
-        )[0].graph
-        FileCheck().check(FUSION_GUARD).run(bwd_graph)
-
-        # update shape: old kernel should handle dynamic shape well without
-        # recompilation
-        x = torch.randn([2, 5, 8], dtype=dtype, device=device).requires_grad_()
-        y = torch.randn([8], dtype=dtype, device=device).requires_grad_()
-        ref_x = x.detach().clone().requires_grad_()
-        ref_y = y.detach().clone().requires_grad_()
-        grad = torch.randn([2, 5, 8], dtype=dtype, device=device)
-        jit_o = t_jit(x, y)
-        # (TODO) check executed kernel, should extend autograd.profiler to fused
-        # kernels
-        jit_o.backward(grad)
-        o = t(ref_x, ref_y)
-        o.backward(grad)
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertEqual(o, jit_o)
-        self.assertEqual(x.grad, ref_x.grad)
-        self.assertEqual(y.grad, ref_y.grad)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_dropout_inference_fusion(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([10, 4, 8], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, p: float, train: bool):
-            o = torch.nn.functional.dropout(x, p, training=train)
-            o = o + 1.0
-            return o
-
-        t_jit = torch.jit.script(t)
-
-        self._run_helper(t_jit, t, x, 0.15, False)
-
-    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_dropout_train_nograd_fusion(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([64, 128, 1024], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, p: float, train: bool):
-            o = torch.nn.functional.dropout(x, p, training=train)
-            o = o + 1.0
-            return o
-
-        t_jit = torch.jit.script(t)
-
-        self._run_helper(t_jit, t, x, 0.0, True, check_runs=20)
-        self._run_helper(t_jit, t, x, 1.0, True, check_runs=20)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_dropout_train_nograd_prob_check(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([1024, 1024], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, p: float, train: bool):
-            o = torch.nn.functional.dropout(x, p, training=train)
-            o = o * 2.0
-            return o
-
-        t_jit = torch.jit.script(t)
-
-        for prob in [0.0, 0.15, 0.5, 0.85, 1.]:
-            torch.cuda.manual_seed_all(123)
-            jit_o = t_jit(x, prob, True)
-            torch.cuda.manual_seed_all(123)
-            jit_o = t_jit(x, prob, True)
-
-            self.assertTrue(jit_o.detach().isfinite().all().item())
-
-            num_elems = x.numel()
-            num_zeros = num_elems - jit_o.detach().count_nonzero().item()
-            percent_zeros = num_zeros / num_elems
-
-            self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01)))
-            self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_dropout_training_fusion(self):
-        dtype = torch.float
-        device = "cuda"
-        sizes = [2, 3, 4, 5]
-
-        def t(x: torch.Tensor, p: float, train: bool):
-            o = torch.nn.functional.dropout(x, p, training=train)
-            o = o * 2.0
-            return o
-
-        def t2(x: torch.Tensor, p: float, train: bool):
-            o = torch.nn.functional.softmax(x, dim=-1)
-            o = torch.nn.functional.dropout(o, p, training=train)
-            return o
-
-        # disabling cache so new inputs would generate new graph
-        t.__disable_jit_function_caching__ = True
-        t2.__disable_jit_function_caching__ = True
-
-        for fn in [t, t2]:
-            for m_format in [torch.contiguous_format, torch.channels_last]:
-                fn_jit = torch.jit.script(fn)
-                x = torch.randn(sizes, dtype=dtype, device=device, requires_grad=True).to(memory_format=m_format)
-                grads = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=m_format)
-
-                # The drop probability needs to be set to zero given that the order of picking random
-                # numbers between eager mode and the jit is different
-                self._run_training_helper(fn_jit, fn, grads, x, 0.0, True)
-
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_gelu(self):
-        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True)
-        grads = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=False)
-
-        def t(x: torch.Tensor, mode: str):
-            o = torch.nn.functional.gelu(x, approximate=mode)
-            o = o * 2.0
-            return o
-
-        t_jit = torch.jit.script(t)
-        self._run_training_helper(t_jit, t, grads, x, 'none')
-        self._run_training_helper(t_jit, t, grads, x, 'tanh')
-        torch._C._jit_set_nvfuser_guard_mode(old_guard)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_dropout_training_prob_check(self):
-        dtype = torch.float
-        device = "cuda"
-        x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True)
-        x_nograd = torch.randn([1024, 1024], dtype=dtype, device=device)
-
-        def t(x: torch.Tensor, p: float, train: bool):
-            o = torch.nn.functional.dropout(x, p, training=train)
-            o = o * 2.0
-            return o
-
-        t_jit = torch.jit.script(t)
-
-        for prob in [0.0, 0.15, 0.5, 0.85, 1.]:
-            torch.cuda.manual_seed_all(123)
-            jit_o = t_jit(x, prob, True)
-            torch.cuda.manual_seed_all(123)
-            jit_o = t_jit(x, prob, True)
-            torch.cuda.manual_seed_all(123)
-            jit_o = t_jit(x, prob, True)
-
-            self.assertTrue(jit_o.detach().isfinite().all().item())
-
-            num_elems = x.numel()
-            num_zeros = num_elems - jit_o.detach().count_nonzero().item()
-            percent_zeros = num_zeros / num_elems
-
-            self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01)))
-            self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_linear(self):
-        in_feature = 2
-        out_feature = 8
-        # Changing the input dims to be 3-D to avoid eager mode bias fusion
-        # The bias fusion causes some precision issues with TF-32
-        weight = torch.randn(out_feature, in_feature, dtype=torch.float32, device='cuda')
-        bias = torch.randn(out_feature, dtype=torch.float32, device='cuda')
-
-        def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor):
-            o = torch.nn.functional.linear(x, weight, bias)
-            o = torch.relu(o)
-            return o
-
-        # disabling cache so new inputs would generate new graph
-        t.__disable_jit_function_caching__ = True
-
-        sizes = [in_feature, ]
-        for i in range(4):
-            # increase input rank in each iteration
-            sizes.insert(0, i + 2)
-            x = torch.randn(*sizes, dtype=torch.float32, device='cuda')
-            t_jit = torch.jit.script(t)
-            # fusion only happens for input rank >= 4
-            has_fusion = 0 if len(sizes) < 4 else 1
-            self._run_helper(t_jit, t, x, weight, bias, check_stride=True, num_fusion=has_fusion)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_linear_symbolic_shapes(self):
-        def fn(x: int):
-            y = torch.zeros((3, 4, x, x + 2)).cuda()
-            for i in range(2):
-                inp = torch.rand((3, 4, x, x + i)).cuda()
-                weight = torch.rand((x + 2, x + i)).cuda()
-                bias = torch.rand((x, x + 2)).cuda()
-                y += torch.sin(torch.nn.functional.linear(inp, weight, bias))
-            return y
-
-        fn_s = torch.jit.script(fn)
-        fn_s(5)
-        fn_s(5)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_conv2d_symbolic_shapes(self):
-        def fn(x: int):
-            responses = []
-            for i in range(2):
-                inp = torch.rand((3, 3, 32, 32)).cuda()
-                weight = torch.rand((x + i, 3, 7, 7)).cuda()
-                bias = torch.rand((x + i)).cuda()
-                res = torch.nn.functional.conv2d(inp, weight, bias, padding=3)
-                responses.append(res)
-            return responses
-
-        fn_s = torch.jit.script(fn)
-        fn_s(5)
-        fn_s(5)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_backward_type(self):
-        # not super useful to check gradient of integer/bool, so skipping here
-        type_pairs = [
-            (torch.float, torch.half),
-            (torch.double, torch.half),
-            (torch.float, torch.double),
-        ]
-        if TEST_BF16:
-            type_pairs += [
-                (torch.float, torch.bfloat16),
-                (torch.double, torch.bfloat16),
-            ]
-        for x_type, y_type in type_pairs:
-            x = torch.randn(4, 2, dtype=x_type, device='cuda', requires_grad=True)
-            y = torch.randn(4, 2, dtype=y_type, device='cuda', requires_grad=True)
-            grad = torch.randn(4, 2, dtype=torch.float, device='cuda')
-
-            def test1(x: torch.Tensor, y: torch.Tensor):
-                o = torch.add(x, y)
-                o = torch.add(o, y)
-                o = torch.add(o, y)
-                o = torch.add(o, y)
-                o = o + 1.0
-                return o
-
-            test1_jit = torch.jit.script(test1)
-            for i in range(3):
-                jit_o = test1_jit(x, y)
-                jit_o.backward(grad)
-
-            bwd_graph = list(
-                list(test1_jit.get_debug_state().execution_plans.values())[
-                    0].code.grad_executor_states()[0].execution_plans.values()
-            )[0].graph
-
-            FileCheck().check(FUSION_GROUP).run(bwd_graph)
-            self.assertEqual(x.grad.dtype, x.dtype)
-            self.assertEqual(y.grad.dtype, y.dtype)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_autocast_1(self):
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = x * 2.0
-            o = torch.softmax(o, dim=-1)
-            o = o * 3.0
-            o = torch._C._nn.linear(o, y)
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True)
-        y = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
-        grad = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            with torch.cuda.amp.autocast():
-                jit_o = t_jit(x, y)
-                if i == 2:
-                    fwd_graph = t_jit.graph_for(x, y)
-            jit_o.backward(grad)
-
-        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
-
-        with torch.cuda.amp.autocast():
-            bwd_graph = list(
-                list(t_jit.get_debug_state().execution_plans.values())[
-                    0].code.grad_executor_states()[0].execution_plans.values()
-            )[0].graph
-        FileCheck().check(FUSION_GROUP).run(bwd_graph)
-
-        self.assertEqual(jit_o.dtype, torch.half)
-        self.assertEqual(x.grad.dtype, x.dtype)
-        self.assertEqual(y.grad.dtype, y.dtype)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_autocast_2(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = torch.softmax(o, dim=-1)
-            o = o * 3.0
-            o = torch.softmax(o, dim=-1)
-            o = o * 4.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True)
-        grad = torch.randn(8, 4, dtype=torch.float, device='cuda', requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            with torch.cuda.amp.autocast():
-                jit_o = t_jit(x)
-                if i == 2:
-                    fwd_graph = t_jit.graph_for(x)
-            jit_o.backward(grad)
-
-        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
-
-        with torch.cuda.amp.autocast():
-            bwd_graph = list(
-                list(t_jit.get_debug_state().execution_plans.values())[
-                    0].code.grad_executor_states()[0].execution_plans.values()
-            )[0].graph
-        FileCheck().check(FUSION_GROUP).run(bwd_graph)
-
-        self.assertEqual(jit_o.dtype, torch.float)
-        self.assertEqual(x.grad.dtype, x.dtype)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_autocast_1_bfloat(self):
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o = x * 2.0
-            o = torch.softmax(o, dim=-1)
-            o = o * 3.0
-            o = torch._C._nn.linear(o, y)
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True)
-        y = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
-        grad = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                jit_o = t_jit(x, y)
-                if i == 2:
-                    fwd_graph = t_jit.graph_for(x, y)
-            jit_o.backward(grad)
-
-        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
-
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-            bwd_graph = list(
-                list(t_jit.get_debug_state().execution_plans.values())[
-                    0].code.grad_executor_states()[0].execution_plans.values()
-            )[0].graph
-        FileCheck().check(FUSION_GROUP).run(bwd_graph)
-
-        self.assertEqual(jit_o.dtype, torch.bfloat16)
-        self.assertEqual(x.grad.dtype, x.dtype)
-        self.assertEqual(y.grad.dtype, y.dtype)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_autocast_2_bfloat(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = torch.softmax(o, dim=-1)
-            o = o * 3.0
-            o = torch.softmax(o, dim=-1)
-            o = o * 4.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True)
-        grad = torch.randn(8, 4, dtype=torch.float, device='cuda', requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                jit_o = t_jit(x)
-                if i == 2:
-                    fwd_graph = t_jit.graph_for(x)
-            jit_o.backward(grad)
-
-        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
-
-        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-            bwd_graph = list(
-                list(t_jit.get_debug_state().execution_plans.values())[
-                    0].code.grad_executor_states()[0].execution_plans.values()
-            )[0].graph
-        FileCheck().check(FUSION_GROUP).run(bwd_graph)
-
-        self.assertEqual(jit_o.dtype, torch.float)
-        self.assertEqual(x.grad.dtype, x.dtype)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_to_dtype_fp32_to_fp16(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = o.to(dtype=torch.half)
-            o = o * 3.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.float, device='cuda')
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        self.assertEqual(jit_o.dtype, torch.half)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_to_dtype_fp16_to_fp32(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = o.to(dtype=torch.float)
-            o = o * 3.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.half, device='cuda')
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        self.assertEqual(jit_o.dtype, torch.float)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_to_dtype_fp16_to_fp16(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = o.to(dtype=torch.half)
-            o = o * 3.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.half, device='cuda')
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        self.assertEqual(jit_o.dtype, torch.half)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_to_dtype_fp32_to_bf16(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = o.to(dtype=torch.bfloat16)
-            o = o * 3.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.float, device='cuda')
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        self.assertEqual(jit_o.dtype, torch.bfloat16)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_to_dtype_bf16_to_fp32(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = o.to(dtype=torch.float)
-            o = o * 3.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda')
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        self.assertEqual(jit_o.dtype, torch.float)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
-    def test_to_dtype_bf16_to_bf16(self):
-        def t(x: torch.Tensor):
-            o = x * 2.0
-            o = o.to(dtype=torch.bfloat16)
-            o = o * 3.0
-            return o
-
-        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda')
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        self.assertEqual(jit_o.dtype, torch.bfloat16)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(not TEST_MULTIGPU, "requires multiple CUDA device")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_multiple_device_pw(self):
-
-        def t(x):
-            o = x + 1.0
-            o = torch.relu(o)
-            return o
-
-        x = torch.randn(2, dtype=torch.float32, device="cuda")
-        t_jit = torch.jit.script(t)
-
-        for i in range(3):
-            jit_o = t_jit(x)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-        torch.cuda.device(1)
-        x = x.to("cuda:1")
-        jit_o = t_jit(x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_graph_for_with_missing_optimized_engine(self):
-        x = torch.randn(8, 4, 2, dtype=torch.float, device="cuda").requires_grad_()
-
-        def t(x: torch.Tensor, flag: bool):
-            x = x + 1.0
-            x = torch.relu(x)
-            if flag:
-                o = x + 1.0
-                o = torch.relu(o)
-            else:
-                o = x + 2.0
-                o = torch.relu(o)
-            return o
-
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, False)
-        jit_o = t_jit(x, False)
-        jit_o = t_jit(x, True)
-        o = t(x, True)
-        self.assertEqual(o, jit_o)
-        # since the output value is not used at all, the fusion operator should
-        # have been optimized away
-        self.assertGraphContainsExactly(t_jit.graph_for(x, True), FUSION_GUARD, 1, True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_branches(self):
-        in_feature = 2
-        out_feature = 4
-        x = torch.randn(4, in_feature, dtype=torch.float32, device='cuda')
-        weight = torch.randn(out_feature, in_feature, dtype=torch.float32, device='cuda')
-        bias = torch.randn(out_feature, dtype=torch.float32, device='cuda')
-
-        def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, flag: bool):
-            if flag:
-                o = torch.nn.functional.linear(x, weight, bias)
-                o = o + 1.0
-                o = torch.relu(o)
-            else:
-                o = x.sum()
-                o = o + 2.0
-                o = torch.relu(o)
-            return o
-
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x, weight, bias, True)
-        jit_o = t_jit(x, weight, bias, True)
-        o = t(x, weight, bias, True)
-        self.assertEqual(o, jit_o)
-        # since the output value is not used at all, the fusion operator should
-        # have been optimized away
-        self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias, True), FUSION_GUARD, 1)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_scalar_tensor(self):
-        x = torch.empty([], device="cuda", dtype=torch.float32)
-
-        def t(x: torch.Tensor):
-            o = x + 1.0
-            o = torch.nn.functional.relu(o)
-            return o
-
-        # bias set to true.
-        t_jit = torch.jit.script(t)
-        jit_o = t_jit(x)
-        jit_o = t_jit(x)
-        o = t(x)
-        self.assertEqual(o, jit_o)
-        # since the output value is not used at all, the fusion operator should
-        # have been optimized away
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
-
-    @unittest.skipIf(os.environ.get('PYTORCH_NO_CUDA_MEMORY_CACHING') is not None,
-                     "skipping graph_rng when caching allocator is disabled")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(CUDA_MAJOR < 11, "requires CUDA11 or above")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_graph_rng(self):
-        self.assertTrue(torch._C._jit_nvfuser_enabled())
-        size = 10000
-        a = torch.randn((size,), device="cuda", dtype=torch.float)
-
-        def t(x):
-            o = x + 1.0
-            o = torch.nn.functional.dropout(o, p=0.1)
-            o = o + 1.0
-            o = torch.nn.functional.dropout(o, p=0.1)
-            return o
-
-        t_jit = torch.jit.script(t)
-
-        for _ in range(3):
-            t_jit(a)
-
-        self.assertGraphContainsExactly(t_jit.graph_for(a), FUSION_GUARD, 1)
-
-        # Control (jitted, ungraphed)
-        torch.cuda.manual_seed(5)
-        eager_out = a.clone()
-        for _ in range(3):
-            eager_out = t_jit(eager_out)
-
-        graph_in = a.clone()
-        g = torch.cuda.CUDAGraph()
-        s = torch.cuda.Stream()
-        s.wait_stream(torch.cuda.current_stream())
-        with torch.cuda.stream(s):
-            torch.cuda.manual_seed(5)
-            g.capture_begin()
-            graph_out = t_jit(graph_in)
-            g.capture_end()
-        torch.cuda.current_stream().wait_stream(s)
-        # g is now a jitted, graphed version of t.
-
-        # Runs a (jitted, graphed) -> (jitted, ungraphed) -> (jitted, graphed) sequence.
-        # The ops in the overall sequence should be the same as Control.
-        g.replay()
-        # graph_out is now filled with g's result. Use it as ungraphed input.
-        out = t_jit(graph_out)
-        graph_in.copy_(out)
-        g.replay()
-
-        # If replay() updated RNG state correctly, graph_out should now equal eager_out
-        self.assertEqual(graph_out, eager_out)
-
-    def _test_batch_norm_impl_index_helper(self, batch, c, hw, affine=True,
-                                           track_running_stats=True, train=True,
-                                           dtype=torch.float32):
-        # enabling inlining to avoid counter increment in BN forward
-        torch._C._debug_set_autodiff_subgraph_inlining(True)
-
-        class MyModule(torch.nn.Module):
-            def __init__(self, num_features=10, affine=True, track_running_stats=True):
-                super(MyModule, self).__init__()
-                self.bn = torch.nn.BatchNorm2d(num_features,
-                                               1e-5,
-                                               affine=affine,
-                                               track_running_stats=track_running_stats).to(dtype=dtype)
-
-            def forward(self, x):
-                o = self.bn(x)
-                o = o * 2.0
-                return o
-
-        x = torch.randn(batch, c, hw, hw, dtype=torch.float, device="cuda").to(dtype=dtype).requires_grad_()
-        grad = torch.randint(-20, 20, (batch, c, hw, hw), device="cuda").to(dtype=dtype).div(-10)
-
-        my_module = MyModule(c, affine, track_running_stats).cuda()
-        ref_module = MyModule(c, affine, track_running_stats).cuda()
-
-        if not train:
-            my_module.eval()
-            ref_module.eval()
-
-        t_jit = torch.jit.script(my_module)
-        ref_module.load_state_dict(my_module.state_dict())
-
-        ref_x = x.detach().requires_grad_()
-
-        for i in range(0, 3):
-            jit_o = t_jit(x)
-            jit_o.backward(grad)
-
-        # TODO: remove this run?
-        o = ref_module(ref_x)
-        o.backward(grad)
-
-        has_affine = ref_module.bn.weight is not None
-        has_running_stats = ref_module.bn.running_mean is not None
-
-        if has_running_stats:
-            my_module.bn.running_mean.zero_()
-            my_module.bn.running_var.fill_(1.0)
-            ref_module.bn.running_mean.zero_()
-            ref_module.bn.running_var.fill_(1.0)
-
-        # Verify that when train is False, we don't have grad for weight/bias.
-        if has_affine and train:
-            my_module.bn.weight.grad.zero_()
-            my_module.bn.bias.grad.zero_()
-            ref_module.bn.weight.grad.zero_()
-            ref_module.bn.bias.grad.zero_()
-
-        x.grad.zero_()
-        ref_x.grad.zero_()
-
-        # real runs
-        jit_o = t_jit(x)
-        jit_o.backward(grad)
-
-        o = ref_module(ref_x)
-        o.backward(grad)
-
-        # assert forward graph fusion
-        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1, consider_subgraphs=True)
-        # assert backward graph fusion
-        bwd_graph = list(
-            list(t_jit.get_debug_state().execution_plans.values())[0].code.grad_executor_states()[0]
-            .execution_plans.values())[0].graph
-        self.assertGraphContainsExactly(bwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
-
-        if TEST_WITH_ROCM:
-            e0 = 1e-3
-            e1 = 1e-2
-            e2 = 1e-2
-        else:
-            e0 = 1e-5 if dtype is not torch.half else 1e-3
-            e1 = 1e-4 if dtype is not torch.half else 1e-3
-            e2 = 1e-3 if dtype is not torch.half else 1e-2
-
-        self.assertTrue(self._compare("comparing output failed", jit_o, o, e0))
-        self.assertTrue(self._compare("comparing input grad failed", x.grad, ref_x.grad, e1))
-        # TODO: switch to welford and reduce this to 1e-5
-        # The 1e-3 looks bad, but we don't have welford in codegen, so numeric
-        # is very different between reference and codegen.
-        if has_affine and train:
-            self.assertTrue(self._compare("comparing weight grad failed",
-                                          my_module.bn.weight.grad,
-                                          ref_module.bn.weight.grad,
-                                          e2))
-            self.assertTrue(self._compare("comparing bias grad failed",
-                                          my_module.bn.bias.grad,
-                                          ref_module.bn.bias.grad,
-                                          e1))
-        if has_running_stats:
-            self.assertTrue(self._compare("comparing running_mean failed",
-                                          my_module.bn.running_mean,
-                                          ref_module.bn.running_mean,
-                                          e0))
-            self.assertTrue(self._compare("comparing running_var failed",
-                                          my_module.bn.running_var,
-                                          ref_module.bn.running_var,
-                                          e0))
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_batch_norm_half(self):
-        with torch.backends.cudnn.flags(enabled=True):
-            setups = [
-                [True, True],
-                [False, False],
-                [True, False],
-                [False, True]]
-            for training_and_track, affine in itertools.product(setups, [True, False]):
-                training, track_running_stats = training_and_track
-                self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_batch_norm_impl_index_inner_bcast(self):
-        # the repro
-        self._test_batch_norm_impl_index_helper(2, 1, 1, False, True, True)
-
-        # running the full set
-        setups = [
-            [True, True],
-            [False, False],
-            [True, False],
-            [False, True]]
-        for training_and_track, affine in itertools.product(setups, [True, False]):
-            training, track_running_stats = training_and_track
-            self._test_batch_norm_impl_index_helper(2, 1, 1, affine, track_running_stats, training)
-
-    @skipIfRocm
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_batch_norm_impl_index_correctness(self):
-        with torch.backends.cudnn.flags(enabled=True):
-            batch = [2, 7, 16]
-            channels = [4, 89, 19, 32]
-            hw = [1, 8, 17, 32]
-
-            # avoid tolerance failure in CI
-            torch.cuda.manual_seed_all(211)
-
-            # failing sizes (2, 1, 1, 1)
-            # failing sizes (2, 89, 8, 8) training False, track True, affine: False
-            for b, c, hw in itertools.product(batch, channels, hw):
-                setups = [
-                    [True, True],
-                    [False, False],
-                    [True, False],
-                    [False, True]]
-                for training_and_track, affine in itertools.product(setups, [True, False]):
-                    training, track_running_stats = training_and_track
-                    self._test_batch_norm_impl_index_helper(b, c, hw, affine, track_running_stats, training)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_softplus_fuser(self):
-        def shifted_softplus(x: torch.Tensor, shift: float):
-            return functional.softplus(x) - shift
-
-        jitted = torch.jit.script(shifted_softplus)
-        inp = torch.randn(4, 2, dtype=torch.float32, device="cuda").requires_grad_()
-        inp_ref = inp.detach().clone().requires_grad_()
-        grad = torch.randn(4, 2, dtype=torch.float32, device="cuda")
-
-        aten_o = shifted_softplus(inp_ref, 0.693147)
-        aten_o.backward(grad)
-        aten_grad = inp_ref.grad
-
-        for i in range(3):
-            jit_o = jitted(inp, 0.693147)
-            inp.grad = None         # avoid accumulation on grad
-            jit_o.backward(grad)
-            jit_grad = inp.grad
-
-        assert torch.allclose(jit_o, aten_o)
-        assert torch.allclose(jit_grad, aten_grad)
-        self.assertGraphContains(jitted.graph_for(inp, 0.693147), FUSION_GROUP, True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_inplace_removal(self):
-        def t(x: torch.Tensor):
-            o = torch.nn.functional.softmax(x, dim=0)
-            o += x
-            return o.relu_()
-
-        jitted = torch.jit.script(t)
-        inp = torch.randn(4, 2, dtype=torch.float32, device="cuda")
-
-        for i in range(3):
-            jit_o = jitted(inp)
-
-        graph = jitted.graph_for(inp)
-        self.assertGraphContains(graph, FUSION_GROUP, True)
-        self.assertGraphContains(graph, 'aten::add', True)
-        self.assertGraphContains(graph, 'aten::relu', True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_conv2d_bias(self):
-        def t(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor):
-            o = torch.nn.functional.conv2d(x, w, bias)
-            return o.relu()
-
-        jitted = torch.jit.script(t)
-        inp = torch.randn(4, 5, 3, 3, dtype=torch.float32, device="cuda")
-        weight = torch.randn(2, 5, 2, 2, dtype=torch.float32, device="cuda")
-        bias = torch.randn(2, dtype=torch.float32, device="cuda")
-
-        for i in range(3):
-            jit_o = jitted(inp, weight, bias)
-
-        graph = jitted.graph_for(inp)
-        self.assertGraphContains(graph, FUSION_GROUP, True)
-
-        def t_not_fused(x: torch.Tensor, w: torch.Tensor):
-            o = torch.nn.functional.conv2d(x, w)
-            return o.relu()
-
-        jitted_not_fused = torch.jit.script(t_not_fused)
-
-        for i in range(3):
-            jit_o = jitted_not_fused(inp, weight)
-
-        graph = jitted_not_fused.graph_for(inp)
-        self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
-        self.assertGraphContains(graph, 'aten::relu', True)
-
-        def t_bias(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor):
-            o = torch.nn.functional.conv2d(x, w, bias)
-            return o.relu()
-
-        jitted_bias = torch.jit.script(t_bias)
-
-        for i in range(3):
-            jit_o = jitted_bias(inp, weight, bias)
-
-        graph = jitted_bias.graph_for(inp)
-        self.assertGraphContains(graph, FUSION_GROUP, True)
-        self.assertGraphContains(graph, 'prim::add_optional', True)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_remove_output_used_only_in_dtype(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self, num_features=4):
-                super(MyModule, self).__init__()
-                self.bn0 = torch.nn.BatchNorm2d(num_features)
-                self.bn1 = torch.nn.BatchNorm2d(num_features)
-
-            def forward(self, x, y):
-                o1 = self.bn0(x)
-                o2 = self.bn1(y)
-                return torch.relu(o1 + o2)
-
-        t = MyModule(4).float().cuda()
-
-        jitted = torch.jit.script(t)
-        x = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
-        y = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
-
-        with torch.cuda.amp.autocast(True):
-            for i in range(5):
-                jit_o = jitted(x, y)
-
-            jit_o = jitted(x, y)
-            o = t(x, y)
-
-            self.assertTrue(torch.allclose(jit_o, o))
-            graph = jitted.graph_for(x, y)
-            self.assertGraphContains(graph, FUSION_GROUP, True)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_fix_shape_expression_bn(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self, num_features=4):
-                super(MyModule, self).__init__()
-                self.bn = torch.nn.BatchNorm2d(num_features)
-
-            def forward(self, x, y):
-                out1 = self.bn(x)
-                out2 = out1 + y
-                out3 = torch.relu(out2)
-                return out3
-
-        t = MyModule(4).float().cuda()
-
-        jitted = torch.jit.script(t)
-        x = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
-        y = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
-
-        with torch.cuda.amp.autocast(True):
-            for i in range(5):
-                jit_o = jitted(x, y)
-
-            jit_o = jitted(x, y)
-            o = t(x, y)
-
-            self.assertTrue(torch.allclose(jit_o, o))
-            graph = jitted.graph_for(x, y)
-            self.assertGraphContains(graph, FUSION_GROUP, True)
-
-    def _run_fwd_helper(self, func, ops, *args):
-        jitted = torch.jit.script(func)
-        for i in range(3):
-            jit_o = jitted(*args)
-        jit_o = jitted(*args)
-        o = func(*args)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        graph = jitted.graph_for(*args)
-        self.assertGraphContains(graph, FUSION_GROUP, True)
-        for op in ops:
-            self.assertGraphContainsExactly(graph, op, 0)
-
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_sibling_fusion(self):
-        device = "cuda"
-        dtype = torch.float
-        x = torch.randn(2, 5, dtype=dtype, device=device)
-        y = torch.randn(2, 5, dtype=dtype, device=device)
-
-        def t(x: torch.Tensor):
-            o1 = x + 1.0
-            o2 = x * 0.5
-            return o1, o2
-        self._run_fwd_helper(t, ['aten::add', 'aten::mul'], x)
-
-        def t2(x: torch.Tensor, y: torch.Tensor):
-            o1 = x.sum(0)
-            o2 = (x * y).sum(0)
-            return o1, o2
-        self._run_fwd_helper(t2, ['aten::sum', 'aten::mul'], x, y)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_clean_profile_ivalue(self):
-        device = "cuda"
-        dtype = torch.float
-        x = torch.randn(2, 5, dtype=dtype, device=device, requires_grad=True)
-        # turn on autodiff subgraph inlining
-        # this is to verify that we clean up profile_ivalue node out side of
-        # fusion code path.
-        torch._C._debug_set_autodiff_subgraph_inlining(True)
-
-        def t(x: torch.Tensor, flag: bool):
-            return torch.dropout(x, 0.5, flag)
-
-        jit_t = torch.jit.script(t)
-        for idx in range(5):
-            out = jit_t(x, True)
-
-        graph = jit_t.graph_for(x, True)
-        out = jit_t(x, False)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_sibling_fusion_no_scalar_inputs(self):
-        device = "cuda"
-        dtype = torch.float
-        x = torch.randn(2, 5, dtype=dtype, device=device)
-        y = torch.randn(3, dtype=dtype, device=device)
-
-        # no tensor dependency between o1/o2, we shouldn't be fusing them
-        def t(x: torch.Tensor, y: torch.Tensor):
-            o1 = x + 1
-            o2 = y - 1
-            return o1, o2
-
-        jitted = torch.jit.script(t)
-        for i in range(3):
-            jit_o = jitted(x, y)
-        graph = jitted.graph_for(x, y)
-        self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
-
-    def _bias_view_relu_helper(self, shape, output_shape, dtype, device, error):
-        class BiasViewRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasViewRelu, self).__init__()
-                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
-                with torch.no_grad():
-                    self.bias.fill_(10)
-
-            def forward(self, inputs: torch.Tensor, view_shape: List[int]):
-                o = inputs + self.bias
-                o = o.view(view_shape)
-                return torch.relu(o)
-
-        t = BiasViewRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        # profiling
-        jit_o = t_jit(x, output_shape)
-        # optimization
-        jit_o = t_jit(x, output_shape)
-        # final
-        jit_o = t_jit(x, output_shape)
-        # eager - baseline
-        o = t(x, output_shape)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, output_shape)
-
-        has_inferred_dimension = any([dim == -1 for dim in output_shape])
-        if has_inferred_dimension:
-            # prohibit fusing when view_shape contains an inferred dimension
-            self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
-            self.assertGraphContainsExactly(graph, 'prim::view_copy', 0)
-        else:
-            self.assertGraphContains(graph, FUSION_GUARD)
-            self.assertGraphContains(graph, 'prim::view_copy', True)
-
-    def _alias_bias_view_relu_helper(self, shape, output_shape, dtype, device, error):
-        class BiasViewRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasViewRelu, self).__init__()
-                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
-                with torch.no_grad():
-                    self.bias.fill_(10)
-
-            def forward(self, inputs : torch.Tensor, bias : torch.Tensor, view_shape : List[int]):
-                o = inputs.view(view_shape)
-                inputs.add_(bias)
-                return torch.relu(o)
-
-        t = BiasViewRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        # profiling
-        jit_o = t_jit(x.clone(), bias, output_shape)
-        # optimization
-        jit_o = t_jit(x.clone(), bias, output_shape)
-        # final
-        jit_o = t_jit(x.clone(), bias, output_shape)
-        # eager - baseline
-        o = t(x.clone(), bias, output_shape)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, bias, output_shape)
-        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
-        self.assertGraphContainsExactly(graph, 'prim::view_copy', 0)
-
-    # generate random view given original view
-    def _random_view(self, original_view, max_len=8, max_views=10000):
-        class Moves(enum.Enum):
-            Merge = 0
-            Split = 1
-            Broadcast = 2
-            ImplicitBroadcast = 3
-            Keep = 4
-
-        def valid(old_view, new_view):
-            old_view_size = reduce(operator.mul, old_view)
-            new_view_size = reduce(operator.mul, new_view)
-            return old_view_size == new_view_size
-
-        # given a random starting number, find the nearest divisor
-        def find_nearest_divisor(N):
-            if 2 >= (N - 1):
-                return -1
-            result = random.randint(2, N - 1)
-            while (N % result) != 0:
-                result += 1
-            return result
-
-        complete_views = set([tuple(original_view)])
-
-        to_visit = []
-        # empty new view, curent originaal view, start pos=0, move count = 0, last_move
-        to_visit.append(([], original_view, 0, [], Moves.Keep))
-
-        # depth-first search of view shapes, starting from the original view
-        while len(to_visit) > 0 and len(complete_views) < max_views:
-            new_view, old_view, odx, move_list, last_move = to_visit[-1]
-            to_visit.pop()
-
-            # iterate over each move type
-            for idx in range(len(Moves)):
-                state = Moves(idx)
-                new_view_clone = copy.deepcopy(new_view)
-                old_view_clone = copy.deepcopy(old_view)
-                new_move_list = move_list + [state]
-                new_odx = odx
-
-                # Update state using Move state
-                if state == Moves.Keep:
-                    new_size = old_view_clone[odx]
-                    new_view_clone.append(new_size)
-                    new_odx += 1
-
-                elif state == Moves.Merge:
-                    if odx + 1 < len(old_view_clone):
-                        new_size = old_view_clone[odx] * old_view_clone[odx + 1]
-                        new_view_clone.append(new_size)
-                        new_odx += 2
-                    else:
-                        continue
-
-                elif state == Moves.Broadcast and last_move != Moves.Broadcast:
-                    new_view_clone.append(1)
-
-                elif state == Moves.Split:
-                    new_size = find_nearest_divisor(old_view_clone[odx])
-                    if new_size == -1:
-                        continue
-                    new_view_clone.append(new_size)
-                    old_view_clone[odx] = int(old_view[odx] / new_size)
-
-                    if old_view_clone[odx] == 1:
-                        new_odx += 1
-
-                elif state == Moves.ImplicitBroadcast:
-                    old_view_clone.insert(odx + 1, 1)
-                    new_size = old_view[odx] * 1
-                    new_view_clone.append(new_size)
-                    new_odx += 2
-
-                if new_odx < len(old_view_clone) and len(new_move_list) < max_len:
-                    to_visit.append((new_view_clone, old_view_clone, new_odx, new_move_list, state))
-                elif (valid(original_view, new_view_clone)):
-                    final_new_view = tuple(new_view_clone)
-                    complete_views.add(final_new_view)
-        return list(complete_views)
-
-    # ndims - number of dimensions
-    # test_fn - view test function
-    def _view_test_generator(self, ndims, test_fn):
-        # create random tensor
-        # max value for each dimension
-        max_size = 10e7
-        max_value = max(int(pow(max_size, 1. / ndims)), 1)
-        sizes = [random.randint(1, max_value) for idx in range(ndims)]
-        x = torch.randn(sizes)
-
-        original_sizes = list(x.size())
-        all_views = self._random_view(original_sizes)
-        random.shuffle(all_views)
-
-        max_samples = 20
-        max_views = min(len(all_views), max_samples)
-        total = 0
-        correct = 0
-        # test random combinations of compatible views
-        for idx in range(max_views):
-            for jdx in range(idx + 1, max_views):
-                total += 1
-                test_fn(all_views[idx], all_views[jdx], torch.float, 'cuda', 1e-6)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_view(self):
-        torch._C._jit_set_nvfuser_guard_mode(True)
-        self._bias_view_relu_helper([2, 3, 4, 5], [-1, 4, 5], torch.float, 'cuda', 1e-6)
-        for ndims in range(1, 5):
-            self._view_test_generator(ndims, self._bias_view_relu_helper)
-        self._alias_bias_view_relu_helper([2, 3, 4, 5], [1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
-
-    def _bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error):
-        class BiasFlattenRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasFlattenRelu, self).__init__()
-                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
-                with torch.no_grad():
-                    self.bias.fill_(10)
-
-            def forward(self, inputs : torch.Tensor, start_dim : int, end_dim : int):
-                o = inputs + self.bias
-                o = o.flatten(start_dim, end_dim)
-                return torch.relu(o)
-
-        t = BiasFlattenRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        self._run_helper(t_jit, t, x, start_dim, end_dim)
-        self.assertGraphContains(t_jit.graph_for(x, start_dim, end_dim), 'prim::flatten_copy', True)
-
-    def _alias_bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error):
-        class BiasFlattenRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasFlattenRelu, self).__init__()
-                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
-                with torch.no_grad():
-                    self.bias.fill_(10)
-
-            def forward(self, inputs : torch.Tensor, bias : torch.Tensor, start_dim : int, end_dim : int):
-                o = inputs.flatten(start_dim, end_dim)
-                inputs.add_(bias)
-                return torch.relu(o)
-
-        t = BiasFlattenRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        # profiling
-        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
-        # optimization
-        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
-        # final
-        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
-        # eager - baseline
-        o = t(x.clone(), bias, start_dim, end_dim)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, bias, start_dim, end_dim)
-
-        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
-        self.assertGraphContainsExactly(graph, 'prim::flatten_copy', 0)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since flatten is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_flatten(self):
-        torch._C._jit_set_nvfuser_guard_mode(True)
-        self._bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6)
-        self._bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6)
-        self._bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6)
-        self._bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6)
-        self._bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6)
-        self._bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6)
-        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6)
-        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6)
-        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6)
-        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6)
-        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6)
-        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_strict_fusion(self):
-        def success(x):
-            with torch.jit.strict_fusion():
-                return x + x + x
-
-        scripted = self.checkScript(success, (torch.rand([4], device='cuda'),))
-        g = torch.jit.last_executed_optimized_graph()
-        FileCheck().check_not("aten::add").check("prim::CudaFusionGroup").run(g)
-
-        def failure(x):
-            with torch.jit.strict_fusion():
-                return x + torch.mm(x, x) + x
-
-        with self.assertRaises(Exception) as error_out:
-            foo_s = torch.jit.script(failure)
-            foo_s(torch.rand([4, 4]))
-            foo_s(torch.rand([4, 4]))
-
-        fc = FileCheck().check("Found unfused operators")
-        fc.check("aten::mm").run(str(error_out.exception))
-
-    def _ltc_helper(self, shape, dtype, device, error, approximate=True):
-        # modeled after LTC linear layer
-        class LTC(torch.nn.Module):
-            def __init__(self):
-                super(LTC, self).__init__()
-                self.weight = torch.nn.Parameter(torch.randn([1024, 1024], dtype=dtype, device=device), requires_grad=False)
-                self.bias = torch.nn.Parameter(torch.randn([1, 1024], dtype=dtype, device=device), requires_grad=False)
-
-            def forward(self, inputs : torch.Tensor):
-                o = inputs.view([32768, 1024])
-                o = torch.mm(o, self.weight)
-                o = o.view([256, 128, 1024])
-                o = o + self.bias
-                o = o.view([32768, 1024])
-                o = o.view([256, 128, 1024])
-                return torch.nn.functional.gelu(o)
-
-        t = LTC()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        # profile/optimization runs
-        for i in range(3):
-            jit_o = t_jit(x)
-        o = t(x)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x)
-        self.assertGraphContains(graph, FUSION_GUARD)
-        self.assertGraphContains(graph, 'prim::view_copy', True)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_nested_view(self):
-        self._ltc_helper([256, 128, 1024], torch.float, 'cuda', 1e-6)
-
-    def _bias_squeeze_relu_helper(self, shape, dtype, device, error):
-        class BiasSqueezeRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasSqueezeRelu, self).__init__()
-
-            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
-                o = inputs + bias
-                o = torch.squeeze(o)
-                return torch.relu(o)
-
-        t = BiasSqueezeRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        jit_o = t_jit(x, bias)
-        jit_o = t_jit(x, bias)
-        jit_o = t_jit(x, bias)
-        o = t(x, bias)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, bias)
-        self.assertGraphContains(graph, FUSION_GUARD)
-        self.assertGraphContains(graph, 'prim::squeeze_copy', True)
-
-    def _alias_bias_squeeze_relu_helper(self, shape, dtype, device, error):
-        class BiasSqueezeRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasSqueezeRelu, self).__init__()
-
-            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
-                o = torch.squeeze(inputs)
-                inputs.add_(bias)
-                return torch.relu(o)
-
-        t = BiasSqueezeRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        jit_o = t_jit(x.clone(), bias)
-        jit_o = t_jit(x.clone(), bias)
-        jit_o = t_jit(x.clone(), bias)
-        o = t(x.clone(), bias)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, bias)
-        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
-        self.assertGraphContainsExactly(graph, 'prim::squeeze_copy', 0)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_squeeze(self):
-        self._bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
-        self._alias_bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
-    # remove this after opinfo tests are enabled
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_squeeze_zero(self):
-        x = torch.tensor(1.0, dtype=torch.float, device="cuda")
-
-        def squeeze_0(x: torch.Tensor):
-            o = x + 1.
-            o = torch.squeeze(o, 0)
-            o = o * 2.
-            return o
-
-        def squeeze_1(x: torch.Tensor):
-            o = x + 1.
-            o = torch.squeeze(o, -1)
-            o = o + .5
-            return o
-
-        squeeze_0_jit = torch.jit.script(squeeze_0)
-        self._run_helper(squeeze_0_jit, squeeze_0, x)
-        squeeze_1_jit = torch.jit.script(squeeze_1)
-        self._run_helper(squeeze_1_jit, squeeze_1, x)
-
-    def _bias_unsqueeze_relu_helper(self, shape, dtype, device, error):
-        class BiasUnsqueezeRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasUnsqueezeRelu, self).__init__()
-
-            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
-                o = inputs + bias
-                o = torch.unsqueeze(o, 0)
-                return torch.relu(o)
-
-        t = BiasUnsqueezeRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        jit_o = t_jit(x, bias)
-        jit_o = t_jit(x, bias)
-        jit_o = t_jit(x, bias)
-        o = t(x, bias)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, bias)
-        self.assertGraphContains(graph, FUSION_GUARD)
-        self.assertGraphContains(graph, 'prim::unsqueeze_copy', True)
-
-    def _alias_bias_unsqueeze_relu_helper(self, shape, dtype, device, error):
-        class BiasUnsqueezeRelu(torch.nn.Module):
-            def __init__(self):
-                super(BiasUnsqueezeRelu, self).__init__()
-
-            def forward(self, inputs : torch.Tensor, bias : torch.Tensor):
-                o = torch.unsqueeze(inputs, 0)
-                inputs.add_(bias)
-                return torch.relu(o)
-
-        t = BiasUnsqueezeRelu()
-        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
-        t_jit = torch.jit.script(t)
-
-        jit_o = t_jit(x.clone(), bias)
-        jit_o = t_jit(x.clone(), bias)
-        jit_o = t_jit(x.clone(), bias)
-        o = t(x.clone(), bias)
-
-        self.assertEqual(o.dtype, jit_o.dtype)
-        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
-        graph = t_jit.graph_for(x, bias)
-        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
-        self.assertGraphContainsExactly(graph, 'prim::unsqueeze_copy', 0)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_unsqueeze(self):
-        self._bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6)
-        self._alias_bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since unsqueeze is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_alias_pass_fix(self):
-        x = torch.randn(4, 24, 2, 2, dtype=torch.float, device="cuda")
-        w = torch.randn(24, 24, 1, 1, dtype=torch.float, device="cuda")
-        b = torch.randn(24, dtype=torch.float, device="cuda")
-
-        def t(x, w, b):
-            b2 = b + 1.0
-            o = torch.conv2d(x, w, b2)
-            return o
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, w, b)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_squeeze_negative_dim(self):
-        x = torch.randn(4, 24, 1, 2, dtype=torch.float, device="cuda")
-
-        def t(x):
-            o = x + 1.0
-            o = o.squeeze(-2)
-            o = o * 2.0
-            return o
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_singleton_fusion(self):
-        x = torch.randn(4, 2, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x.relu()
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_issue1445_fusion(self):
-        def f(t0, t1, t2, t3):
-            masked_input = torch.where(t1, t2, t3)
-            total = masked_input.sum([0, 1, 2, 3])
-            sizes : List[int] = []
-            t10 = torch.reshape(t0, sizes)
-            t7 = total / t10
-            t4 = t7.to(dtype=torch.float)
-            return t4
-
-        x = torch.randn(1, 1, 1, 1, device='cuda').to(dtype=torch.long)
-        y = torch.randn(3, 2, 1, 1, device='cuda').to(dtype=torch.bool).expand([3, 2, 1, 2])
-        z = torch.randn(3, 2, 1, 2, device='cuda')
-        w = torch.tensor(1.5, device='cuda')
-
-        f_jit = torch.jit.script(f)
-        for i in range(5):
-            out_jit = f_jit(x, y, z, w)
-        out = f(x, y, z, w)
-        self.assertEqual(out, out_jit)
-        self.assertGraphContainsExactly(f_jit.graph_for(x, y, z, w), FUSION_GROUP, 1)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_disable_sibling_fuse(self):
-        x = torch.randn(4, 2, device="cuda")
-        y = torch.randn(8, device="cuda")
-        s = torch.tensor(1.5, device="cuda")
-
-        with nvfuser_horizontal_fusion(False):
-            def t(x, y, s):
-                o1 = x + s
-                o2 = y + s
-                return o1, o2
-
-            t_jit = torch.jit.script(t)
-            for i in range(5):
-                t_jit(x, y, s)
-
-            # sibling fusion should be disabled with the flag
-            self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_build_shape_expression_native_dropout(self):
-        x = torch.randn(4, 2, device="cuda")
-
-        def t(x):
-            o, mask = torch.native_dropout(x, 0.0, True)
-            o1 = o.sigmoid()
-            o2 = mask.float().sigmoid()
-            return (o1, o2)
-
-        t_jit = torch.jit.script(t)
-
-        jit_o = t_jit(x)
-        jit_o = t_jit(x)
-        o = t(x)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_scalar_tensor_permuted(self):
-        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
-        y = torch.tensor(1.0, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x, y):
-                return x + y
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x, y)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_cpu_scalar(self):
-        x = torch.randn(4, 2, 3, device="cuda")
-        y = torch.tensor(1.0, device="cpu")
-        z = torch.tensor(2.0, device="cpu")
-
-        with nvfuser_singleton_fusion(True):
-            # testing cpu scalar tensor promotion
-            def t(x, y):
-                return x + y
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x, y)
-
-            # scalar cpu tensor add should NOT be fused
-            @torch.jit.script
-            def t1(y, z):
-                return y * z
-            for _ in range(5):
-                t1(y, z)
-            self.assertGraphContainsExactly(t1.graph_for(y, z), FUSION_GUARD, 0)
-
-            # everything, including scalar cpu tensor add should be fused
-            @torch.jit.script
-            def t2(x, y, z):
-                tmp = y + z
-                return tmp + x
-            for _ in range(5):
-                t2(x, y, z)
-            self.assertGraphContainsExactly(t2.graph_for(x, y, z), 'aten::add', 0)
-            self.assertGraphContainsExactly(t2.graph_for(x, y, z), FUSION_GUARD, 1)
-
-            # 'cpu_tmp = y + z' shouldn't be fused.
-            @torch.jit.script
-            def t3(x, y, z):
-                cpu_tmp = y + z
-                out = x + y
-                return cpu_tmp, out
-            for _ in range(5):
-                t3(x, y, z)
-            self.assertGraphContainsExactly(t3.graph_for(x, y, z), FUSION_GUARD, 1)
-            self.assertGraphContainsExactly(t3.graph_for(x, y, z), 'aten::add', 1)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_shape_expression(self):
-        x = torch.randn(4, 2, 1, 3, device="cuda")
-
-        def t_unsqueeze(x):
-            t0 = x.relu()
-            t1 = t0.unsqueeze(1)
-            t2 = t1 + 1.0
-            t3 = t1.size()
-            return t2, t3
-
-        def t_squeeze(x):
-            t0 = x.relu()
-            t1 = t0.squeeze()
-            t2 = t1 + 1.0
-            t3 = t1.size()
-            return t2, t3
-
-        def t_squeeze_dim(x):
-            t0 = x.relu()
-            t1 = t0.squeeze(-2)
-            t2 = t1 + 1.0
-            t3 = t1.size()
-            return t2, t3
-
-        # squeezing a non-size 1 dimension should be a no op
-        def t_squeeze_dim_no_op(x):
-            t0 = x.relu()
-            t1 = t0.squeeze(1)
-            t2 = t1 + 1.0
-            t3 = t1.size()
-            return t2, t3
-
-        def run(fn):
-            jit_fn = torch.jit.script(fn)
-            jit_o = jit_fn(x)
-            jit_o = jit_fn(x)
-            jit_o = jit_fn(x)
-            o = fn(x)
-            # output 0 is a tensor, so we check dtype and value
-            self.assertEqual(o[0].dtype, jit_o[0].dtype)
-            self.assertEqual(o[0], jit_o[0])
-            # output 1 is shape
-            self.assertEqual(o[1], jit_o[1])
-            self.assertGraphContainsExactly(jit_fn.graph_for(x), FUSION_GUARD, 1)
-
-        for t in [t_unsqueeze, t_squeeze, t_squeeze_dim, t_squeeze_dim_no_op]:
-            run(t)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_scalar_cuda_tensor(self):
-        x = torch.tensor(2.0, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x + 1.0
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-            @torch.jit.script
-            def t_jitted(x):
-                return x.sum(0)
-
-            for i in range(5):
-                t_jitted(x)
-            self.assertGraphContainsExactly(t_jitted.graph_for(x), FUSION_GUARD, 0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_overlapped_input(self):
-        x = torch.randn(8, device="cuda").as_strided((2, 4), (1, 1))
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x + 1.0
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    def test_reduction_empty_axes(self):
-        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                sizes : List[int] = []
-                return x.sum(sizes)
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    def test_int_tensor_input(self):
-        x = torch.randn(4, 2, device="cuda").to(dtype=torch.int)
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x.amax(dim=0)
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_to_boolean(self):
-        x = torch.randn(4, 2, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x.to(dtype=torch.bool)
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_to_copy(self):
-        x = torch.randn(4, 2, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x, dtype : torch.dtype):
-                o = torch.ops.aten._to_copy(x, dtype=dtype)
-                return o
-
-            t.__disable_jit_function_caching__ = True
-
-            t_jit = torch.jit.script(t)
-            for dtype in [torch.float16, torch.bool, torch.float64]:
-                self._run_helper(t_jit, t, x, dtype)
-
-            def t_none(x):
-                with torch.jit.strict_fusion():
-                    o = torch.ops.aten._to_copy(x, dtype=None)
-                return o
-
-            t_jit_none = torch.jit.script(t_none)
-            self._run_helper(t_jit_none, t_none, x)
-
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since reshape is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_view_copy_graph_guard(self):
-        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
-        y = [4, 6]
-
-        with nvfuser_singleton_fusion(True):
-            def t(x, y : List[int]):
-                t1 = x + 1.0
-                t2 = t1 * 1.0
-                out = t2.reshape(y)
-                return out.relu()
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x, y)
-
-    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_view_copy_graph_guard_double_fusion(self):
-        x = torch.randn(2, 2, 5, device="cuda")
-        w = torch.randn(5, 5, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x, w):
-                o = x.view([4, x.size()[-1]])
-                o = torch.matmul(o, w)
-                o = o.view([2, 2, o.size()[1]])
-                return o
-
-            t_jit = torch.jit.script(t)
-            for i in range(3):
-                jit_o = t_jit(x, w)
-            o = t(x, w)
-            self.assertEqual(jit_o, o)
-            self.assertGraphContainsExactly(t_jit.graph_for(x, w), FUSION_GUARD, 2, consider_subgraphs=True)
-
-    @skipIfRocm
-    # see issue here on why we disabled this test https://github.com/csarofeen/pytorch/issues/2127
-    @unittest.skipIf(is_pre_volta(), "permutation scheduling can be dangerous on pre-volta device")
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_view_before_permute(self):
-        view_examples = [[[1, 19, 1, 12, 7, 1, 99], [1, 19, 1, 3, 2772]],
-                         [[3, 17, 80, 1], [51, 1, 2, 4, 10]],
-                         [[3, 17, 80, 1, 9], [51, 1, 2, 4, 10, 9]],
-                         [[2, 3, 4, 5], [1, 6, 1, 2, 2, 5]],
-                         [[22, 22, 2], [22, 11, 1, 1, 4]],
-                         [[37, 9, 7, 6, 10], [333, 2, 2, 3, 35]],
-                         [[8, 1, 1, 8, 1, 8], [8, 2, 4, 1, 8]],
-                         [[1, 333, 1], [1, 37, 9]],
-                         [[1, 333], [1, 1, 1, 111, 1, 3]],
-                         [[1, 27454, 1, 2], [1, 7844, 1, 7]],
-                         [[1, 7844, 1, 7], [1, 27454, 2]]]
-
-        def _getTransposeAxes(sizes):
-            # broadcast do not change
-            # always move inner-most dim
-            # random permutation of other dims
-            result = []
-            valid_sizes = []
-            for idx, val in enumerate(sizes):
-                if val > 1 and idx < len(sizes) - 1:
-                    valid_sizes.append((idx, val))
-                result.append(idx)
-            idx, new_size = valid_sizes[random.randint(0, len(valid_sizes) - 1)]
-            result[idx] = len(sizes) - 1
-            result[len(sizes) - 1] = idx
-            return result
-
-        def _transposeSize(sizes, dims):
-            return [sizes[old_pos] for old_pos in dims]
-
-        for example in view_examples:
-            before_view_size, after_view_size = example
-            axes = _getTransposeAxes(after_view_size)
-            output_size = _transposeSize(after_view_size, axes)
-            self._view_before_permute_helper(before_view_size, after_view_size, output_size, axes)
-
-    def _view_before_permute_helper(self, input_shape, view_shape, output_shape, dims):
-        def t(x, y, view_shape : List[int], dims : List[int]):
-            x_v = x.view(view_shape)
-            x_t = torch.permute(x_v, dims)
-            o = torch.add(x_t, y)
-            o = torch.relu(o)
-            return o
-
-        x = torch.randn(*input_shape, device="cuda")
-        y = torch.randn(*output_shape, device="cuda")
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y, view_shape, dims)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_permute(self):
-        max_dims = 4
-        for ndims in range(2, max_dims + 1):
-            shape = [idx + 2 for idx in range(ndims)]
-            for dims in itertools.permutations(range(ndims)):
-                self._permute_helper(shape, dims)
-
-    def _permute_helper(self, shape, dims):
-        def t(x, y, dims : List[int]):
-            x_t = torch.permute(x, dims)
-            y_t = torch.permute(y, dims)
-            o = torch.add(x_t, y_t)
-            o = torch.relu(o)
-            return o
-
-        x = torch.randn(*shape, device="cuda")
-        y = torch.randn(*shape, device="cuda")
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y, dims)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_transpose(self):
-        max_dims = 4
-        for ndims in range(2, max_dims + 1):
-            shape = [idx + 2 for idx in range(ndims)]
-            for idx in range(1, ndims):
-                for jdx in range(idx):
-                    self._transpose_helper(shape, idx, jdx)
-
-    def _transpose_helper(self, shape, dim0, dim1):
-        def t(x, y, dim0 : int, dim1 : int):
-            x_t = torch.transpose(x, dim0, dim1)
-            y_t = torch.transpose(y, dim0, dim1)
-            o = torch.add(x_t, y_t)
-            o = torch.nn.functional.gelu(o)
-            return o
-
-        x = torch.randn(*shape, device="cuda")
-        y = torch.randn(*shape, device="cuda")
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y, dim0, dim1)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_transpose_default(self):
-        def t(x, y):
-            x_t = torch.t(x)
-            y_t = torch.t(y)
-            o = torch.add(x_t, y_t)
-            o = torch.nn.functional.gelu(o)
-            return o
-
-        x = torch.randn(3, 5, device="cuda")
-        y = torch.randn(3, 5, device="cuda")
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_input_output_passthrough(self):
-        def t(t0, t1, t2):
-            mask = t1.to(dtype=torch.bool)
-            masked_input = torch.where(t0, mask, t2)
-            return masked_input, mask
-
-        t_jit = torch.jit.script(t)
-        # stick to integers, this avoid the numerical difference due to our
-        # promotion
-        x = torch.randn(4, 4, device='cuda').to(dtype=torch.bool)
-        y = torch.randn(4, 4, device='cuda').to(dtype=torch.bool)
-        z = torch.tensor(1.0, device='cuda').to(dtype=torch.bool)
-        jit_o = t_jit(x, y, z)
-        jit_o = t_jit(x, y, z)
-        o = t(x, y, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
-        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_pointwise_reference_tensor(self):
-        def t(input1, input2, scalar):
-            _unsafe_view = torch.ops.aten._unsafe_view(input1, [2, 4, 16])
-            add_ = torch.ops.aten.add_(_unsafe_view, input2)
-            gelu_ = torch.ops.aten.gelu(add_)
-            view_ = torch.ops.aten.view(gelu_, [8, 16])
-            mul_ = torch.ops.aten.mul(add_, scalar)
-            return [view_, mul_]
-
-        x = torch.randn(8, 16, device="cuda")
-        bias = torch.randn(16, device="cuda")
-        scalar = torch.ones(torch.Size([]), device="cuda")
-
-        t_jit = torch.jit.script(t)
-        for i in range(3):
-            jit_o = t_jit(x, bias, scalar)
-        o = t(x, bias, scalar)
-        self.assertEqual(jit_o, o)
-        self.assertGraphContains(t_jit.graph_for(x, bias, scalar), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    def test_native_batch_norm_backward(self):
-        grad_output = torch.randn(4, 2, 3, device="cuda")
-        input = torch.randn(4, 2, 3, device="cuda")
-        weight = torch.randn(2, device="cuda")
-
-        r_m = torch.randn(2, device="cuda")
-        r_v = torch.randn(2, device="cuda").abs()
-
-        save_mean = torch.randn(2, device="cuda")
-        save_invstd = torch.randn(2, device="cuda").abs()
-
-        with nvfuser_singleton_fusion(True):
-            def t(grad_out, input, weight, r_m, r_v, save_mean, save_invstd, train: bool, eps: float, mask: List[bool]):
-                return torch.ops.aten.native_batch_norm_backward(grad_out, input, weight, r_m, r_v, save_mean,
-                                                                 save_invstd, train, eps, mask)
-
-            t_jit = torch.jit.script(t)
-            for i in range(4):
-                jit_o = t_jit(grad_output, input, weight, r_m.clone(), r_v.clone(),
-                              save_mean, save_invstd, True, 1e-5, [True, True, True])
-
-            ref_m = r_m.clone()
-            ref_v = r_v.clone()
-            jit_o = t_jit(grad_output, input, weight, r_m, r_v, save_mean, save_invstd, True, 1e-5, [True, True, True])
-            o = t(grad_output, input, weight, ref_m, ref_v, save_mean, save_invstd, True, 1e-5, [True, True, True])
-            for oo, jit_oo in zip(o, jit_o):
-                self.assertEqual(oo.dtype, jit_oo.dtype)
-                self.assertEqual(oo, jit_oo)
-            self.assertEqual(ref_m.dtype, r_m.dtype)
-            self.assertEqual(ref_m, r_m)
-            self.assertEqual(ref_v.dtype, r_v.dtype)
-            self.assertEqual(ref_v, r_v)
-            self.assertGraphContains(t_jit.graph_for(grad_output, input, weight, r_m.clone(), r_v.clone, save_mean,
-                                                     save_invstd, True, 1e-5, [True, True, True]), FUSION_GUARD)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_contiguous_on_broadcasted(self):
-        x = torch.randn(4, 1, device="cuda")
-        y = torch.randn(4, 128, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x, y):
-                t1 = x.expand([4, 128])
-                t2 = t1 * y
-                return t2
-
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x, y)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_skip_parser(self):
-        x = torch.randn(4, 12, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def fn(x):
-                t1 = x + 1.0
-                return t1.relu()
-
-            fn_jit = torch.jit.script(fn)
-            self._run_helper(fn_jit, fn, x)
-
-            # add node should have been merged into fusion
-            self.assertGraphContains(fn_jit.graph_for(x), FUSION_GUARD)
-            self.assertGraphContainsExactly(fn_jit.graph_for(x), 'aten::add', 0)
-
-            # flips skip parse for `aten::add`, following fusion should skip the
-            # add node
-            self.assertFalse(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True))
-
-            def fn_1(x):
-                t1 = x + 2.0  # change const value so we'll not reuse plan
-                return t1.relu()
-
-            fn_1_jit = torch.jit.script(fn_1)
-            self._run_helper(fn_1_jit, fn_1, x)
-
-            # add node should have been merged into fusion
-            self.assertGraphContains(fn_1_jit.graph_for(x), FUSION_GUARD)
-            self.assertGraphContainsExactly(fn_1_jit.graph_for(x), 'aten::add', 1)
-
-            # flips skip parse for `aten::add`, next fusion should fuse add node
-            self.assertTrue(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True))
-
-            def fn_2(x):
-                t1 = x + 2.0  # change const value so we'll not reuse plan
-                return t1.relu()
-
-            fn_2_jit = torch.jit.script(fn_2)
-            self._run_helper(fn_2_jit, fn_2, x)
-
-            # add node should have been merged into fusion
-            self.assertGraphContains(fn_2_jit.graph_for(x), FUSION_GUARD)
-            self.assertGraphContainsExactly(fn_2_jit.graph_for(x), 'aten::add', 0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_cuda_fusion_guard(self):
-        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
-
-        class ConvModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return x.sin().sigmoid()
-
-        mod = ConvModule().to(device="cuda")
-
-        inputs = [torch.randn(20, 16, 50, 100, device="cuda", requires_grad=True)]
-
-        def reduce_scalar(temp):
-            return temp.sum()
-
-        scripted = torch.jit.script(mod)
-        with torch.no_grad():
-            scripted(*inputs)
-        res = scripted(*inputs)
-        reduce_scalar(res).backward()
-        torch._C._jit_set_nvfuser_guard_mode(old_guard)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_nvfuser_comparison_callbacks_with_fallback(self):
-        try:
-            fused_result = None
-            unfused_result = None
-            graph_ir = None
-
-            def callback(fused_outputs, unfused_outputs, graph_str):
-                nonlocal unfused_result
-                nonlocal fused_result
-                nonlocal graph_ir
-                unfused_result = unfused_outputs[-1]
-                fused_result = fused_outputs[-1]
-                graph_ir = graph_str
-            torch._C._jit_nvfuser_set_comparison_callback(True, callback)
-
-            def fn(x, y):
-                z = torch.add(x, y)
-                return torch.relu(z)
-
-            x = torch.rand((4, 4)).cuda() - 0.5
-            y = torch.rand((4, 4)).cuda() - 0.5
-
-            fn_s = torch.jit.script(fn)
-            fn_s(x, y)
-            fn_s(x, y)
-            fn_s(x, y)
-
-            expected = fn(x, y)
-
-            self.assertEqual(expected, fused_result)
-            self.assertEqual(expected, unfused_result)
-            FileCheck().check("aten::add").run(graph_ir)
-        finally:
-            torch._C._jit_nvfuser_clear_comparison_callback()
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_nvfuser_comparison_callbacks_without_fallback(self):
-        try:
-            fused_result = None
-            unfused_result = None
-            graph_ir = None
-
-            def callback(fused_outputs, unfused_outputs, graph_str):
-                nonlocal unfused_result
-                nonlocal fused_result
-                nonlocal graph_ir
-                if len(unfused_outputs) > 0:
-                    unfused_result = unfused_outputs[-1]
-                fused_result = fused_outputs[-1]
-                graph_ir = graph_str
-            torch._C._jit_nvfuser_set_comparison_callback(False, callback)
-
-            def fn(x, y):
-                z = torch.add(x, y)
-                return torch.relu(z)
-
-            x = torch.rand((4, 4)).cuda() - 0.5
-            y = torch.rand((4, 4)).cuda() - 0.5
-
-            fn_s = torch.jit.script(fn)
-            fn_s(x, y)
-            fn_s(x, y)
-            fn_s(x, y)
-
-            expected = fn(x, y)
-
-            self.assertEqual(expected, fused_result)
-            self.assertEqual(None, unfused_result)
-            FileCheck().check("aten::add").run(graph_ir)
-        finally:
-            torch._C._jit_nvfuser_clear_comparison_callback()
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires NVFuser")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_cuda_fusion_guard_backward(self):
-        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
-
-        inp = torch.randn(10, device="cuda", requires_grad=True)
-        grad = torch.randn(10, device="cuda")
-
-        def f(x):
-            a = x.cos().cos()
-            return a
-        scripted = torch.jit.script(f)
-
-        with profile(activities=[ProfilerActivity.CPU]) as prof:
-            for _ in range(5):
-                inp.grad = None
-                out = scripted(inp)
-                out.backward(grad)
-
-        # check that we do not have fallback triggered
-        self.assertEqual(prof.events().table().find("fallback"), -1)
-        torch._C._jit_set_nvfuser_guard_mode(old_guard)
-
-    # TODO: generalize this
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
-    def test_inf_quick_patch(self):
-        inputs = [torch.tensor([-float('inf'), float('inf'), 4.0], device="cuda"),
-                  torch.tensor([1.0, float('inf'), 4.0], device="cuda"),
-                  torch.tensor([-float('inf'), -1.5, 4.0], device="cuda"),
-                  torch.tensor([1.0, -3.0, float('nan')], device="cuda"),
-                  torch.tensor([-float('inf'), -float('inf'), -float('inf')], device="cuda"),
-                  torch.tensor([float('inf'), float('inf'), float('inf')], device="cuda"),
-                  torch.tensor([float('nan'), float('nan'), float('nan')], device="cuda")]
-
-        def fn_amax(x):
-            return x.amax(dim=0)
-
-        def fn_amin(x):
-            return x.amin(dim=0)
-
-        def fn_add_nan(x):
-            return x.relu() + float('nan')
-
-        def fn_add(x):
-            return x + 1.0
-
-        with nvfuser_singleton_fusion(True):
-            for t in [fn_amax, fn_amin, fn_add, fn_add_nan]:
-                for x in inputs:
-                    t_jit = torch.jit.script(t)
-                    self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_clamp_reversed_bound(self):
-        x = torch.tensor([1., -float('inf'), 2., float('inf'), float('nan')], device="cuda")
-
-        def t(x):
-            return x.clamp(min=1., max=0.5)
-
-        with nvfuser_singleton_fusion(True):
-            jit_t = torch.jit.script(t)
-            self._run_helper(jit_t, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_issue_1785(self):
-        class Fusion(torch.nn.Module):
-            def __init__(self):
-                super(Fusion, self).__init__()
-
-            def forward(self, x, a, b):
-                out = torch.mul(x.unsqueeze(-1), a)
-                out = out + b
-                return out
-
-        x = torch.randn(1024, 192, 3, device='cuda')
-        a = torch.randn(3, 128, device='cuda')
-        b = torch.randn(3, 128, device='cuda')
-
-        model = Fusion()
-        jit_model = torch.jit.script(model)
-
-        with torch.jit.fuser('fuser2'):
-            for _ in range(4):
-                out_ref = model(x, a, b)
-                out_jit = jit_model(x, a, b)
-
-        out_ref = model(x, a, b)
-        out_jit = jit_model(x, a, b)
-        self.assertTrue(self._compare("comparing output failed", out_ref, out_jit, 1e-5))
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_high_rank_fusion(self):
-        # currently we want to limit fusion to node with input where rank <= 8
-        rank_limit = 8
-        shapes = [4 for i in range(rank_limit + 1)]
-        x = torch.randn(shapes, device="cuda")
-
-        with nvfuser_singleton_fusion(True):
-            def t(x):
-                return x.relu()
-
-            jit_t = torch.jit.script(t)
-            for i in range(5):
-                jit_t(x)
-                self.assertGraphContainsExactly(jit_t.graph_for(x), FUSION_GUARD, 0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_clamp(self):
-        x = torch.tensor([1., float('inf'), 2., float('nan'), float('-inf')], device="cuda")
-
-        def clamp_max(x):
-            return x.clamp(max=1.5)
-
-        def clamp_min_max(x):
-            return x.clamp(min=1.5)
-
-        def clamp_min(x):
-            return x.clamp(min=1., max=3.)
-
-        with nvfuser_singleton_fusion(True):
-            for t in [clamp_max, clamp_min, clamp_min_max]:
-                t_jit = torch.jit.script(t)
-                self._run_helper(t_jit, t, x)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_device_constant(self):
-        x = torch.randn(4, 2, device="cuda")
-
-        # cpu tensor shouldn't be fused
-        def t_cpu(x):
-            return torch.rand_like(x, device=torch.device(type='cpu'))
-
-        with nvfuser_singleton_fusion(True):
-            t_cpu_jit = torch.jit.script(t_cpu)
-            for _ in range(5):
-                t_cpu_jit(x)
-
-            self.assertGraphContainsExactly(t_cpu_jit.graph_for(x), FUSION_GUARD, 0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_expand(self):
-        device = "cuda"
-        x = torch.randn(3, 5, device=device)
-        y = torch.randn(4, 2, 3, 5, device=device)
-
-        def t(x, y):
-            with torch.jit.strict_fusion():
-                x = x.relu()
-                o0 = x.expand(2, 3, 5)
-                o1 = x.expand_as(y)
-            return o0, o1
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, y, check_stride=True)
-
-        def t2(x, y):
-            o0 = x.expand(2, 3, 5)
-            o1 = x.expand_as(y)
-            x.add_(1)
-            return o0, o1
-
-        t2_jit = torch.jit.script(t2)
-        self._run_helper(t2_jit, t2, x, y, check_stride=True, num_fusion=0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_scheduler_with_polymorphic_broadcast(self):
-        device = "cuda"
-        x0 = torch.randn(10, 128, device=device)
-        x1 = torch.rand_like(x0)
-        x2 = torch.randn(10, device=device)
-
-        def t(x0, x1, x2):
-            x3 = x2.unsqueeze(-1)
-            x4 = x3 + x0
-            x5 = x3 + x1
-            x6 = x5.sum(0)
-            return x4, x6
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x0, x1, x2, check_stride=True)
-
-        x2 = torch.randn(128, device=device)
-
-        def t2(x0, x1, x2):
-            x3 = x2.unsqueeze(0)
-            x4 = x3 + x0
-            x5 = x3 + x1
-            x6 = x5.sum(1)
-            return x4, x6
-
-        t2_jit = torch.jit.script(t2)
-        self._run_helper(t2_jit, t2, x0, x1, x2, check_stride=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_type_inference(self):
-        device = "cuda"
-        x0 = torch.randn(10, 128, device=device)
-        x1 = torch.rand_like(x0)
-        x2 = torch.rand_like(x0)
-
-        def t(x0, x1, x2, flag : bool = True):
-            x3 = 2.0 * x0
-            x4 = 2.0 * x1
-            x5 = 2.0 * x2
-            if flag:
-                return torch.stack([x3, x4, x5], dim=-1)
-            # second code path doesn't run through profiling
-            # hence would utilize type inference with profiling information
-            return x0 + x1 + x2
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x0, x1, x2, check_stride=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_disable_const_chunk_propagation_for_normalization(self):
-        device = "cuda"
-        x0 = torch.randn(10, 12, device=device)
-        x1 = torch.randn(10, 4, device=device)
-        w0 = torch.randn(12, device=device)
-        w1 = torch.randn(4, device=device)
-
-        def t(x, y, w0, w1):
-            ih = torch.layer_norm(x, (12,), w0)
-            i_r, i_z, i_n = ih.chunk(3, dim=1)
-            i_n = torch.layer_norm(i_n, (4,), w1)
-            r = torch.sigmoid(i_r)
-            n = torch.tanh(i_n + r * i_z)
-            h = n + r * y
-            return h
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x0, x1, w0, w1, check_stride=True)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_no_tensor_input(self):
-        device = "cuda"
-        x = torch.randn(512, device=device)
-
-        def t(x):
-            tensor0 = torch.tensor(3, dtype=torch.float32, device='cuda')
-            tensor1 = torch.tensor(3, dtype=torch.float32, device='cuda')
-            o = torch.div(x.numel(), tensor0)
-            o = torch.mul(o, tensor1)
-            return o
-
-        t_jit = torch.jit.script(t)
-        self._run_helper(t_jit, t, x, check_stride=True)
-
-        # Note that curently TS embeds constant tensor in the graph
-        # this triggers memory leak check in CI
-        torch.jit._state._python_cu.drop_all_functions()
-
-
-class TestEnableDisableCudaFuser(JitTestCase):
-    def setUp(self):
-        super().setUp()
-        if RUN_NVFUSER:
-            self.is_enabled = torch._C._jit_set_nvfuser_enabled(False)
-
-    def tearDown(self):
-        if RUN_NVFUSER:
-            torch._C._jit_set_nvfuser_enabled(self.is_enabled)
-        super().tearDown()
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_context_manager_test(self):
-        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        with torch.jit.fuser('fuser2'):
-            with torch.jit.fuser('fuser2'):
-
-                def t1(x, y):
-                    o = x + y
-                    o = o + 2.0
-                    return o
-                t_jit = torch.jit.script(t1)
-                t_jit(x, y)
-                t_jit(x, y)
-                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
-
-            def t2(x, y):
-                o = x + y
-                o = o + 3.0
-                return o
-            t_jit_2 = torch.jit.script(t2)
-            t_jit_2(x, y)
-            t_jit_2(x, y)
-            self.assertGraphContains(t_jit_2.graph_for(x, y), FUSION_GUARD)
-
-        def t3(x, y):
-            o = x + y
-            o = o + 4.0
-            return o
-        t_jit_3 = torch.jit.script(t3)
-        t_jit_3(x, y)
-        t_jit_3(x, y)
-        self.assertGraphContainsExactly(t_jit_3.graph_for(x, y), FUSION_GUARD, 0)
-
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    def test_register_fuser(self):
-        self.assertFalse(torch._C._jit_set_nvfuser_enabled(True))
-        self.assertTrue(torch._C._jit_nvfuser_enabled())
-        self.assertTrue(torch._C._jit_set_nvfuser_enabled(True))
-        self.assertTrue(torch._C._jit_nvfuser_enabled())
-        self.assertTrue(torch._C._jit_set_nvfuser_enabled(False))
-        self.assertFalse(torch._C._jit_nvfuser_enabled())
-
-    @unittest.skipIf(RUN_CUDA, "Testing on CPU only")
-    def test_register_fuser_cpu(self):
-        with self.assertRaises(RuntimeError):
-            torch._C._jit_set_nvfuser_enabled(True)
-            torch._C._jit_set_nvfuser_enabled(False)
-
-    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
-    @unittest.skipIf(not TEST_WITH_ROCM, "ROCM test only")
-    def test_register_fuser_rocm(self):
-        with self.assertRaises(RuntimeError):
-            torch._C._jit_set_nvfuser_enabled(True)
-            torch._C._jit_set_nvfuser_enabled(False)
-
-    def test_can_be_enabled_nvfuser(self):
-        if TEST_WITH_ROCM:
-            expected = False
-        else:
-            expected = RUN_CUDA
-
-        self.assertEqual(expected, torch._C._jit_nvfuser_can_be_enabled())
-
-# See TestNNCOpInfoParent
-class TestCudaFuserOpInfoParent(JitCommonTestCase):
+try:
+    from _nvfuser.test_torchscript import *  # noqa: F403,F401
+except ImportError:
+    def run_tests():
+        return
     pass
 
-class TestCudaFuserOpInfo(TestCudaFuserOpInfoParent):
-    def setUp(self):
-        super(TestCudaFuserOpInfoParent, self).setUp()
-        if RUN_NVFUSER:
-            self.cuda_fuser_options = CudaFuserTestOptions()
-            # enables guard mode since tracing could change graph to violate guard.
-            torch._C._jit_set_nvfuser_guard_mode(True)
-        self.nvfuser_single_node_mode = torch._C._jit_set_nvfuser_single_node_mode(True)
-
-    def tearDown(self):
-        if RUN_NVFUSER:
-            self.cuda_fuser_options.restore()
-
-        torch._C._jit_set_nvfuser_single_node_mode(self.nvfuser_single_node_mode)
-
-        super(TestCudaFuserOpInfoParent, self).tearDown()
-
-    @slowTest
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @ops(op_db, dtypes=OpDTypes.supported)
-    def test_nvfuser_correctness(self, device, dtype, op):
-        if not op.supports_tracing:
-            self.skipTest("nvfuser requires tracing support")
-
-        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
-
-        for variant, sample in variant_sample_pairs:
-            trace = create_traced_fn(self, variant, cache_traced_fn=True)
-            ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
-
-            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
-
-            val = trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
-
-            self.assertEqual(ref, val, exact_layout=True)
-
-        # Note: Clearing CU after NVFuser tests
-        # https://github.com/pytorch/pytorch/issues/35600
-        # each torch.jit.trace adds state to the _python_cu compilation unit
-        # since this test traces a lot of functions, out-of-memory can occur
-        # if the CU is not cleared.
-        torch.jit._state._python_cu.drop_all_functions()
-
-    @skipIfRocm
-    @slowTest
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    @ops(op_db, allowed_dtypes=(torch.float16, torch.bfloat16, torch.float32,
-                                torch.float64, torch.complex64, torch.complex128))
-    def test_nvfuser_extremal_values(self, device, dtype, op):
-        if not op.supports_tracing:
-            self.skipTest("nvfuser requires tracing support")
-
-        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
-
-        def _get_extremal_tensor(x, val, dtype):
-            if x.dtype != dtype:
-                return x
-            return torch.full_like(x, val)
-
-        def _get_extremal_input(x, val, dtype):
-            if isinstance(x, torch.Tensor):
-                return _get_extremal_tensor(x, val, dtype)
-            elif is_iterable_of_tensors(x):
-                return [_get_extremal_tensor(y, val, dtype) for y in x]
-            return x
-
-        def _get_extremal_sample(sample: SampleInput, val, dtype):
-            extremal_sample = SampleInput(
-                input=_get_extremal_input(sample.input, val, dtype),
-                args=tuple(_get_extremal_input(x, val, dtype) for x in sample.args),
-                kwargs={k: _get_extremal_input(v, val, dtype) for k, v in sample.kwargs.items()},
-            )
-            return extremal_sample
-
-        def _get_extremal_samples(sample: SampleInput, dtype):
-            vals = [float('inf'), float('-inf'), float('nan')]
-            if dtype.is_complex:
-                complex_vals = itertools.product(vals, vals)
-                vals = tuple(map(lambda x: complex(*x), complex_vals))
-            for val in vals:
-                yield _get_extremal_sample(sample, val, dtype)
-
-        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
-
-        for variant, sample in variant_sample_pairs:
-
-            trace = create_traced_fn(self, variant, cache_traced_fn=True)
-            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
-            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
-
-            for extremal_sample in _get_extremal_samples(sample, dtype):
-                try:
-                    with freeze_rng_state():
-                        ref = variant(*clone_inputs((extremal_sample.input, *extremal_sample.args)),
-                                      **extremal_sample.kwargs)
-                except (torch._C._LinAlgError, RuntimeError, ValueError):
-                    # if eager errors out, then don't expect NVFuser to pass
-                    continue
-
-                with freeze_rng_state():
-                    val = trace(*clone_inputs((extremal_sample.input, *extremal_sample.args)),
-                                **extremal_sample.kwargs)
-
-                self.assertEqual(val, ref, equal_nan=True, exact_device=True)
-
-            # See [Note: Clearing CU after NVFuser tests]
-            torch.jit._state._python_cu.drop_all_functions()
-
-instantiate_device_type_tests(TestCudaFuserOpInfo, globals(), only_for=("cuda"))
-
-
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit_disabled.py b/test/test_jit_disabled.py
index 72d4146016d4..6bb694bc794a 100644
--- a/test/test_jit_disabled.py
+++ b/test/test_jit_disabled.py
@@ -46,9 +46,10 @@ def compare_enabled_disabled(self, src):
     def test_attribute(self):
         _program_string = """
 import torch
+
 class Foo(torch.jit.ScriptModule):
     def __init__(self, x):
-        super(Foo, self).__init__()
+        super().__init__()
         self.x = torch.jit.Attribute(x, torch.Tensor)
 
     def forward(self, input):
@@ -64,8 +65,6 @@ def test_script_module_construction(self):
 import torch
 
 class AModule(torch.jit.ScriptModule):
-    def __init__(self):
-        super(AModule, self).__init__()
     @torch.jit.script_method
     def forward(self, input):
         pass
@@ -80,9 +79,6 @@ def test_recursive_script(self):
 import torch
 
 class AModule(torch.nn.Module):
-    def __init__(self):
-        super(AModule, self).__init__()
-
     def forward(self, input):
         pass
 
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index 2ea4c81db4e1..ef3843dc01c4 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -27,7 +27,7 @@
 
 
 def strip_profiling_nodes(nodes):
-    profiling_opcodes = set(['prim::BailoutTemplate', 'prim::BailOut'])
+    profiling_opcodes = {'prim::BailoutTemplate', 'prim::BailOut'}
     return [n for n in nodes if n.kind() not in profiling_opcodes]
 
 
@@ -75,7 +75,7 @@ def test_abs_cpu_unicode_temp_dir(self):
             shell_env = os.environ.copy()
             shell_env['TMP'] = dname
             cmd = [sys.executable, os.path.basename(__file__), type(self).__name__ + '.test_abs_cpu']
-            legacy_jit_flag = '--jit_executor=legacy'
+            legacy_jit_flag = '--jit-executor=legacy'
             for v in sys.argv:
                 if v == legacy_jit_flag:
                     cmd.append(legacy_jit_flag)
@@ -512,7 +512,7 @@ def test_exp_cuda(self):
     def test_fuse_decompose_normalization(self):
         class ResLike(torch.jit.ScriptModule):
             def __init__(self, norm_module):
-                super(ResLike, self).__init__()
+                super().__init__()
                 self.nm = norm_module
 
             @torch.jit.script_method
@@ -823,7 +823,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['d']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.d = torch.device('cuda')
 
             @torch.jit.script_method
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index 5fb012ad4037..3bd8c9497ce0 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
 import sys
-sys.argv.append("--jit_executor=legacy")
+sys.argv.append("--jit-executor=legacy")
 from test_jit_fuser import *  # noqa: F403
 
 if __name__ == '__main__':
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 9b1e30f27a7e..b00588ee20c3 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -46,7 +46,7 @@
 autograd_check_set = {'aten::__is__', 'prim::AutogradAllNonZero', 'prim::AutogradAllZero', 'prim::ListConstruct'}
 
 def strip_profiling_nodes(nodes):
-    profiling_opcodes = set(['prim::BailoutTemplate', 'prim::BailOut'])
+    profiling_opcodes = {'prim::BailoutTemplate', 'prim::BailOut'}
     return [n for n in nodes if n.kind() not in profiling_opcodes]
 
 def warmup_forward(f, *args, profiling_count=2):
@@ -80,6 +80,8 @@ def inline_fusion_groups():
     finally:
         torch._C._debug_set_fusion_group_inlining(old_inlining)
 
+
+@skipIfTorchDynamo()
 class TestTEFuser(JitTestCase):
     def setUp(self):
         super().setUp()
@@ -189,7 +191,7 @@ def func(x):
             return x2.sum()
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(x for x in range(0, 15)), dtype=torch.float, device='cpu')
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device='cpu')
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -205,7 +207,7 @@ def func_neg(x):
             return x.sum((-2, )) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(x for x in range(0, 15)), dtype=torch.float, device='cpu')
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device='cpu')
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -217,7 +219,7 @@ def func(x):
             return x.sum((0, ), keepdim=True, dtype=torch.double) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(x for x in range(0, 15)), dtype=torch.float, device='cpu')
+            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device='cpu')
             a = a.reshape(5, 3)
 
             self.checkScript(func, (a,))
@@ -969,7 +971,7 @@ class M(torch.jit.ScriptModule):
             __constants__ = ['d']
 
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.d = torch.device('cuda')
 
             @torch.jit.script_method
@@ -1236,7 +1238,7 @@ def foo(x):
 
         class MyMod(torch.nn.Module):
             def __init__(self, dtype):
-                super(MyMod, self).__init__()
+                super().__init__()
                 self.dtype = dtype
 
             def forward(self, x):
@@ -2622,6 +2624,7 @@ def get_name(op):
 # super() [with no arguments] fails, presumably because of how instantiate_device_type_tests works.
 # super(TestNNCOpInfo, self) fails because TestNNCOpInfo gets deleted from global scope.
 # super(JitCommonTestCase, self).fn() would skip JitCommonTestCase.fn() implementation
+@skipIfTorchDynamo()
 class TestNNCOpInfoParent(JitCommonTestCase):
     pass
 
@@ -2739,6 +2742,7 @@ def test_nnc_correctness(self, device, dtype, op):
 instantiate_device_type_tests(TestNNCOpInfo, globals(), only_for=only_for)
 
 # Purpose of this class is to allow super() calls. (See TestNNCOpInfoParent)
+@skipIfTorchDynamo()
 class TestLoopnestRandomizationParent(JitTestCase):
     pass
 
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index e424f46ba896..5576f1645349 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
 import sys
-sys.argv.append("--jit_executor=legacy")
+sys.argv.append("--jit-executor=legacy")
 from test_jit import *  # noqa: F403
 
 if __name__ == '__main__':
diff --git a/test/test_jit_llga_fuser.py b/test/test_jit_llga_fuser.py
index 12bd955043b9..16e1bc49701f 100644
--- a/test/test_jit_llga_fuser.py
+++ b/test/test_jit_llga_fuser.py
@@ -174,7 +174,7 @@ def test_bn2d(self, dtype):
     def test_eltwise(self, dtype):
         class M(nn.Module):
             def __init__(self, eltwise_fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.eltwise = eltwise_fn
 
             def forward(self, x):
@@ -234,9 +234,6 @@ def test_avg_pool2d(self, dtype):
     @dtypes(torch.float32, torch.bfloat16)
     def test_variable_kernel_avg_pool2d(self, dtype):
         class M(nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x):
                 x = F.avg_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=0, count_include_pad=False)
                 return x
@@ -387,7 +384,7 @@ class TestFusionPattern(JitLlgaTestCase):
     def test_conv2d_eltwise(self, dtype):
         class M(nn.Module):
             def __init__(self, eltwise_fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=False)
                 self.eltwise = eltwise_fn
@@ -419,7 +416,7 @@ def forward(self, x):
     def test_conv2d_silu(self, dtype):
         class M(nn.Module):
             def __init__(self, inplace):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.eltwise = nn.SiLU(inplace=inplace)
@@ -451,7 +448,7 @@ def forward(self, x):
     def test_ensure_tensor_is_rewrapped(self, dtype):
         class M(nn.Module):
             def __init__(self, eltwise_fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv3 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
@@ -490,7 +487,7 @@ def forward(self, x, y):
     def test_conv2d_clamp(self, dtype):
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv3 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
@@ -523,7 +520,7 @@ def forward(self, x):
     def test_conv2d_bn(self, dtype):
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.bn1 = nn.BatchNorm2d(32)
 
@@ -545,7 +542,7 @@ def forward(self, x):
     def test_conv2d_bn_relu(self, dtype):
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.bn1 = nn.BatchNorm2d(32)
 
@@ -569,7 +566,7 @@ def forward(self, x):
     def test_bn2d_eltwise(self, dtype):
         class M(nn.Module):
             def __init__(self, eltwise_fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.eltwise = eltwise_fn
                 self.bn = nn.BatchNorm2d(32)
 
@@ -591,7 +588,7 @@ def forward(self, x):
     def test_linear_eltwise(self, dtype):
         class M(nn.Module):
             def __init__(self, eltwise_fn, bias):
-                super(M, self).__init__()
+                super().__init__()
                 self.linear = nn.Linear(28, 64, bias)
                 self.eltwise = eltwise_fn
 
@@ -616,7 +613,7 @@ def forward(self, x):
     def test_conv2d_sum(self, dtype):
         class M(nn.Module):
             def __init__(self, bias=False):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=bias)
                 self.bn1 = nn.BatchNorm2d(32)
                 self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=bias)
@@ -649,7 +646,7 @@ def forward(self, x, y):
     def test_wildcard(self, dtype):
         class M(nn.Module):
             def __init__(self):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.eltwise = nn.ReLU()
 
@@ -678,9 +675,6 @@ def forward(self, x):
     @dtypes(torch.int32)
     def test_wildcard_unsupported_dtype(self, dtype):
         class M(nn.Module):
-            def __init__(self):
-                super(M, self).__init__()
-
             def forward(self, x):
                 y = x // 2
                 return y
@@ -703,7 +697,7 @@ def forward(self, x):
     def test_rewrap_tensor_input_to_pytorch(self, dtype):
         class M(nn.Module):
             def __init__(self, eltwise_fn):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=True)
                 self.eltwise = eltwise_fn
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index fe17be9e0e3a..22fe6994831e 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
 import sys
-sys.argv.append("--jit_executor=profiling")
+sys.argv.append("--jit-executor=profiling")
 from test_jit import *  # noqa: F403
 
 if __name__ == '__main__':
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 499c6b6f8aaf..7c734434dfba 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -1,7 +1,7 @@
 # Owner(s): ["oncall: jit"]
 
 import sys
-sys.argv.append("--jit_executor=simple")
+sys.argv.append("--jit-executor=simple")
 from test_jit import *  # noqa: F403
 
 if __name__ == '__main__':
diff --git a/test/test_legacy_vmap.py b/test/test_legacy_vmap.py
index adc2d4bf0af0..56d6e0509577 100644
--- a/test/test_legacy_vmap.py
+++ b/test/test_legacy_vmap.py
@@ -8,8 +8,7 @@
 import functools
 import itertools
 import warnings
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, \
-    skipCUDAIfNoMagma
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 import types
 
 
@@ -1871,6 +1870,7 @@ def test_sum_dim(self):
 
         # Single vmap, various in_dims / out_dims
         test(lambda x: x.sum(()), [torch.randn([B0])])
+        test(lambda x: x.sum(()), [torch.randn([B0, 2])])
         test(lambda x: x.sum(0), [torch.randn([B0])])
         test(lambda x: x.sum(-1), [torch.randn([B0])])
         test(lambda x: x.sum(0), [torch.randn([B0, 3])])
@@ -2414,16 +2414,6 @@ def test_trace(self, device):
         x = torch.randn(2, 3, device=device, requires_grad=True)
         self._batched_grad_test(Tensor.trace, (x,))
 
-    @skipCUDAIfNoMagma
-    @allowVmapFallbackUsage
-    def test_symeig(self, device):
-        def op(x):
-            return torch.symeig(x, eigenvectors=True)[0]
-
-        x = torch.randn(3, 3, device=device, requires_grad=True)
-        self._batched_grad_test(op, (x,), {})
-        self._batched_grad_grad_test(op, (x,), {})
-
     def test_threshold(self, device):
         x = torch.randn(2, 3, device=device, requires_grad=True)
         self._batched_grad_test(lambda x: F.threshold(x, 0.5, 0.0), (x,))
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9034df7d5f73..d1e1e76762d3 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -18,18 +18,18 @@
     (TestCase, run_tests, TEST_SCIPY, IS_MACOS, IS_WINDOWS, slowTest,
      TEST_WITH_ASAN, TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
-     freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM)
+     freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver,
      onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
-     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, dtypesIfMPS)
+     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, dtypesIfMPS, largeTensorTest)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex_and, floating_and_complex_types, integral_types,
     floating_and_complex_types_and, floating_types_and, complex_types,
 )
-from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9, _get_magma_version, \
+from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, _get_magma_version, \
     _get_torch_cuda_version
 from torch.distributions.binomial import Binomial
 import torch.backends.opt_einsum as opt_einsum
@@ -161,6 +161,13 @@ def test_eig_removed_error(self, device):
         with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
             a.eig()
 
+    def test_symeig_removed_error(self, device):
+        a = make_tensor(5, 5, device=device, dtype=torch.float32)
+        with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
+            torch.symeig(a)
+        with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
+            a.symeig()
+
     def test_lstsq_removed_error(self, device):
         a = make_tensor(5, 5, device=device, dtype=torch.float32)
         with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"):
@@ -1177,6 +1184,23 @@ def run_test_case(input_size, ord, keepdim, to_dtype):
                     continue
             run_test_case((S, S) , ord, keepdim, norm_dtype)
 
+    # This test confirms torch.linalg.norm bfloat16 and half get right result.
+    @dtypes(torch.bfloat16, torch.float16)
+    def test_norm_bfloat16_and_half(self, device, dtype):
+        make_arg = partial(make_tensor, dtype=dtype, device=device)
+
+        def run_test_case(input_size, ord, keepdim):
+            msg = (
+                f'input_size={input_size}, ord={ord}, keepdim={keepdim}, '
+                f'dtype={dtype}')
+            input = make_arg(input_size).fill_(1)
+            result_ref = torch.linalg.norm(input.float(), ord, keepdim=keepdim).to(dtype=dtype)
+            result = torch.linalg.norm(input, ord, keepdim=keepdim)
+            self.assertEqual(result_ref, result, msg=msg)
+
+        ord_vector = [0, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None]
+        for S, ord, keepdim in product((10, 2049), ord_vector, (True, False)):
+            run_test_case((S,) , ord, keepdim, )
 
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16)
     def test_vector_norm(self, device, dtype):
@@ -2446,18 +2470,18 @@ def test_invariance_error_spectral_decompositions(self, device, dtype):
         A = make_arg((3, 3))
         with self.assertRaisesRegex(RuntimeError, "ill-defined"):
             U, _, Vh = torch.linalg.svd(A, full_matrices=False)
-            (U + Vh).sum().backward()
+            (U + Vh).sum().abs().backward()
 
         A = make_arg((3, 3))
         with self.assertRaisesRegex(RuntimeError, "ill-defined"):
             V = torch.linalg.eig(A).eigenvectors
-            V.sum().backward()
+            V.sum().abs().backward()
 
         A = make_arg((3, 3))
         A = A + A.mH
         with self.assertRaisesRegex(RuntimeError, "ill-defined"):
             Q = torch.linalg.eigh(A).eigenvectors
-            Q.sum().backward()
+            Q.sum().abs().backward()
 
     @skipCUDAIfNoCusolver  # MAGMA backend doesn't work in this case
     @skipCUDAIfRocm
@@ -4344,6 +4368,26 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
             y = make_arg(size_y, noncontiguous=nctg_y)
             self.check_single_matmul(x, y)
 
+    # 4GB should do, but we run tests in parallel in CI, so let's be generous
+    @largeTensorTest('16GB', device='cuda')
+    def test_large_bmm_mm_backward(self, device):
+        A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
+        B = torch.randn([1024, 65536], device="cuda", requires_grad=True)
+        G = torch.randn([1024, 2, 65536], device="cuda")
+
+        # Should not create an intermediary tensor of size [1024, 1024, 65536] (256GB of memory) and OOM
+        (A @ B).backward(G)
+
+    # 4GB should do, but we run tests in parallel in CI, so let's be generous
+    @largeTensorTest('16GB', device='cuda')
+    def test_large_bmm_backward(self, device):
+        A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
+        B = torch.randn([1, 1024, 65536], device="cuda", requires_grad=True)
+        G = torch.randn([1024, 2, 65536], device="cuda")
+
+        # Should not create an intermediary tensor of size [1024, 1024, 65536] (256GB of memory) and OOM
+        (A @ B).backward(G)
+
     def test_linear_algebra_scalar_raises(self, device) -> None:
         m = torch.randn(5, 5, device=device)
         v = torch.randn(5, device=device)
@@ -4570,8 +4614,8 @@ def call_torch_fn(*args, **kwargs):
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     @dtypesIfCUDA(*floating_and_complex_types_and(
-                  *[torch.half] if not CUDA9 else [],
-                  *[torch.bfloat16] if CUDA11OrLater and SM53OrLater else []
+                  torch.half,
+                  *[torch.bfloat16] if SM53OrLater else []
                   ))
     @dtypes(*all_types_and_complex_and(torch.bfloat16))
     def test_corner_cases_of_cublasltmatmul(self, device, dtype):
@@ -4597,8 +4641,8 @@ def test_corner_cases_of_cublasltmatmul(self, device, dtype):
         torch.nn.functional.linear(m1, m2, M)
 
     @dtypesIfCUDA(*floating_and_complex_types_and(
-                  *[torch.half] if not CUDA9 else [],
-                  *[torch.bfloat16] if CUDA11OrLater and SM53OrLater else []
+                  torch.half,
+                  *[torch.bfloat16] if SM53OrLater else []
                   ))
     @dtypes(*all_types_and_complex_and(torch.bfloat16))
     def test_blas_alpha_beta_empty(self, device, dtype):
@@ -5095,7 +5139,7 @@ def lobpcg(*args, **kwargs):
                 self.assertEqual(E.shape, batches + (k,))
                 self.assertEqual(V.shape, batches + (m, k))
                 self.assertEqual(matmul(A, V), mm(V, E.diag_embed()), atol=prec, rtol=0)
-                e = torch.symeig(A)[0]
+                e = torch.linalg.eigvalsh(A)
                 e_smallest = e[..., :k]
                 self.assertEqual(E, e_smallest)
 
@@ -5365,8 +5409,8 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8,
                         torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     @dtypesIfCUDA(*floating_and_complex_types_and(
-                  *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else [],
-                  *[torch.half]))
+                  *[torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else [],
+                  torch.half))
     @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_addmv(self, device, dtype):
         # have to use torch.randn(...).to(bfloat16) instead of
@@ -5401,8 +5445,8 @@ def test_addmv(self, device, dtype):
         for m, v in itertools.product(ms, vs):
             self._test_addmm_addmv(torch.addmv, t, m, v, beta=0)
 
-    @dtypesIfCUDA(*floating_types_and(*[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and
-                  SM53OrLater) else []))
+    @dtypesIfCUDA(*floating_types_and(*[torch.bfloat16] if TEST_WITH_ROCM or
+                  SM53OrLater else []))
     @dtypes(torch.float, torch.double)
     def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype):
         # tests (o, s)*(s).  o is output size, s is summed size.
@@ -5465,7 +5509,7 @@ def maybe_transpose(cond, m):
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     @dtypesIfMPS(torch.float32)
     @dtypesIfCUDA(*floating_and_complex_types_and(
-                  *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else []))
+                  *[torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
@@ -5474,7 +5518,7 @@ def test_addmm(self, device, dtype):
     @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6,
                         torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8})
     @dtypesIfCUDA(*floating_types_and(
-                  *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else []))
+                  *[torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
     @dtypes(*floating_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_addmm_activation(self, device, dtype):
@@ -5532,6 +5576,122 @@ def test_matmul_45724(self, device):
         torch.matmul(a, b, out=c)
         self.assertEqual(c, cpu_result)
 
+    @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
+    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
+    @onlyCUDA
+    @parametrize("k", [16, 32])
+    @parametrize("n", [16, 32])
+    @parametrize("use_transpose_a", [True, False])
+    @parametrize("use_transpose_b", [True, False])
+    def test__int_mm(self, device, k, n, use_transpose_a, use_transpose_b):
+        if TEST_WITH_ROCM:
+            self.skipTest("_int_mm not compiled for ROCM")
+
+        def genf_int_float(x, y, use_transpose):
+            if use_transpose:
+                x, y = y, x
+            x_int8 = torch.randint(-10, 10, (x, y), dtype=torch.int8, device=device)
+            x_float = x_int8.to(torch.float32)
+            if use_transpose:
+                return x_int8.t(), x_float.t()
+            return x_int8, x_float
+
+        def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
+            a_int8, a_float = genf_int_float(m, k, transpose_a)
+            b_int8, b_float = genf_int_float(k, n, transpose_b)
+            c_int32 = torch._int_mm(a_int8, b_int8)
+            self.assertTrue(c_int32.dtype is torch.int32)
+            self.assertEqual(c_int32.device, torch.device(device))
+            if test_equal:
+                self.assertEqual(c_int32.float(), torch.mm(a_float, b_float))
+            else:
+                self.assertNotEqual(c_int32.float(), torch.mm(a_float, b_float))
+            c_int32_result = c_int32.new_empty(c_int32.size())
+            # Checking out variant
+            torch._int_mm(a_int8, b_int8, out=c_int32_result)
+            if test_equal:
+                self.assertEqual(c_int32_result.float(), torch.mm(a_float, b_float))
+            else:
+                self.assertNotEqual(c_int32_result.float(), torch.mm(a_float, b_float))
+
+        # NOTE: We're just exercising terrible failures here.
+        version = _get_torch_cuda_version()
+        SM86OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 6)
+        if version == (11, 7):
+            if not use_transpose_a and use_transpose_b:
+                if SM86OrLater:
+                    _test(17, k, n, use_transpose_a, use_transpose_b, False)
+                else:
+                    with self.assertRaisesRegex(RuntimeError,
+                                                "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
+                        _test(17, k, n, use_transpose_a, use_transpose_b, False)
+
+            if use_transpose_a and not use_transpose_b:
+                with self.assertRaisesRegex(RuntimeError,
+                                            "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
+                    _test(17, k, n, use_transpose_a, use_transpose_b)
+
+            if use_transpose_a and use_transpose_b:
+                with self.assertRaisesRegex(RuntimeError,
+                                            "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
+                    _test(17, k, n, use_transpose_a, use_transpose_b)
+
+            if not use_transpose_a and not use_transpose_b:
+                if SM86OrLater:
+                    _test(17, k, n, use_transpose_a, use_transpose_b)
+                else:
+                    with self.assertRaisesRegex(RuntimeError,
+                                                "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
+                        _test(17, k, n, use_transpose_a, use_transpose_b)
+        else:
+            with self.assertRaisesRegex(RuntimeError, "_int_mm_out_cuda not compiled for CUDA"):
+                _test(17, k, n, use_transpose_a, use_transpose_b, False)
+
+    @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
+    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
+    @onlyCUDA
+    def test__int_mm_errors(self, device):
+        if TEST_WITH_ROCM:
+            self.skipTest("_int_mm not compiled for ROCM")
+
+        version = _get_torch_cuda_version()
+        if version != (11, 7):
+            self.skipTest("_int_mm only compiled for CUDA 11.7")
+
+        def genf_int(x, y):
+            return torch.empty((x, y), dtype=torch.int8, device=device)
+
+        def _gen_pair(m, k, n):
+            return genf_int(m, k), genf_int(k, n)
+
+        self.assertRaisesRegex(RuntimeError,
+                               r"self.size\(0\) needs to be greater than 16, but got 16",
+                               lambda: torch._int_mm(*_gen_pair(16, 8, 32)))
+        self.assertRaisesRegex(RuntimeError,
+                               r"self.size\(1\) needs to be greater than 0 and a multiple of 8, but got 7",
+                               lambda: torch._int_mm(*_gen_pair(17, 7, 32)))
+        self.assertRaisesRegex(RuntimeError,
+                               r"self.size\(1\) needs to match mat2.size\(0\) but got 8 and 7",
+                               lambda: torch._int_mm(genf_int(17, 8), genf_int(7, 32)))
+        self.assertRaisesRegex(RuntimeError,
+                               r"mat2.size\(1\) needs to be greater than 0 and a multiple of 8, but got 31",
+                               lambda: torch._int_mm(*_gen_pair(17, 8, 31)))
+        self.assertRaisesRegex(RuntimeError,
+                               r"expected scalar type Char but found Float",
+                               lambda: torch._int_mm(genf_int(17, 8).float(), genf_int(8, 32)))
+        self.assertRaisesRegex(RuntimeError,
+                               r"expected scalar type Char but found Float",
+                               lambda: torch._int_mm(genf_int(17, 8), genf_int(8, 32).float()))
+        self.assertRaisesRegex(RuntimeError,
+                               r"Expected result dtype to be of type kInt but got float",
+                               lambda: torch._int_mm(genf_int(17, 8), genf_int(8, 32), out=genf_int(16, 32).float()))
+        self.assertRaisesRegex(RuntimeError,
+                               r"Expected result.size\(0\) to be 17 but got 15",
+                               lambda: torch._int_mm(genf_int(17, 8), genf_int(8, 32), out=genf_int(15, 32).int()))
+        self.assertRaisesRegex(RuntimeError,
+                               r"Expected result.size\(0\) to be 17 but got 16",
+                               lambda: torch._int_mm(genf_int(17, 8), genf_int(8, 32), out=genf_int(16, 31).int()))
+
     @slowTest
     @onlyNativeDeviceTypes
     # bfloat16 doesn't have sufficient precision to pass this test
@@ -5682,7 +5842,7 @@ def test_strided_mm_bmm(self, device, dtype):
     @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_bmm(self, device, dtype):
-        if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
+        if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
             # So on PyTorch, we consider BFloat16 support on SM < 53 as
             # undefined bahavior
@@ -5694,7 +5854,7 @@ def test_bmm(self, device, dtype):
 
         is_supported = True
         if dtype == torch.bfloat16 and self.device_type == 'cuda':
-            is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)
+            is_supported = TEST_WITH_ROCM or SM53OrLater
 
         if not is_supported:
             for num_batches in batch_sizes:
@@ -5794,7 +5954,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
     @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_addbmm(self, device, dtype):
-        if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
+        if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
             # So on PyTorch, we consider BFloat16 support on SM < 53 as
             # undefined bahavior
@@ -5808,7 +5968,7 @@ def test_addbmm(self, device, dtype):
             if self.device_type == 'cpu':
                 self.precision = 1  # 43 vs 43.75
             else:
-                is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)
+                is_supported = TEST_WITH_ROCM or SM53OrLater
 
         if not is_supported:
             b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1)
@@ -5867,7 +6027,7 @@ def generate_tensor():
     @dtypes(*floating_and_complex_types_and(torch.bfloat16))
     @tf32_on_and_off(0.05)
     def test_baddbmm(self, device, dtype):
-        if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater:
+        if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater:
             # cuBLAS does not guarantee BFloat16 support on SM < 53.
             # So on PyTorch, we consider BFloat16 support on SM < 53 as
             # undefined bahavior
@@ -5878,7 +6038,7 @@ def test_baddbmm(self, device, dtype):
 
         is_supported = True
         if dtype == torch.bfloat16 and self.device_type == 'cuda':
-            is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)
+            is_supported = TEST_WITH_ROCM or SM53OrLater
 
         if not is_supported:
             b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1)
@@ -6038,6 +6198,12 @@ def run_test(coeff_shape, data_shape):
         run_test([3, 4], [3, 3, 3])
         run_test([3, 4], [3, 3, 3, 3])
 
+        # Regression test for https://github.com/pytorch/pytorch/issues/94124
+        with self.assertRaises(RuntimeError):
+            x = torch.rand([], device=device, dtype=dtype)
+            coeffs = torch.rand([2, 2], device=device, dtype=dtype)
+            res = torch._compute_linear_combination(x, coeffs)
+
     @onlyCPU
     @skipCPUIfNoLapack
     @dtypes(torch.complex64)
@@ -6972,98 +7138,6 @@ def run_test(A_dims, b_dims):
 
         run_test((1, 1), (1, 1, 1025))
 
-    @precisionOverride({torch.float32: 1e-5, torch.complex64: 1e-5})
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_symeig(self, device, dtype):
-        from torch.testing._internal.common_utils import random_hermitian_matrix
-
-        def run_test(dims, eigenvectors, upper):
-            x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
-            if dtype.is_complex:
-                real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
-            else:
-                real_dtype = dtype
-            oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device)
-            outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device)
-            torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv))
-
-            if eigenvectors:
-                outv_ = outv.cpu().numpy()
-                x_recon = np.matmul(np.matmul(outv_, torch.diag_embed(oute.to(dtype)).cpu().numpy()),
-                                    outv_.swapaxes(-2, -1).conj())
-                self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T')
-            else:
-                eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper)
-                self.assertEqual(eigvals, oute, msg='Eigenvalues mismatch')
-                self.assertEqual(torch.empty(0, device=device, dtype=dtype), outv, msg='Eigenvector matrix not empty')
-
-            rese, resv = x.symeig(eigenvectors=eigenvectors, upper=upper)
-            self.assertEqual(rese, oute, msg="outputs of symeig and symeig with out don't match")
-            self.assertEqual(resv, outv, msg="outputs of symeig and symeig with out don't match")
-
-            # test non-contiguous
-            x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
-            n_dim = len(dims) + 1
-            # Reverse the batch dimensions and the matrix dimensions and then concat them
-            x = x.permute(tuple(range(n_dim - 3, -1, -1)) + (n_dim - 1, n_dim - 2))
-            assert not x.is_contiguous(), "x is intentionally non-contiguous"
-            rese, resv = torch.symeig(x, eigenvectors=eigenvectors, upper=upper)
-            if eigenvectors:
-                resv_ = resv.cpu().numpy()
-                x_recon = np.matmul(np.matmul(resv_, torch.diag_embed(rese.to(dtype)).cpu().numpy()),
-                                    resv_.swapaxes(-2, -1).conj())
-                self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T')
-            else:
-                eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper)
-                self.assertEqual(eigvals, rese, msg='Eigenvalues mismatch')
-                self.assertEqual(torch.empty(0, device=device, dtype=dtype), resv, msg='Eigenvector matrix not empty')
-
-        batch_dims_set = [(), (3,), (3, 5), (5, 3, 5)]
-        for batch_dims, eigenvectors, upper in itertools.product(batch_dims_set, (True, False), (True, False)):
-            run_test((5,) + batch_dims, eigenvectors, upper)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_symeig_out_errors_and_warnings(self, device, dtype):
-        from torch.testing._internal.common_utils import random_hermitian_matrix
-
-        # if non-empty out tensor with wrong shape is passed a warning is given
-        a = random_hermitian_matrix(3, dtype=dtype, device=device)
-        real_dtype = a.real.dtype if dtype.is_complex else dtype
-        out_w = torch.empty(7, 7, dtype=real_dtype, device=device)
-        out_v = torch.empty(7, 7, dtype=dtype, device=device)
-        with warnings.catch_warnings(record=True) as w:
-            # Trigger warning
-            torch.symeig(a, out=(out_w, out_v))
-            self.assertTrue("An output with one or more elements was resized" in str(w[-2].message))
-            self.assertTrue("An output with one or more elements was resized" in str(w[-1].message))
-
-        # dtypes should be safely castable
-        out_w = torch.empty(0, dtype=real_dtype, device=device)
-        out_v = torch.empty(0, dtype=torch.int, device=device)
-        with self.assertRaisesRegex(RuntimeError, "but got eigenvectors with dtype Int"):
-            torch.symeig(a, out=(out_w, out_v))
-
-        out_w = torch.empty(0, dtype=torch.int, device=device)
-        out_v = torch.empty(0, dtype=dtype, device=device)
-        with self.assertRaisesRegex(RuntimeError, "but got eigenvalues with dtype Int"):
-            torch.symeig(a, out=(out_w, out_v))
-
-        # device should match
-        if torch.cuda.is_available():
-            wrong_device = 'cpu' if self.device_type != 'cpu' else 'cuda'
-            out_w = torch.empty(0, device=wrong_device, dtype=dtype)
-            out_v = torch.empty(0, device=device, dtype=dtype)
-            with self.assertRaisesRegex(RuntimeError, "tensors to be on the same device"):
-                torch.symeig(a, out=(out_w, out_v))
-            out_w = torch.empty(0, device=device, dtype=dtype)
-            out_v = torch.empty(0, device=wrong_device, dtype=dtype)
-            with self.assertRaisesRegex(RuntimeError, "tensors to be on the same device"):
-                torch.symeig(a, out=(out_w, out_v))
-
     @skipCUDAIfNoCusolver
     @skipCPUIfNoLapack
     def test_pca_lowrank(self, device):
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 3b36d0f1996b..d83c5a459aca 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -6,7 +6,7 @@
 
 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_cuda import CUDA11OrLater, SM53OrLater
+from torch.testing._internal.common_cuda import SM53OrLater
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
@@ -17,6 +17,7 @@
 
 from torch.testing._internal.common_utils import (
     IS_ARM64,
+    IS_JETSON,
     parametrize,
     run_tests,
     TEST_WITH_ROCM,
@@ -40,7 +41,7 @@ def tearDown(self):
         super(self.__class__, self).tearDown()
 
     @onlyCUDA
-    @unittest.skipIf(not CUDA11OrLater, "Only CUDA 11+ is supported")
+    @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=1e-1, rtol=1e-1),
                         torch.bfloat16: xtol(atol=1e-1, rtol=1e-1),
@@ -113,10 +114,11 @@ def test_cublas_addmm_alignment(self):
         self.assertEqual(out, torch.matmul(X, A.transpose(1, 0)) + B)
 
     @onlyCUDA
-    @unittest.skipIf(not CUDA11OrLater, "Only CUDA 11+ is supported")
+    @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
+    @unittest.skipIf(IS_JETSON, "Too large for Jetson")
     @toleranceOverride({torch.float32: xtol(atol=1e-5, rtol=1e-5)})
     @dtypes(*([torch.float32, torch.float16] +
-              [torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else []))
+              [torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
     @parametrize(
         "batch_size, N, M, P",
         [(2, 100, 100, 100),
diff --git a/test/test_meta.py b/test/test_meta.py
index 16a388604b59..79399224dfa1 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -22,6 +22,7 @@
     ops,
     instantiate_device_type_tests,
     onlyCUDA,
+    onlyCPU,
     OpDTypes,
 )
 from torch.testing._internal.common_methods_invocations import op_db
@@ -608,8 +609,8 @@ def run_meta_crossref(
     torch.Tensor.item : {f64, i32, c128, i64, i16, f16, u8, c64, bf16, b8, i8, f32},
     torch.bincount : {i32, i64, u8, i16, i8},
     torch.frexp : {f64, f16, bf16, f32},
-    torch.functional.unique : {f64, i32, i64, u8, i16, bf16, b8, i8, f32},
-    torch.functional.unique_consecutive : {f64, i32, i64, u8, i16, bf16, b8, i8, f32},
+    torch.functional.unique : {f64, i32, i64, u8, i16, f16, bf16, b8, i8, f32},
+    torch.functional.unique_consecutive : {f64, i32, i64, u8, i16, f16, bf16, b8, i8, f32},
     torch.histc : {f64, bf16, f32},
     torch.histogram : {f64, f32},
     torch.histogramdd : {f64, f32},
@@ -630,9 +631,8 @@ def run_meta_crossref(
     torch.nn.functional.one_hot : {i64},
     torch.nn.functional.pdist : {f64, f32},
     torch.polar : {f64, f32},
-    torch.segment_reduce : {f64, f16, bf16, f32},
+    torch._segment_reduce : {f64, f16, bf16, f32},
     torch.searchsorted : {f64, i32, i64, f16, u8, i16, bf16, i8, f32},
-    torch.symeig : {f64, f32, c128, c64},
     torch.cholesky : {f64, f32, c128, c64},
     torch.cholesky_inverse : {f64, f32, c128, c64},
     torch.cholesky_solve : {f64, f32, c128, c64},
@@ -686,7 +686,7 @@ def run_meta_crossref(
     torch.diff : {b8},
     torch.equal : {bf16, i8, c32, i64, u8, c128, b8, f64, i16, i32, f32, f16, c64},
     torch.functional.cdist : {f64, f32},
-    torch.nanmean : {bf16, f64, f32, f16},
+    torch.nanmean : {bf16, f64, f32, f16, c32, c64, c128},
     torch.nn.functional.cross_entropy : {bf16, f64, f32},
     torch.nn.functional.interpolate : {bf16, f64, f32, u8},
     torch.nn.functional.nll_loss : {bf16, f64, f32},
@@ -698,7 +698,6 @@ def run_meta_crossref(
     # This fails for arguments dispatched to grid_sampler_3d, but succeeds
     # for grid_sampler_2d, so we can't just xfail it
     torch.nn.functional.grid_sample : {f64, f32},
-    torch.bucketize : {f64, i32, i64, f16, u8, i16, bf16, i8, f32},
     torch.Tensor.addbmm_: {bf16, c128, c64, f32, f64, i16, i32, i64, i8, u8},
 }
 
@@ -846,7 +845,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten.ormqr.default : {c64, c128, f64, f32},
     aten.ormqr.out : {c64, c128, f64, f32},
     aten.polar.out : {f32, f64},
-    aten.symeig.default : {c64, c128, f64, f32},
     aten.take.default : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten.take.out : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten.tensordot.out : {c64, i8, f64, c128, i64, bf16, f32, i32, i16, u8},
@@ -859,7 +857,7 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten._histogramdd_from_bin_tensors.default : {f32, f64},
     aten._local_scalar_dense.default : {c32, c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten._pdist_forward.default : {f32, f64},
-    aten._unique2.default : {i8, f64, i64, bf16, f32, i32, b8, i16, u8},
+    aten._unique2.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8},
     aten.bincount.default : {i64, i8, i32, i16, u8},
     aten.equal.default : {c64, f16, i8, f64, c128, i64, bf16, f32, i32, b8, i16, u8},
     aten.frexp.Tensor : {bf16, f32, f16, f64},
@@ -887,8 +885,8 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten.searchsorted.Tensor : {f16, i8, f64, i64, bf16, f32, i32, i16, u8},
     aten.searchsorted.Tensor_out : {f16, i8, f64, i64, bf16, f32, i32, i16, u8},
     aten.segment_reduce.default : {bf16, f32, f16, f64},
-    aten.unique_consecutive.default : {i8, f64, i64, bf16, f32, i32, b8, i16, u8},
-    aten.unique_dim.default : {i8, f64, i64, bf16, f32, i32, b8, i16, u8},
+    aten.unique_consecutive.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8},
+    aten.unique_dim.default : {i8, f64, i64, f16, bf16, f32, i32, b8, i16, u8},
     aten.upsample_nearest3d.vec : {bf16, f32, f64, u8},
 }
 
@@ -904,8 +902,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten.linalg_pinv.atol_rtol_tensor: {f32, f64},
     aten.linalg_pinv.atol_rtol_tensor_out: {f32, f64},
     aten.empty.memory_format: {b8, bf16, c128, c64, c32, f16, f32, f64, i16, i32, i64, i8, u8},
-    aten.bucketize.Tensor : {f16, i8, f64, i64, bf16, f32, i32, i16, u8},
-    aten.bucketize.Tensor_out : {f16, i8, f64, i64, bf16, f32, i32, i16, u8},
     aten.addbmm_.default: {bf16, c128, c64, f32, f64, i16, i32, i64, i8, u8},
 }
 
@@ -971,7 +967,7 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 }
 
 meta_dispatch_device_skips['cpu'] = {
-    aten._embedding_bag_forward_only.default: {f16, f32, f64},
+    aten._embedding_bag_forward_only.default: {bf16, f16, f32, f64},
     aten.native_batch_norm.default: {f32, f64},
     aten._native_batch_norm_legit.default: {f32, f64},
     aten._native_batch_norm_legit.no_stats: {f32, f64},
@@ -1038,8 +1034,7 @@ def get_strided_variants(t, include_storage_offset=False):
             strided_arg_variants = [arg]
         strided_args.append(strided_arg_variants)
 
-    for result in itertools.product(*strided_args):
-        yield result
+    yield from itertools.product(*strided_args)
 
 class MetaCrossRefDispatchMode(torch.utils._python_dispatch.TorchDispatchMode):
     test_case: TestCase
@@ -1226,6 +1221,29 @@ def test_empty_quantized(self):
         r = torch.empty(2 ** 52, device='meta', dtype=torch.qint8)
         self.assertEqual(r.device.type, 'meta')
 
+    @onlyCPU
+    def test_meta_autograd_no_error(self):
+        lib = torch.library.Library("meta_test", "DEF")
+        impl_cpu = torch.library.Library("meta_test", "IMPL", "CPU")
+        impl_meta = torch.library.Library("meta_test", "IMPL", "Meta")
+
+        def foo_impl(x):
+            return x + 1
+
+        lib.define("foo(Tensor a) -> Tensor")
+        impl_meta.impl("foo", foo_impl)
+        impl_cpu.impl("foo", foo_impl)
+
+        a = torch.ones(2, device='meta')
+        # The point of the test is that this should not error:
+        # We have a fallthrough kernel registered to the AutogradMeta
+        # key for custom ops, so it's fine that `foo()` doesn't have
+        # an autograd kernel.
+        b = torch.ops.meta_test.foo.default(a)
+        del impl_meta
+        del impl_cpu
+        del lib
+
     def test_huber_loss_backward(self):
         inps = [torch.rand(2**52, device='meta') for _ in range(3)]
         r = torch.ops.aten.huber_loss_backward(*inps, 0, 1.0)
diff --git a/test/test_metal.py b/test/test_metal.py
index 35b3ed45eb19..6b9b29ea5492 100644
--- a/test/test_metal.py
+++ b/test/test_metal.py
@@ -64,7 +64,7 @@ def test_conv(self):
 
         class Conv2D(torch.nn.Module):
             def __init__(self):
-                super(Conv2D, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
@@ -84,7 +84,7 @@ def forward(self, x):
 
         class Conv2DRelu(torch.nn.Module):
             def __init__(self):
-                super(Conv2DRelu, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
@@ -123,7 +123,7 @@ def forward(self, x):
 
         class Conv2DHardtanh(torch.nn.Module):
             def __init__(self):
-                super(Conv2DHardtanh, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index b1e9b903129b..4b96c19208db 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -300,9 +300,8 @@ def test_conv2d_bf16(self):
     def test_conv3d_bf16(self):
         self._test_conv_bf16_base(dim=3)
 
-    def _test_conv2d_nhwc_base(self, weight_memory_format, dtype):
-        conv_module = torch.nn.Conv2d
-        input_shapes = (224, 224)
+    def _test_conv2d_nhwc_base(self, conv_module, weight_memory_format, dtype):
+        input_shapes = (55, 55)
         options = itertools.product([True, False], [True, False], [1, 2], [1, 4])
         for train, bias, dilation, groups in options:
             N = torch.randint(3, 10, (1,)).item()
@@ -310,8 +309,13 @@ def _test_conv2d_nhwc_base(self, weight_memory_format, dtype):
             C = torch.randint(1, 3, (1,)).item() * groups
             x_shape = (N, C) + input_shapes
             x = torch.randn(x_shape, dtype=dtype)
-            # conv1: mkldnn conv2d in contiguous memory format (nchw)
-            # conv2: mkldnn conv2d in channels last memory format (nhwc)
+
+            # TODO: remove this when group depthwise is supported:
+            if conv_module is torch.nn.ConvTranspose2d and groups > 1 and C == groups:
+                continue
+
+            # conv1: mkldnn conv in contiguous memory format (nchw)
+            # conv2: mkldnn conv in channels last memory format (nhwc)
             conv1 = conv_module(in_channels=C,
                                 out_channels=M,
                                 kernel_size=3,
@@ -342,15 +346,85 @@ def _test_conv2d_nhwc_base(self, weight_memory_format, dtype):
                 self.assertEqual(x1.grad, x2.grad)
 
     def test_conv2d_nhwc(self):
-        self._test_conv2d_nhwc_base(torch.contiguous_format, dtype=torch.float32)
-        self._test_conv2d_nhwc_base(torch.channels_last, dtype=torch.float32)
+        self._test_conv2d_nhwc_base(torch.nn.Conv2d, torch.contiguous_format, dtype=torch.float32)
+        self._test_conv2d_nhwc_base(torch.nn.Conv2d, torch.channels_last, dtype=torch.float32)
 
     @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
     def test_conv2d_nhwc_bf16(self):
         # when has_bf16_support() returns false, bf16 CPU conv will fall back to thnn impl
         if has_bf16_support():
-            self._test_conv2d_nhwc_base(torch.contiguous_format, dtype=torch.bfloat16)
-            self._test_conv2d_nhwc_base(torch.channels_last, dtype=torch.bfloat16)
+            self._test_conv2d_nhwc_base(torch.nn.Conv2d, torch.contiguous_format, dtype=torch.bfloat16)
+            self._test_conv2d_nhwc_base(torch.nn.Conv2d, torch.channels_last, dtype=torch.bfloat16)
+
+    def test_conv_transpose2d_nhwc(self):
+        self._test_conv2d_nhwc_base(torch.nn.ConvTranspose2d, torch.contiguous_format, dtype=torch.float32)
+        self._test_conv2d_nhwc_base(torch.nn.ConvTranspose2d, torch.channels_last, dtype=torch.float32)
+
+    @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
+    def test_conv_transpose2d_nhwc_bf16(self):
+        # when has_bf16_support() returns false, bf16 CPU conv will fall back to thnn impl
+        if has_bf16_support():
+            self._test_conv2d_nhwc_base(torch.nn.ConvTranspose2d, torch.contiguous_format, dtype=torch.bfloat16)
+            self._test_conv2d_nhwc_base(torch.nn.ConvTranspose2d, torch.channels_last, dtype=torch.bfloat16)
+
+    def _test_conv_transpose_base(self, dim):
+        conv_module = {
+            1: torch.nn.ConvTranspose1d,
+            2: torch.nn.ConvTranspose2d,
+            3: torch.nn.ConvTranspose3d
+        }
+        input_shapes = {1: (55,), 2: (28, 28), 3: (14, 14, 14)}
+        options = itertools.product([True, False], [True, False], [1, 2], [1, 4])
+        for train, bias, dilation, groups in options:
+            N = torch.randint(3, 10, (1,)).item()
+            M = torch.randint(1, 3, (1,)).item() * groups
+            C = torch.randint(1, 3, (1,)).item() * groups
+            x_shape = (N, C) + input_shapes[dim]
+            data = torch.randn(x_shape, dtype=torch.float32)
+            # conv: mkldnn tranpose conv fp32
+            # conv_ref: thnn transpose conv fp32
+            conv = conv_module[dim](in_channels=C,
+                                    out_channels=M,
+                                    kernel_size=3,
+                                    stride=1,
+                                    padding=1,
+                                    dilation=dilation,
+                                    bias=bias,
+                                    groups=groups).to(dtype=torch.float32)
+            x = data.clone()
+            x_ref = x.clone()
+            if train:
+                x.requires_grad_()
+                x_ref.requires_grad_()
+
+            conv_ref = copy.deepcopy(conv)
+            with torch.backends.mkldnn.flags(enabled=False):
+                y_ref = conv_ref(x_ref)
+                if train:
+                    y_ref.sum().backward()
+
+            y = conv(x)
+            if train:
+                y.sum().backward()
+
+            self.assertEqual(y, y_ref)
+            if train:
+                self.assertEqual(x.grad, x_ref.grad)
+                self.assertEqual(conv.weight.grad,
+                                 conv_ref.weight.grad,
+                                 atol=1e-3,
+                                 rtol=1e-3)
+                if bias:
+                    self.assertEqual(conv.bias.grad, conv_ref.bias.grad)
+
+    def test_conv_transpose1d(self):
+        self._test_conv_transpose_base(dim=1)
+
+    def test_conv_transpose2d(self):
+        self._test_conv_transpose_base(dim=2)
+
+    def test_conv_transpose3d(self):
+        self._test_conv_transpose_base(dim=3)
 
     def test_conv2d_legacy_jit_model(self):
         """
diff --git a/test/test_mkldnn_fusion.py b/test/test_mkldnn_fusion.py
index 9f264337d956..fad3e77dccab 100644
--- a/test/test_mkldnn_fusion.py
+++ b/test/test_mkldnn_fusion.py
@@ -20,6 +20,7 @@ class PointwisePostOp(NamedTuple):
     algorithm : str = ""
 
 CONV_MODULES = {2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+CONV_TRANSPOSE_MODULES = {2: torch.nn.ConvTranspose2d}
 
 @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
 class TestMkldnnFusion(JitTestCase):
@@ -61,7 +62,7 @@ def _check_model(self, m, x, trace=False):
     def test_single_conv(self):
         class M(nn.Module):
             def __init__(self, in_channels, out_channels, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=bias, **kwargs)
 
             def forward(self, x):
@@ -100,7 +101,7 @@ def forward(self, x):
     def test_conv_unary_fusion_nnc(self):
         class M(nn.Module):
             def __init__(self, unary_fn, in_channels, out_channels, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=bias, **kwargs)
                 self.unary = unary_fn
 
@@ -129,7 +130,7 @@ def forward(self, x):
     def test_unsupported_conv(self):
         class M(nn.Module):
             def __init__(self, m, in_channels, out_channels, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = m(in_channels, out_channels, bias=bias, **kwargs)
 
             def forward(self, x):
@@ -192,7 +193,7 @@ def _binary_list(self):
     def test_linear_unary_fusion_ops(self):
         class M(nn.Module):
             def __init__(self, unary_fn, in_channels, out_channels, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(
                     in_channels, out_channels, bias=bias, **kwargs
                 )
@@ -222,7 +223,7 @@ def forward(self, x):
     def test_conv_unary_fusion_ops(self):
         class M(nn.Module):
             def __init__(self, unary_fn, dim, in_channels, out_channels, dilation, groups, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = CONV_MODULES[dim](in_channels, out_channels, dilation=dilation, groups=groups, bias=bias, **kwargs)
                 self.unary = unary_fn
 
@@ -258,7 +259,7 @@ def forward(self, x):
     def test_conv_binary_fusion_ops(self):
         class M(nn.Module):
             def __init__(self, binary_fn, dim, in_channels, out_channels, dilation, groups, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv = CONV_MODULES[dim](in_channels, out_channels, dilation=dilation, groups=groups, bias=bias, **kwargs)
                 self.binary = binary_fn
 
@@ -306,7 +307,7 @@ def forward(self, x, other):
     def test_linear_binary_fusion_ops(self):
         class M(nn.Module):
             def __init__(self, binary_fn, in_channels, out_channels, bias, **kwargs):
-                super(M, self).__init__()
+                super().__init__()
                 self.linear = torch.nn.Linear(
                     in_channels, out_channels, bias=bias, **kwargs
                 )
@@ -332,5 +333,64 @@ def forward(self, x, other):
                     )
                     self.assertEqual(ref, fused)
 
+    def test_conv_transpose_unary_fusion_ops(self):
+        class M(nn.Module):
+            def __init__(self, unary_fn, dim, in_channels, out_channels, kernel_size, **kwargs):
+                super().__init__()
+                self.conv_transpose = CONV_TRANSPOSE_MODULES[dim](in_channels, out_channels, kernel_size, **kwargs)
+                self.unary = unary_fn
+
+            def forward(self, x):
+                x = self.conv_transpose(x)
+                x = self.unary(x)
+                return x
+
+        input_shapes = {2: (28, 28)}
+        kernel_size = 3
+        for pointwise_name, pointwise_info in self._unary_list().items():
+            for dim in [2]:
+                channels_last = torch.channels_last if dim == 2 else torch.channels_last_3d
+                options = itertools.product([True, False], [1, 2], [1, 4], [torch.contiguous_format, channels_last], [False, True])
+                for bias, dilation, groups, memory_format, prepack_weight in options:
+                    oC = 32 * groups
+                    iC = 3 * groups
+                    x_shape = (1, iC) + input_shapes[dim]
+                    x = torch.randn(x_shape, dtype=torch.float32).to(memory_format=memory_format)
+                    mod = M(pointwise_info.pointwise_module, dim, iC, oC, kernel_size, dilation=dilation, groups=groups, bias=bias)
+                    mod = mod.to(memory_format=memory_format).eval()
+                    with torch.no_grad():
+                        ref = mod(x)
+                        attr = pointwise_info.attr
+                        scalars = pointwise_info.scalars
+                        algorithm = pointwise_info.algorithm
+
+                        if prepack_weight:
+                            packed_weight = torch.ops.mkldnn._reorder_convolution_transpose_weight(
+                                mod.conv_transpose.weight.to_mkldnn(),
+                                mod.conv_transpose.padding,
+                                mod.conv_transpose.output_padding,
+                                mod.conv_transpose.stride,
+                                mod.conv_transpose.dilation,
+                                mod.conv_transpose.groups,
+                                x.size())
+                            mod.conv_transpose.weight = torch.nn.Parameter(
+                                packed_weight,
+                                requires_grad=mod.conv_transpose.weight.requires_grad,
+                            )
+
+                        fused = torch.ops.mkldnn._convolution_transpose_pointwise(
+                            x,
+                            mod.conv_transpose.weight,
+                            mod.conv_transpose.bias,
+                            mod.conv_transpose.padding,
+                            mod.conv_transpose.output_padding,
+                            mod.conv_transpose.stride,
+                            mod.conv_transpose.dilation,
+                            mod.conv_transpose.groups,
+                            attr,
+                            scalars,
+                            algorithm)
+                    self.assertEqual(ref, fused)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index e77fce392594..a6c0a0692d45 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -54,7 +54,7 @@ def test_optimize_for_mobile(self):
 
         class MyTestModule(torch.nn.Module):
             def __init__(self):
-                super(MyTestModule, self).__init__()
+                super().__init__()
                 self.conv_weight = torch.nn.Parameter(torch.rand(conv_weight_shape))
                 self.conv_bias = torch.nn.Parameter(torch.rand((conv_bias_shape)))
                 self.linear_weight = torch.nn.Parameter(torch.rand(linear_weight_shape))
@@ -86,7 +86,7 @@ def foo(self, x):
 
         class BNTestModule(torch.nn.Module):
             def __init__(self):
-                super(BNTestModule, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(1, 20, 5, 1)
                 self.bn = torch.nn.BatchNorm2d(num_features=20)
                 self.bn.eps = 0.0023
@@ -167,7 +167,7 @@ def forward(self, x):
 
         class MyMobileOptimizedTagTest(torch.nn.Module):
             def __init__(self):
-                super(MyMobileOptimizedTagTest, self).__init__()
+                super().__init__()
                 self.linear_weight = torch.nn.Parameter(torch.rand(linear_weight_shape))
                 self.linear_bias = torch.nn.Parameter(torch.rand((weight_output_dim)))
 
@@ -184,7 +184,7 @@ def forward(self, x):
 
         class MyPreserveMethodsTest(torch.nn.Module):
             def __init__(self):
-                super(MyPreserveMethodsTest, self).__init__()
+                super().__init__()
                 self.linear_weight = torch.nn.Parameter(torch.rand(linear_weight_shape))
                 self.linear_bias = torch.nn.Parameter(torch.rand((weight_output_dim)))
 
@@ -208,7 +208,7 @@ def preserveThis(self):
 
         class OptimizeNoForwardTest(torch.nn.Module):
             def __init__(self):
-                super(OptimizeNoForwardTest, self).__init__()
+                super().__init__()
                 self.l = nn.Linear(10, 100)
                 self.l2 = nn.Linear(100, 1)
                 self.d = nn.Dropout(p=0.2)
@@ -234,7 +234,7 @@ def foo(self, x):
 
         class BNTestNoForwardModule(torch.nn.Module):
             def __init__(self):
-                super(BNTestNoForwardModule, self).__init__()
+                super().__init__()
                 self.conv = torch.nn.Conv2d(1, 20, 5, 1)
                 self.bn = torch.nn.BatchNorm2d(num_features=20)
                 self.bn.eps = 0.0023
@@ -273,7 +273,7 @@ def test_quantized_conv_no_asan_failures(self):
 
         class Child(nn.Module):
             def __init__(self):
-                super(Child, self).__init__()
+                super().__init__()
                 self.conv2 = nn.Conv2d(1, 1, 1)
 
             def forward(self, x):
@@ -282,7 +282,7 @@ def forward(self, x):
 
         class Parent(nn.Module):
             def __init__(self):
-                super(Parent, self).__init__()
+                super().__init__()
                 self.quant = torch.ao.quantization.QuantStub()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.child = Child()
@@ -308,7 +308,7 @@ def forward(self, x):
     def test_generate_mobile_module_lints(self):
         class MyTestModule(torch.nn.Module):
             def __init__(self):
-                super(MyTestModule, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(4, 4)
                 self.dropout = torch.nn.Dropout(p=0.5)
 
@@ -319,7 +319,7 @@ def forward(self, inputs):
 
         class MyBNModule(torch.nn.Module):
             def __init__(self):
-                super(MyBNModule, self).__init__()
+                super().__init__()
                 self.bn = torch.nn.BatchNorm2d(4, affine=True)
 
             def forward(self, inputs):
@@ -327,9 +327,6 @@ def forward(self, inputs):
                 return bn
 
         class MyBundledInputModule(torch.nn.Module):
-            def __init__(self):
-                super(MyBundledInputModule, self).__init__()
-
             def forward(self, inputs):
                 return inputs
 
@@ -359,16 +356,10 @@ def get_lint_count_by_type(lint_type, module_lint_List):
     @skipIfNoXNNPACK
     def test_preserve_bundled_inputs_methods(self):
         class MyBundledInputModule(torch.nn.Module):
-            def __init__(self):
-                super(MyBundledInputModule, self).__init__()
-
             def forward(self, inputs):
                 return inputs
 
         class MyIncompleteBundledInputModule(torch.nn.Module):
-            def __init__(self):
-                super(MyIncompleteBundledInputModule, self).__init__()
-
             def forward(self, inputs):
                 return inputs
 
@@ -419,7 +410,7 @@ def test_hoist_conv_packed_params(self):
 
         class Standalone(nn.Module):
             def __init__(self):
-                super(Standalone, self).__init__()
+                super().__init__()
                 self.quant = torch.ao.quantization.QuantStub()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.conv2 = nn.Conv2d(1, 1, 1)
@@ -440,7 +431,7 @@ def fuse_model(self):
 
         class Child(nn.Module):
             def __init__(self):
-                super(Child, self).__init__()
+                super().__init__()
                 self.conv1 = nn.Conv2d(1, 1, 1)
 
             def forward(self, x):
@@ -449,7 +440,7 @@ def forward(self, x):
 
         class Parent(nn.Module):
             def __init__(self):
-                super(Parent, self).__init__()
+                super().__init__()
                 self.quant = torch.ao.quantization.QuantStub()
                 self.conv1 = nn.Conv2d(1, 1, 1)
                 self.child = Child()
@@ -521,7 +512,7 @@ def test_mobilenet_optimize_for_mobile(self):
     def test_clone_module_with_class(self):
         class MyInnerTestModule(torch.nn.Module):
             def __init__(self):
-                super(MyInnerTestModule, self).__init__()
+                super().__init__()
                 self.pqr = torch.Tensor([10., 20., 30.])
 
             def forward(self, inputs):
@@ -533,7 +524,7 @@ def dummy_method_not_cloned(self):
 
         class MyTestModule(torch.nn.Module):
             def __init__(self):
-                super(MyTestModule, self).__init__()
+                super().__init__()
                 self.abc = 23
                 self.pqr = torch.Tensor([1., 2., 3.])
                 self.inner = MyInnerTestModule()
diff --git a/test/test_model_dump.py b/test/test_model_dump.py
index 3c682b6ce680..f7ae07131a99 100644
--- a/test/test_model_dump.py
+++ b/test/test_model_dump.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 # Owner(s): ["oncall: mobile"]
 
-import sys
 import os
 import io
 import functools
@@ -85,8 +84,7 @@ def wrapper(self, *args, **kwds):
 
 class TestModelDump(TestCase):
     def needs_resources(self):
-        if sys.version_info < (3, 7):
-            self.skipTest("importlib.resources was new in 3.7")
+        pass
 
     def test_inline_skeleton(self):
         self.needs_resources()
diff --git a/test/test_module_init.py b/test/test_module_init.py
index 98dcb3ee694a..422363f748f2 100644
--- a/test/test_module_init.py
+++ b/test/test_module_init.py
@@ -185,9 +185,9 @@ def build_constructor_arg_db():
         torch.ao.nn.qat.EmbeddingBag: ((10, 12), {
             'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
         }),
-        torch.nn.quantizable.LSTM: ((5, 6), {}),
-        torch.nn.quantizable.LSTMCell: ((5, 6), {}),
-        torch.nn.quantizable.MultiheadAttention: ((10, 2), {}),
+        torch.ao.nn.quantizable.LSTM: ((5, 6), {}),
+        torch.ao.nn.quantizable.LSTMCell: ((5, 6), {}),
+        torch.ao.nn.quantizable.MultiheadAttention: ((10, 2), {}),
         torch.ao.nn.quantized.BatchNorm2d: ((2,), {}),
         torch.ao.nn.quantized.BatchNorm3d: ((2,), {}),
         torch.ao.nn.quantized.Dropout: ((), {}),
@@ -236,74 +236,74 @@ def build_constructor_arg_db():
         torch.ao.nn.quantized.FloatFunctional: ((), {}),
         torch.ao.nn.quantized.FXFloatFunctional: ((), {}),
         torch.ao.nn.quantized.QFunctional: ((), {}),
-        # Remove torch.nn.quantized after the migration completes:
-        torch.nn.qat.Conv1d: ((3, 3, 3), {
+        # Remove torch.ao.nn.quantized after the migration completes:
+        torch.ao.nn.qat.Conv1d: ((3, 3, 3), {
             'qconfig': torch.ao.quantization.default_qconfig,
         }),
-        torch.nn.qat.Conv2d: ((3, 3, 3), {
+        torch.ao.nn.qat.Conv2d: ((3, 3, 3), {
             'qconfig': torch.ao.quantization.default_qconfig,
         }),
-        torch.nn.qat.Conv3d: ((3, 3, 3), {
+        torch.ao.nn.qat.Conv3d: ((3, 3, 3), {
             'qconfig': torch.ao.quantization.default_qconfig,
         }),
-        torch.nn.qat.Linear: ((5, 2), {
+        torch.ao.nn.qat.Linear: ((5, 2), {
             'qconfig': torch.ao.quantization.default_qconfig,
         }),
-        torch.nn.qat.Embedding: ((10, 12), {
+        torch.ao.nn.qat.Embedding: ((10, 12), {
             'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
         }),
-        torch.nn.qat.EmbeddingBag: ((10, 12), {
+        torch.ao.nn.qat.EmbeddingBag: ((10, 12), {
             'qconfig': torch.ao.quantization.float_qparams_weight_only_qconfig,
         }),
-        torch.nn.quantized.BatchNorm2d: ((2,), {}),
-        torch.nn.quantized.BatchNorm3d: ((2,), {}),
-        torch.nn.quantized.Dropout: ((), {}),
-        torch.nn.quantized.Conv1d: ((3, 3, 3), {}),
-        torch.nn.quantized.Conv2d: ((3, 3, 3), {}),
-        torch.nn.quantized.Conv3d: ((3, 3, 3), {}),
-        torch.nn.quantized.ConvTranspose1d: ((3, 3, 3), {}),
-        torch.nn.quantized.ConvTranspose2d: ((3, 3, 3), {}),
-        torch.nn.quantized.ConvTranspose3d: ((16, 33, (3, 3, 5)), {
+        torch.ao.nn.quantized.BatchNorm2d: ((2,), {}),
+        torch.ao.nn.quantized.BatchNorm3d: ((2,), {}),
+        torch.ao.nn.quantized.Dropout: ((), {}),
+        torch.ao.nn.quantized.Conv1d: ((3, 3, 3), {}),
+        torch.ao.nn.quantized.Conv2d: ((3, 3, 3), {}),
+        torch.ao.nn.quantized.Conv3d: ((3, 3, 3), {}),
+        torch.ao.nn.quantized.ConvTranspose1d: ((3, 3, 3), {}),
+        torch.ao.nn.quantized.ConvTranspose2d: ((3, 3, 3), {}),
+        torch.ao.nn.quantized.ConvTranspose3d: ((16, 33, (3, 3, 5)), {
             'stride': (2, 1, 1),
             'padding': (4, 2, 2),
             'output_padding': (2, 2, 2),
             'dilation': (1, 1, 1),
         }),
-        torch.nn.quantized.DeQuantize: ((), {}),
-        torch.nn.quantized.ELU: ((0.01, 0), {}),
-        torch.nn.quantized.Embedding: ((10, 3), {
+        torch.ao.nn.quantized.DeQuantize: ((), {}),
+        torch.ao.nn.quantized.ELU: ((0.01, 0), {}),
+        torch.ao.nn.quantized.Embedding: ((10, 3), {
             'factory_kwargs': {},
         }),
-        torch.nn.quantized.EmbeddingBag: ((10, 3), {
+        torch.ao.nn.quantized.EmbeddingBag: ((10, 3), {
             'factory_kwargs': {},
         }),
-        torch.nn.quantized.GroupNorm: ((2, 4, torch.nn.Parameter(torch.tensor(2.)),
-                                        torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.nn.quantized.Hardswish: ((0.1, 0,), {}),
-        torch.nn.quantized.InstanceNorm1d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                             torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.nn.quantized.InstanceNorm2d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                             torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.nn.quantized.InstanceNorm3d: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                             torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.nn.quantized.LayerNorm: ((2, torch.nn.Parameter(torch.tensor(2.)),
-                                        torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
-        torch.nn.quantized.LeakyReLU: ((0.01, 0), {}),
-        torch.nn.quantized.Linear: ((5, 2), {
+        torch.ao.nn.quantized.GroupNorm: ((2, 4, torch.nn.Parameter(torch.tensor(2.)),
+                                           torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
+        torch.ao.nn.quantized.Hardswish: ((0.1, 0,), {}),
+        torch.ao.nn.quantized.InstanceNorm1d: ((2, torch.nn.Parameter(torch.tensor(2.)),
+                                                torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
+        torch.ao.nn.quantized.InstanceNorm2d: ((2, torch.nn.Parameter(torch.tensor(2.)),
+                                                torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
+        torch.ao.nn.quantized.InstanceNorm3d: ((2, torch.nn.Parameter(torch.tensor(2.)),
+                                                torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
+        torch.ao.nn.quantized.LayerNorm: ((2, torch.nn.Parameter(torch.tensor(2.)),
+                                           torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}),
+        torch.ao.nn.quantized.LeakyReLU: ((0.01, 0), {}),
+        torch.ao.nn.quantized.Linear: ((5, 2), {
             'factory_kwargs': {},
         }),
-        torch.nn.quantized.MaxPool2d: ((3,), {}),
-        torch.nn.quantized.PReLU: ((0.01, 0), {}),
-        torch.nn.quantized.Quantize: ((0.1, 0), {
+        torch.ao.nn.quantized.MaxPool2d: ((3,), {}),
+        torch.ao.nn.quantized.PReLU: ((0.01, 0), {}),
+        torch.ao.nn.quantized.Quantize: ((0.1, 0), {
             'dtype': torch.int16,
             'factory_kwargs': {},
         }),
-        torch.nn.quantized.ReLU6: ((), {}),
-        torch.nn.quantized.Sigmoid: ((0.1, 0), {}),
-        torch.nn.quantized.Softmax: ((), {}),
-        torch.nn.quantized.FloatFunctional: ((), {}),
-        torch.nn.quantized.FXFloatFunctional: ((), {}),
-        torch.nn.quantized.QFunctional: ((), {}),
+        torch.ao.nn.quantized.ReLU6: ((), {}),
+        torch.ao.nn.quantized.Sigmoid: ((0.1, 0), {}),
+        torch.ao.nn.quantized.Softmax: ((), {}),
+        torch.ao.nn.quantized.FloatFunctional: ((), {}),
+        torch.ao.nn.quantized.FXFloatFunctional: ((), {}),
+        torch.ao.nn.quantized.QFunctional: ((), {}),
     }
 
 
@@ -427,9 +427,9 @@ def generate_tests(test_cls, constructor_arg_db):
         torch.nn,
         torch.ao.nn.qat,
         torch.ao.nn.quantized,
-        torch.nn.qat,
-        torch.nn.quantizable,
-        torch.nn.quantized,
+        torch.ao.nn.qat,
+        torch.ao.nn.quantizable,
+        torch.ao.nn.quantized,
     ]
     # ...except these
     MODULES_TO_SKIP = {
@@ -440,10 +440,10 @@ def generate_tests(test_cls, constructor_arg_db):
         # See https://github.com/pytorch/pytorch/issues/55396
         torch.ao.nn.quantized.Embedding,
         torch.ao.nn.quantized.EmbeddingBag,
-        torch.nn.quantized.Embedding,
-        torch.nn.quantized.EmbeddingBag,
-        torch.nn.quantized.LSTM,
-        torch.nn.quantized.MultiheadAttention,
+        torch.ao.nn.quantized.Embedding,
+        torch.ao.nn.quantized.EmbeddingBag,
+        torch.ao.nn.quantized.LSTM,
+        torch.ao.nn.quantized.MultiheadAttention,
     }
     # no need to support kwargs for these modules even though
     # they have parameters / buffers because they are passed in
@@ -491,13 +491,13 @@ def generate_tests(test_cls, constructor_arg_db):
         torch.ao.nn.quantized.ConvTranspose3d,
         torch.ao.nn.quantized.Linear,
         # Remove the lines below after AO migration is complete
-        torch.nn.quantized.Conv1d,
-        torch.nn.quantized.Conv2d,
-        torch.nn.quantized.Conv3d,
-        torch.nn.quantized.ConvTranspose1d,
-        torch.nn.quantized.ConvTranspose2d,
-        torch.nn.quantized.ConvTranspose3d,
-        torch.nn.quantized.Linear,
+        torch.ao.nn.quantized.Conv1d,
+        torch.ao.nn.quantized.Conv2d,
+        torch.ao.nn.quantized.Conv3d,
+        torch.ao.nn.quantized.ConvTranspose1d,
+        torch.ao.nn.quantized.ConvTranspose2d,
+        torch.ao.nn.quantized.ConvTranspose3d,
+        torch.ao.nn.quantized.Linear,
     }
 
     for namespace in NAMESPACES:
diff --git a/test/test_modules.py b/test/test_modules.py
index 6a8e064b1142..2ae17f5f8cf8 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -498,7 +498,7 @@ def test_cpu_gpu_parity(self, device, dtype, module_info, training):
         # TODO: RNN / GRU / LSTM don't support backwards on eval mode for cuDNN; skip this in a
         # nicer way for eval mode only.
         # See https://github.com/pytorch/pytorch/issues/79161
-        rnn_modules = set([torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM])
+        rnn_modules = {torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM}
         if (module_info.module_cls in rnn_modules
                 and not training
                 and 'cuda' in device
diff --git a/test/test_mps.py b/test/test_mps.py
index d7e560e53c29..a9f5e7fb879b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # Owner(s): ["module: mps"]
 
+import platform
 import sys
 import math
 import random
@@ -11,12 +12,13 @@
 import os
 import pprint
 import copy
+import gc
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import itertools
 from collections import defaultdict
-from torch._six import inf
+from torch import inf
 from torch.nn import Parameter
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \
@@ -31,6 +33,7 @@
 
 from torch.testing._internal.common_methods_invocations import (
     op_db,
+    DecorateInfo,
     UnaryUfuncInfo,
     ReductionOpInfo,
     SpectralFuncInfo,
@@ -55,13 +58,259 @@
     )
 )
 
+def mps_ops_modifier(ops):
+    # Those ops worked on MacOS12, but broken on MacOS13, see https://github.com/pytorch/pytorch/issues/85758
+    MACOS_13_X_XFAILLIST = {
+        'masked.softmax': [torch.float32],
+        'masked.softmin': [torch.float32],
+        'masked.log_softmax': [torch.float32],
+    }
+    MACOS_12_X_XFAILLIST = {
+        '__radd__': [torch.uint8],
+        '__rdiv__': [torch.uint8],
+        '__rmul__': [torch.uint8],
+        '__rpow__': [torch.uint8],
+        'abs': [torch.uint8],
+        'acos': [torch.uint8],
+        'acosh': [torch.uint8],
+        'add': [torch.uint8],
+        'asin': [torch.uint8],
+        'asinh': [torch.uint8],
+        'atan': [torch.uint8],
+        'atanh': [torch.uint8],
+        'cos': [torch.uint8],
+        'cosh': [torch.uint8],
+        'deg2rad': [torch.uint8],
+        'diff': [torch.uint8],
+        'equal': [torch.uint8],
+        'erf': [torch.uint8],
+        'exp2': [torch.uint8],
+        'exp': [torch.uint8],
+        'fmax': [torch.uint8],
+        'fmin': [torch.uint8],
+        'fmod': [torch.uint8],
+        'isclose': [torch.uint8],
+        'isnan': [torch.uint8],
+        'kron': [torch.uint8],
+        'log10': [torch.uint8],
+        'log1p': [torch.uint8],
+        'log2': [torch.uint8],
+        'log': [torch.uint8],
+        'logical_and': [torch.uint8],
+        'logical_or': [torch.uint8],
+        'logical_xor': [torch.uint8],
+        'logit': [torch.uint8],
+        'masked.mean': [torch.uint8],
+        'masked.std': [torch.uint8],
+        'masked.var': [torch.uint8],
+        'nn.functional.avg_pool1d': [torch.int64],
+        'nn.functional.avg_pool2d': [torch.int64],
+        'nn.functional.cosine_embedding_loss': [torch.uint8],
+        'nn.functional.poisson_nll_loss': [torch.uint8],
+        'nn.functional.softsign': [torch.uint8],
+        'nn.functional.tanhshrink': [torch.uint8],
+        'pow': [torch.int16, torch.int64, torch.uint8],
+        'rad2deg': [torch.uint8],
+        'reciprocal': [torch.uint8],
+        'remainder': [torch.uint8],
+        'rsqrt': [torch.uint8],
+        'sigmoid': [torch.uint8],
+        'sign': [torch.uint8],
+        'sin': [torch.uint8],
+        'sinh': [torch.uint8],
+        'special.ndtr': [torch.uint8],
+        'sqrt': [torch.uint8],
+        'sub': [torch.uint8],
+        'tan': [torch.uint8],
+        'tanh': [torch.uint8],
+        'true_divide': [torch.uint8],
+        'xlogy': [torch.uint8],
+        # Weird
+        'square': [torch.uint8, torch.bool, torch.int16, torch.int32, torch.int64],
+    }
+
+
+    # Those ops are not expected to work
+    XFAILLIST = {
+        '__rpow__': [torch.int16, torch.int32, torch.int64],
+        'chalf': None,
+        # Unsupported dtypes
+        'dot': [torch.int64],
+        'index_add': [torch.int64],
+        'nn.functional.conv1d': [torch.int64],
+        'nn.functional.conv2d': [torch.int64],
+        'nn.functional.conv_transpose1d': [torch.int64],
+        'nn.functional.conv_transpose2d': [torch.int64],
+        'remainder': [torch.int64],
+        'sigmoid': [torch.int64],
+        # failures due to lack of op implementation on MPS backend
+        'put': None,
+        # Weird
+        'byte': [torch.float16, torch.float32],
+        'nn.functional.adaptive_avg_pool1d': [torch.float32],
+        'nn.functional.adaptive_avg_pool2d': [torch.float32],
+    }
+
+    def addDecorator(op, d) -> None:
+        op.decorators = list(op.decorators) if op.decorators is not None else []
+        op.decorators.append(d)
+
+    for op in ops:
+        key = op.name + op.variant_test_name
+        if key in XFAILLIST:
+            addDecorator(op, DecorateInfo(
+                         unittest.expectedFailure,
+                         dtypes=XFAILLIST[key]))
+
+        if key in MACOS_13_X_XFAILLIST and torch.backends.mps.is_macos13_or_newer():
+            addDecorator(op, DecorateInfo(
+                         unittest.expectedFailure,
+                         dtypes=MACOS_13_X_XFAILLIST[key]))
+        if key in MACOS_12_X_XFAILLIST and not torch.backends.mps.is_macos13_or_newer():
+            addDecorator(op, DecorateInfo(
+                         unittest.expectedFailure,
+                         dtypes=MACOS_12_X_XFAILLIST[key]))
+        yield op
+
 # Same logic as test_cuda.py
 if not torch.backends.mps.is_available():
     print('MPS not available, skipping tests', file=sys.stderr)
     TestCase = object  # noqa: F811
     NNTestCase = object  # noqa: F811
 
-class MPSReluTest(TestCase):
+product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]))
+
+# Determine whether to enable MPS memory leak check (uses same code as CUDA).
+TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
+
+def skipMPSMemoryLeakCheckIf(condition):
+    def dec(fn):
+        if getattr(fn, '_do_mps_memory_leak_check', True):
+            fn._do_mps_memory_leak_check = not condition
+        return fn
+    return dec
+
+class MpsMemoryLeakCheck():
+    def __init__(self, testcase, name=None):
+        self.name = testcase.id() if name is None else name
+        self.testcase = testcase
+
+    def __enter__(self):
+        # Performs a gc if required (required if any memory is held)
+        caching_allocator_mem_allocated = torch.mps.current_allocated_memory()
+        if caching_allocator_mem_allocated > 0:
+            gc.collect()
+            torch.mps.empty_cache()
+
+        # Acquires caching allocator and driver statistics before the test is run
+        self.caching_allocator_before = torch.mps.current_allocated_memory()
+        self.driver_before = torch.mps.driver_allocated_memory()
+
+    def __exit__(self, exec_type, exec_value, traceback):
+        # Don't check for leaks if an exception was thrown
+        if exec_type is not None:
+            return
+        # Compares caching allocator before/after statistics
+        # An increase in allocated memory is a discrepancy indicating a possible memory leak
+        discrepancy_detected = False
+        caching_allocator_mem_allocated = torch.mps.current_allocated_memory()
+        if caching_allocator_mem_allocated > self.caching_allocator_before:
+            discrepancy_detected = True
+
+        # Short-circuits if no discrepancy detected
+        if not discrepancy_detected:
+            return
+        # Validates the discrepancy persists after garbage collection and
+        # is confirmed by the driver API
+        gc.collect()
+        torch.mps.empty_cache()
+
+        discrepancy_detected = True
+        # Query memory multiple items to ensure leak was not transient
+        for n in range(3):
+            caching_allocator_mem_allocated = torch.mps.current_allocated_memory()
+            driver_mem_allocated = torch.mps.driver_allocated_memory()
+
+            caching_allocator_discrepancy = False
+            driver_discrepancy = False
+
+            if caching_allocator_mem_allocated > self.caching_allocator_before:
+                caching_allocator_discrepancy = True
+
+            if driver_mem_allocated > self.driver_before:
+                driver_discrepancy = True
+
+            if not(caching_allocator_discrepancy or driver_discrepancy):
+                # Leak was false positive, exit loop
+                discrepancy_detected = False
+                break
+
+        if caching_allocator_discrepancy and not driver_discrepancy:
+            # Just raises a warning if the leak is not validated by the driver API
+            msg = ("MPS caching allocator reports a memory leak not "
+                   "verified by the driver API in {}! "
+                   "Caching allocator allocated memory was {} and is now reported as {}. "
+                   "MPS driver allocated memory was {} and is now {}.").format(
+                self.name, self.caching_allocator_before,
+                caching_allocator_mem_allocated, self.driver_before, driver_mem_allocated)
+            warnings.warn(msg)
+        elif caching_allocator_discrepancy and driver_discrepancy:
+            # A caching allocator discrepancy validated by the driver API is a failure
+            msg = ("MPS driver API confirmed a leak in {}! "
+                   "Caching allocator allocated memory was {} and is now reported as {}. "
+                   "MPS driver allocated memory was {} and is now {}.").format(
+                self.name, self.caching_allocator_before, caching_allocator_mem_allocated,
+                self.driver_before, driver_mem_allocated)
+
+            raise RuntimeError(msg)
+
+# Expand TestCase class with Memory Leak Detection on MPS device
+class TestCaseMPS(TestCase):
+    _do_mps_memory_leak_check = True
+
+    def __init__(self, method_name='runTest'):
+        super().__init__(method_name)
+        test_method = getattr(self, method_name, None)
+        if test_method is not None:
+            # Wraps the tested method if we should do MPS memory check.
+            if TEST_MPS_MEM_LEAK_CHECK:
+                if self._do_mps_memory_leak_check:
+                    self.wrap_with_mps_policy(method_name, self.assertLeaksNoMpsTensors)
+
+    def assertLeaksNoMpsTensors(self, name=None):
+        name = self.id() if name is None else name
+        return MpsMemoryLeakCheck(self, name)
+
+    def wrap_with_mps_policy(self, method_name, policy):
+        test_method = getattr(self, method_name)
+        setattr(self, method_name, super().wrap_method_with_policy(test_method, policy))
+
+    # checks for leaks even if TEST_MPS_MEM_LEAK_CHECK is 0
+    def wrap_with_mps_memory_check(self, method):
+        return super().wrap_method_with_policy(method, self.assertLeaksNoMpsTensors)
+
+class TestMemoryLeak(TestCaseMPS):
+    def test_mps_memory_leak_detection(self):
+        l = []
+
+        @self.wrap_with_mps_memory_check
+        def no_leak():
+            pass
+
+        # Trigger an intentional memory leak
+        @self.wrap_with_mps_memory_check
+        def leak_gpu0():
+            # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
+            l.append(torch.randn(1024 * 1024 * 8, device=torch.device("mps")))
+
+        no_leak()
+
+        # check if a runtime error for memory leak was emitted which would
+        # confirm whether memory leak detection worked successfully or not.
+        with self.assertRaisesRegex(RuntimeError, r"MPS driver API confirmed .+"):
+            leak_gpu0()
+
+class MPSReluTest(TestCaseMPS):
     def _npRelu(self, np_features):
         return np.maximum(np_features, np.zeros(np_features.shape)).astype(np_features.dtype)
 
@@ -113,7 +362,7 @@ def testNumbersGPU(self):
                 np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
                 device="mps")
 
-class MatmulTest(TestCase):
+class MatmulTest(TestCaseMPS):
     def _helper(self, shape_tensor_1, shape_tensor_2, expand_tensor_1_shape=None, expand_tensor_2_shape=None):
         if expand_tensor_1_shape:
             tensor1_mps = torch.randn(shape_tensor_1, device="mps").expand(expand_tensor_1_shape)
@@ -152,7 +401,7 @@ def test_batched_matrix_x_broadcasted_matrix(self):
         self._helper((10, 3, 4), (4, 5))
 
 
-class MPSLeakyReluTest(TestCase):
+class MPSLeakyReluTest(TestCaseMPS):
     def _npLeakyRelu(self, np_features, negative_slope=0.1):
         return np.maximum(np_features, negative_slope * np_features).astype(np_features.dtype)
 
@@ -189,7 +438,7 @@ def testNumbersCPU(self):
                 device="cpu")
 
 
-class TestAvgPool(TestCase):
+class TestAvgPool(TestCaseMPS):
     def _sum_pool2d(self, x, kernel_size):
         windows = torch.nn.functional.unfold(x, kernel_size=kernel_size, stride=kernel_size)
         return torch.sum(windows, dim=1)
@@ -239,7 +488,7 @@ def test_avg_pool2d_ceil_mode(self):
         self.assertTrue(not torch.isnan(y).any())
 
 
-class TestMPS(TestCase):
+class TestMPS(TestCaseMPS):
     def test_exp(self, device="mps", dtype=torch.float):
         for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
             b = torch.arange(18, device="cpu") / 3 * math.pi
@@ -251,6 +500,17 @@ def test_exp1(self, device="mps", dtype=torch.float):
         input = torch.tensor([-0.1, 3.0, -0.9]).to('mps')
         output = torch.exp(input).to('cpu')
 
+    def test_exp_strided_output(self):
+        x = torch.rand((256, 10), device='mps')
+        x_cpu = x.to("cpu")
+
+        x = x.permute(1, 0)
+        x_cpu = x_cpu.permute(1, 0)
+
+        res = x.exp()
+        res_cpu = x_cpu.exp()
+        self.assertEqual(res, res_cpu)
+
     def _testLeakyRelu(self, np_features, negative_slope, device):
         cpu_x = torch.from_numpy(np_features).requires_grad_()
         mps_x = torch.from_numpy(np_features).to('mps').requires_grad_()
@@ -290,6 +550,27 @@ def helper(val, shape):
         helper(0, [1024])
         helper(0.2, [2, 3])
 
+    def test_fill_storage_offset(self):
+        shape = [2, 10]
+        val = 0.2
+        tensor = torch.ones(shape, device="mps")
+        tensor_mps = tensor[:][1].fill_(val)
+        tensor_0 = torch.ones(shape, device="cpu")
+        tensor_cpu = tensor_0[:][1].fill_(val)
+
+        self.assertEqual(tensor_mps, tensor_cpu)
+
+        shape = [1, 10]
+        val = 0.0
+        tensor = torch.ones(shape, device="mps")
+        val_tensor_mps = torch.tensor(val, device="mps")
+        tensor_mps = tensor[:, 9].fill_(val_tensor_mps)
+        tensor_0 = torch.ones(shape, device="cpu")
+        val_tensor_cpu = torch.tensor(val, device="cpu")
+        tensor_cpu = tensor_0[:, 9].fill_(val_tensor_cpu)
+
+        self.assertEqual(tensor_mps, tensor_cpu)
+
     def test_cdist_large(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(100, 10, device=device)
@@ -511,6 +792,13 @@ def test_bmm(self):
         self.assertEqual(output_cpu, output_mps)
         self.assertEqual(output_cpu.size(), output_mps.size())
 
+    def test_addr(self):
+        A = torch.ones(5, 10).to("mps")
+        B = torch.ones(5).to("mps")
+        C = torch.ones(10).to("mps")
+        D = torch.addr(A, B, C).to("cpu")
+        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
+
     def test_trace(self):
         M_cpu = torch.randn(3, 3)
         M_mps = M_cpu.detach().clone().to("mps")
@@ -819,6 +1107,27 @@ def helper(size, memory_format):
 
         helper((2, 3, 6, 6), torch.contiguous_format)
 
+    def test_masked_scatter(self):
+        def helper(shape):
+            x_mps = torch.randn(shape, device="mps")
+            x_cpu = x_mps.detach().clone().cpu()
+
+            mask_mps = torch.rand(shape, device="mps") < 0.6
+            mask_cpu = mask_mps.detach().clone().cpu()
+
+            y_mps = torch.randn(shape, device="mps")
+            y_cpu = y_mps.detach().clone().cpu()
+
+            y_mps.masked_scatter_(mask_mps, x_mps)
+            y_cpu.masked_scatter_(mask_cpu, x_cpu)
+
+            self.assertEqual(y_mps, y_cpu)
+        helper([2, 5])
+        helper([10, 10])
+        helper([5, 10, 3])
+        helper([10, 5, 10, 3])
+        helper([10, 5, 10, 3, 20])
+
     def test_masked_fill(self):
         device = "mps"
         dtype = torch.float32
@@ -1505,6 +1814,7 @@ def helper(threshold, value, num_elems, inplace=False, requires_grad=True):
     # Test pow
     def test_pow(self):
         def helper(shape):
+            # aten::pow.Tensor_Tensor
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
             x = cpu_x.detach().clone().to('mps')
             cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
@@ -1514,6 +1824,7 @@ def helper(shape):
 
             self.assertEqual(z, ref_z)
 
+            # aten::pow.Tensor_Scalar
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
             x = cpu_x.detach().clone().to('mps')
             exp = random.random()
@@ -1522,6 +1833,15 @@ def helper(shape):
 
             self.assertEqual(z, ref_z)
 
+            # aten::pow.Scalar
+            x = random.random()
+            cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+            y = cpu_y.detach().clone().to('mps')
+            z = torch.pow(x, y)
+            ref_z = torch.pow(x, cpu_y)
+
+            self.assertEqual(z, ref_z)
+
         helper((2, 8, 4, 5))
 
     # Test addcmul
@@ -1611,6 +1931,115 @@ def test_cpu_to_strided_mps_copy(self):
 
         self.assertEqual(a1, a2)
 
+    def test_view_slice_reshape(self):
+        x = torch.randn([1, 4, 4], device="mps")
+        y = x[0, :1, 1:]
+
+        x_cpu = x.to("cpu")
+        y_cpu = x_cpu[0, :1, 1:]
+
+        r = y + 1
+        r_cpu = y_cpu + 1
+        self.assertEqual(r, r_cpu)
+
+    def test_slice_reshape(self):
+        x = torch.randn([1, 6, 4, 2], dtype=torch.float, device="mps")
+        x_cpu = x.detach().clone().to("cpu")
+
+        x = x[:, 3:].view(2, 3, 4, 1)
+        x_cpu = x_cpu[:, 3:].view(2, 3, 4, 1)
+        self.assertEqual(x, x_cpu)
+
+        x = x + 2
+        x_cpu = x_cpu + 2
+        self.assertEqual(x, x_cpu)
+
+    def test_slice_casting(self):
+        # generate random binary numbers
+        cpu_in = torch.bernoulli(torch.empty(1, 1, 128, 128).uniform_(0, 1)).to(torch.uint8)
+        mps_in = cpu_in.detach().clone().to("mps")
+        # check copy_cast(unit8 -> bool) on tensors with storage offset
+        cpu_out = cpu_in[:, :, 11 : 12, :12].to(torch.bool)
+        mps_out = mps_in[:, :, 11 : 12, :12].to(torch.bool)
+        self.assertEqual(cpu_out, mps_out)
+
+    def test_slice_reshape_contg_view(self):
+        import torch
+
+        x_mps = torch.randn(1, 4800, 2, device="mps")
+        x_cpu = x_mps.detach().clone().cpu()
+
+        r_mps = x_mps + 2
+        r_cpu = x_cpu + 2
+
+        self.assertEqual(r_mps, r_cpu)
+
+    def test_contiguous_slice_2d(self):
+        def helper(shape):
+            for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
+                    t_mps = torch.randn(shape, device="mps")
+                    t_cpu = t_mps.detach().clone().cpu()
+
+                    y_mps = t_mps[i:, :j]
+                    y_cpu = t_cpu[i:, :j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[i:, j]
+                    y_cpu = t_cpu[i:, j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[i, :j]
+                    y_cpu = t_cpu[i, :j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[:i, :j]
+                    y_cpu = t_cpu[:i, :j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[:i, j]
+                    y_cpu = t_cpu[:i, j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[:i, j:]
+                    y_cpu = t_cpu[:i, j:]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+        l = []
+        for N in range(1, 3):
+            l.append(N)
+            for C in range(1, 3):
+                l.append(C)
+                helper(l)
+                for D in range(1, 3):
+                    l.append(D)
+                    helper(l)
+                    for H in range(1, 3):
+                        l.append(H)
+                        helper(l)
+                        for W in range(1, 3):
+                            l.append(W)
+                            helper(l)
+                            l.pop()
+                        l.pop()
+                    l.pop()
+                l.pop()
+            l.pop()
+
+        helper([9, 15, 4])
+        helper([9, 3, 2])
+        helper([3, 4, 18, 22])
+        helper([3, 4, 18, 22, 150])
+
+    def test_contiguous_slice_3d(self):
+        x = torch.randn(2, 3, 3, device="mps")
+        x_cpu = x.detach().clone().cpu()
+        x = x[:1]
+        x_cpu = x_cpu[:1]
+        out = x[:, 0:1, 0:1] * x[:, 1:2, 1:2]
+        out_cpu = x_cpu[:, 0:1, 0:1] * x_cpu[:, 1:2, 1:2]
+        self.assertEqual(out, out_cpu)
+
     def test_view_slice(self):
         # https://github.com/pytorch/pytorch/issues/83995
         NUM_SAMPLES = 60
@@ -1704,25 +2133,28 @@ def helper(operator):
             if operator == "<=":
                 res_mps = x_mps <= y_mps
                 res_cpu = x_cpu <= y_cpu
-            if operator == "<":
+            elif operator == "<":
                 res_mps = x_mps < y_mps
                 res_cpu = x_cpu < y_cpu
-            if operator == ">=":
+            elif operator == ">=":
                 res_mps = x_mps >= y_mps
                 res_cpu = x_cpu >= y_cpu
-            if operator == ">":
+            elif operator == ">":
                 res_mps = x_mps >= y_mps
                 res_cpu = x_cpu >= y_cpu
-            if operator == "==":
+            elif operator == "==":
                 res_mps = x_mps == y_mps
                 res_cpu = x_cpu == y_cpu
-            if operator == "!=":
+            elif operator == "!=":
                 res_mps = x_mps != y_mps
                 res_cpu = x_cpu != y_cpu
+            elif operator == "stack":
+                res_mps = torch.stack((y_mps, x_mps), dim=-1)
+                res_cpu = torch.stack((y_cpu, x_cpu), dim=-1)
 
             self.assertEqual(res_mps, res_cpu)
 
-        for op in ["<=", "<", ">=", ">", "==", "!="]:
+        for op in ["<=", "<", ">=", ">", "==", "!=", "stack"]:
             helper(op)
 
     def test_slice_of_slice(self):
@@ -1800,6 +2232,99 @@ def helper(shape, repeats):
         helper((3, 4, 5), (2, 3, 4, 5))
         helper((3, 4, 5), (2, 2, 2))
 
+    def test_torch_repeat_interleave(self, device="mps"):
+        y = torch.tensor([[1, 2], [3, 4]], device=device)
+        # exercise single argument function signature
+        temp = y.repeat_interleave(2)
+        self.assertEqual(torch.Size([8]), temp.size())
+
+        for dtype in [torch.int, torch.long]:
+            lengths = torch.tensor([1, 2], dtype=dtype, device="mps")
+            output_size = torch.sum(lengths)
+            a = torch.repeat_interleave(
+                y,
+                lengths,
+                dim=0,
+            )
+            self.assertEqual(a.dtype, y.dtype)
+            self.assertEqual(a.size(), torch.Size([3, 2]))
+
+            a_with_output = torch.repeat_interleave(
+                y,
+                lengths,
+                dim=0,
+                output_size=output_size,
+            )
+            self.assertEqual(a_with_output.dtype, y.dtype)
+            self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
+
+    def test_repeat_interleave(self, device="mps"):
+        x = torch.tensor([0, 1, 2, 3], device=device)
+        expected = torch.tensor([1, 2, 2, 3, 3, 3], dtype=torch.int32, device=device)
+        self.assertEqual(torch.repeat_interleave(x), expected)
+
+        with self.assertRaises(RuntimeError):
+            torch.repeat_interleave(torch.arange(4, device=device).reshape(2, 2))
+
+        with self.assertRaises(RuntimeError):
+            torch.repeat_interleave(torch.arange(4.0, device=device))
+
+        with self.assertRaises(RuntimeError):
+            torch.repeat_interleave(torch.tensor([1, 2, -1, 3, 4], device=device))
+
+        y = torch.tensor([[1, 2], [3, 4]], device=device)
+
+        y1_v1 = torch.repeat_interleave(y, 2)
+        y1_v2 = torch.repeat_interleave(y, torch.tensor(2, device=device))
+        y1_v3 = torch.repeat_interleave(y, torch.tensor([2], device=device))
+        y1_expect = torch.tensor([1, 1, 2, 2, 3, 3, 4, 4], device=device)
+        self.assertEqual(y1_v1, y1_expect)
+        self.assertEqual(y1_v2, y1_expect)
+        self.assertEqual(y1_v3, y1_expect)
+
+        y2 = torch.repeat_interleave(y, 3, dim=1)
+        y2_expect = torch.tensor([[1, 1, 1, 2, 2, 2],
+                                  [3, 3, 3, 4, 4, 4]], device=device)
+        self.assertEqual(y2, y2_expect)
+
+        y3 = torch.repeat_interleave(y, torch.tensor([1, 2], device=device), dim=0)
+        y3_expect = torch.tensor([[1, 2],
+                                  [3, 4],
+                                  [3, 4]], device=device)
+        self.assertEqual(y3, y3_expect)
+
+        with self.assertRaises(RuntimeError):
+            torch.repeat_interleave(y, torch.tensor([1, 2, 3], device=device), dim=0)
+
+        with self.assertRaises(RuntimeError):
+            torch.repeat_interleave(y, torch.arange(9, device=device).reshape(3, 3), dim=0)
+
+        # test zero sized dimension
+        x = torch.zeros((5, 0), device=device)
+        y = torch.repeat_interleave(x, repeats=3, dim=1)
+        self.assertEqual(y, x.new_zeros(5, 0, device=device))
+
+        x = torch.tensor([], dtype=torch.int64, device=device)
+        y = torch.repeat_interleave(x, x)
+        self.assertEqual(y, x)
+
+    def test_repeat_interleave_simple(self):
+        def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None):
+            x = torch.randn(shape, dtype=dtype, device="mps")
+            x_cpu = x.detach().clone().cpu()
+
+            num_repeats_cpu = num_repeats.detach().clone().cpu()
+
+            repeats = torch.repeat_interleave(x, num_repeats, dim)
+            repeats_cpu = torch.repeat_interleave(x_cpu, num_repeats_cpu, dim)
+
+            self.assertEqual(repeats, repeats_cpu)
+        helper(shape=3, num_repeats=torch.tensor([100], device="mps"))
+        helper(shape=(2, 2), num_repeats=torch.tensor([3, 3], device="mps"), dim=0)
+        helper(shape=(10, 15, 8), num_repeats=torch.arange(10, device="mps"), dim=0)
+        helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1)
+        helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2)
+
     def test_count_nonzero(self):
         def helper(dtype):
             n = [
@@ -1962,12 +2487,14 @@ def test_full_bugs(self):
         y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8)
         self.assertEqual(y_mps, y_cpu)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     # See https://github.com/pytorch/pytorch/issues/84995
     def test_div_bugs(self):
         for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
-            x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
-            y = torch.div(x, 101, rounding_mode=mode)
-            self.assertEqual(y.sum(), 0)
+            if dtype != torch.int64:
+                x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
+                y = torch.div(x, 101, rounding_mode=mode)
+                self.assertEqual(y.sum(), 0)
 
     # See https://github.com/pytorch/pytorch/issues/82663
     def test_bool_expand(self):
@@ -2133,16 +2660,25 @@ def helper(x, dim, return_inverse, return_counts):
 
     # See https://github.com/pytorch/pytorch/issues/85675
     def test_cat_non_contiguous(self):
-        def rotate_subset(data):
-            return torch.concat([data[:, :2], torch.rot90(data[:, 2:])])
+        def rotate_subset(data, dim):
+            x1 = data[:, :, :2, :]
+            x2 = data[:, :, 2:, :]
+            self.assertFalse(x1.is_contiguous())
+            self.assertFalse(x2.is_contiguous())
+            return torch.concat((x1, x2), dim=dim)
         for dtype in MPS_DTYPES:
             if dtype == torch.bool:
                 continue
-            data = torch.arange(8, dtype=dtype).reshape(2, 4)
+            data = torch.arange(48, dtype=dtype).reshape(1, 2, 4, 6)
+            data = data.to(memory_format=torch.channels_last)
             mps_data = data.to("mps")
-            cpu_result = rotate_subset(data)
-            mps_result = rotate_subset(mps_data)
-            self.assertEqual(cpu_result, mps_result.to("cpu"))
+            self.assertEqual(data, mps_data)
+            for dim in range(data.dim()):
+                cpu_result = rotate_subset(data, dim)
+                mps_result = rotate_subset(mps_data, dim)
+                self.assertEqual(cpu_result, mps_result.to("cpu"))
+                # TODO: enable memory format test
+                # self.assertEqual(cpu_result.is_contiguous(), mps_result.is_contiguous())
 
     # See https://github.com/pytorch/pytorch/issues/85967
     def test_from_numpy_non_contiguous(self):
@@ -2171,6 +2707,16 @@ def test_copy_non_contiguous(self):
         y.permute(3, 2, 1, 0)[1::, ::2] = z
         self.assertEqual(x, y.to('cpu'))
 
+    # See https://github.com/pytorch/pytorch/issues/95417
+    def test_copy_storage_offset(self):
+        x_cpu = torch.zeros(5, device="cpu", dtype=torch.float32)
+        x_mps = torch.zeros(5, device="mps", dtype=torch.float32)
+        update_cpu = torch.tensor([1, 1], device="cpu", dtype=torch.int64)
+        update_mps = torch.tensor([1, 1], device="mps", dtype=torch.int64)
+        x_cpu[2:4] = update_cpu
+        x_mps[2:4] = update_mps  # implicit type casting and copy
+        self.assertEqual(x_cpu, x_mps)
+
     # See https://github.com/pytorch/pytorch/pull/84742
     # and https://github.com/pytorch/pytorch/pull/78319
     def test_binops_dtype_precedence(self):
@@ -2234,8 +2780,97 @@ def test_binops_dtype_precedence(self):
                     getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
                            (torch.full(full_shape, val2, dtype=dtype2, device='cpu')))
 
+    def test_nansum(self):
+        def helper(dtype, noncontiguous, dim):
+            zero_cpu = torch.zeros((), dtype=dtype)
+
+            # Randomly scale the values
+            scale = random.randint(10, 100)
+            x_cpu: torch.Tensor = make_tensor(
+                (5, 5), dtype=dtype, device='cpu',
+                low=-scale, high=scale, noncontiguous=noncontiguous)
+
+            if dtype.is_floating_point:
+                nan_mask_cpu = x_cpu < (0.2 * scale)
+                x_no_nan_cpu = torch.where(nan_mask_cpu, zero_cpu, x_cpu)
+                x_cpu[nan_mask_cpu] = np.nan
+            else:
+                x_no_nan_cpu = x_cpu
+
+            x_mps = x_cpu.to('mps')
+            actual_out_mps = torch.empty(0, dtype=dtype, device='mps')
+            expect_out_cpu = torch.empty(0, dtype=dtype)
+            dim_kwargs = {"dim": dim} if dim is not None else {}
+            expect = torch.sum(x_no_nan_cpu, **dim_kwargs)
+
+            actual_cpu = torch.nansum(x_cpu, **dim_kwargs)
+            # Sanity check on CPU
+            self.assertEqual(expect, actual_cpu)
+
+            # Test MPS
+            actual_mps = torch.nansum(x_mps, **dim_kwargs)
+            # Test out= variant
+            torch.nansum(x_mps, out=actual_out_mps, **dim_kwargs)
+            torch.nansum(x_cpu, out=expect_out_cpu, **dim_kwargs)
+            self.assertEqual(expect, actual_mps)
+            self.assertEqual(expect_out_cpu, actual_out_mps)
+
+        args = itertools.product(
+            (torch.float16, torch.float32, torch.int32, torch.int64),   # dtype
+            (True, False),                                              # noncontiguous
+            (0, 1, None),                                               # dim
+        )
+
+        for dtype, noncontiguous, dim in args:
+            with self.subTest(dtype=dtype, noncontiguous=noncontiguous, dim=dim):
+                helper(dtype, noncontiguous, dim)
+
+    def test_cumsum_all_dtypes(self):
+        def helper(dtype):
+            t = torch.tensor([1, 1, 1, 1], device="mps", dtype=dtype)
+            t_cpu = torch.tensor([1, 1, 1, 1], device="cpu")
+
+            a = t.cumsum(0, dtype=dtype)
+            a_cpu = t_cpu.cumsum(0, dtype=dtype)
+
+            self.assertEqual(a.cpu(), a_cpu)
+        [helper(dtype) for dtype in [torch.int8, torch.int16, torch.int32, torch.float32]]
+
+        try:
+            helper(torch.int64)
+        except Exception as e:
+            e_string = str(e)
+            self.assertEqual(e_string, "MPS does not support cumsum op with int64 input")
+
+    def test_cumsum_minus_one_axis(self):
+        def helper(dtype):
+            # Test with axis -1
+            cpu_x = None
+            if(dtype == torch.float32):
+                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
+            else:
+                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+            x = cpu_x.detach().clone().to('mps')
 
-class TestLogical(TestCase):
+            cpu_y = cpu_x.cumsum(-1)
+            y = x.cumsum(-1)
+
+            self.assertEqual(y, cpu_y)
+
+        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+
+    def test_median_int16(self):
+        def helper(shape, dtype):
+            cpu_x = torch.randint(-9999, 9999, shape, device='cpu', dtype=dtype)
+            x = cpu_x.detach().clone().to('mps')
+
+            median_result = torch.median(x)
+            median_result_cpu = torch.median(cpu_x)
+            self.assertEqual(median_result, median_result_cpu)
+
+        helper((2, 8, 4, 5), torch.int16)
+
+class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -2347,7 +2982,7 @@ def helper(dtype):
 
         [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int32, torch.int16, torch.uint8, torch.int8, torch.bool]]
 
-class TestSmoothL1Loss(TestCase):
+class TestSmoothL1Loss(TestCaseMPS):
 
     def _smooth_l1_loss_helper(self, reduction="mean", requires_grad=False):
         # CPU
@@ -2386,7 +3021,7 @@ def test_smooth_l1_loss_reduction_mean_sum_backward(self):
         self._smooth_l1_loss_helper(reduction="sum", requires_grad=True)
 
 
-class TestNLLLoss(TestCase):
+class TestNLLLoss(TestCaseMPS):
     def test_nll_loss_mismatched_batch(self, device='mps'):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
@@ -2449,13 +3084,15 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
         target = torch.randint(num_channels, target_size, device='cpu')
+        weights = torch.randn(num_channels)
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
+        weights_mps = weights.to("mps")
 
-        output_cpu = F.nll_loss(input, target, reduction=reduction)
-        output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
+        output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
+        output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
         self.assertEqual(output_cpu, output_mps.to('cpu'))
 
         output_cpu.sum().backward()
@@ -2977,6 +3614,26 @@ def test_log_softmax(self):
 
         self.assertEqual(cpu_x.grad, mps_x.grad.to('cpu'))
 
+    def test_log_softmax_large_numbers(self):
+        values = [
+            [10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0],
+            [-10.0, -100.0, -1000.0, -10000.0, -100000.0, -1000000.0]
+        ]
+        cpu_x = torch.tensor(values, device='cpu', requires_grad=True)
+        mps_x = torch.tensor(values, device='mps', requires_grad=True)
+
+        cpu_log_softmax = F.log_softmax(cpu_x, dim=-1)
+        mps_log_softmax = F.log_softmax(mps_x, dim=-1)
+        self.assertEqual(cpu_log_softmax, mps_log_softmax.to('cpu'))
+
+        cpu_grad = torch.ones_like(cpu_log_softmax)
+        mps_grad = torch.ones_like(cpu_log_softmax).to('mps')
+
+        cpu_log_softmax.backward(gradient=cpu_grad)
+        mps_log_softmax.backward(gradient=mps_grad)
+
+        self.assertEqual(cpu_x.grad, mps_x.grad.to('cpu'))
+
     def test_eq(self):
         values1 = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]
         values2 = [[[1.0, 2.0, 15.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [0.0, 11.0, 12.0]]]
@@ -2989,6 +3646,7 @@ def test_eq(self):
 
         self.assertEqual(result_cpu, result_mps.to('cpu'))
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_signed_vs_unsigned_comparison(self):
         cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8)
         mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8)
@@ -4011,27 +4669,28 @@ def helper(n, c, h, w):
     def test_divmode(self):
         def helper(shape, rounding_mode):
             for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
-                cpu_x = None
-                cpu_y = None
-                if (dtype in [torch.float32, torch.float16]):
-                    cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
-                    cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
-                else:
-                    cpu_x = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
-                    cpu_y = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
-
-                mps_x = cpu_x.detach().clone().to('mps')
-                # clamp to avoid division by 0
-                mps_y = cpu_y.detach().clone().to('mps')
-
-                if (rounding_mode == "floor_divide"):
-                    result_div_cpu = torch.floor_divide(cpu_x, cpu_y)
-                    result_div_mps = torch.floor_divide(mps_x, mps_y)
-                    self.assertEqual(result_div_mps, result_div_cpu)
-                else:
-                    result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
-                    result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
-                    self.assertEqual(result_div_mps, result_div_cpu)
+                if (rounding_mode is not None and "floor" in rounding_mode and dtype == torch.int64) is False:
+                    cpu_x = None
+                    cpu_y = None
+                    if (dtype in [torch.float32, torch.float16]):
+                        cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+                        cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+                    else:
+                        cpu_x = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
+                        cpu_y = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
+
+                    mps_x = cpu_x.detach().clone().to('mps')
+                    # clamp to avoid division by 0
+                    mps_y = cpu_y.detach().clone().to('mps')
+
+                    if (rounding_mode == "floor_divide"):
+                        result_div_cpu = torch.floor_divide(cpu_x, cpu_y)
+                        result_div_mps = torch.floor_divide(mps_x, mps_y)
+                        self.assertEqual(result_div_mps, result_div_cpu)
+                    else:
+                        result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
+                        result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
+                        self.assertEqual(result_div_mps, result_div_cpu)
 
         helper((2, 8, 4, 5), None)
         helper((2, 8, 4, 5), "floor")
@@ -4062,6 +4721,19 @@ def helper(shape):
         helper((2, 6, 3, 5))
         helper((2, 8, 4, 5))
 
+    def test_remainder(self):
+        res_cpu = torch.remainder(
+            torch.tensor([-3, -2, -1, 1, 2, 3], dtype=torch.int32, device="cpu"), torch.tensor(2, device="cpu", dtype=torch.int32))
+        res_mps = torch.remainder(
+            torch.tensor([-3, -2, -1, 1, 2, 3], dtype=torch.int32, device="mps"), torch.tensor(2, device="mps", dtype=torch.int32))
+        self.assertEqual(res_cpu, res_mps)
+
+        res_cpu = torch.remainder(
+            torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32, device="cpu"), -1.5)
+        res_mps = torch.remainder(
+            torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32, device="mps"), -1.5)
+        self.assertEqual(res_cpu, res_mps)
+
     def test_expand(self):
         def helper(n, c):
             values = [[1.0], [4.0], [7.0]]
@@ -4075,6 +4747,13 @@ def helper(n, c):
 
         helper(3, 1)
 
+    def test_im2col(self):
+        def helper(x):
+            return torch.nn.functional.unfold(x, kernel_size=(10, 15), dilation=2, padding=5, stride=3)
+        x_cpu = torch.rand(1, 1, 200, 100)
+        x = x_cpu.detach().clone().to('mps')
+        self.assertEqual(helper(x_cpu), helper(x))
+
     def test_select(self):
         def helper(n, c):
             cpu_x = torch.randn(n, c, device='cpu', dtype=torch.float, requires_grad=True)
@@ -4095,16 +4774,6 @@ def helper(n, c):
 
         helper(3, 3)
 
-    def test_assert_topk(self):
-        # here the k > 16 raises an error as expected
-        with self.assertRaisesRegex(RuntimeError, "Currently topk on mps works only for k<=16"):
-            xs = torch.arange(30).to('mps')
-            xs.topk(30)
-        # for k <= 16 it works fine
-        ys_cpu = torch.arange(30)
-        ys_mps = ys_cpu.to('mps')
-        self.assertEqual(ys_cpu.topk(16), ys_mps.topk(16))
-
     def test_topk(self):
         def helper(shape):
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
@@ -4129,11 +4798,32 @@ def helper(shape):
         helper((5, 1))
         helper((1, 5))
         helper((5, 9, 7, 4))
+        helper((50, 20, 7, 4))
+
+    def test_sort(self):
+        for SIZE in (4, 2049):
+            device = 'mps'
+            x = torch.rand(4, SIZE, device=device)
+            res1val, res1ind = torch.sort(x)
+
+            res2val = torch.tensor((), device=device)
+            res2ind = torch.tensor((), device=device, dtype=torch.long)
+            torch.sort(x, out=(res2val, res2ind))
+            self.assertEqual(res1val, res2val, atol=0, rtol=0)
+            self.assertEqual(res1ind, res2ind, atol=0, rtol=0)
+            self.assertEqual(torch.argsort(x), res1ind)
+            self.assertEqual(x.argsort(), res1ind)
+
+            self.assertEqual(
+                torch.sort(torch.tensor((50, 40, 30, 20, 10), device=device))[0],
+                torch.tensor((10, 20, 30, 40, 50), device=device),
+                atol=0, rtol=0
+            )
 
     def test_upsample_nearest2d(self):
-        def helper(N, C, H, W):
+        def helper(N, C, H, W, memory_format):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
-                                    requires_grad=True).reshape(N, C, H, W)
+                                    requires_grad=True).reshape(N, C, H, W).to(memory_format=memory_format)
             inputCPU.retain_grad()
             inputMPS = inputCPU.detach().to('mps').requires_grad_()
 
@@ -4159,8 +4849,9 @@ def helper(N, C, H, W):
 
                     self.assertEqual(inputCPU.grad, inputMPS.grad)
 
-        helper(1, 1, 4, 4)
-        helper(7, 5, 3, 2)
+        for memory_format in [torch.channels_last, torch.contiguous_format]:
+            helper(1, 1, 4, 4, memory_format=memory_format)
+            helper(7, 5, 3, 2, memory_format=memory_format)
 
     def test_upsample_bilinear2d(self):
         def helper(N, C, H, W):
@@ -4602,10 +5293,11 @@ def helper(shape):
 
     # Test selu, elu, celu
     def test_elu(self):
-        def helper(shape, alpha=1.0):
-            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-            x = cpu_x.detach().clone().to('mps').requires_grad_()
+        def helper(shape, alpha=1.0, memory_format=torch.contiguous_format):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            cpu_x = cpu_x.to(memory_format=memory_format).requires_grad_()
 
+            x = cpu_x.detach().clone().to('mps').requires_grad_(True)
             for activation_func in [torch.nn.ELU(alpha=alpha), torch.nn.CELU(alpha=alpha), torch.nn.SELU()]:
                 elu_result = activation_func(x)
                 elu_result_cpu = activation_func(cpu_x)
@@ -4620,9 +5312,10 @@ def helper(shape, alpha=1.0):
                 self.assertEqual(x.grad, cpu_x.grad)
 
         # Test empty shape too
-        for shape in [[], (2, 3), (2, 8, 4, 5)]:
-            for alpha in [0.000001, 1.0, 2.3, 0.34, 23]:
-                helper(shape, alpha)
+        for memory_fromat in [torch.channels_last, torch.contiguous_format]:
+            for shape in [(2, 8, 4, 5)]:
+                for alpha in [0.000001, 1.0, 2.3, 0.34, 23]:
+                    helper(shape, alpha, memory_fromat)
 
     # Test glu
     def test_glu(self):
@@ -4649,7 +5342,7 @@ def helper(shape, dim=0):
 
     # Test softplus
     def test_softplus(self):
-        def helper(shape, beta=0.5, threshold=0.5):
+        def helper(shape, beta=1, threshold=20):
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
             x = cpu_x.detach().clone().to('mps').requires_grad_()
 
@@ -4667,9 +5360,9 @@ def helper(shape, beta=0.5, threshold=0.5):
 
         # Test empty shape too
         for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-            helper(shape)
-            helper(shape, beta=0.6, threshold=0.6)  # relu path
-            helper(shape, beta=1, threshold=20)  # softplus path
+            for beta in [0.5, 1, 2, 3, 4]:
+                for threshold in [0.5, 20, 30, 40, 50]:
+                    helper(shape, beta, threshold)
 
     # Test silu
 
@@ -4717,6 +5410,19 @@ def helper(src_dtype, dst_dtype):
         helper(torch.half, torch.long)
         helper(torch.float, torch.int)
 
+    def test_avg_pool2d_count_include_pad(self):
+        cpu_x = torch.randn((1, 3, 9, 9), device='cpu', dtype=torch.float, requires_grad=True)
+        x = cpu_x.detach().clone().to('mps').requires_grad_()
+        pool = torch.nn.AvgPool2d(kernel_size=(3, 3), padding=(1, 1), stride=(1, 1), ceil_mode=True, count_include_pad=True)
+        ref_y = pool(cpu_x)
+        y = pool(x)
+        self.assertEqual(y, ref_y)
+        cpu_grad = torch.randn(ref_y.shape)
+        grad = cpu_grad.to('mps')
+        ref_y.backward(gradient=cpu_grad)
+        y.backward(gradient=grad)
+        self.assertEqual(x.grad, cpu_x.grad)
+
     # Test adaptive avg pool2d - when the input size is a multiple of output size
     # Not testing for channels last right now
     def test_adaptive_avg_pool2d_simple(self):
@@ -4864,6 +5570,17 @@ def _gelu_ref(X):
         finally:
             torch.set_num_threads(num_threads)
 
+    def test_gelu_tanh(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            x = cpu_x.detach().clone().to('mps')
+
+            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
+            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
+            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
+
+        helper((2, 8, 4, 5))
+
     # Test hardtanh
     def test_hardtanh(self):
         def helper(shape, min_val, max_val, inplace=False):
@@ -5109,6 +5826,7 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
         helper((2, 8, 4, 5), 3, [2, 3, 0])
         helper((2, 3, 3), -1, [1, 2])
         helper((), 0, [0])
+        helper((5), 0, [])
 
     def test_index_select_scalar(self):
         def helper(value, dim, index, idx_dtype=torch.int32):
@@ -5123,7 +5841,7 @@ def helper(value, dim, index, idx_dtype=torch.int32):
 
             self.assertEqual(idx_result, idx_result_cpu)
 
-        helper(0.5, 0, [0, 0])
+        helper(22, 0, [])
 
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
@@ -5339,22 +6057,22 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
             self.assertEqual(scatter_result, scatter_result_cpu)
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
-        for reduce in ["add", "multiply"]:
-            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
-            helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5), reduce_str=reduce)
-
-            helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5), reduce_str=reduce)
-            helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2), reduce_str=reduce)
-            helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3), reduce_str=reduce)
-            helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3), reduce_str=reduce)
-
-            helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8), reduce_str=reduce)
-            helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6), reduce_str=reduce)
-            helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6), reduce_str=reduce)
+        for reduce_type in ["add", "multiply"]:
+            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce_type)
+            helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5), reduce_str=reduce_type)
+
+            helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5), reduce_str=reduce_type)
+            helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3), reduce_str=reduce_type)
+
+            helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8), reduce_str=reduce_type)
+            helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6), reduce_str=reduce_type)
+            helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6), reduce_str=reduce_type)
 
     def test_is_nonzero(self):
         self.assertFalse(torch.is_nonzero(torch.tensor([0.]).to('mps')))
@@ -5494,6 +6212,21 @@ def test_arange(self):
         self.assertEqual(np.arange(1, 2, .3, dtype=np.float32), torch.arange(1, 2, .3, device='mps'))
         self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(6.3, device='mps'))
 
+    def test_arange_empty(self):
+        out_mps = torch.tensor([], device="mps")
+        out_cpu = torch.tensor([], device="cpu")
+
+        y_mps = torch.arange(0, 0, 1, out=out_mps)
+        y_cpu = torch.arange(0, 0, 1, out=out_cpu)
+        self.assertEqual(y_mps, y_cpu)
+
+    # Test rgange
+    def test_range(self):
+        self.assertEqual(np.arange(11, dtype=np.float32), torch.range(0, 10, device='mps'))
+        self.assertEqual(np.arange(7, 0, -1, dtype=np.float32), torch.range(7, 1, -1, device='mps'))
+        self.assertEqual(np.array([1.0000, 1.3000, 1.6000, 1.9000], dtype=np.float32), torch.range(1, 2, .3, device='mps'))
+        self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(0, 6.3, device='mps'))
+
     # Test softmax
     def test_softmax(self):
         def helper(shape, dim, channels_last=False):
@@ -5692,6 +6425,66 @@ def test_mps_generator(self):
         mps_x = torch.randn(5, device='mps', generator=g_mps)
         self.assertEqual(mps_x, mps_y)
 
+    def test_default_mps_generator(self):
+        # manual seeding on the "default" MPS generator using
+        # the global torch.manual_seed()
+        torch.manual_seed(230)
+        mps_x = torch.randn(5, device='mps')
+        # manual seeding using torch.mps.manual_seed()
+        # which should set the "default" MPS generator
+        # like the global torch.manual_seed()
+        torch.mps.manual_seed(230)
+        mps_y = torch.randn(5, device='mps')
+        # seed values were the same, so the random tensor contents should match
+        self.assertEqual(mps_x, mps_y)
+
+        # save the default generator's state to restore it later
+        g_state = torch.mps.get_rng_state()
+
+        # generate random numbers without seeding
+        mps_x = torch.randn(5, device='mps')
+        # in this case, the random results must differ from the last generated random results
+        self.assertNotEqual(mps_x, mps_y)
+
+        # restore the previously saved state, and the results should match again
+        torch.mps.set_rng_state(g_state)
+        mps_x = torch.randn(5, device='mps')
+        self.assertEqual(mps_x, mps_y)
+
+    def test_device_synchronize(self):
+        # just running some ops each followed by a synchronize to wait for
+        # MPS stream to finish running each of them
+        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device='mps', dtype=torch.float)
+
+        x = torch.rand(1, 128, 6, 6, device='mps', dtype=torch.float, requires_grad=True)
+        torch.mps.synchronize()
+        x = net1(x)
+        torch.mps.synchronize()
+        x.backward(torch.randn_like(x))
+        torch.mps.synchronize()
+
+    def test_mps_allocator_module(self):
+        # first garbage collect and empty the cached blocks
+        gc.collect()
+        torch.mps.empty_cache()
+        # measure memory allocations from MPSAllocator
+        current_alloc_before = torch.mps.current_allocated_memory()
+        # after garbage collection and emptying the cache the
+        # current_allocated_memory must be zero
+        self.assertTrue(current_alloc_before == 0)
+        # measure total memory allocations from Metal driver
+        driver_alloc_before = torch.mps.driver_allocated_memory()
+        # allocate a new 8 MB tensor to force allocation of a new Metal Heap
+        x = torch.ones(1024 * 1024 * 8, device="mps")
+        # get memory allocations after allocating tensor x
+        current_alloc_after = torch.mps.current_allocated_memory()
+        driver_alloc_after = torch.mps.driver_allocated_memory()
+        # current and driver memory allocations must have
+        # grown at this point
+        self.assertTrue(current_alloc_after > current_alloc_before)
+        self.assertTrue(driver_alloc_after > driver_alloc_before)
+
     # Test random_.to and random_.from
     def test_random(self):
         def helper(shape, low, high, dtype=torch.int32):
@@ -5858,18 +6651,65 @@ def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True):
         helper(np.array([1, 1, 1, 1, 1]), (0 + 1 + 2 + 3 + 4) / 5, (6 - 2 * 2), 10000)
         helper(np.array([[1, 1, 1, 1, 1, 1, 1]]), 0, 0, 7, False)
 
+    def test_cumsum_dim_check(self):
+        x = torch.rand((3, 3), device="mps")
+        self.assertEqual(x.cumsum(1), x.cumsum(-1))
+        self.assertEqual(x.cumsum(0), x.cumsum(-2))
+        self.assertRaises(IndexError, lambda: x.cumsum(2))
+        self.assertRaises(IndexError, lambda: x.cumsum(-3))
+
+
+class TestTopK(TestCase):
+    def _test_topk(self, shape, largest):
+        cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
+        x = cpu_x.detach().clone().to('mps')
+        if isinstance(shape, tuple):
+            for curr_dim, dim_size in enumerate(shape):
+                for k in range(1, dim_size + 1):
+                    topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest)
+                    topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest)
+                    self.assertEqual(topk_values, topk_values_cpu)
+                    self.assertEqual(topk_indices, topk_indices_cpu)
+        else:
+            for k in range(1, shape):
+                topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest)
+                topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest)
+                self.assertEqual(topk_values, topk_values_cpu)
+                self.assertEqual(topk_indices, topk_indices_cpu)
+
+    def test_topk(self):
+        largest_vals = [True, False]
+        shapes = [
+            # Zero Element Tensors
+            0,
+            (1, 0),
+            (0, 1),
+            (1, 0, 1),
+            # Multiple Element Tensors
+            1,
+            2,
+            (5, 1),
+            (1, 5),
+            (5, 9, 7, 4),
+        ]
+
+        for shape in shapes:
+            for largest_val in largest_vals:
+                with self.subTest(shape=shape, largest_val=largest_val):
+                    self._test_topk(shape, largest_val)
+
 class TestNNMPS(NNTestCase):
 
     def _create_basic_net(self):
         class Layer(nn.Module):
             def __init__(self):
-                super(Layer, self).__init__()
+                super().__init__()
                 self.layer_dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
 
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l1 = Layer()
                 self.dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
@@ -5957,24 +6797,27 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.weight.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
-        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertIsNone(module.weight.grad)
 
         module.bias.requires_grad = True
         module.zero_grad()
-        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
         module(i).sum().backward()
         self.assertIsNotNone(module.weight.grad)
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-        module.zero_grad()
+
+        # Force set to zeros.
+        module.zero_grad(set_to_none=False)
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
-        # Force set to None.
-        module.zero_grad(set_to_none=True)
+        module.zero_grad()
         self.assertIsNone(module.weight.grad)
+        self.assertIsNone(module.bias.grad)
+
 
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:
@@ -6136,7 +6979,7 @@ def test_group_norm_backward(self, device='mps'):
         # self.assertEqual(expect, actual)
 
 
-class TestConstantPadNd(TestCase):
+class TestConstantPadNd(TestCaseMPS):
     def test_preserves_memory_format(self):
         nchw_tensor = torch.rand((1, 2, 5, 3))
         nchw_padded = torch.constant_pad_nd(nchw_tensor, [1, 2], 0.5)
@@ -6147,7 +6990,7 @@ def test_preserves_memory_format(self):
         self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last))
 
 
-class TestLinalgMPS(TestCase):
+class TestLinalgMPS(TestCaseMPS):
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
         dtype = t.dtype
         numpy_dtype = dtype
@@ -6189,7 +7032,31 @@ def maybe_transpose(cond, m):
         m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype))
         self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4)
 
-class TestGatherScatter(TestCase):
+    def _test_addr(self, f, t, m, v, alpha=None, beta=None):
+        dtype = t.dtype
+        numpy_dtype = dtype
+        alpha = 1.2 if alpha is None else alpha
+        beta = 0.8 if beta is None else beta
+        res1 = f(t, m, v, alpha=alpha, beta=beta)
+        res2 = alpha * np.outer(m.to(numpy_dtype).cpu().numpy(), v.to(numpy_dtype).cpu().numpy())
+        if beta != 0:
+            res2 += (torch.mul(t, beta)).to(numpy_dtype).cpu().numpy()
+        res2 = torch.from_numpy(res2).to(dtype)
+        self.assertEqual(res1, res2)
+
+    def test_addr(self, device="mps", dtype=torch.float32):
+        M = torch.randn(10, 25, device=device).to(dtype)
+        m1 = torch.randn(10, device=device).to(dtype)
+        m2 = torch.randn(25, device=device).to(dtype)
+        self._test_addr(torch.addr, M, m1, m2)
+
+        # Test beta=0, M=nan
+        M = torch.full((10, 25), math.nan, device=device).to(dtype)
+        m1 = torch.randn(10, device=device).to(dtype)
+        m2 = torch.randn(25, device=device).to(dtype)
+        self._test_addr(torch.addr, M, m1, m2, beta=0)
+
+class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):
         # Slicing with step
         # https://github.com/pytorch/pytorch/issues/78886
@@ -6254,9 +7121,22 @@ def test_inplace_scatter(self):
 # They are subset of those tests as currently only this subset is working.
 # This whole `class` will be removed when we add generic device testing. There
 # are no additional tests added apart from what is part of test_view_ops.py
-class TestViewOpsMPS(TestCase):
+class TestViewOpsMPS(TestCaseMPS):
     exact_dtype = True
 
+    def test_permute_slicing(self):
+        # test the fix for crash reported in
+        # https://github.com/pytorch/pytorch/issues/94190
+        cpu_x = (torch.randn([3, 2, 2]).float())
+        mps_x = cpu_x.detach().clone().to('mps')
+        cpu_out = cpu_x.permute((2, 0, 1)) * 2.0
+        mps_out = mps_x.permute((2, 0, 1)) * 2.0
+        # this print caused a crash prior to fix PR#94259
+        print(torch.zeros_like(mps_out))
+        # test the fix for fill_scalar_mps() mentioned in issue #94190
+        self.assertEqual(torch.zeros_like(cpu_out), torch.zeros_like(mps_out))
+        self.assertEqual(cpu_x[:, 1, :].fill_(1), mps_x[:, 1, :].fill_(1))
+
     def is_view_of(self, base, other):
         if (not other._is_view() or
                 other is base or
@@ -7052,7 +7932,7 @@ def test_view_all_dtypes_and_devices(self, device="mps"):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             self.assertEqual(x.view(6).shape, [6])
 
-class TestConvolutionMPS(TestCase):
+class TestConvolutionMPS(TestCaseMPS):
     def test_conv1d_all_strides_paddings(self):
         # https://github.com/pytorch/pytorch/issues/82921
         def helper(stride, padding):
@@ -7113,7 +7993,8 @@ def test_conv_transpose_1d_nn_functional(self):
     def test_conv_backward_1d_channels_last(self):
         def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             # https://github.com/pytorch/pytorch/issues/84511
-            conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
+            conv_cpu = torch.nn.Conv1d(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).requires_grad_()
             conv_mps = torch.nn.Conv1d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
             conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True)
@@ -7153,15 +8034,89 @@ def test_conv1d_contiguous(self):
 
     def test_conv2d_all_strides_paddings(self):
         # https://github.com/pytorch/pytorch/issues/83180
-        y_cpu = torch.randn(2, 2, 3, 6)
-        y_gpu = y_cpu.to(device='mps')
-        for strideX in range(1, 4):
-            for strideY in range(1, 4):
-                conv_cpu = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, stride=(strideX, strideY))
-                conv_gpu = copy.deepcopy(conv_cpu).to(device='mps')
-                x_cpu = conv_cpu(y_cpu)
-                x_gpu = conv_gpu(y_gpu)
-                self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
+        def helper(N, C, H, W, groups, input_mem_format, weight_mem_format, permute_data):
+            x_cpu = torch.randn(N, C, H, W).to(memory_format=input_mem_format).requires_grad_()
+            x_mps = x_cpu.detach().clone().to(device='mps').requires_grad_()
+
+            if permute_data:
+                x_cpu.permute(0, 2, 3, 1)
+                x_mps.permute(0, 2, 3, 1)
+
+            for strideX in range(1, 4):
+                for strideY in range(1, 4):
+                    conv_cpu = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY)).requires_grad_()
+                    conv_cpu.weight.data = conv_cpu.weight.to(memory_format=weight_mem_format).requires_grad_()
+
+                    conv_mps = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY), device="mps")
+                    conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+                    conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+                    res_cpu = conv_cpu(x_cpu)
+                    res_mps = conv_mps(x_mps)
+                    self.assertEqual(res_cpu, res_mps.cpu(), rtol=1e-03, atol=1e-05)
+
+                    res_cpu = res_cpu.sum().backward()
+                    res_mps = res_mps.sum().backward()
+                    self.assertEqual(res_cpu, res_mps, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.bias.grad, conv_mps.bias.grad)
+                    self.assertEqual(x_cpu.grad, x_mps.grad)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            for mem_format_weight in [torch.contiguous_format, torch.channels_last]:
+                for permute_data in [True, False]:
+                    helper(2, 2, 3, 6, 1, mem_format_input, mem_format_weight, permute_data)
+                    helper(10, 10, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+                    helper(32, 32, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+
+    def test_conv_transpose_2d_strided(self):
+        def helper(m_cpu, memory_format):
+            m_mps = copy.deepcopy(m_cpu).requires_grad_()
+            m_mps.weight.data = m_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+            m_mps.bias.data = m_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+            input_cpu = torch.randn(20, 16, 50, 100).to(memory_format=memory_format).requires_grad_()
+            input_mps = input_cpu.detach().clone().to("mps")
+
+            output_cpu = m_cpu(input_cpu)
+            output_mps = m_mps(input_mps)
+            self.assertEqual(output_cpu, output_mps)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            # With square kernels and equal stride
+            helper(nn.ConvTranspose2d(16, 33, 3, stride=2).requires_grad_(), mem_format_input)
+
+            # non-square kernels and unequal stride and with padding
+            helper(nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)).requires_grad_(), mem_format_input)
+
+    def test_conv_transpose_2d_specified_output(self):
+        input_cpu = torch.randn(1, 16, 12, 12)
+        input_mps = input_cpu.detach().clone().to("mps")
+
+        downsample_cpu = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        downsample_mps = nn.Conv2d(16, 16, 3, stride=2, padding=1, device="mps")
+        downsample_mps.weight.data = downsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        downsample_mps.bias.data = downsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        upsample_cpu = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        upsample_mps = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, device="mps")
+        upsample_mps.weight.data = upsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        upsample_mps.bias.data = upsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        h_cpu = downsample_cpu(input_cpu)
+        h_mps = downsample_mps(input_mps)
+        self.assertEqual(h_cpu, h_mps)
+
+        size_cpu = h_cpu.size()
+        size_mps = h_mps.size()
+        self.assertEqual(size_cpu, size_mps)
+
+        output_cpu = upsample_cpu(h_cpu, output_size=input_cpu.size())
+        output_mps = upsample_mps(h_mps, output_size=input_mps.size())
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
 
     def test_conv2d_single_stride(self):
         y_cpu = torch.randn(2, 2, 3, 6)
@@ -7173,7 +8128,245 @@ def test_conv2d_single_stride(self):
             x_gpu = conv_gpu(y_gpu)
             self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
 
-class TestAdvancedIndexing(TestCase):
+    def test_grid_sample(self):
+        def test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad):
+            def test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners):
+                for grid_dim_contig_order in [(0, 1, 2, 3), (0, 3, 1, 2), (3, 0, 1, 2), (0, 2, 1, 3)]:
+                    # grid_dim_contig_order specifies the dimension order that can
+                    # make grid to be contiguous.
+                    # i.e., grid.permute(grid_dim_contig_order) is contiguous.
+                    # e.g., with grid_dim_contig_order=[0, 3, 1, 2], grid should be
+                    #       initialized with contiguous tensor of shape [N, 2, H, W]
+                    #       and permuted to [N, H, W, 2] afterwards.
+                    grid_shape = [N, H, W, 2]
+                    grid_init_shape = [grid_shape[d] for d in grid_dim_contig_order]
+                    grid_fwd_permute = [None, None, None, None]
+                    for i, d in enumerate(grid_dim_contig_order):
+                        grid_fwd_permute[d] = i
+
+                    def get_grid(device='cpu', data=None):
+                        if data is not None:
+                            assert list(data.shape) == grid_shape
+                            data = data.permute(grid_dim_contig_order).to(device)
+                        else:
+                            data = torch.randn(grid_init_shape, device=device)
+                        grid = data.permute(grid_fwd_permute)
+                        assert grid.permute(grid_dim_contig_order).is_contiguous()
+                        return grid
+
+                    input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_(input_requires_grad)
+                    grid_cpu = get_grid().requires_grad_()
+                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode,
+                                            align_corners=align_corners)
+                    self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
+
+                    gradients = torch.randn_like(out_cpu)
+                    out_cpu.backward(gradients)
+
+
+                    # Compare against unvectorized CPU fallback
+
+                    # NOTE [ grid_sample CPU fallback ]
+                    # grid_sample uses AVX for 2d images, but that requires 32-bit indexing for
+                    # 32-bit floats. So we also have a fallback that is used only for float tensors
+                    # requiring 64-bit indexing. That requires too much memory to run on CI, so we
+                    # also export the fallback and test it here to ensure feature parity with
+                    # the vectorized version.
+                    input_fallback = input_cpu.float().detach_().requires_grad_()
+                    grid_fallback = grid_cpu.float().detach_().requires_grad_()
+                    out_fallback = torch._grid_sampler_2d_cpu_fallback(
+                        input_fallback, grid_fallback,
+                        F.GRID_SAMPLE_INTERPOLATION_MODES[mode],
+                        F.GRID_SAMPLE_PADDING_MODES[padding_mode],
+                        align_corners)
+                    self.assertEqual(out_fallback, out_cpu.float(), atol=1e-5, rtol=5e-5)
+
+                    out_fallback.backward(gradients.float())
+                    if input_requires_grad:
+                        self.assertEqual(input_fallback.grad, input_cpu.grad.float(), atol=1e-4, rtol=5e-5)
+                    self.assertEqual(grid_fallback.grad, grid_cpu.grad.float(), atol=1e-4, rtol=5e-5)
+
+                    input_mps = input_cpu.detach().transpose(0, 1).to("mps").transpose(0, 1).requires_grad_(input_requires_grad)
+                    grid_mps = get_grid('mps', grid_cpu.detach()).requires_grad_()
+                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode, align_corners=align_corners)
+                    self.assertEqual(out_cpu, out_mps)
+                    out_mps.backward(gradients.to("mps"))
+                    if input_requires_grad:
+                        self.assertEqual(input_cpu.grad, input_mps.grad)
+                    self.assertEqual(grid_cpu.grad, grid_mps.grad, atol=5e-5, rtol=0)
+
+                    # check that zero-dimensional input strides don't error out
+                    base_input = torch.randn(N, C, 1, IW)
+                    input_cpu = base_input.expand_as(input_mps).requires_grad_(input_requires_grad)
+                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode,
+                                            align_corners=align_corners)
+
+                    input_mps = base_input.to("mps").expand_as(input_mps).requires_grad_(input_requires_grad)
+                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode, align_corners=align_corners)
+                    self.assertEqual(out_cpu, out_mps)
+
+            # test same size output
+            test_shape(N, C, H, W, H, W, mode, padding_mode, align_corners)
+
+            # test larger output
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(IH + 1, 12)
+            W = random.randint(IW + 1, 12)
+            test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # test smaller output
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(2, IH)
+            W = random.randint(2, IW)
+            test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # test 1x1 inpput
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = 1
+            IW = 1
+            H = random.randint(2, 5)
+            W = random.randint(2, 5)
+            test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # testing empty grid
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            W = random.randint(3, IW + 2)
+            test_shape(N, C, IH, IW, 0, W, mode, padding_mode, align_corners)
+
+            # testing empty channel
+            N = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(3, IH + 2)
+            W = random.randint(3, IW + 2)
+            test_shape(N, 0, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # testing empty batch
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(3, IH + 2)
+            W = random.randint(3, IW + 2)
+            test_shape(0, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+        for mode in ('bilinear', 'nearest'):
+            for padding_mode in ('zeros', 'reflection'):
+                for align_corners in (True, False):
+                    # test known input
+                    input = torch.arange(1., 11, device="mps").view(1, 1, 2, 5)
+                    grid = torch.tensor(
+                        [[[-0.9, -4.1], [0, 0.2000], [1, -1], [-0.333, 1e-6], [0.5, 1.0]],
+                         [[-1.0, -0.5], [0, 0.3333], [1, -1], [-0.200, 1e-6], [1.5, 0.5]]], device="mps").view(1, 2, 5, 2)
+                    if mode == 'bilinear':
+                        if padding_mode == 'zeros':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[0.0000, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                                     [2.2500, 6.3332500450, 5.0000, 5.1000, 0.0000]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0.0000, 6.5000000000, 1.2500, 4.6675000191, 4.6250],
+                                     [0.5000, 7.1665000916, 1.2500, 5.0000000000, 0.0000]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'border':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1.2000, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                                     [2.2500, 6.3332500450, 5.0000, 5.1000, 8.7500]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[1.0000, 6.5000000000, 5.0000, 4.6675000191, 9.2500],
+                                     [1.0000, 7.1665000916, 5.0000, 5.0000000000, 10.0000]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'reflection':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[3.4500, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                                     [2.2500, 6.3332500450, 5.0000, 5.1000, 7.7500]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[3.0000004768, 6.5000000000, 5.0000, 4.6675000191, 9.2500],
+                                     [1.0000000000, 7.1665000916, 5.0000, 5.0000000000, 9.2500]], device="mps").view(1, 1, 2, 5)
+                        else:
+                            raise AssertionError("missing groundtruth test for padding mode '{}'".format(padding_mode))
+                    elif mode == 'nearest':
+                        if padding_mode == 'zeros':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[0., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 0.]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0., 8., 5., 7., 0.],
+                                     [1., 8., 5., 8., 0.]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'border':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 10.]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 10.]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'reflection':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 9.]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 9.]], device="mps").view(1, 1, 2, 5)
+                        else:
+                            raise AssertionError("missing groundtruth test for padding mode '{}'".format(padding_mode))
+                    elif mode == 'bicubic':
+                        if padding_mode == 'zeros':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[-0.10424726, 7.1400003, 5.0000, 5.7842274, 9.0000],
+                                     [2.4492188, 7.4814040, 5.0000, 6.0277520, 0.0000]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0.00000, 7.6287503, 1.0625, 5.5977230, 5.3270264],
+                                     [0.40625, 8.0288770, 1.0625, 5.9375067, -0.3515625]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'border':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1.1520010, 6.0599990, 5.0000, 4.870930, 9.0000000],
+                                     [2.1328125, 6.4258375, 5.0000, 5.076003, 8.8671875]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0.894531, 6.6050020, 4.625, 4.7138715, 9.800781],
+                                     [0.906250, 7.2822485, 4.625, 5.0000052, 10.00000]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'reflection':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[3.1822524, 6.239998, 5.0000, 4.8709273, 9.00000],
+                                     [1.7812500, 6.703594, 5.0000, 5.0760007, 8.21875]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[2.7993753, 6.6050020, 4.25, 4.7138715, 10.269531],
+                                     [0.8125000, 7.2822485, 4.25, 5.0000052, 9.332031]], device="mps").view(1, 1, 2, 5)
+                        else:
+                            raise AssertionError("missing groundtruth test for padding mode '{}'".format(padding_mode))
+
+                    else:
+                        raise AssertionError("missing groundtruth test for interpolation mode '{}'".format(mode))
+                    output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode,
+                                           align_corners=align_corners)
+                    self.assertEqual(output, groundtruth, atol=1e-5, rtol=0,
+                                     msg="groundtruth comparison failed for mode={}, "
+                                     "padding_mode={}".format(mode, padding_mode))
+
+class TestAdvancedIndexing(TestCaseMPS):
     supported_dtypes = [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]
     supported_np_dtypes = [np.float32, np.float16, np.int64, np.int32, np.int16, np.uint8]
 
@@ -7515,6 +8708,7 @@ def test_bool_indices(self, device="mps"):
             self.assertEqual(v[boolIndices], torch.tensor([True], dtype=torch.bool, device=device))
             self.assertEqual(len(w), 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_bool_indices_accumulate(self, device="mps"):
         mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device)
         mask = mask > 0
@@ -7705,6 +8899,7 @@ def helper(device, dtype):
             self.assertEqual(res.shape, src.shape)
         [helper(device="mps", dtype=dtype) for dtype in [torch.float, torch.int32]]
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_index_src_datatype(self):
         def helper(device, dtype):
             orig_dtype = dtype
@@ -7977,66 +9172,150 @@ def test_cpu_indices(self, device="mps"):
         out = x[idx]  # index
         self.assertEqual(out, torch.zeros(2, device=device), atol=0, rtol=0)
 
-class TestRNNMPS(TestCase):
+class TestRNNMPS(TestCaseMPS):
     def test_lstm_1(self, device="mps", dtype=torch.float32):
+        for layers in [1] if product_version < 13.0 else [1, 2, 5]:
+            torch.random.manual_seed(42)
+            rnn = nn.LSTM(7, 4, layers, device="cpu")
+            input = torch.randn(2, 3, 7, device="cpu")
+            hx = torch.randn(layers, 3, 4, device="cpu")
+            cx = torch.randn(layers, 3, 4, device="cpu")
+
+            cpu_output, (cpu_hn, cpu_cn) = rnn(input, (hx, cx))
+
+            rnn = rnn.to(device)
+            input = input.to(device)
+            hx = hx.to(device)
+            cx = cx.to(device)
+            output, (hn, cn) = rnn(input, (hx, cx))
+
+            self.assertEqual(cpu_output, output)
+            self.assertEqual(cpu_hn, hn)
+            self.assertEqual(cpu_cn, cn)
+
+            # test batch_first
+            rnn = nn.LSTM(7, 4, layers, device="cpu", batch_first=True)
+            input = torch.randn(3, 2, 7, device="cpu")
+            hx = torch.randn(layers, 3, 4, device="cpu")
+            cx = torch.randn(layers, 3, 4, device="cpu")
+            cpu_output, (cpu_hn, cpu_cn) = rnn(input, (hx, cx))
+
+            rnn = rnn.to(device)
+            input = input.to(device)
+            hx = hx.to(device)
+            cx = cx.to(device)
+            output, (hn, cn) = rnn(input, (hx, cx))
+
+            self.assertEqual(cpu_output, output)
+            self.assertEqual(cpu_hn, hn)
+            self.assertEqual(cpu_cn, cn)
+
+    def test_lstm_backward(self, device="mps", dtype=torch.float32):
+        for layers in [1] if product_version < 13.0 else [1, 2, 5]:
+            lstm = nn.LSTM(2, 4, layers)  # initialized globally for consistent parameters init
+            lstm.train()
+
+            def get_results(device, inp, hx, cx):
+                rnn = lstm.to(device)
+                inp, hx, cx = inp.to(device), hx.to(device), cx.to(device)
+
+                output, _ = rnn(inp, (hx, cx))
+                f = output.sum()
+
+                param_names, params = zip(*rnn.named_parameters())
+                param_grads = zip(param_names, torch.autograd.grad(f, params, retain_graph=True))
+
+                input_grad, hx_grad, cx_grad = torch.autograd.grad(f, [inp, hx, cx])
+                return output, param_grads, input_grad, hx_grad, cx_grad
+
+            inp = torch.randn((5, 3, 2), requires_grad=True, dtype=dtype, device=device)
+            hx = torch.randn((layers, 3, 4), requires_grad=True, dtype=dtype, device=device)
+            cx = torch.randn((layers, 3, 4), requires_grad=True, dtype=dtype, device=device)
+
+            cpu_output, cpu_weights_grad, cpu_input_grad, cpu_hx_grad, cpu_cx_grad = get_results("cpu", inp, hx, cx)
+            mps_output, mps_weights_grad, mps_input_grad, mps_hx_grad, mps_cx_grad = get_results(device, inp, hx, cx)
+
+            self.assertEqual(cpu_hx_grad, mps_hx_grad)
+            self.assertEqual(cpu_cx_grad, mps_cx_grad)
+            self.assertEqual(cpu_output, mps_output)
+            self.assertEqual(cpu_input_grad, mps_input_grad)
+            for (cpu_name, cpu_weight_grad), (mps_name, mps_weight_grad) in zip(cpu_weights_grad, mps_weights_grad):
+                self.assertEqual(cpu_weight_grad, mps_weight_grad, f"mismatch in cpu:{cpu_name} vs mps:{mps_name}")
+
+            # test batch_first backward
+            lstm = nn.LSTM(2, 4, layers, batch_first=True)
+            lstm.train()
+
+            hx = torch.randn((layers, 5, 4), requires_grad=True, dtype=dtype, device=device)
+            cx = torch.randn((layers, 5, 4), requires_grad=True, dtype=dtype, device=device)
+
+            cpu_output, cpu_weights_grad, cpu_input_grad, cpu_hx_grad, cpu_cx_grad = get_results("cpu", inp, hx, cx)
+            mps_output, mps_weights_grad, mps_input_grad, mps_hx_grad, mps_cx_grad = get_results(device, inp, hx, cx)
+
+            self.assertEqual(cpu_hx_grad, mps_hx_grad)
+            self.assertEqual(cpu_cx_grad, mps_cx_grad)
+            self.assertEqual(cpu_output, mps_output)
+            self.assertEqual(cpu_input_grad, mps_input_grad)
+            for (cpu_name, cpu_weight_grad), (mps_name, mps_weight_grad) in zip(cpu_weights_grad, mps_weights_grad):
+                self.assertEqual(cpu_weight_grad, mps_weight_grad, f"mismatch in cpu:{cpu_name} vs mps:{mps_name}")
+
+
+    def test_RNN_cell_no_broadcasting(self):
+        def test(cell_module, input, hx, input_size, hidden_size):
+            cell = cell_module(input_size, hidden_size, device='mps')
+            self.assertRaises(RuntimeError, lambda: cell(input, hx))
+
+        def test_all(hidden_size, bad_hx, good_hx, input_size, input):
+            test(nn.RNNCell, input, bad_hx, input_size, hidden_size)
+            test(nn.GRUCell, input, bad_hx, input_size, hidden_size)
+            test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size)
+            test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size)
+
+        hidden_size = 20
+        input_size = 10
+        input = torch.randn(3, input_size, device='mps')
+        bad_hx = torch.randn(1, hidden_size, device='mps')
+        good_hx = torch.randn(3, hidden_size, device='mps')
+
+        # Test hidden/input batch size broadcasting
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test hx's hidden_size vs module's hidden_size broadcasting
+        bad_hx = torch.randn(3, 1)
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test input's input_size vs module's input_size broadcasting
+        bad_input = torch.randn(3, 1)
+        test_all(hidden_size, good_hx, good_hx, input_size, bad_input)
+
+    def test_LSTM_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for bias in (True, False):
+            input = torch.randn(3, 10, device='mps')
+            hx = torch.randn(3, 20, device='mps')
+            cx = torch.randn(3, 20, device='mps')
+            lstm = nn.LSTMCell(10, 20, bias=bias, device='mps')
+            for _ in range(6):
+                hx, cx = lstm(input, (hx, cx))
+
+            (hx + cx).sum().backward()
+
+    def test_LSTM_cell_forward_input_size(self):
+        input = torch.randn(3, 11, device='mps')
+        hx = torch.randn(3, 20, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+
+    def test_LSTM_cell_forward_hidden_size(self):
+        input = torch.randn(3, 10, device='mps')
+        hx = torch.randn(3, 21, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+        self.assertRaises(Exception, lambda: lstm(input, (cx, hx)))
 
-        rnn = nn.LSTM(1, 4, 2, device="cpu")
-        input = torch.randn(2, 3, 1, device="cpu")
-        hx = torch.zeros(2, 3, 4, device="cpu")
-        cx = torch.zeros(2, 3, 4, device="cpu")
-
-        cpu_output, (cpu_hn, cpu_cn) = rnn(input, (hx, cx))
-
-        rnn = rnn.to(device)
-        input = input.to(device)
-        hx = hx.to(device)
-        cx = cx.to(device)
-        output, (hn, cn) = rnn(input, (hx, cx))
-
-        self.assertEqual(cpu_output, output)
-        self.assertEqual(cpu_hn, hn)
-        self.assertEqual(cpu_cn, cn)
-
-        # test batch_first
-        rnn = nn.LSTM(1, 4, 2, device="cpu", batch_first=True)
-        input = torch.randn(3, 2, 1, device="cpu")
-        hx = torch.zeros(2, 3, 4, device="cpu")
-        cx = torch.zeros(2, 3, 4, device="cpu")
-        cpu_output, (cpu_hn, cpu_cn) = rnn(input, (hx, cx))
-
-        rnn = rnn.to(device)
-        input = input.to(device)
-        hx = hx.to(device)
-        cx = cx.to(device)
-        output, (hn, cn) = rnn(input, (hx, cx))
-
-        self.assertEqual(cpu_output, output)
-        self.assertEqual(cpu_hn, hn)
-        self.assertEqual(cpu_cn, cn)
-
-    @unittest.skipIf(True, "Backward of lstm returns wrong result")
-    def test_lstm_2(self, device="mps", dtype=torch.float32):
-        def get_results(device):
-            rnn = nn.LSTM(1, 4, 1, device=device)
-            inp = torch.randn(2, 3, 1, device=device, requires_grad=True)
-            hx = torch.zeros(1, 3, 4, device=device)
-            cx = torch.zeros(1, 3, 4, device=device)
-
-            output, _ = rnn(inp, (hx, cx))
-            output.sum().backward()
-
-            weight_grad = rnn.weight_ih_l0.grad.clone()
-            input_grad = inp.grad.clone()
-
-            return output, weight_grad, input_grad
-
-
-        cpu_output, cpu_weight_grad, cpu_input_grad = get_results("cpu")
-        mps_output, mps_weight_grad, mps_input_grad = get_results("mps")
-
-        self.assertEqual(cpu_output, mps_output)
-        self.assertEqual(cpu_input_grad, mps_input_grad)
-        self.assertEqual(cpu_weight_grad, mps_weight_grad)
 
 class TestFallbackWarning(TestCase):
     # TODO: Remove once test_testing.py is running on MPS devices
@@ -8183,11 +9462,11 @@ def test_serialization_map_location(self):
 
 
 MPS_DTYPES = get_all_dtypes()
-for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.bfloat16]:
     del MPS_DTYPES[MPS_DTYPES.index(t)]
 
 
-class TestConsistency(TestCase):
+class TestConsistency(TestCaseMPS):
     # TODO: This is only used while some ops are being added.
     # This list should contain all ops and dtypes eventually
     # This can be generated automatically in the `new_mps_allowlist.txt` file
@@ -8201,7 +9480,7 @@ class TestConsistency(TestCase):
         '__rmatmul__': ['f32'],
         '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rpow__': ['f16'],
+        '__rpow__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8223,7 +9502,7 @@ class TestConsistency(TestCase):
         'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'addmm': ['f32'],
         'addmv': ['f32'],
-        'addr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'addr': ['f32'],
         'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'allclose': ['f16', 'f32'],
         'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8249,8 +9528,11 @@ class TestConsistency(TestCase):
         'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'bmm': ['f32'],
         'broadcast_shapes': ['f32'],
+        'byte': None,
+        'cat': None,
         'ceil': ['f32', 'int32', 'int64', 'f16'],
-        'char': ['b8', 'u8'],
+        'chalf': None,
+        'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8262,6 +9544,7 @@ class TestConsistency(TestCase):
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'corrcoef': ['f32'],
         'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
         'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
@@ -8271,7 +9554,7 @@ class TestConsistency(TestCase):
         'diag': ['f32', 'i32'],
         'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'diagflat': ['f32', 'i32'],
-        'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'diagonal_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dist': ['f32'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8286,17 +9569,23 @@ class TestConsistency(TestCase):
         'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'float': ['f32'],
+        'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'floor_divide': ['f32', 'f16'],
+        'fmax': ['b8', 'f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'fmin': ['b8', 'f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'fmod': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
         'frac': ['f16', 'f32'],
         'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gradient': ['f16', 'f32', 'i16'],
-        'half': ['f16'],
+        'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'hypot': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'index_add': ['f16', 'f32', 'i16', 'i32'],
-        'int': ['i32'],
+        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8304,6 +9593,7 @@ class TestConsistency(TestCase):
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.matrix_norm': ['f16'],
+        'linalg.matrix_power': ['f32'],
         'linalg.svd': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8318,10 +9608,13 @@ class TestConsistency(TestCase):
         'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked_fill': ['f16', 'i16', 'i32', 'i64'],
+        'long': None,
+        'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked_scatter': ['i8', 'b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'matmul': ['f32'],
         'mm': ['f32'],
         'mv': ['f32'],
@@ -8329,6 +9622,10 @@ class TestConsistency(TestCase):
         'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool1d': ['f32'],
+        'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.avg_pool1d': ['f32', 'i64'],
+        'nn.functional.avg_pool2d': ['f32', 'i64'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
@@ -8343,25 +9640,30 @@ class TestConsistency(TestCase):
         'nn.functional.gaussian_nll_loss': ['f32'],
         'nn.functional.glu': ['f32'],
         'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardsigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.hinge_embedding_loss': ['f32'],
-        'nn.functional.huber_loss': ['f32'],
+        'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
         'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.l1_loss': ['f16', 'f32'],
         'nn.functional.leaky_relu': ['f32'],
         'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.logsigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
         'max_pool2d_with_indices_backward': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
+        'nn.functional.nll_loss': ['f32'],
         'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.padreflect': ['f32'],
         'nn.functional.padreplicate': ['f32'],
-        'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        # TODO: add f16 test case after solve the accuracy issue,
+        # see https://github.com/pytorch/pytorch/pull/95166#issuecomment-1439359181.
+        'nn.functional.pairwise_distance': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8381,34 +9683,37 @@ class TestConsistency(TestCase):
         'nn.functional.upsample_nearest': ['f32'],
         'norm': ['f32', 'f16'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'pow': ['f16'],
+        'pow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'put': None,
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'repeat': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'remainder' : None,
+        'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'rot90': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'short': ['i16'],
-        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
+        'select_scatter': None,
+        'sgn': None,
+        'short': None,
+        'sigmoid': None,
+        'sign': None,
         'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'slice_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'softmax': ['f32'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'square': ['f16', 'f32'],
+        'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8418,9 +9723,12 @@ class TestConsistency(TestCase):
         'tan': ['b8', 'i16', 'i32', 'u8'],
         'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'tensordot': ['f32'],
+        'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'topk': ['f32'],
+        'topk': ['f32', 'f16'],
         'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'sort': ['f32', 'i16', 'i32', 'i64'],
+        'argsort': ['f32', 'i16', 'i32', 'i64'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8435,8 +9743,8 @@ class TestConsistency(TestCase):
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'where': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': ['f32', 'i16', 'i32', 'i64'],
+        'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8448,11 +9756,28 @@ class TestConsistency(TestCase):
         'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'mean': ['f16', 'f32'],
         'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'native_layer_norm': ['torch.float32'],
+        'nn.functional.layer_norm': ['torch.float32'],
+        'nn.functional.bilinear': ['f32'],
+        'linalg.solve_triangular': ['f32'],
+        'triangular_solve': ['f32'],
+        'trace': None,
+        '_native_batch_norm_legit': ['f32'],
+        'native_batch_norm': ['f32'],
+        'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
     }
 
 
@@ -8461,12 +9786,13 @@ class TestConsistency(TestCase):
         '__rdiv__': ['f16', 'f32'],
         '__rmatmul__': ['f32'],
         '__rmul__': ['f16', 'f32'],
+        '__rpow__': ['f32'],
         'masked.log_softmax': ['f32'],
         'masked.logaddexp': ['f32'],
         'masked.softmax': ['f32'],
         'masked.softmin': ['f32'],
         'masked.std': ['f32'],
-        'masked.var': ['f32'],
+        'masked_scatter': ['f16', 'f32'],
         'abs': ['f16', 'f32'],
         'acos': ['f32'],
         'acosh': ['f32'],
@@ -8500,6 +9826,7 @@ class TestConsistency(TestCase):
         'conj': ['f16', 'f32'],
         'conj_physical': ['f16', 'f32'],
         'contiguous': ['f16', 'f32'],
+        'copysign': ['f16', 'f32'],
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
@@ -8523,9 +9850,12 @@ class TestConsistency(TestCase):
         'flipud': ['f16', 'f32'],
         'float': ['f32'],
         'floor': ['f32'],
+        'fmax': ['f16', 'f32'],
+        'fmin': ['f16', 'f32'],
         'gradient': ['f32'],
         'half': ['f16'],
         'hstack': ['f16', 'f32'],
+        'hypot': ['f16', 'f32'],
         'index_select': ['f16', 'f32'],
         'index_add': ['f16', 'f32'],
         'isclose': ['f16', 'f32'],
@@ -8544,6 +9874,7 @@ class TestConsistency(TestCase):
         'log_softmax': ['f32'],
         'logaddexp': ['f32'],
         'logical_not': ['f16', 'f32'],
+        'logit': ['f16', 'f32'],
         'logspace': ['f32'],
         'matmul': ['f32'],
         'mm': ['f32'],
@@ -8551,6 +9882,10 @@ class TestConsistency(TestCase):
         'neg': ['f16', 'f32'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool1d': ['f32'],
+        'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.avg_pool1d': ['f32'],
+        'nn.functional.avg_pool2d': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
@@ -8560,20 +9895,25 @@ class TestConsistency(TestCase):
         'nn.functional.elu': ['f32'],
         'nn.functional.feature_alpha_dropout': ['f16', 'f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.hardsigmoid': ['f16', 'f32'],
         'nn.functional.hardtanh': ['f32'],
         'nn.functional.hinge_embedding_loss': ['f32'],
-        'nn.functional.huber_loss': ['f32'],
+        'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
         'nn.functional.leaky_relu': ['f32'],
         'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.logsigmoid': ['f16', 'f32'],
         'nn.functional.margin_ranking_loss': ['f32'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
         'nn.functional.mse_loss': ['f32'],
+        'nn.functional.nll_loss': ['f32'],
         'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.pairwise_distance': ['f16', 'f32'],
+        # TODO: add f16 test case after solve the accuracy issue,
+        # see https://github.com/pytorch/pytorch/pull/95166#issuecomment-1439359181.
+        'nn.functional.pairwise_distance': ['f32'],
         'nn.functional.poisson_nll_loss': ['f32'],
         'nn.functional.relu': ['f32'],
         'nn.functional.relu6': ['f32'],
@@ -8583,12 +9923,14 @@ class TestConsistency(TestCase):
         'nn.functional.softmin': ['f32'],
         'nn.functional.softplus': ['f32'],
         'nn.functional.softsign': ['f16', 'f32'],
+        'nn.functional.smooth_l1_loss': ['f32'],
         'nn.functional.threshold': ['f32'],
         'nn.functional.triplet_margin_loss': ['f32'],
         'nn.functional.triplet_margin_with_distance_loss': ['f32'],
         'nn.functional.upsample_bilinear': ['f32'],
         'norm': ['f32', 'f16'],
         'positive': ['f16', 'f32'],
+        'pow': ['f32'],
         'rad2deg': ['f16', 'f32'],
         'real': ['f16', 'f32'],
         'reciprocal': ['f16', 'f32'],
@@ -8596,6 +9938,7 @@ class TestConsistency(TestCase):
         'repeat_interleave': ['f16', 'f32'],
         'resolve_conj': ['f16', 'f32'],
         'resolve_neg': ['f16', 'f32'],
+        'roll': ['f16', 'f32'],
         'round': ['f32'],
         'rsqrt': ['f32'],
         'select_scatter': ['f16', 'f32'],
@@ -8627,7 +9970,16 @@ class TestConsistency(TestCase):
         'view_as': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
-        'zero_': ['f16', 'f32']
+        'xlogy': ['f16', 'f32'],
+        'zero_': ['f16', 'f32'],
+        'linalg.solve_triangular': ['f32'],
+        'triangular_solve': ['f32'],
+        '_native_batch_norm_legit': ['f32'],
+        'native_batch_norm': ['f32'],
+        'native_layer_norm': ['f32'],
+        'nn.functional.gelu': ['f32'],
+        'nn.functional.bilinear': ['f32'],
+        'nn.functional.prelu': ['f32'],
     }
 
     # These ops that are problematic. So never run them even when
@@ -8643,45 +9995,22 @@ class TestConsistency(TestCase):
         'masked.sum': [torch.bool],
 
         # Functions that hard crash
-        'nn.functional.nll_loss': [torch.float32],
         'std': [torch.float16],
         'stft': [torch.float32], 'var': [torch.float16],
         # + forward when requires_grad=True or running backward
         'nn.functional.embedding': [torch.float32, torch.float16],
-        '__rpow__': [torch.int64],
 
         'as_strided_scatter': [torch.uint8],
         'atan2': [torch.int64],
         'bfloat16': None,
         'block_diag': [torch.uint8],
-        'byte': None,
-        'chalf': None,
         'diag_embed': [torch.uint8],
         'diagonal_scatter': [torch.uint8],
-        'index_add': None,
-        'linalg.inv': [torch.float32],
-        'long': None,
-        'nn.functional.conv1d': [torch.int64],
-        'nn.functional.conv2d': [torch.int64],
-        'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.conv_transpose2d': [torch.int64],
         'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
-        'nn.functional.huber_loss': [torch.float16],
         'nn.functional.local_response_norm': [torch.int64],
         'nn.functional.padcircular': [torch.uint8],
-        'pow': [torch.int64],
-        'select_scatter': [torch.uint8],
-        'sigmoid': [torch.int64],
-        'slice_scatter': [torch.uint8],
-        'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],  # moved from section below
 
-        # failure in average pooling when both ceilMode and includeZeroPadToAverage are True
-        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
-        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
-        'nn.functional.adaptive_avg_pool1d': [torch.float32],
-        'nn.functional.adaptive_avg_pool2d': [torch.float32],
-        # count_nonzero returns wrong results for these dtypes
-        'nonzero': [torch.uint8, torch.float16],
+
 
         # These were moved from ALLOWLIST to BLOCK as they are not working
         # locally
@@ -8696,8 +10025,6 @@ class TestConsistency(TestCase):
 
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'H': None,
-        'T': None,
         'as_strided': None,
         'broadcast_tensors': None,
         'broadcast': None,
@@ -8739,8 +10066,6 @@ class TestConsistency(TestCase):
         'maxbinary': None,
         'maximum': None,
         'minimum': None,
-        'mT': None,
-        'mH': None,
         'outer': None,
         'softmaxwith_dtype': None,
         'rounddecimals_neg_3': None,
@@ -8761,29 +10086,26 @@ class TestConsistency(TestCase):
         'take_along_dim': None,
     }
 
-    # Those ops worked on MacOS12, but broken on MacOS13
-    VENTURA_BLOCKLIST = {
-        'masked.softmax': [torch.float32],
-        'masked.softmin': [torch.float32],
-        'masked.log_softmax': [torch.float32],
-        'dot': [torch.int64],
+    FP16_LOW_PRECISION_LIST = {
+        'add', 'sub', 'div',
+        '__rdiv__', '__rmul__',
+        'nn.functional.huber_loss',
+        'true_divide', 'kron',
+        'gradient', 'var', 'std',
+        'linalg.vector_norm',
+        'masked.sum', 'masked.std',
+        'masked.var',
     }
 
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
-    @ops(op_db, allowed_dtypes=MPS_DTYPES)
+    @ops(mps_ops_modifier(op_db), allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
-        if not torch.backends.mps.is_available():
-            self.skipTest("MPS is not available")
-
         key = op.name + op.variant_test_name
 
-        if key in self.VENTURA_BLOCKLIST and torch.backends.mps.is_macos13_or_newer():
-            if dtype in self.VENTURA_BLOCKLIST[key]:
-                self.skipTest(f"{key}_{dtype} fails on Ventura, see https://github.com/pytorch/pytorch/issues/85758")
         if key in self.BLOCKLIST:
             if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]:
                 self.skipTest(f"Running test with {op.name} hangs so skipping")
@@ -8800,7 +10122,7 @@ def test_output_match(self, device, dtype, op):
         if not generate_new_truth:
             if op.name not in self.ALLOWLIST_OP:
                 self.skipTest(f"{op.name} is not in the allow list for test on MPS")
-            else:
+            elif self.ALLOWLIST_OP[op.name] is not None:
                 if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]:
                     self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded")
 
@@ -8829,16 +10151,28 @@ def get_samples():
                 mps_args = [mps_sample.input] + list(mps_sample.args)
                 mps_kwargs = mps_sample.kwargs
 
+                # for tensor_split(), the second tensor arg ("tensor_indices_or_sections") must be on CPU only
+                if (op.name == "tensor_split" and isinstance(mps_args[1], torch.Tensor)):
+                    mps_args[1] = cpu_args[1]
+
                 cpu_out = op(*cpu_args, **cpu_kwargs)
                 mps_out = op(*mps_args, **mps_kwargs)
 
                 if op.name == "nn.functional.conv2d" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
-                elif (op.name == "add" or op.name == "sub" or
-                      op.name == "masked.sum" or op.name == "masked.std" or op.name == "masked.var") and dtype == torch.float16:
+                elif op.name in self.FP16_LOW_PRECISION_LIST and dtype == torch.float16:
                     atol = 1e-2
                     rtol = 1e-2
+                elif op.name == "masked.mean":
+                    atol = 7e-4
+                    rtol = 2e-3
+                elif op.name == "native_layer_norm":
+                    atol = 1e-4
+                    rtol = 1.3e-5
+                elif op.name in ["pow", "__rpow__"]:
+                    atol = 1e-6
+                    rtol = 4e-6
                 else:
                     atol = None
                     rtol = None
@@ -8897,7 +10231,7 @@ def req_grad(t):
                 cpu_grad_inputs = torch.autograd.grad(diff_cpu_out, diff_cpu_arg, grad_outputs=cpu_grad_outputs, allow_unused=True)
                 mps_grad_inputs = torch.autograd.grad(diff_mps_out, diff_mps_arg, grad_outputs=mps_grad_outputs, allow_unused=True)
 
-                self.assertEqual(cpu_grad_inputs, mps_grad_inputs)
+                self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
             except Exception as e:
                 if not generate_new_truth:
                     raise e
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 633f43e338b7..865b1f702b55 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -16,7 +16,7 @@
 from torch.nn import Parameter
 from torch.testing._internal.common_utils import (TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN,
                                                   load_tests, slowTest, TEST_WITH_TSAN, TEST_WITH_TORCHDYNAMO,
-                                                  TEST_WITH_ROCM)
+                                                  TEST_WITH_ROCM, IS_MACOS)
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -24,7 +24,7 @@
 
 TEST_REPEATS = 30
 HAS_SHM_FILES = os.path.isdir('/dev/shm')
-MAX_WAITING_TIME_IN_SECONDS = 5
+MAX_WAITING_TIME_IN_SECONDS = 30
 TEST_CUDA_IPC = torch.cuda.is_available() and \
     sys.platform != 'darwin' and \
     sys.platform != 'win32' and \
@@ -34,7 +34,7 @@
 
 class SubProcess(mp.Process):
     def __init__(self, tensor):
-        super(SubProcess, self).__init__()
+        super().__init__()
         self.tensor = tensor
         self.daemon = True
 
@@ -184,7 +184,7 @@ def fs_sharing():
         mp.set_sharing_strategy(prev_strategy)
 
 
-class leak_checker(object):
+class leak_checker:
 
     def __init__(self, test_case):
         self.checked_pids = [os.getpid()]
@@ -260,26 +260,41 @@ def test_fill():
             x = torch.zeros(5, 5).to(device, dtype)
             q = ctx.Queue()
             e = ctx.Event()
+
             data = [x, x[:, 1]]
             q.put(data)
+
             p = ctx.Process(target=simple_fill, args=(q, e))
             p.daemon = True
             lc.check_pid(p.pid)
             p.start()
-            e.wait(10)
-            self.assertTrue(e.is_set())
+
+            total_waiting_time = 0
+            waiting_time = 0.5
+            is_set = False
+            # Once the child process is done, it will set the event to notify the
+            # parent accordingly
+            while total_waiting_time <= MAX_WAITING_TIME_IN_SECONDS and not is_set:
+                time.sleep(waiting_time)
+                total_waiting_time += waiting_time
+                is_set = e.is_set()
+
+            self.assertTrue(is_set)
             self.assertTrue(data[0].eq(4).all())
             self.assertTrue(data[1].eq(4).all())
+
             p.join(100)
             self.assertFalse(p.is_alive())
 
         def test_receive():
             q = ctx.Queue()
             e = ctx.Event()
+
             p = ctx.Process(target=send_tensor, args=(q, e, device, dtype))
             p.daemon = True
             lc.check_pid(p.pid)
             p.start()
+
             t1 = q.get()
             t2 = q.get()
             self.assertTrue(t1.eq(1).all())
@@ -288,9 +303,12 @@ def test_receive():
             self.assertEqual(type(s1), type(s2))
             self.assertEqual(s1.data_ptr(), s1.data_ptr())
             self.assertEqual(s1, s2)
+
             # We need to delete this tensors to allow producer (child process)
             # collect them properly
             del t1, t2
+
+            # Mark the event as done and join the process
             e.set()
             p.join(100)
             self.assertFalse(p.is_alive())
@@ -358,7 +376,10 @@ def test_fd_pool(self):
                      "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
     def test_fs_sharing(self):
         with fs_sharing():
-            self._test_sharing(repeat=TEST_REPEATS)
+            # The test works but is very slow on MacOS, see https://github.com/pytorch/pytorch/pull/93183,
+            # so run it only once there. The delay is in waiting for the child process to terminate (join)
+            repeat = 1 if IS_MACOS else TEST_REPEATS
+            self._test_sharing(repeat=repeat)
 
     @unittest.skipIf(TEST_WITH_TORCHDYNAMO,
                      "Fail to clean up temporary /dev/shm/torch_* file, see https://github.com/pytorch/pytorch/issues/91467")
diff --git a/test/test_multiprocessing_spawn.py b/test/test_multiprocessing_spawn.py
index d8483f115f54..5160056a87f7 100644
--- a/test/test_multiprocessing_spawn.py
+++ b/test/test_multiprocessing_spawn.py
@@ -87,7 +87,7 @@ def _test_nested(i, pids_queue, nested_child_sleep, start_method):
     # Kill self. This should take down the child processes as well.
     os.kill(os.getpid(), signal.SIGTERM)
 
-class _TestMultiProcessing(object):
+class _TestMultiProcessing:
     start_method = None
 
     def test_success(self):
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index 48782535a598..8330a6eb9565 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -13,7 +13,7 @@
 path = os.path.dirname(os.path.realpath(__file__))
 aten_native_yaml = os.path.join(path, '../aten/src/ATen/native/native_functions.yaml')
 all_operators_with_namedtuple_return = {
-    'max', 'min', 'aminmax', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig',
+    'max', 'min', 'aminmax', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd',
     'qr', 'geqrf', 'slogdet', 'sort', 'topk', 'linalg_inv_ex',
     'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "_linalg_eigh", "_unpack_dual", 'linalg_qr',
     'linalg_svd', '_linalg_svd', 'linalg_slogdet', '_linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask',
@@ -24,6 +24,10 @@
     '_linalg_det', '_lu_with_info', 'linalg_ldl_factor_ex', 'linalg_ldl_factor', 'linalg_solve_ex', '_linalg_solve_ex'
 }
 
+all_operators_with_namedtuple_return_skip_list = {
+    '_scaled_dot_product_flash_attention'
+}
+
 
 class TestNamedTupleAPI(TestCase):
 
@@ -39,7 +43,7 @@ def test_native_functions_yaml(self):
                 f = f['func']
                 ret = f.split('->')[1].strip()
                 name = regex.findall(f)[0][0]
-                if name in all_operators_with_namedtuple_return:
+                if name in all_operators_with_namedtuple_return :
                     operators_found.add(name)
                     continue
                 if '_backward' in name or name.endswith('_forward'):
@@ -48,6 +52,8 @@ def test_native_functions_yaml(self):
                     continue
                 if ret == '()':
                     continue
+                if name in all_operators_with_namedtuple_return_skip_list:
+                    continue
                 ret = ret[1:-1].split(',')
                 for r in ret:
                     r = r.strip()
@@ -77,7 +83,6 @@ def test_namedtuple_return(self):
             op(operators=['_linalg_slogdet'], input=(), names=('sign', 'logabsdet', 'LU', 'pivots'), hasout=True),
             op(operators=['qr', 'linalg_qr'], input=(), names=('Q', 'R'), hasout=True),
             op(operators=['geqrf'], input=(), names=('a', 'tau'), hasout=True),
-            op(operators=['symeig'], input=(True,), names=('eigenvalues', 'eigenvectors'), hasout=True),
             op(operators=['triangular_solve'], input=(a,), names=('solution', 'cloned_coefficient'), hasout=True),
             op(operators=['linalg_eig'], input=(), names=('eigenvalues', 'eigenvectors'), hasout=True),
             op(operators=['linalg_eigh'], input=("L",), names=('eigenvalues', 'eigenvectors'), hasout=True),
@@ -162,7 +167,7 @@ def check_torch_return_type(f, names):
                     ret3 = meth(*op.input)
                     check_namedtuple(ret3, op.names)
 
-        all_covered_operators = set([x for y in operators for x in y.operators])
+        all_covered_operators = {x for y in operators for x in y.operators}
 
         self.assertEqual(all_operators_with_namedtuple_return, all_covered_operators, textwrap.dedent('''
         The set of covered operators does not match the `all_operators_with_namedtuple_return` of
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 710753886315..28c3d9cea1f5 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -21,6 +21,7 @@
     IS_FBCODE,
     parametrize,
     run_tests,
+    skipIfSlowGradcheckEnv,
     subtest,
     TestCase,
 )
@@ -917,15 +918,28 @@ def test_nested_tensor_add(self, device, dtype):
     @torch.inference_mode()
     @parametrize("embedding_dim", [8, 128, 256, 384])
     def test_nested_tensor_dense_elementwise(self, device, dtype, embedding_dim):
+        def _test_add_mul(nt, t):
+            ref_add = torch.nested.nested_tensor(
+                [t1 + t2 for (t1, t2) in zip(nt.unbind(), t.unbind())])
+            ref_mul = torch.nested.nested_tensor(
+                [t1 * t2 for (t1, t2) in zip(nt.unbind(), t.unbind())])
+            self.assertEqual(nt.add(t), ref_add)
+            self.assertEqual(nt.mul(t), ref_mul)
+
         batch_size = 32
         seq_lens = torch.randint(low=0, high=10, size=(batch_size,))
+
+        # [B, *, D], [B, 1, D] case
         ts = [torch.randn((seq_len, embedding_dim)) for seq_len in seq_lens]
         nt = torch.nested.nested_tensor(ts, device=device, dtype=dtype)
         t = torch.randn((batch_size, 1, embedding_dim), device=device, dtype=dtype)
-        ref_add = torch.nested.nested_tensor([t1 + t2 for (t1, t2) in zip(nt.unbind(), t.unbind())])
-        ref_mul = torch.nested.nested_tensor([t1 * t2 for (t1, t2) in zip(nt.unbind(), t.unbind())])
-        self.assertEqual(nt.add(t), ref_add)
-        self.assertEqual(nt.mul(t), ref_mul)
+        _test_add_mul(nt, t)
+
+        # [B, *], [B, 1] case
+        ts = [torch.randn(seq_len) for seq_len in seq_lens]
+        nt = torch.nested.nested_tensor(ts, device=device, dtype=dtype)
+        t = torch.randn((batch_size, 1), device=device, dtype=dtype)
+        _test_add_mul(nt, t)
 
     @dtypes(torch.float, torch.float16)
     @skipMeta
@@ -1358,19 +1372,20 @@ def unbind_rebind_matmul(nt1, nt2):
             return torch.nested.nested_tensor(out_ts)
 
         # [N, n_head, *, head_dim], [N, n_head, head_dim, *]
-        N = np.random.randint(2, 5)
+        Ns = [1, 2, 5]
         n_heads = np.random.randint(2, 5)
         head_dim = 3
         t1s = []
         t2s = []
-        for _ in range(N):
-            seq_len1 = np.random.randint(2, 5)
-            seq_len2 = np.random.randint(2, 5)
-            t1s.append(torch.randn(n_heads, seq_len1, head_dim))
-            t2s.append(torch.randn(n_heads, head_dim, seq_len2))
-        nt1 = torch.nested.nested_tensor(t1s, device=device, dtype=dtype)
-        nt2 = torch.nested.nested_tensor(t2s, device=device, dtype=dtype)
-        self.assertEqual(torch.matmul(nt1, nt2), unbind_rebind_matmul(nt1, nt2))
+        for N in Ns:
+            for _ in range(N):
+                seq_len1 = np.random.randint(2, 5)
+                seq_len2 = np.random.randint(2, 5)
+                t1s.append(torch.randn(n_heads, seq_len1, head_dim))
+                t2s.append(torch.randn(n_heads, head_dim, seq_len2))
+            nt1 = torch.nested.nested_tensor(t1s, device=device, dtype=dtype)
+            nt2 = torch.nested.nested_tensor(t2s, device=device, dtype=dtype)
+            self.assertEqual(torch.matmul(nt1, nt2), unbind_rebind_matmul(nt1, nt2))
 
         # test with noncontiguous
         t3s = []
@@ -1641,14 +1656,29 @@ def test_squeeze_unsqueeze(self, device, dtype):
         self.assertEqual(nt, nt2)
 
         # test cases that should work
-        for i in range(-2, 3):
+        nt_sizes = nt._nested_tensor_size()
+        nt_strides = nt._nested_tensor_strides()
+        for i in range(-2, 4):
             if (i == 0):
+                # cannot unsqueeze batch dim
                 continue
             nt_unsqueezed = nt.unsqueeze(i)
-            size_idx = i if i < 0 else i - 1
+            # negative dim will correspond to unsqueeze() applied at dim = dim + nt.dim() + 1
+            wrapped_i = i + nt.dim() + 1 if i < 0 else i
+            # col_index into nt size tensor is requires subtraction of 1 to ignore batch dim
+            size_idx = wrapped_i - 1
             self.assertEqual(nt_unsqueezed._nested_tensor_size()[:, size_idx], torch.ones(2, dtype=torch.long))
+            unsqueezed_stride = nt_unsqueezed._nested_tensor_strides()[:, size_idx]
+            if (i == nt.ndim or i == -1):
+                self.assertEqual(unsqueezed_stride, torch.ones(2, dtype=torch.long))
+            else:
+                stride_col_after = nt_strides[:, size_idx]
+                size_col_after = nt_sizes[:, size_idx]
+                self.assertEqual(unsqueezed_stride, stride_col_after * size_col_after)
             nt_squeezed = nt_unsqueezed.squeeze(i)
             self.assertEqual(nt_squeezed, nt)
+            self.assertEqual(nt_squeezed._nested_tensor_size(), nt_sizes)
+            self.assertEqual(nt_squeezed._nested_tensor_strides(), nt_strides)
 
     @dtypes(torch.float, torch.float16, torch.double)
     def test_transpose_inference_mode_interaction(self, device, dtype):
@@ -1805,7 +1835,7 @@ def test_scaled_dot_product_attention(self, device, input_dim):
         def rand_tensor(*shape):
             return torch.randn(shape, device=device)
 
-        E = 10
+        E = 8
         if input_dim == 3:
             # Shape: (N, L, E); ragged L
             query = torch.nested.nested_tensor([rand_tensor(2, E), rand_tensor(3, E), rand_tensor(4, E)])
@@ -1814,6 +1844,7 @@ def rand_tensor(*shape):
             key = torch.nested.nested_tensor([rand_tensor(3, E), rand_tensor(4, E), rand_tensor(5, E)])
             value = torch.nested.nested_tensor([rand_tensor(3, E), rand_tensor(4, E), rand_tensor(5, E)])
         elif input_dim == 4:
+            # In the 4D case the L and S is ragged
             # Shape: (N, N', L, E); ragged N' and L
             query = torch.nested.nested_tensor([rand_tensor(2, 2, E), rand_tensor(3, 3, E), rand_tensor(4, 4, E)])
             # Shape: (N, N', S, E); ragged N' and S
@@ -1829,34 +1860,28 @@ def rand_mask(size):
         attn_mask = torch.nested.nested_tensor([rand_mask((2, 3)), rand_mask((3, 4)), rand_mask((4, 5))])
 
         dropout_p = 0.0  # no dropout for reproducibility
-        need_attn_weights: bool = True
 
         # Success case: no attn_mask set and is_causal=False.
-        actual = torch.ops.aten._scaled_dot_product_attention(
-            query, key, value, attn_mask=None, dropout_p=dropout_p, need_attn_weights=need_attn_weights)
+        actual = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=None, is_causal=False, dropout_p=dropout_p)
 
         expected_outputs = []
-        expected_attn_weights = []
         for q, k, v in zip(query.unbind(), key.unbind(), value.unbind()):
-            (output, attn_weights) = torch.ops.aten._scaled_dot_product_attention(
-                q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attn_mask=None, dropout_p=dropout_p,
-                need_attn_weights=need_attn_weights)
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attn_mask=None, dropout_p=dropout_p)
             expected_outputs.append(output.squeeze(0))
-            expected_attn_weights.append(attn_weights.squeeze(0))
         expected_output_nested = torch.nested.nested_tensor(expected_outputs)
-        expected_attn_weight_nested = torch.nested.nested_tensor(expected_attn_weights)
-        self.assertEqual(actual[0], expected_output_nested)
-        self.assertEqual(actual[1], expected_attn_weight_nested)
+        self.assertEqual(actual, expected_output_nested)
 
         # Error case: explicit attn_mask set.
         with self.assertRaisesRegex(RuntimeError, "not supported when an explicit attn_mask is set"):
-            torch.ops.aten._scaled_dot_product_attention(
-                query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, need_attn_weights=need_attn_weights)
+            torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=attn_mask, dropout_p=dropout_p)
 
         # Error case: is_causal=True.
         with self.assertRaisesRegex(RuntimeError, "not supported when is_causal=True"):
-            torch.ops.aten._scaled_dot_product_attention(
-                query, key, value, dropout_p=dropout_p, need_attn_weights=need_attn_weights, is_causal=True)
+            torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, dropout_p=dropout_p, is_causal=True)
 
     @dtypes(torch.float, torch.float16, torch.double)
     def test_empty_like(self, device, dtype):
@@ -2227,6 +2252,27 @@ def grad_test_func(a, b, c, weight, bias=None):
         data = (a, b, c, weight)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
+    def test_nested_tensor_linear_plus_transpose(self, device):
+        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, requires_grad=True, dtype=torch.float64, device=device)
+
+        weight = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
+        bias = torch.randn(2, requires_grad=True, dtype=torch.float64, device=device)
+
+        def grad_test_func(a, b, c, weight, bias=None):
+            nt = torch.nested.as_nested_tensor([a, b, c])
+            # This implicitly tests to_padded_tensor grads
+            d = torch.functional.F.linear(nt, weight, bias)
+            d = d.transpose(-1, -2).contiguous()
+            return torch.nested.to_padded_tensor(d, 0)
+        data = (a, b, c, weight, bias)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
+        # Test linear with no bias added
+        data = (a, b, c, weight)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
     def test_nested_tensor_softmax(self, device):
         a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64, device=device)
         b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
@@ -2316,6 +2362,67 @@ def test_indexing_backward(self, device):
         expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4), device=device)])
         self.assertEqual(nt.grad, expected_grad)
 
+    def test_gelu_backward(self, device):
+        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested.as_nested_tensor([a, b, c])
+            nt_gelu = torch.nn.functional.gelu(nt)
+            return torch.nested.to_padded_tensor(nt_gelu, 0)
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
+    def test_relu_backward(self, device):
+        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested.as_nested_tensor([a, b, c])
+            nt_relu = torch.nn.functional.relu(nt)
+            return torch.nested.to_padded_tensor(nt_relu, 0)
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
+    # TODO: OOM https://github.com/pytorch/pytorch/issues/95562
+    @skipIfSlowGradcheckEnv
+    @parametrize("size", [1024, 1023, 513, 512, 256, 128, 32, 4, 2])
+    def test_layer_norm_backward(self, device, size):
+        a = torch.randn(1, 2, size, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, size, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, size, requires_grad=True, dtype=torch.float64, device=device)
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested.as_nested_tensor([a, b, c])
+            layer_norm = torch.nn.LayerNorm(nt.size(-1), device=device, dtype=torch.float64)
+            nt_layer_norm = layer_norm(nt)
+            return torch.nested.to_padded_tensor(nt_layer_norm, 0)
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
+    # TODO: OOM https://github.com/pytorch/pytorch/issues/95562
+    @skipIfSlowGradcheckEnv
+    # Could either mark slow or reduce size
+    @parametrize("size", [128, 32, 4, 2])
+    def test_layer_norm_backward_5d(self, device, size):
+        a = torch.randn(4, size, size, 4, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(7, size, size, 4, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(10, size, size, 4, requires_grad=True, dtype=torch.float64, device=device)
+
+        def grad_test_func(a, b, c):
+            nt = torch.nested.as_nested_tensor([a, b, c])
+            layer_norm = torch.nn.LayerNorm((size, size, nt.size(-1)), device=device, dtype=torch.float64)
+            nt_layer_norm = layer_norm(nt)
+            return torch.nested.to_padded_tensor(nt_layer_norm, 0)
+
+        data = (a, b, c)
+        assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
+
 
 instantiate_parametrized_tests(TestNestedTensor)
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
diff --git a/test/test_nn.py b/test/test_nn.py
index 90bafbb4e59d..fe7593a33fbd 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -21,7 +21,7 @@
 # NN tests use double as the default dtype
 torch.set_default_dtype(torch.double)
 
-from torch._six import inf, nan
+from torch import inf, nan
 import torch.autograd.forward_ad as fwAD
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
@@ -39,7 +39,7 @@
     download_file, get_function_arglist, load_tests, skipIfMps,\
     TEST_WITH_UBSAN, IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
-    skipIfTorchDynamo, IS_WINDOWS
+    skipIfTorchDynamo, IS_WINDOWS, gcIfJetson
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
@@ -165,10 +165,37 @@ def test_module_backcompat(self):
         input = torch.randn(2, 3, dtype=torch.float)
         self.assertEqual(m(input).size(), (2, 5))
 
+    def test_module_super_init(self):
+        class MyMixin:
+            def __init__(self, *a, **kw):
+                super().__init__(*a, **kw)
+                self.mixin_init = True
+
+        class MyModuleWithMixinBefore(MyMixin, nn.Module):
+            pass
+
+        class MyModuleWithMixinAfter(nn.Module, MyMixin):
+            pass
+
+        self.assertTrue(hasattr(MyModuleWithMixinBefore(), 'mixin_init'))
+        self.assertFalse(hasattr(MyModuleWithMixinAfter(), 'mixin_init'))
+
+        nn.Module.call_super_init = True
+        self.assertTrue(hasattr(MyModuleWithMixinBefore(), 'mixin_init'))
+        self.assertTrue(hasattr(MyModuleWithMixinAfter(), 'mixin_init'))
+        nn.Module.call_super_init = False
+
+        MyModuleWithMixinBefore.call_super_init = True
+        MyModuleWithMixinAfter.call_super_init = True
+        self.assertTrue(hasattr(MyModuleWithMixinBefore(), 'mixin_init'))
+        self.assertTrue(hasattr(MyModuleWithMixinAfter(), 'mixin_init'))
+        MyModuleWithMixinBefore.call_super_init = False
+        MyModuleWithMixinAfter.call_super_init = False
+
     def test_share_memory(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.p = nn.Parameter(torch.eye(5))
                 self.par = nn.ParameterList()
                 self.par.append(nn.Parameter(torch.randn(10)))
@@ -219,25 +246,24 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.weight.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
-        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertIsNone(module.weight.grad)
 
         module.bias.requires_grad = True
         module.zero_grad()
-        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
         module(i).sum().backward()
         self.assertIsNotNone(module.weight.grad)
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-        module.zero_grad()
+        module.zero_grad(set_to_none=False)   # Force set to zeros.
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
-        # Force set to None.
-        module.zero_grad(set_to_none=True)
+        module.zero_grad()
         self.assertIsNone(module.weight.grad)
-
+        self.assertIsNone(module.bias.grad)
 
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:
@@ -351,7 +377,7 @@ def __init__(self):
     def test_call_supports_python_dict_output(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l1 = nn.Linear(10, 20)
                 self.register_backward_hook(self.hook)
                 self.check_backward_hook_flag = False
@@ -379,7 +405,7 @@ def test_children(self):
     def test_train_errors_for_invalid_mode(self):
         class SubclassNet(nn.Module):
             def __init__(self):
-                super(SubclassNet, self).__init__()
+                super().__init__()
                 self.l1 = nn.Linear(2, 2)
 
             def forward(self, inputs):
@@ -452,7 +478,7 @@ def test_named_children(self):
     def test_modules(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l1 = l
                 self.l2 = l
                 self.param = torch.empty(3, 5)
@@ -465,7 +491,7 @@ def __init__(self):
     def test_named_modules(self):
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.l1 = l
                 self.l2 = l
                 self.param = torch.empty(3, 5)
@@ -2444,7 +2470,7 @@ def test_load_state_dict_custom(self):
 
         class CustomState(nn.Module):
             def __init__(self):
-                super(CustomState, self).__init__()
+                super().__init__()
                 self.param = torch.nn.Parameter(torch.ones(1))
                 self.sub = torch.nn.Linear(5, 5)
 
@@ -2534,9 +2560,6 @@ def set_extra_state(self, state):
     def test_extra_state_missing_set_extra_state(self):
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def get_extra_state(self):
                 return {
                     'foo': 5
@@ -2549,9 +2572,6 @@ def get_extra_state(self):
     def test_extra_state_missing_get_extra_state(self):
 
         class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def set_extra_state(self):
                 pass
 
@@ -2662,7 +2682,7 @@ def test_assignments(get_list, a, b, c):
     def test_container_copy(self):
         class Model(nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super().__init__()
                 self.linear = nn.Linear(4, 5)
 
             def forward(self, input):
@@ -4925,7 +4945,13 @@ def helper(self, size, dtype, mixed_dtype=False):
             helper(self, shape, torch.bfloat16, False)
             helper(self, shape, torch.bfloat16, True)
 
-    @parametrize_test('bn_module', [torch.nn.BatchNorm2d, torch.nn.SyncBatchNorm])
+    @parametrize_test(
+        'bn_module',
+        [
+            subtest(torch.nn.BatchNorm2d, name="BatchNorm2d"),
+            subtest(torch.nn.SyncBatchNorm, name="SyncBatchNorm"),
+        ],
+    )
     def test_batchnorm_non_contig_cpu(self, bn_module):
         input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu()
         input = input.permute(0, 2, 1, 3)
@@ -5396,8 +5422,8 @@ def test_cosine_similarity(self):
         self.assertEqual(F.cosine_similarity(input1, input2, dim=1).size(), expected_size)
 
         # Check numerical precision, issue #18057
-        vv1 = torch.tensor(list([float(i) for i in range(84)])).unsqueeze(0)
-        vv2 = torch.tensor(list([float(i) for i in range(84)])).unsqueeze(0)
+        vv1 = torch.tensor([float(i) for i in range(84)]).unsqueeze(0)
+        vv2 = torch.tensor([float(i) for i in range(84)]).unsqueeze(0)
         out = F.cosine_similarity(vv1, vv2)
         self.assertLessEqual(out, 1.0)
 
@@ -6368,6 +6394,11 @@ def test_interpolate_illegal_memory_access(self):
         self.assertEqual(out_ref, out)
         self.assertEqual(input_ref.grad, input.grad)
 
+    def test_interpolate_undefined_behavior_casting(self):
+        x = torch.ones([1, 1, 16, 16])
+        self.assertRaises(RuntimeError, lambda: F.interpolate(x, scale_factor=-1e20, mode="bilinear"))
+        self.assertRaises(RuntimeError, lambda: F.interpolate(x, scale_factor=1e20, mode="bilinear"))
+
     def test_interpolate_buffer_overflow(self):
         # Test buffer overflow issue due to inaccurate floating point
         # representation for integer values. See issue below for details.
@@ -7245,7 +7276,7 @@ def sum_reduction_constructor(*args, **kwargs):
 
 class UnpoolingNet(nn.Module):
     def __init__(self, pool, unpool):
-        super(UnpoolingNet, self).__init__()
+        super().__init__()
         self.pool = pool
         self.unpool = unpool
 
@@ -7624,10 +7655,28 @@ def _test_LayerNorm_cpu_mixed_dtype(self, device):
             # so make sure n exceeds vector length
             input = torch.empty(2, 3, 11, 3, device=device, dtype=torch.bfloat16).random_(1, 10)
             m = nn.LayerNorm([11, 3], elementwise_affine=elementwise_affine).to(device, torch.bfloat16)
-            m2 = deepcopy(m).to(device, torch.float)
-            out = m(input)
-            out2 = m2(input)
-            self.assertEqual(out, out2)
+
+            # fp32
+            m_fp32 = deepcopy(m).to(device, torch.float)
+            x_fp32 = input.clone().detach().float().requires_grad_()
+            out_fp32 = m_fp32(x_fp32)
+            out_fp32.sum().backward()
+
+            # bf16
+            m_bf16 = deepcopy(m)
+            x_bf16 = input.clone().detach().requires_grad_()
+            out_bf16 = m_bf16(x_bf16)
+            out_bf16.sum().backward()
+
+            # bf16 mixed type
+            m_mix = deepcopy(m).to(device, torch.float)
+            x_mix = input.clone().detach().requires_grad_()
+            out_mix = m_mix(x_mix)
+            out_mix.sum().backward()
+            self.assertEqual(out_fp32.bfloat16(), out_bf16)
+            self.assertEqual(out_fp32.bfloat16(), out_mix)
+            self.assertEqual(x_fp32.grad.bfloat16(), x_bf16.grad, atol=1e-1, rtol=1e-1)
+            self.assertEqual(x_fp32.grad.bfloat16(), x_mix.grad, atol=1e-1, rtol=1e-1)
 
     def _test_GroupNorm_general(self, device, dtype=torch.float):
         good_shape_g = {
@@ -8028,6 +8077,11 @@ def help(input, conv, memory_format):
         weight = weight.contiguous()
         out_ref = F.conv2d(input2d, weight, bias, (1, 1), 0, (1, 1), 1)
         self.assertEqual(out_ref, out)
+        # sigfpe reported in https://github.com/pytorch/pytorch/issues/94125
+        with self.assertRaises(RuntimeError):
+            inp = torch.empty([1, 1, 1, 0], dtype=dtype, device=device)
+            weight = torch.empty([1, 0, 1], dtype=dtype, device=device)
+            torch._C._nn.slow_conv3d(inp, weight, 1)
 
     def test_InstanceNorm1d_general(self, device):
         b = random.randint(3, 5)
@@ -8249,9 +8303,7 @@ def helper(input_format, grad_format, B=2, C=4, W=4, H=4):
             y_orig.backward(grad_orig)
 
             self.assertEqual(y, y_orig)
-            # TODO: Fix me, CPU should produce valid results here, but it is not
-            if device != "cpu":
-                self.assertEqual(x.grad, x_orig.grad)
+            self.assertEqual(x.grad, x_orig.grad)
 
         for input_format in [torch.contiguous_format, torch.channels_last]:
             for grad_format in [torch.contiguous_format, torch.channels_last]:
@@ -9330,67 +9382,67 @@ def helper(memory_format, isize, osize):
     @parametrize_test("antialias", [True, False])
     @parametrize_test("align_corners", [True, False])
     @parametrize_test("mode", ["bilinear", "bicubic"])
+    @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
     @onlyNativeDeviceTypes
-    def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode):
+    def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
         # Forward AD does not support XLA because XLA tensors don't have storage
         check_forward_ad = torch.device(device).type != 'xla'
 
         kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias)
-        for memory_format in [torch.contiguous_format, torch.channels_last]:
-            # test float scale factor up & downsampling
-            for scale_factor in [0.5, 1.5, 2]:
-                in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
-                out_size = int(math.floor(in_t.shape[-1] * scale_factor))
-                with warnings.catch_warnings(record=True) as w:
-                    out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
-                expected_out = torch.ones(2, 3, out_size, out_size, device=device)
-                self.assertEqual(expected_out, out_t)
-                # Assert that memory format is carried through to the output
-                self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
-                out_t.backward(torch.randn_like(out_t))
-                self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
+        # test float scale factor up & downsampling
+        for scale_factor in [0.5, 1.5, 2]:
+            in_t = torch.ones(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+            with warnings.catch_warnings(record=True) as w:
+                out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
+            expected_out = torch.ones(2, 3, out_size, out_size, device=device)
+            self.assertEqual(expected_out, out_t)
+            # Assert that memory format is carried through to the output
+            self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+            out_t.backward(torch.randn_like(out_t))
+            self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
 
-                if torch.device(device).type == 'cuda':
-                    # Bilinear backward is nondeterministic because of atomicAdd usage
-                    nondet_tol = 1e-5
-                else:
-                    nondet_tol = 0.0
+            if torch.device(device).type == 'cuda':
+                # Bilinear backward is nondeterministic because of atomicAdd usage
+                nondet_tol = 1e-5
+            else:
+                nondet_tol = 0.0
 
-                input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
-                gradcheck(
-                    lambda x: F.interpolate(x, out_size, **kwargs),
-                    [input],
-                    check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
-                )
-                gradgradcheck(
-                    lambda x: F.interpolate(x, out_size, **kwargs),
-                    [input],
-                    check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
-                )
+            input = torch.randn(2, 3, 8, 8, device=device).contiguous(memory_format=memory_format).requires_grad_()
+            gradcheck(
+                lambda x: F.interpolate(x, out_size, **kwargs),
+                [input],
+                check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
+            )
+            gradgradcheck(
+                lambda x: F.interpolate(x, out_size, **kwargs),
+                [input],
+                check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
+            )
 
-                # Assert that cpu and cuda give same results
-                if torch.device(device).type == 'cuda':
-                    for shapes in [
-                        (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
-                    ]:
-                        a_cuda = torch.randn(
-                            *shapes, device=device
-                        ).contiguous(memory_format=memory_format).requires_grad_()
-                        a_cpu = a_cuda.detach().cpu().requires_grad_()
+            # Assert that cpu and cuda give same results
+            if torch.device(device).type == 'cuda':
+                for shapes in [
+                    (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
+                ]:
+                    a_cuda = torch.randn(
+                        *shapes, device=device
+                    ).contiguous(memory_format=memory_format).requires_grad_()
+                    a_cpu = a_cuda.detach().cpu().requires_grad_()
 
-                        with warnings.catch_warnings(record=True):
-                            out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
-                            out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
+                    with warnings.catch_warnings(record=True):
+                        out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs)
+                        out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
 
-                        self.assertEqual(out_cpu, out_cuda.cpu())
+                    self.assertEqual(out_cpu, out_cuda.cpu())
 
-                        g_cuda = torch.randn_like(out_cuda)
-                        g_cpu = g_cuda.cpu()
+                    g_cuda = torch.randn_like(out_cuda)
+                    g_cpu = g_cuda.cpu()
 
-                        out_cuda.backward(g_cuda)
-                        out_cpu.backward(g_cpu)
+                    out_cuda.backward(g_cuda)
+                    out_cpu.backward(g_cpu)
 
-                        self.assertEqual(a_cuda.grad, a_cpu.grad)
+                    self.assertEqual(a_cuda.grad, a_cpu.grad)
 
     @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
     def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format):
@@ -9408,6 +9460,40 @@ def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format):
         t_out = F.interpolate(t_in, size=(2, 2), mode="bilinear", align_corners=False, antialias=True)
         self.assertEqual(expected_out, t_out)
 
+    @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+    @parametrize_test("antialias", [True, False])
+    @parametrize_test("align_corners", [True, False])
+    @parametrize_test("num_channels", [3, 5])
+    @parametrize_test("output_size", [32, 600])
+    def test_upsamplingBiLinear2d_consistency(self, device, memory_format, antialias, align_corners, num_channels, output_size):
+        if torch.device(device).type == "cuda":
+            raise SkipTest("CUDA implementation is not yet supporting uint8")
+
+        mode = "bilinear"
+        # Check if Max Abs Error between resized input_uint8 and resized input_float is smaller than a tolerated value, e.g. 1.0
+        input_ui8 = torch.randint(0, 256, size=(1, num_channels, 400, 400), dtype=torch.uint8, device=device)
+        input_ui8 = input_ui8.contiguous(memory_format=memory_format)
+        input_f32 = input_ui8.float()
+
+        output_f32 = F.interpolate(
+            input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
+        )
+        output_ui8 = F.interpolate(
+            input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
+        )
+
+        mae_tol = 0.5
+        max_abs_err_tol = 1.0
+        num_wrong_pixels_tol = 5
+
+        abs_diff = torch.abs(output_f32.round() - output_ui8.float())
+        mae = torch.mean(abs_diff)
+        max_abs_err = torch.max(abs_diff)
+        num_wrong_pixels = (abs_diff > max_abs_err_tol).sum()
+        self.assertTrue(mae < mae_tol, msg=f"mae={mae}")
+        self.assertTrue(max_abs_err < max_abs_err_tol + 1e-5, msg=f"max ae={max_abs_err}")
+        self.assertTrue(num_wrong_pixels < num_wrong_pixels_tol, msg=f"num_wrong_pixels={num_wrong_pixels}")
+
     def test_upsamplingBicubic2d_correctness(self, device):
         # test output against known input: align_corners=False result must match opencv
         in_t = torch.arange(8., device=device).view(1, 2, 2, 2)
@@ -9539,6 +9625,7 @@ def slow_masked_softmax(input, mask):
                     )
 
     @onlyCUDA
+    @gcIfJetson
     def test_masked_softmax_devices_parity(self):
         # Test that softmax with mask type 0 (LxL attention mask), mask type 1 (BxL padding mask),
         # and mask type 2 (BxHxLxL generic mask) gives the same result on CPU and on CUDA.
@@ -10134,6 +10221,7 @@ def test_upsamplingNearest2d_launch_config(self, device):
         self.assertEqual(out_ref, out)
 
     @onlyCUDA
+    @gcIfJetson
     def test_upsamplingNearest3d_launch_config(self, device):
         m = nn.Upsample(scale_factor=2)
         inp = torch.rand(2**25, 1, 1, 1, 1, device=device)
@@ -11427,7 +11515,7 @@ def run_test_case(norm_type, error_if_nonfinite, scalar, grad_only_one_elem, pre
     def test_clip_grad_norm_multi_device(self, devices, foreach):
         class TestModel(nn.Module):
             def __init__(self):
-                super(TestModel, self).__init__()
+                super().__init__()
                 self.layer1 = nn.Linear(10, 10)
                 self.layer2 = nn.Linear(10, 10)
 
diff --git a/test/test_nnapi.py b/test/test_nnapi.py
index 60f2c8971236..ebc066dd8ebd 100644
--- a/test/test_nnapi.py
+++ b/test/test_nnapi.py
@@ -393,9 +393,6 @@ def forward(self, x):
 
     def test_detach(self):
         class DetachModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 y = x.detach()
                 return torch.nn.functional.relu(y)
diff --git a/test/test_nvfuser_dynamo.py b/test/test_nvfuser_dynamo.py
index e59ead80fe13..d4a67db02d81 100644
--- a/test/test_nvfuser_dynamo.py
+++ b/test/test_nvfuser_dynamo.py
@@ -1,148 +1,11 @@
 # Owner(s): ["module: nvfuser"]
 
-import unittest
-import warnings
-from functools import partial
-
-import torch
-import torch._dynamo as torchdynamo
-from torch.testing import make_tensor
-from torch.testing._internal.common_utils import (
-    IS_WINDOWS,
-    run_tests,
-    skipIfTorchDynamo,
-    TEST_WITH_ROCM,
-    TestCase,
-)
-from torch.testing._internal.jit_utils import RUN_CUDA
-
-RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
-
-
-def is_pre_volta():
-    if not RUN_NVFUSER:
-        return False
-    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
-    return prop.major < 7
-
-
-def is_networkx_available():
-    try:
-        import networkx  # noqa: F401
-
-        return True
-    except ImportError:
-        return False
-
-
-@skipIfTorchDynamo("Not a suitable test for TorchDynamo")
-@unittest.skipIf(IS_WINDOWS, "TorchDynamo is not supported on Windows")
-@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-@unittest.skipIf(is_pre_volta(), "Only supported on Volta and newer devices.")
-class TestNvFuserDynamo(TestCase):
-    def test_basic(self):
-        input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float32)
-        input2 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float32)
-
-        @torchdynamo.optimize("nvprims_nvfuser")
-        def func(a, b):
-            return a.sin() + b.cos()
-
-        # No warnings and no errors
-        with warnings.catch_warnings(record=True) as w:
-            nvfuser_result = func(input1, input2)
-            self.assertEqual(len(w), 0)
-        eager_result = func.__wrapped__(input1, input2)
-        self.assertEqual(eager_result, nvfuser_result)
-
-    @unittest.skipIf(not is_networkx_available(), "networkx not available")
-    def test_min_cut(self):
-        from functorch.compile import default_partition
-        from torch._dynamo.optimizations.training import nvprims_fw_bw_partition_fn
-
-        def get_fw_bw_graph(f, inps, partitioner):
-            from functorch.compile import aot_function
-
-            # Helper functions are taken from functorch/test_aotdispatch.py
-            def extract_graph(fx_g, _, graph_cell):
-                graph_cell[0] = fx_g
-                return fx_g
-
-            fw_graph_cell = [None]
-            bw_graph_cell = [None]
-            aot_function(
-                f,
-                fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
-                bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
-                partition_fn=partitioner,
-            )(*inps).sum().backward()
-            return (fw_graph_cell[0], bw_graph_cell[0])
-
-        def get_ins_outs(fx_g):
-            ins = []
-            outs = []
-            for n in fx_g.graph.nodes:
-                if n.op == "placeholder":
-                    ins.append(n)
-                elif n.op == "output":
-                    outs = tuple(n.args[0])
-            return ins, outs
-
-        def get_num_ins_outs(fx_g):
-            return tuple(len(i) for i in get_ins_outs(fx_g))
-
-        def func(x):
-            return x * x * x
-
-        input1 = make_tensor(
-            (3,), device="cpu", dtype=torch.float32, requires_grad=True
-        )
-        fw_graph, bw_graph = get_fw_bw_graph(func, [input1], default_partition)
-        self.assertEqual(get_num_ins_outs(fw_graph), (1, 3))
-        self.assertEqual(get_num_ins_outs(bw_graph), (3, 1))
-
-        input1 = make_tensor(
-            (3,), device="cpu", dtype=torch.float32, requires_grad=True
-        )
-        fw_graph, bw_graph = get_fw_bw_graph(func, [input1], nvprims_fw_bw_partition_fn)
-        self.assertEqual(get_num_ins_outs(fw_graph), (1, 2))
-        self.assertEqual(get_num_ins_outs(bw_graph), (2, 1))
-
-    def test_batch_norm_implicit_dtype_promotion(self):
-        input1 = make_tensor((2, 3, 4, 5), device="cuda", dtype=torch.float32)
-        input2 = make_tensor((5, 5), device="cuda", dtype=torch.float32)
-        w = make_tensor((3), device="cuda", dtype=torch.float32)
-        b = make_tensor((3), device="cuda", dtype=torch.float32)
-
-        @torchdynamo.optimize("nvprims_nvfuser")
-        def func(mat1, mat2, w, b):
-            o = torch.matmul(mat1, mat2)
-            return torch.batch_norm(o, w, b, None, None, True, 1e-2, 1e-5, True)
-
-        # No warnings and no errors
-        with torch.cuda.amp.autocast():
-            with warnings.catch_warnings(record=True) as warning:
-                nvfuser_result = func(input1, input2, w, b)
-                self.assertEqual(len(warning), 0)
-            eager_result = func.__wrapped__(input1, input2, w, b)
-            self.assertEqual(eager_result, nvfuser_result)
-
-    def test_dtype_correctness(self):
-        input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float16)
-
-        @torchdynamo.optimize("nvprims_nvfuser")
-        def func(a):
-            tmp = a + 1.0
-            # nvfuser would promote output to fp32 in math, FusionDefinition should cast output dtype back
-            return torch.where(tmp > 0, tmp, 0.0)
-
-        # No warnings and no errors
-        with warnings.catch_warnings(record=True) as w:
-            nvfuser_result = func(input1)
-            self.assertEqual(len(w), 0)
-        eager_result = func.__wrapped__(input1)
-        self.assertEqual(eager_result, nvfuser_result)
-
-
-if __name__ == "__main__":
+try:
+    from _nvfuser.test_dynamo import *  # noqa: F403,F401
+except ImportError:
+    def run_tests():
+        return
+    pass
+
+if __name__ == '__main__':
     run_tests()
diff --git a/test/test_nvfuser_frontend.py b/test/test_nvfuser_frontend.py
index 9974eb29c727..c530209a7a84 100644
--- a/test/test_nvfuser_frontend.py
+++ b/test/test_nvfuser_frontend.py
@@ -1,366 +1,11 @@
 # Owner(s): ["module: nvfuser"]
 
-import unittest
-from typing import List
-
-import torch
-from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
-from torch.testing._internal.jit_utils import RUN_CUDA
-import torch._refs as refs
-import torch._prims as prims
-
-# Will only create the _nvfuser module if CUDA is available
-if hasattr(torch._C, "_nvfuser"):
-    from torch._C._nvfuser import Fusion, FusionCache, FusionDefinition, DataType
-
-RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
-
-def is_pre_volta():
-    if not RUN_NVFUSER:
-        return False
-    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
-    return prop.major < 7
-
-@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-@unittest.skipIf(is_pre_volta(), "Only supported on Volta and newer devices.")
-class TestNvFuserFrontend(TestCase):
-    def test_basic(self) :
-        input1 = torch.ones(2, 4, 8, device='cuda')
-        input2 = torch.ones(2, 4, 8, device='cuda')
-        fc = FusionCache.get()
-        before_fusions = fc.num_fusions()
-
-        fs1 = Fusion()
-        with FusionDefinition(fs1) as fd :
-            t0 = fd.define_tensor(3)
-            t1 = fd.define_tensor(3)
-            c0 = fd.define_constant(3.0)
-
-            t2 = fd.ops.add(t0, t1)
-            t3 = fd.ops.mul(t2, c0)
-            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
-
-            fd.add_output(t4)
-
-        # Expected Output is a tensor of 48's
-        nvf_out1 = fs1.execute([input1, input2])[0]
-
-        # Create a new fusion with the same definition, it should hit the cache!
-        fs2 = Fusion()
-        with FusionDefinition(fs2) as fd :
-            t0 = fd.define_tensor(3)
-            t1 = fd.define_tensor(3)
-            c0 = fd.define_constant(3.0)
-
-            t2 = fd.ops.add(t0, t1)
-            t3 = fd.ops.mul(t2, c0)
-            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
-
-            fd.add_output(t4)
-
-        nvf_out2 = fs2.execute([input1, input2])[0]
-
-        # Check there is still only 1 cache entry
-        fc = FusionCache.get()
-        self.assertEqual(fc.num_fusions() - before_fusions, 1)
-
-        # Create a fusion from a fusion id and make sure it executes!
-        fs3 = Fusion(fs2.id())
-        nvf_out3 = fs3.execute([input1, input2])[0]
-
-        eager_out = torch.sum((input1 + input2) * 3.0, dim=-1)
-        self.assertEqual(eager_out, nvf_out1)
-        self.assertEqual(eager_out, nvf_out2)
-        self.assertEqual(eager_out, nvf_out3)
-
-    def test_basic_fp16(self) :
-        fs = Fusion()
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor(3, DataType.Half)
-            t1 = fd.define_tensor(3, DataType.Half)
-            c0 = fd.define_constant(3.0)
-
-            t2 = fd.ops.add(t0, t1)
-            t3 = fd.ops.mul(t2, c0)
-            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
-
-            t5 = fd.ops.cast(t4, DataType.Half)
-            fd.add_output(t5)
-
-        input1 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
-        input2 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
-
-        # Expected Output is a tensor of 48's
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = torch.sum((input1 + input2) * 3.0, dim=-1)
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_cast_double_to_half(self) :
-        fs = Fusion()
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor(2, DataType.Double)
-            t1 = fd.define_tensor(2, DataType.Double)
-
-            t0h = fd.ops.cast(t0, DataType.Half)
-            t1h = fd.ops.cast(t1, DataType.Half)
-            t2 = fd.ops.add(t0h, t1h)
-            t3 = fd.ops.relu(t2)
-            t4 = fd.ops.cast(t3, DataType.Half)
-
-            fd.add_output(t4)
-
-        input1 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
-        input2 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
-
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = torch.relu(input1.to(torch.half) + input2.to(torch.half))
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_promote_to_double(self) :
-        fs = Fusion()
-
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor(2, DataType.Half)
-            t1 = fd.define_tensor(2, DataType.Double)
-
-            t2 = fd.ops.add(t0, t1)
-            t5 = fd.ops.relu(t2)
-
-            fd.add_output(t5)
-
-        input1 = torch.randn(2, 4, device='cuda', dtype=torch.float16)
-        input2 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
-
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = torch.relu(input1 + input2)
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_implicit_broadcast_input(self) :
-        fs = Fusion()
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor(1)
-            t1 = fd.define_tensor(3)
-
-            t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [1])
-            t2 = fd.ops.add(t0_b, t1)
-
-            fd.add_output(t2)
-
-        input1 = torch.randn(3, device='cuda')
-        input2 = torch.randn(2, 3, 4, device='cuda')
-
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_explicit_broadcast_input(self) :
-        input1 = torch.randn(1, 1, 4, device='cuda')
-        input2 = torch.randn(2, 3, 4, device='cuda')
-
-        fs = Fusion()
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor(sizes=input1.size(), strides=input1.stride())
-            t1 = fd.define_tensor(sizes=input2.size(), strides=input2.stride())
-
-            t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [0, 1, 2])
-            t2 = fd.ops.add(t0_b, t1)
-
-            fd.add_output(t2)
-
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [0, 1, 2]), input2)
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_broadcast_mixing(self) :
-        fs = Fusion()
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor([3, 1], [1, 1])
-            t1 = fd.define_tensor(1)
-
-            t1_b = fd.ops.broadcast_in_dim(t1, [3, 3], [0])
-            t2 = fd.ops.add(t0, t1_b)
-
-            fd.add_output(t2)
-
-        input1 = torch.randn(3, 1, device='cuda')
-        input2 = torch.randn(3, device='cuda')
-
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = refs.add(input1, prims.broadcast_in_dim(input2, [3, 3], [0]))
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_ops_broadcast(self) :
-        fs = Fusion()
-        with FusionDefinition(fs) as fd :
-            t0 = fd.define_tensor(1)
-            t1 = fd.define_tensor(3)
-
-            t0_b = fd.ops.broadcast(t0, [True, False, True])
-            t2 = fd.ops.add(t0_b, t1)
-
-            fd.add_output(t2)
-
-        input1 = torch.randn(3, device='cuda')
-        input2 = torch.randn(2, 3, 4, device='cuda')
-
-        nvf_out = fs.execute([input1, input2])[0]
-        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
-        self.assertEqual(eager_out, nvf_out)
-
-    def test_prim_layer_norm_fwd(self) :
-        def primitive_definition(
-            inputs: torch.Tensor,
-            weight: torch.Tensor,
-            bias: torch.Tensor,
-            normalization_axis: int,
-            keepdim: bool,
-        ) -> torch.Tensor:
-            mean = inputs.mean(normalization_axis, keepdim=keepdim)
-            diff = inputs - mean
-            diff_sq = diff * diff
-            var = diff_sq.mean(normalization_axis, keepdim=keepdim)
-            pre_shift_scale_norm_output = (inputs - mean) / torch.sqrt(var + 1e-12)
-            norm_output = weight * pre_shift_scale_norm_output + bias
-            return norm_output
-
-        def nvfuser_fusion(
-            fd: FusionDefinition,
-            normalization_axis: int,
-            norm_size: int,
-            input_shape: List[int],
-            eps: float,
-            keepDim: bool
-        ) -> None :
-            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
-            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
-            bias = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
-            sum0 = fd.ops.sum(inputs, axes=[normalization_axis], keepdim=keepDim)
-            norm_const = fd.define_constant(norm_size)
-            mean = fd.ops.div(sum0, norm_const)
-            diff = fd.ops.sub(inputs, mean)
-            diff_sq = fd.ops.mul(diff, diff)
-            sum1 = fd.ops.sum(diff_sq, axes=[normalization_axis], keepdim=keepDim)
-            var = fd.ops.div(sum1, norm_const)
-            eps_const = fd.define_constant(eps)
-            var_eps = fd.ops.add(var, eps_const)
-            invstd = fd.ops.rsqrt(var_eps)
-            pre_scale_bias = fd.ops.mul(diff, invstd)
-            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
-            scale = fd.ops.mul(pre_scale_bias, weights_bcast)
-            bias_bcast = fd.ops.broadcast_in_dim(bias, output_shape=input_shape, broadcast_dims=[2])
-            out = fd.ops.add(scale, bias_bcast)
-            fd.add_output(out)
-            fd.add_output(mean)
-            fd.add_output(invstd)
-
-        def nvfuser_fusion_var_mean(
-            fd: FusionDefinition,
-            normalization_axis: int,
-            norm_size: int,
-            input_shape: List[int],
-            eps: float,
-            keepDim: bool
-        ) -> None :
-            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
-            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
-            bias = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
-            var, mean = fd.ops.var_mean(inputs, axes=[normalization_axis], correction=0, keepdim=keepDim)
-            eps_const = fd.define_constant(eps)
-            var_eps = fd.ops.add(var, eps_const)
-            invstd = fd.ops.rsqrt(var_eps)
-            diff = fd.ops.sub(inputs, mean)
-            pre_scale_bias = fd.ops.mul(diff, invstd)
-            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
-            scale = fd.ops.mul(pre_scale_bias, weights_bcast)
-            bias_bcast = fd.ops.broadcast_in_dim(bias, output_shape=input_shape, broadcast_dims=[2])
-            out = fd.ops.add(scale, bias_bcast)
-            fd.add_output(out)
-            fd.add_output(mean)
-            fd.add_output(invstd)
-
-        input_size = [64, 128, 1024]
-        dtype = torch.float32
-        device = 'cuda'
-        inputs = torch.randn(*input_size, device=device, requires_grad=True)
-        weights = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
-        biases = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
-        fc = FusionCache.get()
-        before_fusions = fc.num_fusions()
-
-        for _ in range(5) :
-            nvf_fusion = Fusion()
-            with FusionDefinition(nvf_fusion) as fd:
-                nvfuser_fusion(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
-            nvf_out = nvf_fusion.execute([inputs, weights, biases])
-
-        for _ in range(5) :
-            nvf_var_mean_fusion = Fusion()
-            with FusionDefinition(nvf_var_mean_fusion) as fd:
-                nvfuser_fusion_var_mean(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
-            nvf_var_mean_out = nvf_var_mean_fusion.execute([inputs, weights, biases])
-
-        for _ in range(5) :
-            eager_out = primitive_definition(inputs, weights, biases, 2, True)
-
-        self.assertEqual(eager_out, nvf_out[0])
-        self.assertEqual(eager_out, nvf_var_mean_out[0])
-        fusion_cache = FusionCache.get()
-        self.assertEqual(fc.num_fusions() - before_fusions, 2)
-
-    def test_prim_rms_norm_fwd(self) :
-        def primitive_definition(
-            inputs: torch.Tensor,
-            weight: torch.Tensor,
-            normalization_axis: int,
-            keepdim: bool,
-        ) -> torch.Tensor:
-            var = inputs.mul(inputs).mean(normalization_axis, keepdim)
-            pre_shift_scale_norm_output = inputs / torch.sqrt(var + 1e-12)
-            norm_output = weight * pre_shift_scale_norm_output
-            return norm_output
-
-        def nvfuser_fusion(
-            fd: FusionDefinition,
-            normalization_axis: int,
-            norm_size: int,
-            input_shape: List[int],
-            eps: float,
-            keepDim: bool
-        ) -> None :
-            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
-            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
-            inputs_sq = fd.ops.mul(inputs, inputs)
-            sum0 = fd.ops.sum(inputs_sq, axes=[normalization_axis], keepdim=keepDim)
-            norm_const = fd.define_constant(norm_size)
-            var = fd.ops.div(sum0, norm_const)
-            eps_const = fd.define_constant(eps)
-            var_eps = fd.ops.add(var, eps_const)
-            invstd = fd.ops.rsqrt(var_eps)
-            pre_scale = fd.ops.mul(inputs, invstd)
-            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
-            out = fd.ops.mul(pre_scale, weights_bcast)
-            fd.add_output(out)
-            fd.add_output(invstd)
-
-        input_size = [64, 128, 1024]
-        dtype = torch.float32
-        device = 'cuda'
-        inputs = torch.randn(*input_size, device=device, requires_grad=True)
-        weights = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
-        fc = FusionCache.get()
-        before_fusions = fc.num_fusions()
-
-        for _ in range(5) :
-            nvf_fusion = Fusion()
-            with FusionDefinition(nvf_fusion) as fd:
-                nvfuser_fusion(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
-            nvf_out = nvf_fusion.execute([inputs, weights])
-
-        for _ in range(5) :
-            eager_out = primitive_definition(inputs, weights, 2, True)
-
-        self.assertEqual(eager_out, nvf_out[0])
-        self.assertEqual(fc.num_fusions() - before_fusions, 1)
+try:
+    from _nvfuser.test_python_frontend import *  # noqa: F403,F401
+except ImportError:
+    def run_tests():
+        return
+    pass
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_ops.py b/test/test_ops.py
index 2280ccfee5bf..d40b625f93ea 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -19,7 +19,6 @@
     floating_and_complex_types_and,
     all_types_and_complex_and,
 )
-from test_proxy_tensor import xfail, skip, skipOps
 
 from torch.testing._internal.common_utils import (
     TestCase,
@@ -50,6 +49,9 @@
     ops_and_refs,
     python_ref_db,
     BinaryUfuncInfo,
+    xfail,
+    skip,
+    skipOps
 )
 from torch.testing._internal.common_device_type import (
     deviceCountAtLeast,
@@ -659,6 +661,7 @@ def test_noncontiguous_samples(self, device, dtype, op):
     # Cases test here:
     #   - out= with the correct dtype and device, but the wrong shape
     @ops(_ops_and_refs, dtypes=OpDTypes.none)
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_out_warning(self, device, op):
         # Prefers running in float32 but has a fallback for the first listed supported dtype
         supported_dtypes = op.supported_dtypes(self.device_type)
@@ -787,6 +790,7 @@ def _any_nonempty(out):
     #   - if device, dtype are NOT passed, any combination of dtype/device should be OK for out
     #   - if device, dtype are passed, device and dtype should match
     @ops(_ops_and_refs, dtypes=OpDTypes.any_one)
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_out(self, device, dtype, op):
         # Prefers running in float32 but has a fallback for the first listed supported dtype
         samples = op.sample_inputs(device, dtype)
@@ -973,6 +977,7 @@ def _case_four_transform(t):
     #   same values for the cross-product of op variants (method, inplace)
     #   against eager's gold standard op function variant
     @_variant_ops(op_db)
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_variant_consistency_eager(self, device, dtype, op):
         # Acquires variants (method variant, inplace variant, operator variant, inplace_operator variant, aliases)
 
@@ -1047,7 +1052,10 @@ def _test_consistency_helper(samples, variants):
                 if isinstance(
                     expected_forward, torch.Tensor
                 ) and dtype in op.supported_backward_dtypes(torch.device(device).type):
-                    output_process_fn_grad(expected_forward).sum().backward()
+                    out = output_process_fn_grad(expected_forward).sum()
+                    if out.dtype.is_complex:
+                        out = out.abs()
+                    out.backward()
                     expected_grad = tensor.grad
 
                 # Test eager consistency
@@ -1092,7 +1100,10 @@ def _test_consistency_helper(samples, variants):
                     if expected_grad is not None and (
                         variant not in inplace_ops or op.supports_inplace_autograd
                     ):
-                        output_process_fn_grad(variant_forward).sum().backward()
+                        out = output_process_fn_grad(variant_forward).sum()
+                        if out.dtype.is_complex:
+                            out = out.abs()
+                        out.backward()
                         self.assertEqual(expected_grad, tensor.grad)
 
         _test_consistency_helper(samples, variants)
@@ -1560,8 +1571,8 @@ def clone_and_perform_view(input, **kwargs):
                     if isinstance(sample.input, torch.Tensor)
                     else sample.input[0]
                 )
-                expected_forward.sum().backward(retain_graph=True)
-                forward_with_mathview.sum().backward(retain_graph=True)
+                expected_forward.sum().abs().backward(retain_graph=True)
+                forward_with_mathview.sum().abs().backward(retain_graph=True)
                 if tensor.grad is not None:
                     cloned1_tensor = (
                         cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
@@ -1716,12 +1727,13 @@ class TestRefsOpsInfo(TestCase):
     module_alls = [(path, import_module(f"torch.{path}").__all__) for path in import_paths]
     ref_ops_names = tuple(itertools.chain.from_iterable(
         [f"{path}.{op}" for op in module_all] for path, module_all in module_alls))
-    ref_db_names = set(ref_op.name for ref_op in python_ref_db)
+    ref_db_names = {ref_op.name for ref_op in python_ref_db}
 
     # TODO: References that do not have an entry in python_ref_db
     skip_ref_ops = {
         '_refs.bitwise_right_shift',
         '_refs.copy_to',
+        '_refs.empty_permuted',
         '_refs.empty_strided',
         '_refs.equal',
         '_refs.full',
@@ -1834,6 +1846,8 @@ class TestRefsOpsInfo(TestCase):
         '_refs.round',  # missing "decimals"
         '_refs.scalar_tensor',  # missing "layout"
         # other
+        '_refs.empty',  # intentional; direct empty is faster and has less guards
+        '_refs.empty_permuted',  # intentional; direct empty is faster and has less guards
         '_refs.expand_as',
         '_refs.as_strided',  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
         '_refs.copy_to',  # torch._C._jit_get_operation: No such operator aten::copy_to
@@ -1851,7 +1865,9 @@ def test_refs_are_in_python_ref_db(self, op):
         elif inplace:
             self.assertNotIn(op, self.ref_db_names, msg=f"{op} is an in-place operation and should not have an OpInfo")
         else:
-            self.assertIn(op, self.ref_db_names)
+            # Intentionally don't use assertIn to avoid printing the
+            # (very large) container
+            self.assertTrue(op in self.ref_db_names, msg="{op} not in ref_db_names")
 
     @parametrize("op", ref_ops_names)
     def test_refs_are_in_decomp_table(self, op):
@@ -1897,7 +1913,7 @@ def test_refs_are_in_decomp_table(self, op):
     "to_sparse",  # Could not run 'aten::to_sparse' with arguments from the 'Meta' backend
     "tensor_split",  # The tensor has a non-zero number of elements, but its data is not allocated yet
     "repeat_interleave",  # cannot repeat_interleave a meta tensor without output_size
-    "segment_reduce.lengths",  # Could not run 'aten::segment_reduce' with arguments from the 'Meta' backend.
+    "_segment_reduce.lengths",  # Could not run 'aten::segment_reduce' with arguments from the 'Meta' backend.
     "sparse.sampled.addmm",  # sparsity not supported
     # Can not infer total number of classes from meta. no way at present to throw DynamicOutputShapeException
     "nn.functional.one_hot",
@@ -1907,9 +1923,7 @@ def test_refs_are_in_decomp_table(self, op):
 fake_autocast_device_skips = defaultdict(dict)
 
 # TODO: investigate/fix
-fake_autocast_device_skips["cpu"] = set(
-    ("linalg.pinv",)
-)
+fake_autocast_device_skips["cpu"] = {"linalg.pinv"}
 
 
 dynamic_output_op_tests = (
@@ -1981,7 +1995,7 @@ def test_refs_are_in_decomp_table(self, op):
 }
 
 fake_backward_xfails = {xfail(stride_skip) for stride_skip in fake_backward_xfails} | {
-    xfail("segment_reduce", "lengths"),
+    xfail("_segment_reduce", "lengths"),
     xfail("norm", "nuc"),
     xfail("linalg.norm", "subgradients_at_zero"),  # can accept vector inputs
     skip('nn.functional.ctc_loss'),
@@ -2007,7 +2021,7 @@ def _test_fake_helper(self, device, dtype, op, context):
         samples = op.sample_inputs(device, dtype, requires_grad=False)
         for sample in samples:
             try:
-                mode = FakeTensorMode(throw_on_data_dependent_ops=True)
+                mode = FakeTensorMode()
 
                 def map_to_fake(e):
                     if isinstance(e, torch.Tensor):
@@ -2093,7 +2107,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         samples = op.sample_inputs(device, dtype, requires_grad=False)
         for sample in samples:
-            mode = FakeTensorMode(throw_on_data_dependent_ops=True)
+            mode = FakeTensorMode()
 
             def map_to_fake(e):
                 if isinstance(e, torch.Tensor):
diff --git a/test/test_optim.py b/test/test_optim.py
index aeb23eaa5f51..90fce9baa8c3 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -14,7 +14,6 @@
 import torch.nn.functional as F
 from torch.nn import Parameter
 from torch.optim import Adam, SGD, Optimizer
-from torch import sparse
 from torch.optim.lr_scheduler import (
     LambdaLR,
     MultiplicativeLR,
@@ -110,7 +109,7 @@ def eval(params, sparse_grad, w):
                 i = torch.LongTensor([[1, 1]])
                 y = grad[1]
                 v = torch.tensor([y - y / 4.0, y / 4.0])
-            x = sparse.DoubleTensor(i, v, torch.Size([2])).to(dtype=v.dtype)
+            x = torch.sparse_coo_tensor(i, v, (2,), dtype=v.dtype)
             with torch.no_grad():
                 if sparse_grad:
                     params.grad = x
@@ -146,8 +145,8 @@ def _test_basic_cases_template(
         constructor_accepts_maximize=True,
         constructor_accepts_foreach=False,
     ):
-        maximize_options = set([False, constructor_accepts_maximize])
-        foreach_options = set([False, constructor_accepts_foreach])
+        maximize_options = {False, constructor_accepts_maximize}
+        foreach_options = {False, constructor_accepts_foreach}
 
         four_arg_constructor = constructor
         if constructor_accepts_maximize and constructor_accepts_foreach:
@@ -318,7 +317,7 @@ def fn_base(optimizer, weight, bias):
 
         # validate deepcopy() copies all public attributes
         def getPublicAttr(obj):
-            return set(k for k in obj.__dict__ if not k.startswith("_"))
+            return {k for k in obj.__dict__ if not k.startswith("_")}
 
         self.assertEqual(getPublicAttr(optimizer), getPublicAttr(deepcopy(optimizer)))
 
@@ -347,8 +346,8 @@ def make_two_arg_constructor(
             return constructor
 
         for maximize, foreach in itertools.product(
-            set([False, constructor_accepts_maximize]),
-            set([False, constructor_accepts_foreach]),
+            {False, constructor_accepts_maximize},
+            {False, constructor_accepts_foreach},
         ):
             self._test_state_dict(
                 torch.randn(10, 5),
@@ -431,8 +430,8 @@ def _test_complex_2d(self, optimizer_constructor, f=None):
             optim1.zero_grad()
             optim2.zero_grad()
             a2 = torch.complex(a1_real, a1_imag)
-            f(a1).backward()
-            f(a2).backward()
+            f(a1).abs().backward()
+            f(a2).abs().backward()
 
             self.assertEqual(a1.grad.real, a1_real.grad)
             self.assertEqual(a1.grad.imag, a1_imag.grad)
@@ -647,7 +646,6 @@ def test_sgd_complex(self):
                 )
             )
 
-
     def _test_derived_optimizers_varying_tensors(self, optimizer_with_kwargs, kwarg):
         if not torch.cuda.is_available():
             return
@@ -717,7 +715,6 @@ def _test_derived_optimizers_varying_tensors(self, optimizer_with_kwargs, kwarg)
                         actual = actual[0]
                     self.assertEqual(st_p_state[k], actual)
 
-
     def _test_derived_optimizers(self, optimizer_pairs_with_flags, flag):
         if not torch.cuda.is_available():
             return
@@ -730,7 +727,7 @@ def _test_derived_optimizers(self, optimizer_pairs_with_flags, flag):
         device = "cuda"
         for optimizer_constructor, params in optimizer_pairs_with_flags:
             res, state = [], []
-            for enabled in (False, True):
+            for flag_value in (False, True):
                 input = torch.tensor(
                     [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device
                 ).reshape(3, 2)
@@ -744,20 +741,20 @@ def _test_derived_optimizers(self, optimizer_pairs_with_flags, flag):
                 )
                 model.to(dtype=torch.float64, device=device)
                 params_with_flags = deepcopy(params)
-                params_with_flags[flag] = enabled
+                params_with_flags[flag] = flag_value
 
                 optimizer = optimizer_constructor(
                     model.parameters(), **params_with_flags
                 )
 
-                for _ in range(kIterations):
+                for i in range(kIterations):
                     optimizer.zero_grad()
                     output = model(input)
                     loss = output.sum()
                     loss.backward()
 
                     # Test that step behaves as expected (a no-op) when grads are set to None
-                    if iter == 0:
+                    if i == 0:
                         optimizer.zero_grad(set_to_none=True)
 
                     optimizer.step()
@@ -872,22 +869,28 @@ def test_multi_tensor_optimizers_with_varying_tensors(self):
         self._test_derived_optimizers_varying_tensors(optimizer_pairs_with_flags, "foreach")
 
     def test_fused_optimizers(self):
-        optimizer_pairs_with_flags = [
-            (optim.Adam, dict(weight_decay=1.0, amsgrad=False)),
-            (optim.Adam, dict(weight_decay=1.0, amsgrad=True)),
-            (optim.Adam, dict(weight_decay=0.0, amsgrad=False)),
-            (optim.Adam, dict(weight_decay=0.0, amsgrad=True)),
-        ]
+        optimizer_pairs_with_flags = tuple(itertools.product(
+            (optim.Adam, optim.AdamW),
+            (
+                dict(weight_decay=1., amsgrad=False),
+                dict(weight_decay=1., amsgrad=True),
+                dict(weight_decay=0., amsgrad=False),
+                dict(weight_decay=0., amsgrad=True),
+            ),
+        ))
         self._test_derived_optimizers(optimizer_pairs_with_flags, "fused")
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_fused_optimizers_with_varying_tensors(self):
-        optimizer_pairs_with_flags = [
-            (optim.Adam, dict(weight_decay=1.0, amsgrad=False)),
-            (optim.Adam, dict(weight_decay=1.0, amsgrad=True)),
-            (optim.Adam, dict(weight_decay=0.0, amsgrad=False)),
-            (optim.Adam, dict(weight_decay=0.0, amsgrad=True)),
-        ]
+        optimizer_pairs_with_flags = tuple(itertools.product(
+            (optim.Adam, optim.AdamW),
+            (
+                dict(weight_decay=1., amsgrad=False),
+                dict(weight_decay=1., amsgrad=True),
+                dict(weight_decay=0., amsgrad=False),
+                dict(weight_decay=0., amsgrad=True),
+            ),
+        ))
         self._test_derived_optimizers_varying_tensors(optimizer_pairs_with_flags, "fused")
 
     def test_adam(self):
@@ -1605,35 +1608,21 @@ def test_no_grad_for_all_params(self):
             opt.step()
 
     # make sure that `state_steps` is correctly either updated or not updated when `found_inf`.
-    def test_functional_fused_adam_with_foundinf(self):
+    def test_functional_fused_optimizer_with_foundinf(self):
         if not torch.cuda.is_available():
             self.skipTest("CUDA is required.")
 
-        from torch.optim import adam
+        from torch.optim import adam, adamw
 
         num_tensors = 5
-        for amsgrad in (False, True):
-            params, grads, exp_avgs, exp_avg_sqs = [
-                [torch.ones((1,), device="cuda") for _ in range(num_tensors)]
-                for _ in range(4)
-            ]
-            max_exp_avg_sqs = (
-                [torch.ones((1,), device="cuda") for _ in range(num_tensors)]
-                if amsgrad
-                else []
-            )
-            state_steps = [
-                torch.ones((1,), dtype=torch.float32, device="cuda")
-                for _ in range(num_tensors)
-            ]
-            grad_scale = torch.cuda.amp.grad_scaler._MultiDeviceReplicator(
-                torch.ones((1,), dtype=torch.float32, device="cuda")
-            )
-            found_inf = torch.cuda.amp.grad_scaler._MultiDeviceReplicator(
-                torch.ones((1,), dtype=torch.float32, device="cuda")
-            )
-
-            adam.adam(
+        for functional_optim, amsgrad in itertools.product((adam.adam, adamw.adamw), (False, True)):
+            params, grads, exp_avgs, exp_avg_sqs = [[torch.ones((1,), device="cuda") for _ in range(num_tensors)] for _ in range(4)]
+            max_exp_avg_sqs = [torch.ones((1,), device="cuda") for _ in range(num_tensors)] if amsgrad else []
+            state_steps = [torch.ones((1,), dtype=torch.float32, device="cuda") for _ in range(num_tensors)]
+            grad_scale = torch.ones((1,), dtype=torch.float32, device="cuda")
+            found_inf = torch.ones((1,), dtype=torch.float32, device="cuda")
+
+            functional_optim(
                 params,
                 grads,
                 exp_avgs,
@@ -1791,10 +1780,19 @@ def local_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
         opt2.step()
         self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
 
+    def test_fused_optimizer_raises(self):
+        if not torch.cuda.is_available():
+            self.skipTest("Requires CUDA devices")
+        for optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
+            with self.assertRaisesRegex(RuntimeError, "`fused` and `foreach` cannot be `True` together."):
+                optimizer_ctor([torch.empty((), device="cuda")], foreach=True, fused=True)
+            with self.assertRaisesRegex(RuntimeError, "`fused` does not support `differentiable`"):
+                optimizer_ctor([torch.empty((), device="cuda")], differentiable=True, fused=True)
+
 
 class SchedulerTestNet(torch.nn.Module):
     def __init__(self):
-        super(SchedulerTestNet, self).__init__()
+        super().__init__()
         self.conv1 = torch.nn.Conv2d(1, 1, 1)
         self.conv2 = torch.nn.Conv2d(1, 1, 1)
 
@@ -1820,7 +1818,7 @@ class TestLRScheduler(TestCase):
     exact_dtype = True
 
     def setUp(self):
-        super(TestLRScheduler, self).setUp()
+        super().setUp()
         self.net = SchedulerTestNet()
         self.opt = SGD(
             [
@@ -1873,23 +1871,10 @@ def test_no_cyclic_references(self):
         scheduler = LambdaLR(optim, lambda epoch: 1.0)
         del scheduler
 
-        # Prior to Python 3.7, local variables in a function will be referred by the current frame.
-        import sys
-
-        if sys.version_info < (3, 7):
-            import inspect
-
-            referrers = gc.get_referrers(optim)
-            self.assertTrue(
-                len(referrers) == 1 and referrers[0] is inspect.currentframe(),
-                "Optimizer should contain no cyclic references (except current frame)",
-            )
-            del referrers
-        else:
-            self.assertTrue(
-                len(gc.get_referrers(optim)) == 0,
-                "Optimizer should contain no cyclic references",
-            )
+        self.assertTrue(
+            len(gc.get_referrers(optim)) == 0,
+            "Optimizer should contain no cyclic references",
+        )
 
         gc.collect()
         del optim
@@ -3857,9 +3842,7 @@ def _test_against_closed_form(self, scheduler, closed_form_scheduler, epochs=10)
     def _test_reduce_lr_on_plateau(
         self, schedulers, targets, metrics, epochs=10, verbose=False
     ):
-        if isinstance(schedulers, LRScheduler) or isinstance(
-            schedulers, ReduceLROnPlateau
-        ):
+        if isinstance(schedulers, (LRScheduler, ReduceLROnPlateau)):
             schedulers = [schedulers]
         for epoch in range(epochs):
             self.opt.step()
@@ -3984,7 +3967,7 @@ def test_cosine_then_cyclic(self):
 
 class SWATestDNN(torch.nn.Module):
     def __init__(self, input_features):
-        super(SWATestDNN, self).__init__()
+        super().__init__()
         self.n_features = 100
         self.fc1 = torch.nn.Linear(input_features, self.n_features)
         self.bn = torch.nn.BatchNorm1d(self.n_features)
@@ -4000,7 +3983,7 @@ def forward(self, x):
 
 class SWATestCNN(torch.nn.Module):
     def __init__(self, input_channels):
-        super(SWATestCNN, self).__init__()
+        super().__init__()
         self.n_features = 10
         self.conv1 = torch.nn.Conv2d(
             input_channels, self.n_features, kernel_size=3, padding=1
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 66e5a181a60c..7671962e8954 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -84,7 +84,7 @@ def decorator(func):
         return func
     return decorator
 
-class DiagonalTensor(object):
+class DiagonalTensor:
     """A class with __torch_function__ and a specific diagonal representation
 
     This class has limited utility and is mostly useful for verifying that the
@@ -358,7 +358,7 @@ def generate_tensor_like_torch_implementations():
 
 generate_tensor_like_torch_implementations()
 
-class TensorLike(object):
+class TensorLike:
     """A class that overrides the full torch API
 
     This class is used to explicitly test that the full torch.tensor API
diff --git a/test/test_prims.py b/test/test_prims.py
index 4411eb2e6af0..f5367d527307 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -140,12 +140,46 @@ def test_cbrt_prim(self, device, dtype):
 
                 self.assertEqual(y, y_np, exact_device=False)
 
+    @dtypes(torch.float32)
+    def test_collapse(self, device, dtype):
+        t = torch.rand(2, 2, 2)
+        dim_ranges = [(0, 0), (0, 1), (1, 2), (0, 2)]
+        expected_shapes = [(2, 2, 2), (4, 2), (2, 4), (8,)]
+
+        for (start, end), shape in zip(dim_ranges, expected_shapes):
+            expect = t.reshape(shape)
+
+            copy = prims.collapse(t, start, end)
+            self.assertEqual(copy, expect)
+            self.assertFalse(copy._is_view())
+
+            view = prims.collapse_view(t, start, end)
+            self.assertEqual(view, expect)
+            self.assertTrue(view._is_view())
+
+        t_discontig = t.transpose(0, 1)
+        with self.assertRaises(ValueError, msg="no such view exists"):
+            view = prims.collapse_view(t_discontig, 0, 2)
+
+        copy = prims.collapse(t_discontig, 0, 1)
+        self.assertEqual(copy, t_discontig.reshape(4, 2))
+
+        error_dims = [(-1, 1), (0, 3), (1, -1)]
+        for start, end in error_dims:
+            for fn in [prims.collapse, prims.collapse_view]:
+                with self.assertRaises(AssertionError):
+                    fn(t, start, end)
+
     @onlyCUDA
     def test_nvfuser_impl_is_used(self, device):
         # This test is to ensure that when the nvfuser implementation exists it is used
         # Assuming one-to-one mapping between prims and nvfuser implementations
         # This test is not intended to test the correctness of the nvfuser implementation
-        from torch._C._nvfuser import FusionDefinition as fd
+        try:
+            from nvfuser import FusionDefinition as fd
+        except ImportError:
+            from nvfuser._C import FusionDefinition as fd
+
 
         prim_nvfuser_ops = set(torch._prims.__all__).intersection(dir(fd.ops))
         ops_without_nvfuser_impl = {
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 834a6854178a..88f4aa6d782f 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1,18 +1,20 @@
 # Owner(s): ["module: ProxyTensor"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, xfail_inherited_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, xfail_inherited_tests
 import torch
 import unittest
 import warnings
 import operator
 from collections.abc import Iterable
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_methods_invocations import DecorateInfo
-from torch.testing._internal.common_methods_invocations import op_db, wrapper_set_seed
+from torch.testing._internal.common_methods_invocations import op_db, wrapper_set_seed, skip, xfail, skipOps
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, DataDependentOutputException
 
 from torch._decomp import decomposition_table
-from torch.fx.experimental.symbolic_shapes import sym_float, eval_guards, bind_symbols, fx_placeholder_vals
+from torch.fx.experimental.symbolic_shapes import (
+    sym_float, eval_guards, bind_symbols, fx_placeholder_vals, fx_placeholder_targets,
+    constrain_range, guard_int, GuardOnDataDependentSymNode
+)
 from torch.testing._internal.common_device_type import ops
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental.proxy_tensor import make_fx, DecompositionInterpreter, get_isolated_graphmodule
@@ -25,16 +27,23 @@
 
 aten = torch.ops.aten
 
-try:
-    import sympy  # noqa: F401
-    # TODO(jansel): these tests fail on windows
-    HAS_SYMPY = not IS_WINDOWS
-except ImportError:
-    HAS_SYMPY = False
-skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy")
 HAS_CUDA = torch.cuda.is_available()
 
 
+def strip_end(s, suffix):
+    if suffix and s.endswith(suffix):
+        return s[:-len(suffix)]
+    else:
+        return s
+
+
+def show_guards(gm):
+    names = [strip_end(n, "_1") for n in fx_placeholder_targets(gm)]
+    return "\n".join(
+        gm.shape_env.produce_guards(fx_placeholder_vals(gm), names, _simplified=True)
+    )
+
+
 def process_failures():
     """
     Takes file containing failures like
@@ -69,42 +78,6 @@ def create_normalized_name(op):
     print("}")
 
 
-# Copied from functorch
-def xfail(op_name, variant_name='', *, device_type=None, dtypes=None):
-    return (op_name, variant_name, device_type, dtypes, True)
-
-
-def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
-    return (op_name, variant_name, device_type, dtypes, False)
-
-
-def skipOps(test_case_name, base_test_name, to_skip):
-    all_opinfos = op_db
-    for xfail in to_skip:
-        op_name, variant_name, device_type, dtypes, expected_failure = xfail
-        matching_opinfos = [o for o in all_opinfos
-                            if o.name == op_name and o.variant_test_name == variant_name]
-        assert len(matching_opinfos) >= 1, f"Couldn't find OpInfo for {xfail}"
-        for opinfo in matching_opinfos:
-            decorators = list(opinfo.decorators)
-            if expected_failure:
-                decorator = DecorateInfo(unittest.expectedFailure,
-                                         test_case_name, base_test_name,
-                                         device_type=device_type, dtypes=dtypes)
-                decorators.append(decorator)
-            else:
-                decorator = DecorateInfo(unittest.skip("Skipped!"),
-                                         test_case_name, base_test_name,
-                                         device_type=device_type, dtypes=dtypes)
-                decorators.append(decorator)
-            opinfo.decorators = tuple(decorators)
-
-    # This decorator doesn't modify fn in any way
-    def wrapped(fn):
-        return fn
-    return wrapped
-
-
 USE_TORCHVISION = False
 try:
     import torchvision
@@ -229,7 +202,7 @@ def f2(x):
         self.assertTrue(is_any_digamma(traced))
 
         # Verify nested make_fx calls don't make factory functions to be leaked
-        # into the outer graph
+        # into the outer graph. Verify that `make_fx`` itself does not leak its execution.
         def f2(x):
             gm = make_fx(f1)(x)
             self.assertFalse(is_any_sum(gm))
@@ -238,6 +211,20 @@ def f2(x):
 
         traced = make_fx(f2)(torch.randn(3))
         self.assertFalse(is_any_sum(traced))
+        self.assertFalse(is_any_sigmoid(traced))
+        self.assertTrue(is_any_digamma(traced))
+
+        # Verify that the `forward`` function of a graph module produced as a
+        # side effect of an interior `make_fx` is still traced
+        def f3(x):
+            gm = make_fx(f1)(x)
+            self.assertFalse(is_any_sum(gm))
+            self.assertTrue(is_any_sigmoid(gm))
+            # `gm.forward`` is still traced
+            return torch.digamma(gm(x))
+
+        traced = make_fx(f3)(torch.randn(3))
+        self.assertFalse(is_any_sum(traced))
         self.assertTrue(is_any_sigmoid(traced))
         self.assertTrue(is_any_digamma(traced))
 
@@ -433,7 +420,7 @@ def test_f():
                 torch.zeros(3), torch.zeros(3)
             )
 
-        if self.tracing_mode == "symbolic":
+        if self.tracing_mode != "real":
             self.assertRaises(DataDependentOutputException, test_f)
         else:
             self.assertRaisesRegex(RuntimeError, "data-dependent", test_f)
@@ -464,10 +451,13 @@ def f():
             blowup = val.repeat(1000)
             return bool(blowup.sum().item() == 2)
 
-        self.assertRaisesRegex(
-            RuntimeError, "data-dependent",
-            lambda: make_fx(f, tracing_mode=self.tracing_mode)()
-        )
+        def test_f():
+            make_fx(f, tracing_mode=self.tracing_mode)()
+
+        if self.tracing_mode == "fake":
+            self.assertRaises(DataDependentOutputException, test_f)
+        else:
+            self.assertRaisesRegex(RuntimeError, "data-dependent", test_f)
 
     def test_constant_random(self):
         def f():
@@ -475,10 +465,13 @@ def f():
             val.normal_()
             return bool(val.item() == 2.1)
 
-        self.assertRaisesRegex(
-            RuntimeError, "data-dependent",
-            lambda: make_fx(f, tracing_mode=self.tracing_mode)()
-        )
+        def test_f():
+            make_fx(f, tracing_mode=self.tracing_mode)()
+
+        if self.tracing_mode == "fake":
+            self.assertRaises(DataDependentOutputException, test_f)
+        else:
+            self.assertRaisesRegex(RuntimeError, "data-dependent", test_f)
 
     def test_decomposition_interpreter(self):
         def fn(x):
@@ -559,7 +552,7 @@ def forward(mod_self, x):  # noqa: B902
 
 
         gm = make_fx(Emformer())(torch.randn(16, 1, 256))
-        ops = set([n.target for n in gm.graph.nodes if n.op == 'call_function'])
+        ops = {n.target for n in gm.graph.nodes if n.op == 'call_function'}
         self.assertEqual(len(ops), 2)
 
 
@@ -699,10 +692,8 @@ class TestGenericProxyTensorFake(TestGenericProxyTensor):
     tracing_mode = "fake"
 
 
-@skipIfNoSympy
 @xfail_inherited_tests([
     "test_make_fx_overloads",
-    "test_trace_subclasses",
 ])
 class TestGenericProxyTensorSymbolic(TestGenericProxyTensor):
     tracing_mode = "symbolic"
@@ -776,7 +767,6 @@ def _trace(f, *args):
     return make_fx(f, tracing_mode="symbolic")(*inps)
 
 # TODO: Need to test the guards themselves specifically as well
-@skipIfNoSympy
 class TestSymbolicTracing(TestCase):
     def _test_dynamic(self, fn, trace_inputs, test_inputs, assert_eq=True):
         """
@@ -852,9 +842,24 @@ def f(x):
         self.assertTrue(eval_guards(gm, torch.randn(4, 5)))
         self.assertEqual(repr(bind_symbols(gm, torch.randn(4, 5))), "{s0: 4, s1: 5}")
         self.assertFalse(eval_guards(gm, torch.randn(25, 5)))
-        # TODO: There should eventually be guards for contiguity, but they're
-        # not currently being done yet
-        assert len(gm.shape_env.guards) == 1, "\n" + gm.shape_env.format_guards()
+        self.assertExpectedInline(show_guards(gm), """x.size()[0] < 20""")
+
+    @unittest.skipIf(not HAS_CUDA, 'CUDA-only test')
+    def test_cpu_scalar_cuda(self):
+        # Extracted from wave2vec2
+        def f(a, b):
+            return (a * b) @ b
+
+        r = str(
+            make_fx(f, tracing_mode="symbolic")(
+                torch.tensor(1.0), torch.randn(2, 2, device='cuda')
+            ).code
+        ).strip()
+        self.assertExpectedInline(r, """\
+def forward(self, a_1, b_1):
+    mul = torch.ops.aten.mul.Tensor(a_1, b_1);  a_1 = None
+    mm = torch.ops.aten.mm.default(mul, b_1);  mul = b_1 = None
+    return mm""")
 
     def test_binary_broadcast(self):
         def f(a, b):
@@ -891,6 +896,123 @@ def forward(self, a_1):
     mul = torch.ops.aten.mul.Tensor(a_1, _local_scalar_dense);  a_1 = _local_scalar_dense = None
     return mul""")
 
+    def test_item_to_constructor(self):
+        def f(a):
+            r = a.item()
+            constrain_range(r, min=0)
+            return torch.empty(r)
+
+        r = str(make_fx(f, tracing_mode="symbolic")(torch.randint(5, (1,))).code).strip()
+        self.assertExpectedInline(
+            r, """\
+def forward(self, a_1):
+    _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(a_1);  a_1 = None
+    empty = torch.ops.aten.empty.memory_format([_local_scalar_dense], device = device(type='cpu'), pin_memory = False);  _local_scalar_dense = None
+    return empty"""  # noqa: B950
+        )
+
+    def test_dynamic_pointwise_scalar(self):
+        def f(gravity, mask):
+            gravity[mask, 0] = gravity[mask, 0] * -1
+
+        r = str(make_fx(f, tracing_mode="symbolic")(
+            torch.randn((12, 4)),
+            torch.randint(0, 2, (12,), dtype=torch.bool)
+        ).code).strip()
+        self.assertExpectedInline(r, """\
+def forward(self, gravity_1, mask_1):
+    select = torch.ops.aten.select.int(gravity_1, 1, 0)
+    index = torch.ops.aten.index.Tensor(select, [mask_1]);  select = None
+    mul = torch.ops.aten.mul.Tensor(index, -1);  index = None
+    select_1 = torch.ops.aten.select.int(gravity_1, 1, 0);  gravity_1 = None
+    index_put_ = torch.ops.aten.index_put_.default(select_1, [mask_1], mul);  select_1 = mask_1 = mul = None
+    return None""")
+
+    def test_reflect_r_over_x(self):
+        def reflect_R_over_x(R):
+            reflect = torch.eye(3, device=R.device)
+            reflect[0, 0] = -1
+            return reflect @ R @ reflect
+
+        def f(crop_camera, mask):
+            crop_camera[mask] = reflect_R_over_x(crop_camera[mask])
+
+        r = str(make_fx(f, tracing_mode="symbolic")(
+            torch.randn((12, 3, 3)),
+            torch.randint(0, 2, (12,), dtype=torch.bool)
+        ).code).strip()
+        self.assertExpectedInline(r, """\
+def forward(self, crop_camera_1, mask_1):
+    index = torch.ops.aten.index.Tensor(crop_camera_1, [mask_1])
+    eye = torch.ops.aten.eye.default(3, device = device(type='cpu'), pin_memory = False)
+    _tensor_constant0 = self._tensor_constant0
+    lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
+    select = torch.ops.aten.select.int(eye, 0, 0)
+    select_1 = torch.ops.aten.select.int(select, 0, 0);  select = None
+    copy_ = torch.ops.aten.copy_.default(select_1, lift_fresh_copy);  select_1 = lift_fresh_copy = None
+    transpose = torch.ops.aten.transpose.int(index, -2, -1)
+    t = torch.ops.aten.t.default(eye)
+    clone = torch.ops.aten.clone.default(transpose, memory_format = torch.contiguous_format);  transpose = None
+    sym_size = torch.ops.aten.sym_size(index, 0);  index = None
+    sym_size_1 = torch.ops.aten.sym_size(crop_camera_1, 2)
+    mul = sym_size * sym_size_1
+    sym_size_2 = torch.ops.aten.sym_size(crop_camera_1, 1)
+    _unsafe_view = torch.ops.aten._unsafe_view.default(clone, [mul, sym_size_2]);  clone = mul = sym_size_2 = None
+    mm = torch.ops.aten.mm.default(_unsafe_view, t);  _unsafe_view = t = None
+    view = torch.ops.aten.view.default(mm, [sym_size, sym_size_1, 3]);  mm = sym_size_1 = None
+    transpose_1 = torch.ops.aten.transpose.int(view, -2, -1)
+    clone_1 = torch.ops.aten.clone.default(transpose_1, memory_format = torch.contiguous_format);  transpose_1 = None
+    mul_1 = sym_size * 3
+    sym_size_3 = torch.ops.aten.sym_size(view, 1);  view = None
+    view_1 = torch.ops.aten.view.default(clone_1, [mul_1, sym_size_3]);  clone_1 = mul_1 = sym_size_3 = None
+    mm_1 = torch.ops.aten.mm.default(view_1, eye);  view_1 = eye = None
+    view_2 = torch.ops.aten.view.default(mm_1, [sym_size, 3, 3]);  mm_1 = sym_size = None
+    index_put_ = torch.ops.aten.index_put_.default(crop_camera_1, [mask_1], view_2);  crop_camera_1 = mask_1 = view_2 = None
+    return None""")
+
+    @unittest.skipIf(not USE_TORCHVISION, "test requires torchvision")
+    @unittest.expectedFailure
+    def test_unbacked_batch_resnet(self):
+        mod = torchvision.models.resnet18()
+
+        def f(x, mask, params, buffers):
+            for p in itertools.chain([x, mask], params.values(), buffers.values()):
+                for s in p.shape:
+                    guard_int(s)
+            x = x[mask]
+            constrain_range(x.shape[0], min=1)
+            for p in params.values():
+                p.grad = None
+            return torch.func.functional_call(mod, {**params, **buffers}, (x,)).sum()
+
+        make_fx(f, tracing_mode="symbolic")(
+            torch.randn(3, 3, 250, 250),
+            torch.randint(0, 2, (3,), dtype=torch.bool),
+            dict(mod.named_parameters()),
+            dict(mod.named_buffers()),
+        )
+
+    def test_boolean_index(self):
+        def f(images, handedness, valid):
+            images = images[valid]
+            handedness = handedness[valid]
+            right_hand_mask = handedness == 1
+            images[right_hand_mask] = images[right_hand_mask].flip(-1)
+
+        r = str(make_fx(f, tracing_mode="symbolic")(
+            torch.randint(0, 256, (512, 1, 96, 96)),
+            torch.randint(0, 1, (512,)),
+            torch.randint(0, 2, (512,), dtype=torch.bool)
+        ).code).strip()
+        self.assertExpectedInline(r, """\
+def forward(self, images_1, handedness_1, valid_1):
+    index = torch.ops.aten.index.Tensor(images_1, [valid_1]);  images_1 = None
+    index_1 = torch.ops.aten.index.Tensor(handedness_1, [valid_1]);  handedness_1 = valid_1 = None
+    eq = torch.ops.aten.eq.Scalar(index_1, 1);  index_1 = None
+    index_2 = torch.ops.aten.index.Tensor(index, [eq])
+    flip = torch.ops.aten.flip.default(index_2, [-1]);  index_2 = None
+    index_put_ = torch.ops.aten.index_put_.default(index, [eq], flip);  index = eq = flip = None
+    return None""")
 
     def test_neg_shape(self):
         def f(a):
@@ -905,6 +1027,27 @@ def forward(self, a_1):
     empty = torch.ops.aten.empty.memory_format([add], device = device(type='cpu'), pin_memory = False);  add = None
     return empty""")
 
+    def test_invalidate_nonzero(self):
+        ok = False
+
+        def f(a):
+            nonlocal ok
+            b = a.clone()
+            x = b.nonzero()
+            x1 = b.nonzero()
+            x2 = b.nonzero()
+            assert x1.shape[0] == x2.shape[0]
+            ok = True
+            b.normal_()
+            y = b.nonzero()
+            try:
+                bool(x1.shape[0] == y.shape[0])
+                self.fail("didn't raise exception")
+            except GuardOnDataDependentSymNode:
+                pass
+
+        make_fx(f, tracing_mode="symbolic")(torch.randn(4))
+
     def test_sqrt_size(self):
         def f(a):
             return a / a.size(-1) ** 0.5
@@ -951,7 +1094,7 @@ def f(a, b):
         gm = self._test_dynamic(f, [(1, 6), (8, 1)], test_inputs)
         self.assertTrue(eval_guards(gm, torch.randn(1, 10), torch.randn(6, 1)))
         self.assertFalse(eval_guards(gm, torch.randn(1, 2), torch.randn(4, 1)))
-        assert len(gm.shape_env.guards) == 1
+        self.assertExpectedInline(show_guards(gm), """2*a.size()[1]*b.size()[0] > 20""")
 
     def test_new_empty(self):
         def f(a, b):
@@ -1044,8 +1187,8 @@ def f(a, b):
         fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(16), torch.randn(8))
         from torch._dynamo.source import LocalSource
         self.assertExpectedInline(
-            fx_g.shape_env.codegen_guards(fx_placeholder_vals(fx_g), [LocalSource("a"), LocalSource("b")]),
-            """a.size()[0] == 2*b.size()[0] and a.stride()[0] == 1 and a.storage_offset() == 0 and b.stride()[0] == 1 and b.storage_offset() == 0 and b.size()[0] != 0 and b.size()[0] != 1"""  # noqa: B950
+            str(fx_g.shape_env.produce_guards(fx_placeholder_vals(fx_g), [LocalSource("a"), LocalSource("b")])),
+            """['a.size()[0] == 2*b.size()[0]', 'a.stride()[0] == 1', 'a.storage_offset() == 0', 'b.stride()[0] == 1', 'b.storage_offset() == 0', '2 <= b.size()[0]']"""  # noqa: B950
         )
 
     def test_sym_storage_offset(self):
@@ -1119,7 +1262,7 @@ def f(a, b, c, d, e):
             return final_vals
 
         fx_g = _trace(f, 2, 4, 8, 16, 32)
-        self._assert_no_guards(fx_g, 1)
+        self.assertExpectedInline(show_guards(fx_g), """""")
 
 
 
@@ -1133,6 +1276,7 @@ def f(a, b, c, d, e):
     skip('new_empty'),
     skip('empty_like'),
     skip('empty'),
+    skip('empty_permuted'),
     # flaky
     skip('linalg.lstsq', 'grad_oriented'),
     skip('nn.functional.max_unpool1d', '', device_type='cpu'),
@@ -1152,6 +1296,7 @@ def f(a, b, c, d, e):
 
     # Seems like it's creating a sparse tensor that isn't captured by tensor.is_sparse
     xfail('sparse.sampled_addmm'),
+    xfail('sparse.mm', 'reduce'),
 
     # proxy tensor doesn't support sparse correctly right now
     skip('to_sparse'),
@@ -1161,7 +1306,7 @@ def f(a, b, c, d, e):
 
 fake_tensor_failures = {
     # FakeTensor fallback doesn't work
-    xfail('segment_reduce', 'lengths'),
+    xfail('_segment_reduce', 'lengths'),
     xfail('multinomial'),
     xfail('cholesky'),
     xfail('cholesky_inverse'),
@@ -1180,10 +1325,7 @@ def f(a, b, c, d, e):
     xfail('masked.cumprod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition
-    xfail('argwhere', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
-    xfail('bucketize', ''),  # aten.bucketize.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('cartesian_prod', ''),  # Tensors of type TensorImpl do not have numel
     xfail('cdist', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('cholesky_solve', ''),  # Could not run 'aten::_cholesky_solve_helper' with arguments from the 'Meta' back...
     xfail('column_stack', ''),  # Tensors of type TensorImpl do not have numel
@@ -1265,13 +1407,10 @@ def f(a, b, c, d, e):
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
-    xfail('meshgrid', 'list_of_tensors'),  # Tensors of type TensorImpl do not have numel
-    xfail('meshgrid', 'variadic_tensors'),  # Tensors of type TensorImpl do not have numel
     xfail('min', 'reduction_with_dim'),  # aten.min.dim - couldn't find symbolic meta function/decomposition
     xfail('mode', ''),  # aten.mode.default - couldn't find symbolic meta function/decomposition
     xfail('nanquantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('narrow', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('max_pool2d_with_indices_backward', ''),  # (symint math failure) Given input size: (s0xs1x2). Calculated ...
     xfail('nn.functional.adaptive_max_pool1d', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.adaptive_max_pool2d', ''),  # aten.adaptive_max_pool2d.default - couldn't find symbolic meta funct...
     xfail('nn.functional.adaptive_max_pool3d', ''),  # argument 'output_size' (position 2) must be tupl...
@@ -1285,8 +1424,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
-    xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
     xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
     xfail('nn.functional.max_pool1d', ''),  # Trying to call aten.size on a tensor with symbolic shapes.
@@ -1301,8 +1438,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pdist', ''),  # Could not run 'aten::_pdist_forward' with arguments from the 'Meta' backend...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
-    xfail('normal', ''),  # aten.normal.Tensor_Tensor - couldn't find symbolic meta function/decomposition
     xfail('normal', 'number_mean'),  # aten.normal.float_Tensor - couldn't find symbolic meta function/decomposition
     xfail('ormqr', ''),  # aten.ormqr.default - couldn't find symbolic meta function/decomposition
     xfail('pca_lowrank', ''),  # aten.mm.default - couldn't find symbolic meta function/decomposition
@@ -1316,12 +1451,11 @@ def f(a, b, c, d, e):
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('renorm', ''),  # aten.renorm.default - couldn't find symbolic meta function/decomposition
     xfail('repeat_interleave', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('reshape_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('resize_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('resize_as_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('roll', ''),  # Tensors of type TensorImpl do not have numel
     xfail('searchsorted', ''),  # Could not run 'aten::searchsorted.Tensor' with arguments from the 'Meta' backend. ...
-    xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta function/decomposition
+    xfail('_segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta function/decomposition
     xfail('special.airy_ai', ''),  # aten.special_airy_ai.default - couldn't find symbolic meta function/decomposition
     xfail('special.bessel_y0', ''),  # aten.special_bessel_y0.default - couldn't find symbolic meta function/decomposition
     xfail('special.bessel_y1', ''),  # aten.special_bessel_y1.default - couldn't find symbolic meta function/decomposition
@@ -1340,14 +1474,12 @@ def f(a, b, c, d, e):
     xfail('stft', ''),  # argument 'size' must be tuple of ints, but found element of type torch._C.SymIntNode at...
     xfail('sum_to_size', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('svd_lowrank', ''),  # aten.mm.default - couldn't find symbolic meta function/decomposition
-    xfail('symeig', ''),  # aten.symeig.default - couldn't find symbolic meta function/decomposition
     xfail('take_along_dim', ''),  # dtype of indices should be Long but got Float
     xfail('take', ''),  # aten.take.default - couldn't find symbolic meta function/decomposition
     xfail('tensordot', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('trapz', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('trapezoid', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/decomposition
-    xfail('view_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('vsplit', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten._unique2.default - couldn't find symbolic meta function/decomposition
@@ -1446,14 +1578,12 @@ def test_make_fx_exhaustive(self, device, dtype, op):
     def test_make_fx_fake_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "fake")
 
-    @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
              make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
-    @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_inplace',
              make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | inplace_symbolic_tensor_failures)
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 1c25bc79cf92..f49e014d9941 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Owner(s): ["module: autograd"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON, IS_WINDOWS
 import pkgutil
 import torch
 import sys
@@ -50,6 +50,7 @@ def test_no_new_bindings(self):
             "AnyType",
             "Argument",
             "ArgumentSpec",
+            "AwaitType",
             "autocast_decrement_nesting",
             "autocast_increment_nesting",
             "AVG",
@@ -270,7 +271,7 @@ def test_no_new_bindings(self):
         self.assertTrue(torch_C_bindings.issubset(torch_C_allowlist_superset), msg)
 
     # AttributeError: module 'torch.distributed' has no attribute '_shard'
-    @unittest.skipIf(IS_WINDOWS, "Distributed Attribute Error")
+    @unittest.skipIf(IS_WINDOWS or IS_JETSON, "Distributed Attribute Error")
     def test_correct_module_names(self):
         '''
         An API is considered public, if  its  `__module__` starts with `torch.`
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 33465217bbbc..0741e779fcda 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -3,7 +3,10 @@
 import tempfile
 import torch
 from copy import deepcopy
-from torch.library import Library
+from torch.library import Library, impl
+from torch.fx.experimental.proxy_tensor import ShapeEnv
+from torch import SymInt
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.cuda.jiterator import _create_jit_fn
 import unittest
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM, IS_WINDOWS
@@ -284,6 +287,25 @@ def test_error_for_unsupported_ns_or_kind(self) -> None:
         with self.assertRaisesRegex(ValueError, "reserved namespace"):
             my_lib1 = Library("prim", "DEF")
 
+    def test_returning_symint(self) -> None:
+        shape_env = ShapeEnv()
+        fake_tensor_mode = FakeTensorMode(shape_env=shape_env)
+
+        ft = fake_tensor_mode.from_tensor(torch.rand(2, 3))
+
+        s0, s1 = ft.shape
+
+        tlib = Library("tlib", "DEF")
+        tlib.define("sqsum(SymInt a, SymInt b) -> SymInt")
+
+        @impl(tlib, "sqsum", "CompositeExplicitAutograd")
+        def sqsum(a: SymInt, b: SymInt):
+            return a * a + b * b
+
+        out = torch.ops.tlib.sqsum.default(s0, s1)
+        out_val = shape_env.evaluate_expr(out.node.expr)
+        self.assertEquals(out_val, 13)
+
 class TestPythonDispatch(TestCase):
     def test_basic(self) -> None:
         with capture_logs() as logs:
@@ -444,7 +466,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         self.assertRaisesRegex(
             RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).neg(),
         )
-        self.assertRaisesRegexp(
+        self.assertRaisesRegex(
             RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).detach(),
         )
 
@@ -1579,7 +1601,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
 
             err_msg = "no implementation found for 'torch.ops.aten.sym_stride'"
             e = StridesNotImplemented(torch.randn(3, 3), use_wrapper_subclass)
-            with self.assertRaisesRegex(RuntimeError, err_msg):
+            with self.assertRaisesRegex(TypeError, err_msg):
                 e.stride()
 
             e = StridesCustomReturn(torch.randn(3, 3), use_wrapper_subclass)
@@ -1631,7 +1653,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
 
             err_msg = "no implementation found for 'torch.ops.aten.sym_size'"
             e = SizesNotImplemented(torch.randn(3, 3), use_wrapper_subclass)
-            with self.assertRaisesRegex(RuntimeError, err_msg):
+            with self.assertRaisesRegex(TypeError, err_msg):
                 e.size()
 
             e = SizesCustomReturn(torch.randn(3, 3), use_wrapper_subclass)
diff --git a/test/test_reductions.py b/test/test_reductions.py
index a823860c22de..0b196b674cd0 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -11,7 +11,7 @@
 from itertools import product, combinations, permutations
 import warnings
 
-from torch._six import inf, nan
+from torch import inf, nan
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and, get_all_math_dtypes, integral_types, complex_types, floating_types_and,
@@ -80,7 +80,7 @@ def _reduced_shape(shape, dim=None, keepdim=False):
 
     # Wrap negative dims
     dim = dim if isinstance(dim, Sequence) else [dim]
-    dim = set(i if i >= 0 else len(shape) + i for i in dim)
+    dim = {i if i >= 0 else len(shape) + i for i in dim}
 
     result = []
     for i, size in enumerate(shape):
@@ -1434,7 +1434,7 @@ def test_prod_bool(self, device):
         vals = [[True, True], [True, False], [False, False], []]
         for val in vals:
             result = torch.prod(torch.tensor(val, device=device), dtype=torch.bool).item()
-            expect = np.prod(np.array(val), dtype=np.bool)
+            expect = np.prod(np.array(val), dtype=bool)
             self.assertEqual(result, expect)
 
             result = torch.prod(torch.tensor(val, device=device)).item()
@@ -1563,6 +1563,14 @@ def test_output_dtype(dtype, is_int32):
             _, sorted_idx = torch.sort(sequence)
             torch.searchsorted(sequence, values_1d, sorter=sorted_idx.to(torch.float32))
 
+        # invalid sorter value, out of bound (>= innermost size)
+        with self.assertRaisesRegex(RuntimeError, "sorter index out of range"):
+            torch.searchsorted(torch.tensor([1, 2, 3]), 2.5, sorter=torch.tensor([0, 1, 3]))
+
+        # invalid sorter value, out of bound (< 0)
+        with self.assertRaisesRegex(RuntimeError, "sorter index out of range"):
+            torch.searchsorted(torch.tensor([1, 2, 3]), 2.5, sorter=torch.tensor([-1, 1, 2]))
+
         # scalar type bfloat16
         if self.device_type == 'cpu':
             def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False):
@@ -1681,6 +1689,7 @@ def test_nansum_vs_numpy(self, device, dtype):
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_extremal=True)
         self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_keepdim=True)
 
+    @onlyCPU
     @dtypes(*complex_types())
     def test_nansum_complex(self, device, dtype):
         x = torch.randn((3, 3, 3), device=device, dtype=dtype)
diff --git a/test/test_segment_reductions.py b/test/test_segment_reductions.py
index 47eb095ee914..5e14a25784bb 100644
--- a/test/test_segment_reductions.py
+++ b/test/test_segment_reductions.py
@@ -18,17 +18,17 @@
 )
 
 
-reductions = ["amax", "mean", "amin", "sum", "prod"]
+reductions = ["max", "mean", "min", "sum", "prod"]
 
 
 def get_default_value(initial_value, reduction):
     if initial_value is not None:
         return initial_value
-    if reduction == "amax":
+    if reduction == "max":
         return -float("Inf")
     elif reduction == "mean":
         return float("nan")
-    elif reduction == "amin":
+    elif reduction == "min":
         return float("Inf")
     elif reduction == "sum":
         return 0.0
@@ -75,7 +75,7 @@ def _test_common(
                 segment_reduce_kwargs['lengths'] = lengths
             else:
                 segment_reduce_kwargs['offsets'] = offsets
-            actual_result = torch.segment_reduce(
+            actual_result = torch._segment_reduce(
                 data=data,
                 reduce=reduction,
                 **segment_reduce_kwargs
@@ -108,7 +108,7 @@ def _test_common(
                 )
                 self.assertTrue(
                     gradcheck(
-                        lambda x: torch.segment_reduce(
+                        lambda x: torch._segment_reduce(
                             data=x,
                             reduce=reduction,
                             **segment_reduce_kwargs
@@ -133,13 +133,13 @@ def test_simple_1d(self, device, dtypes):
                 check_backward = True if initial is not None else False
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
-                if reduction == "amax":
+                if reduction == "max":
                     expected_result = [1, float("nan"), 5, default_value]
                     expected_grad = [1, 1, 0, 0, 0.5, 0.5]
                 elif reduction == "mean":
                     expected_result = [1, float("nan"), 4.666, default_value]
                     expected_grad = [1.0, 0.5, 0.5, 0.333, 0.333, 0.333]
-                elif reduction == "amin":
+                elif reduction == "min":
                     if initial is not None:
                         initial_value = 1000  # some high number
                         default_value = get_default_value(initial_value, reduction)
@@ -385,7 +385,7 @@ def test_pytorch_scatter_test_cases(self, device, dtypes, reduce):
             lengths = torch.diff(indptr, dim=dim)
             expected = torch.tensor(test[reduce], dtype=val_dtype, device=device)
 
-            actual_result = torch.segment_reduce(
+            actual_result = torch._segment_reduce(
                 data=data,
                 reduce=reduce,
                 lengths=lengths,
@@ -395,7 +395,7 @@ def test_pytorch_scatter_test_cases(self, device, dtypes, reduce):
             self.assertEqual(actual_result, expected)
 
             # test offsets
-            actual_result = torch.segment_reduce(
+            actual_result = torch._segment_reduce(
                 data=data,
                 reduce=reduce,
                 offsets=indptr,
@@ -419,7 +419,7 @@ def fn(x, mode='lengths'):
                         segment_reduce_kwargs[mode] = lengths
                     elif mode == 'offsets':
                         segment_reduce_kwargs[mode] = indptr
-                    return torch.segment_reduce(*segment_reduce_args, **segment_reduce_kwargs)
+                    return torch._segment_reduce(*segment_reduce_args, **segment_reduce_kwargs)
                 self.assertTrue(gradcheck(partial(fn, mode='lengths'), (data.clone().detach().requires_grad_(True))))
                 self.assertTrue(gradcheck(partial(fn, mode='offsets'), (data.clone().detach().requires_grad_(True))))
 
@@ -502,13 +502,13 @@ def test_unsafe_flag(self, device, dtype):
 
         # test for error on 1-D lenghts
         with self.assertRaisesRegex(RuntimeError, "Expected all rows of lengths along axis"):
-            torch.segment_reduce(data, 'sum', lengths=lengths, axis=0, unsafe=False)
+            torch._segment_reduce(data, 'sum', lengths=lengths, axis=0, unsafe=False)
 
         # test for error on multi-D lengths
         nd_lengths = torch.tensor([[0, 3, 3, 0], [2, 3, 0, 0]], dtype=length_type, device=device)
         nd_data = torch.arange(12, dtype=torch.float, device=device).reshape(2, 6)
         with self.assertRaisesRegex(RuntimeError, "Expected all rows of lengths along axis"):
-            torch.segment_reduce(nd_data, 'sum', lengths=nd_lengths, axis=1, unsafe=False)
+            torch._segment_reduce(nd_data, 'sum', lengths=nd_lengths, axis=1, unsafe=False)
 
 
 
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 4eb97e8b4404..9b9a71334bad 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -5,6 +5,7 @@
 import io
 import tempfile
 import os
+import gc
 import sys
 import zipfile
 import warnings
@@ -45,7 +46,7 @@
                 break
 
 
-class FilelikeMock(object):
+class FilelikeMock:
     def __init__(self, data, has_fileno=True, has_readinto=False):
         if has_readinto:
             self.readinto = self.readinto_opt
@@ -78,7 +79,7 @@ def was_called(self, name):
         return name in self.calls
 
 
-class SerializationMixin(object):
+class SerializationMixin:
     def _test_serialization_data(self):
         a = [torch.randn(5, 5).float() for i in range(2)]
         b = [a[i % 2] for i in range(4)]  # 0-3
@@ -312,7 +313,7 @@ def test_serialization_sparse_invalid(self):
         x[1][1] = 1
         x = x.to_sparse()
 
-        class TensorSerializationSpoofer(object):
+        class TensorSerializationSpoofer:
             def __init__(self, tensor):
                 self.tensor = tensor
 
@@ -344,7 +345,7 @@ def _test_serialization_sparse_compressed_invalid(self,
         x[1][1] = 1
         x = conversion(x)
 
-        class TensorSerializationSpoofer(object):
+        class TensorSerializationSpoofer:
             def __init__(self, tensor):
                 self.tensor = tensor
 
@@ -418,7 +419,7 @@ def _test_serialization_backwards_compat(self, weights_only):
         self.assertEqual(c[1], c[3], atol=0, rtol=0)
 
         # test some old tensor serialization mechanism
-        class OldTensorBase(object):
+        class OldTensorBase:
             def __init__(self, new_tensor):
                 self.new_tensor = new_tensor
 
@@ -735,7 +736,16 @@ def test_save_different_dtype_error(self):
             with self.assertRaisesRegex(RuntimeError, error_msg):
                 torch.save([a.storage(), s_bytes], f)
 
-class serialization_method(object):
+    def test_safe_load_basic_types(self):
+        with tempfile.NamedTemporaryFile() as f:
+            data = {"int": 123, "str": "world", "float": 3.14, "bool": False}
+            torch.save(data, f)
+            f.seek(0)
+            loaded_data = torch.load(f, weights_only=True)
+            self.assertEqual(data, loaded_data)
+
+
+class serialization_method:
     def __init__(self, use_zip):
         self.use_zip = use_zip
         self.torch_save = torch.save
@@ -872,7 +882,7 @@ def test_serialization_offset_filelike(self, weights_only):
 
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=False):
-            return super(TestOldSerialization, self).run(*args, **kwargs)
+            return super().run(*args, **kwargs)
 
 
 class TestSerialization(TestCase, SerializationMixin):
@@ -905,6 +915,8 @@ def test_serialization_zipfile_actually_jit(self):
 
     # Ensure large zip64 serialization works properly
     def test_serialization_2gb_file(self):
+        # Run GC to clear up as much memory as possible before running this test
+        gc.collect()
         big_model = torch.nn.Conv2d(20000, 3200, kernel_size=3)
 
         with BytesIOContext() as f:
@@ -1009,7 +1021,7 @@ def _save_load_check(t):
 
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
-            return super(TestSerialization, self).run(*args, **kwargs)
+            return super().run(*args, **kwargs)
 
 
 class TestWrapperSubclass(torch.Tensor):
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 9b2ff360553c..189187b58293 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -7,11 +7,12 @@
 from functools import partial
 import random
 import warnings
+import unittest
 
-from torch._six import nan
+from torch import nan
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfTorchDynamo, torch_to_numpy_dtype_dict)
+    TestCase, run_tests, skipIfTorchDynamo, torch_to_numpy_dtype_dict, IS_JETSON)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyNativeDeviceTypes,
     dtypesIfCUDA, largeTensorTest)
@@ -406,6 +407,34 @@ def gen_data():
             out_t = make_from_data([[3, 2, 1], [6, 5, 4]])
             yield in_t, dims, out_t
 
+            # vectorized NCHW cases (images)
+            if device == "cpu" and dtype != torch.bfloat16:
+                for mf in [torch.contiguous_format, torch.channels_last]:
+                    for c in [2, 3, 8, 16]:
+                        in_t = make_from_size((2, c, 32, 32)).contiguous(memory_format=mf)
+                        np_in_t = in_t.numpy()
+
+                        np_out_t = np_in_t[:, :, :, ::-1].copy()
+                        out_t = torch.from_numpy(np_out_t)
+                        yield in_t, 3, out_t
+
+                        np_out_t = np_in_t[:, :, ::-1, :].copy()
+                        out_t = torch.from_numpy(np_out_t)
+                        yield in_t, 2, out_t
+
+                        # non-contig cases
+                        in_tt = in_t[..., ::2, :]
+                        np_in_t = in_tt.numpy()
+                        np_out_t = np_in_t[:, :, :, ::-1].copy()
+                        out_t = torch.from_numpy(np_out_t)
+                        yield in_tt, 3, out_t
+
+                        in_tt = in_t[..., ::2]
+                        np_in_t = in_tt.numpy()
+                        np_out_t = np_in_t[:, :, :, ::-1].copy()
+                        out_t = torch.from_numpy(np_out_t)
+                        yield in_tt, 3, out_t
+
             # Noops (edge cases)
 
             # Size 0
@@ -477,6 +506,7 @@ def test_flip_numpy(self, device, dtype):
     @onlyCUDA  # CPU is too slow
     @largeTensorTest('17GB')  # 4 tensors of 4GB (in, out) x (torch, numpy) + 1GB
     @largeTensorTest("81GB", "cpu")  # even for CUDA test, sufficient system memory is required
+    @unittest.skipIf(IS_JETSON, "Too large for Jetson")
     def test_flip_large_tensor(self, device):
         t_in = torch.empty(2**32 + 1, dtype=torch.uint8).random_()
         torch_fn = partial(torch.flip, dims=(0,))
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 1343e1ae814d..d8d7e7aaed10 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 import random
-from torch._six import nan
+from torch import nan
 from itertools import permutations, product
 
 from torch.testing import make_tensor
@@ -826,7 +826,7 @@ def ensure_tuple(x):
                 self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
                 self.assertEqual(expected_counts, y_counts)
 
-    @dtypesIfCPU(*all_types_and(torch.bool, torch.bfloat16))
+    @dtypesIfCPU(*all_types_and(torch.bool, torch.float16, torch.bfloat16))
     @dtypes(*all_types_and(torch.half, torch.bool))
     def test_unique(self, device, dtype):
         def ensure_tuple(x):
@@ -883,7 +883,7 @@ def ensure_tuple(x):
                                 count += 1
                         self.assertEqual(j, count)
 
-    @dtypesIfCPU(*all_types_and(torch.bool, torch.bfloat16))
+    @dtypesIfCPU(*all_types_and(torch.bool, torch.float16, torch.bfloat16))
     @dtypes(*all_types_and(torch.half, torch.bool))
     def test_unique_consecutive(self, device, dtype):
         if dtype is torch.bool:
diff --git a/test/test_sparse.py b/test/test_sparse.py
index cc4611455b35..f02233941a66 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -8,25 +8,32 @@
 import unittest
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
-    do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \
+    load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \
     DeterministicGuard, first_sample, TEST_WITH_CROSSREF, TEST_WITH_ROCM, skipIfTorchDynamo, \
-    parametrize, subtest, is_coalesced_indices
+    parametrize, subtest, is_coalesced_indices, suppress_warnings, is_slow_gradcheck_env
 from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version
 from numbers import Number
 from typing import Dict, Any
 from distutils.version import LooseVersion
 from torch.testing._internal.common_cuda import \
-    (SM53OrLater, SM80OrLater, CUDA11OrLater)
+    (SM53OrLater, SM80OrLater)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
-     deviceCountAtLeast, OpDTypes)
+     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes)
 from torch.testing._internal.common_methods_invocations import \
-    (sparse_unary_ufuncs, sparse_masked_reduction_ops)
+    (reduction_ops, sparse_unary_ufuncs, sparse_masked_reduction_ops)
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex, all_types_and_complex_and, floating_and_complex_types,
     floating_and_complex_types_and, integral_types, floating_types_and,
 )
 
+reduction_ops_with_sparse_support = [op for op in reduction_ops if 'masked.' not in op.name and
+                                     (op.supports_sparse
+                                      or op.supports_sparse_csr
+                                      or op.supports_sparse_csc
+                                      or op.supports_sparse_bsr
+                                      or op.supports_sparse_bsc)]
+
 if TEST_SCIPY:
     import scipy.sparse
 
@@ -39,7 +46,7 @@
 
 CUSPARSE_SPMM_COMPLEX128_SUPPORTED = (
     IS_WINDOWS and torch.version.cuda and LooseVersion(torch.version.cuda) > "11.2"
-) or (not IS_WINDOWS and CUDA11OrLater)
+) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
 def all_sparse_layouts(test_name='layout', include_strided=False):
     return parametrize(test_name, [
@@ -51,10 +58,19 @@ def all_sparse_layouts(test_name='layout', include_strided=False):
         subtest(torch.sparse_bsc, name='SparseBSC'),
     ][(0 if include_strided else 1):])
 
+def gradcheck_semantics(test_name='gradcheck'):
+    gradcheck_sparse = functools.partial(gradcheck, masked=False)
+    gradcheck_masked = functools.partial(gradcheck, masked=True, check_sparse_nnz=True)
+    gradcheck_sparse.masked = False
+    gradcheck_masked.masked = True
+    return parametrize(test_name, [
+        subtest(gradcheck_sparse, name='sparse'),
+        subtest(gradcheck_masked, name='masked')])
+
 
 class CrossRefSparseFakeMode(torch._subclasses.CrossRefFakeMode):
     def __init__(self):
-        super(CrossRefSparseFakeMode, self).__init__(
+        super().__init__(
             self.ignore_op, check_strides=False,
             check_aliasing=False,
         )  # TODO: enable stride/alias checking
@@ -75,6 +91,58 @@ def ignore_op(func):
             torch.ops.aten._values.default,
         )
 
+class TestSparseLegacyConstructors(TestCase):
+
+    def test_legacy_warnings(self):
+
+        def f1():
+            "torch.sparse.SparseTensor() is deprecated."\
+                "  Please use torch.sparse_coo_tensor((0,), dtype=)"
+            x_ref = torch.sparse_coo_tensor((0,), dtype=torch.float64)
+            x = torch.sparse.DoubleTensor()
+            self.assertEqual(x, x_ref)
+
+        def f2():
+            "torch.sparse.SparseTensor(cdata=x._cdata) is deprecated."\
+                "  Please use torch.sparse_coo_tensor(x._indices(), x._values(), x.shape)"
+            x_ref = torch.tensor([[1, 2], [3, 4]], dtype=torch.float64).to_sparse()
+            x = torch.sparse.DoubleTensor(cdata=x_ref._cdata)
+            y = torch.sparse_coo_tensor(x._indices(), x._values(), x.shape)
+            self.assertEqual(x, x_ref)
+            self.assertEqual(y, x_ref)
+
+        def f3():
+            "torch.sparse.SparseTensor(indices, values, *, device=) is deprecated."\
+                "  Please use torch.sparse_coo_tensor(indices, values, dtype=, device=)"
+            x_ref = torch.sparse_coo_tensor([[0, 0, 1, 1], [0, 1, 0, 1]], [1, 2, 3, 4], dtype=torch.float64)
+            x = torch.sparse.DoubleTensor(torch.tensor([[0, 0, 1, 1], [0, 1, 0, 1]]),
+                                          torch.tensor([1, 2, 3, 4], dtype=torch.float64))
+            self.assertEqual(x, x_ref)
+
+        def f4():
+            "torch.sparse.SparseTensor(indices, values, shape, *, device=) is deprecated."\
+                "  Please use torch.sparse_coo_tensor(indices, values, shape, dtype=, device=)"
+            x_ref = torch.sparse_coo_tensor([[0, 0, 1, 1], [0, 1, 0, 1]], [1, 2, 3, 4], (2, 3), dtype=torch.float64)
+            x = torch.sparse.DoubleTensor(torch.tensor([[0, 0, 1, 1], [0, 1, 0, 1]]),
+                                          torch.tensor([1, 2, 3, 4], dtype=torch.float64), (2, 3))
+            self.assertEqual(x, x_ref)
+
+        def f5():
+            "torch.sparse.SparseTensor(shape, *, device=) is deprecated."\
+                "  Please use torch.sparse_coo_tensor(shape, dtype=, device=)"
+            x_ref = torch.sparse_coo_tensor((2, 3), dtype=torch.float64)
+            x = torch.sparse.DoubleTensor(2, 3)
+            self.assertEqual(x, x_ref)
+
+        for test_f in [f1, f2, f3, f4, f5]:
+
+            with self.assertWarns(UserWarning, msg=test_f.__doc__) as cm:
+                test_f()
+                test_f()
+
+            # Check warn-once:
+            self.assertEqual(len(cm.warnings), 1)
+
 class TestSparseBase(TestCase):
     def run(self, result=None):
         if TEST_WITH_CROSSREF:
@@ -98,7 +166,6 @@ def sparse_empty_factory(*args, **kwargs):
         def sparse_tensor_factory(*args, **kwargs):
             return torch.sparse_coo_tensor(*args, **kwargs)
         self.sparse_tensor = sparse_tensor_factory
-        self.legacy_sparse_tensor = torch.sparse.DoubleTensor
 
     def _gen_sparse(self, sparse_dim, nnz, with_size, dtype, device, coalesced):
         if isinstance(with_size, Number):
@@ -227,11 +294,6 @@ def test_shape(sparse_dims, nnz, with_size):
         x = self.sparse_tensor(i, v, torch.Size([10, 2]), dtype=dtype, device=device)
         self.assertEqual(x.coalesce()._nnz(), 9)
 
-        # Make sure we can access empty indices / values
-        x = self.legacy_sparse_tensor()
-        self.assertEqual(x._indices().numel(), 0)
-        self.assertEqual(x._values().numel(), 0)
-
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
     @precisionOverride({torch.bfloat16: 1e-2})
@@ -257,7 +319,7 @@ def _test_coalesce(t):
                 else:
                     value_map[idx_tup] = val.clone() if isinstance(val, torch.Tensor) else val
 
-            new_indices = sorted(list(value_map.keys()))
+            new_indices = sorted(value_map.keys())
             _new_values = [value_map[idx] for idx in new_indices]
             if t._values().ndimension() < 2:
                 new_values = t._values().new(_new_values)
@@ -283,6 +345,7 @@ def _test_coalesce(t):
             _test_coalesce(t)  # this tests correctness
 
     @dtypes(torch.double)
+    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/89395")
     def test_coalesce_reference_cycle(self, device, dtype):
         # Test coalesce doesn't create autograd graph cycles (gh-52253)
 
@@ -348,7 +411,11 @@ def test_ctor_size_checks(self, device, dtype):
 
     @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
-    def test_to_dense(self, device, dtype):
+    @gradcheck_semantics()
+    def test_to_dense_with_gradcheck(self, device, dtype, gradcheck):
+        if not gradcheck.masked and is_slow_gradcheck_env():
+            self.skipTest('FIXME: to_dense_backward supports masked semantics only')
+
         def test_tensor(x, res):
             x.to_dense()  # Tests triple to_dense for memory corruption
             x.to_dense()
@@ -481,7 +548,11 @@ def test_shared(self, device, dtype):
 
     @dtypes(torch.double, torch.cdouble)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
-    def test_to_dense_hybrid(self, device, dtype):
+    @gradcheck_semantics()
+    def test_to_dense_hybrid(self, device, dtype, gradcheck):
+        if not gradcheck.masked and is_slow_gradcheck_env():
+            self.skipTest('FIXME: to_dense_backward supports masked semantics only')
+
         def test_tensor(x, res):
             x.to_dense()  # Tests double to_dense for memory corruption
             x.to_dense()
@@ -800,16 +871,16 @@ def test_tensor(x):
             self.assertEqual(y.sparse_dim(), x.sparse_dim())
             self.assertEqual(y.dense_dim(), x.dense_dim())
 
-        x = torch.sparse.FloatTensor(2, 3, 4)
+        x = torch.sparse_coo_tensor((2, 3, 4), dtype=torch.float32)
         test_tensor(x)
 
-        x = torch.sparse.HalfTensor(2, 3, 4)
+        x = torch.sparse_coo_tensor((2, 3, 4), dtype=torch.float16)
         test_tensor(x)
 
-        x = torch.cuda.sparse.HalfTensor(2, 3, 4)
+        x = torch.sparse_coo_tensor((2, 3, 4), dtype=torch.float16)
         test_tensor(x)
 
-        x = torch.sparse.FloatTensor(2, 3, 4, 0)
+        x = torch.sparse_coo_tensor((2, 3, 4, 0), dtype=torch.float32)
         test_tensor(x)
 
     @coalescedonoff
@@ -835,7 +906,10 @@ def test_shape(sparse_dims, nnz, with_size):
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
-    def test_permute(self, device, dtype, coalesced):
+    @gradcheck_semantics()
+    def test_permute(self, device, dtype, coalesced, gradcheck):
+        if not gradcheck.masked and is_slow_gradcheck_env():
+            self.skipTest('FIXME: to_dense_backward supports masked semantics only')
         # trivial checks
         s = torch.rand(3, 3, 3, device=device, dtype=dtype).to_sparse()
         with self.assertRaisesRegex(RuntimeError, "does not match the length"):
@@ -939,7 +1013,7 @@ def test_not_in_place(x):
     def test_add_zeros(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
-            zeros = torch.zeros(sizes, layout=torch.sparse_coo).to(x.device)
+            zeros = torch.sparse_coo_tensor(sizes, device=x.device)
             r1 = zeros + x
             r2 = x + zeros
             self.assertEqual(r1, x)
@@ -1459,7 +1533,8 @@ def test_shape(di, dj, dk, nnz):
     @coalescedonoff
     @unittest.skip("See https://github.com/pytorch/pytorch/issues/73145")
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
-    def test_sparse_addmm(self, device, dtype, coalesced):
+    @gradcheck_semantics()
+    def test_sparse_addmm(self, device, dtype, coalesced, gradcheck):
         def test_shape(m, n, p, nnz, broadcast, alpha_beta=None):
             if alpha_beta is None:
                 alpha = random.random()
@@ -1506,7 +1581,7 @@ def test_shape(d1, d2, d3, nnz, transposed):
 
             def fn(S, D):
                 return torch.sparse.mm(S, D)
-            gradcheck(fn, (S, D), check_sparse_nnz=True)
+            gradcheck(fn, (S, D), check_sparse_nnz=True, masked=True)
 
         test_shape(7, 8, 9, 20, False)
         test_shape(7, 8, 9, 20, True)
@@ -1514,7 +1589,8 @@ def fn(S, D):
     @coalescedonoff
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
-    def test_sparse_mul(self, device, dtype, coalesced):
+    @gradcheck_semantics()
+    def test_sparse_mul(self, device, dtype, coalesced, gradcheck):
         # https://github.com/pytorch/pytorch/issues/79914
         a = torch.tensor([[0., 1]], dtype=dtype, device=device).to_sparse().requires_grad_(True)
         b = torch.tensor([[0., 1]], dtype=dtype, device=device).to_sparse().requires_grad_(True)
@@ -1706,7 +1782,7 @@ def fn(S):
                     if res.is_sparse:
                         res = res.to_dense()
                     return res
-                gradcheck(fn, (S,), check_sparse_nnz=True)
+                gradcheck(fn, (S,), check_sparse_nnz=True, masked=True)
             else:
                 S_sum = torch.sparse.sum(S, td)
                 D_sum = D.sum(td)
@@ -1717,7 +1793,7 @@ def fn(S):
                     if res.is_sparse:
                         res = res.to_dense()
                     return res
-                gradcheck(fn, (S,), check_sparse_nnz=True)
+                gradcheck(fn, (S,), check_sparse_nnz=True, masked=True)
 
         nnz = 10
         sparse_dims = 2
@@ -1922,9 +1998,14 @@ def _test_sparse_mask_fixed():
                 [17, 18, 19, 20],
             ], dtype=dtype, device=device)
             exp_v = torch.tensor([7, 14, 3, 20], dtype=dtype, device=device)
-            res = dense.sparse_mask(x)
+            res_dense_lhs = dense.sparse_mask(x)
+            sparse = dense.to_sparse()
+            res_sparse_lhs = sparse.sparse_mask(x)
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4]), dtype=dtype, device=device)
-            self.assertEqual(res.coalesce(), expected.coalesce())
+            self.assertEqual(res_dense_lhs.coalesce(), expected.coalesce())
+            # check no side effects for the coalesce flag.
+            self.assertTrue(sparse.is_coalesced())
+            self.assertEqual(res_sparse_lhs.coalesce(), expected.coalesce())
 
             i = self.index_tensor([
                 [1, 3, 0, 4],
@@ -1934,9 +2015,14 @@ def _test_sparse_mask_fixed():
             x = self.sparse_tensor(i, v, torch.Size([5, 4, 0])).coalesce()
             dense = torch.empty([5, 4, 0], dtype=dtype, device=device)
             exp_v = torch.empty([4, 0], dtype=dtype, device=device)
-            res = dense.sparse_mask(x)
+            res_dense_lhs = dense.sparse_mask(x)
+            sparse = dense.to_sparse(2)
+            res_sparse_lhs = sparse.sparse_mask(x)
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 0]), dtype=dtype, device=device)
-            self.assertEqual(res.coalesce(), expected.coalesce())
+            self.assertEqual(res_dense_lhs.coalesce(), expected.coalesce())
+            # check no side effects for the coalesce flag.
+            self.assertTrue(sparse.is_coalesced())
+            self.assertEqual(res_sparse_lhs.coalesce(), expected.coalesce())
 
         _test_sparse_mask_fixed()
 
@@ -1949,6 +2035,11 @@ def _test_sparse_mask_fixed():
         self._test_sparse_mask_shape(0, 0, [10, 10, 10], [], dtype, device, coalesced)
         self._test_sparse_mask_shape(0, 0, [10, 10, 0], [], dtype, device, coalesced)
 
+        # check repetitions and matchings in the intersection
+        lhs = torch.randint(0, 5, (100,), device=device)
+        rhs = torch.randint(0, 5, (100,), device=device).to_sparse()
+        self.assertEqual(lhs.to_sparse().sparse_mask(rhs), lhs.sparse_mask(rhs))
+
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_mask_hybrid(self, device, dtype, coalesced):
@@ -1969,10 +2060,15 @@ def _test_sparse_mask_hybrid_fixed():
                 [[13, 5], [14, 1], [15, 1], [16, 6]],
                 [[17, 7], [18, 2], [19, 7], [20, 1]],
             ])
-            res = dense.sparse_mask(x)
+            res_dense_lhs = dense.sparse_mask(x)
+            sparse = dense.to_sparse(2)
+            res_sparse_lhs = sparse.sparse_mask(x)
             exp_v = torch.tensor([[7, 9], [14, 1], [3, 3], [20, 1]])
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2]))
-            self.assertEqual(res.coalesce(), expected.coalesce())
+            self.assertEqual(res_dense_lhs.coalesce(), expected.coalesce())
+            # check no side effects for the coalesce flag
+            self.assertTrue(sparse.is_coalesced())
+            self.assertEqual(res_sparse_lhs.coalesce(), expected.coalesce())
 
             i = self.index_tensor([
                 [1, 3, 0, 4],
@@ -1981,10 +2077,15 @@ def _test_sparse_mask_hybrid_fixed():
             v = torch.empty(4, 2, 0)
             x = self.sparse_tensor(i, v, torch.Size([5, 4, 2, 0])).coalesce()
             dense = torch.empty(5, 4, 2, 0)
-            res = dense.sparse_mask(x)
+            res_dense_lhs = dense.sparse_mask(x)
+            sparse = dense.to_sparse(2)
+            res_sparse_lhs = sparse.sparse_mask(x)
             exp_v = torch.empty(4, 2, 0)
             expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2, 0]))
-            self.assertEqual(res.coalesce(), expected.coalesce())
+            self.assertEqual(res_dense_lhs.coalesce(), expected.coalesce())
+            # check no side effects for the coalesce flag
+            self.assertTrue(sparse.is_coalesced())
+            self.assertEqual(res_sparse_lhs.coalesce(), expected.coalesce())
 
         _test_sparse_mask_hybrid_fixed()
 
@@ -2068,7 +2169,7 @@ def test_shape(i_shapes, v_shapes, nnzs):
         self.assertEqual(dense_tensor.shape, result.shape)
         self.assertEqual(result.layout, torch.sparse_coo)
 
-        sparse_zeros = torch.zeros(dense_tensor.shape, layout=torch.sparse_coo)
+        sparse_zeros = torch.sparse_coo_tensor(dense_tensor.shape)
         self.assertEqual(result._indices().shape, sparse_zeros._indices().shape)
         self.assertEqual(result._values().shape, sparse_zeros._values().shape)
 
@@ -2444,11 +2545,11 @@ def test_sparse_add_coalesce(self, device, dtype):
         self.assertFalse(z._indices().numel() != 2 and z.is_coalesced())
 
     @onlyCUDA
-    def test_storage_not_null(self):
-        x = torch.cuda.sparse.FloatTensor(2)
+    def test_storage_not_null(self, device):
+        x = torch.sparse_coo_tensor((2,), dtype=torch.float32, device=device)
         self.assertNotEqual(x.get_device(), -1)
 
-        x = torch.cuda.sparse.FloatTensor(2, 0)
+        x = torch.sparse_coo_tensor((2, 0), dtype=torch.float32, device=device)
         self.assertNotEqual(x.get_device(), -1)
 
     @onlyCUDA
@@ -2477,19 +2578,9 @@ def check_device(x, device_id):
         x = self.sparse_empty(3, 0, device=1)
         check_device(x, 1)
 
-        i = self.index_tensor([[2]], device=dev2)
-        v = torch.tensor([5], device=dev1)
-        # NB: non-legacy constructor allows this and moves indices
-        self.assertRaises(RuntimeError, lambda: self.legacy_sparse_tensor(i, v, torch.Size([3])))
-
-        i = self.index_tensor([[2]], device=dev2)
-        v = torch.empty(1, 0, device=dev1)
-        # NB: non-legacy constructor allows this and moves indices
-        self.assertRaises(RuntimeError, lambda: self.legacy_sparse_tensor(i, v, torch.Size([3, 0])))
-
     def _test_new_device(self, size, device=torch.cuda):
         with torch.cuda.device(device):
-            x = torch.cuda.sparse.DoubleTensor(*size)
+            x = torch.sparse_coo_tensor(size, device='cuda', dtype=torch.float64)
         self.assertEqual(x.get_device(), device)
         x1 = x.new()
         x2 = x.new(2, 3)
@@ -2609,18 +2700,7 @@ def test_factory_size_check(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, "values has incorrect size"):
             torch.sparse_coo_tensor(indices, values, sizes, dtype=dtype, device=device)
 
-    def test_factory_default(self, device):
-        tensor = self.legacy_sparse_tensor()
-        expected_indices = self.index_tensor([[]], device=device)
-        expected_size = torch.Size([0])
-        self.assertEqual(tensor._indices(), expected_indices)
-        self.assertEqual(tensor.shape, expected_size)
-
     def test_factory_empty_indices(self, device):
-        tensor = self.legacy_sparse_tensor()
-        expected_indices = torch.empty((1, 0), dtype=torch.long, device=device)
-        self.assertEqual(tensor._indices(), expected_indices)
-
         tensor = torch.sparse_coo_tensor(torch.Size([2, 0]), device=device)
         expected_indices = torch.empty((2, 0), dtype=torch.long, device=device)
         self.assertEqual(tensor._indices(), expected_indices)
@@ -2788,18 +2868,12 @@ def test_tensor(indices, values, indices_equal, values_equal):
         values = make_tensor([1, 1], dtype=torch.cdouble, device=device)
         test_tensor(indices, values, False, False)
 
-
     @onlyCPU  # just run once, we test both cpu and cuda
-    def test_constructor_device_legacy(self, device):
+    def test_legacy_new_device(self, device):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]])
         v = torch.tensor([3., 4., 5.])
         size = torch.Size([2, 3])
 
-        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(device='cuda'))
-        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, device='cuda'))
-        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, size, device='cuda'))
-        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cuda'))
-
         x = torch.sparse_coo_tensor(i, v, size, device='cpu')
         self.assertRaises(RuntimeError, lambda: x.new(device='cuda'))
         self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cuda'))
@@ -2807,27 +2881,12 @@ def test_constructor_device_legacy(self, device):
         self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda'))
 
         if torch.cuda.is_available():
-            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(device='cpu'))
-            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, device='cpu'))
-            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, size, device='cpu'))
-            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cpu'))
-
             x = torch.sparse_coo_tensor(i, v, size, device='cuda')
             self.assertRaises(RuntimeError, lambda: x.new(device='cpu'))
             self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cpu'))
             self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cpu'))
             self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu'))
 
-    def test_legacy_constructor(self, device):
-        i = torch.tensor([[0, 1, 1], [2, 0, 2]])
-        v = torch.tensor([3., 4., 5.])
-        size = torch.Size([2, 3])
-
-        self.assertRaises(TypeError, lambda: torch.sparse.FloatTensor(v.storage()))
-        self.assertRaises(TypeError, lambda: torch.sparse.FloatTensor(v))
-        self.assertEqual(torch.sparse_coo, torch.sparse.FloatTensor(torch.Size([2, 3])).layout)
-        self.assertRaises(TypeError, lambda: torch.sparse.FloatTensor([6]))
-
     def test_legacy_new(self, device):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]])
         v = torch.tensor([3., 4., 5.])
@@ -2835,7 +2894,7 @@ def test_legacy_new(self, device):
         s = torch.sparse_coo_tensor(i, v, size)
 
         self.assertEqual(torch.sparse_coo, s.new(device='cpu').layout)
-        self.assertRaises(TypeError, lambda: s.new(v.storage()))
+        self.assertRaises(TypeError, lambda: s.new(v.untyped_storage()))
         self.assertRaises(TypeError, lambda: s.new(v))
         self.assertEqual(torch.sparse_coo, s.new(torch.Size([2, 3])).layout)
         self.assertRaises(TypeError, lambda: s.new([6]))
@@ -2847,13 +2906,46 @@ def test_dtypes(self, device):
         if torch.cuda.is_available():
             do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
 
+    def _test_empty_full(self, device, dtype, requires_grad):
+        shape = (2, 3)
+        layout = torch.sparse_coo
+
+        def check_value(tensor, value=None, dtype=dtype, requires_grad=requires_grad):
+            self.assertEqual(shape, tensor.shape)
+            self.assertIs(dtype, tensor.dtype)
+            self.assertIs(layout, tensor.layout)
+            self.assertEqual(tensor.requires_grad, requires_grad)
+            if tensor.is_cuda and device is not None:
+                self.assertEqual(device, tensor.device)
+            if value is not None:
+                fill = tensor.empty(shape, dtype=dtype).fill_(value)
+                self.assertEqual(tensor, fill)
+
+        v = torch.sparse_coo_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+        check_value(v)
+
+        out = v.new()
+        check_value(torch.zeros(shape, out=out, device=device, requires_grad=requires_grad))
+
+        int64_dtype = torch.int64
+        check_value(v.new_empty(shape), requires_grad=False)
+        check_value(v.new_empty(shape, dtype=int64_dtype, device=device, requires_grad=False),
+                    dtype=int64_dtype, requires_grad=False)
+        check_value(torch.empty_like(v), requires_grad=False)
+        check_value(torch.empty_like(v, dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
+                    dtype=int64_dtype, requires_grad=False)
+
     @onlyCPU  # not really, but we only really want to run this once
-    def test_empty_full(self, device):
-        all_sparse_dtypes = all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)
-        do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu'))
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @parametrize('requires_grad', (True, False))
+    def test_empty_full(self, device, dtype, requires_grad):
+        if requires_grad and not (dtype.is_floating_point or dtype.is_complex):
+            self.skipTest(f'requires_grad==True requires float or complex dtype, got {dtype}')
+
+        self._test_empty_full(device, dtype, requires_grad)
         if torch.cuda.device_count() > 0:
-            do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None)
-            do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0'))
+            self._test_empty_full(None, dtype, requires_grad)
+            self._test_empty_full(torch.device('cuda:0'), dtype, requires_grad)
 
     def test_is_sparse(self, device):
         x = torch.randn(3, 3)
@@ -2862,9 +2954,6 @@ def test_is_sparse(self, device):
         x = torch.randn(3, 3, 0)
         self.assertFalse(x.is_sparse)
 
-        x = self.legacy_sparse_tensor()
-        self.assertTrue(x.is_sparse)
-
         x = self.sparse_empty(1, 0, device=device)
         self.assertTrue(x.is_sparse)
 
@@ -2876,7 +2965,6 @@ def do_test(t):
             # sparse_dim and dense_dim match.
             self.assertEqual(t, t + y)
 
-        do_test(self.legacy_sparse_tensor())
         do_test(self.sparse_empty([3, 0], device=device))
         do_test(self.sparse_empty([3, 3], device=device))
 
@@ -3409,19 +3497,36 @@ def sparse_log(x):
         test_op(4, 100, [3, 4, 2, 3, 5, 2], coalesced)
 
 
-    @dtypes(torch.double)
+    def _check_zero_nnz_softmax_op(self, func, ndim, device, dtype):
+        # create a sparse tensor with shape (0,..., 3) it has no materialize values
+        t = torch.sparse_coo_tensor([[] for _ in range(ndim)], [], (0,) * (ndim - 1) + (3,), device=device, dtype=dtype)
+        out = func(t, 0)
+        self.assertEqual(out, torch.zeros_like(t))
+
+        # gradient
+        t = t.requires_grad_()
+        gradcheck(lambda x: func(x, 0).to_dense(), (t,), masked=True, check_sparse_nnz=True)
+
+
+    @dtypes(torch.double, torch.float)
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
     def test_softmax_zero_nnz(self, device, dtype):
-        t = torch.sparse_coo_tensor([[]], [], (3,), device=device, dtype=dtype)
-        out = torch.sparse.softmax(t, 0)
-        self.assertEqual(out.to_dense(), torch.zeros_like(t))
+        self._check_zero_nnz_softmax_op(torch.sparse.softmax, 1, device, dtype)
+        self._check_zero_nnz_softmax_op(torch.sparse.softmax, 10, device, dtype)
+
+    @dtypes(torch.double, torch.float)
+    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupport triggers assertion error")
+    def test_log_softmax_zero_nnz(self, device, dtype):
+        self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 1, device, dtype)
+        self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 10, device, dtype)
 
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @skipIfRocm
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
-    @dtypesIfCUDA(*floating_types_and(*[torch.half] if CUDA11OrLater and SM53OrLater else [],
-                                      *[torch.bfloat16] if CUDA11OrLater and SM80OrLater else [],
-                                      *[torch.complex64] if CUDA11OrLater else [],
+    @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater else [],
+                                      *[torch.bfloat16] if SM80OrLater else [],
+                                      torch.complex64,
                                       *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else []))
     @unittest.skipIf(TEST_WITH_CROSSREF, "not working with fake tensor")
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2, torch.complex64: 1e-2, torch.float32: 1e-2})
@@ -3458,8 +3563,11 @@ def test_grad_dense(a_s, b_s, g_s):
             c.backward(g)
 
             a_grad, b_grad = test_grad_dense(a, b, g)
-            self.assertEqual(a.grad, a_grad)
-            self.assertEqual(b.grad, b_grad)
+
+            # We convert grad to dense since dense and sparse mm
+            # implementations handle materialized zeroes differently.
+            self.assertEqual(a.grad.to_dense(), a_grad.to_dense())
+            self.assertEqual(b.grad.to_dense(), b_grad.to_dense())
 
         def test_sparse_matmul(sparse_dims, nnz, shape_a, shape_b):
             a, i_a, v_a = self._gen_sparse(sparse_dims, nnz, shape_a, dtype, device, coalesced)
@@ -3488,9 +3596,9 @@ def fn(D1, D2):
                     # This is because cuSparse sometimes returns approximate zero values like `~e-323`
                     # TODO: Check this cuSparse issue.
                     # This happens when you do chain multiplication `torch.sparse.mm` operations
-                    gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5)
+                    gradcheck(fn, (a, b), check_sparse_nnz=True, nondet_tol=1e-5, masked=True)
                 else:
-                    gradcheck(fn, (a, b), check_sparse_nnz=True)
+                    gradcheck(fn, (a, b), check_sparse_nnz=True, masked=True)
                 grad_with_custom_sparsity_pattern_test_helper(sparse_dims, nnz, shape_a, shape_b)
 
         def test_error_cases():
@@ -3835,8 +3943,10 @@ def run_test(shape, nnz):
             self.assertEqual(a.sum(), a._values().sum())
             if dtype.is_floating_point or dtype.is_complex:
                 a.requires_grad_(True)
-                a.sum().backward()
-                self.assertEqual(a.grad, torch.ones(shape, dtype=dtype, device=device))
+                a_inter = a.sum()
+                a_inter.abs().backward()
+                with torch.no_grad():
+                    self.assertEqual(a.grad, torch.ones(shape, dtype=dtype, device=device) * torch.sgn(a_inter))
         for shape in [(10, 5), (10, 10)]:
             run_test(shape, 0)
             run_test(shape, max(shape))
@@ -3849,44 +3959,44 @@ def test_cuda_from_cpu(self):
         with self.assertRaisesRegex(
                 RuntimeError,
                 "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
-            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                     torch.randn(4, 4, 4),
-                                     [3, 4, 4])
+            torch.sparse_coo_tensor(torch.zeros(1, 4).long().cuda(),
+                                    torch.randn(4, 4, 4),
+                                    [3, 4, 4])
 
         with self.assertRaisesRegex(
                 RuntimeError,
                 "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
-            torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                     torch.randn(4, 4, 4, 0),
-                                     [3, 4, 4, 0])
+            torch.sparse_coo_tensor(torch.zeros(1, 4).long().cuda(),
+                                    torch.randn(4, 4, 4, 0),
+                                    [3, 4, 4, 0])
 
         with self.assertRaisesRegex(
                 RuntimeError,
                 "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"):
-            torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
-                                     torch.randn(0, 4, 4, 0),
-                                     [0, 4, 4, 0])
+            torch.sparse_coo_tensor(torch.empty(1, 0).long().cuda(),
+                                    torch.randn(0, 4, 4, 0),
+                                    [0, 4, 4, 0])
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_cuda_sparse_cpu_dense_add(self):
         x = torch.zeros(3, 4, 4)
-        sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                                 torch.randn(4, 4, 4).cuda(),
-                                                 [3, 4, 4])
+        sparse_y = torch.sparse_coo_tensor(torch.zeros(1, 4).long().cuda(),
+                                           torch.randn(4, 4, 4).cuda(),
+                                           [3, 4, 4])
         with self.assertRaisesRegex(RuntimeError, "add: expected 'self' to be a CUDA tensor, but got a CPU tensor"):
             x + sparse_y
 
         x = torch.zeros(3, 4, 4, 0)
-        sparse_y = torch.cuda.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(),
-                                                 torch.randn(4, 4, 4, 0).cuda(),
-                                                 [3, 4, 4, 0])
+        sparse_y = torch.sparse_coo_tensor(torch.zeros(1, 4).long().cuda(),
+                                           torch.randn(4, 4, 4, 0).cuda(),
+                                           [3, 4, 4, 0])
         with self.assertRaisesRegex(RuntimeError, "add: expected 'self' to be a CUDA tensor, but got a CPU tensor"):
             x + sparse_y
 
         x = torch.zeros(0, 4, 4, 0)
-        sparse_y = torch.cuda.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(),
-                                                 torch.randn(0, 4, 4, 0).cuda(),
-                                                 [0, 4, 4, 0])
+        sparse_y = torch.sparse_coo_tensor(torch.empty(1, 0).long().cuda(),
+                                           torch.randn(0, 4, 4, 0).cuda(),
+                                           [0, 4, 4, 0])
         with self.assertRaisesRegex(RuntimeError, "add: expected 'self' to be a CUDA tensor, but got a CPU tensor"):
             x + sparse_y
 
@@ -3925,8 +4035,8 @@ def test_out(self, device, dtype, op):
         sample.input = sample.input.to_sparse()
         expect = op(sample.input, *sample.args, **sample.kwargs)
 
-        out = torch.zeros(sample.input.shape, device=device,
-                          dtype=expect.dtype, layout=torch.sparse_coo)
+        out = torch.sparse_coo_tensor(sample.input.shape, device=device,
+                                      dtype=expect.dtype)
         op(sample.input, *sample.args, **sample.kwargs, out=out)
         self.assertEqual(out, expect)
 
@@ -3963,8 +4073,7 @@ def test_sparse_zeros(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype)
 
         zero_input = torch.zeros((), device=device, dtype=dtype)
-        sparse_input = torch.zeros((), dtype=dtype, device=device,
-                                   layout=torch.sparse_coo)
+        sparse_input = torch.sparse_coo_tensor((), dtype=dtype, device=device)
 
         expect = op(zero_input)
         actual = op(sparse_input)
@@ -3990,7 +4099,8 @@ def fn(x):
                 check_grad_dtypes=True,
                 check_sparse_nnz=True,
                 nondet_tol=op.gradcheck_nondet_tol,
-                fast_mode=op.gradcheck_fast_mode))
+                fast_mode=op.gradcheck_fast_mode,
+                masked=True))
 
 
 class TestSparseMaskedReductions(TestCase):
@@ -4180,6 +4290,33 @@ def create_invalid_tensor(check_invariants=None):
         # local context:
         self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
 
+        # Test nesting of pre-defined context managers
+        check_ctx = torch.sparse.check_sparse_tensor_invariants(True)
+        no_check_ctx = torch.sparse.check_sparse_tensor_invariants(False)
+        with check_ctx:
+            self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+            with no_check_ctx:
+                self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+            self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+        self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+
+        # Test an attempt to re-use an activate context manager instance
+        check_ctx2 = torch.sparse.check_sparse_tensor_invariants(True)
+        with check_ctx:
+            self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+            with no_check_ctx:
+                self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+                with self.assertRaisesRegex(RuntimeError, "This context manager instance is already activated."
+                                            " Use a different context manager instance for context nesting"):
+                    with check_ctx:
+                        self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+                self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+                with check_ctx2:
+                    self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+                self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+            self.assertTrue(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+        self.assertFalse(torch.sparse.check_sparse_tensor_invariants.is_enabled())
+
     def test_generate_simple_inputs(self):
         layouts = [torch.strided, torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc]
 
@@ -4257,7 +4394,7 @@ def test_generate_simple_inputs(self):
     @parametrize("index_dtype", [torch.int32, torch.int64])
     def test_to_dense(self, from_layout, device, dtype, index_dtype):
         """
-        This test tests conversion from any layout to any sparse layout.
+        This test tests conversion from any layout to strided layout.
         """
         for t in self.generate_simple_inputs(
                 from_layout, device=device, dtype=dtype, index_dtype=index_dtype):
@@ -4265,6 +4402,34 @@ def test_to_dense(self, from_layout, device, dtype, index_dtype):
             self.assertEqual(r.layout, torch.strided)
             self.assertEqual(r, t)
 
+    @all_sparse_layouts('from_layout', include_strided=False)
+    @dtypes(torch.float64, torch.complex128)
+    @parametrize("index_dtype", [torch.int64])
+    @gradcheck_semantics()
+    def test_gradcheck_to_dense(self, from_layout, device, dtype, index_dtype, gradcheck):
+        for t in self.generate_simple_inputs(
+                from_layout, device=device, dtype=dtype, index_dtype=index_dtype):
+            batch_dim = t.dim() - t.dense_dim() - t.sparse_dim()
+            if batch_dim > 0:
+                # TODO: implement batch support in _convert_indices_from_csr_to_coo
+                continue
+            t = t.clone().detach().requires_grad_(True)
+            if is_slow_gradcheck_env() and not gradcheck.masked:
+                # TODO: remove this if-block when TODO items below are resolved
+                try:
+                    gradcheck(torch.Tensor.to_dense, t)
+                except RuntimeError as msg:
+                    # TODO: implement non-masked semantics support in to_dense_backward
+                    with self.assertRaisesRegex(RuntimeError, "Jacobian mismatch"):
+                        gradcheck(torch.Tensor.to_dense, t)
+                    self.skipTest('non-masked semantics not supported')
+            r = gradcheck(torch.Tensor.to_dense, t)
+            self.assertTrue(r)
+
+        # when the following assert fails, it means that the if-block
+        # above and the assertFalse test below can be safely removed
+        self.assertFalse(is_slow_gradcheck_env() and not gradcheck.masked)
+
     @all_sparse_layouts('from_layout', include_strided=True)
     @all_sparse_layouts('to_layout', include_strided=False)
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
@@ -4321,17 +4486,13 @@ def explicit_to_sparse(x):
 
             # TODO: The following exception cases all correspond to
             # not implemented conversions
-            if from_layout is torch.sparse_coo and to_layout in {
-                    torch.sparse_bsr, torch.sparse_bsc} and t.sparse_dim() == 2 and is_hybrid:
-                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
-                    t.to_sparse(layout=to_layout, blocksize=blocksize)
-                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
-                    explicit_to_sparse(t)
-                continue
-            elif from_layout is torch.sparse_csr and to_layout in {torch.sparse_bsr} and (is_batch or is_hybrid):
-                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
+            if from_layout in {
+                    torch.sparse_csr, torch.sparse_csc} and to_layout in {torch.sparse_bsr, torch.sparse_bsc} and is_batch:
+                with self.assertRaisesRegex(RuntimeError,
+                                            r"conversion from (Csr|Csc) to (Bsr|Bsc) for batched inputs is not implemented"):
                     t.to_sparse(layout=to_layout, blocksize=blocksize)
-                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
+                with self.assertRaisesRegex(RuntimeError,
+                                            r"conversion from (Csr|Csc) to (Bsr|Bsc) for batched inputs is not implemented"):
                     explicit_to_sparse(t)
                 continue
             elif from_layout is torch.sparse_coo and to_layout in {
@@ -4353,17 +4514,15 @@ def explicit_to_sparse(x):
                     explicit_to_sparse(t)
                 continue
             elif (from_layout, to_layout) in {(torch.sparse_bsc, torch.sparse_csr), (torch.sparse_bsc, torch.sparse_csc),
-                                              (torch.sparse_bsr, torch.sparse_csr), (torch.sparse_bsr, torch.sparse_csc),
-                                              (torch.sparse_csc, torch.sparse_bsr), (torch.sparse_csc, torch.sparse_bsc),
-                                              (torch.sparse_csr, torch.sparse_bsc)}:
+                                              (torch.sparse_bsr, torch.sparse_csr), (torch.sparse_bsr, torch.sparse_csc)}:
                 with self.assertRaisesRegex(
                         RuntimeError,
-                        r"sparse_compressed_to_sparse_(csr|csc|bsr|bsc) expected\s*(SparseCsr[,]|)\s*Sparse(Csr|Bsr)"
+                        r"sparse_compressed_to_sparse_(csr|csc|bsr|bsc) expected\s*(Sparse(Csc|Csr)[,]|)\s*Sparse(Csr|Bsr)"
                         " or Sparse(Csc|Bsc) layout but got Sparse(Csr|Csc|Bsr|Bsc)"):
                     t.to_sparse(layout=to_layout, blocksize=blocksize)
                 with self.assertRaisesRegex(
                         RuntimeError,
-                        r"sparse_compressed_to_sparse_(csr|csc|bsr|bsc) expected\s*(SparseCsr[,]|)\s*Sparse(Csr|Bsr)"
+                        r"sparse_compressed_to_sparse_(csr|csc|bsr|bsc) expected\s*(Sparse(Csc|Csr)[,]|)\s*Sparse(Csr|Bsr)"
                         " or Sparse(Csc|Bsc) layout but got Sparse(Csr|Csc|Bsr|Bsc)"):
                     explicit_to_sparse(t)
                 self.skipTest('NOT IMPL')
@@ -4411,6 +4570,21 @@ def explicit_to_sparse(x):
                 r2 = explicit_to_sparse(t)
                 self.assertEqual(r2, r)
 
+                # Check inverse conversion from sparse compressed block tensors
+                if from_layout == torch.sparse_bsr:
+                    batch_ndim = t.crow_indices().dim() - 1
+                    from_blocksize = t.values().shape[batch_ndim + 1:batch_ndim + 3]
+                elif from_layout == torch.sparse_bsc:
+                    batch_ndim = t.ccol_indices().dim() - 1
+                    from_blocksize = t.values().shape[batch_ndim + 1:batch_ndim + 3]
+                else:
+                    continue
+                if r.ndim != 2:
+                    continue
+
+                t2 = r.to_sparse(layout=from_layout, blocksize=from_blocksize)
+                self.assertEqual(t2, t)
+
         # extra tests
         if (from_layout, to_layout) == (torch.sparse_csr, torch.sparse_bsr):
             # See gh-90910
@@ -4434,6 +4608,148 @@ def explicit_to_sparse(x):
             torch._validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, r.values(), r.shape, r.layout)
             self.assertEqual(r, t)
 
+    @onlyNativeDeviceTypes
+    @suppress_warnings
+    @ops(reduction_ops_with_sparse_support)
+    @precisionOverride({torch.bfloat16: 5e-4, torch.float16: 5e-3})
+    @all_sparse_layouts('layout', include_strided=False)
+    def test_reductions(self, layout, device, dtype, op):
+        count = 0
+        for sample in op.sample_inputs_sparse(layout, device, dtype):
+            count += 1
+
+            t_inp, t_args, t_kwargs = sample.input, sample.args, sample.kwargs
+            result = op.op(t_inp, *t_args, **t_kwargs)
+
+            #  Checking invariant rop(inp, ...).to_dense() == rop(inp.to_dense(), ...)
+            dense = op.op(t_inp.to_dense(), *t_args, **t_kwargs)
+            self.assertEqual(result, dense)
+
+        if count == 0:
+            # we count samples to avoid false-positive test reports
+            self.skipTest('no sample inputs')
+
+    @onlyNativeDeviceTypes
+    @suppress_warnings
+    @ops(reduction_ops_with_sparse_support, allowed_dtypes=(torch.float32, torch.float64, torch.complex64, torch.complex128))
+    @all_sparse_layouts('layout', include_strided=False)
+    def test_reductions_backward(self, layout, device, dtype, op):
+        count = 0
+        for sample in op.sample_inputs_sparse(layout, device, dtype, requires_grad=True):
+            t_inp, t_args, t_kwargs = sample.input, sample.args, sample.kwargs
+            r = op.op(t_inp, *t_args, **t_kwargs)
+            if r.numel() != 0:
+                r = r.sum()
+
+            if op.name == 'sum':
+                count += 1
+                r.abs().backward()
+                self.assertEqual(t_inp.grad, torch.ones(t_inp.shape, dtype=dtype, device=device) * torch.sgn(r))
+            else:
+                self.skipTest('NOT IMPL')
+
+        if count == 0:
+            # we count samples to avoid false-positive test reports
+            self.skipTest('no sample inputs')
+
+    @onlyNativeDeviceTypes
+    @suppress_warnings
+    @parametrize("mth", [subtest(mth, name=mth.__name__)
+                         for mth in [torch.Tensor.is_coalesced,
+                                     torch.Tensor.coalesce,
+                                     torch.Tensor.indices,
+                                     torch.Tensor.values,
+                                     torch.Tensor.crow_indices,
+                                     torch.Tensor.col_indices,
+                                     torch.Tensor.ccol_indices,
+                                     torch.Tensor.row_indices,
+                                     ]])
+    @all_sparse_layouts('layout', include_strided=True)
+    def test_unsupported_backend_error_message(self, mth, layout, device):
+        inp = torch.tensor([[1, 2], [3, 4]], device=device).to_sparse(
+            layout=layout,
+            blocksize=(1, 1) if layout in {torch.sparse_bsr, torch.sparse_bsc} else None)
+        assert inp.layout is layout
+
+        expected_behaviour = dict(
+            # <mth name> = (<supported layouts>, <exception message on other layouts>)
+            is_coalesced=({torch.sparse_coo},
+                          "is_coalesced expected sparse coordinate tensor layout but got (Sparse(Csr|Csc|Bsr|Bsc)|Strided)"),
+            coalesce=({torch.sparse_coo},
+                      "coalesce expected sparse coordinate tensor layout but got (Sparse(Csr|Csc|Bsr|Bsc)|Strided)"),
+            indices=({torch.sparse_coo},
+                     "indices expected sparse coordinate tensor layout but got (Sparse(Csr|Csc|Bsr|Bsc)|Strided)"),
+            values=({torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc},
+                    "values expected sparse tensor layout but got Strided"),
+            crow_indices=({torch.sparse_csr, torch.sparse_bsr},
+                          "crow_indices expected sparse row compressed tensor layout but got (Sparse(Csc|Bsc|)|Strided)"),
+            col_indices=({torch.sparse_csr, torch.sparse_bsr},
+                         "col_indices expected sparse row compressed tensor layout but got (Sparse(Csc|Bsc|)|Strided)"),
+            ccol_indices=({torch.sparse_csc, torch.sparse_bsc},
+                          "ccol_indices expected sparse column compressed tensor layout but got (Sparse(Csr|Bsr|)|Strided)"),
+            row_indices=({torch.sparse_csc, torch.sparse_bsc},
+                         "row_indices expected sparse column compressed tensor layout but got (Sparse(Csr|Bsr|)|Strided)"),
+        )[mth.__name__]
+
+        if layout in expected_behaviour[0]:
+            mth(inp)
+        else:
+            with self.assertRaisesRegex(RuntimeError, expected_behaviour[1]):
+                mth(inp)
+
+    @onlyNativeDeviceTypes
+    @all_sparse_layouts('layout', include_strided=not True)
+    @dtypes(torch.float64, torch.cdouble)
+    @parametrize("masked", [subtest(False, name='sparse'), subtest(True, name='masked')])
+    @parametrize("fast_mode", [subtest(False, name='slow'), subtest(True, name='fast')])
+    def test_gradcheck_mm(self, layout, dtype, device, masked, fast_mode):
+        # This function does not check the following cases:
+        # - batch or hybrid tensors because addmm does not support
+        #   such inputs yet
+        # - check_forward_ad=True because of the lack of sparse tensor
+        #   support in aten::view_as_real, torch._VF._make_dual, etc.
+
+        ref_x = torch.tensor([[1, 2, 0, 0],
+                              [0, 6, 0, 0],
+                              [0, 0, 0, 0],
+                              [13, 14, 0, 15]], dtype=dtype, device=device)
+        ref_y = torch.tensor([[11, 12, 13, 14],
+                              [21, 22, 23, 24],
+                              [31, 32, 33, 34],
+                              [41, 42, 43, 44]],
+                             dtype=dtype, device=device)
+
+        mm = torch.sparse.mm if masked else torch.mm
+
+        blocksize = (2, 2) if layout in {torch.sparse_bsr, torch.sparse_bsc} else None
+        x = ref_x.to_sparse(layout=layout, blocksize=blocksize).requires_grad_(True)
+        y = ref_y.requires_grad_(True)
+
+        if layout is torch.sparse_bsr and not masked or layout is torch.sparse_bsc:
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"addmm: computation on (CPU|CUDA) is not implemented for Strided \+ Sparse(Bsr|Bsc) @ Strided"):
+                torch.autograd.gradcheck(mm, (x, y), check_sparse_nnz=True, fast_mode=fast_mode, masked=masked)
+            self.skipTest('NOT IMPL')
+        elif layout in {torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc} and masked:
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"(sparse_addmm_sparse_backward: unsupported combination of layouts,"
+                    r" grad: Strided, mat1: Sparse(Csc|Bsr|Bsc), mat2: Strided"
+                    r"|addmm: computation on (CPU|CUDA) is not implemented for "
+                    r"Strided \+ Sparse(Csc|Bsr|Bsc) @ Strided without MKL)"):
+                torch.autograd.gradcheck(mm, (x, y), check_sparse_nnz=True, fast_mode=fast_mode, masked=masked)
+            self.skipTest('NOT IMPL')
+        else:
+            if masked:
+                r = torch.autograd.gradcheck(mm, (x, y), check_sparse_nnz=True, fast_mode=fast_mode, masked=masked)
+            else:
+                # Specifying check_sparse_nnz is unnecessary in
+                # non-masked/sparse semantics
+                r = torch.autograd.gradcheck(mm, (x, y), fast_mode=fast_mode, masked=masked)
+            self.assertTrue(r)
+
+
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
 instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
 
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index eb0270058ea1..3e38ce6f7bd0 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -15,7 +15,7 @@
      precisionOverride, skipMeta, skipCUDAIf, skipCUDAIfRocm, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
-from torch.testing._internal.common_cuda import _get_torch_cuda_version, CUDA11OrLater, TEST_CUDA
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_CUDA
 from torch.testing._internal.common_dtype import (
     floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and,
     all_types_and_complex, floating_and_complex_types_and
@@ -1140,13 +1140,13 @@ def _get_compressed_plain_inds(t):
             # error on a strided
             a_strided = a.to_dense()
             with self.assertRaisesRegex(
-                    RuntimeError, r'"resize_as_sparse_compressed_: src " expected sparse compressed tensor layout'):
+                    RuntimeError, r'resize_as_sparse_compressed_: src  expected sparse compressed tensor layout'):
                 b.resize_as_sparse_(a_strided)
 
             # error on b strided
             b_strided = b.to_dense()
             with self.assertRaisesRegex(
-                    RuntimeError, r'"resize_as_sparse_compressed_: self " expected sparse compressed tensor layout'):
+                    RuntimeError, r'resize_as_sparse_compressed_: self  expected sparse compressed tensor layout'):
                 b_strided.resize_as_sparse_(a)
 
             # error if layout does not match, transpose induces layout flip
@@ -1315,8 +1315,6 @@ def test_csr_to_block_csr_errors(self, device, dtype):
             nnz = 15
             t = self.genSparseCSRTensor((16, 16), nnz, dtype=dtype,
                                         device=device, index_dtype=index_dtype)
-            with self.assertRaisesRegex(RuntimeError, "must be square."):
-                block_t = t.to_sparse_bsr((2, 3))
 
             with self.assertRaisesRegex(RuntimeError, r"size \(16, 16\) with block size \(5, 5\)"):
                 block_t = t.to_sparse_bsr((5, 5))
@@ -1363,7 +1361,6 @@ def test_csr_matvec(self, device, dtype):
                 csr.matmul(bad_vec)
 
     @onlyCUDA
-    @unittest.skipIf(not (CUDA11OrLater or TEST_WITH_ROCM), "Only CUDA 11+ is supported")
     # hmm, the test passes ok on CUDA when Rocm is not available:
     @skipCUDAIfRocmVersionLessThan((5, 2))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
@@ -1406,7 +1403,7 @@ def run_test(c, a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device
                     run_test(c, a, a_batched, b, op_b, op_out, dtype=dtype, device=device)
 
     @onlyCUDA
-    @unittest.skipIf(not CUDA11OrLater, "Only CUDA 11+ is supported")
+    @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
     @skipCUDAIfNoSparseGeneric
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_bmm(self, device, dtype):
@@ -1679,7 +1676,7 @@ def run_test(a, b, upper, transpose, unitriangular, op_out):
                 run_test(a, b, upper, unitriangular, transpose, op_out)
 
     @skipCPUIfNoMklSparse
-    @unittest.skipIf(not CUDA11OrLater, "Only CUDA 11+ is supported")
+    @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
     @dtypes(torch.double)
     def test_mm(self, device, dtype):
         def test_shape(di, dj, dk, nnz0=None, nnz1=None):
@@ -2144,12 +2141,23 @@ def run_test(m, n, index_dtype):
             S1 = self.genSparseCSRTensor([m, n], nnz1, dtype=dtype, device=device, index_dtype=index_dtype)
             S2 = self.genSparseCSRTensor([m, n], nnz2, dtype=dtype, device=device, index_dtype=index_dtype)
             S3 = self.genSparseCSRTensor([m, n], nnz3, dtype=dtype, device=device, index_dtype=index_dtype)
-
-            expected = torch.add(S1.to_dense(), S2.to_dense(), alpha=alpha)
-            actual = torch.add(S1, S2, alpha=alpha, out=S3)
-
-            self.assertEqual(actual.to_dense(), expected)
-            self.assertEqual(S3.to_dense(), expected)
+            sparse_args = [S1, S2, S3]
+            dense_args = [t.to_dense() for t in sparse_args]
+            arg_idx = list(range(len(sparse_args)))
+            out_idx = arg_idx + [None]
+
+            for idx1, idx2, idx3 in itertools.product(arg_idx, arg_idx, out_idx):
+                s1 = sparse_args[idx1]
+                s2 = sparse_args[idx2]
+                s3 = None if idx3 is None else sparse_args[idx3]
+                d1 = dense_args[idx1]
+                d2 = dense_args[idx2]
+                d3 = None if idx3 is None else dense_args[idx3]
+
+                expected = torch.add(d1, d2, alpha=alpha, out=d3)
+                actual = torch.add(s1, s2, alpha=alpha, out=s3)
+                self.assertEqual(actual, expected)
+                self.assertEqual(s3, d3)
 
         for index_dtype in [torch.int32, torch.int64]:
             for m, n in itertools.product([3, 5], [3, 5]):
@@ -2177,7 +2185,22 @@ def run_test(index_type):
     def test_sparse_triangular_solve(self, device, dtype):
 
         def run_test(n, k, upper, unitriangular, transpose, zero):
-            triangle_function = torch.triu if upper else torch.tril
+            if not unitriangular:
+                triangle_function = torch.triu if upper else torch.tril
+            else:
+                # Make sure diagonal elements are not materialized.
+                # This is to exercise `unitriangular=True` not relying on
+                # explicit presence of these indices.
+                if upper:
+                    def remove_diagonal(t):
+                        return t.triu(-1)
+
+                else:
+                    def remove_diagonal(t):
+                        return t.tril(-1)
+
+                triangle_function = remove_diagonal
+
             make_A = torch.zeros if zero else make_tensor
             A = make_A((n, n), dtype=dtype, device=device)
             A = triangle_function(A)
@@ -2382,6 +2405,100 @@ def test_sampled_addmm_errors(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, r"Expected mat2 to have strided layout"):
             torch.sparse.sampled_addmm(a_sparse, a, a_sparse)
 
+    @onlyCPU
+    @dtypes(torch.float32, torch.float64, torch.bfloat16)
+    def test_sparse_mm_reduce_sum(self, device, dtype):
+        def run_test(m, n, k, nnz, train):
+            sparse = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=torch.int64)
+            dense = sparse.to_dense()
+
+            mat = torch.randn(k, n, dtype=dtype)
+            ref_mat = mat.clone()
+
+            if train:
+                sparse.requires_grad_()
+                mat.requires_grad_()
+                dense.requires_grad_()
+                ref_mat.requires_grad_()
+
+            ref_out = torch.mm(dense, ref_mat)
+            out = torch.sparse.mm(sparse, mat, 'sum')
+
+            self.assertEqual(out, ref_out)
+
+            if train:
+                ref_out.sum().backward()
+                out.sum().backward()
+
+                grad_input = sparse.grad
+                ref_grad_input = dense.grad
+                grad_mat = mat.grad
+                ref_grad_mat = ref_mat.grad
+
+                self.assertEqual(grad_input.to_dense(), ref_grad_input)
+                self.assertEqual(grad_mat, ref_grad_mat)
+
+        run_test(4, 5, 4, 10, False)
+        run_test(4, 4, 4, 16, True)
+
+    @onlyCPU
+    @dtypes(torch.float32, torch.float64, torch.bfloat16)
+    def test_sparse_mm_reduce(self, device, dtype):
+        def run_test(m, n, k, nnz, reduce_type, index_dtype, train):
+            csr = self.genSparseCSRTensor((m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype)
+            mat = torch.randn(n, k, dtype=dtype)
+            ref_mat = mat.clone()
+            ref_values = csr.values().clone()
+
+            out_int32 = index_dtype == torch.int32
+            coo_indices = torch._convert_indices_from_csr_to_coo(
+                csr.crow_indices(),
+                csr.col_indices(),
+                out_int32=out_int32)
+            row, col = coo_indices[0], coo_indices[1]
+
+            def ref(row, col, val, mat):
+                out = torch.zeros([m, k], dtype=dtype)
+                weight = mat.index_select(0, col)
+                src = weight.mul(val.view(-1, 1))
+                index = row.view(-1, 1).expand_as(weight)
+                index = index.to(dtype=torch.int64)
+                # scatter_reduce expect index to be int64
+                out.scatter_reduce_(0, index, src, reduce=reduce_type, include_self=False)
+                return out
+
+            if train:
+                csr.requires_grad_()
+                mat.requires_grad_()
+                ref_values.requires_grad_()
+                ref_mat.requires_grad_()
+
+            ref_out = ref(row, col, ref_values, ref_mat)
+            out = torch.sparse.mm(csr, mat, reduce_type)
+            self.assertEqual(out, ref_out)
+
+            if train and dtype is not torch.bfloat16:
+                ref_out.sum().backward()
+                out.sum().backward()
+
+                grad_values = csr.grad.values()
+                grad_weight = mat.grad
+                ref_grad_values = ref_values.grad
+                ref_grad_weight = ref_mat.grad
+                self.assertEqual(grad_values, ref_grad_values)
+                self.assertEqual(grad_weight, ref_grad_weight)
+
+        for train in [False, True]:
+            for index_dtype in [torch.int32, torch.int64]:
+                for reduce_type in ["sum", "mean", "amax", "amin"]:
+                    # by setting nnz < M, create empty rows
+                    run_test(3, 4, 11, 1, reduce_type, index_dtype, train)
+                    run_test(3, 4, 11, 6, reduce_type, index_dtype, train)
+                    run_test(3, 4, 11, 12, reduce_type, index_dtype, train)
+                    # we are doing blocking with 4x vector length in the kernel,
+                    # so need to test when K > 4x vector length
+                    run_test(4, 7, 33, 13, reduce_type, index_dtype, train)
+
     @skipMeta
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_coo_csr_conversion(self, device, dtype):
@@ -2858,10 +2975,13 @@ def test_compressed_layout_conversions_coverage(self, device, from_layout, to_la
             frozenset({torch.sparse_csc}),
             frozenset({torch.sparse_csr}),
             frozenset({torch.sparse_csc, torch.sparse_csr}),
+            frozenset({torch.sparse_csc, torch.sparse_bsc}),
+            frozenset({torch.sparse_csc, torch.sparse_bsr}),
+            frozenset({torch.sparse_csr, torch.sparse_bsc}),
+            frozenset({torch.sparse_csr, torch.sparse_bsr}),
             frozenset({torch.sparse_bsc}),
             frozenset({torch.sparse_bsr}),
             frozenset({torch.sparse_bsc, torch.sparse_bsr}),
-            frozenset({torch.sparse_csr, torch.sparse_bsr}),
         }
         block_layouts = (torch.sparse_bsr, torch.sparse_bsc)
 
@@ -2873,10 +2993,31 @@ def _to_from_layout(layout_a, layout_b, a):
             # BSR -> CSR is not yet supported
             if (layout_a, layout_b) == (torch.sparse_bsr, torch.sparse_csr):
                 expect_error = True
+            # BSR -> CSC is not yet supported
+            if (layout_a, layout_b) == (torch.sparse_bsr, torch.sparse_csc):
+                expect_error = True
+            # BSC -> CSR is not yet supported
+            if (layout_a, layout_b) == (torch.sparse_bsc, torch.sparse_csr):
+                expect_error = True
+            # BSC -> CSC is not yet supported
+            if (layout_a, layout_b) == (torch.sparse_bsc, torch.sparse_csc):
+                expect_error = True
             # CSR -> BSR only works for non-batched inputs
             if (layout_a, layout_b) == (torch.sparse_csr, torch.sparse_bsr):
                 if a.dim() > 2:
                     expect_error = True
+            # CSR -> BSC only works for non-batched inputs
+            if (layout_a, layout_b) == (torch.sparse_csr, torch.sparse_bsc):
+                if a.dim() > 2:
+                    expect_error = True
+            # CSC -> BSR only works for non-batched inputs
+            if (layout_a, layout_b) == (torch.sparse_csc, torch.sparse_bsr):
+                if a.dim() > 2:
+                    expect_error = True
+            # CSC -> BSC only works for non-batched inputs
+            if (layout_a, layout_b) == (torch.sparse_csc, torch.sparse_bsc):
+                if a.dim() > 2:
+                    expect_error = True
 
             blocksize_a = (1, 1) if layout_a in {torch.sparse_bsr, torch.sparse_bsc} else None
             blocksize_b = (1, 1) if layout_b in {torch.sparse_bsr, torch.sparse_bsc} else None
diff --git a/test/test_stateless.py b/test/test_stateless.py
index eaec5f9af364..9b431beb5336 100644
--- a/test/test_stateless.py
+++ b/test/test_stateless.py
@@ -1,12 +1,13 @@
 # Owner(s): ["module: nn"]
 
-import unittest
-import sys
+import contextlib
 import os
+import re
 import subprocess
+import sys
+import unittest
 
 import torch
-
 import torch.nn.utils.stateless as stateless
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_utils import run_tests, TestCase, parametrize, instantiate_parametrized_tests, \
@@ -18,10 +19,12 @@ def __init__(self):
         super().__init__()
         self.l1 = torch.nn.Linear(1, 1)
         self.register_buffer('buffer', torch.ones(1))
+        self.foo = 0.0
 
     def forward(self, x):
         return self.l1(x) + self.buffer
 
+
 class MockTiedModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -65,6 +68,29 @@ def _run_call_with_mock_module(self, module, functional_call, device='cpu', pref
         self.assertEqual(cur_weight, prev_weight)
         self.assertEqual(cur_buffer, prev_buffer)
 
+    @contextlib.contextmanager
+    def _ensure_module_unchanged(self, module, message):
+        orig_parameters, orig_buffers = tuple(module.parameters()), tuple(module.buffers())
+        orig_tensors = orig_parameters + orig_buffers
+        orig_tensors_values = tuple(t.clone() for t in orig_tensors)
+        try:
+            yield module
+        finally:
+            parameters, buffers = tuple(module.parameters()), tuple(module.buffers())
+            self.assertTrue(
+                len(parameters) == len(orig_parameters)
+                and len(buffers) == len(orig_buffers)
+                and all(
+                    t1 is t2 and torch.allclose(t1, t3)
+                    for t1, t2, t3 in zip(
+                        orig_tensors,
+                        parameters + buffers,
+                        orig_tensors_values,
+                    )
+                ),
+                message,
+            )
+
     @parametrize("functional_call", [
         subtest(torch.func.functional_call, "torch_func"),
         subtest(stateless.functional_call, "stateless")
@@ -201,7 +227,7 @@ def test_reparametrized_module_change_parametrization_original(self, functional_
         subtest(torch.func.functional_call, "torch_func"),
         subtest(stateless.functional_call, "stateless")
     ])
-    def test_reparamertize_module_fail_reset_to_original(self, functional_call):
+    def test_reparametrize_module_fail_reset_to_original(self, functional_call):
         module = MockModule()
         torch.nn.utils.parametrizations.spectral_norm(module.l1)
         self.assertTrue('l1.parametrizations.weight.original' in dict(module.named_parameters()))
@@ -220,6 +246,161 @@ def test_reparamertize_module_fail_reset_to_original(self, functional_call):
         self.assertTrue('l1.parametrizations.weight.original' in dict(module.named_parameters()))
         self.assertEqual(orig_sn_weight, module.l1.weight)
 
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_reparametrize_some_weights(self, functional_call):
+        module = MockModule()
+        weight = torch.tensor([[2.0]])
+        bias = torch.tensor([5.0])
+        buffer = torch.tensor([3.0])
+        extra = torch.tensor([1.0])
+
+        parameters = {'l1.weight': weight}
+        x = torch.randn(1, 1)
+        out = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + module.l1.bias + module.buffer)
+
+        parameters = {'l1.weight': weight,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        out = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + module.l1.bias + module.buffer)
+
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_reparametrize_strict(self, functional_call):
+        module = MockModule()
+        weight = torch.tensor([[2.0]])
+        bias = torch.tensor([5.0])
+        buffer = torch.tensor([3.0])
+        extra = torch.tensor([1.0])
+
+        # All weights no error
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a successful call',
+        ):
+            out = functional_call(module, parameters, x, strict=True)
+            self.assertEqual(out, x * weight + bias + buffer)
+
+        # Some weights
+        parameters = {'l1.weight': weight}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Missing key(s): 'buffer', 'l1.bias'."),
+            ):
+                out = functional_call(module, parameters, x, strict=True)
+
+        # Extra keys
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Unexpected key(s): 'extra'."),
+            ):
+                out = functional_call(module, parameters, x, strict=True)
+
+        # Some weights with extra keys
+        parameters = {'l1.weight': weight,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Unexpected key(s): 'extra'.") + r'\s+' + re.escape("Missing key(s): 'buffer', 'l1.bias'."),
+            ):
+                out = functional_call(module, parameters, x, strict=True)
+
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_reparametrize_special(self, functional_call):
+        class NonTensor:
+            def __repr__(self):
+                return f'<{self.__class__.__name__}>'
+
+        module = MockModule()
+        weight = torch.tensor([[2.0]])
+        bias = torch.tensor([5.0])
+        buffer = torch.tensor([3.0])
+        non_tensor = NonTensor()
+
+        # Set to None
+        parameters = {'l1.weight': weight,
+                      'l1.bias': None,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a successful call',
+        ):
+            out = functional_call(module, parameters, x)
+            self.assertEqual(out, x * weight + buffer)
+
+        # Set non-tensor
+        parameters = {'l1.weight': non_tensor}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                TypeError,
+                re.escape("<NonTensor> is not an instance of torch.Tensor"),
+            ):
+                out = functional_call(module, parameters, x)
+
+        # Set non-tensor attribute
+        parameters = {'l1.weight': weight, 'foo': torch.tensor([1.0])}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                TypeError,
+                re.escape("attribute `foo`: 0.0 is not an instance of torch.Tensor"),
+            ):
+                out = functional_call(module, parameters, x)
+
+        # Set non-exist submodule
+        parameters = {'l1.weight': weight,
+                      'l2.bias': bias}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                AttributeError,
+                re.escape("MockModule has no attribute `l2`"),
+            ):
+                out = functional_call(module, parameters, x)
+
     @parametrize("functional_call", [
         subtest(torch.func.functional_call, "torch_func"),
         subtest(stateless.functional_call, "stateless")
@@ -233,11 +414,12 @@ def test_tied_weights_warns(self, functional_call):
         subtest(torch.func.functional_call, "torch_func"),
         subtest(stateless.functional_call, "stateless")
     ])
-    def test_reparamertize_tie_weights(self, functional_call):
+    def test_reparametrize_tie_weights(self, functional_call):
         module = MockTiedModule()
-        weight = torch.tensor([[2.0]],)
+        weight = torch.tensor([[2.0]])
         bias = torch.tensor([5.0])
         buffer = torch.tensor([3.0])
+        extra = torch.tensor([1.0])
 
         parameters = {'l1.weight': weight,
                       'l1.bias': bias,
@@ -246,14 +428,21 @@ def test_reparamertize_tie_weights(self, functional_call):
         out = functional_call(module, parameters, x, tie_weights=True)
         self.assertEqual(out, x * weight + bias + bias + buffer + buffer)
 
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        out = functional_call(module, parameters, x, tie_weights=True)
+        self.assertEqual(out, x * weight + bias + bias + buffer + buffer)
 
     @parametrize("functional_call", [
         subtest(torch.func.functional_call, "torch_func"),
         subtest(stateless.functional_call, "stateless")
     ])
-    def test_reparamertize_tie_some_weights(self, functional_call):
+    def test_reparametrize_tie_some_weights(self, functional_call):
         module = MockTiedModule()
-        weight = torch.tensor([[2.0]],)
+        weight = torch.tensor([[2.0]])
         buffer = torch.tensor([3.0])
 
         parameters = {'l1.weight': weight,
@@ -268,7 +457,7 @@ def test_reparamertize_tie_some_weights(self, functional_call):
     ])
     def test_tied_weights_errors(self, functional_call):
         module = MockTiedModule()
-        weight = torch.tensor([[1.0]],)
+        weight = torch.tensor([[1.0]])
         bias = torch.tensor([0.0])
         buffer = torch.tensor([0.0])
 
@@ -285,19 +474,24 @@ def test_tied_weights_errors(self, functional_call):
         del parameters['tied_bias']
         del parameters['tied_buffer']
 
-        with self.assertRaisesRegex(ValueError, "functional_call got values for both (l1.bias|tied_bias)"):
+        with self.assertRaisesRegex(
+            ValueError,
+            re.escape("functional_call got multiple values for keys ['l1.bias', 'tied_bias']"),
+        ):
             parameters['tied_bias'] = torch.tensor([5.0])
             functional_call(module, parameters, x, tie_weights=True)
         del parameters['tied_bias']
 
-        with self.assertRaisesRegex(ValueError, "functional_call got values for both (buffer|tied_buffer)"):
+        with self.assertRaisesRegex(
+            ValueError,
+            re.escape("functional_call got multiple values for keys ['buffer', 'tied_buffer']"),
+        ):
             parameters['tied_buffer'] = torch.tensor([5.0])
             functional_call(module, parameters, x, tie_weights=True)
 
-
     def test_tied_weights_no_error_without_flag(self):
         module = MockTiedModule()
-        weight = torch.tensor([[1.0]],)
+        weight = torch.tensor([[1.0]])
         bias = torch.tensor([0.0])
         buffer = torch.tensor([0.0])
 
@@ -312,6 +506,105 @@ def test_tied_weights_no_error_without_flag(self):
         parameters['tied_buffer'] = torch.tensor([5.0])
         self.assertNotWarn(lambda: stateless._functional_call(module, parameters, x, tie_weights=False))
 
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_reparametrize_tie_weights_strict(self, functional_call):
+        module = MockTiedModule()
+        weight = torch.tensor([[2.0]])
+        bias = torch.tensor([5.0])
+        buffer = torch.tensor([3.0])
+        extra = torch.tensor([1.0])
+
+        # Tie weights no error
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a successful call',
+        ):
+            out = functional_call(module, parameters, x, tie_weights=True, strict=True)
+            self.assertEqual(out, x * weight + bias + bias + buffer + buffer)
+
+        # Tie weights without flag
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Missing key(s): 'tied_bias', 'tied_buffer'."),
+            ):
+                out = functional_call(module, parameters, x, tie_weights=False, strict=True)
+
+        # Tie some weights
+        parameters = {'l1.weight': weight,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Missing key(s): 'l1.bias', 'tied_bias'."),
+            ):
+                out = stateless.functional_call(module, parameters, x, tie_weights=True, strict=True)
+
+        # Tie weights with extra keys
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Unexpected key(s): 'extra'."),
+            ):
+                out = stateless.functional_call(module, parameters, x, tie_weights=True, strict=True)
+
+        # Tie weights with extra keys and without flag
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Unexpected key(s): 'extra'.") + r'\s+' + re.escape("Missing key(s): 'tied_bias', 'tied_buffer'."),
+            ):
+                out = stateless.functional_call(module, parameters, x, tie_weights=False, strict=True)
+
+        # Tie some weights with extra keys
+        parameters = {'l1.weight': weight,
+                      'buffer': buffer,
+                      'extra': extra}
+        x = torch.randn(1, 1)
+        with self._ensure_module_unchanged(
+            module,
+            'the module should not have been modified by a failed call',
+        ):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                re.escape("Unexpected key(s): 'extra'.") + r'\s+' + re.escape("Missing key(s): 'l1.bias', 'tied_bias'."),
+            ):
+                out = stateless.functional_call(module, parameters, x, tie_weights=True, strict=True)
+
     @parametrize("functional_call", [
         subtest(torch.func.functional_call, "torch_func"),
         subtest(stateless.functional_call, "stateless")
@@ -320,17 +613,89 @@ def test_setattr(self, functional_call):
         class Foo(torch.nn.Module):
             def __init__(self):
                 super().__init__()
-                self.register_buffer('foo', torch.zeros(()))
+                self.register_buffer('foo', torch.tensor([0.0]))
 
             def forward(self, x):
                 self.foo = self.foo + 1
                 return x + self.foo
 
-        a = {'foo': torch.zeros(())}
+        foo = torch.tensor([2.0])
+        x = torch.randn(1)
+        a = {'foo': foo}
         mod = Foo()
-        functional_call(mod, a, torch.ones(()))
-        self.assertEqual(mod.foo, torch.zeros(()))
-        self.assertEqual(a['foo'], torch.ones(()))
+        functional_call(mod, a, x)
+        self.assertEqual(mod.foo, torch.tensor([0.0]))
+        self.assertEqual(a['foo'], torch.tensor([3.0]))
+        self.assertEqual(foo, torch.tensor([2.0]))
+        self.assertTrue(a['foo'] is not foo)
+
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_in_place_operator(self, functional_call):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer('foo', torch.tensor([0.0]))
+
+            def forward(self, x):
+                self.foo.add_(1)
+                return x + self.foo
+
+        foo = torch.tensor([2.0])
+        x = torch.randn(1)
+        a = {'foo': foo}
+        mod = Foo()
+        functional_call(mod, a, x)
+        self.assertEqual(mod.foo, torch.tensor([0.0]))
+        self.assertEqual(a['foo'], torch.tensor([3.0]))
+        self.assertEqual(foo, torch.tensor([3.0]))
+        self.assertTrue(a['foo'] is foo)
+
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_setattr_strict(self, functional_call):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                assert not hasattr(self, 'extra')
+
+            def forward(self, x):
+                return x + self.extra
+
+        a = {'extra': torch.zeros(())}
+        mod = Bar()
+        self.assertTrue(not hasattr(mod, 'extra'))
+        out = functional_call(mod, a, torch.ones(()))
+        self.assertEqual(out, torch.ones(()))
+        self.assertTrue(not hasattr(mod, 'extra'))
+
+        a = {'extra': torch.zeros(())}
+        with self.assertRaisesRegex(
+            RuntimeError,
+            re.escape("Unexpected key(s): 'extra'."),
+        ):
+            out = functional_call(mod, a, torch.ones(()), strict=True)
+        self.assertTrue(not hasattr(mod, 'extra'))
+
+        a = {}
+        with self.assertRaisesRegex(
+            AttributeError,
+            re.escape("'Bar' object has no attribute 'extra'"),
+        ):
+            out = functional_call(mod, a, torch.ones(()))
+        self.assertTrue(not hasattr(mod, 'extra'))
+
+        a = {}
+        with self.assertRaisesRegex(
+            AttributeError,
+            re.escape("'Bar' object has no attribute 'extra'"),
+        ):
+            out = functional_call(mod, a, torch.ones(()), strict=True)
+        self.assertTrue(not hasattr(mod, 'extra'))
 
     @parametrize("functional_call", [
         subtest(torch.func.functional_call, "torch_func"),
@@ -355,7 +720,6 @@ def forward(self, inp, *, other_inp):
         res_1 = functional_call(mod, a, (), {'inp': inp, 'other_inp': other_inp})
         self.assertEqual(res, res_1)
 
-
     def test_functional_call_tuple_dicts(self):
         mod = MockModule()
         x = torch.rand((1, 1))
@@ -375,15 +739,121 @@ def test_functional_call_tuple_dicts(self):
         res = torch.func.functional_call(mod, a, x)
         self.assertEqual(res, x + 1)
 
-
     def test_functional_call_multiple_dicts_error(self):
         mod = MockModule()
         x = torch.rand((1, 1))
         parameters = {'l1.weight': torch.zeros((1, 1)), 'l1.bias': torch.zeros((1, 1))}
         repeated_parameters = {'l1.weight': torch.ones((1, 1))}
-        with self.assertRaisesRegex(ValueError, "l1.weight appeared in multiple dictionaries"):
+        with self.assertRaisesRegex(
+            ValueError,
+            re.escape("['l1.weight'] appeared in multiple dictionaries"),
+        ):
             torch.func.functional_call(mod, (parameters, repeated_parameters), x)
 
+    @parametrize("functional_call", [
+        subtest(torch.func.functional_call, "torch_func"),
+        subtest(stateless.functional_call, "stateless")
+    ])
+    def test_functional_call_member_reference(self, functional_call):
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.l1 = torch.nn.Linear(1, 1)
+                self.register_buffer('buffer', torch.ones(1))
+
+            def forward(self, x):
+                parameters = tuple(self.parameters())
+                buffers = tuple(self.buffers())
+                return self.l1(x) + self.buffer, parameters, buffers
+
+        module = Module()
+        weight = torch.tensor([[2.0]])
+        bias = torch.tensor([5.0])
+        buffer = torch.tensor([3.0])
+        extra = torch.tensor([1.0])
+        extra_p = torch.nn.Parameter(extra)
+
+        # All weights
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + bias + buffer)
+        self.assertEqual(parameters, (weight, bias))
+        self.assertEqual(buffers, (buffer,))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight, bias))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (buffer,))))
+
+        # Some weights
+        parameters = {'l1.weight': weight}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + module.l1.bias + module.buffer)
+        self.assertEqual(parameters, (weight, module.l1.bias))
+        self.assertEqual(buffers, (module.buffer,))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight, module.l1.bias))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (module.buffer,))))
+
+        # All weights with extra keys
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer,
+                      'l1.extra': extra}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + bias + buffer)
+        self.assertEqual(parameters, (weight, bias))
+        self.assertEqual(buffers, (buffer,))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight, bias))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (buffer,))))
+
+        # All weights with extra keys with parameters
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer,
+                      'l1.extra': extra_p}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + bias + buffer)
+        self.assertEqual(parameters, (weight, bias, extra_p))
+        self.assertEqual(buffers, (buffer,))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight, bias, extra_p))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (buffer,))))
+
+        # Some weights with extra keys
+        parameters = {'l1.weight': weight,
+                      'l1.extra': extra}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + module.l1.bias + module.buffer)
+        self.assertEqual(parameters, (weight, module.l1.bias))
+        self.assertEqual(buffers, (module.buffer))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight, module.l1.bias))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (module.buffer,))))
+
+        # Some weights with extra keys with parameters
+        parameters = {'l1.weight': weight,
+                      'l1.extra': extra_p}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + module.l1.bias + module.buffer)
+        self.assertEqual(parameters, (weight, module.l1.bias, extra_p))
+        self.assertEqual(buffers, (module.buffer))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight, module.l1.bias, extra_p))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (module.buffer,))))
+
+        # Set None
+        parameters = {'l1.weight': weight,
+                      'l1.bias': None}
+        x = torch.randn(1, 1)
+        out, parameters, buffers = functional_call(module, parameters, x)
+        self.assertEqual(out, x * weight + module.buffer)
+        self.assertEqual(parameters, (weight,))
+        self.assertEqual(buffers, (module.buffer))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(parameters, (weight,))))
+        self.assertTrue(all(t1 is t2 for t1, t2 in zip(buffers, (module.buffer,))))
+
 
 class TestStatelessDeprecation(TestCase):
     def test_private_stateless_warns(self):
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index b3087eee18e0..032e67764072 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -178,7 +178,7 @@ def output_graph(a, b, c, iters: int):
 
 class SubModule(nn.Module):
     def __init__(self):
-        super(SubModule, self).__init__()
+        super().__init__()
         self.a = 11
         self.b = 2
 
@@ -188,7 +188,7 @@ def forward(self, x):
 
 class SubModule2(nn.Module):
     def __init__(self):
-        super(SubModule2, self).__init__()
+        super().__init__()
         self.a = 12
         self.b = 2
 
@@ -199,7 +199,7 @@ def forward(self, x):
 
 class TestModule(nn.Module):
     def __init__(self):
-        super(TestModule, self).__init__()
+        super().__init__()
         self.sub1 = SubModule()
         self.sub2 = SubModule2()
         self.a = 3
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
new file mode 100644
index 000000000000..03b0bc99dba8
--- /dev/null
+++ b/test/test_sympy_utils.py
@@ -0,0 +1,243 @@
+# -*- coding: utf-8 -*-
+# Owner(s): ["oncall: pt2"]
+
+import itertools
+
+import sympy
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
+from torch.utils._sympy.value_ranges import ValueRangeAnalysis, ValueRanges
+from torch.utils._sympy.reference import ReferenceAnalysis
+from torch.utils._sympy.interp import sympy_interp
+
+
+UNARY_OPS = [
+    "reciprocal",
+    "square",
+    "abs",
+    "neg",
+    "exp",
+    "log",
+    "sqrt",
+    "floor",
+    "ceil",
+]
+BINARY_OPS = ["truediv", "div", "add", "mul", "sub", "pow", "minimum", "maximum", "mod"]
+
+UNARY_BOOL_OPS = ["not_"]
+BINARY_BOOL_OPS = ["or_", "and_"]
+COMPARE_OPS = ["eq", "ne", "lt", "gt", "le", "ge"]
+
+# a mix of constants, powers of two, primes
+CONSTANTS = [
+    -1,
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    8,
+    16,
+    32,
+    64,
+    100,
+    101,
+    2**24,
+    2**32,
+    2**37 - 1,
+]
+# less constants for N^2 situations
+LESS_CONSTANTS = [-1, 0, 1, 2, 100]
+
+
+def valid_unary(fn, v):
+    if fn == "log" and v <= 0:
+        return False
+    elif fn == "reciprocal" and v == 0:
+        return False
+    elif fn == "sqrt" and v < 0:
+        return False
+    return True
+
+
+def valid_binary(fn, a, b):
+    if fn == "pow" and (
+        b > 4
+        or (  # sympy will expand to x*x*... for integral b; don't do it if it's big
+            a <= 0 and b == -1
+        )
+        or (a == b == 0)  # no imaginary numbers  # 0**0 is undefined
+    ):
+        return False
+    elif fn == "mod" and b == 0:
+        return False
+    elif (fn == "div" or fn == "truediv") and b == 0:
+        return False
+    return True
+
+
+def generate_range(vals):
+    for a1, a2 in itertools.product(vals, repeat=2):
+        if a1 in [sympy.true, sympy.false]:
+            if a1 == sympy.true and a2 == sympy.false:
+                continue
+        else:
+            if a1 > a2:
+                continue
+        # ranges that only admit infinite values are not interesting
+        if a1 == sympy.oo or a2 == -sympy.oo:
+            continue
+        yield ValueRanges(a1, a2)
+
+
+class TestValueRanges(TestCase):
+    @parametrize("fn", UNARY_OPS)
+    def test_unary_ref(self, fn):
+        for v in CONSTANTS:
+            if not valid_unary(fn, v):
+                continue
+            with self.subTest(v=v):
+                ref_r = getattr(ReferenceAnalysis, fn)(sympy.Integer(v))
+                r = getattr(ValueRangeAnalysis, fn)(ValueRanges.wrap(v))
+                self.assertEqual(r.lower, r.upper)
+                self.assertEqual(ref_r, r.lower)
+
+    def test_pow_half(self):
+        ValueRangeAnalysis.pow(ValueRanges.unknown(), ValueRanges.wrap(0.5))
+
+    @parametrize("fn", BINARY_OPS)
+    def test_binary_ref(self, fn):
+        for a, b in itertools.product(CONSTANTS, repeat=2):
+            if not valid_binary(fn, a, b):
+                continue
+            with self.subTest(a=a, b=b):
+                ref_r = getattr(ReferenceAnalysis, fn)(
+                    sympy.Integer(a), sympy.Integer(b)
+                )
+                r = getattr(ValueRangeAnalysis, fn)(
+                    ValueRanges.wrap(a),
+                    ValueRanges.wrap(b),
+                )
+                self.assertEqual(r.lower, r.upper)
+                self.assertEqual(ref_r, r.lower)
+
+    def test_mul_zero_unknown(self):
+        self.assertEqual(
+            ValueRangeAnalysis.mul(ValueRanges.wrap(0), ValueRanges.unknown()),
+            ValueRanges.wrap(0),
+        )
+
+    @parametrize("fn", UNARY_BOOL_OPS)
+    def test_unary_bool_ref_range(self, fn):
+        vals = [sympy.false, sympy.true]
+        for a in generate_range(vals):
+            with self.subTest(a=a):
+                ref_r = getattr(ValueRangeAnalysis, fn)(a)
+                unique = set()
+                for a0 in vals:
+                    if a0 not in a:
+                        continue
+                    with self.subTest(a0=a0):
+                        r = getattr(ReferenceAnalysis, fn)(a0)
+                        self.assertIn(r, ref_r)
+                        unique.add(r)
+                if ref_r.lower == ref_r.upper:
+                    self.assertEqual(len(unique), 1)
+                else:
+                    self.assertEqual(len(unique), 2)
+
+    @parametrize("fn", BINARY_BOOL_OPS)
+    def test_binary_bool_ref_range(self, fn):
+        vals = [sympy.false, sympy.true]
+        for a, b in itertools.product(generate_range(vals), repeat=2):
+            with self.subTest(a=a, b=b):
+                ref_r = getattr(ValueRangeAnalysis, fn)(a, b)
+                unique = set()
+                for a0, b0 in itertools.product(vals, repeat=2):
+                    if a0 not in a or b0 not in b:
+                        continue
+                    with self.subTest(a0=a0, b0=b0):
+                        r = getattr(ReferenceAnalysis, fn)(a0, b0)
+                        self.assertIn(r, ref_r)
+                        unique.add(r)
+                if ref_r.lower == ref_r.upper:
+                    self.assertEqual(len(unique), 1)
+                else:
+                    self.assertEqual(len(unique), 2)
+
+    @parametrize("fn", UNARY_OPS)
+    def test_unary_ref_range(self, fn):
+        vals = [-sympy.oo, *CONSTANTS, sympy.oo]
+        for a in generate_range(vals):
+            with self.subTest(a=a):
+                ref_r = getattr(ValueRangeAnalysis, fn)(a)
+                for a0 in CONSTANTS:
+                    if a0 not in a:
+                        continue
+                    if not valid_unary(fn, a0):
+                        continue
+                    with self.subTest(a0=a0):
+                        r = getattr(ReferenceAnalysis, fn)(sympy.Integer(a0))
+                        self.assertIn(r, ref_r)
+
+    # This takes about 4s for all the variants
+    @parametrize("fn", BINARY_OPS + COMPARE_OPS)
+    def test_binary_ref_range(self, fn):
+        vals = [-sympy.oo, *LESS_CONSTANTS, sympy.oo]
+        for a, b in itertools.product(generate_range(vals), repeat=2):
+            # don't attempt pow on exponents that are too large (but oo is OK)
+            if fn == "pow" and b.upper > 4 and b.upper != sympy.oo:
+                continue
+            with self.subTest(a=a, b=b):
+                ref_r = getattr(ValueRangeAnalysis, fn)(a, b)
+                for a0, b0 in itertools.product(LESS_CONSTANTS, repeat=2):
+                    if a0 not in a or b0 not in b:
+                        continue
+                    if not valid_binary(fn, a0, b0):
+                        continue
+                    with self.subTest(a0=a0, b0=b0):
+                        r = getattr(ReferenceAnalysis, fn)(
+                            sympy.Integer(a0), sympy.Integer(b0)
+                        )
+                        self.assertIn(r, ref_r)
+
+
+class TestSympyInterp(TestCase):
+    @parametrize("fn", UNARY_OPS + BINARY_OPS + UNARY_BOOL_OPS + BINARY_BOOL_OPS + COMPARE_OPS)
+    def test_interp(self, fn):
+        from sympy.abc import x, y
+        vals = CONSTANTS
+        if fn in {*UNARY_BOOL_OPS, *BINARY_BOOL_OPS}:
+            vals = [True, False]
+        arity = 1
+        if fn in {*BINARY_OPS, *BINARY_BOOL_OPS, *COMPARE_OPS}:
+            arity = 2
+        symbols = [x]
+        if arity == 2:
+            symbols = [x, y]
+        for args in itertools.product(vals, repeat=arity):
+            if arity == 1 and not valid_unary(fn, *args):
+                continue
+            elif arity == 2 and not valid_binary(fn, *args):
+                continue
+            with self.subTest(args=args):
+                sargs = [sympy.sympify(a) for a in args]
+                sympy_expr = getattr(ReferenceAnalysis, fn)(*symbols)
+                ref_r = getattr(ReferenceAnalysis, fn)(*sargs)
+                # Yes, I know this is a longwinded way of saying xreplace; the
+                # point is to test sympy_interp
+                r = sympy_interp(ReferenceAnalysis, dict(zip(symbols, sargs)), sympy_expr)
+                self.assertEqual(ref_r, r)
+
+
+instantiate_parametrized_tests(TestValueRanges)
+instantiate_parametrized_tests(TestSympyInterp)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 67accdecb174..dfc0002ab4ee 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -14,7 +14,7 @@
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, slowTest,
-    TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS, parametrize, skipIfTorchDynamo)
+    TEST_SCIPY, IS_MACOS, IS_PPC, IS_JETSON, IS_WINDOWS, parametrize, skipIfTorchDynamo)
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta, instantiate_device_type_tests, deviceCountAtLeast, onlyNativeDeviceTypes,
     onlyCPU, largeTensorTest, precisionOverride, dtypes,
@@ -953,8 +953,9 @@ def _float_to_int_conversion_helper(self, vals, device, dtype):
     # errors with UBSAN. These casts are deliberate in PyTorch, however, and
     # NumPy has the same behavior.
     @onlyNativeDeviceTypes
-    @unittest.skipIf(IS_MACOS, "Test is broken on MacOS, see https://github.com/pytorch/pytorch/issues/38752")
-    @unittest.skipIf(IS_PPC, "Test is borken on PowerPC, see https://github.com/pytorch/pytorch/issues/39671")
+    @unittest.skipIf(IS_MACOS or IS_JETSON, "Test is broken on MacOS and Jetson, \
+        see https://github.com/pytorch/pytorch/issues/38752")
+    @unittest.skipIf(IS_PPC, "Test is broken on PowerPC, see https://github.com/pytorch/pytorch/issues/39671")
     @dtypes(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
     def test_float_to_int_conversion_finite(self, device, dtype):
         min = torch.finfo(torch.float).min
@@ -1155,7 +1156,7 @@ def test_new_methods_requires_grad(self, device):
     # TODO: update to work on CUDA, too?
     @onlyCPU
     def test_tensor_from_sequence(self, device):
-        class MockSequence(object):
+        class MockSequence:
             def __init__(self, lst):
                 self.lst = lst
 
@@ -1444,14 +1445,14 @@ def test_linlogspace_mem_overlap(self, device):
     def test_ctor_with_numpy_array(self, device):
         correct_dtypes = [
             np.double,
-            np.float,
+            float,
             np.float16,
             np.int64,
             np.int32,
             np.int16,
             np.int8,
             np.uint8,
-            np.bool,
+            bool,
         ]
 
         incorrect_byteorder = '>' if sys.byteorder == 'little' else '<'
@@ -3936,6 +3937,18 @@ def test_astensor_consistency(self, device):
             t = torch.asarray(e)
             self.assertEqual(t, original)
 
+    @onlyCPU
+    def test_numpy_scalars(self, device):
+        scalar = np.float64(0.5)
+
+        with self.assertRaisesRegex(RuntimeError, "can't alias NumPy scalars."):
+            torch.asarray(scalar, copy=False)
+
+        tensor = torch.asarray(scalar)
+        self.assertEqual(tensor.dim(), 0)
+        self.assertEqual(tensor.item(), scalar.item())
+        self.assertEqual(tensor.dtype, torch.float64)
+
 instantiate_device_type_tests(TestTensorCreation, globals())
 instantiate_device_type_tests(TestRandomTensorCreation, globals())
 instantiate_device_type_tests(TestLikeTensorCreation, globals())
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index f69e79cca9ed..5d2ef1ee4dfd 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -53,6 +53,7 @@ def tensor_N(shape, dtype=float):
 class BaseTestCase(TestCase):
     """ Base class used for all TensorBoard tests """
     def setUp(self):
+        super().setUp()
         if not TEST_TENSORBOARD:
             return self.skipTest("Skip the test since TensorBoard is not installed")
         if TEST_WITH_CROSSREF:
@@ -67,7 +68,7 @@ def createSummaryWriter(self):
         return SummaryWriter(temp_dir)
 
     def tearDown(self):
-        super(BaseTestCase, self).tearDown()
+        super().tearDown()
         # Remove directories created by SummaryWriter
         for temp_dir in self.temp_dirs:
             if os.path.exists(temp_dir):
@@ -562,7 +563,7 @@ def test_pytorch_graph(self):
 
         class myLinear(torch.nn.Module):
             def __init__(self):
-                super(myLinear, self).__init__()
+                super().__init__()
                 self.l = torch.nn.Linear(3, 5)
 
             def forward(self, x):
@@ -682,7 +683,7 @@ def test_mlp_graph(self):
         # the add_graph call and still continue.
         class myMLP(torch.nn.Module):
             def __init__(self):
-                super(myMLP, self).__init__()
+                super().__init__()
                 self.input_len = 1 * 28 * 28
                 self.fc1 = torch.nn.Linear(self.input_len, 1200)
                 self.fc2 = torch.nn.Linear(1200, 1200)
@@ -806,7 +807,7 @@ def test_caffe2_simple_model(self):
         model = ModelHelper(name="mnist")
         # how come those inputs don't break the forward pass =.=a
         workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
-        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(int))
 
         with core.NameScope("conv1"):
             conv1 = brew.conv(model, "data", 'conv1', dim_in=1, dim_out=20, kernel=5)
@@ -841,7 +842,7 @@ def test_caffe2_simple_model(self):
     def test_caffe2_simple_cnnmodel(self):
         model = cnn.CNNModelHelper("NCHW", name="overfeat")
         workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
-        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+        workspace.FeedBlob("label", np.random.randn(1, 1000).astype(int))
         with core.NameScope("conv1"):
             conv1 = model.Conv("data", "conv1", 3, 96, 11, stride=4)
             relu1 = model.Relu(conv1, conv1)
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index cf894f3749eb..d60376f19296 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -7,7 +7,7 @@
 import unittest
 import itertools
 
-from torch.testing._internal.common_utils import suppress_warnings, num_profiled_runs, run_tests
+from torch.testing._internal.common_utils import suppress_warnings, num_profiled_runs, run_tests, skipIfTorchDynamo
 
 from torch.testing._internal.jit_utils import JitTestCase, TensorExprTestOptions
 
@@ -15,14 +15,14 @@
 
 class BaseTestClass(JitTestCase):
     def setUp(self):
-        super(BaseTestClass, self).setUp()
+        super().setUp()
         self.tensorexpr_options = TensorExprTestOptions()
         self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         self.dtypes = [torch.float32, torch.bfloat16] if LLVM_ENABLED else [torch.float32]
 
     def tearDown(self):
         self.tensorexpr_options.restore()
-        super(BaseTestClass, self).tearDown()
+        super().tearDown()
 
     def assertLastGraphAllFused(self):
         self.assertAllFused(torch.jit.last_executed_optimized_graph())
@@ -34,6 +34,7 @@ def warmup_and_run_forward(f, *args):
     return results
 
 
+@skipIfTorchDynamo()
 class TestTensorExprFuser(BaseTestClass):
     def test_easy(self):
         def easy(x, y):
@@ -1532,7 +1533,7 @@ def foo(a, b):
     def test_alias_analysis_module(self):
         class AliasModule(nn.Module):
             def __init__(self):
-                super(AliasModule, self).__init__()
+                super().__init__()
                 torch.manual_seed(1337)
                 self.a = torch.randn(128, 128)
                 self.b = torch.randn(128, 128)
@@ -1570,7 +1571,7 @@ def getModule(script):
     def test_alias_analysis_inputs(self):
         class AliasModule(nn.Module):
             def __init__(self):
-                super(AliasModule, self).__init__()
+                super().__init__()
                 torch.manual_seed(1337)
                 self.a = torch.randn(128, 128)
                 self.b = torch.randn(128, 128)
@@ -1603,7 +1604,7 @@ def getModule(script):
     def test_alias_analysis_input_and_module(self):
         class AliasModule(nn.Module):
             def __init__(self):
-                super(AliasModule, self).__init__()
+                super().__init__()
                 torch.manual_seed(1337)
                 self.a = torch.randn(128, 128)
                 self.b = torch.randn(128, 128)
diff --git a/test/test_testing.py b/test/test_testing.py
index 5ca7d9acb650..02fbb930bb55 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -18,7 +18,7 @@
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (IS_FBCODE, IS_MACOS, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest,
+    (IS_FBCODE, IS_JETSON, IS_MACOS, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest,
      parametrize, subtest, instantiate_parametrized_tests, dtype_name, TEST_WITH_ROCM)
 from torch.testing._internal.common_device_type import \
     (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes,
@@ -1987,13 +1987,14 @@ def test_circular_dependencies(self) -> None:
                            "torch.contrib.",  # something weird
                            "torch.testing._internal.distributed.",  # just fails
                            "torch.ao.pruning._experimental.",  # depends on pytorch_lightning, not user-facing
+                           "torch.onnx._internal.fx",  # depends on onnx-script
                            ]
         # See https://github.com/pytorch/pytorch/issues/77801
         if not sys.version_info >= (3, 9):
             ignored_modules.append("torch.utils.benchmark")
-        if IS_WINDOWS or IS_MACOS:
+        if IS_WINDOWS or IS_MACOS or IS_JETSON:
             # Distributed should be importable on Windows(except nn.api.), but not on Mac
-            if IS_MACOS:
+            if IS_MACOS or IS_JETSON:
                 ignored_modules.append("torch.distributed.")
             else:
                 ignored_modules.append("torch.distributed.nn.api.")
@@ -2032,7 +2033,7 @@ def test_no_warning_on_import(self) -> None:
             # On Windows, opening the subprocess with the default CWD makes `import torch`
             # fail, so just set CWD to this script's directory
             cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
-        self.assertEquals(out, "")
+        self.assertEqual(out, "")
 
     @unittest.skipIf(IS_WINDOWS, "importing torch+CUDA on CPU results in warning")
     @parametrize('path', ['torch', 'functorch'])
diff --git a/test/test_throughput_benchmark.py b/test/test_throughput_benchmark.py
index 75003c9fa2f8..1bfdab982f32 100644
--- a/test/test_throughput_benchmark.py
+++ b/test/test_throughput_benchmark.py
@@ -7,7 +7,7 @@
 
 class TwoLayerNet(torch.jit.ScriptModule):
     def __init__(self, D_in, H, D_out):
-        super(TwoLayerNet, self).__init__()
+        super().__init__()
         self.linear1 = torch.nn.Linear(D_in, H)
         self.linear2 = torch.nn.Linear(2 * H, D_out)
 
@@ -21,7 +21,7 @@ def forward(self, x1, x2):
 
 class TwoLayerNetModule(torch.nn.Module):
     def __init__(self, D_in, H, D_out):
-        super(TwoLayerNetModule, self).__init__()
+        super().__init__()
         self.linear1 = torch.nn.Linear(D_in, H)
         self.linear2 = torch.nn.Linear(2 * H, D_out)
 
diff --git a/test/test_torch.py b/test/test_torch.py
index e00ac9fc637f..3d8b8f2fc76d 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -24,13 +24,13 @@
 import subprocess
 import weakref
 import sys
-from torch._six import inf, nan, string_classes
+from torch import inf, nan
 from itertools import product, combinations, permutations
 from functools import partial
 from torch import multiprocessing as mp
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TEST_WITH_TORCHINDUCTOR, TestCase, TEST_WITH_ROCM, run_tests,
+    TEST_WITH_TORCHINDUCTOR, TestCase, TEST_WITH_ROCM, run_tests, IS_JETSON,
     IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, skipIfTorchInductor, slowTest,
     TEST_WITH_CROSSREF, skipIfTorchDynamo,
@@ -1105,7 +1105,10 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
             if not broadcastable(t0, t1, t2):
                 same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True)
                 if not same_size:
-                    self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2))
+                    # Functionalization converts the inplace to an out-of-place, which causes us to error.
+                    # We should fix this, but "error probably on bad inputs" isn't a hi-pri PT2 item.
+                    if not TEST_WITH_TORCHINDUCTOR:
+                        self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2))
             else:
                 tensorfn_inplace(t0, t1, t2)
 
@@ -1525,6 +1528,10 @@ def test_nondeterministic_alert_put(self, device):
                 lambda: op_call(a, indices, values, accumulate=False),
                 'put_')
 
+    # warn_only=False correctly raises RuntimeError: put_ does not have a deterministic implementation
+    # warn_only=True logs warning from the FallbackKernel: torch.ops.aten.put_.default, instead of as UserWarning:
+    # [W Context.cpp:%(lineno)] Warning: put_ does not have a deterministic implementation
+    @skipIfTorchInductor("warning is logged from the FallbackKernel: torch.ops.aten.put_.default when warn_only=True")
     def test_nondeterministic_alert_put_accumulate(self, device):
         a = torch.randn(10, device=device)
         indices = torch.tensor([0, 0], device=device)
@@ -1940,10 +1947,8 @@ def test_exponential(self, device, dtype):
         self.assertEqual(a.size(), torch.Size([1]))
 
         # Tests extremal behavior
-        tests = ((-0, float('inf')), (0, float('inf')), (float('inf'), 0))
-        for test in tests:
-            t = torch.empty((1,), device=device, dtype=dtype).exponential_(test[0])
-            self.assertTrue(t.item() == test[1])
+        t = torch.empty((1,), device=device, dtype=dtype).exponential_(float('inf'))
+        self.assertTrue(t.item() == 0)
 
         # Tests that negative lambda fails
         with self.assertRaises(RuntimeError):
@@ -2071,6 +2076,20 @@ def test_cauchy_no_inf(self, device, dtype):
             x.cauchy_()
             self.assertFalse(x.isinf().sum())
 
+    @dtypes(*floating_types_and(torch.half, torch.bfloat16))
+    def test_cauchy(self, device, dtype):
+        a = torch.tensor([10], dtype=dtype, device=device).cauchy_(0.0, 0.5)
+        self.assertEqual(a.dtype, dtype)
+        self.assertEqual(a.size(), torch.Size([1]))
+
+        # Tests extremal behavior
+        t = torch.empty((1,), device=device, dtype=dtype).cauchy_(float('inf'), 0.5)
+        self.assertTrue(t.item() == float('inf'))
+
+        # Tests non-positive rate fails
+        with self.assertRaises(RuntimeError):
+            torch.empty((1,), device=device, dtype=dtype).cauchy_(0.0, 0.0)
+
     @skipIfMps
     @skipIfNoSciPy
     @dtypes(*all_types_and(torch.half, torch.bfloat16))
@@ -2509,7 +2528,6 @@ def logcumsumexp(a, axis):
         for inp in (x, x2d):
             actual = inp.logcumsumexp(axis)
             expected = logcumsumexp(inp, axis)
-            print(actual, expected)
             self.assertEqual(expected, actual)
 
         # Check that out is actually inplace
@@ -2763,6 +2781,7 @@ def _test_large_cum_fn_helper(self, x, fn):
         torch.testing.assert_close(expected, actual)
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
+    @unittest.skipIf(IS_JETSON, "psutil issue for largeTensorTest. Too large for Jetson.")
     @onlyCUDA
     @dtypes(torch.half)  # only small dtype not to get oom
     @largeTensorTest('25GB', device='cpu')
@@ -2779,6 +2798,7 @@ def test_large_cumsum(self, device, dtype):
     @dtypes(torch.half)  # only small dtype not to get oom
     @largeTensorTest('48GB', device='cpu')
     @largeTensorTest('4GB', device='cuda')
+    @unittest.skipIf(IS_JETSON, "psutil issue for largeTensorTest. Too large for Jetson.")
     def test_large_cumprod(self, device, dtype):
         # initialization to avoid overflow and half caveats
         x = torch.empty(2**30 + 200, device=device, dtype=dtype)
@@ -2804,8 +2824,12 @@ def _test_cumminmax_helper(self, x, fn, expected_val, expected_ind):
         out_val = torch.empty_like(val).t().contiguous().t()
         out_ind = torch.empty_like(ind).t().contiguous().t()
         fn(x, -1, out=(out_val, out_ind))
-        self.assertFalse(out_val.is_contiguous())
-        self.assertFalse(out_ind.is_contiguous())
+        # TODO: Fix this. It reproduces with aot_eager too, and looks like a functionalization bug.
+        # (the problematic case seems rare, as we're calling an out= op directly from user code,
+        # where the passed-in out tensors are non-contiguous).
+        if not TEST_WITH_TORCHINDUCTOR:
+            self.assertFalse(out_val.is_contiguous())
+            self.assertFalse(out_ind.is_contiguous())
         self.assertEqual(out_val, expected_val, atol=0, rtol=0)
         self.assertEqual(out_ind, expected_ind, atol=0, rtol=0)
 
@@ -3985,6 +4009,7 @@ def test_dim_function_empty(self, device):
             ind_05 = torch.tensor([0, 5], dtype=torch.int64, device=device)
             with self.assertRaisesRegex(RuntimeError, "INDICES element is out of DATA bounds"):
                 torch.index_select(w, 1, ind_05)
+        self.assertRaises(RuntimeError, lambda: torch.ones([]).index_select(0, torch.Tensor([0, 0]).int()))
 
     # FIXME: find a test suite for the pdist operator
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
@@ -6060,7 +6085,7 @@ def test_contains(self):
 
         self.assertRaisesRegex(
             RuntimeError,
-            "Tensor.__contains__ only supports Tensor or scalar, but you passed in a {}.".format(type("foo")),
+            "Tensor.__contains__ only supports Tensor or scalar, but you passed in a {}.".format(str),
             lambda: "foo" in x)
         self.assertRaisesRegex(
             RuntimeError,
@@ -6344,7 +6369,7 @@ def test_parsing_intlist(self):
         # fail parse with float variables
         self.assertRaises(TypeError, lambda: torch.ones((torch.tensor(3.), torch.tensor(4))))
         # fail parse with numpy floats
-        self.assertRaises(TypeError, lambda: torch.ones((np.float(3.), torch.tensor(4))))
+        self.assertRaises(TypeError, lambda: torch.ones((3., torch.tensor(4))))
         self.assertRaises(TypeError, lambda: torch.ones((np.array(3.), torch.tensor(4))))
 
         # fail parse with > 1 element variables
@@ -6361,7 +6386,6 @@ def test_parsing_intlist(self):
                                "missing 1 required positional arguments",
                                lambda: torch.tensor().new_zeros((5, 5), 0))
 
-    @skipIfTorchDynamo("will be re-enabled after #90892")
     def test_from_buffer(self):
         a = bytearray([1, 2, 3, 4])
         self.assertEqual(torch.ByteStorage.from_buffer(a).tolist(), [1, 2, 3, 4])
@@ -7426,6 +7450,7 @@ def test_batch_norm_cpu_inference(self):
 
     # FIXME: move these meta tests to their own test suite/class or
     #   distribute them among the appropriate test suites for their ops
+    @skipIfTorchDynamo("Fails after Triton update, see https://github.com/pytorch/pytorch/issues/94687")
     def test_empty_meta(self):
         x = torch.empty(2 ** 20, 2 ** 20, device='meta')
         y = torch.empty(2 ** 20, device='meta')
@@ -7433,6 +7458,7 @@ def test_empty_meta(self):
         self.assertEqual(z.size(), (2 ** 20, 2 ** 20))
         self.assertRaises(RuntimeError, lambda: z[0][0].item())
 
+    @skipIfTorchDynamo("Fails after Triton update, see https://github.com/pytorch/pytorch/issues/94687")
     def test_format_scalar_meta(self):
         x = torch.empty((), device='meta')
         self.assertEqual(format(x), repr(x))
@@ -7495,10 +7521,12 @@ def test_upsample_nearest2d_meta(self):
         # Complain if out device mismatch
         x = torch.empty(0, 3, 8, 8, device='meta')
         out = torch.empty(0, 3, 16, 16, device='cpu')
-        self.assertExpectedRaisesInline(
-            RuntimeError, lambda: torch._C._nn.upsample_nearest2d(x, (16, 16), out=out),
-            """Expected out tensor to have device meta, but got cpu instead"""
-        )
+        # FIXME: compiling should properly error with a device mismatch.
+        if not TEST_WITH_TORCHINDUCTOR:
+            self.assertExpectedRaisesInline(
+                RuntimeError, lambda: torch._C._nn.upsample_nearest2d(x, (16, 16), out=out),
+                """Expected out tensor to have device meta, but got cpu instead"""
+            )
 
     def test_add_meta_scalar(self):
         # From https://github.com/pytorch/pytorch/issues/53815
@@ -7809,6 +7837,9 @@ def test_copy_broadcast(self):
         self.assertRaises(RuntimeError, lambda: torch.zeros(5, 6).copy_(torch.zeros(30)))
 
     # FIXME: Port to a more appropriate test suite
+    # Fails with inductor (and aot_eager) because functionalization replaces copy_ with copy,
+    # which doesn't properly error on bad inputs.
+    @skipIfTorchInductor("FIXME")
     def test_copy_many_to_one(self):
         # Testing in-place copy where it attempt to write from many memory
         # storage to a single storage would cause RuntimeError to be thrown
@@ -8259,7 +8290,7 @@ def _test_namespace(ns, *skips):
                 ns_name = ns.__name__
             skip_regexes = []
             for r in skips:
-                if isinstance(r, string_classes):
+                if isinstance(r, str):
                     skip_regexes.append(re.compile('^{}$'.format(re.escape(r))))
                 else:
                     skip_regexes.append(r)
@@ -8661,6 +8692,18 @@ def test_no_cuda_monkeypatch(self):
         with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class CUDAGraph"):
             torch.cuda.graphs.CUDAGraph()
 
+    def test_tensor_where_scalar(self):
+
+        a = torch.arange(4.0)
+        not_zero = 0.001
+
+        # b is generated through torch.where function with not_zero being a scalar parameter
+        b = torch.where(a != 0, a, not_zero)
+        # c is generated through Tensor.where method with not_zero being a scalar parameter
+        c = a.where(a != 0, not_zero)
+
+        self.assertEqual(b, c)
+
 # The following block extends TestTorch with negative dim wrapping tests
 # FIXME: replace these with OpInfo sample inputs or systemic OpInfo tests
 # Functions to test negative dimension wrapping
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 31cb08a6f81f..7b866c4ab7cf 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -13,6 +13,7 @@
 import torch.optim as optim
 from torch.testing._internal.common_dtype import floating_types_and_half
 
+from typing import Tuple
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_utils import (
     TEST_FAIRSEQ,
@@ -21,20 +22,18 @@
     instantiate_parametrized_tests,
     freeze_rng_state,
     TEST_WITH_CROSSREF,
-    TEST_WITH_ROCM,
-    IS_WINDOWS,
     slowTest,
     set_default_dtype,
     gradcheck
 )
 
+
 from torch.testing._internal.common_methods_invocations import wrapper_set_seed
-from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater
+from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater, PLATFORM_SUPPORTS_FUSED_SDPA
 
 if TEST_FAIRSEQ:
     import fairseq.models.transformer as fairseq_transformer
 
-PLATFORM_SUPPORTS_FUSED_SDPA: bool = TEST_CUDA and not TEST_WITH_ROCM and not IS_WINDOWS
 
 @contextlib.contextmanager
 def use_deterministic_algorithims(mode: bool, warn_only: bool):
@@ -52,6 +51,21 @@ def use_deterministic_algorithims(mode: bool, warn_only: bool):
     finally:
         torch.use_deterministic_algorithms(previous_mode, warn_only=previous_warn_only)
 
+
+# Found in torch/testing/_comparison.py
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float32: 1e-5}
+default_rtol = {torch.float16: 1e-3, torch.bfloat16: 1.6e-2, torch.float32: 1.3e-6}
+
+isSM86Device = torch.cuda.is_available() and torch.cuda.get_device_capability() == (8, 6)
+
+
+def get_rtol(true_value: torch.Tensor, computed_value: torch.Tensor) -> float:
+    deviation = true_value - computed_value
+    deviation = torch.abs(deviation / true_value)
+    # Fill in the nans with the default rtol
+    torch.nan_to_num_(deviation, nan=default_rtol[computed_value.dtype])
+    return deviation.max().item()
+
 class TestTransformers(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
@@ -446,7 +460,8 @@ def perm_fn(x):
             # test case 3, multiple layers with norm
             # d_model = 4
             norm = nn.LayerNorm(4)
-            model = nn.TransformerEncoder(encoder_layer, 2, norm=norm, enable_nested_tensor=enable_nested_tensor).to(device)
+            model = nn.TransformerEncoder(encoder_layer, 2, norm=norm,
+                                          enable_nested_tensor=enable_nested_tensor).to(device)
             if not training:
                 model = model.eval()
             result = model(encoder_input, src_key_padding_mask=mask)
@@ -464,7 +479,8 @@ def perm_fn(x):
             self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
             torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5)
 
-            model = nn.TransformerEncoder(encoder_layer, 6, norm=norm, enable_nested_tensor=enable_nested_tensor).to(device)
+            model = nn.TransformerEncoder(encoder_layer, 6, norm=norm,
+                                          enable_nested_tensor=enable_nested_tensor).to(device)
             if not training:
                 model = model.eval()
             result = model(encoder_input, src_key_padding_mask=mask)
@@ -601,7 +617,6 @@ def forward(
 
                 norm_first = one_encoder_layer.norm_first
 
-
                 # TODO: make this a bit less janky. but for now we initialize with an empty tensor.
                 if(not is_incremental_decoding):
                     assert len(incr_key_lst) == 0 or incr_key_lst[0] is None
@@ -826,7 +841,7 @@ def sdp_ref(
                 attn = torch.nn.functional.dropout(attn, p=dropout_p)
             # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
             output = torch.bmm(attn, v)
-            return output, attn
+            return output
         # TODO: Support cross-device / dtype testing properly when instantiate_device_type_tests() is used.
         dtypes = [torch.double, torch.float]
         for dtype in dtypes:
@@ -866,23 +881,22 @@ def rand_tensor(*shape):
                     a = a.view(-1, L, S)
                 expected = sdp_ref(q, k, v, attn_mask=a, dropout_p=dropout_p)
                 if input_dim > 3:
-                    expected = (expected[0].view(-1, N_prime, L, E), expected[1].view(-1, N_prime, L, S))
+                    expected = expected.view(-1, N_prime, L, E)
 
-            need_attn_weights: bool = True
             with freeze_rng_state():
                 if is_causal:
                     # NB: Don't pass attn_mask here
-                    actual = torch.ops.aten._scaled_dot_product_attention(
-                        query, key, value, None, dropout_p, need_attn_weights, is_causal)
+                    actual = torch.nn.functional.scaled_dot_product_attention(
+                        query, key, value, None, dropout_p, is_causal)
 
                     # Error case: both explicit attn_mask and is_causal are set
                     with self.assertRaisesRegex(RuntimeError,
                                                 "Explicit attn_mask should not be set when is_causal=True"):
-                        torch.ops.aten._scaled_dot_product_attention(
-                            query, key, value, attn_mask, dropout_p, need_attn_weights, is_causal)
+                        torch.nn.functional.scaled_dot_product_attention(
+                            query, key, value, attn_mask, dropout_p, is_causal)
                 else:
-                    actual = torch.ops.aten._scaled_dot_product_attention(
-                        query, key, value, attn_mask, dropout_p, need_attn_weights, is_causal)
+                    actual = torch.nn.functional.scaled_dot_product_attention(
+                        query, key, value, attn_mask, dropout_p, is_causal)
 
                 self.assertEqual(actual, expected)
 
@@ -897,7 +911,7 @@ def rand_tensor(*shape):
             assert gradcheck(lambda *args, **kwargs: wrapper_set_seed(sdp_ref, *args, **kwargs),
                              (q, k, v, attn_mask, dropout_p))
             assert gradcheck(lambda *args, **kwargs:
-                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
+                             wrapper_set_seed(torch.nn.functional.scaled_dot_product_attention, *args, **kwargs),
                              (q, k, v, attn_mask, dropout_p))
 
     @unittest.skipIf(TEST_WITH_CROSSREF, 'Fastpath not available with crossref')
@@ -957,35 +971,226 @@ def _test_fastpath(model, key_padding_mask, mock_return_value, attn_mask=None, n
         _test_fastpath(model, aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
         _test_fastpath(model, not_aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
 
-    def rand_nt(self, shape, device, dtype, requires_grad=False, packed=False):
-        batch, seq_len, num_heads, head_dim = shape
-        size = (seq_len, num_heads, head_dim) if not packed else (seq_len, 3 * num_heads * head_dim)
-        return torch.nested.nested_tensor([
-            torch.randn(size, device=device, dtype=dtype, requires_grad=requires_grad)
-            for _ in range(batch)])
+    # Test failing MHA when bias was NoneType
+    def test_bias_is_none(self):
+        x = torch.rand((1, 5, 10))
+        model = torch.nn.modules.activation.MultiheadAttention(10, 1, bias=False, batch_first=True)
+        model.eval()
+        model(x, x, x)
+        # completes without error
+
+    @parametrize("device", device_list)
+    def test_train_with_is_causal(self, device):
+        # training with is_causal
+        S, L, E, H = 1, 2, 2, 1
+        layer = nn.TransformerEncoderLayer(
+            d_model=2,
+            dim_feedforward=4,
+            nhead=H,
+            batch_first=True,
+            activation="gelu",
+            dropout=0,
+        )
+        criterion = nn.MSELoss()
+        encoder = nn.TransformerEncoder(layer, 2).to(device)
+        optimizer = optim.SGD(encoder.parameters(), lr=0.1, momentum=0.9)
+        encoder.train()
 
-    def rand_tensor(self, shape, device, dtype, requires_grad=False, packed=False):
+        encoder.train()
+        optimizer.zero_grad()
+        inputs = torch.randn(S, L, E).to(device)
+
+        outputs = encoder(inputs, is_causal=True)
+
+        loss = criterion(outputs[:, 0:2, :], inputs[:, 0:2, :])
+        loss.backward()
+        optimizer.step()
+
+        # inference with is_causal
+        t_qvk = torch.randn((S, L, E), device=device, dtype=torch.float32)
+        mha = nn.MultiheadAttention(E, H).to(device)
+        attn_out, _ = mha(t_qvk, t_qvk, t_qvk, is_causal=True)
+
+        # Can't give both attn_mask AND is_causal
+        attn_mask = torch.randint(0, 2, size=(L, L), device=device, dtype=torch.bool)
+        with self.assertRaisesRegex(AssertionError, "Only allow causal mask or attn_mask"):
+            _ = mha(t_qvk, t_qvk, t_qvk, attn_mask=attn_mask, is_causal=True)
+
+        # # Passing a causal mask sets is_causal to 1
+        causal_mask = torch.triu(
+            torch.ones(L, L, device=inputs.device) * float('-inf'), diagonal=1
+        ).to(torch.bool)
+
+        mock_layer = MagicMock(torch.nn.MultiheadAttention(E, H), return_value=inputs)
+        encoder.layers[0] = mock_layer
+        outputs = encoder(inputs, mask=causal_mask)
+        mock_layer.assert_called_with(ANY, src_mask=ANY, is_causal=True, src_key_padding_mask=ANY)
+
+        # check expected numerical values with all kernels
+        self.is_causal_kernels(["math"], device)
+
+    def is_causal_kernels(self, kernels, device):
+        def ones_tensor(*shape):
+            return torch.ones(shape, device=device, dtype=torch.float32).to(device)
+        S, L, E, H = 1, 2, 4, 1
+        qkv = ones_tensor(S, L, E)
+
+        mha = nn.MultiheadAttention(E, H).to(device)
+        mha.in_proj_weight = Parameter(torch.ones((E * 3, E), device=device))
+        mha.out_proj.weight = Parameter(torch.ones((E, E), device=device))
+        expected = torch.ones(size=(S, L, E)).to(device) * 16
+
+        for kernel in kernels:
+            with torch.backends.cuda.sdp_kernel(
+                enable_math=(kernel == 'math'),
+                enable_flash=(kernel == 'flash'),
+                enable_mem_efficient=(kernel == 'meff')
+            ):
+                actual, _ = mha(qkv, qkv, qkv, need_weights=False, is_causal=True)
+                self.assertTrue(torch.equal(actual, expected))
+
+                if kernel != 'math':
+                    # fails with embedding size not multiple of 4
+                    with self.assertRaisesRegex(RuntimeError, "No available kernel"):
+                        qkv_f, mha_f = ones_tensor(S, L, 2), nn.MultiheadAttention(2, H).to(device)
+                        _ = mha_f(qkv_f, qkv_f, qkv_f, need_weights=False, is_causal=True)
+                        torch.cuda.synchronize()
+
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Platform does not supposrt fused SDPA or pre-SM80 hardware"
+    )
+    def test_is_causal_gpu(self):
+        device = 'cuda'
+        self.is_causal_kernels(["math", "meff"], device)
+
+    def test_script_mha_in_proj_weight_none(self):
+        mha = torch.nn.MultiheadAttention(
+            embed_dim=128, num_heads=8, kdim=256, vdim=256
+        ).eval()
+
+        torch.jit.script(mha)
+
+
+class TestSDPA(NNTestCase):
+    """ Used to test the functionality of scaled_dot_product_attention
+    Quarks:
+        There is some trickiness with this function. It's runtime behavior
+        is dependent on the CUDA architecture you are testing it on. See
+        `PLATFORM_SUPPORTS_FUSED_SDPA` at the top of the file.
+        Summary:
+            Math: always supported
+            FlashAttention: Supported on sm80 or newer hardware
+            MemEfficientAttention: Supported on sm50 or newer hardware
+    """
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    backend_map = {
+        SDPBackend.MATH: {"enable_math": True, "enable_flash": False, "enable_mem_efficient": False},
+        SDPBackend.FLASH_ATTENTION: {"enable_math": False, "enable_flash": True, "enable_mem_efficient": False},
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False, "enable_flash": False, "enable_mem_efficient": True}
+    }
+
+    def rand_tensor(self, shape: Tuple[int], device: str, dtype: torch.dtype,
+                    type: str, requires_grad: bool = False, packed: bool = False) -> torch.Tensor:
+        """Creates rand dense or nested tensor with given shape and type.
+
+        Args:
+            shape (Tuple[int]): _description_
+            device (str): _description_
+            dtype (torch.dtype): _description_
+            type (str): _description_
+            requires_grad (bool, optional): _description_. Defaults to False.
+            packed (bool, optional): _description_. Defaults to False.
+
+        Returns:
+            torch.Tensor: _description_
+        """
         batch, seq_len, num_heads, head_dim = shape
-        size = (batch, seq_len, num_heads, head_dim) if not packed else (batch, seq_len, 3 * num_heads * head_dim)
-        return torch.randn(size, device=device, dtype=dtype, requires_grad=requires_grad)
+        if type == "nested":
+            size = (seq_len, num_heads, head_dim) if not packed else (seq_len, 3 * num_heads * head_dim)
+            return torch.nested.nested_tensor([
+                torch.randn(size, device=device, dtype=dtype, requires_grad=requires_grad)
+                for _ in range(batch)])
+        else:
+            size = (batch, seq_len, num_heads, head_dim) if not packed else (batch, seq_len, 3 * num_heads * head_dim)
+            return torch.randn(size, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def convert_flash_attn_S_to_softmax(self, S, query_padding_mask, key_padding_mask, head_dim, causal=False):
+        """FlashAttention stores the S matrix in a different way.
+        Arguments:
+            S: (batch_size, nheads, seqlen_q, seqlen_k)
+            query_padding_mask: (batch_size, seqlen_q)
+            key_padding_mask: (batch_size, seqlen_k)
+        """
+        def _get_block_size(head_dim):
+            assert head_dim % 8 == 0 and head_dim <= 128
+            return 256 if head_dim <= 64 else 128
+        S_flat = S.view(S.shape[0], S.shape[1], S.shape[2] * S.shape[3])
+        seqlen_q, seqlen_k = S.shape[-2:]
+        block_size = _get_block_size(head_dim)
+        loop_steps = math.ceil(seqlen_k / block_size)
+        warps_n = 4
+        mmas_n = (seqlen_k // warps_n //
+                  16) if seqlen_k <= block_size else (block_size // warps_n // 16)
+
+        S_converted = S_flat.view(S_flat.shape[0], S_flat.shape[1], loop_steps,
+                                  seqlen_q // 16, mmas_n, warps_n, 8, 4, 2, 2, 2)
+        S_converted = S_converted.permute(0, 1, 3, 8, 6, 2, 4, 5, 9, 7, 10)
+        S_converted = S_converted.reshape(S_flat.shape[0],
+                                          S_flat.shape[1], (seqlen_q // 16 * 2 * 8), (loop_steps * mmas_n * warps_n * 2 * 4 * 2))
+        # Need to zero out things not in attention_mask in case S was initialized with random values
+        # and some of those values aren't overwritten.
+        seqlen_q_og = query_padding_mask.shape[-1]
+        if seqlen_q_og < seqlen_q:
+            query_padding_mask = F.pad(
+                query_padding_mask, (0, seqlen_q - seqlen_q_og))
+        else:
+            query_padding_mask = query_padding_mask[:, :seqlen_q]
+        q_mask_fill = ~query_padding_mask.view(query_padding_mask.shape[0], 1, query_padding_mask.shape[1], 1)
+        S_converted = S_converted.masked_fill(q_mask_fill, 0.0)
+        seqlen_k_og = key_padding_mask.shape[-1]
+        if seqlen_k_og < seqlen_k:
+            key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k - seqlen_k_og))
+        else:
+            key_padding_mask = key_padding_mask[:, :seqlen_k]
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Flash Attention was not built for this system")
+        k_mask_fill = ~key_padding_mask.view(key_padding_mask.shape[0], 1, 1, key_padding_mask.shape[1])
+        S_converted = S_converted.masked_fill(k_mask_fill, 0.0)
+
+        if causal:
+            causal_mask = torch.triu(torch.ones(
+                seqlen_q, seqlen_k, dtype=torch.bool, device=S.device), 1)
+            S_converted.masked_fill_(causal_mask, 0.0)
+        if seqlen_q_og < seqlen_q:
+            S_converted = S_converted[:, :, :seqlen_q_og, :]
+        else:
+            S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q))
+        if seqlen_k_og < seqlen_k:
+            S_converted = S_converted[:, :, :, :seqlen_k_og]
+        else:
+            S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k))
+        return S_converted
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Fused SDPA was not built for this system")
     @parametrize("type", ["dense", "nested"])
     @parametrize("is_contiguous", [True, False])
-    def test_scaled_dot_product_attention_fused_kernels(self, type: str, is_contiguous: bool):
-        rand_nt = partial(self.rand_nt, device="cuda", dtype=torch.float16)
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16)
+    @parametrize("head_dims_match", [True, False])
+    def test_scaled_dot_product_attention_fused_kernels(self, type: str, is_contiguous: bool, head_dims_match: bool):
+        rand_tensor = partial(self.rand_tensor, type=type, device="cuda", dtype=torch.float16)
 
         batch, seq_len, num_heads, head_dim = 32, 64, 16, 64
         shape = (batch, seq_len, num_heads, head_dim)
-        if type == "dense":
-            query = rand_tensor(shape)
-            key = rand_tensor(shape)
-            value = rand_tensor(shape)
-        elif type == "nested":
-            query = rand_nt(shape)
-            key = rand_nt(shape)
-            value = rand_nt(shape)
+        if head_dims_match:
+            shape_v = shape
+        else:
+            head_dim_v = 96
+            shape_v = (batch, seq_len, num_heads, head_dim_v)
+
+        query = rand_tensor(shape)
+        key = rand_tensor(shape)
+        value = rand_tensor(shape_v)
 
         # Lets switch seq_len and num_heads
         # B x S X H X D -> B x H x S x D
@@ -999,32 +1204,26 @@ def test_scaled_dot_product_attention_fused_kernels(self, type: str, is_contiguo
             value = value.contiguous()
 
         with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True):
-            actual = torch.nn.functional._scaled_dot_product_attention(
-                query, key, value, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+            actual = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
         with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
-            math_ref = torch.nn.functional._scaled_dot_product_attention(
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
                 query.contiguous(), key.contiguous(), value.contiguous(),
-                attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
-
-        # Since we are setting need weights to false lets check that the returned values are of size 0
-        if type == "dense":
-            assert actual[1].numel() == 0
-            assert math_ref[1].numel() == 0
+                attn_mask=None, dropout_p=0.0, is_causal=False)
 
         self.assertEqual(actual[0].contiguous(), math_ref[0].contiguous(), atol=1e-3, rtol=1e-2)
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Flash Attention was not built for this system")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Fused SDPA was not built for this system")
     @parametrize("type", ["dense", "nested"])
     @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_fused_kernels_packed(self, type: str, is_contiguous: bool):
-        rand_nt = partial(self.rand_nt, device="cuda", dtype=torch.float16, packed=True)
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16, packed=True)
+        rand_tensor = partial(self.rand_tensor, type=type, device="cuda", dtype=torch.float16, packed=True)
 
         batch_size, seq_len, num_heads, head_dim = 32, 64, 16, 64
         shape = (batch_size, seq_len, num_heads, head_dim)
 
         # Test Packed
-        qkv = rand_tensor(shape) if type == "dense" else rand_nt(shape)
+        qkv = rand_tensor(shape)
         query, key, value = qkv.chunk(3, dim=-1)
 
         query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
@@ -1037,20 +1236,20 @@ def test_scaled_dot_product_attention_fused_kernels_packed(self, type: str, is_c
             value = value.contiguous()
 
         with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True):
-            actual = torch.nn.functional._scaled_dot_product_attention(
-                query, key, value, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+            actual = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
         with sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
-            math_ref = torch.nn.functional._scaled_dot_product_attention(
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
                 query.contiguous(), key.contiguous(), value.contiguous(),
-                attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+                attn_mask=None, dropout_p=0.0, is_causal=False)
 
-        self.assertEqual(actual[0].contiguous(), math_ref[0].contiguous(), atol=2e-3, rtol=1e-2)
+        self.assertEqual(actual.contiguous(), math_ref.contiguous(), atol=2e-3, rtol=1e-2)
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Flash Attention was not built for this system")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Fused SDPA was not built for this system")
     @parametrize("type", ["dense", "nested"])
-    @parametrize("fused_kernel", ["flash", "mem_efficient"])
+    @parametrize("fused_kernel", [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
     def test_scaled_dot_product_attention_fused_kernels_packed_accuracy(self, type: str, fused_kernel: str):
-        if (not SM80OrLater) and fused_kernel == "flash":
+        if (not SM80OrLater) and fused_kernel == SDPBackend.FLASH_ATTENTION:
             return
 
         def rand_nt(shape):
@@ -1081,36 +1280,25 @@ def rand_tensor(shape):
         key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
-        if fused_kernel == "flash":
-            with sdp_kernel(enable_mem_efficient=False, enable_math=False):
-                # TODO Flash for the nested path is currently not working due to cuda memory issues
-                if type == "nested":
-                    self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                        query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False))
-                    return
-                actual = torch.nn.functional._scaled_dot_product_attention(
-                    query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
-        elif fused_kernel == "mem_efficient":
-            with sdp_kernel(enable_flash=False, enable_math=False):
-                actual = torch.nn.functional._scaled_dot_product_attention(
-                    query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
-
-        with sdp_kernel(enable_flash=False, enable_mem_efficient=False):
-            math_ref_lp = torch.nn.functional._scaled_dot_product_attention(
+        with sdp_kernel(**self.backend_map[fused_kernel]):
+            actual = torch.nn.functional.scaled_dot_product_attention(
+                query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, is_causal=False)
+
+        with sdp_kernel(**self.backend_map[SDPBackend.MATH]):
+            math_ref_lp = torch.nn.functional.scaled_dot_product_attention(
                 query_lp.contiguous(), key_lp.contiguous(), value_lp.contiguous(),
-                attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+                attn_mask=None, dropout_p=0.0, is_causal=False)
 
-        with sdp_kernel(enable_flash=False, enable_mem_efficient=False):
             math_query = query.contiguous()
             math_key = key.contiguous()
             math_value = value.contiguous()
 
-            math_ref = torch.nn.functional._scaled_dot_product_attention(
-                math_query, math_key, math_value, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
+                math_query, math_key, math_value, attn_mask=None, dropout_p=0.0, is_causal=False)
 
-        actual_test = actual[0]
-        math_ref_test = math_ref[0]
-        math_ref_lp_test = math_ref_lp[0]
+        actual_test = actual
+        math_ref_test = math_ref
+        math_ref_lp_test = math_ref_lp
 
         if actual_test.is_nested:
             actual_test = torch.nested.to_padded_tensor(actual_test.contiguous(), padding=0.0)
@@ -1124,12 +1312,13 @@ def rand_tensor(shape):
         self.assertEqual(math_ref_test, math_ref_lp_test, atol=7e-3, rtol=7e-3)
         self.assertEqual(actual_test, math_ref_test, atol=5e-3, rtol=5e-3)
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Flash Attention was not built for this system")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Fused SDPA was not built for this system")
     @parametrize("contiguous_inputs", [True, False])
     def test_sdp_math_gradcheck(self, contiguous_inputs: bool):
 
         batch_size, seq_len, num_heads, head_dim = 4, 4, 2, 16
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
+        rand_tensor = partial(self.rand_tensor, type="dense", device="cuda",
+                              dtype=torch.float64, requires_grad=True, packed=True)
 
         qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
         query, key, value = qkv.chunk(3, dim=-1)
@@ -1145,15 +1334,17 @@ def test_sdp_math_gradcheck(self, contiguous_inputs: bool):
 
         with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
             assert gradcheck(lambda *args, **kwargs:
-                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
-                             (query, key, value, None, 0.0, False, False)
+                             wrapper_set_seed(torch.nn.functional.scaled_dot_product_attention, *args, **kwargs),
+                             (query, key, value, None, 0.0, False)
                              )
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Flash Attention was not built for this system")
     @parametrize("contiguous_inputs", [True, False])
-    def test_sdp_fused_grad_against_math(self, contiguous_inputs: bool):
+    @parametrize("is_causal", [True, False])
+    def test_sdp_mem_efficient_grad_against_math(self, contiguous_inputs: bool, is_causal: bool):
         batch_size, seq_len, num_heads, head_dim = 4, 4, 2, 16
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
+        rand_tensor = partial(self.rand_tensor, type="dense", device="cuda",
+                              dtype=torch.float64, requires_grad=True, packed=True)
 
         qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
         qkv_lp = qkv.detach().clone().to(torch.float32).requires_grad_()
@@ -1179,11 +1370,11 @@ def test_sdp_fused_grad_against_math(self, contiguous_inputs: bool):
             value_lp = value_lp.contiguous()
 
         with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
-            out, atten = torch.nn.functional._scaled_dot_product_attention(query, key, value, None, 0.0, False, False)
+            out = torch.nn.functional.scaled_dot_product_attention(query, key, value, None, 0.0, is_causal)
 
         with sdp_kernel(enable_math=False, enable_mem_efficient=True, enable_flash=False):
-            out_lp, atten_lp = torch.nn.functional._scaled_dot_product_attention(
-                query_lp, key_lp, value_lp, None, 0.0, False, False)
+            out_lp = torch.nn.functional.scaled_dot_product_attention(
+                query_lp, key_lp, value_lp, None, 0.0, is_causal)
 
         rand_upward = torch.rand_like(out)
         rand_upward_lp = rand_upward.to(torch.float32)
@@ -1194,12 +1385,64 @@ def test_sdp_fused_grad_against_math(self, contiguous_inputs: bool):
         # Cast up and compare
         self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=1e-5, rtol=1e-5)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Flash Attention was not built for this system")
+    @parametrize("contiguous_inputs", [True, False])
+    @parametrize("is_causal", [True, False])
+    @parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_sdp_flash_attention_grad_against_math(self, contiguous_inputs: bool, is_causal: bool, dtype: torch.dtype):
+        batch_size, seq_len, num_heads, head_dim = 4, 4, 2, 16
+        rand_tensor = partial(self.rand_tensor, type="dense", device="cuda",
+                              dtype=torch.float64, requires_grad=True, packed=True)
+
+        qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
+        qkv_lp = qkv.detach().clone().to(dtype).requires_grad_()
+
+        query, key, value = qkv.chunk(3, dim=-1)
+        query_lp, key_lp, value_lp = qkv_lp.chunk(3, dim=-1)
+
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        query_lp = query_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        if contiguous_inputs:
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+
+            query_lp = query_lp.contiguous()
+            key_lp = key_lp.contiguous()
+            value_lp = value_lp.contiguous()
+
+        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+            out = torch.nn.functional.scaled_dot_product_attention(query, key, value, None, 0.0, is_causal)
+
+        with sdp_kernel(enable_math=False, enable_mem_efficient=False, enable_flash=True):
+            out_lp = torch.nn.functional.scaled_dot_product_attention(
+                query_lp, key_lp, value_lp, None, 0.0, is_causal)
+
+        rand_upward = torch.rand_like(out)
+        rand_upward_lp = rand_upward.to(dtype)
+
+        out.backward(rand_upward)
+        out_lp.backward(rand_upward_lp)
+
+        # Cast up and compare
+        # Since we are doing the compute on fp16 we have to bump the tolerance
+        # Bump down the tolearnce for blfoat16
+        atol = 7e-4 if dtype == torch.float16 else 7e-3
+        rtol = 7e-4 if dtype == torch.float16 else 7e-3
+        self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=atol, rtol=rtol)
+
     @parametrize("type", ["dense", "nested"])
     def test_fused_sdp_choice(self, type: str):
         device = "cpu"
         # Test that cpu and nestedtensor cpu return MATH backend
         for dtype in floating_types_and_half():
-            make_tensor = partial(self.rand_tensor, device=device, dtype=dtype)
+            make_tensor = partial(self.rand_tensor, type=type, device=device, dtype=dtype)
             size = (2, 2, 3, 4)
             q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
             assert torch._fused_sdp_choice(q, k, v) == SDPBackend.MATH
@@ -1209,9 +1452,8 @@ def test_fused_sdp_choice(self, type: str):
             shape = (batch_size, seq_len, num_heads, head_dim)
             device = "cuda"
             make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float16, packed=True)
-            make_nt = partial(self.rand_nt, device=device, dtype=torch.float16, packed=True)
 
-            qkv = make_tensor(shape) if type == "dense" else make_nt(shape)
+            qkv = make_tensor(shape, type=type)
             query, key, value = qkv.chunk(3, dim=-1)
 
             query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
@@ -1225,9 +1467,8 @@ def test_fused_sdp_choice(self, type: str):
 
             # Change dtype to float32 so that efficient attention should get chosen
             make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float32, packed=True)
-            make_nt = partial(self.rand_nt, device=device, dtype=torch.float32, packed=True)
 
-            qkv = make_tensor(shape) if type == "dense" else make_nt(shape)
+            qkv = make_tensor(shape, type=type)
             query, key, value = qkv.chunk(3, dim=-1)
 
             query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
@@ -1236,13 +1477,13 @@ def test_fused_sdp_choice(self, type: str):
 
             assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "CUDA unavailable")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Platform does not support fused SDPA")
     @parametrize("warn_only", [True, False])
     def test_sdp_choice_with_determinism(self, warn_only):
         # If we are only warning we still expect that efficient_attention will still be called.
         batch_size, seq_len, num_heads, head_dim = 1, 64, 8, 64
         shape = (batch_size, seq_len, num_heads, head_dim)
-        make_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float32, packed=False)
+        make_tensor = partial(self.rand_tensor, type="dense", device="cuda", dtype=torch.float32, packed=False)
         query, key, value = make_tensor(shape), make_tensor(shape), make_tensor(shape)
 
         with use_deterministic_algorithims(True, warn_only=warn_only):
@@ -1250,15 +1491,41 @@ def test_sdp_choice_with_determinism(self, warn_only):
                 assert torch._fused_sdp_choice(query, key, value) == (
                     SDPBackend.EFFICIENT_ATTENTION if warn_only else SDPBackend.MATH)
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "CUDA unavailable")
-    def test_sdp_runtime_dispatch(self):
-        # We will test all the constraints that we know will cause a failure
-        # The problem is that any code path that goes down flash_attention
-        # will fail on CI/CD becuase it is not compiled with the right flags
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not isSM86Device, "Does not support fused SDPA or not SM86 hardware")
+    def test_memory_efficeint_sm86_failure(self):
         device = 'cuda'
         dtype = torch.float16
-        make_tensor = partial(self.rand_tensor, device=device, dtype=dtype)
-
+        make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=dtype)
+        # See check_gpu_sm86_head_dim_128 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+        size = (2, 2, 4, 128)
+        q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
+        with sdp_kernel(enable_mem_efficient=True, enable_flash=False, enable_math=False):
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not isSM86Device, "Does not support fused SDPA or not SM86 hardware")
+    def test_flash_backward_sm86_headdim128(self):
+        device = 'cuda'
+        dtype = torch.float16
+        make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=dtype)
+        # See check_gpu_sm86_head_dim_128 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+        size = (2, 2, 4, 128)
+        q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
+        with sdp_kernel(enable_mem_efficient=False, enable_flash=True, enable_math=False):
+            # Should not fail because inputs don't require grad
+            torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
+
+            # Should fail because inputs require grad
+            q = make_tensor(size, requires_grad=True)
+            k = make_tensor(size, requires_grad=True)
+            v = make_tensor(size, requires_grad=True)
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Platform does not support fused scaled dot product attention")
+    def test_dispatch_fails_no_backend(self):
+        dtype = torch.float16
+        device = "cuda"
         with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=False):
             size = (2, 3, 4)
             q = torch.randn(size, device=device, dtype=dtype)
@@ -1267,196 +1534,374 @@ def test_sdp_runtime_dispatch(self):
             self.assertRaisesRegex(RuntimeError, "No viable backend for scaled_dot_product_attention was found.",
                                    lambda: torch._fused_sdp_choice(q, k, v))
             self.assertRaisesRegex(RuntimeError, "No viable backend for scaled_dot_product_attention was found.",
-                                   lambda: torch.nn.functional._scaled_dot_product_attention(q, k, v))
-
-        with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
-            # Failures for invalid input
-
+                                   lambda: torch.nn.functional.scaled_dot_product_attention(q, k, v))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Does not support fused scaled dot product attention")
+    @parametrize(
+        "kernel",
+        [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION]
+        if SM80OrLater
+        else [SDPBackend.EFFICIENT_ATTENTION],
+    )
+    def test_invalid_fused_inputs_dim_3(self, kernel: SDPBackend):
+        with sdp_kernel(**self.backend_map[kernel]):
             # Dim is not 4
+            device = "cuda"
+            size = (2, 3, 8)
+            dtype = torch.float16
             q = torch.randn(size, device=device, dtype=dtype)
             k = torch.randn(size, device=device, dtype=dtype)
             v = torch.randn(size, device=device, dtype=dtype)
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, False, False))
-
-            # Xformers can now cover this case but will add back in next PR
-            # Invalid last_dim size
-            size = (2, 2, 3, 4)
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Does not support fused scaled dot product attention")
+    @parametrize(
+        "kernel",
+        [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION]
+        if SM80OrLater
+        else [SDPBackend.EFFICIENT_ATTENTION],
+    )
+    def test_invalid_fused_inputs_broadcast(self, kernel: SDPBackend):
+        with sdp_kernel(**self.backend_map[kernel]):
+            #  Fused Kernels don't support broadcasting
+            device = "cuda"
+            dtype = torch.float16
+            size = (2, 4, 3, 8)
+            size_broadcast = (1, 4, 3, 8)
+            q = torch.randn(size_broadcast, device=device, dtype=dtype)
+            k = torch.randn(size, device=device, dtype=dtype)
+            v = torch.randn(size, device=device, dtype=dtype)
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support fused scaled dot product attention")
+    @parametrize("kernel", [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
+    def test_invalid_fused_inputs_head_dim(self, kernel: SDPBackend):
+        with sdp_kernel(**self.backend_map[kernel]):
+            # The embed dim per head is not divisible by 8 for flash attention
+            device = "cuda"
+            dtype = torch.float16
+            make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=dtype)
+            size = (2, 2, 3, 9)
             q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, False, False))
-
-            # Invalid dtype
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Does not support fused scaled dot product attention")
+    @parametrize(
+        "kernel",
+        [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION]
+        if SM80OrLater
+        else [SDPBackend.EFFICIENT_ATTENTION],
+    )
+    def test_invalid_fused_inputs_invalid_dtype(self, kernel: SDPBackend):
+        with sdp_kernel(**self.backend_map[kernel]):
+            # Invalid dtype for both Flash Attention and Mem Efficient Attention
+            device = "cuda"
             size = (2, 2, 3, 16)
-            make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float64)
+            make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=torch.float64)
             q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, False, False))
-
-            make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float32)
-            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, False, False))
-
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA, "Does not support fused scaled dot product attention")
+    @parametrize(
+        "kernel",
+        [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION]
+        if SM80OrLater
+        else [SDPBackend.EFFICIENT_ATTENTION],
+    )
+    def test_invalid_fused_inputs_attn_mask_present(self, kernel: SDPBackend):
+        with sdp_kernel(**self.backend_map[kernel]):
             # Failures for unsupported SDP args
+            device = "cuda"
+            size = (2, 2, 3, 16)
+            make_tensor = partial(self.rand_tensor, type="dense", device=device, dtype=torch.float16)
             q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
-
-            # Needs attention weights
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, True, False))
-
             # Non-None attention mask
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, torch.ones_like(q), 0.0, False, False))
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, torch.ones_like(q), 0.0, False))
 
-    # Test failing MHA when bias was NoneType
-    def test_bias_is_none(self):
-        x = torch.rand((1, 5, 10))
-        model = torch.nn.modules.activation.MultiheadAttention(10, 1, bias=False, batch_first=True)
-        model.eval()
-        model(x, x, x)
-        # completes without error
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "CUDA unavailable")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support fused SDPA or pre-SM80 hardware")
     def test_unaligned_tensors(self):
+        # The alignment is depdent on arch so we specifiy SM80OrLater
         device = 'cuda'
         dtype = torch.float16
-        size = (2, 2, 8, 5)
-        q = torch.randn(size, device=device, dtype=dtype)
-        k = torch.randn(size, device=device, dtype=dtype)
-        v = torch.randn(size, device=device, dtype=dtype)
+        shape = (2, 2, 8, 5)
+        make_tensor = partial(self.rand_tensor, shape=shape, type=type, device=device, dtype=dtype)
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
         with sdp_kernel(enable_flash=False, enable_mem_efficient=True, enable_math=False):
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, False, False))
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "CUDA unavailable")
-    def test_flash_fail_fp32t(self):
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support fused SDPA or pre-SM80 hardware")
+    def test_flash_fail_fp32(self):
         device = 'cuda'
         dtype = torch.float
-        size = (16, 16, 32, 32)
-        q = torch.randn(size, device=device, dtype=dtype)
-        k = torch.randn(size, device=device, dtype=dtype)
-        v = torch.randn(size, device=device, dtype=dtype)
+        shape = (16, 16, 32, 32)
+        make_tensor = partial(self.rand_tensor, shape=shape, type=type, device=device, dtype=dtype)
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
         with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
-            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                q, k, v, None, 0.0, False, False))
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, None, 0.0, False))
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "CUDA unavailable")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support SDPA or pre-SM80 hardware")
     def test_flash_autocast_fp32_float16(self):
         device = 'cuda'
         dtype = torch.float
-        size = (16, 16, 32, 32)
-        q = torch.randn(size, device=device, dtype=dtype)
-        k = torch.randn(size, device=device, dtype=dtype)
-        v = torch.randn(size, device=device, dtype=dtype)
+        shape = (16, 16, 32, 32)
+        make_tensor = partial(self.rand_tensor, shape=shape, type=type, device=device, dtype=dtype)
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
         with torch.autocast(device_type='cuda', dtype=torch.float16):
             with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
-                _ = torch.nn.functional._scaled_dot_product_attention(
-                    q, k, v, None, 0.0, False, False)
+                _ = torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, None, 0.0, False)
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "CUDA unavailable")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support SDPA or pre-SM80 hardware")
     def test_flash_autocast_fp32_bfloat16(self):
         device = 'cuda'
         dtype = torch.float
-        size = (16, 16, 32, 32)
-        q = torch.randn(size, device=device, dtype=dtype)
-        k = torch.randn(size, device=device, dtype=dtype)
-        v = torch.randn(size, device=device, dtype=dtype)
+        shape = (16, 16, 32, 32)
+        make_tensor = partial(self.rand_tensor, shape=shape, type=type, device=device, dtype=dtype)
+        q, k, v = make_tensor(), make_tensor(), make_tensor()
         with torch.autocast(device_type=device, dtype=torch.bfloat16):
             with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
-                _ = torch.nn.functional._scaled_dot_product_attention(
-                    q, k, v, None, 0.0, False, False)
+                _ = torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, None, 0.0, False)
 
-    @parametrize("device", device_list)
-    def test_train_with_is_causal(self, device):
-        # training with is_causal
-        S, L, E, H = 1, 2, 2, 1
-        layer = nn.TransformerEncoderLayer(
-            d_model=2,
-            dim_feedforward=4,
-            nhead=H,
-            batch_first=True,
-            activation="gelu",
-            dropout=0,
-        )
-        criterion = nn.MSELoss()
-        encoder = nn.TransformerEncoder(layer, 2).to(device)
-        optimizer = optim.SGD(encoder.parameters(), lr=0.1, momentum=0.9)
-        encoder.train()
-
-        encoder.train()
-        optimizer.zero_grad()
-        inputs = torch.randn(S, L, E).to(device)
-
-        outputs = encoder(inputs, is_causal=True)
-
-        loss = criterion(outputs[:, 0:2, :], inputs[:, 0:2, :])
-        loss.backward()
-        optimizer.step()
-
-        # inference with is_causal
-        t_qvk = torch.randn((S, L, E), device=device, dtype=torch.float32)
-        mha = nn.MultiheadAttention(E, H).to(device)
-        attn_out, _ = mha(t_qvk, t_qvk, t_qvk, is_causal=True)
-
-        # Can't give both attn_mask AND is_causal
-        attn_mask = torch.randint(0, 2, size=(L, L), device=device, dtype=torch.bool)
-        with self.assertRaisesRegex(AssertionError, "Only allow causal mask or attn_mask"):
-            _ = mha(t_qvk, t_qvk, t_qvk, attn_mask=attn_mask, is_causal=True)
-
-        # # Passing a causal mask sets is_causal to 1
-        causal_mask = torch.triu(
-            torch.ones(L, L, device=inputs.device) * float('-inf'), diagonal=1
-        ).to(torch.bool)
-
-        mock_layer = MagicMock(torch.nn.MultiheadAttention(E, H), return_value=inputs)
-        encoder.layers[0] = mock_layer
-        outputs = encoder(inputs, mask=causal_mask)
-        mock_layer.assert_called_with(ANY, src_mask=ANY, is_causal=True, src_key_padding_mask=ANY)
-
-
-        # check expected numerical values with all kernels
-        self.is_causal_kernels(["math"], device)
-
-
-    def is_causal_kernels(self, kernels, device):
+    def test_incompatible_mask(self):
         def ones_tensor(*shape):
-            return torch.ones(shape, device=device, dtype=torch.float32).to(device)
+            return torch.ones(shape, dtype=torch.float32)
         S, L, E, H = 1, 2, 4, 1
         qkv = ones_tensor(S, L, E)
 
-        mha = nn.MultiheadAttention(E, H).to(device)
-        mha.in_proj_weight = Parameter(torch.ones((E * 3, E), device=device))
-        mha.out_proj.weight = Parameter(torch.ones((E, E), device=device))
-        expected = torch.ones(size=(S, L, E)).to(device) * 16
-
-        for kernel in kernels:
-            with torch.backends.cuda.sdp_kernel(
-                enable_math=(kernel == 'math'),
-                enable_flash=(kernel == 'flash'),
-                enable_mem_efficient=(kernel == 'meff')
-            ):
-                actual, _ = mha(qkv, qkv, qkv, need_weights=False, is_causal=True)
-                self.assertTrue(torch.equal(actual, expected))
-
-                if kernel != 'math':
-                    # fails if need_weights=False
-                    with self.assertRaisesRegex(RuntimeError, "No available kernel"):
-                        _ = mha(qkv, qkv, qkv, is_causal=True)
-                    # fails with embedding size not multiple of 4
-                    with self.assertRaisesRegex(RuntimeError, "No available kernel"):
-                        qkv_f, mha_f = ones_tensor(S, L, 2), nn.MultiheadAttention(2, H).to(device)
-                        _ = mha_f(qkv_f, qkv_f, qkv_f, need_weights=False, is_causal=True)
-                        torch.cuda.synchronize()
+        mha = nn.MultiheadAttention(E, H)
+        mha.in_proj_weight = Parameter(torch.ones((E * 3, E)))
+        mha.out_proj.weight = Parameter(torch.ones((E, E)))
+        qkv = qkv.to(float)
+        kpm = ones_tensor(S, L) * float("-inf")
+        am = ones_tensor(L, L).to(bool)
+
+        def func():
+            return mha(qkv, qkv, qkv, need_weights=False, key_padding_mask=kpm, attn_mask=am)
+
+        self.assertRaises(RuntimeError, func)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support SDPA or pre-SM80 hardware")
+    @parametrize("batch_size", [1, 8])
+    @parametrize("seq_len_q", [4, 8, 64, 128, 256, 512, 1024, 2048])
+    @parametrize("seq_len_k", [4, 8, 64, 128, 256, 512, 1024, 2048])
+    @parametrize("head_dim", [8, 16, 32, 64, 128])
+    @parametrize("is_causal", [True, False])
+    @parametrize("dropout_p", [0.0])  # mem_efficient_attention does not support dropout
+    @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_mem_efficient_attention_vs_math_ref_grads(self, batch_size: int, seq_len_q: int, seq_len_k: int,
+                                                       head_dim: int, is_causal: bool, dropout_p: float, dtype: torch.dtype):
+        n_heads = 4
+        query = torch.rand(batch_size, n_heads, seq_len_q, head_dim,
+                           device="cuda", dtype=dtype, requires_grad=True)
+        key = torch.rand(batch_size, n_heads, seq_len_k, head_dim, device="cuda",
+                         dtype=dtype, requires_grad=True)
+        value = torch.rand(batch_size, n_heads, seq_len_k, head_dim,
+                           device="cuda", dtype=dtype, requires_grad=True)
+
+        # Run the math kernel on low precision references
+        query_ref_lp = query.clone().detach().requires_grad_(True)
+        key_ref_lp = key.clone().detach().requires_grad_(True)
+        value_ref_lp = value.clone().detach().requires_grad_(True)
+
+        higher_precision_dtype = torch.float64 if dtype == torch.float32 else torch.float32
+
+        query_ref = query.clone().detach().to(higher_precision_dtype).requires_grad_(True)
+        key_ref = key.clone().detach().to(higher_precision_dtype).requires_grad_(True)
+        value_ref = value.clone().detach().to(higher_precision_dtype).requires_grad_(True)
+
+        # Create real output
+        with sdp_kernel(enable_mem_efficient=True, enable_flash=False, enable_math=False):
+            # See check_gpu_sm86_head_dim_128 in pytorch/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+            if isSM86Device and head_dim == 128:
+                self.assertRaises(RuntimeError, lambda: F.scaled_dot_product_attention(query, key, value,
+                                                                                       dropout_p=dropout_p, is_causal=is_causal))
+                return
+            else:
+                out = F.scaled_dot_product_attention(query, key, value, dropout_p=dropout_p, is_causal=is_causal)
+
+        with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+            # High Precision Math Reference
+            out_ref = F.scaled_dot_product_attention(query_ref, key_ref, value_ref,
+                                                     dropout_p=dropout_p, is_causal=is_causal)
+            # Low Precision Math Reference
+            out_lp_ref = F.scaled_dot_product_attention(query_ref_lp, key_ref_lp, value_ref_lp,
+                                                        dropout_p=dropout_p, is_causal=is_causal)
+
+        upstream_grad = torch.rand_like(out, requires_grad=False)
+
+        out.backward(upstream_grad)
+        out_ref.backward(upstream_grad.to(out_ref.dtype))
+        out_lp_ref.backward(upstream_grad.to(out_lp_ref.dtype))
+
+        # [Note] Fused Tolerances
+        # Establish the numerical error between the "true" high precision math output
+        # and the low precision math reference. We use this reference for the atol
+        # And we use the default rtol for the low precision type.
+        # We then provide a fudge factor for gradients respectively to account
+        # for the use of the fused kernel rather than the eager implemntation.
+        out_deviation = out_ref - out_lp_ref
+        output_ref_atol = max(torch.abs(out_deviation).max().item(), default_atol[out.dtype])
+        output_ref_rtol = max(get_rtol(out_ref, out_lp_ref), default_rtol[out.dtype])
+
+        grad_q_deviation = query_ref.grad - query_ref_lp.grad
+        grad_q_ref_atol = max(torch.abs(grad_q_deviation).max().item(), default_atol[out.dtype])
+        grad_q_ref_rtol = max(get_rtol(query_ref.grad, query_ref_lp.grad), default_rtol[out.dtype])
+
+        # TODO: Investigate why grad_k needs larger tolerances
+        grad_k_deviation = key_ref.grad - key_ref_lp.grad
+        grad_k_ref_atol = max(7 * torch.abs(grad_k_deviation).max().item(), 7 * default_atol[out.dtype])
+        grad_k_ref_rtol = max(7 * get_rtol(key_ref.grad, key_ref_lp.grad), 7 * default_rtol[out.dtype])
+
+        grad_v_deviation = value_ref.grad - value_ref_lp.grad
+        grad_v_ref_atol = max(torch.abs(grad_v_deviation).max().item(), default_atol[out.dtype])
+        grad_v_ref_rtol = max(get_rtol(value_ref.grad, value_ref_lp.grad), default_rtol[out.dtype])
+
+        self.assertEqual(out, out_ref.to(out.dtype), atol=output_ref_atol, rtol=output_ref_rtol)
+        self.assertEqual(query.grad, query_ref.grad.to(query.grad.dtype),
+                         atol=grad_q_ref_atol, rtol=grad_q_ref_rtol)
+        self.assertEqual(key.grad, key_ref.grad.to(key.grad.dtype),
+                         atol=grad_k_ref_atol, rtol=grad_k_ref_rtol)
+        self.assertEqual(value.grad, value_ref.grad.to(value.grad.dtype),
+                         atol=grad_v_ref_atol, rtol=grad_v_ref_rtol)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "Does not support SDPA or pre-SM80 hardware")
+    @parametrize("batch_size", [1, 8])
+    @parametrize("seq_len_q", [4, 8, 64, 128, 256, 512, 1024, 2048])
+    @parametrize("seq_len_k", [4, 8, 64, 128, 256, 512, 1024, 2048])
+    @parametrize("head_dim", [8, 16, 32, 64])
+    @parametrize("is_causal", [True, False])
+    @parametrize("dropout_p", [0.0, 0.22, 0.48])
+    @parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_flash_attention_vs_math_ref_grads(self, batch_size: int, seq_len_q: int, seq_len_k: int,
+                                               head_dim: int, is_causal: bool, dropout_p: float, dtype: torch.dtype):
+        n_heads = 4
+        query = torch.rand(batch_size, n_heads, seq_len_q, head_dim,
+                           device="cuda", dtype=dtype, requires_grad=True)
+        key = torch.rand(batch_size, n_heads, seq_len_k, head_dim, device="cuda",
+                         dtype=dtype, requires_grad=True)
+        value = torch.rand(batch_size, n_heads, seq_len_k, head_dim,
+                           device="cuda", dtype=dtype, requires_grad=True)
+
+        # Run the math kernel on low precision references
+        query_ref_lp = query.clone().detach().requires_grad_(True)
+        key_ref_lp = key.clone().detach().requires_grad_(True)
+        value_ref_lp = value.clone().detach().requires_grad_(True)
+
+        query_ref = query.clone().detach().to(torch.float32).requires_grad_(True)
+        key_ref = key.clone().detach().to(torch.float32).requires_grad_(True)
+        value_ref = value.clone().detach().to(torch.float32).requires_grad_(True)
+
+        is_dropout = dropout_p > 0.0
+
+        # Create real output
+        output_tuple = torch.ops.aten._scaled_dot_product_flash_attention(
+            query, key, value, dropout_p=dropout_p, is_causal=is_causal, return_debug_mask=True)
+        out = output_tuple[0]
+        dbug_mask = output_tuple[-1]
+
+        query_padding_mask = torch.ones(
+            1, seq_len_q, device="cuda", dtype=torch.bool)
+        key_padding_mask = torch.ones(
+            1, seq_len_k, device="cuda", dtype=torch.bool)
+
+        softmax_mask = self.convert_flash_attn_S_to_softmax(
+            dbug_mask, query_padding_mask, key_padding_mask, head_dim=head_dim, causal=is_causal)
+        dropout_mask = softmax_mask >= 0
+
+        if not is_dropout:
+            with sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False):
+                # High Precision Math Reference
+                out_ref = F.scaled_dot_product_attention(
+                    query_ref, key_ref, value_ref, is_causal=is_causal)
+                # Low Precision Math Reference
+                out_lp_ref = F.scaled_dot_product_attention(
+                    query_ref_lp, key_ref_lp, value_ref_lp, is_causal=is_causal)
+        else:
+            # High Precision Math Reference
+            out_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                query_ref, key_ref, value_ref, dropout_p=dropout_p, is_causal=is_causal, dropout_mask=dropout_mask)[0]
+            # Low Precision Math Reference
+            out_lp_ref = torch.ops.aten._scaled_dot_product_attention_math(
+                query_ref_lp, key_ref_lp, value_ref_lp, dropout_p=dropout_p, is_causal=is_causal, dropout_mask=dropout_mask)[0]
+
+        upstream_grad = torch.rand_like(out, requires_grad=False)
+
+        out.backward(upstream_grad)
+        out_ref.backward(upstream_grad.to(out_ref.dtype))
+        out_lp_ref.backward(upstream_grad.to(out_lp_ref.dtype))
+
+        # See [Note] Fused Tolerances above
+        out_deviation = out_ref - out_lp_ref
+        output_ref_atol = max(torch.abs(out_deviation).max().item(), default_atol[out.dtype])
+        output_ref_rtol = max(get_rtol(out_ref, out_lp_ref), default_rtol[out.dtype])
+
+        # TODO: Investigate why grad_q needs larger tolerances
+        grad_q_deviation = query_ref.grad - query_ref_lp.grad
+        grad_q_ref_atol = max(2 * torch.abs(grad_q_deviation).max().item(), default_atol[out.dtype])
+        grad_q_ref_rtol = max(get_rtol(query_ref.grad, query_ref_lp.grad), default_rtol[out.dtype])
+
+        grad_k_deviation = key_ref.grad - key_ref_lp.grad
+        grad_k_ref_atol = max(torch.abs(grad_k_deviation).max().item(), default_atol[out.dtype])
+        grad_k_ref_rtol = max(get_rtol(key_ref.grad, key_ref_lp.grad), default_rtol[out.dtype])
+
+        grad_v_deviation = value_ref.grad - value_ref_lp.grad
+        grad_v_ref_atol = max(torch.abs(grad_v_deviation).max().item(), default_atol[out.dtype])
+        grad_v_ref_rtol = max(get_rtol(value_ref.grad, value_ref_lp.grad), default_rtol[out.dtype])
+
+        self.assertEqual(out, out_ref.to(out.dtype), atol=output_ref_atol, rtol=output_ref_rtol)
+        self.assertEqual(query.grad, query_ref.grad.to(query.grad.dtype),
+                         atol=grad_q_ref_atol, rtol=grad_q_ref_rtol)
+        self.assertEqual(key.grad, key_ref.grad.to(key.grad.dtype),
+                         atol=grad_k_ref_atol, rtol=grad_k_ref_rtol)
+        self.assertEqual(value.grad, value_ref.grad.to(value.grad.dtype),
+                         atol=grad_v_ref_atol, rtol=grad_v_ref_rtol)
+
+    @parametrize("kernel", [SDPBackend.MATH, SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
+    @parametrize("device", ["cpu", "cuda"] if TEST_CUDA else ["cpu"])
+    def test_invalid_inputs_different_datatypes(self, kernel: SDPBackend, device: str):
+        with sdp_kernel(**self.backend_map[kernel]):
+            # Different datatypes
+            shape = (1, 4, 8, 16)
+            query = torch.randn(shape, dtype=torch.float32, device=device)
+            key = torch.randn(shape, dtype=torch.float16, device=device)
+            value = torch.randn(shape, dtype=torch.float16, device=device)
+            self.assertRaises(RuntimeError, lambda: F.scaled_dot_product_attention(query, key, value))
+
+    @parametrize("kernel", [SDPBackend.MATH, SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
+    @parametrize("device", ["cpu", "cuda"] if TEST_CUDA else ["cpu"])
+    def test_invalid_inputs_different_devices(self, kernel: SDPBackend, device: str):
+        # Different devices
+        shape = (1, 4, 8, 16)
+        if device == "cuda":
+            query = torch.randn(shape, dtype=torch.float32, device=device)
+            key = torch.randn(shape, dtype=torch.float16, device='cpu')
+            value = torch.randn(shape, dtype=torch.float16, device='cpu')
+            self.assertRaises(RuntimeError, lambda: F.scaled_dot_product_attention(query, key, value))
+
+    @parametrize("kernel", [SDPBackend.MATH, SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION])
+    @parametrize("device", ["cpu", "cuda"] if TEST_CUDA else ["cpu"])
+    def test_invalid_inputs_1_dimensional_inputs(self, kernel: SDPBackend, device: str):
+        with sdp_kernel(**self.backend_map[kernel]):
+            # 1 dimensional input
+            shape = (1, 4)
+            query = torch.randn(4, dtype=torch.float16, device=device)
+            key = torch.randn(shape, dtype=torch.float16, device=device)
+            value = torch.randn(shape, dtype=torch.float16, device=device)
+            self.assertRaises(RuntimeError, lambda: F.scaled_dot_product_attention(query, key, value))
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater, "CUDA unavailable")
-    def test_is_causal_gpu(self):
-        device = 'cuda'
-        self.is_causal_kernels(["math", "meff"], device)
 
 # TODO: Replace this with instantiate_device_type_tests() to take advantage of test framework support for
 # cross device / dtype testing.
 instantiate_parametrized_tests(TestTransformers)
+instantiate_parametrized_tests(TestSDPA)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_typing.py b/test/test_typing.py
index dc083e12d603..3f6589e6bf08 100644
--- a/test/test_typing.py
+++ b/test/test_typing.py
@@ -10,6 +10,8 @@
 
 import pytest
 
+from torch.testing._internal.common_utils import run_tests
+
 try:
     from mypy import api
 except ImportError:
@@ -232,5 +234,5 @@ def _test_reveal(path: str, reveal: str, expected_reveal: str, lineno: int) -> N
         raise AssertionError(_REVEAL_MSG.format(lineno, expected_reveal, reveal))
 
 
-if __name__ == '__main__':
-    pytest.main([__file__])
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 77a1940a7f50..bb9107b61812 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -8,7 +8,7 @@
 import random
 import unittest
 
-from torch._six import inf, nan
+from torch import inf, nan
 from torch.testing._internal.common_utils import (
     TestCase,
     run_tests,
diff --git a/test/test_utils.py b/test/test_utils.py
index cb65e0c8b59b..184e2d33f5ba 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -104,7 +104,7 @@ def test_checkpoint_trigger(self):
         class Net(nn.Module):
 
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.counter = 0
 
             def forward(self, input_var):
@@ -190,7 +190,7 @@ def test_checkpoint(self):
     def test_checkpoint_module_list(self):
         class ModuleListNet(nn.Module):
             def __init__(self):
-                super(ModuleListNet, self).__init__()
+                super().__init__()
                 module_list = [
                     nn.Linear(100, 50),
                     nn.ReLU(),
@@ -779,7 +779,7 @@ def test_load_standalone(self):
             shutil.rmtree(build_dir)
 
 
-class DummyXPUModule(object):
+class DummyXPUModule:
     @staticmethod
     def is_available():
         return True
@@ -885,6 +885,11 @@ def test_cc_compiler_is_ok(self):
 
 class TestTraceback(TestCase):
     def test_basic(self):
+        # We can't xfail this test as it leaves the traceback in such a bad
+        # state that xfail itself fails.
+        if sys.version_info >= (3, 11):
+            self.skipTest("Fails on 3.11")
+
         source = '''\
 def f(x):
     x = x * 3
diff --git a/test/test_vulkan.py b/test/test_vulkan.py
index 37b52d3fc98c..a9093f4191f5 100644
--- a/test/test_vulkan.py
+++ b/test/test_vulkan.py
@@ -67,7 +67,7 @@ def test_conv(self):
 
         class Conv2D(torch.nn.Module):
             def __init__(self):
-                super(Conv2D, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
@@ -87,7 +87,7 @@ def forward(self, x):
 
         class Conv2DRelu(torch.nn.Module):
             def __init__(self):
-                super(Conv2DRelu, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
@@ -126,7 +126,7 @@ def forward(self, x):
 
         class Conv2DHardtanh(torch.nn.Module):
             def __init__(self):
-                super(Conv2DHardtanh, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
diff --git a/test/test_weak.py b/test/test_weak.py
index 6e2b77e3026b..a59dc491c135 100644
--- a/test/test_weak.py
+++ b/test/test_weak.py
@@ -512,7 +512,7 @@ def __getitem__(self, key):
 
         d = self._empty_mapping()
 
-        class badseq(object):
+        class badseq:
             def __iter__(self):
                 return self
 
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 17ac2d9e7fc3..ab764a61d8a9 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -191,7 +191,7 @@ class TestXNNPACKSerDes(TestCase):
     def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias):
         class Linear(torch.nn.Module):
             def __init__(self, weight, bias=None):
-                super(Linear, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
 
@@ -200,7 +200,7 @@ def forward(self, x):
 
         class LinearPrePacked(torch.nn.Module):
             def __init__(self, weight, bias=None):
-                super(LinearPrePacked, self).__init__()
+                super().__init__()
                 self.packed_weight_bias = torch.ops.prepacked.linear_clamp_prepack(weight, bias)
 
             def forward(self, x):
@@ -266,7 +266,7 @@ def test_conv2d(self,
                     format):
         class Conv2D(torch.nn.Module):
             def __init__(self, weight, bias, strides, paddings, dilations, groups):
-                super(Conv2D, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
                 self.strides = strides
@@ -280,7 +280,7 @@ def forward(self, x):
 
         class Conv2DPrePacked(torch.nn.Module):
             def __init__(self, weight, bias, strides, paddings, dilations, groups):
-                super(Conv2DPrePacked, self).__init__()
+                super().__init__()
                 self.packed_weight_bias = torch.ops.prepacked.conv2d_clamp_prepack(weight, bias,
                                                                                    strides, paddings, dilations, groups)
 
@@ -367,7 +367,7 @@ def test_conv2d_transpose(self,
                               format):
         class Conv2DT(torch.nn.Module):
             def __init__(self, weight, bias, strides, paddings, output_paddings, dilations, groups):
-                super(Conv2DT, self).__init__()
+                super().__init__()
                 self.weight = weight
                 self.bias = bias
                 self.strides = strides
@@ -382,7 +382,7 @@ def forward(self, x):
 
         class Conv2DTPrePacked(torch.nn.Module):
             def __init__(self, weight, bias, strides, paddings, output_paddings, dilations, groups):
-                super(Conv2DTPrePacked, self).__init__()
+                super().__init__()
                 self.packed_weight_bias = torch.ops.prepacked.conv2d_transpose_clamp_prepack(weight, bias,
                                                                                              strides, paddings,
                                                                                              output_paddings,
@@ -475,7 +475,7 @@ def test_combined_model(self,
         class M(torch.nn.Module):
             def __init__(self, conv_weight, conv_bias, linear_weight, linear_bias,
                          strides, paddings, dilations, groups):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv_weight = conv_weight
                 self.conv_bias = conv_bias
                 self.linear_weight = linear_weight
@@ -495,7 +495,7 @@ def forward(self, x):
         class MPrePacked(torch.nn.Module):
             def __init__(self, conv_weight, conv_bias, linear_weight, linear_bias,
                          strides, paddings, dilations, groups):
-                super(MPrePacked, self).__init__()
+                super().__init__()
                 self.conv2d_clamp_run_weight_bias = \
                     torch.ops.prepacked.conv2d_clamp_prepack(conv_weight, conv_bias,
                                                              strides, paddings, dilations, groups)
@@ -623,7 +623,7 @@ def test_linear(self):
 
         class Linear(torch.nn.Module):
             def __init__(self):
-                super(Linear, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand((weight_output_dim)), requires_grad=False)
 
@@ -632,7 +632,7 @@ def forward(self, x):
 
         class LinearNoBias(torch.nn.Module):
             def __init__(self):
-                super(LinearNoBias, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(weight_shape), requires_grad=False)
 
             def forward(self, x):
@@ -670,7 +670,7 @@ def forward(self, x):
 
         class Conv2D(torch.nn.Module):
             def __init__(self):
-                super(Conv2D, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
@@ -684,7 +684,7 @@ def forward(self, x):
 
         class Conv2DT(torch.nn.Module):
             def __init__(self):
-                super(Conv2DT, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(conv_transpose_weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                 self.strides = strides
@@ -720,7 +720,7 @@ def forward(self, x):
 
         class M(torch.nn.Module):
             def __init__(self, activation_fn=F.relu):
-                super(M, self).__init__()
+                super().__init__()
                 self.conv_weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                 self.conv_bias = torch.nn.Parameter(torch.rand((conv_bias_shape)), requires_grad=False)
                 self.linear_weight = torch.nn.Parameter(torch.rand(linear_weight_shape), requires_grad=False)
@@ -832,7 +832,7 @@ def forward(self, x):
 
         class MFusionAntiPattern(torch.nn.Module):
             def __init__(self):
-                super(MFusionAntiPattern, self).__init__()
+                super().__init__()
                 self.linear_weight = torch.nn.Parameter(torch.rand(linear_weight_shape), requires_grad=False)
                 self.linear_bias = torch.nn.Parameter(torch.rand((weight_output_dim)), requires_grad=False)
                 self.strides = strides
@@ -860,7 +860,7 @@ def forward(self, x):
 
         class MFusionAntiPatternParamMinMax(torch.nn.Module):
             def __init__(self):
-                super(MFusionAntiPatternParamMinMax, self).__init__()
+                super().__init__()
                 self.linear_weight = torch.nn.Parameter(torch.rand(linear_weight_shape), requires_grad=False)
                 self.linear_bias = torch.nn.Parameter(torch.rand((weight_output_dim)), requires_grad=False)
                 self.strides = strides
@@ -893,7 +893,7 @@ def test_decomposed_linear(self):
 
         class DecomposedLinearAddmm(torch.nn.Module):
             def __init__(self):
-                super(DecomposedLinearAddmm, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand((weight_output_dim)), requires_grad=False)
 
@@ -903,7 +903,7 @@ def forward(self, x):
 
         class DecomposedLinearMatmulAdd(torch.nn.Module):
             def __init__(self):
-                super(DecomposedLinearMatmulAdd, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand((weight_output_dim)), requires_grad=False)
 
@@ -915,7 +915,7 @@ def forward(self, x):
 
         class DecomposedLinearMatmul(torch.nn.Module):
             def __init__(self):
-                super(DecomposedLinearMatmul, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(weight_shape), requires_grad=False)
                 self.bias = torch.nn.Parameter(torch.rand((weight_output_dim)), requires_grad=False)
 
@@ -1018,7 +1018,7 @@ def test_conv1d_basic(self):
 
             class Conv1D(torch.nn.Module):
                 def __init__(self):
-                    super(Conv1D, self).__init__()
+                    super().__init__()
                     self.weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                     self.bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                     self.stride = stride
@@ -1080,7 +1080,7 @@ def test_conv1d_with_relu_fc(self):
 
             class Net(torch.nn.Module):
                 def __init__(self):
-                    super(Net, self).__init__()
+                    super().__init__()
                     self.conv_weight = torch.nn.Parameter(torch.rand(conv_weight_shape), requires_grad=False)
                     self.conv_bias = torch.nn.Parameter(torch.rand(conv_bias_shape), requires_grad=False)
                     self.stride = stride
diff --git a/third_party/LICENSES_BUNDLED.txt b/third_party/LICENSES_BUNDLED.txt
index d03c1c2137e8..45b7a2c2c4de 100644
--- a/third_party/LICENSES_BUNDLED.txt
+++ b/third_party/LICENSES_BUNDLED.txt
@@ -1,6 +1,11 @@
 The Pytorch repository and source distributions bundle several libraries that are 
 compatibly licensed.  We list these here.
 
+Name: DCGM
+License: Apache-2.0
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/DCGM
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/DCGM/LICENSE
+
 Name: FP16
 License: MIT
 Files: third_party/FP16
@@ -21,6 +26,11 @@ License: BSD-3-Clause
 Files: third_party/QNNPACK
   For details, see: third_party/QNNPACK/LICENSE
 
+Name: VulkanMemoryAllocator
+License: MIT
+Files: third_party/VulkanMemoryAllocator
+  For details, see: third_party/VulkanMemoryAllocator/LICENSE.txt
+
 Name: XNNPACK
 License: BSD-3-Clause
 Files: third_party/XNNPACK
@@ -29,27 +39,39 @@ Files: third_party/XNNPACK
 Name: benchmark
 License: Apache-2.0
 Files: third_party/benchmark,
-     third_party/protobuf/third_party/benchmark,
+     third_party/onnx/third_party/benchmark,
      third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark,
-     third_party/onnx/third_party/benchmark
+     third_party/protobuf/third_party/benchmark
   For details, see: third_party/benchmark/LICENSE,
-     third_party/protobuf/third_party/benchmark/LICENSE,
+     third_party/onnx/third_party/benchmark/LICENSE,
      third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE,
-     third_party/onnx/third_party/benchmark/LICENSE
+     third_party/protobuf/third_party/benchmark/LICENSE
 
 Name: clog
 License: BSD-2-Clause
-Files: third_party/cpuinfo/deps/clog,
-     third_party/fbgemm/third_party/cpuinfo/deps/clog,
-     third_party/QNNPACK/deps/clog
-  For details, see: third_party/cpuinfo/deps/clog/LICENSE,
-     third_party/fbgemm/third_party/cpuinfo/deps/clog/LICENSE,
-     third_party/QNNPACK/deps/clog/LICENSE
+Files: third_party/QNNPACK/deps/clog,
+     third_party/cpuinfo/deps/clog,
+     third_party/fbgemm/third_party/cpuinfo/deps/clog
+  For details, see: third_party/QNNPACK/deps/clog/LICENSE,
+     third_party/cpuinfo/deps/clog/LICENSE,
+     third_party/fbgemm/third_party/cpuinfo/deps/clog/LICENSE
+
+Name: colorama
+License: BSD-3-Clause
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/DCGM/testing/python3/libs_3rdparty/colorama
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/DCGM/testing/python3/libs_3rdparty/colorama/LICENSE.txt
 
 Name: cpplint
 License: BSD-3-Clause
-Files: third_party/nlohmann/tools/cpplint
-  For details, see: third_party/nlohmann/tools/cpplint/LICENSE
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/json/third_party/cpplint,
+     third_party/nlohmann/tools/cpplint
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/json/third_party/cpplint/LICENSE,
+     third_party/nlohmann/tools/cpplint/LICENSE
+
+Name: cpr
+License: MIT
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/cpr
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/cpr/LICENSE
 
 Name: cpuinfo
 License: BSD-2-Clause
@@ -63,6 +85,13 @@ License: MIT
 Files: third_party/cudnn_frontend
   For details, see: third_party/cudnn_frontend/LICENSE.txt
 
+Name: cutlass
+License: BSD-3-Clause
+Files: third_party/cutlass,
+     third_party/fbgemm/third_party/cutlass
+  For details, see: third_party/cutlass/LICENSE.txt,
+     third_party/fbgemm/third_party/cutlass/LICENSE.txt
+
 Name: dart
 License: Apache-2.0
 Files: third_party/flatbuffers/dart
@@ -70,8 +99,15 @@ Files: third_party/flatbuffers/dart
 
 Name: doctest
 License: MIT
-Files: third_party/nlohmann/tests/thirdparty/doctest
-  For details, see: third_party/nlohmann/tests/thirdparty/doctest/LICENSE.txt
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/json/test/thirdparty/doctest,
+     third_party/nlohmann/tests/thirdparty/doctest
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/json/test/thirdparty/doctest/LICENSE.txt,
+     third_party/nlohmann/tests/thirdparty/doctest/LICENSE.txt
+
+Name: dynolog
+License: MIT
+Files: third_party/kineto/libkineto/third_party/dynolog
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/LICENSE
 
 Name: eigen
 License: BSD-3-Clause
@@ -95,10 +131,12 @@ Files: third_party/flatbuffers
 
 Name: fmt
 License: MIT with exception
-Files: third_party/kineto/libkineto/third_party/fmt,
-     third_party/fmt
-  For details, see: third_party/kineto/libkineto/third_party/fmt/LICENSE.rst,
-     third_party/fmt/LICENSE.rst
+Files: third_party/fmt,
+     third_party/kineto/libkineto/third_party/dynolog/third_party/fmt,
+     third_party/kineto/libkineto/third_party/fmt
+  For details, see: third_party/fmt/LICENSE.rst,
+     third_party/kineto/libkineto/third_party/dynolog/third_party/fmt/LICENSE.rst,
+     third_party/kineto/libkineto/third_party/fmt/LICENSE.rst
 
 Name: foxi
 License: MIT
@@ -112,14 +150,14 @@ Files: third_party/gemmlowp/gemmlowp
 
 Name: generator
 License: Apache-2.0
-Files: third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator,
+Files: third_party/fbgemm/third_party/googletest/googlemock/scripts/generator,
      third_party/googletest/googlemock/scripts/generator,
-     third_party/fbgemm/third_party/googletest/googlemock/scripts/generator,
+     third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator,
      third_party/protobuf/third_party/googletest/googlemock/scripts/generator,
      third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator
-  For details, see: third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE,
+  For details, see: third_party/fbgemm/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/googletest/googlemock/scripts/generator/LICENSE,
-     third_party/fbgemm/third_party/googletest/googlemock/scripts/generator/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/protobuf/third_party/googletest/googlemock/scripts/generator/LICENSE,
      third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator/LICENSE
 
@@ -130,31 +168,33 @@ Files: third_party/gloo
 
 Name: googlemock
 License: BSD-3-Clause
-Files: third_party/kineto/libkineto/third_party/googletest/googlemock,
-     third_party/fbgemm/third_party/googletest/googlemock,
+Files: third_party/fbgemm/third_party/googletest/googlemock,
+     third_party/kineto/libkineto/third_party/googletest/googlemock,
      third_party/protobuf/third_party/googletest/googlemock,
      third_party/tensorpipe/third_party/googletest/googlemock
-  For details, see: third_party/kineto/libkineto/third_party/googletest/googlemock/LICENSE,
-     third_party/fbgemm/third_party/googletest/googlemock/LICENSE,
+  For details, see: third_party/fbgemm/third_party/googletest/googlemock/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/googlemock/LICENSE,
      third_party/protobuf/third_party/googletest/googlemock/LICENSE,
      third_party/tensorpipe/third_party/googletest/googlemock/LICENSE
 
 Name: googletest
 License: BSD-3-Clause
-Files: third_party/kineto/libkineto/third_party/googletest,
-     third_party/kineto/libkineto/third_party/googletest/googletest,
-     third_party/googletest,
-     third_party/fbgemm/third_party/googletest,
+Files: third_party/fbgemm/third_party/googletest,
      third_party/fbgemm/third_party/googletest/googletest,
+     third_party/googletest,
+     third_party/kineto/libkineto/third_party/dynolog/third_party/googletest,
+     third_party/kineto/libkineto/third_party/googletest,
+     third_party/kineto/libkineto/third_party/googletest/googletest,
      third_party/protobuf/third_party/googletest,
      third_party/protobuf/third_party/googletest/googletest,
      third_party/tensorpipe/third_party/googletest,
      third_party/tensorpipe/third_party/googletest/googletest
-  For details, see: third_party/kineto/libkineto/third_party/googletest/LICENSE,
-     third_party/kineto/libkineto/third_party/googletest/googletest/LICENSE,
-     third_party/googletest/LICENSE,
-     third_party/fbgemm/third_party/googletest/LICENSE,
+  For details, see: third_party/fbgemm/third_party/googletest/LICENSE,
      third_party/fbgemm/third_party/googletest/googletest/LICENSE,
+     third_party/googletest/LICENSE,
+     third_party/kineto/libkineto/third_party/dynolog/third_party/googletest/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/LICENSE,
+     third_party/kineto/libkineto/third_party/googletest/googletest/LICENSE,
      third_party/protobuf/third_party/googletest/LICENSE,
      third_party/protobuf/third_party/googletest/googletest/LICENSE,
      third_party/tensorpipe/third_party/googletest/LICENSE,
@@ -167,6 +207,11 @@ Files: third_party/ideep/mkl-dnn/tests/gtest,
   For details, see: third_party/ideep/mkl-dnn/tests/gtest/LICENSE,
      third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest/LICENSE
 
+Name: hipify_torch
+License: MIT
+Files: third_party/fbgemm/third_party/hipify_torch
+  For details, see: third_party/fbgemm/third_party/hipify_torch/LICENSE.txt
+
 Name: ideep
 License: MIT
 Files: third_party/ideep
@@ -222,21 +267,26 @@ License: Apache-2.0
 Files: third_party/ideep/mkl-dnn/third_party/oneDNN
   For details, see: third_party/ideep/mkl-dnn/third_party/oneDNN/LICENSE
 
-Name: onnx
-License: MIT
-Files: third_party/onnx-tensorrt/third_party/onnx
-  For details, see: third_party/onnx-tensorrt/third_party/onnx/LICENSE
-
 Name: onnx
 License: Apache-2.0
 Files: third_party/onnx
   For details, see: third_party/onnx/LICENSE
 
+Name: onnx
+License: MIT
+Files: third_party/onnx-tensorrt/third_party/onnx
+  For details, see: third_party/onnx-tensorrt/third_party/onnx/LICENSE
+
 Name: onnx-tensorrt
 License: MIT
 Files: third_party/onnx-tensorrt
   For details, see: third_party/onnx-tensorrt/LICENSE
 
+Name: pfs
+License: Apache-2.0
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/pfs
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/pfs/LICENSE
+
 Name: protobuf
 License: BSD-3-Clause
 Files: third_party/protobuf
@@ -254,13 +304,13 @@ Files: third_party/pthreadpool
 
 Name: pybind11
 License: BSD-3-Clause
-Files: third_party/pybind11,
+Files: third_party/onnx/third_party/pybind11,
      third_party/onnx-tensorrt/third_party/onnx/third_party/pybind11,
-     third_party/onnx/third_party/pybind11,
+     third_party/pybind11,
      third_party/tensorpipe/third_party/pybind11
-  For details, see: third_party/pybind11/LICENSE,
+  For details, see: third_party/onnx/third_party/pybind11/LICENSE,
      third_party/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE,
-     third_party/onnx/third_party/pybind11/LICENSE,
+     third_party/pybind11/LICENSE,
      third_party/tensorpipe/third_party/pybind11/LICENSE
 
 Name: python-peachpy
@@ -298,6 +348,11 @@ License: BSD-3-Clause
 Files: third_party/tensorpipe
   For details, see: third_party/tensorpipe/LICENSE.txt
 
+Name: test
+License: MIT with exception
+Files: third_party/kineto/libkineto/third_party/dynolog/third_party/cpr/test
+  For details, see: third_party/kineto/libkineto/third_party/dynolog/third_party/cpr/test/LICENSE
+
 Name: zstd
 License: BSD-3-Clause
 Files: third_party/zstd
diff --git a/third_party/XNNPACK b/third_party/XNNPACK
index ae108ef49aa5..51a987591a6f 160000
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
@@ -1 +1 @@
-Subproject commit ae108ef49aa5623b896fc93d4298c49d1750d9ba
+Subproject commit 51a987591a6fc9f0fc0707077f53d763ac132cbf
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index 171a7a986f7f..81a041a68245 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 171a7a986f7fbd9ed71bd0cf3c7ad4f55843d6b3
+Subproject commit 81a041a68245cd8f871c43bbbbd5b6b627979a30
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 80d64206c078..03b204667670 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 80d64206c07879fd4683be66873de7cefa1a0a71
+Subproject commit 03b2046676707da64504e898490ab46104d4682a
diff --git a/third_party/fmt b/third_party/fmt
index 7bdf0628b127..a33701196adf 160000
--- a/third_party/fmt
+++ b/third_party/fmt
@@ -1 +1 @@
-Subproject commit 7bdf0628b1276379886c7f6dda2cef2b3b374f0b
+Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50
diff --git a/third_party/generate-xnnpack-wrappers.py b/third_party/generate-xnnpack-wrappers.py
index c1bb51ad9cf5..8df048992c01 100644
--- a/third_party/generate-xnnpack-wrappers.py
+++ b/third_party/generate-xnnpack-wrappers.py
@@ -4,6 +4,7 @@
 import collections
 import os
 import sys
+import logging
 
 BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
 WRAPPER_SRC_NAMES = {
@@ -11,6 +12,7 @@
     "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
     "PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
     "PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
     "PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
     "PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
     "PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
@@ -27,14 +29,50 @@
     "PROD_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
     "AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
+
+    # add additoonal:
+    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "ALL_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
+    "ALL_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "ALL_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "ALL_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+
+    'ALL_AVX512SKX_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_AVX512VBMI_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_F16C_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_FMA3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_FP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEON_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEON_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
+    'ALL_NEONBF16_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEONDOT_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEONFMA_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
+    'ALL_NEONFP16_MICROKERNEL_SRCS':"defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEONFP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
+    'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
+    'ALL_NEONV8_MICROKERNEL_SRCS': "defined(__aarch64__)",
+    'ALL_SCALAR_MICROKERNEL_SRCS': "defined(__arm__)",
+    'ALL_SSE_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_SSE2_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_SSE41_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_SSSE3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'ALL_XOP_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    'AARCH32_ASM_MICROKERNEL_SRCS': "defined(__arm__)",
+    "PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_SCALAR_MICROKERNEL_SRCS": "defined(__arm__)",
+
 }
 
-SRC_NAMES = [
+SRC_NAMES = set([
     "OPERATOR_SRCS",
     "SUBGRAPH_SRCS",
     "LOGGING_SRCS",
+    "XNNPACK_SRCS",
     "HOT_SRCS",
     "TABLE_SRCS",
     "JIT_SRCS",
@@ -52,15 +90,83 @@
     "PROD_AVX2_MICROKERNEL_SRCS",
     "PROD_AVX512F_MICROKERNEL_SRCS",
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
-]
+    "PROD_SCALAR_MICROKERNEL_SRCS",
+    "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS",
+    "PROD_SCALAR_RISCV_MICROKERNEL_SRCS",
+    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
+    "PROD_FP16ARITH_MICROKERNEL_SRCS",
+    "PROD_NEON_MICROKERNEL_SRCS",
+    "PROD_NEONFP16_MICROKERNEL_SRCS",
+    "PROD_NEONFMA_MICROKERNEL_SRCS",
+    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONV8_MICROKERNEL_SRCS",
+    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONDOT_MICROKERNEL_SRCS",
+    "PROD_SSE2_MICROKERNEL_SRCS",
+    "PROD_SSSE3_MICROKERNEL_SRCS",
+    "PROD_SSE41_MICROKERNEL_SRCS",
+    "PROD_AVX_MICROKERNEL_SRCS",
+    "PROD_F16C_MICROKERNEL_SRCS",
+    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
+    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
 
-def update_sources(xnnpack_path):
+    # new adding libs:
+    'ALL_ARMSIMD32_MICROKERNEL_SRCS',
+    'ALL_AVX_MICROKERNEL_SRCS',
+    'ALL_AVX2_MICROKERNEL_SRCS',
+    'ALL_AVX512F_MICROKERNEL_SRCS',
+    'ALL_AVX512SKX_MICROKERNEL_SRCS',
+    'ALL_AVX512VBMI_MICROKERNEL_SRCS',
+    'ALL_F16C_MICROKERNEL_SRCS',
+    'ALL_FMA3_MICROKERNEL_SRCS',
+    'ALL_FP16ARITH_MICROKERNEL_SRCS',
+    'ALL_HEXAGON_MICROKERNEL_SRCS',
+    'ALL_NEON_MICROKERNEL_SRCS',
+    'ALL_NEON_AARCH64_MICROKERNEL_SRCS',
+    'ALL_NEONBF16_MICROKERNEL_SRCS',
+    'ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS',
+    'ALL_NEONDOT_MICROKERNEL_SRCS',
+    'ALL_NEONFMA_MICROKERNEL_SRCS',
+    'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS',
+    'ALL_NEONFP16_MICROKERNEL_SRCS',
+    'ALL_NEONFP16ARITH_MICROKERNEL_SRCS',
+    'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS',
+    'ALL_NEONV8_MICROKERNEL_SRCS',
+    'ALL_SCALAR_MICROKERNEL_SRCS',
+    'ALL_SSE_MICROKERNEL_SRCS',
+    'ALL_SSE2_MICROKERNEL_SRCS',
+    'ALL_SSE41_MICROKERNEL_SRCS',
+    'ALL_SSSE3_MICROKERNEL_SRCS',
+    'ALL_WASM_MICROKERNEL_SRCS',
+    'ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS',
+    'ALL_WASMSIMD_MICROKERNEL_SRCS',
+    'ALL_XOP_MICROKERNEL_SRCS',
+    'AARCH32_ASM_MICROKERNEL_SRCS',
+    'AARCH64_ASM_MICROKERNEL_SRCS',
+])
+
+def handle_singleline_parse(line):
+    start_index = line.find("(")
+    end_index = line.find(")")
+    line = line[start_index+1:end_index]
+    key_val = line.split(" ")
+    return key_val[0], key_val[1][4:]
+
+def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
     sources = collections.defaultdict(list)
-    with open(os.path.join(xnnpack_path, "XNNPACK/CMakeLists.txt")) as cmake:
+    count = 0
+    with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
         lines = cmake.readlines()
         i = 0
         while i < len(lines):
             line = lines[i]
+
+            if lines[i].startswith("SET") and "src/" in lines[i]:
+                name, val = handle_singleline_parse(line)
+                sources[name].append(val)
+                i+=1
+                continue
+
             if line.startswith("SET") and line.split('(')[1].strip(' \t\n\r') in set(WRAPPER_SRC_NAMES.keys()) | set(SRC_NAMES):
                 name = line.split('(')[1].strip(' \t\n\r')
                 i += 1
@@ -80,11 +186,19 @@ def update_sources(xnnpack_path):
 def gen_wrappers(xnnpack_path):
     xnnpack_sources = collections.defaultdict(list)
     sources = update_sources(xnnpack_path)
+
+    microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake")
+    for key in  microkernels_sources:
+        sources[key] = microkernels_sources[key]
+
     for name in WRAPPER_SRC_NAMES:
         xnnpack_sources[WRAPPER_SRC_NAMES[name]].extend(sources[name])
+
     for condition, filenames in xnnpack_sources.items():
+        print(condition)
         for filename in filenames:
             filepath = os.path.join(xnnpack_path, "xnnpack_wrappers", filename)
+
             if not os.path.isdir(os.path.dirname(filepath)):
                 os.makedirs(os.path.dirname(filepath))
             with open(filepath, "w") as wrapper:
diff --git a/third_party/gloo.BUILD b/third_party/gloo.BUILD
index b38da098b461..daa17f15e765 100644
--- a/third_party/gloo.BUILD
+++ b/third_party/gloo.BUILD
@@ -20,6 +20,8 @@ template_rule(
         "cmakedefine01 GLOO_USE_MPI": "define GLOO_USE_MPI 0",
         "cmakedefine01 GLOO_USE_AVX": "define GLOO_USE_AVX 0",
         "cmakedefine01 GLOO_USE_LIBUV": "define GLOO_USE_LIBUV 0",
+        # The `GLOO_HAVE_TRANSPORT_TCP_TLS` line should go above the `GLOO_HAVE_TRANSPORT_TCP` in order to properly substitute the template.
+        "cmakedefine01 GLOO_HAVE_TRANSPORT_TCP_TLS": "define GLOO_HAVE_TRANSPORT_TCP_TLS 1",
         "cmakedefine01 GLOO_HAVE_TRANSPORT_TCP": "define GLOO_HAVE_TRANSPORT_TCP 1",
         "cmakedefine01 GLOO_HAVE_TRANSPORT_IBVERBS": "define GLOO_HAVE_TRANSPORT_IBVERBS 0",
         "cmakedefine01 GLOO_HAVE_TRANSPORT_UV": "define GLOO_HAVE_TRANSPORT_UV 0",
@@ -35,6 +37,7 @@ cc_library(
             "gloo/rendezvous/*.h",
             "gloo/transport/*.h",
             "gloo/transport/tcp/*.h",
+            "gloo/transport/tcp/tls/*.h",
         ],
         exclude = [
             "gloo/rendezvous/redis_store.h",
diff --git a/third_party/ideep b/third_party/ideep
index 7201315611be..7bc3e12f7c0c 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 7201315611bebbb041f2ca7a0cdb3c6f4ccd17a3
+Subproject commit 7bc3e12f7c0cad7fb24f8d4ab63dcd467ffa60c7
diff --git a/third_party/kineto b/third_party/kineto
index 88c1367ff1dc..e121ba84c711 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 88c1367ff1dccf045f39f07d2e08e9e2a829ddab
+Subproject commit e121ba84c71102656d011338bcb616419a241ad1
diff --git a/third_party/nvfuser/CMakeLists.txt b/third_party/nvfuser/CMakeLists.txt
new file mode 100644
index 000000000000..2c72ca34e7a5
--- /dev/null
+++ b/third_party/nvfuser/CMakeLists.txt
@@ -0,0 +1,336 @@
+if(NOT BUILD_NVFUSER)
+  return()
+endif()
+
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(nvfuser)
+
+if(NOT USE_ROCM)
+  set(TORCHLIB_FLAVOR torch_cuda)
+else()
+  set(TORCHLIB_FLAVOR torch_hip)
+endif()
+
+# --- project
+
+file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/nvfuser")
+
+set(NVFUSER_ROOT ${PROJECT_SOURCE_DIR})
+set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
+set(TORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+set(TORCH_INSTALL_LIB_DIR ${TORCH_ROOT}/torch/lib)
+
+# --- build nvfuser_codegen library
+
+set(NVFUSER_SRCS)
+set(NVFUSER_CODEGEN ${PROJECT_NAME}_codegen)
+list(APPEND NVFUSER_SRCS
+    ${NVFUSER_SRCS_DIR}/arith.cpp
+    ${NVFUSER_SRCS_DIR}/compute_at.cpp
+    ${NVFUSER_SRCS_DIR}/inlining.cpp
+    ${NVFUSER_SRCS_DIR}/compute_at_map.cpp
+    ${NVFUSER_SRCS_DIR}/codegen.cpp
+    ${NVFUSER_SRCS_DIR}/contiguity.cpp
+    ${NVFUSER_SRCS_DIR}/dispatch.cpp
+    ${NVFUSER_SRCS_DIR}/expr_evaluator.cpp
+    ${NVFUSER_SRCS_DIR}/kernel_expr_evaluator.cpp
+    ${NVFUSER_SRCS_DIR}/executor.cpp
+    ${NVFUSER_SRCS_DIR}/executor_kernel_arg.cpp
+    ${NVFUSER_SRCS_DIR}/executor_launch_params.cpp
+    ${NVFUSER_SRCS_DIR}/evaluator_common.cpp
+    ${NVFUSER_SRCS_DIR}/executor_utils.cpp
+    ${NVFUSER_SRCS_DIR}/fusion.cpp
+    ${NVFUSER_SRCS_DIR}/graph_fuser.cpp
+    ${NVFUSER_SRCS_DIR}/grouped_reduction.cpp
+    ${NVFUSER_SRCS_DIR}/index_compute.cpp
+    ${NVFUSER_SRCS_DIR}/lower_index_compute.cpp
+    ${NVFUSER_SRCS_DIR}/instrumentation.cpp
+    ${NVFUSER_SRCS_DIR}/ir_base_nodes.cpp
+    ${NVFUSER_SRCS_DIR}/ir_builder.cpp
+    ${NVFUSER_SRCS_DIR}/ir_cloner.cpp
+    ${NVFUSER_SRCS_DIR}/ir_container.cpp
+    ${NVFUSER_SRCS_DIR}/ir_graphviz.cpp
+    ${NVFUSER_SRCS_DIR}/ir_nodes.cpp
+    ${NVFUSER_SRCS_DIR}/ir_iostream.cpp
+    ${NVFUSER_SRCS_DIR}/ir_utils.cpp
+    ${NVFUSER_SRCS_DIR}/iter_visitor.cpp
+    ${NVFUSER_SRCS_DIR}/kernel.cpp
+    ${NVFUSER_SRCS_DIR}/kernel_cache.cpp
+    ${NVFUSER_SRCS_DIR}/kernel_ir.cpp
+    ${NVFUSER_SRCS_DIR}/kernel_ir_dispatch.cpp
+    ${NVFUSER_SRCS_DIR}/lower_alias_memory.cpp
+    ${NVFUSER_SRCS_DIR}/lower_allocation.cpp
+    ${NVFUSER_SRCS_DIR}/lower_double_buffer.cpp
+    ${NVFUSER_SRCS_DIR}/lower_divisible_split.cpp
+    ${NVFUSER_SRCS_DIR}/lower_expr_sort.cpp
+    ${NVFUSER_SRCS_DIR}/lower_fused_reduction.cpp
+    ${NVFUSER_SRCS_DIR}/lower_fusion_simplifier.cpp
+    ${NVFUSER_SRCS_DIR}/lower_index.cpp
+    ${NVFUSER_SRCS_DIR}/lower_index_hoist.cpp
+    ${NVFUSER_SRCS_DIR}/lower_insert_syncs.cpp
+    ${NVFUSER_SRCS_DIR}/lower_instrument.cpp
+    ${NVFUSER_SRCS_DIR}/lower_loops.cpp
+    ${NVFUSER_SRCS_DIR}/lower_magic_zero.cpp
+    ${NVFUSER_SRCS_DIR}/lower_misaligned_vectorization.cpp
+    ${NVFUSER_SRCS_DIR}/lower_predicate.cpp
+    ${NVFUSER_SRCS_DIR}/lower_predicate_elimination.cpp
+    ${NVFUSER_SRCS_DIR}/lower_replace_size.cpp
+    ${NVFUSER_SRCS_DIR}/lower_shift.cpp
+    ${NVFUSER_SRCS_DIR}/lower_sync_information.cpp
+    ${NVFUSER_SRCS_DIR}/lower_thread_predicate.cpp
+    ${NVFUSER_SRCS_DIR}/lower_trivial_broadcast.cpp
+    ${NVFUSER_SRCS_DIR}/lower_trivial_reductions.cpp
+    ${NVFUSER_SRCS_DIR}/lower_unroll.cpp
+    ${NVFUSER_SRCS_DIR}/lower_utils.cpp
+    ${NVFUSER_SRCS_DIR}/lower_validation.cpp
+    ${NVFUSER_SRCS_DIR}/lower_warp_reduce.cpp
+    ${NVFUSER_SRCS_DIR}/lower2device.cpp
+    ${NVFUSER_SRCS_DIR}/lower_bank_conflict.cpp
+    ${NVFUSER_SRCS_DIR}/manager.cpp
+    ${NVFUSER_SRCS_DIR}/maxinfo_propagator.cpp
+    ${NVFUSER_SRCS_DIR}/mutator.cpp
+    ${NVFUSER_SRCS_DIR}/non_divisible_split.cpp
+    ${NVFUSER_SRCS_DIR}/ops/alias.cpp
+    ${NVFUSER_SRCS_DIR}/ops/composite.cpp
+    ${NVFUSER_SRCS_DIR}/ops/normalization.cpp
+    ${NVFUSER_SRCS_DIR}/parallel_dimension_map.cpp
+    ${NVFUSER_SRCS_DIR}/parallel_type_bitmap.cpp
+    ${NVFUSER_SRCS_DIR}/parser.cpp
+    ${NVFUSER_SRCS_DIR}/partial_split_map.cpp
+    ${NVFUSER_SRCS_DIR}/partition.cpp
+    ${NVFUSER_SRCS_DIR}/predicate_compute.cpp
+    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
+    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
+    ${NVFUSER_SRCS_DIR}/python_frontend/fusion_interface.cpp
+    ${NVFUSER_SRCS_DIR}/register_interface.cpp
+    ${NVFUSER_SRCS_DIR}/root_domain_map.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/pointwise.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/pointwise_utils.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/normalization.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/reduction.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/matmul.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/reduction_utils.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/registry.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
+    ${NVFUSER_SRCS_DIR}/type_inference.cpp
+    ${NVFUSER_SRCS_DIR}/type_promotion.cpp
+    ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp
+    ${NVFUSER_SRCS_DIR}/tensor_view.cpp
+    ${NVFUSER_SRCS_DIR}/transform_iter.cpp
+    ${NVFUSER_SRCS_DIR}/transform_replay.cpp
+    ${NVFUSER_SRCS_DIR}/transform_rfactor.cpp
+    ${NVFUSER_SRCS_DIR}/transform_view.cpp
+    ${NVFUSER_SRCS_DIR}/type.cpp
+    ${NVFUSER_SRCS_DIR}/utils.cpp
+    ${NVFUSER_SRCS_DIR}/mma_type.cpp
+    ${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
+)
+
+add_library(${NVFUSER_CODEGEN} SHARED ${NVFUSER_SRCS})
+
+if(NOT USE_ROCM)
+  target_compile_options(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
+  # NB: This must be target_compile_definitions, not target_compile_options,
+  # as the latter is not respected by nvcc
+  target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
+else()
+  target_compile_options(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
+  target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
+  target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE
+    USE_ROCM
+    __HIP_PLATFORM_HCC__
+    )
+endif()
+
+target_link_libraries(${NVFUSER_CODEGEN} PRIVATE torch ${TORCHLIB_FLAVOR})
+if(NOT USE_ROCM)
+  target_link_libraries(${NVFUSER_CODEGEN} PRIVATE ${CUDA_NVRTC_LIB} torch::nvtoolsext)
+  target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${CUDA_INCLUDE_DIRS})
+else()
+  target_link_libraries(${NVFUSER_CODEGEN} PRIVATE ${ROCM_HIPRTC_LIB})
+  target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${Caffe2_HIP_INCLUDE})
+endif()
+if(NOT MSVC)
+  target_compile_options(${NVFUSER_CODEGEN} PRIVATE -Wno-unused-variable)
+endif()
+target_include_directories(${NVFUSER_CODEGEN}
+                           PUBLIC $<BUILD_INTERFACE:${NVFUSER_SRCS_DIR}>)
+set_property(TARGET ${NVFUSER_CODEGEN} PROPERTY CXX_STANDARD 17)
+install(TARGETS ${NVFUSER_CODEGEN} EXPORT NvfuserTargets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+# installing nvfuser python tests
+install(DIRECTORY "${NVFUSER_ROOT}/python_tests/"
+        DESTINATION "${TORCH_ROOT}/test/_nvfuser"
+        FILES_MATCHING PATTERN "*.py" )
+
+file(WRITE "${TORCH_ROOT}/test/_nvfuser/.gitignore" "*")
+# --- build nvfuser_python library
+
+if(BUILD_PYTHON)
+  set(NVFUSER "${PROJECT_NAME}")
+  #find_package(pybind11 REQUIRED)
+
+  set(NVFUSER_PYTHON_SRCS)
+  list(APPEND NVFUSER_PYTHON_SRCS
+      ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings.cpp
+      ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings_extension.cpp
+  )
+
+  add_library(${NVFUSER} MODULE ${NVFUSER_PYTHON_SRCS})
+  if(NOT USE_ROCM)
+    target_compile_options(${NVFUSER} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
+    # NB: This must be target_compile_definitions, not target_compile_options,
+    # as the latter is not respected by nvcc
+    target_compile_definitions(${NVFUSER} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
+    target_link_libraries(${NVFUSER} PRIVATE torch::nvtoolsext)
+  else()
+    target_compile_options(${NVFUSER} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
+    target_compile_definitions(${NVFUSER} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
+    target_compile_definitions(${NVFUSER} PRIVATE
+      USE_ROCM
+      __HIP_PLATFORM_HCC__
+      )
+    target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${Caffe2_HIP_INCLUDE})
+  endif()
+
+  target_link_libraries(${NVFUSER} PRIVATE ${NVFUSER_CODEGEN})
+  target_link_libraries(${NVFUSER} PRIVATE torch torch_python ${TORCHLIB_FLAVOR})
+  target_link_libraries(${NVFUSER} PRIVATE pybind::pybind11)
+  target_include_directories(${NVFUSER} PRIVATE ${TORCH_ROOT})
+  target_compile_definitions(${NVFUSER} PRIVATE EXTENSION_NAME=_C)
+  target_compile_options(${NVFUSER} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
+
+  # avoid using Python3_add_library, copied from functorch
+  set_target_properties(${NVFUSER} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
+  if(NOT MSVC)
+    target_compile_options(${NVFUSER} PRIVATE -Wno-unused-variable)
+    set_target_properties(${NVFUSER} PROPERTIES SUFFIX ".so")
+  else()
+    set_target_properties(${NVFUSER} PROPERTIES SUFFIX ".pyd")
+  endif()
+
+  set_target_properties(${NVFUSER} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+        ${CMAKE_BINARY_DIR}/nvfuser)
+  set_target_properties(${NVFUSER} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
+
+  if(TORCH_PYTHON_LINK_FLAGS AND NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
+    message(STATUS "somehow this is happening")
+    set_target_properties(${NVFUSER} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
+  endif()
+  install(TARGETS ${NVFUSER} EXPORT NvfuserTargets DESTINATION ${TORCH_ROOT}/nvfuser/)
+
+  # install nvfuser python files
+  install(DIRECTORY "${NVFUSER_ROOT}/python/"
+          DESTINATION "${TORCH_ROOT}/nvfuser"
+          FILES_MATCHING PATTERN "*.py" )
+  
+  file(WRITE "${TORCH_ROOT}/nvfuser/.gitignore" "*")
+endif()
+
+# --- generate runtime files
+
+# The list of NVFUSER runtime files
+list(APPEND NVFUSER_RUNTIME_FILES
+  ${NVFUSER_ROOT}/runtime/array.cu
+  ${NVFUSER_ROOT}/runtime/block_reduction.cu
+  ${NVFUSER_ROOT}/runtime/block_sync_atomic.cu
+  ${NVFUSER_ROOT}/runtime/block_sync_default.cu
+  ${NVFUSER_ROOT}/runtime/broadcast.cu
+  ${NVFUSER_ROOT}/runtime/fp16_support.cu
+  ${NVFUSER_ROOT}/runtime/fused_reduction.cu
+  ${NVFUSER_ROOT}/runtime/fused_welford_helper.cu
+  ${NVFUSER_ROOT}/runtime/fused_welford_impl.cu
+  ${NVFUSER_ROOT}/runtime/bf16_support.cu
+  ${NVFUSER_ROOT}/runtime/grid_broadcast.cu
+  ${NVFUSER_ROOT}/runtime/grid_reduction.cu
+  ${NVFUSER_ROOT}/runtime/grid_sync.cu
+  ${NVFUSER_ROOT}/runtime/helpers.cu
+  ${NVFUSER_ROOT}/runtime/index_utils.cu
+  ${NVFUSER_ROOT}/runtime/random_numbers.cu
+  ${NVFUSER_ROOT}/runtime/swizzle.cu
+  ${NVFUSER_ROOT}/runtime/tensor.cu
+  ${NVFUSER_ROOT}/runtime/tuple.cu
+  ${NVFUSER_ROOT}/runtime/type_traits.cu
+  ${NVFUSER_ROOT}/runtime/welford.cu
+  ${NVFUSER_ROOT}/runtime/warp.cu
+  ${NVFUSER_ROOT}/runtime/tensorcore.cu
+  ${NVFUSER_ROOT}/runtime/memory.cu
+  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/UnpackRaw.cuh
+)
+
+if(USE_ROCM)
+list(APPEND NVFUSER_RUNTIME_FILES
+  ${NVFUSER_ROOT}/runtime/array_rocm.cu
+  ${NVFUSER_ROOT}/runtime/bf16_support_rocm.cu
+  ${NVFUSER_ROOT}/runtime/block_sync_default_rocm.cu
+  ${NVFUSER_ROOT}/runtime/warp_rocm.cu
+)
+endif()
+
+file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
+
+# "stringify" NVFUSER runtime sources
+# (generate C++ header files embedding the original input as a string literal)
+set(NVFUSER_STRINGIFY_TOOL "${NVFUSER_ROOT}/tools/stringify_file.py")
+foreach(src ${NVFUSER_RUNTIME_FILES})
+  get_filename_component(filename ${src} NAME_WE)
+  set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
+  add_custom_command(
+    COMMENT "Stringify NVFUSER runtime source file"
+    OUTPUT ${dst}
+    DEPENDS ${src} "${NVFUSER_STRINGIFY_TOOL}"
+    COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
+  )
+  add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
+  add_dependencies(${NVFUSER_CODEGEN} nvfuser_rt_${filename})
+
+  # also generate the resource headers during the configuration step
+  # (so tools like clang-tidy can run w/o requiring a real build)
+  execute_process(COMMAND
+    ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
+endforeach()
+
+target_include_directories(${NVFUSER_CODEGEN} PRIVATE "${CMAKE_BINARY_DIR}/include")
+
+# -- build tests
+
+# note: ideally we don't need USE_CUDA here, but our cpp tests are not ROCM compatible.
+if(BUILD_TEST AND USE_CUDA)
+  set(NVFUSER_TESTS "${PROJECT_NAME}_tests")
+  set(JIT_TEST_SRCS)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_definition.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_cache.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_record.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu1.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu2.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu3.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensor_factories.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_shift.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensorcore.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_view.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_transpose.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_utils.cpp)
+
+  add_executable(${NVFUSER_TESTS}
+             ${TORCH_ROOT}/test/cpp/common/main.cpp
+             ${TORCH_ROOT}/test/cpp/jit/test_utils.cpp
+             ${JIT_TEST_SRCS})
+
+  target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_GTEST)
+  if(NOT USE_ROCM)
+    target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_CUDA)
+  else()
+    target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_ROCM)
+  endif()
+  target_include_directories(${NVFUSER_TESTS} PRIVATE "${NVFUSER_ROOT}" "${TORCH_ROOT}/torch/csrc/api/include/")
+  target_link_libraries(${NVFUSER_TESTS} PRIVATE ${NVFUSER_CODEGEN} torch ${TORCHLIB_FLAVOR} gtest_main gmock_main)
+  if(NOT MSVC)
+    target_compile_options(${NVFUSER_TESTS} PRIVATE -Wno-unused-variable)
+  endif()
+
+  install(TARGETS ${NVFUSER_TESTS} DESTINATION bin)
+endif()
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/third_party/nvfuser/csrc/arith.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/arith.cpp
rename to third_party/nvfuser/csrc/arith.cpp
index d4e1348ee693..d1759f5bcc47 100644
--- a/torch/csrc/jit/codegen/cuda/arith.cpp
+++ b/third_party/nvfuser/csrc/arith.cpp
@@ -1,15 +1,15 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <arith.h>
 
 #include <c10/util/BFloat16.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
-#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <type.h>
+#include <type_promotion.h>
 #include <cfloat>
 
 namespace torch {
@@ -2171,7 +2171,7 @@ TensorView* gather(
   return out_tv;
 }
 
-TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp) {
+TensorView* viewAsScalar(TensorView* inp) {
   auto inp_type = inp->getDataType().value();
   TORCH_CHECK(
       isVectorType(inp_type),
diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/third_party/nvfuser/csrc/arith.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/arith.h
rename to third_party/nvfuser/csrc/arith.h
index 66344c74880c..04f5dd076033 100644
--- a/torch/csrc/jit/codegen/cuda/arith.h
+++ b/third_party/nvfuser/csrc/arith.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
-#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
+#include <ir_interface_nodes.h>
+#include <type.h>
+#include <type_promotion.h>
 
 class Val;
 
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/third_party/nvfuser/csrc/codegen.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/codegen.cpp
rename to third_party/nvfuser/csrc/codegen.cpp
index a13c282a7713..da19576dbdd6 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.cpp
+++ b/third_party/nvfuser/csrc/codegen.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <codegen.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <scheduler/mma_utils.h>
+#include <type.h>
+#include <utils.h>
 
 #include <array>
 #include <cmath>
diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/third_party/nvfuser/csrc/codegen.h
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/codegen.h
rename to third_party/nvfuser/csrc/codegen.h
index 31e4fb707363..fa52748615e9 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.h
+++ b/third_party/nvfuser/csrc/codegen.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <kernel.h>
 
 #include <string>
 
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/third_party/nvfuser/csrc/compute_at.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/compute_at.cpp
rename to third_party/nvfuser/csrc/compute_at.cpp
index d8f950848f8f..b2f681323fd7 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.cpp
+++ b/third_party/nvfuser/csrc/compute_at.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/compute_at.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <compute_at.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower_utils.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
 
 #include <c10/util/irange.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/third_party/nvfuser/csrc/compute_at.h
similarity index 85%
rename from torch/csrc/jit/codegen/cuda/compute_at.h
rename to third_party/nvfuser/csrc/compute_at.h
index d3d3fdb299dd..1d8c739c022d 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.h
+++ b/third_party/nvfuser/csrc/compute_at.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <inlining.h>
+#include <root_domain_map.h>
+#include <transform_replay.h>
 
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/third_party/nvfuser/csrc/compute_at_map.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/compute_at_map.cpp
rename to third_party/nvfuser/csrc/compute_at_map.cpp
index 1c2ac627b575..50d21277e48b 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
+++ b/third_party/nvfuser/csrc/compute_at_map.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
+#include <compute_at_map.h>
 
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <disjoint_set.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
 
 #include <tuple>
 
@@ -431,7 +431,7 @@ void IterDomainGraph::build(Fusion* fusion) {
             // might not be a compute at leaf domain of `p_tv`, but it actually
             // has an equivalent compute at leaf domain. For that case, we map
             // the equivalent compute at leaf domain.
-            for (int i = 0; i < p_tv->getComputeAtPosition(); i++) {
+            for (unsigned int i = 0; i < p_tv->getComputeAtPosition(); i++) {
               auto id = p_tv->axis(i);
               if (permissive_disjoint_sets.permissiveAreMapped(p_id, id)) {
                 loop_nodes_.mapEntries(c_id, id);
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.h b/third_party/nvfuser/csrc/compute_at_map.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/compute_at_map.h
rename to third_party/nvfuser/csrc/compute_at_map.h
index 5ea92dff1644..66ca4d5ae5f7 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.h
+++ b/third_party/nvfuser/csrc/compute_at_map.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
+#include <disjoint_set.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
+#include <lower_trivial_reductions.h>
 
 #include <deque>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/contiguity.cpp b/third_party/nvfuser/csrc/contiguity.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/contiguity.cpp
rename to third_party/nvfuser/csrc/contiguity.cpp
index dcb39d948c67..808a1a2ec0ab 100644
--- a/torch/csrc/jit/codegen/cuda/contiguity.cpp
+++ b/third_party/nvfuser/csrc/contiguity.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <lower2device.h>
 
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
+#include <contiguity.h>
 
 namespace torch {
 namespace jit {
@@ -135,7 +135,7 @@ void OrderedIdInformation::handle(Merge* merge) {
   // Update maps
   // Find the position inner would have to have to be considered ordered
   auto pos_after_outer = outer_pos + 1;
-  for (; pos_after_outer < active_ids_.size(); pos_after_outer++) {
+  for (; pos_after_outer < int64_t(active_ids_.size()); pos_after_outer++) {
     if (active_ids_[pos_after_outer] == nullptr) {
       // Can't be considered ordered after a nullptr
       break;
diff --git a/torch/csrc/jit/codegen/cuda/contiguity.h b/third_party/nvfuser/csrc/contiguity.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/contiguity.h
rename to third_party/nvfuser/csrc/contiguity.h
index e3be65a5bbc0..f3b0cf509762 100644
--- a/torch/csrc/jit/codegen/cuda/contiguity.h
+++ b/third_party/nvfuser/csrc/contiguity.h
@@ -2,11 +2,11 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
+#include <compute_at_map.h>
+#include <disjoint_set.h>
+#include <ir_all_nodes.h>
+#include <lower_shift.h>
+#include <lower_trivial_broadcast.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/disjoint_set.h b/third_party/nvfuser/csrc/disjoint_set.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/disjoint_set.h
rename to third_party/nvfuser/csrc/disjoint_set.h
index 8fd60dab5bd2..f62c4b4d77aa 100644
--- a/torch/csrc/jit/codegen/cuda/disjoint_set.h
+++ b/third_party/nvfuser/csrc/disjoint_set.h
@@ -9,7 +9,7 @@
 #include <vector>
 
 // For printing of the set when using a Statement as the type for the set
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <ir_base_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/third_party/nvfuser/csrc/dispatch.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/dispatch.cpp
rename to third_party/nvfuser/csrc/dispatch.cpp
index 70e9ae16375e..d9c02c7f0b29 100644
--- a/torch/csrc/jit/codegen/cuda/dispatch.cpp
+++ b/third_party/nvfuser/csrc/dispatch.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <type.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <dispatch.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/third_party/nvfuser/csrc/dispatch.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/dispatch.h
rename to third_party/nvfuser/csrc/dispatch.h
index 4fea698191ec..e52028b0d213 100644
--- a/torch/csrc/jit/codegen/cuda/dispatch.h
+++ b/third_party/nvfuser/csrc/dispatch.h
@@ -3,7 +3,7 @@
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
 
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <utils.h>
 
 #include <unordered_map>
 
diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/third_party/nvfuser/csrc/docs/.gitignore
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/docs/.gitignore
rename to third_party/nvfuser/csrc/docs/.gitignore
diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/third_party/nvfuser/csrc/docs/documentation.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/docs/documentation.h
rename to third_party/nvfuser/csrc/docs/documentation.h
diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/third_party/nvfuser/csrc/docs/fuser.doxygen
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/docs/fuser.doxygen
rename to third_party/nvfuser/csrc/docs/fuser.doxygen
diff --git a/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png b/third_party/nvfuser/csrc/docs/images/ir_architecture.png
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png
rename to third_party/nvfuser/csrc/docs/images/ir_architecture.png
index 48616c381bc5..f21c4fcd467f 100644
Binary files a/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png and b/third_party/nvfuser/csrc/docs/images/ir_architecture.png differ
diff --git a/torch/csrc/jit/codegen/cuda/docs/main_page.md b/third_party/nvfuser/csrc/docs/main_page.md
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/docs/main_page.md
rename to third_party/nvfuser/csrc/docs/main_page.md
diff --git a/torch/csrc/jit/codegen/cuda/dynamic_type.h b/third_party/nvfuser/csrc/dynamic_type.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/dynamic_type.h
rename to third_party/nvfuser/csrc/dynamic_type.h
diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp b/third_party/nvfuser/csrc/evaluator_common.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/evaluator_common.cpp
rename to third_party/nvfuser/csrc/evaluator_common.cpp
index ae280b4ac44c..094dd54c1595 100644
--- a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
+++ b/third_party/nvfuser/csrc/evaluator_common.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
 
-#include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
+#include <evaluator_common.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.h b/third_party/nvfuser/csrc/evaluator_common.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/evaluator_common.h
rename to third_party/nvfuser/csrc/evaluator_common.h
index 528b1f1b2e0a..349ae22de15a 100644
--- a/torch/csrc/jit/codegen/cuda/evaluator_common.h
+++ b/third_party/nvfuser/csrc/evaluator_common.h
@@ -1,10 +1,10 @@
 #pragma once
-#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
-#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <dynamic_type.h>
+#include <executor_kernel_arg.h>
+#include <executor_launch_params.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <lower2device.h>
 
 #include <c10/core/DeviceType.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/third_party/nvfuser/csrc/executor.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/executor.cpp
rename to third_party/nvfuser/csrc/executor.cpp
index 23be5f4232aa..0ab2951bda63 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/third_party/nvfuser/csrc/executor.cpp
@@ -1,21 +1,22 @@
 
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower_bank_conflict.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <executor.h>
+
+#include <codegen.h>
+#include <executor_kernel_arg.h>
+#include <executor_utils.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_ir.h>
+#include <lower_bank_conflict.h>
+#include <utils.h>
 
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/llvm_jit_strings.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <ATen/native/cuda/jit_utils.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
@@ -877,7 +878,7 @@ KernelArgumentHolder FusionExecutor::inferOutputSizes(
     executor_entry = &executor_entry_lookup_[*opt_code];
   }
 
-  executor_utils::initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
   TORCH_INTERNAL_ASSERT(lowered_);
 
   TORCH_INTERNAL_ASSERT(
@@ -975,7 +976,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
   c10::DeviceGuard dg(options_.device);
   auto stream = at::cuda::getCurrentCUDAStream();
-  executor_utils::initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
   TORCH_INTERNAL_ASSERT(lowered_);
   launch_params_ = LaunchParams();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -1258,7 +1259,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
   if (execute_kernel_) {
     if (maybe_available_dynamic_smem_.has_value() &&
-        launch_params_.smem() > maybe_available_dynamic_smem_.value()) {
+        size_t(launch_params_.smem()) > maybe_available_dynamic_smem_.value()) {
 #ifndef USE_ROCM
       // Increase limit of dynamic shared memory if needed.
       AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncSetAttribute(
diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/third_party/nvfuser/csrc/executor.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/executor.h
rename to third_party/nvfuser/csrc/executor.h
index 9d4775b37ca9..4ec71666ba66 100644
--- a/torch/csrc/jit/codegen/cuda/executor.h
+++ b/third_party/nvfuser/csrc/executor.h
@@ -1,13 +1,13 @@
 #pragma once
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <executor_launch_params.h>
+#include <executor_utils.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_cloner.h>
+#include <ir_printer.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
+#include <utils.h>
 
 #include <c10/core/DeviceType.h>
 
@@ -261,7 +261,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // See:
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
-  const int max_static_smem_ = 48 << 10;
+  const uint64_t max_static_smem_ = 48 << 10;
   int warp_size_ = 0;
   executor_utils::NvrtcFunction compiled_kernel_;
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/third_party/nvfuser/csrc/executor_kernel_arg.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
rename to third_party/nvfuser/csrc/executor_kernel_arg.cpp
index bc1ce2a4b7bc..3454146c7eef 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
+++ b/third_party/nvfuser/csrc/executor_kernel_arg.cpp
@@ -1,9 +1,9 @@
 #include <c10/util/irange.h>
 
 // Extract size and strides
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <kernel_cache.h>
 
-#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
+#include <executor_kernel_arg.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/third_party/nvfuser/csrc/executor_kernel_arg.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
rename to third_party/nvfuser/csrc/executor_kernel_arg.h
index 32f0eb021821..620f0600fe86 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
+++ b/third_party/nvfuser/csrc/executor_kernel_arg.h
@@ -3,7 +3,7 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <type.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <array>
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_launch_params.cpp b/third_party/nvfuser/csrc/executor_launch_params.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/executor_launch_params.cpp
rename to third_party/nvfuser/csrc/executor_launch_params.cpp
index 167202b52e83..806ceb963715 100644
--- a/torch/csrc/jit/codegen/cuda/executor_launch_params.cpp
+++ b/third_party/nvfuser/csrc/executor_launch_params.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <executor_launch_params.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_launch_params.h b/third_party/nvfuser/csrc/executor_launch_params.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/executor_launch_params.h
rename to third_party/nvfuser/csrc/executor_launch_params.h
index 66bafb250774..9c413f71293a 100644
--- a/torch/csrc/jit/codegen/cuda/executor_launch_params.h
+++ b/third_party/nvfuser/csrc/executor_launch_params.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <type.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/third_party/nvfuser/csrc/executor_utils.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/executor_utils.cpp
rename to third_party/nvfuser/csrc/executor_utils.cpp
index f32c257708a9..34cc176de9dd 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/third_party/nvfuser/csrc/executor_utils.cpp
@@ -1,15 +1,16 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <ATen/native/cuda/jit_utils.h>
 
 #include <c10/util/irange.h>
 
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <contiguity.h>
+#include <executor_utils.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
 #include <torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h>
 #include <torch/csrc/jit/resource_guard.h>
 
@@ -926,18 +927,6 @@ ExpressionEvaluator bindFusionInputs(
   return expr_eval;
 }
 
-void initializeCudaContext() {
-  // lazily construct context if non-existing yet;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  CUcontext pctx = nullptr;
-  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx));
-  if (!pctx) {
-    std::unique_lock<std::mutex> cudaFreeMutexLock(
-        *(c10::cuda::getFreeMutex()));
-    C10_CUDA_CHECK(cudaFree(nullptr));
-  }
-}
-
 namespace {
 
 // Dump PTX or CUBIN to a file
@@ -979,7 +968,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
         "NVFuser Compile: arch check disabled, should not compile any kernel");
   }
 
-  initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
 
   std::stringstream ptxas_log;
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/third_party/nvfuser/csrc/executor_utils.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/executor_utils.h
rename to third_party/nvfuser/csrc/executor_utils.h
index af3b4d9372d4..9a2c2eafd451 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/third_party/nvfuser/csrc/executor_utils.h
@@ -9,13 +9,13 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <executor_kernel_arg.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <kernel.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
 
 #include <string>
 #include <vector>
@@ -54,8 +54,6 @@ struct NvrtcFunction {
   CUfunction function = CUfunction();
 };
 
-void initializeCudaContext();
-
 // Returns executable function and the ptxas log from compilation
 std::pair<NvrtcFunction, std::string> nvrtcCompile(
     const std::string& code,
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/third_party/nvfuser/csrc/expr_evaluator.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
rename to third_party/nvfuser/csrc/expr_evaluator.cpp
index 6e1c62811111..4e9948ca8234 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
+++ b/third_party/nvfuser/csrc/expr_evaluator.cpp
@@ -1,10 +1,10 @@
 
-#include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <evaluator_common.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
 
 #include <iostream>
 
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/third_party/nvfuser/csrc/expr_evaluator.h
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/expr_evaluator.h
rename to third_party/nvfuser/csrc/expr_evaluator.h
index 4329f9604304..ecc8cb59f9ff 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h
+++ b/third_party/nvfuser/csrc/expr_evaluator.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <dynamic_type.h>
+#include <ir_interface_nodes.h>
+#include <iter_visitor.h>
 
 #include <c10/util/Optional.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/fusion.cpp
rename to third_party/nvfuser/csrc/fusion.cpp
index e4f24f0473a1..55343043e618 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.cpp
+++ b/third_party/nvfuser/csrc/fusion.cpp
@@ -1,17 +1,17 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_bank_conflict.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_cloner.h>
+#include <ir_printer.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel.h>
+#include <lower2device.h>
+#include <lower_bank_conflict.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/third_party/nvfuser/csrc/fusion.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/fusion.h
rename to third_party/nvfuser/csrc/fusion.h
index 2c0c59fae2b9..56985f1546f2 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.h
+++ b/third_party/nvfuser/csrc/fusion.h
@@ -4,9 +4,9 @@
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_container.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <ir_base_nodes.h>
+#include <ir_container.h>
+#include <iter_visitor.h>
 
 #include <unordered_map>
 #include <unordered_set>
diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/third_party/nvfuser/csrc/fusion_segmenter.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
rename to third_party/nvfuser/csrc/fusion_segmenter.cpp
index c0bf81dc688b..5149db603ccd 100644
--- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
+++ b/third_party/nvfuser/csrc/fusion_segmenter.cpp
@@ -1,13 +1,13 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
+#include <arith.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_cloner.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <scheduler/debug_utils.h>
 
 #include <sstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h b/third_party/nvfuser/csrc/fusion_segmenter.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/fusion_segmenter.h
rename to third_party/nvfuser/csrc/fusion_segmenter.h
index 5014e708cb95..4e221d2072e1 100644
--- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h
+++ b/third_party/nvfuser/csrc/fusion_segmenter.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <fusion.h>
+#include <ir_base_nodes.h>
+#include <kernel_cache.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/registry.h>
+#include <utils.h>
 
 #include <deque>
 #include <list>
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/third_party/nvfuser/csrc/graph_fuser.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/graph_fuser.cpp
rename to third_party/nvfuser/csrc/graph_fuser.cpp
index c2427f938627..6e486d05b7c2 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/third_party/nvfuser/csrc/graph_fuser.cpp
@@ -2,12 +2,11 @@
 
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/codegen/cuda/partition.h>
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <instrumentation.h>
+#include <parser.h>
+#include <partition.h>
+#include <transform_view.h>
+#include <utils.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/jit_log.h>
@@ -729,9 +728,7 @@ struct CudaGraphFuser {
     }
 
     bchunk->removeInput(producer_index);
-    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores,clang-diagnostic-unused-variable)
-    for (const auto i : c10::irange(nchunks)) {
-      (void)i; // Suppress unused variable warning
+    for (const auto _ : c10::irange(nchunks)) {
       bchunk->eraseOutput(nchunks * producer_index);
     }
 
diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp b/third_party/nvfuser/csrc/grouped_reduction.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
rename to third_party/nvfuser/csrc/grouped_reduction.cpp
index d907a0665e9f..7a325601d70c 100644
--- a/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
+++ b/third_party/nvfuser/csrc/grouped_reduction.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <ir_builder.h>
+#include <ir_utils.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
 
-#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+#include <grouped_reduction.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.h b/third_party/nvfuser/csrc/grouped_reduction.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/grouped_reduction.h
rename to third_party/nvfuser/csrc/grouped_reduction.h
index 330a6018446b..52395f01b91c 100644
--- a/torch/csrc/jit/codegen/cuda/grouped_reduction.h
+++ b/third_party/nvfuser/csrc/grouped_reduction.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/third_party/nvfuser/csrc/index_compute.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/index_compute.cpp
rename to third_party/nvfuser/csrc/index_compute.cpp
index 9028f93e9a20..4f377e85cb87 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/third_party/nvfuser/csrc/index_compute.cpp
@@ -1,26 +1,26 @@
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
+#include <index_compute.h>
 
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
-#include <torch/csrc/jit/codegen/cuda/lower_index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_validation.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <arith.h>
+#include <contiguity.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
+#include <lower_double_buffer.h>
+#include <lower_index_compute.h>
+#include <lower_magic_zero.h>
+#include <lower_shift.h>
+#include <lower_unroll.h>
+#include <lower_utils.h>
+#include <lower_validation.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
+#include <transform_replay.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/third_party/nvfuser/csrc/index_compute.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/index_compute.h
rename to third_party/nvfuser/csrc/index_compute.h
index 9a94ee94ac09..00288136a2a9 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.h
+++ b/third_party/nvfuser/csrc/index_compute.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <iter_visitor.h>
+#include <root_domain_map.h>
 
 #include <unordered_map>
 #include <unordered_set>
diff --git a/torch/csrc/jit/codegen/cuda/inlining.cpp b/third_party/nvfuser/csrc/inlining.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/inlining.cpp
rename to third_party/nvfuser/csrc/inlining.cpp
index da6d229c68f8..50782b37d893 100644
--- a/torch/csrc/jit/codegen/cuda/inlining.cpp
+++ b/third_party/nvfuser/csrc/inlining.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <inlining.h>
+#include <ir_utils.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
 
 #include <utility>
 
@@ -210,7 +210,7 @@ FindMappedPositions::FindMappedPositions(
     reference_pos += int64_t(reference->nDims()) + 1;
   }
   TORCH_CHECK(
-      reference_pos >= 0 && reference_pos <= reference->nDims(),
+      reference_pos >= 0 && reference_pos <= int64_t(reference->nDims()),
       "Invalid axis received ",
       reference_pos,
       " but should be > -",
diff --git a/torch/csrc/jit/codegen/cuda/inlining.h b/third_party/nvfuser/csrc/inlining.h
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/inlining.h
rename to third_party/nvfuser/csrc/inlining.h
index 3b15eb23f987..7e9600f023d0 100644
--- a/torch/csrc/jit/codegen/cuda/inlining.h
+++ b/third_party/nvfuser/csrc/inlining.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <ir_interface_nodes.h>
+#include <maxinfo_propagator.h>
+#include <transform_replay.h>
 
 #include <memory>
 #include <unordered_set>
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.cpp b/third_party/nvfuser/csrc/instrumentation.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/instrumentation.cpp
rename to third_party/nvfuser/csrc/instrumentation.cpp
index 2d570ce5b9d4..121a8a2d398b 100644
--- a/torch/csrc/jit/codegen/cuda/instrumentation.cpp
+++ b/third_party/nvfuser/csrc/instrumentation.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <instrumentation.h>
 
 #include <c10/macros/Export.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.h b/third_party/nvfuser/csrc/instrumentation.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/instrumentation.h
rename to third_party/nvfuser/csrc/instrumentation.h
index ef89fcd66090..cd57825a248e 100644
--- a/torch/csrc/jit/codegen/cuda/instrumentation.h
+++ b/third_party/nvfuser/csrc/instrumentation.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <utils.h>
 
 #include <nvToolsExt.h>
 
diff --git a/third_party/nvfuser/csrc/ir_all_nodes.h b/third_party/nvfuser/csrc/ir_all_nodes.h
new file mode 100644
index 000000000000..f80c4d714c08
--- /dev/null
+++ b/third_party/nvfuser/csrc/ir_all_nodes.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <ir_base_nodes.h>
+#include <ir_interface_nodes.h>
+#include <ir_internal_nodes.h>
+
+// TODO: remove this once the Kernel IR split is complete
+#include <kernel_ir.h>
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/third_party/nvfuser/csrc/ir_base_nodes.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
rename to third_party/nvfuser/csrc/ir_base_nodes.cpp
index ff00f659da63..4b53af45e762 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
+++ b/third_party/nvfuser/csrc/ir_base_nodes.cpp
@@ -1,14 +1,14 @@
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <dispatch.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_cloner.h>
+#include <ir_printer.h>
+#include <kernel.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <mutator.h>
 
 #include <torch/csrc/jit/ir/ir.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/third_party/nvfuser/csrc/ir_base_nodes.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/ir_base_nodes.h
rename to third_party/nvfuser/csrc/ir_base_nodes.h
index dadabe167ebf..c46d4389596e 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
+++ b/third_party/nvfuser/csrc/ir_base_nodes.h
@@ -5,8 +5,8 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 
-#include <torch/csrc/jit/codegen/cuda/type.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <type.h>
+#include <utils.h>
 
 #include <cstdint>
 #include <iostream>
diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.cpp b/third_party/nvfuser/csrc/ir_builder.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_builder.cpp
rename to third_party/nvfuser/csrc/ir_builder.cpp
index f0fd438c1567..cfbb455e0a4a 100644
--- a/torch/csrc/jit/codegen/cuda/ir_builder.cpp
+++ b/third_party/nvfuser/csrc/ir_builder.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <fusion.h>
+#include <ir_builder.h>
+#include <ir_cloner.h>
+#include <kernel.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.h b/third_party/nvfuser/csrc/ir_builder.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/ir_builder.h
rename to third_party/nvfuser/csrc/ir_builder.h
index af0e8cb1cc35..21031997ab56 100644
--- a/torch/csrc/jit/codegen/cuda/ir_builder.h
+++ b/third_party/nvfuser/csrc/ir_builder.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_container.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_container.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/third_party/nvfuser/csrc/ir_cloner.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/ir_cloner.cpp
rename to third_party/nvfuser/csrc/ir_cloner.cpp
index 489be49ddfc7..8d2f6babaa78 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
+++ b/third_party/nvfuser/csrc/ir_cloner.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
+#include <ir_cloner.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/third_party/nvfuser/csrc/ir_cloner.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/ir_cloner.h
rename to third_party/nvfuser/csrc/ir_cloner.h
index 06e1ec3359d9..116f8074beae 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.h
+++ b/third_party/nvfuser/csrc/ir_cloner.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <dispatch.h>
+#include <ir_builder.h>
 
 #include <unordered_map>
 #include <vector>
diff --git a/torch/csrc/jit/codegen/cuda/ir_container.cpp b/third_party/nvfuser/csrc/ir_container.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/ir_container.cpp
rename to third_party/nvfuser/csrc/ir_container.cpp
index e84418eb9733..2d7f8f8e6733 100644
--- a/torch/csrc/jit/codegen/cuda/ir_container.cpp
+++ b/third_party/nvfuser/csrc/ir_container.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_container.h>
+#include <instrumentation.h>
+#include <ir_builder.h>
+#include <ir_cloner.h>
+#include <ir_container.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/ir_container.h b/third_party/nvfuser/csrc/ir_container.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_container.h
rename to third_party/nvfuser/csrc/ir_container.h
index fb1aaeaf383c..43aabaeb8aee 100644
--- a/torch/csrc/jit/codegen/cuda/ir_container.h
+++ b/third_party/nvfuser/csrc/ir_container.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <ir_base_nodes.h>
+#include <utils.h>
 
 #include <deque>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/third_party/nvfuser/csrc/ir_graphviz.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
rename to third_party/nvfuser/csrc/ir_graphviz.cpp
index 6c04e4214b07..6f6391dcea2e 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
+++ b/third_party/nvfuser/csrc/ir_graphviz.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <ir_graphviz.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <type.h>
 
 #include <fstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/third_party/nvfuser/csrc/ir_graphviz.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_graphviz.h
rename to third_party/nvfuser/csrc/ir_graphviz.h
index 1f555ed31ec0..73c2282f7e1f 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h
+++ b/third_party/nvfuser/csrc/ir_graphviz.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <dispatch.h>
 
 #include <sstream>
 #include <string>
diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/third_party/nvfuser/csrc/ir_interface_nodes.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
rename to third_party/nvfuser/csrc/ir_interface_nodes.h
index dbefc4858d11..1dd8879faf8b 100644
--- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
+++ b/third_party/nvfuser/csrc/ir_interface_nodes.h
@@ -2,10 +2,10 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+#include <fusion.h>
+#include <ir_base_nodes.h>
+#include <ir_internal_nodes.h>
+#include <mma_type.h>
 
 #include <torch/csrc/jit/ir/ir.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/third_party/nvfuser/csrc/ir_internal_nodes.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
rename to third_party/nvfuser/csrc/ir_internal_nodes.h
index d34b3a9f89c5..5c13efc79526 100644
--- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
+++ b/third_party/nvfuser/csrc/ir_internal_nodes.h
@@ -2,10 +2,10 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/mma_type.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+#include <fusion.h>
+#include <ir_base_nodes.h>
+#include <mma_type.h>
+#include <parallel_type_bitmap.h>
 
 //! Nodes in here should generally not be used by users. They should be behind
 //! the scenes and users shouldn't have to be aware of what they do to use the
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/third_party/nvfuser/csrc/ir_iostream.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_iostream.cpp
rename to third_party/nvfuser/csrc/ir_iostream.cpp
index e13273c8e75e..d9de6bb8a257 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
+++ b/third_party/nvfuser/csrc/ir_iostream.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
-
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <ir_iostream.h>
+#include <ir_printer.h>
+
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_utils.h>
+#include <kernel.h>
+#include <lower_utils.h>
 
 #include <c10/util/irange.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/third_party/nvfuser/csrc/ir_iostream.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_iostream.h
rename to third_party/nvfuser/csrc/ir_iostream.h
index 599e50286d29..80d2311f1f59 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.h
+++ b/third_party/nvfuser/csrc/ir_iostream.h
@@ -2,7 +2,7 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <dispatch.h>
 
 #include <c10/util/irange.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/third_party/nvfuser/csrc/ir_nodes.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ir_nodes.cpp
rename to third_party/nvfuser/csrc/ir_nodes.cpp
index c4d994f272be..3a14887d2866 100644
--- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
+++ b/third_party/nvfuser/csrc/ir_nodes.cpp
@@ -1,16 +1,16 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
+#include <arith.h>
+#include <disjoint_set.h>
+#include <ir_cloner.h>
+#include <ir_interface_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
+#include <transform_rfactor.h>
+#include <transform_view.h>
 
 #include <c10/util/irange.h>
 
@@ -2560,17 +2560,19 @@ TensorDomain* TensorDomain::flatten(int64_t start_dim, int64_t end_dim) {
     end_dim += inp_domain.size();
   }
   TORCH_CHECK(
-      start_dim >= 0 && start_dim < inp_domain.size(),
+      start_dim >= 0 && start_dim < int64_t(inp_domain.size()),
       "Invalid start_dim ",
       start_dim);
   TORCH_CHECK(
-      end_dim >= 0 && end_dim < inp_domain.size(), "Invalid end_dim ", end_dim);
+      end_dim >= 0 && end_dim < int64_t(inp_domain.size()),
+      "Invalid end_dim ",
+      end_dim);
   TORCH_CHECK(start_dim <= end_dim, "start_dim must be <= end_dim");
 
   std::vector<IterDomain*> new_root_domain;
   new_root_domain.reserve(inp_domain.size());
   for (auto i : c10::irange(inp_domain.size())) {
-    bool is_rfactor_dim = i >= start_dim && i <= end_dim;
+    bool is_rfactor_dim = i >= size_t(start_dim) && i <= size_t(end_dim);
     auto inp_id = inp_domain[i];
     auto out_id = IterDomainBuilder(inp_id)
                       .is_rfactor_domain(is_rfactor_dim)
diff --git a/torch/csrc/jit/codegen/cuda/ir_printer.h b/third_party/nvfuser/csrc/ir_printer.h
similarity index 93%
rename from torch/csrc/jit/codegen/cuda/ir_printer.h
rename to third_party/nvfuser/csrc/ir_printer.h
index 2cc0177787fb..d95895022e8e 100644
--- a/torch/csrc/jit/codegen/cuda/ir_printer.h
+++ b/third_party/nvfuser/csrc/ir_printer.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <ir_iostream.h>
+#include <iter_visitor.h>
 
 #include <iostream>
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/third_party/nvfuser/csrc/ir_utils.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/ir_utils.cpp
rename to third_party/nvfuser/csrc/ir_utils.cpp
index dba5ee10adab..7863aca74daa 100644
--- a/torch/csrc/jit/codegen/cuda/ir_utils.cpp
+++ b/third_party/nvfuser/csrc/ir_utils.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <arith.h>
+#include <fusion.h>
+#include <ir_builder.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower_utils.h>
 
 #include <set>
 
@@ -569,7 +569,7 @@ std::vector<T*> uniqueEntries(const std::vector<T*>& tv_deuqe) {
 } // namespace
 
 // Return immediate producers of val
-TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(Val* val) {
+std::vector<Val*> producerValsOf(Val* val) {
   if (val->definition() == nullptr) {
     return {};
   }
@@ -578,7 +578,7 @@ TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(Val* val) {
 }
 
 // Return immediate consumers of val
-TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(Val* val) {
+std::vector<Val*> consumerValsOf(Val* val) {
   std::vector<Val*> consumer_vals;
   for (auto use_expr : val->uses()) {
     auto outputs = use_expr->outputs();
@@ -588,7 +588,7 @@ TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(Val* val) {
 }
 
 // Return immediate siblings of val
-TORCH_CUDA_CU_API std::vector<Val*> siblingValsOf(Val* val) {
+std::vector<Val*> siblingValsOf(Val* val) {
   std::vector<Val*> sibling_vals;
   auto def = val->definition();
   if (def != nullptr) {
@@ -604,8 +604,7 @@ TORCH_CUDA_CU_API std::vector<Val*> siblingValsOf(Val* val) {
 }
 
 // Return immediate producers of val
-TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(
-    const std::vector<Val*>& vals) {
+std::vector<Val*> producerValsOf(const std::vector<Val*>& vals) {
   std::vector<Val*> all_producer_vals;
   for (auto val : vals) {
     auto producer_vals = producerValsOf(val);
@@ -617,8 +616,7 @@ TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(
 }
 
 // Return immediate consumers of val
-TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(
-    const std::vector<Val*>& vals) {
+std::vector<Val*> consumerValsOf(const std::vector<Val*>& vals) {
   std::vector<Val*> all_consumer_vals;
   for (auto val : vals) {
     auto consumer_vals = consumerValsOf(val);
@@ -641,7 +639,7 @@ std::vector<TensorView*> consumerTvsOf(TensorView* tv) {
   return {consumer_tvs.begin(), consumer_tvs.end()};
 }
 
-TORCH_CUDA_CU_API std::vector<TensorView*> siblingTvsOf(TensorView* tv) {
+std::vector<TensorView*> siblingTvsOf(TensorView* tv) {
   auto sibling_vals = siblingValsOf(tv);
   auto sibling_tvs = ir_utils::filterByType<TensorView>(sibling_vals);
   return {sibling_tvs.begin(), sibling_tvs.end()};
@@ -879,7 +877,7 @@ bool isReductionTvOp(const Expr* expr) {
   return ir_utils::isTvOp(expr) && isReductionOp(expr);
 }
 
-TORCH_CUDA_CU_API std::vector<ViewOp*> getViewOps(Fusion* fusion) {
+std::vector<ViewOp*> getViewOps(Fusion* fusion) {
   auto all_exprs = fusion->exprs();
 
   auto all_view_ops = ir_utils::filterByType<ViewOp>(all_exprs);
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.h b/third_party/nvfuser/csrc/ir_utils.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/ir_utils.h
rename to third_party/nvfuser/csrc/ir_utils.h
index adfc64fc74ad..cfad4b849a8a 100644
--- a/torch/csrc/jit/codegen/cuda/ir_utils.h
+++ b/third_party/nvfuser/csrc/ir_utils.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <ir_all_nodes.h>
+#include <type.h>
 
 #include <iterator>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/third_party/nvfuser/csrc/iter_visitor.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/iter_visitor.cpp
rename to third_party/nvfuser/csrc/iter_visitor.cpp
index 984a22194a20..4599b41f0890 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
+++ b/third_party/nvfuser/csrc/iter_visitor.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <iter_visitor.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <type.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/third_party/nvfuser/csrc/iter_visitor.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/iter_visitor.h
rename to third_party/nvfuser/csrc/iter_visitor.h
index 3ad485f1a17b..53a686f82605 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.h
+++ b/third_party/nvfuser/csrc/iter_visitor.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <dispatch.h>
+#include <type.h>
 
 #include <deque>
 #include <unordered_set>
diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/third_party/nvfuser/csrc/kernel.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/kernel.cpp
rename to third_party/nvfuser/csrc/kernel.cpp
index 9e5211604972..5e3232f81ceb 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.cpp
+++ b/third_party/nvfuser/csrc/kernel.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <kernel.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/third_party/nvfuser/csrc/kernel.h
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/kernel.h
rename to third_party/nvfuser/csrc/kernel.h
index e2a0e57ed68f..9da44bfe2745 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.h
+++ b/third_party/nvfuser/csrc/kernel.h
@@ -2,14 +2,14 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
-#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
-#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
+#include <fusion.h>
+#include <ir_base_nodes.h>
+#include <ir_builder.h>
+#include <lower_sync_information.h>
+#include <lower_warp_reduce.h>
+#include <parallel_dimension_map.h>
+#include <utils.h>
+#include <vectorization_info.h>
 
 #include <memory>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/third_party/nvfuser/csrc/kernel_cache.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/kernel_cache.cpp
rename to third_party/nvfuser/csrc/kernel_cache.cpp
index c4604042bfae..4c7c86c6f5a7 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/third_party/nvfuser/csrc/kernel_cache.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <kernel_cache.h>
 
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <parser.h>
+#include <scheduler/debug_utils.h>
+#include <scheduler/registry.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 
@@ -209,7 +209,7 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
   // permute output tensor returned by kernel execution. See Part_3 in Note [
   // Permutation support in nvfuser ]
   for (const auto& pair : fusion_->getPermutationOutputMap()) {
-    if (pair.first < outputs.size()) {
+    if (size_t(pair.first) < outputs.size()) {
       outputs[pair.first] = outputs[pair.first].permute(pair.second);
     }
   }
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/third_party/nvfuser/csrc/kernel_cache.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/kernel_cache.h
rename to third_party/nvfuser/csrc/kernel_cache.h
index a8a0f1cf4f62..12820dcf12d2 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.h
+++ b/third_party/nvfuser/csrc/kernel_cache.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <evaluator_common.h>
+#include <executor.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/registry.h>
 
 #include <c10/macros/Export.h>
 #include <c10/util/ArrayRef.h>
diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp b/third_party/nvfuser/csrc/kernel_expr_evaluator.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
rename to third_party/nvfuser/csrc/kernel_expr_evaluator.cpp
index 15a18a6bca83..9eb518159c22 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
+++ b/third_party/nvfuser/csrc/kernel_expr_evaluator.cpp
@@ -1,6 +1,6 @@
 
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <instrumentation.h>
+#include <kernel_expr_evaluator.h>
 
 #include <iostream>
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h b/third_party/nvfuser/csrc/kernel_expr_evaluator.h
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
rename to third_party/nvfuser/csrc/kernel_expr_evaluator.h
index 8df365dfdc58..82dcd5179a6a 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
+++ b/third_party/nvfuser/csrc/kernel_expr_evaluator.h
@@ -3,10 +3,10 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
-#include <torch/csrc/jit/codegen/cuda/evaluator_common.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <dispatch.h>
+#include <dynamic_type.h>
+#include <evaluator_common.h>
+#include <kernel_ir.h>
 
 #include <c10/util/Optional.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/third_party/nvfuser/csrc/kernel_ir.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/kernel_ir.cpp
rename to third_party/nvfuser/csrc/kernel_ir.cpp
index 7e69f0307a7a..e6dcbf5d773f 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
+++ b/third_party/nvfuser/csrc/kernel_ir.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <ir_builder.h>
+#include <kernel.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <type.h>
 
 #include <iostream>
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/third_party/nvfuser/csrc/kernel_ir.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/kernel_ir.h
rename to third_party/nvfuser/csrc/kernel_ir.h
index cd44e8d8e21b..6650ebd873e9 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.h
+++ b/third_party/nvfuser/csrc/kernel_ir.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <ir_all_nodes.h>
+#include <ir_base_nodes.h>
+#include <parallel_type_bitmap.h>
+#include <type.h>
+#include <utils.h>
 
 #include <c10/macros/Export.h>
 #include <c10/util/Optional.h>
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp b/third_party/nvfuser/csrc/kernel_ir_dispatch.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp
rename to third_party/nvfuser/csrc/kernel_ir_dispatch.cpp
index 665e8d81532e..a46d3f4dcb86 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp
+++ b/third_party/nvfuser/csrc/kernel_ir_dispatch.cpp
@@ -1,5 +1,5 @@
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h b/third_party/nvfuser/csrc/kernel_ir_dispatch.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h
rename to third_party/nvfuser/csrc/kernel_ir_dispatch.h
index 139b4c37d45f..15a25ef4c967 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h
+++ b/third_party/nvfuser/csrc/kernel_ir_dispatch.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <dispatch.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/third_party/nvfuser/csrc/lower2device.cpp
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/lower2device.cpp
rename to third_party/nvfuser/csrc/lower2device.cpp
index 142ee1b7a02f..ec4c68cf50b9 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.cpp
+++ b/third_party/nvfuser/csrc/lower2device.cpp
@@ -1,31 +1,31 @@
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <lower2device.h>
 
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_alias_memory.h>
-#include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
-#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
-#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
-#include <torch/csrc/jit/codegen/cuda/lower_expr_sort.h>
-#include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
-#include <torch/csrc/jit/codegen/cuda/lower_index.h>
-#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
-#include <torch/csrc/jit/codegen/cuda/lower_instrument.h>
-#include <torch/csrc/jit/codegen/cuda/lower_loops.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
-#include <torch/csrc/jit/codegen/cuda/lower_predicate.h>
-#include <torch/csrc/jit/codegen/cuda/lower_replace_size.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
-#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_validation.h>
-#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower_alias_memory.h>
+#include <lower_allocation.h>
+#include <lower_divisible_split.h>
+#include <lower_double_buffer.h>
+#include <lower_expr_sort.h>
+#include <lower_fusion_simplifier.h>
+#include <lower_index.h>
+#include <lower_insert_syncs.h>
+#include <lower_instrument.h>
+#include <lower_loops.h>
+#include <lower_magic_zero.h>
+#include <lower_misaligned_vectorization.h>
+#include <lower_predicate.h>
+#include <lower_replace_size.h>
+#include <lower_shift.h>
+#include <lower_trivial_reductions.h>
+#include <lower_unroll.h>
+#include <lower_utils.h>
+#include <lower_validation.h>
+#include <lower_warp_reduce.h>
 
 #include <list>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/third_party/nvfuser/csrc/lower2device.h
similarity index 83%
rename from torch/csrc/jit/codegen/cuda/lower2device.h
rename to third_party/nvfuser/csrc/lower2device.h
index 250b06a6495f..9dbbd67f055e 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.h
+++ b/third_party/nvfuser/csrc/lower2device.h
@@ -2,27 +2,27 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
-#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
-#include <torch/csrc/jit/codegen/cuda/lower_fused_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/lower_index_hoist.h>
-#include <torch/csrc/jit/codegen/cuda/lower_predicate.h>
-#include <torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
-#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
-#include <torch/csrc/jit/codegen/cuda/non_divisible_split.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
-#include <torch/csrc/jit/codegen/cuda/partial_split_map.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
+#include <compute_at_map.h>
+#include <ir_all_nodes.h>
+#include <kernel.h>
+#include <kernel_ir.h>
+#include <lower_allocation.h>
+#include <lower_double_buffer.h>
+#include <lower_fused_reduction.h>
+#include <lower_index_hoist.h>
+#include <lower_predicate.h>
+#include <lower_predicate_elimination.h>
+#include <lower_shift.h>
+#include <lower_sync_information.h>
+#include <lower_thread_predicate.h>
+#include <lower_trivial_broadcast.h>
+#include <lower_trivial_reductions.h>
+#include <lower_warp_reduce.h>
+#include <non_divisible_split.h>
+#include <parallel_dimension_map.h>
+#include <partial_split_map.h>
+#include <root_domain_map.h>
+#include <vectorization_info.h>
 
 #include <memory>
 #include <ostream>
diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp b/third_party/nvfuser/csrc/lower_alias_memory.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
rename to third_party/nvfuser/csrc/lower_alias_memory.cpp
index ef12cce8fd46..e66ba4f474dc 100644
--- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
+++ b/third_party/nvfuser/csrc/lower_alias_memory.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/lower_alias_memory.h>
-
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <lower_alias_memory.h>
+
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <lower_utils.h>
 
 #include <sstream>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h b/third_party/nvfuser/csrc/lower_alias_memory.h
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/lower_alias_memory.h
rename to third_party/nvfuser/csrc/lower_alias_memory.h
index 0d144b9f2f40..105484a57d81 100644
--- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h
+++ b/third_party/nvfuser/csrc/lower_alias_memory.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <dispatch.h>
+#include <ir_all_nodes.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/third_party/nvfuser/csrc/lower_allocation.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_allocation.cpp
rename to third_party/nvfuser/csrc/lower_allocation.cpp
index 264905cfa213..ae3ef4f94b4e 100644
--- a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
+++ b/third_party/nvfuser/csrc/lower_allocation.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_allocation.h>
 
 #include <unordered_set>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.h b/third_party/nvfuser/csrc/lower_allocation.h
similarity index 86%
rename from torch/csrc/jit/codegen/cuda/lower_allocation.h
rename to third_party/nvfuser/csrc/lower_allocation.h
index 45ebeac03f77..cbac9f9eefcd 100644
--- a/torch/csrc/jit/codegen/cuda/lower_allocation.h
+++ b/third_party/nvfuser/csrc/lower_allocation.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp b/third_party/nvfuser/csrc/lower_bank_conflict.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp
rename to third_party/nvfuser/csrc/lower_bank_conflict.cpp
index 0b97b973f786..9ed567c4d56b 100644
--- a/torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp
+++ b/third_party/nvfuser/csrc/lower_bank_conflict.cpp
@@ -1,10 +1,10 @@
-#include <torch/csrc/jit/codegen/cuda/lower_bank_conflict.h>
+#include <lower_bank_conflict.h>
 
-#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <dynamic_type.h>
+#include <expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <type.h>
 
 #include <unordered_set>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_bank_conflict.h b/third_party/nvfuser/csrc/lower_bank_conflict.h
similarity index 87%
rename from torch/csrc/jit/codegen/cuda/lower_bank_conflict.h
rename to third_party/nvfuser/csrc/lower_bank_conflict.h
index b651c4ed33e2..a82c40c7ad38 100644
--- a/torch/csrc/jit/codegen/cuda/lower_bank_conflict.h
+++ b/third_party/nvfuser/csrc/lower_bank_conflict.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <dynamic_type.h>
+#include <executor_launch_params.h>
+#include <ir_base_nodes.h>
+#include <kernel.h>
 
 #include <unordered_map>
 #include <utility>
diff --git a/torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp b/third_party/nvfuser/csrc/lower_divisible_split.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp
rename to third_party/nvfuser/csrc/lower_divisible_split.cpp
index c1de1201e5d1..4a93be69d0f7 100644
--- a/torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp
+++ b/third_party/nvfuser/csrc/lower_divisible_split.cpp
@@ -1,8 +1,8 @@
 
-#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
+#include <lower_divisible_split.h>
 
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <disjoint_set.h>
+#include <ir_utils.h>
 
 #include <unordered_set>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_divisible_split.h b/third_party/nvfuser/csrc/lower_divisible_split.h
similarity index 82%
rename from torch/csrc/jit/codegen/cuda/lower_divisible_split.h
rename to third_party/nvfuser/csrc/lower_divisible_split.h
index f2c4a78e4895..f69a9f14e6fa 100644
--- a/torch/csrc/jit/codegen/cuda/lower_divisible_split.h
+++ b/third_party/nvfuser/csrc/lower_divisible_split.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <compute_at_map.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp b/third_party/nvfuser/csrc/lower_double_buffer.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp
rename to third_party/nvfuser/csrc/lower_double_buffer.cpp
index 9d3482c2d1d4..cf154f59e37a 100644
--- a/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp
+++ b/third_party/nvfuser/csrc/lower_double_buffer.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <ir_utils.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
+#include <lower_double_buffer.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.h b/third_party/nvfuser/csrc/lower_double_buffer.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_double_buffer.h
rename to third_party/nvfuser/csrc/lower_double_buffer.h
index 6f961451d0b4..d7741bbc8276 100644
--- a/torch/csrc/jit/codegen/cuda/lower_double_buffer.h
+++ b/third_party/nvfuser/csrc/lower_double_buffer.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
 
 // Double buffering a tensor doubles its allocation size and uses two
 // buffers to facilitate computation and memory access
diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/third_party/nvfuser/csrc/lower_expr_sort.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
rename to third_party/nvfuser/csrc/lower_expr_sort.cpp
index 5b659e3e9460..312f8770ca1c 100644
--- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
+++ b/third_party/nvfuser/csrc/lower_expr_sort.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_expr_sort.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <compute_at_map.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_expr_sort.h>
+#include <lower_utils.h>
 
 #include <deque>
 #include <list>
diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.h b/third_party/nvfuser/csrc/lower_expr_sort.h
similarity index 79%
rename from torch/csrc/jit/codegen/cuda/lower_expr_sort.h
rename to third_party/nvfuser/csrc/lower_expr_sort.h
index 4b44541c6fb4..b23b45f92fe1 100644
--- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.h
+++ b/third_party/nvfuser/csrc/lower_expr_sort.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <ir_base_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp b/third_party/nvfuser/csrc/lower_fused_reduction.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp
rename to third_party/nvfuser/csrc/lower_fused_reduction.cpp
index 744feab598b3..87db1d5ca625 100644
--- a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp
+++ b/third_party/nvfuser/csrc/lower_fused_reduction.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_fused_reduction.h>
+#include <lower_fused_reduction.h>
 
 #include <algorithm>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h b/third_party/nvfuser/csrc/lower_fused_reduction.h
similarity index 93%
rename from torch/csrc/jit/codegen/cuda/lower_fused_reduction.h
rename to third_party/nvfuser/csrc/lower_fused_reduction.h
index 4307a30bc512..332f49d253a1 100644
--- a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h
+++ b/third_party/nvfuser/csrc/lower_fused_reduction.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp b/third_party/nvfuser/csrc/lower_fusion_simplifier.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp
rename to third_party/nvfuser/csrc/lower_fusion_simplifier.cpp
index a82ef0ae52f6..34849ffe39b5 100644
--- a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp
+++ b/third_party/nvfuser/csrc/lower_fusion_simplifier.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <ir_builder.h>
+#include <kernel_ir_dispatch.h>
+#include <lower_utils.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
+#include <lower_fusion_simplifier.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h b/third_party/nvfuser/csrc/lower_fusion_simplifier.h
similarity index 68%
rename from torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h
rename to third_party/nvfuser/csrc/lower_fusion_simplifier.h
index e18f4a8f0778..03019ea63865 100644
--- a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h
+++ b/third_party/nvfuser/csrc/lower_fusion_simplifier.h
@@ -2,10 +2,10 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
+#include <dispatch.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <lower_trivial_reductions.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/third_party/nvfuser/csrc/lower_index.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_index.cpp
rename to third_party/nvfuser/csrc/lower_index.cpp
index e83a0e9fce99..3a480f7813b3 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.cpp
+++ b/third_party/nvfuser/csrc/lower_index.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
-
-#include <torch/csrc/jit/codegen/cuda/lower_index.h>
+#include <arith.h>
+#include <index_compute.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <predicate_compute.h>
+
+#include <lower_index.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/third_party/nvfuser/csrc/lower_index.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/lower_index.h
rename to third_party/nvfuser/csrc/lower_index.h
index 6c08eeb195ea..2990bc5883c8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.h
+++ b/third_party/nvfuser/csrc/lower_index.h
@@ -2,10 +2,10 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <instrumentation.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <root_domain_map.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/third_party/nvfuser/csrc/lower_index_compute.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
rename to third_party/nvfuser/csrc/lower_index_compute.cpp
index 140fecc0f8af..b687cfe10279 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
+++ b/third_party/nvfuser/csrc/lower_index_compute.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_validation.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <contiguity.h>
+#include <index_compute.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_index_compute.h>
+#include <lower_magic_zero.h>
+#include <lower_utils.h>
+#include <lower_validation.h>
+#include <transform_iter.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.h b/third_party/nvfuser/csrc/lower_index_compute.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_index_compute.h
rename to third_party/nvfuser/csrc/lower_index_compute.h
index 4b81fd0dec0c..fc5c91ddcc97 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_compute.h
+++ b/third_party/nvfuser/csrc/lower_index_compute.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
+#include <fusion.h>
+#include <index_compute.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp b/third_party/nvfuser/csrc/lower_index_hoist.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp
rename to third_party/nvfuser/csrc/lower_index_hoist.cpp
index b6af97378e79..77dbac598783 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp
+++ b/third_party/nvfuser/csrc/lower_index_hoist.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+#include <iter_visitor.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_magic_zero.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_index_hoist.h>
+#include <lower_index_hoist.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_hoist.h b/third_party/nvfuser/csrc/lower_index_hoist.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_index_hoist.h
rename to third_party/nvfuser/csrc/lower_index_hoist.h
index b3bf36248f8b..a22d2ce68ab3 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_hoist.h
+++ b/third_party/nvfuser/csrc/lower_index_hoist.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 #include <functional>
 #include <unordered_map>
@@ -27,7 +27,7 @@ namespace cuda {
 //! Class to represent unique indexed domains for index
 //! hoisting. Uniquenesss is determined with the indexed domain
 //! itself, the for-loops and their index values.
-class CommonIndexKey {
+class TORCH_CUDA_CU_API CommonIndexKey {
   friend struct CommonIndexKeyHash;
 
  public:
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/third_party/nvfuser/csrc/lower_insert_syncs.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
rename to third_party/nvfuser/csrc/lower_insert_syncs.cpp
index 86ca9d8427e7..709ec1afef11 100644
--- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
+++ b/third_party/nvfuser/csrc/lower_insert_syncs.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <dispatch.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_insert_syncs.h>
+#include <lower_utils.h>
 
 #include <unordered_set>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h b/third_party/nvfuser/csrc/lower_insert_syncs.h
similarity index 88%
rename from torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
rename to third_party/nvfuser/csrc/lower_insert_syncs.h
index 756462f0bd7c..ab35a3a68530 100644
--- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
+++ b/third_party/nvfuser/csrc/lower_insert_syncs.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_instrument.cpp b/third_party/nvfuser/csrc/lower_instrument.cpp
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/lower_instrument.cpp
rename to third_party/nvfuser/csrc/lower_instrument.cpp
index cb7402bb752a..ba81be622255 100644
--- a/torch/csrc/jit/codegen/cuda/lower_instrument.cpp
+++ b/third_party/nvfuser/csrc/lower_instrument.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+#include <iter_visitor.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_magic_zero.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_instrument.h>
+#include <lower_instrument.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_instrument.h b/third_party/nvfuser/csrc/lower_instrument.h
similarity index 93%
rename from torch/csrc/jit/codegen/cuda/lower_instrument.h
rename to third_party/nvfuser/csrc/lower_instrument.h
index 6ad39737b440..6caa0a952f4c 100644
--- a/torch/csrc/jit/codegen/cuda/lower_instrument.h
+++ b/third_party/nvfuser/csrc/lower_instrument.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/third_party/nvfuser/csrc/lower_loops.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/lower_loops.cpp
rename to third_party/nvfuser/csrc/lower_loops.cpp
index 0653296366cc..44d4b048b5ee 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp
+++ b/third_party/nvfuser/csrc/lower_loops.cpp
@@ -1,13 +1,13 @@
-#include <torch/csrc/jit/codegen/cuda/lower_loops.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <lower_loops.h>
+
+#include <arith.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <transform_replay.h>
 
 #include <algorithm>
 #include <deque>
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/third_party/nvfuser/csrc/lower_loops.h
similarity index 85%
rename from torch/csrc/jit/codegen/cuda/lower_loops.h
rename to third_party/nvfuser/csrc/lower_loops.h
index 9b480d7eb6f8..ed806aa5d539 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.h
+++ b/third_party/nvfuser/csrc/lower_loops.h
@@ -3,11 +3,11 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+#include <compute_at_map.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
+#include <lower_thread_predicate.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp b/third_party/nvfuser/csrc/lower_magic_zero.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp
rename to third_party/nvfuser/csrc/lower_magic_zero.cpp
index 717d43d4c5ca..c28f50f2b59a 100644
--- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp
+++ b/third_party/nvfuser/csrc/lower_magic_zero.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_index_compute.h>
+#include <lower_magic_zero.h>
+
+#include <dispatch.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_index_compute.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h b/third_party/nvfuser/csrc/lower_magic_zero.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/lower_magic_zero.h
rename to third_party/nvfuser/csrc/lower_magic_zero.h
index 8ee4d49fc0b4..556030f995e1 100644
--- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h
+++ b/third_party/nvfuser/csrc/lower_magic_zero.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp b/third_party/nvfuser/csrc/lower_misaligned_vectorization.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
rename to third_party/nvfuser/csrc/lower_misaligned_vectorization.cpp
index 9e713f4cf3a2..f69f4420e250 100644
--- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
+++ b/third_party/nvfuser/csrc/lower_misaligned_vectorization.cpp
@@ -1,14 +1,14 @@
-#include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
-
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
+#include <lower_misaligned_vectorization.h>
+
+#include <index_compute.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <predicate_compute.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h b/third_party/nvfuser/csrc/lower_misaligned_vectorization.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h
rename to third_party/nvfuser/csrc/lower_misaligned_vectorization.h
index bd7ae19d93a8..5c07fe154578 100644
--- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h
+++ b/third_party/nvfuser/csrc/lower_misaligned_vectorization.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp b/third_party/nvfuser/csrc/lower_predicate.cpp
similarity index 91%
rename from torch/csrc/jit/codegen/cuda/lower_predicate.cpp
rename to third_party/nvfuser/csrc/lower_predicate.cpp
index 7b0393d49157..1cb4a3e17003 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
+++ b/third_party/nvfuser/csrc/lower_predicate.cpp
@@ -1,17 +1,17 @@
-#include <torch/csrc/jit/codegen/cuda/lower_predicate.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <lower_predicate.h>
+
+#include <arith.h>
+#include <index_compute.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <predicate_compute.h>
+#include <transform_iter.h>
+#include <transform_replay.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.h b/third_party/nvfuser/csrc/lower_predicate.h
similarity index 77%
rename from torch/csrc/jit/codegen/cuda/lower_predicate.h
rename to third_party/nvfuser/csrc/lower_predicate.h
index 7f4926dad917..cc94d9ae67b2 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate.h
+++ b/third_party/nvfuser/csrc/lower_predicate.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp b/third_party/nvfuser/csrc/lower_predicate_elimination.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
rename to third_party/nvfuser/csrc/lower_predicate_elimination.cpp
index 294a2327bbba..5fc271c6ecf8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
+++ b/third_party/nvfuser/csrc/lower_predicate_elimination.cpp
@@ -1,16 +1,16 @@
-#include <torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <lower_predicate_elimination.h>
+
+#include <arith.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_shift.h>
+#include <lower_utils.h>
+#include <predicate_compute.h>
+#include <transform_iter.h>
+#include <transform_replay.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h b/third_party/nvfuser/csrc/lower_predicate_elimination.h
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h
rename to third_party/nvfuser/csrc/lower_predicate_elimination.h
index 557796ce9d4d..2eb094d7c34c 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h
+++ b/third_party/nvfuser/csrc/lower_predicate_elimination.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp b/third_party/nvfuser/csrc/lower_replace_size.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/lower_replace_size.cpp
rename to third_party/nvfuser/csrc/lower_replace_size.cpp
index 02b2e9a70edc..a94de103ba92 100644
--- a/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp
+++ b/third_party/nvfuser/csrc/lower_replace_size.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-
-#include <torch/csrc/jit/codegen/cuda/lower_replace_size.h>
+#include <instrumentation.h>
+#include <ir_builder.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower_utils.h>
+#include <root_domain_map.h>
+
+#include <lower_replace_size.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.h b/third_party/nvfuser/csrc/lower_replace_size.h
similarity index 81%
rename from torch/csrc/jit/codegen/cuda/lower_replace_size.h
rename to third_party/nvfuser/csrc/lower_replace_size.h
index 81cee9f6ffe0..91e60f8b2f7b 100644
--- a/torch/csrc/jit/codegen/cuda/lower_replace_size.h
+++ b/third_party/nvfuser/csrc/lower_replace_size.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <dispatch.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.cpp b/third_party/nvfuser/csrc/lower_shift.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_shift.cpp
rename to third_party/nvfuser/csrc/lower_shift.cpp
index 2a7c04243f4c..e3d10620cff5 100644
--- a/torch/csrc/jit/codegen/cuda/lower_shift.cpp
+++ b/third_party/nvfuser/csrc/lower_shift.cpp
@@ -1,14 +1,14 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <arith.h>
+#include <index_compute.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <lower_index_compute.h>
+#include <lower_shift.h>
+#include <lower_utils.h>
 
 #include <functional>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.h b/third_party/nvfuser/csrc/lower_shift.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_shift.h
rename to third_party/nvfuser/csrc/lower_shift.h
index f12410703d99..ba03907c6315 100644
--- a/torch/csrc/jit/codegen/cuda/lower_shift.h
+++ b/third_party/nvfuser/csrc/lower_shift.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <dispatch.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp b/third_party/nvfuser/csrc/lower_sync_information.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
rename to third_party/nvfuser/csrc/lower_sync_information.cpp
index 9b8ccd4a77ae..6d015e9212e7 100644
--- a/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
+++ b/third_party/nvfuser/csrc/lower_sync_information.cpp
@@ -1,9 +1,9 @@
 
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <lower2device.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
+#include <lower_sync_information.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.h b/third_party/nvfuser/csrc/lower_sync_information.h
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/lower_sync_information.h
rename to third_party/nvfuser/csrc/lower_sync_information.h
index 09fcf9eabd7f..42199828675b 100644
--- a/torch/csrc/jit/codegen/cuda/lower_sync_information.h
+++ b/third_party/nvfuser/csrc/lower_sync_information.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+#include <ir_all_nodes.h>
+#include <parallel_type_bitmap.h>
 
 #include <unordered_map>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/third_party/nvfuser/csrc/lower_thread_predicate.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
rename to third_party/nvfuser/csrc/lower_thread_predicate.cpp
index dc10224a165c..9e691589edca 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
+++ b/third_party/nvfuser/csrc/lower_thread_predicate.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <lower_thread_predicate.h>
+
+#include <arith.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_utils.h>
 
 #include <c10/util/irange.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/third_party/nvfuser/csrc/lower_thread_predicate.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
rename to third_party/nvfuser/csrc/lower_thread_predicate.h
index e8a895efb56d..8ca62291ab05 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
+++ b/third_party/nvfuser/csrc/lower_thread_predicate.h
@@ -3,9 +3,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+#include <ir_all_nodes.h>
+#include <lower_utils.h>
+#include <parallel_type_bitmap.h>
 
 #include <unordered_map>
 #include <unordered_set>
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp b/third_party/nvfuser/csrc/lower_trivial_broadcast.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
rename to third_party/nvfuser/csrc/lower_trivial_broadcast.cpp
index 88a84aa3c587..f60564b48c98 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
+++ b/third_party/nvfuser/csrc/lower_trivial_broadcast.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <root_domain_map.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
+#include <lower_trivial_broadcast.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h b/third_party/nvfuser/csrc/lower_trivial_broadcast.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
rename to third_party/nvfuser/csrc/lower_trivial_broadcast.h
index c30fa9951404..5df0c084bbec 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
+++ b/third_party/nvfuser/csrc/lower_trivial_broadcast.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <ir_all_nodes.h>
+#include <root_domain_map.h>
 
 #include <c10/macros/Export.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp b/third_party/nvfuser/csrc/lower_trivial_reductions.cpp
similarity index 88%
rename from torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp
rename to third_party/nvfuser/csrc/lower_trivial_reductions.cpp
index 4043df60e5c9..e12ff8f31911 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp
+++ b/third_party/nvfuser/csrc/lower_trivial_reductions.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <dispatch.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <lower2device.h>
+#include <lower_trivial_reductions.h>
+#include <lower_utils.h>
+#include <root_domain_map.h>
 
 #include <unordered_set>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h b/third_party/nvfuser/csrc/lower_trivial_reductions.h
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h
rename to third_party/nvfuser/csrc/lower_trivial_reductions.h
index caf0bd029d68..2467bc462f98 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h
+++ b/third_party/nvfuser/csrc/lower_trivial_reductions.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <dispatch.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <unordered_set>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/third_party/nvfuser/csrc/lower_unroll.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/lower_unroll.cpp
rename to third_party/nvfuser/csrc/lower_unroll.cpp
index 63dbbf83d775..b2eeedfb4510 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
+++ b/third_party/nvfuser/csrc/lower_unroll.cpp
@@ -1,15 +1,15 @@
-#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
+#include <lower_unroll.h>
+
+#include <arith.h>
+#include <index_compute.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
+#include <lower_misaligned_vectorization.h>
+#include <lower_utils.h>
+#include <predicate_compute.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/third_party/nvfuser/csrc/lower_unroll.h
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/lower_unroll.h
rename to third_party/nvfuser/csrc/lower_unroll.h
index 786e45115ba6..dc69d0ee2d60 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.h
+++ b/third_party/nvfuser/csrc/lower_unroll.h
@@ -1,11 +1,11 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower_thread_predicate.h>
+#include <lower_utils.h>
+#include <root_domain_map.h>
 
 #include <bitset>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/third_party/nvfuser/csrc/lower_utils.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_utils.cpp
rename to third_party/nvfuser/csrc/lower_utils.cpp
index 3e92269f278a..a239c6a3b109 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp
+++ b/third_party/nvfuser/csrc/lower_utils.cpp
@@ -1,15 +1,15 @@
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <lower_utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <arith.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_thread_predicate.h>
+#include <root_domain_map.h>
 
 #include <algorithm>
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/third_party/nvfuser/csrc/lower_utils.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_utils.h
rename to third_party/nvfuser/csrc/lower_utils.h
index 4807c1e5520e..c7e925246c21 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.h
+++ b/third_party/nvfuser/csrc/lower_utils.h
@@ -3,10 +3,10 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+#include <compute_at_map.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
+#include <parallel_type_bitmap.h>
 
 #include <bitset>
 #include <map>
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/third_party/nvfuser/csrc/lower_validation.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_validation.cpp
rename to third_party/nvfuser/csrc/lower_validation.cpp
index f6f71c2ec123..259e5111dacf 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp
+++ b/third_party/nvfuser/csrc/lower_validation.cpp
@@ -1,16 +1,16 @@
-#include <torch/csrc/jit/codegen/cuda/lower_validation.h>
-
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <lower_validation.h>
+
+#include <contiguity.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <transform_iter.h>
+#include <transform_replay.h>
+#include <type.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <limits>
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.h b/third_party/nvfuser/csrc/lower_validation.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/lower_validation.h
rename to third_party/nvfuser/csrc/lower_validation.h
index 47305ac25ef4..69ed4cced8b5 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.h
+++ b/third_party/nvfuser/csrc/lower_validation.h
@@ -2,7 +2,7 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp b/third_party/nvfuser/csrc/lower_warp_reduce.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
rename to third_party/nvfuser/csrc/lower_warp_reduce.cpp
index ff603c1d18f6..960b84aa0dcb 100644
--- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
+++ b/third_party/nvfuser/csrc/lower_warp_reduce.cpp
@@ -1,10 +1,10 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
+#include <expr_evaluator.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <lower_warp_reduce.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h b/third_party/nvfuser/csrc/lower_warp_reduce.h
similarity index 87%
rename from torch/csrc/jit/codegen/cuda/lower_warp_reduce.h
rename to third_party/nvfuser/csrc/lower_warp_reduce.h
index 7480809c7dce..52d017943b15 100644
--- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h
+++ b/third_party/nvfuser/csrc/lower_warp_reduce.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <kernel_ir.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/third_party/nvfuser/csrc/manager.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/manager.cpp
rename to third_party/nvfuser/csrc/manager.cpp
index 4eb61c78b749..d9186ab2254f 100644
--- a/torch/csrc/jit/codegen/cuda/manager.cpp
+++ b/third_party/nvfuser/csrc/manager.cpp
@@ -1,13 +1,13 @@
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/manager.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/type_inference.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <executor.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <kernel_cache.h>
+#include <manager.h>
+#include <parser.h>
+#include <scheduler/all_schedulers.h>
+#include <type_inference.h>
+#include <utils.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
@@ -285,7 +285,7 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
     // make a copy of the stack
     int64_t inputs_size =
         static_cast<int64_t>(fusion_node->g(attr::Subgraph)->inputs().size());
-    TORCH_INTERNAL_ASSERT(stack.size() >= inputs_size);
+    TORCH_INTERNAL_ASSERT(int64_t(stack.size()) >= inputs_size);
     stack_copy = Stack();
     stack_copy->insert(
         stack_copy->end(), stack.begin(), stack.end() - inputs_size);
@@ -350,7 +350,7 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
     int64_t output_count =
         static_cast<int64_t>(fusion_node->g(attr::Subgraph)->outputs().size());
     TORCH_CHECK(
-        output_count <= stack.size(),
+        output_count <= int64_t(stack.size()),
         "Expected ",
         output_count,
         " outputs but found only ",
diff --git a/torch/csrc/jit/codegen/cuda/manager.h b/third_party/nvfuser/csrc/manager.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/manager.h
rename to third_party/nvfuser/csrc/manager.h
diff --git a/torch/csrc/jit/codegen/cuda/maxinfo_propagator.cpp b/third_party/nvfuser/csrc/maxinfo_propagator.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/maxinfo_propagator.cpp
rename to third_party/nvfuser/csrc/maxinfo_propagator.cpp
index 6df8d3f95dd7..20c83084dcca 100644
--- a/torch/csrc/jit/codegen/cuda/maxinfo_propagator.cpp
+++ b/third_party/nvfuser/csrc/maxinfo_propagator.cpp
@@ -1,5 +1,5 @@
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <maxinfo_propagator.h>
+#include <root_domain_map.h>
 
 namespace torch {
 namespace jit {
@@ -373,7 +373,7 @@ MaxRootDomainInfoSpanningTree::getReferenceRootIDInfo(
     leaf_pos += int64_t(tv->nDims()) + 1;
   }
   TORCH_CHECK(
-      leaf_pos >= 0 && leaf_pos <= tv->nDims(),
+      leaf_pos >= 0 && leaf_pos <= int64_t(tv->nDims()),
       "MaxRootDomainInfoSpanningTree called on an leaf_pos outside valid range.");
   RootDomainInfo result;
   const auto& root_domain = tv->getMaybeRFactorDomain();
diff --git a/torch/csrc/jit/codegen/cuda/maxinfo_propagator.h b/third_party/nvfuser/csrc/maxinfo_propagator.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/maxinfo_propagator.h
rename to third_party/nvfuser/csrc/maxinfo_propagator.h
index 620096fe7d88..83228477ef05 100644
--- a/torch/csrc/jit/codegen/cuda/maxinfo_propagator.h
+++ b/third_party/nvfuser/csrc/maxinfo_propagator.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <ir_interface_nodes.h>
+#include <ir_utils.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/mma_type.cpp b/third_party/nvfuser/csrc/mma_type.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/mma_type.cpp
rename to third_party/nvfuser/csrc/mma_type.cpp
index 8588d6845554..2c0be9a9b313 100644
--- a/torch/csrc/jit/codegen/cuda/mma_type.cpp
+++ b/third_party/nvfuser/csrc/mma_type.cpp
@@ -1,6 +1,6 @@
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <mma_type.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/mma_type.h b/third_party/nvfuser/csrc/mma_type.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/mma_type.h
rename to third_party/nvfuser/csrc/mma_type.h
index 7874573a3d01..29faefdf1920 100644
--- a/torch/csrc/jit/codegen/cuda/mma_type.h
+++ b/third_party/nvfuser/csrc/mma_type.h
@@ -1,6 +1,6 @@
 #pragma once
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <fusion.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/mutator.cpp b/third_party/nvfuser/csrc/mutator.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/mutator.cpp
rename to third_party/nvfuser/csrc/mutator.cpp
index 12a3de15f4a7..5338573fb0f7 100644
--- a/torch/csrc/jit/codegen/cuda/mutator.cpp
+++ b/third_party/nvfuser/csrc/mutator.cpp
@@ -1,8 +1,8 @@
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <mutator.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/mutator.h b/third_party/nvfuser/csrc/mutator.h
similarity index 88%
rename from torch/csrc/jit/codegen/cuda/mutator.h
rename to third_party/nvfuser/csrc/mutator.h
index 433de485cf19..f2a983b2fdd2 100644
--- a/torch/csrc/jit/codegen/cuda/mutator.h
+++ b/third_party/nvfuser/csrc/mutator.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <dispatch.h>
+#include <ir_base_nodes.h>
 
 #include <unordered_map>
 
diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp b/third_party/nvfuser/csrc/non_divisible_split.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
rename to third_party/nvfuser/csrc/non_divisible_split.cpp
index eaff9274892d..339d0874e6a5 100644
--- a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
+++ b/third_party/nvfuser/csrc/non_divisible_split.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/non_divisible_split.h>
+#include <expr_evaluator.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_utils.h>
+#include <non_divisible_split.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.h b/third_party/nvfuser/csrc/non_divisible_split.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/non_divisible_split.h
rename to third_party/nvfuser/csrc/non_divisible_split.h
index 6706c9f072d3..4a02e16a6ded 100644
--- a/torch/csrc/jit/codegen/cuda/non_divisible_split.h
+++ b/third_party/nvfuser/csrc/non_divisible_split.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <ir_all_nodes.h>
+#include <iter_visitor.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.cpp b/third_party/nvfuser/csrc/ops/alias.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/ops/alias.cpp
rename to third_party/nvfuser/csrc/ops/alias.cpp
index 20c6ee533063..8f0793781ce0 100644
--- a/torch/csrc/jit/codegen/cuda/ops/alias.cpp
+++ b/third_party/nvfuser/csrc/ops/alias.cpp
@@ -1,9 +1,9 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/ops/alias.h>
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
-#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
+#include <arith.h>
+#include <ir_builder.h>
+#include <ir_utils.h>
+#include <ops/alias.h>
+#include <transform_view.h>
+#include <type_promotion.h>
 
 namespace torch {
 namespace jit {
@@ -120,11 +120,13 @@ TensorView* flatten(TensorView* x, int64_t start_dim, int64_t end_dim) {
     end_dim += inp_domain.size();
   }
   TORCH_CHECK(
-      start_dim >= 0 && start_dim < inp_domain.size(),
+      start_dim >= 0 && start_dim < int64_t(inp_domain.size()),
       "Invalid start_dim ",
       start_dim);
   TORCH_CHECK(
-      end_dim >= 0 && end_dim < inp_domain.size(), "Invalid end_dim ", end_dim);
+      end_dim >= 0 && end_dim < int64_t(inp_domain.size()),
+      "Invalid end_dim ",
+      end_dim);
   TORCH_CHECK(start_dim <= end_dim, "start_dim must be <= end_dim");
 
   if (start_dim == end_dim) {
@@ -145,7 +147,7 @@ TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes) {
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   TORCH_INTERNAL_ASSERT(
-      ndims == sizes.size(),
+      ndims == int(sizes.size()),
       "Invalid sizes for squeeze: ",
       sizes,
       ". Input tensor: ",
@@ -169,7 +171,7 @@ TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes, int dim) {
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   TORCH_INTERNAL_ASSERT(
-      ndims == sizes.size(),
+      ndims == int(sizes.size()),
       "Invalid sizes for squeeze: ",
       sizes,
       ". Input tensor: ",
diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.h b/third_party/nvfuser/csrc/ops/alias.h
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/ops/alias.h
rename to third_party/nvfuser/csrc/ops/alias.h
index f363f01bb409..c9821ba9d107 100644
--- a/torch/csrc/jit/codegen/cuda/ops/alias.h
+++ b/third_party/nvfuser/csrc/ops/alias.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <ir_interface_nodes.h>
+#include <type.h>
 
 //
 // The operations defined in this header is intended as user facing functions.
diff --git a/third_party/nvfuser/csrc/ops/all_ops.h b/third_party/nvfuser/csrc/ops/all_ops.h
new file mode 100644
index 000000000000..21f8437702ed
--- /dev/null
+++ b/third_party/nvfuser/csrc/ops/all_ops.h
@@ -0,0 +1,5 @@
+#pragma once
+#include <arith.h>
+#include <ops/alias.h>
+#include <ops/composite.h>
+#include <ops/normalization.h>
diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.cpp b/third_party/nvfuser/csrc/ops/composite.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/ops/composite.cpp
rename to third_party/nvfuser/csrc/ops/composite.cpp
index a7905c4894c1..50cf9f89c762 100644
--- a/torch/csrc/jit/codegen/cuda/ops/composite.cpp
+++ b/third_party/nvfuser/csrc/ops/composite.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ops/composite.h>
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
+#include <arith.h>
+#include <ir_builder.h>
+#include <ops/composite.h>
+#include <transform_view.h>
 
 namespace torch {
 namespace jit {
@@ -75,7 +75,7 @@ LstmResult lstm(
 
 namespace {
 template <typename T>
-TORCH_CUDA_CU_API T* sign(T* x) {
+T* sign(T* x) {
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   auto zero = IrBuilder::create<Double>(x->container(), 0.);
   auto one = IrBuilder::create<Double>(x->container(), 1.);
@@ -85,11 +85,11 @@ TORCH_CUDA_CU_API T* sign(T* x) {
 }
 } // namespace
 
-TORCH_CUDA_CU_API TensorView* sign(TensorView* x) {
+TensorView* sign(TensorView* x) {
   return sign<TensorView>(x);
 }
 
-TORCH_CUDA_CU_API Val* sign(Val* x) {
+Val* sign(Val* x) {
   return sign<Val>(x);
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.h b/third_party/nvfuser/csrc/ops/composite.h
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/ops/composite.h
rename to third_party/nvfuser/csrc/ops/composite.h
index 23aee5b20c47..c1c9251301c1 100644
--- a/torch/csrc/jit/codegen/cuda/ops/composite.h
+++ b/third_party/nvfuser/csrc/ops/composite.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <ir_interface_nodes.h>
+#include <type.h>
 
 //
 // The operations defined in this header is intended as user facing functions.
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/third_party/nvfuser/csrc/ops/normalization.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/ops/normalization.cpp
rename to third_party/nvfuser/csrc/ops/normalization.cpp
index f1739c665f03..0194100f5e0a 100644
--- a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
+++ b/third_party/nvfuser/csrc/ops/normalization.cpp
@@ -1,6 +1,6 @@
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ops/normalization.h>
+#include <arith.h>
+#include <ir_builder.h>
+#include <ops/normalization.h>
 
 namespace torch {
 namespace jit {
@@ -37,14 +37,14 @@ TensorView* variance(
     bool unbiased,
     bool keepdim) {
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
-  int64_t correction = unbiased ? 1 : 0;
+  double correction = unbiased ? 1 : 0;
   return variance(x, dims, correction, keepdim);
 }
 
 TensorView* variance(
     TensorView* x,
     const std::vector<int>& dims,
-    int64_t correction,
+    double correction,
     bool keepdim) {
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
 
@@ -62,17 +62,17 @@ TensorView* variance(
   auto num_features = numFeatures(x, dims, kNumberOfDims);
   if (correction > 0) {
     num_features =
-        sub(num_features, IrBuilder::create<Int>(x->container(), correction));
+        sub(num_features, IrBuilder::create<Double>(x->container(), correction));
   }
   auto y = div(sum_x_mean_sub_sq, num_features);
 
   return y;
 }
 
-TORCH_CUDA_CU_API VarMeanResult variance_mean(
+VarMeanResult variance_mean(
     TensorView* x,
     const std::vector<int>& dims,
-    int64_t correction,
+    double correction,
     bool keepdim) {
   TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
 
@@ -108,7 +108,7 @@ TORCH_CUDA_CU_API VarMeanResult variance_mean(
   auto num_features = numFeatures(x, dims, kNumberOfDims);
   if (correction > 0) {
     num_features =
-        sub(num_features, IrBuilder::create<Int>(x->container(), correction));
+        sub(num_features, IrBuilder::create<Double>(x->container(), correction));
   }
 
   auto welford_out = Welford(x, dims);
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.h b/third_party/nvfuser/csrc/ops/normalization.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/ops/normalization.h
rename to third_party/nvfuser/csrc/ops/normalization.h
index d0283525d19a..f3b6a2784738 100644
--- a/torch/csrc/jit/codegen/cuda/ops/normalization.h
+++ b/third_party/nvfuser/csrc/ops/normalization.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <ir_interface_nodes.h>
+#include <type.h>
 
 //
 // The operations defined in this header is intended as user facing functions.
@@ -57,13 +57,13 @@ TORCH_CUDA_CU_API TensorView* variance(
 TORCH_CUDA_CU_API TensorView* variance(
     TensorView* x,
     const std::vector<int>& dims,
-    int64_t correction,
+    double correction,
     bool keepdim);
 
 TORCH_CUDA_CU_API VarMeanResult variance_mean(
     TensorView* x,
     const std::vector<int>& dims,
-    int64_t correction,
+    double correction,
     bool keepdim);
 
 TORCH_CUDA_CU_API TensorView* standard_deviation(
diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp b/third_party/nvfuser/csrc/parallel_dimension_map.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
rename to third_party/nvfuser/csrc/parallel_dimension_map.cpp
index c562b206652d..79299c6e9371 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp
+++ b/third_party/nvfuser/csrc/parallel_dimension_map.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
+#include <parallel_dimension_map.h>
 
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <expr_evaluator.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_expr_evaluator.h>
+#include <lower2device.h>
 
 #include <sstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h b/third_party/nvfuser/csrc/parallel_dimension_map.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/parallel_dimension_map.h
rename to third_party/nvfuser/csrc/parallel_dimension_map.h
index 03bd513396f9..5ecd319baa43 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h
+++ b/third_party/nvfuser/csrc/parallel_dimension_map.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <deque>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.cpp b/third_party/nvfuser/csrc/parallel_type_bitmap.cpp
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/parallel_type_bitmap.cpp
rename to third_party/nvfuser/csrc/parallel_type_bitmap.cpp
index 9e3ff2046c0f..9a8a37653217 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.cpp
+++ b/third_party/nvfuser/csrc/parallel_type_bitmap.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h>
+#include <parallel_type_bitmap.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h b/third_party/nvfuser/csrc/parallel_type_bitmap.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h
rename to third_party/nvfuser/csrc/parallel_type_bitmap.h
index 642017a3c097..ce058e26ff55 100644
--- a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h
+++ b/third_party/nvfuser/csrc/parallel_type_bitmap.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <type.h>
 
 #include <array>
 #include <bitset>
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/third_party/nvfuser/csrc/parser.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/parser.cpp
rename to third_party/nvfuser/csrc/parser.cpp
index e78d5effbee3..3d61c50c66d5 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/third_party/nvfuser/csrc/parser.cpp
@@ -1,14 +1,14 @@
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/type_inference.h>
-#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <parser.h>
+
+#include <arith.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_iostream.h>
+#include <ops/all_ops.h>
+#include <type_inference.h>
+#include <type_promotion.h>
+#include <utils.h>
 
 #include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <torch/csrc/jit/ir/constants.h>
diff --git a/torch/csrc/jit/codegen/cuda/parser.h b/third_party/nvfuser/csrc/parser.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/parser.h
rename to third_party/nvfuser/csrc/parser.h
index ddfbf7762742..929d0e5ef3b3 100644
--- a/torch/csrc/jit/codegen/cuda/parser.h
+++ b/third_party/nvfuser/csrc/parser.h
@@ -4,7 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <fusion.h>
 
 /*
  * This file handles Parsing PyTorch jit ir;
diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp b/third_party/nvfuser/csrc/partial_split_map.cpp
similarity index 90%
rename from torch/csrc/jit/codegen/cuda/partial_split_map.cpp
rename to third_party/nvfuser/csrc/partial_split_map.cpp
index dd8fb05a0493..2a0b6b2573f0 100644
--- a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp
+++ b/third_party/nvfuser/csrc/partial_split_map.cpp
@@ -1,6 +1,6 @@
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/partial_split_map.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <partial_split_map.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.h b/third_party/nvfuser/csrc/partial_split_map.h
similarity index 80%
rename from torch/csrc/jit/codegen/cuda/partial_split_map.h
rename to third_party/nvfuser/csrc/partial_split_map.h
index 8ec489915b79..ae3de67786d8 100644
--- a/torch/csrc/jit/codegen/cuda/partial_split_map.h
+++ b/third_party/nvfuser/csrc/partial_split_map.h
@@ -2,9 +2,9 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <dispatch.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/partition.cpp b/third_party/nvfuser/csrc/partition.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/partition.cpp
rename to third_party/nvfuser/csrc/partition.cpp
index e9c809101b07..77dc230ea1a1 100644
--- a/torch/csrc/jit/codegen/cuda/partition.cpp
+++ b/third_party/nvfuser/csrc/partition.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/partition.h>
+#include <partition.h>
 
 #include <ATen/core/jit_type.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <instrumentation.h>
+#include <parser.h>
+#include <utils.h>
 #include <torch/csrc/jit/jit_log.h>
 
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/partition.h b/third_party/nvfuser/csrc/partition.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/partition.h
rename to third_party/nvfuser/csrc/partition.h
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp b/third_party/nvfuser/csrc/predicate_compute.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/predicate_compute.cpp
rename to third_party/nvfuser/csrc/predicate_compute.cpp
index 2941b96fdae1..6a4bf17493ad 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
+++ b/third_party/nvfuser/csrc/predicate_compute.cpp
@@ -1,13 +1,13 @@
-#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <predicate_compute.h>
+
+#include <arith.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <index_compute.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <transform_iter.h>
 
 #include <c10/util/irange.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.h b/third_party/nvfuser/csrc/predicate_compute.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/predicate_compute.h
rename to third_party/nvfuser/csrc/predicate_compute.h
index 6cf3609d3151..b390d299777b 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.h
+++ b/third_party/nvfuser/csrc/predicate_compute.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/index_compute.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <index_compute.h>
+#include <kernel_ir.h>
+#include <lower_thread_predicate.h>
+#include <lower_utils.h>
+#include <root_domain_map.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/README.md b/third_party/nvfuser/csrc/python_frontend/README.md
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/python_frontend/README.md
rename to third_party/nvfuser/csrc/python_frontend/README.md
index d519e69bcda3..c1b65e45dfb8 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/README.md
+++ b/third_party/nvfuser/csrc/python_frontend/README.md
@@ -8,7 +8,7 @@ This frontend allows for a user to describe the set of operations for nvFuser to
 
 ```python
 import torch
-from torch._C._nvfuser import Fusion, FusionDefinition, DataType
+from nvfuser._C import Fusion, FusionDefinition, DataType
 
 fs = Fusion()
 with FusionDefinition(fs) as fd :
@@ -104,7 +104,7 @@ output = fd.ops.foo(arg1, ... )
 ```
 You can see a supported list of operations with the following query:
 ```python
-python -c "from torch._C._nvfuser import FusionDefinition; help(FusionDefinition.Operators)"
+python -c "from nvfuser._C import FusionDefinition; help(FusionDefinition.Operators)"
 ```
 #### Notating Outputs
 
@@ -119,7 +119,7 @@ add_output(output: Scalar)
 # Debug Information
 **Query a list of supported operations:**
 ```python
-python -c "from torch._C._nvfuser import FusionDefinition; help(FusionDefinition.Operators)"
+python -c "from nvfuser._C import FusionDefinition; help(FusionDefinition.Operators)"
 ```
 **View the fusion definitions that are executed by setting an environment variable:**
 ```python
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp b/third_party/nvfuser/csrc/python_frontend/fusion_cache.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp
rename to third_party/nvfuser/csrc/python_frontend/fusion_cache.cpp
index 0efc4a0f0cfc..f96fe9e14df5 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_cache.cpp
@@ -1,5 +1,5 @@
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+#include <python_frontend/fusion_cache.h>
+#include <python_frontend/fusion_record.h>
 #include <mutex>
 
 namespace nvfuser {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h b/third_party/nvfuser/csrc/python_frontend/fusion_cache.h
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h
rename to third_party/nvfuser/csrc/python_frontend/fusion_cache.h
index 7d18d78f6720..6c0c1e8d214b 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_cache.h
@@ -1,8 +1,8 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
+#include <kernel_cache.h>
+#include <python_frontend/fusion_record.h>
 
 #include <memory>
 
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp b/third_party/nvfuser/csrc/python_frontend/fusion_definition.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp
rename to third_party/nvfuser/csrc/python_frontend/fusion_definition.cpp
index cf467d9ae5ca..33e07cea5608 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_definition.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <instrumentation.h>
+#include <python_frontend/fusion_cache.h>
+#include <python_frontend/fusion_definition.h>
+#include <python_frontend/fusion_interface.h>
+#include <utils.h>
 
 // Require namespace for perf scope instrumentation
 using namespace torch::jit::fuser::cuda::inst;
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h b/third_party/nvfuser/csrc/python_frontend/fusion_definition.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h
rename to third_party/nvfuser/csrc/python_frontend/fusion_definition.h
index 68723813ea2c..c61dc2335d2e 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_definition.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <kernel_cache.h>
 
 //! nvFuser Fusion IR namespace abbreviation
 namespace Nvf = torch::jit::fuser::cuda;
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp b/third_party/nvfuser/csrc/python_frontend/fusion_interface.cpp
similarity index 93%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp
rename to third_party/nvfuser/csrc/python_frontend/fusion_interface.cpp
index b9e3b65116af..1618b40b8cf3 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_interface.cpp
@@ -1,5 +1,5 @@
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
+#include <python_frontend/fusion_cache.h>
+#include <python_frontend/fusion_interface.h>
 
 namespace nvfuser {
 
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h b/third_party/nvfuser/csrc/python_frontend/fusion_interface.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h
rename to third_party/nvfuser/csrc/python_frontend/fusion_interface.h
index 60d55f16104f..7bdbaab73698 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_interface.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <kernel_cache.h>
 
 //! nvFuser Fusion IR namespace abbreviation
 namespace Nvf = torch::jit::fuser::cuda;
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/third_party/nvfuser/csrc/python_frontend/fusion_record.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
rename to third_party/nvfuser/csrc/python_frontend/fusion_record.h
index 771b374db7d5..daea184a2309 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
+++ b/third_party/nvfuser/csrc/python_frontend/fusion_record.h
@@ -1,10 +1,10 @@
 #pragma once
 #include <c10/util/complex.h>
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ops/alias.h>
-#include <torch/csrc/jit/codegen/cuda/ops/normalization.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <arith.h>
+#include <ops/alias.h>
+#include <ops/normalization.h>
+#include <python_frontend/fusion_definition.h>
+#include <utils.h>
 
 #include <algorithm>
 
@@ -1367,7 +1367,7 @@ struct NormOpRecord : RecordFunctor {
       std::string name,
       RecordType type,
       std::vector<int>& axes,
-      int64_t correction,
+      double correction,
       bool keep_dim)
       : RecordFunctor(std::move(args), std::move(outputs), name, type),
         axes_(axes),
@@ -1441,7 +1441,7 @@ struct NormOpRecord : RecordFunctor {
   //! Dimensions of tensor to reduce for variance calculation
   std::vector<int> axes_;
   //! Bessel's correction value
-  int64_t correction_;
+  double correction_;
   //! Indicates whether to keep the reduced dimension(s).
   bool keep_dim_;
 };
@@ -1451,7 +1451,7 @@ struct VarianceOpRecord : NormOpRecord {
       std::vector<State> args,
       std::vector<State> outputs,
       std::vector<int>& axes,
-      int64_t correction,
+      double correction,
       bool keep_dim)
       : NormOpRecord(
             std::move(args),
@@ -1480,7 +1480,7 @@ struct VarianceMeanOpRecord : NormOpRecord {
       std::vector<State> args,
       std::vector<State> outputs,
       std::vector<int>& axes,
-      int64_t correction,
+      double correction,
       bool keep_dim)
       : NormOpRecord(
             std::move(args),
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/third_party/nvfuser/csrc/python_frontend/python_bindings.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
rename to third_party/nvfuser/csrc/python_frontend/python_bindings.cpp
index fc9d105100b9..4ecb52b2fdfe 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/python_bindings.cpp
@@ -1,19 +1,18 @@
-#include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
+#include <python_frontend/python_bindings.h>
 
-#ifdef USE_CUDA
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ops/composite.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
+#include <arith.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ops/composite.h>
+#include <python_frontend/fusion_cache.h>
+#include <python_frontend/fusion_definition.h>
+#include <python_frontend/fusion_interface.h>
+#include <python_frontend/fusion_record.h>
+#include <python_frontend/python_bindings.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <iostream>
 #include <tuple>
@@ -22,10 +21,7 @@ namespace torch {
 namespace jit {
 
 void initNvFuserPythonBindings(PyObject* module) {
-  auto m = py::handle(module).cast<py::module>();
-
-  //! Top Level nvFuser Python submodule
-  auto nvfuser = m.def_submodule("_nvfuser");
+  auto nvfuser = py::handle(module).cast<py::module>();
 
   //! DataTypes supported by nvFuser in the FusionDefinition
   py::enum_<Nvf::DataType>(nvfuser, "DataType")
@@ -1273,7 +1269,7 @@ void initNvFuserPythonBindings(PyObject* module) {
       [](nvfuser::FusionDefinition::Operators& self,
          nvfuser::Tensor arg,
          std::vector<int>& axes,
-         int64_t correction,
+         double correction,
          bool keepdim) -> nvfuser::Tensor {
         FUSER_PERF_SCOPE("Operators.var");
         nvfuser::FusionDefinition* fd = self.fusion_definition;
@@ -1296,7 +1292,7 @@ void initNvFuserPythonBindings(PyObject* module) {
       [](nvfuser::FusionDefinition::Operators& self,
          nvfuser::Tensor arg,
          std::vector<int>& axes,
-         int64_t correction,
+         double correction,
          bool keepdim) -> decltype(auto) {
         FUSER_PERF_SCOPE("Operators.var_mean");
         nvfuser::FusionDefinition* fd = self.fusion_definition;
@@ -1415,15 +1411,3 @@ void initNvFuserPythonBindings(PyObject* module) {
 
 } // namespace jit
 } // namespace torch
-
-#else
-
-namespace torch {
-namespace jit {
-
-void initNvFuserPythonBindings(PyObject* module) {}
-
-} // namespace jit
-} // namespace torch
-
-#endif // USE_CUDA
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h b/third_party/nvfuser/csrc/python_frontend/python_bindings.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h
rename to third_party/nvfuser/csrc/python_frontend/python_bindings.h
diff --git a/third_party/nvfuser/csrc/python_frontend/python_bindings_extension.cpp b/third_party/nvfuser/csrc/python_frontend/python_bindings_extension.cpp
new file mode 100644
index 000000000000..d488d0966b9f
--- /dev/null
+++ b/third_party/nvfuser/csrc/python_frontend/python_bindings_extension.cpp
@@ -0,0 +1,7 @@
+#include <python_frontend/python_bindings.h>
+#include <torch/extension.h>
+
+PYBIND11_MODULE(EXTENSION_NAME, m) {
+  m.doc() = "nvfuser C API python binding"; // optional module docstring
+  torch::jit::initNvFuserPythonBindings(m.ptr());
+}
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp b/third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_cache.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
rename to third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_cache.cpp
index 607c560dab74..1eff6648fff6 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_cache.cpp
@@ -4,9 +4,9 @@
 
 #include <torch/torch.h>
 
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <python_frontend/fusion_cache.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp b/third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_definition.cpp
similarity index 94%
rename from torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
rename to third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_definition.cpp
index bae9cf6def81..8686b0488a7a 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_definition.cpp
@@ -4,11 +4,11 @@
 
 #include <torch/torch.h>
 
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <python_frontend/fusion_definition.h>
+#include <python_frontend/fusion_interface.h>
+#include <python_frontend/fusion_record.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp b/third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_record.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
rename to third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_record.cpp
index 5ae2db7db880..14c1a0c9e66e 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
+++ b/third_party/nvfuser/csrc/python_frontend/test/test_nvfuser_fusion_record.cpp
@@ -4,9 +4,9 @@
 
 #include <torch/torch.h>
 
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <python_frontend/fusion_record.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/third_party/nvfuser/csrc/register_interface.cpp b/third_party/nvfuser/csrc/register_interface.cpp
new file mode 100644
index 000000000000..ffb19a18559a
--- /dev/null
+++ b/third_party/nvfuser/csrc/register_interface.cpp
@@ -0,0 +1,745 @@
+#include <manager.h>
+#include <parser.h>
+#include <partition.h>
+#include <register_interface.h>
+
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <ATen/native/NonSymbolicBC.h>
+#include <ATen/native/TensorShape.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+#include <torch/csrc/jit/runtime/register_ops_utils.h>
+
+/*
+ * Registers function pointers in interface.h
+ */
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+class RegisterInterface {
+ public:
+  RegisterInterface() {
+    auto ptr = getFuserInterface();
+    ptr->fn_compile_n = &compileCudaFusionGroup;
+    ptr->fn_run_n_s = &runCudaFusionGroup;
+    ptr->fn_fuse_graph = &CudaFuseGraph;
+    ptr->fn_can_fuse_n = &isFusibleCudaFusionGroup;
+    ptr->fn_insert_profile_inodes = &InsertProfileNodes;
+    ptr->fn_profile_n = &shouldProfileNode;
+    ptr->fn_skip_n = &skipNodeKind;
+  }
+};
+
+static RegisterInterface register_interface_;
+
+class RegisterNVFuserPass {
+ public:
+  RegisterNVFuserPass() {
+    NVFuserPassManager::registerPass(true);
+  }
+};
+
+static RegisterNVFuserPass register_nvfuser_pass_;
+
+} // namespace
+
+//! [ Note -- type guard logic in CudaFusionGuard ]
+//!
+//! CudaFusionGuard is used to Guard input tensor to `CudaFusionGroup` so that
+//! we would not feed inputs that violates the graph defined in `GraphCache`.
+//!
+//! see [ Note -- 2 level cache implementation ] for definition of unique
+//! computational graph.
+//! see [ Note -- CudaFusionGuard implementation] for details on how guard works
+//! in profiling executor
+//!
+//! Type guard logic is used to query whether a runtime input `tensor` compiles
+//! with profiled `guard_tensor_type`. `guard_tensor_type` is the observed
+//! tensor type during profiling runs.
+//!
+//! At this moment, we only do single profiling run, so `guard_tensor_type` has
+//! static shape / stride / scalarType. *This might be a little confusing as our
+//! implementation is actually more relaxed.
+//!
+//! Things that we check:
+//!   a. identical rank & scalar type
+//!   b. stride check:
+//!        b.1. identical stride order
+//!        b.2. identical contiguity
+//!             note that contiguity here is used for tensor collapsing. So
+//!             extra attention should be paid to contiguity across size-1
+//!             dimensions.
+//!   c. size check:
+//!        c.1 broadcast check:
+//!        making sure that broadcast semantics are identical. So we want to
+//!        make sure a given dimension either are both size-1 for `tensor` &
+//!        `guard_tensor_type`, or are both non-size-1.
+//!        This is due to the fact that we specialize size-1 dimension as
+//!        broadcasted dimension while translating PyTorch tensor to Fusion IR.
+//!        c.1 size-0 check:
+//!        we don't specialize this on codegen, but we do specialize fusion
+//!        logic for size-0 on reductoins, hence the check
+//!
+bool complyWith(
+    const at::Tensor& tensor,
+    const c10::TensorTypePtr& guard_tensor_type) {
+  // guard broadcast semantics, contiguity & stride order;
+  TORCH_INTERNAL_ASSERT(
+      guard_tensor_type && guard_tensor_type->dim().has_value());
+
+  // check a. if num_dimension check fails or scalar type check fails
+  if (*guard_tensor_type->dim() != static_cast<size_t>(tensor.ndimension()) ||
+      (guard_tensor_type->scalarType().has_value() &&
+       (guard_tensor_type->scalarType().value() != tensor.scalar_type())) ||
+      (guard_tensor_type->device().has_value() &&
+       (guard_tensor_type->device().value() != tensor.device())) ||
+      (guard_tensor_type->requiresGrad().has_value() &&
+       guard_tensor_type->requiresGrad().value() !=
+           (tensor.requires_grad() && at::GradMode::is_enabled()))) {
+    return false;
+  }
+
+  // TODO: should we get symbolic_size instead and check for size
+  // consistency across tensors as well?
+  const auto& sizes = guard_tensor_type->sizes();
+  // see [ Note -- stirde_properties in tensor type ]
+  const auto& stride_properties = guard_tensor_type->stride_properties();
+
+  const auto& t_sizes = tensor.sizes();
+  const auto& t_strides = tensor.strides();
+  int inner_dim = -1;
+  for (const auto j : c10::irange(*guard_tensor_type->dim())) {
+    // check b. for stride check, we go along dimensions from fastest stride to
+    // slowest stride
+    int sorted_index = stride_properties[j]->stride_index_
+        ? static_cast<int>(*stride_properties[j]->stride_index_)
+        : -1;
+
+    // only apply stride check when we have stride_properties
+    if (sorted_index != -1) {
+      // check b.1. stride order [current dimension has stride larger
+      // than its inner dimension(s)], check only applies when both:
+      //     i. already encountered an inner dimension
+      //    ii. not at the fastest dimension
+      if (j != 0 && inner_dim != -1) {
+        // we are not looking at dim-j, but dim-sorted_index, which
+        // is the j-th fastest dim;
+        // Note: we ignore 0-stride dimension, since eager logic on stride
+        // indices is ambiguous
+        if (t_strides[sorted_index] != 0 && t_strides[inner_dim] != 0 &&
+            t_strides[sorted_index] < t_strides[inner_dim]) {
+          return false;
+        }
+      }
+
+      // check b.2. contiguity, we only check when it's marked as
+      // contiguous.
+      if (stride_properties[j]->contiguous_ &&
+          *stride_properties[j]->contiguous_) {
+        if (j != 0) {
+          // we use contiguity to collapse dimension, if size == 1, it is
+          // always collapsible
+          // computeStrideProps also default to contiguous when stride == 1
+          if (t_sizes[sorted_index] != 1 && t_strides[sorted_index] != 1) {
+            TORCH_INTERNAL_ASSERT(
+                stride_properties[j - 1]->stride_index_.has_value(),
+                "Counknown index is meaningless");
+            // TODO: merge this check up
+            if (t_strides[sorted_index] !=
+                t_strides[inner_dim] * t_sizes[inner_dim]) {
+              return false;
+            }
+          }
+        } else {
+          // TODO: merge this check up
+          if (t_strides[sorted_index] != 1) {
+            return false;
+          }
+        }
+      }
+
+      // update inner_dim to be current dim. Note that we try to skip update
+      // when current `t_size[sorted_index] == 1`, because:
+      //   1. stride comparison on a size-1 dimension is meaningless
+      //      [check b.1]
+      //   2. contiguity on a size-1 dimension is misleading. For collapsing,
+      //      we should actually look at the next non-size-1 dimension
+      //      [check b.2]
+      if (inner_dim == -1 || t_sizes[sorted_index] != 1) {
+        inner_dim = sorted_index;
+      }
+    }
+
+    // check c.1, we go along semantic ordered dimensions
+    // check broadcast / size-1:
+    bool guard_bcast = sizes[j].has_value() && sizes[j].value() == 1;
+    if (guard_bcast != (t_sizes[j] == 1)) {
+      return false;
+    }
+
+    // check c.2, check for size-0
+    bool guard_size_0 = sizes[j].has_value() && sizes[j].value() == 0;
+    if (guard_size_0 != (t_sizes[j] == 0)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+} // namespace cuda
+} // namespace fuser
+
+namespace {
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators size_eq_guard({
+    Operator(
+        //"prim::CudaFusionSizeEq(int[] size, int[] ref) -> bool",
+        "prim::CudaFusionSizeEq(...) -> bool",
+        // prim::CudaFusionGuard returns a fresh Boolean type without aliasing.
+        // if we would ever return refined tensor, which would change aliasing
+        // analysis, we should update aliasdb pass.
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            at::ArrayRef<IValue> inputs = last(stack, 2);
+            drop(stack, 2);
+
+            if (!fuser::cuda::getCudaFusionGuardMode()) {
+              push(stack, IValue(true));
+              return;
+            }
+
+            // auto inp = inputs[0].toIntList();
+            TORCH_INTERNAL_ASSERT(
+                inputs[1].isIntList(), "reference needs to be of int list");
+            auto ref = inputs[1].toIntList();
+
+            auto ret = true;
+            if (ref.empty()) {
+              ret = inputs[0].isNone();
+            } else {
+              if (inputs[0].isIntList()) {
+                auto inp = inputs[0].toIntList();
+                if (inp.size() != ref.size()) {
+                  push(stack, IValue(false));
+                  return;
+                }
+
+                for (const auto i : c10::irange(inp.size())) {
+                  if (((inp[i] == 1) != (ref[i] == 1))) {
+                    ret = false;
+                    break;
+                  }
+                }
+              } else {
+                ret = false;
+              }
+            }
+
+            push(stack, IValue(ret));
+            return;
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_fusion({
+    Operator(
+        prim::CudaFusionGroup,
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            fuser::cuda::runFusionGroup(node, stack);
+          };
+        },
+        aliasAnalysisSpecialCase()),
+});
+
+RegisterOperators reg_guard({
+    Operator(
+        "prim::CudaFusionGuard(...) -> bool",
+        // prim::CudaFusionGuard returns a fresh Boolean type without aliasing.
+        // if we would ever return refined tensor, which would change aliasing
+        // analysis, we should update aliasdb pass.
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            // TODO: check latency here!!!!
+            std::vector<TypePtr> types = node->tys(attr::types);
+            const auto num_inputs = types.size();
+            at::ArrayRef<IValue> inputs = last(stack, num_inputs);
+            drop(stack, num_inputs);
+
+            if (!fuser::cuda::getCudaFusionGuardMode()) {
+              push(stack, IValue(true));
+              return;
+            }
+
+            for (const auto i : c10::irange(num_inputs)) {
+              const c10::TensorTypePtr& guard_tensor_type =
+                  types[i]->cast<TensorType>();
+
+              // TODO: maybe we should just push false and fallback
+              TORCH_INTERNAL_ASSERT(inputs[i].isTensor());
+              const at::Tensor& tensor = inputs[i].toTensor();
+
+              if (!fuser::cuda::complyWith(tensor, guard_tensor_type)) {
+                push(stack, IValue(false));
+                return;
+              }
+            }
+
+            // TODO: check type and return the right flag
+            // naively return true;
+            push(stack, IValue(true));
+            return;
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// Infer dynamic axis (-1) in view_sizes given tensor_sizes
+bool inferViewShape(
+    c10::List<int64_t> tensor_sizes,
+    c10::List<int64_t> view_sizes) {
+  int64_t dynamic_index = -1;
+  size_t view_size_num_elements = 1;
+  for (size_t idx = 0; idx < view_sizes.size(); ++idx) {
+    if (view_sizes[idx] == -1) {
+      TORCH_INTERNAL_ASSERT(
+          dynamic_index == -1, "Only one dimension can by inferred.")
+      dynamic_index = idx;
+    } else {
+      TORCH_INTERNAL_ASSERT(view_sizes[idx] > 0);
+      view_size_num_elements *= view_sizes[idx];
+    }
+  }
+  const size_t kNumElements = std::accumulate(
+      tensor_sizes.begin(), tensor_sizes.end(), 1, std::multiplies<>());
+
+  if (kNumElements % view_size_num_elements != 0) {
+    return false;
+  }
+
+  if (dynamic_index != -1) {
+    view_sizes[dynamic_index] = kNumElements / view_size_num_elements;
+  }
+
+  return true;
+}
+
+//!
+//! CudaFusionViewGuard Example Graph:
+//!
+//! graph(%self : __torch__.BiasViewRelu,
+//!       %inputs.1 : Tensor):
+//!   %2 : int = prim::Constant[value=-1]() # dynamic_bvg.py:50:40
+//!   %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
+//!   %4 : NoneType = prim::Constant()
+//!   %5 : int[] = prim::Constant[value=[2, 3]]()
+//!   %6 : int[] = aten::size(%inputs.1) # dynamic_bvg.py:50:25
+//!   %7 : int[] = aten::slice(%6, %4, %2, %3) # dynamic_bvg.py:50:25
+//!   %view_shape.1 : int[] = aten::add(%7, %5) # dynamic_bvg.py:50:25
+//!   %bias : Tensor = prim::GetAttr[name="bias"](%self)
+//!   %10 : int[] = aten::size(%bias)
+//!   %11 : int[] = prim::BroadcastSizes(%6, %10)
+//!   %12 : bool = prim::CudaFusionGuard[types=[...]](%inputs.1, %bias)
+//!   %13 : int[] = prim::Constant[value=[-1, -1, -1, 6]]()
+//!   %14 : int[] = prim::Constant[value=[-1, -1, -1, 2, 3]]()
+//!   %15 : bool = prim::CudaFusionViewGuard(%11, %view_shape.1, %13, %14)
+//!   %16 : bool[] = prim::ListConstruct(%15, %12)
+//!   %17 : bool = aten::all(%16)
+//!   %18 : Tensor = prim::If(%17)
+//!     block0():
+//!       %19 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%inputs.1, %bias)
+//!       -> (%19)
+//!     block1():
+//!       %20 : Function = prim::Constant[name="fallback_fn", fallback=1]()
+//!       %21 : (...) = prim::CallFunction(%20, %inputs.1, %bias, %view_shape.1)
+//!       %22 : Float(...) = prim::TupleUnpack(%21)
+//!       -> (%22)
+//!   return (%18)
+//! with prim::CudaFusionGroup_0 = graph(%0 : Float(...),
+//!       %1 : Float(...)):
+//!   %2 : int[] = prim::Constant[value=[2, 3, 4, 2, 3]]()
+//!   %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
+//!   %o.1 : Float(...) = aten::add(%0, %1, %3) # dynamic_bvg.py:51:16
+//!   %5 : Float(...) = prim::view_copy(%o.1, %2)
+//!   %6 : Float(...) = aten::relu(%5) # dynamic_bvg.py:53:19
+//!   return (%6)
+//!
+RegisterOperators view_guard({
+    Operator(
+        "prim::CudaFusionViewGuard(...) -> bool",
+        // prim::CudaFusionViewGuard returns a fresh Boolean type without
+        // aliasing. if we would ever return refined tensor, which would change
+        // aliasing analysis, we should update aliasdb pass.
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            // view_sizes_constraint - Constant List[Int]
+            at::ArrayRef<IValue> inputs = last(stack, 3);
+
+            // tensor_sizes is the runtime size for the self tensor
+            // tensor_sizes - dynamic size List[Int]
+            TORCH_INTERNAL_ASSERT(
+                inputs[0].isIntList(), "tensor_sizes needs to be Int List");
+            auto tensor_sizes = inputs[0].toIntList();
+
+            // profiled_view_sizes is the runtime view size
+            // profiled_view_sizes - profile_ivalue List[Int]
+            TORCH_INTERNAL_ASSERT(
+                inputs[1].isIntList(),
+                "profiled_view_sizes needs to be Int list");
+            auto profiled_view_sizes = inputs[1].toIntList();
+
+            // tensor_constraints is a constant List[Int]
+            // used to guard tensor_sizes
+            TORCH_INTERNAL_ASSERT(
+                inputs[2].isIntList(),
+                "tensor constraint needs to be Int List");
+            auto tensor_constraints = inputs[2].toIntList();
+
+            // Drop after gather all input arguments
+            // If an argument is moved, it is destroyed when dropped from stack
+            drop(stack, 3);
+
+            auto status = inferViewShape(tensor_sizes, profiled_view_sizes);
+            if (!status) {
+              push(stack, IValue(false));
+              return;
+            }
+
+            if (!fuser::cuda::getCudaFusionGuardMode()) {
+              push(stack, IValue(true));
+              return;
+            }
+            std::vector<int64_t> tensor_sizes_int_vec = tensor_sizes.vec();
+            std::vector<int64_t> view_sizes_int_vec = tensor_sizes.vec();
+            std::vector<int64_t> previous_constraints =
+                tensor_constraints.vec();
+            auto new_constraints =
+                torch::jit::fuser::cuda::analyzeViewConstraint(
+                    tensor_sizes_int_vec, view_sizes_int_vec);
+            bool guard_status =
+                (new_constraints.conglomerateString() == previous_constraints);
+            push(stack, IValue(guard_status));
+            return;
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+RegisterOperators ivalue_guard({
+    Operator(
+        "prim::CudaFusionIvalGuard(...) -> bool",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            at::ArrayRef<IValue> inputs = last(stack, 2);
+            drop(stack, 2);
+            if (!fuser::cuda::getCudaFusionGuardMode()) {
+              push(stack, IValue(true));
+              return;
+            }
+            push(stack, inputs[0].equals(inputs[1]));
+            return;
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_add_optional({
+    Operator(
+        "prim::add_optional(Tensor(a) input, Tensor? bias) -> Tensor(a)",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            IValue input, bias;
+            pop(stack, input, bias);
+            if (bias.isNone()) {
+              push(stack, std::move(input));
+            } else {
+              push(stack, at::add(input.toTensor(), bias.toTensor(), 1.0));
+            }
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_permute_copy({
+    Operator(
+        "prim::permute_copy(Tensor(a) self, int[] dims) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "permute_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dims;
+            pop(stack, self, dims);
+            push(stack, at::native::view(self.toTensor(), dims.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_transpose_copy({
+    Operator(
+        "prim::transpose_copy.int(Tensor(a) self, int dim0, int dim1) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "transpose_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dim0, dim1;
+            pop(stack, self, dim0, dim1);
+            push(
+                stack,
+                at::transpose(self.toTensor(), dim0.toInt(), dim1.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_t_copy({
+    Operator(
+        "prim::t_copy(Tensor(a) self) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "t_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self;
+            pop(stack, self);
+            push(stack, at::t(self.toTensor()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_view_copy({
+    Operator(
+        "prim::view_copy(Tensor self, int[] size) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "view_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, size;
+            pop(stack, self, size);
+            push(stack, at::native::view(self.toTensor(), size.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_flatten_copy({
+    Operator(
+        "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "flatten_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, start_dim, end_dim;
+            pop(stack, self, start_dim, end_dim);
+            push(
+                stack,
+                at::native::flatten(
+                    self.toTensor(), start_dim.toInt(), end_dim.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_reshape_copy({
+    Operator(
+        "prim::reshape_copy(Tensor self, int[] shape) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "reshape_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, shape;
+            pop(stack, self, shape);
+            push(
+                stack,
+                at::native::reshape(self.toTensor(), shape.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_squeeze_copy({
+    Operator(
+        "prim::squeeze_copy(Tensor self) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "squeeze_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self;
+            pop(stack, self);
+            push(stack, at::squeeze(self.toTensor()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_squeeze_dim_copy({
+    Operator(
+        "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "squeeze_dim_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dim;
+            pop(stack, self, dim);
+            push(stack, at::squeeze(self.toTensor(), dim.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_unsqueeze_copy({
+    Operator(
+        "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "unsqueeze_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dim;
+            pop(stack, self, dim);
+            push(stack, at::unsqueeze(self.toTensor(), dim.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_infer_unsqueeze_size({
+    Operator(
+        "prim::infer_unsqueeze_size(int[] a, int dim) -> int[]",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto dim = pop(stack).toInt();
+            auto size = pop(stack).toIntVector();
+            if (dim < 0) {
+              dim = dim + 1 + size.size();
+            }
+            auto it = size.begin() + dim;
+            size.insert(it, 1);
+            push(stack, IValue(size));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_infer_squeeze_dim_size({
+    Operator(
+        "prim::infer_squeeze_size.dim(int[] a, int dim) -> int[]",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto dim = pop(stack).toInt();
+            auto size = pop(stack).toIntVector();
+            if (dim < 0) {
+              dim = dim + size.size();
+            }
+            auto it = size.begin() + dim;
+            if (*it == 1) {
+              size.erase(it);
+            }
+            push(stack, IValue(size));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_infer_squeeze_size({
+    Operator(
+        "prim::infer_squeeze_size(int[] a) -> int[]",
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto size = pop(stack).toIntVector();
+
+            for (auto it = size.begin(); it != size.end(); it++) {
+              if (*it == 1) {
+                auto pre = it - 1;
+                size.erase(it);
+                it = pre;
+              }
+            }
+            push(stack, IValue(size));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_expand_copy({
+    Operator(
+        "prim::expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "expand_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, size, implicit;
+            pop(stack, self, size, implicit);
+            push(stack, self.toTensor().expand(size.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_expand_as_copy({
+    Operator(
+        "prim::expand_as_copy(Tensor self, Tensor other) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "expand_as_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, other;
+            pop(stack, self, other);
+            push(
+                stack,
+                at::native::expand_as(self.toTensor(), other.toTensor()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+} // namespace
+
+} // namespace jit
+} // namespace torch
diff --git a/third_party/nvfuser/csrc/register_interface.h b/third_party/nvfuser/csrc/register_interface.h
new file mode 100644
index 000000000000..9ad6e8a15c6b
--- /dev/null
+++ b/third_party/nvfuser/csrc/register_interface.h
@@ -0,0 +1,48 @@
+#pragma once
+#include <manager.h>
+#include <transform_view.h>
+
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+/*
+ * This file contains APIs for cuda fuser;
+ *
+ * We use an empty static struct to hold the function pointers, which are
+ * registered separately. This is to support cpu-only compilation.
+ * Registration is done in torch/csrc/jit/codegen/cuda/register_interface.cpp
+ */
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+TORCH_CUDA_CU_API bool complyWith(
+    const at::Tensor& tensor,
+    const c10::TensorTypePtr& guard_tensor_type);
+
+struct TORCH_CUDA_CU_API NVFuserPassManager
+    : public PassManager<NVFuserPassManager> {
+  static bool registerPass(bool enabled) {
+    bool old_value = PassManager::isRegistered();
+    if (enabled) {
+      PassManager::registerPass(fuseGraph);
+    } else {
+      PassManager::clearPass();
+    }
+    return old_value;
+  }
+
+  static bool isRegistered() {
+    return PassManager::isRegistered();
+  }
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp b/third_party/nvfuser/csrc/root_domain_map.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/root_domain_map.cpp
rename to third_party/nvfuser/csrc/root_domain_map.cpp
index 235d257e2351..776316858985 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
+++ b/third_party/nvfuser/csrc/root_domain_map.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <root_domain_map.h>
 
 #include <sstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.h b/third_party/nvfuser/csrc/root_domain_map.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/root_domain_map.h
rename to third_party/nvfuser/csrc/root_domain_map.h
index fa3d323ba6d2..b4bce99f9584 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.h
+++ b/third_party/nvfuser/csrc/root_domain_map.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <disjoint_set.h>
+#include <ir_all_nodes.h>
+#include <iter_visitor.h>
+#include <utils.h>
 
 #include <c10/macros/Export.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h b/third_party/nvfuser/csrc/scheduler/all_schedulers.h
similarity index 51%
rename from torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
rename to third_party/nvfuser/csrc/scheduler/all_schedulers.h
index d01d226efe42..7c5f51c31759 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
+++ b/third_party/nvfuser/csrc/scheduler/all_schedulers.h
@@ -1,8 +1,8 @@
 #pragma once
-#include <torch/csrc/jit/codegen/cuda/scheduler/normalization.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
+#include <scheduler/normalization.h>
+#include <scheduler/pointwise.h>
+#include <scheduler/reduction.h>
+#include <scheduler/transpose.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/third_party/nvfuser/csrc/scheduler/compile_time_info.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
rename to third_party/nvfuser/csrc/scheduler/compile_time_info.h
index 6453962bfec8..b8adc34db455 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
+++ b/third_party/nvfuser/csrc/scheduler/compile_time_info.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <fusion.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/pointwise_utils.h>
+#include <scheduler/utils.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h b/third_party/nvfuser/csrc/scheduler/debug_utils.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h
rename to third_party/nvfuser/csrc/scheduler/debug_utils.h
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/heuristic.h b/third_party/nvfuser/csrc/scheduler/heuristic.h
similarity index 86%
rename from torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
rename to third_party/nvfuser/csrc/scheduler/heuristic.h
index a828d66fdf03..0fb187506174 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
+++ b/third_party/nvfuser/csrc/scheduler/heuristic.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <executor_launch_params.h>
+#include <utils.h>
 
 #include <string>
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp b/third_party/nvfuser/csrc/scheduler/matmul.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp
rename to third_party/nvfuser/csrc/scheduler/matmul.cpp
index ca3abc75aabd..0e44400e0505 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp
+++ b/third_party/nvfuser/csrc/scheduler/matmul.cpp
@@ -1,6 +1,6 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/matmul.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <scheduler/matmul.h>
+#include <scheduler/mma_utils.h>
+#include <scheduler/utils.h>
 
 namespace torch {
 namespace jit {
@@ -13,7 +13,7 @@ namespace {
 //      [... I0, B, I1] -> [... B, I0, I1]
 //  should probably be only used to order innermost mnk axes.
 void moveInnerBroadcastLeft(TensorView* tv, int number_of_inner_pos = 3) {
-  TORCH_INTERNAL_ASSERT(tv->nDims() >= number_of_inner_pos);
+  TORCH_INTERNAL_ASSERT(int(tv->nDims()) >= number_of_inner_pos);
   std::vector<int> broadcast_pos;
   std::vector<int> nonbroadcast_pos;
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/matmul.h b/third_party/nvfuser/csrc/scheduler/matmul.h
similarity index 92%
rename from torch/csrc/jit/codegen/cuda/scheduler/matmul.h
rename to third_party/nvfuser/csrc/scheduler/matmul.h
index cade826a2679..a487d9313e03 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/matmul.h
+++ b/third_party/nvfuser/csrc/scheduler/matmul.h
@@ -2,8 +2,8 @@
 
 #include <ATen/core/ivalue.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+#include <fusion.h>
+#include <mma_type.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp b/third_party/nvfuser/csrc/scheduler/mma_utils.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
rename to third_party/nvfuser/csrc/scheduler/mma_utils.cpp
index ddf1061591ed..3b11292d34df 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
+++ b/third_party/nvfuser/csrc/scheduler/mma_utils.cpp
@@ -1,10 +1,10 @@
 
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <expr_evaluator.h>
+#include <ir_printer.h>
+#include <lower_utils.h>
+#include <root_domain_map.h>
+#include <scheduler/mma_utils.h>
+#include <scheduler/utils.h>
 
 namespace torch {
 namespace jit {
@@ -921,7 +921,8 @@ void scheduler_utils::matmul_utils::canonicalizeMmaTvOrdering(TensorView* tv) {
 
   // Validate that all of the root ids are covered by
   //  the inserted categories.
-  TORCH_INTERNAL_ASSERT(current_pos == ndims, "Id not completely categorized");
+  TORCH_INTERNAL_ASSERT(
+      current_pos == (int)ndims, "Id not completely categorized");
 
   // Apply the new ordering
   tv->reorder(order_map);
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h b/third_party/nvfuser/csrc/scheduler/mma_utils.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h
rename to third_party/nvfuser/csrc/scheduler/mma_utils.h
index 03cbea6d3cff..f6835b096f84 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h
+++ b/third_party/nvfuser/csrc/scheduler/mma_utils.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/mma_type.h>
+#include <fusion.h>
+#include <mma_type.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/third_party/nvfuser/csrc/scheduler/normalization.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
rename to third_party/nvfuser/csrc/scheduler/normalization.cpp
index 459974b8d288..114d7e457e92 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
+++ b/third_party/nvfuser/csrc/scheduler/normalization.cpp
@@ -1,15 +1,15 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction.h>
-
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <scheduler/reduction.h>
+
+#include <executor_utils.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/registry.h>
+#include <scheduler/utils.h>
+#include <scheduler/vectorize_helper.h>
+#include <transform_replay.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
@@ -791,7 +791,7 @@ std::shared_ptr<ReductionParams> persistentHeuristic(
   return rparams;
 }
 
-TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
+std::shared_ptr<ReductionParams> getPersistentHeuristics(
     Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
     HeuristicSummary* data_cache) {
@@ -946,7 +946,7 @@ TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
       project_persistent_buffers);
 }
 
-TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
+std::shared_ptr<ReductionParams> getPersistentHeuristics(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& runtime_inputs,
     HeuristicSummary* data_cache) {
@@ -956,9 +956,7 @@ TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
 }
 
 // fusion is the input IR that will be modified by this function
-TORCH_CUDA_CU_API void schedulePersistentKernel(
-    Fusion* fusion,
-    const ReductionParams& rparams) {
+void schedulePersistentKernel(Fusion* fusion, const ReductionParams& rparams) {
   FUSER_PERF_SCOPE("schedulePersistentKernel");
 
   FusionGuard fg(fusion);
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.h b/third_party/nvfuser/csrc/scheduler/normalization.h
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/scheduler/normalization.h
rename to third_party/nvfuser/csrc/scheduler/normalization.h
index dbf2eb895f0f..ba5fea609027 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/normalization.h
+++ b/third_party/nvfuser/csrc/scheduler/normalization.h
@@ -2,8 +2,8 @@
 
 #include <ATen/core/ivalue.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h>
+#include <fusion.h>
+#include <scheduler/reduction_heuristic.h>
 
 // TODO: If caching inputs would require persistence we are sending it to the
 // persistent kerenl scheduler. This isn't necessary if the only persistent
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/third_party/nvfuser/csrc/scheduler/pointwise.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
rename to third_party/nvfuser/csrc/scheduler/pointwise.cpp
index b40e6fbf7cf7..d05f4a02d701 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
+++ b/third_party/nvfuser/csrc/scheduler/pointwise.cpp
@@ -1,17 +1,17 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
-
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <scheduler/pointwise.h>
+
+#include <executor_utils.h>
+#include <inlining.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower_utils.h>
+#include <scheduler/pointwise_utils.h>
+#include <scheduler/registry.h>
+#include <scheduler/utils.h>
+#include <scheduler/vectorize_helper.h>
+#include <transform_replay.h>
+#include <utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
@@ -55,7 +55,9 @@ class DomainMap : public pointwise_utils::DomainMap {
  private:
   bool hasMinimumSize(TensorView* tv, int num_axes) const {
     TORCH_INTERNAL_ASSERT(tv != nullptr);
-    return (num_axes == 0 || tv->getMaybeRFactorDomain().size() > num_axes);
+    return (
+        num_axes == 0 ||
+        (int64_t)tv->getMaybeRFactorDomain().size() > num_axes);
   }
 };
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/third_party/nvfuser/csrc/scheduler/pointwise.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
rename to third_party/nvfuser/csrc/scheduler/pointwise.h
index f3a1da7bcff5..a0bcf4a17818 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
+++ b/third_party/nvfuser/csrc/scheduler/pointwise.h
@@ -2,8 +2,8 @@
 
 #include <ATen/core/ivalue.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h>
+#include <fusion.h>
+#include <scheduler/pointwise_heuristic.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h b/third_party/nvfuser/csrc/scheduler/pointwise_heuristic.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h
rename to third_party/nvfuser/csrc/scheduler/pointwise_heuristic.h
index 3d2cb5ee9521..dc67ba1fdb23 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h
+++ b/third_party/nvfuser/csrc/scheduler/pointwise_heuristic.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/scheduler/heuristic.h>
+#include <scheduler/heuristic.h>
 
 #include <sstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp b/third_party/nvfuser/csrc/scheduler/pointwise_utils.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp
rename to third_party/nvfuser/csrc/scheduler/pointwise_utils.cpp
index cf823322078f..d6329202a4bd 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp
+++ b/third_party/nvfuser/csrc/scheduler/pointwise_utils.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h>
+#include <scheduler/pointwise_utils.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/third_party/nvfuser/csrc/scheduler/pointwise_utils.h
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
rename to third_party/nvfuser/csrc/scheduler/pointwise_utils.h
index 6cc4b1b8b93b..c6dbe91c96b2 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/third_party/nvfuser/csrc/scheduler/pointwise_utils.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <compute_at_map.h>
+#include <ir_all_nodes.h>
+#include <ir_utils.h>
+#include <scheduler/utils.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp b/third_party/nvfuser/csrc/scheduler/reduction.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
rename to third_party/nvfuser/csrc/scheduler/reduction.cpp
index 3037f8469dad..a1b6c33cd1bb 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
+++ b/third_party/nvfuser/csrc/scheduler/reduction.cpp
@@ -1,16 +1,16 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction.h>
+#include <scheduler/reduction.h>
 
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <executor_utils.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_utils.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/registry.h>
+#include <scheduler/utils.h>
+#include <scheduler/vectorize_helper.h>
+#include <transform_replay.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <ir_iostream.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
@@ -568,7 +568,7 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
           // There's a place to put it in the device
           || target_blocks < device_multiprocessor_count * 4
           // There's a place to put it in unrolling
-          || target_unroll < vectorize_factor)) {
+          || target_unroll < int64_t(vectorize_factor))) {
     if (target_threads_in_block <
         ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4)) {
       target_threads_in_block *= 2;
@@ -584,7 +584,8 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     if (target_blocks > device_multiprocessor_count &&
         target_threads_in_block >
             ceilDiv(device_max_threads_per_multiprocessor, (int64_t)16) &&
-        target_unroll < vectorize_factor && available_parallelism() > 1) {
+        target_unroll < int64_t(vectorize_factor) &&
+        available_parallelism() > 1) {
       target_unroll *= 2;
     }
   }
@@ -668,7 +669,8 @@ std::shared_ptr<ReductionParams> outerReductionHeuristic(
     iter_unroll_factor = std::min(iter_unroll_factor, iDimAvail());
     iter_unroll_factor = std::min(iter_unroll_factor, target_unroll);
     iter_unroll_factor = scheduler_utils::lastPow2(iter_unroll_factor);
-    if (vectorize_factor > 1 && iter_unroll_factor <= vectorize_factor) {
+    if (vectorize_factor > 1 &&
+        iter_unroll_factor <= (int64_t)vectorize_factor) {
       iter_unroll_factor =
           std::min(iter_unroll_factor, (int64_t)vectorize_factor);
       vectorize = true;
@@ -867,7 +869,7 @@ std::shared_ptr<ReductionParams> reductionHeuristic(
   }
 }
 
-TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getReductionHeuristics(
+std::shared_ptr<ReductionParams> getReductionHeuristics(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& runtime_inputs,
     HeuristicSummary* data_cache) {
@@ -878,7 +880,7 @@ TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getReductionHeuristics(
   return getReductionHeuristics(fusion, runtime_info, data_cache);
 }
 
-TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getReductionHeuristics(
+std::shared_ptr<ReductionParams> getReductionHeuristics(
     Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
     HeuristicSummary* data_cache) {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.h b/third_party/nvfuser/csrc/scheduler/reduction.h
similarity index 85%
rename from torch/csrc/jit/codegen/cuda/scheduler/reduction.h
rename to third_party/nvfuser/csrc/scheduler/reduction.h
index c09608e74b07..78eaef592df9 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction.h
+++ b/third_party/nvfuser/csrc/scheduler/reduction.h
@@ -2,8 +2,8 @@
 
 #include <ATen/core/ivalue.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h>
+#include <fusion.h>
+#include <scheduler/reduction_heuristic.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h b/third_party/nvfuser/csrc/scheduler/reduction_heuristic.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h
rename to third_party/nvfuser/csrc/scheduler/reduction_heuristic.h
index 5349b64aeaff..712bd006b3ec 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h
+++ b/third_party/nvfuser/csrc/scheduler/reduction_heuristic.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/scheduler/heuristic.h>
+#include <scheduler/heuristic.h>
 
 #include <sstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp b/third_party/nvfuser/csrc/scheduler/reduction_utils.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
rename to third_party/nvfuser/csrc/scheduler/reduction_utils.cpp
index ae9ecd88bbdc..45822222190b 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
+++ b/third_party/nvfuser/csrc/scheduler/reduction_utils.cpp
@@ -1,13 +1,13 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <scheduler/reduction_utils.h>
+
+#include <expr_evaluator.h>
+#include <inlining.h>
+#include <ir_cloner.h>
+#include <ir_utils.h>
+#include <maxinfo_propagator.h>
+#include <scheduler/registry.h>
+#include <scheduler/utils.h>
+#include <transform_replay.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h b/third_party/nvfuser/csrc/scheduler/reduction_utils.h
similarity index 91%
rename from torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h
rename to third_party/nvfuser/csrc/scheduler/reduction_utils.h
index cd091cde21a0..0427aa9cedd7 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h
+++ b/third_party/nvfuser/csrc/scheduler/reduction_utils.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <scheduler/reduction_heuristic.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/third_party/nvfuser/csrc/scheduler/registry.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
rename to third_party/nvfuser/csrc/scheduler/registry.cpp
index 5d5bc84ef3b4..67a03525f258 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/third_party/nvfuser/csrc/scheduler/registry.cpp
@@ -1,16 +1,16 @@
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <disjoint_set.h>
+#include <executor_utils.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <root_domain_map.h>
+#include <scheduler/debug_utils.h>
+#include <scheduler/pointwise.h>
+#include <scheduler/registry.h>
+#include <scheduler/transpose.h>
+#include <scheduler/utils.h>
 
 #include <limits>
 
@@ -646,8 +646,8 @@ size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) {
   //  innermost dimension size for the word size of vectorizaiton
   size_t vector_size = 1;
   size_t next_vector_size = 2;
-  while (next_vector_size <= max_vector_size && next_vector_size <= numel &&
-         numel % next_vector_size == 0) {
+  while (next_vector_size <= max_vector_size &&
+         next_vector_size <= (size_t)numel && numel % next_vector_size == 0) {
     vector_size = next_vector_size;
     next_vector_size *= 2;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.h b/third_party/nvfuser/csrc/scheduler/registry.h
similarity index 93%
rename from torch/csrc/jit/codegen/cuda/scheduler/registry.h
rename to third_party/nvfuser/csrc/scheduler/registry.h
index 8b3409447634..85a4dcb54946 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.h
+++ b/third_party/nvfuser/csrc/scheduler/registry.h
@@ -1,13 +1,13 @@
 #pragma once
-#include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/heuristic.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_heuristic.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <executor_kernel_arg.h>
+#include <fusion.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/compile_time_info.h>
+#include <scheduler/heuristic.h>
+#include <scheduler/pointwise_heuristic.h>
+#include <scheduler/reduction_heuristic.h>
+#include <scheduler/utils.h>
+#include <utils.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/third_party/nvfuser/csrc/scheduler/transpose.cpp
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
rename to third_party/nvfuser/csrc/scheduler/transpose.cpp
index b7e85cbc1c5e..90e2d82e766e 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/third_party/nvfuser/csrc/scheduler/transpose.cpp
@@ -1,17 +1,17 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
-
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <scheduler/transpose.h>
+
+#include <executor_utils.h>
+#include <inlining.h>
+#include <instrumentation.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower_utils.h>
+#include <scheduler/pointwise_utils.h>
+#include <scheduler/registry.h>
+#include <scheduler/utils.h>
+#include <scheduler/vectorize_helper.h>
+#include <transform_replay.h>
+#include <utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
@@ -251,13 +251,15 @@ void maybeBuildVirtualInnerDims(
   // merge inner_most1 and inner_most2 left until we are done or we can no
   // longer do so
   int64_t dim = inner_most1 - 1;
-  while (dim >= 0 && dim != inner_most2 && merged_size1 < params.tile_size1) {
+  while (dim >= 0 && dim != inner_most2 &&
+         merged_size1 < (int64_t)params.tile_size1) {
     params.dims_merged_with_1.push_back(dim);
     merged_size1 *= shape_in_ref1[dim];
     dim--;
   }
   dim = inner_most2 - 1;
-  while (dim >= 0 && dim != inner_most1 && merged_size2 < params.tile_size2) {
+  while (dim >= 0 && dim != inner_most1 &&
+         merged_size2 < (int64_t)params.tile_size2) {
     params.dims_merged_with_2.push_back(dim);
     merged_size2 *= shape_in_ref1[dim];
     dim--;
@@ -275,7 +277,7 @@ void maybeBuildVirtualInnerDims(
     unavailable_dims.insert(i);
   }
   dim = shape_in_ref1.size() - 1;
-  while (dim >= 0 && merged_size1 < params.tile_size1) {
+  while (dim >= 0 && merged_size1 < (int64_t)params.tile_size1) {
     if (unavailable_dims.count(dim) == 0) {
       params.dims_merged_with_1.push_back(dim);
       merged_size1 *= shape_in_ref1[dim];
@@ -284,7 +286,7 @@ void maybeBuildVirtualInnerDims(
     dim--;
   }
   dim = shape_in_ref1.size() - 1;
-  while (dim >= 0 && merged_size2 < params.tile_size2) {
+  while (dim >= 0 && merged_size2 < (int64_t)params.tile_size2) {
     if (unavailable_dims.count(dim) == 0) {
       params.dims_merged_with_2.push_back(dim);
       merged_size2 *= shape_in_ref1[dim];
@@ -294,8 +296,8 @@ void maybeBuildVirtualInnerDims(
   }
   // If both are satisfied, then we are done. If neither are satisfied, then it
   // is impossible to satisfy both of them, also done.
-  if ((merged_size1 < params.tile_size1) ==
-      (merged_size2 < params.tile_size2)) {
+  if ((merged_size1 < (int64_t)params.tile_size1) ==
+      (merged_size2 < (int64_t)params.tile_size2)) {
     return; // no need to split
   }
   // If one of them are not satisfied, there might be two cases:
@@ -309,7 +311,7 @@ void maybeBuildVirtualInnerDims(
   int64_t large_dim;
   int64_t split_factor;
   bool split_inner_most;
-  if (merged_size1 < params.tile_size1) {
+  if (merged_size1 < (int64_t)params.tile_size1) {
     if (params.dims_merged_with_2.empty()) {
 #if SUPPORT_SPLITTING_INNERMOST_DIM
       // https://github.com/csarofeen/pytorch/issues/1964
@@ -351,17 +353,17 @@ void maybeBuildVirtualInnerDims(
   params.split_before_tiling.push_back({large_dim, split_factor});
   // adjust all dims to after-split
   for (auto& i : params.dims_merged_with_1) {
-    if (i > large_dim) {
+    if ((int64_t)i > large_dim) {
       i++;
     }
   }
   for (auto& i : params.dims_merged_with_2) {
-    if (i > large_dim) {
+    if ((int64_t)i > large_dim) {
       i++;
     }
   }
   // Give the split-out dim to the unsatisfied one, so that both are satisfied.
-  if (merged_size1 < params.tile_size1) {
+  if (merged_size1 < (int64_t)params.tile_size1) {
     if (!split_inner_most) {
       params.dims_merged_with_2.pop_back();
       params.dims_merged_with_2.push_back(large_dim + 1);
@@ -508,7 +510,7 @@ std::string getTransposeRuntimeRejectReason(
   const int64_t device_multiprocessor_count =
       (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
   auto elements_per_wave = device_multiprocessor_count * default_tile_elements;
-  if (elements_per_wave > n_elems) {
+  if ((int64_t)elements_per_wave > n_elems) {
     return "Transpose scheduler does not perform well on small problem sizes.";
   }
 
@@ -522,7 +524,7 @@ std::string getTransposeRuntimeRejectReason(
   //   transpose(T0[1000000000, 2, 2], 1, 2)
   // the pointwise scheduler should provide better performance, because it
   // provides coalesced memory access
-  if (inner_size1 * inner_size2 < default_tile_elements) {
+  if (inner_size1 * inner_size2 < (int64_t)default_tile_elements) {
     auto inner_elements = inner_size1 * inner_size2;
     for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1;
          i++) {
@@ -539,15 +541,15 @@ std::string getTransposeRuntimeRejectReason(
     //   T3[2, 10000000, 3] input/output
     //   T4[3, 10000000, 2] input/output
     //   T5[3, 10000000, 2] input/output
-    if (inner_elements < default_tile_elements) {
+    if (inner_elements < (int64_t)default_tile_elements) {
       return "Inner transpose of small dimensions should be scheduled by the "
              "pointwise scheduler because it provides better memory coalescing";
     }
   }
 
 #if !SUPPORT_SPLITTING_INNERMOST_DIM
-  if (n_elems / inner_size1 < TransposeParams::getDefaultTileSize() ||
-      n_elems / inner_size2 < TransposeParams::getDefaultTileSize()) {
+  if (n_elems / inner_size1 < (int64_t)TransposeParams::getDefaultTileSize() ||
+      n_elems / inner_size2 < (int64_t)TransposeParams::getDefaultTileSize()) {
     return "Splitting of inner most dim for the creation of virtual inner most dim "
            "is disabled due to indexing bug, skipping this case at runtime for now"
            "See: https://github.com/csarofeen/pytorch/issues/1964";
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/third_party/nvfuser/csrc/scheduler/transpose.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/transpose.h
rename to third_party/nvfuser/csrc/scheduler/transpose.h
index c1a4ab6efb6a..83b9c828b20c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
+++ b/third_party/nvfuser/csrc/scheduler/transpose.h
@@ -2,8 +2,8 @@
 
 #include <ATen/core/ivalue.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h>
+#include <fusion.h>
+#include <scheduler/transpose_heuristic.h>
 
 #define SUPPORT_SPLITTING_INNERMOST_DIM 0
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/third_party/nvfuser/csrc/scheduler/transpose_heuristic.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
rename to third_party/nvfuser/csrc/scheduler/transpose_heuristic.h
index 5e56278a7f16..0d9ece670e66 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/third_party/nvfuser/csrc/scheduler/transpose_heuristic.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <c10/util/hash.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/heuristic.h>
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <scheduler/heuristic.h>
+#include <utils.h>
 
 #include <sstream>
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/third_party/nvfuser/csrc/scheduler/utils.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
rename to third_party/nvfuser/csrc/scheduler/utils.cpp
index 4ba6b241e455..28d2b7ff117e 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
+++ b/third_party/nvfuser/csrc/scheduler/utils.cpp
@@ -1,15 +1,15 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
-
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <scheduler/registry.h>
+#include <scheduler/utils.h>
+#include <scheduler/vectorize_helper.h>
+
+#include <compute_at_map.h>
+#include <contiguity.h>
+#include <expr_evaluator.h>
+#include <instrumentation.h>
+#include <ir_utils.h>
+#include <root_domain_map.h>
+#include <scheduler/mma_utils.h>
+#include <transform_replay.h>
 
 #include <algorithm>
 
@@ -152,7 +152,7 @@ void splitDims(
   for (auto entry : to_split) {
     size_t dim = entry.first;
     size_t size = entry.second;
-    if (dim != prev_dim) {
+    if ((int64_t)dim != prev_dim) {
       dim_offset += pending_dim_offset;
       pending_dim_offset = 0;
     }
@@ -258,7 +258,7 @@ void parallelizeAllLike(
     pos += reference_tv->nDims() + 1;
   }
   TORCH_CHECK(
-      pos >= 0 && pos <= reference_tv->nDims(),
+      pos >= 0 && pos <= (int64_t)reference_tv->nDims(),
       "parallelizeAllLike called on an position outside valid range.");
 
   std::unordered_map<IterDomain*, IterDomain*> concrete_to_reference_map;
@@ -1942,7 +1942,8 @@ void orderTiledConcreteIdAsRoot(TensorView* tv) {
 
   // Validate that we have processed all inner ids or broadcast/reduction
   //  ids we have registered.
-  TORCH_INTERNAL_ASSERT(current_pos == ndims, "Inconsistent ordering logic");
+  TORCH_INTERNAL_ASSERT(
+      current_pos == (int)ndims, "Inconsistent ordering logic");
 
   // Apply the new order:
   tv->reorder(reorder_map_old_to_new);
@@ -2302,19 +2303,19 @@ bool breakIsDisjoint(std::vector<int> group_ids, int pos) {
     pos += group_ids.size();
   }
   TORCH_INTERNAL_ASSERT(
-      pos >= 0 && pos <= group_ids.size(),
+      pos >= 0 && pos <= (int)group_ids.size(),
       "Invalid position, size of vec is ",
       group_ids.size(),
       " but position is ",
       pos);
 
-  if (pos == 0 || pos == group_ids.size()) {
+  if (pos == 0 || pos == (int)group_ids.size()) {
     return true;
   }
 
   std::unordered_set<int> left_ints(group_ids.begin(), group_ids.begin() + pos);
 
-  for (auto i = pos; i < group_ids.size(); i++) {
+  for (auto i = pos; i < (int)group_ids.size(); i++) {
     if (left_ints.count(group_ids[i]) > 0) {
       return false;
     }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.h b/third_party/nvfuser/csrc/scheduler/utils.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/scheduler/utils.h
rename to third_party/nvfuser/csrc/scheduler/utils.h
index 373a879f740d..3cba54d5ae46 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.h
+++ b/third_party/nvfuser/csrc/scheduler/utils.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h>
+#include <disjoint_set.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <maxinfo_propagator.h>
+#include <scheduler/reduction_heuristic.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp b/third_party/nvfuser/csrc/scheduler/vectorize_helper.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
rename to third_party/nvfuser/csrc/scheduler/vectorize_helper.cpp
index 2c3c848c7f5c..7d72f240000c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
+++ b/third_party/nvfuser/csrc/scheduler/vectorize_helper.cpp
@@ -1,11 +1,11 @@
-#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
+#include <scheduler/vectorize_helper.h>
 
-#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
-#include <torch/csrc/jit/codegen/cuda/contiguity.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <compute_at_map.h>
+#include <contiguity.h>
+#include <expr_evaluator.h>
+#include <iter_visitor.h>
+#include <lower_divisible_split.h>
+#include <scheduler/registry.h>
 
 #include <c10/util/irange.h>
 
@@ -239,7 +239,7 @@ size_t expandVectorizationToContigMergedDomains(
 
     int tv_num_merged_domains = 0;
     for (const auto i : c10::irange(max_num_merged_domains)) {
-      if (i == tv_root.size()) {
+      if (i == (int)tv_root.size()) {
         break;
       }
       auto ref_id = ref_root.at(ref_root.size() - 1 - i);
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h b/third_party/nvfuser/csrc/scheduler/vectorize_helper.h
similarity index 89%
rename from torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
rename to third_party/nvfuser/csrc/scheduler/vectorize_helper.h
index a9b959b495d6..8b5f8b81dc7a 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
+++ b/third_party/nvfuser/csrc/scheduler/vectorize_helper.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <scheduler/registry.h>
 
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/third_party/nvfuser/csrc/tensor_view.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/tensor_view.cpp
rename to third_party/nvfuser/csrc/tensor_view.cpp
index 85f320fef2e4..3b6ad4bbb40d 100644
--- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp
+++ b/third_party/nvfuser/csrc/tensor_view.cpp
@@ -1,22 +1,22 @@
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/compute_at.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
+#include <arith.h>
+#include <compute_at.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_cloner.h>
+#include <ir_interface_nodes.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <lower2device.h>
+#include <lower_double_buffer.h>
+#include <scheduler/mma_utils.h>
 
 // Cleanup
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <transform_iter.h>
+#include <transform_replay.h>
 
 namespace torch {
 namespace jit {
@@ -310,7 +310,7 @@ void TensorView::inlineAt(
   }
 
   TORCH_INTERNAL_ASSERT(
-      pos >= 0 && pos <= nDims(),
+      pos >= 0 && pos <= (int64_t)nDims(),
       "Invalid inline position for T",
       name(),
       ": ",
@@ -328,7 +328,7 @@ void TensorView::inlineAt(
   }
 
   TORCH_INTERNAL_ASSERT(
-      pos <= max_inline_pos,
+      pos <= (int64_t)max_inline_pos,
       "Invalid inline position for T",
       name(),
       ": ",
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.cpp b/third_party/nvfuser/csrc/transform_iter.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/transform_iter.cpp
rename to third_party/nvfuser/csrc/transform_iter.cpp
index ab683e79ce9a..32475f56ece9 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.cpp
+++ b/third_party/nvfuser/csrc/transform_iter.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <transform_iter.h>
 
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <ir_utils.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.h b/third_party/nvfuser/csrc/transform_iter.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/transform_iter.h
rename to third_party/nvfuser/csrc/transform_iter.h
index 554c6fbfdf83..c68d7f5cc236 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.h
+++ b/third_party/nvfuser/csrc/transform_iter.h
@@ -2,11 +2,11 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <disjoint_set.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <iter_visitor.h>
+#include <root_domain_map.h>
 #include <unordered_map>
 #include <vector>
 
diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.cpp b/third_party/nvfuser/csrc/transform_replay.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/transform_replay.cpp
rename to third_party/nvfuser/csrc/transform_replay.cpp
index e00a4b840eaa..3e351d897444 100644
--- a/torch/csrc/jit/codegen/cuda/transform_replay.cpp
+++ b/third_party/nvfuser/csrc/transform_replay.cpp
@@ -1,16 +1,16 @@
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <transform_replay.h>
+
+#include <arith.h>
+#include <disjoint_set.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <maxinfo_propagator.h>
+#include <root_domain_map.h>
+#include <transform_iter.h>
 
 #include <deque>
 
@@ -818,7 +818,7 @@ bool TransformReplay::fullSelfMatching(
       [](auto a, auto b) { return std::make_pair(a, b); });
   BestEffortReplay replay_(replay_dom, target_dom, target2replay_map);
   auto r = replay_.getReplay();
-  for (int64_t i = 0; i < replay_dom.size(); i++) {
+  for (int64_t i = 0; i < (int64_t)replay_dom.size(); i++) {
     auto target_id = target_dom[i];
     auto replay_it = r.find(target_id);
     if (replay_it == r.end() || replay_it->second != replay_dom[i]) {
@@ -943,7 +943,7 @@ TransformPropagator::TransformPropagator(TensorView* from, int64_t pos) {
     pos += int64_t(from->nDims()) + 1;
   }
   TORCH_CHECK(
-      pos >= 0 && pos <= from->nDims(),
+      pos >= 0 && pos <= (int64_t)from->nDims(),
       "TransformPropagator called on an pos outside valid range.");
   replayed_pos_[from] = pos;
 }
diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.h b/third_party/nvfuser/csrc/transform_replay.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/transform_replay.h
rename to third_party/nvfuser/csrc/transform_replay.h
index 3dace83adab7..b476efb95f34 100644
--- a/torch/csrc/jit/codegen/cuda/transform_replay.h
+++ b/third_party/nvfuser/csrc/transform_replay.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
+#include <ir_internal_nodes.h>
+#include <maxinfo_propagator.h>
 
 #include <algorithm>
 #include <unordered_map>
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp b/third_party/nvfuser/csrc/transform_rfactor.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
rename to third_party/nvfuser/csrc/transform_rfactor.cpp
index 8d5151074563..8fc152c4f967 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
+++ b/third_party/nvfuser/csrc/transform_rfactor.cpp
@@ -1,12 +1,12 @@
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <transform_rfactor.h>
+
+#include <arith.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_builder.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.h b/third_party/nvfuser/csrc/transform_rfactor.h
similarity index 88%
rename from torch/csrc/jit/codegen/cuda/transform_rfactor.h
rename to third_party/nvfuser/csrc/transform_rfactor.h
index b03fc53b6d48..c910740c9c98 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.h
+++ b/third_party/nvfuser/csrc/transform_rfactor.h
@@ -2,8 +2,8 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <ir_all_nodes.h>
+#include <transform_iter.h>
 
 #include <algorithm>
 #include <vector>
diff --git a/torch/csrc/jit/codegen/cuda/transform_view.cpp b/third_party/nvfuser/csrc/transform_view.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/transform_view.cpp
rename to third_party/nvfuser/csrc/transform_view.cpp
index 3c209f6b4dd7..d26873100e07 100644
--- a/torch/csrc/jit/codegen/cuda/transform_view.cpp
+++ b/third_party/nvfuser/csrc/transform_view.cpp
@@ -1,13 +1,13 @@
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
+#include <transform_view.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+#include <arith.h>
+#include <fusion.h>
+#include <instrumentation.h>
+#include <ir_builder.h>
+#include <ir_internal_nodes.h>
+#include <ir_iostream.h>
+#include <iter_visitor.h>
+#include <transform_iter.h>
 
 namespace torch {
 namespace jit {
@@ -170,7 +170,7 @@ class MergeTransform final : public ViewTransform {
       std::vector<IterDomain*>& root_domain,
       std::vector<IterDomain*>& current_transformed_domain) override {
     TORCH_INTERNAL_ASSERT(
-        (index_ + 1) < current_transformed_domain.size(),
+        (index_ + 1) < (int64_t)current_transformed_domain.size(),
         "Tried to apply: ",
         toString(),
         "\t To domain: \t",
@@ -232,7 +232,7 @@ class SplitTransform final : public ViewTransform {
       std::vector<IterDomain*>& root_domain,
       std::vector<IterDomain*>& current_transformed_domain) override {
     TORCH_INTERNAL_ASSERT(
-        index_ < current_transformed_domain.size(),
+        index_ < (int64_t)current_transformed_domain.size(),
         "Index: \t",
         index_,
         "\t Domain Size:\t",
@@ -458,7 +458,7 @@ class AnalyzeViewTransformation {
     if (root_domain_not_provided_) {
       return original_view_[original_view_index] == 1;
     } else {
-      TORCH_INTERNAL_ASSERT(original_view_index < root_domain_.size());
+      TORCH_INTERNAL_ASSERT(original_view_index < (int64_t)root_domain_.size());
       return root_domain_[original_view_index]->isImplicitBroadcast() &&
           !root_domain_[original_view_index]->hasExpandedExtent();
     }
@@ -493,8 +493,8 @@ class AnalyzeViewTransformation {
 
     // Iterate until original view is completely consumed and new view is
     // completely generated.
-    while (original_view_index < original_view_.size() ||
-           new_view_index < new_view_.size()) {
+    while (original_view_index < (int64_t)original_view_.size() ||
+           new_view_index < (int64_t)new_view_.size()) {
       TORCH_INTERNAL_ASSERT(
           !(prev_new_view_index == new_view_index &&
             prev_original_view_index == original_view_index),
@@ -503,15 +503,15 @@ class AnalyzeViewTransformation {
       prev_new_view_index = new_view_index;
       prev_original_view_index = original_view_index;
 
-      if (new_view_index >= new_view_.size()) {
+      if (new_view_index >= (int64_t)new_view_.size()) {
         TORCH_INTERNAL_ASSERT(
             current_size == 1,
             "View is complete, but there's still some elements to distribute.");
       }
 
-      if ((new_view_index + 1 >= new_view_.size() ||
+      if ((new_view_index + 1 >= (int64_t)new_view_.size() ||
            (new_view_[new_view_index + 1] != 1)) &&
-          original_view_index + 1 < original_view_.size() &&
+          original_view_index + 1 < (int64_t)original_view_.size() &&
           original_view_[original_view_index + 1] == 1 &&
           !isImplicitBroadcast(original_view_index + 1)) {
         // Next index in original_view is runtime size 1 and next new view is
@@ -524,7 +524,7 @@ class AnalyzeViewTransformation {
         continue;
       }
 
-      if (new_view_index < new_view_.size() &&
+      if (new_view_index < (int64_t)new_view_.size() &&
           // Still new dimensions to resolve and current size does resolve it.
           current_size == new_view_[new_view_index]) {
         // Keep this dimension, it's good to go, we hit a boundary where there's
@@ -536,7 +536,7 @@ class AnalyzeViewTransformation {
         ++original_view_index;
 
         // Update current_size with the next size in original view
-        if (original_view_index < original_view_.size()) {
+        if (original_view_index < (int64_t)original_view_.size()) {
           current_size = original_view_[original_view_index];
         } else {
           current_size = 0;
@@ -548,7 +548,8 @@ class AnalyzeViewTransformation {
       // view. Insert broadcast and increment new_view. Size 1 dimensions in
       // new_view that don't match up with runtime size 1's in original view are
       // assumed to be broadcast (not a split from a runtime domain).
-      if (new_view_index < new_view_.size() && new_view_[new_view_index] == 1) {
+      if (new_view_index < (int64_t)new_view_.size() &&
+          new_view_[new_view_index] == 1) {
         broadcast_transforms_.push_back(
             std::make_shared<BroadcastTransform>(new_view_index));
         ++new_view_index;
@@ -571,7 +572,7 @@ class AnalyzeViewTransformation {
         ++original_view_index;
 
         // Update original position and current size.
-        if (original_view_index < original_view_.size()) {
+        if (original_view_index < (int64_t)original_view_.size()) {
           current_size = original_view_[original_view_index];
         } else {
           current_size = 0;
@@ -580,7 +581,7 @@ class AnalyzeViewTransformation {
         continue;
       }
 
-      if (original_view_index + 1 < original_view_.size() &&
+      if (original_view_index + 1 < (int64_t)original_view_.size() &&
           isImplicitBroadcast(original_view_index + 1)) {
         // Original view has a compile time size 1 dimension, and it's
         // interfering with necessary transformations. Do a trivial reduction.
@@ -594,10 +595,10 @@ class AnalyzeViewTransformation {
       // We're only left with performing transformations to match a new_view
       // dimension, there must be an activew new_view.
       TORCH_INTERNAL_ASSERT(
-          new_view_index < new_view_.size(),
+          new_view_index < (int64_t)new_view_.size(),
           "Expecting to still have new dimensions to work on in view, but none left.");
 
-      if (new_view_index < new_view_.size() &&
+      if (new_view_index < (int64_t)new_view_.size() &&
           current_size % new_view_[new_view_index] == 0) {
         // Insert split to generate the next new_view domain.
         view_transforms_.push_back(std::make_shared<SplitTransform>(
@@ -614,7 +615,7 @@ class AnalyzeViewTransformation {
       // Need more of the original_view dimension to resolve the new_view
       // dimension, merge the next dimension in.
       TORCH_INTERNAL_ASSERT(
-          original_view_index + 1 < original_view_.size(),
+          original_view_index + 1 < (int64_t)original_view_.size(),
           "Expecting to still have original dimensions to work on in view, but none left.");
 
       view_transforms_.push_back(
@@ -702,7 +703,7 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferViewShapes(
   // TODO: refactor
   int64_t dynamic_index = -1;
   int64_t new_size_num_elements = 1;
-  for (int64_t idx = 0; idx < new_sizes.size(); ++idx) {
+  for (int64_t idx = 0; idx < (int64_t)new_sizes.size(); ++idx) {
     if (new_sizes[idx] == -1) {
       TORCH_INTERNAL_ASSERT(
           dynamic_index == -1, "Only one dimension can by inferred.")
diff --git a/torch/csrc/jit/codegen/cuda/transform_view.h b/third_party/nvfuser/csrc/transform_view.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/transform_view.h
rename to third_party/nvfuser/csrc/transform_view.h
index c3eb0ac34bea..b280141e45ef 100644
--- a/torch/csrc/jit/codegen/cuda/transform_view.h
+++ b/third_party/nvfuser/csrc/transform_view.h
@@ -2,7 +2,7 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 #include <memory>
 #include <vector>
@@ -39,7 +39,7 @@ struct AnalyzeViewResult {
   std::vector<std::shared_ptr<ViewTransform>> transforms;
 };
 
-struct TORCH_API AnalyzeViewConstraint {
+struct TORCH_CUDA_CU_API AnalyzeViewConstraint {
   // 1 if size 1 dimension, otherwise 0;
   std::vector<int64_t> original_constraint;
   std::vector<int64_t> new_constraint;
diff --git a/torch/csrc/jit/codegen/cuda/type.cpp b/third_party/nvfuser/csrc/type.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/type.cpp
rename to third_party/nvfuser/csrc/type.cpp
index 3b8f380683ed..8bd1fd2f4293 100644
--- a/torch/csrc/jit/codegen/cuda/type.cpp
+++ b/third_party/nvfuser/csrc/type.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <type.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
@@ -1054,15 +1054,11 @@ std::ostream& operator<<(
   return out << load_store_type2string(load_store_type);
 }
 
-TORCH_CUDA_CU_API std::ostream& operator<<(
-    std::ostream& out,
-    const IterType bt) {
+std::ostream& operator<<(std::ostream& out, const IterType bt) {
   return out << iter_type2string(bt);
 }
 
-TORCH_CUDA_CU_API std::ostream& operator<<(
-    std::ostream& os,
-    const Swizzle2DType& swizzle) {
+std::ostream& operator<<(std::ostream& os, const Swizzle2DType& swizzle) {
   switch (swizzle) {
     case Swizzle2DType::NoSwizzle:
       os << "NoSwizzle";
@@ -1086,9 +1082,7 @@ TORCH_CUDA_CU_API std::ostream& operator<<(
   return os;
 }
 
-TORCH_CUDA_CU_API std::ostream& operator<<(
-    std::ostream& os,
-    const SwizzleMode& swizzle) {
+std::ostream& operator<<(std::ostream& os, const SwizzleMode& swizzle) {
   switch (swizzle) {
     case SwizzleMode::NoSwizzle:
       os << "NoSwizzle";
@@ -1106,8 +1100,7 @@ TORCH_CUDA_CU_API std::ostream& operator<<(
   return os;
 }
 
-TORCH_CUDA_CU_API c10::optional<std::string> inline_op_str(
-    const UnaryOpType uotype) {
+c10::optional<std::string> inline_op_str(const UnaryOpType uotype) {
   const char* str = unary_op_type_inline_op2string(uotype);
   return str != nullptr ? c10::optional<std::string>(std::string(str))
                         : c10::nullopt;
@@ -1236,7 +1229,7 @@ size_t dataTypeSize(DataType type, DataType index_type) {
   return dataTypeSize(type);
 }
 
-TORCH_CUDA_CU_API std::ostream& operator<<(
+std::ostream& operator<<(
     std::ostream& os,
     const DoubleBufferLoopStage loop_stage) {
   switch (loop_stage) {
diff --git a/torch/csrc/jit/codegen/cuda/type.h b/third_party/nvfuser/csrc/type.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/type.h
rename to third_party/nvfuser/csrc/type.h
diff --git a/torch/csrc/jit/codegen/cuda/type_inference.cpp b/third_party/nvfuser/csrc/type_inference.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/type_inference.cpp
rename to third_party/nvfuser/csrc/type_inference.cpp
index 7422cf20d7c2..a3a94522bd54 100644
--- a/torch/csrc/jit/codegen/cuda/type_inference.cpp
+++ b/third_party/nvfuser/csrc/type_inference.cpp
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/codegen/cuda/type_inference.h>
+#include <type_inference.h>
 
 #include <ATen/AccumulateType.h>
 #include <c10/core/ScalarType.h>
-#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <instrumentation.h>
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/runtime/operator.h>
@@ -10,7 +10,7 @@
 #include <ATen/ExpandUtils.h>
 #include <ATen/core/jit_type.h>
 #include <ATen/native/TypeProperties.h>
-#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
+#include <type_promotion.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/type_inference.h b/third_party/nvfuser/csrc/type_inference.h
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/type_inference.h
rename to third_party/nvfuser/csrc/type_inference.h
diff --git a/torch/csrc/jit/codegen/cuda/type_promotion.cpp b/third_party/nvfuser/csrc/type_promotion.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/type_promotion.cpp
rename to third_party/nvfuser/csrc/type_promotion.cpp
index bfc3f7451a38..3462e2fd3aae 100644
--- a/torch/csrc/jit/codegen/cuda/type_promotion.cpp
+++ b/third_party/nvfuser/csrc/type_promotion.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/codegen/cuda/type_promotion.h>
+#include <type_promotion.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
+#include <arith.h>
+#include <ir_interface_nodes.h>
 
 #include <ATen/native/TypeProperties.h>
 #include <c10/core/ScalarType.h>
diff --git a/torch/csrc/jit/codegen/cuda/type_promotion.h b/third_party/nvfuser/csrc/type_promotion.h
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/type_promotion.h
rename to third_party/nvfuser/csrc/type_promotion.h
index 37f403cbaaeb..fb9f241a7f66 100644
--- a/torch/csrc/jit/codegen/cuda/type_promotion.h
+++ b/third_party/nvfuser/csrc/type_promotion.h
@@ -3,7 +3,7 @@
 #include <ATen/Context.h>
 #include <ATen/native/TypeProperties.h>
 #include <c10/core/ScalarType.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
+#include <ir_interface_nodes.h>
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/third_party/nvfuser/csrc/utils.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/utils.cpp
rename to third_party/nvfuser/csrc/utils.cpp
index 33395692fb39..9153b64d1f7e 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/third_party/nvfuser/csrc/utils.cpp
@@ -1,5 +1,5 @@
 
-#include <torch/csrc/jit/codegen/cuda/utils.h>
+#include <utils.h>
 
 #include <c10/util/string_view.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/third_party/nvfuser/csrc/utils.h
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/utils.h
rename to third_party/nvfuser/csrc/utils.h
index 61f7fee7cd4c..01d08735b48d 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/third_party/nvfuser/csrc/utils.h
@@ -2,7 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <c10/util/Exception.h>
-#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <type.h>
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/vectorization_info.h b/third_party/nvfuser/csrc/vectorization_info.h
similarity index 93%
rename from torch/csrc/jit/codegen/cuda/vectorization_info.h
rename to third_party/nvfuser/csrc/vectorization_info.h
index 14b5662ab3c5..8699a756fd92 100644
--- a/torch/csrc/jit/codegen/cuda/vectorization_info.h
+++ b/third_party/nvfuser/csrc/vectorization_info.h
@@ -2,7 +2,7 @@
 
 #include <c10/macros/Export.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <ir_all_nodes.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_extension/README.md b/third_party/nvfuser/examples/sinh_extension/README.md
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_extension/README.md
rename to third_party/nvfuser/examples/sinh_extension/README.md
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_extension/main.cpp b/third_party/nvfuser/examples/sinh_extension/main.cpp
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_extension/main.cpp
rename to third_party/nvfuser/examples/sinh_extension/main.cpp
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_extension/setup.py b/third_party/nvfuser/examples/sinh_extension/setup.py
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_extension/setup.py
rename to third_party/nvfuser/examples/sinh_extension/setup.py
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_extension/test.py b/third_party/nvfuser/examples/sinh_extension/test.py
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_extension/test.py
rename to third_party/nvfuser/examples/sinh_extension/test.py
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_libtorch/CMakeLists.txt b/third_party/nvfuser/examples/sinh_libtorch/CMakeLists.txt
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_libtorch/CMakeLists.txt
rename to third_party/nvfuser/examples/sinh_libtorch/CMakeLists.txt
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_libtorch/README.md b/third_party/nvfuser/examples/sinh_libtorch/README.md
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_libtorch/README.md
rename to third_party/nvfuser/examples/sinh_libtorch/README.md
diff --git a/torch/csrc/jit/codegen/cuda/examples/sinh_libtorch/main.cpp b/third_party/nvfuser/examples/sinh_libtorch/main.cpp
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/examples/sinh_libtorch/main.cpp
rename to third_party/nvfuser/examples/sinh_libtorch/main.cpp
diff --git a/third_party/nvfuser/python/__init__.py b/third_party/nvfuser/python/__init__.py
new file mode 100644
index 000000000000..945903c11006
--- /dev/null
+++ b/third_party/nvfuser/python/__init__.py
@@ -0,0 +1 @@
+from . import _C
diff --git a/third_party/nvfuser/python_tests/__init__.py b/third_party/nvfuser/python_tests/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/third_party/nvfuser/python_tests/test_dynamo.py b/third_party/nvfuser/python_tests/test_dynamo.py
new file mode 100644
index 000000000000..57918486d6f2
--- /dev/null
+++ b/third_party/nvfuser/python_tests/test_dynamo.py
@@ -0,0 +1,148 @@
+# Owner(s): ["module: nvfuser"]
+
+import unittest
+import warnings
+from functools import partial
+
+import torch
+import torch._dynamo as torchdynamo
+from torch.testing import make_tensor
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TEST_WITH_ROCM,
+    TestCase,
+)
+from torch.testing._internal.jit_utils import RUN_CUDA
+
+RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
+
+
+def is_pre_volta():
+    if not RUN_NVFUSER:
+        return False
+    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return prop.major < 7
+
+
+def is_networkx_available():
+    try:
+        import networkx  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+@skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+@unittest.skipIf(IS_WINDOWS, "TorchDynamo is not supported on Windows")
+@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+@unittest.skipIf(is_pre_volta(), "Only supported on Volta and newer devices.")
+class TestNvFuserDynamo(TestCase):
+    def test_basic(self):
+        input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float32)
+        input2 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float32)
+
+        @torchdynamo.optimize("nvprims_nvfuser")
+        def func(a, b):
+            return a.sin() + b.cos()
+
+        # No warnings and no errors
+        with warnings.catch_warnings(record=True) as w:
+            nvfuser_result = func(input1, input2)
+            self.assertEqual(len(w), 0)
+        eager_result = func.__wrapped__(input1, input2)
+        self.assertEqual(eager_result, nvfuser_result)
+
+    @unittest.skipIf(not is_networkx_available(), "networkx not available")
+    def test_min_cut(self):
+        from functorch.compile import default_partition
+        from torch._dynamo.backends.nvfuser import nvprims_fw_bw_partition_fn
+
+        def get_fw_bw_graph(f, inps, partitioner):
+            from functorch.compile import aot_function
+
+            # Helper functions are taken from functorch/test_aotdispatch.py
+            def extract_graph(fx_g, _, graph_cell):
+                graph_cell[0] = fx_g
+                return fx_g
+
+            fw_graph_cell = [None]
+            bw_graph_cell = [None]
+            aot_function(
+                f,
+                fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+                bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
+                partition_fn=partitioner,
+            )(*inps).sum().backward()
+            return (fw_graph_cell[0], bw_graph_cell[0])
+
+        def get_ins_outs(fx_g):
+            ins = []
+            outs = []
+            for n in fx_g.graph.nodes:
+                if n.op == "placeholder":
+                    ins.append(n)
+                elif n.op == "output":
+                    outs = tuple(n.args[0])
+            return ins, outs
+
+        def get_num_ins_outs(fx_g):
+            return tuple(len(i) for i in get_ins_outs(fx_g))
+
+        def func(x):
+            return x * x * x
+
+        input1 = make_tensor(
+            (3,), device="cpu", dtype=torch.float32, requires_grad=True
+        )
+        fw_graph, bw_graph = get_fw_bw_graph(func, [input1], default_partition)
+        self.assertEqual(get_num_ins_outs(fw_graph), (1, 3))
+        self.assertEqual(get_num_ins_outs(bw_graph), (3, 1))
+
+        input1 = make_tensor(
+            (3,), device="cpu", dtype=torch.float32, requires_grad=True
+        )
+        fw_graph, bw_graph = get_fw_bw_graph(func, [input1], nvprims_fw_bw_partition_fn)
+        self.assertEqual(get_num_ins_outs(fw_graph), (1, 2))
+        self.assertEqual(get_num_ins_outs(bw_graph), (2, 1))
+
+    def test_batch_norm_implicit_dtype_promotion(self):
+        input1 = make_tensor((2, 3, 4, 5), device="cuda", dtype=torch.float32)
+        input2 = make_tensor((5, 5), device="cuda", dtype=torch.float32)
+        w = make_tensor((3), device="cuda", dtype=torch.float32)
+        b = make_tensor((3), device="cuda", dtype=torch.float32)
+
+        @torchdynamo.optimize("nvprims_nvfuser")
+        def func(mat1, mat2, w, b):
+            o = torch.matmul(mat1, mat2)
+            return torch.batch_norm(o, w, b, None, None, True, 1e-2, 1e-5, True)
+
+        # No warnings and no errors
+        with torch.cuda.amp.autocast():
+            with warnings.catch_warnings(record=True) as warning:
+                nvfuser_result = func(input1, input2, w, b)
+                self.assertEqual(len(warning), 0)
+            eager_result = func.__wrapped__(input1, input2, w, b)
+            self.assertEqual(eager_result, nvfuser_result)
+
+    def test_dtype_correctness(self):
+        input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float16)
+
+        @torchdynamo.optimize("nvprims_nvfuser")
+        def func(a):
+            tmp = a + 1.0
+            # nvfuser would promote output to fp32 in math, FusionDefinition should cast output dtype back
+            return torch.where(tmp > 0, tmp, 0.0)
+
+        # No warnings and no errors
+        with warnings.catch_warnings(record=True) as w:
+            nvfuser_result = func(input1)
+            self.assertEqual(len(w), 0)
+        eager_result = func.__wrapped__(input1)
+        self.assertEqual(eager_result, nvfuser_result)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/third_party/nvfuser/python_tests/test_python_frontend.py b/third_party/nvfuser/python_tests/test_python_frontend.py
new file mode 100644
index 000000000000..cb367c4e4b09
--- /dev/null
+++ b/third_party/nvfuser/python_tests/test_python_frontend.py
@@ -0,0 +1,368 @@
+# Owner(s): ["module: nvfuser"]
+
+import unittest
+from typing import List
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
+from torch.testing._internal.jit_utils import RUN_CUDA
+import torch._refs as refs
+import torch._prims as prims
+
+# Will only create the nvfuser module if CUDA is available
+try:
+    from nvfuser._C import Fusion, FusionCache, FusionDefinition, DataType
+except ImportError:
+    pass
+
+RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
+
+def is_pre_volta():
+    if not RUN_NVFUSER:
+        return False
+    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return prop.major < 7
+
+@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+@unittest.skipIf(is_pre_volta(), "Only supported on Volta and newer devices.")
+class TestNvFuserFrontend(TestCase):
+    def test_basic(self) :
+        input1 = torch.ones(2, 4, 8, device='cuda')
+        input2 = torch.ones(2, 4, 8, device='cuda')
+        fc = FusionCache.get()
+        before_fusions = fc.num_fusions()
+
+        fs1 = Fusion()
+        with FusionDefinition(fs1) as fd :
+            t0 = fd.define_tensor(3)
+            t1 = fd.define_tensor(3)
+            c0 = fd.define_constant(3.0)
+
+            t2 = fd.ops.add(t0, t1)
+            t3 = fd.ops.mul(t2, c0)
+            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+            fd.add_output(t4)
+
+        # Expected Output is a tensor of 48's
+        nvf_out1 = fs1.execute([input1, input2])[0]
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        fs2 = Fusion()
+        with FusionDefinition(fs2) as fd :
+            t0 = fd.define_tensor(3)
+            t1 = fd.define_tensor(3)
+            c0 = fd.define_constant(3.0)
+
+            t2 = fd.ops.add(t0, t1)
+            t3 = fd.ops.mul(t2, c0)
+            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+            fd.add_output(t4)
+
+        nvf_out2 = fs2.execute([input1, input2])[0]
+
+        # Check there is still only 1 cache entry
+        fc = FusionCache.get()
+        self.assertEqual(fc.num_fusions() - before_fusions, 1)
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute([input1, input2])[0]
+
+        eager_out = torch.sum((input1 + input2) * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_basic_fp16(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(3, DataType.Half)
+            t1 = fd.define_tensor(3, DataType.Half)
+            c0 = fd.define_constant(3.0)
+
+            t2 = fd.ops.add(t0, t1)
+            t3 = fd.ops.mul(t2, c0)
+            t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+
+            t5 = fd.ops.cast(t4, DataType.Half)
+            fd.add_output(t5)
+
+        input1 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
+        input2 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16)
+
+        # Expected Output is a tensor of 48's
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = torch.sum((input1 + input2) * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_cast_double_to_half(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(2, DataType.Double)
+            t1 = fd.define_tensor(2, DataType.Double)
+
+            t0h = fd.ops.cast(t0, DataType.Half)
+            t1h = fd.ops.cast(t1, DataType.Half)
+            t2 = fd.ops.add(t0h, t1h)
+            t3 = fd.ops.relu(t2)
+            t4 = fd.ops.cast(t3, DataType.Half)
+
+            fd.add_output(t4)
+
+        input1 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
+        input2 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = torch.relu(input1.to(torch.half) + input2.to(torch.half))
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_promote_to_double(self) :
+        fs = Fusion()
+
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(2, DataType.Half)
+            t1 = fd.define_tensor(2, DataType.Double)
+
+            t2 = fd.ops.add(t0, t1)
+            t5 = fd.ops.relu(t2)
+
+            fd.add_output(t5)
+
+        input1 = torch.randn(2, 4, device='cuda', dtype=torch.float16)
+        input2 = torch.randn(2, 4, device='cuda', dtype=torch.float64)
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = torch.relu(input1 + input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_implicit_broadcast_input(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(1)
+            t1 = fd.define_tensor(3)
+
+            t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [1])
+            t2 = fd.ops.add(t0_b, t1)
+
+            fd.add_output(t2)
+
+        input1 = torch.randn(3, device='cuda')
+        input2 = torch.randn(2, 3, 4, device='cuda')
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_explicit_broadcast_input(self) :
+        input1 = torch.randn(1, 1, 4, device='cuda')
+        input2 = torch.randn(2, 3, 4, device='cuda')
+
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(sizes=input1.size(), strides=input1.stride())
+            t1 = fd.define_tensor(sizes=input2.size(), strides=input2.stride())
+
+            t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [0, 1, 2])
+            t2 = fd.ops.add(t0_b, t1)
+
+            fd.add_output(t2)
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [0, 1, 2]), input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_broadcast_mixing(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor([3, 1], [1, 1])
+            t1 = fd.define_tensor(1)
+
+            t1_b = fd.ops.broadcast_in_dim(t1, [3, 3], [0])
+            t2 = fd.ops.add(t0, t1_b)
+
+            fd.add_output(t2)
+
+        input1 = torch.randn(3, 1, device='cuda')
+        input2 = torch.randn(3, device='cuda')
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(input1, prims.broadcast_in_dim(input2, [3, 3], [0]))
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_ops_broadcast(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(1)
+            t1 = fd.define_tensor(3)
+
+            t0_b = fd.ops.broadcast(t0, [True, False, True])
+            t2 = fd.ops.add(t0_b, t1)
+
+            fd.add_output(t2)
+
+        input1 = torch.randn(3, device='cuda')
+        input2 = torch.randn(2, 3, 4, device='cuda')
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
+        self.assertEqual(eager_out, nvf_out)
+
+    def test_prim_layer_norm_fwd(self) :
+        def primitive_definition(
+            inputs: torch.Tensor,
+            weight: torch.Tensor,
+            bias: torch.Tensor,
+            normalization_axis: int,
+            keepdim: bool,
+        ) -> torch.Tensor:
+            mean = inputs.mean(normalization_axis, keepdim=keepdim)
+            diff = inputs - mean
+            diff_sq = diff * diff
+            var = diff_sq.mean(normalization_axis, keepdim=keepdim)
+            pre_shift_scale_norm_output = (inputs - mean) / torch.sqrt(var + 1e-12)
+            norm_output = weight * pre_shift_scale_norm_output + bias
+            return norm_output
+
+        def nvfuser_fusion(
+            fd: FusionDefinition,
+            normalization_axis: int,
+            norm_size: int,
+            input_shape: List[int],
+            eps: float,
+            keepDim: bool
+        ) -> None :
+            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            bias = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            sum0 = fd.ops.sum(inputs, axes=[normalization_axis], keepdim=keepDim)
+            norm_const = fd.define_constant(norm_size)
+            mean = fd.ops.div(sum0, norm_const)
+            diff = fd.ops.sub(inputs, mean)
+            diff_sq = fd.ops.mul(diff, diff)
+            sum1 = fd.ops.sum(diff_sq, axes=[normalization_axis], keepdim=keepDim)
+            var = fd.ops.div(sum1, norm_const)
+            eps_const = fd.define_constant(eps)
+            var_eps = fd.ops.add(var, eps_const)
+            invstd = fd.ops.rsqrt(var_eps)
+            pre_scale_bias = fd.ops.mul(diff, invstd)
+            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
+            scale = fd.ops.mul(pre_scale_bias, weights_bcast)
+            bias_bcast = fd.ops.broadcast_in_dim(bias, output_shape=input_shape, broadcast_dims=[2])
+            out = fd.ops.add(scale, bias_bcast)
+            fd.add_output(out)
+            fd.add_output(mean)
+            fd.add_output(invstd)
+
+        def nvfuser_fusion_var_mean(
+            fd: FusionDefinition,
+            normalization_axis: int,
+            norm_size: int,
+            input_shape: List[int],
+            eps: float,
+            keepDim: bool
+        ) -> None :
+            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            bias = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            var, mean = fd.ops.var_mean(inputs, axes=[normalization_axis], correction=0, keepdim=keepDim)
+            eps_const = fd.define_constant(eps)
+            var_eps = fd.ops.add(var, eps_const)
+            invstd = fd.ops.rsqrt(var_eps)
+            diff = fd.ops.sub(inputs, mean)
+            pre_scale_bias = fd.ops.mul(diff, invstd)
+            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
+            scale = fd.ops.mul(pre_scale_bias, weights_bcast)
+            bias_bcast = fd.ops.broadcast_in_dim(bias, output_shape=input_shape, broadcast_dims=[2])
+            out = fd.ops.add(scale, bias_bcast)
+            fd.add_output(out)
+            fd.add_output(mean)
+            fd.add_output(invstd)
+
+        input_size = [64, 128, 1024]
+        dtype = torch.float32
+        device = 'cuda'
+        inputs = torch.randn(*input_size, device=device, requires_grad=True)
+        weights = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
+        biases = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
+        fc = FusionCache.get()
+        before_fusions = fc.num_fusions()
+
+        for _ in range(5) :
+            nvf_fusion = Fusion()
+            with FusionDefinition(nvf_fusion) as fd:
+                nvfuser_fusion(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
+            nvf_out = nvf_fusion.execute([inputs, weights, biases])
+
+        for _ in range(5) :
+            nvf_var_mean_fusion = Fusion()
+            with FusionDefinition(nvf_var_mean_fusion) as fd:
+                nvfuser_fusion_var_mean(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
+            nvf_var_mean_out = nvf_var_mean_fusion.execute([inputs, weights, biases])
+
+        for _ in range(5) :
+            eager_out = primitive_definition(inputs, weights, biases, 2, True)
+
+        self.assertEqual(eager_out, nvf_out[0])
+        self.assertEqual(eager_out, nvf_var_mean_out[0])
+        fusion_cache = FusionCache.get()
+        self.assertEqual(fc.num_fusions() - before_fusions, 2)
+
+    def test_prim_rms_norm_fwd(self) :
+        def primitive_definition(
+            inputs: torch.Tensor,
+            weight: torch.Tensor,
+            normalization_axis: int,
+            keepdim: bool,
+        ) -> torch.Tensor:
+            var = inputs.mul(inputs).mean(normalization_axis, keepdim)
+            pre_shift_scale_norm_output = inputs / torch.sqrt(var + 1e-12)
+            norm_output = weight * pre_shift_scale_norm_output
+            return norm_output
+
+        def nvfuser_fusion(
+            fd: FusionDefinition,
+            normalization_axis: int,
+            norm_size: int,
+            input_shape: List[int],
+            eps: float,
+            keepDim: bool
+        ) -> None :
+            inputs = fd.define_tensor(symbolic_sizes=[-1, -1, -1], contiguous=[True, True, True], dtype=DataType.Float)
+            weights = fd.define_tensor(symbolic_sizes=[-1], contiguous=[True], dtype=DataType.Float)
+            inputs_sq = fd.ops.mul(inputs, inputs)
+            sum0 = fd.ops.sum(inputs_sq, axes=[normalization_axis], keepdim=keepDim)
+            norm_const = fd.define_constant(norm_size)
+            var = fd.ops.div(sum0, norm_const)
+            eps_const = fd.define_constant(eps)
+            var_eps = fd.ops.add(var, eps_const)
+            invstd = fd.ops.rsqrt(var_eps)
+            pre_scale = fd.ops.mul(inputs, invstd)
+            weights_bcast = fd.ops.broadcast_in_dim(weights, output_shape=input_shape, broadcast_dims=[2])
+            out = fd.ops.mul(pre_scale, weights_bcast)
+            fd.add_output(out)
+            fd.add_output(invstd)
+
+        input_size = [64, 128, 1024]
+        dtype = torch.float32
+        device = 'cuda'
+        inputs = torch.randn(*input_size, device=device, requires_grad=True)
+        weights = torch.nn.Parameter(torch.randn(input_size[2], dtype=dtype, device=device))
+        fc = FusionCache.get()
+        before_fusions = fc.num_fusions()
+
+        for _ in range(5) :
+            nvf_fusion = Fusion()
+            with FusionDefinition(nvf_fusion) as fd:
+                nvfuser_fusion(fd, 2, inputs.size()[2], inputs.size(), 1e-12, True)
+            nvf_out = nvf_fusion.execute([inputs, weights])
+
+        for _ in range(5) :
+            eager_out = primitive_definition(inputs, weights, 2, True)
+
+        self.assertEqual(eager_out, nvf_out[0])
+        self.assertEqual(fc.num_fusions() - before_fusions, 1)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/third_party/nvfuser/python_tests/test_torchscript.py b/third_party/nvfuser/python_tests/test_torchscript.py
new file mode 100644
index 000000000000..7eccdc0f21f0
--- /dev/null
+++ b/third_party/nvfuser/python_tests/test_torchscript.py
@@ -0,0 +1,5306 @@
+# Owner(s): ["oncall: jit"]
+
+import contextlib
+import unittest
+import os
+import random
+import enum
+import copy
+from functools import reduce
+import operator
+import warnings
+
+import torch
+from torch.nn import functional
+from torch.profiler import profile, ProfilerActivity
+
+from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes
+from torch.testing._internal.common_jit import JitCommonTestCase
+from torch.testing._internal.common_methods_invocations import op_db, SampleInput
+from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, TEST_WITH_ROCM, slowTest, \
+    is_iterable_of_tensors, freeze_rng_state, skipIfRocm
+from torch.testing._internal.jit_utils import clone_inputs, get_traced_sample_variant_pairs, JitTestCase, RUN_CUDA
+from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
+from torch.testing import FileCheck
+
+import itertools
+import numpy as np
+import math
+
+from torch.autograd.gradcheck import gradcheck
+
+from typing import List
+
+RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
+CUDA_MAJOR, CUDA_MINOR = 0, 0
+
+if RUN_NVFUSER and torch.version.cuda is not None:
+    CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2])
+
+if 'PYTORCH_NVFUSER_ENABLE' not in os.environ:
+    os.environ['PYTORCH_NVFUSER_ENABLE'] = ""
+os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition,' + os.environ['PYTORCH_NVFUSER_ENABLE']
+if 'PYTORCH_NVFUSER_DISABLE' not in os.environ:
+    os.environ['PYTORCH_NVFUSER_DISABLE'] = ""
+os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,' + os.environ['PYTORCH_NVFUSER_DISABLE']
+os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0'
+# TODO: enable complex when we fixes the extremal cases in OpInfo
+# see issue https://github.com/csarofeen/pytorch/issues/1730"
+# os.environ['PYTORCH_NVFUSER_ENABLE'] = 'complex'
+
+if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+    torch._C._jit_set_texpr_fuser_enabled(False)
+    torch._C._jit_set_profiling_executor(True)
+    torch._C._jit_set_profiling_mode(True)
+
+FUSION_GROUP = 'prim::CudaFusionGroup'
+FUSION_GUARD = 'prim::CudaFusionGuard'
+# TODO: revert disabled alias ops
+ALIAS_TEST_DISABLED = True
+
+
+@contextlib.contextmanager
+def nvfuser_singleton_fusion(flag):
+    old_value = torch._C._jit_set_nvfuser_single_node_mode(flag)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_nvfuser_single_node_mode(old_value)
+
+@contextlib.contextmanager
+def nvfuser_horizontal_fusion(flag):
+    old_value = torch._C._jit_set_nvfuser_horizontal_mode(flag)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_nvfuser_horizontal_mode(old_value)
+
+def is_pre_volta():
+    if not RUN_NVFUSER:
+        return False
+    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return prop.major < 7
+
+TEST_BF16 = RUN_NVFUSER and torch.cuda.is_bf16_supported()
+
+TEST_LARGE_TENSOR = RUN_NVFUSER
+if RUN_NVFUSER:
+    torch.ones(1).cuda()  # initialize cuda context
+    TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
+
+class CudaFuserTestOptions():
+    def __init__(self):
+        self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
+        self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        self.old_guard = torch._C._jit_set_nvfuser_guard_mode(False)
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+        self.old_value = torch._C._jit_set_autocast_mode(True)
+
+        if(RUN_CUDA):
+            self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(True)
+
+    def restore(self):
+        if(RUN_CUDA):
+            torch._C._jit_set_nvfuser_enabled(self.old_nvfuser)
+        torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse)
+        torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse)
+        torch._C._jit_set_nvfuser_guard_mode(self.old_guard)
+        torch._C._debug_set_autodiff_subgraph_inlining(True)
+        torch._C._jit_set_autocast_mode(self.old_value)
+
+class TestCudaFuser(JitTestCase):
+    def assertEqual(self, *args, **kwargs):
+        kwargs["exact_layout"] = True
+        super().assertEqual(*args, **kwargs)
+
+    def _getSubgraphInFusion(self, graph):
+        num_node = 0
+        subgraph = None
+
+        def count(block, ret):
+            for n in block.nodes():
+                if n.kind() == FUSION_GROUP:
+                    ret[0] = ret[0] + 1
+                    self.assertTrue(n.hasAttribute('Subgraph'))
+                    ret[1] = n.g('Subgraph')
+                for block in n.blocks():
+                    count(block, ret)
+        ret = [num_node, subgraph]
+        count(graph, ret)
+        self.assertEqual(ret[0], 1)
+        return ret[1]
+
+    def setUp(self):
+        super().setUp()
+
+        self.skip_node_list = []
+        disabled_ops = ("aten::batch_norm",
+                        "aten::_batch_norm_impl_index",
+                        "aten::_batch_norm_impl_index_backward",
+                        "aten::native_batch_norm_backward",)
+        for op in disabled_ops:
+            disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
+            if disabled_flag:
+                torch._C._jit_set_nvfuser_skip_node_kind(op, True)
+                self.skip_node_list.append(op)
+
+        # cpu backup to avoid errors in case this is run on a CPU-only machine
+        dev = 'cuda' if RUN_NVFUSER else 'cpu'
+        self.special_values = torch.tensor(
+            [float("-inf"), -10, -math.pi,
+                -1, -0.5, 0, 1, 0.5,
+                math.pi, 10, float("inf"),
+                float("nan")], dtype=torch.float, device=dev)
+
+        self.int_types = [
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64
+        ]
+
+        self.support_tensor_dtypes = [
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.bool,
+            torch.complex64,
+            torch.complex128,
+        ]
+        if TEST_BF16:
+            self.support_tensor_dtypes.append(torch.bfloat16)
+
+        if(RUN_NVFUSER):
+            self.cuda_fuser_options = CudaFuserTestOptions()
+
+    def tearDown(self):
+        # restoring skip node to the configuration before tests
+        for op in self.skip_node_list:
+            disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
+            if not disabled_flag:
+                torch._C._jit_set_nvfuser_skip_node_kind(op, True)
+
+        if(RUN_NVFUSER):
+            self.cuda_fuser_options.restore()
+        super().tearDown()
+
+    def _run_helper(self, jit_op, op, *args, check_stride=False, num_fusion=1, check_runs=1):
+        seed = 123
+        torch.cuda.manual_seed_all(seed)
+        jit_o = jit_op(*args)
+
+        for i in range(check_runs):
+            torch.cuda.manual_seed_all(seed + i)
+            jit_o = jit_op(*args)
+            torch.cuda.manual_seed_all(seed + i)
+            o = op(*args)
+
+            if type(jit_o) is torch.Tensor:
+                jit_o = [jit_o, ]
+                o = [o, ]
+
+            for oo, jit_oo in zip(o, jit_o):
+                self.assertEqual(oo.dtype, jit_oo.dtype)
+                self.assertEqual(oo, jit_oo)
+                if check_stride:
+                    self.assertEqual(oo.stride(), jit_oo.stride())
+
+        self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, num_fusion, consider_subgraphs=True)
+
+    def _run_training_helper(self, jit_op, op, grads, *args):
+        torch.cuda.manual_seed_all(123)
+        jit_o = jit_op(*args)
+        jit_g = jit_o.backward(grads)
+        torch.cuda.manual_seed_all(123)
+        jit_o = jit_op(*args)
+        jit_g = jit_o.backward(grads)
+        torch.cuda.manual_seed_all(123)
+        jit_o = jit_op(*args)
+        jit_g = jit_o.backward(grads)
+        torch.cuda.manual_seed_all(123)
+        o = op(*args)
+        g = o.backward(grads)
+        self.assertEqual(o, jit_o)
+        self.assertEqual(g, jit_g)
+        self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True)
+        bwd_graph = list(
+            list(jit_op.get_debug_state().execution_plans.values())[
+                0].code.grad_executor_states()[0].execution_plans.values()
+        )[0].graph
+        self.assertGraphContainsExactly(bwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_half(self):
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float):
+            o_16 = torch.add(x, y)
+            o_32_a = torch.add(y, z, alpha=alpha)
+            o_32_b = torch.add(o_16, z)
+            return (o_16, o_32_a, o_32_b)
+
+        t_jit = torch.jit.script(t)
+        alpha = 0.5
+        # stick to integers, this avoid the numerical difference due to our
+        # promotion
+        x = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
+        y = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
+        z = torch.randint(0, 256, (4, 8)).to(dtype=torch.float16, device="cuda")
+        jit_o = t_jit(x, y, z, alpha)
+        jit_o = t_jit(x, y, z, alpha)
+        o = t(x, y, z, alpha)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD)
+
+
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_bfloat(self):
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float):
+            o_16 = torch.add(x, y)
+            o_32_a = torch.add(y, z, alpha=alpha)
+            o_32_b = torch.add(o_16, z)
+            return (o_16, o_32_a, o_32_b)
+
+        t_jit = torch.jit.script(t)
+        alpha = 0.5
+        # stick to integers, this avoid the numerical difference due to our
+        # promotion
+        x = torch.randint(0, 256, (4, 8)).to(dtype=torch.bfloat16, device="cuda")
+        y = torch.randint(0, 256, (4, 8)).to(dtype=torch.bfloat16, device="cuda")
+        z = torch.randint(0, 256, (4, 8)).to(dtype=torch.bfloat16, device="cuda")
+        jit_o = t_jit(x, y, z, alpha)
+        jit_o = t_jit(x, y, z, alpha)
+        o = t(x, y, z, alpha)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_const(self):
+        def t(x, y):
+            o = x + y
+            o = o + 2.0
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_chunk(self):
+        def t(x, y, z, q):
+            o = x + q
+            x0, x1 = torch.chunk(o, 2)
+            o = x0 + x1
+            o = o + y
+            o = o * z
+            o = torch.relu(o)
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(2, 8, dtype=torch.float, device="cuda")
+        z = torch.randn(2, 8, dtype=torch.float, device="cuda")
+        q = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, z, q)
+        jit_o = t_jit(x, y, z, q)
+        o = t(x, y, z, q)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, z, q), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_reduction_dtypes_axis(self):
+
+        for op in [torch.sum, torch.mean, torch.amax, torch.var, torch.std]:
+            for dtype in [torch.float16, torch.float32, torch.double]:
+                for axis in [-1, 2, 0]:
+                    def make_func(op):
+                        def func(x: torch.Tensor):
+                            o = torch.mul(x, 2.0)
+                            o = op(o, dim=[axis])
+                            return o
+                        return func
+
+                    x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
+                    t = make_func(op)
+                    t_jit = torch.jit.trace(t, x)
+                    jit_o = t_jit(x)
+                    jit_o = t_jit(x)
+                    o = t(x)
+                    self.assertEqual(o.dtype, jit_o.dtype)
+                    self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+                    self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_variance(self):
+
+        for op in [torch.var, torch.std]:
+            for dtype in [torch.float16, torch.float32, torch.double]:
+                for axis in [-2, -1, 2, 1]:
+                    for unbiased in [False, True]:
+                        def make_func(op):
+                            def func(x: torch.Tensor):
+                                o = torch.mul(x, 2.0)
+                                o = op(o, dim=[axis])
+                                return o
+                            return func
+
+                        x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
+                        t = make_func(op)
+                        t_jit = torch.jit.trace(t, x)
+                        jit_o = t_jit(x)
+                        jit_o = t_jit(x)
+                        o = t(x)
+                        self.assertEqual(o.dtype, jit_o.dtype)
+                        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+                        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_variance_profiling(self):
+        with nvfuser_singleton_fusion(True):
+            for op in [torch.var, torch.std]:
+                for dtype in [torch.float16, torch.float32, torch.double]:
+                    for axis in [-2, -1, 2, 1]:
+                        for unbiased in [False, True]:
+                            for keepdim in [False, True]:
+                                def t(x: torch.Tensor, dim: List[int], unbiased: bool, keepdim: bool):
+                                    o = torch.mul(x, 2.0)
+                                    o = op(o, dim=dim, unbiased=unbiased, keepdim=keepdim)
+                                    return o
+
+                                x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
+                                t_jit = torch.jit.script(t)
+                                self._run_helper(t_jit, t, x, [axis], unbiased, keepdim, check_stride=False, check_runs=5)
+
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scalar_input(self):
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 1, 32, dtype=torch.float, device="cuda")
+        y = y.expand(4, 8, 32, 32)
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_0(self):
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_1(self):
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(1, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_2(self):
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 1, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_3(self):
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 17, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 17, 1, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
+
+    # test_broadcasting_partition_logic_X
+    # Testing partition logic that is capable to avoid creating unsupported
+    # broadcasting semantics in CudaFusionGroup
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_partition_logic_0(self):
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            x = x + 12.0
+            o1 = x + y
+            o2 = x + z
+            o = o1 + o2
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 6, 8, dtype=torch.float32, device="cuda")
+        y = torch.randn(8, 6, 8, dtype=torch.float32, device="cuda")
+        z = torch.randn(6, 8, dtype=torch.float32, device="cuda")
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, z))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_partition_logic_1(self):
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            x = x + 12.0
+            o1 = x + y
+            o2 = x + z
+            o = o1 + o2
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 6, 8, dtype=torch.float32, device="cuda")
+        y = torch.randn(4, 8, 6, 8, dtype=torch.float32, device="cuda")
+        z = torch.randn(4, 1, 6, 8, dtype=torch.float32, device="cuda")
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, z))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False)
+
+    @unittest.skipIf(True, "Broadcast with different output not supported yet")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_multiple_output_shape(self):
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = x + 12
+            o1 = o + y
+            o2 = o + z
+            oo = o1.sum() + o2.sum()
+            return oo
+        t_jit = torch.jit.script(t)
+        x = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(2, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o, jit_o)
+        # Currently cannot fuse this
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(True, "broadcast on branches can't be resolved yet")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_broadcasting_multiple_output(self):
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = x + 12
+            o1 = o + y
+            o2 = o + z
+            oo = o1.sum() + o2.sum()
+            return oo
+        t_jit = torch.jit.script(t)
+        x = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o, jit_o)
+        # Currently cannot fuse this
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    def _unary_test_helper(self, operation, dtype, random_data):
+        gradient_check = (dtype == torch.float64) and random_data
+        shape = self.special_values.shape
+        torch.cuda.manual_seed_all(211)
+
+        # need additional def of t for boolean ops
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = x * y
+            o = o + 5e-3
+            o = operation(o)
+            return o
+
+        y = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check)
+        y = y.to(dtype=dtype)
+
+        if random_data:
+            x = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check)
+            if dtype in self.int_types:
+                # prefer a larger variance for integer types
+                x = x * 5
+            x = x.to(dtype=dtype)
+        else:
+            x = self.special_values.to(dtype=dtype)
+        try:
+            ref = t(x, y)
+        except Exception:
+            # same way as TE checker, if eager mode throws, ignore this test
+            return
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        if gradient_check:
+            if jit_o.dtype != torch.bool:
+                # bool dtype has no `-`
+                gradcheck(t_jit, [x, y], nondet_tol=1e-5)
+        elif dtype in self.support_tensor_dtypes:
+            self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+        o = t(x, y)
+        self.assertEqual(o.dtype, jit_o.dtype)
+
+        if dtype == torch.bfloat16:
+            # compare with the actual ground truth for
+            #  bfloat16 kernels instead of eager mode
+            #  implementation, since mismatch in cast
+            #  adds excessive noise.
+            o = t(x.to(torch.float64), y.to(torch.float64))
+            if o.dtype.is_floating_point:
+                o = o.to(torch.bfloat16)
+        else:
+            o = t(x, y)
+
+        self.assertTrue(self._compare("failing case {}\n{}\n{}\n{}".format(dtype, operation, x, y), o, jit_o, 1e-2))
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_unary_ops(self):
+        data_types = [
+            *self.int_types,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            # TODO: revert this
+            # see issue https://github.com/csarofeen/pytorch/issues/1730"
+            # torch.cfloat,
+            # torch.cdouble,
+        ]
+        if TEST_BF16:
+            data_types.append(torch.bfloat16)
+        operations = [torch.neg,
+                      torch.abs,
+                      torch.log,
+                      torch.log10,
+                      torch.log1p,
+                      torch.log2,
+                      torch.lgamma,
+                      torch.exp,
+                      torch.expm1,
+                      torch.erf,
+                      torch.erfc,
+                      torch.cos,
+                      torch.acos,
+                      torch.cosh,
+                      torch.sin,
+                      torch.asin,
+                      torch.sinh,
+                      torch.tan,
+                      torch.atan,
+                      torch.sqrt,
+                      torch.rsqrt,
+                      torch.ceil,
+                      torch.floor,
+                      torch.round,
+                      torch.trunc,
+                      torch.frac,
+                      torch.reciprocal,
+                      torch.isfinite,
+                      torch.isinf,
+                      torch.isnan,
+                      torch.isneginf,
+                      torch.isposinf,
+                      torch.isreal,
+                      torch.nn.functional.softplus,
+                      torch.nn.functional.gelu,
+                      torch.nn.functional.leaky_relu,
+                      torch.nn.functional.silu,
+                      torch.relu,
+                      torch.sigmoid,
+                      torch.bitwise_not,
+                      torch.tan,
+                      torch.tanh]
+        skip_complex = {torch.rsqrt, torch.reciprocal}
+        for op, dtype in itertools.product(operations, data_types):
+            if dtype.is_complex and op in skip_complex:
+                continue
+            self._unary_test_helper(op, dtype, False)  # test special numbers
+            self._unary_test_helper(op, dtype, True)  # test random data
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_category_rule(self):
+        def run_tensor(x, z):
+            def t(x: torch.Tensor, z: torch.Tensor):
+                o = x + z
+                o = torch.abs(o)
+                return o
+            t_jit = torch.jit.script(t)
+            jit_o = t_jit(x, z)
+            jit_o = t_jit(x, z)
+            o = t(x, z)
+            self.assertEqual(o.dtype, jit_o.dtype)
+            self.assertEqual(o, jit_o)
+            self.assertGraphContains(t_jit.graph_for(x, z), FUSION_GUARD)
+
+        def run_scalar(x, z):
+            def t(x: torch.Tensor, z: float):
+                o = x + z
+                o = torch.abs(o)
+                return o
+            t_jit = torch.jit.script(t)
+            jit_o = t_jit(x, z)
+            jit_o = t_jit(x, z)
+            o = t(x, z)
+            self.assertEqual(o.dtype, jit_o.dtype)
+            self.assertEqual(o, jit_o)
+            self.assertGraphContains(t_jit.graph_for(x, z), FUSION_GUARD)
+
+        # n-dim with 0-dim (no type-promote)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.tensor(2.0, dtype=torch.double, device="cuda")
+        run_tensor(x, z)
+
+        # n-dim with 0-dim (type-promote)
+        x = torch.randn(4, 8, 32, 32, device="cuda").to(dtype=torch.long)
+        z = torch.tensor(2.0, dtype=torch.double, device="cuda")
+        run_tensor(x, z)
+
+        # n-dim with n-dim (type-promote)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 8, 32, 32, dtype=torch.double, device="cuda")
+        run_tensor(x, z)
+
+        # n-dim with scalar (no type-promote)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float16, device="cuda")
+        z = torch.tensor(3., dtype=torch.double)
+        run_scalar(x, z)
+        if TEST_BF16:
+            # n-dim with scalar (no type-promote)
+            x = torch.randn(4, 8, 32, 32, dtype=torch.bfloat16, device="cuda")
+            z = torch.tensor(3., dtype=torch.double)
+            run_scalar(x, z)
+
+        # n-dim with scalar (type-promote)
+        x = torch.randn(4, 8, 32, 32, device="cuda").to(dtype=torch.long)
+        z = torch.tensor(3., dtype=torch.double)
+        run_scalar(x, z)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_unary_bitwise(self):
+        def bit_not(x: torch.Tensor):
+            return ~(x + 1)
+
+        jitted = torch.jit.script(bit_not)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long)
+        jit_o = jitted(x)
+        jit_o = jitted(x)
+        o = bit_not(x)
+        self.assertEqual(o, jit_o)
+        jitted.graph_for(x)  # Shows up in second instance, not first
+        self.assertGraphContains(jitted.graph_for(x), FUSION_GUARD)
+
+        def bool_not(x: torch.Tensor, y: torch.Tensor):
+            return ~(x & y)
+
+        jitted = torch.jit.script(bool_not)
+        x = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool)
+        y = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool)
+        jit_o = jitted(x, y)
+        jit_o = jitted(x, y)
+        o = bool_not(x, y)
+        self.assertEqual(o, jit_o)
+        jitted.graph_for(x, y)  # Shows up in second instance, not first
+        self.assertGraphContains(jitted.graph_for(x, y), FUSION_GUARD)
+
+    def _get_scalar_binary_test_fn(self, category_and_type1, category_and_type2, operation):
+        category1, dtype_arg1 = category_and_type1
+        category2, dtype_arg2 = category_and_type2
+
+        def t_intx_tensory(x: int, y: torch.Tensor):
+            o = operation(x, y)
+            o = 2 + o
+            return o
+
+        def t_doublex_tensory(x: float, y: torch.Tensor):
+            o = operation(x, y)
+            o = 2 + o
+            return o
+
+        def t_cdoublex_tensory(x: complex, y: torch.Tensor):
+            o = operation(x, y)
+            o = 2 + o
+            return o
+
+        # Omit both scalar cases and swap cases
+        assert category1 == "scalar" and category2 != "scalar"
+        if dtype_arg1.is_floating_point:
+            return t_doublex_tensory
+        if dtype_arg1 == torch.int64 or dtype_arg1 == torch.int32:
+            return t_intx_tensory
+        if dtype_arg1.is_complex or dtype_arg1 == torch.int32:
+            return t_cdoublex_tensory
+        raise NotImplementedError
+
+    def _binary_test_helper(self, operation, dtypes, random_data, categories="ndim"):
+        if isinstance(dtypes, tuple):
+            dtype_arg1, dtype_arg2 = dtypes
+        else:
+            dtype_arg1 = dtype_arg2 = dtypes
+
+        if isinstance(categories, tuple) and random_data:
+            category1, category2 = categories
+        elif not random_data:
+            category1 = category2 = "ndim"
+        else:
+            category1 = category2 = categories
+
+        def is_cpu_category(x):
+            return x == "0dimcpu" or x == "scalar"
+
+        # skip unsupported cases
+        if is_cpu_category(category1) and is_cpu_category(category2):
+            return
+
+        # only test cases with first operand as scalar
+        if category2 == "scalar":
+            return
+
+        # skip ops that doesn't support scalar inputs in eager
+        if operation in [
+            torch.atan2,
+            torch.max,
+            torch.min,
+            torch.remainder,  # unsupported in nvfuser
+        ]:
+            if category1 == "scalar" or category2 == "scalar":
+                return
+
+        if operation in [
+            torch.fmod,
+            torch.eq,
+            torch.ne,
+            torch.ge,
+            torch.gt,
+            torch.le,
+            torch.lt
+        ]:
+            if category1 == "scalar":
+                return
+
+        # operators that does not support bfloat16
+        if operation in [torch.fmod]:
+            if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16:
+                return
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = operation(x, y)
+            o = o + z
+            return o
+
+        shape = (4, 32, 32)
+
+        shapex = shape if category1 == "ndim" else ()
+        shapey = shape if category2 == "ndim" else ()
+
+        if random_data:
+            x = (torch.randn(shapex, dtype=torch.float, device="cuda") * 5).to(dtype_arg1)
+            y = (torch.randn(shapey, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
+        else:
+            x = self.special_values.to(dtype=dtype_arg1)
+            y = (torch.rand_like(self.special_values) * 5).to(dtype_arg2)
+
+        r"""
+            Category conversion
+        """
+        has_scalar = False
+        if category1 == "scalar":
+            has_scalar = True
+            x = x.item()
+
+        if category1 == "0dimcpu":
+            x = x.to(device="cpu")
+
+        if category2 == "scalar":
+            has_scalar = True
+            y = y.item()
+
+        if category2 == "0dimcpu":
+            y = y.to(device="cpu")
+
+        z = torch.tensor([2], device="cuda").to(dtype_arg1)
+        is_dtype_arg1_int = dtype_arg1 == torch.int32 or dtype_arg1 == torch.int64
+        is_dtype_arg2_int = dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64
+
+        if operation in [torch.pow]:
+            if is_dtype_arg1_int and is_dtype_arg2_int:
+                if category2 == "scalar":
+                    # RuntimeError: Integers to negative integer powers are not allowed
+                    y = abs(y)
+                if category2 == "0dimcpu" and y == -1:
+                    # https://github.com/pytorch/pytorch/issues/73196
+                    y = y - 1
+                if category2 == "0dimcpu" and y == -2:
+                    # avoid pow(0, -2), which gives inconsistent results on integer tensor
+                    y = y - 1
+
+        # Avoid division by zero for integer tensors
+        div_like = [torch.div, torch.fmod, torch.remainder]
+        if operation in div_like and (dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64):
+            y[y == 0] = 1
+
+        test_value = True
+        if dtype_arg1 == torch.half or dtype_arg2 == torch.half:
+            test_value = False
+        if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16:
+            test_value = False
+
+        try:
+            if not has_scalar:
+                o = t(x, y, z)
+                t_jit = torch.jit.script(t)
+                jit_o = t_jit(x, y, z)
+                jit_o = t_jit(x, y, z)
+                jit_o = t_jit(x, y, z)
+
+                self.assertEqual(o.dtype, jit_o.dtype)
+                if test_value:
+                    self.assertEqual(o, jit_o)
+                self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+            elif category2 != "scalar":  # only test the case where first is scalar
+                test_fn = self._get_scalar_binary_test_fn((category1, dtype_arg1), (category2, dtype_arg2), operation)
+                o = test_fn(x, y)
+                t_jit = torch.jit.script(test_fn)
+                jit_o = t_jit(x, y)
+                jit_o = t_jit(x, y)
+                jit_o = t_jit(x, y)
+
+                self.assertEqual(o.dtype, jit_o.dtype)
+                if test_value:
+                    self.assertEqual(o, jit_o)
+                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+        except Exception as e:
+            print("failing test for op: ", operation.__name__)
+            print("with input\n\tx: ", x)
+            print("\ty: ", y)
+            print("\tz: ", z)
+            raise e
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_binary_ops(self):
+        data_types = [
+            torch.int32,
+            torch.int64,
+            torch.float16,
+            torch.float32,
+            torch.float64,
+        ]
+        if TEST_BF16:
+            data_types.append(torch.bfloat16)
+        operations = [torch.mul,
+                      torch.div,
+                      torch.atan2,
+                      torch.max,
+                      torch.min,
+                      torch.pow,
+                      torch.remainder,
+                      torch.fmod,
+                      torch.eq,
+                      torch.ne,
+                      torch.ge,
+                      torch.gt,
+                      torch.le,
+                      torch.lt]
+
+        category_types = [
+            "scalar",
+            "0dim",
+            "0dimcpu",
+            "ndim"
+        ]
+
+        binary_dtype_combinations = list(itertools.combinations(data_types, 2))
+        category_combinations = list(itertools.combinations(category_types, 2))
+
+        for op, dtypes, categories in itertools.product(operations, binary_dtype_combinations, category_combinations):
+            self._binary_test_helper(op, dtypes, True, categories)  # random data
+
+        for op, dtypes in itertools.product(operations, binary_dtype_combinations):
+            self._binary_test_helper(op, dtypes, False)  # special numbers
+
+    # TODO: revert this
+    @unittest.skipIf(True, "see issue https://github.com/csarofeen/pytorch/issues/1730")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_binary_ops_complex(self):
+        data_types = [torch.cfloat, torch.cdouble]
+        operations = [torch.mul, torch.div, torch.pow, torch.eq, torch.ne]
+
+        category_types = [
+            "scalar",
+            "0dim",
+            "0dimcpu",
+            "ndim"
+        ]
+
+        binary_dtype_combinations = list(itertools.combinations(data_types, 2))
+        category_combinations = list(itertools.combinations(category_types, 2))
+
+        for op, dtypes, categories in itertools.product(operations, binary_dtype_combinations, category_combinations):
+            self._binary_test_helper(op, dtypes, True, categories)  # random data
+
+        for op, dtypes in itertools.product(operations, binary_dtype_combinations):
+            self._binary_test_helper(op, dtypes, False)  # special numbers
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_binary_bitwise(self):
+        dtypes = [torch.bool, torch.int32, torch.int64]
+
+        for dtype1, dtype2, dtype3 in itertools.product(dtypes, repeat=3):
+            def jit_and(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_and(x, y) & z
+
+            def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_or(x, y) | z
+
+            def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_xor(x, y) ^ z
+
+            def jit_lshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_left_shift(x, y) << z
+
+            def jit_rshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+                return torch.bitwise_right_shift(x, y) >> z
+
+            for jit_func in [jit_and, jit_or, jit_xor, jit_lshift, jit_rshift]:
+                if torch.bool in {dtype1, dtype2, dtype3} and jit_func in {jit_lshift, jit_rshift}:
+                    continue
+                x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype1)
+                y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype2)
+                z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(2).to(dtype3)
+
+                jitted = torch.jit.script(jit_func)
+                jit_o = jitted(x, y, z)
+                jit_o = jitted(x, y, z)
+                o = jit_func(x, y, z)
+                self.assertEqual(o, jit_o)
+                self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_type_as_op(self):
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = torch.lt(x, z)
+            o = o.type_as(y)
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 0.5)
+        jit_o = t_jit(x, y, 0.5)
+        o = t(x, y, 0.5)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, 0.5), FUSION_GUARD)
+
+    def _ternary_integer_test_helper(self, dtype_arg1):
+        shape = (4, 8, 32, 32)
+        magnitude = 100
+        if (dtype_arg1 in self.int_types):
+            x = torch.randint(-magnitude, magnitude, shape, dtype=dtype_arg1, device="cuda")
+        else:
+            x = torch.randn(shape, dtype=dtype_arg1, device="cuda") * magnitude
+        arg2 = int(0)
+        arg3 = int(magnitude * 0.1)
+
+        def clamp0(x: torch.Tensor, f: int):
+            o = 2. * torch.clamp(x, min=f)
+            return o
+        clamp0_jit = torch.jit.script(clamp0)
+        self._run_helper(clamp0_jit, clamp0, x, arg2)
+
+        def clamp1(x: torch.Tensor, f: int, ff: int):
+            o = 2. * torch.clamp(x, min=f, max=ff)
+            return o
+        clamp1_jit = torch.jit.script(clamp1)
+        self._run_helper(clamp1_jit, clamp1, x, arg2, arg3)
+
+        def clamp2(x: torch.Tensor, f: float, ff: int):
+            o = 2. * torch.clamp(x, min=f, max=ff)
+            return o
+        clamp2_jit = torch.jit.script(clamp2)
+        self._run_helper(clamp2_jit, clamp2, x, float(arg2), arg3)
+
+        def clamp3(x: torch.Tensor, f: int, ff: float):
+            o = 2. * torch.clamp(x, min=f, max=ff)
+            return o
+        clamp3_jit = torch.jit.script(clamp3)
+        self._run_helper(clamp3_jit, clamp3, x, arg2, float(arg3))
+
+        def threshold(x: torch.Tensor, th: int, val: int):
+            o = 2. * torch.threshold(x, th, val)
+            return o
+        threshold_jit = torch.jit.script(threshold)
+        self._run_helper(threshold_jit, threshold, x, arg2, arg3)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_ternary_ops_integer_compatibility(self):
+        data_types = [
+            torch.float16,
+            torch.float32,
+            torch.float64
+        ]
+        for dtype in data_types:
+            self._ternary_integer_test_helper(dtype)
+
+    def _ternary_test_helper(self, operation, dtypes, random_data):
+        if isinstance(dtypes, tuple):
+            dtype_arg1, dtype_arg2, dtype_arg3 = dtypes
+        else:
+            dtype_arg1 = dtype_arg2 = dtype_arg3 = dtypes
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: torch.Tensor):
+            o = operation(x, y, z)
+            o = o + alpha
+            return o
+
+        shape = (4, 32, 32)
+        if operation is torch.where:
+            dtype_arg1 = torch.bool
+            if random_data:
+                x = torch.randint(0, 2, shape).to(dtype=torch.bool, device="cuda")
+                y = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
+                z = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg3)
+            else:
+                x = torch.randint(0, 2, self.special_values.size()).to(dtype=torch.bool, device="cuda")
+                y = self.special_values.to(dtype=dtype_arg2)
+                z = (torch.rand_like(self.special_values) * 5).to(dtype_arg3)
+        elif random_data:
+            x = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg1)
+            y = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg2)
+            z = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg3)
+        else:
+            x = self.special_values.to(dtype=dtype_arg1)
+            y = (torch.rand_like(self.special_values) * 5).to(dtype_arg2)
+            z = (torch.rand_like(self.special_values) * 5).to(dtype_arg3)
+        alpha = torch.tensor([2], device="cuda").to(dtype_arg1)
+
+        o = t(x, y, z, alpha)
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y, z, alpha)
+        jit_o = t_jit(x, y, z, alpha)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_ternary_ops_type_promotion(self):
+        # TODO: update accuracy tolerance for bf16 / fp16 data types
+        data_types = [
+            # torch.float16,
+            torch.float32,
+            torch.float64
+        ]
+        '''
+        if TEST_BF16:
+            data_types.append(torch.bfloat16)
+        '''
+        # TODO: Add Tensor support for clamp
+        operations = [torch.clamp]
+        ternary_dtype_combinations = itertools.combinations(data_types, 3)
+        for op, dtypes in itertools.product(operations, ternary_dtype_combinations):
+            self._ternary_test_helper(op, dtypes, True)  # random data
+            self._ternary_test_helper(op, dtypes, False)  # special numbers
+
+    # We can't test the scalar version of rsub from python
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective")
+    def test_rsub(self):
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+
+        def rsub(x: torch.Tensor, y: torch.Tensor):
+            o = torch.rsub(x, y)
+            o = o * 2.
+            return o
+
+        rsub_jit = torch.jit.script(rsub)
+        self._run_helper(rsub_jit, rsub, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    # legacy fuser does not work for rand_like, see issue #34361
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective")
+    def test_ternary_ops(self):
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        cond = torch.randint(0, 2, (4, 8, 32, 32)).to(dtype=torch.bool, device="cuda")
+
+        def add(x: torch.Tensor, other: torch.Tensor, alpha: float):
+            o = torch.relu(x)
+            o = torch.add(o, other=other, alpha=alpha)
+            return o
+        add_jit = torch.jit.script(add)
+        self._run_helper(add_jit, add, x, y, 2.0)
+
+        def clamp0(x: torch.Tensor, f: float):
+            o = 2. * torch.clamp(x, min=f)
+            return o
+        clamp0_jit = torch.jit.script(clamp0)
+        self._run_helper(clamp0_jit, clamp0, x, 0.5)
+
+        def clamp1(x: torch.Tensor, f: float, ff: float):
+            o = 2. * torch.clamp(x, min=f, max=ff)
+            return o
+        clamp1_jit = torch.jit.script(clamp1)
+        self._run_helper(clamp1_jit, clamp1, x, -0.2, 0.7)
+
+        def threshold(x: torch.Tensor, th: float, val: float):
+            o = 2. * torch.threshold(x, th, val)
+            return o
+        threshold_jit = torch.jit.script(threshold)
+        self._run_helper(threshold_jit, threshold, x, 0.2, 0.9)
+
+        def where(x: torch.Tensor, y: torch.Tensor, cond: torch.Tensor):
+            o = 2. * torch.where(cond, x, y)
+            return o
+        where_jit = torch.jit.script(where)
+        self._run_helper(where_jit, where, x, y, cond)
+
+        def lerp(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = 2. * torch.lerp(x, y, z)
+            return o
+        lerp_jit = torch.jit.script(lerp)
+        self._run_helper(lerp_jit, lerp, x, y, z)
+
+        def lerp_scale(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = 2. * torch.lerp(x, y, z)
+            return o
+        lerp_scale_jit = torch.jit.script(lerp_scale)
+        self._run_helper(lerp_scale_jit, lerp_scale, x, y, 0.5)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
+    def test_addcmul_ops(self):
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+
+        def addcmul(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, value: float):
+            o = torch.add(x, 0.5)
+            o = torch.addcmul(o, y, z, value=value)
+            return o
+        addcmul_jit = torch.jit.script(addcmul)
+        self._run_helper(addcmul_jit, addcmul, x, y, z, 2.0)
+
+        def addcmul_no_alpha(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = torch.add(x, 0.5)
+            o = torch.addcmul(o, y, z)
+            return o
+        addcmul_no_alpha_jit = torch.jit.script(addcmul_no_alpha)
+        self._run_helper(addcmul_no_alpha_jit, addcmul_no_alpha, x, y, z)
+
+        def addcmul_const_alpha(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = torch.add(x, 0.5)
+            o = torch.addcmul(o, y, z, value=0.75)
+            return o
+        addcmul_const_alpha_jit = torch.jit.script(addcmul_const_alpha)
+        self._run_helper(addcmul_const_alpha_jit, addcmul_const_alpha, x, y, z)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_dynamic_size(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+        torch._C._jit_set_bailout_depth(20)
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0))
+        self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False)
+
+        # this test is not ideal, as we rely on the bailout to test it and we
+        # don't know a way to verify the bailout graph to validate the proper
+        # fusion.
+        x = torch.randn(8, 32, 16, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(16, 8, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
+        x = torch.randn(8, 17, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 17, 1, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD)
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_random_topo(self):
+        os.environ["PYTORCH_NVFUSER_DISABLE_FALLBACK"] = "1"
+        self.assertTrue(runDefaultTestWithSeed(28449))
+
+    def _compare(self, desc, inp1, inp2, error):
+        a = inp1.clone()
+        b = inp2.clone()
+        close = torch.allclose(a, b, rtol=error, atol=error, equal_nan=True)
+        if not close:
+            print(desc, close)
+            z = a - b
+            index = (torch.abs(z) >= error + error * torch.abs(b)).nonzero()
+            print("dif    : ", z[index])
+            print("inp1   : ", a[index])
+            print("inp2   : ", b[index])
+            print("maximum difference", z[index].max())
+        return close
+
+    # Permutation helper that applies binary operation between two tensors:
+    #   1. applies separate permutation `perm0` & `perm1` to two inputs
+    #   2. reduce dimension `broadcast_axis` of operand two to size 1
+    # The purpose of this test is to ensure permutation works well in
+    # complicated cases with arbitrary stride order and broadcasting dimensions
+    def _permutation_helper(self, sizes, broadcast_axis, dtype, device, perm0, perm1):
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = torch.add(x, y)
+            o = torch.relu(o)
+            return o
+
+        x = torch.randn([sizes[i] for i in perm0], dtype=dtype, device=device).permute(
+            [perm0.index(i) for i in range(len(sizes))])
+        if broadcast_axis >= 0:
+            sizes[broadcast_axis] = 1
+        y = torch.randn([sizes[i] for i in perm1], dtype=dtype, device=device).permute(
+            [perm1.index(i) for i in range(len(sizes))])
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertEqual(o.stride(), jit_o.stride())
+        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+
+    # end-2-end test of permutation & contiguity handling in integration.
+    # we are testing inputs with all combination of permutation order, just to
+    # ensure that integration would be able to generate functionally correct
+    # kernels
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_binary_ops_permutation(self):
+        # note that num_dim is exclusive from len(x), so we are not reducing
+        # to single element (codegen limitation at this moment)
+        x = [7, 8, 12]
+        b_axes = range(-1, len(x))
+        for b_axis in b_axes:
+            for perm0 in itertools.permutations(range(len(x))):
+                for perm1 in itertools.permutations(range(len(x))):
+                    x = [7, 8, 12]
+                    self._permutation_helper(x, b_axis, torch.float32, "cuda", perm0, perm1)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_binary_ops_channels_last_with_bcast(self):
+        device = "cuda"
+        x = torch.randn([4, 3, 2, 5], device=device).to(memory_format=torch.channels_last)
+        w = torch.randn([2, 5], device=device)
+
+        def t(x: torch.Tensor, b: torch.Tensor):
+            o = x + b
+            return torch.relu(o)
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, w)
+        jit_o = t_jit(x, w)
+        jit_o = t_jit(x, w)
+        o = t(x, w)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+        self.assertGraphContains(t_jit.graph_for(x, w), FUSION_GUARD)
+
+    def _reduction_helper(self, sizes, reduction_axis, dtype, device, perm0, perm1, keepdim=False):
+        class MyReduction(torch.nn.Module):
+            __constants__ = ['reduction_axis', 'keepdim']
+
+            def __init__(self):
+                super().__init__()
+                self.reduction_axis = reduction_axis
+                self.keepdim = keepdim
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                o = torch.add(x, y)
+                o = torch.sum(o, dim=self.reduction_axis, keepdim=self.keepdim)
+                return o
+
+        t = MyReduction()
+
+        x = torch.randn([sizes[i] for i in perm0], dtype=dtype, device=device).permute(
+            [perm0.index(i) for i in range(len(sizes))])
+        y = torch.randn([sizes[i] for i in perm1], dtype=dtype, device=device).permute(
+            [perm1.index(i) for i in range(len(sizes))])
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        # numerical issues here due to our scheduling.
+        # can't use `self.assertEqual(o, jit_o)`
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+        self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_reduction(self):
+        for x in ([7, 8, 12], [12, 8, 7, 9, 15], [128, 16, 8, 32]):
+            # note that num_dim is exclusive from len(x), so we are not reducing
+            # to single element (codegen limitation at this moment)
+            for num_reduce_dim in range(1, len(x)):
+                for axes in itertools.combinations(range(len(x)), num_reduce_dim):
+                    for keepdim in (True, False):
+                        perm0 = range(len(x))
+                        perm1 = range(len(x))
+                        self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1, keepdim)
+
+    def _layer_norm_autodiff_helper(self, model, grad, shapes, args):
+        jit_model = torch.jit.script(model)
+
+        eps = np.random.random() * 1e-4
+        use_cudnn = bool(np.random.randint(0, 2))
+
+        # profile/optimization runs
+        for i in range(3):
+            jit_o = jit_model(shapes, *args, eps, use_cudnn)
+            jit_o.backward(grad)
+
+        ref_args = [t.detach().clone().requires_grad_() for t in args]
+        [t.grad.zero_() for t in args]
+        jit_o = jit_model(shapes, *args, eps, use_cudnn)
+        jit_o.backward(grad)
+
+        o = model(shapes, *ref_args, eps, use_cudnn)
+        o.backward(grad)
+        self.assertEqual(jit_o, o)
+        for arg, ref_arg in zip(args, ref_args):
+            self.assertEqual(arg.grad, ref_arg.grad)
+
+        # check fusion in fw & bw
+        g = jit_model.graph_for(shapes, *args, eps, use_cudnn)
+        for node in g.nodes():
+            n = node
+        dbg_state = jit_model.get_debug_state()
+        for val in dbg_state.execution_plans.values():
+            v = val
+        state2 = v.code.grad_executor_states()
+        for val in state2[0].execution_plans.values():
+            v2 = val
+        FileCheck().check(FUSION_GUARD).run(g)
+        FileCheck().check(FUSION_GUARD).run(v2.graph)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_layer_norm_autodiff(self):
+        def t_wb(shapes: List[int], x, w, b, eps: float, cudnn: bool):
+            o = torch.layer_norm(x, shapes, w, b, eps, cudnn)
+            o = torch.relu(o)
+            return o
+
+        def t_w(shapes: List[int], x, w, eps: float, cudnn: bool):
+            o = torch.layer_norm(x, shapes, w, None, eps, cudnn)
+            o = torch.relu(o)
+            return o
+
+        def t_b(shapes: List[int], x, b, eps: float, cudnn: bool):
+            o = torch.layer_norm(x, shapes, None, b, eps, cudnn)
+            o = torch.relu(o)
+            return o
+
+        def t(shapes: List[int], x, eps: float, cudnn: bool):
+            o = torch.layer_norm(x, shapes, None, None, eps, cudnn)
+            o = torch.relu(o)
+            return o
+
+        model = {3: t_wb, 2: t_w, 1: t_b, 0: t}
+
+        for w, b in itertools.product([True, False], repeat=2):
+            batch = [2]
+            # note: awkward shape here to avoid vectorized fast kernel, which is
+            # buggy in aten
+            shapes = [2, 7, 3]
+            m = model[w * 2 + b]
+
+            grad = torch.randn(batch + shapes, dtype=torch.float32, device="cuda")
+            args = [torch.randn(batch + shapes, dtype=torch.float32, device="cuda").requires_grad_()]
+            if w:
+                args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
+            if b:
+                args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
+            self._layer_norm_autodiff_helper(m, grad, shapes, args)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_layer_norm_parser(self):
+        dtype = torch.float32
+        device = "cuda"
+        x = torch.randn([4, 4, 2], dtype=dtype, device=device)
+        w = torch.randn([4, 2], dtype=dtype, device=device)
+        b = torch.randn([4, 2], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, w: torch.Tensor, b: torch.Tensor):
+            o = torch.relu(x)
+            o = torch.layer_norm(o, [4, 2], w, b, 1e-5)
+            return o
+
+        o = t(x, w, b)
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, w, b)
+        jit_o = t_jit(x, w, b)
+        o = t(x, w, b)
+        self.assertGraphContains(t_jit.graph_for(x, w, b), FUSION_GUARD)
+
+    def _native_layer_norm_helper(self, shape, norm_shape, dtype, device, error, affine=True):
+        class MyLayerNorm(torch.nn.Module):
+            __constants__ = ['norm_shape']
+
+            def __init__(self, elementwise_affine=True):
+                super().__init__()
+                self.norm_shape = norm_shape
+                if elementwise_affine:
+                    self.weight = torch.randn(norm_shape, dtype=dtype, device=device)
+                    self.bias = torch.randn(norm_shape, dtype=dtype, device=device)
+                    with torch.no_grad():
+                        self.weight.fill_(1)
+                        self.bias.fill_(0)
+                else:
+                    self.weight = None
+                    self.bias = None
+
+            def forward(self, x: torch.Tensor):
+                o = torch.relu(x)
+                o = torch.native_layer_norm(o, self.norm_shape, self.weight, self.bias, 1e-5)
+                return o
+
+        t = MyLayerNorm(affine)
+
+        x = torch.randn(shape, dtype=dtype, device=device)
+        t_jit = torch.jit.script(t)
+        jit_o, jit_mean, jit_rstd = t_jit(x)
+        jit_o, jit_mean, jit_rstd = t_jit(x)
+        o, mean, rstd = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        # numerical issues here due to our scheduling.
+        # can't use `self.assertEqual(o, jit_o)`
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        self.assertTrue(self._compare("comparing mean failed", mean, jit_mean, error))
+        self.assertTrue(self._compare("comparing rstd failed", rstd, jit_rstd, error))
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_native_layer_norm(self):
+        dims = 4
+        rnds = 3
+        for idx in range(rnds):
+            for offset in range(1, dims):
+                for affine in (True, False):
+                    input_shape = [random.randint(10, 30) for idx in range(dims)]
+                    norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
+                    self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_native_layer_norm_half(self):
+        dims = 4
+        rnds = 3
+        for idx in range(rnds):
+            for offset in range(1, dims):
+                input_shape = [random.randint(10, 30) for idx in range(dims)]
+                norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
+                self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_native_layer_norm_bfloat(self):
+        dims = 4
+        rnds = 3
+        for idx in range(rnds):
+            for offset in range(1, dims):
+                input_shape = [random.randint(10, 30) for idx in range(dims)]
+                norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)]
+                self._native_layer_norm_helper(input_shape, norm_shape, torch.bfloat16, "cuda", 1e-1)
+
+    def _norm_helper(self,
+                     shape,
+                     dtype,
+                     device,
+                     error,
+                     is_batch_norm_else_instance_norm,
+                     memory_format=torch.contiguous_format,
+                     *,
+                     layer_dtype=torch.float32):
+        class MyBatchNorm(torch.nn.Module):
+            def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
+                o = torch.nn.functional.batch_norm(x, r_mean, r_var, training=True)
+                o = torch.relu(o)
+                return o
+
+        class MyInstanceNorm(torch.nn.Module):
+            def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
+                o = torch.nn.functional.instance_norm(x, r_mean, r_var, use_input_stats=True)
+                o = torch.relu(o)
+                return o
+
+        t = MyBatchNorm() if is_batch_norm_else_instance_norm else MyInstanceNorm()
+
+        x = torch.randn(shape, dtype=dtype, device=device).to(memory_format=memory_format)
+        running_mean = torch.zeros(shape[1], dtype=layer_dtype, device=device)
+        running_var = torch.ones(shape[1], dtype=layer_dtype, device=device)
+        t_jit = torch.jit.script(t)
+
+        eager_running_mean = running_mean.clone()
+        eager_running_var = running_var.clone()
+        jit_running_mean = running_mean.clone()
+        jit_running_var = running_var.clone()
+
+        jit_o = t_jit(x, running_mean.clone(), running_var.clone())
+
+        self.assertTrue(self._compare("prerun comparing running_mean failed", eager_running_mean, jit_running_mean, error))
+        self.assertTrue(self._compare("prerun comparing running_var failed", eager_running_var, jit_running_var, error))
+
+        jit_o = t_jit(x, jit_running_mean, jit_running_var)
+        o = t(x, eager_running_mean, eager_running_var)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.stride(), jit_o.stride())
+        # numerical issues here due to our scheduling.
+        # can't use `self.assertEqual(o, jit_o)`
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        self.assertTrue(self._compare("comparing running_mean failed", eager_running_mean, jit_running_mean, error))
+        self.assertTrue(self._compare("comparing running_var failed", eager_running_var, jit_running_var, error))
+        self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_layer_norm_trivial_reduce_dim(self):
+        def t_wb(shapes: List[int], x, w, b, eps: float, cudnn: bool):
+            o = torch.layer_norm(x, shapes, w, b, eps, cudnn)
+            o = torch.relu(o)
+            return o
+
+        batch = [1]
+        shapes = [2, 7, 3]
+
+        grad = torch.randn(batch + shapes, dtype=torch.float32, device="cuda")
+        args = [torch.randn(batch + shapes, dtype=torch.float32, device="cuda").requires_grad_()]
+        args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
+        args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_())
+        self._layer_norm_autodiff_helper(t_wb, grad, shapes, args)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_norm_half_layer(self):
+        size = [2, 4, 2, 2]
+
+        for is_batch_norm_else_instance_norm in [False, True]:
+            for mf in [torch.channels_last, torch.contiguous_format]:
+                self._norm_helper(size, torch.float16, "cuda", 1e-3, is_batch_norm_else_instance_norm,
+                                  memory_format=mf, layer_dtype=torch.float16)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_norm_channels_last(self):
+        size = [3, 4, 5, 6]
+
+        with torch.backends.cudnn.flags(enabled=False):
+            for is_batch_norm_else_instance_norm in [False, True]:
+                for mf in [torch.channels_last, torch.contiguous_format]:
+                    self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_norm(self):
+        output_elements = 10000
+        channel_sizes = [67, 457, 1024, 4096]
+
+        with torch.backends.cudnn.flags(enabled=False):
+            for is_batch_norm_else_instance_norm in [False, True]:
+                for dims in range(3, 6):
+                    output_size = int(pow(output_elements, 1. / (dims - 1)))
+                    for C in channel_sizes:
+                        x = [output_size for idx in range(dims)]
+                        x[1] = C
+                        self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
+
+    @skipIfRocm
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_norm_large(self):
+        output_elements = 262144
+        channel_sizes = 67, 457, 1024
+
+        for is_batch_norm_else_instance_norm in [True, False]:
+            for dims in range(3, 6):
+                output_size = int(pow(output_elements, 1. / (dims - 1)))
+                for C in channel_sizes:
+                    x = [output_size for idx in range(dims)]
+                    x[1] = C
+                    self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_norm_half(self):
+        output_elements = 10000
+        channel_sizes = [67, 457, 1024, 4096]
+
+        with torch.backends.cudnn.flags(enabled=False):
+            # TODO instance norm on ROCm was giving ~50% incorrect results
+            for is_batch_norm_else_instance_norm in [True] if TEST_WITH_ROCM else [False, True]:
+                for dims in range(3, 6):
+                    output_size = int(pow(output_elements, 1. / (dims - 1)))
+                    for C in channel_sizes:
+                        x = [output_size for idx in range(dims)]
+                        x[1] = C
+                        self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_norm_bfloat(self):
+        output_elements = 10000
+        channel_sizes = [67, 457, 1024, 4096]
+
+        with torch.backends.cudnn.flags(enabled=False):
+            # TODO instance norm on ROCm was giving ~50% incorrect results
+            for is_batch_norm_else_instance_norm in [True] if TEST_WITH_ROCM else [False, True]:
+                for dims in range(3, 6):
+                    output_size = int(pow(output_elements, 1. / (dims - 1)))
+                    for C in channel_sizes:
+                        x = [output_size for idx in range(dims)]
+                        x[1] = C
+                        self._norm_helper(x, torch.bfloat16, "cuda", 1e-1, is_batch_norm_else_instance_norm)
+
+    def _softmax_helper(self, shape, reduction_axis, is_log_softmax, dtype, device, error):
+        class MySoftmax(torch.nn.Module):
+            __constants__ = ['reduction_axis']
+
+            def __init__(self):
+                super().__init__()
+                self.reduction_axis = reduction_axis
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                o = torch.add(x, y)
+                o = torch.nn.functional.softmax(o, dim=self.reduction_axis)
+                return o
+
+        class MyLogSoftmax(torch.nn.Module):
+            __constants__ = ['reduction_axis']
+
+            def __init__(self):
+                super().__init__()
+                self.reduction_axis = reduction_axis
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                o = torch.add(x, y)
+                o = torch.nn.functional.log_softmax(o, dim=self.reduction_axis)
+                return o
+
+        gradient_check = (dtype == torch.float64)
+        t = MyLogSoftmax() if is_log_softmax else MySoftmax()
+
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check)
+        y = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check)
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+
+        if gradient_check:
+            gradcheck(t_jit.forward, [x, y], nondet_tol=1e-5)
+        else:
+            o = t(x, y)
+            self.assertEqual(o.dtype, jit_o.dtype)
+            # numerical issues here due to our scheduling.
+            # can't use `self.assertEqual(o, jit_o)`
+            self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+            self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_softmax_dtype(self):
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = torch.mul(x, y)
+            o = torch.nn.functional.softmax(o, dim=0, dtype=torch.float32)
+            return o
+
+        x = torch.randn([4, 4], dtype=torch.float16, device="cuda").requires_grad_()
+        y = torch.randn_like(x).requires_grad_()
+        grad = torch.randn_like(x).float()
+
+        ref_x = x.detach().requires_grad_()
+        ref_y = y.detach().requires_grad_()
+        o = t(ref_x, ref_y)
+        o.backward(grad)
+
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+        x.grad.zero_()
+        y.grad.zero_()
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(ref_x.grad, x.grad)
+        self.assertEqual(ref_y.grad, y.grad)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
+        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
+        bwd_graph = list(
+            list(t_jit.get_debug_state().execution_plans.values())[
+                0].code.grad_executor_states()[0].execution_plans.values()
+        )[0].graph
+        FileCheck().check(FUSION_GUARD).run(bwd_graph)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test__softmax_function(self):
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = torch.mul(x, y)
+            o = torch._softmax(o, dim=-1, half_to_float=False)
+            return o
+
+        x = torch.randn([4, 4], dtype=torch.float16, device="cuda")
+        y = torch.randn_like(x)
+
+        o = t(x, y)
+
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
+        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test__softmax_function_half_to_float(self):
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = torch.mul(x, y)
+            o = torch._softmax(o, dim=-1, half_to_float=True)
+            return o
+
+        x = torch.randn([4, 4], dtype=torch.float16, device="cuda")
+        y = torch.randn_like(x)
+
+        o = t(x, y)
+
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-3))
+        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_softmax(self):
+        output_size = 10000
+        dims = 4
+        output_size = int(pow(output_size, 1. / dims))
+        reduction_sizes = [67, 256, 1024, 4096]
+
+        # gradient check
+        for reduction_dim in range(dims):
+            for is_log_softmax in [False, True]:
+                shape = [output_size for idx in range(dims)]
+                self._softmax_helper(shape, reduction_dim, is_log_softmax, torch.float64, "cuda", 1e-4)
+
+        for reduction_dim in range(dims):
+            for reduction_size in reduction_sizes:
+                x = [output_size for idx in range(dims)]
+                x[reduction_dim] = reduction_size
+                for is_log_softmax in [False, True]:
+                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float32, "cuda", 1e-4)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_softmax_half(self):
+        output_size = 10000
+        dims = 4
+        output_size = int(pow(output_size, 1. / dims))
+        reduction_sizes = [67, 256, 1024, 4096]
+
+        for reduction_dim in range(dims):
+            for reduction_size in reduction_sizes:
+                x = [output_size for idx in range(dims)]
+                x[reduction_dim] = reduction_size
+                for is_log_softmax in [False, True]:
+                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float16, "cuda", 5e-3)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_softmax_bfloat(self):
+        output_size = 10000
+        dims = 4
+        output_size = int(pow(output_size, 1. / dims))
+        reduction_sizes = [67, 256, 1024, 4096]
+
+        for reduction_dim in range(dims):
+            for reduction_size in reduction_sizes:
+                x = [output_size for idx in range(dims)]
+                x[reduction_dim] = reduction_size
+                for is_log_softmax in [False, True]:
+                    self._softmax_helper(x, reduction_dim, is_log_softmax, torch.bfloat16, "cuda", 1e-1)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_reduction_permutation(self):
+        x = [7, 8, 12]
+        # note that num_dim is exclusive from len(x), so we are not reducing
+        # to single element (codegen limitation at this moment)
+        for num_reduce_dim in range(1, len(x)):
+            for axes in itertools.combinations(range(len(x)), num_reduce_dim):
+                for perm0 in itertools.permutations(range(len(x))):
+                    for perm1 in itertools.permutations(range(len(x))):
+                        self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_reduction_multiple_output(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+        torch._C._jit_set_bailout_depth(20)
+
+        def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor):
+            o = torch.mul(x, y)
+            o = torch.mul(o, scale)
+            out1 = torch.mul(o, z)
+            out2 = torch.sum(out1, dim=[2])
+            return out1, out2
+
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        scale = 0.5
+        jit_o = t_jit(x, y, scale, z)
+        jit_o = t_jit(x, y, scale, z)
+        o = t(x, y, scale, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GUARD)
+
+        x = x.to(memory_format=torch.channels_last)
+        y = y.to(memory_format=torch.channels_last)
+        z = z.to(memory_format=torch.channels_last)
+        jit_o = t_jit(x, y, scale, z)
+        jit_o = t_jit(x, y, scale, z)
+        o = t(x, y, scale, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GUARD)
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_channels_last_with_broadcast(self):
+        # setting this true forces a new graph to be generated with a new
+        # input a different broadcast shape
+        torch._C._jit_set_nvfuser_guard_mode(True)
+
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = torch.mul(x, y)
+            o = o + 2.0
+            return o
+        t_jit = torch.jit.script(t)
+
+        # Single Channel broadcasts
+        # Test 1
+        x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        x = x.to(memory_format=torch.channels_last)
+
+        y = torch.randn(8, 4, 10, 1, dtype=torch.float, device="cuda")
+        y = y.to(memory_format=torch.channels_last)
+
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
+                         jit_o.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(o, jit_o)
+
+        # Test 2
+        y = torch.randn(8, 4, 1, 16, dtype=torch.float, device="cuda")
+        y = y.to(memory_format=torch.channels_last)
+
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
+                         jit_o.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(o, jit_o)
+
+        # Test 3
+        y = torch.randn(8, 1, 10, 16, dtype=torch.float, device="cuda")
+        y = y.to(memory_format=torch.channels_last)
+
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
+                         jit_o.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(o, jit_o)
+
+        # Test 3
+        y = torch.randn(1, 4, 10, 16, dtype=torch.float, device="cuda")
+        y = y.to(memory_format=torch.channels_last)
+
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
+                         jit_o.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(o, jit_o)
+
+        '''
+        Currently, the JIT doesn't have tensor merge logic to handle adding
+        a broadcast tensor with more than one broadcast into a non-broadcast
+        tensor.  Therefore, either of these tests can fail depending on the
+        sort implementation.  The second test is known to fail.
+
+        # Two Channel broadcasts
+        # Test 1
+        y = torch.randn(8, 4, 1, 1, dtype=torch.float, device="cuda")
+        y = y.to(memory_format=torch.channels_last)
+
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
+                         jit_o.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(o, jit_o)
+
+        # Test 2
+        y = torch.randn(8, 4, 1, 1, dtype=torch.float, device="cuda")
+        y = y.to(memory_format=torch.channels_last).transpose(2,3)
+        x = x.transpose(2,3)
+
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o.is_contiguous(memory_format=torch.channels_last),
+                         jit_o.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(o, jit_o)
+        '''
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_pw_single_reduction_partition(self):
+        sizes = [2, 2, 2]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device)
+        y = torch.randn(sizes, dtype=dtype, device=device)
+        z = torch.randn(sizes, dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = torch.add(x, y)
+            o = torch.sum(o, dim=[0])
+            o = torch.add(o, z)
+            return o
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permutation_preservation(self):
+        sizes = [2, 3, 4, 5]
+        dtype = torch.float
+        device = "cuda"
+
+        with nvfuser_singleton_fusion(True):
+
+            def t(x: torch.Tensor):
+                return torch.relu(x)
+
+            t_jit = torch.jit.script(t)
+            x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+            self._run_helper(t_jit, t, x, check_stride=True)
+
+            def t(x: torch.Tensor, y: torch.Tensor):
+                return torch.add(x, y)
+
+            t_jit = torch.jit.script(t)
+            x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+            y = torch.randn(sizes[1:], dtype=dtype, device=device)
+            self._run_helper(t_jit, t, x, y, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permutation_preservation_edge_case_0(self):
+        sizes = [2, 3, 4, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        # mismatch rank with *note* different permutation recognized by PE
+        bias = torch.randn(3, dtype=dtype, device=device).unsqueeze(-1).unsqueeze(-1)
+
+        def t(x, y):
+            return x + y
+
+        t_jit = torch.jit.script(t)
+        with nvfuser_singleton_fusion(True):
+            self._run_helper(t_jit, t, x, bias, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permutation_preservation_edge_case_1_broken(self):
+        sizes = [2, 3, 4, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        # in-compatible permutation, this will cause format propagation to break
+        bias = torch.randn(4, 5, dtype=dtype, device=device)
+
+        def t(x, y):
+            return x + y
+
+        t_jit = torch.jit.script(t)
+        with nvfuser_singleton_fusion(True):
+            for _ in range(5):
+                jit_o = t_jit(x, bias)
+
+        o = t(x, bias)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        try:
+            # nvfuser does not support in-compatible permutation, this will throw
+            self.assertEqual(o.stride(), jit_o.stride())
+        except Exception as e:
+            warnings.warn(
+                "permutation propagation is broken, proper support should come after nvfuser permutation scheduler update")
+        self.assertGraphContains(t_jit.graph_for(x, bias), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permutation_preservation_edge_case_2(self):
+        sizes = [2, 3, 4, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        y = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+        z = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last)
+
+        def t(x, y, w):
+            tmp = torch.lerp(x, y, w)
+            tmp = torch.clamp(tmp, -1.0, 0.5)
+            tmp = torch.nn.functional.softplus(tmp)
+            return torch.threshold(tmp, -2.0, 0.5)
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, z, check_stride=True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_normalization_partition(self):
+        sizes = [3, 8, 5]
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(sizes, dtype=dtype, device=device)
+        y = torch.randn(sizes, dtype=dtype, device=device)
+        z = torch.randn(sizes, dtype=dtype, device=device)
+        r_m = torch.randn(8, dtype=dtype, device=device)
+        r_v = torch.randn(8, dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor):
+            o = torch.add(x, y)
+            o = torch.nn.functional.softmax(o, dim=0)
+            o = torch.add(o, z)
+            o = torch.nn.functional.batch_norm(o, r_mean, r_var, training=True)
+            return o
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y, z, r_m, r_v)
+        jit_o = t_jit(x, y, z, r_m, r_v)
+        o = t(x, y, z, r_m, r_v)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, z, r_m, r_v), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_sum_to_one(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([4, 5, 6], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor):
+            o = torch.add(x, 1)
+            o = torch.sum(o, dim=[0, 1, 2])
+            return o
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_single_reduction_broadcast(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([7, 4, 8], dtype=dtype, device=device)
+        y = torch.randn([4, 8], dtype=dtype, device=device)
+        z = torch.randn([1, 4, 8], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
+            o = torch.add(x, y)
+            o = torch.add(o, z)
+            o = torch.sum(o, dim=[0])
+            return o
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_trivial_reduction(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([1, 4, 8], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor):
+            o = torch.add(x, 1)
+            o = torch.sum(o, dim=[0])
+            o = torch.sum(o, dim=[0])
+            return o
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skip("Skipped due to rand_like behavior change")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_profiling_node(self):
+        # TODO: should we change this test to not use rand_like, or just
+        # remove this test?
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(4, 8, 8, 8, dtype=dtype, device=device)
+
+        def repro(x: torch.Tensor, alpha: float):
+            o = torch.rand_like(x)
+            o = torch.add(o, alpha)
+            return o
+        repro_jit = torch.jit.script(repro)
+        self._run_helper(repro_jit, repro, x, 0.6)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_reduction_sizes_op(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn(2, 3, 4, 5, dtype=dtype, device=device)
+        y = torch.randn(2, 3, 4, 5, dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = x + y
+            o = torch.relu(o)
+            o = o.sum((1, 3))
+            return o.size()
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o = t_jit(x, y)
+        o = t(x, y)
+        self.assertEqual(o, jit_o)
+        # since the output value is not used at all, the fusion operator should
+        # have been optimized away
+        self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_profile_ivalue(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([7, 4, 7], dtype=dtype, device=device)
+        y = torch.randn([7, 4, 7], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor, dim: List[int], keepdim: bool):
+            o = torch.add(x, y)
+            o = o.sum(dim, keepdim=keepdim)
+            return o
+
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y, (0, 1), False)
+        jit_o = t_jit(x, y, (0, 1), False)
+        o = t(x, y, (0, 1), False)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_profile_ivalue_multiple_profiles(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([7, 4, 7], dtype=dtype, device=device)
+
+        def t(x, num: int):
+            for i in range(num):
+                # varying reduction axes should break profile_ivalue
+                tmp = x.sum(i, keepdim=True)
+                # inplace add on input/output, can't be functionalized/fused
+                x += tmp
+            return x
+
+        with nvfuser_singleton_fusion(True):
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, 3, num_fusion=0)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_sum_to_size(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([2, 4, 4], dtype=dtype, device=device)
+        y = torch.randn([2, 4, 4], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, y: torch.Tensor, new_size: List[int]):
+            o = torch.add(x, y)
+            o = o.sum_to_size(new_size)
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, (4, 1))
+
+        # update shape: old kernel should handle dynamic shape well without
+        # recompilation
+        x = torch.randn([2, 5, 8], dtype=dtype, device=device)
+        y = torch.randn([2, 5, 8], dtype=dtype, device=device)
+        # (TODO) check executed kernel, should extend autograd.profiler to fused
+        # kernels
+        self._run_helper(t_jit, t, x, y, (5, 1))
+
+        with nvfuser_singleton_fusion(True):
+            x = torch.randn([2, 5, 8], dtype=dtype, device=device)
+
+            def t(x: torch.Tensor):
+                # no-op reduction
+                return x.sum_to_size((2, 5, 8))
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_grad_sum_to_size(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([2, 4, 4], dtype=dtype, device=device).requires_grad_()
+        y = torch.randn([4], dtype=dtype, device=device).requires_grad_()
+        grad = torch.randn([2, 4, 4], dtype=dtype, device=device)
+
+        ref_x = x.detach().clone().requires_grad_()
+        ref_y = y.detach().clone().requires_grad_()
+
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = torch.add(x, y)
+            o = torch.relu(o)
+            return o
+
+        # profiling runs for forward & backward
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+
+        x.grad = None
+        y.grad = None
+        jit_o = t_jit(x, y)
+        jit_o.backward(grad)
+        o = t(ref_x, ref_y)
+        o.backward(grad)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertEqual(x.grad, ref_x.grad)
+        self.assertEqual(y.grad, ref_y.grad)
+        bwd_graph = list(
+            list(t_jit.get_debug_state().execution_plans.values())[
+                0].code.grad_executor_states()[0].execution_plans.values()
+        )[0].graph
+        FileCheck().check(FUSION_GUARD).run(bwd_graph)
+
+        # update shape: old kernel should handle dynamic shape well without
+        # recompilation
+        x = torch.randn([2, 5, 8], dtype=dtype, device=device).requires_grad_()
+        y = torch.randn([8], dtype=dtype, device=device).requires_grad_()
+        ref_x = x.detach().clone().requires_grad_()
+        ref_y = y.detach().clone().requires_grad_()
+        grad = torch.randn([2, 5, 8], dtype=dtype, device=device)
+        jit_o = t_jit(x, y)
+        # (TODO) check executed kernel, should extend autograd.profiler to fused
+        # kernels
+        jit_o.backward(grad)
+        o = t(ref_x, ref_y)
+        o.backward(grad)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
+        self.assertEqual(x.grad, ref_x.grad)
+        self.assertEqual(y.grad, ref_y.grad)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_dropout_inference_fusion(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([10, 4, 8], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, p: float, train: bool):
+            o = torch.nn.functional.dropout(x, p, training=train)
+            o = o + 1.0
+            return o
+
+        t_jit = torch.jit.script(t)
+
+        self._run_helper(t_jit, t, x, 0.15, False)
+
+    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_dropout_train_nograd_fusion(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([64, 128, 1024], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, p: float, train: bool):
+            o = torch.nn.functional.dropout(x, p, training=train)
+            o = o + 1.0
+            return o
+
+        t_jit = torch.jit.script(t)
+
+        self._run_helper(t_jit, t, x, 0.0, True, check_runs=20)
+        self._run_helper(t_jit, t, x, 1.0, True, check_runs=20)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_dropout_train_nograd_prob_check(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([1024, 1024], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, p: float, train: bool):
+            o = torch.nn.functional.dropout(x, p, training=train)
+            o = o * 2.0
+            return o
+
+        t_jit = torch.jit.script(t)
+
+        for prob in [0.0, 0.15, 0.5, 0.85, 1.]:
+            torch.cuda.manual_seed_all(123)
+            jit_o = t_jit(x, prob, True)
+            torch.cuda.manual_seed_all(123)
+            jit_o = t_jit(x, prob, True)
+
+            self.assertTrue(jit_o.detach().isfinite().all().item())
+
+            num_elems = x.numel()
+            num_zeros = num_elems - jit_o.detach().count_nonzero().item()
+            percent_zeros = num_zeros / num_elems
+
+            self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01)))
+            self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_dropout_training_fusion(self):
+        dtype = torch.float
+        device = "cuda"
+        sizes = [2, 3, 4, 5]
+
+        def t(x: torch.Tensor, p: float, train: bool):
+            o = torch.nn.functional.dropout(x, p, training=train)
+            o = o * 2.0
+            return o
+
+        def t2(x: torch.Tensor, p: float, train: bool):
+            o = torch.nn.functional.softmax(x, dim=-1)
+            o = torch.nn.functional.dropout(o, p, training=train)
+            return o
+
+        # disabling cache so new inputs would generate new graph
+        t.__disable_jit_function_caching__ = True
+        t2.__disable_jit_function_caching__ = True
+
+        for fn in [t, t2]:
+            for m_format in [torch.contiguous_format, torch.channels_last]:
+                fn_jit = torch.jit.script(fn)
+                x = torch.randn(sizes, dtype=dtype, device=device, requires_grad=True).to(memory_format=m_format)
+                grads = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=m_format)
+
+                # The drop probability needs to be set to zero given that the order of picking random
+                # numbers between eager mode and the jit is different
+                self._run_training_helper(fn_jit, fn, grads, x, 0.0, True)
+
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_gelu(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True)
+        grads = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=False)
+
+        def t(x: torch.Tensor, mode: str):
+            o = torch.nn.functional.gelu(x, approximate=mode)
+            o = o * 2.0
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_training_helper(t_jit, t, grads, x, 'none')
+        self._run_training_helper(t_jit, t, grads, x, 'tanh')
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_dropout_training_prob_check(self):
+        dtype = torch.float
+        device = "cuda"
+        x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True)
+        x_nograd = torch.randn([1024, 1024], dtype=dtype, device=device)
+
+        def t(x: torch.Tensor, p: float, train: bool):
+            o = torch.nn.functional.dropout(x, p, training=train)
+            o = o * 2.0
+            return o
+
+        t_jit = torch.jit.script(t)
+
+        for prob in [0.0, 0.15, 0.5, 0.85, 1.]:
+            torch.cuda.manual_seed_all(123)
+            jit_o = t_jit(x, prob, True)
+            torch.cuda.manual_seed_all(123)
+            jit_o = t_jit(x, prob, True)
+            torch.cuda.manual_seed_all(123)
+            jit_o = t_jit(x, prob, True)
+
+            self.assertTrue(jit_o.detach().isfinite().all().item())
+
+            num_elems = x.numel()
+            num_zeros = num_elems - jit_o.detach().count_nonzero().item()
+            percent_zeros = num_zeros / num_elems
+
+            self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01)))
+            self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_linear(self):
+        in_feature = 2
+        out_feature = 8
+        # Changing the input dims to be 3-D to avoid eager mode bias fusion
+        # The bias fusion causes some precision issues with TF-32
+        weight = torch.randn(out_feature, in_feature, dtype=torch.float32, device='cuda')
+        bias = torch.randn(out_feature, dtype=torch.float32, device='cuda')
+
+        def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor):
+            o = torch.nn.functional.linear(x, weight, bias)
+            o = torch.relu(o)
+            return o
+
+        # disabling cache so new inputs would generate new graph
+        t.__disable_jit_function_caching__ = True
+
+        sizes = [in_feature, ]
+        for i in range(4):
+            # increase input rank in each iteration
+            sizes.insert(0, i + 2)
+            x = torch.randn(*sizes, dtype=torch.float32, device='cuda')
+            t_jit = torch.jit.script(t)
+            # fusion only happens for input rank >= 4
+            has_fusion = 0 if len(sizes) < 4 else 1
+            self._run_helper(t_jit, t, x, weight, bias, check_stride=True, num_fusion=has_fusion)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_linear_symbolic_shapes(self):
+        def fn(x: int):
+            y = torch.zeros((3, 4, x, x + 2)).cuda()
+            for i in range(2):
+                inp = torch.rand((3, 4, x, x + i)).cuda()
+                weight = torch.rand((x + 2, x + i)).cuda()
+                bias = torch.rand((x, x + 2)).cuda()
+                y += torch.sin(torch.nn.functional.linear(inp, weight, bias))
+            return y
+
+        fn_s = torch.jit.script(fn)
+        fn_s(5)
+        fn_s(5)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_conv2d_symbolic_shapes(self):
+        def fn(x: int):
+            responses = []
+            for i in range(2):
+                inp = torch.rand((3, 3, 32, 32)).cuda()
+                weight = torch.rand((x + i, 3, 7, 7)).cuda()
+                bias = torch.rand((x + i)).cuda()
+                res = torch.nn.functional.conv2d(inp, weight, bias, padding=3)
+                responses.append(res)
+            return responses
+
+        fn_s = torch.jit.script(fn)
+        fn_s(5)
+        fn_s(5)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_backward_type(self):
+        # not super useful to check gradient of integer/bool, so skipping here
+        type_pairs = [
+            (torch.float, torch.half),
+            (torch.double, torch.half),
+            (torch.float, torch.double),
+        ]
+        if TEST_BF16:
+            type_pairs += [
+                (torch.float, torch.bfloat16),
+                (torch.double, torch.bfloat16),
+            ]
+        for x_type, y_type in type_pairs:
+            x = torch.randn(4, 2, dtype=x_type, device='cuda', requires_grad=True)
+            y = torch.randn(4, 2, dtype=y_type, device='cuda', requires_grad=True)
+            grad = torch.randn(4, 2, dtype=torch.float, device='cuda')
+
+            def test1(x: torch.Tensor, y: torch.Tensor):
+                o = torch.add(x, y)
+                o = torch.add(o, y)
+                o = torch.add(o, y)
+                o = torch.add(o, y)
+                o = o + 1.0
+                return o
+
+            test1_jit = torch.jit.script(test1)
+            for i in range(3):
+                jit_o = test1_jit(x, y)
+                jit_o.backward(grad)
+
+            bwd_graph = list(
+                list(test1_jit.get_debug_state().execution_plans.values())[
+                    0].code.grad_executor_states()[0].execution_plans.values()
+            )[0].graph
+
+            FileCheck().check(FUSION_GROUP).run(bwd_graph)
+            self.assertEqual(x.grad.dtype, x.dtype)
+            self.assertEqual(y.grad.dtype, y.dtype)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_autocast_1(self):
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = x * 2.0
+            o = torch.softmax(o, dim=-1)
+            o = o * 3.0
+            o = torch._C._nn.linear(o, y)
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True)
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
+        grad = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            with torch.cuda.amp.autocast():
+                jit_o = t_jit(x, y)
+                if i == 2:
+                    fwd_graph = t_jit.graph_for(x, y)
+            jit_o.backward(grad)
+
+        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
+
+        with torch.cuda.amp.autocast():
+            bwd_graph = list(
+                list(t_jit.get_debug_state().execution_plans.values())[
+                    0].code.grad_executor_states()[0].execution_plans.values()
+            )[0].graph
+        FileCheck().check(FUSION_GROUP).run(bwd_graph)
+
+        self.assertEqual(jit_o.dtype, torch.half)
+        self.assertEqual(x.grad.dtype, x.dtype)
+        self.assertEqual(y.grad.dtype, y.dtype)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_autocast_2(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = torch.softmax(o, dim=-1)
+            o = o * 3.0
+            o = torch.softmax(o, dim=-1)
+            o = o * 4.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True)
+        grad = torch.randn(8, 4, dtype=torch.float, device='cuda', requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            with torch.cuda.amp.autocast():
+                jit_o = t_jit(x)
+                if i == 2:
+                    fwd_graph = t_jit.graph_for(x)
+            jit_o.backward(grad)
+
+        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
+
+        with torch.cuda.amp.autocast():
+            bwd_graph = list(
+                list(t_jit.get_debug_state().execution_plans.values())[
+                    0].code.grad_executor_states()[0].execution_plans.values()
+            )[0].graph
+        FileCheck().check(FUSION_GROUP).run(bwd_graph)
+
+        self.assertEqual(jit_o.dtype, torch.float)
+        self.assertEqual(x.grad.dtype, x.dtype)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_autocast_1_bfloat(self):
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o = x * 2.0
+            o = torch.softmax(o, dim=-1)
+            o = o * 3.0
+            o = torch._C._nn.linear(o, y)
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True)
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda', requires_grad=True)
+        grad = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                jit_o = t_jit(x, y)
+                if i == 2:
+                    fwd_graph = t_jit.graph_for(x, y)
+            jit_o.backward(grad)
+
+        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
+
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            bwd_graph = list(
+                list(t_jit.get_debug_state().execution_plans.values())[
+                    0].code.grad_executor_states()[0].execution_plans.values()
+            )[0].graph
+        FileCheck().check(FUSION_GROUP).run(bwd_graph)
+
+        self.assertEqual(jit_o.dtype, torch.bfloat16)
+        self.assertEqual(x.grad.dtype, x.dtype)
+        self.assertEqual(y.grad.dtype, y.dtype)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_autocast_2_bfloat(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = torch.softmax(o, dim=-1)
+            o = o * 3.0
+            o = torch.softmax(o, dim=-1)
+            o = o * 4.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True)
+        grad = torch.randn(8, 4, dtype=torch.float, device='cuda', requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                jit_o = t_jit(x)
+                if i == 2:
+                    fwd_graph = t_jit.graph_for(x)
+            jit_o.backward(grad)
+
+        self.assertGraphContainsExactly(fwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
+
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            bwd_graph = list(
+                list(t_jit.get_debug_state().execution_plans.values())[
+                    0].code.grad_executor_states()[0].execution_plans.values()
+            )[0].graph
+        FileCheck().check(FUSION_GROUP).run(bwd_graph)
+
+        self.assertEqual(jit_o.dtype, torch.float)
+        self.assertEqual(x.grad.dtype, x.dtype)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_to_dtype_fp32_to_fp16(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = o.to(dtype=torch.half)
+            o = o * 3.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.float, device='cuda')
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        self.assertEqual(jit_o.dtype, torch.half)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_to_dtype_fp16_to_fp32(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = o.to(dtype=torch.float)
+            o = o * 3.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.half, device='cuda')
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        self.assertEqual(jit_o.dtype, torch.float)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_to_dtype_fp16_to_fp16(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = o.to(dtype=torch.half)
+            o = o * 3.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.half, device='cuda')
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        self.assertEqual(jit_o.dtype, torch.half)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_to_dtype_fp32_to_bf16(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = o.to(dtype=torch.bfloat16)
+            o = o * 3.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.float, device='cuda')
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        self.assertEqual(jit_o.dtype, torch.bfloat16)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_to_dtype_bf16_to_fp32(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = o.to(dtype=torch.float)
+            o = o * 3.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda')
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        self.assertEqual(jit_o.dtype, torch.float)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(not TEST_BF16, "device does not support BFloat16")
+    def test_to_dtype_bf16_to_bf16(self):
+        def t(x: torch.Tensor):
+            o = x * 2.0
+            o = o.to(dtype=torch.bfloat16)
+            o = o * 3.0
+            return o
+
+        x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda')
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        self.assertEqual(jit_o.dtype, torch.bfloat16)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(not TEST_MULTIGPU, "requires multiple CUDA device")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_multiple_device_pw(self):
+
+        def t(x):
+            o = x + 1.0
+            o = torch.relu(o)
+            return o
+
+        x = torch.randn(2, dtype=torch.float32, device="cuda")
+        t_jit = torch.jit.script(t)
+
+        for i in range(3):
+            jit_o = t_jit(x)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+        torch.cuda.device(1)
+        x = x.to("cuda:1")
+        jit_o = t_jit(x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_graph_for_with_missing_optimized_engine(self):
+        x = torch.randn(8, 4, 2, dtype=torch.float, device="cuda").requires_grad_()
+
+        def t(x: torch.Tensor, flag: bool):
+            x = x + 1.0
+            x = torch.relu(x)
+            if flag:
+                o = x + 1.0
+                o = torch.relu(o)
+            else:
+                o = x + 2.0
+                o = torch.relu(o)
+            return o
+
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, False)
+        jit_o = t_jit(x, False)
+        jit_o = t_jit(x, True)
+        o = t(x, True)
+        self.assertEqual(o, jit_o)
+        # since the output value is not used at all, the fusion operator should
+        # have been optimized away
+        self.assertGraphContainsExactly(t_jit.graph_for(x, True), FUSION_GUARD, 1, True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_branches(self):
+        in_feature = 2
+        out_feature = 4
+        x = torch.randn(4, in_feature, dtype=torch.float32, device='cuda')
+        weight = torch.randn(out_feature, in_feature, dtype=torch.float32, device='cuda')
+        bias = torch.randn(out_feature, dtype=torch.float32, device='cuda')
+
+        def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, flag: bool):
+            if flag:
+                o = torch.nn.functional.linear(x, weight, bias)
+                o = o + 1.0
+                o = torch.relu(o)
+            else:
+                o = x.sum()
+                o = o + 2.0
+                o = torch.relu(o)
+            return o
+
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x, weight, bias, True)
+        jit_o = t_jit(x, weight, bias, True)
+        o = t(x, weight, bias, True)
+        self.assertEqual(o, jit_o)
+        # since the output value is not used at all, the fusion operator should
+        # have been optimized away
+        self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias, True), FUSION_GUARD, 1)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scalar_tensor(self):
+        x = torch.empty([], device="cuda", dtype=torch.float32)
+
+        def t(x: torch.Tensor):
+            o = x + 1.0
+            o = torch.nn.functional.relu(o)
+            return o
+
+        # bias set to true.
+        t_jit = torch.jit.script(t)
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o, jit_o)
+        # since the output value is not used at all, the fusion operator should
+        # have been optimized away
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1)
+
+    @unittest.skipIf(os.environ.get('PYTORCH_NO_CUDA_MEMORY_CACHING') is not None,
+                     "skipping graph_rng when caching allocator is disabled")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_graph_rng(self):
+        self.assertTrue(torch._C._jit_nvfuser_enabled())
+        size = 10000
+        a = torch.randn((size,), device="cuda", dtype=torch.float)
+
+        def t(x):
+            o = x + 1.0
+            o = torch.nn.functional.dropout(o, p=0.1)
+            o = o + 1.0
+            o = torch.nn.functional.dropout(o, p=0.1)
+            return o
+
+        t_jit = torch.jit.script(t)
+
+        for _ in range(3):
+            t_jit(a)
+
+        self.assertGraphContainsExactly(t_jit.graph_for(a), FUSION_GUARD, 1)
+
+        # Control (jitted, ungraphed)
+        torch.cuda.manual_seed(5)
+        eager_out = a.clone()
+        for _ in range(3):
+            eager_out = t_jit(eager_out)
+
+        graph_in = a.clone()
+        g = torch.cuda.CUDAGraph()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            torch.cuda.manual_seed(5)
+            g.capture_begin()
+            graph_out = t_jit(graph_in)
+            g.capture_end()
+        torch.cuda.current_stream().wait_stream(s)
+        # g is now a jitted, graphed version of t.
+
+        # Runs a (jitted, graphed) -> (jitted, ungraphed) -> (jitted, graphed) sequence.
+        # The ops in the overall sequence should be the same as Control.
+        g.replay()
+        # graph_out is now filled with g's result. Use it as ungraphed input.
+        out = t_jit(graph_out)
+        graph_in.copy_(out)
+        g.replay()
+
+        # If replay() updated RNG state correctly, graph_out should now equal eager_out
+        self.assertEqual(graph_out, eager_out)
+
+    def _test_batch_norm_impl_index_helper(self, batch, c, hw, affine=True,
+                                           track_running_stats=True, train=True,
+                                           dtype=torch.float32):
+        # enabling inlining to avoid counter increment in BN forward
+        torch._C._debug_set_autodiff_subgraph_inlining(True)
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, num_features=10, affine=True, track_running_stats=True):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(num_features,
+                                               1e-5,
+                                               affine=affine,
+                                               track_running_stats=track_running_stats).to(dtype=dtype)
+
+            def forward(self, x):
+                o = self.bn(x)
+                o = o * 2.0
+                return o
+
+        x = torch.randn(batch, c, hw, hw, dtype=torch.float, device="cuda").to(dtype=dtype).requires_grad_()
+        grad = torch.randint(-20, 20, (batch, c, hw, hw), device="cuda").to(dtype=dtype).div(-10)
+
+        my_module = MyModule(c, affine, track_running_stats).cuda()
+        ref_module = MyModule(c, affine, track_running_stats).cuda()
+
+        if not train:
+            my_module.eval()
+            ref_module.eval()
+
+        t_jit = torch.jit.script(my_module)
+        ref_module.load_state_dict(my_module.state_dict())
+
+        ref_x = x.detach().requires_grad_()
+
+        for i in range(0, 3):
+            jit_o = t_jit(x)
+            jit_o.backward(grad)
+
+        # TODO: remove this run?
+        o = ref_module(ref_x)
+        o.backward(grad)
+
+        has_affine = ref_module.bn.weight is not None
+        has_running_stats = ref_module.bn.running_mean is not None
+
+        if has_running_stats:
+            my_module.bn.running_mean.zero_()
+            my_module.bn.running_var.fill_(1.0)
+            ref_module.bn.running_mean.zero_()
+            ref_module.bn.running_var.fill_(1.0)
+
+        # Verify that when train is False, we don't have grad for weight/bias.
+        if has_affine and train:
+            my_module.bn.weight.grad.zero_()
+            my_module.bn.bias.grad.zero_()
+            ref_module.bn.weight.grad.zero_()
+            ref_module.bn.bias.grad.zero_()
+
+        x.grad.zero_()
+        ref_x.grad.zero_()
+
+        # real runs
+        jit_o = t_jit(x)
+        jit_o.backward(grad)
+
+        o = ref_module(ref_x)
+        o.backward(grad)
+
+        # assert forward graph fusion
+        self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1, consider_subgraphs=True)
+        # assert backward graph fusion
+        bwd_graph = list(
+            list(t_jit.get_debug_state().execution_plans.values())[0].code.grad_executor_states()[0]
+            .execution_plans.values())[0].graph
+        self.assertGraphContainsExactly(bwd_graph, FUSION_GUARD, 1, consider_subgraphs=True)
+
+        if TEST_WITH_ROCM:
+            e0 = 1e-3
+            e1 = 1e-2
+            e2 = 1e-2
+        else:
+            e0 = 1e-5 if dtype is not torch.half else 1e-3
+            e1 = 1e-4 if dtype is not torch.half else 1e-3
+            e2 = 1e-3 if dtype is not torch.half else 1e-2
+
+        self.assertTrue(self._compare("comparing output failed", jit_o, o, e0))
+        self.assertTrue(self._compare("comparing input grad failed", x.grad, ref_x.grad, e1))
+        # TODO: switch to welford and reduce this to 1e-5
+        # The 1e-3 looks bad, but we don't have welford in codegen, so numeric
+        # is very different between reference and codegen.
+        if has_affine and train:
+            self.assertTrue(self._compare("comparing weight grad failed",
+                                          my_module.bn.weight.grad,
+                                          ref_module.bn.weight.grad,
+                                          e2))
+            self.assertTrue(self._compare("comparing bias grad failed",
+                                          my_module.bn.bias.grad,
+                                          ref_module.bn.bias.grad,
+                                          e1))
+        if has_running_stats:
+            self.assertTrue(self._compare("comparing running_mean failed",
+                                          my_module.bn.running_mean,
+                                          ref_module.bn.running_mean,
+                                          e0))
+            self.assertTrue(self._compare("comparing running_var failed",
+                                          my_module.bn.running_var,
+                                          ref_module.bn.running_var,
+                                          e0))
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_batch_norm_half(self):
+        with torch.backends.cudnn.flags(enabled=True):
+            setups = [
+                [True, True],
+                [False, False],
+                [True, False],
+                [False, True]]
+            for training_and_track, affine in itertools.product(setups, [True, False]):
+                training, track_running_stats = training_and_track
+                self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_batch_norm_impl_index_inner_bcast(self):
+        # the repro
+        self._test_batch_norm_impl_index_helper(2, 1, 1, False, True, True)
+
+        # running the full set
+        setups = [
+            [True, True],
+            [False, False],
+            [True, False],
+            [False, True]]
+        for training_and_track, affine in itertools.product(setups, [True, False]):
+            training, track_running_stats = training_and_track
+            self._test_batch_norm_impl_index_helper(2, 1, 1, affine, track_running_stats, training)
+
+    @skipIfRocm
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_batch_norm_impl_index_correctness(self):
+        with torch.backends.cudnn.flags(enabled=True):
+            batch = [2, 7, 16]
+            channels = [4, 89, 19, 32]
+            hw = [1, 8, 17, 32]
+
+            # avoid tolerance failure in CI
+            torch.cuda.manual_seed_all(211)
+
+            # failing sizes (2, 1, 1, 1)
+            # failing sizes (2, 89, 8, 8) training False, track True, affine: False
+            for b, c, hw in itertools.product(batch, channels, hw):
+                setups = [
+                    [True, True],
+                    [False, False],
+                    [True, False],
+                    [False, True]]
+                for training_and_track, affine in itertools.product(setups, [True, False]):
+                    training, track_running_stats = training_and_track
+                    self._test_batch_norm_impl_index_helper(b, c, hw, affine, track_running_stats, training)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_softplus_fuser(self):
+        def shifted_softplus(x: torch.Tensor, shift: float):
+            return functional.softplus(x) - shift
+
+        jitted = torch.jit.script(shifted_softplus)
+        inp = torch.randn(4, 2, dtype=torch.float32, device="cuda").requires_grad_()
+        inp_ref = inp.detach().clone().requires_grad_()
+        grad = torch.randn(4, 2, dtype=torch.float32, device="cuda")
+
+        aten_o = shifted_softplus(inp_ref, 0.693147)
+        aten_o.backward(grad)
+        aten_grad = inp_ref.grad
+
+        for i in range(3):
+            jit_o = jitted(inp, 0.693147)
+            inp.grad = None         # avoid accumulation on grad
+            jit_o.backward(grad)
+            jit_grad = inp.grad
+
+        assert torch.allclose(jit_o, aten_o)
+        assert torch.allclose(jit_grad, aten_grad)
+        self.assertGraphContains(jitted.graph_for(inp, 0.693147), FUSION_GROUP, True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_inplace_removal(self):
+        def t(x: torch.Tensor):
+            o = torch.nn.functional.softmax(x, dim=0)
+            o += x
+            return o.relu_()
+
+        jitted = torch.jit.script(t)
+        inp = torch.randn(4, 2, dtype=torch.float32, device="cuda")
+
+        for i in range(3):
+            jit_o = jitted(inp)
+
+        graph = jitted.graph_for(inp)
+        self.assertGraphContains(graph, FUSION_GROUP, True)
+        self.assertGraphContains(graph, 'aten::add', True)
+        self.assertGraphContains(graph, 'aten::relu', True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_conv2d_bias(self):
+        def t(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor):
+            o = torch.nn.functional.conv2d(x, w, bias)
+            return o.relu()
+
+        jitted = torch.jit.script(t)
+        inp = torch.randn(4, 5, 3, 3, dtype=torch.float32, device="cuda")
+        weight = torch.randn(2, 5, 2, 2, dtype=torch.float32, device="cuda")
+        bias = torch.randn(2, dtype=torch.float32, device="cuda")
+
+        for i in range(3):
+            jit_o = jitted(inp, weight, bias)
+
+        graph = jitted.graph_for(inp)
+        self.assertGraphContains(graph, FUSION_GROUP, True)
+
+        def t_not_fused(x: torch.Tensor, w: torch.Tensor):
+            o = torch.nn.functional.conv2d(x, w)
+            return o.relu()
+
+        jitted_not_fused = torch.jit.script(t_not_fused)
+
+        for i in range(3):
+            jit_o = jitted_not_fused(inp, weight)
+
+        graph = jitted_not_fused.graph_for(inp)
+        self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
+        self.assertGraphContains(graph, 'aten::relu', True)
+
+        def t_bias(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor):
+            o = torch.nn.functional.conv2d(x, w, bias)
+            return o.relu()
+
+        jitted_bias = torch.jit.script(t_bias)
+
+        for i in range(3):
+            jit_o = jitted_bias(inp, weight, bias)
+
+        graph = jitted_bias.graph_for(inp)
+        self.assertGraphContains(graph, FUSION_GROUP, True)
+        self.assertGraphContains(graph, 'prim::add_optional', True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_remove_output_used_only_in_dtype(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self, num_features=4):
+                super().__init__()
+                self.bn0 = torch.nn.BatchNorm2d(num_features)
+                self.bn1 = torch.nn.BatchNorm2d(num_features)
+
+            def forward(self, x, y):
+                o1 = self.bn0(x)
+                o2 = self.bn1(y)
+                return torch.relu(o1 + o2)
+
+        t = MyModule(4).float().cuda()
+
+        jitted = torch.jit.script(t)
+        x = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
+        y = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
+
+        with torch.cuda.amp.autocast(True):
+            for i in range(5):
+                jit_o = jitted(x, y)
+
+            jit_o = jitted(x, y)
+            o = t(x, y)
+
+            self.assertTrue(torch.allclose(jit_o, o))
+            graph = jitted.graph_for(x, y)
+            self.assertGraphContains(graph, FUSION_GROUP, True)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_fix_shape_expression_bn(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self, num_features=4):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(num_features)
+
+            def forward(self, x, y):
+                out1 = self.bn(x)
+                out2 = out1 + y
+                out3 = torch.relu(out2)
+                return out3
+
+        t = MyModule(4).float().cuda()
+
+        jitted = torch.jit.script(t)
+        x = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
+        y = torch.randn(3, 4, 2, 5, dtype=torch.float32, device="cuda")
+
+        with torch.cuda.amp.autocast(True):
+            for i in range(5):
+                jit_o = jitted(x, y)
+
+            jit_o = jitted(x, y)
+            o = t(x, y)
+
+            self.assertTrue(torch.allclose(jit_o, o))
+            graph = jitted.graph_for(x, y)
+            self.assertGraphContains(graph, FUSION_GROUP, True)
+
+    def _run_fwd_helper(self, func, ops, *args):
+        jitted = torch.jit.script(func)
+        for i in range(3):
+            jit_o = jitted(*args)
+        jit_o = jitted(*args)
+        o = func(*args)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        graph = jitted.graph_for(*args)
+        self.assertGraphContains(graph, FUSION_GROUP, True)
+        for op in ops:
+            self.assertGraphContainsExactly(graph, op, 0)
+
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_sibling_fusion(self):
+        device = "cuda"
+        dtype = torch.float
+        x = torch.randn(2, 5, dtype=dtype, device=device)
+        y = torch.randn(2, 5, dtype=dtype, device=device)
+
+        def t(x: torch.Tensor):
+            o1 = x + 1.0
+            o2 = x * 0.5
+            return o1, o2
+        self._run_fwd_helper(t, ['aten::add', 'aten::mul'], x)
+
+        def t2(x: torch.Tensor, y: torch.Tensor):
+            o1 = x.sum(0)
+            o2 = (x * y).sum(0)
+            return o1, o2
+        self._run_fwd_helper(t2, ['aten::sum', 'aten::mul'], x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_clean_profile_ivalue(self):
+        device = "cuda"
+        dtype = torch.float
+        x = torch.randn(2, 5, dtype=dtype, device=device, requires_grad=True)
+        # turn on autodiff subgraph inlining
+        # this is to verify that we clean up profile_ivalue node out side of
+        # fusion code path.
+        torch._C._debug_set_autodiff_subgraph_inlining(True)
+
+        def t(x: torch.Tensor, flag: bool):
+            return torch.dropout(x, 0.5, flag)
+
+        jit_t = torch.jit.script(t)
+        for idx in range(5):
+            out = jit_t(x, True)
+
+        graph = jit_t.graph_for(x, True)
+        out = jit_t(x, False)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_sibling_fusion_no_scalar_inputs(self):
+        device = "cuda"
+        dtype = torch.float
+        x = torch.randn(2, 5, dtype=dtype, device=device)
+        y = torch.randn(3, dtype=dtype, device=device)
+
+        # no tensor dependency between o1/o2, we shouldn't be fusing them
+        def t(x: torch.Tensor, y: torch.Tensor):
+            o1 = x + 1
+            o2 = y - 1
+            return o1, o2
+
+        jitted = torch.jit.script(t)
+        for i in range(3):
+            jit_o = jitted(x, y)
+        graph = jitted.graph_for(x, y)
+        self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
+
+    def _bias_view_relu_helper(self, shape, output_shape, dtype, device, error):
+        class BiasViewRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs: torch.Tensor, view_shape: List[int]):
+                o = inputs + self.bias
+                o = o.view(view_shape)
+                return torch.relu(o)
+
+        t = BiasViewRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        # profiling
+        jit_o = t_jit(x, output_shape)
+        # optimization
+        jit_o = t_jit(x, output_shape)
+        # final
+        jit_o = t_jit(x, output_shape)
+        # eager - baseline
+        o = t(x, output_shape)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, output_shape)
+
+        has_inferred_dimension = any([dim == -1 for dim in output_shape])
+        if has_inferred_dimension:
+            # prohibit fusing when view_shape contains an inferred dimension
+            self.assertGraphContainsExactly(graph, FUSION_GROUP, 0)
+            self.assertGraphContainsExactly(graph, 'prim::view_copy', 0)
+        else:
+            self.assertGraphContains(graph, FUSION_GUARD)
+            self.assertGraphContains(graph, 'prim::view_copy', True)
+
+    def _alias_bias_view_relu_helper(self, shape, output_shape, dtype, device, error):
+        class BiasViewRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs : torch.Tensor, bias : torch.Tensor, view_shape : List[int]):
+                o = inputs.view(view_shape)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasViewRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        # profiling
+        jit_o = t_jit(x.clone(), bias, output_shape)
+        # optimization
+        jit_o = t_jit(x.clone(), bias, output_shape)
+        # final
+        jit_o = t_jit(x.clone(), bias, output_shape)
+        # eager - baseline
+        o = t(x.clone(), bias, output_shape)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias, output_shape)
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::view_copy', 0)
+
+    # generate random view given original view
+    def _random_view(self, original_view, max_len=8, max_views=10000):
+        class Moves(enum.Enum):
+            Merge = 0
+            Split = 1
+            Broadcast = 2
+            ImplicitBroadcast = 3
+            Keep = 4
+
+        def valid(old_view, new_view):
+            old_view_size = reduce(operator.mul, old_view)
+            new_view_size = reduce(operator.mul, new_view)
+            return old_view_size == new_view_size
+
+        # given a random starting number, find the nearest divisor
+        def find_nearest_divisor(N):
+            if 2 >= (N - 1):
+                return -1
+            result = random.randint(2, N - 1)
+            while (N % result) != 0:
+                result += 1
+            return result
+
+        complete_views = {tuple(original_view)}
+
+        to_visit = []
+        # empty new view, curent originaal view, start pos=0, move count = 0, last_move
+        to_visit.append(([], original_view, 0, [], Moves.Keep))
+
+        # depth-first search of view shapes, starting from the original view
+        while len(to_visit) > 0 and len(complete_views) < max_views:
+            new_view, old_view, odx, move_list, last_move = to_visit[-1]
+            to_visit.pop()
+
+            # iterate over each move type
+            for idx in range(len(Moves)):
+                state = Moves(idx)
+                new_view_clone = copy.deepcopy(new_view)
+                old_view_clone = copy.deepcopy(old_view)
+                new_move_list = move_list + [state]
+                new_odx = odx
+
+                # Update state using Move state
+                if state == Moves.Keep:
+                    new_size = old_view_clone[odx]
+                    new_view_clone.append(new_size)
+                    new_odx += 1
+
+                elif state == Moves.Merge:
+                    if odx + 1 < len(old_view_clone):
+                        new_size = old_view_clone[odx] * old_view_clone[odx + 1]
+                        new_view_clone.append(new_size)
+                        new_odx += 2
+                    else:
+                        continue
+
+                elif state == Moves.Broadcast and last_move != Moves.Broadcast:
+                    new_view_clone.append(1)
+
+                elif state == Moves.Split:
+                    new_size = find_nearest_divisor(old_view_clone[odx])
+                    if new_size == -1:
+                        continue
+                    new_view_clone.append(new_size)
+                    old_view_clone[odx] = int(old_view[odx] / new_size)
+
+                    if old_view_clone[odx] == 1:
+                        new_odx += 1
+
+                elif state == Moves.ImplicitBroadcast:
+                    old_view_clone.insert(odx + 1, 1)
+                    new_size = old_view[odx] * 1
+                    new_view_clone.append(new_size)
+                    new_odx += 2
+
+                if new_odx < len(old_view_clone) and len(new_move_list) < max_len:
+                    to_visit.append((new_view_clone, old_view_clone, new_odx, new_move_list, state))
+                elif (valid(original_view, new_view_clone)):
+                    final_new_view = tuple(new_view_clone)
+                    complete_views.add(final_new_view)
+        return list(complete_views)
+
+    # ndims - number of dimensions
+    # test_fn - view test function
+    def _view_test_generator(self, ndims, test_fn):
+        # create random tensor
+        # max value for each dimension
+        max_size = 10e7
+        max_value = max(int(pow(max_size, 1. / ndims)), 1)
+        sizes = [random.randint(1, max_value) for idx in range(ndims)]
+        x = torch.randn(sizes)
+
+        original_sizes = list(x.size())
+        all_views = self._random_view(original_sizes)
+        random.shuffle(all_views)
+
+        max_samples = 20
+        max_views = min(len(all_views), max_samples)
+        total = 0
+        correct = 0
+        # test random combinations of compatible views
+        for idx in range(max_views):
+            for jdx in range(idx + 1, max_views):
+                total += 1
+                test_fn(all_views[idx], all_views[jdx], torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view(self):
+        torch._C._jit_set_nvfuser_guard_mode(True)
+        self._bias_view_relu_helper([2, 3, 4, 5], [-1, 4, 5], torch.float, 'cuda', 1e-6)
+        for ndims in range(1, 5):
+            self._view_test_generator(ndims, self._bias_view_relu_helper)
+        self._alias_bias_view_relu_helper([2, 3, 4, 5], [1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
+
+    def _bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error):
+        class BiasFlattenRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs : torch.Tensor, start_dim : int, end_dim : int):
+                o = inputs + self.bias
+                o = o.flatten(start_dim, end_dim)
+                return torch.relu(o)
+
+        t = BiasFlattenRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        self._run_helper(t_jit, t, x, start_dim, end_dim)
+        self.assertGraphContains(t_jit.graph_for(x, start_dim, end_dim), 'prim::flatten_copy', True)
+
+    def _alias_bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error):
+        class BiasFlattenRelu(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False)
+                with torch.no_grad():
+                    self.bias.fill_(10)
+
+            def forward(self, inputs : torch.Tensor, bias : torch.Tensor, start_dim : int, end_dim : int):
+                o = inputs.flatten(start_dim, end_dim)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasFlattenRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        # profiling
+        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
+        # optimization
+        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
+        # final
+        jit_o = t_jit(x.clone(), bias, start_dim, end_dim)
+        # eager - baseline
+        o = t(x.clone(), bias, start_dim, end_dim)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias, start_dim, end_dim)
+
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::flatten_copy', 0)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since flatten is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_flatten(self):
+        torch._C._jit_set_nvfuser_guard_mode(True)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6)
+        self._bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6)
+        self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_strict_fusion(self):
+        def success(x):
+            with torch.jit.strict_fusion():
+                return x + x + x
+
+        scripted = self.checkScript(success, (torch.rand([4], device='cuda'),))
+        g = torch.jit.last_executed_optimized_graph()
+        FileCheck().check_not("aten::add").check("prim::CudaFusionGroup").run(g)
+
+        def failure(x):
+            with torch.jit.strict_fusion():
+                return x + torch.mm(x, x) + x
+
+        with self.assertRaises(Exception) as error_out:
+            foo_s = torch.jit.script(failure)
+            foo_s(torch.rand([4, 4]))
+            foo_s(torch.rand([4, 4]))
+
+        fc = FileCheck().check("Found unfused operators")
+        fc.check("aten::mm").run(str(error_out.exception))
+
+    def _ltc_helper(self, shape, dtype, device, error, approximate=True):
+        # modeled after LTC linear layer
+        class LTC(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn([1024, 1024], dtype=dtype, device=device), requires_grad=False)
+                self.bias = torch.nn.Parameter(torch.randn([1, 1024], dtype=dtype, device=device), requires_grad=False)
+
+            def forward(self, inputs : torch.Tensor):
+                o = inputs.view([32768, 1024])
+                o = torch.mm(o, self.weight)
+                o = o.view([256, 128, 1024])
+                o = o + self.bias
+                o = o.view([32768, 1024])
+                o = o.view([256, 128, 1024])
+                return torch.nn.functional.gelu(o)
+
+        t = LTC()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        # profile/optimization runs
+        for i in range(3):
+            jit_o = t_jit(x)
+        o = t(x)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x)
+        self.assertGraphContains(graph, FUSION_GUARD)
+        self.assertGraphContains(graph, 'prim::view_copy', True)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_nested_view(self):
+        self._ltc_helper([256, 128, 1024], torch.float, 'cuda', 1e-6)
+
+    def _bias_squeeze_relu_helper(self, shape, dtype, device, error):
+        class BiasSqueezeRelu(torch.nn.Module):
+            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
+                o = inputs + bias
+                o = torch.squeeze(o)
+                return torch.relu(o)
+
+        t = BiasSqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        o = t(x, bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContains(graph, FUSION_GUARD)
+        self.assertGraphContains(graph, 'prim::squeeze_copy', True)
+
+    def _alias_bias_squeeze_relu_helper(self, shape, dtype, device, error):
+        class BiasSqueezeRelu(torch.nn.Module):
+            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
+                o = torch.squeeze(inputs)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasSqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        o = t(x.clone(), bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::squeeze_copy', 0)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_squeeze(self):
+        self._bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
+        self._alias_bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    # remove this after opinfo tests are enabled
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_squeeze_zero(self):
+        x = torch.tensor(1.0, dtype=torch.float, device="cuda")
+
+        def squeeze_0(x: torch.Tensor):
+            o = x + 1.
+            o = torch.squeeze(o, 0)
+            o = o * 2.
+            return o
+
+        def squeeze_1(x: torch.Tensor):
+            o = x + 1.
+            o = torch.squeeze(o, -1)
+            o = o + .5
+            return o
+
+        squeeze_0_jit = torch.jit.script(squeeze_0)
+        self._run_helper(squeeze_0_jit, squeeze_0, x)
+        squeeze_1_jit = torch.jit.script(squeeze_1)
+        self._run_helper(squeeze_1_jit, squeeze_1, x)
+
+    def _bias_unsqueeze_relu_helper(self, shape, dtype, device, error):
+        class BiasUnsqueezeRelu(torch.nn.Module):
+            def forward(self, inputs: torch.Tensor, bias: torch.Tensor):
+                o = inputs + bias
+                o = torch.unsqueeze(o, 0)
+                return torch.relu(o)
+
+        t = BiasUnsqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        jit_o = t_jit(x, bias)
+        o = t(x, bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContains(graph, FUSION_GUARD)
+        self.assertGraphContains(graph, 'prim::unsqueeze_copy', True)
+
+    def _alias_bias_unsqueeze_relu_helper(self, shape, dtype, device, error):
+        class BiasUnsqueezeRelu(torch.nn.Module):
+            def forward(self, inputs : torch.Tensor, bias : torch.Tensor):
+                o = torch.unsqueeze(inputs, 0)
+                inputs.add_(bias)
+                return torch.relu(o)
+
+        t = BiasUnsqueezeRelu()
+        x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False)
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        jit_o = t_jit(x.clone(), bias)
+        o = t(x.clone(), bias)
+
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, error))
+        graph = t_jit.graph_for(x, bias)
+        self.assertGraphContainsExactly(graph, FUSION_GUARD, 0)
+        self.assertGraphContainsExactly(graph, 'prim::unsqueeze_copy', 0)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_unsqueeze(self):
+        self._bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6)
+        self._alias_bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_alias_pass_fix(self):
+        x = torch.randn(4, 24, 2, 2, dtype=torch.float, device="cuda")
+        w = torch.randn(24, 24, 1, 1, dtype=torch.float, device="cuda")
+        b = torch.randn(24, dtype=torch.float, device="cuda")
+
+        def t(x, w, b):
+            b2 = b + 1.0
+            o = torch.conv2d(x, w, b2)
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, w, b)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_squeeze_negative_dim(self):
+        x = torch.randn(4, 24, 1, 2, dtype=torch.float, device="cuda")
+
+        def t(x):
+            o = x + 1.0
+            o = o.squeeze(-2)
+            o = o * 2.0
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_singleton_fusion(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.relu()
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_issue1445_fusion(self):
+        def f(t0, t1, t2, t3):
+            masked_input = torch.where(t1, t2, t3)
+            total = masked_input.sum([0, 1, 2, 3])
+            sizes : List[int] = []
+            t10 = torch.reshape(t0, sizes)
+            t7 = total / t10
+            t4 = t7.to(dtype=torch.float)
+            return t4
+
+        x = torch.randn(1, 1, 1, 1, device='cuda').to(dtype=torch.long)
+        y = torch.randn(3, 2, 1, 1, device='cuda').to(dtype=torch.bool).expand([3, 2, 1, 2])
+        z = torch.randn(3, 2, 1, 2, device='cuda')
+        w = torch.tensor(1.5, device='cuda')
+
+        f_jit = torch.jit.script(f)
+        for i in range(5):
+            out_jit = f_jit(x, y, z, w)
+        out = f(x, y, z, w)
+        self.assertEqual(out, out_jit)
+        self.assertGraphContainsExactly(f_jit.graph_for(x, y, z, w), FUSION_GROUP, 1)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_disable_sibling_fuse(self):
+        x = torch.randn(4, 2, device="cuda")
+        y = torch.randn(8, device="cuda")
+        s = torch.tensor(1.5, device="cuda")
+
+        with nvfuser_horizontal_fusion(False):
+            def t(x, y, s):
+                o1 = x + s
+                o2 = y + s
+                return o1, o2
+
+            t_jit = torch.jit.script(t)
+            for i in range(5):
+                t_jit(x, y, s)
+
+            # sibling fusion should be disabled with the flag
+            self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_build_shape_expression_native_dropout(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        def t(x):
+            o, mask = torch.native_dropout(x, 0.0, True)
+            o1 = o.sigmoid()
+            o2 = mask.float().sigmoid()
+            return (o1, o2)
+
+        t_jit = torch.jit.script(t)
+
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scalar_tensor_permuted(self):
+        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
+        y = torch.tensor(1.0, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, y):
+                return x + y
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cpu_scalar(self):
+        x = torch.randn(4, 2, 3, device="cuda")
+        y = torch.tensor(1.0, device="cpu")
+        z = torch.tensor(2.0, device="cpu")
+
+        with nvfuser_singleton_fusion(True):
+            # testing cpu scalar tensor promotion
+            def t(x, y):
+                return x + y
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+            # scalar cpu tensor add should NOT be fused
+            @torch.jit.script
+            def t1(y, z):
+                return y * z
+            for _ in range(5):
+                t1(y, z)
+            self.assertGraphContainsExactly(t1.graph_for(y, z), FUSION_GUARD, 0)
+
+            # everything, including scalar cpu tensor add should be fused
+            @torch.jit.script
+            def t2(x, y, z):
+                tmp = y + z
+                return tmp + x
+            for _ in range(5):
+                t2(x, y, z)
+            self.assertGraphContainsExactly(t2.graph_for(x, y, z), 'aten::add', 0)
+            self.assertGraphContainsExactly(t2.graph_for(x, y, z), FUSION_GUARD, 1)
+
+            # 'cpu_tmp = y + z' shouldn't be fused.
+            @torch.jit.script
+            def t3(x, y, z):
+                cpu_tmp = y + z
+                out = x + y
+                return cpu_tmp, out
+            for _ in range(5):
+                t3(x, y, z)
+            self.assertGraphContainsExactly(t3.graph_for(x, y, z), FUSION_GUARD, 1)
+            self.assertGraphContainsExactly(t3.graph_for(x, y, z), 'aten::add', 1)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_shape_expression(self):
+        x = torch.randn(4, 2, 1, 3, device="cuda")
+
+        def t_unsqueeze(x):
+            t0 = x.relu()
+            t1 = t0.unsqueeze(1)
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        def t_squeeze(x):
+            t0 = x.relu()
+            t1 = t0.squeeze()
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        def t_squeeze_dim(x):
+            t0 = x.relu()
+            t1 = t0.squeeze(-2)
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        # squeezing a non-size 1 dimension should be a no op
+        def t_squeeze_dim_no_op(x):
+            t0 = x.relu()
+            t1 = t0.squeeze(1)
+            t2 = t1 + 1.0
+            t3 = t1.size()
+            return t2, t3
+
+        def run(fn):
+            jit_fn = torch.jit.script(fn)
+            jit_o = jit_fn(x)
+            jit_o = jit_fn(x)
+            jit_o = jit_fn(x)
+            o = fn(x)
+            # output 0 is a tensor, so we check dtype and value
+            self.assertEqual(o[0].dtype, jit_o[0].dtype)
+            self.assertEqual(o[0], jit_o[0])
+            # output 1 is shape
+            self.assertEqual(o[1], jit_o[1])
+            self.assertGraphContainsExactly(jit_fn.graph_for(x), FUSION_GUARD, 1)
+
+        for t in [t_unsqueeze, t_squeeze, t_squeeze_dim, t_squeeze_dim_no_op]:
+            run(t)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scalar_cuda_tensor(self):
+        x = torch.tensor(2.0, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x + 1.0
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+            @torch.jit.script
+            def t_jitted(x):
+                return x.sum(0)
+
+            for i in range(5):
+                t_jitted(x)
+            self.assertGraphContainsExactly(t_jitted.graph_for(x), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_overlapped_input(self):
+        x = torch.randn(8, device="cuda").as_strided((2, 4), (1, 1))
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x + 1.0
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_reduction_empty_axes(self):
+        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                sizes : List[int] = []
+                return x.sum(sizes)
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_int_tensor_input(self):
+        x = torch.randn(4, 2, device="cuda").to(dtype=torch.int)
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.amax(dim=0)
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_to_boolean(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.to(dtype=torch.bool)
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x)
+
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_to_copy(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, dtype : torch.dtype):
+                o = torch.ops.aten._to_copy(x, dtype=dtype)
+                return o
+
+            t.__disable_jit_function_caching__ = True
+
+            t_jit = torch.jit.script(t)
+            for dtype in [torch.float16, torch.bool, torch.float64]:
+                self._run_helper(t_jit, t, x, dtype)
+
+            def t_none(x):
+                with torch.jit.strict_fusion():
+                    o = torch.ops.aten._to_copy(x, dtype=None)
+                return o
+
+            t_jit_none = torch.jit.script(t_none)
+            self._run_helper(t_jit_none, t_none, x)
+
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since reshape is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view_copy_graph_guard(self):
+        x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0])
+        y = [4, 6]
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, y : List[int]):
+                t1 = x + 1.0
+                t2 = t1 * 1.0
+                out = t2.reshape(y)
+                return out.relu()
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view_copy_graph_guard_double_fusion(self):
+        x = torch.randn(2, 2, 5, device="cuda")
+        w = torch.randn(5, 5, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, w):
+                o = x.view([4, x.size()[-1]])
+                o = torch.matmul(o, w)
+                o = o.view([2, 2, o.size()[1]])
+                return o
+
+            t_jit = torch.jit.script(t)
+            for i in range(3):
+                jit_o = t_jit(x, w)
+            o = t(x, w)
+            self.assertEqual(jit_o, o)
+            self.assertGraphContainsExactly(t_jit.graph_for(x, w), FUSION_GUARD, 2, consider_subgraphs=True)
+
+    @skipIfRocm
+    # see issue here on why we disabled this test https://github.com/csarofeen/pytorch/issues/2127
+    @unittest.skipIf(is_pre_volta(), "permutation scheduling can be dangerous on pre-volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view_before_permute(self):
+        view_examples = [[[1, 19, 1, 12, 7, 1, 99], [1, 19, 1, 3, 2772]],
+                         [[3, 17, 80, 1], [51, 1, 2, 4, 10]],
+                         [[3, 17, 80, 1, 9], [51, 1, 2, 4, 10, 9]],
+                         [[2, 3, 4, 5], [1, 6, 1, 2, 2, 5]],
+                         [[22, 22, 2], [22, 11, 1, 1, 4]],
+                         [[37, 9, 7, 6, 10], [333, 2, 2, 3, 35]],
+                         [[8, 1, 1, 8, 1, 8], [8, 2, 4, 1, 8]],
+                         [[1, 333, 1], [1, 37, 9]],
+                         [[1, 333], [1, 1, 1, 111, 1, 3]],
+                         [[1, 27454, 1, 2], [1, 7844, 1, 7]],
+                         [[1, 7844, 1, 7], [1, 27454, 2]]]
+
+        def _getTransposeAxes(sizes):
+            # broadcast do not change
+            # always move inner-most dim
+            # random permutation of other dims
+            result = []
+            valid_sizes = []
+            for idx, val in enumerate(sizes):
+                if val > 1 and idx < len(sizes) - 1:
+                    valid_sizes.append((idx, val))
+                result.append(idx)
+            idx, new_size = valid_sizes[random.randint(0, len(valid_sizes) - 1)]
+            result[idx] = len(sizes) - 1
+            result[len(sizes) - 1] = idx
+            return result
+
+        def _transposeSize(sizes, dims):
+            return [sizes[old_pos] for old_pos in dims]
+
+        for example in view_examples:
+            before_view_size, after_view_size = example
+            axes = _getTransposeAxes(after_view_size)
+            output_size = _transposeSize(after_view_size, axes)
+            self._view_before_permute_helper(before_view_size, after_view_size, output_size, axes)
+
+    def _view_before_permute_helper(self, input_shape, view_shape, output_shape, dims):
+        def t(x, y, view_shape : List[int], dims : List[int]):
+            x_v = x.view(view_shape)
+            x_t = torch.permute(x_v, dims)
+            o = torch.add(x_t, y)
+            o = torch.relu(o)
+            return o
+
+        x = torch.randn(*input_shape, device="cuda")
+        y = torch.randn(*output_shape, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, view_shape, dims)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permute(self):
+        max_dims = 4
+        for ndims in range(2, max_dims + 1):
+            shape = [idx + 2 for idx in range(ndims)]
+            for dims in itertools.permutations(range(ndims)):
+                self._permute_helper(shape, dims)
+
+    def _permute_helper(self, shape, dims):
+        def t(x, y, dims : List[int]):
+            x_t = torch.permute(x, dims)
+            y_t = torch.permute(y, dims)
+            o = torch.add(x_t, y_t)
+            o = torch.relu(o)
+            return o
+
+        x = torch.randn(*shape, device="cuda")
+        y = torch.randn(*shape, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, dims)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_transpose(self):
+        max_dims = 4
+        for ndims in range(2, max_dims + 1):
+            shape = [idx + 2 for idx in range(ndims)]
+            for idx in range(1, ndims):
+                for jdx in range(idx):
+                    self._transpose_helper(shape, idx, jdx)
+
+    def _transpose_helper(self, shape, dim0, dim1):
+        def t(x, y, dim0 : int, dim1 : int):
+            x_t = torch.transpose(x, dim0, dim1)
+            y_t = torch.transpose(y, dim0, dim1)
+            o = torch.add(x_t, y_t)
+            o = torch.nn.functional.gelu(o)
+            return o
+
+        x = torch.randn(*shape, device="cuda")
+        y = torch.randn(*shape, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, dim0, dim1)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_transpose_default(self):
+        def t(x, y):
+            x_t = torch.t(x)
+            y_t = torch.t(y)
+            o = torch.add(x_t, y_t)
+            o = torch.nn.functional.gelu(o)
+            return o
+
+        x = torch.randn(3, 5, device="cuda")
+        y = torch.randn(3, 5, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_input_output_passthrough(self):
+        def t(t0, t1, t2):
+            mask = t1.to(dtype=torch.bool)
+            masked_input = torch.where(t0, mask, t2)
+            return masked_input, mask
+
+        t_jit = torch.jit.script(t)
+        # stick to integers, this avoid the numerical difference due to our
+        # promotion
+        x = torch.randn(4, 4, device='cuda').to(dtype=torch.bool)
+        y = torch.randn(4, 4, device='cuda').to(dtype=torch.bool)
+        z = torch.tensor(1.0, device='cuda').to(dtype=torch.bool)
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_pointwise_reference_tensor(self):
+        def t(input1, input2, scalar):
+            _unsafe_view = torch.ops.aten._unsafe_view(input1, [2, 4, 16])
+            add_ = torch.ops.aten.add_(_unsafe_view, input2)
+            gelu_ = torch.ops.aten.gelu(add_)
+            view_ = torch.ops.aten.view(gelu_, [8, 16])
+            mul_ = torch.ops.aten.mul(add_, scalar)
+            return [view_, mul_]
+
+        x = torch.randn(8, 16, device="cuda")
+        bias = torch.randn(16, device="cuda")
+        scalar = torch.ones(torch.Size([]), device="cuda")
+
+        t_jit = torch.jit.script(t)
+        for i in range(3):
+            jit_o = t_jit(x, bias, scalar)
+        o = t(x, bias, scalar)
+        self.assertEqual(jit_o, o)
+        self.assertGraphContains(t_jit.graph_for(x, bias, scalar), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_native_batch_norm_backward(self):
+        grad_output = torch.randn(4, 2, 3, device="cuda")
+        input = torch.randn(4, 2, 3, device="cuda")
+        weight = torch.randn(2, device="cuda")
+
+        r_m = torch.randn(2, device="cuda")
+        r_v = torch.randn(2, device="cuda").abs()
+
+        save_mean = torch.randn(2, device="cuda")
+        save_invstd = torch.randn(2, device="cuda").abs()
+
+        with nvfuser_singleton_fusion(True):
+            def t(grad_out, input, weight, r_m, r_v, save_mean, save_invstd, train: bool, eps: float, mask: List[bool]):
+                return torch.ops.aten.native_batch_norm_backward(grad_out, input, weight, r_m, r_v, save_mean,
+                                                                 save_invstd, train, eps, mask)
+
+            t_jit = torch.jit.script(t)
+            for i in range(4):
+                jit_o = t_jit(grad_output, input, weight, r_m.clone(), r_v.clone(),
+                              save_mean, save_invstd, True, 1e-5, [True, True, True])
+
+            ref_m = r_m.clone()
+            ref_v = r_v.clone()
+            jit_o = t_jit(grad_output, input, weight, r_m, r_v, save_mean, save_invstd, True, 1e-5, [True, True, True])
+            o = t(grad_output, input, weight, ref_m, ref_v, save_mean, save_invstd, True, 1e-5, [True, True, True])
+            for oo, jit_oo in zip(o, jit_o):
+                self.assertEqual(oo.dtype, jit_oo.dtype)
+                self.assertEqual(oo, jit_oo)
+            self.assertEqual(ref_m.dtype, r_m.dtype)
+            self.assertEqual(ref_m, r_m)
+            self.assertEqual(ref_v.dtype, r_v.dtype)
+            self.assertEqual(ref_v, r_v)
+            self.assertGraphContains(t_jit.graph_for(grad_output, input, weight, r_m.clone(), r_v.clone, save_mean,
+                                                     save_invstd, True, 1e-5, [True, True, True]), FUSION_GUARD)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_contiguous_on_broadcasted(self):
+        x = torch.randn(4, 1, device="cuda")
+        y = torch.randn(4, 128, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x, y):
+                t1 = x.expand([4, 128])
+                t2 = t1 * y
+                return t2
+
+            t_jit = torch.jit.script(t)
+            self._run_helper(t_jit, t, x, y)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_skip_parser(self):
+        x = torch.randn(4, 12, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def fn(x):
+                t1 = x + 1.0
+                return t1.relu()
+
+            fn_jit = torch.jit.script(fn)
+            self._run_helper(fn_jit, fn, x)
+
+            # add node should have been merged into fusion
+            self.assertGraphContains(fn_jit.graph_for(x), FUSION_GUARD)
+            self.assertGraphContainsExactly(fn_jit.graph_for(x), 'aten::add', 0)
+
+            # flips skip parse for `aten::add`, following fusion should skip the
+            # add node
+            self.assertFalse(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True))
+
+            def fn_1(x):
+                t1 = x + 2.0  # change const value so we'll not reuse plan
+                return t1.relu()
+
+            fn_1_jit = torch.jit.script(fn_1)
+            self._run_helper(fn_1_jit, fn_1, x)
+
+            # add node should have been merged into fusion
+            self.assertGraphContains(fn_1_jit.graph_for(x), FUSION_GUARD)
+            self.assertGraphContainsExactly(fn_1_jit.graph_for(x), 'aten::add', 1)
+
+            # flips skip parse for `aten::add`, next fusion should fuse add node
+            self.assertTrue(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True))
+
+            def fn_2(x):
+                t1 = x + 2.0  # change const value so we'll not reuse plan
+                return t1.relu()
+
+            fn_2_jit = torch.jit.script(fn_2)
+            self._run_helper(fn_2_jit, fn_2, x)
+
+            # add node should have been merged into fusion
+            self.assertGraphContains(fn_2_jit.graph_for(x), FUSION_GUARD)
+            self.assertGraphContainsExactly(fn_2_jit.graph_for(x), 'aten::add', 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cuda_fusion_guard(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+
+        class ConvModule(torch.nn.Module):
+            def forward(self, x):
+                return x.sin().sigmoid()
+
+        mod = ConvModule().to(device="cuda")
+
+        inputs = [torch.randn(20, 16, 50, 100, device="cuda", requires_grad=True)]
+
+        def reduce_scalar(temp):
+            return temp.sum()
+
+        scripted = torch.jit.script(mod)
+        with torch.no_grad():
+            scripted(*inputs)
+        res = scripted(*inputs)
+        reduce_scalar(res).backward()
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_nvfuser_comparison_callbacks_with_fallback(self):
+        try:
+            fused_result = None
+            unfused_result = None
+            graph_ir = None
+
+            def callback(fused_outputs, unfused_outputs, graph_str):
+                nonlocal unfused_result
+                nonlocal fused_result
+                nonlocal graph_ir
+                unfused_result = unfused_outputs[-1]
+                fused_result = fused_outputs[-1]
+                graph_ir = graph_str
+            torch._C._jit_nvfuser_set_comparison_callback(True, callback)
+
+            def fn(x, y):
+                z = torch.add(x, y)
+                return torch.relu(z)
+
+            x = torch.rand((4, 4)).cuda() - 0.5
+            y = torch.rand((4, 4)).cuda() - 0.5
+
+            fn_s = torch.jit.script(fn)
+            fn_s(x, y)
+            fn_s(x, y)
+            fn_s(x, y)
+
+            expected = fn(x, y)
+
+            self.assertEqual(expected, fused_result)
+            self.assertEqual(expected, unfused_result)
+            FileCheck().check("aten::add").run(graph_ir)
+        finally:
+            torch._C._jit_nvfuser_clear_comparison_callback()
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_nvfuser_comparison_callbacks_without_fallback(self):
+        try:
+            fused_result = None
+            unfused_result = None
+            graph_ir = None
+
+            def callback(fused_outputs, unfused_outputs, graph_str):
+                nonlocal unfused_result
+                nonlocal fused_result
+                nonlocal graph_ir
+                if len(unfused_outputs) > 0:
+                    unfused_result = unfused_outputs[-1]
+                fused_result = fused_outputs[-1]
+                graph_ir = graph_str
+            torch._C._jit_nvfuser_set_comparison_callback(False, callback)
+
+            def fn(x, y):
+                z = torch.add(x, y)
+                return torch.relu(z)
+
+            x = torch.rand((4, 4)).cuda() - 0.5
+            y = torch.rand((4, 4)).cuda() - 0.5
+
+            fn_s = torch.jit.script(fn)
+            fn_s(x, y)
+            fn_s(x, y)
+            fn_s(x, y)
+
+            expected = fn(x, y)
+
+            self.assertEqual(expected, fused_result)
+            self.assertEqual(None, unfused_result)
+            FileCheck().check("aten::add").run(graph_ir)
+        finally:
+            torch._C._jit_nvfuser_clear_comparison_callback()
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires NVFuser")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_cuda_fusion_guard_backward(self):
+        old_guard = torch._C._jit_set_nvfuser_guard_mode(True)
+
+        inp = torch.randn(10, device="cuda", requires_grad=True)
+        grad = torch.randn(10, device="cuda")
+
+        def f(x):
+            a = x.cos().cos()
+            return a
+        scripted = torch.jit.script(f)
+
+        with profile(activities=[ProfilerActivity.CPU]) as prof:
+            for _ in range(5):
+                inp.grad = None
+                out = scripted(inp)
+                out.backward(grad)
+
+        # check that we do not have fallback triggered
+        self.assertEqual(prof.events().table().find("fallback"), -1)
+        torch._C._jit_set_nvfuser_guard_mode(old_guard)
+
+    # TODO: generalize this
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    def test_inf_quick_patch(self):
+        inputs = [torch.tensor([-float('inf'), float('inf'), 4.0], device="cuda"),
+                  torch.tensor([1.0, float('inf'), 4.0], device="cuda"),
+                  torch.tensor([-float('inf'), -1.5, 4.0], device="cuda"),
+                  torch.tensor([1.0, -3.0, float('nan')], device="cuda"),
+                  torch.tensor([-float('inf'), -float('inf'), -float('inf')], device="cuda"),
+                  torch.tensor([float('inf'), float('inf'), float('inf')], device="cuda"),
+                  torch.tensor([float('nan'), float('nan'), float('nan')], device="cuda")]
+
+        def fn_amax(x):
+            return x.amax(dim=0)
+
+        def fn_amin(x):
+            return x.amin(dim=0)
+
+        def fn_add_nan(x):
+            return x.relu() + float('nan')
+
+        def fn_add(x):
+            return x + 1.0
+
+        with nvfuser_singleton_fusion(True):
+            for t in [fn_amax, fn_amin, fn_add, fn_add_nan]:
+                for x in inputs:
+                    t_jit = torch.jit.script(t)
+                    self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_clamp_reversed_bound(self):
+        x = torch.tensor([1., -float('inf'), 2., float('inf'), float('nan')], device="cuda")
+
+        def t(x):
+            return x.clamp(min=1., max=0.5)
+
+        with nvfuser_singleton_fusion(True):
+            jit_t = torch.jit.script(t)
+            self._run_helper(jit_t, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_issue_1785(self):
+        class Fusion(torch.nn.Module):
+            def forward(self, x, a, b):
+                out = torch.mul(x.unsqueeze(-1), a)
+                out = out + b
+                return out
+
+        x = torch.randn(1024, 192, 3, device='cuda')
+        a = torch.randn(3, 128, device='cuda')
+        b = torch.randn(3, 128, device='cuda')
+
+        model = Fusion()
+        jit_model = torch.jit.script(model)
+
+        with torch.jit.fuser('fuser2'):
+            for _ in range(4):
+                out_ref = model(x, a, b)
+                out_jit = jit_model(x, a, b)
+
+        out_ref = model(x, a, b)
+        out_jit = jit_model(x, a, b)
+        self.assertTrue(self._compare("comparing output failed", out_ref, out_jit, 1e-5))
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_high_rank_fusion(self):
+        # currently we want to limit fusion to node with input where rank <= 8
+        rank_limit = 8
+        shapes = [4 for i in range(rank_limit + 1)]
+        x = torch.randn(shapes, device="cuda")
+
+        with nvfuser_singleton_fusion(True):
+            def t(x):
+                return x.relu()
+
+            jit_t = torch.jit.script(t)
+            for i in range(5):
+                jit_t(x)
+                self.assertGraphContainsExactly(jit_t.graph_for(x), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_clamp(self):
+        x = torch.tensor([1., float('inf'), 2., float('nan'), float('-inf')], device="cuda")
+
+        def clamp_max(x):
+            return x.clamp(max=1.5)
+
+        def clamp_min_max(x):
+            return x.clamp(min=1.5)
+
+        def clamp_min(x):
+            return x.clamp(min=1., max=3.)
+
+        with nvfuser_singleton_fusion(True):
+            for t in [clamp_max, clamp_min, clamp_min_max]:
+                t_jit = torch.jit.script(t)
+                self._run_helper(t_jit, t, x)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_device_constant(self):
+        x = torch.randn(4, 2, device="cuda")
+
+        # cpu tensor shouldn't be fused
+        def t_cpu(x):
+            return torch.rand_like(x, device=torch.device(type='cpu'))
+
+        with nvfuser_singleton_fusion(True):
+            t_cpu_jit = torch.jit.script(t_cpu)
+            for _ in range(5):
+                t_cpu_jit(x)
+
+            self.assertGraphContainsExactly(t_cpu_jit.graph_for(x), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_expand(self):
+        device = "cuda"
+        x = torch.randn(3, 5, device=device)
+        y = torch.randn(4, 2, 3, 5, device=device)
+
+        def t(x, y):
+            with torch.jit.strict_fusion():
+                x = x.relu()
+                o0 = x.expand(2, 3, 5)
+                o1 = x.expand_as(y)
+            return o0, o1
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, check_stride=True)
+
+        def t2(x, y):
+            o0 = x.expand(2, 3, 5)
+            o1 = x.expand_as(y)
+            x.add_(1)
+            return o0, o1
+
+        t2_jit = torch.jit.script(t2)
+        self._run_helper(t2_jit, t2, x, y, check_stride=True, num_fusion=0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_scheduler_with_polymorphic_broadcast(self):
+        device = "cuda"
+        x0 = torch.randn(10, 128, device=device)
+        x1 = torch.rand_like(x0)
+        x2 = torch.randn(10, device=device)
+
+        def t(x0, x1, x2):
+            x3 = x2.unsqueeze(-1)
+            x4 = x3 + x0
+            x5 = x3 + x1
+            x6 = x5.sum(0)
+            return x4, x6
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x0, x1, x2, check_stride=True)
+
+        x2 = torch.randn(128, device=device)
+
+        def t2(x0, x1, x2):
+            x3 = x2.unsqueeze(0)
+            x4 = x3 + x0
+            x5 = x3 + x1
+            x6 = x5.sum(1)
+            return x4, x6
+
+        t2_jit = torch.jit.script(t2)
+        self._run_helper(t2_jit, t2, x0, x1, x2, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_type_inference(self):
+        device = "cuda"
+        x0 = torch.randn(10, 128, device=device)
+        x1 = torch.rand_like(x0)
+        x2 = torch.rand_like(x0)
+
+        def t(x0, x1, x2, flag : bool = True):
+            x3 = 2.0 * x0
+            x4 = 2.0 * x1
+            x5 = 2.0 * x2
+            if flag:
+                return torch.stack([x3, x4, x5], dim=-1)
+            # second code path doesn't run through profiling
+            # hence would utilize type inference with profiling information
+            return x0 + x1 + x2
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x0, x1, x2, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_disable_const_chunk_propagation_for_normalization(self):
+        device = "cuda"
+        x0 = torch.randn(10, 12, device=device)
+        x1 = torch.randn(10, 4, device=device)
+        w0 = torch.randn(12, device=device)
+        w1 = torch.randn(4, device=device)
+
+        def t(x, y, w0, w1):
+            ih = torch.layer_norm(x, (12,), w0)
+            i_r, i_z, i_n = ih.chunk(3, dim=1)
+            i_n = torch.layer_norm(i_n, (4,), w1)
+            r = torch.sigmoid(i_r)
+            n = torch.tanh(i_n + r * i_z)
+            h = n + r * y
+            return h
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x0, x1, w0, w1, check_stride=True)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_no_tensor_input(self):
+        device = "cuda"
+        x = torch.randn(512, device=device)
+
+        def t(x):
+            tensor0 = torch.tensor(3, dtype=torch.float32, device='cuda')
+            tensor1 = torch.tensor(3, dtype=torch.float32, device='cuda')
+            o = torch.div(x.numel(), tensor0)
+            o = torch.mul(o, tensor1)
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, check_stride=True)
+
+        # Note that curently TS embeds constant tensor in the graph
+        # this triggers memory leak check in CI
+        torch.jit._state._python_cu.drop_all_functions()
+
+
+class TestEnableDisableCudaFuser(JitTestCase):
+    def setUp(self):
+        super().setUp()
+        if RUN_NVFUSER:
+            self.is_enabled = torch._C._jit_set_nvfuser_enabled(False)
+
+    def tearDown(self):
+        if RUN_NVFUSER:
+            torch._C._jit_set_nvfuser_enabled(self.is_enabled)
+        super().tearDown()
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_context_manager_test(self):
+        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        with torch.jit.fuser('fuser2'):
+            with torch.jit.fuser('fuser2'):
+
+                def t1(x, y):
+                    o = x + y
+                    o = o + 2.0
+                    return o
+                t_jit = torch.jit.script(t1)
+                t_jit(x, y)
+                t_jit(x, y)
+                self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD)
+
+            def t2(x, y):
+                o = x + y
+                o = o + 3.0
+                return o
+            t_jit_2 = torch.jit.script(t2)
+            t_jit_2(x, y)
+            t_jit_2(x, y)
+            self.assertGraphContains(t_jit_2.graph_for(x, y), FUSION_GUARD)
+
+        def t3(x, y):
+            o = x + y
+            o = o + 4.0
+            return o
+        t_jit_3 = torch.jit.script(t3)
+        t_jit_3(x, y)
+        t_jit_3(x, y)
+        self.assertGraphContainsExactly(t_jit_3.graph_for(x, y), FUSION_GUARD, 0)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    def test_register_fuser(self):
+        self.assertFalse(torch._C._jit_set_nvfuser_enabled(True))
+        self.assertTrue(torch._C._jit_nvfuser_enabled())
+        self.assertTrue(torch._C._jit_set_nvfuser_enabled(True))
+        self.assertTrue(torch._C._jit_nvfuser_enabled())
+        self.assertTrue(torch._C._jit_set_nvfuser_enabled(False))
+        self.assertFalse(torch._C._jit_nvfuser_enabled())
+
+    @unittest.skipIf(RUN_CUDA, "Testing on CPU only")
+    def test_register_fuser_cpu(self):
+        with self.assertRaises(RuntimeError):
+            torch._C._jit_set_nvfuser_enabled(True)
+            torch._C._jit_set_nvfuser_enabled(False)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(not TEST_WITH_ROCM, "ROCM test only")
+    def test_register_fuser_rocm(self):
+        with self.assertRaises(RuntimeError):
+            torch._C._jit_set_nvfuser_enabled(True)
+            torch._C._jit_set_nvfuser_enabled(False)
+
+    def test_can_be_enabled_nvfuser(self):
+        if TEST_WITH_ROCM:
+            expected = False
+        else:
+            expected = RUN_CUDA
+
+        self.assertEqual(expected, torch._C._jit_nvfuser_can_be_enabled())
+
+# See TestNNCOpInfoParent
+class TestCudaFuserOpInfoParent(JitCommonTestCase):
+    pass
+
+class TestCudaFuserOpInfo(TestCudaFuserOpInfoParent):
+    def setUp(self):
+        super(TestCudaFuserOpInfoParent, self).setUp()
+        if RUN_NVFUSER:
+            self.cuda_fuser_options = CudaFuserTestOptions()
+            # enables guard mode since tracing could change graph to violate guard.
+            torch._C._jit_set_nvfuser_guard_mode(True)
+        self.nvfuser_single_node_mode = torch._C._jit_set_nvfuser_single_node_mode(True)
+
+    def tearDown(self):
+        if RUN_NVFUSER:
+            self.cuda_fuser_options.restore()
+
+        torch._C._jit_set_nvfuser_single_node_mode(self.nvfuser_single_node_mode)
+
+        super(TestCudaFuserOpInfoParent, self).tearDown()
+
+    @slowTest
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @ops(op_db, dtypes=OpDTypes.supported)
+    def test_nvfuser_correctness(self, device, dtype, op):
+        if not op.supports_tracing:
+            self.skipTest("nvfuser requires tracing support")
+
+        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
+
+        for variant, sample in variant_sample_pairs:
+            trace = create_traced_fn(self, variant, cache_traced_fn=True)
+            ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            val = trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            self.assertEqual(ref, val, exact_layout=True)
+
+        # Note: Clearing CU after NVFuser tests
+        # https://github.com/pytorch/pytorch/issues/35600
+        # each torch.jit.trace adds state to the _python_cu compilation unit
+        # since this test traces a lot of functions, out-of-memory can occur
+        # if the CU is not cleared.
+        torch.jit._state._python_cu.drop_all_functions()
+
+    @skipIfRocm
+    @slowTest
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    @ops(op_db, allowed_dtypes=(torch.float16, torch.bfloat16, torch.float32,
+                                torch.float64, torch.complex64, torch.complex128))
+    def test_nvfuser_extremal_values(self, device, dtype, op):
+        if not op.supports_tracing:
+            self.skipTest("nvfuser requires tracing support")
+
+        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
+
+        def _get_extremal_tensor(x, val, dtype):
+            if x.dtype != dtype:
+                return x
+            return torch.full_like(x, val)
+
+        def _get_extremal_input(x, val, dtype):
+            if isinstance(x, torch.Tensor):
+                return _get_extremal_tensor(x, val, dtype)
+            elif is_iterable_of_tensors(x):
+                return [_get_extremal_tensor(y, val, dtype) for y in x]
+            return x
+
+        def _get_extremal_sample(sample: SampleInput, val, dtype):
+            extremal_sample = SampleInput(
+                input=_get_extremal_input(sample.input, val, dtype),
+                args=tuple(_get_extremal_input(x, val, dtype) for x in sample.args),
+                kwargs={k: _get_extremal_input(v, val, dtype) for k, v in sample.kwargs.items()},
+            )
+            return extremal_sample
+
+        def _get_extremal_samples(sample: SampleInput, dtype):
+            vals = [float('inf'), float('-inf'), float('nan')]
+            if dtype.is_complex:
+                complex_vals = itertools.product(vals, vals)
+                vals = tuple(map(lambda x: complex(*x), complex_vals))
+            for val in vals:
+                yield _get_extremal_sample(sample, val, dtype)
+
+        variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op)
+
+        for variant, sample in variant_sample_pairs:
+
+            trace = create_traced_fn(self, variant, cache_traced_fn=True)
+            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+            trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs)
+
+            for extremal_sample in _get_extremal_samples(sample, dtype):
+                try:
+                    with freeze_rng_state():
+                        ref = variant(*clone_inputs((extremal_sample.input, *extremal_sample.args)),
+                                      **extremal_sample.kwargs)
+                except (torch._C._LinAlgError, RuntimeError, ValueError):
+                    # if eager errors out, then don't expect NVFuser to pass
+                    continue
+
+                with freeze_rng_state():
+                    val = trace(*clone_inputs((extremal_sample.input, *extremal_sample.args)),
+                                **extremal_sample.kwargs)
+
+                self.assertEqual(val, ref, equal_nan=True, exact_device=True)
+
+            # See [Note: Clearing CU after NVFuser tests]
+            torch.jit._state._python_cu.drop_all_functions()
+
+instantiate_device_type_tests(TestCudaFuserOpInfo, globals(), only_for=("cuda"))
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/torch/csrc/jit/codegen/cuda/runtime/array.cu b/third_party/nvfuser/runtime/array.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/array.cu
rename to third_party/nvfuser/runtime/array.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/array_rocm.cu b/third_party/nvfuser/runtime/array_rocm.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/array_rocm.cu
rename to third_party/nvfuser/runtime/array_rocm.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu b/third_party/nvfuser/runtime/bf16_support.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu
rename to third_party/nvfuser/runtime/bf16_support.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/bf16_support_rocm.cu b/third_party/nvfuser/runtime/bf16_support_rocm.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/bf16_support_rocm.cu
rename to third_party/nvfuser/runtime/bf16_support_rocm.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu b/third_party/nvfuser/runtime/block_reduction.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu
rename to third_party/nvfuser/runtime/block_reduction.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu b/third_party/nvfuser/runtime/block_sync_atomic.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
rename to third_party/nvfuser/runtime/block_sync_atomic.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu b/third_party/nvfuser/runtime/block_sync_default.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
rename to third_party/nvfuser/runtime/block_sync_default.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_default_rocm.cu b/third_party/nvfuser/runtime/block_sync_default_rocm.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/block_sync_default_rocm.cu
rename to third_party/nvfuser/runtime/block_sync_default_rocm.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/broadcast.cu b/third_party/nvfuser/runtime/broadcast.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/broadcast.cu
rename to third_party/nvfuser/runtime/broadcast.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu b/third_party/nvfuser/runtime/fp16_support.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
rename to third_party/nvfuser/runtime/fp16_support.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu b/third_party/nvfuser/runtime/fused_reduction.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
rename to third_party/nvfuser/runtime/fused_reduction.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu b/third_party/nvfuser/runtime/fused_welford_helper.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu
rename to third_party/nvfuser/runtime/fused_welford_helper.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu b/third_party/nvfuser/runtime/fused_welford_impl.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu
rename to third_party/nvfuser/runtime/fused_welford_impl.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu b/third_party/nvfuser/runtime/grid_broadcast.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
rename to third_party/nvfuser/runtime/grid_broadcast.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu b/third_party/nvfuser/runtime/grid_reduction.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
rename to third_party/nvfuser/runtime/grid_reduction.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu b/third_party/nvfuser/runtime/grid_sync.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu
rename to third_party/nvfuser/runtime/grid_sync.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu b/third_party/nvfuser/runtime/helpers.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/helpers.cu
rename to third_party/nvfuser/runtime/helpers.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/index_utils.cu b/third_party/nvfuser/runtime/index_utils.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/index_utils.cu
rename to third_party/nvfuser/runtime/index_utils.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/memory.cu b/third_party/nvfuser/runtime/memory.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/memory.cu
rename to third_party/nvfuser/runtime/memory.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu b/third_party/nvfuser/runtime/random_numbers.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
rename to third_party/nvfuser/runtime/random_numbers.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/swizzle.cu b/third_party/nvfuser/runtime/swizzle.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/swizzle.cu
rename to third_party/nvfuser/runtime/swizzle.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu b/third_party/nvfuser/runtime/tensor.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/tensor.cu
rename to third_party/nvfuser/runtime/tensor.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu b/third_party/nvfuser/runtime/tensorcore.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu
rename to third_party/nvfuser/runtime/tensorcore.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/tuple.cu b/third_party/nvfuser/runtime/tuple.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/tuple.cu
rename to third_party/nvfuser/runtime/tuple.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/type_traits.cu b/third_party/nvfuser/runtime/type_traits.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/type_traits.cu
rename to third_party/nvfuser/runtime/type_traits.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/warp.cu b/third_party/nvfuser/runtime/warp.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/warp.cu
rename to third_party/nvfuser/runtime/warp.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/warp_rocm.cu b/third_party/nvfuser/runtime/warp_rocm.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/warp_rocm.cu
rename to third_party/nvfuser/runtime/warp_rocm.cu
diff --git a/torch/csrc/jit/codegen/cuda/runtime/welford.cu b/third_party/nvfuser/runtime/welford.cu
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/runtime/welford.cu
rename to third_party/nvfuser/runtime/welford.cu
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp b/third_party/nvfuser/test/test_gpu1.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
rename to third_party/nvfuser/test/test_gpu1.cpp
index 2a14695b53ff..3d75ab3e04c9 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
+++ b/third_party/nvfuser/test/test_gpu1.cpp
@@ -2,43 +2,43 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <grouped_reduction.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_magic_zero.h>
+#include <mutator.h>
+#include <ops/all_ops.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/api/function_impl.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <parser.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/torch.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp b/third_party/nvfuser/test/test_gpu2.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
rename to third_party/nvfuser/test/test_gpu2.cpp
index 9cc3bf195c47..87781f4f48d0 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
+++ b/third_party/nvfuser/test/test_gpu2.cpp
@@ -2,43 +2,43 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <grouped_reduction.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_magic_zero.h>
+#include <mutator.h>
+#include <ops/all_ops.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/api/function_impl.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <parser.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/torch.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/third_party/nvfuser/test/test_gpu3.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
rename to third_party/nvfuser/test/test_gpu3.cpp
index 8d24cc380374..76702159ec53 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
+++ b/third_party/nvfuser/test/test_gpu3.cpp
@@ -2,43 +2,44 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <grouped_reduction.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_magic_zero.h>
+#include <mutator.h>
+#include <ops/all_ops.h>
+#include <parser.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/api/function_impl.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/torch.h>
 
@@ -4492,7 +4493,7 @@ void checkSiblingConsistency(TensorView* replay, TensorView* target) {
       [](auto a, auto b) { return std::make_pair(a, b); });
   BestEffortReplay replay_(replay_dom, target_dom, target2replay_map);
   auto r = replay_.getReplay();
-  for (int64_t i = 0; i < replay_dom.size(); i++) {
+  for (int64_t i = 0; i < (int64_t)replay_dom.size(); i++) {
     auto target_id = target_dom[i];
     auto replay_it = r.find(target_id);
     TORCH_CHECK(replay_it != r.end());
@@ -6347,7 +6348,7 @@ TEST_F(NVFuserTest, FusionVectorizeStrideContiguity2D_CUDA) {
     at::Tensor t0 = at::randn({1000000, size}, options).narrow(1, 0, 16);
     auto cg_outputs = fec.runFusionWithInputs({t0});
 
-    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+    TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
 
     testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
   }
@@ -6376,7 +6377,7 @@ TEST_F(NVFuserTest, FusionVectorizeStrideContiguity3D_CUDA) {
     at::Tensor t0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8);
     auto cg_outputs = fec.runFusionWithInputs({t0});
 
-    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+    TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
 
     testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
   }
@@ -6412,7 +6413,7 @@ TEST_F(NVFuserTest, FusionVectorizeStrideContiguity5D_CUDA) {
                         .narrow(3, 0, 4);
     auto cg_outputs = fec.runFusionWithInputs({t0});
 
-    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+    TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
 
     testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
   }
@@ -6459,7 +6460,7 @@ TEST_F(NVFuserTest, FusionVectorizeStrideContiguitySelfOverlapping_CUDA) {
     at::Tensor t0 = at::empty_strided(shape, stride, options);
     t0.random_();
     auto cg_outputs = fec.runFusionWithInputs({t0});
-    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+    TORCH_CHECK(getVecSizeForPointwise(fec) == (size_t)vec);
     testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp b/third_party/nvfuser/test/test_gpu_fused_reduction.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
rename to third_party/nvfuser/test/test_gpu_fused_reduction.cpp
index e827de56e56b..55b11f5790e0 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
+++ b/third_party/nvfuser/test/test_gpu_fused_reduction.cpp
@@ -1,36 +1,36 @@
 #if defined(USE_CUDA)
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <grouped_reduction.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <mutator.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 // fuser and IR parser
 #include <ATen/cuda/CUDAContext.h>
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu b/third_party/nvfuser/test/test_gpu_rng.cu
similarity index 96%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
rename to third_party/nvfuser/test/test_gpu_rng.cu
index a1ff6562e6bd..211e83d70729 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
+++ b/third_party/nvfuser/test/test_gpu_rng.cu
@@ -3,13 +3,13 @@
 
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <c10/util/Optional.h>
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <arith.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <kernel_cache.h>
+#include <scheduler/all_schedulers.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 
 #include <cassert>
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp b/third_party/nvfuser/test/test_gpu_shift.cpp
similarity index 99%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
rename to third_party/nvfuser/test/test_gpu_shift.cpp
index d1f185011826..cda9b713c5bb 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
+++ b/third_party/nvfuser/test/test_gpu_shift.cpp
@@ -1,34 +1,34 @@
 #if defined(USE_CUDA)
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <mutator.h>
+#include <ops/all_ops.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 // fuser and IR parser
 #include <ATen/cuda/CUDAContext.h>
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp b/third_party/nvfuser/test/test_gpu_tensor_factories.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp
rename to third_party/nvfuser/test/test_gpu_tensor_factories.cpp
index 06e93fcd579e..fb11208fc337 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp
+++ b/third_party/nvfuser/test/test_gpu_tensor_factories.cpp
@@ -2,15 +2,15 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <codegen.h>
+#include <executor.h>
+#include <fusion.h>
+#include <ir_all_nodes.h>
+#include <ir_iostream.h>
+#include <kernel_cache.h>
+#include <ops/all_ops.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/third_party/nvfuser/test/test_gpu_tensorcore.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
rename to third_party/nvfuser/test/test_gpu_tensorcore.cpp
index c00d02c8a40d..f395b8dad644 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
+++ b/third_party/nvfuser/test/test_gpu_tensorcore.cpp
@@ -1,37 +1,37 @@
 #if defined(USE_CUDA)
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/mma_type.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/matmul.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <ir_all_nodes.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_printer.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <lower2device.h>
+#include <mma_type.h>
+#include <mutator.h>
+#include <ops/all_ops.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/matmul.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 // fuser and IR parser
 #include <ATen/cuda/CUDAContext.h>
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/third_party/nvfuser/test/test_gpu_transpose.cpp
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
rename to third_party/nvfuser/test/test_gpu_transpose.cpp
index b10360f00315..5366e1df3ebc 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
+++ b/third_party/nvfuser/test/test_gpu_transpose.cpp
@@ -2,15 +2,15 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <executor.h>
+#include <inlining.h>
+#include <kernel_cache.h>
+#include <ops/all_ops.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/transpose.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp b/third_party/nvfuser/test/test_gpu_utils.cpp
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp
rename to third_party/nvfuser/test/test_gpu_utils.cpp
index 19c3c6f9bf6d..dacb9043d870 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp
+++ b/third_party/nvfuser/test/test_gpu_utils.cpp
@@ -2,12 +2,12 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <fusion.h>
+#include <lower_utils.h>
+#include <ops/all_ops.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
@@ -42,7 +42,7 @@ TEST_F(NVFuserTest, FusionMergeDims_CUDA) {
       {p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10]});
   std::vector<size_t> dims{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
   auto merged = scheduler_utils::mergeDims(tv, {2, 3, 7, 8, 9}, dims);
-  TORCH_CHECK(merged == 2);
+  TORCH_CHECK(merged == (size_t)2);
   std::vector<int64_t> expect_shape{
       p[0], p[1], p[2] * p[3] * p[7] * p[8] * p[9], p[4], p[5], p[6], p[10]};
   TORCH_CHECK(tv->nDims() == expect_shape.size());
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/third_party/nvfuser/test/test_gpu_validator.h
similarity index 98%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
rename to third_party/nvfuser/test/test_gpu_validator.h
index f70c7a80f76f..769afc1d7f20 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
+++ b/third_party/nvfuser/test/test_gpu_validator.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <executor_utils.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <ir_iostream.h>
+#include <lower_utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp b/third_party/nvfuser/test/test_gpu_view.cpp
similarity index 97%
rename from torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
rename to third_party/nvfuser/test/test_gpu_view.cpp
index 9785e089052a..3c2303c7e502 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
+++ b/third_party/nvfuser/test/test_gpu_view.cpp
@@ -2,41 +2,41 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+#include <arith.h>
+#include <codegen.h>
+#include <disjoint_set.h>
+#include <executor.h>
+#include <executor_launch_params.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <fusion_segmenter.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <ir_graphviz.h>
+#include <ir_iostream.h>
+#include <ir_utils.h>
+#include <iter_visitor.h>
+#include <kernel_cache.h>
+#include <kernel_expr_evaluator.h>
+#include <kernel_ir.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_divisible_split.h>
+#include <mutator.h>
+#include <ops/all_ops.h>
+#include <register_interface.h>
+#include <root_domain_map.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/reduction_utils.h>
+#include <scheduler/utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
+#include <transform_replay.h>
+#include <transform_rfactor.h>
 
 // fuser and IR parser
-#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <parser.h>
 #include <torch/csrc/jit/ir/irparser.h>
 
 #include <ATen/cuda/CUDAContext.h>
diff --git a/torch/csrc/jit/codegen/cuda/test/test_utils.h b/third_party/nvfuser/test/test_utils.h
similarity index 95%
rename from torch/csrc/jit/codegen/cuda/test/test_utils.h
rename to third_party/nvfuser/test/test_utils.h
index 8b199b930f24..a237510d4e56 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_utils.h
+++ b/third_party/nvfuser/test/test_utils.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <executor.h>
+#include <expr_evaluator.h>
+#include <ir_all_nodes.h>
+#include <kernel_ir_dispatch.h>
+#include <lower2device.h>
+#include <lower_magic_zero.h>
+#include <transform_replay.h>
 
 #include <ATen/Context.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -319,7 +319,7 @@ struct TransformPropagatorWithCheck : public TransformPropagator {
     auto to_pos = replayed_pos_.at(to);
     TORCH_CHECK(
         TransformReplay::getMatchedLeafPosWithoutReplayPasC(
-            to, from, from_pos) == to_pos);
+            to, from, from_pos) == (int)to_pos);
   }
   virtual void propagateP2C(TensorView* from, TensorView* to) override {
     TransformPropagator::propagateP2C(from, to);
@@ -327,7 +327,7 @@ struct TransformPropagatorWithCheck : public TransformPropagator {
     auto to_pos = replayed_pos_.at(to);
     TORCH_CHECK(
         TransformReplay::getMatchedLeafPosWithoutReplayCasP(
-            to, from, from_pos) == to_pos);
+            to, from, from_pos) == (int)to_pos);
   }
   virtual void propagateSibling(TensorView* from, TensorView* to) override {
     TransformPropagator::propagateSibling(from, to);
diff --git a/torch/csrc/jit/codegen/cuda/tools/stringify_file.py b/third_party/nvfuser/tools/stringify_file.py
similarity index 100%
rename from torch/csrc/jit/codegen/cuda/tools/stringify_file.py
rename to third_party/nvfuser/tools/stringify_file.py
diff --git a/third_party/onnx b/third_party/onnx
index f7ee1ac60d06..e192ba01e438 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit f7ee1ac60d06abe8e26c9b6bbe1e3db5286b614b
+Subproject commit e192ba01e438d22ca2dedd7956e28e3551626c91
diff --git a/third_party/onnx.BUILD b/third_party/onnx.BUILD
index df5e09cad684..c5bf8c65ac05 100644
--- a/third_party/onnx.BUILD
+++ b/third_party/onnx.BUILD
@@ -76,6 +76,8 @@ cc_library(
         "onnx/version_converter/*.h",
         "onnx/common/*.h",
         "onnx/defs/*.h",
+        "onnx/defs/math/*.h",
+        "onnx/defs/reduction/*.h",
         "onnx/defs/tensor/*.h",
         "onnx/shape_inference/*.h",
         "onnx/version_converter/adapters/*.h",
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index 41f6e2e7c815..75228dc38f71 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -35,6 +35,10 @@ load(
     "PROD_SSE_MICROKERNEL_SRCS",
     "PROD_SSSE3_MICROKERNEL_SRCS",
     "PROD_XOP_MICROKERNEL_SRCS",
+    "ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS",
+    "ALL_NEON_AARCH64_MICROKERNEL_SRCS",
+    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
+    "ALL_AVX512VBMI_MICROKERNEL_SRCS",
 )
 
 # This defines XNNPACK targets for both fbsource BUCK and OSS BUCK
@@ -82,7 +86,17 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
     fb_xplat_cxx_library(
         name = "operators",
         # srcs have to include HOT_SRCS to be able to build on ARVR
-        srcs = OPERATOR_SRCS + HOT_SRCS,
+        srcs = OPERATOR_SRCS + [
+            "XNNPACK/src/binary-elementwise-config.c",
+            "XNNPACK/src/packing.c",
+            "XNNPACK/src/cache.c",
+            "XNNPACK/src/indirection.c",
+            "XNNPACK/src/operator-utils.c",
+            "XNNPACK/src/normalization.c",
+            "XNNPACK/src/allocator.c",
+            "XNNPACK/src/memory.c",
+            "XNNPACK/src/mutex.c",
+        ],
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.h"),
         ]),
@@ -99,12 +113,14 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
+            "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0",
         ],
         visibility = ["PUBLIC"],
         windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS,
         windows_compiler_flags_override = WINDOWS_FLAGS,
         deps = [
             ":interface",
+            ":ukernels_f16c",
             third_party("cpuinfo"),
             third_party("FP16"),
             third_party("FXdiv"),
@@ -131,6 +147,9 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
+            "-DXNN_ENABLE_JIT=0",
+            "-DXNN_ENABLE_SPARSE=0",
+            "-DXNN_ENABLE_MEMOPT",
         ],
         visibility = ["PUBLIC"],
         windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS,
@@ -1088,6 +1107,78 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
+    fb_xplat_cxx_library(
+        name = "ukernels_avx512vbmi",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_AVX512VBMI_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.c"),
+            ("XNNPACK/src", "**/*.h"),
+        ]),
+        header_namespace = "",
+        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        compiler_flags = [
+            "-O2",
+            "-mavx512f",
+            "-mavx512cd",
+            "-mavx512bw",
+            "-mavx512dq",
+            "-mavx512vl",
+            "-mavx512vbmi",
+        ],
+        fbobjc_preprocessor_flags = [
+            "-DXNN_PRIVATE=",
+            "-DXNN_INTERNAL=",
+        ],
+        labels = labels,
+        platform_compiler_flags = [
+            (
+                "^(i[3-6]86|x86|x86_64|AMD64)$",
+                [
+                    "-mavx512f",
+                    "-mavx512cd",
+                    "-mavx512bw",
+                    "-mavx512dq",
+                    "-mavx512vl",
+                    "-mavx512vbmi",
+                ],
+            ),
+        ],
+        platform_srcs = ([
+            (
+                "x86|x86_64|platform009|platform010",
+                PROD_AVX512VBMI_MICROKERNEL_SRCS,
+            ),
+        ] if not is_arvr_mode() else []),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        visibility = ["PUBLIC"],
+        windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS + [
+            "-mavx512f",
+            "-mavx512cd",
+            "-mavx512bw",
+            "-mavx512dq",
+            "-mavx512vl",
+            "-mavx512vbmi",
+        ],
+        windows_compiler_flags_override = WINDOWS_FLAGS + [
+            "-mavx512f",
+            "-mavx512cd",
+            "-mavx512bw",
+            "-mavx512dq",
+            "-mavx512vl",
+            "-mavx512vbmi",
+        ],
+        deps = [
+            ":interface",
+        ],
+    )
+
+
     fb_xplat_cxx_library(
         name = "ukernels_avx512_ovr_win32",
         headers = subdir_glob([
@@ -1474,7 +1565,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_neon_aarch64",
-        srcs = PROD_AARCH64_NEON_MICROKERNEL_SRCS,
+        srcs = ALL_NEON_AARCH64_MICROKERNEL_SRCS,
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -1589,6 +1680,47 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
         ],
     )
 
+    fb_xplat_cxx_library(
+        name = "ukernels_neonfma_aarch64",
+        srcs = ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS,
+        headers = subdir_glob([
+            ("XNNPACK/src", "**/*.h"),
+            ("XNNPACK/src", "**/*.c"),
+        ]),
+        header_namespace = "",
+        apple_sdks = (IOS, MACOSX, APPLETVOS),
+        compiler_flags = [
+            "-O2",
+        ],
+        fbobjc_preprocessor_flags = [
+            "-DXNN_PRIVATE=",
+            "-DXNN_INTERNAL=",
+        ],
+        labels = labels,
+        platform_compiler_flags = [
+            (
+                "^(android-armv8|iphoneos-armv8)$",
+                [
+                    "-march=armv8-a",
+                    "-mfpu=neon-fp-armv8",
+                    "-mfloat-abi=softfp",
+                ],
+            ),
+        ],
+        platforms = (APPLE, ANDROID, CXX, WINDOWS),
+        preferred_linkage = "static",
+        preprocessor_flags = [
+            "-DXNN_LOG_LEVEL=0",
+        ],
+        visibility = ["PUBLIC"],
+        windows_clang_compiler_flags_override = WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS,
+        windows_compiler_flags_override = WINDOWS_FLAGS,
+        deps = [
+            ":interface",
+            third_party("FP16"),
+        ],
+    )
+
     fb_xplat_cxx_library(
         name = "ukernels_asm_aarch32",
         srcs = AARCH32_ASM_MICROKERNEL_SRCS,
@@ -1686,6 +1818,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_neon_fp16",
             ":ukernels_neon_fp16arith_aarch64",
             ":ukernels_neon_v8",
+            ":ukernels_neonfma_aarch64",
         ],
     )
 
@@ -1707,6 +1840,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_sse41",
             ":ukernels_ssse3",
             ":ukernels_xop",
+            ":ukernels_avx512vbmi",
         ],
     )
 
@@ -1728,6 +1862,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_sse_ovr_win32",
             ":ukernels_ssse3_ovr_win32",
             ":ukernels_xop_ovr_win32",
+            ":ukernels_avx512vbmi",
         ],
     )
 
@@ -1749,6 +1884,7 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ":ukernels_neon_fp16arith_aarch64",
             ":ukernels_neon_v8",
             ":ukernels_scalar_aarch32",
+            ":ukernels_neonfma_aarch64",
         ],
     )
 
@@ -1791,7 +1927,6 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             ],
             # doesn't cover iphonesimulator-x86_64
             "ovr_config//runtime:arm64-linux-ubuntu-neon": [":arm64_lib"],
-            "ovr_config//runtime:platform009": [":x86_and_x86_64_lib"],
             "ovr_config//runtime:platform010": [":x86_and_x86_64_lib"],
         }),
         exported_headers = {
@@ -1820,15 +1955,23 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
             "-DXNN_NO_X32_OPERATORS",
             "-DXNN_NO_X8_OPERATORS",
             "-DXNN_NO_XX_OPERATORS",
+            "-DXNN_ENABLE_MEMOPT",
+            "-DXNN_ENABLE_SPARSE=0",
+            "-DXNN_ENABLE_JIT=0",
+            "-DXNN_ENABLE_ASSEMBLY",
+            "-DXNN_ENABLE_GEMM_M_SPECIALIZATION",
+            "-DXNN_ENABLE_ARM_DOTPROD",
         ],
         srcs = [
-            "XNNPACK/src/allocator.c",
             "XNNPACK/src/init.c",
-            "XNNPACK/src/memory-planner.c",
-            "XNNPACK/src/operator-delete.c",
-            "XNNPACK/src/runtime.c",
-            "XNNPACK/src/subgraph.c",
-            "XNNPACK/src/tensor.c",
+            "XNNPACK/src/params.c",
+            "XNNPACK/src/operator-run.c",
+            "XNNPACK/src/microparams-init.c",
+            "XNNPACK/src/x8-lut-config.c",
+            "XNNPACK/src/hardware-config.c",
+            "XNNPACK/src/transpose-config.c",
+            "XNNPACK/src/amalgam/scalar.c",
+            "XNNPACK/src/operators/post-operation.c",
         ] + LOGGING_SRCS,
         visibility = ["PUBLIC"],
         windows_clang_compiler_flags_override = (WINDOWS_FLAGS + WINDOWS_CLANG_COMPILER_FLAGS) if XNNPACK_WINDOWS_AVX512F_ENABLED else WINDOWS_FLAGS,
diff --git a/third_party/xnnpack_src_defs.bzl b/third_party/xnnpack_src_defs.bzl
index d7586e9463cd..7706bf6875de 100644
--- a/third_party/xnnpack_src_defs.bzl
+++ b/third_party/xnnpack_src_defs.bzl
@@ -2,31 +2,10 @@
 Auto-generated by generate-wrappers.py script. Do not modify
 """
 
-OPERATOR_SRCS = [
-    "XNNPACK/src/operators/argmax-pooling-nhwc.c",
-    "XNNPACK/src/operators/average-pooling-nhwc.c",
-    "XNNPACK/src/operators/binary-elementwise-nd.c",
-    "XNNPACK/src/operators/channel-shuffle-nc.c",
-    "XNNPACK/src/operators/constant-pad-nd.c",
-    "XNNPACK/src/operators/convolution-nchw.c",
-    "XNNPACK/src/operators/convolution-nhwc.c",
-    "XNNPACK/src/operators/deconvolution-nhwc.c",
-    "XNNPACK/src/operators/depth-to-space-nchw2nhwc.c",
-    "XNNPACK/src/operators/depth-to-space-nhwc.c",
-    "XNNPACK/src/operators/fully-connected-nc.c",
-    "XNNPACK/src/operators/global-average-pooling-ncw.c",
-    "XNNPACK/src/operators/global-average-pooling-nwc.c",
-    "XNNPACK/src/operators/lut-elementwise-nc.c",
-    "XNNPACK/src/operators/max-pooling-nhwc.c",
-    "XNNPACK/src/operators/prelu-nc.c",
-    "XNNPACK/src/operators/resize-bilinear-nchw.c",
-    "XNNPACK/src/operators/resize-bilinear-nhwc.c",
-    "XNNPACK/src/operators/softmax-nc.c",
-    "XNNPACK/src/operators/unary-elementwise-nc.c",
-    "XNNPACK/src/operators/unpooling-nhwc.c",
-]
-
 SUBGRAPH_SRCS = [
+    "XNNPACK/src/memory-planner.c",
+    "XNNPACK/src/runtime.c",
+    "XNNPACK/src/subgraph.c",
     "XNNPACK/src/subgraph/abs.c",
     "XNNPACK/src/subgraph/add2.c",
     "XNNPACK/src/subgraph/argmax-pooling-2d.c",
@@ -34,16 +13,19 @@ SUBGRAPH_SRCS = [
     "XNNPACK/src/subgraph/bankers-rounding.c",
     "XNNPACK/src/subgraph/ceiling.c",
     "XNNPACK/src/subgraph/clamp.c",
+    "XNNPACK/src/subgraph/concatenate.c",
     "XNNPACK/src/subgraph/convert.c",
     "XNNPACK/src/subgraph/convolution-2d.c",
+    "XNNPACK/src/subgraph/copy.c",
     "XNNPACK/src/subgraph/deconvolution-2d.c",
     "XNNPACK/src/subgraph/depth-to-space.c",
     "XNNPACK/src/subgraph/depthwise-convolution-2d.c",
     "XNNPACK/src/subgraph/divide.c",
     "XNNPACK/src/subgraph/elu.c",
+    "XNNPACK/src/subgraph/even-split.c",
     "XNNPACK/src/subgraph/floor.c",
     "XNNPACK/src/subgraph/fully-connected.c",
-    "XNNPACK/src/subgraph/global-average-pooling-2d.c",
+    "XNNPACK/src/subgraph/global-average-pooling.c",
     "XNNPACK/src/subgraph/hardswish.c",
     "XNNPACK/src/subgraph/leaky-relu.c",
     "XNNPACK/src/subgraph/max-pooling-2d.c",
@@ -54,26 +36,548 @@ SUBGRAPH_SRCS = [
     "XNNPACK/src/subgraph/prelu.c",
     "XNNPACK/src/subgraph/sigmoid.c",
     "XNNPACK/src/subgraph/softmax.c",
+    "XNNPACK/src/subgraph/space-to-depth-2d.c",
     "XNNPACK/src/subgraph/square-root.c",
     "XNNPACK/src/subgraph/square.c",
     "XNNPACK/src/subgraph/squared-difference.c",
     "XNNPACK/src/subgraph/static-constant-pad.c",
     "XNNPACK/src/subgraph/static-reshape.c",
     "XNNPACK/src/subgraph/static-resize-bilinear-2d.c",
+    "XNNPACK/src/subgraph/static-slice.c",
+    "XNNPACK/src/subgraph/static-transpose.c",
     "XNNPACK/src/subgraph/subtract.c",
     "XNNPACK/src/subgraph/unpooling-2d.c",
+    "XNNPACK/src/subgraph/validation.c",
+    "XNNPACK/src/tensor.c",
 ]
 
-LOGGING_SRCS = [
-    "XNNPACK/src/datatype-strings.c",
-    "XNNPACK/src/operator-strings.c",
-    "XNNPACK/src/subgraph-strings.c",
+HOT_SRCS = [
 ]
 
-HOT_SRCS = [
-    "XNNPACK/src/indirection.c",
-    "XNNPACK/src/operator-run.c",
-    "XNNPACK/src/packing.c",
+ALL_AVX512F_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c16s4r-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c16s4r-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l32c16s4r-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l32c16s4r-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l64c16s4r-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l64c16s4r-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p32c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p32c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p32c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p32c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p32c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p32c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx512f.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p32c-minmax-avx512f-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p32c-minmax-avx512f.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-7x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-8x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-7x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-8x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-8x16-minmax-avx512f-broadcast.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-avx512f-2x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-avx512f-2x32.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x128-acc2.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x128-acc4.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x128.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x144-acc3.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x144.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x160-acc2.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x160-acc5.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x160.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192-acc2.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192-acc3.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192-acc6.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x128-acc2.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x128-acc4.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x128.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x144-acc3.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x144.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x160-acc2.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x160-acc5.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x160.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192-acc2.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192-acc3.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192-acc6.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x128-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x128-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x128.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x144-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x144.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x160-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x160-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x160.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192-acc6.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-avx512f.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-avx512f-x32.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-avx512f-x16.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-avx512f-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x64.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x80.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x96.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x112.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x128.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x64.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x80.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x96.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x112.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx512f-rr1-p6-x128.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-avx512f-x16.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-avx512f-x32.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-avx512f-x16.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-avx512f-x32.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-avx512f-x16.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-avx512f-x32.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-avx512f-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-avx512f-x32.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-avx512f-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-avx512f-x32.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-avx512f-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-avx512f-x32.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-avx512f-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-avx512f-x32.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x16.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x32.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x48.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x64.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x80.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x96.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x112.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x128.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x144.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x160.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x176.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x192.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x16.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x32.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x48.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x64.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x80.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x96.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x112.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x128.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x144.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x160.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x176.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x192.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x112.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x128.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x112.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x128.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x112.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x128.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x112.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x128.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x112.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x128.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x112.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x128.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x16.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x32.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x48.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x64.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x80.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x96.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x112.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x128.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-avx512f-x16.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-avx512f-x32.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-avx512f-x16.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-avx512f-x32.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-avx512f-x16.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-avx512f-x32.c",
+    "XNNPACK/src/math/exp-f32-avx512f-rr2-lut16-p3-perm-scalef.c",
+    "XNNPACK/src/math/exp-f32-avx512f-rr2-lut16-p3-perm.c",
+    "XNNPACK/src/math/exp-f32-avx512f-rr2-lut32-p2-perm2-scalef.c",
+    "XNNPACK/src/math/exp-f32-avx512f-rr2-lut32-p2-perm2.c",
+    "XNNPACK/src/math/exp-f32-avx512f-rr2-p5-scalef.c",
+    "XNNPACK/src/math/exp-f32-avx512f-rr2-p5.c",
+    "XNNPACK/src/math/expm1minus-f32-avx512f-rr1-lut16-p3-perm.c",
+    "XNNPACK/src/math/expm1minus-f32-avx512f-rr1-p6.c",
+    "XNNPACK/src/math/extexp-avx512f-p5.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut16-p3-perm-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut16-p3-perm-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut16-p3-perm-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut32-p2-perm2-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut32-p2-perm2-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut32-p2-perm2-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut64-p2-gather-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut64-p2-gather-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-lut64-p2-gather-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-p5-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-p5-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr1-p5-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut16-p3-perm-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut16-p3-perm-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut16-p3-perm-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut32-p2-perm2-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut64-p2-gather-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut64-p2-gather-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-lut64-p2-gather-scalef-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-p5-scalef-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-p5-scalef-nr1fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx512f-rr2-p5-scalef-nr1fma.c",
+    "XNNPACK/src/math/sqrt-f32-avx512f-nr1fma1adj.c",
+    "XNNPACK/src/math/sqrt-f32-avx512f-nr1fma.c",
+    "XNNPACK/src/math/sqrt-f32-avx512f-nr2fma.c",
+]
+
+PROD_AVX2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/avx2.c",
+]
+
+PROD_SCALAR_AARCH32_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/scalar-aarch32.c",
+]
+
+ALL_WASM_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9p8x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p2c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p2c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p1c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p1c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p2c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p2c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p1c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p1c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p2c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p2c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p1c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p1c-minmax-wasm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p2c-minmax-wasm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p2c-minmax-wasm.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x4-minmax-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x4-relu-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-2x4-minmax-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-2x4-relu-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-relu-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-minmax-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-relu-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x4-minmax-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-2x4-minmax-wasm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x4-minmax-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x4-minmax-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x4-relu-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-2x4-minmax-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-2x4-relu-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-relu-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-minmax-wasm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-relu-wasm.c",
+    "XNNPACK/src/f32-maxpool/f32-maxpool-9p8x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9x-minmax-wasm-c1.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasm-2x1.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasm-2x4.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-x1.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-x2.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-x3.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasm-fmagic-x4.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-x1.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-x2.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-x3.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasm-fmagic-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasm-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasm-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasm-x8.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasm-x1.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasm-x2.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasm-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-x1.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-x2.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-x3.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-x5.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-lut16-p3-x6.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-p6-x1.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-p6-x2.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-p6-x3.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-p6-x5.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasm-rr2-p6-x6.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-wasm-x1.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-wasm-x2.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-wasm-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasm-x1.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasm-x2.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasm-x4.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c1-minmax-wasm-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c2-minmax-wasm-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-wasm-2x.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasm-x1.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasm-x2.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasm-x4.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasm-x8.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-wasm-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-wasm-fmagic.c",
+]
+
+PROD_AVX512F_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/avx512f.c",
 ]
 
 TABLE_SRCS = [
@@ -84,471 +588,7118 @@ TABLE_SRCS = [
     "XNNPACK/src/tables/exp2minus-k-over-16.c",
     "XNNPACK/src/tables/exp2minus-k-over-64.c",
     "XNNPACK/src/tables/exp2minus-k-over-2048.c",
+    "XNNPACK/src/tables/vlog.c",
+]
+
+ALL_AVX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x32.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x32.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-avx.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x24.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x32.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-7x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-7x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x16-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-7x8-minmax-avx-broadcast.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-avx-2x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-avx-2x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x8.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x24.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x32.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-avx.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-avx-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-avx-x16.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-avx-x8.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-avx-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-p6-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-p6-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-p6-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx-rr2-p6-x48.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-avx-x8.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-avx-x16.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-avx-x8.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-avx-x16.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-avx-x8.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-avx-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-avx-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-avx-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-avx-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-avx-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-avx-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-avx-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-avx-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-avx-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x40.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x56.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x72.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x40.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x56.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x72.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x80.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-x8.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-x16.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-avx-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-avx-x16.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-avx-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-avx-x16.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-avx-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-avx-x16.c",
+    "XNNPACK/src/math/exp-f32-avx-rr2-p5.c",
+    "XNNPACK/src/math/expm1minus-f32-avx-rr2-lut4-p4-perm.c",
+    "XNNPACK/src/math/expm1minus-f32-avx-rr2-lut16-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-avx-rr2-p6.c",
+    "XNNPACK/src/math/sigmoid-f32-avx-rr2-lut64-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx-rr2-p5-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx-rr2-p5-nr1.c",
+    "XNNPACK/src/math/sigmoid-f32-avx-rr2-p5-nr2.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x8.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x24.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-avx.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x32.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-avx-x8.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-avx-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-avx-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-avx-x8.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-avx-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-avx-x32.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx-mul32.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x8.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x24.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx-mul32-ld32-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx-mul32-ld32-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul32-ld32-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul32-ld32-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-avx-x8.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-avx-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-avx-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-avx-x8.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-avx-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-avx-x32.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-avx-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-avx-mul16-ld64-x16.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx-x16.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx-x48.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx-x64.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-8x8-multi-mov-avx.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-8x8-multi-switch-avx.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-8x8-reuse-mov-avx.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-8x8-reuse-multi-avx.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-8x8-reuse-switch-avx.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x4-multi-mov-avx.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x4-multi-multi-avx.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x4-multi-switch-avx.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x4-reuse-mov-avx.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x4-reuse-multi-avx.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x4-reuse-switch-avx.c",
+]
+
+AARCH64_ASM_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "XNNPACK/src/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "XNNPACK/src/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+]
+
+ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-x32.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-x32.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmrelaxedsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmrelaxedsimd-fma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmrelaxedsimd-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmrelaxedsimd-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmrelaxedsimd-x24.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmrelaxedsimd-x32.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-wasmrelaxedsimd-c4.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-wasmrelaxedsimd-c8.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmrelaxedsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-relu-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-relu-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-wasmrelaxedsimd-fma-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-wasmrelaxedsimd-fma-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmrelaxedsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-relu-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-wasmrelaxedsimd-fma.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-1x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-2x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-iminmax-4x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-1x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-2x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmrelaxedsimd-laneselect-4x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmrelaxedsimd-rr2-p5-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-fma-rr2-p6-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmrelaxedsimd-rr2-p6-x24.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmrelaxedsimd-iminmax-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmrelaxedsimd-iminmax-x8.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmrelaxedsimd-laneselect-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmrelaxedsimd-laneselect-x8.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-wasmrelaxedsimd-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-wasmrelaxedsimd-fma-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-wasmrelaxedsimd-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-wasmrelaxedsimd-fma-2x.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-lut64-p2-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-lut64-p2-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-lut64-p2-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-lut64-p2-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-lut64-p2-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-p5-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-p5-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-p5-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-fma-rr2-p5-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-lut64-p2-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-lut64-p2-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-lut64-p2-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-lut64-p2-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-lut64-p2-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-x24.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-wasmrelaxedsimd-x8.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-wasmrelaxedsimd-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-wasmrelaxedsimd-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmrelaxedsimd-arm-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmrelaxedsimd-arm-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmrelaxedsimd-x86-x8.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmrelaxedsimd-x86-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmrelaxedsimd-x86-x32.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-wasmrelaxedsimd-x8.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-wasmrelaxedsimd-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-wasmrelaxedsimd-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmrelaxedsimd-arm-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmrelaxedsimd-arm-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmrelaxedsimd-x86-x8.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmrelaxedsimd-x86-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmrelaxedsimd-x86-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmpshufb-x16.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmpshufb-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmpshufb-x48.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmpshufb-x64.c",
+]
+
+ALL_XOP_MICROKERNEL_SRCS = [
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-xop-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-xop.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-xop-mul32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-xop-mul32-ld32-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-xop-mul32-ld32-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-xop-mul32-ld32-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-xop-mul32-ld32-x16.c",
+]
+
+ALL_FMA3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p8c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p16c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p32c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p32c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p8c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p16c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p32c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p32c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p8c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p16c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p32c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p32c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p8c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p16c-minmax-fma3.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p32c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p32c-minmax-fma3.c",
+    "XNNPACK/src/f16-ibilinear/gen/f16-ibilinear-fma3-c8.c",
+    "XNNPACK/src/f16-ibilinear/gen/f16-ibilinear-fma3-c16.c",
+    "XNNPACK/src/f16-vmulcaddc/gen/f16-vmulcaddc-c8-minmax-fma3-2x.c",
+    "XNNPACK/src/f16-vmulcaddc/gen/f16-vmulcaddc-c16-minmax-fma3-2x.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-fma3.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-fma3-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-fma3.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-7x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-8x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-7x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-8x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x16-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x16s4-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-7x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-8x8-minmax-fma3-broadcast.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-fma3-x8.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-fma3-x16.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x8.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x16.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x24.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x32.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x40.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x48.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x56.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x64.c",
+    "XNNPACK/src/math/sqrt-f32-fma3-nr1fma1adj.c",
+    "XNNPACK/src/math/sqrt-f32-fma3-nr1fma.c",
+    "XNNPACK/src/math/sqrt-f32-fma3-nr2fma.c",
+]
+
+PROD_ARMSIMD32_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/armsimd32.c",
+]
+
+PROD_XOP_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/xop.c",
+]
+
+ALL_SSE2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x32.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x32.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vabs-sse2-x8.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vabs-sse2-x16.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vneg-sse2-x8.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vneg-sse2-x16.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-4x-sse2-c4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9p8x-sse2-c4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9x-sse2-c4.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x24.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x32.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-sse2-dup.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-sse2-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-sse2-2x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x8.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x24.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x32.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse2-rr2-p6-x24.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-sse2-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-sse2-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-sse2-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-sse2-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-sse2-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-sse2-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-sse2-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-sse2-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-sse2-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-sse2-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x24.c",
+    "XNNPACK/src/math/cvt-f16-f32-sse2-int16.c",
+    "XNNPACK/src/math/cvt-f16-f32-sse2-int32.c",
+    "XNNPACK/src/math/cvt-f32-f16-sse2.c",
+    "XNNPACK/src/math/exp-f32-sse2-rr2-lut64-p2.c",
+    "XNNPACK/src/math/exp-f32-sse2-rr2-p5.c",
+    "XNNPACK/src/math/expm1minus-f32-sse2-rr2-lut16-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-sse2-rr2-p6.c",
+    "XNNPACK/src/math/expminus-f32-sse2-rr2-p5.c",
+    "XNNPACK/src/math/roundd-sse2-cvt.c",
+    "XNNPACK/src/math/roundne-sse2-cvt.c",
+    "XNNPACK/src/math/roundu-sse2-cvt.c",
+    "XNNPACK/src/math/roundz-sse2-cvt.c",
+    "XNNPACK/src/math/sigmoid-f32-sse2-rr2-lut64-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f32-sse2-rr2-lut64-p2-nr1.c",
+    "XNNPACK/src/math/sigmoid-f32-sse2-rr2-lut64-p2-nr2.c",
+    "XNNPACK/src/math/sigmoid-f32-sse2-rr2-p5-div.c",
+    "XNNPACK/src/math/sigmoid-f32-sse2-rr2-p5-nr1.c",
+    "XNNPACK/src/math/sigmoid-f32-sse2-rr2-p5-nr2.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse2-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x8.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x24.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-sse2.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-fp32-sse2.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-gemmlowp-sse2.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-sse2.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-sse2-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-sse2-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-sse2-x32.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-avgpool/qu8-avgpool-9p8x-minmax-fp32-sse2-c8.c",
+    "XNNPACK/src/qu8-avgpool/qu8-avgpool-9x-minmax-fp32-sse2-c8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-sse2-mul16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x8.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x24.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-fp32-sse2.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-gemmlowp-sse2.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-sse2.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-sse2-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-sse2-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-sse2-x32.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-sse2-c8.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-sse2-c16.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-9p8x-minmax-sse2-c16.c",
+    "XNNPACK/src/s8-vclamp/s8-vclamp-sse2-x64.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-sse2-c8.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-sse2-c16.c",
+    "XNNPACK/src/u8-maxpool/u8-maxpool-9p8x-minmax-sse2-c16.c",
+    "XNNPACK/src/u8-rmax/u8-rmax-sse2.c",
+    "XNNPACK/src/u8-vclamp/u8-vclamp-sse2-x64.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-sse2.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-switch-sse2.c",
+    "XNNPACK/src/x8-zip/x8-zip-x2-sse2.c",
+    "XNNPACK/src/x8-zip/x8-zip-x3-sse2.c",
+    "XNNPACK/src/x8-zip/x8-zip-x4-sse2.c",
+    "XNNPACK/src/x8-zip/x8-zip-xm-sse2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-mov-sse2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-switch-sse2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-mov-sse2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-sse2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-switch-sse2.c",
+    "XNNPACK/src/x16-transposec/x16-transposec-4x8-sse2.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-mov-sse2.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-multi-sse2.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-switch-sse2.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-sse2.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-multi-sse2.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-switch-sse2.c",
+    "XNNPACK/src/x32-unpool/x32-unpool-sse2.c",
+    "XNNPACK/src/x32-zip/x32-zip-x2-sse2.c",
+    "XNNPACK/src/x32-zip/x32-zip-x3-sse2.c",
+    "XNNPACK/src/x32-zip/x32-zip-x4-sse2.c",
+    "XNNPACK/src/x32-zip/x32-zip-xm-sse2.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-mov-sse2.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-multi-sse2.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-switch-sse2.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-mov-sse2.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-multi-sse2.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-switch-sse2.c",
+    "XNNPACK/src/xx-fill/xx-fill-sse2-x64.c",
+    "XNNPACK/src/xx-pad/xx-pad-sse2.c",
+]
+
+LOGGING_SRCS = [
+    "XNNPACK/src/enums/datatype-strings.c",
+    "XNNPACK/src/enums/microkernel-type.c",
+    "XNNPACK/src/enums/node-type.c",
+    "XNNPACK/src/enums/operator-type.c",
+    "XNNPACK/src/log.c",
+]
+
+PROD_AVX512VBMI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/avx512vbmi.c",
+]
+
+PROD_FP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/fp16arith.c",
+]
+
+AARCH32_ASM_MICROKERNEL_SRCS = [
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S",
+    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S",
+    "XNNPACK/src/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S",
+    "XNNPACK/src/f32-gemm/f32-gemm-4x4-asm-aarch32-vfp-ld64.S",
+    "XNNPACK/src/f32-gemm/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S",
+    "XNNPACK/src/f32-gemm/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a53.S",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a75.S",
+    "XNNPACK/src/qc8-dwconv/qc8-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
+    "XNNPACK/src/qc8-dwconv/qc8-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S",
+    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S",
+    "XNNPACK/src/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
+]
+
+PROD_F16C_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/f16c.c",
+]
+
+ALL_F16C_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-avgpool/f16-avgpool-9p8x-minmax-f16c-c8.c",
+    "XNNPACK/src/f16-avgpool/f16-avgpool-9x-minmax-f16c-c8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-x16.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c",
+    "XNNPACK/src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c",
+    "XNNPACK/src/f16-prelu/gen/f16-prelu-f16c-2x8.c",
+    "XNNPACK/src/f16-prelu/gen/f16-prelu-f16c-2x16.c",
+    "XNNPACK/src/f16-rmax/f16-rmax-f16c.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-f16c-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-f16c-x16.c",
+    "XNNPACK/src/f16-vclamp/gen/f16-vclamp-f16c-x8.c",
+    "XNNPACK/src/f16-vclamp/gen/f16-vclamp-f16c-x16.c",
+    "XNNPACK/src/f16-vhswish/gen/f16-vhswish-f16c-x8.c",
+    "XNNPACK/src/f16-vhswish/gen/f16-vhswish-f16c-x16.c",
+    "XNNPACK/src/f16-vlrelu/gen/f16-vlrelu-f16c-x8.c",
+    "XNNPACK/src/f16-vlrelu/gen/f16-vlrelu-f16c-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndd-f16c-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndd-f16c-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndne-f16c-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndne-f16c-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndu-f16c-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndu-f16c-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndz-f16c-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndz-f16c-x16.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-f16c-sqrt-x8.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-f16c-sqrt-x16.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vsqr-f16c-x8.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vsqr-f16c-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-f16c-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-f16c-x16.c",
+    "XNNPACK/src/math/cvt-f16-f32-f16c.c",
+    "XNNPACK/src/math/cvt-f32-f16-f16c.c",
+]
+
+PROD_AVX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/avx.c",
+]
+
+ALL_ARMSIMD32_MICROKERNEL_SRCS = [
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-armsimd32-x4.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-armsimd32-x8.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-x4.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-x8.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x1c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x2c4-minmax-fp32-armsimd32.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-armsimd32-x4.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-armsimd32-x8.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-x4.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-x8.c",
+]
+
+PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS = [
+]
+
+ALL_NEONFMA_MICROKERNEL_SRCS = [
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonfma-shland.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonfma-zip.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonfma-shland.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonfma-zip.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonfma-shland.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonfma-zip.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonfma-shland.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonfma-zip.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonfma-shland.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonfma-zip.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neonfma.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neonfma-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-neonfma-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-neonfma-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-8x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-neonfma-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-neonfma-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-8x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-neonfma-p4.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-neonfma-p8.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-neonfma-p16.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-neonfma-c4.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-neonfma-c8.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-neonfma-dup-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-neonfma-dup-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-neonfma-dup-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-8x8s4-minmax-neonfma.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-neonfma.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neonfma.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x20.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x20.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-neonfma-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-neonfma-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-12x1-minmax-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-neonfma-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-neonfma-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neonfma-rr1-p6-x24.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-neonfma-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-neonfma-2x.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x24.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x4.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x8.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x12.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x16.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x20.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x24.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x28.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x32.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x36.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x40.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x4.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x8.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x12.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x16.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x20.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x24.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x28.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x32.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x36.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x40.c",
+    "XNNPACK/src/math/exp-f32-neonfma-rr2-lut64-p2.c",
+    "XNNPACK/src/math/exp-f32-neonfma-rr2-p5.c",
+    "XNNPACK/src/math/expm1minus-f32-neonfma-rr1-lut16-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-neonfma-rr1-p6.c",
+    "XNNPACK/src/math/expminus-f32-neonfma-rr2-lut64-p2.c",
+    "XNNPACK/src/math/expminus-f32-neonfma-rr2-lut2048-p1.c",
+    "XNNPACK/src/math/expminus-f32-neonfma-rr2-p5.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-lut64-p2-nr1recps1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-lut64-p2-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-lut64-p2-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-lut2048-p1-nr1recps1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-lut2048-p1-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-lut2048-p1-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-p5-nr1recps1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-p5-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr1-p5-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-lut64-p2-nr1recps1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-lut64-p2-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-lut64-p2-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-lut2048-p1-nr1recps1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-lut2048-p1-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-lut2048-p1-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-p5-nr1recps1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-p5-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-neonfma-rr2-p5-nr2recps.c",
+    "XNNPACK/src/math/sqrt-f32-neonfma-nr1fma.c",
+    "XNNPACK/src/math/sqrt-f32-neonfma-nr1rsqrts1fma1adj.c",
+    "XNNPACK/src/math/sqrt-f32-neonfma-nr2fma1adj.c",
+    "XNNPACK/src/math/sqrt-f32-neonfma-nr2fma.c",
+    "XNNPACK/src/math/sqrt-f32-neonfma-nr3fma.c",
+]
+
+ALL_SCALAR_MICROKERNEL_SRCS = [
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-scalar.c",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples4-scalar.c",
+    "XNNPACK/src/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c",
+    "XNNPACK/src/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c",
+    "XNNPACK/src/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c",
+    "XNNPACK/src/cs16-fftr/gen/cs16-fftr-scalar-x1.c",
+    "XNNPACK/src/cs16-fftr/gen/cs16-fftr-scalar-x2.c",
+    "XNNPACK/src/cs16-fftr/gen/cs16-fftr-scalar-x4.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x1.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x2.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x3.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-4x-scalar-c1.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9p8x-scalar-c1.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9x-scalar-c1.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9p8x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-scalar-1x1.c",
+    "XNNPACK/src/f32-conv-hwc/f32-conv-hwc-3x3s2p0p1c3x4-scalar-1x1.c",
+    "XNNPACK/src/f32-conv-hwc/f32-conv-hwc-3x3s2p1c3x4-scalar-1x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-2x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-2x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-3x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-4x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-5x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-6x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-2x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-2x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-3x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-4x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-2x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-2x1-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-2x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-3x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-3x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-3x1-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-3x1.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p1c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p1c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p2c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p2c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p2c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p2c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p1c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p1c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p1c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p1c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p2c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p2c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p2c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p2c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p1c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p1c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p1c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p1c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p2c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p2c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p2c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p2c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p1c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p1c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p1c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p1c-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p2c-minmax-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p2c-minmax-scalar.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p2c-scalar-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p2c-scalar.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x1.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x2.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x3.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x4.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x1.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x2.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x3.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x4.c",
+    "XNNPACK/src/f32-gavgpool-cw/f32-gavgpool-cw-scalar-x1.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x4-minmax-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x4-relu-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x4-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-2x4-minmax-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-2x4-relu-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-2x4-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-relu-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-minmax-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-relu-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x4-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x4-minmax-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-2x4-minmax-scalar.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x4-minmax-scalar.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-scalar-p1.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-scalar-p2.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-scalar-p4.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-scalar-c1.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-scalar-c2.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-scalar-c4.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x4-minmax-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x4-relu-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x4-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-2x4-minmax-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-2x4-relu-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-2x4-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-relu-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-minmax-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-relu-scalar.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-scalar.c",
+    "XNNPACK/src/f32-maxpool/f32-maxpool-9p8x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9p8x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9x-minmax-scalar-c1.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-2x4-minmax-scalar.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-3x3-minmax-scalar.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x2-minmax-scalar.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x4-minmax-scalar.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-scalar-2x1.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-scalar-2x4.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x1.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x2.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x3.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x4.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x1.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x2.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x3.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x4.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x1.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x2.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x3.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x4.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x1.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x2.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x3.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x4.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x1.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x2.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x3.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x4.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x1.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x2.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x3.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x1.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x2-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x4-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x4-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x1.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x2-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x4-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x4-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x4.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-scalar.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-1x1-minmax-scalar-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-1x1-minmax-scalar.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-2x1-minmax-scalar-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-2x1-minmax-scalar.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-scalar-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-scalar.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x2-minmax-scalar.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x4-minmax-scalar.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-scalar-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-scalar-x1.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-scalar-x2.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-scalar-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-scalar-x8.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-scalar-x1.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-scalar-x2.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-scalar-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x1.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x2.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x3.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x5.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x6.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-p6-x1.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-p6-x2.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-p6-x3.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-p6-x5.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-scalar-rr2-p6-x6.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-scalar-x1.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-scalar-x2.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-scalar-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-scalar-x1.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-scalar-x2.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-scalar-x4.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c1-minmax-scalar-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c2-minmax-scalar-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-scalar-2x.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-scalar-x1.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-scalar-x2.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-scalar-x4.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-scalar-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-scalar-libm-x1.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-scalar-libm-x2.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-scalar-libm-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-scalar-libm-x1.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-scalar-libm-x2.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-scalar-libm-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-scalar-libm-x1.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-scalar-libm-x2.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-scalar-libm-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-scalar-libm-x1.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-scalar-libm-x2.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-scalar-libm-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut64-p2-div-x1.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut64-p2-div-x2.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut2048-p1-div-x1.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut2048-p1-div-x2.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut2048-p1-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-x1.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-x2.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-x4.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-x1.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-x2.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-scalar-x1.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-scalar-x2.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-scalar-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-scalar-x1.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-scalar-x2.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-scalar-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-scalar-x1.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-scalar-x2.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-scalar-x4.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-scalar-x1.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-scalar-x2.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-scalar-x3.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-scalar-x4.c",
+    "XNNPACK/src/math/cvt-f32-f16-scalar-bitcast.c",
+    "XNNPACK/src/math/cvt-f32-f16-scalar-fabsf.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-lut4-p4.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-lut8-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-lut8-p4.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-lut16-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-lut16-p4.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-p5.c",
+    "XNNPACK/src/math/expm1minus-f32-scalar-rr2-p6.c",
+    "XNNPACK/src/math/expminus-f32-scalar-rr2-lut64-p2.c",
+    "XNNPACK/src/math/expminus-f32-scalar-rr2-lut2048-p1.c",
+    "XNNPACK/src/math/expminus-f32-scalar-rr2-p5.c",
+    "XNNPACK/src/math/roundd-scalar-addsub.c",
+    "XNNPACK/src/math/roundd-scalar-cvt.c",
+    "XNNPACK/src/math/roundd-scalar-floor.c",
+    "XNNPACK/src/math/roundne-scalar-addsub.c",
+    "XNNPACK/src/math/roundne-scalar-nearbyint.c",
+    "XNNPACK/src/math/roundne-scalar-rint.c",
+    "XNNPACK/src/math/roundu-scalar-addsub.c",
+    "XNNPACK/src/math/roundu-scalar-ceil.c",
+    "XNNPACK/src/math/roundu-scalar-cvt.c",
+    "XNNPACK/src/math/roundz-scalar-addsub.c",
+    "XNNPACK/src/math/roundz-scalar-cvt.c",
+    "XNNPACK/src/math/roundz-scalar-trunc.c",
+    "XNNPACK/src/math/sigmoid-f32-scalar-rr2-lut64-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f32-scalar-rr2-lut2048-p1-div.c",
+    "XNNPACK/src/math/sigmoid-f32-scalar-rr2-p5-div.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-bitmanip.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-clz-binsearch.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-clz-newton.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-hashemian.c",
+    "XNNPACK/src/math/sqrt-u32-scalar-tflm.c",
+    "XNNPACK/src/math/sqrt-u64-scalar-cvtu32-sqrt-cvtsatu32f64.c",
+    "XNNPACK/src/math/sqrt-u64-scalar-cvtu32-sqrt-llrint.c",
+    "XNNPACK/src/math/sqrt-u64-scalar-cvtu64-sqrt-llrint.c",
+    "XNNPACK/src/math/tanh-f32-scalar-rr1-p6-div.c",
+    "XNNPACK/src/math/tanh-f32-scalar-rr2-p6-div.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x1.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x2.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x3.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x4.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-gemmlowp-scalar.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndnu-scalar.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-scalar-x1.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-scalar-x2.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-scalar-x4.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-scalar-x1.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-scalar-x2.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-scalar-x4.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-scalar-x1.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-scalar-x2.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-scalar-x4.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-x1.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-x2.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-x4.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-x1.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-x2.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-x4.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-x1.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-x2.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-x4.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-x1.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-x2.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-x4.c",
+    "XNNPACK/src/qu8-avgpool/qu8-avgpool-9p8x-minmax-fp32-scalar-imagic-c1.c",
+    "XNNPACK/src/qu8-avgpool/qu8-avgpool-9x-minmax-fp32-scalar-imagic-c1.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x1.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x2.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x3.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x4.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x2-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-scalar-imagic.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4-minmax-rndnu-scalar.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-fp32-scalar-fmagic.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-fp32-scalar-lrintf.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-gemmlowp-scalar.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-scalar-x1.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-scalar-x2.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-scalar-x4.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-scalar-x1.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-scalar-x2.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-scalar-x4.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-scalar-x1.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-scalar-x2.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-scalar-x4.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-x1.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-x2.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-x4.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-x1.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-x2.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-x4.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-x1.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-x2.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-x4.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-x1.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-x2.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-x4.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-scalar-c1.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-9p8x-minmax-scalar-c1.c",
+    "XNNPACK/src/s8-vclamp/s8-vclamp-scalar-x4.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c",
+    "XNNPACK/src/s16-window/gen/s16-window-scalar-x1.c",
+    "XNNPACK/src/s16-window/gen/s16-window-scalar-x2.c",
+    "XNNPACK/src/s16-window/gen/s16-window-scalar-x3.c",
+    "XNNPACK/src/s16-window/gen/s16-window-scalar-x4.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c",
+    "XNNPACK/src/u8-lut32norm/u8-lut32norm-scalar.c",
+    "XNNPACK/src/u8-maxpool/u8-maxpool-9p8x-minmax-scalar-c1.c",
+    "XNNPACK/src/u8-rmax/u8-rmax-scalar.c",
+    "XNNPACK/src/u8-vclamp/u8-vclamp-scalar-x4.c",
+    "XNNPACK/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c",
+    "XNNPACK/src/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c",
+    "XNNPACK/src/u32-vlog/gen/u32-vlog-scalar-x1.c",
+    "XNNPACK/src/u32-vlog/gen/u32-vlog-scalar-x2.c",
+    "XNNPACK/src/u32-vlog/gen/u32-vlog-scalar-x3.c",
+    "XNNPACK/src/u32-vlog/gen/u32-vlog-scalar-x4.c",
+    "XNNPACK/src/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-x1.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-scalar-x1.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-scalar-x2.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-scalar-x4.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-scalar-x8.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-scalar-x16.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-1x2-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-1x4-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-2x1-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-2x2-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-2x4-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-4x1-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-4x2-scalar-int.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-4x4-scalar-int.c",
+    "XNNPACK/src/x8-zip/x8-zip-x2-scalar.c",
+    "XNNPACK/src/x8-zip/x8-zip-x3-scalar.c",
+    "XNNPACK/src/x8-zip/x8-zip-x4-scalar.c",
+    "XNNPACK/src/x8-zip/x8-zip-xm-scalar.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-1x2-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-1x4-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-2x1-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-2x2-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-2x4-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x1-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x2-scalar-int.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-scalar-int.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-1x2-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-1x4-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-2x1-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-2x2-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-2x4-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-4x1-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-4x2-scalar.c",
+    "XNNPACK/src/x24-transposec/gen/x24-transposec-4x4-scalar.c",
+    "XNNPACK/src/x32-packx/x32-packx-x2-scalar.c",
+    "XNNPACK/src/x32-packx/x32-packx-x3-scalar.c",
+    "XNNPACK/src/x32-packx/x32-packx-x4-scalar.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-1x2-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-1x2-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-1x4-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-1x4-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x1-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x1-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x4-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x4-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x1-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x1-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x2-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x2-scalar-int.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-scalar-float.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-scalar-int.c",
+    "XNNPACK/src/x32-unpool/x32-unpool-scalar.c",
+    "XNNPACK/src/x32-zip/x32-zip-x2-scalar.c",
+    "XNNPACK/src/x32-zip/x32-zip-x3-scalar.c",
+    "XNNPACK/src/x32-zip/x32-zip-x4-scalar.c",
+    "XNNPACK/src/x32-zip/x32-zip-xm-scalar.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-1x2-scalar-float.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-1x2-scalar-int.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x1-scalar-float.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x1-scalar-int.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-scalar-float.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-scalar-int.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x1-scalar-float.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x1-scalar-int.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x2-scalar-float.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-4x2-scalar-int.c",
+    "XNNPACK/src/xx-copy/xx-copy-scalar-memcpy.c",
+    "XNNPACK/src/xx-fill/xx-fill-scalar-x16.c",
+    "XNNPACK/src/xx-pad/xx-pad-scalar.c",
+    "XNNPACK/src/xx-transpose/xx-transpose-1x1-scalar-memcpy.c",
+]
+
+ALL_NEONBF16_MICROKERNEL_SRCS = [
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonbf16-bfdot.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonbf16-bfmlal.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-1x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonbf16-bfdot.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonbf16-bfmlal.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonbf16-bfdot.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonbf16-bfmlal.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonbf16-bfdot.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonbf16-bfmlal.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-4x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfdot.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfmlal.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-5x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+    "XNNPACK/src/bf16-gemm/gen/bf16-gemm-6x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+]
+
+ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x8.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x24.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x32.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x40.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x48.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x56.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x64.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-aarch64-neonfp16arith-sqrt-x8.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-aarch64-neonfp16arith-sqrt-x16.c",
+    "XNNPACK/src/math/sigmoid-f16-aarch64-neonfp16arith-rr1-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f16-aarch64-neonfp16arith-rr1-p3-div.c",
+    "XNNPACK/src/math/sigmoid-f16-aarch64-neonfp16arith-rr2-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f16-aarch64-neonfp16arith-rr2-p3-div.c",
+    "XNNPACK/src/math/sqrt-f16-aarch64-neonfp16arith-sqrt.c",
+]
+
+PROD_FMA3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/fma3.c",
+]
+
+XNNPACK_SRCS = [
+    "XNNPACK/src/binary-elementwise-config.c",
+    "XNNPACK/src/init.c",
+    "XNNPACK/src/params.c",
+    "XNNPACK/src/transpose-config.c",
+    "XNNPACK/src/x8-lut-config.c",
+]
+
+ALL_FP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-x1.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-x2.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-x4.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-x1.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-x2.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-x4.c",
+]
+
+ALL_NEONDOT_MICROKERNEL_SRCS = [
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-8x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-8x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-8x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-8x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-6x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-6x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-8x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-8x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-6x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-6x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-8x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-8x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x32c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x32c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x32c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-5x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-5x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-6x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-6x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-8x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-8x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x32c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x32c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x32c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-5x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-5x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-6x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-6x16c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-8x8c4-minmax-rndnu-neondot.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-8x16c4-minmax-rndnu-neondot.c",
+]
+
+PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neon-aarch64.c",
+]
+
+JIT_AARCH32_SRCS = [
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch32-neon-cortex-a7.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch32-neon-cortex-a53.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch32-neon-cortex-a55.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch32-neon-cortex-a75.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch32-neon-ld64.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch32-neon-cortex-a7.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch32-neon-cortex-a53.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch32-neon-cortex-a55.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch32-neon-cortex-a75.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch32-neon-ld64.cc",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-fp32-aarch32-neonv8-mlal-lane-ld64.cc",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8c4-fp32-aarch32-neondot-ld64.cc",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-fp32-aarch32-neonv8-mlal-lane-ld64.cc",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8c4-fp32-aarch32-neondot-ld64.cc",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-rndnu-aarch32-neon-mlal-lane-ld64.cc",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-rndnu-aarch32-neondot-ld64.cc",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-rndnu-aarch32-neon-mlal-lane-ld64.cc",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-rndnu-aarch32-neondot-ld64.cc",
+]
+
+PROD_NEON_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neon.c",
+]
+
+ALL_NEON_MICROKERNEL_SRCS = [
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-neon-x1.c",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-neon-x4.c",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples1-neon.c",
+    "XNNPACK/src/cs16-bfly4/cs16-bfly4-samples4-neon.c",
+    "XNNPACK/src/cs16-fftr/cs16-fftr-neon-x4.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x32.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x32.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-4x-neon-c4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9p8x-neon-c4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9x-neon-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9p8x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-neon-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-neon-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-neon-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-neon-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-neon-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-neon-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-neon-2x2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-3x4.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neon.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neon-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neon.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x24.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x32.c",
+    "XNNPACK/src/f32-gavgpool-cw/f32-gavgpool-cw-neon-x4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-neon-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-neon-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x2-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-neon-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-neon-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-8x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-dup-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-8x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-neon-p4.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-neon-p8.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-neon-p16.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-neon-c4.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-neon-c8.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-neon-dup-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-neon-lane-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x2-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-neon-dup-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-neon-dup-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-neon-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-neon-lane-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-8x8s4-minmax-neon.c",
+    "XNNPACK/src/f32-maxpool/f32-maxpool-9p8x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9p8x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9x-minmax-neon-c4.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-neon.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-1x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-1x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-1x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-2x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-2x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-4x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-4x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-neon-4x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x8.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x24.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x32.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x20.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x20.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-neon.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-neon-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-neon-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-neon.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-neon-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-neon-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-neon.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-12x1-minmax-neon.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-neon-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-neon-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-neon.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-neon-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-neon-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-neon.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-neon-x8.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-neon-x4.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-neon-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-neon-rr2-p6-x24.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-neon-x4.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-neon-x8.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-neon-x16.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-neon-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-neon-x8.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-neon-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-neon-2x.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-neon-x4.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-neon-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-neon-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-neon-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-neon-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-neon-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-neon-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-neon-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-neon-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-neon-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x24.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-neon-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-neon-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-neon-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-neon-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-neon-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-neon-x8.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-neon-x8.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-neon-x16.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-neon-x24.c",
+    "XNNPACK/src/i16-vlshift/gen/i16-vlshift-neon-x32.c",
+    "XNNPACK/src/math/cvt-f16-f32-neon-int16.c",
+    "XNNPACK/src/math/cvt-f16-f32-neon-int32.c",
+    "XNNPACK/src/math/cvt-f32-f16-neon.c",
+    "XNNPACK/src/math/cvt-f32-qs8-neon.c",
+    "XNNPACK/src/math/cvt-f32-qu8-neon.c",
+    "XNNPACK/src/math/expm1minus-f32-neon-rr2-lut16-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-neon-rr2-p6.c",
+    "XNNPACK/src/math/roundd-neon-addsub.c",
+    "XNNPACK/src/math/roundd-neon-cvt.c",
+    "XNNPACK/src/math/roundne-neon-addsub.c",
+    "XNNPACK/src/math/roundu-neon-addsub.c",
+    "XNNPACK/src/math/roundu-neon-cvt.c",
+    "XNNPACK/src/math/roundz-neon-addsub.c",
+    "XNNPACK/src/math/roundz-neon-cvt.c",
+    "XNNPACK/src/math/sigmoid-f32-neon-rr2-lut64-p2-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neon-rr2-lut2048-p1-nr2recps.c",
+    "XNNPACK/src/math/sigmoid-f32-neon-rr2-p5-nr2recps.c",
+    "XNNPACK/src/math/sqrt-f32-neon-nr1rsqrts.c",
+    "XNNPACK/src/math/sqrt-f32-neon-nr2rsqrts.c",
+    "XNNPACK/src/math/sqrt-f32-neon-nr3rsqrts.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-neon-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neon-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neon-mla8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neon-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neon-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mla8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mul8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neon-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neon-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mla8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mul8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-rndnu-neon-mla8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-rndnu-neon-mul8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mla8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mla8-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mul8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mul8-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mla8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mul8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mla8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mla8-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mul8-ld64.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mul8-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x8.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x24.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-6x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-6x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x8c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2s4-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c2s4-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mull-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4s2-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c4s2-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c8-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c8-minmax-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c16-minmax-rndnu-neon-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-6x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-6x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-fp32-neon.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-gemmlowp-neon.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-neon.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x32.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld128-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld128-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld128-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld128-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-neon-x8.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-neon-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-neon-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-neon-x8.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-neon-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-neon-x32.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-neon-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-neon-ld64-x16.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-neon-ld128-x16.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-rndnu-neon-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-rndnu-neon-ld64-x16.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-rndnu-neon-ld128-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neon-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neon-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neon-ld128-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-rndnu-neon-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-rndnu-neon-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-rndnu-neon-ld128-x16.c",
+    "XNNPACK/src/qu8-avgpool/qu8-avgpool-9p8x-minmax-fp32-neon-c8.c",
+    "XNNPACK/src/qu8-avgpool/qu8-avgpool-9x-minmax-fp32-neon-c8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-neon-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-rndnu-neon-mul8.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-rndnu-neon-mul16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x8.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x24.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-fp32-neon.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-gemmlowp-neon.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-neon.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-neon-ld64-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-neon-ld64-x16.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-neon-ld64-x32.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-neon-ld128-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld64-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld64-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld64-x32.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld128-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-neon-x8.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-neon-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-neon-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-neon-x8.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-neon-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-neon-x32.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-neon-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-neon-ld64-x16.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-neon-ld128-x16.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-rndnu-neon-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-rndnu-neon-ld64-x16.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-rndnu-neon-ld128-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neon-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neon-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neon-ld128-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld128-x16.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-neon-c8.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-neon-c16.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-2p2x-minmax-neon-c16.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-4p3x-minmax-neon-c16.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-9p8x-minmax-neon-c16.c",
+    "XNNPACK/src/s8-vclamp/s8-vclamp-neon-x64.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c",
+    "XNNPACK/src/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c",
+    "XNNPACK/src/s16-window/gen/s16-window-neon-x8.c",
+    "XNNPACK/src/s16-window/gen/s16-window-neon-x16.c",
+    "XNNPACK/src/s16-window/gen/s16-window-neon-x24.c",
+    "XNNPACK/src/s16-window/gen/s16-window-neon-x32.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift12-neon-x8.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift12-neon-x16.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift12-neon-x24.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift12-neon-x32.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift15-neon-x8.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift15-neon-x16.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift15-neon-x24.c",
+    "XNNPACK/src/s16-window/gen/s16-window-shift15-neon-x32.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-neon-c8.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-neon-c16.c",
+    "XNNPACK/src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c",
+    "XNNPACK/src/u8-rmax/u8-rmax-neon.c",
+    "XNNPACK/src/u8-vclamp/u8-vclamp-neon-x64.c",
+    "XNNPACK/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c",
+    "XNNPACK/src/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-multi-dec-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-multi-mov-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-multi-switch-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-reuse-multi-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-8x8-reuse-switch-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-switch-zip-neon.c",
+    "XNNPACK/src/x8-zip/x8-zip-x2-neon.c",
+    "XNNPACK/src/x8-zip/x8-zip-x3-neon.c",
+    "XNNPACK/src/x8-zip/x8-zip-x4-neon.c",
+    "XNNPACK/src/x8-zip/x8-zip-xm-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-multi-dec-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-multi-mov-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-multi-multi-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-multi-switch-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-reuse-multi-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-4x4-reuse-switch-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-dec-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-mov-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-switch-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-zip-neon.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-switch-zip-neon.c",
+    "XNNPACK/src/x24-transposec/x24-transposec-2x2-neon-tbl64.c",
+    "XNNPACK/src/x32-packx/x32-packx-x4-neon-st4.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-multi-dec-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-multi-mov-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-multi-multi-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-multi-switch-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-reuse-multi-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-2x2-reuse-switch-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-dec-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-mov-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-multi-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-switch-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-multi-zip-neon.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-switch-zip-neon.c",
+    "XNNPACK/src/x32-unpool/x32-unpool-neon.c",
+    "XNNPACK/src/x32-zip/x32-zip-x2-neon.c",
+    "XNNPACK/src/x32-zip/x32-zip-x3-neon.c",
+    "XNNPACK/src/x32-zip/x32-zip-x4-neon.c",
+    "XNNPACK/src/x32-zip/x32-zip-xm-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-dec-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-mov-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-multi-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-multi-switch-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-dec-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-mov-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-multi-zip-neon.c",
+    "XNNPACK/src/x64-transposec/gen/x64-transposec-2x2-reuse-switch-zip-neon.c",
+    "XNNPACK/src/xx-fill/xx-fill-neon-x64.c",
+    "XNNPACK/src/xx-pad/xx-pad-neon.c",
+]
+
+ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS = [
+]
+
+ALL_AVX2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-3x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-5x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-5x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-7x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-1x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-1x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-3x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-4x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-4x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-5x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-5x16-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-6x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-7x8-minmax-avx2-broadcast.c",
+    "XNNPACK/src/f16-pavgpool/f16-pavgpool-9p8x-minmax-avx2-c8.c",
+    "XNNPACK/src/f16-pavgpool/f16-pavgpool-9x-minmax-avx2-c8.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x32-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x32-acc4.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x32.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x40-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x40-acc5.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x40.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x48-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x48-acc3.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x48.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x64-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x64-acc4.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x64.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x72-acc3.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x72.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x80-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x80-acc5.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x80.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96-acc3.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96-acc6.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96.c",
+    "XNNPACK/src/f16-velu/gen/f16-velu-avx2-rr1-p3-x8.c",
+    "XNNPACK/src/f16-velu/gen/f16-velu-avx2-rr1-p3-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x8.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x24.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x32.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x40.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x48.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x56.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x64.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x8.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x24.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x32.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x40.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x48.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x56.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x64.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x32.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x48.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x64.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x48.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x64.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x64-acc2.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x64-acc4.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x64.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x72-acc3.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x72.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x80-acc2.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x80-acc5.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x80.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96-acc2.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96-acc3.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96-acc6.c",
+    "XNNPACK/src/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x64-acc2.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x64-acc4.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x64.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x72-acc3.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x72.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x80-acc2.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x80-acc5.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x80.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96-acc2.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96-acc3.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96-acc6.c",
+    "XNNPACK/src/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x64-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x64-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x64.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x72-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x72.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x80-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x80-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x80.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96-acc6.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x56.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x64.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x72.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x80.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x56.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x64.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x72.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x80.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x56.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x64.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x72.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x80.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x40.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x48.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x56.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x64.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x72.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-avx2-rr1-p6-x80.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x8.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x16.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x24.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x32.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x40.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x48.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x56.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x64.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x72.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x80.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x88.c",
+    "XNNPACK/src/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x96.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x8.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x16.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x24.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x32.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x40.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x48.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x56.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x64.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x72.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x80.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x88.c",
+    "XNNPACK/src/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x96.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x40.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x56.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x72.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x40.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x56.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x72.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x80.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x32.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x40.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x48.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x56.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x64.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x72.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x80.c",
+    "XNNPACK/src/math/exp-f32-avx2-rr2-lut8-p3-perm.c",
+    "XNNPACK/src/math/exp-f32-avx2-rr2-lut8-p4-perm.c",
+    "XNNPACK/src/math/exp-f32-avx2-rr2-p5.c",
+    "XNNPACK/src/math/expm1minus-f16-avx2-rr1-p3.c",
+    "XNNPACK/src/math/expm1minus-f32-avx2-rr1-lut4-p4-perm.c",
+    "XNNPACK/src/math/expm1minus-f32-avx2-rr1-lut8-p4-perm.c",
+    "XNNPACK/src/math/expm1minus-f32-avx2-rr1-lut16-p3-gather.c",
+    "XNNPACK/src/math/expm1minus-f32-avx2-rr1-p6.c",
+    "XNNPACK/src/math/expminus-f16-avx2-rr1-p2.c",
+    "XNNPACK/src/math/expminus-f16-avx2-rr1-p3.c",
+    "XNNPACK/src/math/expminus-f32-avx2-rr1-p5.c",
+    "XNNPACK/src/math/expminus-f32-avx2-rr2-p5.c",
+    "XNNPACK/src/math/extexp-avx2-p5.c",
+    "XNNPACK/src/math/sigmoid-f16-avx2-rr1-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f16-avx2-rr1-p2-rcp.c",
+    "XNNPACK/src/math/sigmoid-f16-avx2-rr1-p3-div.c",
+    "XNNPACK/src/math/sigmoid-f16-avx2-rr1-p3-rcp.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-nr2fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-p5-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-p5-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr1-p5-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-nr2fma1adj.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-nr2fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-p5-div.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-p5-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f32-avx2-rr2-p5-nr2fma.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-xw-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-xw-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x8c8-xw-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x8.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x24.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-xw-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-xw-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x8c8-xw-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-avx2-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-avx2-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-avx2-x64.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-avx2-x64.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-avx2-mul32.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x8.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x24.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x8c8-minmax-fp32-avx2.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx2-mul32-ld64-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx2-mul32-ld64-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-avx2-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-avx2-x32.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-avx2-x64.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-avx2-x64.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx2-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx2-x64.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx2-x96.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx2-x128.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-32x32-reuse-mov-avx2.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-16x16-reuse-mov-avx2.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-16x16-reuse-switch-avx2.c",
+]
+
+PROD_SSSE3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/ssse3.c",
+]
+
+ALL_NEONFP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-avgpool/f16-avgpool-9p8x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-avgpool/f16-avgpool-9x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-conv-hwc2chw/f16-conv-hwc2chw-3x3s2p1c3x4-neonfp16arith-2x2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8-acc3.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8-acc4.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-2x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-2x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-3x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-4x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-5x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-6x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8-acc3.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8-acc4.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-2x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-2x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-3x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-4x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc3.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc4.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc5.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-2x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-2x8-acc3.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-2x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-3x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-3x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-4x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-4x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-5x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc3.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc4.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc5.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-2x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-2x8-acc3.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-2x8.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-3x8-acc2.c",
+    "XNNPACK/src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-3x8.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p8c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p8c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p16c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p16c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p32c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-3p32c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p8c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p8c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p16c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p16c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p32c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-4p32c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p8c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p8c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p16c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p16c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p32c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-9p32c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p8c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p8c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p16c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p16c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p32c-minmax-neonfp16arith-acc2.c",
+    "XNNPACK/src/f16-dwconv/gen/f16-dwconv-25p32c-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-gavgpool-cw/f16-gavgpool-cw-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c",
+    "XNNPACK/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-1x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-4x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-6x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-8x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemm-8x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-1x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-1x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-4x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-4x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-6x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-8x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-gemm/gen/f16-gemminc-8x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-ibilinear-chw/gen/f16-ibilinear-chw-neonfp16arith-p4.c",
+    "XNNPACK/src/f16-ibilinear-chw/gen/f16-ibilinear-chw-neonfp16arith-p8.c",
+    "XNNPACK/src/f16-ibilinear-chw/gen/f16-ibilinear-chw-neonfp16arith-p16.c",
+    "XNNPACK/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-1x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-1x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-4x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-4x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-6x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-6x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-8x8-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-igemm/gen/f16-igemm-8x16-minmax-neonfp16arith-ld64.c",
+    "XNNPACK/src/f16-maxpool/f16-maxpool-9p8x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-pavgpool/f16-pavgpool-9p8x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-pavgpool/f16-pavgpool-9x-minmax-neonfp16arith-c8.c",
+    "XNNPACK/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c",
+    "XNNPACK/src/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x32-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x32-acc4.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x32.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x40-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x40-acc5.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x40.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x48-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x48-acc3.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x48.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x64-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x64-acc4.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x64.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x72-acc3.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x72.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x80-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x80-acc5.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x80.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96-acc2.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96-acc3.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96-acc6.c",
+    "XNNPACK/src/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96.c",
+    "XNNPACK/src/f16-rmax/f16-rmax-neonfp16arith.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-8x1-minmax-neonfp16arith-pipelined.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-8x1-minmax-neonfp16arith-x2.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-8x1-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-16x1-minmax-neonfp16arith-pipelined.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-16x1-minmax-neonfp16arith-x2.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-16x1-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith-pipelined.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith-x2.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-pipelined.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-x2.c",
+    "XNNPACK/src/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmaxc-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmin-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vminc-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vclamp/gen/f16-vclamp-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vclamp/gen/f16-vclamp-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-x8.c",
+    "XNNPACK/src/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-x16.c",
+    "XNNPACK/src/f16-vhswish/gen/f16-vhswish-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vhswish/gen/f16-vhswish-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vmulcaddc/gen/f16-vmulcaddc-c8-minmax-neonfp16arith-2x.c",
+    "XNNPACK/src/f16-vmulcaddc/gen/f16-vmulcaddc-c16-minmax-neonfp16arith-2x.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndd-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndd-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndne-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndne-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndu-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndu-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndz-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vrnd/gen/f16-vrndz-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x8.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x24.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x32.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x40.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x48.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x56.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x64.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x8.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x16.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x24.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x32.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x40.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x48.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x56.c",
+    "XNNPACK/src/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x64.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x8.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x16.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x24.c",
+    "XNNPACK/src/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x32.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vabs-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vabs-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vneg-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vneg-neonfp16arith-x16.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vsqr-neonfp16arith-x8.c",
+    "XNNPACK/src/f16-vunary/gen/f16-vsqr-neonfp16arith-x16.c",
+    "XNNPACK/src/math/exp-f16-neonfp16arith-rr2-p3.c",
+    "XNNPACK/src/math/expm1minus-f16-neonfp16arith-rr1-p3.c",
+    "XNNPACK/src/math/expm1minus-f16-neonfp16arith-rr2-p3.c",
+    "XNNPACK/src/math/expminus-f16-neonfp16arith-rr1-p2.c",
+    "XNNPACK/src/math/expminus-f16-neonfp16arith-rr1-p3.c",
+    "XNNPACK/src/math/expminus-f16-neonfp16arith-rr2-p2.c",
+    "XNNPACK/src/math/expminus-f16-neonfp16arith-rr2-p3.c",
+    "XNNPACK/src/math/sigmoid-f16-neonfp16arith-rr2-p2-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f16-neonfp16arith-rr2-p2-nr1recps.c",
+    "XNNPACK/src/math/sigmoid-f16-neonfp16arith-rr2-p2-recpe.c",
+    "XNNPACK/src/math/sigmoid-f16-neonfp16arith-rr2-p3-nr1fma.c",
+    "XNNPACK/src/math/sigmoid-f16-neonfp16arith-rr2-p3-nr1recps.c",
+    "XNNPACK/src/math/sigmoid-f16-neonfp16arith-rr2-p3-recpe.c",
+    "XNNPACK/src/math/sqrt-f16-neonfp16arith-nr1fma1adj.c",
+    "XNNPACK/src/math/sqrt-f16-neonfp16arith-nr1fma.c",
+    "XNNPACK/src/math/sqrt-f16-neonfp16arith-nr1rsqrts.c",
+]
+
+ALL_HEXAGON_MICROKERNEL_SRCS = [
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x2.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x4.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x6.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x8.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x10.c",
+    "XNNPACK/src/cs16-vsquareabs/gen/cs16-vsquareabs-hexagon-x12.c",
+]
+
+PROD_SSE2_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/sse2.c",
+]
+
+PROD_NEONDOT_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neondot.c",
+]
+
+ALL_NEON_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-x8.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-x4.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-x8.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x16.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x48.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x64.c",
+    "XNNPACK/src/x24-transposec/x24-transposec-4x4-aarch64-neon-tbl128.c",
+    "XNNPACK/src/x32-transposec/x32-transposec-4x4-aarch64-neon-tbl128.c",
+]
+
+PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neonfp16arith.c",
+]
+
+ALL_AVX512VBMI_MICROKERNEL_SRCS = [
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x64.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x128.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x192.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x256.c",
+]
+
+PROD_SCALAR_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/scalar.c",
+]
+
+ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-aarch64-neonfma-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-aarch64-neonfma-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-aarch64-neonfma-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-aarch64-neonfma-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-aarch64-neonfma-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-aarch64-neonfma-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-aarch64-neonfma-2x2.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-aarch64-neonfma-2x1.c",
+    "XNNPACK/src/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-aarch64-neonfma-2x2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-3x4.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x4-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x2-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x4-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x2-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x4-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-12x2-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-12x4-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x2-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x4-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x2-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x4-minmax-aarch64-neonfma.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x24.c",
+    "XNNPACK/src/math/sigmoid-f32-aarch64-neonfma-rr1-lut64-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f32-aarch64-neonfma-rr1-lut2048-p1-div.c",
+    "XNNPACK/src/math/sigmoid-f32-aarch64-neonfma-rr1-p5-div.c",
+    "XNNPACK/src/math/sigmoid-f32-aarch64-neonfma-rr2-lut64-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f32-aarch64-neonfma-rr2-lut2048-p1-div.c",
+    "XNNPACK/src/math/sigmoid-f32-aarch64-neonfma-rr2-p5-div.c",
+    "XNNPACK/src/math/tanh-f32-aarch64-neonfma-rr1-p6-div.c",
+]
+
+ALL_SSSE3_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-6x4.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-ssse3.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-ssse3.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-ssse3.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-ssse3-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-ssse3-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-ssse3-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-ssse3-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-ssse3-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-ssse3-ld128.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-gemmlowp-ssse3.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-ssse3.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-ssse3-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-ssse3-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-ssse3-x32.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-gemmlowp-ssse3.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-ssse3.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-ssse3-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-ssse3-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-ssse3-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-ssse3-x16.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-ssse3-x32.c",
+    "XNNPACK/src/x24-transposec/x24-transposec-4x4-ssse3.c",
+]
+
+PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neonfp16arith-aarch64.c",
 ]
 
 JIT_SRCS = [
     "XNNPACK/src/jit/aarch32-assembler.cc",
     "XNNPACK/src/jit/aarch64-assembler.cc",
     "XNNPACK/src/jit/assembler.cc",
-    "XNNPACK/src/jit/memory.c",
 ]
 
-JIT_AARCH32_SRCS = [
-    "XNNPACK/src/f32-gemm/4x8-aarch32-neon-cortex-a7.cc",
-    "XNNPACK/src/f32-gemm/4x8-aarch32-neon-cortex-a53.cc",
-    "XNNPACK/src/f32-gemm/4x8-aarch32-neon-cortex-a55.cc",
-    "XNNPACK/src/f32-gemm/4x8-aarch32-neon-cortex-a75.cc",
-    "XNNPACK/src/f32-gemm/4x8-aarch32-neon-ld64.cc",
-    "XNNPACK/src/f32-igemm/4x8-aarch32-neon-cortex-a7.cc",
-    "XNNPACK/src/f32-igemm/4x8-aarch32-neon-cortex-a53.cc",
-    "XNNPACK/src/f32-igemm/4x8-aarch32-neon-cortex-a55.cc",
-    "XNNPACK/src/f32-igemm/4x8-aarch32-neon-cortex-a75.cc",
-    "XNNPACK/src/f32-igemm/4x8-aarch32-neon-ld64.cc",
-    "XNNPACK/src/qc8-gemm/4x8-fp32-aarch32-neonv8-mlal-lane-ld64.cc",
-    "XNNPACK/src/qc8-gemm/4x8c4-fp32-aarch32-neondot-ld64.cc",
-    "XNNPACK/src/qc8-igemm/4x8-fp32-aarch32-neonv8-mlal-lane-ld64.cc",
-    "XNNPACK/src/qc8-igemm/4x8c4-fp32-aarch32-neondot-ld64.cc",
-    "XNNPACK/src/qs8-gemm/4x8-rndnu-aarch32-neon-mlal-lane-ld64.cc",
-    "XNNPACK/src/qs8-gemm/4x8c4-rndnu-aarch32-neondot-ld64.cc",
-    "XNNPACK/src/qs8-igemm/4x8-rndnu-aarch32-neon-mlal-lane-ld64.cc",
-    "XNNPACK/src/qs8-igemm/4x8c4-rndnu-aarch32-neondot-ld64.cc",
+OPERATOR_SRCS = [
+    "XNNPACK/src/operator-delete.c",
+    "XNNPACK/src/operators/argmax-pooling-nhwc.c",
+    "XNNPACK/src/operators/average-pooling-nhwc.c",
+    "XNNPACK/src/operators/binary-elementwise-nd.c",
+    "XNNPACK/src/operators/channel-shuffle-nc.c",
+    "XNNPACK/src/operators/constant-pad-nd.c",
+    "XNNPACK/src/operators/convolution-nchw.c",
+    "XNNPACK/src/operators/convolution-nhwc.c",
+    "XNNPACK/src/operators/deconvolution-nhwc.c",
+    "XNNPACK/src/operators/fully-connected-nc.c",
+    "XNNPACK/src/operators/global-average-pooling-ncw.c",
+    "XNNPACK/src/operators/global-average-pooling-nwc.c",
+    "XNNPACK/src/operators/lut-elementwise-nc.c",
+    "XNNPACK/src/operators/max-pooling-nhwc.c",
+    "XNNPACK/src/operators/prelu-nc.c",
+    "XNNPACK/src/operators/resize-bilinear-nchw.c",
+    "XNNPACK/src/operators/resize-bilinear-nhwc.c",
+    "XNNPACK/src/operators/slice-nd.c",
+    "XNNPACK/src/operators/softmax-nc.c",
+    "XNNPACK/src/operators/transpose-nd.c",
+    "XNNPACK/src/operators/unary-elementwise-nc.c",
+    "XNNPACK/src/operators/unpooling-nhwc.c",
 ]
 
-JIT_AARCH64_SRCS = [
-    "XNNPACK/src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.cc",
-    "XNNPACK/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.cc",
-    "XNNPACK/src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.cc",
-    "XNNPACK/src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.cc",
+PROD_SSE41_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/sse41.c",
 ]
 
-PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS = [
-    "XNNPACK/src/params-init.c",
-    "XNNPACK/src/u8-lut32norm/scalar.c",
-    "XNNPACK/src/xx-copy/memcpy.c",
-    "XNNPACK/src/x8-lut/gen/lut-scalar-x4.c",
-    "XNNPACK/src/x32-depthtospace2d-chw2hwc/scalar.c",
+ALL_NEONFP16_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-neonfp16-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-neonfp16-x16.c",
+    "XNNPACK/src/math/cvt-f16-f32-neonfp16.c",
+    "XNNPACK/src/math/cvt-f32-f16-neonfp16.c",
 ]
 
-PROD_SSE_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f32-avgpool/9p8x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-avgpool/9x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-conv-hwc2chw/3x3s2p1c3x4-sse-2x2.c",
-    "XNNPACK/src/f32-dwconv/gen/up8x3-minmax-sse.c",
-    "XNNPACK/src/f32-dwconv/gen/up8x4-minmax-sse.c",
-    "XNNPACK/src/f32-dwconv/gen/up8x9-minmax-sse.c",
-    "XNNPACK/src/f32-dwconv/gen/up8x25-minmax-sse.c",
-    "XNNPACK/src/f32-dwconv2d-chw/gen/3x3p1-minmax-sse-2x4-acc2.c",
-    "XNNPACK/src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-sse-1x4-acc3.c",
-    "XNNPACK/src/f32-dwconv2d-chw/gen/5x5p2-minmax-sse-4x4.c",
-    "XNNPACK/src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-sse-2x4.c",
-    "XNNPACK/src/f32-gavgpool-cw/sse-x4.c",
-    "XNNPACK/src/f32-gavgpool/7p7x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-gavgpool/7x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-gemm/gen/1x8-minmax-sse-load1.c",
-    "XNNPACK/src/f32-gemm/gen/4x2c4-minmax-sse.c",
-    "XNNPACK/src/f32-gemm/gen/4x8-minmax-sse-load1.c",
-    "XNNPACK/src/f32-ibilinear-chw/gen/sse-p8.c",
-    "XNNPACK/src/f32-ibilinear/gen/sse-c8.c",
-    "XNNPACK/src/f32-igemm/gen/1x8-minmax-sse-load1.c",
-    "XNNPACK/src/f32-igemm/gen/4x2c4-minmax-sse.c",
-    "XNNPACK/src/f32-igemm/gen/4x8-minmax-sse-load1.c",
-    "XNNPACK/src/f32-maxpool/9p8x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-pavgpool/9p8x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-pavgpool/9x-minmax-sse-c4.c",
-    "XNNPACK/src/f32-rmax/sse.c",
-    "XNNPACK/src/f32-spmm/gen/32x1-minmax-sse.c",
-    "XNNPACK/src/f32-vbinary/gen/vadd-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vaddc-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vdiv-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vdivc-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vmaxc-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vmin-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vminc-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vmul-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vmulc-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vrdivc-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vrsubc-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vsqrdiff-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vsqrdiffc-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vsub-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vbinary/gen/vsubc-minmax-sse-x8.c",
-    "XNNPACK/src/f32-vclamp/gen/vclamp-sse-x8.c",
-    "XNNPACK/src/f32-vhswish/gen/vhswish-sse-x8.c",
-    "XNNPACK/src/f32-vlrelu/gen/vlrelu-sse-x8.c",
-    "XNNPACK/src/f32-vmulcaddc/gen/c4-minmax-sse-2x.c",
-    "XNNPACK/src/f32-vsqrt/gen/sse-sqrt-x4.c",
-    "XNNPACK/src/f32-vunary/gen/vabs-sse-x8.c",
-    "XNNPACK/src/f32-vunary/gen/vneg-sse-x8.c",
-    "XNNPACK/src/f32-vunary/gen/vsqr-sse-x8.c",
-    "XNNPACK/src/x32-packx/x4-sse.c",
+PROD_AVX512SKX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/avx512skx.c",
 ]
 
-PROD_SSE2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c",
-    "XNNPACK/src/f32-argmaxpool/4x-sse2-c4.c",
-    "XNNPACK/src/f32-argmaxpool/9p8x-sse2-c4.c",
-    "XNNPACK/src/f32-argmaxpool/9x-sse2-c4.c",
-    "XNNPACK/src/f32-f16-vcvt/gen/vcvt-sse2-x16.c",
-    "XNNPACK/src/f32-prelu/gen/sse2-2x8.c",
-    "XNNPACK/src/f32-qs8-vcvt/gen/vcvt-sse2-x32.c",
-    "XNNPACK/src/f32-qu8-vcvt/gen/vcvt-sse2-x32.c",
-    "XNNPACK/src/f32-raddstoreexpminusmax/gen/sse2-rr2-p5-x20-acc2.c",
-    "XNNPACK/src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c",
-    "XNNPACK/src/f32-vlrelu/gen/vlrelu-sse2-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndd-sse2-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndne-sse2-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndu-sse2-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndz-sse2-x8.c",
-    "XNNPACK/src/f32-vsigmoid/gen/vsigmoid-sse2-rr2-lut64-p2-div-x8.c",
-    "XNNPACK/src/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
-    "XNNPACK/src/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
-    "XNNPACK/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c",
-    "XNNPACK/src/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c",
-    "XNNPACK/src/qs8-f32-vcvt/gen/vcvt-sse2-x32.c",
-    "XNNPACK/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c",
-    "XNNPACK/src/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c",
-    "XNNPACK/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qs8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qs8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qu8-avgpool/9p8x-minmax-sse2-c8.c",
-    "XNNPACK/src/qu8-avgpool/9x-minmax-sse2-c8.c",
-    "XNNPACK/src/qu8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
-    "XNNPACK/src/qu8-f32-vcvt/gen/vcvt-sse2-x32.c",
-    "XNNPACK/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c",
-    "XNNPACK/src/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c",
-    "XNNPACK/src/qu8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qu8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qu8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qu8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "XNNPACK/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qu8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/qu8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "XNNPACK/src/s8-ibilinear/gen/sse2-c8.c",
-    "XNNPACK/src/s8-maxpool/9p8x-minmax-sse2-c16.c",
-    "XNNPACK/src/s8-vclamp/sse2-x64.c",
-    "XNNPACK/src/u8-ibilinear/gen/sse2-c8.c",
-    "XNNPACK/src/u8-maxpool/9p8x-minmax-sse2-c16.c",
-    "XNNPACK/src/u8-rmax/sse2.c",
-    "XNNPACK/src/u8-vclamp/sse2-x64.c",
-    "XNNPACK/src/xx-fill/sse2-x64.c",
-    "XNNPACK/src/xx-pad/sse2.c",
-    "XNNPACK/src/x8-zip/xm-sse2.c",
-    "XNNPACK/src/x8-zip/x2-sse2.c",
-    "XNNPACK/src/x8-zip/x3-sse2.c",
-    "XNNPACK/src/x8-zip/x4-sse2.c",
-    "XNNPACK/src/x32-unpool/sse2.c",
-    "XNNPACK/src/x32-zip/xm-sse2.c",
-    "XNNPACK/src/x32-zip/x2-sse2.c",
-    "XNNPACK/src/x32-zip/x3-sse2.c",
-    "XNNPACK/src/x32-zip/x4-sse2.c",
+JIT_AARCH64_SRCS = [
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-aarch64-neonfma-cortex-a53.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-aarch64-neonfma-cortex-a75.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch64-neonfma-cortex-a53.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch64-neonfma-cortex-a55.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-aarch64-neonfma-cortex-a75.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-aarch64-neonfma-cortex-a53.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-aarch64-neonfma-cortex-a55.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-aarch64-neonfma-cortex-a75.cc",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-aarch64-neonfma-ld128.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-aarch64-neonfma-cortex-a53.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-aarch64-neonfma-cortex-a75.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch64-neonfma-cortex-a53.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch64-neonfma-cortex-a55.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-aarch64-neonfma-cortex-a75.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-aarch64-neonfma-cortex-a53.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-aarch64-neonfma-cortex-a55.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-aarch64-neonfma-cortex-a75.cc",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-aarch64-neonfma-ld128.cc",
 ]
 
-PROD_SSSE3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c",
+PROD_SCALAR_RISCV_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/scalar-riscv.c",
 ]
 
-PROD_SSE41_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c",
-    "XNNPACK/src/f32-f16-vcvt/gen/vcvt-sse41-x8.c",
-    "XNNPACK/src/f32-prelu/gen/sse41-2x8.c",
-    "XNNPACK/src/f32-qs8-vcvt/gen/vcvt-sse41-x32.c",
-    "XNNPACK/src/f32-vlrelu/gen/vlrelu-sse41-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndd-sse41-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndne-sse41-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndu-sse41-x8.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndz-sse41-x8.c",
-    "XNNPACK/src/f32-vsigmoid/gen/vsigmoid-sse41-rr2-lut64-p2-div-x8.c",
-    "XNNPACK/src/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
-    "XNNPACK/src/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
-    "XNNPACK/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16-add16.c",
-    "XNNPACK/src/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16-add16.c",
-    "XNNPACK/src/qs8-f32-vcvt/gen/vcvt-sse41-x16.c",
-    "XNNPACK/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c",
-    "XNNPACK/src/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c",
-    "XNNPACK/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c",
-    "XNNPACK/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c",
-    "XNNPACK/src/qs8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "XNNPACK/src/qs8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
-    "XNNPACK/src/qu8-f32-vcvt/gen/vcvt-sse41-x16.c",
-    "XNNPACK/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c",
-    "XNNPACK/src/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c",
-    "XNNPACK/src/qu8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qu8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qu8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qu8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "XNNPACK/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c",
-    "XNNPACK/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c",
-    "XNNPACK/src/qu8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "XNNPACK/src/qu8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "XNNPACK/src/s8-ibilinear/gen/sse41-c16.c",
-    "XNNPACK/src/s8-maxpool/9p8x-minmax-sse41-c16.c",
-    "XNNPACK/src/s8-vclamp/sse41-x64.c",
-    "XNNPACK/src/u8-ibilinear/gen/sse41-c16.c",
+PROD_NEONFMA_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neonfma.c",
 ]
 
-PROD_AVX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c",
-    "XNNPACK/src/f32-dwconv/gen/up8x25-minmax-avx.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x3-minmax-avx.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x4-minmax-avx.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x9-minmax-avx.c",
-    "XNNPACK/src/f32-f16-vcvt/gen/vcvt-avx-x24.c",
-    "XNNPACK/src/f32-gemm/gen/1x16-minmax-avx-broadcast.c",
-    "XNNPACK/src/f32-gemm/gen/5x16-minmax-avx-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/1x16-minmax-avx-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/5x16-minmax-avx-broadcast.c",
-    "XNNPACK/src/f32-prelu/gen/avx-2x16.c",
-    "XNNPACK/src/f32-qs8-vcvt/gen/vcvt-avx-x32.c",
-    "XNNPACK/src/f32-qu8-vcvt/gen/vcvt-avx-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vadd-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vaddc-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vdiv-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vdivc-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vmaxc-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vmin-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vminc-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vmul-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vmulc-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vrdivc-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vrsubc-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vsqrdiff-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vsqrdiffc-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vsub-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vsubc-minmax-avx-x16.c",
-    "XNNPACK/src/f32-vclamp/gen/vclamp-avx-x16.c",
-    "XNNPACK/src/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x32.c",
-    "XNNPACK/src/f32-vhswish/gen/vhswish-avx-x16.c",
-    "XNNPACK/src/f32-vlrelu/gen/vlrelu-avx-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndd-avx-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndne-avx-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndu-avx-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndz-avx-x16.c",
-    "XNNPACK/src/f32-vsigmoid/gen/vsigmoid-avx-rr2-p5-nr2-x40.c",
-    "XNNPACK/src/f32-vsqrt/gen/avx-sqrt-x8.c",
-    "XNNPACK/src/f32-vunary/gen/vabs-avx-x16.c",
-    "XNNPACK/src/f32-vunary/gen/vneg-avx-x16.c",
-    "XNNPACK/src/f32-vunary/gen/vsqr-avx-x16.c",
-    "XNNPACK/src/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c",
-    "XNNPACK/src/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c",
-    "XNNPACK/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c",
-    "XNNPACK/src/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c",
-    "XNNPACK/src/qs8-f32-vcvt/gen/vcvt-avx-x32.c",
-    "XNNPACK/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c",
-    "XNNPACK/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c",
-    "XNNPACK/src/qs8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "XNNPACK/src/qs8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c",
-    "XNNPACK/src/qu8-f32-vcvt/gen/vcvt-avx-x32.c",
-    "XNNPACK/src/qu8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qu8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qu8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qu8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "XNNPACK/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c",
-    "XNNPACK/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c",
-    "XNNPACK/src/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "XNNPACK/src/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "XNNPACK/src/x8-lut/gen/lut-avx-x64.c",
+ALL_AVX512SKX_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-x32.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-avx512skx-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-avx512skx-x32.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x32.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x64.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x96.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x128.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x64.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x96.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x32.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x48.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx512skx-mul32-ld128-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-avx512skx-mul32-ld128-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx512skx-mul32-ld128-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-avx512skx-mul32-ld128-x32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x32.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x48.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c8-minmax-fp32-avx512skx.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-x16.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-x32.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512skx-vpshufb-x64.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512skx-vpshufb-x128.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512skx-vpshufb-x192.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-avx512skx-vpshufb-x256.c",
 ]
 
-PROD_F16C_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
-    "XNNPACK/src/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c",
-    "XNNPACK/src/f16-gavgpool/gen/7x-minmax-f16c-c8.c",
-    "XNNPACK/src/f16-maxpool/9p8x-minmax-f16c-c8.c",
-    "XNNPACK/src/f16-prelu/gen/f16c-2x16.c",
-    "XNNPACK/src/f16-vbinary/gen/vadd-minmax-f16c-x16.c",
-    "XNNPACK/src/f16-vbinary/gen/vaddc-minmax-f16c-x16.c",
-    "XNNPACK/src/f16-vbinary/gen/vmul-minmax-f16c-x16.c",
-    "XNNPACK/src/f16-vbinary/gen/vmulc-minmax-f16c-x16.c",
-    "XNNPACK/src/f16-vclamp/gen/vclamp-f16c-x16.c",
-    "XNNPACK/src/f16-vhswish/gen/vhswish-f16c-x16.c",
-    "XNNPACK/src/f16-vlrelu/gen/vlrelu-f16c-x16.c",
-    "XNNPACK/src/f32-f16-vcvt/gen/vcvt-f16c-x16.c",
+ALL_SSE41_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x32.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x32.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x24.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x32.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-sse41-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-sse41-2x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x32.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-sse41-rr2-p6-x24.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-sse41-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-sse41-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-sse41-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-sse41-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-sse41-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-sse41-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-sse41-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-sse41-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-sse41-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-sse41-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x24.c",
+    "XNNPACK/src/math/cvt-f16-f32-sse41-int16.c",
+    "XNNPACK/src/math/cvt-f16-f32-sse41-int32.c",
+    "XNNPACK/src/math/cvt-f32-f16-sse41.c",
+    "XNNPACK/src/math/roundd-sse41.c",
+    "XNNPACK/src/math/roundne-sse41.c",
+    "XNNPACK/src/math/roundu-sse41.c",
+    "XNNPACK/src/math/roundz-sse41.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse41-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x8.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x24.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-sse41.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-fp32-sse41.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-gemmlowp-sse41.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndna-sse41.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x32.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-sse41-x8.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-sse41-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-sse41-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-x8.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-sse41-x32.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-sse41-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-sse41-mul32.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x8.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x24.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-gemmlowp-sse41.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-rndna-sse41.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul32-ld32-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul32-ld32-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-sse41-x8.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-sse41-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-sse41-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-x8.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-sse41-x32.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-sse41-c8.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-sse41-c16.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-9p8x-minmax-sse41-c16.c",
+    "XNNPACK/src/s8-vclamp/s8-vclamp-sse41-x64.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c",
 ]
 
-PROD_XOP_MICROKERNEL_SRCS = [
-    "XNNPACK/src/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c",
-    "XNNPACK/src/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c",
-    "XNNPACK/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c",
-    "XNNPACK/src/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c",
-    "XNNPACK/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c",
-    "XNNPACK/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c",
-    "XNNPACK/src/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
-    "XNNPACK/src/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
-    "XNNPACK/src/qu8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qu8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qu8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qu8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "XNNPACK/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c",
-    "XNNPACK/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c",
+ALL_NEONV8_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x8.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x24.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x32.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-neonv8-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-neonv8-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-neonv8-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-neonv8-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-neonv8-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-neonv8-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-neonv8-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-neonv8-x8.c",
+    "XNNPACK/src/math/cvt-f32-qs8-neonv8.c",
+    "XNNPACK/src/math/cvt-f32-qu8-neonv8.c",
+    "XNNPACK/src/math/roundd-neonv8.c",
+    "XNNPACK/src/math/roundne-neonv8.c",
+    "XNNPACK/src/math/roundu-neonv8.c",
+    "XNNPACK/src/math/roundz-neonv8.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-neonv8-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neonv8-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neonv8-mla8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neonv8-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neonv8-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mla8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mul8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neonv8-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neonv8-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mla8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mla8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mul8-ld64.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mul8-ld128.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-neonv8-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-neonv8-ld64-x16.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-neonv8-ld128-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neonv8-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neonv8-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neonv8-ld128-x16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-neonv8-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-neonv8-ld64-x16.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-neonv8-ld128-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neonv8-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neonv8-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neonv8-ld128-x16.c",
 ]
 
-PROD_FMA3_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c",
-    "XNNPACK/src/f16-dwconv/gen/up16x3-minmax-fma3.c",
-    "XNNPACK/src/f16-dwconv/gen/up16x4-minmax-fma3.c",
-    "XNNPACK/src/f16-dwconv/gen/up16x9-minmax-fma3.c",
-    "XNNPACK/src/f16-ibilinear/gen/fma3-c8.c",
-    "XNNPACK/src/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c",
-    "XNNPACK/src/f32-dwconv/gen/up8x25-minmax-fma3.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x3-minmax-fma3.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x4-minmax-fma3.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x9-minmax-fma3.c",
-    "XNNPACK/src/f32-gemm/gen/1x16-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-gemm/gen/1x16s4-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-gemm/gen/4x16s4-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-gemm/gen/5x16-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/1x16-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/1x16s4-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/5x16-minmax-fma3-broadcast.c",
-    "XNNPACK/src/f32-vhswish/gen/vhswish-fma3-x16.c",
+ALL_WASMSIMD_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-x32.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-x8.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-x16.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-x24.c",
+    "XNNPACK/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-x32.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-4x-wasmsimd-c4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9p8x-wasmsimd-c4.c",
+    "XNNPACK/src/f32-argmaxpool/f32-argmaxpool-9x-wasmsimd-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9p8x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9p8x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-wasmsimd-2x2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-arm-splat-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-wasmsimd-x86-splat-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-loadsplat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-loadsplat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-wasmsimd-x86-splat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-arm-splat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-wasmsimd-x86-splat-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-wasmsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-wasmsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-wasmsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-wasmsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-wasmsimd-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-wasmsimd.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmsimd-arm-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmsimd-x86-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-wasmsimd.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmsimd-x8.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmsimd-x16.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmsimd-x24.c",
+    "XNNPACK/src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmsimd-x32.c",
+    "XNNPACK/src/f32-gavgpool-cw/f32-gavgpool-cw-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-gavgpool-cw/f32-gavgpool-cw-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-6x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-6x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-wasmsimd-p4.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-wasmsimd-p8.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-wasmsimd-c4.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-wasmsimd-c8.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmsimd-arm-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmsimd-x86-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-relu-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-relu-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-wasmsimd-loadsplat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8-wasmsimd-splat.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-relu-wasmsimd.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-6x8s4-wasmsimd.c",
+    "XNNPACK/src/f32-maxpool/f32-maxpool-9p8x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-maxpool/f32-maxpool-9p8x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9p8x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-arm-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9x-minmax-wasmsimd-x86-c4.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-arm-splat.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-wasmsimd-x86-splat.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-1x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-2x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-iminmax-4x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-1x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-2x16.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x8.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-wasmsimd-laneselect-4x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-cvt-x32.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-x8.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-x16.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-x24.c",
+    "XNNPACK/src/f32-qs8-vcvt/gen/f32-qs8-vcvt-wasmsimd-magic-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-x8.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-x24.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-cvt-x32.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-x8.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-x16.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-x24.c",
+    "XNNPACK/src/f32-qu8-vcvt/gen/f32-qu8-vcvt-wasmsimd-magic-x32.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x8-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x8.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x12-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x12-acc3.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x12.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x16-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x16-acc4.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x16.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x20-acc2.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x20-acc5.c",
+    "XNNPACK/src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-wasmsimd-rr2-p5-x20.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-arm-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-arm-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-arm-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-x86-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-x86-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-x86-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-arm-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-arm-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-arm-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-x86-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-x86-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-x86-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-arm-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-arm-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-arm-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-x86-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-x86-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-x86-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-arm.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-pipelined-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-pipelined.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-x2.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-wasmsimd-x86.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-arm-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-wasmsimd-x86-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-relu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasmsimd-arm-x4.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasmsimd-arm-x8.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasmsimd-x86-x4.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-wasmsimd-x86-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-arm-rr2-p6-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-lut16-p3-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-lut16-p3-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-lut16-p3-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-lut16-p3-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-lut16-p3-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-lut16-p3-x24.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-p6-x4.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-p6-x8.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-p6-x12.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-p6-x16.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-p6-x20.c",
+    "XNNPACK/src/f32-velu/gen/f32-velu-wasmsimd-x86-rr2-p6-x24.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmsimd-iminmax-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmsimd-iminmax-x8.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmsimd-laneselect-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-wasmsimd-laneselect-x8.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-wasmsimd-arm-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-wasmsimd-x86-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-wasmsimd-arm-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-wasmsimd-x86-2x.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-wasmsimd-x16.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndd-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndne-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndu-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vrnd/gen/f32-vrndz-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-lut64-p2-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-lut64-p2-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-lut64-p2-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-lut64-p2-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-lut64-p2-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-lut64-p2-div-x24.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-p5-div-x4.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-p5-div-x8.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-p5-div-x12.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-p5-div-x16.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-p5-div-x20.c",
+    "XNNPACK/src/f32-vsigmoid/gen/f32-vsigmoid-wasmsimd-rr2-p5-div-x24.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-wasmsimd-sqrt-x4.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-wasmsimd-sqrt-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-wasmsimd-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-wasmsimd-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-wasmsimd-x8.c",
+    "XNNPACK/src/math/cvt-f16-f32-wasmsimd-int16.c",
+    "XNNPACK/src/math/cvt-f16-f32-wasmsimd-int32.c",
+    "XNNPACK/src/math/cvt-f32-f16-wasmsimd.c",
+    "XNNPACK/src/math/cvt-f32-qs8-wasmsimd.c",
+    "XNNPACK/src/math/cvt-f32-qu8-wasmsimd.c",
+    "XNNPACK/src/math/expm1minus-f32-wasmsimd-rr2-lut16-p3-andnot.c",
+    "XNNPACK/src/math/expm1minus-f32-wasmsimd-rr2-lut16-p3-max.c",
+    "XNNPACK/src/math/expm1minus-f32-wasmsimd-rr2-p6-andnot.c",
+    "XNNPACK/src/math/expm1minus-f32-wasmsimd-rr2-p6-max.c",
+    "XNNPACK/src/math/roundd-wasmsimd-addsub.c",
+    "XNNPACK/src/math/roundd-wasmsimd-cvt.c",
+    "XNNPACK/src/math/roundd-wasmsimd-native.c",
+    "XNNPACK/src/math/roundne-wasmsimd-addsub.c",
+    "XNNPACK/src/math/roundne-wasmsimd-native.c",
+    "XNNPACK/src/math/roundu-wasmsimd-addsub.c",
+    "XNNPACK/src/math/roundu-wasmsimd-cvt.c",
+    "XNNPACK/src/math/roundu-wasmsimd-native.c",
+    "XNNPACK/src/math/roundz-wasmsimd-addsub.c",
+    "XNNPACK/src/math/roundz-wasmsimd-cvt.c",
+    "XNNPACK/src/math/roundz-wasmsimd-native.c",
+    "XNNPACK/src/math/sigmoid-f32-wasmsimd-rr2-lut64-p2-div.c",
+    "XNNPACK/src/math/sigmoid-f32-wasmsimd-rr2-p5-div.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-gemm/gen/qc8-gemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qc8-igemm/gen/qc8-igemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-wasmsimd-mul16-add16.c",
+    "XNNPACK/src/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-x8.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-x16.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-x24.c",
+    "XNNPACK/src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-x32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c",
+    "XNNPACK/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-gemm/gen/qs8-gemm-4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qs8-igemm/gen/qs8-igemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-fp32-wasmsimd.c",
+    "XNNPACK/src/qs8-requantization/qs8-requantization-gemmlowp-wasmsimd.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-wasmsimd-x8.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-wasmsimd-x16.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-wasmsimd-x24.c",
+    "XNNPACK/src/qs8-vadd/gen/qs8-vadd-minmax-wasmsimd-x32.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-wasmsimd-x8.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-wasmsimd-x16.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-wasmsimd-x24.c",
+    "XNNPACK/src/qs8-vaddc/gen/qs8-vaddc-minmax-wasmsimd-x32.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-wasmsimd-x8.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-wasmsimd-x16.c",
+    "XNNPACK/src/qs8-vcvt/gen/qs8-vcvt-wasmsimd-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmsimd-arm-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmsimd-arm-x32.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmsimd-x86-x8.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmsimd-x86-x16.c",
+    "XNNPACK/src/qs8-vlrelu/gen/qs8-vlrelu-wasmsimd-x86-x32.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-wasmsimd-mul32-ld64-x8.c",
+    "XNNPACK/src/qs8-vmul/gen/qs8-vmul-minmax-fp32-wasmsimd-mul32-ld64-x16.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-wasmsimd-mul32-ld64-x8.c",
+    "XNNPACK/src/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-wasmsimd-mul32-ld64-x16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-fp32-wasmsimd-mul16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-x8.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-x16.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-x24.c",
+    "XNNPACK/src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-x32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c",
+    "XNNPACK/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
+    "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-fp32-wasmsimd.c",
+    "XNNPACK/src/qu8-requantization/qu8-requantization-gemmlowp-wasmsimd.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-wasmsimd-x8.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-wasmsimd-x16.c",
+    "XNNPACK/src/qu8-vadd/gen/qu8-vadd-minmax-wasmsimd-x32.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-wasmsimd-x8.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-wasmsimd-x16.c",
+    "XNNPACK/src/qu8-vaddc/gen/qu8-vaddc-minmax-wasmsimd-x32.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-wasmsimd-x8.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-wasmsimd-x16.c",
+    "XNNPACK/src/qu8-vcvt/gen/qu8-vcvt-wasmsimd-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmsimd-arm-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmsimd-arm-x32.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmsimd-x86-x8.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmsimd-x86-x16.c",
+    "XNNPACK/src/qu8-vlrelu/gen/qu8-vlrelu-wasmsimd-x86-x32.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-wasmsimd-mul32-ld64-x8.c",
+    "XNNPACK/src/qu8-vmul/gen/qu8-vmul-minmax-fp32-wasmsimd-mul32-ld64-x16.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-wasmsimd-mul32-ld64-x8.c",
+    "XNNPACK/src/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-wasmsimd-mul32-ld64-x16.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-wasmsimd-dot16x2-c8.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-wasmsimd-dot16x2-c16.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-wasmsimd-mul32-c8.c",
+    "XNNPACK/src/s8-ibilinear/gen/s8-ibilinear-wasmsimd-mul32-c16.c",
+    "XNNPACK/src/s8-maxpool/s8-maxpool-9p8x-minmax-wasmsimd-c16.c",
+    "XNNPACK/src/s8-vclamp/s8-vclamp-wasmsimd-x64.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c16.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c8.c",
+    "XNNPACK/src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-mul32-c16.c",
+    "XNNPACK/src/u8-maxpool/u8-maxpool-9p8x-minmax-wasmsimd-c16.c",
+    "XNNPACK/src/u8-vclamp/u8-vclamp-wasmsimd-x64.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmsimd-x16.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmsimd-x32.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmsimd-x48.c",
+    "XNNPACK/src/x8-lut/gen/x8-lut-wasmsimd-x64.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-wasmsimd.c",
+    "XNNPACK/src/x8-transposec/gen/x8-transposec-16x16-reuse-switch-wasmsimd.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-mov-wasmsimd.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-multi-switch-wasmsimd.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-mov-wasmsimd.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-wasmsimd.c",
+    "XNNPACK/src/x16-transposec/gen/x16-transposec-8x8-reuse-switch-wasmsimd.c",
+    "XNNPACK/src/x32-packx/x32-packx-x4-wasmsimd.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-mov-wasmsimd.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-multi-wasmsimd.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-multi-switch-wasmsimd.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-wasmsimd.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-multi-wasmsimd.c",
+    "XNNPACK/src/x32-transposec/gen/x32-transposec-4x4-reuse-switch-wasmsimd.c",
+    "XNNPACK/src/x32-unpool/x32-unpool-wasmsimd.c",
+    "XNNPACK/src/x32-zip/x32-zip-x2-wasmsimd.c",
+    "XNNPACK/src/x32-zip/x32-zip-x3-wasmsimd.c",
+    "XNNPACK/src/x32-zip/x32-zip-x4-wasmsimd.c",
+    "XNNPACK/src/x32-zip/x32-zip-xm-wasmsimd.c",
+    "XNNPACK/src/xx-fill/xx-fill-wasmsimd-x64.c",
+    "XNNPACK/src/xx-pad/xx-pad-wasmsimd.c",
 ]
 
-PROD_AVX2_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-gemm/gen/1x16-minmax-avx2-broadcast.c",
-    "XNNPACK/src/f16-gemm/gen/4x16-minmax-avx2-broadcast.c",
-    "XNNPACK/src/f16-igemm/gen/1x16-minmax-avx2-broadcast.c",
-    "XNNPACK/src/f16-igemm/gen/4x16-minmax-avx2-broadcast.c",
-    "XNNPACK/src/f32-qs8-vcvt/gen/vcvt-avx2-x64.c",
-    "XNNPACK/src/f32-qu8-vcvt/gen/vcvt-avx2-x64.c",
-    "XNNPACK/src/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c",
-    "XNNPACK/src/f32-vsigmoid/gen/vsigmoid-avx2-rr1-p5-div-x40.c",
-    "XNNPACK/src/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "XNNPACK/src/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "XNNPACK/src/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qc8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qc8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "XNNPACK/src/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "XNNPACK/src/qs8-f32-vcvt/gen/vcvt-avx2-x16.c",
-    "XNNPACK/src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qs8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c",
-    "XNNPACK/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "XNNPACK/src/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "XNNPACK/src/qu8-f32-vcvt/gen/vcvt-avx2-x16.c",
-    "XNNPACK/src/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qu8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "XNNPACK/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c",
-    "XNNPACK/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
-    "XNNPACK/src/x8-lut/gen/lut-avx2-x128.c",
+PROD_NEONV8_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neonv8.c",
 ]
 
-PROD_AVX512F_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f32-dwconv/gen/up16x3-minmax-avx512f.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x4-minmax-avx512f.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x9-minmax-avx512f.c",
-    "XNNPACK/src/f32-dwconv/gen/up16x25-minmax-avx512f.c",
-    "XNNPACK/src/f32-gemm/gen/1x16-minmax-avx512f-broadcast.c",
-    "XNNPACK/src/f32-gemm/gen/7x16-minmax-avx512f-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/1x16-minmax-avx512f-broadcast.c",
-    "XNNPACK/src/f32-igemm/gen/7x16-minmax-avx512f-broadcast.c",
-    "XNNPACK/src/f32-prelu/gen/avx512f-2x16.c",
-    "XNNPACK/src/f32-vbinary/gen/vadd-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vaddc-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vdiv-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vdivc-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vmaxc-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vmin-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vminc-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vmul-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vmulc-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vrdivc-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vrsubc-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vsqrdiff-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vsqrdiffc-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vsub-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c",
-    "XNNPACK/src/f32-vclamp/gen/vclamp-avx512f-x16.c",
-    "XNNPACK/src/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x64.c",
-    "XNNPACK/src/f32-vhswish/gen/vhswish-avx512f-x16.c",
-    "XNNPACK/src/f32-vlrelu/gen/vlrelu-avx512f-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndd-avx512f-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndne-avx512f-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndu-avx512f-x16.c",
-    "XNNPACK/src/f32-vrnd/gen/vrndz-avx512f-x16.c",
-    "XNNPACK/src/f32-vsigmoid/gen/vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x64.c",
-    "XNNPACK/src/f32-vunary/gen/vabs-avx512f-x16.c",
-    "XNNPACK/src/f32-vunary/gen/vneg-avx512f-x16.c",
-    "XNNPACK/src/f32-vunary/gen/vsqr-avx512f-x16.c",
+ALL_SSE_MICROKERNEL_SRCS = [
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9p8x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-avgpool/f32-avgpool-9x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-sse-1x1.c",
+    "XNNPACK/src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-sse-2x2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-6x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-3x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-4x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-4x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-5x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc5.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4-acc3.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-3x4-acc2.c",
+    "XNNPACK/src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-3x4.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p4c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-3p8c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p4c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-4p8c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-5f5m5l8c4s4r-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-5f5m5l8c4s4r-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-7f6m6l8c4s4r-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-7f6m6l8c4s4r-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p4c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-9p8c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p4c-minmax-sse.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-sse-acc2.c",
+    "XNNPACK/src/f32-dwconv/gen/f32-dwconv-25p8c-minmax-sse.c",
+    "XNNPACK/src/f32-gavgpool-cw/f32-gavgpool-cw-sse-x4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-1x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-3x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x2c4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-4x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemm-5x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-1x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-3x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-4x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-gemm/gen/f32-gemminc-5x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-sse-p4.c",
+    "XNNPACK/src/f32-ibilinear-chw/gen/f32-ibilinear-chw-sse-p8.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-sse-c4.c",
+    "XNNPACK/src/f32-ibilinear/gen/f32-ibilinear-sse-c8.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-1x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-3x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x2c4-minmax-sse.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-4x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-sse-dup.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8-minmax-sse-load1.c",
+    "XNNPACK/src/f32-igemm/gen/f32-igemm-5x8s4-minmax-sse.c",
+    "XNNPACK/src/f32-maxpool/f32-maxpool-9p8x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9p8x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-pavgpool/f32-pavgpool-9x-minmax-sse-c4.c",
+    "XNNPACK/src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-sse-2x4.c",
+    "XNNPACK/src/f32-prelu/gen/f32-prelu-sse-2x8.c",
+    "XNNPACK/src/f32-rmax/f32-rmax-sse.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-4x1-minmax-sse.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-8x1-minmax-sse.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-16x1-minmax-sse.c",
+    "XNNPACK/src/f32-spmm/gen/f32-spmm-32x1-minmax-sse.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vadd-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vaddc-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdiv-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vdivc-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmaxc-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmin-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vminc-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmul-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vmulc-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrdivc-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vrsubc-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiff-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsqrdiffc-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsub-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-sse-x4.c",
+    "XNNPACK/src/f32-vbinary/gen/f32-vsubc-minmax-sse-x8.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-sse-x4.c",
+    "XNNPACK/src/f32-vclamp/gen/f32-vclamp-sse-x8.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-sse-x4.c",
+    "XNNPACK/src/f32-vhswish/gen/f32-vhswish-sse-x8.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-sse-x4.c",
+    "XNNPACK/src/f32-vlrelu/gen/f32-vlrelu-sse-x8.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-sse-2x.c",
+    "XNNPACK/src/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-sse-2x.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-sse-x4.c",
+    "XNNPACK/src/f32-vrelu/gen/f32-vrelu-sse-x8.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-sse-sqrt-x4.c",
+    "XNNPACK/src/f32-vsqrt/gen/f32-vsqrt-sse-sqrt-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-sse-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vabs-sse-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-sse-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vneg-sse-x8.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-sse-x4.c",
+    "XNNPACK/src/f32-vunary/gen/f32-vsqr-sse-x8.c",
+    "XNNPACK/src/math/roundd-sse-addsub.c",
+    "XNNPACK/src/math/roundne-sse-addsub.c",
+    "XNNPACK/src/math/roundu-sse-addsub.c",
+    "XNNPACK/src/math/roundz-sse-addsub.c",
+    "XNNPACK/src/math/sqrt-f32-sse-hh1mac.c",
+    "XNNPACK/src/math/sqrt-f32-sse-nr1mac.c",
+    "XNNPACK/src/math/sqrt-f32-sse-nr2mac.c",
+    "XNNPACK/src/x32-packx/x32-packx-x4-sse.c",
+    "XNNPACK/src/x32-transposec/x32-transposec-4x4-sse.c",
 ]
 
-PROD_AVX512SKX_MICROKERNEL_SRCS = [
-    "XNNPACK/src/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c",
-    "XNNPACK/src/f32-f16-vcvt/gen/vcvt-avx512skx-x16.c",
-    "XNNPACK/src/f32-qs8-vcvt/gen/vcvt-avx512skx-x128.c",
-    "XNNPACK/src/f32-qu8-vcvt/gen/vcvt-avx512skx-x128.c",
-    "XNNPACK/src/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
-    "XNNPACK/src/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
-    "XNNPACK/src/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qc8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qc8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qc8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qs8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
-    "XNNPACK/src/qs8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
-    "XNNPACK/src/qs8-f32-vcvt/gen/vcvt-avx512skx-x32.c",
-    "XNNPACK/src/qs8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qs8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qs8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qs8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "XNNPACK/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "XNNPACK/src/qu8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
-    "XNNPACK/src/qu8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
-    "XNNPACK/src/qu8-f32-vcvt/gen/vcvt-avx512skx-x32.c",
-    "XNNPACK/src/qu8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qu8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qu8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qu8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "XNNPACK/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "XNNPACK/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "XNNPACK/src/x8-lut/gen/lut-avx512skx-vpshufb-x64.c",
+PROD_SSE_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/sse.c",
+]
+
+PROD_NEONFP16_MICROKERNEL_SRCS = [
+    "XNNPACK/src/amalgam/neonfp16.c",
 ]
diff --git a/third_party/xnnpack_wrapper_defs.bzl b/third_party/xnnpack_wrapper_defs.bzl
index 26556a7fbfa2..9ecc08885d57 100644
--- a/third_party/xnnpack_wrapper_defs.bzl
+++ b/third_party/xnnpack_wrapper_defs.bzl
@@ -2,1130 +2,5914 @@
 Auto-generated by generate-wrappers.py script. Do not modify
 """
 
-AARCH32_ASM_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/4x4-aarch32-vfp-ld64.S",
-    "xnnpack_wrappers/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S",
-    "xnnpack_wrappers/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S",
+PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS = [
 ]
 
-PROD_NEONDOT_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x16c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-neondot.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
+PROD_SCALAR_AARCH32_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/scalar-aarch32.c",
+]
+
+PROD_NEON_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neon.c",
+]
+
+PROD_NEONFP16_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neonfp16.c",
+]
+
+PROD_NEON_AARCH64_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neon-aarch64.c",
 ]
 
 PROD_NEONFMA_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-dwconv/gen/up8x3-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x4-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x9-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x25-minmax-neonfma-acc2.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x8s4-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-gemm/gen/6x8s4-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-ibilinear-chw/gen/neonfma-p8.c",
-    "xnnpack_wrappers/f32-ibilinear/gen/neonfma-c8.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x8s4-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-igemm/gen/6x8s4-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/neonfma-rr1-lut64-p2-x16.c",
-    "xnnpack_wrappers/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x16.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-neonfma-rr1-p6-x8.c",
-    "xnnpack_wrappers/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x16.c",
+    "xnnpack_wrappers/amalgam/neonfma.c",
+]
+
+PROD_AARCH64_NEON_MICROKERNEL_SRCS = [
+]
+
+PROD_NEONV8_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neonv8.c",
+]
+
+PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [
+]
+
+PROD_NEONDOT_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neondot.c",
+]
+
+PROD_SSE_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/sse.c",
+]
+
+PROD_SSE2_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/sse2.c",
 ]
 
 PROD_SSSE3_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c",
+    "xnnpack_wrappers/amalgam/ssse3.c",
 ]
 
-PROD_SCALAR_AARCH32_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-scalar-x4.c",
-    "xnnpack_wrappers/f32-argmaxpool/4x-scalar-c1.c",
-    "xnnpack_wrappers/f32-argmaxpool/9p8x-scalar-c1.c",
-    "xnnpack_wrappers/f32-argmaxpool/9x-scalar-c1.c",
-    "xnnpack_wrappers/f32-avgpool/9p8x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-avgpool/9x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-conv-hwc/3x3s2p0p1c3x4-scalar-1x1.c",
-    "xnnpack_wrappers/f32-conv-hwc/3x3s2p1c3x4-scalar-1x1.c",
-    "xnnpack_wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x3-minmax-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x3-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x4-minmax-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x4-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x9-minmax-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x9-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x25-minmax-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up1x25-scalar-acc2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-scalar-4x1.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-scalar-2x1-acc2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-scalar-2x1-acc2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-2x1-acc2.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-scalar-fabsf-x2.c",
-    "xnnpack_wrappers/f32-gavgpool-cw/scalar-x1.c",
-    "xnnpack_wrappers/f32-gavgpool/7p7x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-gavgpool/7x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x4-minmax-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x4-relu-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x4-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x2-minmax-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x2-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x4-minmax-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x4-relu-scalar.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x4-scalar.c",
-    "xnnpack_wrappers/f32-ibilinear-chw/gen/scalar-p4.c",
-    "xnnpack_wrappers/f32-ibilinear/gen/scalar-c2.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x4-minmax-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x4-relu-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x4-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x2-minmax-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x2-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x4-minmax-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x4-relu-scalar.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x4-scalar.c",
-    "xnnpack_wrappers/f32-maxpool/9p8x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-pavgpool/9p8x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-pavgpool/9x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/f32-prelu/gen/scalar-2x4.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-scalar-imagic-x4.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-scalar-imagic-x4.c",
-    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/scalar-rr2-p5-x4-acc2.c",
-    "xnnpack_wrappers/f32-rmax/scalar.c",
-    "xnnpack_wrappers/f32-spmm/gen/8x1-minmax-scalar.c",
-    "xnnpack_wrappers/f32-spmm/gen/8x2-minmax-scalar.c",
-    "xnnpack_wrappers/f32-spmm/gen/8x4-minmax-scalar.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vadd-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vaddc-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdiv-minmax-scalar-x2.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdivc-minmax-scalar-x2.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmaxc-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmin-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vminc-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmul-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmulc-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiff-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiffc-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsub-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsubc-minmax-scalar-x8.c",
-    "xnnpack_wrappers/f32-vclamp/gen/vclamp-scalar-x4.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-scalar-rr2-lut16-p3-x4.c",
-    "xnnpack_wrappers/f32-vhswish/gen/vhswish-scalar-x4.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-scalar-x4.c",
-    "xnnpack_wrappers/f32-vmulcaddc/gen/c1-minmax-scalar-2x.c",
-    "xnnpack_wrappers/f32-vrelu/gen/vrelu-scalar-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-scalar-libm-x1.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-scalar-libm-x1.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-scalar-libm-x1.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-scalar-libm-x1.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-scalar-rr2-lut64-p2-div-x2.c",
-    "xnnpack_wrappers/f32-vsqrt/gen/scalar-sqrt-x1.c",
-    "xnnpack_wrappers/f32-vunary/gen/vabs-scalar-x4.c",
-    "xnnpack_wrappers/f32-vunary/gen/vneg-scalar-x4.c",
-    "xnnpack_wrappers/f32-vunary/gen/vsqr-scalar-x4.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up2x25-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c",
-    "xnnpack_wrappers/qc8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c",
-    "xnnpack_wrappers/qc8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up1x25-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-scalar-x4.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-scalar-x1.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-scalar-x1.c",
-    "xnnpack_wrappers/qs8-vmul/gen/minmax-fp32-scalar-x4.c",
-    "xnnpack_wrappers/qs8-vmulc/gen/minmax-fp32-scalar-x4.c",
-    "xnnpack_wrappers/qu8-avgpool/9p8x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/qu8-avgpool/9x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up1x25-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-scalar-x4.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qu8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qu8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-scalar-x1.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-scalar-x1.c",
-    "xnnpack_wrappers/qu8-vmul/gen/minmax-fp32-scalar-x4.c",
-    "xnnpack_wrappers/qu8-vmulc/gen/minmax-fp32-scalar-x4.c",
-    "xnnpack_wrappers/s8-ibilinear/gen/scalar-c1.c",
-    "xnnpack_wrappers/s8-maxpool/9p8x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/s8-vclamp/scalar-x4.c",
-    "xnnpack_wrappers/u8-ibilinear/gen/scalar-c1.c",
-    "xnnpack_wrappers/u8-maxpool/9p8x-minmax-scalar-c1.c",
-    "xnnpack_wrappers/u8-rmax/scalar.c",
-    "xnnpack_wrappers/u8-vclamp/scalar-x4.c",
-    "xnnpack_wrappers/xx-fill/scalar-x16.c",
-    "xnnpack_wrappers/xx-pad/scalar.c",
-    "xnnpack_wrappers/x8-zip/xm-scalar.c",
-    "xnnpack_wrappers/x8-zip/x2-scalar.c",
-    "xnnpack_wrappers/x8-zip/x3-scalar.c",
-    "xnnpack_wrappers/x8-zip/x4-scalar.c",
-    "xnnpack_wrappers/x32-packx/x2-scalar.c",
-    "xnnpack_wrappers/x32-packx/x3-scalar.c",
-    "xnnpack_wrappers/x32-packx/x4-scalar.c",
-    "xnnpack_wrappers/x32-unpool/scalar.c",
-    "xnnpack_wrappers/x32-zip/xm-scalar.c",
-    "xnnpack_wrappers/x32-zip/x2-scalar.c",
-    "xnnpack_wrappers/x32-zip/x3-scalar.c",
-    "xnnpack_wrappers/x32-zip/x4-scalar.c",
+PROD_SSE41_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/sse41.c",
+]
+
+PROD_AVX_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/avx.c",
+]
+
+PROD_F16C_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/f16c.c",
 ]
 
 PROD_XOP_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qu8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qu8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c",
+    "xnnpack_wrappers/amalgam/xop.c",
 ]
 
 PROD_FMA3_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c",
-    "xnnpack_wrappers/f16-dwconv/gen/up16x3-minmax-fma3.c",
-    "xnnpack_wrappers/f16-dwconv/gen/up16x4-minmax-fma3.c",
-    "xnnpack_wrappers/f16-dwconv/gen/up16x9-minmax-fma3.c",
-    "xnnpack_wrappers/f16-ibilinear/gen/fma3-c8.c",
-    "xnnpack_wrappers/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x25-minmax-fma3.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x3-minmax-fma3.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x4-minmax-fma3.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x9-minmax-fma3.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x16-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x16s4-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x16s4-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-gemm/gen/5x16-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x16-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x16s4-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/5x16-minmax-fma3-broadcast.c",
-    "xnnpack_wrappers/f32-vhswish/gen/vhswish-fma3-x16.c",
+    "xnnpack_wrappers/amalgam/fma3.c",
 ]
 
-PROD_AARCH64_NEON_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-neonfma-3x4.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neonfma-2x4-acc2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-neonfma-4x4.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neonfma-1x4-acc2.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-neonfma-lane-ld64.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x2-minmax-neonfma-lane-ld64.c",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-neonfma-lane-ld64.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x8-minmax-neonfma-lane-ld64.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x2-minmax-neonfma-lane-ld64.c",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-neonfma-lane-ld64.c",
-    "xnnpack_wrappers/f32-spmm/gen/32x2-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-spmm/gen/32x4-minmax-neonfma.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdiv-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdivc-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrdivc-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vsqrt/gen/neon-sqrt-x4.c",
-    "xnnpack_wrappers/x8-lut/gen/lut-neon-tbx128x4-x64.c",
+PROD_AVX2_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/avx2.c",
 ]
 
-PROD_NEONFP16_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-neonfp16-x16.c",
+PROD_AVX512F_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/avx512f.c",
 ]
 
-PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/params-init.c",
-    "xnnpack_wrappers/u8-lut32norm/scalar.c",
-    "xnnpack_wrappers/xx-copy/memcpy.c",
-    "xnnpack_wrappers/x8-lut/gen/lut-scalar-x4.c",
-    "xnnpack_wrappers/x32-depthtospace2d-chw2hwc/scalar.c",
+PROD_AVX512SKX_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/avx512skx.c",
 ]
 
-PROD_AVX_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x25-minmax-avx.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x3-minmax-avx.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x4-minmax-avx.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x9-minmax-avx.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-avx-x24.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x16-minmax-avx-broadcast.c",
-    "xnnpack_wrappers/f32-gemm/gen/5x16-minmax-avx-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x16-minmax-avx-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/5x16-minmax-avx-broadcast.c",
-    "xnnpack_wrappers/f32-prelu/gen/avx-2x16.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-avx-x32.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-avx-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vadd-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vaddc-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdiv-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdivc-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmaxc-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmin-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vminc-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmul-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmulc-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrdivc-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrsubc-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiff-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiffc-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsub-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsubc-minmax-avx-x16.c",
-    "xnnpack_wrappers/f32-vclamp/gen/vclamp-avx-x16.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x32.c",
-    "xnnpack_wrappers/f32-vhswish/gen/vhswish-avx-x16.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-avx-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-avx-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-avx-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-avx-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-avx-x16.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-avx-rr2-p5-nr2-x40.c",
-    "xnnpack_wrappers/f32-vsqrt/gen/avx-sqrt-x8.c",
-    "xnnpack_wrappers/f32-vunary/gen/vabs-avx-x16.c",
-    "xnnpack_wrappers/f32-vunary/gen/vneg-avx-x16.c",
-    "xnnpack_wrappers/f32-vunary/gen/vsqr-avx-x16.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-avx-x32.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qs8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "xnnpack_wrappers/qs8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-avx-x32.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qu8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qu8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c",
-    "xnnpack_wrappers/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "xnnpack_wrappers/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c",
-    "xnnpack_wrappers/x8-lut/gen/lut-avx-x64.c",
+PROD_AVX512VBMI_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/avx512vbmi.c",
 ]
 
-PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c",
-    "xnnpack_wrappers/f16-dwconv/gen/up16x3-minmax-neonfp16arith.c",
-    "xnnpack_wrappers/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c",
-    "xnnpack_wrappers/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c",
-    "xnnpack_wrappers/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c",
-    "xnnpack_wrappers/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c",
-    "xnnpack_wrappers/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c",
-    "xnnpack_wrappers/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
-    "xnnpack_wrappers/f16-ibilinear/gen/neonfp16arith-c8.c",
-    "xnnpack_wrappers/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
-    "xnnpack_wrappers/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c",
-    "xnnpack_wrappers/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c",
-    "xnnpack_wrappers/f16-prelu/gen/neonfp16arith-2x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vmul-minmax-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vmulc-minmax-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vclamp/gen/vclamp-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vhswish/gen/vhswish-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vlrelu/gen/vlrelu-neonfp16arith-x16.c",
-    "xnnpack_wrappers/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c",
+AARCH32_ASM_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x1.S",
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x2.S",
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-asm-aarch32-neon-x4.S",
+    "xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x1.S",
+    "xnnpack_wrappers/cs16-fftr/cs16-fftr-asm-aarch32-neon-x4.S",
+    "xnnpack_wrappers/f32-gemm/f32-gemm-4x4-asm-aarch32-vfp-ld64.S",
+    "xnnpack_wrappers/f32-gemm/f32-gemm-4x4-minmax-asm-aarch32-vfp-ld64.S",
+    "xnnpack_wrappers/f32-gemm/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a55.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a7.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-ld64.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch32-neon-prfm-cortex-a75.S",
+    "xnnpack_wrappers/qc8-dwconv/qc8-dwconv-3p8c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
+    "xnnpack_wrappers/qc8-dwconv/qc8-dwconv-3p16c-minmax-fp32-asm-aarch32-neonv8-mla8-cortex-a35.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a35.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a35.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-asm-aarch32-neonv8-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8c4-minmax-fp32-asm-aarch32-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-asm-aarch32-neondot-ld64.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a7.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a7.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-asm-aarch32-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-arm-x1.S",
+    "xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x1.S",
+    "xnnpack_wrappers/u32-filterbank-accumulate/u32-filterbank-accumulate-asm-aarch32-neon-x2.S",
 ]
 
-PROD_F16C_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
-    "xnnpack_wrappers/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c",
-    "xnnpack_wrappers/f16-gavgpool/gen/7x-minmax-f16c-c8.c",
-    "xnnpack_wrappers/f16-maxpool/9p8x-minmax-f16c-c8.c",
-    "xnnpack_wrappers/f16-prelu/gen/f16c-2x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vadd-minmax-f16c-x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vaddc-minmax-f16c-x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vmul-minmax-f16c-x16.c",
-    "xnnpack_wrappers/f16-vbinary/gen/vmulc-minmax-f16c-x16.c",
-    "xnnpack_wrappers/f16-vclamp/gen/vclamp-f16c-x16.c",
-    "xnnpack_wrappers/f16-vhswish/gen/vhswish-f16c-x16.c",
-    "xnnpack_wrappers/f16-vlrelu/gen/vlrelu-f16c-x16.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-f16c-x16.c",
+AARCH64_ASM_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x8-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-1x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-4x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a55r0.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-cortex-a75.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld32.S",
+    "xnnpack_wrappers/f16-igemm/f16-igemm-6x16-minmax-asm-aarch64-neonfp16arith-ld64.S",
+    "xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-dwconv/f32-dwconv-9p4c-minmax-asm-aarch64-neonfma.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/f32-igemm-1x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-igemm/f32-igemm-4x12-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
+    "xnnpack_wrappers/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld64.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a53.S",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-asm-aarch64-neonfma-prfm-cortex-a75.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-asm-aarch64-neon-mull.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mull.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld32.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld32.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal-prfm.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-fp32-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-asm-aarch64-neon-mlal.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld64.S",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S",
 ]
 
-PROD_NEONV8_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-neonv8-x32.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-neonv8-x32.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-neonv8-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-neonv8-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-neonv8-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-neonv8-x8.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mla8-ld64.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mla8-ld64.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mla8-ld64.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
+PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neonfp16arith-aarch64.c",
 ]
 
-PROD_AVX512SKX_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-avx512skx-x16.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-avx512skx-x128.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-avx512skx-x128.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-avx512skx-x32.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-avx512skx-x32.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c",
-    "xnnpack_wrappers/x8-lut/gen/lut-avx512skx-vpshufb-x64.c",
+ALL_ARMSIMD32_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-armsimd32-x4.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-armsimd32-x8.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-x4.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-armsimd32-x8.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x1c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x2c4-minmax-fp32-armsimd32.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-armsimd32-x4.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-armsimd32-x8.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-x4.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-armsimd32-x8.c",
 ]
 
-PROD_NEON_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-neon-int16-x16.c",
-    "xnnpack_wrappers/f32-argmaxpool/4x-neon-c4.c",
-    "xnnpack_wrappers/f32-argmaxpool/9p8x-neon-c4.c",
-    "xnnpack_wrappers/f32-argmaxpool/9x-neon-c4.c",
-    "xnnpack_wrappers/f32-avgpool/9p8x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-avgpool/9x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-neon-2x2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x3-minmax-neon.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x4-minmax-neon.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x9-minmax-neon.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x25-minmax-neon-acc2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-neon-2x4.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neon-1x4.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-neon-1x4.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neon-1x4.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-neon-x8.c",
-    "xnnpack_wrappers/f32-gavgpool-cw/neon-x4.c",
-    "xnnpack_wrappers/f32-gavgpool/7p7x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-gavgpool/7x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-neon-lane-ld64.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x2-minmax-neon-lane-ld64.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-neon-lane-ld64.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-neon-lane-ld128.c",
-    "xnnpack_wrappers/f32-ibilinear-chw/gen/neon-p8.c",
-    "xnnpack_wrappers/f32-ibilinear/gen/neon-c8.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x8-minmax-neon-lane-ld64.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x2-minmax-neon-lane-ld64.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-neon-lane-ld64.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-neon-lane-ld128.c",
-    "xnnpack_wrappers/f32-maxpool/9p8x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-pavgpool/9p8x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-pavgpool/9x-minmax-neon-c4.c",
-    "xnnpack_wrappers/f32-prelu/gen/neon-2x8.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-neon-x32.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-neon-x32.c",
-    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/neon-rr2-lut64-p2-x8.c",
-    "xnnpack_wrappers/f32-rmax/neon.c",
-    "xnnpack_wrappers/f32-spmm/gen/32x1-minmax-neon.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vadd-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vaddc-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmaxc-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmin-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vminc-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmul-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmulc-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrsubc-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiff-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiffc-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsub-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsubc-minmax-neon-x8.c",
-    "xnnpack_wrappers/f32-vclamp/gen/vclamp-neon-x8.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-neon-rr2-lut16-p3-x8.c",
-    "xnnpack_wrappers/f32-vhswish/gen/vhswish-neon-x16.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-neon-x8.c",
-    "xnnpack_wrappers/f32-vmulcaddc/gen/c4-minmax-neon-2x.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-neon-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-neon-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-neon-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-neon-x8.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-neon-rr2-lut64-p2-nr2recps-x8.c",
-    "xnnpack_wrappers/f32-vunary/gen/vabs-neon-x8.c",
-    "xnnpack_wrappers/f32-vunary/gen/vneg-neon-x8.c",
-    "xnnpack_wrappers/f32-vunary/gen/vsqr-neon-x8.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mla8-ld64.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mla8-ld64.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mla8-ld64.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c2s4-minmax-fp32-neon-mlal.c",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c2s4-minmax-fp32-neon-mlal.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c2s4-minmax-fp32-neon-mlal.c",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c2s4-minmax-fp32-neon-mlal.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mla8-ld64.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mla8-ld64.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mla8-ld64.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-neon-x32.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7x-minmax-rndnu-neon-c8.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c2s4-minmax-rndnu-neon-mlal.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c2s4-minmax-rndnu-neon-mlal.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c2s4-minmax-rndnu-neon-mlal.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c2s4-minmax-rndnu-neon-mlal.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-neon-ld64-x16.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-neon-ld64-x32.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-neon-ld64-x16.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-neon-ld64-x32.c",
-    "xnnpack_wrappers/qs8-vmul/gen/minmax-rndnu-neon-ld64-x16.c",
-    "xnnpack_wrappers/qs8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c",
-    "xnnpack_wrappers/qu8-avgpool/9p8x-minmax-neon-c8.c",
-    "xnnpack_wrappers/qu8-avgpool/9x-minmax-neon-c8.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up8x25-minmax-rndnu-neon-mul8.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x9-minmax-rndnu-neon-mul8.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-neon-x32.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7x-minmax-rndnu-neon-c8.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-gemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-igemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-neon-ld64-x16.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-neon-ld64-x32.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-neon-ld64-x16.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-neon-ld64-x32.c",
-    "xnnpack_wrappers/qu8-vmul/gen/minmax-rndnu-neon-ld64-x16.c",
-    "xnnpack_wrappers/qu8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c",
-    "xnnpack_wrappers/s8-ibilinear/gen/neon-c8.c",
-    "xnnpack_wrappers/s8-ibilinear/gen/neon-c16.c",
-    "xnnpack_wrappers/s8-maxpool/9p8x-minmax-neon-c16.c",
-    "xnnpack_wrappers/s8-vclamp/neon-x64.c",
-    "xnnpack_wrappers/u8-ibilinear/gen/neon-c8.c",
-    "xnnpack_wrappers/u8-ibilinear/gen/neon-c16.c",
-    "xnnpack_wrappers/u8-maxpool/9p8x-minmax-neon-c16.c",
-    "xnnpack_wrappers/u8-rmax/neon.c",
-    "xnnpack_wrappers/u8-vclamp/neon-x64.c",
-    "xnnpack_wrappers/xx-fill/neon-x64.c",
-    "xnnpack_wrappers/xx-pad/neon.c",
-    "xnnpack_wrappers/x8-zip/xm-neon.c",
-    "xnnpack_wrappers/x8-zip/x2-neon.c",
-    "xnnpack_wrappers/x8-zip/x3-neon.c",
-    "xnnpack_wrappers/x8-zip/x4-neon.c",
-    "xnnpack_wrappers/x32-packx/x4-neon-st4.c",
-    "xnnpack_wrappers/x32-unpool/neon.c",
-    "xnnpack_wrappers/x32-zip/xm-neon.c",
-    "xnnpack_wrappers/x32-zip/x2-neon.c",
-    "xnnpack_wrappers/x32-zip/x3-neon.c",
-    "xnnpack_wrappers/x32-zip/x4-neon.c",
+ALL_AVX_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-x32.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-x32.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-avx.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x8.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x24.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-avx-x32.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-7x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-7x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x16-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-7x8-minmax-avx-broadcast.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-avx-2x8.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-avx-2x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x24.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx-x32.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x8.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x16.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x24.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-x32.c",
+    "xnnpack_wrappers/f32-rmax/f32-rmax-avx.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-avx-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-avx-x16.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-avx-x8.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-avx-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut4-p4-perm-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-lut16-p3-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-p6-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-p6-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-p6-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-p6-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx-rr2-p6-x48.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-avx-x8.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-avx-x16.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-avx-x8.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-avx-x16.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-avx-x8.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-avx-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-avx-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-avx-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-avx-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-avx-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-avx-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-avx-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-avx-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-avx-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x40.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x56.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x72.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-div-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x40.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x56.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x72.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx-rr2-p5-nr2-x80.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-x8.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-x16.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-avx-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-avx-x16.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-avx-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-avx-x16.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-avx-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-avx-x16.c",
+    "xnnpack_wrappers/math/exp-f32-avx-rr2-p5.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx-rr2-lut4-p4-perm.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx-rr2-lut16-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx-rr2-p6.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx-rr2-lut64-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx-rr2-p5-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx-rr2-p5-nr1.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx-rr2-p5-nr2.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x8.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x24.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx-x32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-avx.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul16-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx-mul32-ld32-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul16-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx-mul32-ld32-x32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-avx-x8.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-avx-x16.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-avx-x32.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-avx-x8.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-avx-x16.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-avx-x32.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx-mul32.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x8.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x24.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx-x32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-avx-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-avx-ld128.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-avx-x8.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-avx-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-avx-x32.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-avx-x8.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-avx-x16.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-avx-x32.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-avx-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-avx-mul16-ld64-x16.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx-x16.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx-x32.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx-x48.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx-x64.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-8x8-multi-mov-avx.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-8x8-multi-switch-avx.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-8x8-reuse-mov-avx.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-8x8-reuse-multi-avx.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-8x8-reuse-switch-avx.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x4-multi-mov-avx.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x4-multi-multi-avx.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x4-multi-switch-avx.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x4-reuse-mov-avx.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x4-reuse-multi-avx.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x4-reuse-switch-avx.c",
 ]
 
-PROD_AVX2_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-gemm/gen/1x16-minmax-avx2-broadcast.c",
-    "xnnpack_wrappers/f16-gemm/gen/4x16-minmax-avx2-broadcast.c",
-    "xnnpack_wrappers/f16-igemm/gen/1x16-minmax-avx2-broadcast.c",
-    "xnnpack_wrappers/f16-igemm/gen/4x16-minmax-avx2-broadcast.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-avx2-x64.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-avx2-x64.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-avx2-rr1-p5-div-x40.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qc8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-avx2-x16.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qs8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-avx2-x16.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qu8-igemm/gen/3x8c8-minmax-fp32-avx2.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c",
-    "xnnpack_wrappers/x8-lut/gen/lut-avx2-x128.c",
+ALL_AVX2_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-3x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-5x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-5x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-7x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-1x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-1x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-3x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-4x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-4x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-5x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-5x16-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-6x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-7x8-minmax-avx2-broadcast.c",
+    "xnnpack_wrappers/f16-pavgpool/f16-pavgpool-9p8x-minmax-avx2-c8.c",
+    "xnnpack_wrappers/f16-pavgpool/f16-pavgpool-9x-minmax-avx2-c8.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x32-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x32-acc4.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x32.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x40-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x40-acc5.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x40.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x48-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x48-acc3.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x48.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x64-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x64-acc4.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x64.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x72-acc3.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x72.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x80-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x80-acc5.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x80.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96-acc3.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96-acc6.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-avx2-rr1-p2-x96.c",
+    "xnnpack_wrappers/f16-velu/gen/f16-velu-avx2-rr1-p3-x8.c",
+    "xnnpack_wrappers/f16-velu/gen/f16-velu-avx2-rr1-p3-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x8.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x24.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x32.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x40.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x48.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x56.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-div-x64.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x8.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x24.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x32.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x40.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x48.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x56.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-avx2-rr1-p2-rcp-x64.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x32.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x48.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx2-x64.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x16.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x32.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x48.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx2-x64.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x64-acc2.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x64-acc4.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x64.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x72-acc3.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x72.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x80-acc2.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x80-acc5.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x80.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96-acc2.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96-acc3.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96-acc6.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx2-p5-x96.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x64-acc2.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x64-acc4.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x64.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x72-acc3.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x72.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x80-acc2.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x80-acc5.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x80.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96-acc2.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96-acc3.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96-acc6.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx2-p5-x96.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x64-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x64-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x64.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x72-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x72.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x80-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x80-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x80.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96-acc6.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx2-rr1-p5-x96.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x56.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x64.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x72.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut4-p4-perm-x80.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x56.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x64.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x72.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut8-p4-perm-x80.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x56.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x64.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x72.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-lut16-p3-gather-x80.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x40.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x56.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x64.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x72.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx2-rr1-p6-x80.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x8.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x16.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x24.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x32.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x40.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x48.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x56.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x64.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x72.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x80.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x88.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx2-p5-x96.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x8.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x16.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x24.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x32.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x40.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x48.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x56.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x64.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x72.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x80.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x88.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx2-p5-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x40.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x56.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x72.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-div-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x40.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x56.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x72.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr1fma-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x40.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x56.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x72.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx2-rr1-p5-nr2fma-x80.c",
+    "xnnpack_wrappers/math/exp-f32-avx2-rr2-lut8-p3-perm.c",
+    "xnnpack_wrappers/math/exp-f32-avx2-rr2-lut8-p4-perm.c",
+    "xnnpack_wrappers/math/exp-f32-avx2-rr2-p5.c",
+    "xnnpack_wrappers/math/expm1minus-f16-avx2-rr1-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx2-rr1-lut4-p4-perm.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx2-rr1-lut8-p4-perm.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx2-rr1-lut16-p3-gather.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx2-rr1-p6.c",
+    "xnnpack_wrappers/math/expminus-f16-avx2-rr1-p2.c",
+    "xnnpack_wrappers/math/expminus-f16-avx2-rr1-p3.c",
+    "xnnpack_wrappers/math/expminus-f32-avx2-rr1-p5.c",
+    "xnnpack_wrappers/math/expminus-f32-avx2-rr2-p5.c",
+    "xnnpack_wrappers/math/extexp-avx2-p5.c",
+    "xnnpack_wrappers/math/sigmoid-f16-avx2-rr1-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f16-avx2-rr1-p2-rcp.c",
+    "xnnpack_wrappers/math/sigmoid-f16-avx2-rr1-p3-div.c",
+    "xnnpack_wrappers/math/sigmoid-f16-avx2-rr1-p3-rcp.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-nr2fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-lut64-p2-gather-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-p5-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-p5-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr1-p5-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-nr2fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-lut64-p2-gather-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-p5-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-p5-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx2-rr2-p5-nr2fma.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-xw-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-xw-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x8c8-xw-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul16-add16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpmovsx.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul16-vpunpck.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x8.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x24.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx2-x32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-xw-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-xw-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c8-xw-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx2-mul32-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx2-mul32-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-avx2-x16.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-avx2-x32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-avx2-x64.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-avx2-x16.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-avx2-x32.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-avx2-x64.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-avx2-mul32.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x8.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x24.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx2-x32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x8c8-minmax-fp32-avx2.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx2-mul32-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx2-mul32-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx2-mul32-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-avx2-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-avx2-x32.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-avx2-x64.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-avx2-x16.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-avx2-x32.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-avx2-x64.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx2-x32.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx2-x64.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx2-x96.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx2-x128.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-32x32-reuse-mov-avx2.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-16x16-reuse-mov-avx2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-16x16-reuse-switch-avx2.c",
 ]
 
-PROD_SSE_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-avgpool/9p8x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-avgpool/9x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-sse-2x2.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x3-minmax-sse.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x4-minmax-sse.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x9-minmax-sse.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up8x25-minmax-sse.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-sse-2x4-acc2.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-sse-1x4-acc3.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-sse-4x4.c",
-    "xnnpack_wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-sse-2x4.c",
-    "xnnpack_wrappers/f32-gavgpool-cw/sse-x4.c",
-    "xnnpack_wrappers/f32-gavgpool/7p7x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-gavgpool/7x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-sse-load1.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x2c4-minmax-sse.c",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-sse-load1.c",
-    "xnnpack_wrappers/f32-ibilinear-chw/gen/sse-p8.c",
-    "xnnpack_wrappers/f32-ibilinear/gen/sse-c8.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x8-minmax-sse-load1.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x2c4-minmax-sse.c",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-sse-load1.c",
-    "xnnpack_wrappers/f32-maxpool/9p8x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-pavgpool/9p8x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-pavgpool/9x-minmax-sse-c4.c",
-    "xnnpack_wrappers/f32-rmax/sse.c",
-    "xnnpack_wrappers/f32-spmm/gen/32x1-minmax-sse.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vadd-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vaddc-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdiv-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdivc-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmaxc-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmin-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vminc-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmul-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmulc-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrdivc-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrsubc-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiff-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiffc-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsub-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsubc-minmax-sse-x8.c",
-    "xnnpack_wrappers/f32-vclamp/gen/vclamp-sse-x8.c",
-    "xnnpack_wrappers/f32-vhswish/gen/vhswish-sse-x8.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-sse-x8.c",
-    "xnnpack_wrappers/f32-vmulcaddc/gen/c4-minmax-sse-2x.c",
-    "xnnpack_wrappers/f32-vsqrt/gen/sse-sqrt-x4.c",
-    "xnnpack_wrappers/f32-vunary/gen/vabs-sse-x8.c",
-    "xnnpack_wrappers/f32-vunary/gen/vneg-sse-x8.c",
-    "xnnpack_wrappers/f32-vunary/gen/vsqr-sse-x8.c",
-    "xnnpack_wrappers/x32-packx/x4-sse.c",
+ALL_AVX512F_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c16s4r-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c16s4r-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l32c16s4r-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l32c16s4r-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l64c16s4r-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l64c16s4r-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p32c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p32c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p32c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p32c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p32c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p32c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p32c-minmax-avx512f-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p32c-minmax-avx512f.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-7x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-8x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-7x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-8x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-7x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-8x16-minmax-avx512f-broadcast.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-avx512f-2x16.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-avx512f-2x32.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x128-acc2.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x128-acc4.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x128.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x144-acc3.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x144.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x160-acc2.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x160-acc5.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x160.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192-acc2.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192-acc3.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192-acc6.c",
+    "xnnpack_wrappers/f32-raddexpminusmax/gen/f32-raddexpminusmax-avx512f-p5-scalef-x192.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x128-acc2.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x128-acc4.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x128.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x144-acc3.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x144.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x160-acc2.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x160-acc5.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x160.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192-acc2.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192-acc3.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192-acc6.c",
+    "xnnpack_wrappers/f32-raddextexp/gen/f32-raddextexp-avx512f-p5-scalef-x192.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x128-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x128-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x128.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x144-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x144.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x160-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x160-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x160.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192-acc6.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-avx512f-rr1-p5-scalef-x192.c",
+    "xnnpack_wrappers/f32-rmax/f32-rmax-avx512f.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-avx512f-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x64.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x80.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x96.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x112.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-lut16-p3-perm-x128.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x48.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x64.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x80.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x96.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x112.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-avx512f-rr1-p6-x128.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x16.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x32.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x48.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x64.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x80.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x96.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x112.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x128.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x144.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x160.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x176.c",
+    "xnnpack_wrappers/f32-vscaleexpminusmax/gen/f32-vscaleexpminusmax-avx512f-p5-scalef-x192.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x16.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x32.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x48.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x64.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x80.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x96.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x112.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x128.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x144.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x160.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x176.c",
+    "xnnpack_wrappers/f32-vscaleextexp/gen/f32-vscaleextexp-avx512f-p5-scalef-x192.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x112.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-div-x128.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x112.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-lut16-p3-perm-scalef-nr1fma-x128.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x112.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-div-x128.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x112.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr1-p5-scalef-nr1fma-x128.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x112.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x128.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x32.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x48.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x64.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x80.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x96.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x112.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma-x128.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x16.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x32.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x48.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x64.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x80.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x96.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x112.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-avx512f-nr1fma1adj-x128.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-avx512f-x32.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-avx512f-x16.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-avx512f-x32.c",
+    "xnnpack_wrappers/math/exp-f32-avx512f-rr2-lut16-p3-perm-scalef.c",
+    "xnnpack_wrappers/math/exp-f32-avx512f-rr2-lut16-p3-perm.c",
+    "xnnpack_wrappers/math/exp-f32-avx512f-rr2-lut32-p2-perm2-scalef.c",
+    "xnnpack_wrappers/math/exp-f32-avx512f-rr2-lut32-p2-perm2.c",
+    "xnnpack_wrappers/math/exp-f32-avx512f-rr2-p5-scalef.c",
+    "xnnpack_wrappers/math/exp-f32-avx512f-rr2-p5.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx512f-rr1-lut16-p3-perm.c",
+    "xnnpack_wrappers/math/expm1minus-f32-avx512f-rr1-p6.c",
+    "xnnpack_wrappers/math/extexp-avx512f-p5.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut16-p3-perm-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut16-p3-perm-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut16-p3-perm-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut32-p2-perm2-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut32-p2-perm2-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut32-p2-perm2-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut64-p2-gather-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut64-p2-gather-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-lut64-p2-gather-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-p5-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-p5-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr1-p5-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut16-p3-perm-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut16-p3-perm-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut16-p3-perm-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut32-p2-perm2-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut32-p2-perm2-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut64-p2-gather-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut64-p2-gather-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-lut64-p2-gather-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-p5-scalef-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-p5-scalef-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sigmoid-f32-avx512f-rr2-p5-scalef-nr1fma.c",
+    "xnnpack_wrappers/math/sqrt-f32-avx512f-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sqrt-f32-avx512f-nr1fma.c",
+    "xnnpack_wrappers/math/sqrt-f32-avx512f-nr2fma.c",
 ]
 
-PROD_SSE41_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-sse41-x8.c",
-    "xnnpack_wrappers/f32-prelu/gen/sse41-2x8.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-sse41-x32.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-sse41-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-sse41-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-sse41-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-sse41-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-sse41-x8.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-sse41-rr2-lut64-p2-div-x8.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16-add16.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16-add16.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-sse41-x16.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qs8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "xnnpack_wrappers/qs8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-sse41-x16.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qu8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qu8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qu8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "xnnpack_wrappers/qu8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c",
-    "xnnpack_wrappers/s8-ibilinear/gen/sse41-c16.c",
-    "xnnpack_wrappers/s8-maxpool/9p8x-minmax-sse41-c16.c",
-    "xnnpack_wrappers/s8-vclamp/sse41-x64.c",
-    "xnnpack_wrappers/u8-ibilinear/gen/sse41-c16.c",
+ALL_AVX512SKX_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-x32.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-avx512skx-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-avx512skx-x32.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x32.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x64.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x96.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-avx512skx-x128.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x32.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x64.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x96.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx512skx-x128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x32.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x48.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-avx512skx-x64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx512skx-mul32-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-avx512skx-mul32-ld128-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx512skx-mul32-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-avx512skx-mul32-ld128-x32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-avx512skx-mul32.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x32.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x48.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-avx512skx-x64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c8-minmax-fp32-avx512skx.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-avx512skx-mul32-ld128-x32.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-avx512skx-mul32-ld128-x32.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512skx-vpshufb-x64.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512skx-vpshufb-x128.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512skx-vpshufb-x192.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512skx-vpshufb-x256.c",
 ]
 
-PROD_SSE2_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c",
-    "xnnpack_wrappers/f32-argmaxpool/4x-sse2-c4.c",
-    "xnnpack_wrappers/f32-argmaxpool/9p8x-sse2-c4.c",
-    "xnnpack_wrappers/f32-argmaxpool/9x-sse2-c4.c",
-    "xnnpack_wrappers/f32-f16-vcvt/gen/vcvt-sse2-x16.c",
-    "xnnpack_wrappers/f32-prelu/gen/sse2-2x8.c",
-    "xnnpack_wrappers/f32-qs8-vcvt/gen/vcvt-sse2-x32.c",
-    "xnnpack_wrappers/f32-qu8-vcvt/gen/vcvt-sse2-x32.c",
-    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/sse2-rr2-p5-x20-acc2.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-sse2-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-sse2-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-sse2-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-sse2-x8.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-sse2-x8.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-sse2-rr2-lut64-p2-div-x8.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
-    "xnnpack_wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
-    "xnnpack_wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c",
-    "xnnpack_wrappers/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c",
-    "xnnpack_wrappers/qs8-f32-vcvt/gen/vcvt-sse2-x32.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c",
-    "xnnpack_wrappers/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c",
-    "xnnpack_wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qs8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qs8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qu8-avgpool/9p8x-minmax-sse2-c8.c",
-    "xnnpack_wrappers/qu8-avgpool/9x-minmax-sse2-c8.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c",
-    "xnnpack_wrappers/qu8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c",
-    "xnnpack_wrappers/qu8-f32-vcvt/gen/vcvt-sse2-x32.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c",
-    "xnnpack_wrappers/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c",
-    "xnnpack_wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qu8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qu8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
-    "xnnpack_wrappers/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qu8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/qu8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c",
-    "xnnpack_wrappers/s8-ibilinear/gen/sse2-c8.c",
-    "xnnpack_wrappers/s8-maxpool/9p8x-minmax-sse2-c16.c",
-    "xnnpack_wrappers/s8-vclamp/sse2-x64.c",
-    "xnnpack_wrappers/u8-ibilinear/gen/sse2-c8.c",
-    "xnnpack_wrappers/u8-maxpool/9p8x-minmax-sse2-c16.c",
-    "xnnpack_wrappers/u8-rmax/sse2.c",
-    "xnnpack_wrappers/u8-vclamp/sse2-x64.c",
-    "xnnpack_wrappers/xx-fill/sse2-x64.c",
-    "xnnpack_wrappers/xx-pad/sse2.c",
-    "xnnpack_wrappers/x8-zip/xm-sse2.c",
-    "xnnpack_wrappers/x8-zip/x2-sse2.c",
-    "xnnpack_wrappers/x8-zip/x3-sse2.c",
-    "xnnpack_wrappers/x8-zip/x4-sse2.c",
-    "xnnpack_wrappers/x32-unpool/sse2.c",
-    "xnnpack_wrappers/x32-zip/xm-sse2.c",
-    "xnnpack_wrappers/x32-zip/x2-sse2.c",
-    "xnnpack_wrappers/x32-zip/x3-sse2.c",
-    "xnnpack_wrappers/x32-zip/x4-sse2.c",
+ALL_AVX512VBMI_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x64.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x128.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x192.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-avx512vbmi-vpermx2b-x256.c",
 ]
 
-PROD_AVX512F_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f32-dwconv/gen/up16x3-minmax-avx512f.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x4-minmax-avx512f.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x9-minmax-avx512f.c",
-    "xnnpack_wrappers/f32-dwconv/gen/up16x25-minmax-avx512f.c",
-    "xnnpack_wrappers/f32-gemm/gen/1x16-minmax-avx512f-broadcast.c",
-    "xnnpack_wrappers/f32-gemm/gen/7x16-minmax-avx512f-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/1x16-minmax-avx512f-broadcast.c",
-    "xnnpack_wrappers/f32-igemm/gen/7x16-minmax-avx512f-broadcast.c",
-    "xnnpack_wrappers/f32-prelu/gen/avx512f-2x16.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vadd-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vaddc-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdiv-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vdivc-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmaxc-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmin-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vminc-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmul-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vmulc-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrdivc-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vrsubc-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiff-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsqrdiffc-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsub-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c",
-    "xnnpack_wrappers/f32-vclamp/gen/vclamp-avx512f-x16.c",
-    "xnnpack_wrappers/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x64.c",
-    "xnnpack_wrappers/f32-vhswish/gen/vhswish-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vlrelu/gen/vlrelu-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndd-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndne-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndu-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vrnd/gen/vrndz-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vsigmoid/gen/vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x64.c",
-    "xnnpack_wrappers/f32-vunary/gen/vabs-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vunary/gen/vneg-avx512f-x16.c",
-    "xnnpack_wrappers/f32-vunary/gen/vsqr-avx512f-x16.c",
+ALL_F16C_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-avgpool/f16-avgpool-9p8x-minmax-f16c-c8.c",
+    "xnnpack_wrappers/f16-avgpool/f16-avgpool-9x-minmax-f16c-c8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-x16.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c",
+    "xnnpack_wrappers/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c",
+    "xnnpack_wrappers/f16-prelu/gen/f16-prelu-f16c-2x8.c",
+    "xnnpack_wrappers/f16-prelu/gen/f16-prelu-f16c-2x16.c",
+    "xnnpack_wrappers/f16-rmax/f16-rmax-f16c.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-f16c-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-f16c-x16.c",
+    "xnnpack_wrappers/f16-vclamp/gen/f16-vclamp-f16c-x8.c",
+    "xnnpack_wrappers/f16-vclamp/gen/f16-vclamp-f16c-x16.c",
+    "xnnpack_wrappers/f16-vhswish/gen/f16-vhswish-f16c-x8.c",
+    "xnnpack_wrappers/f16-vhswish/gen/f16-vhswish-f16c-x16.c",
+    "xnnpack_wrappers/f16-vlrelu/gen/f16-vlrelu-f16c-x8.c",
+    "xnnpack_wrappers/f16-vlrelu/gen/f16-vlrelu-f16c-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndd-f16c-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndd-f16c-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndne-f16c-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndne-f16c-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndu-f16c-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndu-f16c-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndz-f16c-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndz-f16c-x16.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-f16c-sqrt-x8.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-f16c-sqrt-x16.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vsqr-f16c-x8.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vsqr-f16c-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-f16c-x8.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-f16c-x16.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-f16c.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-f16c.c",
 ]
 
-AARCH64_ASM_MICROKERNEL_SRCS = [
-    "xnnpack_wrappers/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/4x16inc-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/6x8inc-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-cortex-a75.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen-inc/8x8inc-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/1x8-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/1x16-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/4x8-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/4x16-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/6x8-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a55.S",
-    "xnnpack_wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a75.S",
-    "xnnpack_wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f16-gemm/gen/8x8-minmax-aarch64-neonfp16arith-ld64.S",
-    "xnnpack_wrappers/f16-igemm/4x16-minmax-aarch64-neonfp16arith-ld32.S",
-    "xnnpack_wrappers/f32-dwconv/up4x9-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-dwconv/up4x9-minmax-aarch64-neonfma.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/1x12inc-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/4x12inc-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a73.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/1x12-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/4x12-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a73.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S",
-    "xnnpack_wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld64.S",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld128.S",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S",
-    "xnnpack_wrappers/f32-igemm/1x8-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/1x12-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/4x8-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-igemm/4x12-minmax-aarch64-neonfma-cortex-a53.S",
-    "xnnpack_wrappers/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a55.S",
-    "xnnpack_wrappers/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a73.S",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld32.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S",
-    "xnnpack_wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
-    "xnnpack_wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
+ALL_FMA3_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p32c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p32c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p32c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p32c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p32c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p32c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p32c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p32c-minmax-fma3.c",
+    "xnnpack_wrappers/f16-ibilinear/gen/f16-ibilinear-fma3-c8.c",
+    "xnnpack_wrappers/f16-ibilinear/gen/f16-ibilinear-fma3-c16.c",
+    "xnnpack_wrappers/f16-vmulcaddc/gen/f16-vmulcaddc-c8-minmax-fma3-2x.c",
+    "xnnpack_wrappers/f16-vmulcaddc/gen/f16-vmulcaddc-c16-minmax-fma3-2x.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c8s4r-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c8s4r-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l32c8s4r-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-fma3-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-fma3.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-7x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-8x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-7x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-8x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x16-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x16s4-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-7x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-8x8-minmax-fma3-broadcast.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-fma3-x8.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-fma3-x16.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x8.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x16.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x24.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x32.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x40.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x48.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x56.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-fma3-nr1fma1adj-x64.c",
+    "xnnpack_wrappers/math/sqrt-f32-fma3-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sqrt-f32-fma3-nr1fma.c",
+    "xnnpack_wrappers/math/sqrt-f32-fma3-nr2fma.c",
+]
+
+ALL_FP16ARITH_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-x1.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-x2.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-fp16arith-x4.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-x1.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-x2.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-fp16arith-sqrt-x4.c",
+]
+
+ALL_NEON_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-neon-x1.c",
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-neon-x4.c",
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-neon.c",
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples4-neon.c",
+    "xnnpack_wrappers/cs16-fftr/cs16-fftr-neon-x4.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x4.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x8.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x12.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-neon-mlal-ld128-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-x32.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-x32.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-4x-neon-c4.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-9p8x-neon-c4.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-9x-neon-c4.c",
+    "xnnpack_wrappers/f32-avgpool/f32-avgpool-9p8x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-avgpool/f32-avgpool-9x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-neon-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-neon-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-neon-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-neon-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-neon-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-neon-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-neon-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-neon-2x2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-neon-6x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-neon-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-2x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-3x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-4x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-2x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-3x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-3x4.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neon.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neon-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neon.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x8.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x24.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-neon-x32.c",
+    "xnnpack_wrappers/f32-gavgpool-cw/f32-gavgpool-cw-neon-x4.c",
+    "xnnpack_wrappers/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-neon-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-neon-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x2-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-neon-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-neon-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-8x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-neon-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-neon-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-8x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-neon-p4.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-neon-p8.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-neon-p16.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-neon-c4.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-neon-c8.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x4-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-neon-dup-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-neon-lane-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x2-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-neon-dup-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-neon-dup-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-neon-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-neon-lane-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-8x8s4-minmax-neon.c",
+    "xnnpack_wrappers/f32-maxpool/f32-maxpool-9p8x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-pavgpool/f32-pavgpool-9p8x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-pavgpool/f32-pavgpool-9x-minmax-neon-c4.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-neon.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-neon.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-1x4.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-1x8.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-1x16.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-2x4.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-2x8.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-2x16.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-4x4.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-4x8.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-neon-4x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x24.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neon-x32.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x8.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x16.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x24.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neon-x32.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x8-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x8.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x12-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x12-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x12.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x16-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x16-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x16.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x20-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x20-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-lut64-p2-x20.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x8-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x8.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x12-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x12-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x12.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x16-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x16-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x16.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x20-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x20-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neon-rr2-p5-x20.c",
+    "xnnpack_wrappers/f32-rmax/f32-rmax-neon.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-neon-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-neon-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-neon.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-neon-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-neon-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-neon.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-12x1-minmax-neon.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-neon-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-neon-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-neon.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-neon-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-neon-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-neon.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-neon-x8.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-neon-x4.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-neon-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-lut16-p3-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-p6-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-p6-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-p6-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-p6-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neon-rr2-p6-x24.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-neon-x4.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-neon-x8.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-neon-x16.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-neon-x4.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-neon-x8.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-neon-2x.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-neon-2x.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-neon-x4.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-neon-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-neon-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-neon-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-neon-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-neon-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-neon-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-neon-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-neon-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-neon-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut64-p2-nr2recps-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-lut2048-p1-nr2recps-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neon-rr2-p5-nr2recps-x24.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-neon-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-neon-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-neon-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-neon-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-neon-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-neon-x8.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-neon-x8.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-neon-x16.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-neon-x24.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-neon-x32.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-neon-int16.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-neon-int32.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-neon.c",
+    "xnnpack_wrappers/math/cvt-f32-qs8-neon.c",
+    "xnnpack_wrappers/math/cvt-f32-qu8-neon.c",
+    "xnnpack_wrappers/math/expm1minus-f32-neon-rr2-lut16-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-neon-rr2-p6.c",
+    "xnnpack_wrappers/math/roundd-neon-addsub.c",
+    "xnnpack_wrappers/math/roundd-neon-cvt.c",
+    "xnnpack_wrappers/math/roundne-neon-addsub.c",
+    "xnnpack_wrappers/math/roundu-neon-addsub.c",
+    "xnnpack_wrappers/math/roundu-neon-cvt.c",
+    "xnnpack_wrappers/math/roundz-neon-addsub.c",
+    "xnnpack_wrappers/math/roundz-neon-cvt.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neon-rr2-lut64-p2-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neon-rr2-lut2048-p1-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neon-rr2-p5-nr2recps.c",
+    "xnnpack_wrappers/math/sqrt-f32-neon-nr1rsqrts.c",
+    "xnnpack_wrappers/math/sqrt-f32-neon-nr2rsqrts.c",
+    "xnnpack_wrappers/math/sqrt-f32-neon-nr3rsqrts.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neon-mla8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mla8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mul8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mla8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mul8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-rndnu-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-rndnu-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mla8-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mul8-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mla8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mla8-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mul8-ld64.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mul8-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x8.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x24.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-x32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-6x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-6x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-rndnu-neon-mull-addw-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2-minmax-rndnu-neon-mull-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2s4-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c2s4-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mull-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mull-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neon-mull-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4s2-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4s2-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c8-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c8-minmax-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c16-minmax-rndnu-neon-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-6x8-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-6x16-minmax-rndnu-neon-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-fp32-neon.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-gemmlowp-neon.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-neon.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndnu-neon-mull.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-neon-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-neon-ld128-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-neon-ld128-x32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-neon-x8.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-neon-x16.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-neon-x32.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-neon-x8.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-neon-x16.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-neon-x32.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-neon-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-neon-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-neon-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-rndnu-neon-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-rndnu-neon-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-rndnu-neon-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neon-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neon-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neon-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-rndnu-neon-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-rndnu-neon-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-rndnu-neon-ld128-x16.c",
+    "xnnpack_wrappers/qu8-avgpool/qu8-avgpool-9p8x-minmax-fp32-neon-c8.c",
+    "xnnpack_wrappers/qu8-avgpool/qu8-avgpool-9x-minmax-fp32-neon-c8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-neon-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-rndnu-neon-mul8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-rndnu-neon-mul16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x8.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x24.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-x32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-fp32-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-6x8-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-6x16-minmax-rndnu-neon-mlal-lane.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-fp32-neon.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-gemmlowp-neon.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-neon.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-neon-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-neon-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-neon-ld64-x32.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-neon-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld64-x32.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-neon-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-neon-x8.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-neon-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-neon-x32.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-neon-x8.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-neon-x16.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-neon-x32.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-neon-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-neon-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-neon-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-rndnu-neon-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-rndnu-neon-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-rndnu-neon-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neon-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neon-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neon-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-rndnu-neon-ld128-x16.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-neon-c8.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-neon-c16.c",
+    "xnnpack_wrappers/s8-maxpool/s8-maxpool-2p2x-minmax-neon-c16.c",
+    "xnnpack_wrappers/s8-maxpool/s8-maxpool-4p3x-minmax-neon-c16.c",
+    "xnnpack_wrappers/s8-maxpool/s8-maxpool-9p8x-minmax-neon-c16.c",
+    "xnnpack_wrappers/s8-vclamp/s8-vclamp-neon-x64.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-neon-x8.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-neon-x16.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-neon-x24.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-neon-x32.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-neon-x8.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-neon-x16.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-neon-x24.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-neon-x32.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift12-neon-x8.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift12-neon-x16.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift12-neon-x24.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift12-neon-x32.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift15-neon-x8.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift15-neon-x16.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift15-neon-x24.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-shift15-neon-x32.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-neon-c8.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-neon-c16.c",
+    "xnnpack_wrappers/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c",
+    "xnnpack_wrappers/u8-rmax/u8-rmax-neon.c",
+    "xnnpack_wrappers/u8-vclamp/u8-vclamp-neon-x64.c",
+    "xnnpack_wrappers/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x1.c",
+    "xnnpack_wrappers/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-neon-x2.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-multi-dec-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-multi-mov-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-multi-switch-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-reuse-multi-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-8x8-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-16x16-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-16x16-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x2-neon.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x3-neon.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x4-neon.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-xm-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-multi-dec-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-multi-mov-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-multi-multi-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-multi-switch-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-reuse-multi-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-multi-dec-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-multi-mov-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-multi-switch-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-multi-zip-neon.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/x24-transposec/x24-transposec-2x2-neon-tbl64.c",
+    "xnnpack_wrappers/x32-packx/x32-packx-x4-neon-st4.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-multi-dec-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-multi-mov-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-multi-multi-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-multi-switch-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-reuse-multi-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-dec-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-mov-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-multi-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-switch-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-multi-zip-neon.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/x32-unpool/x32-unpool-neon.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x2-neon.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x3-neon.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x4-neon.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-xm-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-dec-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-mov-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-multi-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-switch-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-dec-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-mov-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-multi-zip-neon.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-switch-zip-neon.c",
+    "xnnpack_wrappers/xx-fill/xx-fill-neon-x64.c",
+    "xnnpack_wrappers/xx-pad/xx-pad-neon.c",
+]
+
+ALL_NEON_AARCH64_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-aarch64-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-aarch64-neon-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-aarch64-neon-x8.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-x4.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-aarch64-neon-sqrt-x8.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x16.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x32.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x48.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-aarch64-neon-tbx128x4-x64.c",
+    "xnnpack_wrappers/x24-transposec/x24-transposec-4x4-aarch64-neon-tbl128.c",
+    "xnnpack_wrappers/x32-transposec/x32-transposec-4x4-aarch64-neon-tbl128.c",
+]
+
+ALL_NEONBF16_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonbf16-bfdot.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonbf16-bfmlal.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-1x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonbf16-bfdot.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonbf16-bfmlal.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonbf16-bfdot.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonbf16-bfmlal.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonbf16-bfdot.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonbf16-bfmlal.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-4x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfdot.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonbf16-bfmlal.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-5x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-6x8c2-minmax-neonbf16-bfdot-lane-ld128.c",
+]
+
+ALL_NEONDOT_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-8x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-8x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-8x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-8x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-6x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-6x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-8x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-8x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-6x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-6x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-8x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-8x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x32c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x32c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x32c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-5x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-5x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-6x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-6x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-8x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-8x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x32c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x32c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x32c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-5x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-5x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-6x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-6x16c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-8x8c4-minmax-rndnu-neondot.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-8x16c4-minmax-rndnu-neondot.c",
+]
+
+ALL_NEONFMA_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonfma-shland.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-1x4c8-minmax-neonfma-zip.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonfma-shland.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-2x4c8-minmax-neonfma-zip.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonfma-shland.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-3x4c8-minmax-neonfma-zip.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonfma-shland.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-4x4c8-minmax-neonfma-zip.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonfma-shland.c",
+    "xnnpack_wrappers/bf16-gemm/gen/bf16-gemm-5x4c8-minmax-neonfma-zip.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l4c4s4r-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l8c4s4r-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-2f2m2l16c4s4r-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p4c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p16c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p4c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p16c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p4c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p16c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p4c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neonfma-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p16c-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-neonfma-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-neonfma-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-8x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-neonfma-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-neonfma-dup-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-8x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-neonfma-p4.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-neonfma-p8.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-neonfma-p16.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-neonfma-c4.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-neonfma-c8.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-neonfma-dup-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-neonfma-dup-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-neonfma-dup-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-8x8s4-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-8x8-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x8-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x8.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x12-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x12-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x12.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x16-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x16-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x16.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x20-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x20-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-lut64-p2-x20.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x8-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x8.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x12-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x12-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x12.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x16-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x16-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x16.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x20-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x20-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-neonfma-rr1-p5-x20.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-neonfma-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-neonfma-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-12x1-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-neonfma-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-neonfma-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma-x2.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-neonfma.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-lut16-p3-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-p6-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-p6-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-p6-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-p6-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-neonfma-rr1-p6-x24.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-neonfma-2x.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-neonfma-2x.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr1recps1fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr1recps1fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-lut2048-p1-nr2recps-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr1recps1fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2fma-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-neonfma-rr1-p5-nr2recps-x24.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x4.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x8.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x12.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x16.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x20.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x24.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x28.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x32.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x36.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr1rsqrts1fma1adj-x40.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x4.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x8.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x12.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x16.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x20.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x24.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x28.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x32.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x36.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-neonfma-nr2fma1adj-x40.c",
+    "xnnpack_wrappers/math/exp-f32-neonfma-rr2-lut64-p2.c",
+    "xnnpack_wrappers/math/exp-f32-neonfma-rr2-p5.c",
+    "xnnpack_wrappers/math/expm1minus-f32-neonfma-rr1-lut16-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-neonfma-rr1-p6.c",
+    "xnnpack_wrappers/math/expminus-f32-neonfma-rr2-lut64-p2.c",
+    "xnnpack_wrappers/math/expminus-f32-neonfma-rr2-lut2048-p1.c",
+    "xnnpack_wrappers/math/expminus-f32-neonfma-rr2-p5.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-lut64-p2-nr1recps1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-lut64-p2-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-lut64-p2-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-lut2048-p1-nr1recps1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-lut2048-p1-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-lut2048-p1-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-p5-nr1recps1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-p5-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr1-p5-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-lut64-p2-nr1recps1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-lut64-p2-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-lut64-p2-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-lut2048-p1-nr1recps1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-lut2048-p1-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-lut2048-p1-nr2recps.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-p5-nr1recps1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-p5-nr2fma.c",
+    "xnnpack_wrappers/math/sigmoid-f32-neonfma-rr2-p5-nr2recps.c",
+    "xnnpack_wrappers/math/sqrt-f32-neonfma-nr1fma.c",
+    "xnnpack_wrappers/math/sqrt-f32-neonfma-nr1rsqrts1fma1adj.c",
+    "xnnpack_wrappers/math/sqrt-f32-neonfma-nr2fma1adj.c",
+    "xnnpack_wrappers/math/sqrt-f32-neonfma-nr2fma.c",
+    "xnnpack_wrappers/math/sqrt-f32-neonfma-nr3fma.c",
+]
+
+ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-aarch64-neonfma-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-aarch64-neonfma-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x4-aarch64-neonfma-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-aarch64-neonfma-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p0p1c3x8-aarch64-neonfma-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-aarch64-neonfma-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x4-aarch64-neonfma-2x2.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-aarch64-neonfma-2x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/gen/f32-conv-hwc-3x3s2p1c3x8-aarch64-neonfma-2x2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-aarch64-neonfma-6x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-aarch64-neonfma-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-2x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-3x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-4x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-aarch64-neonfma-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-2x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-3x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-aarch64-neonfma-3x4.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-6x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-6x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x4-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x2-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-aarch64-neonfma-lane-ld64.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-6x8-minmax-aarch64-neonfma-lane-ld128.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x2-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x4-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x2-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x4-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-12x2-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-12x4-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x2-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x4-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x2-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x4-minmax-aarch64-neonfma.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut64-p2-div-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-lut2048-p1-div-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-aarch64-neonfma-rr1-p5-div-x24.c",
+    "xnnpack_wrappers/math/sigmoid-f32-aarch64-neonfma-rr1-lut64-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-aarch64-neonfma-rr1-lut2048-p1-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-aarch64-neonfma-rr1-p5-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-aarch64-neonfma-rr2-lut64-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-aarch64-neonfma-rr2-lut2048-p1-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-aarch64-neonfma-rr2-p5-div.c",
+    "xnnpack_wrappers/math/tanh-f32-aarch64-neonfma-rr1-p6-div.c",
+]
+
+ALL_NEONFP16_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-neonfp16-x8.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-neonfp16-x16.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-neonfp16.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-neonfp16.c",
+]
+
+ALL_NEONFP16ARITH_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-avgpool/f16-avgpool-9p8x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-avgpool/f16-avgpool-9x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-conv-hwc2chw/f16-conv-hwc2chw-3x3s2p1c3x4-neonfp16arith-2x2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8-acc3.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8-acc4.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-1x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-2x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-2x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-3x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-4x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-5x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3p1-minmax-neonfp16arith-6x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8-acc3.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8-acc4.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-1x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-2x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-2x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-3x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-3x3s2p1-minmax-neonfp16arith-4x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc3.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc4.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8-acc5.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-2x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-2x8-acc3.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-2x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-3x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-3x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-4x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-4x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-5x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc3.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc4.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8-acc5.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-2x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-2x8-acc3.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-2x8.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-3x8-acc2.c",
+    "xnnpack_wrappers/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-3x8.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p8c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p8c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p16c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p16c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p32c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-3p32c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p8c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p8c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p16c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p16c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p32c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-4p32c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p8c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p8c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p16c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p16c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p32c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-9p32c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p8c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p8c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p16c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p16c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p32c-minmax-neonfp16arith-acc2.c",
+    "xnnpack_wrappers/f16-dwconv/gen/f16-dwconv-25p32c-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-gavgpool-cw/f16-gavgpool-cw-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c",
+    "xnnpack_wrappers/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-1x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-4x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-6x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemm-8x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-1x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-4x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-6x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-gemm/gen/f16-gemminc-8x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-ibilinear-chw/gen/f16-ibilinear-chw-neonfp16arith-p4.c",
+    "xnnpack_wrappers/f16-ibilinear-chw/gen/f16-ibilinear-chw-neonfp16arith-p8.c",
+    "xnnpack_wrappers/f16-ibilinear-chw/gen/f16-ibilinear-chw-neonfp16arith-p16.c",
+    "xnnpack_wrappers/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-ibilinear/gen/f16-ibilinear-neonfp16arith-c16.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-1x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-1x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-4x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-4x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-6x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-6x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-8x8-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-igemm/gen/f16-igemm-8x16-minmax-neonfp16arith-ld64.c",
+    "xnnpack_wrappers/f16-maxpool/f16-maxpool-9p8x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-pavgpool/f16-pavgpool-9p8x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-pavgpool/f16-pavgpool-9x-minmax-neonfp16arith-c8.c",
+    "xnnpack_wrappers/f16-prelu/gen/f16-prelu-neonfp16arith-2x8.c",
+    "xnnpack_wrappers/f16-prelu/gen/f16-prelu-neonfp16arith-2x16.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x32-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x32-acc4.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x32.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x40-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x40-acc5.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x40.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x48-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x48-acc3.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x48.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x64-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x64-acc4.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x64.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x72-acc3.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x72.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x80-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x80-acc5.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x80.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96-acc2.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96-acc3.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96-acc6.c",
+    "xnnpack_wrappers/f16-raddstoreexpminusmax/gen/f16-raddstoreexpminusmax-neonfp16arith-rr2-p2-x96.c",
+    "xnnpack_wrappers/f16-rmax/f16-rmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-8x1-minmax-neonfp16arith-pipelined.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-8x1-minmax-neonfp16arith-x2.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-8x1-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-16x1-minmax-neonfp16arith-pipelined.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-16x1-minmax-neonfp16arith-x2.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-16x1-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith-pipelined.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith-x2.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-24x1-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-pipelined.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith-x2.c",
+    "xnnpack_wrappers/f16-spmm/gen/f16-spmm-32x1-minmax-neonfp16arith.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vadd-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vaddc-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmaxc-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmin-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vminc-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmul-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vmulc-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrsubc-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiff-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsqrdiffc-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsub-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vsubc-minmax-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vclamp/gen/f16-vclamp-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vclamp/gen/f16-vclamp-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-x8.c",
+    "xnnpack_wrappers/f16-velu/gen/f16-velu-neonfp16arith-rr1-p3-x16.c",
+    "xnnpack_wrappers/f16-vhswish/gen/f16-vhswish-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vhswish/gen/f16-vhswish-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vlrelu/gen/f16-vlrelu-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vmulcaddc/gen/f16-vmulcaddc-c8-minmax-neonfp16arith-2x.c",
+    "xnnpack_wrappers/f16-vmulcaddc/gen/f16-vmulcaddc-c16-minmax-neonfp16arith-2x.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndd-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndd-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndne-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndne-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndu-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndu-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndz-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vrnd/gen/f16-vrndz-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x8.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x24.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x32.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x40.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x48.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x56.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1fma-x64.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x8.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x24.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x32.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x40.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x48.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x56.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-neonfp16arith-rr2-p2-nr1recps-x64.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x8.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x16.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x24.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-neonfp16arith-nr1fma1adj-x32.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vabs-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vabs-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vneg-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vneg-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vsqr-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vsqr-neonfp16arith-x16.c",
+    "xnnpack_wrappers/math/exp-f16-neonfp16arith-rr2-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f16-neonfp16arith-rr1-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f16-neonfp16arith-rr2-p3.c",
+    "xnnpack_wrappers/math/expminus-f16-neonfp16arith-rr1-p2.c",
+    "xnnpack_wrappers/math/expminus-f16-neonfp16arith-rr1-p3.c",
+    "xnnpack_wrappers/math/expminus-f16-neonfp16arith-rr2-p2.c",
+    "xnnpack_wrappers/math/expminus-f16-neonfp16arith-rr2-p3.c",
+    "xnnpack_wrappers/math/sigmoid-f16-neonfp16arith-rr2-p2-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f16-neonfp16arith-rr2-p2-nr1recps.c",
+    "xnnpack_wrappers/math/sigmoid-f16-neonfp16arith-rr2-p2-recpe.c",
+    "xnnpack_wrappers/math/sigmoid-f16-neonfp16arith-rr2-p3-nr1fma.c",
+    "xnnpack_wrappers/math/sigmoid-f16-neonfp16arith-rr2-p3-nr1recps.c",
+    "xnnpack_wrappers/math/sigmoid-f16-neonfp16arith-rr2-p3-recpe.c",
+    "xnnpack_wrappers/math/sqrt-f16-neonfp16arith-nr1fma1adj.c",
+    "xnnpack_wrappers/math/sqrt-f16-neonfp16arith-nr1fma.c",
+    "xnnpack_wrappers/math/sqrt-f16-neonfp16arith-nr1rsqrts.c",
+]
+
+ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdiv-minmax-aarch64-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vdivc-minmax-aarch64-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-x8.c",
+    "xnnpack_wrappers/f16-vbinary/gen/f16-vrdivc-minmax-aarch64-neonfp16arith-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x8.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x16.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x24.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x32.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x40.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x48.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x56.c",
+    "xnnpack_wrappers/f16-vsigmoid/gen/f16-vsigmoid-aarch64-neonfp16arith-rr2-p2-div-x64.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-aarch64-neonfp16arith-sqrt-x8.c",
+    "xnnpack_wrappers/f16-vsqrt/gen/f16-vsqrt-aarch64-neonfp16arith-sqrt-x16.c",
+    "xnnpack_wrappers/math/sigmoid-f16-aarch64-neonfp16arith-rr1-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f16-aarch64-neonfp16arith-rr1-p3-div.c",
+    "xnnpack_wrappers/math/sigmoid-f16-aarch64-neonfp16arith-rr2-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f16-aarch64-neonfp16arith-rr2-p3-div.c",
+    "xnnpack_wrappers/math/sqrt-f16-aarch64-neonfp16arith-sqrt.c",
+]
+
+ALL_NEONV8_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x24.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-neonv8-x32.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x8.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x16.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x24.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-neonv8-x32.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-neonv8-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-neonv8-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-neonv8-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-neonv8-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-neonv8-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-neonv8-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-neonv8-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-neonv8-x8.c",
+    "xnnpack_wrappers/math/cvt-f32-qs8-neonv8.c",
+    "xnnpack_wrappers/math/cvt-f32-qu8-neonv8.c",
+    "xnnpack_wrappers/math/roundd-neonv8.c",
+    "xnnpack_wrappers/math/roundne-neonv8.c",
+    "xnnpack_wrappers/math/roundu-neonv8.c",
+    "xnnpack_wrappers/math/roundz-neonv8.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-neonv8-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neonv8-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-neonv8-mla8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neonv8-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neonv8-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mla8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mul8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p32c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neonv8-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neonv8-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mla8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mla8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mul8-ld64.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mul8-ld128.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-6x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x8-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-6x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p32c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2-minmax-fp32-neonv8-mlal-ld4r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c2s4-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neonv8-mlal-dup.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld1r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c4s2-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x8c8-minmax-fp32-neonv8-mlal.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-neonv8-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-neonv8-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-neonv8-ld128-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neonv8-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neonv8-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-neonv8-ld128-x16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p24c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p32c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p24c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-neonv8-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-neonv8-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-neonv8-ld128-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neonv8-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neonv8-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-neonv8-ld128-x16.c",
+]
+
+ALL_SCALAR_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples1-scalar.c",
+    "xnnpack_wrappers/cs16-bfly4/cs16-bfly4-samples4-scalar.c",
+    "xnnpack_wrappers/cs16-bfly4/gen/cs16-bfly4-scalar-x1.c",
+    "xnnpack_wrappers/cs16-bfly4/gen/cs16-bfly4-scalar-x2.c",
+    "xnnpack_wrappers/cs16-bfly4/gen/cs16-bfly4-scalar-x4.c",
+    "xnnpack_wrappers/cs16-fftr/gen/cs16-fftr-scalar-x1.c",
+    "xnnpack_wrappers/cs16-fftr/gen/cs16-fftr-scalar-x2.c",
+    "xnnpack_wrappers/cs16-fftr/gen/cs16-fftr-scalar-x4.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x1.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x2.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x3.c",
+    "xnnpack_wrappers/cs16-vsquareabs/gen/cs16-vsquareabs-scalar-x4.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x1.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x2.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x3.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-x4.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-4x-scalar-c1.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-9p8x-scalar-c1.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-9x-scalar-c1.c",
+    "xnnpack_wrappers/f32-avgpool/f32-avgpool-9p8x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-avgpool/f32-avgpool-9x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-scalar-1x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/f32-conv-hwc-3x3s2p0p1c3x4-scalar-1x1.c",
+    "xnnpack_wrappers/f32-conv-hwc/f32-conv-hwc-3x3s2p1c3x4-scalar-1x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-1x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-2x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-2x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-3x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-4x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-5x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-scalar-6x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-1x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-2x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-2x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-3x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-scalar-4x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-1x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-2x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-2x1-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-2x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-3x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-scalar-3x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-1x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-3x1-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-3x1.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p1c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p1c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p1c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p2c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p2c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p2c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p2c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p1c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p1c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p1c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p1c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p2c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p2c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p2c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p2c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p1c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p1c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p1c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p1c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p2c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p2c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p2c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p2c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p1c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p1c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p1c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p1c-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p2c-minmax-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p2c-minmax-scalar.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p2c-scalar-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p2c-scalar.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x1.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x2.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x3.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-x4.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x1.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x2.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x3.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-x4.c",
+    "xnnpack_wrappers/f32-gavgpool-cw/f32-gavgpool-cw-scalar-x1.c",
+    "xnnpack_wrappers/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x4-relu-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x4-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-2x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-2x4-relu-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-2x4-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-minmax-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-relu-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-relu-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x4-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-2x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-scalar-p1.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-scalar-p2.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-scalar-p4.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-scalar-c1.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-scalar-c2.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-scalar-c4.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x4-relu-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x4-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-2x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-2x4-relu-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-2x4-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-minmax-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-relu-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x4-relu-scalar.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x4-scalar.c",
+    "xnnpack_wrappers/f32-maxpool/f32-maxpool-9p8x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-pavgpool/f32-pavgpool-9p8x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-pavgpool/f32-pavgpool-9x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-2x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-3x3-minmax-scalar.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x2-minmax-scalar.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-scalar-2x1.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-scalar-2x4.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x1.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x2.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x3.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-fmagic-x4.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x1.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x2.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x3.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-imagic-x4.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x1.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x2.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x3.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-scalar-lrintf-x4.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x1.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x2.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x3.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-fmagic-x4.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x1.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x2.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x3.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-imagic-x4.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x1.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x2.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x3.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-scalar-lrintf-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x1.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x2-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x4-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x4-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-lut64-p2-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x1.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x2-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x4-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x4-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-scalar-rr2-p5-x4.c",
+    "xnnpack_wrappers/f32-rmax/f32-rmax-scalar.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-1x1-minmax-scalar-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-1x1-minmax-scalar.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-2x1-minmax-scalar-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-2x1-minmax-scalar.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-scalar-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-scalar.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-scalar-pipelined.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-scalar.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x2-minmax-scalar.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x4-minmax-scalar.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-relu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-relu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-relu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-relu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-scalar-x1.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-scalar-x2.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-scalar-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-scalar-x8.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-scalar-x1.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-scalar-x2.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-scalar-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x1.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x2.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x3.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x5.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-lut16-p3-x6.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-p6-x1.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-p6-x2.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-p6-x3.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-p6-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-p6-x5.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-scalar-rr2-p6-x6.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-scalar-x1.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-scalar-x2.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-scalar-x4.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c1-minmax-scalar-2x.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c2-minmax-scalar-2x.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-scalar-2x.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-scalar-x1.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-scalar-x2.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-scalar-x4.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-scalar-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-scalar-libm-x1.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-scalar-libm-x2.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-scalar-libm-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-scalar-libm-x1.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-scalar-libm-x2.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-scalar-libm-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-scalar-libm-x1.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-scalar-libm-x2.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-scalar-libm-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-scalar-libm-x1.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-scalar-libm-x2.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-scalar-libm-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut64-p2-div-x1.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut64-p2-div-x2.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut64-p2-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut2048-p1-div-x1.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut2048-p1-div-x2.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-lut2048-p1-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-x1.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-x2.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-scalar-rr2-p5-div-x4.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-x1.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-x2.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-scalar-sqrt-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-scalar-x1.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-scalar-x2.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-scalar-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-scalar-x1.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-scalar-x2.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-scalar-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-scalar-x1.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-scalar-x2.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-scalar-x4.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-scalar-x1.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-scalar-x2.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-scalar-x3.c",
+    "xnnpack_wrappers/i16-vlshift/gen/i16-vlshift-scalar-x4.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-scalar-bitcast.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-scalar-fabsf.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-lut4-p4.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-lut8-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-lut8-p4.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-lut16-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-lut16-p4.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-p5.c",
+    "xnnpack_wrappers/math/expm1minus-f32-scalar-rr2-p6.c",
+    "xnnpack_wrappers/math/expminus-f32-scalar-rr2-lut64-p2.c",
+    "xnnpack_wrappers/math/expminus-f32-scalar-rr2-lut2048-p1.c",
+    "xnnpack_wrappers/math/expminus-f32-scalar-rr2-p5.c",
+    "xnnpack_wrappers/math/roundd-scalar-addsub.c",
+    "xnnpack_wrappers/math/roundd-scalar-cvt.c",
+    "xnnpack_wrappers/math/roundd-scalar-floor.c",
+    "xnnpack_wrappers/math/roundne-scalar-addsub.c",
+    "xnnpack_wrappers/math/roundne-scalar-nearbyint.c",
+    "xnnpack_wrappers/math/roundne-scalar-rint.c",
+    "xnnpack_wrappers/math/roundu-scalar-addsub.c",
+    "xnnpack_wrappers/math/roundu-scalar-ceil.c",
+    "xnnpack_wrappers/math/roundu-scalar-cvt.c",
+    "xnnpack_wrappers/math/roundz-scalar-addsub.c",
+    "xnnpack_wrappers/math/roundz-scalar-cvt.c",
+    "xnnpack_wrappers/math/roundz-scalar-trunc.c",
+    "xnnpack_wrappers/math/sigmoid-f32-scalar-rr2-lut64-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-scalar-rr2-lut2048-p1-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-scalar-rr2-p5-div.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-bitmanip.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-clz-binsearch.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-clz-newton.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-cvti32-sqrt-lrint.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-cvti64-sqrt-lrint.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-cvti64-sqrtf-lrintf.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-cvtu32-sqrt-lrint.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-cvtu32-sqrtf-lrintf.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-hashemian.c",
+    "xnnpack_wrappers/math/sqrt-u32-scalar-tflm.c",
+    "xnnpack_wrappers/math/sqrt-u64-scalar-cvtu32-sqrt-cvtsatu32f64.c",
+    "xnnpack_wrappers/math/sqrt-u64-scalar-cvtu32-sqrt-llrint.c",
+    "xnnpack_wrappers/math/sqrt-u64-scalar-cvtu64-sqrt-llrint.c",
+    "xnnpack_wrappers/math/tanh-f32-scalar-rr1-p6-div.c",
+    "xnnpack_wrappers/math/tanh-f32-scalar-rr2-p6-div.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p1c-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p2c-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p4c-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x1.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x2.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x3.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-x4.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-gemmlowp-scalar.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-scalar-signed64.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndnu-scalar.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-scalar-x1.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-scalar-x2.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-scalar-x4.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-scalar-x1.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-scalar-x2.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-scalar-x4.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-x1.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-x2.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-scalar-andxor-x4.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-x1.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-x2.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-scalar-select-x4.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-x1.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-x2.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-scalar-x4.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-x1.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-x2.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-scalar-x4.c",
+    "xnnpack_wrappers/qu8-avgpool/qu8-avgpool-9p8x-minmax-fp32-scalar-imagic-c1.c",
+    "xnnpack_wrappers/qu8-avgpool/qu8-avgpool-9x-minmax-fp32-scalar-imagic-c1.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p1c-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p2c-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p4c-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p1c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x1.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x2.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x3.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-x4.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x2-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x2-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-scalar-imagic.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4-minmax-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4-minmax-rndnu-scalar.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-fp32-scalar-fmagic.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-fp32-scalar-lrintf.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-gemmlowp-scalar.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-scalar-signed64.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-scalar-unsigned32.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-scalar-unsigned64.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-scalar-x1.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-scalar-x2.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-scalar-x4.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-scalar-x1.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-scalar-x2.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-scalar-x4.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-scalar-x1.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-scalar-x2.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-scalar-x4.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-x1.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-x2.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-scalar-andxor-x4.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-x1.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-x2.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-scalar-select-x4.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-x1.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-x2.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-scalar-x4.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-x1.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-x2.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-scalar-x4.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-scalar-c1.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-scalar-c2.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-scalar-c4.c",
+    "xnnpack_wrappers/s8-maxpool/s8-maxpool-9p8x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/s8-vclamp/s8-vclamp-scalar-x4.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-scalar-x1.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-scalar-x2.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-scalar-x3.c",
+    "xnnpack_wrappers/s16-rmaxabs/gen/s16-rmaxabs-scalar-x4.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-scalar-x1.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-scalar-x2.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-scalar-x3.c",
+    "xnnpack_wrappers/s16-window/gen/s16-window-scalar-x4.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-scalar-c1.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-scalar-c2.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-scalar-c4.c",
+    "xnnpack_wrappers/u8-lut32norm/u8-lut32norm-scalar.c",
+    "xnnpack_wrappers/u8-maxpool/u8-maxpool-9p8x-minmax-scalar-c1.c",
+    "xnnpack_wrappers/u8-rmax/u8-rmax-scalar.c",
+    "xnnpack_wrappers/u8-vclamp/u8-vclamp-scalar-x4.c",
+    "xnnpack_wrappers/u32-filterbank-accumulate/gen/u32-filterbank-accumulate-scalar-x1.c",
+    "xnnpack_wrappers/u32-filterbank-subtract/u32-filterbank-subtract-scalar-x2.c",
+    "xnnpack_wrappers/u32-vlog/gen/u32-vlog-scalar-x1.c",
+    "xnnpack_wrappers/u32-vlog/gen/u32-vlog-scalar-x2.c",
+    "xnnpack_wrappers/u32-vlog/gen/u32-vlog-scalar-x3.c",
+    "xnnpack_wrappers/u32-vlog/gen/u32-vlog-scalar-x4.c",
+    "xnnpack_wrappers/u64-u32-vsqrtshift/u64-u32-vsqrtshift-scalar-cvtu32-sqrt-cvtu32f64-x1.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-scalar-x1.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-scalar-x2.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-scalar-x4.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-scalar-x8.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-scalar-x16.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-1x2-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-1x4-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-2x1-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-2x2-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-2x4-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-4x1-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-4x2-scalar-int.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-4x4-scalar-int.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x2-scalar.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x3-scalar.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x4-scalar.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-xm-scalar.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-1x2-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-1x4-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-2x1-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-2x2-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-2x4-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x1-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x2-scalar-int.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-4x4-scalar-int.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-1x2-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-1x4-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-2x1-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-2x2-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-2x4-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-4x1-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-4x2-scalar.c",
+    "xnnpack_wrappers/x24-transposec/gen/x24-transposec-4x4-scalar.c",
+    "xnnpack_wrappers/x32-packx/x32-packx-x2-scalar.c",
+    "xnnpack_wrappers/x32-packx/x32-packx-x3-scalar.c",
+    "xnnpack_wrappers/x32-packx/x32-packx-x4-scalar.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-1x2-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-1x2-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-1x4-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-1x4-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x1-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x1-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x2-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x4-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-2x4-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x1-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x1-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x2-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x2-scalar-int.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-scalar-float.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-scalar-int.c",
+    "xnnpack_wrappers/x32-unpool/x32-unpool-scalar.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x2-scalar.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x3-scalar.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x4-scalar.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-xm-scalar.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-1x2-scalar-float.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-1x2-scalar-int.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x1-scalar-float.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x1-scalar-int.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-scalar-float.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-scalar-int.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x1-scalar-float.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x1-scalar-int.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x2-scalar-float.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-4x2-scalar-int.c",
+    "xnnpack_wrappers/xx-copy/xx-copy-scalar-memcpy.c",
+    "xnnpack_wrappers/xx-fill/xx-fill-scalar-x16.c",
+    "xnnpack_wrappers/xx-pad/xx-pad-scalar.c",
+    "xnnpack_wrappers/xx-transpose/xx-transpose-1x1-scalar-memcpy.c",
+]
+
+ALL_SSE_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f32-avgpool/f32-avgpool-9p8x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-avgpool/f32-avgpool-9x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-sse-1x1.c",
+    "xnnpack_wrappers/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x4-sse-2x2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-sse-6x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-2x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-3x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-4x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4-acc5.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-3x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-3x4.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p4c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p4c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-3p8c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p4c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p4c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-4p8c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-5f5m5l8c4s4r-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-5f5m5l8c4s4r-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-7f6m6l8c4s4r-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-7f6m6l8c4s4r-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p4c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p4c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-9p8c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p4c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p4c-minmax-sse.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-sse-acc2.c",
+    "xnnpack_wrappers/f32-dwconv/gen/f32-dwconv-25p8c-minmax-sse.c",
+    "xnnpack_wrappers/f32-gavgpool-cw/f32-gavgpool-cw-sse-x4.c",
+    "xnnpack_wrappers/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x2c4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-sse-p4.c",
+    "xnnpack_wrappers/f32-ibilinear-chw/gen/f32-ibilinear-chw-sse-p8.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-sse-c4.c",
+    "xnnpack_wrappers/f32-ibilinear/gen/f32-ibilinear-sse-c8.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x2c4-minmax-sse.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-sse-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-sse-load1.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8s4-minmax-sse.c",
+    "xnnpack_wrappers/f32-maxpool/f32-maxpool-9p8x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-pavgpool/f32-pavgpool-9p8x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-pavgpool/f32-pavgpool-9x-minmax-sse-c4.c",
+    "xnnpack_wrappers/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-sse-2x4.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-sse-2x8.c",
+    "xnnpack_wrappers/f32-rmax/f32-rmax-sse.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-4x1-minmax-sse.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-8x1-minmax-sse.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-16x1-minmax-sse.c",
+    "xnnpack_wrappers/f32-spmm/gen/f32-spmm-32x1-minmax-sse.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vadd-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vaddc-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdiv-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vdivc-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmaxc-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmin-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vminc-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmul-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vmulc-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrdivc-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vrsubc-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiff-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsqrdiffc-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsub-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-sse-x4.c",
+    "xnnpack_wrappers/f32-vbinary/gen/f32-vsubc-minmax-sse-x8.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-sse-x4.c",
+    "xnnpack_wrappers/f32-vclamp/gen/f32-vclamp-sse-x8.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-sse-x4.c",
+    "xnnpack_wrappers/f32-vhswish/gen/f32-vhswish-sse-x8.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-sse-x4.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-sse-x8.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c4-minmax-sse-2x.c",
+    "xnnpack_wrappers/f32-vmulcaddc/gen/f32-vmulcaddc-c8-minmax-sse-2x.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-sse-x4.c",
+    "xnnpack_wrappers/f32-vrelu/gen/f32-vrelu-sse-x8.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-sse-sqrt-x4.c",
+    "xnnpack_wrappers/f32-vsqrt/gen/f32-vsqrt-sse-sqrt-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-sse-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vabs-sse-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-sse-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vneg-sse-x8.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-sse-x4.c",
+    "xnnpack_wrappers/f32-vunary/gen/f32-vsqr-sse-x8.c",
+    "xnnpack_wrappers/math/roundd-sse-addsub.c",
+    "xnnpack_wrappers/math/roundne-sse-addsub.c",
+    "xnnpack_wrappers/math/roundu-sse-addsub.c",
+    "xnnpack_wrappers/math/roundz-sse-addsub.c",
+    "xnnpack_wrappers/math/sqrt-f32-sse-hh1mac.c",
+    "xnnpack_wrappers/math/sqrt-f32-sse-nr1mac.c",
+    "xnnpack_wrappers/math/sqrt-f32-sse-nr2mac.c",
+    "xnnpack_wrappers/x32-packx/x32-packx-x4-sse.c",
+    "xnnpack_wrappers/x32-transposec/x32-transposec-4x4-sse.c",
+]
+
+ALL_SSE2_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-x32.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-x32.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vabs-sse2-x8.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vabs-sse2-x16.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vneg-sse2-x8.c",
+    "xnnpack_wrappers/f16-vunary/gen/f16-vneg-sse2-x16.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-4x-sse2-c4.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-9p8x-sse2-c4.c",
+    "xnnpack_wrappers/f32-argmaxpool/f32-argmaxpool-9x-sse2-c4.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x8.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x24.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-1x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-3x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-4x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemm-5x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-1x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-3x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-4x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-gemm/gen/f32-gemminc-5x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-1x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-3x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-4x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-igemm/gen/f32-igemm-5x8-minmax-sse2-dup.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-sse2-2x4.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-sse2-2x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x24.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x8.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x24.c",
+    "xnnpack_wrappers/f32-qu8-vcvt/gen/f32-qu8-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x8-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x8.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x12-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x12-acc3.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x12.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x16-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x16-acc4.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x16.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x20-acc2.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x20-acc5.c",
+    "xnnpack_wrappers/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-sse2-rr2-p5-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-lut16-p3-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-p6-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-p6-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-p6-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-p6-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse2-rr2-p6-x24.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-sse2-x4.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-sse2-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-sse2-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-sse2-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-sse2-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-sse2-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-sse2-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-sse2-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-sse2-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-sse2-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-lut64-p2-div-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse2-rr2-p5-div-x24.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-sse2-int16.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-sse2-int32.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-sse2.c",
+    "xnnpack_wrappers/math/exp-f32-sse2-rr2-lut64-p2.c",
+    "xnnpack_wrappers/math/exp-f32-sse2-rr2-p5.c",
+    "xnnpack_wrappers/math/expm1minus-f32-sse2-rr2-lut16-p3.c",
+    "xnnpack_wrappers/math/expm1minus-f32-sse2-rr2-p6.c",
+    "xnnpack_wrappers/math/expminus-f32-sse2-rr2-p5.c",
+    "xnnpack_wrappers/math/roundd-sse2-cvt.c",
+    "xnnpack_wrappers/math/roundne-sse2-cvt.c",
+    "xnnpack_wrappers/math/roundu-sse2-cvt.c",
+    "xnnpack_wrappers/math/roundz-sse2-cvt.c",
+    "xnnpack_wrappers/math/sigmoid-f32-sse2-rr2-lut64-p2-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-sse2-rr2-lut64-p2-nr1.c",
+    "xnnpack_wrappers/math/sigmoid-f32-sse2-rr2-lut64-p2-nr2.c",
+    "xnnpack_wrappers/math/sigmoid-f32-sse2-rr2-p5-div.c",
+    "xnnpack_wrappers/math/sigmoid-f32-sse2-rr2-p5-nr1.c",
+    "xnnpack_wrappers/math/sigmoid-f32-sse2-rr2-p5-nr2.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse2-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x8.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x24.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-fp32-sse2.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-gemmlowp-sse2.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-sse2.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse2-mul16-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse2-mul16-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-sse2-x16.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-sse2-x32.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-avgpool/qu8-avgpool-9p8x-minmax-fp32-sse2-c8.c",
+    "xnnpack_wrappers/qu8-avgpool/qu8-avgpool-9x-minmax-fp32-sse2-c8.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-sse2-mul16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x8.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x24.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse2-ld128.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-fp32-sse2.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-gemmlowp-sse2.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-sse2.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-sse2-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-sse2-x32.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-sse2-x16.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-sse2-x32.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse2-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse2-mul16-ld64-x16.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-sse2-c8.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-sse2-c16.c",
+    "xnnpack_wrappers/s8-maxpool/s8-maxpool-9p8x-minmax-sse2-c16.c",
+    "xnnpack_wrappers/s8-vclamp/s8-vclamp-sse2-x64.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-sse2-c8.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-sse2-c16.c",
+    "xnnpack_wrappers/u8-maxpool/u8-maxpool-9p8x-minmax-sse2-c16.c",
+    "xnnpack_wrappers/u8-rmax/u8-rmax-sse2.c",
+    "xnnpack_wrappers/u8-vclamp/u8-vclamp-sse2-x64.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-16x16-reuse-mov-sse2.c",
+    "xnnpack_wrappers/x8-transposec/gen/x8-transposec-16x16-reuse-switch-sse2.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x2-sse2.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x3-sse2.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-x4-sse2.c",
+    "xnnpack_wrappers/x8-zip/x8-zip-xm-sse2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-multi-mov-sse2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-multi-switch-sse2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-mov-sse2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-multi-sse2.c",
+    "xnnpack_wrappers/x16-transposec/gen/x16-transposec-8x8-reuse-switch-sse2.c",
+    "xnnpack_wrappers/x16-transposec/x16-transposec-4x8-sse2.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-mov-sse2.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-multi-sse2.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-multi-switch-sse2.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-mov-sse2.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-multi-sse2.c",
+    "xnnpack_wrappers/x32-transposec/gen/x32-transposec-4x4-reuse-switch-sse2.c",
+    "xnnpack_wrappers/x32-unpool/x32-unpool-sse2.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x2-sse2.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x3-sse2.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-x4-sse2.c",
+    "xnnpack_wrappers/x32-zip/x32-zip-xm-sse2.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-mov-sse2.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-multi-sse2.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-multi-switch-sse2.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-mov-sse2.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-multi-sse2.c",
+    "xnnpack_wrappers/x64-transposec/gen/x64-transposec-2x2-reuse-switch-sse2.c",
+    "xnnpack_wrappers/xx-fill/xx-fill-sse2-x64.c",
+    "xnnpack_wrappers/xx-pad/xx-pad-sse2.c",
+]
+
+ALL_SSE41_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-x32.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x8.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x16.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x24.c",
+    "xnnpack_wrappers/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-x32.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x8.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x16.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x24.c",
+    "xnnpack_wrappers/f32-f16-vcvt/gen/f32-f16-vcvt-sse41-x32.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-sse41-2x4.c",
+    "xnnpack_wrappers/f32-prelu/gen/f32-prelu-sse41-2x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x8.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x16.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x24.c",
+    "xnnpack_wrappers/f32-qs8-vcvt/gen/f32-qs8-vcvt-sse41-x32.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-lut16-p3-x24.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-p6-x4.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-p6-x8.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-p6-x12.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-p6-x16.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-p6-x20.c",
+    "xnnpack_wrappers/f32-velu/gen/f32-velu-sse41-rr2-p6-x24.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-sse41-x4.c",
+    "xnnpack_wrappers/f32-vlrelu/gen/f32-vlrelu-sse41-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-sse41-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndd-sse41-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-sse41-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndne-sse41-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-sse41-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndu-sse41-x8.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-sse41-x4.c",
+    "xnnpack_wrappers/f32-vrnd/gen/f32-vrndz-sse41-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-lut64-p2-div-x24.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x4.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x8.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x12.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x16.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x20.c",
+    "xnnpack_wrappers/f32-vsigmoid/gen/f32-vsigmoid-sse41-rr2-p5-div-x24.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-sse41-int16.c",
+    "xnnpack_wrappers/math/cvt-f16-f32-sse41-int32.c",
+    "xnnpack_wrappers/math/cvt-f32-f16-sse41.c",
+    "xnnpack_wrappers/math/roundd-sse41.c",
+    "xnnpack_wrappers/math/roundne-sse41.c",
+    "xnnpack_wrappers/math/roundu-sse41.c",
+    "xnnpack_wrappers/math/roundz-sse41.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse41-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x8.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x16.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x24.c",
+    "xnnpack_wrappers/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-x32.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c",
+    "xnnpack_wrappers/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-fp32-sse41.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-gemmlowp-sse41.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-sse41.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul32-ld32-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul16-ld64-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-sse41-mul32-ld32-x32.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-sse41-x8.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-sse41-x16.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-sse41-x32.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-sse41-x8.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-sse41-x16.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-sse41-x32.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmul/gen/qs8-vmul-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qs8-vmulc/gen/qs8-vmulc-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-sse41-mul16.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-sse41-mul32.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x8.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x16.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x24.c",
+    "xnnpack_wrappers/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-x32.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c",
+    "xnnpack_wrappers/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-sse41-ld128.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-gemmlowp-sse41.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-sse41.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-sse41-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-sse41-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-sse41-x8.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-sse41-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-sse41-x32.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-sse41-x8.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-sse41-x16.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-sse41-x32.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmul/gen/qu8-vmul-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse41-mul16-ld64-x8.c",
+    "xnnpack_wrappers/qu8-vmulc/gen/qu8-vmulc-minmax-fp32-sse41-mul16-ld64-x16.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-sse41-c8.c",
+    "xnnpack_wrappers/s8-ibilinear/gen/s8-ibilinear-sse41-c16.c",
+    "xnnpack_wrappers/s8-maxpool/s8-maxpool-9p8x-minmax-sse41-c16.c",
+    "xnnpack_wrappers/s8-vclamp/s8-vclamp-sse41-x64.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-sse41-c8.c",
+    "xnnpack_wrappers/u8-ibilinear/gen/u8-ibilinear-sse41-c16.c",
+]
+
+ALL_SSSE3_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4-acc3.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4-acc4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-1x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-2x4-acc2.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-2x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-3x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-4x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-5x4.c",
+    "xnnpack_wrappers/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3p1-minmax-ssse3-6x4.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-ssse3-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-ssse3.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-ssse3-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-ssse3.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-ssse3-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-ssse3.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-ssse3-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-ssse3-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-ssse3-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-ssse3-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-ssse3-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-ssse3-ld128.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-gemmlowp-ssse3.c",
+    "xnnpack_wrappers/qs8-requantization/qs8-requantization-rndna-ssse3.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-ssse3-x16.c",
+    "xnnpack_wrappers/qs8-vcvt/gen/qs8-vcvt-ssse3-x32.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-ssse3-x16.c",
+    "xnnpack_wrappers/qs8-vlrelu/gen/qs8-vlrelu-ssse3-x32.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-gemmlowp-ssse3.c",
+    "xnnpack_wrappers/qu8-requantization/qu8-requantization-rndna-ssse3.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-ssse3-x16.c",
+    "xnnpack_wrappers/qu8-vcvt/gen/qu8-vcvt-ssse3-x32.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-ssse3-x16.c",
+    "xnnpack_wrappers/qu8-vlrelu/gen/qu8-vlrelu-ssse3-x32.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-ssse3-x16.c",
+    "xnnpack_wrappers/x8-lut/gen/x8-lut-ssse3-x32.c",
+    "xnnpack_wrappers/x24-transposec/x24-transposec-4x4-ssse3.c",
+]
+
+ALL_XOP_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-3p16c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p8c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p16c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-9p24c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p8c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p16c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qc8-dwconv/gen/qc8-dwconv-25p24c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-gemm/gen/qc8-gemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qc8-igemm/gen/qc8-igemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-9p24c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-xop-mul16-add16.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qs8-dwconv/gen/qs8-dwconv-25p24c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c2s4-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-1x4c8-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c2s4-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-2x4c8-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c2s4-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-3x4c8-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-gemm/gen/qs8-gemm-4x4c2s4-xw-minmax-fp32-xop.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qs8-igemm/gen/qs8-igemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x24.c",
+    "xnnpack_wrappers/qs8-vadd/gen/qs8-vadd-minmax-xop-mul32-ld32-x32.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x24.c",
+    "xnnpack_wrappers/qs8-vaddc/gen/qs8-vaddc-minmax-xop-mul32-ld32-x32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-xop-mul32.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-2x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-3x4c8-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-xop-ld64.c",
+    "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x4c2s4-minmax-fp32-xop-ld128.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-xop-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qu8-vadd/gen/qu8-vadd-minmax-xop-mul32-ld32-x16.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-xop-mul32-ld32-x8.c",
+    "xnnpack_wrappers/qu8-vaddc/gen/qu8-vaddc-minmax-xop-mul32-ld32-x16.c",
+]
+
+PROD_FP16ARITH_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/fp16arith.c",
+]
+
+PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/neonfp16arith.c",
+]
+
+PROD_SCALAR_MICROKERNEL_SRCS = [
+    "xnnpack_wrappers/amalgam/scalar.c",
 ]
diff --git a/tools/README.md b/tools/README.md
index 6d20bda05017..9ded063f4554 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -39,8 +39,6 @@ Developer tools which you might find useful:
   can conveniently run diffs on them when working on code-generation.
   (See also [generated_dirs.txt](generated_dirs.txt) which
   specifies the list of directories with generated files.)
-* [stats/test_history.py](stats/test_history.py) - Query S3 to display history of a single
-  test across multiple jobs over time.
 
 Important if you want to run on AMD GPU:
 
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index a5416ca037e5..dba7f3c55710 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -80,6 +80,7 @@
     "c10/cuda/*",
     "c10/cuda/test/CMakeLists.txt",
     "modules/*",
+    "third_party/nvfuser/*",
     # PyTorch paths
     # Keep this synchronized with is_pytorch_file in hipify_python.py
     "aten/src/ATen/cuda/*",
@@ -116,13 +117,13 @@
     # Correct path to generate HIPConfig.h:
     #   CUDAConfig.h.in -> (amd_build) HIPConfig.h.in -> (cmake) HIPConfig.h
     "aten/src/ATen/cuda/CUDAConfig.h",
-    "torch/csrc/jit/codegen/cuda/codegen.cpp",
-    "torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/block_sync_default_rocm.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/broadcast.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu",
-    "torch/csrc/jit/codegen/cuda/runtime/helpers.cu",
+    "third_party/nvfuser/csrc/codegen.cpp",
+    "third_party/nvfuser/runtime/block_reduction.cu",
+    "third_party/nvfuser/runtime/block_sync_atomic.cu",
+    "third_party/nvfuser/runtime/block_sync_default_rocm.cu",
+    "third_party/nvfuser/runtime/broadcast.cu",
+    "third_party/nvfuser/runtime/grid_reduction.cu",
+    "third_party/nvfuser/runtime/helpers.cu",
     "torch/csrc/jit/codegen/fuser/cuda/resource_strings.h",
     "torch/csrc/jit/tensorexpr/ir_printer.cpp",
     # generated files we shouldn't frob
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 9ec2bb38e032..de13e63d75d0 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -554,7 +554,7 @@
   result: (self_t - other_t * result) / other_p
 
 - name: div.Scalar(Tensor self, Scalar other) -> Tensor
-  self: div_tensor_self_backward(grad, at::lift_fresh(at::scalar_to_tensor(other)), self.scalar_type())
+  self: div_tensor_self_backward(grad, other, self.scalar_type())
   result: self_t / other
 
 - name: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
@@ -563,7 +563,7 @@
   result: "rounding_mode.has_value() ? result.new_zeros_symint(result.sym_sizes()) : self_t / other_p - other_t * (self_p / other_p) / other_p"
 
 - name: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
-  self: div_tensor_self_backward(grad, at::lift_fresh(at::scalar_to_tensor(other)), self.scalar_type(), rounding_mode)
+  self: div_tensor_self_backward(grad, other, self.scalar_type(), rounding_mode)
   result: "rounding_mode.has_value() ? result.new_zeros_symint(result.sym_sizes()) : self_t / other"
 
 - name: dot(Tensor self, Tensor tensor) -> Tensor
@@ -903,8 +903,8 @@
   result: auto_element_wise
 
 - name: logaddexp(Tensor self, Tensor other) -> Tensor
-  self: grad / (1 + exp(other - self))
-  other: grad / (1 + exp(self - other))
+  self: grad / (1 + exp(other - self)).conj()
+  other: grad / (1 + exp(self - other)).conj()
   result: self_t / (1 + exp(other_p - self_p)) + other_t / (1 + exp(self_p - other_p))
 
 - name: logaddexp2(Tensor self, Tensor other) -> Tensor
@@ -1130,7 +1130,7 @@
   result: other_t * self_p + self_t * other_p
 
 - name: mul.Scalar(Tensor self, Scalar other) -> Tensor
-  self: mul_tensor_backward(grad, at::lift_fresh(at::scalar_to_tensor(other)), self.scalar_type())
+  self: mul_tensor_backward(grad, other, self.scalar_type())
   result: self_t * other
 
 - name: mv(Tensor self, Tensor vec) -> Tensor
@@ -1154,6 +1154,10 @@
   input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
 
+- name: _native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, /*training=*/false, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, /*training=*/false, eps)
+
 - name: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, Tensor(), Tensor(), result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, Tensor(), Tensor(), result1, result2, training, eps)
@@ -1174,7 +1178,7 @@
   rstd: not_implemented("native_layer_norm_backward rstd")
 
 - name: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
-  input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].device().is_xpu() ? grads[0] : grads[0].contiguous(grads[0].device().is_cpu() ? grads[0].suggest_memory_format() : c10::MemoryFormat::Contiguous), input.device().is_xpu() ? input : input.contiguous(input.device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
+  input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].device().is_xpu() ? grads[0] : grads[0].contiguous(grads[0].device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), input.device().is_xpu() ? input : input.contiguous(input.device().is_cpu() ? input.suggest_memory_format() : c10::MemoryFormat::Contiguous), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
   result0: group_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, group)
   result1: group_norm_mean_jvp(input_t, result1, group)
   result2: group_norm_invstd_jvp(input_p, input_t, result1, result2, group)
@@ -1532,12 +1536,12 @@
   self: unsqueeze_to(grad, dim, self.sym_sizes())
   result: auto_linear
 
-- name: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
+- name: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   self: std_backward(result, grad, self, dim, correction, keepdim)
   # pointwise (variance) + sum + sqrt
   result: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result)).masked_fill_(result == 0, 0)
 
-- name: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- name: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   self: std_mean_backward(grads[0], grads[1], self, result0, dim, correction, keepdim)
   result0: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result0)).masked_fill_(result0 == 0, 0)
   # linear
@@ -1588,9 +1592,6 @@
                    full_matrices ? Vh.narrow_symint(-2, 0, S.sym_size(-1)) : Vh)"
   U, S, Vh: linalg_svd_jvp(A_t, U, S, Vh, full_matrices)
 
-- name: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
-  self: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors_return, /*is_hermitian=*/true, /*symeig_eigenvector=*/eigenvectors)
-
 - name: _linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
   A: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/true)
   eigenvalues, eigenvectors: linalg_eig_jvp(A_t, eigenvalues, eigenvectors, /*is_hermitian=*/true)
@@ -1635,7 +1636,7 @@
   self: tanh_backward(grad, result)
   result: auto_element_wise
 
-- name: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+- name: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
   self: value_selecting_reduction_backward_symint(grad, dim, indices, self.sym_sizes(), true)
   output_differentiability: [True, False]
   values: gather(self_t, dim, indices)
@@ -1753,12 +1754,12 @@
   self: grad.squeeze(dim)
   result: auto_linear
 
-- name: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
+- name: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   self: var_backward(grad, self, dim, correction, keepdim)
   # pointwise + sum
   result: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
 
-- name: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+- name: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   self: var_mean_backward(grads[0], grads[1], self, dim, correction, keepdim)
   result0: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
   # linear
@@ -1805,7 +1806,7 @@
   mask: non_differentiable
 
 - name: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
-  values: sparse_constructor_values_backward(grad, indices)
+  values: grad.sparse_mask(result)._values()
 
 - name: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
   self: at::_sparse_sum_backward(grad, self, dim)
@@ -2173,8 +2174,8 @@
   input, weight, bias: linear_backward(input, grad, weight, grad_input_mask)
 
 #mps
-- name: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  self: mps_max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode)
+- name: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+  self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode)
 
 - name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   self, weight, bias: "grad.defined() ? mps_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
@@ -2418,6 +2419,10 @@
   mat1: maybe_multiply(grad.sparse_mask(self).mm(mat2.mH()), alpha.conj())
   mat2: maybe_multiply(mat1.mH().mm(grad.sparse_mask(self)), alpha.conj())
 
+- name: _sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  self, other: "grad.defined() ? _sparse_mm_reduce_impl_backward(self, grad, other, reduce, result1, grad_input_mask) :  std::tuple<Tensor, Tensor>()"
+
 - name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   grad_output: smooth_l1_loss_backward(grad, self, target, reduction, beta)
   self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
@@ -2563,11 +2568,11 @@
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 #LSTM MPS
-- name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
-  output_differentiability: [True, True, True, False, False]
-  input, hx, params: "lstm_mps_backward(grads[0], grads[1], grads[2], result3, result4, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
+- name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+  output_differentiability: [True, True, True, False, False, False]
+  input, hx, params: "lstm_mps_backward(grads[0], grads[1], grads[2], result3, result4, input, result5, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
 
-- name: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+- name: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
 
 
 
@@ -2667,9 +2672,18 @@
   output_differentiability: [True, False]
   query, key, value: _scaled_dot_product_efficient_attention_backward(grad, query, key, value, result0, result1, is_causal, at::_chunk_grad_outputs_efficient_attention(query, key, value, is_causal))
 
-- name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+- name: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
   output_differentiability: [True, False]
   query, key, value: _efficient_attention_backward(grad, query, key, value, result0, result1, causal, at::_chunk_grad_outputs_efficient_attention(query, key, value, causal))
+# Returns ouput, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, rng_state
+
+- name: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False) -> (Tensor ouput, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, int philox_seed, int philox_offset, Tensor debug_attn_mask)
+  output_differentiability: [True, False, False, False, False, False, False, False, False]
+  query, key, value: _scaled_dot_product_flash_attention_backward(grad, query, key, value, ouput, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset)
+
+- name:  _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal, bool return_debug_mask) -> (Tensor output, Tensor softmax_logsumexp, int philox_seed, int philox_offset, Tensor debug_attn_mask)
+  output_differentiability: [True, False, False, False, False]
+  query, key, value: _flash_attention_backward(grad, query, key, value, output, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset)
 
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
@@ -2953,3 +2967,66 @@
 - name: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
   self: grad.reshape_symint(self.sym_sizes())
   result: auto_linear
+
+# note(crcrpar): `torchgen/api/autograd` logic would unwantedly replace substrings of `self` and `other` of function names.
+- name: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: div_tensor_self_backward(grads[i], other[i], self[i].scalar_type())
+  other: div_tensor_other_backward(grads[i], self[i], other[i])
+
+- name: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
+  self: pow_backward_self(grads[i], self[i], exponent[i])
+  exponent: pow_backward_exponent(grads[i], self[i], exponent[i], result[i])
+
+- name: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
+  self: pow_backward(grads[i], self[i], exponent[i])
+
+- name: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
+  exponent: pow_backward_exponent(grads[i], self, exponent[i], result[i])
+
+# Definitions below would be able to be generated by `torchgen` e.g. , but currently I see some weird numerical errors.
+- name: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: mul_tensor_backward(grads[i], other[i], self[i].scalar_type())
+  other: mul_tensor_backward(grads[i], self[i], other[i].scalar_type())
+
+- name: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+  self: handle_r_to_c(self[i].scalar_type(), grads[i])
+  other: handle_r_to_c(other[i].scalar_type(), maybe_multiply(-grads[i], alpha.conj()))
+
+- name: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: where(self[i] >= other[i], grads[i], at::scalar_tensor(0., grads[i].options()))
+  other: where(self[i] < other[i], grads[i], at::scalar_tensor(0., grads[i].options()))
+
+- name: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: where(self[i] <= other[i], grads[i], at::scalar_tensor(0., grads[i].options()))
+  other: where(self[i] > other[i], grads[i], at::scalar_tensor(0., grads[i].options()))
+
+- name: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: at::where(self[i] == other[i], grads[i] / 2, grads[i]).masked_fill_(self[i] > other[i], 0)
+  other: at::where(self[i] == other[i], grads[i] / 2, grads[i]).masked_fill_(self[i] < other[i], 0)
+
+- name: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+  self: at::where(self[i] == other[i], grads[i] / 2, grads[i]).masked_fill_(self[i] < other[i], 0)
+  other: at::where(self[i] == other[i], grads[i] / 2, grads[i]).masked_fill_(self[i] > other[i], 0)
+
+- name: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
+  self: grads[i] * (1 - weights[i]).conj()
+  tensors1: grads[i] * weights[i].conj()
+  weights: grads[i] * (tensors1[i] - self[i]).conj()
+
+- name: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
+  self: "weight.isComplex() ? grads[i] * (1 - weight.conj().toComplexDouble()) : grads[i] * (1 - weight.toDouble())"
+  tensors1: grads[i] * weight.conj()
+
+# note(crcrpar): following definitions seem necessary because the reference native functions
+# of `maximum` and `minimum` don't have the overload def with Scalar as their second argument.
+- name: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  self: at::where(self[i] == scalar, grads[i] / 2, grads[i]).masked_fill_(self[i] > scalar, 0)
+
+- name: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  self: at::where(self[i] == scalars[i], grads[i] / 2, grads[i]).masked_fill_(self[i] > scalars[i], 0)
+
+- name: _foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+  self: at::where(self[i] == scalar, grads[i] / 2, grads[i]).masked_fill_(self[i] < scalar, 0)
+
+- name: _foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+  self: at::where(self[i] == scalars[i], grads[i] / 2, grads[i]).masked_fill_(self[i] < scalars[i], 0)
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 6c78af9caa48..c4d1df00a95d 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -62,13 +62,11 @@ def gen_autograd(
     template_path = os.path.join(autograd_dir, "templates")
 
     native_funcs = parse_native_yaml(native_functions_path, tags_path).native_functions
-    fns = list(
-        sorted(
-            filter(
-                operator_selector.is_native_function_selected_for_training, native_funcs
-            ),
-            key=lambda f: cpp.name(f.func),
-        )
+    fns = sorted(
+        filter(
+            operator_selector.is_native_function_selected_for_training, native_funcs
+        ),
+        key=lambda f: cpp.name(f.func),
     )
     fns_with_diff_infos: List[
         NativeFunctionWithDifferentiabilityInfo
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index f7b30cf18ce7..6c2b5aeb301a 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -15,6 +15,7 @@
 )
 from torchgen.api.types import (
     ArrayRefCType,
+    BaseCppType,
     BaseCType,
     Binding,
     boolT,
@@ -369,6 +370,34 @@
 }
 """
 
+
+GETTER_BODY_VEC_SCALAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  if (prop[i].isComplex()) {
+    auto cprop = prop[i].to<c10::complex<double>>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyComplex_FromDoubles(cprop.real(), cprop.imag()));
+  } else if (prop[i].isFloatingPoint()) {
+    auto double_prop = prop[i].to<double>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble(double_prop));
+  } else if (prop[i].isIntegral(/*includeBool=*/false)) {
+    auto long_prop = prop[i].to<int64_t>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromLong(long_prop));
+  } else if (prop[i].isBoolean()) {
+    if (prop[i].to<bool>()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_True);
+    } else {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_False);
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+    return nullptr;
+  }
+}
+return tup;
+"""
+
+
 MISC_GETTER_DEFS = {
     OptionalCType(BaseCType(longT)): (GETTER_DEFINITION_OPT, GETTER_BODY_INT64_T),
     OptionalCType(BaseCType(SymIntT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SYMINT),
@@ -645,6 +674,38 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
                     op=info.op, name=name, body=GETTER_BODY_STRING
                 )
             )
+        elif type == ArrayRefCType(
+            elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+        ):
+            saved_variables.append(f"std::vector<at::Scalar> {name};")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}.clear();")
+            # release_variables.append(f"{name}_released_ = true;")
+            # unpack.append(f"auto {name} = unpack_list({name}_);")
+            # asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                CodeTemplate(
+                    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name};
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+                            """
+                ).substitute(
+                    op=info.op,
+                    name=name,
+                    body=GETTER_BODY_VEC_SCALAR,
+                )
+            )
         else:
             # Check for indicators that you're putting a non-owning reference
             # into the saved variable field.  If this is spuriously firing,
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index d79212a093b5..cd3a5ca0093a 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -158,7 +158,8 @@
 SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate(
     """\
 std::function<at::Tensor(const at::Tensor&)> func=nullptr;
-if (${is_view_with_metadata_change} || !self.unsafeGetTensorImpl()->support_as_strided()) {
+if (${is_view_with_metadata_change} || !self.unsafeGetTensorImpl()->support_as_strided() ||
+    c10::AutogradState::get_tls_state().get_view_replay_enabled()) {
   ${replay_view_func}
 }
 """
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index ee06a8ed1238..bb3d397402d9 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -117,14 +117,15 @@
     "_cholesky.*",
     "_triangular_solve.*",
     "_qr.*",
-    "_symeig.*",
     "_svd.*",
     "slice",
     "item",
     "_local_scalar_dense",
     "to",
     "_to_copy",
+    "_to_copy_out",
     "_reshape_copy",
+    "_reshape_copy_out",
     "copy_sparse_to_sparse_",
     "copy_",
     "numpy_T",
@@ -153,10 +154,10 @@
     "fill.Scalar",  # only used by the functionalization pass
     "lift.*",
     "normal_functional",  # only used by the functionalization pas
-    "_nested_tensor_strides",  # don't want to expose this to python
     "_nested_tensor_offsets",  # don't want to expose this to python
     "_nested_view_from_buffer",  # View only version of _nested_from_buffer. This will force users to only use the "safe" version.
     "_nested_view_from_buffer_copy",
+    "_nested_view_from_buffer_copy_out",
 ]
 
 SKIP_PYTHON_BINDINGS = list(
@@ -179,9 +180,14 @@
 
 @with_native_function
 def should_generate_py_binding(f: NativeFunction) -> bool:
-    # So far, all NativeFunctions that are entirely code-generated do not get python bindings.
-    if "generated" in f.tags:
+    # NativeFunctions that are entirely code-generated should not get python bindings
+    # because these codegen implementations are often inefficient. A handful of
+    # view_copy style ops were exposed accidentally when they were handwritten and now
+    # that we are moving them to codegen for bc reasons we need to keep them exposed in
+    # python.
+    if "generated" in f.tags and "view_copy" not in f.tags:
         return False
+
     name = cpp.name(f.func)
     for skip_regex in SKIP_PYTHON_BINDINGS:
         if skip_regex.match(name):
@@ -191,7 +197,6 @@ def should_generate_py_binding(f: NativeFunction) -> bool:
     for pattern in SKIP_PYTHON_BINDINGS_SIGNATURES:
         if pattern == signature:
             return False
-
     return True
 
 
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 45796d8ffa47..fc974b250949 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -19,33 +19,29 @@
 #   - all ops below are part of MANUAL_TRACER to skip codegen Tracer kernel registration
 # Note: we still register to dispatch key Profiler for these ops, keeping it untouched for now.
 # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
-MANUAL_BACKEND = set(
-    [
-        "options",
-        "data",
-        "set_data",
-        "is_leaf",
-        "output_nr",
-        "_version",
-        "retain_grad",
-        "_backward",
-        "requires_grad_",
-    ]
-)
+MANUAL_BACKEND = {
+    "options",
+    "data",
+    "set_data",
+    "is_leaf",
+    "output_nr",
+    "_version",
+    "retain_grad",
+    "_backward",
+    "requires_grad_",
+}
 
 # For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
 # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
-MANUAL_AUTOGRAD_AND_TRACER = set(
-    [
-        "resize_",
-        "resize_as_",
-        "detach",
-        "detach_",
-        "copy_",
-        "_fw_primal",
-        "_make_dual",
-    ]
-)
+MANUAL_AUTOGRAD_AND_TRACER = {
+    "resize_",
+    "resize_as_",
+    "detach",
+    "detach_",
+    "copy_",
+    "_fw_primal",
+    "_make_dual",
+}
 
 # Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:
 #   union(MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 4e1ca78e633a..4c709d29068a 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -39,6 +39,8 @@
 )
 
 from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCppType,
     BaseCType,
     Binding,
     DispatcherSignature,
@@ -185,6 +187,8 @@
     "fliplr",
     "flipud",
     "rot90",
+    "nanmean",
+    "nansum",
     "transpose",
     "permute",
     "squeeze",
@@ -245,6 +249,7 @@
     "log10",
     "log1p",
     "log2",
+    "logaddexp",
     "logcumsumexp",
     "reciprocal",
     "tan",
@@ -305,7 +310,6 @@
     "reflection_pad1d_backward",
     "reflection_pad2d_backward",
     "reflection_pad3d_backward",
-    "symeig",
     "_sparse_sparse_matmul",
     "replication_pad1d",
     "replication_pad2d",
@@ -377,7 +381,6 @@
     "coalesce",
     "values",
     "_sparse_coo_tensor_with_dims_and_tensors",
-    "sparse_mask_helper_cuda",
     "_sparse_addmm",
 }
 
@@ -970,10 +973,10 @@ def find_args_with_derivatives(
         """Find arguments that have derivative definitions"""
         if info is None or not info.has_derivatives:
             return differentiable_inputs
-        names = set(name for d in info.derivatives for name in d.var_names)
+        names = {name for d in info.derivatives for name in d.var_names}
         differentiable = [arg for arg in differentiable_inputs if arg.name in names]
         if len(differentiable) != len(names):
-            missing = names - set(arg.name for arg in differentiable)
+            missing = names - {arg.name for arg in differentiable}
             raise RuntimeError(
                 f"Missing arguments for derivatives: {missing} in {info.name}"
             )
@@ -1224,6 +1227,10 @@ def save_variables(
                 expr = f"std::string({expr})"
             elif type == OptionalCType(BaseCType(stringT)):
                 expr = f"{expr}.has_value() ? c10::optional<std::string>(std::string({expr}.value())) : c10::nullopt"
+            elif type == ArrayRefCType(
+                elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+            ):
+                expr = expr + ".vec()"
             guard = guard_for(arg)
             if guard is None:
                 if stmts_prepend:
diff --git a/tools/code_analyzer/gen_operators_yaml.py b/tools/code_analyzer/gen_operators_yaml.py
index 58b8763c142c..c9ab858f57a6 100644
--- a/tools/code_analyzer/gen_operators_yaml.py
+++ b/tools/code_analyzer/gen_operators_yaml.py
@@ -55,15 +55,15 @@
 # There are a few main inputs to this application
 # -----------------------------------------------
 #
-# 1. Inference Root Operators (--root_ops): Root operators (called directly
+# 1. Inference Root Operators (--root-ops): Root operators (called directly
 #    from TorchScript) used by inference use-cases.
 #
-# 2. Training Root Operators (--training_root_ops): Root operators used
+# 2. Training Root Operators (--training-root-ops): Root operators used
 #    by training use-cases. Currently, this list is the list of all operators
 #    used by training, and not just the root operators. All Training ops are
 #    also considered for inference, so these are merged into inference ops.
 #
-# 3. Operator Depencency Graph (--dep_graph_yaml_path): A path to the
+# 3. Operator Depencency Graph (--dep-graph-yaml-path): A path to the
 #    operator dependency graph used to determine which operators depend on
 #    which other operators for correct functioning. This is used for
 #    generating the transitive closure of all the operators used by the
@@ -71,12 +71,12 @@
 #    For tracing based selective build, we don't need to perform this
 #    transitive cloure.
 #
-# 4. Model Metadata (--model_name, --model_versions, --model_assets,
-#    --model_backends): Self-descriptive. These are used to tell this
+# 4. Model Metadata (--model-name, --model-versions, --model-assets,
+#    --model-backends): Self-descriptive. These are used to tell this
 #    script which model operator lists to fetch from the Unified Model
 #    Build Metadata YAML file.
 #
-# 5. Unified Model YAML file (--models_yaml_path): A path to the Unified
+# 5. Unified Model YAML file (--models-yaml-path): A path to the Unified
 #    model YAML operator list file. This yaml file contains (for each
 #    model/version/asset/backend) the set of used root and traced
 #    operators. This is used to extract the actual set of operators
@@ -490,45 +490,53 @@ def fill_output(output: Dict[str, object], options: object):
 
 def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
     parser.add_argument(
+        "--root-ops",
         "--root_ops",
         help="A comma separated list of root operators used by the model",
         required=False,
     )
     parser.add_argument(
+        "--training-root-ops",
         "--training_root_ops",
         help="A comma separated list of root operators used for training",
         required=False,
     )
     parser.add_argument(
+        "--output-path",
         "--output_path",
         help="The location of the output yaml file.",
         required=True,
     )
     parser.add_argument(
+        "--dep-graph-yaml-path",
         "--dep_graph_yaml_path",
         type=str,
         help="A path to the Operator Dependency Graph YAML file.",
         required=True,
     )
     parser.add_argument(
+        "--model-name",
         "--model_name",
         type=str,
         help="The name of the model that uses the specified root operators.",
         required=True,
     )
     parser.add_argument(
+        "--model-versions",
         "--model_versions",
         type=str,
         help="A comma separated list of model versions.",
         required=False,
     )
     parser.add_argument(
+        "--model-assets",
         "--model_assets",
         type=str,
         help="A comma separate list of model asset names (if absent, defaults to all assets for this model).",
         required=False,
     )
     parser.add_argument(
+        "--model-backends",
         "--model_backends",
         type=str,
         default="CPU",
@@ -536,12 +544,14 @@ def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
         required=False,
     )
     parser.add_argument(
+        "--models-yaml-path",
         "--models_yaml_path",
         type=str,
         help="The path to where the unified Mobile Model Config YAML resides.",
         required=True,
     )
     parser.add_argument(
+        "--include-all-operators",
         "--include_all_operators",
         action="store_true",
         default=False,
@@ -549,6 +559,7 @@ def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace:
         required=False,
     )
     parser.add_argument(
+        "--rule-name",
         "--rule_name",
         type=str,
         help="The name of pt_operator_library rule resulting in this generation",
diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py
index 18104ab30cb6..1ce54cb62438 100644
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@@ -40,7 +40,7 @@ def throw_if_any_op_includes_overloads(selective_builder: SelectiveBuilder) -> N
         raise Exception(
             (
                 "Operators that include all overloads are "
-                + "not allowed since --allow_include_all_overloads "
+                + "not allowed since --allow-include-all-overloads "
                 + "was specified: {}"
             ).format(", ".join(ops))
         )
@@ -99,6 +99,7 @@ def main(argv: List[Any]) -> None:
     """
     parser = argparse.ArgumentParser(description="Generate operator lists")
     parser.add_argument(
+        "--output-dir",
         "--output_dir",
         help=(
             "The directory to store the output yaml files (selected_mobile_ops.h, "
@@ -107,6 +108,7 @@ def main(argv: List[Any]) -> None:
         required=True,
     )
     parser.add_argument(
+        "--model-file-list-path",
         "--model_file_list_path",
         help=(
             "Path to a file that contains the locations of individual "
@@ -117,6 +119,7 @@ def main(argv: List[Any]) -> None:
         required=True,
     )
     parser.add_argument(
+        "--allow-include-all-overloads",
         "--allow_include_all_overloads",
         help=(
             "Flag to allow operators that include all overloads. "
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
index df03e6331728..dde6a72a1838 100644
--- a/tools/dynamo/verify_dynamo.py
+++ b/tools/dynamo/verify_dynamo.py
@@ -8,7 +8,8 @@
 from pkg_resources import packaging
 
 MIN_CUDA_VERSION = packaging.version.parse("11.6")
-MIN_PYTHON_VERSION = (3, 7)
+MIN_ROCM_VERSION = packaging.version.parse("5.4")
+MIN_PYTHON_VERSION = (3, 8)
 
 
 class VerifyDynamoError(BaseException):
@@ -52,6 +53,31 @@ def get_cuda_version():
     return packaging.version.parse(cuda_str_version)
 
 
+def get_rocm_version():
+    from torch.utils import cpp_extension
+
+    ROCM_HOME = cpp_extension._find_rocm_home()
+    if not ROCM_HOME:
+        raise VerifyDynamoError(
+            "ROCM was not found on the system, please set ROCM_HOME environment variable"
+        )
+
+    hipcc = os.path.join(ROCM_HOME, "bin", "hipcc")
+    hip_version_str = (
+        subprocess.check_output([hipcc, "--version"])
+        .strip()
+        .decode(*cpp_extension.SUBPROCESS_DECODE_ARGS)
+    )
+    hip_version = re.search(r"HIP version: (\d+[.]\d+)", hip_version_str)
+
+    if hip_version is None:
+        raise VerifyDynamoError("HIP version not found in `hipcc --version` output")
+
+    hip_str_version = hip_version.group(1)
+
+    return packaging.version.parse(hip_str_version)
+
+
 def check_cuda():
     import torch
 
@@ -81,7 +107,38 @@ def check_cuda():
             f"- minimum requirement: {MIN_CUDA_VERSION}"
         )
 
-    return cuda_ver
+    return cuda_ver if torch.version.hip is None else "None"
+
+
+def check_rocm():
+    import torch
+
+    if not torch.cuda.is_available() or torch.version.hip is None:
+        return None
+
+    # Extracts main ROCm version from full string
+    torch_rocm_ver = packaging.version.parse(
+        ".".join(list(torch.version.hip.split(".")[0:2]))
+    )
+
+    # check if torch rocm version matches system rocm version
+    rocm_ver = get_rocm_version()
+    if rocm_ver != torch_rocm_ver:
+        warnings.warn(
+            f"ROCm version mismatch, `torch` version: {torch_rocm_ver}, env version: {rocm_ver}"
+        )
+    if torch_rocm_ver < MIN_ROCM_VERSION:
+        warnings.warn(
+            f"(`torch`) ROCm version not supported: {torch_rocm_ver} "
+            f"- minimum requirement: {MIN_ROCM_VERSION}"
+        )
+    if rocm_ver < MIN_ROCM_VERSION:
+        warnings.warn(
+            f"(env) ROCm version not supported: {rocm_ver} "
+            f"- minimum requirement: {MIN_ROCM_VERSION}"
+        )
+
+    return rocm_ver if torch.version.hip else "None"
 
 
 def check_dynamo(backend, device, err_msg):
@@ -112,9 +169,6 @@ def fn(x):
             return x + x
 
         class Module(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x):
                 return x + x
 
@@ -153,12 +207,18 @@ def main():
     python_ver = check_python()
     torch_ver = check_torch()
     cuda_ver = check_cuda()
+    rocm_ver = check_rocm()
     print(
         f"Python version: {python_ver.major}.{python_ver.minor}.{python_ver.micro}\n"
         f"`torch` version: {torch_ver}\n"
         f"CUDA version: {cuda_ver}\n"
+        f"ROCM version: {rocm_ver}\n"
     )
     for args in _SANITY_CHECK_ARGS:
+        # TODO remove check when 3.11 is supported
+        if sys.version_info >= (3, 11):
+            warnings.warn("Dynamo not yet supported in Python 3.11. Skipping check.")
+            continue
         check_dynamo(*args)
     print("All required checks passed")
 
diff --git a/tools/extract_scripts.py b/tools/extract_scripts.py
index 7a9a29decc5a..c420c1565f9d 100755
--- a/tools/extract_scripts.py
+++ b/tools/extract_scripts.py
@@ -7,7 +7,7 @@
 from typing import Any, Dict, Optional
 
 import yaml
-from typing_extensions import TypedDict
+from typing_extensions import TypedDict  # Python 3.11+
 
 Step = Dict[str, Any]
 
diff --git a/tools/fast_nvcc/fast_nvcc.py b/tools/fast_nvcc/fast_nvcc.py
index 3b79e4f0eac4..659d91ae3c1f 100755
--- a/tools/fast_nvcc/fast_nvcc.py
+++ b/tools/fast_nvcc/fast_nvcc.py
@@ -16,7 +16,7 @@
 import time
 from typing import Awaitable, cast, DefaultDict, Dict, List, Match, Optional, Set
 
-from typing_extensions import TypedDict
+from typing_extensions import TypedDict  # Python 3.11+
 
 help_msg = """fast_nvcc [OPTION]... -- [NVCC_ARG]...
 
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 603fbf7632ed..9269f39cda4c 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -61,7 +61,7 @@ def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
         return mapping
 
 
-class VulkanShaderGenerator(object):
+class VulkanShaderGenerator:
     standard_header = """
 #version 450 core
 #define PRECISION $precision
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index 1586ff15fd20..9e9f73b031f8 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -61,12 +61,13 @@ def get_torch_version(sha: Optional[str] = None) -> str:
         description="Generate torch/version.py from build and environment metadata."
     )
     parser.add_argument(
+        "--is-debug",
         "--is_debug",
         type=distutils.util.strtobool,
         help="Whether this build is debug mode or not.",
     )
-    parser.add_argument("--cuda_version", type=str)
-    parser.add_argument("--hip_version", type=str)
+    parser.add_argument("--cuda-version", "--cuda_version", type=str)
+    parser.add_argument("--hip-version", "--hip_version", type=str)
 
     args = parser.parse_args()
 
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index 79c594a9afa0..6179d6afe482 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -4,7 +4,7 @@
 import pathlib
 import sys
 from dataclasses import dataclass
-from typing import List, Sequence, Union
+from typing import List, Literal, Sequence, Union
 
 import yaml
 
@@ -17,13 +17,12 @@
 from torchgen.model import Argument, NativeFunction, NativeFunctionsGroup, Variant
 from torchgen.selective_build.selector import SelectiveBuilder
 from torchgen.utils import FileManager, make_file_manager, mapMaybe, Target
-from typing_extensions import Literal
 
 
 # Generates UnboxingFunctions.h & UnboxingFunctions.cpp.
 @dataclass(frozen=True)
 class ComputeUnboxingFunctions:
-    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+    target: Literal[Target.DECLARATION, Target.DEFINITION]
     selector: SelectiveBuilder
 
     @method_with_native_function
@@ -205,7 +204,11 @@ def main(args: List[str]) -> None:
         default="aten/src/ATen",
     )
     parser.add_argument(
-        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+        "-d",
+        "--install-dir",
+        "--install_dir",
+        help="output directory",
+        default="build/aten/src/ATen",
     )
     parser.add_argument(
         "-o",
@@ -218,6 +221,7 @@ def main(args: List[str]) -> None:
         help="run without writing any files (still updates outputs)",
     )
     parser.add_argument(
+        "--op-selection-yaml-path",
         "--op_selection_yaml_path",
         help="Provide a path to the operator selection (for custom build) YAML "
         "that contains the information about the set of selected operators "
@@ -226,6 +230,7 @@ def main(args: List[str]) -> None:
         "The operator names also contain the namespace prefix (e.g. aten::)",
     )
     parser.add_argument(
+        "--op-registration-allowlist",
         "--op_registration_allowlist",
         nargs="*",
         help="filter op registrations by the allowlist (if set); "
@@ -233,6 +238,7 @@ def main(args: List[str]) -> None:
         "e.g.: aten::empty aten::conv2d ...",
     )
     parser.add_argument(
+        "--TEST-ONLY-op-registration-allowlist-yaml-path",
         "--TEST_ONLY_op_registration_allowlist_yaml_path",
         help="Provide a path to the operator selection (for custom build) YAML "
         "which contains a list of operators. It is to serve testing purpose and "
diff --git a/tools/jit/test/test_gen_unboxing.py b/tools/jit/test/test_gen_unboxing.py
index de016b164222..e4f228063199 100644
--- a/tools/jit/test/test_gen_unboxing.py
+++ b/tools/jit/test/test_gen_unboxing.py
@@ -17,7 +17,7 @@ def test_get_custom_build_selector_with_allowlist(
         mock_parse_native_yaml: NonCallableMock,
         mock_get_custom_build_selector: NonCallableMock,
     ) -> None:
-        args = ["--op_registration_allowlist=op1", "--op_selection_yaml_path=path2"]
+        args = ["--op-registration-allowlist=op1", "--op-selection-yaml-path=path2"]
         gen_unboxing.main(args)
         mock_get_custom_build_selector.assert_called_once_with(["op1"], "path2")
 
@@ -32,8 +32,8 @@ def test_get_custom_build_selector_with_allowlist_yaml(
         temp_file.write(b"- aten::add.Tensor")
         temp_file.seek(0)
         args = [
-            f"--TEST_ONLY_op_registration_allowlist_yaml_path={temp_file.name}",
-            "--op_selection_yaml_path=path2",
+            f"--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
+            "--op-selection-yaml-path=path2",
         ]
         gen_unboxing.main(args)
         mock_get_custom_build_selector.assert_called_once_with(
@@ -52,9 +52,9 @@ def test_get_custom_build_selector_with_both_allowlist_and_yaml(
         temp_file.write(b"- aten::add.Tensor")
         temp_file.seek(0)
         args = [
-            "--op_registration_allowlist=op1",
-            "--TEST_ONLY_op_registration_allowlist_yaml_path={temp_file.name}",
-            "--op_selection_yaml_path=path2",
+            "--op-registration-allowlist=op1",
+            "--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
+            "--op-selection-yaml-path=path2",
         ]
         gen_unboxing.main(args)
         mock_get_custom_build_selector.assert_called_once_with(["op1"], "path2")
diff --git a/tools/linter/adapters/actionlint_linter.py b/tools/linter/adapters/actionlint_linter.py
index d9131b37ec00..169451ca1cec 100644
--- a/tools/linter/adapters/actionlint_linter.py
+++ b/tools/linter/adapters/actionlint_linter.py
@@ -53,8 +53,7 @@ def run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
     finally:
         end_time = time.monotonic()
diff --git a/tools/linter/adapters/bazel_linter.py b/tools/linter/adapters/bazel_linter.py
new file mode 100644
index 000000000000..fd8eddea4841
--- /dev/null
+++ b/tools/linter/adapters/bazel_linter.py
@@ -0,0 +1,175 @@
+"""
+This linter ensures that users don't set a SHA hash checksum in Bazel for the http_archive.
+Although the security practice of setting the checksum is good, it doesn't work when the
+archive is downloaded from some sites like GitHub because it can change. Specifically,
+GitHub gives no guarantee to keep the same value forever. Check for more details at
+https://github.com/community/community/discussions/46034.
+"""
+import argparse
+import json
+import re
+import subprocess
+import xml.etree.ElementTree as ET
+from enum import Enum
+from typing import List, NamedTuple, Optional, Set
+from urllib.parse import urlparse
+
+
+LINTER_CODE = "BAZEL_LINTER"
+SHA256_REGEX = re.compile(r"\s*sha256\s*=\s*['\"](?P<sha256>[a-zA-Z0-9]{64})['\"]\s*,")
+DOMAINS_WITH_UNSTABLE_CHECKSUM = {"github.com"}
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+def is_required_checksum(urls: List[Optional[str]]) -> bool:
+    if not urls:
+        return False
+
+    for url in urls:
+        if not url:
+            continue
+
+        parsed_url = urlparse(url)
+        if parsed_url.hostname in DOMAINS_WITH_UNSTABLE_CHECKSUM:
+            return False
+
+    return True
+
+
+def get_disallowed_checksums(
+    binary: str,
+) -> Set[str]:
+    """
+    Return the set of disallowed checksums from all http_archive rules
+    """
+    try:
+        # Use bazel to get the list of external dependencies in XML format
+        proc = subprocess.run(
+            [binary, "query", "kind(http_archive, //external:*)", "--output=xml"],
+            capture_output=True,
+        )
+    except OSError:
+        raise
+
+    stdout = str(proc.stdout, "utf-8").strip()
+    root = ET.fromstring(stdout)
+
+    disallowed_checksums = set()
+    # Parse all the http_archive rules in the XML output
+    for rule in root.findall('.//rule[@class="http_archive"]'):
+        urls_node = rule.find('.//list[@name="urls"]')
+        if urls_node is None:
+            continue
+        urls = [n.get("value") for n in urls_node.findall(".//string")]
+
+        checksum_node = rule.find('.//string[@name="sha256"]')
+        if checksum_node is None:
+            continue
+        checksum = checksum_node.get("value")
+
+        if not checksum:
+            continue
+
+        if not is_required_checksum(urls):
+            disallowed_checksums.add(checksum)
+
+    return disallowed_checksums
+
+
+def check_bazel(
+    filename: str,
+    disallowed_checksums: Set[str],
+) -> List[LintMessage]:
+    original = ""
+    replacement = ""
+
+    with open(filename) as f:
+        for line in f:
+            original += f"{line}"
+
+            m = SHA256_REGEX.match(line)
+            if m:
+                sha256 = m.group("sha256")
+
+                if sha256 in disallowed_checksums:
+                    continue
+
+            replacement += f"{line}"
+
+        if original == replacement:
+            return []
+
+        return [
+            LintMessage(
+                path=filename,
+                line=None,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ADVICE,
+                name="format",
+                original=original,
+                replacement=replacement,
+                description="Found redundant SHA checksums. Run `lintrunner -a` to apply this patch.",
+            )
+        ]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="A custom linter to detect redundant SHA checksums in Bazel",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--binary",
+        required=True,
+        help="bazel binary path",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+    args = parser.parse_args()
+
+    try:
+        disallowed_checksums = get_disallowed_checksums(args.binary)
+    except Exception as e:
+        err_msg = LintMessage(
+            path=None,
+            line=None,
+            char=None,
+            code=LINTER_CODE,
+            severity=LintSeverity.ERROR,
+            name="command-failed",
+            original=None,
+            replacement=None,
+            description=(f"Failed due to {e.__class__.__name__}:\n{e}"),
+        )
+        print(json.dumps(err_msg._asdict()), flush=True)
+        exit(0)
+
+    for filename in args.filenames:
+        for lint_message in check_bazel(filename, disallowed_checksums):
+            print(json.dumps(lint_message._asdict()), flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/black_linter.py b/tools/linter/adapters/black_linter.py
index 8459b6a1e142..617bfb1d39cc 100644
--- a/tools/linter/adapters/black_linter.py
+++ b/tools/linter/adapters/black_linter.py
@@ -52,8 +52,7 @@ def _run_command(
         return subprocess.run(
             args,
             stdin=stdin,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
             shell=IS_WINDOWS,  # So batch scripts are found.
             timeout=timeout,
             check=True,
diff --git a/tools/linter/adapters/circleci_linter.py b/tools/linter/adapters/circleci_linter.py
index 6200b383ee35..517bfe9394e7 100644
--- a/tools/linter/adapters/circleci_linter.py
+++ b/tools/linter/adapters/circleci_linter.py
@@ -53,8 +53,7 @@ def run_command(args: List[str], cwd: str) -> "subprocess.CompletedProcess[bytes
         return subprocess.run(
             args,
             cwd=cwd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
             check=True,
         )
     finally:
diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py
index 3445dee4e540..f30275684406 100644
--- a/tools/linter/adapters/clangformat_linter.py
+++ b/tools/linter/adapters/clangformat_linter.py
@@ -51,8 +51,7 @@ def _run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
             shell=IS_WINDOWS,  # So batch scripts are found.
             timeout=timeout,
             check=True,
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index d7e19452df03..081c343ec3f1 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -77,8 +77,7 @@ def run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
             check=False,
         )
     finally:
@@ -106,8 +105,7 @@ def clang_search_dirs() -> List[str]:
     result = subprocess.run(
         [compiler, "-E", "-x", "c++", "-", "-v"],
         stdin=subprocess.DEVNULL,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
         check=True,
     )
     stderr = result.stderr.decode().strip().split("\n")
@@ -204,6 +202,7 @@ def main() -> None:
         help="clang-tidy binary path",
     )
     parser.add_argument(
+        "--build-dir",
         "--build_dir",
         required=True,
         help=(
@@ -253,6 +252,14 @@ def main() -> None:
 
     abs_build_dir = Path(args.build_dir).resolve()
 
+    # Get the absolute path to clang-tidy and use this instead of the relative
+    # path such as .lintbin/clang-tidy. The problem here is that os.chdir is
+    # per process, and the linter uses it to move between the current directory
+    # and the build folder. And there is no .lintbin directory in the latter.
+    # When it happens in a race condition, the linter command will fails with
+    # the following no such file or directory error: '.lintbin/clang-tidy'
+    binary_path = os.path.abspath(args.binary)
+
     with concurrent.futures.ThreadPoolExecutor(
         max_workers=os.cpu_count(),
         thread_name_prefix="Thread",
@@ -261,7 +268,7 @@ def main() -> None:
             executor.submit(
                 check_file,
                 filename,
-                args.binary,
+                binary_path,
                 abs_build_dir,
             ): filename
             for filename in args.filenames
diff --git a/tools/linter/adapters/cmake_linter.py b/tools/linter/adapters/cmake_linter.py
index 0847f5617cbc..c5de15352c27 100644
--- a/tools/linter/adapters/cmake_linter.py
+++ b/tools/linter/adapters/cmake_linter.py
@@ -53,8 +53,7 @@ def run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
     finally:
         end_time = time.monotonic()
diff --git a/tools/linter/adapters/constexpr_linter.py b/tools/linter/adapters/constexpr_linter.py
new file mode 100644
index 000000000000..16dd80c5d532
--- /dev/null
+++ b/tools/linter/adapters/constexpr_linter.py
@@ -0,0 +1,94 @@
+"""
+CONSTEXPR: Ensures users don't use vanilla constexpr since it causes issues
+"""
+
+import argparse
+import json
+import logging
+import sys
+
+from enum import Enum
+from typing import NamedTuple, Optional
+
+CONSTEXPR = "constexpr char"
+CONSTEXPR_MACRO = "CONSTEXPR_EXCEPT_WIN_CUDA char"
+
+LINTER_CODE = "CONSTEXPR"
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+
+
+class LintMessage(NamedTuple):
+    path: Optional[str]
+    line: Optional[int]
+    char: Optional[int]
+    code: str
+    severity: LintSeverity
+    name: str
+    original: Optional[str]
+    replacement: Optional[str]
+    description: Optional[str]
+
+
+def check_file(filename: str) -> Optional[LintMessage]:
+    logging.debug("Checking file %s", filename)
+
+    with open(filename, "r") as f:
+        lines = f.readlines()
+
+    for idx, line in enumerate(lines):
+        if CONSTEXPR in line:
+            original = "".join(lines)
+            replacement = original.replace(CONSTEXPR, CONSTEXPR_MACRO)
+            logging.debug(f"replacement: {replacement}")
+            return LintMessage(
+                path=filename,
+                line=idx,
+                char=None,
+                code=LINTER_CODE,
+                severity=LintSeverity.ERROR,
+                name="Vanilla constexpr used, prefer macros",
+                original=original,
+                replacement=replacement,
+                description="Vanilla constexpr used, prefer macros run `lintrunner --take CONSTEXPR -a` to apply changes.",
+            )
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="CONSTEXPR linter",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+    )
+    parser.add_argument(
+        "filenames",
+        nargs="+",
+        help="paths to lint",
+    )
+
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="<%(threadName)s:%(levelname)s> %(message)s",
+        level=logging.NOTSET
+        if args.verbose
+        else logging.DEBUG
+        if len(args.filenames) < 1000
+        else logging.INFO,
+        stream=sys.stderr,
+    )
+
+    lint_messages = []
+    for filename in args.filenames:
+        lint_message = check_file(filename)
+        if lint_message is not None:
+            lint_messages.append(lint_message)
+
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py
index 26f8dd8eec3f..97b57d9c8704 100644
--- a/tools/linter/adapters/flake8_linter.py
+++ b/tools/linter/adapters/flake8_linter.py
@@ -148,8 +148,7 @@ def _run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
             check=True,
             encoding="utf-8",
         )
diff --git a/tools/linter/adapters/grep_linter.py b/tools/linter/adapters/grep_linter.py
index f6bd714eb4a7..21c8a210b2b6 100644
--- a/tools/linter/adapters/grep_linter.py
+++ b/tools/linter/adapters/grep_linter.py
@@ -51,8 +51,7 @@ def run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
     finally:
         end_time = time.monotonic()
diff --git a/tools/linter/adapters/mypy_linter.py b/tools/linter/adapters/mypy_linter.py
index cd94879fa0f9..0cd0c62df3ca 100644
--- a/tools/linter/adapters/mypy_linter.py
+++ b/tools/linter/adapters/mypy_linter.py
@@ -67,8 +67,7 @@ def run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
     finally:
         end_time = time.monotonic()
diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json
index a6362a12922b..dbb20e2ed7a0 100644
--- a/tools/linter/adapters/s3_init_config.json
+++ b/tools/linter/adapters/s3_init_config.json
@@ -39,5 +39,15 @@
             "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint",
             "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76"
         }
+    },
+    "bazel": {
+        "Darwin": {
+            "download_url": "https://ossci-macos.s3.amazonaws.com/bazel-4.2.1-darwin-x86_64",
+            "hash": "74d93848f0c9d592e341e48341c53c87e3cb304a54a2a1ee9cff3df422f0b23c"
+        },
+        "Linux": {
+            "download_url": "https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64",
+            "hash": "1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c"
+        }
     }
 }
diff --git a/tools/linter/adapters/shellcheck_linter.py b/tools/linter/adapters/shellcheck_linter.py
index 025595d39f29..bcf0b2a517b0 100644
--- a/tools/linter/adapters/shellcheck_linter.py
+++ b/tools/linter/adapters/shellcheck_linter.py
@@ -38,8 +38,7 @@ def run_command(
     try:
         return subprocess.run(
             args,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
     finally:
         end_time = time.monotonic()
diff --git a/tools/linter/adapters/workflow_consistency_linter.py b/tools/linter/adapters/workflow_consistency_linter.py
index 6e5fb4db20ff..0359a52f1055 100644
--- a/tools/linter/adapters/workflow_consistency_linter.py
+++ b/tools/linter/adapters/workflow_consistency_linter.py
@@ -10,7 +10,13 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, NamedTuple, Optional
 
-from yaml import CSafeLoader, dump, load
+from yaml import dump, load
+
+# Safely load fast C Yaml loader/dumper if they are available
+try:
+    from yaml import CSafeLoader as Loader
+except ImportError:
+    from yaml import SafeLoader as Loader  # type: ignore[misc]
 
 
 class LintSeverity(str, Enum):
@@ -38,7 +44,7 @@ def glob_yamls(path: Path) -> Iterable[Path]:
 
 def load_yaml(path: Path) -> Any:
     with open(path) as f:
-        return load(f, CSafeLoader)
+        return load(f, Loader)
 
 
 def is_workflow(yaml: Any) -> bool:
diff --git a/tools/linter/clang_tidy/generate_build_files.py b/tools/linter/clang_tidy/generate_build_files.py
index 3986d3d28e4d..7e56ecb6d3b5 100644
--- a/tools/linter/clang_tidy/generate_build_files.py
+++ b/tools/linter/clang_tidy/generate_build_files.py
@@ -8,8 +8,7 @@ def run_cmd(cmd: List[str]) -> None:
     print(f"Running: {cmd}")
     result = subprocess.run(
         cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
     )
     stdout, stderr = (
         result.stdout.decode("utf-8").strip(),
@@ -22,10 +21,6 @@ def run_cmd(cmd: List[str]) -> None:
         exit(1)
 
 
-def run_timed_cmd(cmd: List[str]) -> None:
-    run_cmd(["time"] + cmd)
-
-
 def update_submodules() -> None:
     run_cmd(["git", "submodule", "update", "--init", "--recursive"])
 
@@ -34,11 +29,11 @@ def gen_compile_commands() -> None:
     os.environ["USE_NCCL"] = "0"
     os.environ["CC"] = "clang"
     os.environ["CXX"] = "clang++"
-    run_timed_cmd([sys.executable, "setup.py", "--cmake-only", "build"])
+    run_cmd([sys.executable, "setup.py", "--cmake-only", "build"])
 
 
 def run_autogen() -> None:
-    run_timed_cmd(
+    run_cmd(
         [
             sys.executable,
             "-m",
@@ -51,7 +46,7 @@ def run_autogen() -> None:
         ]
     )
 
-    run_timed_cmd(
+    run_cmd(
         [
             sys.executable,
             "tools/setup_helpers/generate_code.py",
@@ -59,7 +54,7 @@ def run_autogen() -> None:
             "aten/src/ATen/native/native_functions.yaml",
             "--tags-path",
             "aten/src/ATen/native/tags.yaml",
-            "--gen_lazy_ts_backend",
+            "--gen-lazy-ts-backend",
         ]
     )
 
diff --git a/tools/lite_interpreter/gen_selected_mobile_ops_header.py b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
index aebb36ca156b..b260005d786b 100644
--- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py
+++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
@@ -147,6 +147,7 @@ def main() -> None:
     )
     parser.add_argument(
         "-p",
+        "--yaml-file-path",
         "--yaml_file_path",
         type=str,
         required=True,
@@ -154,6 +155,7 @@ def main() -> None:
     )
     parser.add_argument(
         "-o",
+        "--output-file-path",
         "--output_file_path",
         type=str,
         required=True,
diff --git a/tools/nightly.py b/tools/nightly.py
index 4d1c9291fd8b..3fa821ffb924 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -214,16 +214,15 @@ def check_branch(subcommand: str, branch: Optional[str]) -> Optional[str]:
     cmd = ["git", "status", "--untracked-files=no", "--porcelain"]
     p = subprocess.run(
         cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
         check=True,
-        universal_newlines=True,
+        text=True,
     )
     if p.stdout.strip():
         return "Need to have clean working tree to checkout!\n\n" + p.stdout
     # next check that the branch name doesn't already exist
     cmd = ["git", "show-ref", "--verify", "--quiet", "refs/heads/" + branch]
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)  # type: ignore[assignment]
+    p = subprocess.run(cmd, capture_output=True, check=False)  # type: ignore[assignment]
     if not p.returncode:
         return f"Branch {branch!r} already exists"
     return None
@@ -314,7 +313,7 @@ def conda_solve(
     )
     cmd.extend(channel_args)
     cmd.extend(SPECS_TO_INSTALL)
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+    p = subprocess.run(cmd, capture_output=True, check=True)
     # parse solution
     solve = json.loads(p.stdout)
     link = solve["actions"]["LINK"]
@@ -363,7 +362,7 @@ def _site_packages(dirname: str, platform: str) -> str:
 def _ensure_commit(git_sha1: str) -> None:
     """Make sure that we actually have the commit locally"""
     cmd = ["git", "cat-file", "-e", git_sha1 + "^{commit}"]
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
+    p = subprocess.run(cmd, capture_output=True, check=False)
     if p.returncode == 0:
         # we have the commit locally
         return
@@ -390,10 +389,9 @@ def _nightly_version(spdir: str) -> str:
     cmd = ["git", "show", "--no-patch", "--format=%s", git_version]
     p = subprocess.run(
         cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
         check=True,
-        universal_newlines=True,
+        text=True,
     )
     m = SHA1_RE.search(p.stdout)
     if m is None:
@@ -544,9 +542,8 @@ def _available_envs() -> Dict[str, str]:
     p = subprocess.run(
         cmd,
         check=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        universal_newlines=True,
+        capture_output=True,
+        text=True,
     )
     lines = p.stdout.splitlines()
     envs = {}
diff --git a/tools/onnx/gen_diagnostics.py b/tools/onnx/gen_diagnostics.py
index 92960024e048..bade0a50ed92 100644
--- a/tools/onnx/gen_diagnostics.py
+++ b/tools/onnx/gen_diagnostics.py
@@ -40,13 +40,27 @@
 _PY_RULE_CLASS_TEMPLATE = """\
 class _{pascal_case_name}(infra.Rule):
     \"\"\"{short_description}\"\"\"
-    def format_message(self, {message_arguments}) -> str:  # type: ignore[override]
+    def format_message(  # type: ignore[override]
+        self,
+        {message_arguments}
+    ) -> str:
         \"\"\"Returns the formatted default message of this Rule.
 
         Message template: {message_template}
         \"\"\"
         return self.message_default_template.format({message_arguments_assigned})
 
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+        {message_arguments}
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        \"\"\"Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: {message_template}
+        \"\"\"
+        return self, level, self.format_message({message_arguments_assigned})
+
 """
 
 _PY_RULE_COLLECTION_FIELD_TEMPLATE = """\
diff --git a/tools/onnx/sarif/gen_sarif.sh b/tools/onnx/sarif/gen_sarif.sh
index 2099b92838ea..a7e6ce0f6a3b 100755
--- a/tools/onnx/sarif/gen_sarif.sh
+++ b/tools/onnx/sarif/gen_sarif.sh
@@ -33,7 +33,7 @@ python -m jschema_to_python \
     -vv
 
 # Generate SARIF version file
-echo "from typing_extensions import Final" > "${ROOT}/${SARIF_DIR}/version.py"
+echo "from typing import Final" > "${ROOT}/${SARIF_DIR}/version.py"
 echo "SARIF_VERSION: Final = \"${SARIF_VERSION}\"" >> "${ROOT}/${SARIF_DIR}/version.py"
 echo "SARIF_SCHEMA_LINK: Final = \"${SARIF_SCHEMA_LINK}\"" >> "${ROOT}/${SARIF_DIR}/version.py"
 
diff --git a/tools/onnx/templates/rules.py.in b/tools/onnx/templates/rules.py.in
index 2137119d14c2..19b1e08d50fc 100644
--- a/tools/onnx/templates/rules.py.in
+++ b/tools/onnx/templates/rules.py.in
@@ -3,6 +3,7 @@ ${generated_comment}
 """
 
 import dataclasses
+from typing import Tuple
 
 # flake8: noqa
 from torch.onnx._internal.diagnostics import infra
diff --git a/tools/onnx/update_default_opset_version.py b/tools/onnx/update_default_opset_version.py
index 9c4b0e099be8..6dc6ffbd2890 100755
--- a/tools/onnx/update_default_opset_version.py
+++ b/tools/onnx/update_default_opset_version.py
@@ -107,6 +107,9 @@ def main(args: Any) -> None:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--skip_build", action="store_true", help="Skip building pytorch"
+        "--skip-build",
+        "--skip_build",
+        action="store_true",
+        help="Skip building pytorch",
     )
     main(parser.parse_args())
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index a362598fe8fe..cf2deecbe0aa 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -298,6 +298,7 @@ def gen_nn_functional(fm: FileManager) -> None:
         "softplus",
         "softshrink",
         "one_hot",
+        "scaled_dot_product_attention",
     ]
     import_code = ["from .. import {0} as {0}".format(_) for _ in imports]
     # TODO make these types more precise
diff --git a/tools/rules_cc/cuda_support.patch b/tools/rules_cc/cuda_support.patch
new file mode 100644
index 000000000000..d097eee5036a
--- /dev/null
+++ b/tools/rules_cc/cuda_support.patch
@@ -0,0 +1,80 @@
+diff --git cc/private/toolchain/unix_cc_configure.bzl cc/private/toolchain/unix_cc_configure.bzl
+index ba992fc..e4e8364 100644
+--- cc/private/toolchain/unix_cc_configure.bzl
++++ cc/private/toolchain/unix_cc_configure.bzl
+@@ -27,6 +27,7 @@ load(
+     "which",
+     "write_builtin_include_directory_paths",
+ )
++load("@rules_cuda//cuda:toolchain.bzl", "cuda_compiler_deps")
+ 
+ def _field(name, value):
+     """Returns properly indented top level crosstool field."""
+@@ -397,7 +398,7 @@ def configure_unix_toolchain(repository_ctx, cpu_value, overriden_tools):
+     cxx_opts = split_escaped(get_env_var(
+         repository_ctx,
+         "BAZEL_CXXOPTS",
+-        "-std=c++0x",
++        "-std=c++11",
+         False,
+     ), ":")
+ 
+@@ -463,7 +464,7 @@ def configure_unix_toolchain(repository_ctx, cpu_value, overriden_tools):
+             )),
+             "%{cc_compiler_deps}": get_starlark_list([":builtin_include_directory_paths"] + (
+                 [":cc_wrapper"] if darwin else []
+-            )),
++            ) + cuda_compiler_deps()),
+             "%{cc_toolchain_identifier}": cc_toolchain_identifier,
+             "%{compile_flags}": get_starlark_list(
+                 [
+diff --git cc/private/toolchain/unix_cc_toolchain_config.bzl cc/private/toolchain/unix_cc_toolchain_config.bzl
+index c3cf3ba..1744eb4 100644
+--- cc/private/toolchain/unix_cc_toolchain_config.bzl
++++ cc/private/toolchain/unix_cc_toolchain_config.bzl
+@@ -25,6 +25,7 @@ load(
+     "variable_with_value",
+     "with_feature_set",
+ )
++load("@rules_cuda//cuda:toolchain.bzl", "cuda_toolchain_config")
+ 
+ all_compile_actions = [
+     ACTION_NAMES.c_compile,
+@@ -580,7 +581,8 @@ def _impl(ctx):
+                 ],
+                 flag_groups = [
+                     flag_group(
+-                        flags = ["-iquote", "%{quote_include_paths}"],
++                        # -isystem because there is an nvcc thing where it doesn't forward -iquote to host compiler.
++                        flags = ["-isystem", "%{quote_include_paths}"],
+                         iterate_over = "quote_include_paths",
+                     ),
+                     flag_group(
+@@ -1152,10 +1154,15 @@ def _impl(ctx):
+             unfiltered_compile_flags_feature,
+         ]
+ 
++    cuda = cuda_toolchain_config(
++        cuda_toolchain_info = ctx.attr._cuda_toolchain_info,
++        compiler_path = ctx.attr.tool_paths["gcc"],
++    )
++
+     return cc_common.create_cc_toolchain_config_info(
+         ctx = ctx,
+-        features = features,
+-        action_configs = action_configs,
++        features = features + cuda.features,
++        action_configs = action_configs + cuda.action_configs,
+         cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+         toolchain_identifier = ctx.attr.toolchain_identifier,
+         host_system_name = ctx.attr.host_system_name,
+@@ -1192,6 +1199,9 @@ cc_toolchain_config = rule(
+         "tool_paths": attr.string_dict(),
+         "toolchain_identifier": attr.string(mandatory = True),
+         "unfiltered_compile_flags": attr.string_list(),
++        "_cuda_toolchain_info": attr.label(
++            default = Label("@rules_cuda//cuda:cuda_toolchain_info"),
++        ),
+     },
+     provides = [CcToolchainConfigInfo],
+ )
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 5ce3f3009b3c..22bf230865d9 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -144,7 +144,7 @@ def generate(
             args.append("-GNinja")
         elif IS_WINDOWS:
             generator = os.getenv("CMAKE_GENERATOR", "Visual Studio 15 2017")
-            supported = ["Visual Studio 15 2017", "Visual Studio 16 2019"]
+            supported = ["Visual Studio 16 2019", "Visual Studio 17 2022"]
             if generator not in supported:
                 print("Unsupported `CMAKE_GENERATOR`: " + generator)
                 print("Please set it to one of the following values: ")
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
index cb0c4650e691..7c626f6be7c0 100644
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@@ -43,7 +43,7 @@ def lib_paths_from_base(base_path: str) -> List[str]:
     os.environ["CXXFLAGS"] = os.environ["CFLAGS"]
 
 
-class BuildType(object):
+class BuildType:
     """Checks build type. The build type will be given in :attr:`cmake_build_type_env`. If :attr:`cmake_build_type_env`
     is ``None``, then the build type will be inferred from ``CMakeCache.txt``. If ``CMakeCache.txt`` does not exist,
     os.environ['CMAKE_BUILD_TYPE'] will be used.
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index 8defd769539a..ceba33e97732 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -138,6 +138,7 @@ def main() -> None:
         help="Root directory where to install files. Defaults to the current working directory.",
     )
     parser.add_argument(
+        "--install-dir",
         "--install_dir",
         help=(
             "Deprecated. Use --gen-dir instead. The semantics are different, do not change "
@@ -159,21 +160,25 @@ def main() -> None:
         help="Path to the YAML file that contains the list of operators to include for custom build.",
     )
     parser.add_argument(
+        "--operators-yaml-path",
         "--operators_yaml_path",
         help="Path to the model YAML file that contains the list of operators to include for custom build.",
     )
     parser.add_argument(
+        "--force-schema-registration",
         "--force_schema_registration",
         action="store_true",
         help="force it to generate schema-only registrations for ops that are not"
         "listed on --selected-op-list",
     )
     parser.add_argument(
+        "--gen-lazy-ts-backend",
         "--gen_lazy_ts_backend",
         action="store_true",
         help="Enable generation of the torch::lazy TorchScript backend",
     )
     parser.add_argument(
+        "--per-operator-headers",
         "--per_operator_headers",
         action="store_true",
         help="Build lazy tensor ts backend with per-operator ATen headers, must match how ATen was built",
diff --git a/tools/shared/__init__.py b/tools/shared/__init__.py
index 6bcc9aa6271e..338dc66a8234 100644
--- a/tools/shared/__init__.py
+++ b/tools/shared/__init__.py
@@ -1,2 +1 @@
-from .cwrap_common import set_declaration_defaults, sort_by_number_of_args
 from .module_loader import import_module
diff --git a/tools/shared/cwrap_common.py b/tools/shared/cwrap_common.py
deleted file mode 100644
index 42548b9afa11..000000000000
--- a/tools/shared/cwrap_common.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# this code should be common among cwrap and ATen preprocessing
-# for now, I have put it in one place but right now is copied out of cwrap
-
-import copy
-from typing import Any, Dict, Iterable, List, Union
-
-Arg = Dict[str, Any]
-
-
-def parse_arguments(args: List[Union[str, Arg]]) -> List[Arg]:
-    new_args = []
-    for arg in args:
-        # Simple arg declaration of form "<type> <name>"
-        if isinstance(arg, str):
-            t, _, name = arg.partition(" ")
-            new_args.append({"type": t, "name": name})
-        elif isinstance(arg, dict):
-            if "arg" in arg:
-                arg["type"], _, arg["name"] = arg["arg"].partition(" ")
-                del arg["arg"]
-            new_args.append(arg)
-        else:
-            raise AssertionError()
-    return new_args
-
-
-Declaration = Dict[str, Any]
-
-
-def set_declaration_defaults(declaration: Declaration) -> None:
-    if "schema_string" not in declaration:
-        # This happens for legacy TH bindings like
-        # _thnn_conv_depthwise2d_backward
-        declaration["schema_string"] = ""
-    declaration.setdefault("arguments", [])
-    declaration.setdefault("return", "void")
-    if "cname" not in declaration:
-        declaration["cname"] = declaration["name"]
-    if "backends" not in declaration:
-        declaration["backends"] = ["CPU", "CUDA"]
-    assert "api_name" not in declaration
-    declaration["api_name"] = declaration["name"]
-    # NB: keep this in sync with gen_autograd.py
-    if declaration.get("overload_name"):
-        declaration["type_wrapper_name"] = "{}_{}".format(
-            declaration["name"], declaration["overload_name"]
-        )
-    else:
-        declaration["type_wrapper_name"] = declaration["name"]
-    # TODO: Uggggh, parsing the schema string here, really???
-    declaration["operator_name_with_overload"] = declaration["schema_string"].split(
-        "("
-    )[0]
-    if declaration["schema_string"]:
-        declaration["unqual_schema_string"] = declaration["schema_string"].split("::")[
-            1
-        ]
-        declaration["unqual_operator_name_with_overload"] = declaration[
-            "operator_name_with_overload"
-        ].split("::")[1]
-    else:
-        declaration["unqual_schema_string"] = ""
-        declaration["unqual_operator_name_with_overload"] = ""
-    # Simulate multiple dispatch, even if it's not necessary
-    if "options" not in declaration:
-        declaration["options"] = [
-            {
-                "arguments": copy.deepcopy(declaration["arguments"]),
-                "schema_order_arguments": copy.deepcopy(
-                    declaration["schema_order_arguments"]
-                ),
-            }
-        ]
-        del declaration["arguments"]
-        del declaration["schema_order_arguments"]
-    # Parse arguments (some of them can be strings)
-    for option in declaration["options"]:
-        option["arguments"] = parse_arguments(option["arguments"])
-        option["schema_order_arguments"] = parse_arguments(
-            option["schema_order_arguments"]
-        )
-    # Propagate defaults from declaration to options
-    for option in declaration["options"]:
-        for k, v in declaration.items():
-            # TODO(zach): why does cwrap not propagate 'name'? I need it
-            # propagaged for ATen
-            if k != "options":
-                option.setdefault(k, v)
-
-
-# TODO(zach): added option to remove keyword handling for C++ which cannot
-# support it.
-
-Option = Dict[str, Any]
-
-
-def filter_unique_options(
-    options: Iterable[Option],
-    allow_kwarg: bool,
-    type_to_signature: Dict[str, str],
-    remove_self: bool,
-) -> List[Option]:
-    def exclude_arg(arg: Arg) -> bool:
-        return arg["type"] == "CONSTANT"  # type: ignore[no-any-return]
-
-    def exclude_arg_with_self_check(arg: Arg) -> bool:
-        return exclude_arg(arg) or (remove_self and arg["name"] == "self")
-
-    def signature(option: Option, num_kwarg_only: int) -> str:
-        if num_kwarg_only == 0:
-            kwarg_only_count = None
-        else:
-            kwarg_only_count = -num_kwarg_only
-        arg_signature = "#".join(
-            type_to_signature.get(arg["type"], arg["type"])
-            for arg in option["arguments"][:kwarg_only_count]
-            if not exclude_arg_with_self_check(arg)
-        )
-        if kwarg_only_count is None:
-            return arg_signature
-        kwarg_only_signature = "#".join(
-            arg["name"] + "#" + arg["type"]
-            for arg in option["arguments"][kwarg_only_count:]
-            if not exclude_arg(arg)
-        )
-        return arg_signature + "#-#" + kwarg_only_signature
-
-    seen_signatures = set()
-    unique = []
-    for option in options:
-        # if only check num_kwarg_only == 0 if allow_kwarg == False
-        limit = len(option["arguments"]) if allow_kwarg else 0
-        for num_kwarg_only in range(0, limit + 1):
-            sig = signature(option, num_kwarg_only)
-            if sig not in seen_signatures:
-                if num_kwarg_only > 0:
-                    for arg in option["arguments"][-num_kwarg_only:]:
-                        arg["kwarg_only"] = True
-                unique.append(option)
-                seen_signatures.add(sig)
-                break
-    return unique
-
-
-def sort_by_number_of_args(declaration: Declaration, reverse: bool = True) -> None:
-    def num_args(option: Option) -> int:
-        return len(option["arguments"])
-
-    declaration["options"].sort(key=num_args, reverse=reverse)
-
-
-class Function(object):
-    def __init__(self, name: str) -> None:
-        self.name = name
-        self.arguments: List["Argument"] = []
-
-    def add_argument(self, arg: "Argument") -> None:
-        assert isinstance(arg, Argument)
-        self.arguments.append(arg)
-
-    def __repr__(self) -> str:
-        return self.name + "(" + ", ".join(a.__repr__() for a in self.arguments) + ")"
-
-
-class Argument(object):
-    def __init__(self, _type: str, name: str, is_optional: bool):
-        self.type = _type
-        self.name = name
-        self.is_optional = is_optional
-
-    def __repr__(self) -> str:
-        return self.type + " " + self.name
-
-
-def parse_header(path: str) -> List[Function]:
-    with open(path, "r") as f:
-        lines: Iterable[Any] = f.read().split("\n")
-
-    # Remove empty lines and prebackend directives
-    lines = filter(lambda l: l and not l.startswith("#"), lines)
-    # Remove line comments
-    lines = (l.partition("//") for l in lines)
-    # Select line and comment part
-    lines = ((l[0].strip(), l[2].strip()) for l in lines)
-    # Remove trailing special signs
-    lines = ((l[0].rstrip(");").rstrip(","), l[1]) for l in lines)
-    # Split arguments
-    lines = ((l[0].split(","), l[1]) for l in lines)
-    # Flatten lines
-    new_lines = []
-    for l, c in lines:
-        for split in l:
-            new_lines.append((split, c))
-    lines = new_lines
-    del new_lines
-    # Remove unnecessary whitespace
-    lines = ((l[0].strip(), l[1]) for l in lines)
-    # Remove empty lines
-    lines = filter(lambda l: l[0], lines)
-    generic_functions = []
-    for l, c in lines:
-        if l.startswith("TH_API void THNN_"):
-            fn_name = l[len("TH_API void THNN_") :]
-            if fn_name[0] == "(" and fn_name[-2] == ")":
-                fn_name = fn_name[1:-2]
-            else:
-                fn_name = fn_name[:-1]
-            generic_functions.append(Function(fn_name))
-        elif l.startswith("TORCH_CUDA_CPP_API void THNN_"):
-            fn_name = l[len("TORCH_CUDA_CPP_API void THNN_") :]
-            if fn_name[0] == "(" and fn_name[-2] == ")":
-                fn_name = fn_name[1:-2]
-            else:
-                fn_name = fn_name[:-1]
-            generic_functions.append(Function(fn_name))
-        elif l.startswith("TORCH_CUDA_CU_API void THNN_"):
-            fn_name = l[len("TORCH_CUDA_CU_API void THNN_") :]
-            if fn_name[0] == "(" and fn_name[-2] == ")":
-                fn_name = fn_name[1:-2]
-            else:
-                fn_name = fn_name[:-1]
-            generic_functions.append(Function(fn_name))
-        elif l:
-            t, name = l.split()
-            if "*" in name:
-                t = t + "*"
-                name = name[1:]
-            generic_functions[-1].add_argument(Argument(t, name, "[OPTIONAL]" in c))
-    return generic_functions
diff --git a/tools/stats/check_disabled_tests.py b/tools/stats/check_disabled_tests.py
index 636af668a13d..a387733cf8d9 100644
--- a/tools/stats/check_disabled_tests.py
+++ b/tools/stats/check_disabled_tests.py
@@ -116,8 +116,7 @@ def get_test_reports(
         for path in artifact_paths:
             unzip(path)
 
-        for report in Path(".").glob("**/*.xml"):
-            yield report
+        yield from Path(".").glob("**/*.xml")
 
 
 def get_disabled_test_name(test_id: str) -> Tuple[str, str, str, str]:
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index a798119010d2..d01a7997f46f 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -73,7 +73,7 @@ def is_cached_file_valid() -> bool:
 def get_slow_tests(
     dirpath: str, filename: str = SLOW_TESTS_FILE
 ) -> Optional[Dict[str, float]]:
-    url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/slow-tests.json"
+    url = "https://ossci-metrics.s3.amazonaws.com/slow-tests.json"
     try:
         return fetch_and_cache(dirpath, filename, url, lambda x: x)
     except Exception:
@@ -119,7 +119,7 @@ def process_disabled_test(the_response: Dict[str, Any]) -> Dict[str, Any]:
         return disabled_test_from_issues
 
     try:
-        url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/disabled-tests-condensed.json"
+        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json"
         return fetch_and_cache(dirpath, filename, url, process_disabled_test)
     except Exception:
         print("Couldn't download test skip set, leaving all tests enabled...")
diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py
deleted file mode 100755
index 068b03598772..000000000000
--- a/tools/stats/print_test_stats.py
+++ /dev/null
@@ -1,1070 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import bz2
-import datetime
-import json
-import math
-import os
-import re
-import statistics
-import subprocess
-import time
-from collections import defaultdict
-from pathlib import Path
-from typing import (
-    Any,
-    cast,
-    DefaultDict,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Optional,
-    Set,
-    Tuple,
-)
-from xml.dom import minidom
-
-from typing_extensions import TypedDict
-
-from tools.stats.s3_stat_parser import (
-    Commit,
-    get_S3_object_from_bucket,
-    get_test_stats_summaries_for_job,
-    HAVE_BOTO3,
-    newify_case,
-    Report,
-    ReportMetaMeta,
-    Status,
-    Version1Report,
-    Version2Case,
-    Version2Report,
-    VersionedReport,
-)
-from tools.stats.scribe import send_to_scribe
-
-
-SimplerSuite = Dict[str, Version2Case]
-SimplerFile = Dict[str, SimplerSuite]
-SimplerReport = Dict[str, SimplerFile]
-
-
-class Stat(TypedDict):
-    center: float
-    spread: Optional[float]
-
-
-class CaseDiff(TypedDict):
-    margin: str
-    name: str
-    was: Optional[Tuple[Stat, Status]]
-    now: Optional[Version2Case]
-
-
-class SuiteDiff(TypedDict):
-    margin: str
-    name: str
-    was: Optional[Stat]
-    now: Optional[float]
-    cases: List[CaseDiff]
-
-
-# TODO: consolidate this with the get_cases function from
-# tools/stats/test_history.py
-
-# Here we translate to a three-layer format (file -> suite -> case)
-# rather than a two-layer format (suite -> case) because as mentioned in
-# a comment in the body of this function, if we consolidate suites that
-# share a name, there will be test case name collisions, and once we
-# have those, there's no clean way to deal with it in the diffing logic.
-# It's not great to have to add a dummy empty string for the filename
-# for version 1 reports, but it's better than either losing cases that
-# share a name (for version 2 reports) or using a list of cases rather
-# than a dict.
-def simplify(report: Report) -> SimplerReport:
-    if "format_version" not in report:  # version 1 implicitly
-        v1report = cast(Version1Report, report)
-        return {
-            # we just don't have test filename information sadly, so we
-            # just make one fake filename that is the empty string
-            "": {
-                suite_name: {
-                    # This clobbers some cases that have duplicate names
-                    # because in version 1, we would merge together all
-                    # the suites with a given name (even if they came
-                    # from different files), so there were actually
-                    # situations in which two cases in the same suite
-                    # shared a name (because they actually originally
-                    # came from two suites that were then merged). It
-                    # would probably be better to warn about the cases
-                    # that we're silently discarding here, but since
-                    # we're only uploading in the new format (where
-                    # everything is also keyed by filename) going
-                    # forward, it shouldn't matter too much.
-                    case["name"]: newify_case(case)
-                    for case in suite["cases"]
-                }
-                for suite_name, suite in v1report["suites"].items()
-            }
-        }
-    else:
-        v_report = cast(VersionedReport, report)
-        version = v_report["format_version"]
-        if version == 2:
-            v2report = cast(Version2Report, v_report)
-            return {
-                filename: {
-                    suite_name: suite["cases"]
-                    for suite_name, suite in file_data["suites"].items()
-                }
-                for filename, file_data in v2report["files"].items()
-            }
-        else:
-            raise RuntimeError(f"Unknown format version: {version}")
-
-
-def plural(n: int) -> str:
-    return "" if n == 1 else "s"
-
-
-def get_base_commit(sha1: str) -> str:
-    default_branch = os.environ.get("GIT_DEFAULT_BRANCH")
-    # capture None and "" cases
-    if not default_branch:
-        default_branch = "master"
-
-    default_remote = f"origin/{default_branch}"
-    return subprocess.check_output(
-        ["git", "merge-base", sha1, default_remote],
-        encoding="ascii",
-    ).strip()
-
-
-def display_stat(
-    x: Stat,
-    format: Tuple[Tuple[int, int], Tuple[int, int]],
-) -> str:
-    spread_len = format[1][0] + 1 + format[1][1]
-    spread = x["spread"]
-    if spread is not None:
-        spread_str = f" ± {spread:{spread_len}.{format[1][1]}f}s"
-    else:
-        spread_str = " " * (3 + spread_len + 1)
-    mean_len = format[0][0] + 1 + format[0][1]
-    return f'{x["center"]:{mean_len}.{format[0][1]}f}s{spread_str}'
-
-
-def list_stat(l: List[float]) -> Stat:
-    return {
-        "center": statistics.mean(l),
-        "spread": statistics.stdev(l) if len(l) > 1 else None,
-    }
-
-
-def zero_stat() -> Stat:
-    return {"center": 0, "spread": None}
-
-
-def recenter(was: Stat, now: float) -> Stat:
-    return {"center": now - was["center"], "spread": was["spread"]}
-
-
-def sum_normals(stats: Iterable[Stat]) -> Stat:
-    """
-    Returns a stat corresponding to the sum of the given stats.
-
-    Assumes that the center and spread for each of the given stats are
-    mean and stdev, respectively.
-    """
-    l = list(stats)
-    spread: Optional[float]
-    if any(stat["spread"] is not None for stat in l):
-        spread = math.sqrt(sum((stat["spread"] or 0) ** 2 for stat in l))
-    else:
-        spread = None
-    return {
-        "center": sum(stat["center"] for stat in l),
-        "spread": spread,
-    }
-
-
-def format_seconds(seconds: List[float]) -> str:
-    if len(seconds) > 0:
-        x = list_stat(seconds)
-        return f"total time {display_stat(x, ((5, 2), (4, 2)))}".strip()
-    return ""
-
-
-def show_ancestors(num_commits: int) -> str:
-    return f"    | : ({num_commits} commit{plural(num_commits)})"
-
-
-def unlines(lines: List[str]) -> str:
-    return "".join(f"{line}\n" for line in lines)
-
-
-def matching_test_times(
-    *,
-    base_reports: Dict[Commit, List[SimplerReport]],
-    filename: str,
-    suite_name: str,
-    case_name: str,
-    status: Status,
-) -> List[float]:
-    times: List[float] = []
-    for reports in base_reports.values():
-        for report in reports:
-            file_data = report.get(filename)
-            if file_data:
-                suite = file_data.get(suite_name)
-                if suite:
-                    case = suite.get(case_name)
-                    if case:
-                        t = case["seconds"]
-                        s = case["status"]
-                        if s == status:
-                            times.append(t)
-    return times
-
-
-def analyze(
-    *,
-    head_report: SimplerReport,
-    base_reports: Dict[Commit, List[SimplerReport]],
-) -> List[SuiteDiff]:
-    nonempty_shas = [sha for sha, reports in base_reports.items() if reports]
-    # most recent main ancestor with at least one S3 report,
-    # or empty list if there are none (will show all tests as added)
-    base_report = base_reports[nonempty_shas[0]] if nonempty_shas else []
-
-    # find all relevant suites (those in either base or head or both)
-    all_reports = [head_report] + base_report
-    all_suites: Set[Tuple[str, str]] = {
-        (filename, suite_name)
-        for r in all_reports
-        for filename, file_data in r.items()
-        for suite_name in file_data.keys()
-    }
-
-    removed_suites: List[SuiteDiff] = []
-    modified_suites: List[SuiteDiff] = []
-    added_suites: List[SuiteDiff] = []
-
-    for filename, suite_name in sorted(all_suites):
-        case_diffs: List[CaseDiff] = []
-        head_suite = head_report.get(filename, {}).get(suite_name)
-        base_cases: Dict[str, Status] = dict(
-            sorted(
-                set.intersection(
-                    *[
-                        {
-                            (n, case["status"])
-                            for n, case in report.get(filename, {})
-                            .get(suite_name, {})
-                            .items()
-                        }
-                        for report in base_report
-                    ]
-                    or [set()]
-                )
-            )
-        )
-        case_stats: Dict[str, Stat] = {}
-        if head_suite:
-            now = sum(case["seconds"] for case in head_suite.values())
-            if any(
-                filename in report and suite_name in report[filename]
-                for report in base_report
-            ):
-                removed_cases: List[CaseDiff] = []
-                for case_name, case_status in base_cases.items():
-                    case_stats[case_name] = list_stat(
-                        matching_test_times(
-                            base_reports=base_reports,
-                            filename=filename,
-                            suite_name=suite_name,
-                            case_name=case_name,
-                            status=case_status,
-                        )
-                    )
-                    if case_name not in head_suite:
-                        removed_cases.append(
-                            {
-                                "margin": "-",
-                                "name": case_name,
-                                "was": (case_stats[case_name], case_status),
-                                "now": None,
-                            }
-                        )
-                modified_cases: List[CaseDiff] = []
-                added_cases: List[CaseDiff] = []
-                for head_case_name in sorted(head_suite):
-                    head_case = head_suite[head_case_name]
-                    if head_case_name in base_cases:
-                        stat = case_stats[head_case_name]
-                        base_status = base_cases[head_case_name]
-                        if head_case["status"] != base_status:
-                            modified_cases.append(
-                                {
-                                    "margin": "!",
-                                    "name": head_case_name,
-                                    "was": (stat, base_status),
-                                    "now": head_case,
-                                }
-                            )
-                    else:
-                        added_cases.append(
-                            {
-                                "margin": "+",
-                                "name": head_case_name,
-                                "was": None,
-                                "now": head_case,
-                            }
-                        )
-                # there might be a bug calculating this stdev, not sure
-                was = sum_normals(case_stats.values())
-                case_diffs = removed_cases + modified_cases + added_cases
-                if case_diffs:
-                    modified_suites.append(
-                        {
-                            "margin": " ",
-                            "name": suite_name,
-                            "was": was,
-                            "now": now,
-                            "cases": case_diffs,
-                        }
-                    )
-            else:
-                for head_case_name in sorted(head_suite):
-                    head_case = head_suite[head_case_name]
-                    case_diffs.append(
-                        {
-                            "margin": " ",
-                            "name": head_case_name,
-                            "was": None,
-                            "now": head_case,
-                        }
-                    )
-                added_suites.append(
-                    {
-                        "margin": "+",
-                        "name": suite_name,
-                        "was": None,
-                        "now": now,
-                        "cases": case_diffs,
-                    }
-                )
-        else:
-            for case_name, case_status in base_cases.items():
-                case_stats[case_name] = list_stat(
-                    matching_test_times(
-                        base_reports=base_reports,
-                        filename=filename,
-                        suite_name=suite_name,
-                        case_name=case_name,
-                        status=case_status,
-                    )
-                )
-                case_diffs.append(
-                    {
-                        "margin": " ",
-                        "name": case_name,
-                        "was": (case_stats[case_name], case_status),
-                        "now": None,
-                    }
-                )
-            removed_suites.append(
-                {
-                    "margin": "-",
-                    "name": suite_name,
-                    # there might be a bug calculating this stdev, not sure
-                    "was": sum_normals(case_stats.values()),
-                    "now": None,
-                    "cases": case_diffs,
-                }
-            )
-
-    return removed_suites + modified_suites + added_suites
-
-
-def case_diff_lines(diff: CaseDiff) -> List[str]:
-    lines = [f'def {diff["name"]}: ...']
-
-    case_fmt = ((3, 3), (2, 3))
-
-    was = diff["was"]
-    if was:
-        was_line = f"    # was {display_stat(was[0], case_fmt)}"
-        was_status = was[1]
-        if was_status:
-            was_line += f" ({was_status})"
-        lines.append(was_line)
-
-    now = diff["now"]
-    if now:
-        now_stat: Stat = {"center": now["seconds"], "spread": None}
-        now_line = f"    # now {display_stat(now_stat, case_fmt)}"
-        now_status = now["status"]
-        if now_status:
-            now_line += f" ({now_status})"
-        lines.append(now_line)
-
-    return [""] + [f'{diff["margin"]} {l}' for l in lines]
-
-
-def display_suite_diff(diff: SuiteDiff) -> str:
-    lines = [f'class {diff["name"]}:']
-
-    suite_fmt = ((4, 2), (3, 2))
-
-    was = diff["was"]
-    if was:
-        lines.append(f"    # was {display_stat(was, suite_fmt)}")
-
-    now = diff["now"]
-    if now is not None:
-        now_stat: Stat = {"center": now, "spread": None}
-        lines.append(f"    # now {display_stat(now_stat, suite_fmt)}")
-
-    for case_diff in diff["cases"]:
-        lines.extend([f"  {l}" for l in case_diff_lines(case_diff)])
-
-    return unlines([""] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + [""])
-
-
-def anomalies(diffs: List[SuiteDiff]) -> str:
-    return "".join(map(display_suite_diff, diffs))
-
-
-def graph(
-    *,
-    head_sha: Commit,
-    head_seconds: float,
-    base_seconds: Dict[Commit, List[float]],
-    on_master: bool,
-    ancestry_path: int = 0,
-    other_ancestors: int = 0,
-) -> str:
-    lines = [
-        "Commit graph (base is most recent master ancestor with at least one S3 report):",
-        "",
-        "    : (master)",
-        "    |",
-    ]
-
-    head_time_str = f"           {format_seconds([head_seconds])}"
-    if on_master:
-        lines.append(f"    * {head_sha[:10]} (HEAD)   {head_time_str}")
-    else:
-        lines.append(f"    | * {head_sha[:10]} (HEAD) {head_time_str}")
-
-        if ancestry_path > 0:
-            lines += [
-                "    | |",
-                show_ancestors(ancestry_path),
-            ]
-
-        if other_ancestors > 0:
-            lines += [
-                "    |/|",
-                show_ancestors(other_ancestors),
-                "    |",
-            ]
-        else:
-            lines.append("    |/")
-
-    is_first = True
-    for sha, seconds in base_seconds.items():
-        num_runs = len(seconds)
-        prefix = str(num_runs).rjust(3)
-        base = "(base)" if is_first and num_runs > 0 else "      "
-        if num_runs > 0:
-            is_first = False
-        t = format_seconds(seconds)
-        p = plural(num_runs)
-        if t:
-            p = f"{p}, ".ljust(3)
-        lines.append(f"    * {sha[:10]} {base} {prefix} report{p}{t}")
-
-    lines.extend(["    |", "    :"])
-
-    return unlines(lines)
-
-
-def case_delta(case: CaseDiff) -> Stat:
-    was = case["was"]
-    now = case["now"]
-    return recenter(
-        was[0] if was else zero_stat(),
-        now["seconds"] if now else 0,
-    )
-
-
-def display_final_stat(stat: Stat) -> str:
-    center = stat["center"]
-    spread = stat["spread"]
-    displayed = display_stat(
-        {"center": abs(center), "spread": spread},
-        ((4, 2), (3, 2)),
-    )
-    if center < 0:
-        sign = "-"
-    elif center > 0:
-        sign = "+"
-    else:
-        sign = " "
-    return f"{sign}{displayed}".rstrip()
-
-
-def summary_line(message: str, d: DefaultDict[str, List[CaseDiff]]) -> str:
-    all_cases = [c for cs in d.values() for c in cs]
-    tests = len(all_cases)
-    suites = len(d)
-    sp = f"{plural(suites)})".ljust(2)
-    tp = f"{plural(tests)},".ljust(2)
-    # there might be a bug calculating this stdev, not sure
-    stat = sum_normals(case_delta(c) for c in all_cases)
-    return "".join(
-        [
-            f"{message} (across {suites:>4} suite{sp}",
-            f"{tests:>6} test{tp}",
-            f" totaling {display_final_stat(stat)}",
-        ]
-    )
-
-
-def summary(analysis: List[SuiteDiff]) -> str:
-    removed_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list)
-    modified_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list)
-    added_tests: DefaultDict[str, List[CaseDiff]] = defaultdict(list)
-
-    for diff in analysis:
-        # the use of 'margin' here is not the most elegant
-        name = diff["name"]
-        margin = diff["margin"]
-        cases = diff["cases"]
-        if margin == "-":
-            removed_tests[name] += cases
-        elif margin == "+":
-            added_tests[name] += cases
-        else:
-            removed = list(filter(lambda c: c["margin"] == "-", cases))
-            added = list(filter(lambda c: c["margin"] == "+", cases))
-            modified = list(filter(lambda c: c["margin"] == "!", cases))
-            if removed:
-                removed_tests[name] += removed
-            if added:
-                added_tests[name] += added
-            if modified:
-                modified_tests[name] += modified
-
-    return unlines(
-        [
-            summary_line("Removed ", removed_tests),
-            summary_line("Modified", modified_tests),
-            summary_line("Added   ", added_tests),
-        ]
-    )
-
-
-def regression_info(
-    *,
-    head_sha: Commit,
-    head_report: Report,
-    base_reports: Dict[Commit, List[Report]],
-    job_name: str,
-    on_master: bool,
-    ancestry_path: int,
-    other_ancestors: int,
-) -> str:
-    """
-    Return a human-readable report describing any test time regressions.
-
-    The head_sha and head_report args give info about the current commit
-    and its test times. Since Python dicts maintain insertion order
-    (guaranteed as part of the language spec since 3.7), the
-    base_reports argument must list the head's several most recent
-    main commits, from newest to oldest (so the merge-base is
-    list(base_reports)[0]).
-    """
-    simpler_head = simplify(head_report)
-    simpler_base: Dict[Commit, List[SimplerReport]] = {}
-    for commit, reports in base_reports.items():
-        simpler_base[commit] = [simplify(r) for r in reports]
-    analysis = analyze(
-        head_report=simpler_head,
-        base_reports=simpler_base,
-    )
-
-    return "\n".join(
-        [
-            unlines(
-                [
-                    "----- Historic stats comparison result ------",
-                    "",
-                    f"    job: {job_name}",
-                    f"    commit: {head_sha}",
-                ]
-            ),
-            # don't print anomalies, because sometimes due to sharding, the
-            # output from this would be very long and obscure better signal
-            # anomalies(analysis),
-            graph(
-                head_sha=head_sha,
-                head_seconds=head_report["total_seconds"],
-                base_seconds={
-                    c: [r["total_seconds"] for r in rs]
-                    for c, rs in base_reports.items()
-                },
-                on_master=on_master,
-                ancestry_path=ancestry_path,
-                other_ancestors=other_ancestors,
-            ),
-            summary(analysis),
-        ]
-    )
-
-
-class TestCase:
-    def __init__(self, dom: Any) -> None:
-        self.class_name = str(dom.attributes["classname"].value)
-        self.name = str(dom.attributes["name"].value)
-        self.time = float(dom.attributes["time"].value)
-        error_elements = dom.getElementsByTagName("error")
-        # DISCLAIMER: unexpected successes and expected failures are currently not reported in assemble_s3_object
-        self.expected_failure = False
-        self.skipped = False
-        self.errored = False
-        self.unexpected_success = False
-        if len(error_elements) > 0:
-            # We are only expecting 1 element here
-            error_element = error_elements[0]
-            self.unexpected_success = (
-                error_element.hasAttribute("type")
-                and error_element.attributes["type"].value == "UnexpectedSuccess"
-            )
-            self.errored = not self.unexpected_success
-        skipped_elements = dom.getElementsByTagName("skipped")
-        if len(skipped_elements) > 0:
-            # We are only expecting 1 element here
-            skipped_element = skipped_elements[0]
-            self.expected_failure = (
-                skipped_element.hasAttribute("type")
-                and skipped_element.attributes["type"].value == "XFAIL"
-            )
-            self.skipped = not self.expected_failure
-        self.failed = len(dom.getElementsByTagName("failure")) > 0
-
-    def __repr__(self) -> str:
-        return self.__str__()
-
-    def __str__(self) -> str:
-        return (
-            f"[TestCase name: {self.name} | class_name: {self.class_name} | time: {self.time} | "
-            f"expected_failure: {self.expected_failure} | skipped: {self.skipped} | errored: {self.errored} | "
-            f"unexpected_success: {self.unexpected_success} | failed: {self.failed}]\n"
-        )
-
-
-class TestSuite:
-    def __init__(self, name: str) -> None:
-        self.name = name
-        self.test_cases: Dict[str, TestCase] = {}
-        self.failed_count = 0
-        self.skipped_count = 0
-        self.errored_count = 0
-        self.total_time = 0.0
-        # The below are currently not included in test reports
-        self.unexpected_success_count = 0
-        self.expected_failure_count = 0
-
-    def __repr__(self) -> str:
-        rc = (
-            f"{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}"
-        )
-        if self.skipped_count > 0:
-            rc += f" skipped: {self.skipped_count}"
-        return f"TestSuite({rc})"
-
-    def append(self, test_case: TestCase) -> None:
-        self.test_cases[test_case.name] = test_case
-        self.total_time += test_case.time
-        self.failed_count += 1 if test_case.failed else 0
-        self.skipped_count += 1 if test_case.skipped else 0
-        self.errored_count += 1 if test_case.errored else 0
-        self.unexpected_success_count += 1 if test_case.unexpected_success else 0
-        self.expected_failure_count += 1 if test_case.expected_failure else 0
-
-    def update(self, test_case: TestCase) -> None:
-        name = test_case.name
-        assert (
-            name in self.test_cases
-        ), f"Error: attempting to replace nonexistent test case {name}"
-        # Note that time for unexpected successes and expected failures are reported as 0s
-        self.test_cases[name].time += test_case.time
-        self.test_cases[name].failed |= test_case.failed
-        self.test_cases[name].errored |= test_case.errored
-        self.test_cases[name].skipped |= test_case.skipped
-        self.test_cases[name].unexpected_success |= test_case.unexpected_success
-        self.test_cases[name].expected_failure |= test_case.expected_failure
-
-
-# Tests that spawn duplicates (usually only twice) intentionally
-MULTITESTS = [
-    "test_cpp_extensions_aot",
-    "distributed/test_distributed_spawn",
-    "distributed\\test_distributed_spawn",  # for windows
-    "distributed/test_c10d_gloo",
-    "distributed\\test_c10d_gloo",  # for windows
-    "cpp",  # The caffe2 cpp tests spawn duplicate test cases as well.
-]
-
-
-class TestFile:
-    def __init__(self, name: str) -> None:
-        self.name = name
-        self.total_time = 0.0
-        self.test_suites: Dict[str, TestSuite] = {}
-
-    def append(self, test_case: TestCase) -> None:
-        suite_name = test_case.class_name
-        if suite_name not in self.test_suites:
-            self.test_suites[suite_name] = TestSuite(suite_name)
-        if test_case.name in self.test_suites[suite_name].test_cases:
-            if self.name in MULTITESTS:
-                self.test_suites[suite_name].update(test_case)
-                self.total_time += test_case.time
-        else:
-            self.test_suites[suite_name].append(test_case)
-            self.total_time += test_case.time
-
-
-def parse_report(path: str) -> Iterator[TestCase]:
-    try:
-        dom = minidom.parse(path)
-    except Exception as e:
-        print(f"Error occurred when parsing {path}: {e}")
-        return
-    for test_case in dom.getElementsByTagName("testcase"):
-        yield TestCase(test_case)
-
-
-def get_recursive_files(folder: str, extension: str) -> Iterable[str]:
-    """
-    Get recursive list of files with given extension even.
-
-    Use it instead of glob(os.path.join(folder, '**', f'*{extension}'))
-    if folder/file names can start with `.`, which makes it hidden on Unix platforms
-    """
-    assert extension.startswith(".")
-    for root, _, files in os.walk(folder):
-        for fname in files:
-            if os.path.splitext(fname)[1] == extension:
-                yield os.path.join(root, fname)
-
-
-def parse_reports(folder: str) -> Dict[str, TestFile]:
-    tests_by_file = {}
-    for report in get_recursive_files(folder, ".xml"):
-        report_path = Path(report)
-        # basename of the directory of test-report is the test filename
-        test_filename = re.sub(r"\.", "/", report_path.parent.name)
-        if test_filename not in tests_by_file:
-            tests_by_file[test_filename] = TestFile(test_filename)
-        for test_case in parse_report(report):
-            tests_by_file[test_filename].append(test_case)
-    return tests_by_file
-
-
-def build_info() -> ReportMetaMeta:
-    return {
-        "build_pr": os.environ.get("PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER", "")),
-        "build_tag": os.environ.get("TAG", os.environ.get("CIRCLE_TAG", "")),
-        "build_sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "")),
-        "build_base_commit": get_base_commit(
-            os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD"))
-        ),
-        "build_branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH", "")),
-        "build_job": os.environ.get(
-            "BUILD_ENVIRONMENT", os.environ.get("CIRCLE_JOB", "")
-        ),
-        "build_workflow_id": os.environ.get(
-            "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID", "")
-        ),
-        "build_start_time_epoch": str(
-            int(os.path.getmtime(os.path.realpath(__file__)))
-        ),
-    }
-
-
-def build_message(
-    test_file: TestFile,
-    test_suite: TestSuite,
-    test_case: TestCase,
-    meta_info: ReportMetaMeta,
-) -> Dict[str, Dict[str, Any]]:
-    return {
-        "normal": {
-            **meta_info,
-            "test_filename": test_file.name,
-            "test_suite_name": test_suite.name,
-            "test_case_name": test_case.name,
-        },
-        "int": {
-            "time": int(time.time()),
-            "test_total_count": 1,
-            "test_total_time": int(test_case.time * 1000),
-            "test_failed_count": 1 if test_case.failed > 0 else 0,
-            "test_skipped_count": 1 if test_case.skipped > 0 else 0,
-            "test_errored_count": 1 if test_case.errored > 0 else 0,
-        },
-    }
-
-
-def send_report_to_scribe(reports: Dict[str, TestFile]) -> None:
-    meta_info = build_info()
-    logs = json.dumps(
-        [
-            {
-                "category": "perfpipe_pytorch_test_times",
-                "message": json.dumps(
-                    build_message(test_file, test_suite, test_case, meta_info)
-                ),
-                "line_escape": False,
-            }
-            for test_file in reports.values()
-            for test_suite in test_file.test_suites.values()
-            for test_case in test_suite.test_cases.values()
-        ]
-    )
-    # no need to print send result as exceptions will be captured and print later.
-    send_to_scribe(logs)
-
-
-def assemble_s3_object(
-    reports: Dict[str, TestFile],
-    *,
-    total_seconds: float,
-) -> Version2Report:
-    return {
-        **build_info(),  # type: ignore[misc]
-        "total_seconds": total_seconds,
-        "format_version": 2,
-        "files": {
-            name: {
-                "total_seconds": test_file.total_time,
-                "suites": {
-                    name: {
-                        "total_seconds": suite.total_time,
-                        "cases": {
-                            name: {
-                                "seconds": case.time,
-                                "status": "errored"
-                                if case.errored
-                                else "failed"
-                                if case.failed
-                                else "skipped"
-                                if case.skipped
-                                else None,
-                            }
-                            for name, case in suite.test_cases.items()
-                        },
-                    }
-                    for name, suite in test_file.test_suites.items()
-                },
-            }
-            for name, test_file in reports.items()
-        },
-    }
-
-
-def send_report_to_s3(head_report: Version2Report) -> None:
-    job = os.getenv("BUILD_ENVIRONMENT", os.environ.get("CIRCLE_JOB"))
-    sha1 = os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", ""))
-    now = datetime.datetime.utcnow().isoformat()
-
-    # SHARD_NUMBER and TEST_CONFIG are specific to GHA, as these details would be included in CIRCLE_JOB already
-    shard = os.environ.get("SHARD_NUMBER", "")
-    test_config = os.environ.get("TEST_CONFIG")
-
-    job_report_dirname = (
-        f'{job}{f"-{test_config}" if test_config is not None else ""}{shard}'
-    )
-    key = f"test_time/{sha1}/{job_report_dirname}/{now}Z.json.bz2"  # Z meaning UTC
-    obj = get_S3_object_from_bucket("ossci-metrics", key)
-    # use bz2 because the results are smaller than gzip, and the
-    # compression time penalty we pay is only about half a second for
-    # input files of a few megabytes in size like these JSON files, and
-    # because for some reason zlib doesn't seem to play nice with the
-    # gunzip command whereas Python's bz2 does work with bzip2
-    obj.put(Body=bz2.compress(json.dumps(head_report).encode()))
-
-
-def print_regressions(head_report: Report, *, num_prev_commits: int) -> None:
-    sha1 = os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD"))
-
-    base = get_base_commit(sha1)
-
-    count_spec = f"{base}..{sha1}"
-    intermediate_commits = int(
-        subprocess.check_output(
-            ["git", "rev-list", "--count", count_spec], encoding="ascii"
-        )
-    )
-    ancestry_path = int(
-        subprocess.check_output(
-            ["git", "rev-list", "--ancestry-path", "--count", count_spec],
-            encoding="ascii",
-        )
-    )
-
-    # if current commit is already on main, we need to exclude it from
-    # this history; otherwise we include the merge-base
-    commits = subprocess.check_output(
-        ["git", "rev-list", f"--max-count={num_prev_commits+1}", base],
-        encoding="ascii",
-    ).splitlines()
-    on_master = False
-    if base == sha1:
-        on_master = True
-        commits = commits[1:]
-    else:
-        commits = commits[:-1]
-
-    job = os.environ.get("BUILD_ENVIRONMENT", "")
-    objects: Dict[Commit, List[Report]] = defaultdict(list)
-
-    for commit in commits:
-        objects[commit]
-        summaries = get_test_stats_summaries_for_job(sha=commit, job_prefix=job)
-        for _, summary in summaries.items():
-            objects[commit].extend(summary)
-
-    print()
-    print(
-        regression_info(
-            head_sha=sha1,
-            head_report=head_report,
-            base_reports=objects,
-            job_name=job,
-            on_master=on_master,
-            ancestry_path=ancestry_path - 1,
-            other_ancestors=intermediate_commits - ancestry_path,
-        ),
-        end="",
-    )
-
-
-def positive_integer(value: str) -> float:
-    parsed = int(value)
-    if parsed < 1:
-        raise argparse.ArgumentTypeError(f"{value} is not a natural number")
-    return parsed
-
-
-def positive_float(value: str) -> float:
-    parsed = float(value)
-    if parsed <= 0.0:
-        raise argparse.ArgumentTypeError(f"{value} is not a positive rational number")
-    return parsed
-
-
-def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool:
-    for test_file in reports.values():
-        for test_suite in test_file.test_suites.values():
-            if len(test_suite.test_cases) > 0:
-                return False
-    return True
-
-
-if __name__ == "__main__":
-    import argparse
-    import sys
-
-    parser = argparse.ArgumentParser(
-        "Print statistics from test XML output.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--longest-of-class",
-        type=positive_integer,
-        default=3,
-        metavar="N",
-        help="how many longest tests to show for each class",
-    )
-    parser.add_argument(
-        "--class-print-threshold",
-        type=positive_float,
-        default=1.0,
-        metavar="N",
-        help="Minimal total time to warrant class report",
-    )
-    parser.add_argument(
-        "--longest-of-run",
-        type=positive_integer,
-        default=10,
-        metavar="N",
-        help="how many longest tests to show from the entire run",
-    )
-    if HAVE_BOTO3:
-        parser.add_argument(
-            "--upload-to-s3",
-            action="store_true",
-            help="upload test time to S3 bucket",
-        )
-        parser.add_argument(
-            "--compare-with-s3",
-            action="store_true",
-            help="download test times for base commits and compare",
-        )
-    parser.add_argument(
-        "--num-prev-commits",
-        type=positive_integer,
-        default=10,
-        metavar="N",
-        help="how many previous commits to compare test times with",
-    )
-    parser.add_argument(
-        "--use-json",
-        metavar="FILE.json",
-        help="compare S3 with JSON file, instead of the test report folder",
-    )
-    parser.add_argument(
-        "folder",
-        help="test report folder",
-    )
-    args = parser.parse_args()
-
-    reports_by_file = parse_reports(args.folder)
-
-    if reports_has_no_tests(reports_by_file):
-        print(f"No tests in reports found in {args.folder}")
-        sys.exit(0)
-
-    try:
-        send_report_to_scribe(reports_by_file)
-    except Exception as e:
-        print(f"ERROR ENCOUNTERED WHEN UPLOADING TO SCRIBE: {e}")
-
-    total_time = 0.0
-    for filename, test_filename in reports_by_file.items():
-        for suite_name, test_suite in test_filename.test_suites.items():
-            total_time += test_suite.total_time
-
-    obj = assemble_s3_object(reports_by_file, total_seconds=total_time)
-
-    if args.upload_to_s3:
-        try:
-            send_report_to_s3(obj)
-        except Exception as e:
-            print(f"ERROR ENCOUNTERED WHEN UPLOADING TO S3: {e}")
-
-    if args.compare_with_s3:
-        head_json = obj
-        if args.use_json:
-            head_json = json.loads(Path(args.use_json).read_text())
-        try:
-            print_regressions(head_json, num_prev_commits=args.num_prev_commits)
-        except Exception as e:
-            print(f"ERROR ENCOUNTERED WHEN COMPARING AGAINST S3: {e}")
diff --git a/tools/stats/s3_stat_parser.py b/tools/stats/s3_stat_parser.py
deleted file mode 100644
index 2691888ecbfa..000000000000
--- a/tools/stats/s3_stat_parser.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import bz2
-import json
-import logging
-import subprocess
-from collections import defaultdict
-from datetime import datetime, timedelta
-from typing import Any, cast, Dict, List, Optional, Tuple, Union
-
-from typing_extensions import Literal, TypedDict
-
-try:
-    import boto3  # type: ignore[import]
-    import botocore  # type: ignore[import]
-
-    HAVE_BOTO3 = True
-except ImportError:
-    HAVE_BOTO3 = False
-
-
-logger = logging.getLogger(__name__)
-
-
-OSSCI_METRICS_BUCKET = "ossci-metrics"
-
-Commit = str  # 40-digit SHA-1 hex string
-Status = Optional[Literal["errored", "failed", "skipped"]]
-
-
-class CaseMeta(TypedDict):
-    seconds: float
-
-
-class Version1Case(CaseMeta):
-    name: str
-    errored: bool
-    failed: bool
-    skipped: bool
-
-
-class Version1Suite(TypedDict):
-    total_seconds: float
-    cases: List[Version1Case]
-
-
-class ReportMetaMeta(TypedDict):
-    build_pr: str
-    build_tag: str
-    build_sha1: Commit
-    build_base_commit: Commit
-    build_branch: str
-    build_job: str
-    build_workflow_id: str
-    build_start_time_epoch: str
-
-
-class ReportMeta(ReportMetaMeta):
-    total_seconds: float
-
-
-class Version1Report(ReportMeta):
-    suites: Dict[str, Version1Suite]
-
-
-class Version2Case(CaseMeta):
-    status: Status
-
-
-class Version2Suite(TypedDict):
-    total_seconds: float
-    cases: Dict[str, Version2Case]
-
-
-class Version2File(TypedDict):
-    total_seconds: float
-    suites: Dict[str, Version2Suite]
-
-
-class VersionedReport(ReportMeta):
-    format_version: int
-
-
-# report: Version2Report implies report['format_version'] == 2
-class Version2Report(VersionedReport):
-    files: Dict[str, Version2File]
-
-
-Report = Union[Version1Report, VersionedReport]
-
-if HAVE_BOTO3:
-    S3_RESOURCE_READ_ONLY = boto3.resource(
-        "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
-    )
-    S3_RESOURCE = boto3.resource("s3")
-
-
-def get_S3_bucket_readonly(bucket_name: str) -> Any:
-    return S3_RESOURCE_READ_ONLY.Bucket(bucket_name)
-
-
-def get_S3_object_from_bucket(bucket_name: str, object: str) -> Any:
-    return S3_RESOURCE.Object(bucket_name, object)
-
-
-def case_status(case: Version1Case) -> Status:
-    for k in {"errored", "failed", "skipped"}:
-        if case[k]:  # type: ignore[literal-required]
-            return cast(Status, k)
-    return None
-
-
-def newify_case(case: Version1Case) -> Version2Case:
-    return {
-        "seconds": case["seconds"],
-        "status": case_status(case),
-    }
-
-
-def get_cases(
-    *,
-    data: Report,
-    filename: Optional[str],
-    suite_name: Optional[str],
-    test_name: Optional[str],
-) -> List[Version2Case]:
-    cases: List[Version2Case] = []
-    if "format_version" not in data:  # version 1 implicitly
-        v1report = cast(Version1Report, data)
-        suites = v1report["suites"]
-        for sname, v1suite in suites.items():
-            if not suite_name or sname == suite_name:
-                for v1case in v1suite["cases"]:
-                    if not test_name or v1case["name"] == test_name:
-                        cases.append(newify_case(v1case))
-    else:
-        v_report = cast(VersionedReport, data)
-        version = v_report["format_version"]
-        if version == 2:
-            v2report = cast(Version2Report, v_report)
-            for fname, v2file in v2report["files"].items():
-                if fname == filename or not filename:
-                    for sname, v2suite in v2file["suites"].items():
-                        if sname == suite_name or not suite_name:
-                            for cname, v2case in v2suite["cases"].items():
-                                if not test_name or cname == test_name:
-                                    cases.append(v2case)
-        else:
-            raise RuntimeError(f"Unknown format version: {version}")
-    return cases
-
-
-def _parse_master_summaries(summaries: Any, jobs: List[str]) -> Dict[str, List[Report]]:
-    summary_dict = defaultdict(list)
-    for summary in summaries:
-        # master summary format: "test_time/{sha}/{job}/file"
-        summary_job = summary.key.split("/")[2]
-        if summary_job in jobs or len(jobs) == 0:
-            binary = summary.get()["Body"].read()
-            string = bz2.decompress(binary).decode("utf-8")
-            summary_dict[summary_job].append(json.loads(string))
-    return summary_dict
-
-
-def _parse_pr_summaries(
-    summaries: Any, job_prefix: str
-) -> Dict[str, List[Tuple[Report, str]]]:
-    summary_dict = defaultdict(list)
-    for summary in summaries:
-        # PR summary format: "pr_test_time/{pr}/{sha}/{job}/file"
-        summary_job = summary.key.split("/")[3]
-        summary_timestamp = summary.key.split("/")[4][: len("YYYY-MM-ddTHH:mm:ss")]
-        if not job_prefix or len(job_prefix) == 0 or summary_job.startswith(job_prefix):
-            binary = summary.get()["Body"].read()
-            string = bz2.decompress(binary).decode("utf-8")
-            summary_dict[summary_job].append((json.loads(string), summary_timestamp))
-    return summary_dict
-
-
-# Collect and decompress S3 test stats summaries into JSON.
-# data stored on S3 buckets are pathed by {sha}/{job} so we also allow
-# optional jobs filter
-def get_test_stats_summaries(
-    *, sha: str, jobs: Optional[List[str]] = None
-) -> Dict[str, List[Report]]:
-    bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET)
-    summaries = bucket.objects.filter(Prefix=f"test_time/{sha}")
-    return _parse_master_summaries(summaries, jobs=list(jobs or []))
-
-
-def get_test_stats_summaries_for_job(
-    *, sha: str, job_prefix: str
-) -> Dict[str, List[Report]]:
-    bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET)
-    summaries = bucket.objects.filter(Prefix=f"test_time/{sha}/{job_prefix}")
-    return _parse_master_summaries(summaries, jobs=list())
-
-
-def get_test_stats_summaries_for_pr(
-    *, pr: str, job_prefix: str
-) -> Dict[str, List[Tuple[Report, str]]]:
-    bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET)
-    summaries = bucket.objects.filter(Prefix=f"pr_test_time/{pr}/")
-    return _parse_pr_summaries(summaries, job_prefix=job_prefix)
-
-
-# This function returns a list of S3 test time reports. This function can run into errors if HAVE_BOTO3 = False
-# or the S3 bucket is somehow unavailable. Even though this function goes through ten commits' reports to find a
-# non-empty report, it is still conceivable (though highly unlikely) for this function to return no reports.
-def get_previous_reports_for_branch(
-    branch: str, ci_job_prefix: str = ""
-) -> List[Report]:
-    commit_date_ts = subprocess.check_output(
-        ["git", "show", "-s", "--format=%ct", "HEAD"], encoding="ascii"
-    ).strip()
-    commit_date = datetime.fromtimestamp(int(commit_date_ts))
-    # We go a day before this current commit to avoiding pulling incomplete reports
-    day_before_commit = str(commit_date - timedelta(days=1)).split(" ")[0]
-    # something like git rev-list --before="2021-03-04" --max-count=10 --remotes="*origin/nightly"
-    commits = subprocess.check_output(
-        [
-            "git",
-            "rev-list",
-            f"--before={day_before_commit}",
-            "--max-count=10",
-            f"--remotes=*{branch}",
-        ],
-        encoding="ascii",
-    ).splitlines()
-
-    reports: List[Report] = []
-    commit_index = 0
-    while len(reports) == 0 and commit_index < len(commits):
-        commit = commits[commit_index]
-        logger.info(f"Grabbing reports from commit: {commit}")
-        summaries = get_test_stats_summaries_for_job(
-            sha=commit, job_prefix=ci_job_prefix
-        )
-        for job_name, summary in summaries.items():
-            reports.append(summary[0])
-            if len(summary) > 1:
-                logger.warning(
-                    f"WARNING: Multiple summary objects found for {commit}/{job_name}"
-                )
-        commit_index += 1
-    return reports
diff --git a/tools/stats/scribe.py b/tools/stats/scribe.py
deleted file mode 100644
index 2ca2d8c6824f..000000000000
--- a/tools/stats/scribe.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import base64
-import bz2
-import json
-import os
-from typing import Any
-
-
-_lambda_client = None
-
-
-def sprint(*args: Any) -> None:
-    print("[scribe]", *args)
-
-
-def aws_lambda() -> Any:
-    global _lambda_client
-    # lazy import so that we don't need to introduce extra dependencies
-    import boto3  # type: ignore[import]
-
-    if _lambda_client is None:
-        _lambda_client = boto3.client("lambda")
-
-    return _lambda_client
-
-
-def invoke_lambda(name: str, payload: Any) -> Any:
-    res = aws_lambda().invoke(FunctionName=name, Payload=json.dumps(payload).encode())
-    payload = str(res["Payload"].read().decode())
-    if res.get("FunctionError"):
-        raise Exception(payload)
-    return payload
-
-
-def send_to_scribe(logs: str) -> str:
-    access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN", "")
-
-    # boto3 can be used when the runner has IAM roles setup
-    # currently it's used as a fallback when SCRIBE_GRAPHQL_ACCESS_TOKEN is empty
-    if access_token == "":
-        return _send_to_scribe_via_boto3(logs)
-
-    return _send_to_scribe_via_http(access_token, logs)
-
-
-def _send_to_scribe_via_boto3(logs: str) -> str:
-    sprint("Scribe access token not provided, sending report via boto3...")
-    event = {"base64_bz2_logs": base64.b64encode(bz2.compress(logs.encode())).decode()}
-    return str(invoke_lambda("gh-ci-scribe-proxy", event))
-
-
-def _send_to_scribe_via_http(access_token: str, logs: str) -> str:
-    # lazy import so that we don't need to introduce extra dependencies
-    import requests  # type: ignore[import]
-
-    sprint("Scribe access token provided, sending report via http...")
-    r = requests.post(
-        "https://graph.facebook.com/scribe_logs",
-        data={"access_token": access_token, "logs": logs},
-    )
-    r.raise_for_status()
-    return str(r.text)
diff --git a/tools/stats/test_history.py b/tools/stats/test_history.py
deleted file mode 100755
index c964fb487522..000000000000
--- a/tools/stats/test_history.py
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import subprocess
-import sys
-from datetime import datetime, timezone
-from signal import SIG_DFL, signal, SIGPIPE
-from typing import Dict, Iterator, List, Optional, Set, Tuple
-
-from tools.stats.s3_stat_parser import get_cases, get_test_stats_summaries, Report
-
-
-def get_git_commit_history(*, path: str, ref: str) -> List[Tuple[str, datetime]]:
-    rc = subprocess.check_output(
-        ["git", "-C", path, "log", "--pretty=format:%H %ct", ref],
-    ).decode("latin-1")
-    return [
-        (x[0], datetime.fromtimestamp(int(x[1]), tz=timezone.utc))
-        for x in [line.split(" ") for line in rc.split("\n")]
-    ]
-
-
-def make_column(
-    *,
-    data: Optional[Report],
-    filename: Optional[str],
-    suite_name: Optional[str],
-    test_name: str,
-    digits: int,
-) -> Tuple[str, int]:
-    decimals = 3
-    num_length = digits + 1 + decimals
-    if data:
-        cases = get_cases(
-            data=data, filename=filename, suite_name=suite_name, test_name=test_name
-        )
-        if cases:
-            case = cases[0]
-            status = case["status"]
-            omitted = len(cases) - 1
-            if status:
-                return f"{status.rjust(num_length)} ", omitted
-            else:
-                return f'{case["seconds"]:{num_length}.{decimals}f}s', omitted
-        else:
-            return f'{"absent".rjust(num_length)} ', 0
-    else:
-        return " " * (num_length + 1), 0
-
-
-def make_columns(
-    *,
-    jobs: List[str],
-    jsons: Dict[str, Report],
-    omitted: Dict[str, int],
-    filename: Optional[str],
-    suite_name: Optional[str],
-    test_name: str,
-    digits: int,
-) -> str:
-    columns = []
-    total_omitted = 0
-    total_suites = 0
-    for job in jobs:
-        data = jsons.get(job)
-        column, omitted_suites = make_column(
-            data=data,
-            filename=filename,
-            suite_name=suite_name,
-            test_name=test_name,
-            digits=digits,
-        )
-        columns.append(column)
-        total_suites += omitted_suites
-        if job in omitted:
-            total_omitted += omitted[job]
-    if total_omitted > 0:
-        columns.append(f"({total_omitted} job re-runs omitted)")
-    if total_suites > 0:
-        columns.append(f"({total_suites} matching suites omitted)")
-    return " ".join(columns)
-
-
-def make_lines(
-    *,
-    jobs: Set[str],
-    jsons: Dict[str, List[Report]],
-    filename: Optional[str],
-    suite_name: Optional[str],
-    test_name: str,
-) -> List[str]:
-    lines = []
-    for job, reports in jsons.items():
-        for data in reports:
-            cases = get_cases(
-                data=data,
-                filename=filename,
-                suite_name=suite_name,
-                test_name=test_name,
-            )
-            if cases:
-                case = cases[0]
-                status = case["status"]
-                line = f'{job} {case["seconds"]}s{f" {status}" if status else ""}'
-                if len(cases) > 1:
-                    line += f" ({len(cases) - 1} matching suites omitted)"
-                lines.append(line)
-            elif job in jobs:
-                lines.append(f"{job} (test not found)")
-    if lines:
-        return lines
-    else:
-        return ["(no reports in S3)"]
-
-
-def history_lines(
-    *,
-    commits: List[Tuple[str, datetime]],
-    jobs: Optional[List[str]],
-    filename: Optional[str],
-    suite_name: Optional[str],
-    test_name: str,
-    delta: int,
-    sha_length: int,
-    mode: str,
-    digits: int,
-) -> Iterator[str]:
-    prev_time = datetime.now(tz=timezone.utc)
-    for sha, time in commits:
-        if (prev_time - time).total_seconds() < delta * 3600:
-            continue
-        prev_time = time
-        if jobs is None:
-            summaries = get_test_stats_summaries(sha=sha)
-        else:
-            summaries = get_test_stats_summaries(sha=sha, jobs=jobs)
-        if mode == "columns":
-            assert jobs is not None
-            # we assume that get_test_stats_summaries here doesn't
-            # return empty lists
-            omitted = {job: len(l) - 1 for job, l in summaries.items() if len(l) > 1}
-            lines = [
-                make_columns(
-                    jobs=jobs,
-                    jsons={job: l[0] for job, l in summaries.items()},
-                    omitted=omitted,
-                    filename=filename,
-                    suite_name=suite_name,
-                    test_name=test_name,
-                    digits=digits,
-                )
-            ]
-        else:
-            assert mode == "multiline"
-            lines = make_lines(
-                jobs=set(jobs or []),
-                jsons=summaries,
-                filename=filename,
-                suite_name=suite_name,
-                test_name=test_name,
-            )
-        for line in lines:
-            yield f"{time:%Y-%m-%d %H:%M:%S}Z {sha[:sha_length]} {line}".rstrip()
-
-
-class HelpFormatter(
-    argparse.ArgumentDefaultsHelpFormatter,
-    argparse.RawDescriptionHelpFormatter,
-):
-    pass
-
-
-def description() -> str:
-    return r"""
-Display the history of a test.
-
-Each line of (non-error) output starts with the timestamp and SHA1 hash
-of the commit it refers to, in this format:
-
-    YYYY-MM-DD hh:mm:ss 0123456789abcdef0123456789abcdef01234567
-
-In multiline mode, each line next includes the name of a CircleCI job,
-followed by the time of the specified test in that job at that commit.
-Example:
-
-    $ tools/stats/test_history.py --mode=multiline --ref=86a961af879 --sha-length=8 \
-      --test=test_composite_compliance_dot_cpu_float32 \
-      --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
-    2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc5.4-test-default1 0.001s
-    2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc7-test-default1 0.001s
-    2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
-    2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc7-test-default1 0.001s
-    2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
-    2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc7-test-default1 0.001s
-    2022-02-18 13:14:56Z e73eaffd (no reports in S3)
-    2022-02-18 06:29:12Z 710f12f5 linux-xenial-py3.7-gcc5.4-test-default1 0.001s
-
-Another multiline example, this time with the --all flag:
-
-    $ tools/stats/test_history.py --mode=multiline --all --ref=86a961af879 --delta=12 --sha-length=8 \
-      --test=test_composite_compliance_dot_cuda_float32
-    2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-default1 0.001s skipped
-    2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-slow1 0.001s skipped
-    2022-02-18 03:49:46Z 69389fb5 linux-xenial-cuda11.3-py3.7-gcc7-test-default1 0.001s skipped
-    2022-02-18 03:49:46Z 69389fb5 periodic-linux-bionic-cuda11.5-py3.7-gcc7-test-default1 0.001s skipped
-    2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test-default1 0.001s skipped
-    2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test-default1 0.001s skipped
-
-In columns mode, the name of the job isn't printed, but the order of the
-columns is guaranteed to match the order of the jobs passed on the
-command line. Example:
-
-    $ tools/stats/test_history.py --mode=columns --ref=86a961af879 --sha-length=8 \
-      --test=test_composite_compliance_dot_cpu_float32 \
-      --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1
-    2022-02-18 15:47:37Z 86a961af    0.001s    0.001s
-    2022-02-18 15:12:34Z f5e201e4    0.001s    0.001s
-    2022-02-18 13:14:56Z 1c0df265    0.001s    0.001s
-    2022-02-18 13:14:56Z e73eaffd
-    2022-02-18 06:29:12Z 710f12f5    0.001s    0.001s
-    2022-02-18 05:20:30Z 51b04f27    0.001s    0.001s
-    2022-02-18 03:49:46Z 69389fb5    0.001s    0.001s
-    2022-02-18 00:19:12Z 056b6260    0.001s    0.001s
-    2022-02-17 23:58:32Z 39fb7714    0.001s    0.001s
-
-Minor note: in columns mode, a blank cell means that no report was found
-in S3, while the word "absent" means that a report was found but the
-indicated test was not found in that report.
-"""
-
-
-def parse_args(raw: List[str]) -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        __file__,
-        description=description(),
-        formatter_class=HelpFormatter,
-    )
-    parser.add_argument(
-        "--mode",
-        choices=["columns", "multiline"],
-        help="output format",
-        default="columns",
-    )
-    parser.add_argument(
-        "--pytorch",
-        help="path to local PyTorch clone",
-        default=".",
-    )
-    parser.add_argument(
-        "--ref",
-        help="starting point (most recent Git ref) to display history for",
-        default="master",
-    )
-    parser.add_argument(
-        "--delta",
-        type=int,
-        help="minimum number of hours between commits",
-        default=0,
-    )
-    parser.add_argument(
-        "--sha-length",
-        type=int,
-        help="length of the prefix of the SHA1 hash to show",
-        default=40,
-    )
-    parser.add_argument(
-        "--digits",
-        type=int,
-        help="(columns) number of digits to display before the decimal point",
-        default=4,
-    )
-    parser.add_argument(
-        "--all",
-        action="store_true",
-        help="(multiline) ignore listed jobs, show all jobs for each commit",
-    )
-    parser.add_argument(
-        "--file",
-        help="name of the file containing the test",
-    )
-    parser.add_argument(
-        "--suite",
-        help="name of the suite containing the test",
-    )
-    parser.add_argument("--test", help="name of the test", required=True)
-    parser.add_argument(
-        "--job",
-        help="names of jobs to display columns for, in order",
-        action="append",
-        default=[],
-    )
-    args = parser.parse_args(raw)
-
-    args.jobs = None if args.all else args.job
-    # We dont allow implicit or empty "--jobs", unless "--all" is specified.
-    if args.jobs == []:
-        parser.error("No jobs specified.")
-
-    return args
-
-
-def run(raw: List[str]) -> Iterator[str]:
-    args = parse_args(raw)
-
-    commits = get_git_commit_history(path=args.pytorch, ref=args.ref)
-
-    return history_lines(
-        commits=commits,
-        jobs=args.jobs,
-        filename=args.file,
-        suite_name=args.suite,
-        test_name=args.test,
-        delta=args.delta,
-        mode=args.mode,
-        sha_length=args.sha_length,
-        digits=args.digits,
-    )
-
-
-def main() -> None:
-    for line in run(sys.argv[1:]):
-        print(line, flush=True)
-
-
-if __name__ == "__main__":
-    signal(SIGPIPE, SIG_DFL)  # https://stackoverflow.com/a/30091579
-    try:
-        main()
-    except KeyboardInterrupt:
-        pass
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index c91075225a62..3f1a54e17825 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -108,10 +108,10 @@ def download_gha_artifacts(
 
 def upload_to_rockset(collection: str, docs: List[Any]) -> None:
     print(f"Writing {len(docs)} documents to Rockset")
-    client = rockset.Client(
-        api_server="api.rs2.usw2.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+    client = rockset.RocksetClient(
+        host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
     )
-    client.Collection.retrieve(collection).add_docs(docs)
+    client.Documents.add_documents(collection=collection, data=docs)
     print("Done!")
 
 
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index 23695933c704..f29a98fb369b 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -33,8 +33,12 @@ def parse_xml_report(
     """Convert a test report xml file into a JSON-serializable list of test cases."""
     print(f"Parsing {tag}s for test report: {report}")
 
-    job_id = get_job_id(report)
-    print(f"Found job id: {job_id}")
+    try:
+        job_id = get_job_id(report)
+        print(f"Found job id: {job_id}")
+    except Exception:
+        job_id = None
+        print("Failed to find job id")
 
     test_cases: List[Dict[str, Any]] = []
 
diff --git a/tools/substitute.py b/tools/substitute.py
index 8c38aa8fee5b..c3b353bf7401 100644
--- a/tools/substitute.py
+++ b/tools/substitute.py
@@ -7,7 +7,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("--input-file")
     parser.add_argument("--output-file")
-    parser.add_argument("--install_dir")
+    parser.add_argument("--install-dir", "--install_dir")
     parser.add_argument("--replace", action="append", nargs=2)
     options = parser.parse_args()
 
diff --git a/tools/test/test_executorch_custom_ops.py b/tools/test/test_executorch_custom_ops.py
index 5ca261362aa9..d5a4757a8451 100644
--- a/tools/test/test_executorch_custom_ops.py
+++ b/tools/test/test_executorch_custom_ops.py
@@ -1,9 +1,16 @@
+import tempfile
+import unittest
 from typing import Any, Dict
+from unittest.mock import ANY, Mock, patch
 
 import expecttest
 
+import torchgen
 from torchgen.executorch.api.custom_ops import ComputeNativeFunctionStub
+from torchgen.gen_executorch import gen_headers
 from torchgen.model import Location, NativeFunction
+from torchgen.selective_build.selector import SelectiveBuilder
+from torchgen.utils import FileManager
 
 SPACES = "    "
 
@@ -72,3 +79,45 @@ def test_schema_has_no_return_type_argument_throws(self) -> None:
         gen = ComputeNativeFunctionStub()
         with self.assertRaisesRegex(Exception, "Can't handle this return type"):
             gen(func)
+
+
+class TestGenCustomOpsHeader(unittest.TestCase):
+    @patch.object(torchgen.utils.FileManager, "write_with_template")
+    @patch.object(torchgen.utils.FileManager, "write")
+    def test_fm_writes_custom_ops_header_when_boolean_is_true(
+        self, unused: Mock, mock_method: Mock
+    ) -> None:
+        with tempfile.TemporaryDirectory() as tempdir:
+            fm = FileManager(tempdir, tempdir, False)
+            gen_headers(
+                native_functions=[],
+                gen_custom_ops_header=True,
+                custom_ops_native_functions=[],
+                static_dispatch_idx=[],
+                selector=SelectiveBuilder.get_nop_selector(),
+                backend_indices={},
+                cpu_fm=fm,
+                use_aten_lib=False,
+            )
+            mock_method.assert_called_once_with(
+                "CustomOpsNativeFunctions.h", "NativeFunctions.h", ANY
+            )
+
+    @patch.object(torchgen.utils.FileManager, "write_with_template")
+    @patch.object(torchgen.utils.FileManager, "write")
+    def test_fm_doesnot_writes_custom_ops_header_when_boolean_is_false(
+        self, unused: Mock, mock_method: Mock
+    ) -> None:
+        with tempfile.TemporaryDirectory() as tempdir:
+            fm = FileManager(tempdir, tempdir, False)
+            gen_headers(
+                native_functions=[],
+                gen_custom_ops_header=False,
+                custom_ops_native_functions=[],
+                static_dispatch_idx=[],
+                selector=SelectiveBuilder.get_nop_selector(),
+                backend_indices={},
+                cpu_fm=fm,
+                use_aten_lib=False,
+            )
+            mock_method.assert_not_called()
diff --git a/tools/test/test_executorch_gen.py b/tools/test/test_executorch_gen.py
index 1a4918096131..25bd01973475 100644
--- a/tools/test/test_executorch_gen.py
+++ b/tools/test/test_executorch_gen.py
@@ -43,7 +43,7 @@
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
-  tags: canonical
+  tags: core
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -67,7 +67,7 @@
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
-  tags: canonical
+  tags: core
 
 """
 
@@ -84,7 +84,7 @@ def setUp(self) -> None:
         with open(self.tags_yaml_path, "w") as f:
             f.write(
                 """
-- tag: canonical
+- tag: core
   desc: test
             """
             )
@@ -181,8 +181,8 @@ def test_operators_with_different_namespaces_are_grouped_correctly(self) -> None
 namespace custom_1 {
 
 // custom_1::op_1() -> bool
-TORCH_API inline bool op_1() {
-    return ::at::native::kernel_1();
+TORCH_API inline bool op_1(torch::executor::RuntimeContext & context) {
+    return ::at::native::kernel_1(context);
 }
 
 } // namespace custom_1
@@ -195,8 +195,8 @@ def test_operators_with_different_namespaces_are_grouped_correctly(self) -> None
 namespace custom_2 {
 
 // custom_2::op_2() -> bool
-TORCH_API inline bool op_2() {
-    return ::at::native::kernel_2();
+TORCH_API inline bool op_2(torch::executor::RuntimeContext & context) {
+    return ::at::native::kernel_2(context);
 }
 
 } // namespace custom_2
diff --git a/tools/test/test_executorch_signatures.py b/tools/test/test_executorch_signatures.py
new file mode 100644
index 000000000000..6095fedc71fa
--- /dev/null
+++ b/tools/test/test_executorch_signatures.py
@@ -0,0 +1,58 @@
+import unittest
+
+from torchgen.executorch.api.types import ExecutorchCppSignature
+from torchgen.local import parametrize
+from torchgen.model import Location, NativeFunction
+
+DEFAULT_NATIVE_FUNCTION, _ = NativeFunction.from_yaml(
+    {"func": "foo.out(Tensor input, *, Tensor(a!) out) -> Tensor(a!)"},
+    loc=Location(__file__, 1),
+    valid_tags=set(),
+)
+
+
+class ExecutorchCppSignatureTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.sig = ExecutorchCppSignature.from_native_function(DEFAULT_NATIVE_FUNCTION)
+
+    def test_runtime_signature_contains_runtime_context(self) -> None:
+        # test if `RuntimeContext` argument exists in `RuntimeSignature`
+        with parametrize(
+            use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+        ):
+            args = self.sig.arguments(include_context=True)
+            self.assertEquals(len(args), 3)
+            self.assertTrue(any(a.name == "context" for a in args))
+
+    def test_runtime_signature_does_not_contain_runtime_context(self) -> None:
+        # test if `RuntimeContext` argument is missing in `RuntimeSignature`
+        with parametrize(
+            use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+        ):
+            args = self.sig.arguments(include_context=False)
+            self.assertEquals(len(args), 2)
+            self.assertFalse(any(a.name == "context" for a in args))
+
+    def test_runtime_signature_declaration_correct(self) -> None:
+        with parametrize(
+            use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+        ):
+            decl = self.sig.decl(include_context=True)
+            self.assertEquals(
+                decl,
+                (
+                    "torch::executor::Tensor & foo_outf("
+                    "torch::executor::RuntimeContext & context, "
+                    "const torch::executor::Tensor & input, "
+                    "torch::executor::Tensor & out)"
+                ),
+            )
+            no_context_decl = self.sig.decl(include_context=False)
+            self.assertEquals(
+                no_context_decl,
+                (
+                    "torch::executor::Tensor & foo_outf("
+                    "const torch::executor::Tensor & input, "
+                    "torch::executor::Tensor & out)"
+                ),
+            )
diff --git a/tools/test/test_selective_build.py b/tools/test/test_selective_build.py
index bb90f01b0157..4b96ec98d399 100644
--- a/tools/test/test_selective_build.py
+++ b/tools/test/test_selective_build.py
@@ -1,5 +1,3 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
-
 import unittest
 
 from torchgen.selective_build.operator import *
diff --git a/tools/test/test_stats.py b/tools/test/test_stats.py
deleted file mode 100644
index 2718308f66da..000000000000
--- a/tools/test/test_stats.py
+++ /dev/null
@@ -1,683 +0,0 @@
-# -*- coding: utf-8 -*-
-import unittest
-from typing import Dict, List
-
-from tools.stats import print_test_stats
-from tools.stats.s3_stat_parser import (
-    Commit,
-    Report,
-    ReportMetaMeta,
-    Status,
-    Version1Case,
-    Version1Report,
-    Version2Case,
-    Version2Report,
-)
-
-
-def fakehash(char: str) -> str:
-    return char * 40
-
-
-def dummy_meta_meta() -> ReportMetaMeta:
-    return {
-        "build_pr": "",
-        "build_tag": "",
-        "build_sha1": "",
-        "build_base_commit": "",
-        "build_branch": "",
-        "build_job": "",
-        "build_workflow_id": "",
-        "build_start_time_epoch": "",
-    }
-
-
-def makecase(
-    name: str,
-    seconds: float,
-    *,
-    errored: bool = False,
-    failed: bool = False,
-    skipped: bool = False,
-) -> Version1Case:
-    return {
-        "name": name,
-        "seconds": seconds,
-        "errored": errored,
-        "failed": failed,
-        "skipped": skipped,
-    }
-
-
-def make_report_v1(tests: Dict[str, List[Version1Case]]) -> Version1Report:
-    suites = {
-        suite_name: {
-            "total_seconds": sum(case["seconds"] for case in cases),
-            "cases": cases,
-        }
-        for suite_name, cases in tests.items()
-    }
-    return {
-        **dummy_meta_meta(),  # type: ignore[misc]
-        "total_seconds": sum(s["total_seconds"] for s in suites.values()),
-        "suites": suites,
-    }
-
-
-def make_case_v2(seconds: float, status: Status = None) -> Version2Case:
-    return {
-        "seconds": seconds,
-        "status": status,
-    }
-
-
-def make_report_v2(
-    tests: Dict[str, Dict[str, Dict[str, Version2Case]]]
-) -> Version2Report:
-    files = {}
-    for file_name, file_suites in tests.items():
-        suites = {
-            suite_name: {
-                "total_seconds": sum(case["seconds"] for case in cases.values()),
-                "cases": cases,
-            }
-            for suite_name, cases in file_suites.items()
-        }
-        files[file_name] = {
-            "suites": suites,
-            "total_seconds": sum(suite["total_seconds"] for suite in suites.values()),  # type: ignore[type-var]
-        }
-    return {
-        **dummy_meta_meta(),  # type: ignore[misc]
-        "format_version": 2,
-        "total_seconds": sum(s["total_seconds"] for s in files.values()),
-        "files": files,
-    }
-
-
-maxDiff = None
-
-
-class TestPrintTestStats(unittest.TestCase):
-    version1_report: Version1Report = make_report_v1(
-        {
-            # input ordering of the suites is ignored
-            "Grault": [
-                # not printed: status same and time similar
-                makecase("test_grault0", 4.78, failed=True),
-                # status same, but time increased a lot
-                makecase("test_grault2", 1.473, errored=True),
-            ],
-            # individual tests times changed, not overall suite
-            "Qux": [
-                # input ordering of the test cases is ignored
-                makecase("test_qux1", 0.001, skipped=True),
-                makecase("test_qux6", 0.002, skipped=True),
-                # time in bounds, but status changed
-                makecase("test_qux4", 7.158, failed=True),
-                # not printed because it's the same as before
-                makecase("test_qux7", 0.003, skipped=True),
-                makecase("test_qux5", 11.968),
-                makecase("test_qux3", 23.496),
-            ],
-            # new test suite
-            "Bar": [
-                makecase("test_bar2", 3.742, failed=True),
-                makecase("test_bar1", 50.447),
-            ],
-            # overall suite time changed but no individual tests
-            "Norf": [
-                makecase("test_norf1", 3),
-                makecase("test_norf2", 3),
-                makecase("test_norf3", 3),
-                makecase("test_norf4", 3),
-            ],
-            # suite doesn't show up if it doesn't change enough
-            "Foo": [
-                makecase("test_foo1", 42),
-                makecase("test_foo2", 56),
-            ],
-        }
-    )
-
-    version2_report: Version2Report = make_report_v2(
-        {
-            "test_a": {
-                "Grault": {
-                    "test_grault0": make_case_v2(4.78, "failed"),
-                    "test_grault2": make_case_v2(1.473, "errored"),
-                },
-                "Qux": {
-                    "test_qux1": make_case_v2(0.001, "skipped"),
-                    "test_qux6": make_case_v2(0.002, "skipped"),
-                    "test_qux4": make_case_v2(7.158, "failed"),
-                    "test_qux7": make_case_v2(0.003, "skipped"),
-                    "test_qux8": make_case_v2(11.968),
-                    "test_qux3": make_case_v2(23.496),
-                },
-            },
-            "test_b": {
-                "Bar": {
-                    "test_bar2": make_case_v2(3.742, "failed"),
-                    "test_bar1": make_case_v2(50.447),
-                },
-                # overall suite time changed but no individual tests
-                "Norf": {
-                    "test_norf1": make_case_v2(3),
-                    "test_norf2": make_case_v2(3),
-                    "test_norf3": make_case_v2(3),
-                    "test_norf4": make_case_v2(3),
-                },
-            },
-            "test_c": {
-                "Foo": {
-                    "test_foo1": make_case_v2(42),
-                    "test_foo2": make_case_v2(56),
-                },
-            },
-        }
-    )
-
-    def test_simplify(self) -> None:
-        self.assertEqual(
-            {
-                "": {
-                    "Bar": {
-                        "test_bar1": {"seconds": 50.447, "status": None},
-                        "test_bar2": {"seconds": 3.742, "status": "failed"},
-                    },
-                    "Foo": {
-                        "test_foo1": {"seconds": 42, "status": None},
-                        "test_foo2": {"seconds": 56, "status": None},
-                    },
-                    "Grault": {
-                        "test_grault0": {"seconds": 4.78, "status": "failed"},
-                        "test_grault2": {"seconds": 1.473, "status": "errored"},
-                    },
-                    "Norf": {
-                        "test_norf1": {"seconds": 3, "status": None},
-                        "test_norf3": {"seconds": 3, "status": None},
-                        "test_norf2": {"seconds": 3, "status": None},
-                        "test_norf4": {"seconds": 3, "status": None},
-                    },
-                    "Qux": {
-                        "test_qux1": {"seconds": 0.001, "status": "skipped"},
-                        "test_qux3": {"seconds": 23.496, "status": None},
-                        "test_qux4": {"seconds": 7.158, "status": "failed"},
-                        "test_qux5": {"seconds": 11.968, "status": None},
-                        "test_qux6": {"seconds": 0.002, "status": "skipped"},
-                        "test_qux7": {"seconds": 0.003, "status": "skipped"},
-                    },
-                },
-            },
-            print_test_stats.simplify(self.version1_report),
-        )
-
-        self.assertEqual(
-            {
-                "test_a": {
-                    "Grault": {
-                        "test_grault0": {"seconds": 4.78, "status": "failed"},
-                        "test_grault2": {"seconds": 1.473, "status": "errored"},
-                    },
-                    "Qux": {
-                        "test_qux1": {"seconds": 0.001, "status": "skipped"},
-                        "test_qux3": {"seconds": 23.496, "status": None},
-                        "test_qux4": {"seconds": 7.158, "status": "failed"},
-                        "test_qux6": {"seconds": 0.002, "status": "skipped"},
-                        "test_qux7": {"seconds": 0.003, "status": "skipped"},
-                        "test_qux8": {"seconds": 11.968, "status": None},
-                    },
-                },
-                "test_b": {
-                    "Bar": {
-                        "test_bar1": {"seconds": 50.447, "status": None},
-                        "test_bar2": {"seconds": 3.742, "status": "failed"},
-                    },
-                    "Norf": {
-                        "test_norf1": {"seconds": 3, "status": None},
-                        "test_norf2": {"seconds": 3, "status": None},
-                        "test_norf3": {"seconds": 3, "status": None},
-                        "test_norf4": {"seconds": 3, "status": None},
-                    },
-                },
-                "test_c": {
-                    "Foo": {
-                        "test_foo1": {"seconds": 42, "status": None},
-                        "test_foo2": {"seconds": 56, "status": None},
-                    },
-                },
-            },
-            print_test_stats.simplify(self.version2_report),
-        )
-
-    def test_analysis(self) -> None:
-        head_report = self.version1_report
-
-        base_reports: Dict[Commit, List[Report]] = {
-            # bbbb has no reports, so base is cccc instead
-            fakehash("b"): [],
-            fakehash("c"): [
-                make_report_v1(
-                    {
-                        "Baz": [
-                            makecase("test_baz2", 13.605),
-                            # no recent suites have & skip this test
-                            makecase("test_baz1", 0.004, skipped=True),
-                        ],
-                        "Foo": [
-                            makecase("test_foo1", 43),
-                            # test added since dddd
-                            makecase("test_foo2", 57),
-                        ],
-                        "Grault": [
-                            makecase("test_grault0", 4.88, failed=True),
-                            makecase("test_grault1", 11.967, failed=True),
-                            makecase("test_grault2", 0.395, errored=True),
-                            makecase("test_grault3", 30.460),
-                        ],
-                        "Norf": [
-                            makecase("test_norf1", 2),
-                            makecase("test_norf2", 2),
-                            makecase("test_norf3", 2),
-                            makecase("test_norf4", 2),
-                        ],
-                        "Qux": [
-                            makecase("test_qux3", 4.978, errored=True),
-                            makecase("test_qux7", 0.002, skipped=True),
-                            makecase("test_qux2", 5.618),
-                            makecase("test_qux4", 7.766, errored=True),
-                            makecase("test_qux6", 23.589, failed=True),
-                        ],
-                    }
-                ),
-            ],
-            fakehash("d"): [
-                make_report_v1(
-                    {
-                        "Foo": [
-                            makecase("test_foo1", 40),
-                            # removed in cccc
-                            makecase("test_foo3", 17),
-                        ],
-                        "Baz": [
-                            # not skipped, so not included in stdev
-                            makecase("test_baz1", 3.14),
-                        ],
-                        "Qux": [
-                            makecase("test_qux7", 0.004, skipped=True),
-                            makecase("test_qux2", 6.02),
-                            makecase("test_qux4", 20.932),
-                        ],
-                        "Norf": [
-                            makecase("test_norf1", 3),
-                            makecase("test_norf2", 3),
-                            makecase("test_norf3", 3),
-                            makecase("test_norf4", 3),
-                        ],
-                        "Grault": [
-                            makecase("test_grault0", 5, failed=True),
-                            makecase("test_grault1", 14.325, failed=True),
-                            makecase("test_grault2", 0.31, errored=True),
-                        ],
-                    }
-                ),
-            ],
-            fakehash("e"): [],
-            fakehash("f"): [
-                make_report_v1(
-                    {
-                        "Foo": [
-                            makecase("test_foo3", 24),
-                            makecase("test_foo1", 43),
-                        ],
-                        "Baz": [
-                            makecase("test_baz2", 16.857),
-                        ],
-                        "Qux": [
-                            makecase("test_qux2", 6.422),
-                            makecase("test_qux4", 6.382, errored=True),
-                        ],
-                        "Norf": [
-                            makecase("test_norf1", 0.9),
-                            makecase("test_norf3", 0.9),
-                            makecase("test_norf2", 0.9),
-                            makecase("test_norf4", 0.9),
-                        ],
-                        "Grault": [
-                            makecase("test_grault0", 4.7, failed=True),
-                            makecase("test_grault1", 13.146, failed=True),
-                            makecase("test_grault2", 0.48, errored=True),
-                        ],
-                    }
-                ),
-            ],
-        }
-
-        simpler_head = print_test_stats.simplify(head_report)
-        simpler_base = {}
-        for commit, reports in base_reports.items():
-            simpler_base[commit] = [print_test_stats.simplify(r) for r in reports]
-        analysis = print_test_stats.analyze(
-            head_report=simpler_head,
-            base_reports=simpler_base,
-        )
-
-        self.assertEqual(
-            """\
-
-- class Baz:
--     # was   15.23s ±   2.30s
--
--     def test_baz1: ...
--         # was   0.004s           (skipped)
--
--     def test_baz2: ...
--         # was  15.231s ±  2.300s
-
-
-  class Grault:
-      # was   48.86s ±   1.19s
-      # now    6.25s
-
-    - def test_grault1: ...
-    -     # was  13.146s ±  1.179s (failed)
-
-    - def test_grault3: ...
-    -     # was  30.460s
-
-
-  class Qux:
-      # was   41.66s ±   1.06s
-      # now   42.63s
-
-    - def test_qux2: ...
-    -     # was   6.020s ±  0.402s
-
-    ! def test_qux3: ...
-    !     # was   4.978s           (errored)
-    !     # now  23.496s
-
-    ! def test_qux4: ...
-    !     # was   7.074s ±  0.979s (errored)
-    !     # now   7.158s           (failed)
-
-    ! def test_qux6: ...
-    !     # was  23.589s           (failed)
-    !     # now   0.002s           (skipped)
-
-    + def test_qux1: ...
-    +     # now   0.001s           (skipped)
-
-    + def test_qux5: ...
-    +     # now  11.968s
-
-
-+ class Bar:
-+     # now   54.19s
-+
-+     def test_bar1: ...
-+         # now  50.447s
-+
-+     def test_bar2: ...
-+         # now   3.742s           (failed)
-
-""",
-            print_test_stats.anomalies(analysis),
-        )
-
-    def test_graph(self) -> None:
-        # HEAD is on master
-        self.assertEqual(
-            """\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    * aaaaaaaaaa (HEAD)              total time   502.99s
-    * bbbbbbbbbb (base)   1 report,  total time    47.84s
-    * cccccccccc          1 report,  total time   332.50s
-    * dddddddddd          0 reports
-    |
-    :
-""",
-            print_test_stats.graph(
-                head_sha=fakehash("a"),
-                head_seconds=502.99,
-                base_seconds={
-                    fakehash("b"): [47.84],
-                    fakehash("c"): [332.50],
-                    fakehash("d"): [],
-                },
-                on_master=True,
-            ),
-        )
-
-        self.assertEqual(
-            """\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time  9988.77s
-    |/
-    * bbbbbbbbbb (base) 121 reports, total time  7654.32s ±   55.55s
-    * cccccccccc         20 reports, total time  5555.55s ±  253.19s
-    * dddddddddd          1 report,  total time  1234.56s
-    |
-    :
-""",
-            print_test_stats.graph(
-                head_sha=fakehash("a"),
-                head_seconds=9988.77,
-                base_seconds={
-                    fakehash("b"): [7598.77] * 60 + [7654.32] + [7709.87] * 60,
-                    fakehash("c"): [5308.77] * 10 + [5802.33] * 10,
-                    fakehash("d"): [1234.56],
-                },
-                on_master=False,
-            ),
-        )
-
-        self.assertEqual(
-            """\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time    25.52s
-    | |
-    | : (5 commits)
-    |/
-    * bbbbbbbbbb          0 reports
-    * cccccccccc          0 reports
-    * dddddddddd (base)  15 reports, total time    58.92s ±   25.82s
-    |
-    :
-""",
-            print_test_stats.graph(
-                head_sha=fakehash("a"),
-                head_seconds=25.52,
-                base_seconds={
-                    fakehash("b"): [],
-                    fakehash("c"): [],
-                    fakehash("d"): [52.25] * 14 + [152.26],
-                },
-                on_master=False,
-                ancestry_path=5,
-            ),
-        )
-
-        self.assertEqual(
-            """\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     0.08s
-    |/|
-    | : (1 commit)
-    |
-    * bbbbbbbbbb          0 reports
-    * cccccccccc (base)   1 report,  total time     0.09s
-    * dddddddddd          3 reports, total time     0.10s ±    0.05s
-    |
-    :
-""",
-            print_test_stats.graph(
-                head_sha=fakehash("a"),
-                head_seconds=0.08,
-                base_seconds={
-                    fakehash("b"): [],
-                    fakehash("c"): [0.09],
-                    fakehash("d"): [0.05, 0.10, 0.15],
-                },
-                on_master=False,
-                other_ancestors=1,
-            ),
-        )
-
-        self.assertEqual(
-            """\
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     5.98s
-    | |
-    | : (1 commit)
-    |/|
-    | : (7 commits)
-    |
-    * bbbbbbbbbb (base)   2 reports, total time     6.02s ±    1.71s
-    * cccccccccc          0 reports
-    * dddddddddd         10 reports, total time     5.84s ±    0.92s
-    |
-    :
-""",
-            print_test_stats.graph(
-                head_sha=fakehash("a"),
-                head_seconds=5.98,
-                base_seconds={
-                    fakehash("b"): [4.81, 7.23],
-                    fakehash("c"): [],
-                    fakehash("d"): [4.97] * 5 + [6.71] * 5,
-                },
-                on_master=False,
-                ancestry_path=1,
-                other_ancestors=7,
-            ),
-        )
-
-    def test_regression_info(self) -> None:
-        self.assertEqual(
-            """\
------ Historic stats comparison result ------
-
-    job: foo_job
-    commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     3.02s
-    |/
-    * bbbbbbbbbb (base)   1 report,  total time    41.00s
-    * cccccccccc          1 report,  total time    43.00s
-    |
-    :
-
-Removed  (across    1 suite)      1 test,  totaling -   1.00s
-Modified (across    1 suite)      1 test,  totaling -  41.48s ±   2.12s
-Added    (across    1 suite)      1 test,  totaling +   3.00s
-""",
-            print_test_stats.regression_info(
-                head_sha=fakehash("a"),
-                head_report=make_report_v1(
-                    {
-                        "Foo": [
-                            makecase("test_foo", 0.02, skipped=True),
-                            makecase("test_baz", 3),
-                        ]
-                    }
-                ),
-                base_reports={
-                    fakehash("b"): [
-                        make_report_v1(
-                            {
-                                "Foo": [
-                                    makecase("test_foo", 40),
-                                    makecase("test_bar", 1),
-                                ],
-                            }
-                        ),
-                    ],
-                    fakehash("c"): [
-                        make_report_v1(
-                            {
-                                "Foo": [
-                                    makecase("test_foo", 43),
-                                ],
-                            }
-                        ),
-                    ],
-                },
-                job_name="foo_job",
-                on_master=False,
-                ancestry_path=0,
-                other_ancestors=0,
-            ),
-        )
-
-    def test_regression_info_new_job(self) -> None:
-        self.assertEqual(
-            """\
------ Historic stats comparison result ------
-
-    job: foo_job
-    commit: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
-
-Commit graph (base is most recent master ancestor with at least one S3 report):
-
-    : (master)
-    |
-    | * aaaaaaaaaa (HEAD)            total time     3.02s
-    | |
-    | : (3 commits)
-    |/|
-    | : (2 commits)
-    |
-    * bbbbbbbbbb          0 reports
-    * cccccccccc          0 reports
-    |
-    :
-
-Removed  (across    0 suites)     0 tests, totaling     0.00s
-Modified (across    0 suites)     0 tests, totaling     0.00s
-Added    (across    1 suite)      2 tests, totaling +   3.02s
-""",
-            print_test_stats.regression_info(
-                head_sha=fakehash("a"),
-                head_report=make_report_v1(
-                    {
-                        "Foo": [
-                            makecase("test_foo", 0.02, skipped=True),
-                            makecase("test_baz", 3),
-                        ]
-                    }
-                ),
-                base_reports={
-                    fakehash("b"): [],
-                    fakehash("c"): [],
-                },
-                job_name="foo_job",
-                on_master=False,
-                ancestry_path=3,
-                other_ancestors=2,
-            ),
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tools/test/test_test_history.py b/tools/test/test_test_history.py
deleted file mode 100644
index 7851ca3f510f..000000000000
--- a/tools/test/test_test_history.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import itertools
-import re
-import shlex
-import unittest
-from typing import List, Optional
-
-from tools.stats import test_history
-from typing_extensions import TypedDict
-
-
-class Example(TypedDict):
-    cmd: str
-    args: List[str]
-    lines: List[str]
-
-
-def parse_block(block: List[str]) -> Optional[Example]:
-    if block:
-        match = re.match(r"^\$ ([^ ]+) (.*)$", block[0])
-        if match:
-            cmd, first = match.groups()
-            args = []
-            for i, line in enumerate([first] + block[1:]):
-                if line.endswith("\\"):
-                    args.append(line[:-1])
-                else:
-                    args.append(line)
-                    break
-            return {
-                "cmd": cmd,
-                "args": shlex.split("".join(args)),
-                "lines": block[i + 1 :],
-            }
-    return None
-
-
-def parse_description(description: str) -> List[Example]:
-    examples: List[Example] = []
-    for block in description.split("\n\n"):
-        matches = [re.match(r"^    (.*)$", line) for line in block.splitlines()]
-        if all(matches):
-            lines = []
-            for match in matches:
-                assert match
-                (line,) = match.groups()
-                lines.append(line)
-            example = parse_block(lines)
-            if example:
-                examples.append(example)
-    return examples
-
-
-@unittest.skip("Skipping as this test is fragile, issue #73083")
-class TestTestHistory(unittest.TestCase):
-    maxDiff = None
-
-    def test_help_examples(self) -> None:
-        examples = parse_description(test_history.description())
-        self.assertEqual(len(examples), 3)
-        for i, example in enumerate(examples):
-            with self.subTest(i=i):
-                self.assertTrue(test_history.__file__.endswith(example["cmd"]))
-                expected = example["lines"]
-                actual = list(
-                    itertools.islice(
-                        test_history.run(example["args"]),
-                        len(expected),
-                    )
-                )
-                self.assertEqual(actual, expected)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tools/testing/modulefinder_determinator.py b/tools/testing/modulefinder_determinator.py
index dd4e4cf5c6df..116517091b01 100644
--- a/tools/testing/modulefinder_determinator.py
+++ b/tools/testing/modulefinder_determinator.py
@@ -113,7 +113,7 @@ def test_impact_of_file(filename: str) -> str:
         CI - CI configuration files
     """
     parts = filename.split(os.sep)
-    if parts[0] in [".jenkins", ".circleci"]:
+    if parts[0] in [".jenkins", ".circleci", ".ci"]:
         return "CI"
     if parts[0] in ["docs", "scripts", "CODEOWNERS", "README.md"]:
         return "NONE"
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index d3b89c8f2f7e..bde066de7a67 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -84,7 +84,7 @@ def calculate_shards(
 def _query_changed_test_files() -> List[str]:
     default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'master')}"
     cmd = ["git", "diff", "--name-only", default_branch, "HEAD"]
-    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    proc = subprocess.run(cmd, capture_output=True)
 
     if proc.returncode != 0:
         raise RuntimeError("Unable to get changed files")
diff --git a/tools/vscode_settings.py b/tools/vscode_settings.py
index 5c7fa8740c4f..21fddf6caccb 100755
--- a/tools/vscode_settings.py
+++ b/tools/vscode_settings.py
@@ -1,20 +1,64 @@
 #!/usr/bin/env python3
 
-import json
 from pathlib import Path
 
+try:
+    # VS Code settings allow comments and trailing commas, which are not valid JSON.
+    import json5 as json  # type: ignore[import]
+
+    HAS_JSON5 = True
+except ImportError:
+    import json  # type: ignore[no-redef]
+
+    HAS_JSON5 = False
+
+
+ROOT_FOLDER = Path(__file__).absolute().parent.parent
+VSCODE_FOLDER = ROOT_FOLDER / ".vscode"
+RECOMMENDED_SETTINGS = VSCODE_FOLDER / "settings_recommended.json"
+SETTINGS = VSCODE_FOLDER / "settings.json"
+
+
+# settings can be nested, so we need to recursively update the settings.
+def deep_update(d: dict, u: dict) -> dict:  # type: ignore[type-arg]
+    for k, v in u.items():
+        if isinstance(v, dict):
+            d[k] = deep_update(d.get(k, {}), v)
+        elif isinstance(v, list):
+            d[k] = d.get(k, []) + v
+        else:
+            d[k] = v
+    return d
+
 
 def main() -> None:
-    folder = Path(".vscode")
-    recommended = json.loads((folder / "settings_recommended.json").read_text())
-    path = folder / "settings.json"
+    recommended_settings = json.loads(RECOMMENDED_SETTINGS.read_text())
+    try:
+        current_settings_text = SETTINGS.read_text()
+    except FileNotFoundError:
+        current_settings_text = "{}"
+
     try:
-        current = json.loads(path.read_text())
-    except Exception:
-        current = {}
-    with open(path, "w") as f:
-        json.dump({**current, **recommended}, f, indent=2)
-        f.write("\n")
+        current_settings = json.loads(current_settings_text)
+    except ValueError as ex:  # json.JSONDecodeError is a subclass of ValueError
+        if HAS_JSON5:
+            raise SystemExit("Failed to parse .vscode/settings.json.") from ex
+        raise SystemExit(
+            "Failed to parse .vscode/settings.json. "
+            "Maybe it contains comments or trailing commas. "
+            "Try `pip install json5` to install an extended JSON parser."
+        ) from ex
+
+    settings = deep_update(current_settings, recommended_settings)
+
+    SETTINGS.write_text(
+        json.dumps(
+            settings,
+            indent=4,
+        )
+        + "\n",  # add a trailing newline
+        encoding="utf-8",
+    )
 
 
 if __name__ == "__main__":
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index e5d13b57535d..fb98cda76119 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -132,6 +132,7 @@ if(USE_CUDA)
 
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDA)
     if(USE_CUDNN)
+        list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::cudnn)
         list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_CUDNN)
     endif()
 
@@ -386,9 +387,9 @@ add_custom_command(
     "${PYTHON_EXECUTABLE}" -c \"from pathlib import Path\; Path('${TOOLS_PATH}/generate_torch_version.py').touch()\"
   COMMAND
     "${PYTHON_EXECUTABLE}" ${TOOLS_PATH}/generate_torch_version.py
-      --is_debug=${TORCH_VERSION_DEBUG}
-      --cuda_version=${CUDA_VERSION}
-      --hip_version=${HIP_VERSION}
+      --is-debug=${TORCH_VERSION_DEBUG}
+      --cuda-version=${CUDA_VERSION}
+      --hip-version=${HIP_VERSION}
   DEPENDS ${TOOLS_PATH}/generate_torch_version.py
   WORKING_DIRECTORY ${TORCH_ROOT}
 )
diff --git a/torch/_C/_VariableFunctions.pyi.in b/torch/_C/_VariableFunctions.pyi.in
index ffd9f5204093..8a5a63837aa6 100644
--- a/torch/_C/_VariableFunctions.pyi.in
+++ b/torch/_C/_VariableFunctions.pyi.in
@@ -1,9 +1,7 @@
 # ${generated_comment}
 
-from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided
-from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, TypeVar
-from typing_extensions import Literal
-from torch._six import inf
+from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided, inf
+from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, Literal, TypeVar
 
 from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout, SymInt, Device
 import torch
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 37c4d9ab7f13..b4f8510f6fc6 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2,16 +2,14 @@
 
 import torch
 from torch.package import PackageExporter
-from torch import Tensor
+from torch import Tensor, inf
 from torch.autograd.graph import Node as _Node
 from enum import Enum
 from pathlib import Path
 from typing import (
     Any, BinaryIO, Callable, ContextManager, Dict, Iterable, Iterator, List,
     NamedTuple, Optional, overload, Sequence, Tuple, TypeVar, Type, Union,
-    Generic, Set, AnyStr)
-from typing_extensions import Literal
-from torch._six import inf
+    Literal, Generic, Set, AnyStr)
 
 from torch.types import (
     _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage, SymInt, _dispatchkey
@@ -151,11 +149,11 @@ per_channel_symmetric: qscheme = ...
 per_channel_affine_float_qparams: qscheme = ...
 
 # Defined in torch/csrc/autograd/python_function.cpp
-class _FunctionBase(object):
+class _FunctionBase:
     ...
 
 # Defined in torch/csrc/autograd/python_legacy_variable.cpp
-class _LegacyVariableBase(object):
+class _LegacyVariableBase(Tensor):  # inherits from Tensor to appease mypy
     def __init__(
         self,
         data: Optional[Tensor]=...,
@@ -169,7 +167,7 @@ class IODescriptor: ...
 
 class JITException: ...
 
-class Future(object):
+class Future:
   def __init__(self, devices: List[device]) -> None: ...
   def done(self) -> _bool: ...
   def value(self) -> Any: ...
@@ -179,6 +177,12 @@ class Future(object):
   def set_result(self, result: Any) -> None: ...
   def _set_unwrap_func(self, callback: Callable) -> None: ...
 
+class _Await:
+  def __init__(self) -> None: ...
+  def fn(self) -> Callable: ...
+  def args(self) -> Tuple[Any, ...]: ...
+  def is_nowait(self) -> _bool: ...
+
 def _jit_set_num_profiled_runs(num: _size) -> _size: ...
 
 # Defined in torch/csrc/jit/passes/mobile_optimizer_type.h
@@ -194,6 +198,9 @@ VULKAN_AUTOMATIC_GPU_TRANSFER: _MobileOptimizerType
 
 def fork(*args: Any, **kwargs: Any) -> Future: ...
 def wait(fut: Future) -> Any: ...
+def _awaitable(*args: Any, **kwargs: Any) -> _Await: ...
+def _awaitable_wait(aw: _Await) -> Any: ...
+def _awaitable_nowait(x: Any) -> _Await: ...
 def _collect_all(futures: List[Future]) -> Future: ...
 def _set_print_stack_traces_on_fatal_signal(print: _bool) -> None: ...
 
@@ -363,8 +370,8 @@ def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def
 
 def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
 def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ...
-def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ...
-def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool, is_script: _bool) -> None: ...
+def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, params_dict: Dict[str, IValue], opset_version: _int) -> None: ...
+def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool, is_script: _bool, opset_version: _int) -> None: ...
 def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph, module: Optional[ScriptModule] = None) -> None: ...
 def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
 def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
@@ -473,7 +480,7 @@ def _import_ir_module_from_package(
 ) -> ScriptModule: ...
 
 def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
-def _check_onnx_proto(proto: str, full_check: _bool = False) -> None: ...
+def _check_onnx_proto(proto: str) -> None: ...
 def _propagate_and_assign_input_shapes(
     graph: Graph,
     inputs: Tuple[Tensor, ...],
@@ -692,7 +699,7 @@ def _test_only_add_entry_to_op_version(op_name: str, entry: _UpgraderEntry) -> N
 def _test_only_remove_entry_to_op_version(op_name: str) -> None: ...
 
 # Defined in torch/csrc/jit/python/script_init.cpp
-class ScriptModuleSerializer(object):
+class ScriptModuleSerializer:
     def __init__(self, export_writer: PyTorchFileWriter) -> None: ...
     def serialize(self, model: ScriptModule, script_module_id: _int) -> None: ...
     def write_files(self) -> None: ...
@@ -700,14 +707,14 @@ class ScriptModuleSerializer(object):
     ...
 
 # Defined in torch/csrc/jit/python/script_init.cpp
-class SerializationStorageContext(object):
+class SerializationStorageContext:
     def __init__(self) -> None: ...
     def has_storage(self, storage: Storage) -> _bool: ...
     def get_or_add_storage(self, storage: Storage) -> _int: ...
     ...
 
 # Defined in torch/csrc/jit/python/script_init.cpp
-class DeserializationStorageContext(object):
+class DeserializationStorageContext:
     def __init__(self) -> None: ...
     def get_storage(self, name: str, dtype: _dtype) -> Tensor: ...
     def has_storage(self, name: str) -> _bool: ...
@@ -895,8 +902,6 @@ def _disabled_torch_function_impl(func: Callable, types: Iterable[Type], args: T
 def _disabled_torch_dispatch_impl(func: Callable, types: Iterable[Type], args: Tuple, kwargs: Dict) -> Any: ...  # THPModule_disable_dispatch_function
 def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ...
-def _is_mps_available() -> _bool: ...
-def _is_mps_on_macos_13_or_newer() -> _bool: ...
 class _LinalgBackend:
     Default: _LinalgBackend
     Cusolver: _LinalgBackend
@@ -965,7 +970,7 @@ def _pop_torch_dispatch_stack() -> Any: ...
 def _get_dispatch_stack_at(idx: _int) -> Any: ...
 def _len_torch_dispatch_stack() -> _int: ...
 
-class _InferenceMode(object):
+class _InferenceMode:
     def __init__(self, mode: _bool) -> None: ...
 
 class _DisableFuncTorch:
@@ -977,8 +982,11 @@ class _EnableTorchFunction:
 class _MultithreadingEnabled:
     def __init__(self, mode: _bool) -> None: ...
 
+class _ViewReplayEnabled:
+    def __init__(self, mode: _bool) -> None: ...
+
 # Defined in torch/csrc/jit/python/script_init.cpp
-class LoggerBase(object):
+class LoggerBase:
     ...
 
 class NoopLogger(LoggerBase):
@@ -991,7 +999,7 @@ class AggregationType(Enum):
     SUM = 0
     AVG = 1
 
-class FileCheck(object):
+class FileCheck:
     def run(self, test_string: str) -> None: ...
     def check(self, test_string: str) -> 'FileCheck': ...
     def check_not(self, test_string: str) -> 'FileCheck': ...
@@ -1003,7 +1011,7 @@ class FileCheck(object):
     ...
 
 # Defined in torch/csrc/jit/python/init.cpp
-class PyTorchFileReader(object):
+class PyTorchFileReader:
     @overload
     def __init__(self, name: str) -> None: ...
     @overload
@@ -1011,7 +1019,7 @@ class PyTorchFileReader(object):
     def get_record(self, name: str) -> bytes: ...
     ...
 
-class PyTorchFileWriter(object):
+class PyTorchFileWriter:
     @overload
     def __init__(self, name: str) -> None: ...
     @overload
@@ -1039,7 +1047,7 @@ def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ...
 def _rename_privateuse1_backend(backend: str) -> None: ...
 
 # Defined in torch/csrc/Generator.cpp
-class Generator(object):
+class Generator:
     device: _device
     def __init__(self, device: Union[_device, str, None] = None) -> None: ...
     def get_state(self) -> Tensor: ...
@@ -1118,28 +1126,28 @@ def _dispatch_get_registrations_for_dispatch_key(dispatch_key: str = "") -> List
 def _are_functorch_transforms_active() -> _bool: ...
 
 # Define in torch/csrc/autograd/init.cpp
-class _DisablePythonDispatcher(object):
+class _DisablePythonDispatcher:
     pass
 
-class _EnablePythonDispatcher(object):
+class _EnablePythonDispatcher:
     pass
 
 def _set_python_dispatcher(dispatcher: object) -> None: ...
 
 
 # Defined in torch/csrc/utils/init.cpp
-class BenchmarkConfig(object):
+class BenchmarkConfig:
     num_calling_threads: _int
     num_worker_threads: _int
     num_warmup_iters: _int
     num_iters: _int
     profiler_output_path: str
 
-class BenchmarkExecutionStats(object):
+class BenchmarkExecutionStats:
     latency_avg_ms: _float
     num_iters: _int
 
-class ThroughputBenchmark(object):
+class ThroughputBenchmark:
     def __init__(self, module: Any) -> None: ...
     def add_input(self, *args: Any, **kwargs: Any) -> None: ...
     def run_once(self, *args: Any, **kwargs: Any) -> Any: ...
@@ -1153,7 +1161,9 @@ ${legacy_class_hints}
 
 # Defined in torch/csrc/autograd/python_engine.cpp
 class _ImperativeEngine:
-    ...
+    def queue_callback(self, callback: Callable[[], None]) -> None: ...
+    def run_backward(self, *args: Any, **kwargs: Any) -> Tuple[Tensor, ...]: ...
+    def is_checkpoint_valid(self) -> _bool: ...
 
 # Defined in torch/csrc/autograd/python_variable.cpp
 class _TensorMeta(type):
@@ -1189,6 +1199,16 @@ class _TensorBase(metaclass=_TensorMeta):
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
 
+# Defined in torch/csrc/mps/Module.cpp
+def _mps_synchronize() -> None: ...
+def _mps_get_default_generator() -> Generator: ...
+def _mps_emptyCache() -> None: ...
+def _mps_setMemoryFraction(fraction: _float) -> None: ...
+def _mps_currentAllocatedMemory() -> _int: ...
+def _mps_driverAllocatedMemory() -> _int: ...
+def _mps_is_available() -> _bool: ...
+def _mps_is_on_macos_13_or_newer(minor: _int) -> _bool: ...
+
 # Defined in torch/csrc/cuda/Module.cpp
 def _cuda_getCurrentStream(device: _int) -> Tuple: ...
 def _cuda_getCurrentRawStream(device: _int) -> _int: ...
@@ -1476,6 +1496,10 @@ class FutureType(JitType):
     def __init__(self, a: JitType) -> None: ...
     def getElementType(self) -> JitType: ...
 
+class AwaitType(JitType):
+    def __init__(self, a: JitType) -> None: ...
+    def getElementType(self) -> JitType: ...
+
 class RRefType(JitType):
     def __init__(self, a: JitType) -> None: ...
 
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index bdba43cb693a..391095e3b3bc 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -76,6 +76,8 @@ def _set_empty_test_observer(is_global: bool, sampling_prob: float) -> None: ...
 def _push_saved_tensors_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ...
 def _pop_saved_tensors_default_hooks() -> None: ...
 
+def _unsafe_set_version_counter(t: torch.Tensor, prev_version: int) -> None: ...
+
 def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
 def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ...
 def _profiler_type() -> ActiveProfilerType: ...
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 4a1fe23cec61..83adf8bc4e51 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -1,10 +1,8 @@
 from enum import Enum
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 from torch._C import device, dtype, layout
 
-from typing_extensions import Literal
-
 # defined in torch/csrc/profiler/python/init.cpp
 
 class RecordScope(Enum):
diff --git a/torch/_C/return_types.pyi.in b/torch/_C/return_types.pyi.in
index aa540ea328b5..ca5e3f85f89e 100644
--- a/torch/_C/return_types.pyi.in
+++ b/torch/_C/return_types.pyi.in
@@ -1,9 +1,7 @@
 # ${generated_comment}
 
-from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided
-from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, TypeVar
-from typing_extensions import Literal
-from torch._six import inf
+from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided, inf
+from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, Literal, TypeVar
 
 from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout
 
diff --git a/torch/_VF.py b/torch/_VF.py
index b0b6c1dd85b4..c6b63c511959 100644
--- a/torch/_VF.py
+++ b/torch/_VF.py
@@ -20,7 +20,7 @@ class VFModule(types.ModuleType):
     vf: types.ModuleType
 
     def __init__(self, name):
-        super(VFModule, self).__init__(name)
+        super().__init__(name)
         self.vf = torch._C._VariableFunctions
 
     def __getattr__(self, attr):
diff --git a/torch/__init__.py b/torch/__init__.py
index df97b2c1864a..4bd47a144028 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -28,8 +28,6 @@
 else:
     from .torch_version import __version__ as __version__
 
-from ._six import string_classes as _string_classes
-
 from typing import Any, Callable, Dict, Optional, Set, Type, TYPE_CHECKING, Union
 import builtins
 
@@ -98,18 +96,10 @@
 
     kernel32.LoadLibraryW.restype = ctypes.c_void_p
     if with_load_library_flags:
-        kernel32.AddDllDirectory.restype = ctypes.c_void_p
         kernel32.LoadLibraryExW.restype = ctypes.c_void_p
 
     for dll_path in dll_paths:
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(dll_path)
-        elif with_load_library_flags:
-            res = kernel32.AddDllDirectory(dll_path)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += f' Error adding "{dll_path}" to the DLL directories.'
-                raise err
+        os.add_dll_directory(dll_path)
 
     try:
         ctypes.CDLL('vcruntime140.dll')
@@ -145,22 +135,24 @@
     kernel32.SetErrorMode(prev_error_mode)
 
 
-def _preload_cuda_deps():
-    """ Preloads cudnn/cublas deps if they could not be found otherwise """
+def _preload_cuda_deps(lib_folder, lib_name):
+    """Preloads cuda deps if they could not be found otherwise."""
     # Should only be called on Linux if default path resolution have failed
     assert platform.system() == 'Linux', 'Should only be called on Linux'
+    import glob
+    lib_path = None
     for path in sys.path:
         nvidia_path = os.path.join(path, 'nvidia')
         if not os.path.exists(nvidia_path):
             continue
-        cublas_path = os.path.join(nvidia_path, 'cublas', 'lib', 'libcublas.so.11')
-        cudnn_path = os.path.join(nvidia_path, 'cudnn', 'lib', 'libcudnn.so.8')
-        if not os.path.exists(cublas_path) or not os.path.exists(cudnn_path):
-            continue
-        break
-
-    ctypes.CDLL(cublas_path)
-    ctypes.CDLL(cudnn_path)
+        candidate_lib_paths = glob.glob(os.path.join(nvidia_path, lib_folder, 'lib', lib_name))
+        if candidate_lib_paths and not lib_path:
+            lib_path = candidate_lib_paths[0]
+        if lib_path:
+            break
+    if not lib_path:
+        raise ValueError(f"{lib_name} not found in the system path {sys.path}")
+    ctypes.CDLL(lib_path)
 
 
 # See Note [Global dependencies]
@@ -175,11 +167,26 @@ def _load_global_deps():
     try:
         ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
     except OSError as err:
-        # Can only happen of wheel with cublas as PYPI deps
-        # As PyTorch is not purelib, but nvidia-cublas-cu11 is
-        if 'libcublas.so.11' not in err.args[0]:
+        # Can only happen for wheel with cuda libs as PYPI deps
+        # As PyTorch is not purelib, but nvidia-*-cu11 is
+        cuda_libs: Dict[str, str] = {
+            'cublas': 'libcublas.so.*[0-9]',
+            'cudnn': 'libcudnn.so.*[0-9]',
+            'cuda_nvrtc': 'libnvrtc.so.*[0-9].*[0-9]',
+            'cuda_runtime': 'libcudart.so.*[0-9].*[0-9]',
+            'cuda_cupti': 'libcupti.so.*[0-9].*[0-9]',
+            'cufft': 'libcufft.so.*[0-9]',
+            'curand': 'libcurand.so.*[0-9]',
+            'cusolver': 'libcusolver.so.*[0-9]',
+            'cusparse': 'libcusparse.so.*[0-9]',
+            'nccl': 'libnccl.so.*[0-9]',
+            'nvtx': 'libnvToolsExt.so.*[0-9]',
+        }
+        is_cuda_lib_err = [lib for lib in cuda_libs.values() if(lib.split('.')[0] in err.args[0])]
+        if not is_cuda_lib_err:
             raise err
-        _preload_cuda_deps()
+        for lib_folder, lib_name in cuda_libs.items():
+            _preload_cuda_deps(lib_folder, lib_name)
         ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
@@ -244,6 +251,9 @@ def __bool__(self):
     def __int__(self):
         return self.node.int_()
 
+    def __index__(self):
+        return self.node.int_()
+
     # Magic methods installed by torch.fx.experimental.symbolic_shapes
 
     def __eq__(self, other: object) -> builtins.bool:
@@ -313,6 +323,9 @@ def __sym_max__(self, other):
     def __sym_min__(self, other):
         raise AssertionError("type stub not overridden")
 
+    def __sym_int__(self):
+        raise AssertionError("type stub not overridden")
+
     def __repr__(self):
         return self.node.str()
 
@@ -388,14 +401,6 @@ def sym_float(a):
         return a.__sym_float__()
     return py_float(a)  # type: ignore[operator]
 
-# Drop in replacement for math.floor/ceil.  Actually, math.floor/ceil
-# directly usable, but this has a more relaxed type signature for mypy
-# (mypy requires SupportFloat which is too strict)
-def _sym_floor(x):
-    return math.floor(x)  # type: ignore[type]
-
-def _sym_ceil(x):
-    return math.ceil(x)  # type: ignore[type]
 
 def sym_int(a):
     r""" SymInt-aware utility for int casting.
@@ -406,7 +411,7 @@ def sym_int(a):
     if isinstance(a, SymInt):
         return a
     elif isinstance(a, SymFloat):
-        return _sym_floor(a) if a > 0 else _sym_ceil(a)
+        return math.floor(a) if a >= 0 else math.ceil(a)  # type: ignore[arg-type]
     return py_int(a)  # type: ignore[operator]
 
 def sym_max(a, b):
@@ -414,6 +419,9 @@ def sym_max(a, b):
     if isinstance(a, (SymInt, SymFloat)):
         return a.__sym_max__(b)
     elif isinstance(b, (SymInt, SymFloat)):
+        # NB: If you actually care about preserving output type exactly
+        # if you do something like max(0, 0.0), it is NOT sound to treat
+        # min/max as commutative
         return b.__sym_max__(a)
     return builtins.max(a, b)  # type: ignore[operator]
 
@@ -434,7 +442,7 @@ def sym_min(a, b):
     import torch._C as _C_for_compiled_check
 
     # The __file__ check only works for Python 3.7 and above.
-    if sys.version_info >= (3, 7) and _C_for_compiled_check.__file__ is None:
+    if _C_for_compiled_check.__file__ is None:
         raise ImportError(textwrap.dedent('''
             Failed to load PyTorch C extensions:
                 It appears that PyTorch has loaded the `torch/_C` folder
@@ -596,7 +604,7 @@ def set_default_tensor_type(t):
         torch.float64
 
     """
-    if isinstance(t, _string_classes):
+    if isinstance(t, str):
         t = _import_dotted_name(t)
     _C._set_default_tensor_type(t)
 
@@ -1150,6 +1158,9 @@ def manager_path():
     # signatures already imported. For now these clashes are ignored; see
     # PR #43339 for details.
     from torch._C._VariableFunctions import *  # type: ignore[misc] # noqa: F403
+    # Fixup segment_reduce visibility
+    _segment_reduce = segment_reduce
+    del segment_reduce
 
 # Ops not to be exposed in `torch` namespace,
 # mostly helper ops.
@@ -1162,6 +1173,11 @@ def manager_path():
         continue
     obj = getattr(_C._VariableFunctions, name)
     obj.__module__ = 'torch'
+    # Hide some APIs that should not be public
+    if name == "segment_reduce":
+        # TODO: Once the undocumented FC window is passed, remove the line bellow
+        globals()[name] = obj
+        name = "_" + name
     globals()[name] = obj
     if not name.startswith("_"):
         __all__.append(name)
@@ -1213,6 +1229,7 @@ def _assert(condition, message):
 )
 from torch import fft as fft
 from torch import futures as futures
+from torch import _awaits as _awaits
 from torch import nested as nested
 from torch import nn as nn
 from torch.signal import windows as windows
@@ -1307,21 +1324,73 @@ def compiled_with_cxx11_abi():
     solve,
     lstsq,
 )
+from ._linalg_utils import _symeig as symeig  # type: ignore[misc]
+
 
 class _TorchCompileInductorWrapper:
     compiler_name = "inductor"
 
-    def __init__(self, mode, passes):
-        from torch._dynamo.eval_frame import lookup_backend
-        from torch._inductor.config import InductorConfigContext
-
-        self.compile_fn = lookup_backend(self.compiler_name)
-        self.cm = InductorConfigContext(mode if mode is not None else passes)
-        self._torchdynamo_orig_callable = self.compile_fn
+    def __init__(self, mode, options, dynamic):
+        self.config = dict()
+        self.dynamic = dynamic
+        self.apply_mode(mode)
+        self.apply_options(options)
+        if dynamic:
+            # cudagraphs conflicts with dynamic shapes
+            self.config["triton.cudagraphs"] = False
+            assert "triton.cudagraphs" not in (
+                options or ()
+            ), "triton.cudagraphs does not support dynamic shapes"
+
+    def __eq__(self, other):
+        return (isinstance(other, _TorchCompileInductorWrapper) and
+                self.config == other.config and
+                self.dynamic == other.dynamic)
+
+    def apply_mode(self, mode: Optional[str]):
+        if mode is None or mode == "default":
+            pass
+        elif mode == "reduce-overhead":
+            self.apply_options({
+                "triton.cudagraphs": True,
+                "size_asserts": False,
+            })
+        elif mode == "max-autotune":
+            self.apply_options({
+                "epilogue_fusion": True,
+                "max_autotune": True,
+                "triton.cudagraphs": True,
+            })
+        else:
+            raise RuntimeError(
+                f"Unrecognized mode={mode}, should be one of: default, reduce-overhead, max-autotune"
+            )
+
+    def apply_options(self, options: Optional[Dict[str, Any]]):
+        if not options:
+            return
+
+        from torch._inductor import config
+        current_config: Dict[str, Any] = config.to_dict()  # type: ignore[attr-defined]
+
+        for key, val in options.items():
+            attr_name = key.replace("-", "_")
+            if attr_name not in current_config:
+                raise RuntimeError(
+                    f"Unexpected optimization option {key}, known options are {list(current_config.keys())}"
+                )
+            if type(val) is not type(current_config[attr_name]):
+                val_type_str = type(val).__name__
+                expected_type_str = type(current_config[attr_name]).__name__
+                raise RuntimeError(
+                    f"Unexpected type of attr {key}, got {val_type_str} should be {expected_type_str}"
+                )
+            self.config[attr_name] = val
 
     def __call__(self, model_, inputs_):
-        with self.cm:
-            return self.compile_fn(model_, inputs_)
+        from torch._inductor.compile_fx import compile_fx
+
+        return compile_fx(model_, inputs_, config_patches=self.config)
 
 
 def compile(model: Optional[Callable] = None, *,
@@ -1329,10 +1398,10 @@ def compile(model: Optional[Callable] = None, *,
             dynamic: builtins.bool = False,
             backend: Union[str, Callable] = "inductor",
             mode: Union[str, None] = None,
-            passes: Optional[Dict[str, Union[str, builtins.int, builtins.bool]]] = None,
-            **kwargs) -> Callable:
+            options: Optional[Dict[str, Union[str, builtins.int, builtins.bool]]] = None,
+            disable: builtins.bool = False) -> Callable:
     """
-    Optimizes given model/function using Dynamo and specified backend
+    Optimizes given model/function using TorchDynamo and specified backend.
 
     Args:
        model (Callable): Module/function to optimize
@@ -1340,20 +1409,12 @@ def compile(model: Optional[Callable] = None, *,
        dynamic (bool): Use dynamic shape tracing
        backend (str or Callable): backend to be used
        mode (str): Can be either "default", "reduce-overhead" or "max-autotune"
-       passes (dict): A dictionary of passes to the backend. Passes currently recognized by inductor backend:
-                       - static-memory
-                       - matmul-tune
-                       - matmul-padding
-                       - triton-autotune
-                       - triton-bmm
-                       - triton-mm
-                       - triton-convolution
-                       - rematerialize-threshold
-                       - rematerialize-acc-threshold
+       options (dict): A dictionary of options to pass to the backend.
+       disable (bool): Turn torch.compile() into a no-op for testing
 
     Example::
 
-        @torch.compile(passes={"matmul-padding": True}, fullgraph=True)
+        @torch.compile(options={"matmul-padding": True}, fullgraph=True)
         def foo(x):
             return torch.sin(x) + torch.cos(x)
 
@@ -1369,18 +1430,18 @@ def fn(model: Callable):
                            dynamic=dynamic,
                            backend=backend,
                            mode=mode,
-                           passes=passes,
-                           **kwargs)
+                           options=options,
+                           disable=disable)
         return fn
 
     import torch._dynamo
-    if mode is not None and passes is not None:
-        raise RuntimeError("Either mode or passes can be specified, but both can't be specified at the same time.")
-    if mode is None and passes is None:
+    if mode is not None and options is not None:
+        raise RuntimeError("Either mode or options can be specified, but both can't be specified at the same time.")
+    if mode is None and options is None:
         mode = "default"
     if backend == "inductor":
-        backend = _TorchCompileInductorWrapper(mode, passes)
-    return torch._dynamo.optimize(backend=backend, nopython=fullgraph, dynamic=dynamic, **kwargs)(model)
+        backend = _TorchCompileInductorWrapper(mode, options, dynamic)
+    return torch._dynamo.optimize(backend=backend, nopython=fullgraph, dynamic=dynamic, disable=disable)(model)
 
 
 def _register_device_module(device_type, module):
diff --git a/torch/_awaits/__init__.py b/torch/_awaits/__init__.py
new file mode 100644
index 000000000000..c7a0065c7dfa
--- /dev/null
+++ b/torch/_awaits/__init__.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from typing import cast, Callable, Generic, Type, TypeVar
+
+import torch
+
+__all__ = ['Await']
+
+W = TypeVar("W")
+
+class _PyAwaitMeta(type(torch._C._Await), type(Generic)):  # type: ignore[misc, no-redef]
+    pass
+
+class _Await(torch._C._Await, Generic[W], metaclass=_PyAwaitMeta):
+    r"""
+    Wrapper around a ``torch._C.Await`` which encapsulates delayed execution
+    of a callable. All manipulations happen with functions ``torch.jit._awaitable``,
+    ``torch.jit._awaitable_wait``, ``torch.jit._awaitable_nowait``.
+
+    Torch scriptable manipulations:
+    ``torch.jit._awaitable(func, *args)``
+    Creates ``Await[W]`` object, where W is return type of func.
+
+    Returns:
+    ``torch.jit._awaitable_wait(Await[W])``
+    Returns the result of the function, specified at ``_awaitable``,  with specified arguments.
+
+    Returns:
+        The result of type ``W`` of the function call. The result is owned by ``Await[W]``
+        and returned on all following ``_awaitable_wait`` calls.
+
+
+    ``torch.jit._awaitable_nowait(W)``
+    Returns:
+        Trivial ``Await[W]`` with specified result.
+
+
+    Only in eager mode:
+    ``fn() -> Callable[Tuple[Any], W]``
+    Returns:
+        Specified at ``_awaitable`` python function ``func``.
+
+    ``args() -> Tuple[Any]``
+    Returns:
+        Specified at ``_awaitable`` python args.
+
+    ``is_nowait() -> _bool``
+    Returns:
+        ``True`` if this object was created via ``_awaitable_nowait`` call (trivial `Await[W]`).
+
+    In eager mode ``Await[W]`` can be used as ``W`` i.e. attributes of W can be called on ``Await[W]``,
+    ``_awaitable_wait()`` call will be transparently added.
+    """
+    pass
diff --git a/torch/_classes.py b/torch/_classes.py
index 3de7c9e1a2be..870073fea6ea 100644
--- a/torch/_classes.py
+++ b/torch/_classes.py
@@ -5,7 +5,7 @@
 
 class _ClassNamespace(types.ModuleType):
     def __init__(self, name):
-        super(_ClassNamespace, self).__init__("torch.classes" + name)
+        super().__init__("torch.classes" + name)
         self.name = name
 
     def __getattr__(self, attr):
@@ -19,7 +19,7 @@ class _Classes(types.ModuleType):
     __file__ = "_classes.py"
 
     def __init__(self):
-        super(_Classes, self).__init__("torch.classes")
+        super().__init__("torch.classes")
 
     def __getattr__(self, name):
         namespace = _ClassNamespace(name)
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index d50f33933da4..bb801139d918 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -15,6 +15,7 @@
     "meta_table",
     "register_decomposition",
     "get_decompositions",
+    "core_aten_decompositions",
 ]
 
 
@@ -167,3 +168,156 @@ def get_decompositions(
 # populate the table
 import torch._decomp.decompositions
 import torch._refs
+
+# This list was copied from torch/_inductor/decomposition.py
+# excluding decompositions that results in prim ops
+# Resulting opset of decomposition is core aten ops
+def core_aten_decompositions() -> Dict[OpOverload, Callable]:
+    aten = torch.ops.aten
+    return get_decompositions(
+        [
+            aten._adaptive_avg_pool2d_backward,
+            aten.addcdiv,
+            aten.addcdiv_,
+            aten.addcmul,
+            aten.addcmul_,
+            aten.addr,
+            aten.avg_pool2d_backward,
+            aten.binary_cross_entropy,
+            aten.binary_cross_entropy_backward,
+            aten.binary_cross_entropy_with_logits,
+            aten.bucketize,
+            aten.celu,
+            aten.col2im,
+            aten.cudnn_batch_norm,
+            aten.cudnn_batch_norm_backward,
+            aten.detach,
+            aten.diag_embed,
+            aten.diagonal,
+            aten.dot,
+            aten.elu,
+            aten.elu_backward,
+            aten._embedding_bag,
+            aten.embedding_dense_backward,
+            aten.expand_as,
+            aten.eye,
+            aten.fill,
+            aten.frac,
+            aten._fused_moving_avg_obs_fq_helper,
+            aten.gelu,
+            aten.gelu_backward,
+            aten.glu_backward,
+            aten.grid_sampler_2d,
+            aten.hardshrink,
+            aten.hardshrink_backward,
+            aten.hardsigmoid,
+            aten.hardsigmoid_backward,
+            aten.hardswish,
+            aten.hardswish_,
+            aten.hardswish_backward,
+            aten.hardtanh,
+            aten.hardtanh_,
+            aten.hardtanh_backward,
+            aten.heaviside,
+            aten.huber_loss,
+            aten.huber_loss_backward,
+            aten.im2col,
+            aten.index_add,
+            aten.index_add_,
+            aten.index_copy,
+            aten.index_copy_,
+            aten.index_fill,
+            aten.index_fill_,
+            aten.index_select,
+            aten.isneginf,
+            aten.isposinf,
+            aten.l1_loss,
+            aten.leaky_relu,
+            aten.leaky_relu_,
+            aten.leaky_relu_backward,
+            aten.lerp,
+            aten.linspace,
+            aten.logaddexp,
+            aten.logit,
+            aten.logit_backward,
+            aten.log_sigmoid_backward,
+            aten.log_sigmoid_forward,
+            aten._log_softmax,
+            aten._log_softmax_backward_data,
+            aten.logspace,
+            aten.logsumexp.default,
+            aten.masked_fill,
+            aten.masked_fill_,
+            aten.max_pool2d_with_indices_backward,
+            aten.mish,
+            aten.mse_loss,
+            aten.mse_loss_backward,
+            aten.mv,
+            aten.mvlgamma,
+            aten.nan_to_num,
+            aten.narrow,
+            aten.native_batch_norm,
+            aten.native_batch_norm_backward,
+            aten._native_batch_norm_legit,
+            aten._native_batch_norm_legit_no_training,
+            aten._native_batch_norm_legit_functional,
+            aten.native_dropout_backward,
+            aten.native_group_norm,
+            aten.native_group_norm_backward,
+            aten.native_layer_norm,
+            aten.native_layer_norm_backward,
+            aten.new_empty,
+            aten.new_full,
+            aten.new_ones,
+            aten.new_zeros,
+            aten.nll_loss_backward,
+            aten.nll_loss_forward,
+            aten.norm,
+            aten.ones,
+            aten.ones_like,
+            aten._prelu_kernel,
+            aten._prelu_kernel_backward,
+            aten._reshape_alias,
+            aten.rot90,
+            aten.rsub.Scalar,
+            aten.rsub.Tensor,
+            aten.select_backward,
+            aten.select_scatter,
+            aten.sgn,
+            aten.sigmoid_backward,
+            aten.silu,
+            aten.silu_,
+            aten.silu_backward,
+            aten.sinc,
+            aten.slice_backward,
+            aten.soft_margin_loss,
+            aten.soft_margin_loss_backward,
+            aten._softmax,
+            aten._softmax_backward_data,
+            aten.softplus,
+            aten.softplus_backward,
+            aten.softshrink,
+            aten.softshrink_backward,
+            aten.special_entr,
+            aten.special_log_ndtr,
+            aten.special_xlog1py,
+            aten.stack,
+            aten.t,
+            aten.tanh_backward,
+            aten.threshold,
+            aten.threshold_backward,
+            aten.trace,
+            aten.transpose.int,
+            aten.tril.default,
+            aten.unfold,
+            aten.unfold_backward,
+            aten.upsample_bilinear2d,
+            aten.upsample_bilinear2d.vec,
+            aten.upsample_nearest2d_backward,
+            aten.xlogy,
+            aten.zero,
+            aten.zero_,
+            aten.zeros,
+            aten.zeros_like,
+        ]
+    )
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 1ead83831e7c..54266e1bd374 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1054,7 +1054,6 @@ def embedding(
 
 
 @register_decomposition(aten.embedding_dense_backward)
-@pw_cast_for_opmath
 def embedding_dense_backward(
     grad_output: Tensor,
     indices: Tensor,
@@ -1062,20 +1061,24 @@ def embedding_dense_backward(
     padding_idx: int,
     scale_grad_by_freq: bool,
 ):
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        grad_output, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    grad_output = grad_output.to(computation_dtype)
     indices = _maybe_convert_to_dtype(indices, torch.long)  # type: ignore[assignment]
     if scale_grad_by_freq:
         counts = indices.new_zeros((num_weights,))
         ones = torch.ones_like(indices)
         counts = counts.index_put([indices], ones, accumulate=True)
         grad_weights_scale = counts[indices]
-        grad_output = grad_output / grad_weights_scale.unsqueeze(1)
+        grad_output = grad_output / grad_weights_scale.unsqueeze(-1)
 
     mask = _unsqueeze_to_dim(indices == padding_idx, grad_output.ndim)
     grad = grad_output.masked_fill(mask, 0)
     grad_weight = grad_output.new_zeros(
         (num_weights,) + grad_output.shape[indices.ndim :]
     )
-    return grad_weight.index_put([indices], grad, accumulate=True)
+    return grad_weight.index_put([indices], grad, accumulate=True).to(result_dtype)
 
 
 def prod(x: List[int]):
@@ -1134,21 +1137,11 @@ def addmm(self: Tensor, mat1: Tensor, mat2: Tensor, beta: int = 1, alpha: int =
     return out + beta * self
 
 
-# This computes the mean and variance along the specifized normalization dims,
-# then normalizes along those dims. Finally, it returns the mean and variance of
-# the normalized dims. Note that it intentionally leaves outputs upcasted.
-# Example:
-# input: [2, 3, 4, 5], norm_dims: [1, 3]
-# mean: [2, 1, 4, 1]
-def normalize(input, norm_dims, eps):
-    computation_dtype = utils.get_computation_dtype(input.dtype)
-    input_acc = input.to(dtype=computation_dtype)
-    biased_var = torch.var(input_acc, dim=norm_dims, unbiased=False, keepdim=True)
-    mean = torch.mean(input_acc, dim=norm_dims, keepdim=True)
-    rstd = torch.rsqrt(biased_var + eps)
-
-    out = (input - mean) * rstd
-    return out, mean, rstd
+@register_decomposition(aten._int_mm)
+@out_wrapper()
+@pw_cast_for_opmath
+def _int_mm(self: Tensor, mat1: Tensor, mat2: Tensor):
+    return torch._int_mm(mat1, mat2)
 
 
 @register_decomposition(aten.native_group_norm_backward)
@@ -1341,10 +1334,9 @@ def native_batch_norm_helper(
     if training:
         computation_dtype = utils.get_computation_dtype(input.dtype)
         input_acc = input.to(dtype=computation_dtype)
-        biased_var = torch.var(
-            input_acc, dim=reduction_dims, unbiased=False, keepdim=True
+        biased_var, mean = torch.var_mean(
+            input_acc, dim=reduction_dims, correction=0, keepdim=True
         )
-        mean = torch.mean(input_acc, dim=reduction_dims, keepdim=True)
         rstd = torch.rsqrt(biased_var + eps)
 
         output = (input - mean) * rstd
@@ -1457,9 +1449,15 @@ def native_batch_norm_decomposition(
             "running_var is None, but running_mean is provided. "
             "They should both be None or both be provided."
         )
-    return aten._native_batch_norm_legit(
-        input, weight, bias, running_mean, running_var, training, momentum, eps
-    )
+    if training:
+        # HACK: batch norm consolidation should clean this up so this op doesn't take in a training arg.
+        return aten._native_batch_norm_legit(
+            input, weight, bias, running_mean, running_var, training, momentum, eps
+        )
+    else:
+        return aten._native_batch_norm_legit_no_training(
+            input, weight, bias, running_mean, running_var, momentum, eps
+        )
 
 
 @aten.unsafe_chunk.default.py_impl(DispatchKey.CompositeImplicitAutograd)
@@ -1474,6 +1472,28 @@ def unsafe_chunk_py_impl(tensor, chunks, dim=0) -> List[Tensor]:
     return torch.ops.aten.unsafe_split.Tensor(tensor, split_size, dim)
 
 
+@register_decomposition(aten._native_batch_norm_legit_no_training.default)
+def _native_batch_norm_legit_no_training(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    return aten._native_batch_norm_legit.default(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        False,  # training
+        momentum,
+        eps,
+    )
+
+
 @register_decomposition(aten._native_batch_norm_legit.default)
 def _native_batch_norm_legit(
     input: Tensor,
@@ -1991,12 +2011,13 @@ def upsample_compute_output_size(input_size, output_size, scale_factors):
             lambda: "Must specify exactly one of output_size and scale_factors",
         )
         utils.check(len(scale_factors) == spatial_dimensions, lambda: "")
-        return [
-            # Returning output_size as float. We cannot convert it to int directly,
-            # as latter computation of scale_factor is relying output size being float
-            sym_float(input_size[i + 2] * scale_factors[i])
-            for i in range(spatial_dimensions)
-        ]
+        output_size = []
+        for i, s in enumerate(scale_factors):
+            if int(s) == s:
+                output_size.append(input_size[i + 2] * int(s))
+            else:
+                output_size.append(sym_int(input_size[i + 2] * s))
+        return output_size
     utils.check(
         False, lambda: "Must specify exactly one of output_size and scale_factors"
     )
@@ -2015,8 +2036,6 @@ def upsample_nearest1d_vec(input, output_size, scale_factors):
     osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
     scale = get_scale_value(scale_factors, 0)
 
-    # NB: osize could be a list of float when scale_factors is float
-    # so we cannot redispatch to aten.upsample_nearest1d.default here
     return upsample_nearest1d(input, osize, scale)
 
 
@@ -2028,8 +2047,6 @@ def upsample_nearest2d_vec(input, output_size, scale_factors):
     scale_h = get_scale_value(scale_factors, 0)
     scale_w = get_scale_value(scale_factors, 1)
 
-    # NB: osize could be a list of float when scale_factors is float
-    # so we cannot redispatch to aten.upsample_nearest2d.default here
     return upsample_nearest2d(input, osize, scale_h, scale_w)
 
 
@@ -2042,12 +2059,10 @@ def upsample_nearest3d_vec(input, output_size, scale_factors):
     scale_h = get_scale_value(scale_factors, 1)
     scale_w = get_scale_value(scale_factors, 2)
 
-    # NB: osize could be a list of float when scale_factors is float
-    # so we cannot redispatch to aten.upsample_nearest3d.default here
     return upsample_nearest3d(input, osize, scale_d, scale_h, scale_w)
 
 
-def _compute_upsample_nearest_indices(input, output_size):
+def _compute_upsample_nearest_indices(input, output_size, scales):
     # For each dim in output_size, compute the set of input indices used
     # to produce the upsampled output.
     indices = []
@@ -2058,13 +2073,11 @@ def _compute_upsample_nearest_indices(input, output_size):
         # scale = isize / osize
         # input_index = floor(output_index * scale)
         # Same as OpenCV INTER_NEAREST
-        osize = sym_float(output_size[d])
-        output_indices = torch.arange(
-            sym_int(osize), dtype=input.dtype, device=input.device
-        )
-        isize = sym_float(input.shape[-num_spatial_dims + d])
-        scale = isize / osize
-        input_indices = torch.floor(output_indices * scale).to(torch.int64)
+        osize = output_size[d]
+        output_indices = torch.arange(osize, dtype=input.dtype, device=input.device)
+        isize = input.shape[-num_spatial_dims + d]
+        scale = isize / (isize * scales[d]) if scales[d] is not None else isize / osize
+        input_indices = (output_indices * scale).to(torch.int64)
         for _ in range(num_spatial_dims - 1 - d):
             input_indices = input_indices.unsqueeze(-1)
         indices.append(input_indices)
@@ -2076,10 +2089,10 @@ def _compute_upsample_nearest_indices(input, output_size):
 @pw_cast_for_opmath
 def upsample_nearest1d(
     input: Tensor,
-    output_size: List[Union[int, float]],
+    output_size: List[int],
     scales: Optional[float] = None,
 ) -> Tensor:
-    (l_indices,) = _compute_upsample_nearest_indices(input, output_size)
+    (l_indices,) = _compute_upsample_nearest_indices(input, output_size, (scales,))
     result = input[:, :, l_indices]
     return result
 
@@ -2089,11 +2102,13 @@ def upsample_nearest1d(
 @pw_cast_for_opmath
 def upsample_nearest2d(
     input: Tensor,
-    output_size: List[Union[int, float]],
+    output_size: List[int],
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
 ) -> Tensor:
-    h_indices, w_indices = _compute_upsample_nearest_indices(input, output_size)
+    h_indices, w_indices = _compute_upsample_nearest_indices(
+        input, output_size, (scales_h, scales_w)
+    )
     result = input[:, :, h_indices, w_indices]
 
     # convert output to correct memory format, if necessary
@@ -2114,19 +2129,582 @@ def upsample_nearest2d(
 @pw_cast_for_opmath
 def upsample_nearest3d(
     input: Tensor,
-    output_size: List[Union[int, float]],
+    output_size: List[int],
     scales_d: Optional[float] = None,
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
 ) -> Tensor:
     d_indices, h_indices, w_indices = _compute_upsample_nearest_indices(
-        input, output_size
+        input, output_size, (scales_d, scales_h, scales_w)
     )
     result = input[:, :, d_indices, h_indices, w_indices]
 
     return result
 
 
+def gather_params(params, has_biases, has_projections):
+    if has_biases and has_projections:
+        group_size = 5
+    elif has_biases:
+        group_size = 4
+    elif has_projections:
+        group_size = 3
+    else:
+        group_size = 2
+
+    assert len(params) % group_size == 0, len(params)
+    return [
+        tuple(params[i : i + group_size]) for i in range(0, len(params), group_size)
+    ]
+
+
+def params_hiddens(params, hiddens, i, bidirectional):
+    if bidirectional:
+        cur_params, cur_hidden = params[2 * i], hiddens[2 * i]
+        bidir_params, bidir_hidden = params[2 * i + 1], hiddens[2 * i + 1]
+    else:
+        cur_params, cur_hidden = params[i], hiddens[i]
+        bidir_params, bidir_hidden = None, None
+
+    return cur_params, cur_hidden, bidir_params, bidir_hidden
+
+
+def update_hidden_for_packed(cur_hidden, last_batch_size, batch_size, hiddens):
+    assert last_batch_size > batch_size
+    hiddens.append(cur_hidden.narrow(0, batch_size, last_batch_size - batch_size))
+    return cur_hidden.narrow(0, 0, batch_size)
+
+
+def update_hidden_for_packed_reverse(
+    cur_hidden, last_batch_size, batch_size, inp_hidden
+):
+    if last_batch_size == batch_size:
+        return cur_hidden
+    assert last_batch_size < batch_size
+    return torch.concat(
+        (
+            cur_hidden,
+            inp_hidden.narrow(0, last_batch_size, batch_size - last_batch_size),
+        )
+    )
+
+
+def one_layer_rnn_data(
+    inp, hidden, params, has_biases, hidden_fn, batch_sizes, reverse=False
+):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+
+    step_output = []
+    hiddens: List["torch.Tensor"] = []
+
+    last_batch_size = batch_sizes[-1] if reverse else batch_sizes[0]
+    cur_hidden = hidden.narrow(0, 0, last_batch_size)
+    split_inp = torch.split(inp, list(batch_sizes))
+    if reverse:
+        split_inp = split_inp[::-1]
+    for inp in split_inp:
+        i = inp.shape[0]
+
+        if last_batch_size == i:
+            pass  # don't update cur_hidden
+        # this will only happen when reverse=False, since batch sizes are sorted largest -> smallest
+        elif reverse:
+            cur_hidden = update_hidden_for_packed_reverse(
+                cur_hidden, last_batch_size, i, hidden
+            )
+        else:
+            cur_hidden = update_hidden_for_packed(
+                cur_hidden, last_batch_size, i, hiddens
+            )
+
+        cur_hidden = hidden_fn(inp, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias)
+        last_batch_size = i
+        step_output.append(cur_hidden)
+
+    if reverse:
+        step_output.reverse()
+    else:
+        hiddens.append(cur_hidden)
+        hiddens.reverse()
+
+    out = torch.cat(step_output, 0)
+    hidden_out = torch.cat(hiddens, 0) if not reverse else cur_hidden
+    return out, hidden_out
+
+
+def rnn_cell(nonlinearity):
+    def inner(i, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+        return nonlinearity(F.linear(cur_hidden, hh_weight, hh_bias) + i)
+
+    return inner
+
+
+def rnn_cell_data(nonlinearity):
+    def inner(i, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+        i = F.linear(i, ih_weight, ih_bias)
+        return nonlinearity(F.linear(cur_hidden, hh_weight, hh_bias) + i)
+
+    return inner
+
+
+def one_layer_rnn(inp, hidden, params, has_biases, hidden_fn, reverse=False):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+
+    precomputed_input = F.linear(inp, ih_weight, ih_bias)
+    precomputed_input = precomputed_input.flip(0) if reverse else precomputed_input
+    cur_hidden = hidden.unsqueeze(0)
+    step_output = []
+    for i in precomputed_input:
+        cur_hidden = hidden_fn(i, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias)
+        step_output.append(cur_hidden)
+
+    if reverse:
+        step_output.reverse()
+
+    out = torch.cat(step_output, 0)
+
+    return out, cur_hidden.squeeze(0)
+
+
+def _rnn_helper(
+    input,
+    hidden,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+    layer_fn,
+):
+    input = input.transpose(0, 1) if batch_first else input
+    final_hiddens = []
+
+    for i in range(num_layers):
+        cur_params, cur_hidden, bidir_params, bidir_hidden = params_hiddens(
+            params, hidden, i, bidirectional
+        )
+        dropout = dropout if (train and num_layers < i - 1) else 0.0
+        fwd_inp, fwd_hidden = layer_fn(input, cur_hidden, cur_params, has_biases)
+        final_hiddens.append(fwd_hidden)
+
+        if bidirectional:
+            bwd_inp, bwd_hidden = layer_fn(
+                input, bidir_hidden, bidir_params, has_biases, reverse=True
+            )
+            final_hiddens.append(bwd_hidden)
+
+        if bidirectional:
+            input = torch.cat([fwd_inp, bwd_inp], fwd_inp.dim() - 1)
+        else:
+            input = fwd_inp
+
+        if dropout != 0 and train and i < num_layers - 1:
+            input = torch.dropout(input, dropout, train=True)
+
+    input = input.transpose(0, 1) if batch_first else input
+    return input, final_hiddens
+
+
+@register_decomposition(aten.rnn_tanh.input)
+@aten.rnn_tanh.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_tanh.input.py_impl(DispatchKey.Autograd)
+def rnn_tanh_input(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        partial(one_layer_rnn, hidden_fn=rnn_cell(torch.tanh)),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.rnn_relu.input)
+@aten.rnn_relu.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_relu.input.py_impl(DispatchKey.Autograd)
+def rnn_relu_input(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        partial(one_layer_rnn, hidden_fn=rnn_cell(torch.relu)),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.rnn_relu.data)
+@aten.rnn_relu.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_relu.data.py_impl(DispatchKey.Autograd)
+def rnn_relu_data(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        data,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(
+            one_layer_rnn_data,
+            batch_sizes=batch_sizes,
+            hidden_fn=rnn_cell_data(torch.relu),
+        ),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.rnn_tanh.data)
+@aten.rnn_tanh.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_tanh.data.py_impl(DispatchKey.Autograd)
+def rnn_tanh_data(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        data,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(
+            one_layer_rnn_data,
+            batch_sizes=batch_sizes,
+            hidden_fn=rnn_cell_data(torch.tanh),
+        ),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+def lstm_cell(inp, hx, cx, hh_weight, hh_bias, hr_weight, chunk_dim):
+    gates = F.linear(hx, hh_weight, hh_bias) + inp
+    chunked_gates = gates.chunk(4, chunk_dim)
+    in_gate = chunked_gates[0].sigmoid()
+    forget_gate = chunked_gates[1].sigmoid()
+    cell_gate = chunked_gates[2].tanh()
+    out_gate = chunked_gates[3].sigmoid()
+    cy = forget_gate * cx + (in_gate * cell_gate)
+    hy = out_gate * cy.tanh()
+    hy = hy if hr_weight is None else F.linear(hy, hr_weight, None)
+
+    return hy, cy
+
+
+def one_layer_lstm(inp, hidden, params, has_biases, reverse=False):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+    hr_weight = (
+        params[4] if len(params) == 5 else params[2] if len(params) == 3 else None
+    )
+
+    hx = hidden[0].unsqueeze(0)
+    cx = hidden[1].unsqueeze(0)
+
+    precomputed_input = F.linear(inp, ih_weight, ih_bias)
+    precomputed_input = precomputed_input.flip(0) if reverse else precomputed_input
+    step_output = []
+    for inp in precomputed_input:
+        hx, cx = lstm_cell(inp, hx, cx, hh_weight, hh_bias, hr_weight, chunk_dim=2)
+        step_output.append(hx)
+
+    if reverse:
+        step_output.reverse()
+
+    out = torch.cat(step_output, 0)
+
+    return out, (hx.squeeze(1), cx.squeeze(1))
+
+
+def one_layer_lstm_data(inp, hidden, params, has_biases, batch_sizes, reverse=False):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+    hr_weight = (
+        params[4] if len(params) == 5 else params[2] if len(params) == 3 else None
+    )
+
+    step_output = []
+    hiddens = []
+
+    last_batch_size = batch_sizes[-1] if reverse else batch_sizes[0]
+    split_inp = torch.split(inp, list(batch_sizes))
+    if reverse:
+        split_inp = split_inp[::-1]
+
+    orig_hx = hidden[0]
+    orig_cx = hidden[1]
+    hx, cx = orig_hx.narrow(0, 0, last_batch_size), orig_cx.narrow(
+        0, 0, last_batch_size
+    )
+
+    for inp in split_inp:
+        i = inp.shape[0]
+        inp = F.linear(inp, ih_weight, ih_bias)
+
+        # this will only happen when reverse=False, since batch sizes are sorted largest -> smallest
+        if i < last_batch_size:
+            hiddens.append(
+                (
+                    hx.narrow(0, i, last_batch_size - i),
+                    cx.narrow(0, i, last_batch_size - i),
+                )
+            )
+            hx, cx = hx.narrow(0, 0, i), cx.narrow(0, 0, i)
+
+        # this will only happen when reverse=True
+        if i > last_batch_size:
+            hx = torch.concat(
+                (hx, orig_hx.narrow(0, last_batch_size, i - last_batch_size)), 0
+            )
+            cx = torch.concat(
+                (cx, orig_cx.narrow(0, last_batch_size, i - last_batch_size)), 0
+            )
+
+        hx, cx = lstm_cell(inp, hx, cx, hh_weight, hh_bias, hr_weight, chunk_dim=1)
+        last_batch_size = i
+        step_output.append(hx)
+
+    if reverse:
+        step_output.reverse()
+        hidden_out = (hx, cx)
+    else:
+        hiddens.append((hx, cx))
+        hiddens.reverse()
+        hidden0, hidden1 = zip(*hiddens)
+        hidden_out = torch.cat(hidden0, 0), torch.cat(hidden1, 0)
+
+    out = torch.cat(step_output, 0)
+    return out, hidden_out
+
+
+@register_decomposition(aten.lstm.input)
+@aten.lstm.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.lstm.input.py_impl(DispatchKey.Autograd)
+def lstm_impl(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    assert len(hx) == 2, "lstm expects two hidden states"
+    params = gather_params(params, has_biases, hx[0].size(2) != hx[1].size(2))
+    hidden = list(zip(hx[0], hx[1]))
+    out, final_hiddens = _rnn_helper(
+        input,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        one_layer_lstm,
+    )
+    final_hiddens = list(zip(*final_hiddens))
+    return out, torch.stack(final_hiddens[0], 0), torch.stack(final_hiddens[1], 0)
+
+
+@register_decomposition(aten.lstm.data)
+@aten.lstm.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.lstm.data.py_impl(DispatchKey.Autograd)
+def lstm_data_impl(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    assert len(hx) == 2, "lstm expects two hidden states"
+    params = gather_params(params, has_biases, hx[0].size(2) != hx[1].size(2))
+    hidden = list(zip(hx[0], hx[1]))
+    out, final_hiddens = _rnn_helper(
+        data,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(one_layer_lstm_data, batch_sizes=batch_sizes),
+    )
+    final_hiddens = list(zip(*final_hiddens))
+    return out, torch.stack(final_hiddens[0], 0), torch.stack(final_hiddens[1], 0)
+
+
+def gru_cell(inp, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+    chunked_igates = inp.chunk(3, 1)
+    chunked_hgates = F.linear(cur_hidden, hh_weight, hh_bias).chunk(3, 2)
+    reset_gate = (chunked_hgates[0] + chunked_igates[0]).sigmoid()
+    input_gate = (chunked_hgates[1] + chunked_igates[1]).sigmoid()
+    new_gate = (chunked_igates[2] + (chunked_hgates[2] * reset_gate)).tanh()
+    return (cur_hidden - new_gate) * input_gate + new_gate
+
+
+def gru_cell_data(inp, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+    chunked_igates = F.linear(inp, ih_weight, ih_bias).chunk(3, 1)
+    chunked_hgates = F.linear(cur_hidden, hh_weight, hh_bias).chunk(3, 1)
+    reset_gate = (chunked_hgates[0] + chunked_igates[0]).sigmoid()
+    input_gate = (chunked_hgates[1] + chunked_igates[1]).sigmoid()
+    new_gate = (chunked_igates[2] + (chunked_hgates[2] * reset_gate)).tanh()
+    return (cur_hidden - new_gate) * input_gate + new_gate
+
+
+@register_decomposition(aten.gru.data)
+@aten.gru.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.gru.data.py_impl(DispatchKey.Autograd)
+def gru_impl_data(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        data,
+        hx.unbind(0),
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(one_layer_rnn_data, batch_sizes=batch_sizes, hidden_fn=gru_cell_data),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.gru.input)
+@aten.gru.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.gru.input.py_impl(DispatchKey.Autograd)
+def gru_impl(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hx.unbind(0),
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        partial(one_layer_rnn, hidden_fn=gru_cell),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten._upsample_bilinear2d_aa.vec)
+@aten._upsample_bilinear2d_aa.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_bilinear2d_aa.vec.py_impl(DispatchKey.Autograd)
+def upsample_bilinear2d_aa_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+    return torch.ops.aten._upsample_bilinear2d_aa(
+        input, osize, align_corners, scale_h, scale_w
+    )
+
+
 @register_decomposition(aten.upsample_bilinear2d.vec)
 @aten.upsample_bilinear2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
 @aten.upsample_bilinear2d.vec.py_impl(DispatchKey.Autograd)
@@ -2134,9 +2712,6 @@ def upsample_bilinear2d_vec(input, output_size, align_corners, scale_factors):
     osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
     scale_h = get_scale_value(scale_factors, 0)
     scale_w = get_scale_value(scale_factors, 1)
-
-    # NB: osize could be a list of float when scale_factors is float
-    # so we cannot redispatch to aten.upsample_bilinear2d.default here
     return upsample_bilinear2d(input, osize, align_corners, scale_h, scale_w)
 
 
@@ -2145,7 +2720,7 @@ def upsample_bilinear2d_vec(input, output_size, align_corners, scale_factors):
 @pw_cast_for_opmath
 def upsample_bilinear2d(
     input: Tensor,
-    output_size: List[Union[int, float]],
+    output_size: List[int],
     align_corners: bool,
     scales_h: Optional[float] = None,
     scales_w: Optional[float] = None,
@@ -2153,29 +2728,33 @@ def upsample_bilinear2d(
     # get dimensions of original image
     n_batch, n_channels, in_h, in_w = input.shape
 
-    out_h = sym_float(output_size[0])
-    out_w = sym_float(output_size[1])
+    out_h = output_size[0]
+    out_w = output_size[1]
 
     # Calculate horizontal and vertical scaling factor
     # TODO: Figure out if scales_h/scales_w matters here
     if out_h > 1:
         if align_corners:
-            h_scale_factor = (in_h - 1) / (sym_int(out_h) - 1)
+            h_scale_factor = (in_h - 1) / (out_h - 1)
         else:
-            h_scale_factor = in_h / out_h
+            h_scale_factor = (
+                in_h / (in_h * scales_h) if scales_h is not None else in_h / out_h
+            )
     else:
         h_scale_factor = 0.0
 
     if out_w > 1:
         if align_corners:
-            w_scale_factor = (in_w - 1) / (sym_int(out_w) - 1)
+            w_scale_factor = (in_w - 1) / (out_w - 1)
         else:
-            w_scale_factor = in_w / out_w
+            w_scale_factor = (
+                in_w / (in_w * scales_w) if scales_w is not None else in_w / out_w
+            )
     else:
         w_scale_factor = 0.0
 
-    i = torch.arange(sym_int(out_h), dtype=input.dtype, device=input.device)
-    j = torch.arange(sym_int(out_w), dtype=input.dtype, device=input.device)
+    i = torch.arange(out_h, dtype=input.dtype, device=input.device)
+    j = torch.arange(out_w, dtype=input.dtype, device=input.device)
 
     if align_corners:
         x = h_scale_factor * i
@@ -2184,9 +2763,9 @@ def upsample_bilinear2d(
         x = (h_scale_factor * (i + 0.5) - 0.5).clamp(min=0.0)
         y = (w_scale_factor * (j + 0.5) - 0.5).clamp(min=0.0)
 
-    x_floor = torch.floor(x).to(torch.int64)
+    x_floor = x.to(torch.int64)
     x_ceil = torch.ceil(x).clamp(max=in_h - 1).to(torch.int64)
-    y_floor = torch.floor(y).to(torch.int64)
+    y_floor = y.to(torch.int64)
     y_ceil = torch.ceil(y).clamp(max=in_w - 1).to(torch.int64)
 
     x_view = x.unsqueeze(1)
@@ -2734,6 +3313,8 @@ def get_x_interp(y):
 
 
 @register_decomposition(aten.upsample_bicubic2d.vec)
+@aten.upsample_bicubic2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_bicubic2d.vec.py_impl(DispatchKey.Autograd)
 @out_wrapper()
 @pw_cast_for_opmath
 def upsample_bicubic2d_vec(
@@ -2750,7 +3331,10 @@ def upsample_bicubic2d_vec(
         assert scale_factors is not None
         output_size = cast(
             Tuple[int, int],
-            tuple(int(w * scale) for w, scale in zip(a.shape[2:], scale_factors)),
+            tuple(
+                sym_int(sym_float(w) * scale)
+                for w, scale in zip(a.shape[2:], scale_factors)
+            ),
         )
     scale_h, scale_w = scale_factors if scale_factors else (None, None)
     return upsample_bicubic2d_default(a, output_size, align_corners, scale_h, scale_w)
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 94a73397d9fa..64ae116839a4 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -1,10 +1,12 @@
 from . import allowed_functions, convert_frame, eval_frame, resume_execution
+from .backends.registry import list_backends, register_backend
 from .convert_frame import replay
 from .eval_frame import (
     assume_constant_result,
     disable,
     explain,
     export,
+    is_dynamo_supported,
     optimize,
     optimize_assert,
     OptimizedModule,
@@ -19,7 +21,9 @@
     "allow_in_graph",
     "assume_constant_result",
     "disallow_in_graph",
+    "forbid_in_graph",
     "graph_break",
+    "mark_dynamic",
     "optimize",
     "optimize_assert",
     "export",
@@ -28,10 +32,11 @@
     "replay",
     "disable",
     "reset",
-    "list_backends",
     "skip",
     "OptimizedModule",
     "is_compiling",
+    "register_backend",
+    "list_backends",
 ]
 
 
@@ -51,19 +56,6 @@ def reset():
     reset_frame_count()
 
 
-def list_backends():
-    """
-    Return valid strings that can be passed to::
-
-        @torch._dynamo.optimize(<backend>)
-        def foo(...):
-           ....
-    """
-    from .optimizations import BACKENDS
-
-    return sorted(BACKENDS.keys())
-
-
 def allow_in_graph(fn):
     """
     Customize which functions TorchDynamo will include in the generated
@@ -123,3 +115,55 @@ def fn(a):
 def graph_break():
     """Force a graph break"""
     pass
+
+
+def forbid_in_graph(fn):
+    """
+    Customize which functions TorchDynamo will assert are not present while tracing.
+
+    If you want a graph break on this function instead, use disallow_in_graph.
+    TODO(voz): We now have allow_in_graph, disallow_in_graph, forbid_in_graph - some more robust
+    documentation would not be amiss.
+    """
+    if isinstance(fn, (list, tuple)):
+        return [forbid_in_graph(x) for x in fn]
+    assert callable(fn), "forbid_in_graph applies only to callables"
+    fn._dynamo_forbidden = True
+    return fn
+
+
+@forbid_in_graph
+def mark_dynamic(t, index):
+    """
+    Mark a tensor as having a dynamic dim.
+
+    [Note - on the state of mark_dynamic]
+
+    The behavior of having a dynamic dimension on a tensor is governed by a few factors:
+
+    1) torch._dynamo.config dynamic_shapes True or False.
+        a) dynamic_shapes=True - dynamic_shapes must be True for mark_dynamic to work.
+        a) dynamic_shapes=False - This config will raise an exception when used in conjunction with
+        mark_dyamic. We will eventually support this.
+
+    2) If the dimension is fully constrained - as in, it does not allow more than a single value
+    in both eager (torch.compile, torch._dynamo.optimize) mode and export mode (torch._dynamo.export),
+    we will raise an error
+
+    3) If the dimension is partially constrained - allowing at least 2 values but not the full unbounded
+    range of shapes, in eager we will pass it through, but export will raise an error.
+
+    4) Attempts to trace this function will explicitly raise. As such, all calls to mark_dynamic must be made
+    before torch.compile.
+
+    """
+    if isinstance(index, int):
+        if not hasattr(t, "_dynamo_dynamic_indices"):
+            t._dynamo_dynamic_indices = set()
+        # TODO(voz): Should we bounds check?
+        t._dynamo_dynamic_indices.add(index)
+        return
+
+    assert isinstance(index, (list, tuple))
+    for i in index:
+        mark_dynamic(t, i)
diff --git a/torch/_dynamo/allowed_functions.py b/torch/_dynamo/allowed_functions.py
index e7a3983b05bf..5440521ad1d2 100644
--- a/torch/_dynamo/allowed_functions.py
+++ b/torch/_dynamo/allowed_functions.py
@@ -107,7 +107,6 @@ def _disallowed_function_ids():
         torch.set_autocast_cpu_enabled,
         torch.set_autocast_enabled,
         torch.set_autocast_gpu_dtype,
-        torch.autograd.profiler.profile,
         warnings.warn,
         torch._C._dynamo.eval_frame.unsupported,
     ]
@@ -185,6 +184,12 @@ def _find_torch_objects(module):
     _find_torch_objects(torch)
     _find_torch_objects(math)
 
+    # torch.Tensor.{fn}
+    for name in dir(torch.Tensor):
+        method = getattr(torch.Tensor, name)
+        if isinstance(method, types.MethodDescriptorType):
+            torch_object_ids[id(method)] = f"torch.Tensor.{name}"
+
     for idx in _disallowed_function_ids():
         if idx in torch_object_ids:
             del torch_object_ids[idx]
@@ -257,7 +262,7 @@ def is_allowed(obj):
 
 
 def torch_get_name(obj, default):
-    """Convert a torch.* funcion to a string"""
+    """Convert a torch.* function to a string"""
     return _allowed_function_ids.get_name(id(obj), default)
 
 
diff --git a/torch/_dynamo/backends/__init__.py b/torch/_dynamo/backends/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
new file mode 100644
index 000000000000..a5fbbaae5581
--- /dev/null
+++ b/torch/_dynamo/backends/common.py
@@ -0,0 +1,121 @@
+import functools
+import logging
+
+import torch
+from torch._dynamo import eval_frame
+from torch._dynamo.utils import counters
+from torch._functorch.aot_autograd import aot_module_simplified
+from torch._subclasses import FakeTensor
+from torch.utils._python_dispatch import _disable_current_modes
+
+log = logging.getLogger(__name__)
+
+
+def aot_autograd(**kwargs):
+    def compiler_fn(gm: torch.fx.GraphModule, example_inputs):
+        import functorch.compile
+
+        # Hack to get around circular import problems with aot_eager_decomp_partition
+        if callable(kwargs.get("decompositions")):
+            kwargs["decompositions"] = kwargs["decompositions"]()
+
+        # TODO: stop monkeypatching here (without even cleaning up, UGH!)
+        functorch.compile.config.use_functionalize = True
+        functorch.compile.config.use_fake_tensor = True
+
+        counters["aot_autograd"]["total"] += 1
+        use_fallback = False
+
+        if use_fallback:
+            log.debug("Unable to use AOT Autograd because graph has mutation")
+            counters["aot_autograd"]["not_ok"] += 1
+            return gm
+
+        # OK attempt to compile
+
+        def _wrapped_bw_compiler(*args, **kwargs):
+            # stop TorchDynamo from trying to compile our generated backwards pass
+            return eval_frame.disable(eval_frame.disable(bw_compiler)(*args, **kwargs))
+
+        bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
+        kwargs["bw_compiler"] = _wrapped_bw_compiler
+
+        from torch._inductor.debug import enable_aot_logging
+
+        try:
+            # NB: NOT cloned!
+            with enable_aot_logging():
+                cg = aot_module_simplified(gm, example_inputs, **kwargs)
+                counters["aot_autograd"]["ok"] += 1
+                return eval_frame.disable(cg)
+        except Exception:
+            counters["aot_autograd"]["not_ok"] += 1
+            raise
+
+    return compiler_fn
+
+
+def mem_efficient_fusion_kwargs(use_decomps):
+    from functorch.compile import (
+        default_decompositions,
+        min_cut_rematerialization_partition,
+        ts_compile,
+    )
+
+    kwargs = {
+        # these are taken from memory_efficient_fusion()
+        "fw_compiler": ts_compile,
+        "bw_compiler": ts_compile,
+        "partition_fn": min_cut_rematerialization_partition,
+    }
+
+    if use_decomps:
+        kwargs["decompositions"] = default_decompositions
+
+    return kwargs
+
+
+def fake_tensor_unsupported(fn):
+    """
+    Decorator for backends that need real inputs.  We swap out fake
+    tensors for zero tensors.
+    """
+
+    def defake(x):
+        if not isinstance(x, FakeTensor):
+            return x
+        if x._has_symbolic_sizes_strides:
+            size = [s.node.shape_env.size_hint(s.node.expr) for s in x.size()]
+            stride = [s.node.shape_env.size_hint(s.node.expr) for s in x.stride()]
+        else:
+            size = x.size()
+            stride = x.stride()
+        y = torch.empty_strided(
+            size,
+            stride,
+            dtype=x.dtype,
+            device=x.device,
+            requires_grad=x.requires_grad,
+        )
+        y.zero_()
+        return y
+
+    @functools.wraps(fn)
+    def wrapper(model, inputs, **kwargs):
+        with _disable_current_modes():
+            inputs = list(map(defake, inputs))
+            return fn(model, inputs, **kwargs)
+
+    return wrapper
+
+
+def device_from_inputs(example_inputs) -> torch.device:
+    for x in example_inputs:
+        if hasattr(x, "device"):
+            return x.device
+
+
+def dtype_from_inputs(example_inputs) -> torch.dtype:
+    for x in example_inputs:
+        if hasattr(x, "dtype"):
+            return x.dtype
diff --git a/torch/_dynamo/backends/cudagraphs.py b/torch/_dynamo/backends/cudagraphs.py
new file mode 100644
index 000000000000..8148bc50bfe1
--- /dev/null
+++ b/torch/_dynamo/backends/cudagraphs.py
@@ -0,0 +1,180 @@
+import logging
+import operator
+from collections import defaultdict
+from typing import Set
+
+import torch
+
+from torch.fx import GraphModule
+from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.nn import Module
+from torch.utils._pytree import tree_map
+from .common import aot_autograd
+from .registry import register_backend
+
+log = logging.getLogger(__name__)
+
+
+def cloner(t):
+    if isinstance(t, torch.Tensor):
+        return t.clone()
+    else:
+        return t
+
+
+class CudaGraphModule(Module):
+    gm: GraphModule
+    mutated_inputs: Set[int]
+
+    def __init__(self, gm, mutated_inputs):
+        super().__init__()
+        self.gm = gm
+        self.mutated_inputs = mutated_inputs
+
+    warmed_up = False
+
+    # these are all None or all filled
+    graph = None
+    static_inputs = None
+    static_outputs = None
+
+    # NB: we override __call__ as we don't need any nn.Module machinery
+    # and to reduce overhead
+    def __call__(self, *args):
+        # TODO: once we've recorded here, we'd like to replace the __call__
+        # implementation with compiled bytecode that copies into static, replays
+        # the cuda graph, then copies out.  First condition is the hotpath,
+        # needs optimizing
+        if self.graph is not None:
+            assert len(args) == len(self.static_inputs)
+            for dst, src in zip(self.static_inputs, args):
+                dst.copy_(src)
+            self.graph.replay()
+            for i in self.mutated_inputs:
+                args[i].copy_(self.static_inputs[i])
+            return tree_map(cloner, self.static_outputs)
+
+        elif self.warmed_up:
+            # record
+            self.static_inputs = [x.clone() for x in args]
+            self.graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(self.graph):
+                self.static_outputs = self.gm(*self.static_inputs)
+            # NB: recording doesn't actually run the operations, so
+            # now we immediately replay the graph to serve up the result
+            self.graph.replay()
+            for i in self.mutated_inputs:
+                args[i].copy_(self.static_inputs[i])
+            return tree_map(cloner, self.static_outputs)
+
+        else:
+            # warmup
+            stream = torch.cuda.Stream()
+            stream.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(stream):
+                r = self.gm(*args)
+            torch.cuda.current_stream().wait_stream(stream)
+            self.warmed_up = True
+            return r
+
+
+# Interpreter versions of these passes can be found at
+# https://gist.github.com/ezyang/df2d746cac3b2c7d55c181e37c57ef23
+
+
+def find_input_mutations(g):
+    def meta_fk(meta):
+        return meta["val"] if "val" in meta else meta["fake_result"]
+
+    inputs = defaultdict(set)
+    input_idx = 0
+    mutated_inputs = set()
+    for n in g.nodes:
+        if n.op == "placeholder":
+            inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
+            input_idx += 1
+        elif n.op == "call_function":
+            if n.target is operator.getitem:
+                continue
+            schema = n.target._schema
+            for i, arg in enumerate(schema.arguments):
+                if i < len(n.args):
+                    argument = n.args[i]
+                else:
+                    if arg.name not in n.kwargs:
+                        continue
+                    argument = n.kwargs[arg.name]
+                mut_arg = False
+                if arg.alias_info:
+                    if arg.alias_info.is_write:
+                        mut_arg = True
+                if mut_arg:
+                    # TODO: not correct for args that contain tensors in a struct
+                    # like list
+                    mutated_inputs |= inputs[
+                        StorageWeakRef(meta_fk(argument.meta)._typed_storage())
+                    ]
+        # TODO: error on unrecognized nodes
+    return mutated_inputs
+
+
+# Mutates input graph
+def apply_cuda_graphs(gm):
+    for n in gm.graph.nodes:
+        if n.op == "call_module":
+            assert not n.kwargs
+            submod = gm.get_submodule(n.target)
+            gm.delete_submodule(n.target)
+            mutated_inputs = find_input_mutations(submod.graph)
+            gm.add_submodule(n.target, CudaGraphModule(submod, mutated_inputs))
+    # NB: we didn't actually change the graph, no need for recompile
+
+
+def cudagraphs(model, inputs):
+    model = partition_cudagraphs(model, inputs)
+    apply_cuda_graphs(model)
+    return model
+
+
+aot_cudagraphs = aot_autograd(fw_compiler=cudagraphs, bw_compiler=cudagraphs)
+
+# aot_cudagraphs only applies CUDA graphs to the graph.  It is also helpful
+# for debugging and can serve as a perf baseline.
+# TODO(jansel): rename to just "cudagraphs"?
+register_backend(name="cudagraphs", compiler_fn=aot_cudagraphs)
+
+
+def cudagraphs_inner(model, inputs, copy_outputs=True):
+    """This isn't registered as a backend, but is used in some benchmarks"""
+    assert isinstance(inputs, (list, tuple))
+    static_inputs = [torch.zeros_like(x) for x in inputs]
+
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(stream):
+        model(*inputs)
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream):
+        static_outputs = model(*static_inputs)
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+
+    def run(*new_inputs):
+        assert len(static_inputs) == len(new_inputs)
+        for dst, src in zip(static_inputs, new_inputs):
+            dst.copy_(src)
+        graph.replay()
+        if copy_outputs:
+            return [x.clone() for x in static_outputs]
+        else:
+            return static_outputs
+
+    return run
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
new file mode 100644
index 000000000000..7b5a291b0dad
--- /dev/null
+++ b/torch/_dynamo/backends/debugging.py
@@ -0,0 +1,56 @@
+import functools
+from importlib import import_module
+
+from functorch.compile import min_cut_rematerialization_partition, nop
+
+import torch
+from torch._functorch.compilers import ts_compile
+from .common import aot_autograd
+from .registry import register_debug_backend as register_backend
+
+"""
+This file contains TorchDynamo backends intended for debugging uses.
+"""
+
+
+@register_backend
+def eager(gm, fake_tensor_inputs):
+    return gm
+
+
+@register_backend(name="ts")
+def torchscript(gm, fake_tensor_inputs):
+    return torch.jit.script(gm)
+
+
+# Useful for debugging purpose
+# aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
+aot_eager = aot_autograd(fw_compiler=nop)
+register_backend(name="aot_eager", compiler_fn=aot_eager)
+
+
+# Uses TorchInductor AOT Autograd decomps and partitioner to isolate aot vs
+# inductor problems.
+# aot_eager_decomp_partition just replaces the inductor compiler with nop to help
+# isolate inductor vs aot_eager errors
+aot_eager_decomp_partition = aot_autograd(
+    # these are taken from memory_efficient_fusion()
+    fw_compiler=nop,
+    bw_compiler=nop,
+    # NB: lambda here is to delay import of inductor
+    decompositions=lambda: import_module(
+        "torch._inductor.compile_fx"
+    ).select_decomp_table(),
+    partition_fn=functools.partial(
+        min_cut_rematerialization_partition, compiler="inductor"
+    ),
+)
+register_backend(
+    name="aot_eager_decomp_partition", compiler_fn=aot_eager_decomp_partition
+)
+
+# AOT Autograd with torchscript backend. Default partitioner.
+# aot_ts uses torchscript backend. We can use this with both nnc and nvfuser
+# by using the relevant fuser with torch.jit.fuser(...)
+aot_ts = aot_autograd(fw_compiler=ts_compile)
+register_backend(name="aot_ts", compiler_fn=aot_ts)
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/backends/distributed.py
similarity index 86%
rename from torch/_dynamo/optimizations/distributed.py
rename to torch/_dynamo/backends/distributed.py
index 32f5aafd1300..a9d1a45389ba 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -1,11 +1,13 @@
 import logging
+import traceback
 from dataclasses import dataclass, field
 from typing import Any, List, Optional
 
 import torch
 from torch import fx
+from torch._dynamo.output_graph import GraphCompileReason
+from torch._dynamo.utils import deepcopy_to_fake_tensor, fake_mode_from_tensors
 from torch.fx.node import Node
-from ..utils import deepcopy_to_fake_tensor, fake_mode_from_tensors
 
 log = logging.getLogger(__name__)
 
@@ -54,7 +56,7 @@ def pretty_print_buckets(buckets: List[Bucket]):
 
 
 class DDPOptimizer:
-    """
+    """Note [DDPOptimizer]
     DDPOptimizer applies when dynamo compiles models wrapped in DistributedDataParallel (DDP),
     breaking the dynamo graph into chunks to compile separately, with the breaks aligning to
     the boundaries of gradient-allreduce buckets chosen by DDP.
@@ -63,7 +65,7 @@ class DDPOptimizer:
      - DDP uses allreduce collectives to synchronize partial gradients computed on different workers
      - DDP groups gradient allreduces into 'buckets' to optimize communication efficiency of all-reduce
      - Parameters grouped into buckets are assumed to be adjacent in time, so they become ready
-       at around the same time during backward and thus can share the same allreduce efficently
+       at around the same time during backward and thus can share the same allreduce efficiently
      - Allreduces must overlap with backward compute for optimal training performance
      - DDP schedules allreduces using 'hooks' fired from the c++ autograd engine in pytorch, which
        operates when individual grads become 'ready'
@@ -164,10 +166,9 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
 
             if node.op == "call_module":
                 target = gm.get_submodule(node.target)
-                for name, p in target.named_parameters():
-                    param = target.get_parameter(name)
-                    if p.requires_grad and not self._ignore_parameter(param):
-                        buckets[0].size += p.untyped_storage().nbytes()
+                for name, param in target.named_parameters():
+                    if param.requires_grad and not self._ignore_parameter(param):
+                        buckets[0].size += param.untyped_storage().nbytes()
                         buckets[0].params.append(f"{node.target}_{name}")
                         buckets[0].param_ids.append(id(param))
             elif node.op == "get_attr":
@@ -183,6 +184,12 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
             # Ignored params still end up in buckets, we just don't count them towards the capacity
             buckets[0].nodes.append(node)
 
+        if len(buckets) > 1 and buckets[0].size == 0:
+            # we collected a small preamble graph with ops that don't include parameters, fuse it back
+            buckets[1].nodes.extend(buckets[0].nodes)
+            assert len(buckets[0].params) == 0, "Params should be empty if size is 0"
+            del buckets[0]
+
         # stash buckets for testing/debugging purposes
         self.buckets = buckets
         log.info(
@@ -253,6 +260,14 @@ def forward(self, *args):
                             sn.args = (sn.args,)
 
                 input_mod.recompile()
+                input_mod.compile_subgraph_reason = GraphCompileReason(
+                    "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
+                    " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
+                    [
+                        # it's close to useless to get a real stacktrace here, and quite verbose.
+                        traceback.FrameSummary(__file__, 0, DDPOptimizer),
+                    ],
+                )
                 wrapper = WrapperModule(
                     self.compiler(input_mod, args),
                     unwrap_singleton_tuple,
@@ -261,7 +276,7 @@ def forward(self, *args):
 
             # Note:
             #
-            # The way distributed works today around fake tensors can be somehwat confusing.
+            # The way distributed works today around fake tensors can be somewhat confusing.
             # Some of these codepaths are shared in both runtime, and compile time. The presence
             # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
             #
@@ -278,7 +293,7 @@ def forward(self, *args):
             # 4) Fake tensors should never be around at runtime.
             #
             # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
-            # to match what aot_autograd exepcts. See Note: [Fake Modules and AOTAutograd]
+            # to match what aot_autograd expects. See Note: [Fake Modules and AOTAutograd]
             def run_node(self, n: Node) -> Any:
                 with self._set_current_node(n):
                     args, kwargs = self.fetch_args_kwargs_from_env(n)
@@ -296,8 +311,6 @@ def run_node(self, n: Node) -> Any:
                     assert isinstance(args, tuple)
                     assert isinstance(kwargs, dict)
 
-                    # modify the currently running FX graph
-                    # maybe this isn't sound in general, but only changing the target of a node might be ok?
                     if n.op == "call_module":
                         real_mod = self.fetch_attr(n.target)
                         if fake_mode:
@@ -308,15 +321,28 @@ def run_node(self, n: Node) -> Any:
                         log.debug(
                             f"\n---{n.target} graph---\n" + str(curr_submod.graph)
                         )
+
+                        # When calling the compiler on the submod, inputs (new_args) are expected to
+                        # be FakeTensors already since Dynamo would have made them FakeTensors in the
+                        # non-DDP flow.  However, the parameters are _not_ expected to be FakeTensors,
+                        # since this wrapping happens during compilation
                         compiled_submod_real = self.compile_submod(
                             real_mod, new_args, kwargs
                         )
+
+                        # We update the original (outer) graph with a call into the compiled module
+                        # instead of the uncompiled one.
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
                         self.module.add_submodule(n.target, compiled_submod_real)
-                        return curr_submod(*new_args, **kwargs)
-                    # then we execute the modified node using the usual logic
-                    return getattr(self, n.op)(n.target, new_args, kwargs)
+
+                        # Finally, we have to produce inputs for use compiling the next submodule,
+                        # and these need to be FakeTensors, so we execute the module under fake_mode
+                        with fake_mode:
+                            return curr_submod(*new_args, **kwargs)
+                    else:
+                        # placeholder or output nodes don't need to get compiled, just executed
+                        return getattr(self, n.op)(n.target, new_args, kwargs)
 
         submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn)
         submod_compiler.run(*example_inputs)
diff --git a/torch/_dynamo/backends/inductor.py b/torch/_dynamo/backends/inductor.py
new file mode 100644
index 000000000000..cbc427e8eec0
--- /dev/null
+++ b/torch/_dynamo/backends/inductor.py
@@ -0,0 +1,9 @@
+from torch._dynamo import register_backend
+
+
+@register_backend
+def inductor(*args, **kwargs):
+    # do import here to avoid loading inductor into memory when it is not used
+    from torch._inductor.compile_fx import compile_fx
+
+    return compile_fx(*args, **kwargs)
diff --git a/torch/_dynamo/backends/ipex.py b/torch/_dynamo/backends/ipex.py
new file mode 100644
index 000000000000..b95bdb1d5313
--- /dev/null
+++ b/torch/_dynamo/backends/ipex.py
@@ -0,0 +1,39 @@
+import importlib
+import logging
+
+import torch
+from torch._dynamo import register_backend
+from .common import fake_tensor_unsupported
+
+log = logging.getLogger(__name__)
+
+
+@register_backend
+@fake_tensor_unsupported
+def ipex(model, inputs):
+    try:
+        import intel_extension_for_pytorch  # type: ignore[import]  # noqa: F401
+    except ImportError:
+        log.exception(
+            "Unable to import Intel Extension for PyTorch (IPEX). "
+            "Please install the right version of IPEX that matches the PyTorch version being used. "
+            "Refer to https://github.com/intel/intel-extension-for-pytorch for details."
+        )
+        raise
+
+    try:
+        with torch.no_grad():
+            traced_model = torch.jit.trace(model.eval(), inputs)
+            traced_model = torch.jit.freeze(traced_model)
+        return traced_model
+    except Exception:
+        log.warning("JIT trace failed during the 'ipex' optimize process.")
+        return model
+
+
+def has_ipex():
+    try:
+        importlib.import_module("intel_extension_for_pytorch")
+        return True
+    except ImportError:
+        return False
diff --git a/torch/_dynamo/backends/nvfuser.py b/torch/_dynamo/backends/nvfuser.py
new file mode 100644
index 000000000000..958a70bd709e
--- /dev/null
+++ b/torch/_dynamo/backends/nvfuser.py
@@ -0,0 +1,95 @@
+import logging
+from functools import partial
+
+import torch
+from ..backends.common import aot_autograd, mem_efficient_fusion_kwargs
+from .registry import register_backend, register_debug_backend
+
+log = logging.getLogger(__name__)
+
+
+def prims_executor(gm, inputs, *, executor):
+    from functorch.compile import make_boxed_func
+
+    # This function is called once per forward/backward pass of a graph in AOT
+    # Autograd. We use it to set up the nvFuser-specific FX graph and return
+    # execute function.
+    from torch._prims.context import TorchRefsNvfuserCapabilityMode
+    from torch._prims.executor import execute
+    from torch.fx.experimental.proxy_tensor import make_fx
+
+    # AOT Autograd might not use the partitioner, so we need to make sure that
+    # the graph is transformed to use nvFuser-compatible nodes.
+    if not getattr(gm, "_nvprim_transformed", False):
+        with TorchRefsNvfuserCapabilityMode():
+            gm = make_fx(gm)(*inputs)
+
+    # Then we return a callable that executes the "gm" graph
+    return make_boxed_func(partial(execute, gm, executor=executor))
+
+
+def nvprims_fw_bw_partition_fn(joint_module, joint_inputs, *, num_fwd_outputs):
+    # This function is called once per forward+backward pass of a graph in AOT
+    # Autograd. We use it to set up the nvFuser-specific FX graph that is later
+    # passed to the executor.
+    from functorch.compile import min_cut_rematerialization_partition
+
+    from torch._prims.context import TorchRefsNvfuserCapabilityMode
+    from torch.fx.experimental.proxy_tensor import make_fx
+
+    # AOT Autograd expects arguments of the traced function to be named exactly
+    # "primals, tangents"
+    def func(primals, tangents):
+        return joint_module(primals, tangents)
+
+    # First we trace the graph conditionally decomposing nodes
+    # that can be sent to the nvfuser executor
+    with TorchRefsNvfuserCapabilityMode():
+        prim_gm = make_fx(func)(*joint_inputs)
+
+    # all nvprims for now
+    recomputable_ops = {
+        getattr(torch.ops.nvprims, prim)
+        for prim in dir(torch.ops.nvprims)
+        if isinstance(getattr(torch.ops.nvprims, prim), torch._ops.OpOverloadPacket)
+        and getattr(torch.ops.nvprims, prim).is_recomputable
+    }
+
+    fw_gm, bw_gm = min_cut_rematerialization_partition(
+        prim_gm,
+        joint_inputs,
+        recomputable_ops=recomputable_ops,
+        num_fwd_outputs=num_fwd_outputs,
+    )
+    # AOT Autograd might not use the partitioner, so we need to make sure that
+    # the graph is marked as already transformed to use nvFuser-compatible nodes
+    fw_gm._nvprim_transformed = True
+    bw_gm._nvprim_transformed = True
+    return fw_gm, bw_gm
+
+
+def create_nvprims_backend(*, executor):
+    return aot_autograd(
+        fw_compiler=partial(prims_executor, executor=executor),
+        bw_compiler=partial(prims_executor, executor=executor),
+        partition_fn=nvprims_fw_bw_partition_fn,
+    )
+
+
+aot_nvprims_nvfuser = create_nvprims_backend(executor="nvfuser")
+aot_nvprims_aten = create_nvprims_backend(executor="aten")
+
+# "nvprims" is a subset of PrimTorch primitives that are guaranteed to be
+# supported by nvFuser. This is the preferred backend for nvFuser+PrimTorch.
+register_backend(name="nvprims_nvfuser", compiler_fn=aot_nvprims_nvfuser)
+# This is useful for debugging. Can be removed later.
+register_debug_backend(name="nvprims_aten", compiler_fn=aot_nvprims_aten)
+
+
+# Use min cut rematerialization and TorchScript+nvFuser with AOT Autograd
+# aot_ts_nvfuser uses the memory efficient fusion algorithm from AOT Autograd.
+# It uses min cut rematerialization algorithm, uses nvFuser as the
+# compiler backend, and TorchScript as the frontend.
+aot_mem_efficient_fusion = aot_autograd(**mem_efficient_fusion_kwargs(use_decomps=True))
+aot_mem_efficient_fusion.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
+register_backend(name="aot_ts_nvfuser", compiler_fn=aot_mem_efficient_fusion)
diff --git a/torch/_dynamo/backends/onnxrt.py b/torch/_dynamo/backends/onnxrt.py
new file mode 100644
index 000000000000..cd10d2610538
--- /dev/null
+++ b/torch/_dynamo/backends/onnxrt.py
@@ -0,0 +1,118 @@
+import importlib
+import os
+import tempfile
+
+import torch
+from .common import device_from_inputs, fake_tensor_unsupported
+from .registry import register_backend
+
+try:
+    import numpy as np
+
+    _np_dtype = {
+        torch.float16: np.float16,
+        torch.float32: np.float32,
+        torch.float64: np.float64,
+        torch.uint8: np.uint8,
+        torch.int8: np.int8,
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.longlong,
+        torch.bool: np.bool_,
+    }
+
+except ImportError:
+    _np_dtype = None
+
+
+def default_provider(device_type):
+    if "ONNXRT_PROVIDER" in os.environ:
+        return os.environ["ONNXRT_PROVIDER"]
+    return {
+        "cpu": "CPUExecutionProvider",
+        "cuda": "CUDAExecutionProvider",
+        # "TensorrtExecutionProvider" is another option
+    }[device_type]
+
+
+def has_onnxruntime():
+    try:
+        importlib.import_module("onnxruntime")
+        return True
+    except ImportError:
+        return False
+
+
+@register_backend
+@fake_tensor_unsupported
+def onnxrt(gm, example_inputs, *, filename=None, provider=None):
+    if filename is None:
+        with tempfile.NamedTemporaryFile(suffix=".onnx") as tmp:
+            return onnxrt(gm, example_inputs, filename=tmp.name)
+
+    import onnxruntime  # type: ignore[import]
+
+    assert _np_dtype, "requires numpy"
+
+    device_type = device_from_inputs(example_inputs).type
+    example_outputs = gm(*example_inputs)
+    output_spec = [
+        (o.shape, o.dtype, o.layout, o.device, o.requires_grad) for o in example_outputs
+    ]
+    input_names = [f"i{i}" for i in range(len(example_inputs))]
+    output_names = [f"o{x}" for x in range(len(example_outputs))]
+
+    torch.onnx.export(
+        torch.jit.script(gm),
+        example_inputs,
+        filename,
+        input_names=input_names,
+        output_names=output_names,
+    )
+    del example_inputs, example_outputs
+
+    if provider is None:
+        provider = default_provider(device_type)
+    assert provider in onnxruntime.get_available_providers()
+    session = onnxruntime.InferenceSession(filename, providers=[provider])
+
+    def _call(*initial_args):
+        binding = session.io_binding()
+        args = [a.contiguous() for a in initial_args]
+        for name, value in zip(input_names, args):
+            dev = value.device
+            binding.bind_input(
+                name,
+                dev.type,
+                dev.index or 0,
+                _np_dtype[value.dtype],
+                value.size(),
+                value.data_ptr(),
+            )
+        outputs = [
+            torch.empty(
+                shape,
+                dtype=dtype,
+                layout=layout,
+                device=device,
+                requires_grad=requires_grad,
+            )
+            for shape, dtype, layout, device, requires_grad in output_spec
+        ]
+
+        for name, value in zip(output_names, outputs):
+            dev = value.device
+            binding.bind_output(
+                name,
+                dev.type,
+                dev.index or 0,
+                _np_dtype[value.dtype],
+                value.size(),
+                value.data_ptr(),
+            )
+        session.run_with_iobinding(binding)
+        if device_type == "cpu":
+            binding.copy_outputs_to_cpu()
+        return outputs
+
+    return _call
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
new file mode 100644
index 000000000000..99a2c719b6de
--- /dev/null
+++ b/torch/_dynamo/backends/registry.py
@@ -0,0 +1,109 @@
+import functools
+import sys
+from typing import Callable, Dict, List, Optional, Protocol, Sequence, Tuple
+
+import torch
+from torch import fx
+
+
+class CompiledFn(Protocol):
+    def __call__(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        ...
+
+
+CompilerFn = Callable[[fx.GraphModule, List[torch.Tensor]], CompiledFn]
+
+_BACKENDS: Dict[str, CompilerFn] = dict()
+
+
+def register_backend(
+    compiler_fn: Optional[CompilerFn] = None,
+    name: Optional[str] = None,
+    tags: Sequence[str] = (),
+):
+    """
+    Decorator to add a given compiler to the registry to allow calling
+    `torch.compile` with string shorthand.  Note: for projects not
+    imported by default, it might be easier to pass a function directly
+    as a backend and not use a string.
+
+    Args:
+        compiler_fn: Callable taking a FX graph and fake tensor inputs
+        name: Optional name, defaults to `compiler_fn.__name__`
+        tags: Optional set of string tags to categorize backend with
+    """
+    if compiler_fn is None:
+        # @register_backend(name="") syntax
+        return functools.partial(register_backend, name=name, tags=tags)
+    assert callable(compiler_fn)
+    name = name or compiler_fn.__name__
+    assert name not in _BACKENDS, f"duplicate name: {name}"
+    _BACKENDS[name] = compiler_fn
+    compiler_fn._tags = tuple(tags)
+    return compiler_fn
+
+
+register_debug_backend = functools.partial(register_backend, tags=("debug",))
+register_experimental_backend = functools.partial(
+    register_backend, tags=("experimental",)
+)
+
+
+def lookup_backend(compiler_fn):
+    """Expand backend strings to functions"""
+    if isinstance(compiler_fn, str):
+        if compiler_fn not in _BACKENDS:
+            _lazy_import()
+        if compiler_fn not in _BACKENDS:
+            _lazy_import_entry_point(compiler_fn)
+        compiler_fn = _BACKENDS[compiler_fn]
+    return compiler_fn
+
+
+def list_backends(exclude_tags=("debug", "experimental")):
+    """
+    Return valid strings that can be passed to:
+
+        torch.compile(..., backend="name")
+    """
+    _lazy_import()
+    exclude_tags = set(exclude_tags or ())
+    return sorted(
+        [
+            name
+            for name, backend in _BACKENDS.items()
+            if not exclude_tags.intersection(backend._tags)
+        ]
+    )
+
+
+@functools.lru_cache(None)
+def _lazy_import():
+    from .. import backends
+    from ..utils import import_submodule
+
+    import_submodule(backends)
+
+    from ..debug_utils import dynamo_minifier_backend
+
+    assert dynamo_minifier_backend is not None
+
+
+@functools.lru_cache(None)
+def _lazy_import_entry_point(backend_name: str):
+    from importlib.metadata import entry_points
+
+    compiler_fn = None
+    group_name = "torch_dynamo_backends"
+    if sys.version_info < (3, 10):
+        backend_eps = entry_points()
+        eps = [ep for ep in backend_eps[group_name] if ep.name == backend_name]
+        if len(eps) > 0:
+            compiler_fn = eps[0].load()
+    else:
+        backend_eps = entry_points(group=group_name)
+        if backend_name in backend_eps.names:
+            compiler_fn = backend_eps[backend_name].load()
+
+    if compiler_fn is not None and backend_name not in list_backends(tuple()):
+        register_backend(compiler_fn=compiler_fn, name=backend_name)
diff --git a/torch/_dynamo/backends/tensorrt.py b/torch/_dynamo/backends/tensorrt.py
new file mode 100644
index 000000000000..493e21a9dfc5
--- /dev/null
+++ b/torch/_dynamo/backends/tensorrt.py
@@ -0,0 +1,12 @@
+# import torch  # type: ignore[import]
+# from .common import device_from_inputs, fake_tensor_unsupported  # type: ignore[import]
+# from .registry import register_backend  # type: ignore[import]
+
+"""
+Placeholder for TensorRT backend for dynamo via torch-tensorrt
+"""
+
+# @register_backend
+# def tensorrt(gm, example_inputs):
+#    import torch_tensorrt # type: ignore[import]
+#    pass
diff --git a/torch/_dynamo/backends/torchxla.py b/torch/_dynamo/backends/torchxla.py
new file mode 100644
index 000000000000..34545c8fe23c
--- /dev/null
+++ b/torch/_dynamo/backends/torchxla.py
@@ -0,0 +1,39 @@
+import logging
+
+from ..backends.common import aot_autograd
+from ..backends.registry import register_experimental_backend as register_backend
+
+log = logging.getLogger(__name__)
+
+
+@register_backend
+def torchxla_trivial(gm, fake_tensor_inputs):
+    return gm
+
+
+@register_backend
+def torchxla_trace_once(model, fake_tensor_inputs):
+    import torch_xla.core.dynamo_bridge as bridge  # type: ignore[import]
+
+    compiled_graph = None
+
+    def fwd(*args):
+        nonlocal model
+        nonlocal compiled_graph
+        if compiled_graph is None:
+            compiled_graph = bridge.extract_compiled_graph(model, args)
+            del model
+        return compiled_graph(*args)
+
+    return fwd
+
+
+aot_torchxla_trivial = aot_autograd(
+    fw_compiler=torchxla_trivial,
+)
+register_backend(name="aot_torchxla_trivial", compiler_fn=aot_torchxla_trivial)
+
+aot_torchxla_trace_once = aot_autograd(
+    fw_compiler=torchxla_trace_once,
+)
+register_backend(name="aot_torchxla_trace_once", compiler_fn=aot_torchxla_trace_once)
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
new file mode 100644
index 000000000000..e63a62a75905
--- /dev/null
+++ b/torch/_dynamo/backends/tvm.py
@@ -0,0 +1,157 @@
+import functools
+import importlib
+import logging
+import os
+import tempfile
+
+import torch
+from .common import device_from_inputs, fake_tensor_unsupported
+
+from .registry import register_backend
+
+log = logging.getLogger(__name__)
+
+
+@register_backend
+@fake_tensor_unsupported
+def tvm(gm, example_inputs, *, scheduler=None, trials=20000):
+    import tvm  # type: ignore[import]
+    from tvm import relay  # type: ignore[import]
+    from tvm.contrib import graph_executor  # type: ignore[import]
+
+    jit_mod = torch.jit.trace(gm, example_inputs)
+    device = device_from_inputs(example_inputs)
+    shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
+    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
+    if device.type == "cuda":
+        dev = tvm.cuda(device.index)
+        target = tvm.target.cuda()
+    else:
+        dev = tvm.cpu(0)
+        target = tvm.target.Target(llvm_target())
+
+    if scheduler is None:
+        scheduler = os.environ.get("TVM_SCHEDULER", None)
+
+    if scheduler == "auto_scheduler":
+        from tvm import auto_scheduler
+
+        log_file = tempfile.NamedTemporaryFile()
+
+        if not os.path.exists(log_file):
+            tasks, task_weights = auto_scheduler.extract_tasks(
+                mod["main"], params, target
+            )
+            for task in tasks:
+                print(task.compute_dag)
+            else:
+                print("No tasks")
+            if len(tasks) != 0:
+                tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+                if not os.path.exists(log_file):
+                    assert trials > 0
+                    tune_option = auto_scheduler.TuningOptions(
+                        num_measure_trials=trials,
+                        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+                        early_stopping=2000,
+                    )
+                    try:
+                        tuner.tune(tune_option)
+                    except Exception:
+                        if os.path.exists(log_file):
+                            os.unlink(log_file)
+                        raise
+
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+            ):
+                lib = relay.build(mod, target=target, params=params)
+    elif scheduler == "meta_schedule":
+        from tvm import meta_schedule as ms
+
+        with tempfile.TemporaryDirectory() as work_dir:
+            if device.type != "cuda":
+                # meta_schedule needs num-cores to be specified
+                # here we use the maximum core count
+                target = tvm.target.Target(
+                    f"{llvm_target()} --num-cores {ms.utils.cpu_count(logical=False)}"
+                )
+            # TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch
+            # once USE_PT_TVMDSOOP is updated and turned on by default in TVM.
+            database = ms.relay_integration.tune_relay(
+                mod=mod,
+                target=target,
+                work_dir=work_dir,
+                max_trials_global=20000,
+                num_trials_per_iter=64,
+                params=params,
+                strategy="evolutionary",
+            )
+            lib = ms.relay_integration.compile_relay(
+                database=database,
+                mod=mod,
+                target=target,
+                params=params,
+            )
+    elif scheduler == "default" or not scheduler:
+        # no autotuning
+        with tvm.transform.PassContext(opt_level=10):
+            lib = relay.build(mod, target=target, params=params)
+    else:
+        raise NotImplementedError(
+            "This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
+            "There are three available options: default, auto_scheduler and meta_schedule."
+        )
+    m = graph_executor.GraphModule(lib["default"](dev))
+
+    def to_torch_tensor(nd_tensor):
+        """A helper function to transfer a NDArray to torch.tensor."""
+        if nd_tensor.dtype == "bool":
+            # DLPack does not support boolean so it can't be handled by
+            # torch.utils.dlpack.from_pack. Workaround by going through
+            # numpy, although this brings additional data copy overhead.
+            return torch.from_numpy(nd_tensor.numpy())
+        return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
+
+    def to_tvm_tensor(torch_tensor):
+        """A helper function to transfer a torch.tensor to NDArray."""
+        if torch_tensor.dtype == torch.bool:
+            # same reason as above, fallback to numpy conversion which
+            # could introduce data copy overhead
+            return tvm.nd.array(torch_tensor.cpu().numpy())
+        return tvm.nd.from_dlpack(torch_tensor)
+
+    def exec_tvm(*i_args):
+        args = [a.contiguous() for a in i_args]
+        for idx, arg in enumerate(args, 0):
+            if arg.dim() != 0:
+                if arg.requires_grad:
+                    arg = arg.detach()
+                m.set_input(
+                    f"inp_{idx}",
+                    to_tvm_tensor(arg),
+                )
+        m.run()
+        return [to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())]
+
+    return exec_tvm
+
+
+tvm_meta_schedule = functools.partial(tvm, scheduler="meta_schedule")
+tvm_auto_scheduler = functools.partial(tvm, scheduler="auto_scheduler")
+
+
+def has_tvm():
+    try:
+        importlib.import_module("tvm")
+        return True
+    except ImportError:
+        return False
+
+
+@functools.lru_cache(None)
+def llvm_target():
+    if "avx512" in open("/proc/cpuinfo").read():
+        return "llvm -mcpu=skylake-avx512"
+    return "llvm -mcpu=core-avx2"
diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
index 165bb77fc3c1..38700c214fe7 100644
--- a/torch/_dynamo/bytecode_analysis.py
+++ b/torch/_dynamo/bytecode_analysis.py
@@ -5,28 +5,22 @@
 
 TERMINAL_OPCODES = {
     dis.opmap["RETURN_VALUE"],
-    dis.opmap["JUMP_ABSOLUTE"],
     dis.opmap["JUMP_FORWARD"],
     dis.opmap["RAISE_VARARGS"],
     # TODO(jansel): double check exception handling
 }
 if sys.version_info >= (3, 9):
     TERMINAL_OPCODES.add(dis.opmap["RERAISE"])
+if sys.version_info >= (3, 11):
+    TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD"])
+else:
+    TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
 JUMP_OPCODES = set(dis.hasjrel + dis.hasjabs)
+JUMP_OPNAMES = {dis.opname[opcode] for opcode in JUMP_OPCODES}
 HASLOCAL = set(dis.haslocal)
 HASFREE = set(dis.hasfree)
 
-if sys.version_info < (3, 8):
-
-    def stack_effect(opcode, arg, jump=None):
-        # jump= was added in python 3.8, we just ingore it here
-        if dis.opname[opcode] in ("NOP", "EXTENDED_ARG"):
-            # for some reason NOP isn't supported in python 3.7
-            return 0
-        return dis.stack_effect(opcode, arg)
-
-else:
-    stack_effect = dis.stack_effect
+stack_effect = dis.stack_effect
 
 
 def remove_dead_code(instructions):
@@ -187,11 +181,6 @@ def stacksize_analysis(instructions):
     low = min([x.low for x in stack_sizes.values()])
     high = max([x.high for x in stack_sizes.values()])
 
-    if sys.version_info < (3, 8) and not fixed_point.value:
-        # This is a rare issue in python 3.7 that still needs debugging
-        # see test/test_nops.py::NopTests::test3
-        return low + 32
-
     assert fixed_point.value, "failed to reach fixed point"
     assert low >= 0
     return high
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 5355e3f41cdf..7e14c1971b4c 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -3,7 +3,7 @@
 import itertools
 import sys
 import types
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from .bytecode_analysis import (
     propagate_line_nums,
@@ -18,7 +18,7 @@ class Instruction:
 
     opcode: int
     opname: str
-    arg: int
+    arg: Optional[int]
     argval: Any
     offset: Optional[int] = None
     starts_line: Optional[int] = None
@@ -57,6 +57,106 @@ def create_instruction(name, arg=None, argval=_NotProvided, target=None):
     )
 
 
+# Python 3.11 remaps
+def create_jump_absolute(target):
+    inst = "JUMP_FORWARD" if sys.version_info >= (3, 11) else "JUMP_ABSOLUTE"
+    return create_instruction(inst, target=target)
+
+
+def create_load_global(name, arg, push_null):
+    """
+    `name` is the name of the global to be loaded.
+    `arg` is the index of `name` in the global name table.
+    `push_null` specifies whether or not a NULL should be pushed to the stack
+    before the global (Python 3.11+ only).
+
+    Python 3.11 changed the LOAD_GLOBAL instruction in that the first bit of
+    the arg specifies whether a NULL should be pushed to the stack before the
+    global. The remaining bits of arg contain the name index. See
+    `create_call_function` for why this NULL is needed.
+    """
+    if sys.version_info >= (3, 11):
+        arg = (arg << 1) + push_null
+    return create_instruction("LOAD_GLOBAL", arg, name)
+
+
+def create_dup_top():
+    if sys.version_info >= (3, 11):
+        return create_instruction("COPY", 1)
+    return create_instruction("DUP_TOP")
+
+
+def create_rot_n(n):
+    """
+    Returns a "simple" sequence of instructions that rotates TOS to the n-th
+    position in the stack. For Python < 3.11, returns a single ROT_*
+    instruction. If no such instruction exists, an error is raised and the
+    caller is expected to generate an equivalent sequence of instructions.
+    For Python >= 3.11, any rotation can be expressed as a simple sequence of
+    swaps.
+    """
+    if n <= 1:
+        # don't rotate
+        return []
+
+    if sys.version_info >= (3, 11):
+        # rotate can be expressed as a sequence of swap operations
+        # e.g. rotate 3 is equivalent to swap 3, swap 2
+        return [create_instruction("SWAP", i) for i in range(n, 1, -1)]
+
+    # ensure desired rotate function exists
+    if sys.version_info < (3, 8) and n >= 4:
+        raise AttributeError(f"rotate {n} not supported for Python < 3.8")
+    if sys.version_info < (3, 10) and n >= 5:
+        raise AttributeError(f"rotate {n} not supported for Python < 3.10")
+
+    if n <= 4:
+        return [create_instruction("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])]
+    return [create_instruction("ROT_N", n)]
+
+
+def create_call_function(nargs, push_null):
+    """
+    Creates a sequence of instructions that makes a function call.
+
+    `push_null` is used in Python 3.11+ only. It is used in codegen when
+    a function call is intended to be made with the NULL + fn convention,
+    and we know that the NULL has not been pushed yet. We will push a
+    NULL and rotate it to the correct position immediately before making
+    the function call.
+    push_null should default to True unless you know you are calling a function
+    that you codegen'd with a null already pushed, for example,
+
+    create_instruction("LOAD_GLOBAL", 1, "math")  # pushes a null
+    create_instruction("LOAD_ATTR", argval="sqrt")
+    create_instruction("LOAD_CONST", argval=25)
+    create_call_function(1, False)
+    """
+    if sys.version_info >= (3, 11):
+        output = []
+        if push_null:
+            output.append(create_instruction("PUSH_NULL"))
+            output.extend(create_rot_n(nargs + 2))
+        output.append(create_instruction("PRECALL", nargs))
+        output.append(create_instruction("CALL", nargs))
+        return output
+    return [create_instruction("CALL_FUNCTION", nargs)]
+
+
+def create_call_method(nargs):
+    if sys.version_info >= (3, 11):
+        return [create_instruction("PRECALL", nargs), create_instruction("CALL", nargs)]
+    return [create_instruction("CALL_METHOD", nargs)]
+
+
+def cell_and_freevars_offset(code, i):
+    if sys.version_info >= (3, 11):
+        if isinstance(code, dict):
+            return i + code["co_nlocals"]
+        return i + code.co_nlocals
+    return i
+
+
 def lnotab_writer(lineno, byteno=0):
     """
     Used to create typing.CodeType.co_lnotab
@@ -114,7 +214,7 @@ def end(total_bytes):
     return linetable, update, end
 
 
-def assemble(instructions: List[dis.Instruction], firstlineno):
+def assemble(instructions: List[Instruction], firstlineno):
     """Do the opposite of dis.get_instructions()"""
     code = []
     if sys.version_info < (3, 10):
@@ -127,6 +227,9 @@ def assemble(instructions: List[dis.Instruction], firstlineno):
             update_lineno(inst.starts_line, len(code))
         arg = inst.arg or 0
         code.extend((inst.opcode, arg & 0xFF))
+        if sys.version_info >= (3, 11):
+            for _ in range(instruction_size(inst) // 2 - 1):
+                code.extend((0, 0))
 
     if sys.version_info >= (3, 10):
         end(len(code))
@@ -146,6 +249,22 @@ def virtualize_jumps(instructions):
                     break
 
 
+_REL_JUMPS = set(dis.hasjrel)
+
+
+def flip_jump_direction(instruction):
+    if sys.version_info < (3, 11):
+        raise RuntimeError("Cannot flip jump direction in Python < 3.11")
+    if "FORWARD" in instruction.opname:
+        instruction.opname = instruction.opname.replace("FORWARD", "BACKWARD")
+    elif "BACKWARD" in instruction.opname:
+        instruction.opname = instruction.opname.replace("BACKWARD", "FORWARD")
+    else:
+        raise AttributeError("Instruction is not a forward or backward jump")
+    instruction.opcode = dis.opmap[instruction.opname]
+    assert instruction.opcode in _REL_JUMPS
+
+
 def devirtualize_jumps(instructions):
     """Fill in args for virtualized jump target after instructions may have moved"""
     indexof = {id(inst): i for i, inst, in enumerate(instructions)}
@@ -167,17 +286,29 @@ def devirtualize_jumps(instructions):
             if inst.opcode in dis.hasjabs:
                 if sys.version_info < (3, 10):
                     inst.arg = target.offset
-                else:
-                    # arg is offset of the instruction line rather than the bytecode
-                    # for all jabs/jrel since python 3.10
+                elif sys.version_info < (3, 11):
+                    # `arg` is expected to be bytecode offset, whereas `offset` is byte offset.
+                    # Divide since bytecode is 2 bytes large.
                     inst.arg = int(target.offset / 2)
-            else:  # relative jump
-                if sys.version_info < (3, 10):
-                    inst.arg = target.offset - inst.offset - instruction_size(inst)
                 else:
-                    inst.arg = int(
-                        (target.offset - inst.offset - instruction_size(inst)) / 2
-                    )
+                    raise RuntimeError("Python 3.11+ should not have absolute jumps")
+            else:  # relative jump
+                # byte offset between target and next instruction
+                inst.arg = int(target.offset - inst.offset - instruction_size(inst))
+                if inst.arg < 0:
+                    if sys.version_info < (3, 11):
+                        raise RuntimeError("Got negative jump offset for Python < 3.11")
+                    inst.arg = -inst.arg
+                    # forward jumps become backward
+                    if "FORWARD" in inst.opname:
+                        flip_jump_direction(inst)
+                elif inst.arg > 0:
+                    # backward jumps become forward
+                    if sys.version_info >= (3, 11) and "BACKWARD" in inst.opname:
+                        flip_jump_direction(inst)
+                if sys.version_info >= (3, 10):
+                    # see bytecode size comment in the absolute jump case above
+                    inst.arg //= 2
             inst.argval = target.offset
             inst.argrepr = f"to {target.offset}"
 
@@ -197,31 +328,44 @@ def remove_load_call_method(instructions: List[Instruction]):
 
 
 def explicit_super(code: types.CodeType, instructions: List[Instruction]):
-    """convert super() with no args into explict arg form"""
+    """convert super() with no args into explicit arg form"""
     cell_and_free = (code.co_cellvars or tuple()) + (code.co_freevars or tuple())
     output = []
     for idx, inst in enumerate(instructions):
         output.append(inst)
         if inst.opname == "LOAD_GLOBAL" and inst.argval == "super":
             nexti = instructions[idx + 1]
-            if nexti.opname == "CALL_FUNCTION" and nexti.arg == 0:
+            if nexti.opname in ("CALL_FUNCTION", "PRECALL") and nexti.arg == 0:
                 assert "__class__" in cell_and_free
                 output.append(
                     create_instruction(
-                        "LOAD_DEREF", cell_and_free.index("__class__"), "__class__"
+                        "LOAD_DEREF",
+                        cell_and_freevars_offset(
+                            code, cell_and_free.index("__class__")
+                        ),
+                        "__class__",
                     )
                 )
                 first_var = code.co_varnames[0]
                 if first_var in cell_and_free:
                     output.append(
                         create_instruction(
-                            "LOAD_DEREF", cell_and_free.index(first_var), first_var
+                            "LOAD_DEREF",
+                            cell_and_freevars_offset(
+                                code, cell_and_free.index(first_var)
+                            ),
+                            first_var,
                         )
                     )
                 else:
                     output.append(create_instruction("LOAD_FAST", 0, first_var))
                 nexti.arg = 2
                 nexti.argval = 2
+                if nexti.opname == "PRECALL":
+                    # also update the following CALL instruction
+                    call_inst = instructions[idx + 2]
+                    call_inst.arg = 2
+                    call_inst.argval = 2
 
     instructions[:] = output
 
@@ -259,7 +403,26 @@ def maybe_pop_n(n):
     return added
 
 
+# from https://github.com/python/cpython/blob/v3.11.1/Include/internal/pycore_opcode.h#L41
+# TODO use the actual object instead, can interface from eval_frame.c
+_PYOPCODE_CACHES = {
+    "BINARY_SUBSCR": 4,
+    "STORE_SUBSCR": 1,
+    "UNPACK_SEQUENCE": 1,
+    "STORE_ATTR": 4,
+    "LOAD_ATTR": 4,
+    "COMPARE_OP": 2,
+    "LOAD_GLOBAL": 5,
+    "BINARY_OP": 1,
+    "LOAD_METHOD": 10,
+    "PRECALL": 1,
+    "CALL": 4,
+}
+
+
 def instruction_size(inst):
+    if sys.version_info >= (3, 11):
+        return 2 * (_PYOPCODE_CACHES.get(dis.opname[inst.opcode], 0) + 1)
     return 2
 
 
@@ -303,35 +466,61 @@ def fix_vars(instructions: List[Instruction], code_options):
     varnames = {name: idx for idx, name in enumerate(code_options["co_varnames"])}
     names = {name: idx for idx, name in enumerate(code_options["co_names"])}
     for i in range(len(instructions)):
+        if sys.version_info >= (3, 11) and instructions[i].opname == "LOAD_GLOBAL":
+            # LOAD_GLOBAL is in HAS_NAME, so instructions[i].arg will be overwritten.
+            # So we must compute push_null earlier.
+            assert instructions[i].arg is not None
+            shift = 1
+            push_null = instructions[i].arg % 2
+        else:
+            shift = 0
+            push_null = 0
+
         if instructions[i].opcode in HAS_LOCAL:
             instructions[i].arg = varnames[instructions[i].argval]
         elif instructions[i].opcode in HAS_NAME:
             instructions[i].arg = names[instructions[i].argval]
 
+        if instructions[i].arg is not None:
+            instructions[i].arg = (instructions[i].arg << shift) + push_null
+
 
 def transform_code_object(code, transformations, safe=False):
-    keys = [
-        "co_argcount",
-        "co_posonlyargcount",  # python 3.8+
-        "co_kwonlyargcount",
-        "co_nlocals",
-        "co_stacksize",
-        "co_flags",
-        "co_code",
-        "co_consts",
-        "co_names",
-        "co_varnames",
-        "co_filename",
-        "co_name",
-        "co_firstlineno",
-        "co_lnotab",  # changed to "co_linetable" if python 3.10+
-        "co_freevars",
-        "co_cellvars",
-    ]
-    if sys.version_info < (3, 8):
-        keys.pop(1)
+    # Python 3.11 changes to code keys are not fully documented.
+    # See https://github.com/python/cpython/blob/3.11/Objects/clinic/codeobject.c.h#L24
+    # for new format.
+    keys = ["co_argcount"]
+    keys.append("co_posonlyargcount")
+    keys.extend(
+        [
+            "co_kwonlyargcount",
+            "co_nlocals",
+            "co_stacksize",
+            "co_flags",
+            "co_code",
+            "co_consts",
+            "co_names",
+            "co_varnames",
+            "co_filename",
+            "co_name",
+        ]
+    )
+    if sys.version_info >= (3, 11):
+        keys.append("co_qualname")
+    keys.append("co_firstlineno")
     if sys.version_info >= (3, 10):
-        keys = list(map(lambda x: x.replace("co_lnotab", "co_linetable"), keys))
+        keys.append("co_linetable")
+    else:
+        keys.append("co_lnotab")
+    if sys.version_info >= (3, 11):
+        # not documented, but introduced in https://github.com/python/cpython/issues/84403
+        keys.append("co_exceptiontable")
+    keys.extend(
+        [
+            "co_freevars",
+            "co_cellvars",
+        ]
+    )
     code_options = {k: getattr(code, k) for k in keys}
     assert len(code_options["co_varnames"]) == code_options["co_nlocals"]
 
@@ -339,7 +528,12 @@ def transform_code_object(code, transformations, safe=False):
     propagate_line_nums(instructions)
 
     transformations(instructions, code_options)
+    return clean_and_assemble_instructions(instructions, keys, code_options)[1]
+
 
+def clean_and_assemble_instructions(
+    instructions: List[Instruction], keys: List[str], code_options: Dict[str, Any]
+) -> Tuple[List[Instruction], types.CodeType]:
     fix_vars(instructions, code_options)
 
     dirty = True
@@ -362,7 +556,10 @@ def transform_code_object(code, transformations, safe=False):
     assert set(keys) - {"co_posonlyargcount"} == set(code_options.keys()) - {
         "co_posonlyargcount"
     }
-    return types.CodeType(*[code_options[k] for k in keys])
+    if sys.version_info >= (3, 11):
+        # generated code doesn't contain exceptions, so leave exception table empty
+        code_options["co_exceptiontable"] = b""
+    return instructions, types.CodeType(*[code_options[k] for k in keys])
 
 
 def cleaned_instructions(code, safe=False):
@@ -371,7 +568,8 @@ def cleaned_instructions(code, safe=False):
     virtualize_jumps(instructions)
     strip_extended_args(instructions)
     if not safe:
-        remove_load_call_method(instructions)
+        if sys.version_info < (3, 11):
+            remove_load_call_method(instructions)
         explicit_super(code, instructions)
     return instructions
 
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index dd46ba097e1f..582983709a96 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -7,14 +7,22 @@
 
 import torch.nn
 
-from .bytecode_transformation import create_instruction, Instruction
+from .bytecode_transformation import (
+    cell_and_freevars_offset,
+    create_call_function,
+    create_dup_top,
+    create_instruction,
+    create_load_global,
+    create_rot_n,
+    Instruction,
+)
 from .exc import unimplemented
 from .source import AttrSource, Source
 from .utils import is_safe_constant, istype, rot_n_helper
 from .variables.base import VariableTracker
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
-    DynamicShapeVariable,
+    SymNodeVariable,
     TensorVariable,
     TensorWithTFOverrideVariable,
     UnspecializedPythonVariable,
@@ -31,7 +39,7 @@ def merge(self, other: VariableTracker):
         self.variable = self.variable.add_options(other)
 
 
-class PyCodegen(object):
+class PyCodegen:
     """
     Helper class uses for constructing Python bytecode
     """
@@ -55,6 +63,9 @@ def __init__(
         self.cell_and_freevars = self.tx.cell_and_freevars
         self.new_var = self.tx.output.new_var
 
+    def cell_and_freevars_offset(self, i):
+        return cell_and_freevars_offset(self.code_options, i)
+
     def graph_output_vars(self):
         return [x.variable for x in self.graph_outputs.values()]
 
@@ -72,7 +83,7 @@ def __call__(self, value, allow_cache=True):
         graph_outputs = self.graph_outputs
 
         if self.top_of_stack is value:
-            output.append(create_instruction("DUP_TOP"))
+            output.append(create_dup_top())
             return
 
         if allow_cache:
@@ -95,7 +106,7 @@ def __call__(self, value, allow_cache=True):
             value,
             (
                 TensorVariable,
-                DynamicShapeVariable,
+                SymNodeVariable,
                 TensorWithTFOverrideVariable,
                 UnspecializedPythonVariable,
             ),
@@ -119,10 +130,7 @@ def __call__(self, value, allow_cache=True):
 
             if isinstance(value, UnspecializedPythonVariable) and value.need_unwrap:
                 output.extend(
-                    [
-                        self.create_load_attr("item"),
-                        create_instruction("CALL_FUNCTION", 0),
-                    ]
+                    [self.create_load_attr("item")] + create_call_function(0, True)
                 )
         elif isinstance(value, NNModuleVariable):
             parts = value.module_key.split(".")
@@ -141,7 +149,7 @@ def __call__(self, value, allow_cache=True):
             except NotImplementedError:
                 unimplemented(f"reconstruct: {value}")
             if allow_cache and value in self.tempvars:
-                self._output.append(create_instruction("DUP_TOP"))
+                self._output.append(create_dup_top())
                 self.add_cache(value)
 
         self.top_of_stack = value
@@ -157,7 +165,7 @@ def foreach(self, items):
         for i in items:
             self(i)
 
-    def setup_globally_cached(self, name, value):
+    def setup_globally_cached(self, name, value, push_null):
         """Store value in a new global"""
         name = re.sub(r"[^a-zA-Z0-9_]+", "_", name)
         f_globals = self.tx.f_globals
@@ -165,7 +173,7 @@ def setup_globally_cached(self, name, value):
             assert id(f_globals[name]) == id(value)
         else:
             f_globals[name] = value
-        return [self.create_load_global(name, add=True)]
+        return [self.create_load_global(name, push_null, add=True)]
 
     def clear_tos(self):
         self.top_of_stack = None
@@ -186,7 +194,9 @@ def get_instructions(self):
     def create_load(self, name):
         if name in self.cell_and_freevars():
             return create_instruction(
-                "LOAD_DEREF", self.cell_and_freevars().index(name), name
+                "LOAD_DEREF",
+                self.cell_and_freevars_offset(self.cell_and_freevars().index(name)),
+                name,
             )
         assert name in self.code_options["co_varnames"], f"{name} missing"
         return create_instruction(
@@ -196,25 +206,29 @@ def create_load(self, name):
     def create_load_closure(self, name):
         assert name in self.cell_and_freevars()
         return create_instruction(
-            "LOAD_CLOSURE", self.cell_and_freevars().index(name), name
+            "LOAD_CLOSURE",
+            self.cell_and_freevars_offset(self.cell_and_freevars().index(name)),
+            name,
         )
 
     def create_store(self, name):
         if name in self.cell_and_freevars():
             return create_instruction(
-                "STORE_DEREF", self.cell_and_freevars().index(name), name
+                "STORE_DEREF",
+                self.cell_and_freevars_offset(self.cell_and_freevars().index(name)),
+                name,
             )
         assert name in self.code_options["co_varnames"]
         return create_instruction(
             "STORE_FAST", self.code_options["co_varnames"].index(name), name
         )
 
-    def create_load_global(self, name, add=False):
+    def create_load_global(self, name, push_null, add=False):
         if add:
             self.tx.output.update_co_names(name)
         assert name in self.code_options["co_names"], f"{name} not in co_names"
-        return create_instruction(
-            "LOAD_GLOBAL", self.code_options["co_names"].index(name), name
+        return create_load_global(
+            name, self.code_options["co_names"].index(name), push_null
         )
 
     def create_load_const(self, value):
@@ -252,31 +266,45 @@ def create_load_attr(self, name):
     def create_load_attrs(self, names):
         return [self.create_load_attr(name) for name in names.split(".")]
 
-    def load_function_name(self, fn_name, num_on_stack=0):
+    def load_function_name(self, fn_name, push_null, num_on_stack=0):
         """Load the global fn_name on the stack num_on_stack down"""
-        return [self.create_load_global(fn_name, add=True)] + self.rot_n(
-            num_on_stack + 1
+        output = []
+        if push_null and sys.version_info >= (3, 11):
+            output.extend(
+                [create_instruction("PUSH_NULL")] + self.rot_n(num_on_stack + 1)
+            )
+        output.extend(
+            [self.create_load_global(fn_name, False, add=True)]
+            + self.rot_n(num_on_stack + 1)
         )
+        return output
 
     def rot_n(self, n):
-        if n == 0 or n == 1:
-            return []
-        elif n == 2:
-            return [create_instruction("ROT_TWO")]
-        elif n == 3:
-            return [create_instruction("ROT_THREE")]
-        elif n == 4 and sys.version_info >= (3, 8):
-            return [create_instruction("ROT_FOUR")]
-        elif sys.version_info >= (3, 10):
-            return [create_instruction("ROT_N", n)]
-        else:
-            return [
-                create_instruction("BUILD_TUPLE", n),
-                self._create_load_const(rot_n_helper(n)),
-                create_instruction("ROT_TWO"),
-                create_instruction("CALL_FUNCTION_EX", 0),
-                create_instruction("UNPACK_SEQUENCE", n),
-            ]
+        try:
+            return create_rot_n(n)
+        except AttributeError:
+            # desired rotate bytecode doesn't exist, generate equivalent bytecode
+            return (
+                [
+                    create_instruction("BUILD_TUPLE", n),
+                    self._create_load_const(rot_n_helper(n)),
+                ]
+                + create_rot_n(2)
+                + [
+                    create_instruction("CALL_FUNCTION_EX", 0),
+                    create_instruction("UNPACK_SEQUENCE", n),
+                ]
+            )
+
+    def pop_null(self):
+        # POP_TOP doesn't work for null, so we pop nulls by pushing in a
+        # nop function, calling it (which consumes the null), and popping the result.
+        assert sys.version_info >= (3, 11)
+        return (
+            [self._create_load_const(lambda: None)]
+            + create_call_function(0, False)
+            + [create_instruction("POP_TOP")]
+        )
 
     def make_function_with_closure(
         self, fn_name: str, code: types.CodeType, num_on_stack=0
@@ -288,52 +316,51 @@ def make_function_with_closure(
             assert var in self.cell_and_freevars()
             output.append(
                 create_instruction(
-                    "LOAD_CLOSURE", self.cell_and_freevars().index(var), var
+                    "LOAD_CLOSURE",
+                    self.cell_and_freevars_offset(self.cell_and_freevars().index(var)),
+                    var,
                 )
             )
         output.append(create_instruction("BUILD_TUPLE", len(freevars)))
         output.append(self.create_load_const(code))
-        output.append(self.create_load_const(fn_name))
+        if sys.version_info < (3, 11):
+            output.append(self.create_load_const(fn_name))
         output.append(create_instruction("MAKE_FUNCTION", 0x08))
         output.extend(self.rot_n(num_on_stack + 1))
         self.clear_tos()
 
-    def create_load_python_module(self, mod):
+    def create_load_python_module(self, mod, push_null):
         """
         Generate a LOAD_GLOBAL instruction to fetch a given python module.
         """
         root_globals = self.tx.output.root_globals
         name = re.sub(r"^.*[.]", "", mod.__name__)
         if root_globals.get(name, None) is mod:
-            return self.create_load_global(name, add=True)
+            return self.create_load_global(name, push_null, add=True)
         mangled_name = f"___module_{name}_{id(mod)}"
         if mangled_name not in root_globals:
             self.tx.output.install_global(mangled_name, mod)
-        return self.create_load_global(mangled_name, add=True)
+        return self.create_load_global(mangled_name, push_null, add=True)
 
     def make_call_generated_code(self, fn_name: str) -> List[Instruction]:
         """Call the generated code function stored in fn_name"""
-        self.extend_output(self.load_function_name(fn_name))
+        self.extend_output(self.load_function_name(fn_name, True))
 
         graphargs = self.tx.output.graphargs
         for arg in graphargs:
             if arg.is_unspecialized:
                 self.extend_output(
                     [
-                        self.create_load_python_module(torch),
+                        self.create_load_python_module(torch, True),
                         self.create_load_attr("tensor"),
                     ]
                 )
                 self.extend_output(arg.load(self))
-                self.extend_output(
-                    [
-                        create_instruction("CALL_FUNCTION", 1),
-                    ]
-                )
+                self.extend_output(create_call_function(1, False))
             else:
                 self.extend_output(arg.load(self))
 
-        self.append_output(create_instruction("CALL_FUNCTION", len(graphargs)))
+        self.extend_output(create_call_function(len(graphargs), False))
 
     def load_import_from(self, module_name, object_name):
         self.extend_output(
@@ -343,7 +370,18 @@ def load_import_from(self, module_name, object_name):
         )
 
     def create_begin_finally(self):
-        if sys.version_info < (3, 8):
-            return self.create_load_const(None)
-        else:
-            return create_instruction("BEGIN_FINALLY")
+        return create_instruction("BEGIN_FINALLY")
+
+    def create_call_function_kw(self, nargs, kw_names, push_null):
+        if sys.version_info >= (3, 11):
+            output = create_call_function(nargs, push_null)
+            assert output[-2].opname == "PRECALL"
+            kw_names_inst = create_instruction(
+                "KW_NAMES", self.get_const_index(self.code_options, kw_names)
+            )
+            output.insert(-2, kw_names_inst)
+            return output
+        return [
+            self.create_load_const(kw_names),
+            create_instruction("CALL_FUNCTION_KW", nargs),
+        ]
diff --git a/torch/_dynamo/comptime.py b/torch/_dynamo/comptime.py
index fca14000de19..e449d8f878f1 100644
--- a/torch/_dynamo/comptime.py
+++ b/torch/_dynamo/comptime.py
@@ -7,6 +7,9 @@
 
 import dis
 import traceback
+from typing import Optional, Union
+
+import torch
 
 from .exc import unimplemented
 
@@ -57,6 +60,13 @@ def as_fake(self):
         """
         return self.__variable.as_proxy().node.meta["example_value"]
 
+    def size(self, dim: Optional[int] = None) -> Union[int, torch.SymInt]:
+        """
+        Returns the size of the tensor (if dim is None) or the size
+        at the dimension dim.  The returned size may be a SymInt.
+        """
+        return self.as_fake().size(dim)
+
     def python_type(self):
         """
         Returns what type(v) would have returned for the variable
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index bd62c1e49397..ed54cc00b540 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -1,29 +1,21 @@
-import logging
 import os
 import sys
+import tempfile
 from os.path import abspath, dirname
-from types import ModuleType
 
 import torch
-
 from . import external_utils
 
-try:
-    import torch._prims
-    import torch._refs
-
-    HAS_REFS_PRIMS = True
-except ImportError:
-    HAS_REFS_PRIMS = False
-
+from .logging import get_loggers_level, set_loggers_level
 
 # log level (levels print what it says + all levels listed below it)
 # logging.DEBUG print full traces <-- lowest level + print tracing of every instruction
 # logging.INFO print the steps that dynamo is running and optionally, compiled functions + graphs
 # logging.WARN print warnings (including graph breaks)
 # logging.ERROR print exceptions (and what user code was being processed when it occurred)
-# NOTE: changing log_level will automatically update the levels of all torchdynamo loggers
-log_level = logging.WARNING
+log_level = property(
+    lambda _: get_loggers_level(), lambda _, lvl: set_loggers_level(lvl)
+)
 
 # log compiled function + graphs at level INFO
 output_code = False
@@ -68,12 +60,15 @@
 # don't specialize on shapes and strides and put shape ops in graph
 dynamic_shapes = os.environ.get("TORCHDYNAMO_DYNAMIC_SHAPES") == "1"
 
+# This is a temporarily flag, which changes the behavior of dynamic_shapes=True.
+# When assume_static_by_default is True, we only allocate symbols for shapes marked dynamic via mark_dynamic.
+# NOTE - this flag can be removed once we can run dynamic_shapes=False w/ the mark_dynamic API
+# see [Note - on the state of mark_dynamic]
+assume_static_by_default = False
+
 # Set this to False to assume nn.Modules() contents are immutable (similar assumption as freezing)
 guard_nn_modules = False
 
-# run FX normalization passes in optimizer
-normalize_ir = False
-
 # This feature doesn't really work.  We offer this flag for experimental
 # purposes / if you want to help us build out support.
 #
@@ -118,13 +113,10 @@
     torch.distributions,
     torch.testing,
     torch.ao.nn,
+    torch._refs,
+    torch._prims,
+    torch._decomp,
 }
-if HAS_REFS_PRIMS:
-    skipfiles_inline_module_allowlist |= {
-        torch._refs,
-        torch._prims,
-        torch._decomp,
-    }
 
 # If a string representing a PyTorch module is in this ignorelist,
 # the `allowed_functions.is_allowed` function will not consider it
@@ -146,14 +138,36 @@
 # Compiler compilation debug info
 # 1: Dumps the original graph out to repro.py if compilation fails
 # 2: Dumps a minifier_launcher.py if compilation fails.
-# 3: Always dumps a minifier_laucher.py. Good for segfaults.
+# 3: Always dumps a minifier_launcher.py. Good for segfaults.
 # 4: Dumps a minifier_launcher.py if the accuracy fails.
 repro_level = int(os.environ.get("TORCHDYNAMO_REPRO_LEVEL", 2))
 
+# By default, we try to detect accuracy failure by running both forward
+# and backward of a torchdynamo produced graph (if you are using repro_after
+# 'dynamo').  This setting forces us to only test the forward graph and
+# not the backward graph.  This can be helpful if you're trying to debug
+# an inference only problem, but the minifier seems to be choking on the
+# backwards step
+# TODO: Detect this situation automatically so the user doesn't need
+# to manually configure this
+repro_forward_only = os.environ.get("TORCHDYNAMO_REPRO_FORWARD_ONLY") == "1"
+
+# The tolerance we should use when testing if a compiled graph
+# has diverged so that we should treat it as an accuracy failure
+repro_tolerance = 1e-3
+
 # Not all backends support scalars. Some calls on torch.Tensor (like .item()) return a scalar type.
 # When this flag is set to False, we introduce a graph break instead of capturing.
+# This requires dynamic_shapes to be True.
 capture_scalar_outputs = False
 
+# Not all backends support operators that have dynamic output shape (e.g.,
+# nonzero, unique).  When this flag is set to False, we introduce a graph
+# break instead of capturing.  This requires dynamic_shapes to be True.
+# If you set this to True, you probably also want capture_scalar_outputs
+# (these are separated for historical reasons).
+capture_dynamic_output_shape_ops = False
+
 # Should almost always be true in prod. This relaxes the requirement that cond's true_fn and
 # false_fn produces code with identical guards.
 enforce_cond_guards_match = True
@@ -171,11 +185,8 @@
 # If True, raise when aot autograd is unsafe to use
 raise_on_unsafe_aot_autograd = False
 
-# How to import torchdynamo, either torchdynamo or torch._dynamo
-dynamo_import = __name__.replace(".config", "")
-
-# How to import torchinductor, either torchinductor or torch.inductor
-inductor_import = dynamo_import.replace("dynamo", "inductor")
+# Throw an error if backend changes without reset
+raise_on_backend_change = False
 
 # If true, error with a better message if we symbolically trace over a
 # dynamo-optimized function. If false, silently suppress dynamo.
@@ -185,36 +196,34 @@
 allow_rnn = False
 
 # root folder of the project
-if "torch." in dynamo_import:
-    base_dir = dirname(dirname(dirname(abspath(__file__))))
+base_dir = dirname(dirname(dirname(abspath(__file__))))
+
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+if is_fbcode():
+    debug_dir_root = os.path.join(tempfile.gettempdir(), "torch_compile_debug")
 else:
-    base_dir = dirname(dirname(abspath(__file__)))
+    debug_dir_root = os.path.join(os.getcwd(), "torch_compile_debug")
 
-debug_dir_root = os.path.join(os.getcwd(), "torch_compile_debug")
 
 # this is to resolve a import problem in fbcode, we will be deleting
 # this very shortly
 DO_NOT_USE_legacy_non_fake_example_inputs = False
 
 
-class _AccessLimitingConfig(ModuleType):
-    def __setattr__(self, name, value):
-        if name not in _allowed_config_names:
-            raise AttributeError(f"{__name__}.{name} does not exist")
-        # automatically set logger level whenever config.log_level is modified
-        if name == "log_level":
-            from .logging import set_loggers_level
-
-            set_loggers_level(value)
-        return object.__setattr__(self, name, value)
-
+_save_config_ignore = {
+    "repro_after",
+    "repro_level",
+    # workaround: "cannot pickle PyCapsule"
+    "constant_functions",
+    # workaround: "cannot pickle module"
+    "skipfiles_inline_module_allowlist",
+}
 
-_allowed_config_names = {*globals().keys()}
-sys.modules[__name__].__class__ = _AccessLimitingConfig
 
-from .config_utils import get_config_serialization_fns
+from .config_utils import install_config_module
 
-save_config, load_config = get_config_serialization_fns(
-    sys.modules[__name__],
-    ignore_set={"repro_after", "repro_level"},
-)
+install_config_module(sys.modules[__name__])
diff --git a/torch/_dynamo/config_utils.py b/torch/_dynamo/config_utils.py
index 952fee9ba26b..7b607952516b 100644
--- a/torch/_dynamo/config_utils.py
+++ b/torch/_dynamo/config_utils.py
@@ -1,44 +1,204 @@
-import inspect
+import contextlib
+
 import pickle
+import unittest
+from types import FunctionType, ModuleType
+from typing import Any, Dict, Set
+from unittest import mock
+
+# Types saved/loaded in configs
+CONFIG_TYPES = (int, float, bool, type(None), str, list, set, tuple, dict)
+
 
+def install_config_module(module):
+    """
+    Converts a module-level config into a `ConfigModule()`
+    """
 
-# Construct functions that save/load the state of the config module `module`.
-# The config settings are expected to either be module-level globals or
-# class variables.
-# `ignore_set` is a set of names of configurations to ignore. e.g. if you
-# want to ignore config.x and config.y.z in your config module, then
-# `ignore_set` should be {"x", "y.z"}.
-def get_config_serialization_fns(module, ignore_set=None):
-    def _save(obj, name_prefix):
-        saved_state = {}
-        for key, val in obj.__dict__.items():
-            if ignore_set is not None and name_prefix + key in ignore_set:
+    class ConfigModuleInstance(ConfigModule):
+        _bypass_keys = set()
+
+    def visit(source, dest, prefix):
+        """Walk the module structure and move everything to module._config"""
+        for key, value in list(source.__dict__.items()):
+            if key.startswith("__") or isinstance(value, (ModuleType, FunctionType)):
                 continue
-            try:
-                pickle.dumps(val)
-            except Exception:
-                pass
+
+            name = f"{prefix}{key}"
+            if isinstance(value, property) and dest is module:
+                # make @property work at the module level
+                delattr(module, key)
+                setattr(ConfigModuleInstance, key, value)
+                ConfigModuleInstance._bypass_keys.add(key)
+            elif isinstance(value, CONFIG_TYPES):
+                config[name] = value
+                if dest is module:
+                    delattr(module, key)
+            elif isinstance(value, type):
+                assert value.__module__ == module.__name__
+                # a subconfig with `class Blah:` syntax
+                proxy = SubConfigProxy(module, f"{name}.")
+                visit(value, proxy, f"{name}.")
+                setattr(dest, key, proxy)
             else:
-                saved_state[key] = (
-                    _save(val, name_prefix + key + ".") if inspect.isclass(val) else val
-                )
-        return saved_state
-
-    def save_config():
-        return pickle.dumps(_save(module, ""))
-
-    def _load(obj, data):
-        for key, val in data.items():
-            attr = getattr(obj, key, None)
-            if attr is not None and inspect.isclass(attr):
-                _load(attr, val)
+                raise AssertionError(f"Unhandled config {key}={value} ({type(value)})")
+
+    config = dict()
+    visit(module, module, "")
+    module._config = config
+    module._allowed_keys = set(config.keys())
+    module.__class__ = ConfigModuleInstance
+
+
+class ConfigModule(ModuleType):
+    _config: Dict[str, Any]
+    _allowed_keys: Set[str]
+    _bypass_keys: Set[str]
+
+    def __init__(self):
+        raise NotImplementedError(
+            f"use {__name__}.install_config_module(sys.modules[__name__])"
+        )
+
+    def __setattr__(self, name, value):
+        if name in self._bypass_keys:
+            super().__setattr__(name, value)
+        elif name not in self._allowed_keys:
+            raise AttributeError(f"{self.__name__}.{name} does not exist")
+        else:
+            self._config[name] = value
+
+    def __getattr__(self, name):
+        try:
+            return self._config[name]
+        except KeyError:
+            # make hasattr() work properly
+            raise AttributeError(f"{self.__name__}.{name} does not exist")
+
+    def __delattr__(self, name):
+        # must support delete because unittest.mock.patch deletes
+        # then recreate things
+        del self._config[name]
+
+    def save_config(self):
+        """Convert config to a pickled blob"""
+        config = dict(self._config)
+        for key in config.get("_save_config_ignore", ()):
+            config.pop(key)
+        return pickle.dumps(config, protocol=2)
+
+    def load_config(self, data):
+        """Restore from a prior call to save_config()"""
+        self.to_dict().update(pickle.loads(data))
+
+    def to_dict(self):
+        return self._config
+
+    def patch(self, arg1=None, arg2=None, **kwargs):
+        """
+        Decorator and/or context manager to make temporary changes to a config.
+
+        As a decorator:
+
+            @config.patch("name", val)
+            @config.patch(name1=val1, name2=val2):
+            @config.patch({"name1": val1, "name2", val2})
+            def foo(...):
+                ...
+
+        As a context manager:
+
+            with config.patch("name", val):
+                ...
+        """
+        if arg1 is not None:
+            if arg2 is not None:
+                # patch("key", True) syntax
+                changes = {arg1: arg2}
             else:
-                try:
-                    setattr(obj, key, val)
-                except Exception:
-                    pass
+                # patch({"key": True}) syntax
+                changes = arg1
+            assert not kwargs
+        else:
+            # patch(key=True) syntax
+            changes = kwargs
+            assert arg2 is None
+        assert isinstance(changes, dict), f"expected `dict` got {type(changes)}"
+        prior = {}
+        config = self
+
+        class ConfigPatch(ContextDecorator):
+            def __enter__(self):
+                assert not prior
+                for key in changes.keys():
+                    # KeyError on invalid entry
+                    prior[key] = config._config[key]
+                config._config.update(changes)
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                config._config.update(prior)
+                prior.clear()
+
+        return ConfigPatch()
+
+
+class ContextDecorator(contextlib.ContextDecorator):
+    """
+    Same as contextlib.ContextDecorator, but with support for
+    `unittest.TestCase`
+    """
+
+    def __call__(self, func):
+        if isinstance(func, type) and issubclass(func, unittest.TestCase):
+
+            class _TestCase(func):
+                @classmethod
+                def setUpClass(cls):
+                    self.__enter__()
+                    try:
+                        super().setUpClass()
+                    except Exception:
+                        self.__exit__(None, None, None)
+                        raise
+
+                @classmethod
+                def tearDownClass(cls):
+                    try:
+                        super().tearDownClass()
+                    finally:
+                        self.__exit__(None, None, None)
+
+            _TestCase.__name__ = func.__name__
+            return _TestCase
+
+        return super().__call__(func)
+
+
+class SubConfigProxy:
+    """
+    Shim to redirect to main config.
+    `config.triton.cudagraphs` maps to _config["triton.cudagraphs"]
+    """
+
+    def __init__(self, config, prefix):
+        # `super().__setattr__` to bypass custom `__setattr__`
+        super().__setattr__("_config", config)
+        super().__setattr__("_prefix", prefix)
+
+    def __setattr__(self, name, value):
+        return self._config.__setattr__(self._prefix + name, value)
+
+    def __getattr__(self, name):
+        return self._config.__getattr__(self._prefix + name)
+
+    def __delattr__(self, name):
+        return self._config.__delattr__(self._prefix + name)
 
-    def load_config(data):
-        _load(module, pickle.loads(data))
 
-    return save_config, load_config
+def patch_object(obj, name, value):
+    """
+    Workaround `mock.patch.object` issue with ConfigModule
+    """
+    if isinstance(obj, ConfigModule):
+        return obj.patch(name, value)
+    return mock.patch.object(obj, name, value)
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index fa4fb2d2a9fb..76ee5bb34590 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -11,6 +11,7 @@
 
 from . import config, exc
 from .allowed_functions import is_allowed
+from .backends.registry import CompilerFn
 from .bytecode_analysis import remove_dead_code, remove_pointless_jumps
 from .bytecode_transformation import is_generator, transform_code_object
 from .eval_frame import always_optimize_code_objects, skip_code, TorchPatcher
@@ -25,7 +26,7 @@
 )
 from .guards import CheckFunctionManager, GuardedCode
 from .hooks import Hooks
-from .output_graph import CompilerFn, OutputGraph
+from .output_graph import OutputGraph
 from .replay_record import ExecutionRecord
 from .symbolic_convert import InstructionTranslator
 from .utils import (
@@ -245,7 +246,7 @@ def format_guard_failures(code):
 
             assert code in guard_failures, "TODO(whc) any other recompile reasons?"
             log.warning(
-                f"{config.dynamo_import} hit config.cache_size_limit ({config.cache_size_limit})\n"
+                f"torch._dynamo hit config.cache_size_limit ({config.cache_size_limit})\n"
                 + f"   function: {format_func_info(code)}\n"
                 + f"   reasons:  {format_guard_failures(code)}\n"
                 + f"to diagnose recompilation issues, see {troubleshooting_url}."
@@ -327,9 +328,9 @@ def transform(instructions, code_options):
                 log.debug("Restarting analysis ...")
                 if attempt > 100:
                     unimplemented("100+ RestartAnalysis() calls")
-            except exc.SkipFrame:
+            except exc.SkipFrame as e:
                 log.debug(
-                    f"Skipping frame {code.co_name} \
+                    f"Skipping frame {e} {code.co_name} \
                     {code.co_filename} {code.co_firstlineno}"
                 )
                 if one_graph:
@@ -417,7 +418,7 @@ def _convert_frame(frame: types.FrameType, cache_size: int, hooks: Hooks):
 
 # TODO mlazos: add support for same args, or record them
 def replay(filename):
-    from .optimizations.backends import eager
+    from .backends.debugging import eager
 
     original_replay_val = config.replay_record_enabled
     config.replay_record_enabled = False
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index d4e0c7850793..a311178b30bc 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -16,13 +16,13 @@
 from torch._prims_common import is_float_dtype
 
 from . import config
-from .optimizations.backends import register_backend
+from .backends.registry import lookup_backend, register_debug_backend
 from .utils import clone_inputs, get_debug_dir
 
 log = logging.getLogger(__name__)
 
 
-inductor_config = import_module(f"{config.inductor_import}.config")
+inductor_config = import_module("torch._inductor.config")
 use_buck = inductor_config.is_fbcode()
 
 
@@ -220,14 +220,17 @@ def _cuda_system_info_comment():
 
 
 def generate_config_string():
+    import torch._functorch.config
     import torch._inductor.config
 
     return textwrap.dedent(
         f"""\
-import {config.dynamo_import}.config
-import {config.inductor_import}.config
-{config.dynamo_import}.config.load_config({repr(torch._dynamo.config.save_config())})
-{config.inductor_import}.config.load_config({repr(torch._inductor.config.save_config())})
+import torch._dynamo.config
+import torch._inductor.config
+import torch._functorch.config
+torch._dynamo.config.load_config({repr(torch._dynamo.config.save_config())})
+torch._inductor.config.load_config({repr(torch._inductor.config.save_config())})
+torch._functorch.config.load_config({repr(torch._functorch.config.save_config())})
         """
     )
 
@@ -241,7 +244,7 @@ def generate_compiler_repro_string(gm, args):
 import torch
 from torch import tensor, device
 import torch.fx as fx
-from {config.dynamo_import}.testing import rand_strided
+from torch._dynamo.testing import rand_strided
 from math import inf
 from torch.fx.experimental.proxy_tensor import make_fx
 
@@ -265,13 +268,17 @@ def generate_compiler_repro_string(gm, args):
     model_str += (
         "args = [rand_strided(sh, st, dt, dev) for (sh, st, dt, dev) in args]\n"
     )
-    model_str += "mod = make_fx(Repro())(*args)\n"
+    # TODO: fake may be better for performance here
+    tracing_mode = "real"
+    if config.dynamic_shapes:
+        tracing_mode = "symbolic"
+    model_str += f"mod = make_fx(Repro(), tracing_mode={repr(tracing_mode)})(*args)\n"
     return model_str
 
 
-INDUCTOR_IMPORT = f"""
-from {config.inductor_import}.compile_fx import compile_fx_inner
-from {config.dynamo_import}.debug_utils import same_two_models
+INDUCTOR_IMPORT = """
+from torch._inductor.compile_fx import compile_fx_inner
+from torch._dynamo.debug_utils import same_two_models
 """
 
 COMPILER_REPRO_OPTIONS = {
@@ -307,12 +314,12 @@ def dump_compiler_graph_state(gm, args, compiler_name):
 def save_graph_repro(fd, gm, args, compiler_name):
     sync_line = ""
     for arg in args:
-        if arg.is_cuda:
+        if isinstance(arg, torch.Tensor) and arg.is_cuda:
             sync_line = "torch.cuda.synchronize() # Ensures that segfaults are surfaced"
             break
 
     if "inductor" in compiler_name:
-        fd.write(f"import {config.inductor_import}.overrides\n")
+        fd.write("import torch._inductor.overrides\n")
     fd.write(generate_compiler_repro_string(gm, args))
     fd.write(COMPILER_REPRO_OPTIONS[compiler_name][0])
     if "_accuracy" in compiler_name:
@@ -492,19 +499,21 @@ class AccuracyError(Exception):
     pass
 
 
-def wrap_compiler_debug(compiler_fn, compiler_name: str):
+def wrap_compiler_debug(unconfigured_compiler_fn, compiler_name: str):
     """
     Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
     forward and backward call separately with the backend compiler_fn - like
     inductor or nvfuser. Intercepting after Aot Autograd presents neat
-    abstration, where all the params are lifted as graph inputs, making it easy
+    abstraction, where all the params are lifted as graph inputs, making it easy
     to save the graph as a string.
     """
 
-    @functools.wraps(compiler_fn)
+    @functools.wraps(unconfigured_compiler_fn)
     def debug_wrapper(gm, example_inputs, **kwargs):
         from torch._subclasses import FakeTensorMode
 
+        compiler_fn = functools.partial(unconfigured_compiler_fn, **kwargs)
+
         orig_graph = copy.deepcopy(gm.graph)
         assert config.repro_after in ("dynamo", "aot", None)
         inner_compiled_fn = None
@@ -513,9 +522,9 @@ def deferred_for_real_inputs(real_inputs):
             """
             Aot Autograd fw_compiler and bw_compiler can have fake tensors. So,
             example_inputs can be fake tensors. We can call compiler_fn (which is
-            inductor or nvfuser) with fake tensors but the actualy compiled_fn
+            inductor or nvfuser) with fake tensors but the actually compiled_fn
             should be called with real tensors. Therefore, the actual invocation
-            is deffered.
+            is deferred.
             """
             # Avoid re-compiling when we call the compiled function twice. This happens
             # when we run the model inference or training in a for loop like here
@@ -538,7 +547,7 @@ def deferred_for_real_inputs(real_inputs):
                         "Accuracy minification is supported for inductor only"
                     )
                 if inner_compiled_fn is None:
-                    inner_compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
+                    inner_compiled_fn = compiler_fn(gm, example_inputs)
                 if backend_aot_accuracy_fails(gm, real_inputs, compiler_fn):
                     log.warning("Accuracy failed for the AOT Autograd graph")
                     dump_compiler_graph_state(
@@ -560,7 +569,7 @@ def deferred_for_real_inputs(real_inputs):
                     # Call the compiler_fn - which is either aot_autograd or inductor
                     # with fake inputs
                     if inner_compiled_fn is None:
-                        inner_compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
+                        inner_compiled_fn = compiler_fn(gm, example_inputs)
                     # Call the compiled function with real inputs
                     return inner_compiled_fn(real_inputs)
                 except Exception as e:
@@ -583,7 +592,7 @@ def deferred_for_real_inputs(real_inputs):
             compiled_fn = deferred_for_real_inputs
             compiled_fn._boxed_call = True
         else:
-            compiled_fn = compiler_fn(gm, example_inputs, **kwargs)
+            compiled_fn = compiler_fn(gm, example_inputs)
 
         return compiled_fn
 
@@ -665,16 +674,16 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
     except Exception as e:
         # This means that the the minified graph is bad/exposes a different problem.
         # As we are checking accuracy here, lets log the exception and return True.
-        log.warning(
+        log.exception(
             (
-                "While minifying the program in accuracy minification mode,"
+                "While minifying the program in accuracy minification mode, "
                 "ran into a runtime exception which is likely an unrelated issue."
                 " Skipping this graph."
             )
         )
         return True
 
-    passing = same(ref, res, fp64_ref, tol=0.001, equal_nan=True)
+    passing = same(ref, res, fp64_ref, tol=config.repro_tolerance, equal_nan=True)
     return passing
 
 
@@ -751,10 +760,10 @@ class AccuracyError(Exception):
 import torch
 from torch import tensor, device
 import torch.fx as fx
-import {config.dynamo_import}
-from {config.dynamo_import}.testing import rand_strided
-from {config.dynamo_import}.debug_utils import run_fwd_maybe_bwd
-from {config.dynamo_import}.debug_utils import same_two_models
+import torch._dynamo
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.debug_utils import run_fwd_maybe_bwd
+from torch._dynamo.debug_utils import same_two_models
 
 {generate_config_string()}
 
@@ -767,7 +776,7 @@ class AccuracyError(Exception):
 {model_str}
 
 mod = Repro()
-opt_mod = {config.dynamo_import}.optimize("{compiler_name}")(mod)
+opt_mod = torch._dynamo.optimize("{compiler_name}")(mod)
 
 {run_code}
         """
@@ -875,9 +884,9 @@ def backend_accuracy_fails(gm, example_inputs, compiler_fn, only_fwd=False):
     except Exception as e:
         # This means that the the minified graph is bad/exposes a different problem.
         # As we are checking accuracy here, lets log the exception and return False.
-        log.warning(
+        log.exception(
             (
-                "While minifying the program in accuracy minification mode,"
+                "While minifying the program in accuracy minification mode, "
                 "ran into a runtime exception which is likely an unrelated issue."
                 " Skipping this graph"
             )
@@ -948,10 +957,10 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 from torch import tensor, device
 import torch.fx as fx
 import functools
-import {config.dynamo_import}
-from {config.dynamo_import}.debug_utils import run_fwd_maybe_bwd
-from {config.dynamo_import}.optimizations.backends import BACKENDS
-from {config.dynamo_import}.testing import rand_strided
+import torch._dynamo
+from torch._dynamo.debug_utils import run_fwd_maybe_bwd
+from torch._dynamo.backends.registry import lookup_backend
+from torch._dynamo.testing import rand_strided
 
 {generate_config_string()}
 
@@ -966,13 +975,13 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 
 # Setup debug minifier compiler
 torch._dynamo.debug_utils.MINIFIER_SPAWNED = True
-compiler_fn = BACKENDS["{minifier_backend}"]
+compiler_fn = lookup_backend("{minifier_backend}")
 {custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
     compiler_fn,
     compiler_name="{compiler_name}",
 )
-opt_mod = {config.dynamo_import}.optimize(dynamo_minifier_backend)(mod)
+opt_mod = torch._dynamo.optimize(dynamo_minifier_backend)(mod)
 
 with torch.cuda.amp.autocast(enabled={torch.is_autocast_enabled()}):
     opt_mod(*args)
@@ -981,7 +990,7 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
     helper_for_dump_minify(contents)
 
 
-def wrap_backend_debug(compiler_fn, compiler_name: str):
+def wrap_backend_debug(unconfigured_compiler_fn, compiler_name: str):
     """
     A minifier decorator that wraps the TorchDynamo produced Fx graph modules.
     As opposed to wrap_compiler_debug, this wrapper intercepts at the
@@ -991,8 +1000,9 @@ def wrap_backend_debug(compiler_fn, compiler_name: str):
     repro.tar.gz.
     """
 
-    @functools.wraps(compiler_fn)
+    @functools.wraps(unconfigured_compiler_fn)
     def debug_wrapper(gm, example_inputs, **kwargs):
+        compiler_fn = functools.partial(unconfigured_compiler_fn, **kwargs)
         assert config.repro_after in ("dynamo", "aot", None)
         if config.repro_after == "dynamo":
             if config.repro_level == 3:
@@ -1001,10 +1011,10 @@ def debug_wrapper(gm, example_inputs, **kwargs):
             # Check for either accuracy (level 4) or other type of failures.
             if config.repro_level == 4:
                 # Check Accuracy
-                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs, **kwargs)
+                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
                 if backend_accuracy_fails(gm, example_inputs, compiler_fn):
                     log.warning(
-                        "Accuracy failed for the TorchDyanmo produced graph. Creating script to minify the error."
+                        "Accuracy failed for the TorchDynamo produced graph. Creating script to minify the error."
                     )
                     dump_to_minify_after_dynamo(
                         fx.GraphModule(gm, copy.deepcopy(gm.graph)),
@@ -1018,9 +1028,7 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                     raise exc
             else:
                 try:
-                    compiled_gm = compiler_fn(
-                        copy.deepcopy(gm), example_inputs, **kwargs
-                    )
+                    compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
                     run_fwd_maybe_bwd(compiled_gm, example_inputs)
                 except Exception as exc:
                     log.warning(
@@ -1044,21 +1052,19 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                     )
                     raise
         else:
-            compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
+            compiled_gm = compiler_fn(gm, example_inputs)
 
         return compiled_gm
 
-    debug_wrapper._torchdynamo_orig_callable = compiler_fn
+    debug_wrapper._torchdynamo_orig_callable = unconfigured_compiler_fn
 
     return debug_wrapper
 
 
-@register_backend
+@register_debug_backend
 def dynamo_minifier_backend(gm, example_inputs, compiler_name):
     from functorch.compile import minifier
 
-    from .eval_frame import lookup_backend
-
     compiler_fn = lookup_backend(compiler_name)
 
     try:
@@ -1088,31 +1094,27 @@ def dynamo_minifier_backend(gm, example_inputs, compiler_name):
     return gm
 
 
-@register_backend
+@register_debug_backend
 def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name):
     from functorch.compile import minifier
 
-    from torch._dynamo.optimizations.backends import BACKENDS
-
-    if compiler_name == "inductor":
-        from torch._inductor.compile_fx import compile_fx
-
-        compiler_fn = compile_fx
-    else:
-        compiler_fn = BACKENDS[compiler_name]
+    compiler_fn = lookup_backend(compiler_name)
 
     # Set the eval mode to remove randomness.
     gm.eval()
 
     # Check Accuracy
-    if backend_accuracy_fails(gm, example_inputs, compiler_fn):
-        log.warning("Accuracy failed for the TorchDyanmo produced graph")
+    if backend_accuracy_fails(
+        gm, example_inputs, compiler_fn, only_fwd=config.repro_forward_only
+    ):
+        log.warning("Accuracy failed for the TorchDynamo produced graph")
         dump_state_fn = functools.partial(
             dump_backend_state, compiler_name=compiler_name, check_accuracy=True
         )
         fails_fn = functools.partial(
             backend_accuracy_fails,
             compiler_fn=compiler_fn,
+            only_fwd=config.repro_forward_only,
         )
         dump_state_fn(fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs)
         minifier(
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 332cca46bfff..ae79d61f5076 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -18,6 +18,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.nn.parallel.distributed import DistributedDataParallel
+from .backends.registry import CompilerFn, lookup_backend
 
 from .hooks import Hooks
 
@@ -39,7 +40,6 @@
 from . import config, convert_frame, skipfiles, utils
 from .exc import ResetRequired
 from .mutation_guard import install_generation_tagging_init
-from .output_graph import CompilerFn
 from .types import DynamoCallback
 from .utils import compile_times
 
@@ -78,7 +78,26 @@ def __getattr__(self, name):
             return self._modules["_orig_mod"]
         return getattr(self._orig_mod, name)
 
+    def __setattr__(self, name, value):
+        if name == "forward":
+            log.warning(
+                "Modifying OptimizedModule.forward may not do what you expect. "
+                "Most usage of OptimizedModule routes through __call__, which will never call OptimizedModule.forward. "
+                "Instead, OptimizedModule.__call__ will invoke a compiled version of the wrapped module's __call__. "
+                "OptimizedModule.forward is provided only as an escape hatch for invoking the compiled wrapped module "
+                "forward method without __call__ (and thus bypassing module hooks). "
+                "To alter the behavior of the wrapped module, modify its forward before compilation. "
+            )
+        super().__setattr__(name, value)
+
+    def __call__(self, *args, **kwargs):
+        return self.dynamo_ctx(self._orig_mod.__call__)(*args, **kwargs)
+
     def forward(self, *args, **kwargs):
+        log.warning(
+            "Calling OptimizedModule.forward will compile/execute wrapped model forward without running module hooks. "
+            "Usually, you should invoke OptimizedModule.__call__ instead, which follows pytorch module behavior."
+        )
         return self.dynamo_ctx(self._orig_mod.forward)(*args, **kwargs)
 
 
@@ -121,9 +140,7 @@ def enable_dynamic(enable: bool = True):
     if not enable:
         yield
         return
-    with patch("torch._dynamo.config.dynamic_shapes", True), patch(
-        "torch._functorch.config.use_dynamic_shapes", True
-    ), patch("torch._dynamo.config.specialize_int_float", False):
+    with config.patch(dynamic_shapes=True, specialize_int_float=False):
         yield
 
 
@@ -267,14 +284,21 @@ def _fn(*args, **kwargs):
 
 
 class OptimizeContext(_TorchDynamoContext):
+    @staticmethod
+    def _different_backend(old, new):
+        return not (old == new or old is None)
+
     def __init__(self, callback, backend_ctx_ctor, first_ctx=False, *, dynamic=False):
         def on_enter():
             global most_recent_backend
-            if (
-                most_recent_backend is not None
-                and most_recent_backend is not compiler_fn
-            ):
-                raise ResetRequired()
+            if OptimizeContext._different_backend(most_recent_backend, compiler_fn):
+                if config.raise_on_backend_change:
+                    raise ResetRequired()
+                else:
+                    warnings.warn(
+                        "changing options to `torch.compile()` may require "
+                        "calling `torch._dynamo.reset()` to take effect"
+                    )
             most_recent_backend = compiler_fn
             install_generation_tagging_init()
 
@@ -316,7 +340,7 @@ def catch_errors(frame, cache_size):
             ddp_module = DistributedDataParallel._get_active_ddp_module()
             if ddp_module:
                 with compile_lock:
-                    from .optimizations.distributed import DDPOptimizer
+                    from torch._dynamo.backends.distributed import DDPOptimizer
 
                     ddp_optimizer = DDPOptimizer(
                         bucket_bytes_cap=ddp_module.bucket_bytes_cap,
@@ -359,21 +383,27 @@ def get_compiler_fn(compiler_fn):
     return wrap_backend_debug(compiler_fn, compiler_str)
 
 
-def lookup_backend(compiler_fn):
-    """Expand backend strings to functions"""
-    if isinstance(compiler_fn, str):
-        from .optimizations import BACKENDS
-
-        compiler_fn = BACKENDS[compiler_fn]
-    return compiler_fn
-
-
 class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
     def __call__(self, fn):
         assert callable(fn)
         return fn
 
 
+def check_if_dynamo_supported():
+    if sys.platform == "win32":
+        raise RuntimeError("Windows not yet supported for torch.compile")
+    if sys.version_info >= (3, 11):
+        raise RuntimeError("Python 3.11+ not yet supported for torch.compile")
+
+
+def is_dynamo_supported():
+    try:
+        check_if_dynamo_supported()
+        return True
+    except Exception:
+        return False
+
+
 def optimize(
     backend="inductor",
     *,
@@ -407,6 +437,7 @@ def optimize(
         def toy_example(a, b):
             ...
     """
+    check_if_dynamo_supported()
     # Note: The hooks object could be global instead of passed around, *however* that would make
     # for a confusing API usage and plumbing story wherein we nest multiple .optimize calls.
     # There is some prior art around this, w/r/t nesting backend calls are enforced to be the same
@@ -416,14 +447,6 @@ def toy_example(a, b):
     torch._C._log_api_usage_once("torch._dynamo.optimize")
     if disable or os.environ.get("TORCHDYNAMO_DISABLE", "") == "1":
         return _NullDecorator()
-    if sys.platform == "win32":
-        warnings.warn(
-            "Windows is not currently supported, torch.compile() will do nothing"
-        )
-        return _NullDecorator()
-    if sys.version_info >= (3, 11):
-        warnings.warn("Python 3.11+ not yet supported, torch.compile() will do nothing")
-        return _NullDecorator()
 
     backend = get_compiler_fn(backend)
 
@@ -525,6 +548,7 @@ def guard_export_print(guards):
 def export(
     f, *args, aten_graph=False, decomposition_table=None, tracing_mode="real", **kwargs
 ):
+    check_if_dynamo_supported()
     torch._C._log_api_usage_once("torch._dynamo.export")
     if decomposition_table is not None or tracing_mode != "real":
         assert (
@@ -590,8 +614,7 @@ def result_capturing_wrapper(*graph_inputs):
 
         return result_capturing_wrapper
 
-    # TODO(voz): Handle kwargs properly?
-    flat_args, in_spec = pytree.tree_flatten(args)
+    flat_args, in_spec = pytree.tree_flatten((args, kwargs))
 
     remove_from_cache(f)
     with patch(f"{__name__}.most_recent_backend", None):
@@ -635,6 +658,8 @@ def placeholder(self, target, args, kwargs):
             arg = next(self.old_args_gen)
             if "val" in self.current_node.meta:
                 arg.node.meta["val"] = self.current_node.meta["val"]
+            if "tensor_dict" in self.current_node.meta:
+                arg.node.meta["tensor_dict"] = self.current_node.meta["tensor_dict"]
             return arg
 
         def output(self, target, args, kwargs):
@@ -645,12 +670,15 @@ def output(self, target, args, kwargs):
 
         def run_node(self, n):
             self.current_node = n
-            return super().run_node(n)
+            r = super().run_node(n)
+            if "val" in self.current_node.meta:
+                r.node.meta["val"] = self.current_node.meta["val"]
+            return r
 
     if aten_graph:
         # Running graph with interpreter is needed for propagating the stack_trace
         def graph_with_interpreter(*args):
-            with torch.fx.traceback.override_stack_trace():
+            with torch.fx.traceback.preserve_node_meta():
                 return torch.fx.Interpreter(graph).run(*args)
 
         graph = make_fx(
@@ -665,9 +693,10 @@ def graph_with_interpreter(*args):
     ).transform()
 
     # Make dynamo graph to have same input/output spec as user code
+    input_strs = [f"orig_arg_{i}" for i in range(len(args))] + list(kwargs.keys())
     new_graph.graph._codegen = _PyTreeCodeGen(
         _PyTreeInfo(
-            [f"orig_arg_{i}" for i in range(len(args))],
+            input_strs,
             in_spec,
             out_spec_traced,
         )
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 349438def9e0..56c867e7acb0 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -30,11 +30,11 @@ class TorchRuntimeError(TorchDynamoException):
 
 class ResetRequired(TorchDynamoException):
     def __init__(self):
-        super(ResetRequired, self).__init__(
+        super().__init__(
             textwrap.dedent(
                 """
                 Must call `torch._dynamo.reset()` before changing backends.  Detected two calls to
-                `torch._dynamo.optimize(...)` with a different backend compiler arguments.
+                `torch.compile()` with a different backend compiler arguments.
                 """
             )
         )
@@ -50,7 +50,7 @@ def __init__(self, backend_fn, inner_exception):
 
 class Unsupported(TorchDynamoException):
     def __init__(self, msg):
-        super(Unsupported, self).__init__(msg)
+        super().__init__(msg)
         self.real_stack = []
         self.msg = msg
         self.category = None
@@ -101,12 +101,10 @@ def augment_exc_message(exc, msg="\n"):
 
     if config.replay_record_enabled and hasattr(exc, "record_filename"):
         msg += f"\nLast frame execution written to {exc.record_filename}. To run only this frame while debugging, run\
- {config.dynamo_import}.replay('{exc.record_filename}').\n"
+ torch._dynamo.replay('{exc.record_filename}').\n"
 
     if not config.verbose:
-        msg += (
-            f"\nSet {config.dynamo_import}.config.verbose=True for more information\n"
-        )
+        msg += "\nSet torch._dynamo.config.verbose=True for more information\n"
 
     if hasattr(exc, "inner_exception") and hasattr(
         exc.inner_exception, "minifier_path"
@@ -143,10 +141,7 @@ def filter_stack(stack):
     for frame in stack:
         if "convert_frame" in frame.filename:
             break
-        if (
-            "eval_frame" in frame.filename
-            or f"{config.dynamo_import}.optimize(" in frame.line
-        ):
+        if "eval_frame" in frame.filename or "torch._dynamo.optimize(" in frame.line:
             continue
         user_stack.append(frame)
 
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 62515d822765..f36d5e881e18 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -10,8 +10,6 @@
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from weakref import ReferenceType
 
-import sympy
-
 import torch
 
 from torch._guards import (
@@ -22,7 +20,7 @@
     GuardSource,
     Source,
 )
-from torch.fx.experimental.symbolic_shapes import FloorDiv
+from torch.fx.experimental.symbolic_shapes import SYMPY_INTERP
 
 from . import config, convert_frame, mutation_guard
 from .eval_frame import set_guard_error_hook, set_guard_fail_hook
@@ -30,6 +28,7 @@
 from .types import GuardedCode, GuardFail, GuardFn  # noqa: F401
 from .utils import (
     dict_const_keys,
+    dict_const_keys_repr,
     dict_param_key_ids,
     guard_failures,
     HAS_NUMPY,
@@ -37,6 +36,8 @@
     np,
     orig_code_map,
     rename_implicit,
+    tensor_shape_should_be_static,
+    tensor_static_reason_to_message,
     tuple_iterator_getitem,
     tuple_iterator_len,
 )
@@ -87,7 +88,7 @@ def __init__(
         id_ref: Callable[[Type[object]], str],
         source_ref: Callable[[Source], str],
         scope: Optional[Dict[str, object]],
-        guarded_code: "CheckFunctionManager",
+        check_fn_manager: "CheckFunctionManager",
         renames=True,
     ):
         self.id_ref = id_ref
@@ -118,6 +119,7 @@ def __init__(
         # tensor match guards make sure we actually have tensors)
         self.shape_env_code: List[str] = []
 
+        # [Note - On Eager Tensor Guards]
         # Most of the time, we generate Python code in a guard to directly
         # check various properties.  However, tensors are a bit special;
         # it is too slow to check their properties one-by-one in Python.
@@ -132,9 +134,7 @@ def __init__(
         self.tensor_check_names: List[str] = []
         self.tensor_check_examples: List[torch.Tensor] = []
 
-        self.tensor_check_ids: Dict[str, int] = {}
-        # TODO: tf is this naming
-        self.guarded_code: CheckFunctionManager = guarded_code
+        self.check_fn_manager: CheckFunctionManager = check_fn_manager
 
     # Warning: use this with care!  This lets you access what the current
     # value of the value you are guarding on is.  You probably don't want
@@ -171,6 +171,22 @@ def TYPE_MATCH(self, guard: Guard):
         code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
         self._produce_guard_code(guard, [code])
 
+    def BOOL_FALSE(self, guard: Guard):
+        # Guard on the runtime value being 'False',
+        # can be faster than seemingly equivalent checks like DICT_KEYS for empty dict
+        #
+        # WARNING: this guard is not safe to use generally.  It only works if the runtime
+        # value is of a type that supports bool(), and some types e.g. Tensor do not.
+        # Only use this guard in cases you can guarantee the runtime type will be friendly.
+        # (e.g. Specialized NNModule with mutation protection via setattr)
+        #
+        # Why not simply check the runtime type inside this guard?  It's slow enough to defeat
+        # the purpose of using this guard, which itself is supposed to be a faster alternative
+        # to DICT_KEYS.
+        ref = self.arg_ref(guard)
+        code = f"not {ref}"
+        self._produce_guard_code(guard, [code])
+
     def ID_MATCH(self, guard: Guard):
         # ___check_obj_id is same as `id(x) == y`
         m = re.match(r"^type\((.+)\)$", guard.name)
@@ -342,11 +358,12 @@ def DICT_KEYS(self, guard):
         code.append(f"___check_type_id({ref}, {self.id_ref(t)})")
         param_key_ids = set(dict_param_key_ids(value))
         const_keys = set(dict_const_keys(value))
+        const_keys_repr = dict_const_keys_repr(const_keys)
         if param_key_ids:
             code.append(f"___dict_param_key_ids({ref}) == {param_key_ids!r}")
-            code.append(f"___dict_const_keys({ref}) == {const_keys!r}")
+            code.append(f"___dict_const_keys({ref}) == {const_keys_repr}")
         else:
-            code.append(f"set({ref}.keys()) == {const_keys!r}")
+            code.append(f"set({ref}.keys()) == {const_keys_repr}")
 
         self._produce_guard_code(guard, code)
 
@@ -378,7 +395,7 @@ def ODICT_KEYS(self, guard):
         self._produce_guard_code(guard, code)
 
     def OBJECT_MUTATION(self, guard: Guard):
-        mutation_guard.watch(self.get(guard.name), self.guarded_code)
+        mutation_guard.watch(self.get(guard.name), self.check_fn_manager)
 
     def GRAD_MODE(self, guard: Guard):
         """Guard on the initial grad state"""
@@ -396,16 +413,16 @@ def SHAPE_ENV(self, guard: Guard):
         # shape variables to sources from tracked_fakes.  This must happen after
         # tensor checks.
         assert guard.name == ""
-        output_graph = self.guarded_code.output_graph
+        output_graph = self.check_fn_manager.output_graph
         # NB: self.output_graph can be None in the debug_nops tests
         fs = output_graph.tracked_fakes
-        code = output_graph.shape_env.codegen_guards(
+        guards = output_graph.shape_env.produce_guards(
             [a.fake for a in fs],
             [a.source for a in fs],
             source_ref=self.source_ref,
         )
-        if code != "True":
-            self._produce_guard_code(guard, [code], shape_env=True)
+        for shape_guard in guards:
+            self._produce_guard_code(guard, [shape_guard], shape_env=True)
 
     def TENSOR_MATCH(self, guard: Guard):
         if guard.is_nn_module():
@@ -414,23 +431,97 @@ def TENSOR_MATCH(self, guard: Guard):
             value = self.get(guard.name)
             assert isinstance(value, torch.Tensor)
             tensor_name = self.arg_ref(guard)
-            self.tensor_check_names.append(tensor_name)
-            self.tensor_check_examples.append(value)
-
-            # STOP - DO NOT USE id_ref FOR TENSORS - TENSOR INVALIDATION RULES DIFFER
-            self.tensor_check_ids[tensor_name] = id(value)
-
-            # Note: Guard code produced for tensor_match is a little different.
-            # We accumulate tensor names, then do a single install of `___check_tensors`.
-            # See _guards.cpp and TensorGuard for more information.
-            # TODO(voz): Add tensor matching code to export
-            # Note: this is a bit of a special case, and so does not use _produce_guard_code
-            guard.set_export_info(
-                "TENSOR_MATCH",
-                weakref.ref(type(value)),
-                None,
-                weakref.ref(value),
+            # [Note - On Export Tensor Guards]
+            #
+            # In eager mode, tensor guards are evaluated through C++, in guards.cpp
+            # see [Note - On Eager Tensor Guards] for more info.
+            #
+            # In export mode, we instead maintain parallel logic between C++ and python
+            # here, with an exception of checking the dispatch key - with the idea that a dispatch key
+            # is an entirely runtime notion that would make no sense to keep in an exported graph.
+            #
+            # Now, this idea is okay, but to paraphrase @ezyang, this mental model is sufficient for now, although
+            # not entirely true.
+            # For example, suppose one of the input tensors had the negative dispatch key.
+            # You should end up with a graph that is specialized for tensors that have a negative dispatch key.
+            # If you allow a Tensor that does NOT have this bit set, you will accidentally run it "as if" it were negated.
+            # Now, negative key only shows up for complex numbers, and most likely, the exported to target doesn't
+            # support this feature at all, but the point stands that :some: tensor state only shows up on dispatch key.
+            # TODO(voz): Either populate a dispatch_key check into the guards, or error on users passing in an unsupported
+            # subset of keys during export.
+            #
+            # The list of tensor fields and calls we care about can be found in `terms` below.
+            # TODO(voz): We are missing storage offset in all our tensor guards?
+            code: List[str] = list()
+            if self.check_fn_manager.output_graph.export:
+                self.TYPE_MATCH(guard)
+                terms = [
+                    "dtype",
+                    "device.type",
+                    "device.index",
+                    "requires_grad",
+                    "ndimension()",
+                ]
+                if not config.dynamic_shapes:
+                    terms.append("stride()")
+                    # We need to do this to avoid the torch.Size type in guards
+                    code.append(f"{tensor_name}.shape == {tuple(value.shape)}")
+
+                for term in terms:
+                    real_value = self.get(tensor_name + "." + term)
+                    code.append(f"{tensor_name}.{term} == {real_value}")
+            else:
+                self.tensor_check_names.append(tensor_name)
+                self.tensor_check_examples.append(value)
+
+            # A frame is valid for reuse with dynamic dimensions if the new dynamic dimensions are a
+            # strict subset of the old.
+            #
+            # The logic here is as follows:
+            #
+            # Every mark_dynamic directive is a user-knows-best command, which can incur a raise at tracing
+            # time if we find guards that run counter to the user directive.
+            # If compiling a frame with explicit dynamic dims X could cause an exception, we MUST NOT skip compiling.
+            #
+            # If the frame is compiled with any marked dynamic indices, let's call that set of indices X.
+            # When we evaluated inputs against the guards, given the same tensor with potentially new dynamic indices,
+            # let's call that set Y.
+            #
+            # When X is a strict subset of Y, the potential new raises introduced during compilation are a strict subset
+            # of the raises we
+            # could have encountered. The frame compiled under Y is safe to reuse with X.
+            # When X is not a strict subset of Y, the non-overlapping new elements of X may cause new raises, and the
+            # frame is no longer fit for reuse.
+            #
+            # This is the case because any newly introduced mark_dynamic directives have a chance of
+            # raising, failing compilation. Any existing mark_dynamic indices that we lost are safe to lose
+            # as all it means is that we have gotten rid of a user directive which could incur a raise at compile time.
+            # In the case of when there is no Y, that is, there are no dynamic indices marked at all, the frame is safe
+            # to reuse
+            # as an empty set is a safe degeneration - that is, a strictly static tensor is always valid for a frame
+            # compiled with that same
+            # tensor + more onerous user directives.
+            static, reason = tensor_shape_should_be_static(
+                value, guard.source, is_tensor=True
             )
+            if not static:
+                if hasattr(value, "_dynamo_dynamic_indices"):
+                    code.append(
+                        f"({tensor_name}._dynamo_dynamic_indices.issubset({value._dynamo_dynamic_indices})) if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True"  # noqa: B950
+                    )
+                # In the case of us not having any dynamic dimension indices, we compiled the frame with no chance of
+                # raising for this specific tensor - and any inputs with more dynamic user directives specified must be recompiled.
+                else:
+                    code.append(
+                        f"hasattr({tensor_name}, '_dynamo_dynamic_indices') == False"
+                    )
+            else:
+                assert not hasattr(
+                    value, "_dynamo_dynamic_indices"
+                ), f"Illegal Unreachable state, guard accumulation for dynamic tensor that should have been static. Initial static message: {tensor_static_reason_to_message(reason)}"  # noqa: B950
+
+            if len(code) > 0:
+                self._produce_guard_code(guard, code)
 
     # A util that appends guarded code, or, in the case of export, adds data onto guards
     def _produce_guard_code(
@@ -480,7 +571,7 @@ def _produce_guard_code(
 
 
 # NB: Naively, you'd expect this to only be a function that produces
-# the callable that consistutes the guard.  However, there is some
+# the callable that constitutes the guard.  However, there is some
 # delicate handling for invalidating this check function when the
 # locals/globals get invalidated, so there's some extra state
 # we have to hold in this manager class.
@@ -543,6 +634,7 @@ def source_ref(source):
                 # TODO: we could make use of 'DefaultsSource' and offer a .guard.is_defaults() API
                 and "__defaults__" not in guard.name
                 and "__kwdefaults__" not in guard.name
+                and "hooks" not in guard.name
             ):
                 continue
             guard.create(local_builder, global_builder)
@@ -573,12 +665,12 @@ def compile_check_fn(
             local_builder.tensor_check_names + global_builder.tensor_check_names
         )
 
-        tensor_check_ids = local_builder.tensor_check_ids.copy()
-        tensor_check_ids.update(global_builder.tensor_check_ids)
-
         check_tensors_fn = None
         check_tensors_verbose_fn = None
         if tensor_check_names:
+            assert (
+                not self.output_graph.export
+            ), "Illegal to set tensor_check_names in export."
             tensor_check_examples = (
                 local_builder.tensor_check_examples
                 + global_builder.tensor_check_examples
@@ -601,18 +693,18 @@ def compile_check_fn(
         )
         for guard in aotautograd_guards:
             if isinstance(guard, DuplicateInputs):
-                pos_a = guard.input_pos_a
-                pos_b = guard.input_pos_b
-                assert pos_b < len(self.output_graph.graphargs) and pos_a < len(
-                    self.output_graph.graphargs
-                ), "Deduped args out of bounds"
+                pos_a = self.output_graph.pos_to_arg[guard.input_pos_a]
+                pos_b = self.output_graph.pos_to_arg[guard.input_pos_b]
+                assert (
+                    pos_b >= 0 and pos_a >= 0
+                ), "Deduped args out of bounds, cannot be negative"
+
                 assert self.output_graph.graphargs[
                     pos_a
                 ].is_tensor, "Deduped arg must be a tensor"
                 assert self.output_graph.graphargs[
                     pos_b
                 ].is_tensor, "Deduped arg must be a tensor"
-
                 code_part = f"{self.output_graph.graphargs[pos_a].source.name()} is {self.output_graph.graphargs[pos_b].source.name()}"  # noqa: B950
                 code_parts.append(code_part)
                 verbose_code_parts.append(code_part)
@@ -623,12 +715,6 @@ def compile_check_fn(
         verbose_code_parts.extend(local_builder.shape_env_code)
         assert not global_builder.shape_env_code
 
-        def direct_equality(a, b):
-            return a == b
-
-        def direct_negation(a, b):
-            return not direct_equality(a, b)
-
         code = " and ".join(unique(code_parts))
         closure_vars = collections.OrderedDict(
             [
@@ -636,13 +722,8 @@ def direct_negation(a, b):
                 ("___check_tensors", check_tensors_fn),
                 ("___check_tensors_verbose", check_tensors_verbose_fn),
                 ("tensor_check_names", tensor_check_names),
-                ("floor", math.floor),
-                ("ceiling", math.ceil),
-                ("Eq", direct_equality),
-                ("Ne", direct_negation),
-                ("Mod", sympy.Mod),
-                ("FloorDiv", FloorDiv),
             ]
+            + list(SYMPY_INTERP.items())
         )
         closure_vars.update(CLOSURE_VARS)
         py_code = f"""\
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index c25949e4581a..e5c87b6f03a7 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -21,10 +21,16 @@ def get_loggers():
 
 # Set the level of all loggers that torchdynamo is responsible for
 def set_loggers_level(level):
+    """Write current log level"""
     for logger in get_loggers():
         logger.setLevel(level)
 
 
+def get_loggers_level():
+    """Read current log level"""
+    return get_loggers()[0].level
+
+
 LOGGING_CONFIG = {
     "version": 1,
     "formatters": {
diff --git a/torch/_dynamo/optimizations/__init__.py b/torch/_dynamo/optimizations/__init__.py
deleted file mode 100644
index 9117517b8bf4..000000000000
--- a/torch/_dynamo/optimizations/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .backends import BACKENDS
-from .training import create_aot_backends
-
-create_aot_backends()
-
-__all__ = ["BACKENDS"]
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
deleted file mode 100644
index 9bebfa90d240..000000000000
--- a/torch/_dynamo/optimizations/analysis.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import itertools
-import operator
-
-import torch
-
-from torch._subclasses import FakeTensorMode  # noqa: F401
-from torch.fx.node import map_aggregate
-from torch.fx.passes.shape_prop import _extract_tensor_metadata, ShapeProp
-from torch.multiprocessing.reductions import StorageWeakRef
-
-
-class ShapeAliasingAndMutationProp(ShapeProp):
-    def __init__(self, *args, **kwargs):
-        super(ShapeAliasingAndMutationProp, self).__init__(*args, **kwargs)
-        self.input_alias_groups = set()
-        self.storage_to_alias_group = dict()
-        self.make_alias_group = itertools.count(1)
-        self.name = "ShapeAliasingAndMutation"
-
-    def tensor_alias_group(self, value: torch.Tensor):
-        """Assign a unique identifier to the storage of a given tensor"""
-        storage = StorageWeakRef(value._typed_storage())
-        alias_group = self.storage_to_alias_group.get(storage)
-        if alias_group is None:
-            alias_group = next(self.make_alias_group)
-            self.storage_to_alias_group[storage] = alias_group
-        return alias_group
-
-    def placeholder(self, target, args, kwargs):
-        value = super().placeholder(target, args, kwargs)
-        assert isinstance(value, torch.Tensor)
-        self.input_alias_groups.add(self.tensor_alias_group(value))
-        return value
-
-    def run_node(self, n: torch.fx.Node):
-        args, kwargs = self.fetch_args_kwargs_from_env(n)
-        tensor_args = self.extract_tensors((args, kwargs))
-
-        input_versions1 = [obj._version for obj in tensor_args]
-        result = getattr(self, n.op)(n.target, args, kwargs)
-        input_versions2 = [obj._version for obj in tensor_args]
-
-        n.meta["type"] = type(result)
-        n.meta["alias_groups"] = {
-            self.tensor_alias_group(obj) for obj in self.extract_tensors(result)
-        }
-
-        if (
-            not n.meta["alias_groups"]
-            and n.op == "call_function"
-            and n.target == operator.setitem
-        ):
-            n.meta["alias_groups"] = {self.tensor_alias_group(tensor_args[0])}
-
-        n.meta["mutates_alias_groups"] = {
-            self.tensor_alias_group(tensor)
-            for tensor, v1, v2 in zip(tensor_args, input_versions1, input_versions2)
-            if v1 != v2
-        }
-        # Partial mutation refers to the mutation caused by getitem that can
-        # potentially result in changing only a slice of the original tensor
-        n.meta["partial_mutation"] = False
-
-        def visit_arg(arg: torch.fx.Node):
-            if (
-                arg.op == "call_function" and arg.target == operator.getitem
-            ) or arg.meta["partial_mutation"]:
-                if bool(n.meta["mutates_alias_groups"] & arg.meta["alias_groups"]):
-                    n.meta["partial_mutation"] = True
-
-        torch.fx.map_arg((n.args, n.kwargs), visit_arg)
-        n.meta["is_input_alias"] = bool(
-            self.input_alias_groups & n.meta["alias_groups"]
-        )
-        n.meta["is_input_mutation"] = bool(
-            self.input_alias_groups & n.meta["mutates_alias_groups"]
-        )
-        n.meta["is_mutation"] = bool(n.meta["mutates_alias_groups"])
-        n.meta["tensor_metas"] = [
-            _extract_tensor_metadata(obj) for obj in self.extract_tensors(result)
-        ]
-        tensors = self.extract_tensors(result)
-        if tensors:
-            n.meta["device"] = tensors[0].device
-            n.meta["dtype"] = tensors[0].dtype
-
-        return result
-
-    @staticmethod
-    def extract_tensors(result):
-        """Return a flat list of tensors found in some nested data structure"""
-        seen = set()
-        tensors = []
-
-        def visit(obj):
-            if isinstance(obj, torch.Tensor) and id(obj) not in seen:
-                seen.add(id(obj))
-                tensors.append(obj)
-
-        map_aggregate(result, visit)
-        return tensors
-
-    def run(self, *args):
-        try:
-            super().run(*args)
-        finally:
-            # cleanup
-            self.env.clear()
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
deleted file mode 100644
index 2d9bd9648ab8..000000000000
--- a/torch/_dynamo/optimizations/backends.py
+++ /dev/null
@@ -1,822 +0,0 @@
-import copy
-import functools
-import io
-import logging
-import os
-import subprocess
-import tempfile
-
-from typing import Dict
-
-import torch
-from ..output_graph import CompilerFn
-
-from ..utils import identity
-from .subgraph import SubGraph
-
-log = logging.getLogger(__name__)
-BACKENDS: Dict[str, CompilerFn] = dict()
-
-
-def register_backend(fn):
-    @functools.wraps(fn)
-    def inner(gm, example_inputs, **kwargs):
-        return fn(gm, example_inputs, **kwargs)
-
-    BACKENDS[fn.__name__] = inner
-    return inner
-
-
-def create_backend(fn):
-    """
-    WARNING: We do not recommend using this for new backends.  This is
-    primarily used to support legacy TorchScript-based backends.
-    """
-
-    @functools.wraps(fn)
-    def inner(model, example_inputs=None, **kwargs):
-        if model is None:
-            return None
-
-        if not isinstance(model, SubGraph):
-            with tempfile.TemporaryDirectory() as tmp:
-                return inner(SubGraph(model, example_inputs, tmp), **kwargs)
-        else:
-            assert example_inputs is None
-
-        try:
-            return fn(model, **kwargs)
-        except KeyboardInterrupt:
-            raise
-
-    BACKENDS[fn.__name__] = inner
-    return inner
-
-
-@register_backend
-def inductor(*args, **kwargs):
-    # do import here to avoid loading inductor into memory when it is not used
-    from torch._inductor.compile_fx import compile_fx
-
-    return compile_fx(*args, **kwargs)
-
-
-@create_backend
-def eager(subgraph):
-    return subgraph.model
-
-
-@create_backend
-def ts(subgraph):
-    return subgraph.scripted
-
-
-def reload_jit_model(subgraph, opt_fn=identity):
-    tmp = io.BytesIO()
-    torch.jit.save(subgraph.scripted, tmp)
-    tmp.seek(0)
-    model = torch.jit.load(tmp)
-    model = opt_fn(model)
-    # populate cache
-    for _ in range(3):
-        model(*subgraph.example_inputs)
-    return model
-
-
-def reload_jit_model_ofi(subgraph):
-    return reload_jit_model(subgraph, torch.jit.optimize_for_inference)
-
-
-@create_backend
-def nnc(subgraph):
-    with torch.jit.fuser("fuser1"):
-        return reload_jit_model(subgraph)
-
-
-@create_backend
-def nnc_ofi(subgraph):
-    with torch.jit.fuser("fuser1"):
-        return reload_jit_model_ofi(subgraph)
-
-
-@create_backend
-def ts_nvfuser(subgraph):
-    with torch.jit.fuser("fuser2"):
-        return reload_jit_model(subgraph)
-
-
-@create_backend
-def ts_nvfuser_ofi(subgraph):
-    with torch.jit.fuser("fuser2"):
-        return reload_jit_model_ofi(subgraph)
-
-
-@create_backend
-def onednn(subgraph):
-    with torch.jit.fuser("fuser3"):
-        return reload_jit_model(subgraph)
-
-
-@create_backend
-def ofi(subgraph):
-    return torch.jit.optimize_for_inference(subgraph.scripted)
-
-
-@create_backend
-def static_runtime(subgraph):
-    scripted = subgraph.scripted
-    if hasattr(scripted, "_c"):
-        static_module = torch._C._jit_to_static_module(scripted._c)
-    else:
-        static_module = torch._C._jit_to_static_module(scripted.graph)
-    return subgraph.wrap_returns(static_module)
-
-
-def onnxrt_common(subgraph, provider, onnx_filename=None):
-    import numpy as np  # type: ignore[import]
-    import onnxruntime  # type: ignore[import]
-
-    _np_dtype = {
-        torch.float16: np.float16,
-        torch.float32: np.float32,
-        torch.float64: np.float64,
-        torch.uint8: np.uint8,
-        torch.int8: np.int8,
-        torch.int16: np.int16,
-        torch.int32: np.int32,
-        torch.int64: np.longlong,
-        torch.bool: np.bool_,
-    }
-
-    assert provider in onnxruntime.get_available_providers()
-    session = onnxruntime.InferenceSession(
-        onnx_filename or subgraph.onnx_filename, providers=[provider]
-    )
-    input_names = subgraph.input_names
-    output_names = subgraph.output_names
-    create_outputs = subgraph.empty_outputs_factory()
-    is_cpu = subgraph.is_cpu
-
-    def _call(*initial_args):
-        binding = session.io_binding()
-        args = [a.contiguous() for a in initial_args]
-        for name, value in zip(input_names, args):
-            dev = value.device
-            binding.bind_input(
-                name,
-                dev.type,
-                dev.index or 0,
-                _np_dtype[value.dtype],
-                value.size(),
-                value.data_ptr(),
-            )
-        outputs = create_outputs()
-        for name, value in zip(output_names, outputs):
-            dev = value.device
-            binding.bind_output(
-                name,
-                dev.type,
-                dev.index or 0,
-                _np_dtype[value.dtype],
-                value.size(),
-                value.data_ptr(),
-            )
-        session.run_with_iobinding(binding)
-        if is_cpu:
-            binding.copy_outputs_to_cpu()
-        return outputs
-
-    return subgraph.wrap_returns(_call)
-
-
-@create_backend
-def onnxrt_cpu(subgraph):
-    return onnxrt_common(subgraph, provider="CPUExecutionProvider")
-
-
-@create_backend
-def onnxrt_cuda(subgraph):
-    return onnxrt_common(subgraph, provider="CUDAExecutionProvider")
-
-
-@create_backend
-def onnx2tensorrt(subgraph):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    return onnxrt_common(subgraph, provider="TensorrtExecutionProvider")
-
-
-@create_backend
-def onnxrt_cpu_numpy(subgraph, provider="CPUExecutionProvider"):
-    """Alternate version that integrates via numpy"""
-    import onnxruntime
-
-    assert provider in onnxruntime.get_available_providers()
-    ort_session = onnxruntime.InferenceSession(
-        subgraph.onnx_filename, providers=[provider]
-    )
-
-    def to_numpy(x):
-        try:
-            return x.numpy()
-        except RuntimeError:
-            return x.detach().numpy()
-
-    def _call(*args):
-        res = ort_session.run(
-            None, {f"i{i}": to_numpy(arg) for i, arg in enumerate(args)}
-        )
-        res = [torch.from_numpy(x) for x in res]
-        return res
-
-    return subgraph.wrap_returns(_call)
-
-
-@create_backend
-def onnxrt(subgraph):
-    if subgraph.is_cuda:
-        return onnxrt_cuda(subgraph)
-    else:
-        return onnxrt_cpu(subgraph)
-
-
-@functools.lru_cache(None)
-def _init_tensorflow():
-    import tensorflow as tf  # type: ignore[import]
-
-    # prevent tensorflow from eating all the GPU memory
-    gpus = tf.config.list_physical_devices("GPU")
-    for gpu in gpus:
-        tf.config.experimental.set_memory_growth(gpu, True)
-    return tf
-
-
-@create_backend
-def onnx2tf(subgraph):
-    import onnx  # type: ignore[import]
-    from onnx_tf.backend import prepare  # type: ignore[import]
-
-    tf = _init_tensorflow()
-    filename = subgraph.filename("tensorflow")
-    input_names = subgraph.input_names
-    output_names = subgraph.output_names
-    device = "/CPU:0" if subgraph.is_cpu else f"/GPU:{subgraph.device_index}"
-    with tf.device(device):
-        if not os.path.exists(filename):
-            prepare(onnx.load(subgraph.onnx_filename)).export_graph(filename)
-        tf_module = tf.saved_model.load(filename)
-        tf_module = tf.function(tf_module, jit_compile=True)
-
-    def run(*i_args):
-        args = [a.contiguous() for a in i_args]
-        with tf.device(device):
-            outs = tf_module(
-                **{
-                    name: tf.experimental.dlpack.from_dlpack(
-                        torch.utils.dlpack.to_dlpack(args[idx])
-                    )
-                    for idx, name in enumerate(input_names)
-                }
-            )
-            return [
-                torch.utils.dlpack.from_dlpack(
-                    tf.experimental.dlpack.to_dlpack(outs[name])
-                )
-                for name in output_names
-            ]
-
-    return subgraph.wrap_returns(run)
-
-
-@create_backend
-def taso(subgraph):
-    taso_filename = subgraph.filename("taso")
-    subprocess.check_call(
-        [
-            os.path.expanduser("~/conda/envs/taso/bin/python"),
-            "-c",
-            "import taso,onnx; onnx.save(taso.export_onnx(taso.optimize("
-            f"taso.load_onnx('{subgraph.onnx_filename}'))), '{taso_filename}')",
-        ]
-    )
-    return onnxrt_common(
-        subgraph, provider="CUDAExecutionProvider", onnx_filename=taso_filename
-    )
-
-
-@create_backend
-def ipex(subgraph, **kwargs):
-    import intel_extension_for_pytorch as ipex  # type: ignore[import]
-
-    inputs = subgraph.example_inputs
-    model = subgraph.model
-    with torch.no_grad():
-        model.eval()
-        if kwargs["datatype"] == "bf16":
-            model = ipex.optimize(model, dtype=torch.bfloat16)
-        else:
-            model = ipex.optimize(model, dtype=torch.float32)
-        try:
-            traced_model = torch.jit.trace(model, inputs).eval()
-            traced_model = torch.jit.freeze(traced_model)
-            return traced_model
-        except Exception:
-            log.warning("JIT trace failed during the 'ipex' optimize process.")
-            return model
-
-
-def _raise_timeout(signum, frame):
-    raise TimeoutError()
-
-
-@create_backend
-def fx2trt(subgraph, **kwargs):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    from torch_tensorrt.fx.fx2trt import (  # type: ignore[import]
-        InputTensorSpec,
-        TRTInterpreter,
-    )
-    from torch_tensorrt.fx.passes.lower_basic_pass import (  # type: ignore[import]
-        transform_setitem,
-    )
-    from torch_tensorrt.fx.tools.trt_splitter import (  # type: ignore[import]
-        TRTSplitter,
-        TRTSplitterSetting,
-    )
-    from torch_tensorrt.fx.tracer.acc_tracer import acc_tracer  # type: ignore[import]
-    from torch_tensorrt.fx.trt_module import TRTModule  # type: ignore[import]
-    from torch_tensorrt.fx.utils import LowerPrecision  # type: ignore[import]
-
-    from .normalize import normalize_ir
-
-    try:
-        model = subgraph.model
-        inputs = subgraph.example_inputs
-        # normalize
-        model = normalize_ir(model, inputs)
-        # pass rewrite
-        model = transform_setitem(model, inputs)
-        acc_model = acc_tracer.trace(model, inputs)
-        # Split out unsupported ops
-        splitter_setting = TRTSplitterSetting()
-        splitter_setting.use_implicit_batch_dim = False
-        splitter = TRTSplitter(acc_model, inputs, settings=splitter_setting)
-        splitter.node_support_preview()
-        split_mod = splitter()
-        num_piece = 0
-        for name, _ in split_mod.named_children():
-            print(f"graph is split into {name}")
-            num_piece += 1
-
-        # if the graph module is split into pieces larger than 8, we consider its perf
-        # is not good and fall back to non-TRT
-        if num_piece > 8:
-            print(
-                f"The graph module is split into {num_piece} which is large than the \
-                threshold=8. Fall back to non-TRT module."
-            )
-            return None
-
-        if "fp16_mode" in kwargs and kwargs["fp16_mode"]:
-            precision = LowerPrecision.FP16
-        else:
-            precision = LowerPrecision.FP32
-
-        def get_submod_inputs(mod, submod, inputs):
-            acc_inputs = None
-
-            def get_input(self, inputs):
-                nonlocal acc_inputs
-                acc_inputs = inputs
-
-            handle = submod.register_forward_pre_hook(get_input)
-            mod(*inputs)
-            handle.remove()
-            return acc_inputs
-
-        for name, _ in split_mod.named_children():
-            if "_run_on_acc" in name:
-                submod = getattr(split_mod, name)
-                # print("acc=",submod.code)
-                # Get submodule inputs for fx2trt
-                acc_inputs = get_submod_inputs(split_mod, submod, inputs)
-
-                # fx2trt replacement
-                interp = TRTInterpreter(
-                    submod,
-                    InputTensorSpec.from_tensors(acc_inputs),
-                    explicit_batch_dimension=True,
-                )
-                r = interp.run(
-                    max_workspace_size=20 << 30,
-                    lower_precision=precision,
-                    # profiling_verbosity=trt.ProfilingVerbosity.DETAILED, #For profile
-                )
-                # For profile
-                # from fx2trt_oss.fx.tools.trt_profiler_sorted import profile_trt_module
-                # profile_trt_module("", trt_mod, acc_inputs)
-                trt_mod = TRTModule(*r)
-
-                setattr(split_mod, name, trt_mod)
-            else:
-                submod = getattr(split_mod, name)
-                # print("gpu=",submod.code)
-        return subgraph.wrap_returns(split_mod)
-    except Exception:
-        log.exception("FX2TRT conversion error")
-        return None
-
-
-@create_backend
-def torch2trt(subgraph):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    from torch2trt import torch2trt  # type: ignore[import]
-
-    inputs = subgraph.example_inputs
-    trt_mod = torch2trt(
-        subgraph.model,
-        inputs,
-        max_batch_size=len(inputs[0]),
-        strict_type_constraints=True,
-    )
-    return subgraph.wrap_returns(trt_mod)
-
-
-@create_backend
-def tensorrt(subgraph):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    model = onnx2tensorrt(subgraph)
-    if model is None:
-        model = torch2trt(subgraph)
-    return model
-
-
-@create_backend
-def cudagraphs(subgraph):
-    model = subgraph.model
-    inputs = subgraph.example_inputs
-    assert subgraph.is_cuda
-    return subgraph.wrap_returns(cudagraphs_inner(model, inputs))
-
-
-@create_backend
-def cudagraphs_ts(subgraph):
-    assert subgraph.is_cuda
-    model = subgraph.scripted
-    inputs = subgraph.example_inputs
-
-    # warmup
-    for _ in range(3):
-        model(*inputs)
-
-    return subgraph.wrap_returns(cudagraphs_inner(model, inputs))
-
-
-@create_backend
-def cudagraphs_ts_ofi(subgraph):
-    assert subgraph.is_cuda
-    model = torch.jit.optimize_for_inference(torch.jit.freeze(subgraph.scripted))
-    inputs = subgraph.example_inputs
-
-    # warmup
-    for _ in range(3):
-        model(*inputs)
-
-    return subgraph.wrap_returns(cudagraphs_inner(model, inputs))
-
-
-def cudagraphs_inner(model, inputs, copy_outputs=True):
-    assert isinstance(inputs, (list, tuple))
-    static_inputs = [torch.zeros_like(x) for x in inputs]
-
-    # warmup
-    torch.cuda.synchronize()
-    stream = torch.cuda.Stream()
-    stream.wait_stream(torch.cuda.current_stream())
-    with torch.cuda.stream(stream):
-        model(*inputs)
-    stream.synchronize()
-    torch.cuda.current_stream().wait_stream(stream)
-    torch.cuda.synchronize()
-
-    # record
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph, stream=stream):
-        static_outputs = model(*static_inputs)
-    if not isinstance(static_outputs, (list, tuple)):
-        static_outputs = (static_outputs,)
-
-    def run(*new_inputs):
-        assert len(static_inputs) == len(new_inputs)
-        for dst, src in zip(static_inputs, new_inputs):
-            dst.copy_(src)
-        graph.replay()
-        if copy_outputs:
-            return [x.clone() for x in static_outputs]
-        else:
-            return static_outputs
-
-    return run
-
-
-def tvm_compile(jit_mod, example_inputs, log_file=None, **kwargs):
-    if jit_mod is None:
-        return None
-    try:
-        return tvm_compile_inner(jit_mod, example_inputs, None, log_file, **kwargs)
-    except Exception as e:
-        if log_file and os.path.exists(log_file):
-            os.unlink(log_file)
-        if isinstance(e, KeyboardInterrupt):
-            raise
-        log.exception("tvm error")
-        return None
-
-
-@create_backend
-def tvm(subgraph):
-    return subgraph.wrap_returns(
-        tvm_compile_inner(
-            subgraph.scripted,
-            subgraph.example_inputs,
-            tuning_option=None,
-            cuda=subgraph.is_cuda,
-        )
-    )
-
-
-@create_backend
-def ansor(subgraph):
-    """
-    WARNING: this backend takes hours or days to train and
-    often produces a slower result than the default schedule.
-    """
-    return subgraph.wrap_returns(
-        tvm_compile_inner(
-            subgraph.scripted,
-            subgraph.example_inputs,
-            tuning_option="auto_scheduler",
-            log_file=subgraph.filename("ansor"),
-            cuda=subgraph.is_cuda,
-        )
-    )
-
-
-@create_backend
-def tvm_meta_schedule(subgraph):
-    return subgraph.wrap_returns(
-        tvm_compile_inner(
-            subgraph.scripted,
-            subgraph.example_inputs,
-            tuning_option="meta_schedule",
-            trials=20000,
-            cuda=subgraph.is_cuda,
-        )
-    )
-
-
-@functools.lru_cache(None)
-def llvm_target():
-    if "avx512" in open("/proc/cpuinfo").read():
-        return "llvm -mcpu=skylake-avx512"
-    return "llvm -mcpu=core-avx2"
-
-
-def tvm_compile_inner(
-    jit_mod, example_inputs, tuning_option=None, log_file=None, trials=20000, cuda=False
-):
-    try:
-        import tvm  # type: ignore[import]
-        from tvm import relay  # type: ignore[import]
-        from tvm.contrib import graph_executor  # type: ignore[import]
-
-        shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
-        mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
-        if cuda:
-            dev = tvm.cuda(0)
-            target = tvm.target.cuda()
-        else:
-            dev = tvm.cpu(0)
-            target = tvm.target.Target(llvm_target())
-
-        if tuning_option == "auto_scheduler":
-            from tvm import auto_scheduler
-
-            if log_file is None:
-                log_file = tempfile.NamedTemporaryFile()
-            if not os.path.exists(log_file):
-                tasks, task_weights = auto_scheduler.extract_tasks(
-                    mod["main"], params, target
-                )
-                for task in tasks:
-                    print(task.compute_dag)
-                else:
-                    print("No tasks")
-                if len(tasks) != 0:
-                    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-                    if not os.path.exists(log_file):
-                        assert trials > 0
-                        tune_option = auto_scheduler.TuningOptions(
-                            num_measure_trials=trials,
-                            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-                            early_stopping=2000,
-                        )
-                        try:
-                            tuner.tune(tune_option)
-                        except Exception:
-                            if os.path.exists(log_file):
-                                os.unlink(log_file)
-                            raise
-
-            with auto_scheduler.ApplyHistoryBest(log_file):
-                with tvm.transform.PassContext(
-                    opt_level=3, config={"relay.backend.use_auto_scheduler": True}
-                ):
-                    lib = relay.build(mod, target=target, params=params)
-        elif tuning_option == "meta_schedule":
-            from os import path as osp
-
-            from tvm import meta_schedule as ms
-
-            with tempfile.TemporaryDirectory() as work_dir:
-                if log_file is not None:
-                    assert osp.isdir(
-                        log_file
-                    ), "TVM's meta_schedule requires a directory for storing log files."
-                    work_dir = log_file
-                if not cuda:
-                    # meta_schedule needs num-cores to be specified
-                    # here we use the maximum core count
-                    target = tvm.target.Target(
-                        f"{llvm_target()} --num-cores {ms.utils.cpu_count(logical=False)}"
-                    )
-                # TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch
-                # once USE_PT_TVMDSOOP is updated and turned on by default in TVM.
-                database = ms.relay_integration.tune_relay(
-                    mod=mod,
-                    target=target,
-                    work_dir=work_dir,
-                    max_trials_global=20000,
-                    num_trials_per_iter=64,
-                    params=params,
-                    strategy="evolutionary",
-                )
-                lib = ms.relay_integration.compile_relay(
-                    database=database,
-                    mod=mod,
-                    target=target,
-                    params=params,
-                )
-
-        elif tuning_option is None:
-            # no autotuning (for debugging)
-            with tvm.transform.PassContext(opt_level=10):
-                lib = relay.build(mod, target=target, params=params)
-        else:
-            raise NotImplementedError(
-                "This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
-                "There are three available options including None, auto_scheduler and meta_schedule."
-            )
-        m = graph_executor.GraphModule(lib["default"](dev))
-
-        def to_torch_tensor(nd_tensor):
-            """A helper function to transfer a NDArray to torch.tensor."""
-            if nd_tensor.dtype == "bool":
-                # DLPack does not support boolean so it can't be handled by
-                # torch.utils.dlpack.from_pack. Workaround by going through
-                # numpy, although this brings additional data copy overhead.
-                return torch.from_numpy(nd_tensor.numpy())
-            return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
-
-        def to_tvm_tensor(torch_tensor):
-            """A helper function to transfer a torch.tensor to NDArray."""
-            if torch_tensor.dtype == torch.bool:
-                # same reason as above, fallback to numpy conversion which
-                # could introduce data copy overhead
-                return tvm.nd.array(torch_tensor.cpu().numpy())
-            return tvm.nd.from_dlpack(torch_tensor)
-
-        def exec_tvm(*i_args):
-            args = [a.contiguous() for a in i_args]
-            for idx, arg in enumerate(args, 0):
-                if arg.dim() != 0:
-                    if arg.requires_grad:
-                        arg = arg.detach()
-                    m.set_input(
-                        f"inp_{idx}",
-                        to_tvm_tensor(arg),
-                    )
-            m.run()
-            return [
-                to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())
-            ]
-
-        return exec_tvm
-    except Exception:
-        log.exception("tvm error")
-        return jit_mod  # explicit fall back to eager
-
-
-@functools.lru_cache(None)
-def _init_ltc():
-    try:
-        import torch._lazy.extract_compiled_graph
-        from torch._lazy.ts_backend import init as init_ts_backend
-
-        # hopefully changing this line to sth like _ltc_init_xla_backend in future
-        # will enable XLA
-        init_ts_backend()
-
-        return torch._lazy
-    except ModuleNotFoundError as e:
-        print(f"ltc backend fails. Can not import {e.name}")
-        raise
-
-
-def ltc_reuse_graph(gm: torch.fx.GraphModule, example_inputs):
-    ltc = _init_ltc()
-    return ltc.extract_compiled_graph.extract_compiled_graph(gm, example_inputs)
-
-
-def ltc_trivial(gm: torch.fx.GraphModule, example_inputs):
-    ltc = _init_ltc()
-    lazy_model = copy.deepcopy(gm).to(device="lazy")
-    ltc.extract_compiled_graph.force_lazy_device(lazy_model)
-
-    def ltc_model(*inputs):
-        orig_device = inputs[0].device if len(inputs) > 0 else "cuda"
-        lazy_inputs = tuple(inp.to(device="lazy") for inp in inputs)
-
-        lazy_out = lazy_model(*lazy_inputs)
-        out = tuple(out.to(device=orig_device) for out in lazy_out)
-        return out
-
-    return ltc_model
-
-
-@create_backend
-def torchxla_trivial(subgraph):
-    return subgraph.model
-
-
-@create_backend
-def torchxla_trace_once(subgraph):
-    import torch._dynamo.optimizations.torchxla_integration as integration
-
-    compiled_graph = None
-    model = subgraph.model
-
-    def fwd(*args):
-        nonlocal subgraph
-        nonlocal compiled_graph
-        if compiled_graph is None:
-            compiled_graph = integration.extract_compiled_graph(model, args)
-            del subgraph
-        return compiled_graph(*args)
-
-    return fwd
-
-
-def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
-    kwargs_ipex = {"datatype": "fp32"}
-    return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)
-
-
-def ipex_bf16(gm: torch.fx.GraphModule, example_inputs):
-    kwargs_ipex = {"datatype": "bf16"}
-    return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)
-
-
-def fx2trt_compiler_fp16(gm: torch.fx.GraphModule, example_inputs):
-    kwargs_fx2trt = {"fp16_mode": True}
-    trt_compiled = BACKENDS["fx2trt"](gm, example_inputs, **kwargs_fx2trt)
-    if trt_compiled is not None:
-        return trt_compiled
-    else:
-        print(
-            "FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
-        )
-        return gm.forward
-
-
-def fx2trt_compiler(gm: torch.fx.GraphModule, example_inputs):
-    kwargs_fx2trt = {"fp16_mode": False}
-    trt_compiled = BACKENDS["fx2trt"](gm, example_inputs, **kwargs_fx2trt)
-    if trt_compiled is not None:
-        return trt_compiled
-    else:
-        print(
-            "FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
-        )
-        return gm.forward
diff --git a/torch/_dynamo/optimizations/inference.py b/torch/_dynamo/optimizations/inference.py
deleted file mode 100644
index 0ecf45402549..000000000000
--- a/torch/_dynamo/optimizations/inference.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import base64
-import hashlib
-import io
-import itertools
-import json
-import logging
-import os
-import time
-from collections import defaultdict
-
-import torch
-
-from .. import config
-from ..utils import (
-    check_is_cuda,
-    checkpoint_params,
-    clone_inputs,
-    count_calls,
-    counters,
-)
-from .normalize import long_name, normalize_ir
-
-log = logging.getLogger(__name__)
-
-
-def string_key(gm: torch.fx.GraphModule, example_inputs):
-    out = io.StringIO()
-    node_to_id = defaultdict(iter(itertools.count()).__next__)
-
-    def argkey(n: torch.fx.Node):
-        return f"#{node_to_id[n]}"
-
-    def tensorkey(t):
-        if isinstance(t, torch.Tensor):
-            requires_grad = t.requires_grad and torch.torch.is_grad_enabled()
-            return (
-                f"{t.__class__.__name__}({t.dtype}, {t.device}, "
-                f"{tuple(t.size())}, {tuple(t.stride())}, {requires_grad})"
-            )
-        return type(t).__name__
-
-    inputs_iter = iter(example_inputs)
-
-    for node in gm.graph.nodes:
-        key = argkey(node)
-        name = "."
-        if node.op == "placeholder":
-            name = tensorkey(next(inputs_iter))
-        elif node.op == "get_attr":
-            val = eval(f"self.{node.target}", {"self": gm})
-            name = tensorkey(val)
-        elif node.op in ("call_function", "call_method", "call_module"):
-            name = long_name(gm, node)
-        out.write(
-            f"{key} {node.op} {name} "
-            f"{torch.fx.map_arg(node.args, argkey)!r} "
-            f"{torch.fx.map_arg(node.kwargs, argkey)!r}\n"
-        )
-    return out.getvalue()
-
-
-def graph_hash(gm: torch.fx.GraphModule, example_inputs):
-    return "g" + base64.urlsafe_b64encode(
-        hashlib.sha256(string_key(gm, example_inputs).encode("utf-8")).digest()
-    )[:39].decode("utf-8")
-
-
-def folder_name(gm: torch.fx.GraphModule, example_inputs):
-    base = os.path.join(config.base_dir, "subgraphs")
-    if not os.path.exists(base):
-        os.mkdir(base)
-        open(os.path.join(base, "__init__.py"), "w").close()
-    return os.path.join(base, graph_hash(gm, example_inputs))
-
-
-def record_graph_stats(gm):
-    for node in gm.graph.nodes:
-        if node.op in ("call_function", "call_method", "call_module"):
-            counters[node.op][long_name(gm, node)] += 1
-        elif node.op in ("placeholder", "output", "get_attr"):
-            pass
-        else:
-            raise AssertionError(node.op)
-
-
-def check_requires_grad(gm, example_inputs):
-    if torch.is_grad_enabled():
-        if any(
-            getattr(x, "requires_grad", False)
-            for x in itertools.chain(example_inputs, gm.parameters(True))
-        ):
-            return True
-    return False
-
-
-def jit_trace(gm, example_inputs):
-    """Wrapper around jit.trace to handle hooks"""
-    restore_backward_hooks = []
-
-    def visit(mod):
-        if mod._backward_hooks:
-            restore_backward_hooks.append((mod, mod._backward_hooks))
-            mod._backward_hooks = []
-
-    if not check_requires_grad(gm, example_inputs):
-        # in inference mode it is safe to ignore backwards hooks to allow tracing
-        gm.apply(visit)
-
-    try:
-        return torch.jit.trace(gm.forward, example_inputs)
-    finally:
-        for mod, hooks in restore_backward_hooks:
-            mod._backward_hooks = hooks
-
-
-def same(left, right):
-    return len(left) == len(right) and all(
-        torch.allclose(a, b, atol=1e-4, rtol=1e-4) for a, b in zip(left, right)
-    )
-
-
-class TorchScriptStrategy(object):
-    """Common base for backend strategies that use TorchScript"""
-
-    @classmethod
-    def compile_fn(cls, gm: torch.fx.GraphModule, example_inputs):
-        if count_calls(gm.graph) < 2:
-            return gm.forward  # no point for tiny graphs
-        return cls(gm, example_inputs).verified_candidate()
-
-    def __init__(self, gm: torch.fx.GraphModule, example_inputs):
-        super(TorchScriptStrategy, self).__init__()
-        self.restore = checkpoint_params(gm)
-        self.original_example_inputs = example_inputs
-        self.correct = gm.forward(*self.example_inputs)
-        self.gm = normalize_ir(gm, self.original_example_inputs)
-        self.scripted = jit_trace(self.gm, self.example_inputs)
-
-    @property
-    def example_inputs(self):
-        return clone_inputs(self.original_example_inputs)
-
-    def verified_candidate(self):
-        try:
-            candidate = self.candidate()
-            if candidate is None or candidate is self.gm.forward:
-                return self.gm.forward
-
-            self.restore()
-            result = candidate(*self.example_inputs)
-
-            if same(result, self.correct):
-                return candidate
-
-            print(f"incorrect candidate {self}")
-
-            return self.gm.forward
-        except Exception:
-            log.exception("error in verified_candidate()")
-            return self.gm.forward
-        finally:
-            self.restore()
-
-    def candidate(self):
-        raise NotImplementedError()
-
-
-def save_pt(path, name, data):
-    with open(os.path.join(path, name), "wb") as fd:
-        torch.save(data, fd)
-
-
-def save_metadata(path, gm, example_inputs):
-    with open(os.path.join(path, "metadata.json"), "w") as fd:
-        json.dump(
-            {
-                "is_cuda": check_is_cuda(gm, example_inputs),
-            },
-            fd,
-        )
-
-
-def touch_timestamp(path):
-    open(os.path.join(path, "timestamp"), "w").write(str(time.time()))
-
-
-def argmin(perf):
-    best = "eager"
-    best_sec = float("inf")
-    for name, sec in perf.items():
-        if sec < best_sec:
-            best = name
-            best_sec = float(sec)
-            if name == "eager":
-                # small bias torwards using eager since it is more robust
-                best_sec *= 0.99
-    return best
diff --git a/torch/_dynamo/optimizations/log_args.py b/torch/_dynamo/optimizations/log_args.py
deleted file mode 100644
index 111da69d4a8f..000000000000
--- a/torch/_dynamo/optimizations/log_args.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import json
-import os
-
-import torch
-from torch.fx.experimental.proxy_tensor import make_fx
-
-aten = torch.ops.aten
-
-
-class ConvArgsAnalysis(torch.fx.Interpreter):
-    """
-    Log arguments like input shape (input, bias, weights shape)
-    and options(padding/stride/kernel size/dilation/etc) for
-    aten.convolution
-    """
-
-    def __init__(self, gm: torch.fx.GraphModule):
-        super().__init__(gm)
-
-        self.nodes_conv_args = {}
-        self.conv_arg_names = [
-            arg.name for arg in aten.convolution.default._schema.arguments
-        ]
-
-    def run(self, *args):
-        run_result = super().run(*args)
-        if self.nodes_conv_args:
-            filename = "tmp/conv_args.json"
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            with open(filename, "a") as fd:
-                json.dump(self.nodes_conv_args, fd)
-                fd.write("\n")
-        return run_result
-
-    def run_node(self, n: torch.fx.Node):
-        result = super().run_node(n)
-        if n.op == "call_function":
-            if n.target == aten.convolution.default:
-                args, kwargs = self.fetch_args_kwargs_from_env(n)
-                assert len(args) == len(
-                    self.conv_arg_names
-                ), f"aten.convolution should have {len(self.conv_arg_names)} args"
-                conv_args = {}
-                # collect tensor's shape, stride (channel first or last), dtype
-                for i in range(3):
-                    arg_name = self.conv_arg_names[i]
-                    if args[i] is None:
-                        conv_args[arg_name] = {
-                            "shape": None,
-                            "stride": None,
-                            "dtype": None,
-                        }
-                    else:
-                        conv_args[arg_name] = {
-                            "shape": args[i].shape,
-                            "stride": args[i].stride(),
-                            "dtype": str(args[i].dtype),
-                        }
-                # collect stride/padding/dilation/transposed/output_padding/groups
-                for i in range(3, len(args)):
-                    arg_name = self.conv_arg_names[i]
-                    conv_args[arg_name] = args[i]
-
-                self.nodes_conv_args[n.name.replace("_default", "")] = conv_args
-        return result
-
-
-def conv_args_analysis(gm: torch.fx.GraphModule, example_inputs):
-    def conv_arg_inner(*args):
-        fx_g = make_fx(gm)(*args)
-        return ConvArgsAnalysis(fx_g).run(*args)
-
-    return conv_arg_inner
diff --git a/torch/_dynamo/optimizations/normalize.py b/torch/_dynamo/optimizations/normalize.py
deleted file mode 100644
index 47b2c5703a4d..000000000000
--- a/torch/_dynamo/optimizations/normalize.py
+++ /dev/null
@@ -1,441 +0,0 @@
-import builtins
-import dataclasses
-import functools
-import itertools
-import logging
-import math
-import operator
-
-import torch
-from torch.fx import Transformer
-from torch.fx.experimental.normalize import NormalizeOperators
-from torch.fx.operator_schemas import get_signature_for_torch_op
-
-from .. import config
-from ..allowed_functions import torch_get_name
-from ..utils import clone_inputs, counters
-from .analysis import ShapeAliasingAndMutationProp
-
-log = logging.getLogger(__name__)
-
-VIEW_OPS = {
-    # list taken from https://pytorch.org/docs/stable/tensor_view.html
-    "getitem",
-    "as_strided",
-    "detach",
-    "diagonal",
-    "expand",
-    "expand_as",
-    "movedim",
-    "narrow",
-    "permute",
-    "select",
-    "squeeze",
-    "transpose",
-    "t",
-    "T",
-    "real",
-    "imag",
-    "view_as_real",
-    "view_as_imag",
-    "unflatten",
-    "unfold",
-    "unsqueeze",
-    "view",
-    "view_as",
-    "unbind",
-    "split",
-    "split_with_sizes",
-    "swapaxes",
-    "swapdims",
-    "chunk",
-    "indices",
-    "values",
-}
-MAYBE_VIEW_OPS = {"contiguous", "reshape"}
-
-# convert x.foo(...) to torch.foo(x, ...)
-NORMALIZE_METHODS = {
-    # These ones aren't normalized:
-    # ('view', 342)
-    # ('reshape', 285)
-    # ('expand', 87)
-    # ('permute', 78)
-    # ('to', 66)
-    # ('contiguous', 62)
-    # ('reshape_as', 57)
-    # ('masked_fill', 30)
-    # ('float', 22) -- could rewrite
-    # ('expand_as', 14) -- could rewrite
-    # ('detach', 4)
-    # ('repeat', 2)
-    # TODO(jansel): debug why this causes issues in detectron2_maskrcnn
-    # "div": torch.div,
-    "add_": operator.iadd,
-    "all": torch.all,
-    "any": torch.any,
-    "ceil": torch.ceil,
-    "chunk": torch.chunk,
-    "clamp": torch.clamp,
-    "clone": torch.clone,
-    "exp": torch.exp,
-    "flatten": torch.flatten,
-    "flip": torch.flip,
-    "floor": torch.floor,
-    "index_select": torch.index_select,
-    "log2": torch.log2,
-    "log_softmax": torch.nn.functional.log_softmax,
-    "max": torch.max,
-    "mean": torch.mean,
-    "min": torch.min,
-    "mul_": operator.imul,
-    "narrow": torch.narrow,
-    "ne": torch.ne,
-    "nonzero": torch.nonzero,
-    "numel": torch.numel,
-    "pow": torch.pow,
-    "round": torch.round,
-    "rsqrt": torch.rsqrt,
-    "sigmoid": torch.sigmoid,
-    "softmax": torch.nn.functional.softmax,
-    "sort": torch.sort,
-    "split": torch.split,
-    "squeeze": torch.squeeze,
-    "std": torch.std,
-    "sum": torch.sum,
-    "topk": torch.topk,
-    "transpose": torch.transpose,
-    "tril": torch.tril,
-    "t": torch.t,
-    "unbind": torch.unbind,
-    "unsqueeze": torch.unsqueeze,
-}
-DONT_EXPAND_MODULES = {
-    # These have internal control flow
-    "ConvTranspose1d",
-    "ConvTranspose2d",
-    "Conv2d",
-    "ConvReLU2d",
-    "ConvBn2d",
-    "ConvBnReLU2d",
-    "EmbeddingBag",
-    "InstanceNorm2d",
-    "LSTM",
-}
-
-F = torch.nn.functional
-INPLACE_KEYWORD_OPS = {
-    F.mish,
-    F.silu,
-    F.hardsigmoid,
-    F.rrelu,
-    F.leaky_relu,
-    F.celu,
-    F.selu,
-    F.elu,
-    F.relu6,
-    F.hardswish,
-    F.hardtanh,
-    F.relu,
-    F.threshold,
-}
-IOPERATOR_REPLACEMENTS = {
-    "masked_fill_": "masked_fill",
-    "scatter_": "scatter",
-    "unsqueeze_": "unsqueeze",
-    torch.relu_: torch.relu,
-    torch.sigmoid_: torch.sigmoid,
-    operator.iadd: torch.add,
-    operator.iand: torch.bitwise_and,
-    operator.ifloordiv: functools.partial(torch.div, rounding_mode="floor"),
-    operator.itruediv: torch.div,
-    operator.imul: torch.mul,
-    operator.imatmul: torch.matmul,
-    operator.ior: torch.bitwise_or,
-    operator.ipow: torch.pow,
-    operator.isub: torch.sub,
-    operator.ixor: torch.bitwise_xor,
-}
-OPERATOR_REPLACEMENTS = {
-    operator.lt: torch.lt,
-    operator.le: torch.le,
-    operator.eq: torch.eq,
-    operator.ne: torch.ne,
-    operator.ge: torch.ge,
-    operator.gt: torch.gt,
-    operator.abs: torch.abs,
-    operator.add: torch.add,
-    operator.and_: torch.bitwise_and,
-    operator.floordiv: functools.partial(torch.div, rounding_mode="floor"),
-    # operator.truediv: torch.div,  # TODO(jansel): debug issue in vision_maskrcnn
-    operator.inv: torch.bitwise_not,
-    operator.invert: torch.bitwise_not,
-    operator.mod: torch.remainder,
-    operator.mul: torch.mul,
-    operator.matmul: torch.matmul,
-    operator.neg: torch.neg,
-    operator.or_: torch.bitwise_or,
-    operator.pos: torch.positive,
-    operator.pow: torch.pow,
-    operator.sub: torch.sub,
-    operator.xor: torch.bitwise_xor,
-    torch.nn.functional.sigmoid: torch.sigmoid,
-    torch.nn.functional.tanh: torch.tanh,
-    torch.nn.functional.relu: torch.relu,
-}
-
-SKIP_INPLACE = {
-    v
-    for v in itertools.chain(
-        math.__dict__.values(), builtins.__dict__.values(), operator.__dict__.values()
-    )
-    if callable(v)
-}
-
-
-def always_true(*args, **kwargs):
-    return True
-
-
-class InliningTracer(torch.fx.Tracer):
-    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
-        return False
-
-
-def expand_module_call(prefix, graph: torch.fx.Graph, module, args, kwargs):
-    # this patch is needed to make BatchNorm2D FX trace
-    module.__dict__["_check_input_dim"] = always_true
-    try:
-        assert not kwargs
-        arg_index = itertools.count()
-        vars = dict()
-        for node in InliningTracer().trace(module).nodes:
-            if node.op == "placeholder":
-                vars[node] = args[next(arg_index)]
-            elif node.op == "output":
-                assert len(node.args) == 1
-                return vars[node.args[0]]
-            elif node.op == "get_attr":
-                vars[node] = graph.get_attr(f"{prefix}{node.target}")
-            else:
-                vars[node] = graph.node_copy(node, vars.__getitem__)
-        raise AssertionError("unreachable")
-    except Exception:
-        print(f"Error while expanding {module.__class__.__name__}")
-        raise
-    finally:
-        del module.__dict__["_check_input_dim"]
-
-
-@dataclasses.dataclass
-class NodeCounts:
-    usages: int = 0
-
-
-def short_name(gm, node: torch.fx.Node):
-    if node.op == "call_function":
-        return node.target.__name__
-    elif node.op == "call_method":
-        return node.target
-    elif node.op == "call_module":
-        return gm.get_submodule(node.target).__class__.__name__
-    elif node.op == "get_attr":
-        return node.target
-    elif node.op == "output":
-        return "output"
-    raise AssertionError(node.op)
-
-
-def long_name(gm, node: torch.fx.Node):
-    name = short_name(gm, node)
-    target = node.target
-    if node.op == "call_function":
-        return torch_get_name(
-            node.target, f"{getattr(target, '__module__', '')}.{name}"
-        )
-    elif node.op == "call_method":
-        return name
-    elif node.op == "call_module":
-        target = gm.get_submodule(target).__class__
-        return f"{getattr(target, '__module__', '')}.{getattr(target, '__name__', '')}"
-    elif node.op == "get_attr":
-        return name
-    elif node.op == "output":
-        return "output"
-    raise AssertionError("unreachable")
-
-
-class Inplacifier:
-    def __init__(self, gm: torch.fx.GraphModule):
-        self.gm = gm
-
-    def can_be_view(self, node):
-        name = short_name(self.gm, node)
-        return name in VIEW_OPS or name in MAYBE_VIEW_OPS
-
-    def inplacify(self):
-        counts = dict()
-
-        def record_usage(node):
-            counts[node].usages += 1
-            return node
-
-        for node in self.gm.graph.nodes:
-            if node.op in ("call_function", "call_method", "call_module"):
-                if self.can_be_view(node):
-                    # Aliasing
-                    counts[node] = counts[node.args[0]]
-                elif "out" in node.kwargs:
-                    counts[node] = counts[node.kwargs["out"]]
-                else:
-                    counts[node] = NodeCounts(0)
-            else:
-                counts[node] = NodeCounts(float("inf"))
-
-        for node in reversed(list(self.gm.graph.nodes)):
-            kwargs = dict(node.kwargs)
-            if "inplace" in kwargs:
-                kwargs.pop("inplace")
-            if node.op == "call_function" and len(node.args) + len(kwargs) == 1:
-                arg = node.args[0] if node.args else next(kwargs.values())
-                if isinstance(arg, torch.fx.Node) and counts[arg].usages == 0:
-                    if node.target in SKIP_INPLACE:
-                        continue
-                    elif node.target in INPLACE_KEYWORD_OPS:
-                        kwargs["inplace"] = True
-                        counters["optimizations"]["inplace"] += 1
-                    elif " out: torch.Tensor" in repr(
-                        get_signature_for_torch_op(node.target)
-                    ):
-                        kwargs["out"] = arg
-                        counters["optimizations"]["out"] += 1
-                    else:
-                        continue
-                    with self.gm.graph.inserting_before(node):
-                        node.replace_all_uses_with(
-                            self.gm.graph.call_function(node.target, node.args, kwargs)
-                        )
-                    self.gm.graph.erase_node(node)
-
-            torch.fx.map_arg((node.args, node.kwargs), record_usage)
-
-
-class Functionalization(Transformer):
-    """
-    Remove most cases of mutation from a given fx Graph.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super(Functionalization, self).__init__(*args, **kwargs)
-        self.tracer.tensor_attrs = dict()  # TODO(jansel): upstream this fix
-
-    def run_node(self, n: torch.fx.Node):
-
-        patches = []
-        target = n.target
-        args, kwargs = self.fetch_args_kwargs_from_env(n)
-        kwargs = dict(kwargs)
-
-        if (
-            not n.meta["is_input_mutation"]
-            and not n.meta["partial_mutation"]
-            and issubclass(n.meta["type"], torch.Tensor)
-        ):
-            if "inplace" in n.kwargs:
-                if kwargs["inplace"]:
-                    patches.append(n.args[0])
-                kwargs.pop("inplace")
-            elif "out" in n.kwargs:
-                kwargs.pop("out")
-                patches.append(n.kwargs["out"])
-            elif n.target in IOPERATOR_REPLACEMENTS:
-                target = IOPERATOR_REPLACEMENTS[n.target]
-                patches.append(n.args[0])
-            elif n.meta["is_mutation"]:
-                counters["mutation"][long_name(self.module, n)] += 1
-
-            if target in OPERATOR_REPLACEMENTS and not kwargs:
-                target = OPERATOR_REPLACEMENTS[target]
-
-        if target is builtins.getattr:
-            if args[1] == "dtype":
-                return n.args[0].meta["dtype"]
-            elif args[1] == "device":
-                return n.args[0].meta["device"]
-            else:
-                counters["getattr"][args[1]] += 1
-
-        if isinstance(target, functools.partial):
-            assert not target.args
-            kwargs.update(target.keywords)
-            target = target.func
-
-        if not issubclass(n.meta["type"], torch.Tensor):
-            counters["nontensor"][long_name(self.module, n)] += 1
-
-        with self._set_current_node(n):
-            result = getattr(self, n.op)(target, args, kwargs)
-
-            # For inplace operators, the output dtype should be equal to the
-            # dtype of tensor being inplace modified.
-            if n.target in IOPERATOR_REPLACEMENTS:
-                result = self.call_method("to", (result, n.args[0].meta["dtype"]), {})
-
-        for patch in patches:
-            assert isinstance(
-                patch, torch.fx.Node
-            ), f"{patch} {n.target} {n.args} {n.kwargs}"
-            if patch in self.env:
-                self.env[patch] = result
-
-        return result
-
-
-def swap_node(graph, old_node, new_node):
-    old_node.replace_all_uses_with(new_node)
-    graph.erase_node(old_node)
-    new_node.meta = old_node.meta
-
-
-def normalize(gm: torch.fx.GraphModule):
-    # gm.graph.print_tabular()
-    graph: torch.fx.Graph = gm.graph
-
-    for node in list(graph.nodes):
-        with graph.inserting_before(node):
-            if node.op == "call_method" and node.target in NORMALIZE_METHODS:
-                swap_node(
-                    graph,
-                    node,
-                    graph.call_function(
-                        NORMALIZE_METHODS[node.target], node.args, node.kwargs
-                    ),
-                )
-            elif node.op == "call_module":
-                submod = gm.get_submodule(node.target)
-                if submod.__class__.__name__ not in DONT_EXPAND_MODULES:
-                    swap_node(
-                        graph,
-                        node,
-                        expand_module_call(
-                            f"{node.target}.", graph, submod, node.args, node.kwargs
-                        ),
-                    )
-
-    # gm.graph.print_tabular()
-
-
-def normalize_ir(gm, example_inputs):
-    if config.normalize_ir:
-        example_inputs = clone_inputs(example_inputs)
-        normalize(gm)
-        try:
-            gm = NormalizeOperators(gm).transform()
-        except AttributeError:
-            # log.exception("NormalizeOperators() failed")
-            pass
-        ShapeAliasingAndMutationProp(gm).run(*example_inputs)
-        gm = Functionalization(gm).transform()
-    gm.recompile()
-    # record_graph_stats(gm)
-    return gm
diff --git a/torch/_dynamo/optimizations/subgraph.py b/torch/_dynamo/optimizations/subgraph.py
deleted file mode 100644
index 55b773675566..000000000000
--- a/torch/_dynamo/optimizations/subgraph.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import functools
-import importlib
-import itertools
-import json
-import logging
-import math
-import operator
-import os
-
-import torch
-
-from .. import config
-from ..utils import check_is_cuda, checkpoint_params, is_jit_model, torchscript
-
-log = logging.getLogger(__name__)
-
-
-def cached(fn):
-    cached_name = f"_{fn.__name__}"
-
-    @functools.wraps(fn)
-    def inner(self):
-        if hasattr(self, cached_name):
-            return getattr(self, cached_name)
-        result = fn(self)
-        setattr(self, cached_name, result)
-        return result
-
-    return inner
-
-
-def load_module_fx(name):
-    pymod = importlib.import_module(f"subgraphs.{name}")
-    # TODO(jansel): upstream these fixes to to_folder()
-    pymod.module._operator_iadd = operator.iadd
-    pymod.module._operator_imul = operator.imul
-    pymod.module._operator_itruediv = operator.itruediv
-    pymod.module._operator_setitem = operator.setitem
-    pymod.module.math_sqrt = math.sqrt
-    pymod.module.device = torch.device
-    pymod.module.inf = float("inf")
-    return pymod.FxModule()
-
-
-def load_module_jit(name):
-    filename = os.path.join(config.base_dir, "subgraphs", name, "model.ts")
-    if not os.path.exists(filename):
-        return None
-    model = torch.jit.load(filename)
-    assert is_jit_model(model)
-    return model
-
-
-class SubGraph(object):
-    @classmethod
-    def load(cls, name):
-        model_dir = os.path.join(config.base_dir, "subgraphs", name)
-        example_inputs = torch.load(os.path.join(model_dir, "example_inputs.pt"))
-        example_outputs = torch.load(os.path.join(model_dir, "example_outputs.pt"))
-        metadata = json.loads(open(os.path.join(model_dir, "metadata.json")).read())
-        model_fx = load_module_fx(name)
-        model_jit = load_module_jit(name)
-        is_cuda = metadata["is_cuda"]
-
-        assert model_jit is not None
-
-        torch.set_rng_state(torch.load(os.path.join(model_dir, "rng_state.pt")))
-        if is_cuda:
-            model_jit = model_jit.cuda()
-        restore_jit = checkpoint_params(model_jit)
-        if model_fx is not None:
-            if is_cuda:
-                model_fx = model_fx.cuda()
-            restore_fx = checkpoint_params(model_fx)
-        else:
-            model_fx = model_jit
-            restore_fx = restore_jit
-
-        def restore():
-            restore_fx()
-            restore_jit()
-
-        subgraph = cls(model_fx, example_inputs, model_dir)
-        subgraph._scripted = model_jit
-        subgraph._example_outputs = example_outputs
-        subgraph._is_cuda = is_cuda
-        subgraph.restore = restore
-        return subgraph
-
-    def __init__(self, model, example_inputs, model_dir):
-        super(SubGraph, self).__init__()
-        self.model = model
-        self.example_inputs = example_inputs
-        self.model_dir = model_dir
-
-    def filename(self, name):
-        return os.path.join(self.model_dir, name)
-
-    @property
-    @cached
-    def scripted(self):
-        return torchscript(self.model, self.example_inputs)
-
-    @property
-    @cached
-    def example_outputs(self):
-        filename = self.filename("example_outputs.pt")
-        if os.path.exists(filename):
-            return torch.load(filename)
-        result = self.model(*self.example_inputs)
-        torch.save(result, filename)
-        return result
-
-    @property
-    def example_outputs_list(self):
-        if self.is_tensor_output:
-            return [self.example_outputs]
-        return self.example_outputs
-
-    @property
-    def input_names(self):
-        return [f"i{i}" for i in range(len(self.example_inputs))]
-
-    @property
-    def is_tensor_output(self):
-        return not isinstance(self.example_outputs, (list, tuple))
-
-    @property
-    def output_names(self):
-        return [f"o{x}" for x in range(len(self.example_outputs_list))]
-
-    @property
-    def device_index(self):
-        return 0
-
-    @property
-    @cached
-    def onnx_filename(self):
-        filename = self.filename("onnx")
-        if os.path.exists(filename):
-            return filename
-
-        try:
-            torch.onnx.export(
-                self.scripted,
-                self.example_inputs,
-                filename,
-                input_names=self.input_names,
-                output_names=self.output_names,
-                do_constant_folding=True,
-                opset_version=14,
-            )
-        except IndexError:
-            # work around bug in constant folding pass
-            torch.onnx.export(
-                self.scripted,
-                self.example_inputs,
-                filename,
-                input_names=self.input_names,
-                output_names=self.output_names,
-                do_constant_folding=False,
-                opset_version=14,
-            )
-        return filename
-
-    @property
-    def is_cpu(self):
-        return not self.is_cuda
-
-    @property
-    @cached
-    def is_cuda(self):
-        return check_is_cuda(self.model, self.example_inputs)
-
-    @property
-    def output_specs(self):
-        return [
-            (o.shape, o.dtype, o.layout, o.device, o.requires_grad)
-            for o in self.example_outputs_list
-        ]
-
-    def empty_outputs_factory(self):
-        specs = self.output_specs
-
-        def create():
-            return [
-                torch.empty(
-                    shape,
-                    dtype=dtype,
-                    layout=layout,
-                    device=device,
-                    requires_grad=requires_grad,
-                )
-                for shape, dtype, layout, device, requires_grad in specs
-            ]
-
-        return create
-
-    def wrap_returns(self, fn):
-        """Fix [Tensor()] vs Tensor() return type issues"""
-        expected = self.example_outputs
-        actual = fn(*self.example_inputs)
-        if isinstance(expected, (list, tuple)) and not isinstance(
-            actual, (list, tuple)
-        ):
-            assert len(expected) == 1
-            if isinstance(expected, tuple):
-                return lambda *args: (fn(*args),)
-            else:
-                return lambda *args: [fn(*args)]
-        elif not isinstance(expected, (list, tuple)) and isinstance(
-            actual, (list, tuple)
-        ):
-            assert len(actual) == 1
-            return lambda *args: fn(*args)[0]
-        elif isinstance(expected, (list, tuple)) and isinstance(actual, (list, tuple)):
-            assert len(actual) == len(expected)
-            return fn
-        else:
-            return fn
-
-    def has_dtype(self, dtype):
-        for x in itertools.chain(
-            self.example_inputs, self.scripted.parameters(), self.scripted.buffers()
-        ):
-            if x.dtype == dtype:
-                return True
-        return False
-
-    def will_tensorrt_barf(self):
-        return False
-        # code = torch.jit.freeze(self.scripted).code
-        # TODO(jansel): submit a bug report for this one, issue is in opacus_cifar10
-        # if "group_norm" in code or "einsum" in code:
-        #    return True
-        # return self.has_dtype(torch.int64)
diff --git a/torch/_dynamo/optimizations/torchxla_integration.py b/torch/_dynamo/optimizations/torchxla_integration.py
deleted file mode 100644
index 9db5351b70db..000000000000
--- a/torch/_dynamo/optimizations/torchxla_integration.py
+++ /dev/null
@@ -1,331 +0,0 @@
-import dataclasses
-
-import functools
-import itertools
-import os
-import time
-from typing import Any, Dict, List
-
-import torch
-
-debug = os.environ.get("TORCH_XLA_DEBUG") == "1"
-
-
-@dataclasses.dataclass
-class GraphInputMatcher:
-    """
-    The GraphInputMatcher class setup the graph inputs for future calls after lazy tracing.
-    Specifically, those graph inputs corresponding to method parameters should be replaced with the
-    arguments for the current call.
-
-    tensor_id_to_arg_idx maps the tensor id to the parameter index.
-    graph_input_tensor_ids, graph_input_xla_values list the tensor_id and ivalue for each of the
-    TS/XLA graph inputs.
-    """
-
-    tensor_id_to_arg_idx: Dict[int, int]
-    graph_input_tensor_ids: List[int]
-    # there are 2 categories of graph_input_tensors.
-    # Category 1: those whose id are not found in tensor_id_to_arg_idx. These are
-    # most likely const tensors and we can get its content from graph_input_tensors
-    # Category 2: those whose id are found in tensor_id_to_arg_idx. We should get
-    #  the tensor from method arguments
-    graph_input_xla_values: List[Any]
-
-    # get the real graph input tensors
-    def __call__(self, args):
-        real_input = []
-        for tensor_id, traced_xla_value in zip(
-            self.graph_input_tensor_ids, self.graph_input_xla_values
-        ):
-            arg_idx = self.tensor_id_to_arg_idx.get(tensor_id, None)
-            # Instead of use trace time base seed, use the runtime
-            # base seed here.
-            if tensor_id == torch_xla._XLAC._get_seed_info_id():
-                inp = torch_xla._XLAC._get_base_seed_as_tensor(
-                    str(traced_xla_value.device)
-                )
-            elif arg_idx is None:
-                inp = traced_xla_value
-            else:
-                inp = args[arg_idx]
-            real_input.append(inp)
-        return real_input
-
-
-def get_fallback_ops():
-    fallback_ops = []
-    for opname in metrics.counter_names():
-        if "aten::" not in opname:
-            continue
-        val = int(metrics.counter_value(opname))
-        if val > 0:
-            fallback_ops.append(f"{opname}={val}")
-
-    return fallback_ops
-
-
-@functools.lru_cache(None)
-def import_torchxla():
-    """
-    CI will run test_circular_dependencies in test/test_testing.py
-    which tries to import all modules found.
-    Enclosing the imports in a function so CI that does not have torch_xla
-    installed will not break.
-    """
-    global torch_xla, xm, metrics
-    import torch_xla
-    import torch_xla.core.xla_model as xm
-    import torch_xla.debug.metrics as metrics
-
-
-class Deduper:
-    def __init__(self):
-        # origlist index to dedupedlist index
-        self.permute_for_orig = None
-
-    def dedup(self, origlist):
-        self.permute_for_orig = []
-        deduped_ids = dict()
-        deduped_list = []
-        for item in origlist:
-            item_id = id(item)
-            if item_id not in deduped_ids:
-                deduped_ids[item_id] = len(deduped_ids)
-                deduped_list.append(item)
-            self.permute_for_orig.append(deduped_ids[item_id])
-
-        return deduped_list
-
-    def recover(self, deduped_list):
-        assert len(self.permute_for_orig) >= len(deduped_list)
-        return [deduped_list[i] for i in self.permute_for_orig]
-
-
-class DumbReturnHandler:
-    """
-    Define dumb return as an output that is also an input.
-    Torch xla does not return such tensors as its graph output. That breaks the
-    API contract with the caller of the graph. Also AOTAutograd
-    may generate such a graph quite often.
-
-    To avoid break the contract with the user of the GraphModule, we need
-    add those outputs manually.
-
-    Check https://github.com/pytorch/pytorch/pull/89536 for details.
-
-    AOTAutograd may also generate graph with duplicated return item.
-    E.g. https://gist.github.com/shunting314/e60df8ac21fbe2494337c10d02bd78dc
-    (this is a graph generated for a model with a single BatchNorm2d)
-    XLA will dedup those duplicate items, but we need recover the duplications to maintain
-    the contract with the caller.
-    """
-
-    def __init__(self, trace_inputs, trace_outputs, trace_inputs_inplace_update_bool):
-        self.trace_inputs = trace_inputs
-        self.trace_outputs = trace_outputs
-
-        # dedup the traced outputs first
-        self.deduper = Deduper()
-        self.deduped_trace_outputs = self.deduper.dedup(self.trace_outputs)
-
-        if debug:
-            print(
-                f"Number of duplicated outputs {len(self.trace_outputs) - len(self.deduped_trace_outputs)})"
-            )
-
-        # record the output that is also a input
-        trace_inputs_id2pos = {id(x): pos for pos, x in enumerate(self.trace_inputs)}
-        self.trace_outputs_pos_to_inputs_pos = []
-        for out_pos, out in enumerate(self.deduped_trace_outputs):
-            in_pos = trace_inputs_id2pos.get(id(out), None)
-            if in_pos is not None and not trace_inputs_inplace_update_bool[in_pos]:
-                self.trace_outputs_pos_to_inputs_pos.append((out_pos, in_pos))
-
-        if debug:
-            print(
-                f"Number trace input {len(trace_inputs)}, number trace output {len(trace_outputs)}"
-            )
-            print(
-                f"Found {len(self.trace_outputs_pos_to_inputs_pos)} dumb returns: {self.trace_outputs_pos_to_inputs_pos}"
-            )
-
-    def addDumbReturn(self, real_inputs, real_outputs):
-        for out_pos, in_pos in self.trace_outputs_pos_to_inputs_pos:
-            assert in_pos < len(real_inputs)
-            # equals is fine since we can append an item at the end
-            assert out_pos <= len(real_outputs)
-
-            real_outputs.insert(out_pos, real_inputs[in_pos])
-
-        ret = self.deduper.recover(real_outputs)
-        return ret
-
-
-class NoneRemover:
-    """
-    torchxla pybind APIs that accepts a Tensor list does not expect None value on
-    the list. But some graph (e.g. backward graph generated by aot autograd) may
-    return a None value. We need strip those None value before sending the list to
-    those torchxla APIs. We need add None value back later after running the
-    compiled graph from torchxla.
-    """
-
-    def __init__(self):
-        self.none_poslist = []
-
-    def remove_nones(self, value_list):
-        """
-        Remove none from value_list. value_list will be inplace updated.
-        The original position of None values are recorded.
-        """
-        num = len(value_list)
-
-        # work in reverse order
-        for i in reversed(range(num)):
-            if value_list[i] is None:
-                self.none_poslist.append(i)
-                del value_list[i]
-
-        self.none_poslist.reverse()
-
-    def add_nones(self, value_list):
-        """
-        Add nones to value_list according to self.none_poslist. value_list
-        is inplace updated.
-        """
-        for pos in self.none_poslist:
-            value_list.insert(pos, None)
-
-
-def is_xla_tensor(tensor: torch.Tensor) -> bool:
-    return tensor.device.type == "xla"
-
-
-def extract_compiled_graph(xla_model: torch.fx.GraphModule, xla_args):
-    import_torchxla()
-
-    assert all(
-        map(
-            is_xla_tensor,
-            filter(
-                lambda x: isinstance(x, torch.Tensor),
-                itertools.chain(xla_model.parameters(), xla_args),
-            ),
-        )
-    ), "All tensors should be on xla"
-
-    # This call is critical to make sure xla_args' tensor id show up in graph_input_tensor_ids
-    xm.mark_step()
-    args_tensor_ids = [
-        torch_xla._XLAC._xla_get_tensor_id(xla_arg) for xla_arg in xla_args
-    ]
-
-    if debug:
-        print(f"Graph module:\n{xla_model.code}")
-        print(f"args_tensor_ids {args_tensor_ids}")
-
-    tensor_id_to_arg_idx = {tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)}
-
-    # get_fallback_ops below uses counters to detect torch_xla fallbacks.
-    # Clear the counters here so we ignore pre-existing fallbacks and
-    # only detect fallbacks happening when running the xla_model below.
-    metrics.clear_counters()
-    xla_out = xla_model(*xla_args)
-
-    fallback_ops = get_fallback_ops()
-    if len(fallback_ops) > 0:
-        raise RuntimeError(
-            f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}"
-        )
-
-    if not isinstance(xla_out, (tuple, list)):
-        xla_out = (xla_out,)
-
-    none_remover = NoneRemover()
-    none_remover.remove_nones(xla_out)
-
-    xla_out_ids = {id(x) for x in xla_out}
-
-    # If a arg is being in place updated by model, we need to include arg as part of the graph result.
-    xla_args_need_update_bool = torch_xla._XLAC._check_tensor_need_materialization(
-        xla_args
-    )
-    xla_args_need_update = []
-    arg_index_to_need_update_index = {}
-    for i, need_update in enumerate(xla_args_need_update_bool):
-        # Don't add inplace updated argument to the list if it's already
-        # being returned
-        if need_update and id(xla_args[i]) not in xla_out_ids:
-            arg_index_to_need_update_index[i] = len(xla_args_need_update)
-            xla_args_need_update.append(xla_args[i])
-
-    args_and_out = tuple(xla_args_need_update) + tuple(xla_out)
-
-    if debug:
-        print(f"#inplace update: {len(xla_args_need_update)}")
-        print(f"XLA IR Text: {torch_xla._XLAC._get_xla_tensors_text(args_and_out)}")
-
-    # calculate graph hash
-    dumb_return_handler = DumbReturnHandler(
-        xla_args, args_and_out, xla_args_need_update_bool
-    )
-    graph_hash = torch_xla._XLAC._get_graph_hash(args_and_out)
-    if debug:
-        print("graph_hash", graph_hash)
-
-    (
-        graph_input_tensor_ids,
-        graph_input_xla_values,
-    ) = torch_xla._XLAC._get_tensors_xla_device_data_node(args_and_out)
-    if debug:
-        print(f"graph_input_tensor_ids {graph_input_tensor_ids}")
-    assert len(graph_input_tensor_ids) == len(
-        graph_input_xla_values
-    ), f"{len(graph_input_tensor_ids)} v.s. {len(graph_input_xla_values)}"
-    graph_input_matcher = GraphInputMatcher(
-        tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_xla_values
-    )
-
-    # compiles+runs graph rooted at tensors in 'args_and_out'
-    torch_xla._XLAC._xla_sync_multi(args_and_out, [])
-    torch_xla._XLAC._clear_pending_irs(str(xm.xla_device()))
-
-    def optimized_mod(*args):
-        torch_xla._XLAC._xla_sync_multi(args, [])
-        enter_ts = time.time()
-        if len(args_and_out) == 0:
-            return ()
-
-        assert len(args) > 0  # can not handle no args case for now
-        graph_input = graph_input_matcher(args)
-        start_ts = time.time()
-        res = torch_xla._XLAC._run_cached_graph(graph_hash, graph_input)
-        res = dumb_return_handler.addDumbReturn(args, res)
-        if debug:
-            print(
-                f"torchxla reuse compiled graph run_cached_graph takes {time.time() - start_ts} seconds"
-            )
-
-        args_inplace_update_ts = time.time()
-        assert len(res) == len(args_and_out), f"{len(res)} v.s. {len(args_and_out)}"
-        ncopy = 0
-
-        for arg_index, res_index in arg_index_to_need_update_index.items():
-            args[arg_index].copy_(res[res_index])
-
-        if debug:
-            print(
-                f"Copy {ncopy} args takes {time.time() - args_inplace_update_ts} seconds"
-            )
-
-        # First few elements might be xla_args that needs to be in place updated
-        result = res[len(xla_args_need_update) :]
-        if debug:
-            print(f"optimized_mod takes {time.time() - enter_ts} seconds overall")
-
-        xm.mark_step()
-        none_remover.add_nones(result)
-        return result
-
-    return optimized_mod
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
deleted file mode 100644
index 4ffe431e96aa..000000000000
--- a/torch/_dynamo/optimizations/training.py
+++ /dev/null
@@ -1,380 +0,0 @@
-import functools
-import logging
-import operator
-from collections import defaultdict
-from functools import partial
-from importlib import import_module
-from typing import Set
-
-from functorch.compile import (
-    aot_module_simplified,
-    min_cut_rematerialization_partition,
-    nop,
-    ts_compile,
-)
-
-import torch
-
-from torch._functorch.compilers import debug_nop
-from torch.fx import GraphModule
-from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
-from torch.multiprocessing.reductions import StorageWeakRef
-from torch.nn import Module
-from torch.utils._pytree import tree_map
-
-from .. import config, eval_frame
-from ..utils import clone_inputs, counters
-from .backends import BACKENDS
-from .normalize import normalize_ir
-
-log = logging.getLogger(__name__)
-
-
-def aot_autograd(**kwargs):
-    def compiler_fn(gm: torch.fx.GraphModule, example_inputs):
-        import functorch.compile
-
-        # Hack to get around circular import problems with aot_eager_decomp_partition
-        if callable(kwargs.get("decompositions")):
-            kwargs["decompositions"] = kwargs["decompositions"]()
-
-        # TODO: stop monkeypatching here (without even cleaning up, UGH!)
-        functorch.compile.config.use_functionalize = True
-        functorch.compile.config.use_fake_tensor = True
-
-        counters["aot_autograd"]["total"] += 1
-        use_fallback = False
-
-        if not functorch.compile.config.use_functionalize and config.normalize_ir:
-            try:
-                gm = normalize_ir(gm, clone_inputs(example_inputs))
-            except Exception:
-                log.debug("TorchDynamo unable to remove mutation")
-                use_fallback = True
-
-        if use_fallback:
-            log.debug("Unable to use AOT Autograd because graph has mutation")
-            counters["aot_autograd"]["not_ok"] += 1
-            return gm
-
-        # OK attempt to compile
-
-        def _wrapped_bw_compiler(*args, **kwargs):
-            # stop TorchDynamo from trying to compile our generated backwards pass
-            return eval_frame.disable(eval_frame.disable(bw_compiler)(*args, **kwargs))
-
-        bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
-        kwargs["bw_compiler"] = _wrapped_bw_compiler
-
-        from torch._inductor.debug import enable_aot_logging
-
-        try:
-            # NB: NOT cloned!
-            with enable_aot_logging():
-                cg = aot_module_simplified(gm, example_inputs, **kwargs)
-                counters["aot_autograd"]["ok"] += 1
-                return eval_frame.disable(cg)
-        except Exception:
-            counters["aot_autograd"]["not_ok"] += 1
-            raise
-
-    return compiler_fn
-
-
-DEBUG = False
-
-# Useful for debugging purpose
-aot_eager = aot_autograd(fw_compiler=debug_nop if DEBUG else nop)
-
-# AOT Autograd with torchscript backend. Default partitioner.
-aot_ts = aot_autograd(fw_compiler=ts_compile)
-
-# Uses TorchInductor AOT Autograd decomps and partitioner to isolate aot vs
-# inductor problems.
-aot_eager_decomp_partition = aot_autograd(
-    # these are taken from memory_efficient_fusion()
-    fw_compiler=nop,
-    bw_compiler=nop,
-    # NB: lambda here is to delay import of inductor
-    decompositions=lambda: import_module(
-        f"{config.inductor_import}.compile_fx"
-    ).select_decomp_table(),
-    partition_fn=functools.partial(
-        min_cut_rematerialization_partition, compiler="inductor"
-    ),
-)
-
-
-def mem_efficient_fusion_kwargs(use_decomps):
-    from functorch.compile import (
-        default_decompositions,
-        min_cut_rematerialization_partition,
-        ts_compile,
-    )
-
-    kwargs = {
-        # these are taken from memory_efficient_fusion()
-        "fw_compiler": ts_compile,
-        "bw_compiler": ts_compile,
-        "partition_fn": min_cut_rematerialization_partition,
-    }
-
-    if use_decomps:
-        kwargs["decompositions"] = default_decompositions
-
-    return kwargs
-
-
-# Use min cut rematerialization and TorchScript+nvFuser with AOT Autograd
-aot_mem_efficient_fusion = aot_autograd(**mem_efficient_fusion_kwargs(use_decomps=True))
-aot_mem_efficient_fusion_no_decomp = aot_autograd(
-    **mem_efficient_fusion_kwargs(use_decomps=False)
-)
-
-# Pass TorchScript+nvFuser context to TorchDynamo
-aot_mem_efficient_fusion.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
-aot_mem_efficient_fusion_no_decomp.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
-
-
-def prims_executor(gm, inputs, *, executor):
-    from functorch.compile import make_boxed_func
-
-    # This function is called once per forward/backward pass of a graph in AOT
-    # Autograd. We use it to set up the nvFuser-specific FX graph and return
-    # execute function.
-    from torch._prims.context import TorchRefsNvfuserCapabilityMode
-    from torch._prims.executor import execute
-    from torch.fx.experimental.proxy_tensor import make_fx
-
-    # AOT Autograd might not use the partitioner, so we need to make sure that
-    # the graph is transformed to use nvFuser-compatible nodes.
-    if not getattr(gm, "_nvprim_transformed", False):
-        with TorchRefsNvfuserCapabilityMode():
-            gm = make_fx(gm)(*inputs)
-
-    # Then we return a callable that executes the "gm" graph
-    return make_boxed_func(partial(execute, gm, executor=executor))
-
-
-def nvprims_fw_bw_partition_fn(joint_module, joint_inputs, *, num_fwd_outputs):
-    # This function is called once per forward+backward pass of a graph in AOT
-    # Autograd. We use it to set up the nvFuser-specific FX graph that is later
-    # passed to the executor.
-    from functorch.compile import min_cut_rematerialization_partition
-
-    from torch._prims.context import TorchRefsNvfuserCapabilityMode
-    from torch.fx.experimental.proxy_tensor import make_fx
-
-    # AOT Autograd expects arguments of the traced function to be named exactly
-    # "primals, tangents"
-    def func(primals, tangents):
-        return joint_module(primals, tangents)
-
-    # First we trace the graph conditionally decomposing nodes
-    # that can be sent to the nvfuser executor
-    with TorchRefsNvfuserCapabilityMode():
-        prim_gm = make_fx(func)(*joint_inputs)
-
-    # all nvprims for now
-    recomputable_ops = {
-        getattr(torch.ops.nvprims, prim)
-        for prim in dir(torch.ops.nvprims)
-        if isinstance(getattr(torch.ops.nvprims, prim), torch._ops.OpOverloadPacket)
-        and getattr(torch.ops.nvprims, prim).is_recomputable
-    }
-
-    fw_gm, bw_gm = min_cut_rematerialization_partition(
-        prim_gm,
-        joint_inputs,
-        recomputable_ops=recomputable_ops,
-        num_fwd_outputs=num_fwd_outputs,
-    )
-    # AOT Autograd might not use the partitioner, so we need to make sure that
-    # the graph is marked as already transformed to use nvFuser-compatible nodes
-    fw_gm._nvprim_transformed = True
-    bw_gm._nvprim_transformed = True
-    return fw_gm, bw_gm
-
-
-def create_nvprims_backend(*, executor):
-    return aot_autograd(
-        fw_compiler=partial(prims_executor, executor=executor),
-        bw_compiler=partial(prims_executor, executor=executor),
-        partition_fn=nvprims_fw_bw_partition_fn,
-    )
-
-
-aot_nvprims_nvfuser = create_nvprims_backend(executor="nvfuser")
-aot_nvprims_aten = create_nvprims_backend(executor="aten")
-
-
-def cloner(t):
-    if isinstance(t, torch.Tensor):
-        return t.clone()
-    else:
-        return t
-
-
-class CudaGraphModule(Module):
-    gm: GraphModule
-    mutated_inputs: Set[int]
-
-    def __init__(self, gm, mutated_inputs):
-        super().__init__()
-        self.gm = gm
-        self.mutated_inputs = mutated_inputs
-
-    warmed_up = False
-
-    # these are all None or all filled
-    graph = None
-    static_inputs = None
-    static_outputs = None
-
-    # NB: we override __call__ as we don't need any nn.Module machinery
-    # and to reduce overhead
-    def __call__(self, *args):
-        # TODO: once we've recorded here, we'd like to replace the __call__
-        # implementation with compiled bytecode that copies into static, replays
-        # the cuda graph, then copies out.  First condition is the hotpath,
-        # needs optimizing
-        if self.graph is not None:
-            assert len(args) == len(self.static_inputs)
-            for dst, src in zip(self.static_inputs, args):
-                dst.copy_(src)
-            self.graph.replay()
-            for i in self.mutated_inputs:
-                args[i].copy_(self.static_inputs[i])
-            return tree_map(cloner, self.static_outputs)
-
-        elif self.warmed_up:
-            # record
-            self.static_inputs = [x.clone() for x in args]
-            self.graph = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(self.graph):
-                self.static_outputs = self.gm(*self.static_inputs)
-            # NB: recording doesn't actually run the operations, so
-            # now we immediately replay the graph to serve up the result
-            self.graph.replay()
-            for i in self.mutated_inputs:
-                args[i].copy_(self.static_inputs[i])
-            return tree_map(cloner, self.static_outputs)
-
-        else:
-            # warmup
-            stream = torch.cuda.Stream()
-            stream.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(stream):
-                r = self.gm(*args)
-            torch.cuda.current_stream().wait_stream(stream)
-            self.warmed_up = True
-            return r
-
-
-# Interpreter versions of these passes can be found at
-# https://gist.github.com/ezyang/df2d746cac3b2c7d55c181e37c57ef23
-
-
-def find_input_mutations(g):
-    def meta_fk(meta):
-        return meta["val"] if "val" in meta else meta["fake_result"]
-
-    inputs = defaultdict(set)
-    input_idx = 0
-    mutated_inputs = set()
-    for n in g.nodes:
-        if n.op == "placeholder":
-            inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
-            input_idx += 1
-        elif n.op == "call_function":
-            if n.target is operator.getitem:
-                continue
-            schema = n.target._schema
-            for i, arg in enumerate(schema.arguments):
-                if i < len(n.args):
-                    argument = n.args[i]
-                else:
-                    if arg.name not in n.kwargs:
-                        continue
-                    argument = n.kwargs[arg.name]
-                mut_arg = False
-                if arg.alias_info:
-                    if arg.alias_info.is_write:
-                        mut_arg = True
-                if mut_arg:
-                    # TODO: not correct for args that contain tensors in a struct
-                    # like list
-                    mutated_inputs |= inputs[
-                        StorageWeakRef(meta_fk(argument.meta)._typed_storage())
-                    ]
-        # TODO: error on unrecognized nodes
-    return mutated_inputs
-
-
-# Mutates input graph
-def apply_cuda_graphs(gm):
-    for n in gm.graph.nodes:
-        if n.op == "call_module":
-            assert not n.kwargs
-            submod = gm.get_submodule(n.target)
-            gm.delete_submodule(n.target)
-            mutated_inputs = find_input_mutations(submod.graph)
-            gm.add_submodule(n.target, CudaGraphModule(submod, mutated_inputs))
-    # NB: we didn't actually change the graph, no need for recompile
-
-
-def cudagraphs(model, inputs):
-    model = partition_cudagraphs(model, inputs)
-    apply_cuda_graphs(model)
-    return model
-
-
-aot_cudagraphs = aot_autograd(fw_compiler=cudagraphs, bw_compiler=cudagraphs)
-
-aot_torchxla_trivial = aot_autograd(
-    fw_compiler=BACKENDS["torchxla_trivial"],
-)
-
-aot_torchxla_trace_once = aot_autograd(
-    fw_compiler=BACKENDS["torchxla_trace_once"],
-)
-
-
-def create_aot_backends():
-    """
-    Register aliases for the AOT backends
-    """
-    # aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
-    BACKENDS["aot_eager"] = aot_eager
-
-    # aot_eager_decomp_partition just replaces the inductor compiler with nop to help
-    # isolate inductor vs aot_eager errors
-    BACKENDS["aot_eager_decomp_partition"] = aot_eager_decomp_partition
-
-    # aot_ts uses torchscript backend. We can use this with both nnc and nvfuser
-    # by using the relevant fuser with torch.jit.fuser(...)
-    BACKENDS["aot_ts"] = aot_ts
-
-    # "nvprims" is a subset of PrimTorch primitives that are guaranteed to be
-    # supported by nvFuser. This is the preferred backend for nvFuser+PrimTorch.
-    BACKENDS["nvprims_nvfuser"] = aot_nvprims_nvfuser
-    # This is useful for debugging. Can be removed later.
-    BACKENDS["nvprims_aten"] = aot_nvprims_aten
-
-    # aot_ts_nvfuser uses the memory efficient fusion algorithm from AOT Autograd.
-    # It uses min cut rematerialization algorithm, uses nvFuser as the
-    # compiler backend, and TorchScript as the frontend.
-    BACKENDS["aot_ts_nvfuser"] = aot_mem_efficient_fusion
-
-    # Similar to aot_ts_nvfuser, but disables the decompositions. Decompositions
-    # can cause accuracy deviations. This setting allows us to compare accuracy
-    # without worrying about the impact of decomposisitons. More details at
-    # https://github.com/pytorch/torchdynamo/issues/611
-    BACKENDS["aot_ts_nvfuser_nodecomps"] = aot_mem_efficient_fusion_no_decomp
-
-    # aot_cudagraphs only applies CUDA graphs to the graph.  It is also helpful
-    # for debugging and can serve as a perf baseline.
-    BACKENDS["aot_cudagraphs"] = aot_cudagraphs
-
-    BACKENDS["aot_torchxla_trivial"] = aot_torchxla_trivial
-    BACKENDS["aot_torchxla_trace_once"] = aot_torchxla_trace_once
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 00b4a7ee8219..7a1dd8579166 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -7,20 +7,7 @@
 import re
 import traceback
 from dataclasses import dataclass
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    NamedTuple,
-    Optional,
-    OrderedDict,
-    Set,
-    Tuple,
-    Union,
-)
-
-from typing_extensions import Protocol
+from typing import Any, Dict, List, NamedTuple, Optional, OrderedDict, Set, Union
 
 import torch.nn
 from torch import fx
@@ -34,13 +21,25 @@
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from . import config, logging as torchdynamo_logging, variables
-from .bytecode_transformation import create_instruction, Instruction, unique_id
+from .backends.registry import CompiledFn, CompilerFn
+from .bytecode_transformation import (
+    create_call_function,
+    create_instruction,
+    Instruction,
+    unique_id,
+)
 from .codegen import PyCodegen
 from .exc import BackendCompilerFailed, unimplemented
 from .guards import GuardBuilder
 from .mutation_guard import is_dynamic_nn_module
 from .side_effects import SideEffects
-from .source import ConstantSource, is_constant_source, LocalSource, ShapeEnvSource
+from .source import (
+    ConstantSource,
+    is_constant_source,
+    LocalInputSource,
+    LocalSource,
+    ShapeEnvSource,
+)
 from .utils import (
     assert_no_fake_params_or_buffers,
     checkpoint_params,
@@ -56,7 +55,7 @@
 from .variables.builder import GraphArg, TrackedFake, VariableBuilder, wrap_fx_proxy
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
-    DynamicShapeVariable,
+    SymNodeVariable,
     TensorVariable,
     UnspecializedPythonVariable,
 )
@@ -64,15 +63,6 @@
 log = logging.getLogger(__name__)
 
 
-# TODO: I think this accepts int arguments too
-class CompiledFn(Protocol):
-    def __call__(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
-        ...
-
-
-CompilerFn = Callable[[fx.GraphModule, List[torch.Tensor]], CompiledFn]
-
-
 class OutputGraphState(NamedTuple):
     graphargs: List[GraphArg]
     tracked_fakes: List[TrackedFake]
@@ -80,7 +70,6 @@ class OutputGraphState(NamedTuple):
     nn_modules: Optional[Dict[str, torch.nn.Module]]
     side_effects: SideEffects
     timestamp: int
-    name_to_input: OrderedDict[str, Optional[fx.Proxy]]
 
     def diff(self, other: "OutputGraphState", *, prefix: str = "") -> Optional[str]:
         for k in self._fields:
@@ -131,7 +120,7 @@ class FakeRootModule(torch.nn.Module):
     """Trick the constructor of fx.GraphModule"""
 
     def __init__(self, nn_modules: Dict[str, torch.nn.Module]):
-        super(FakeRootModule, self).__init__()
+        super().__init__()
         for k, v in nn_modules.items():
             setattr(self, k, v)
 
@@ -149,7 +138,6 @@ def example_inputs(self):
         return clone_inputs(self.original_example_inputs)
 
     def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-
         self.restore = checkpoint_params(gm)
         self.gm = gm
         copy_gm = copy.deepcopy(self.gm)
@@ -192,13 +180,23 @@ def __init__(
         code_options: Dict[str, Any],
         compiler_fn: CompilerFn,
         root_tx,
+        export: bool,
     ):
-        super(OutputGraph, self).__init__()
+        super().__init__()
         self.graph = torch.fx.Graph()
         self.graphargs: List[GraphArg] = []
+        self.export = export
+        # In export mode, we force the shape_env to strictly disallow any constraining
+        # of the user marked dynamic dims
         fake_mode = torch._subclasses.FakeTensorMode(
-            throw_on_data_dependent_ops=True,
-            shape_env=ShapeEnv() if config.dynamic_shapes else None,
+            shape_env=ShapeEnv(
+                allow_scalar_outputs=config.capture_scalar_outputs,
+                allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
+                strict_mark_dyn=export,
+                assume_static_by_default=config.assume_static_by_default,
+            )
+            if config.dynamic_shapes
+            else None,
         )
         self.tracing_context: TracingContext = TracingContext(fake_mode)
         if config.dynamic_shapes:
@@ -242,6 +240,8 @@ def __init__(
         self.random_values_var = None
         self.initial_random_state = ()
         self.unspec_variable_map: Dict[str, UnspecializedPythonVariable] = {}
+        # Maps the source arg position to the grapharg position
+        self.pos_to_arg: Dict[int, int] = {}
 
         # Enables creating unique node names by tracking
         # all current placeholder node names
@@ -286,7 +286,6 @@ def copy_graphstate(self) -> OutputGraphState:
             dict(self.nn_modules),
             self.side_effects.clone(),
             self.timestamp,
-            self.name_to_input.copy(),
         )
         self.timestamp += 1
         return state
@@ -300,7 +299,6 @@ def restore_graphstate(self, state: OutputGraphState):
             self.nn_modules,
             self.side_effects,
             self.timestamp,
-            self.name_to_input,
         ) = state
         self.tracing_context.guards_context.restore_graphstate(guards_state)
         # FX deepcopy doesn't work for a partially created graph, so just remove new nodes
@@ -316,6 +314,12 @@ def restore_graphstate(self, state: OutputGraphState):
                 removed_nodes += 1
         log.debug(f"restore_graphstate: removed {removed_nodes} nodes")
 
+    def add_grapharg(self, arg: GraphArg):
+        curr_pos = len(self.graphargs)
+        self.graphargs.append(arg)
+        if isinstance(arg.source, LocalInputSource):
+            self.pos_to_arg[arg.source.pos] = curr_pos
+
     def count_calls(self):
         return count_calls(self.graph)
 
@@ -364,6 +368,26 @@ def update_co_names(self, name):
                 name,
             )
 
+    @staticmethod
+    def module_has_hooks(mod, only_check_unsupported=False):
+        supported_hooks = [
+            "_forward_pre_hooks",
+            "_forward_hooks",
+        ]
+        unsupported_hooks = [
+            "_backward_pre_hooks",
+            "_backward_hooks",
+            "_state_dict_pre_hooks",
+            "_state_dict_hooks",
+            "_load_state_dict_pre_hooks",
+            "_load_state_dict_post_hooks",
+        ]
+        check_hooks = unsupported_hooks
+        if not only_check_unsupported:
+            check_hooks += supported_hooks
+
+        return any(len(getattr(mod, x)) > 0 for x in check_hooks if hasattr(mod, x))
+
     def register_attr_or_module(
         self,
         target: Union[torch.nn.Module, torch.Tensor, Any],
@@ -391,6 +415,10 @@ def wrap_name(module_key):
 
         elif isinstance(target, torch.nn.Module):
             assert isinstance(target, torch.nn.Module)
+            if self.module_has_hooks(target, only_check_unsupported=True):
+                log.warning(
+                    "nn.Module hooks are not fully supported, they may be ignored"
+                )
             options["guards"].add(source.make_guard(GuardBuilder.NN_MODULE))
 
             def wrap_name(module_key):
@@ -405,10 +433,10 @@ def wrap_name(module_key):
             # alas, this is like this for now
 
             def wrap_name(module_key):
-                return DynamicShapeVariable.create(
+                return SymNodeVariable.create(
                     self,
                     self.create_proxy("get_attr", module_key, tuple(), {}),
-                    dyn_shape=target,
+                    sym_num=target,
                     **options,
                 )
 
@@ -430,7 +458,7 @@ def wrap_name(module_key):
 
         # create a new unique name
         name = "_".join(map(str, names))
-        # e.g. repalce abc.xyz[123].qkv with abc.xyz_123.qkv
+        # e.g. replace abc.xyz[123].qkv with abc.xyz_123.qkv
         name = re.sub(r"\[(\d+)\]", r"_\g<1>", name)
         # e.g. replace abc.xyz_123.qkv with abc_xyz_123_qkv
         name = re.sub(r"[^a-zA-Z0-9]", "_", name)
@@ -498,18 +526,18 @@ def compile_subgraph(
             codegen = PyCodegen(tx, root)
             random_calls_instructions.extend(
                 [
-                    codegen.create_load_global("random", add=True),
+                    codegen.create_load_global("random", True, add=True),
                     codegen.create_load_attr("setstate"),
                     codegen.create_load_const(tx.output.initial_random_state),
-                    create_instruction("CALL_FUNCTION", 1),
                 ]
+                + create_call_function(1, False),
             )
-            random_calls_instructions.extend(codegen.load_function_name(rand_fn_name))
             random_calls_instructions.extend(
-                [
-                    create_instruction("CALL_FUNCTION", 0),
-                    codegen.create_store(tx.output.random_values_var),
-                ]
+                codegen.load_function_name(rand_fn_name, True)
+            )
+            random_calls_instructions.extend(create_call_function(0, False))
+            random_calls_instructions.append(
+                codegen.create_store(tx.output.random_values_var),
             )
             self.add_output_instructions(random_calls_instructions)
 
@@ -522,7 +550,6 @@ def compile_subgraph(
             and len(set(stack_values)) == len(stack_values)
             and self.side_effects.is_empty()
         ):
-
             # optimization to generate better code in a common case
             self.add_output_instructions(
                 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
@@ -627,6 +654,11 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
     @dynamo_timed(phase_name="backend_compile")
     def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+        tot = 0
+        for node in gm.graph.nodes:
+            if node.op in ("call_function", "call_method", "call_module"):
+                tot += 1
+        torch._dynamo.utils.increment_op_count(tot)
         try:
             name = (
                 self.compiler_fn.__name__
@@ -741,8 +773,6 @@ def cleanup(self) -> None:
         # There is a reference cycle between tracer and OutputGraph, causing
         # some of the tensor objects to be held alive for longer than necessary.
 
-        # Clear cache for conversion of real -> fake tensors
-        self.root_tx.fake_mode.fake_tensor_converter = None
         self.root_tx = None
 
         # Note: generated fx graph will hold a reference to the nn_module,
@@ -788,10 +818,12 @@ def create_proxy(
         while tx:
             frame_summaries.append(tx.frame_summary())
             tx = getattr(tx, "parent", None)
+        # Reverse the frame_summaries, such that the innermost frame is at the last
+        frame_summaries.reverse()
 
         # official from_list stub doesn't have new-style type
         msgs = traceback.StackSummary.from_list(frame_summaries).format()  # type: ignore[arg-type]
-        rv.node.stack_trace = " | ".join(msgs)
+        rv.node.stack_trace = "".join(msgs)
 
         return rv
 
diff --git a/torch/_dynamo/profiler.py b/torch/_dynamo/profiler.py
index b5a667070a8c..500b9f508639 100644
--- a/torch/_dynamo/profiler.py
+++ b/torch/_dynamo/profiler.py
@@ -107,7 +107,7 @@ def results(self):
 
         last_op_end_time = -1
         captured_region_end_time = -1
-        events = list(sorted(self.prof.events(), key=lambda x: x.time_range.start))
+        events = sorted(self.prof.events(), key=lambda x: x.time_range.start)
         for e in events:
             if e.name == "TORCHDYNAMO":
                 captured_region_end_time = e.time_range.end
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index c05f610d6712..a4d06b81c9f5 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -2,10 +2,12 @@
 import dataclasses
 import sys
 import types
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 from .bytecode_transformation import (
+    create_call_function,
     create_instruction,
+    create_jump_absolute,
     Instruction,
     transform_code_object,
 )
@@ -28,16 +30,22 @@
 @dataclasses.dataclass(frozen=True)
 class ReenterWith:
     stack_index: int = None
+    target_values: Optional[Tuple] = None
 
     def __call__(self, code_options, cleanup):
+        load_args = []
+        if self.target_values:
+            load_args = [
+                create_instruction(
+                    "LOAD_CONST",
+                    PyCodegen.get_const_index(code_options, val),
+                    val,
+                )
+                for val in self.target_values
+            ]
         if sys.version_info < (3, 9):
             with_cleanup_start = create_instruction("WITH_CLEANUP_START")
-            if sys.version_info < (3, 8):
-                begin_finally = create_instruction(
-                    "LOAD_CONST", PyCodegen.get_const_index(code_options, None), None
-                )
-            else:
-                begin_finally = create_instruction("BEGIN_FINALLY")
+            begin_finally = create_instruction("BEGIN_FINALLY")
             cleanup[:] = [
                 create_instruction("POP_BLOCK"),
                 begin_finally,
@@ -47,12 +55,12 @@ def __call__(self, code_options, cleanup):
             ] + cleanup
 
             return [
-                create_instruction("CALL_FUNCTION", 0),
+                *load_args,
+                create_instruction("CALL_FUNCTION", len(load_args)),
                 create_instruction("SETUP_WITH", target=with_cleanup_start),
                 create_instruction("POP_TOP"),
             ]
-        else:
-
+        elif sys.version_info < (3, 11):
             with_except_start = create_instruction("WITH_EXCEPT_START")
             pop_top_after_with_except_start = create_instruction("POP_TOP")
 
@@ -82,11 +90,57 @@ def __call__(self, code_options, cleanup):
             ] + cleanup
 
             return [
-                create_instruction("CALL_FUNCTION", 0),
+                *load_args,
+                create_instruction("CALL_FUNCTION", len(load_args)),
                 create_instruction("SETUP_WITH", target=with_except_start),
                 create_instruction("POP_TOP"),
             ]
 
+        else:
+            pop_top_after_with_except_start = create_instruction("POP_TOP")
+            cleanup_complete_jump_target = create_instruction("NOP")
+
+            def create_load_none():
+                return create_instruction(
+                    "LOAD_CONST", PyCodegen.get_const_index(code_options, None), None
+                )
+
+            cleanup[:] = (
+                [
+                    create_load_none(),
+                    create_load_none(),
+                    create_load_none(),
+                ]
+                + create_call_function(2, False)
+                + [
+                    create_instruction("POP_TOP"),
+                    create_instruction(
+                        "JUMP_FORWARD", target=cleanup_complete_jump_target
+                    ),
+                    create_instruction("PUSH_EXC_INFO"),
+                    create_instruction("WITH_EXCEPT_START"),
+                    create_instruction(
+                        "POP_JUMP_FORWARD_IF_TRUE",
+                        target=pop_top_after_with_except_start,
+                    ),
+                    create_instruction("RERAISE", 2),
+                    create_instruction("COPY", 3),
+                    create_instruction("POP_EXCEPT"),
+                    create_instruction("RERAISE", 1),
+                    pop_top_after_with_except_start,
+                    create_instruction("POP_EXCEPT"),
+                    create_instruction("POP_TOP"),
+                    create_instruction("POP_TOP"),
+                    cleanup_complete_jump_target,
+                ]
+                + cleanup
+            )
+
+            return create_call_function(0, False) + [
+                create_instruction("BEFORE_WITH"),
+                create_instruction("POP_TOP"),
+            ]
+
 
 @dataclasses.dataclass
 class ResumeFunctionMetadata:
@@ -116,6 +170,7 @@ def generate(
         nstack: int,
         argnames: List[str],
         setup_fns: List[ReenterWith],
+        null_idxes: List[int],
     ):
         assert offset is not None
         assert not (
@@ -125,7 +180,7 @@ def generate(
         assert code.co_flags & CO_OPTIMIZED
         if code in ContinueExecutionCache.generated_code_metadata:
             return cls.generate_based_on_original_code_object(
-                code, lineno, offset, nstack, argnames, setup_fns
+                code, lineno, offset, nstack, argnames, setup_fns, null_idxes
             )
 
         meta = ResumeFunctionMetadata(code)
@@ -139,6 +194,10 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
                 code_options["co_freevars"] or []
             )
             code_options["co_name"] = f"<graph break in {code_options['co_name']}>"
+            if sys.version_info >= (3, 11):
+                code_options[
+                    "co_qualname"
+                ] = f"<graph break in {code_options['co_qualname']}>"
             code_options["co_firstlineno"] = lineno
             code_options["co_cellvars"] = tuple()
             code_options["co_freevars"] = freevars
@@ -151,6 +210,7 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
             code_options["co_flags"] = code_options["co_flags"] & ~(
                 CO_VARARGS | CO_VARKEYWORDS
             )
+            # TODO probably need to update co_exceptiontable for python 3.11
             (target,) = [i for i in instructions if i.offset == offset]
 
             prefix = []
@@ -162,7 +222,12 @@ def update(instructions: List[Instruction], code_options: Dict[str, Any]):
                     prefix.extend(hooks.pop(i)(code_options, cleanup))
             assert not hooks
 
-            prefix.append(create_instruction("JUMP_ABSOLUTE", target=target))
+            if sys.version_info >= (3, 11):
+                for idx in null_idxes:
+                    prefix.append(create_instruction("PUSH_NULL"))
+                    prefix.extend(create_rot_n(idx))
+
+            prefix.append(create_jump_absolute(target))
 
             # because the line number table monotonically increases from co_firstlineno
             # remove starts_line for any instructions before the graph break instruction
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 00cfbb0e4a4e..e7323af74840 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -6,7 +6,11 @@
 import torch.nn
 
 from . import utils, variables
-from .bytecode_transformation import create_instruction
+from .bytecode_transformation import (
+    create_call_function,
+    create_call_method,
+    create_instruction,
+)
 from .codegen import PyCodegen
 from .source import LocalSource, Source
 from .utils import object_new
@@ -294,14 +298,14 @@ def codegen_save_tempvars(self, cg: PyCodegen):
                 var.mutable_local, (AttributeMutationExisting, AttributeMutationNew)
             ) and isinstance(var, variables.NewCellVariable):
                 cg.load_import_from(utils.__name__, "make_cell")
-                cg.extend_output([create_instruction("CALL_FUNCTION", 0)])
+                cg.extend_output(create_call_function(0, True))
                 cg.add_cache(var)
                 if isinstance(var.mutable_local, AttributeMutationNew):
                     var.mutable_local.source = LocalSource(cg.tempvars[var])
             elif isinstance(var.mutable_local, AttributeMutationNew):
                 cg.load_import_from(utils.__name__, "object_new")
                 cg(var.mutable_local.cls_source)
-                cg.extend_output([create_instruction("CALL_FUNCTION", 1)])
+                cg.extend_output(create_call_function(1, True))
                 cg.add_cache(var)
                 var.mutable_local.source = LocalSource(cg.tempvars[var])
             elif var in cg.tempvars:
@@ -337,10 +341,12 @@ def codegen_update_mutated(self, cg: PyCodegen):
                 cg.extend_output([create_instruction("LOAD_METHOD", "clear")])
 
                 suffixes.append(
-                    [
-                        create_instruction("CALL_METHOD", 0),  # clear
+                    create_call_method(0)  # clear
+                    + [
                         create_instruction("POP_TOP"),
-                        create_instruction("CALL_METHOD", 1),  # update
+                    ]
+                    + create_call_method(1)  # update
+                    + [
                         create_instruction("POP_TOP"),
                     ]
                 )
diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index 48ef11bad40b..64e901fe1d23 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -130,13 +130,11 @@ def _module_dir(m: types.ModuleType):
 }
 
 # Include optimizer code for tracing
-FILENAME_ALLOWLIST |= set(
-    [
-        inspect.getfile(obj)
-        for obj in torch.optim.__dict__.values()
-        if inspect.isclass(obj)
-    ]
-)
+FILENAME_ALLOWLIST |= {
+    inspect.getfile(obj)
+    for obj in torch.optim.__dict__.values()
+    if inspect.isclass(obj)
+}
 FILENAME_ALLOWLIST |= {torch.optim._functional.__file__}
 
 if HAS_PRIMS_REFS:
@@ -220,7 +218,9 @@ def is_torch_inline_allowed(filename):
 
 @functools.lru_cache(None)
 def dynamo_dir():
-    return _module_dir(importlib.import_module(config.dynamo_import))
+    import torch._dynamo
+
+    return _module_dir(torch._dynamo)
 
 
 def is_torch(filename):
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index 5af4e68330ee..036da4bbe741 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -6,8 +6,8 @@
 from torch._guards import GuardSource, Source
 
 from . import utils
-from .bytecode_transformation import create_instruction
-from .utils import rename_implicit
+from .bytecode_transformation import create_call_function, create_instruction
+from .utils import enum_repr, rename_implicit
 
 _GUARD_SOURCE_NN_MODULE = {
     GuardSource.LOCAL: GuardSource.LOCAL_NN_MODULE,
@@ -59,6 +59,11 @@ def name(self):
         return rename_implicit(self.local_name)
 
 
+@dataclasses.dataclass
+class LocalInputSource(LocalSource):
+    pos: int
+
+
 @dataclasses.dataclass
 class RandomValueSource(Source):
     random_call_index: int
@@ -82,7 +87,7 @@ class GlobalSource(Source):
     global_name: str
 
     def reconstruct(self, codegen):
-        return [codegen.create_load_global(self.global_name, add=True)]
+        return [codegen.create_load_global(self.global_name, False, add=True)]
 
     def guard_source(self):
         return GuardSource.GLOBAL
@@ -97,9 +102,8 @@ class GlobalWeakRefSource(Source):
 
     def reconstruct(self, codegen):
         return [
-            codegen.create_load_global(self.global_name, add=True),
-            create_instruction("CALL_FUNCTION", 0),
-        ]
+            codegen.create_load_global(self.global_name, True, add=True),
+        ] + create_call_function(0, False)
 
     def guard_source(self):
         return GuardSource.GLOBAL
@@ -260,17 +264,23 @@ def name(self):
         if isinstance(self.index, Source):
             return f"{self.base.name()}[{self.index.name()}]"
         else:
-            return f"{self.base.name()}[{self.index!r}]"
+            if isinstance(self.index, enum.Enum):
+                return f"{self.base.name()}[{enum_repr(self.index)}]"
+            else:
+                return f"{self.base.name()}[{self.index!r}]"
 
 
 @dataclasses.dataclass
 class TupleIteratorGetItemSource(GetItemSource):
     def reconstruct(self, codegen):
         codegen.load_import_from(utils.__name__, "tuple_iterator_getitem")
-        return self.base.reconstruct(codegen) + [
-            codegen.create_load_const(self.index),
-            create_instruction("CALL_FUNCTION", 2),
-        ]
+        return (
+            self.base.reconstruct(codegen)
+            + [
+                codegen.create_load_const(self.index),
+            ]
+            + create_call_function(2, True)
+        )
 
     def name(self):
         return f"___tuple_iterator_getitem({self.base.name()}, {self.index!r})"
@@ -285,7 +295,7 @@ def __post_init__(self):
 
     def reconstruct(self, codegen):
         codegen.load_import_from("builtins", "type")
-        return self.base.reconstruct(codegen) + [create_instruction("CALL_FUNCTION", 1)]
+        return self.base.reconstruct(codegen) + create_call_function(1, True)
 
     def guard_source(self):
         return self.base.guard_source()
@@ -308,7 +318,7 @@ def reconstruct(self, codegen):
         return (
             self.type.reconstruct(codegen)
             + self.obj.reconstruct(codegen)
-            + [create_instruction("CALL_FUNCTION", 2)]
+            + create_call_function(2, True)
         )
 
     def guard_source(self):
@@ -332,8 +342,8 @@ def reconstruct(self, codegen):
             + self.base.reconstruct(codegen)
             + [
                 codegen.create_load_const(self.index),
-                create_instruction("CALL_FUNCTION", 2),
             ]
+            + create_call_function(2, True)
         )
 
     def guard_source(self):
@@ -367,7 +377,7 @@ class ConstantSource(Source):
     source_name: str
 
     def reconstruct(self, codegen):
-        return [codegen.create_load_global(self.source_name, add=False)]
+        return [codegen.create_load_global(self.source_name, False, add=False)]
 
     def guard_source(self):
         return GuardSource.CONSTANT
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index e96c235a6ca7..19f198237fd2 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -13,7 +13,7 @@
 import typing
 import weakref
 from collections.abc import Sized
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Type
 from unittest.mock import patch
 
 import torch
@@ -29,10 +29,12 @@
     variables,
 )
 from .allowed_functions import is_allowed, is_builtin_callable, is_builtin_constant
-from .bytecode_analysis import livevars_analysis
+from .bytecode_analysis import JUMP_OPNAMES, livevars_analysis
 from .bytecode_transformation import (
     cleaned_instructions,
+    create_call_function,
     create_instruction,
+    create_jump_absolute,
     Instruction,
     is_generator,
     unique_id,
@@ -48,18 +50,20 @@
     GetItemSource,
     GlobalSource,
     GlobalWeakRefSource,
+    LocalInputSource,
     LocalSource,
 )
 from .utils import counters, graph_break_dup_warning_checker, istype, proxy_args_kwargs
 from .variables.base import MutableLocal, typestr, VariableTracker
 from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.builtin import BuiltinVariable
-from .variables.constant import ConstantVariable
+from .variables.constant import ConstantVariable, EnumVariable
 from .variables.dicts import ConstDictVariable
 from .variables.functions import (
     BaseUserFunctionVariable,
     NestedUserFunctionVariable,
     UserFunctionVariable,
+    UserMethodVariable,
 )
 from .variables.lists import (
     BaseListVariable,
@@ -73,14 +77,20 @@
     ContextWrappingVariable,
     GetAttrVariable,
     GradModeVariable,
+    NullVariable,
     PythonModuleVariable,
     UnknownVariable,
     WithExitFunctionVariable,
 )
 from .variables.nn_module import NNModuleVariable
-from .variables.tensor import DynamicShapeVariable, TensorVariable
+from .variables.tensor import (
+    supported_const_comparison_ops,
+    supported_tensor_comparison_ops,
+    SymNodeVariable,
+    TensorVariable,
+)
 from .variables.torch import TorchVariable
-from .variables.user_defined import UserDefinedVariable
+from .variables.user_defined import UserDefinedObjectVariable, UserDefinedVariable
 
 log = logging.getLogger(__name__)
 
@@ -92,6 +102,7 @@ def _step_logger():
 
 @dataclasses.dataclass
 class BlockStackEntry:
+    id: int
     target: Instruction
     stack_index: Optional[int] = None
     with_context: ContextWrappingVariable = None
@@ -101,7 +112,10 @@ def can_restore(self):
 
     def resume_fn(self):
         assert self.stack_index is not None
-        return ReenterWith(self.stack_index)
+        if self.with_context and self.with_context.target_values:
+            return ReenterWith(self.stack_index, tuple(self.with_context.target_values))
+        else:
+            return ReenterWith(self.stack_index)
 
     def exit(self, tx):
         return self.with_context.exit(tx)
@@ -190,15 +204,19 @@ def _detect_and_normalize_assert_statement(
         has_error_msg = True
 
         # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
+        # (PRECALL for Python 3.11+)
         current_instruction_pointer += 1
         if current_instruction_pointer >= len(self.instructions):
             return False
         inst = self.instructions[current_instruction_pointer]
-        if inst.opname != "CALL_FUNCTION":
+        if inst.opname not in ("CALL_FUNCTION", "PRECALL"):
             return False
 
-        # CALL_FUNCTION should be followed by RAISE_VARARGS
+        # for Python 3.11+, PRECALL should be followed by CALL, then RAISE_VARARGS
+        # for Python < 3.11, CALL_FUNCTION should be followed by RAISE_VARARGS
         current_instruction_pointer += 1
+        if inst.opname == "PRECALL":
+            current_instruction_pointer += 1
         if current_instruction_pointer >= len(self.instructions):
             return False
         inst = self.instructions[current_instruction_pointer]
@@ -273,17 +291,41 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
                 + if_jump
             )
         elif isinstance(value, NNModuleVariable):
-            # Equivant of "self.nn_module is not None"
+            # Equivalent of "self.nn_module is not None"
             if truth_fn(value):
                 push and self.push(value)
                 self.jump(inst)
+        elif isinstance(value, UserDefinedObjectVariable):
+            x = value.var_getattr(self, "__bool__")
+            # __bool__ is function
+            if isinstance(x, UserMethodVariable):
+                state = self.copy_graphstate()
+                result = x.call_function(self, [], {})
+                if isinstance(result, ConstantVariable) and isinstance(
+                    result.value, bool
+                ):
+                    self.output.guards.update(result.guards)
+                    if truth_fn(result.value):
+                        push and self.push(value)
+                        self.jump(inst)
+                else:
+                    # rollback to the state before the __bool__ inline
+                    self.restore_graphstate(state)
+                    unimplemented(
+                        "generic_jump on UserDefined with __bool__ returning non-constant"
+                    )
+            # __bool__ is non-function or not existed in the user defined object
+            else:
+                if truth_fn(True):
+                    push and self.push(value)
+                    self.jump(inst)
         elif not isinstance(value, TensorVariable) and value.has_unpack_var_sequence(
             self
         ):
             if truth_fn(len(value.unpack_var_sequence(self))):
                 push and self.push(value)
                 self.jump(inst)
-        elif isinstance(value, DynamicShapeVariable):
+        elif isinstance(value, SymNodeVariable):
             eval_result = value.evaluate_expr(self.output)
             if truth_fn(eval_result):
                 push and self.push(value)
@@ -335,7 +377,14 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                 reason = GraphCompileReason(excp.msg, user_stack)
             self.restore_graphstate(state)
             self.output.compile_subgraph(self, reason=reason)
-            self.popn(push - dis.stack_effect(inst.opcode, inst.arg))
+            if sys.version_info >= (3, 11) and inst.opname == "CALL":
+                # stack effect for PRECALL + CALL is split between the two instructions
+                stack_effect = dis.stack_effect(
+                    dis.opmap["PRECALL"], inst.arg
+                ) + dis.stack_effect(dis.opmap["CALL"], inst.arg)
+            else:
+                stack_effect = dis.stack_effect(inst.opcode, inst.arg)
+            self.popn(push - stack_effect)
 
             for _ in range(push):
                 self.push(UnknownVariable())
@@ -357,7 +406,23 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                 )
                 self.output.add_output_instructions(setup_finally)
 
-            self.output.add_output_instructions([inst])
+            if sys.version_info >= (3, 11) and inst.opname == "CALL":
+                kw_names = self.kw_names.value if self.kw_names is not None else ()
+                if len(kw_names) > 0:
+                    self.output.add_output_instructions(
+                        [
+                            create_instruction(
+                                "KW_NAMES",
+                                PyCodegen.get_const_index(self.code_options, kw_names),
+                            ),
+                        ]
+                    )
+                self.output.add_output_instructions(
+                    create_call_function(inst.arg, False)
+                )
+                # no need to reset self.kw_names since self should not continue to run
+            else:
+                self.output.add_output_instructions([inst])
 
             # Add the cleanup instructions from try..finally block
             self.output.add_output_instructions(cleanup)
@@ -370,6 +435,14 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
     return decorator
 
 
+def is_none(x):
+    return x is None
+
+
+def is_not_none(x):
+    return x is not None
+
+
 class InstructionTranslatorBase(Checkpointable[InstructionTranslatorGraphState]):
     output: OutputGraph
     symbolic_locals: Dict[str, VariableTracker]
@@ -381,6 +454,7 @@ class InstructionTranslatorBase(Checkpointable[InstructionTranslatorGraphState])
     block_stack: List[BlockStackEntry]
     lineno: int
     mutated_closure_cell_contents: Set[str]
+    kw_names: Optional[ConstantVariable]
 
     checkpoint: Optional[Tuple[Instruction, InstructionTranslatorGraphState]]
     random_calls: List[
@@ -391,11 +465,7 @@ def has_backedge(self):
         cur_offset = self.current_instruction.offset
         assert self.instruction_pointer is not None
         for inst in self.instructions[self.instruction_pointer :]:
-            if inst.opname in (
-                "JUMP_ABSOLUTE",
-                "POP_JUMP_IF_TRUE",
-                "POP_JUMP_IF_FALSE",
-            ):
+            if inst.opname in JUMP_OPNAMES:
                 jump_offset = inst.argval
                 if jump_offset < cur_offset:
                     return True
@@ -432,6 +502,18 @@ def call_function(
             isinstance(x, VariableTracker)
             for x in itertools.chain(args, kwargs.values())
         )
+        inner_fn = None
+        if hasattr(fn, "value"):
+            inner_fn = fn.value
+        if hasattr(fn, "fn"):
+            inner_fn = fn.fn
+        if (
+            inner_fn
+            and callable(inner_fn)
+            and hasattr(inner_fn, "_dynamo_forbidden")
+            and inner_fn._dynamo_forbidden
+        ):
+            raise AssertionError(f"Attempt to trace forbidden callable {inner_fn}")
         self.push(fn.call_function(self, args, kwargs))
 
     def update_locals_and_stack(self, oldvar: VariableTracker, newvar: VariableTracker):
@@ -525,8 +607,7 @@ def step(self):
             reason=GraphCompileReason("step_unsupported", [self.frame_summary()]),
         )
         self.output.add_output_instructions(
-            [create_instruction("JUMP_ABSOLUTE", target=continue_inst)]
-            + self.instructions
+            [create_jump_absolute(continue_inst)] + self.instructions
         )
 
     def run(self):
@@ -627,6 +708,10 @@ def get_global_source(self, name):
         return source
 
     def LOAD_GLOBAL(self, inst):
+        if sys.version_info >= (3, 11):
+            if inst.arg % 2:
+                self.PUSH_NULL(inst)
+
         name = inst.argval
 
         if config.replay_record_enabled:
@@ -794,11 +879,11 @@ def jump(self, inst):
 
     def SETUP_LOOP(self, inst):
         # only exists in python<=3.7
-        self.block_stack.append(BlockStackEntry(inst.target))
+        self.block_stack.append(BlockStackEntry(0, inst.target))
 
     def SETUP_EXCEPT(self, inst):
         # only exists in python<=3.7
-        self.block_stack.append(BlockStackEntry(inst.target))
+        self.block_stack.append(BlockStackEntry(0, inst.target))
 
     def POP_BLOCK(self, inst):
         self.block_stack.pop()
@@ -810,10 +895,12 @@ def SETUP_WITH(self, inst):
         self.output.guards.update(ctx.guards)
 
         if isinstance(self, InstructionTranslator):
-            self.block_stack.append(BlockStackEntry(inst.target, len(self.stack), ctx))
+            self.block_stack.append(
+                BlockStackEntry(0, inst.target, len(self.stack), ctx)
+            )
         else:
             # can't restore this while inlining
-            self.block_stack.append(BlockStackEntry(inst.target))
+            self.block_stack.append(BlockStackEntry(0, inst.target))
         self.push(
             WithExitFunctionVariable(
                 ctx,
@@ -824,18 +911,14 @@ def SETUP_WITH(self, inst):
         self.push(ctx.enter(self))
 
     def SETUP_FINALLY(self, inst):
-        self.block_stack.append(BlockStackEntry(inst.target))
+        self.block_stack.append(BlockStackEntry(0, inst.target))
 
     def BEGIN_FINALLY(self, inst):
         self.push(None)
 
     def WITH_CLEANUP_START(self, inst):
         exit, exc = self.popn(2)
-        if sys.version_info < (3, 8):
-            assert exc.is_python_constant()
-            assert exc.as_python_constant() is None
-        else:
-            assert exc is None
+        assert exc is None
         self.push(exc)
         self.push(exit.call_function(self, [ConstantVariable(None)] * 3, {}))
 
@@ -845,13 +928,7 @@ def WITH_CLEANUP_FINISH(self, inst):
 
     def END_FINALLY(self, inst):
         tos = self.pop()
-        if sys.version_info < (3, 8):
-            # python3.7 and 3.8 can have END_FINALLY without BEGIN_FINALLY
-            assert tos is None or (
-                tos.is_python_constant() and tos.as_python_constant() is None
-            )
-        else:
-            assert tos is None
+        assert tos is None
 
     def FOR_ITER(self, inst):
         it = self.pop()
@@ -873,29 +950,18 @@ def COMPARE_OP(self, inst):
         right = right.as_specialized(self)
         options = VariableTracker.propagate([left, right])
         op = inst.argval
-        supported_is_const = {
-            "is": operator.is_,
-            "is not": operator.is_not,
-            "==": operator.eq,
-            "!=": operator.ne,
-        }
-        supported_tensors = {
-            ">": operator.gt,
-            "<": operator.lt,
-            ">=": operator.ge,
-            "<=": operator.le,
-            "==": operator.eq,
-            "!=": operator.ne,
-        }
         supported_any = dict(
-            itertools.chain(supported_tensors.items(), supported_is_const.items())
+            itertools.chain(
+                supported_tensor_comparison_ops.items(),
+                supported_const_comparison_ops.items(),
+            )
         )
         if (
             isinstance(
                 left,
                 (
                     TensorVariable,
-                    DynamicShapeVariable,
+                    SymNodeVariable,
                     NNModuleVariable,
                     BaseListVariable,
                     UserDefinedVariable,
@@ -905,12 +971,12 @@ def COMPARE_OP(self, inst):
             )
             and isinstance(right, ConstantVariable)
             and right.value is None
-            and op in supported_is_const
+            and op in supported_const_comparison_ops
         ):
             # <non-None> is None
             self.push(
                 ConstantVariable(
-                    supported_is_const[op](object(), right.value), **options
+                    supported_const_comparison_ops[op](object(), right.value), **options
                 )
             )
         elif (
@@ -927,42 +993,16 @@ def COMPARE_OP(self, inst):
                     **options,
                 )
             )
-        elif (
-            isinstance(left, TensorVariable) or isinstance(right, TensorVariable)
-        ) and op in supported_tensors:
-            self.push(
-                wrap_fx_proxy(
-                    self,
-                    supported_tensors[op](left.as_proxy(), right.as_proxy()),
-                    **options,
-                )
-            )
-        elif (
-            isinstance(left, DynamicShapeVariable)
-            or isinstance(right, DynamicShapeVariable)
-        ) and op in supported_tensors:
-            self.push(
-                DynamicShapeVariable.create(
-                    self,
-                    supported_tensors[op](left.as_proxy(), right.as_proxy()),
-                    dyn_shape=None,
-                    **options,
-                )
-            )
         elif op in ("in", "not in"):
             self.push(right.call_method(self, "__contains__", [left], {}))
             if op == "not in":
                 self.UNARY_NOT(inst)
-        elif (
-            isinstance(left, UserFunctionVariable)
-            and isinstance(right, UserFunctionVariable)
-            and op in supported_is_const
-        ):
+        else:
             self.push(
-                ConstantVariable(supported_is_const[op](left.fn, right.fn), **options)
+                BuiltinVariable(supported_any[op], **options).call_function(
+                    self, [left, right], {}
+                )
             )
-        else:
-            unimplemented(f"COMPARE_OP {typestr(left)} {op} {typestr(right)}")
 
     def GET_ITER(self, inst):
         self.call_function(BuiltinVariable(iter), [self.pop()], {})
@@ -1024,8 +1064,16 @@ def CALL_FUNCTION_KW(self, inst):
 
     def LOAD_METHOD(self, inst):
         self.LOAD_ATTR(inst)
-        self.push(self.pop())
-        self.push(None)
+        obj = self.pop()
+        if sys.version_info >= (3, 11):
+            # always follow the NULL + fn convention, since if obj
+            # is actually a method, self is already bound to it, so it
+            # doesn't need to be passed in as an arg.
+            self.PUSH_NULL(inst)
+            self.push(obj)
+        else:
+            self.push(obj)
+            self.push(None)
 
     def CALL_METHOD(self, inst):
         args = self.popn(inst.argval)
@@ -1135,7 +1183,7 @@ def BUILD_MAP(self, inst):
         options = VariableTracker.propagate(items)
         result = dict()
         for k, v in zip(items[::2], items[1::2]):
-            assert isinstance(k, ConstantVariable) or (
+            assert isinstance(k, (ConstantVariable, EnumVariable)) or (
                 isinstance(k, TensorVariable) and k.specialized_value is not None
             )
 
@@ -1163,11 +1211,7 @@ def BUILD_CONST_KEY_MAP(self, inst):
         )
 
     def MAP_ADD(self, inst):
-        if sys.version_info < (3, 8):
-            v, k = self.popn(2)
-        else:
-            k, v = self.popn(2)
-
+        k, v = self.popn(2)
         assert inst.argval > 0
         obj = self.stack[-inst.arg]
         assert isinstance(obj, ConstDictVariable)
@@ -1210,8 +1254,14 @@ def LIST_APPEND(self, inst):
     def MAKE_FUNCTION(self, inst):
         flags = inst.arg
         old_stack = list(self.stack)
-        fn_name = self.pop()
+        if sys.version_info < (3, 11):
+            fn_name = self.pop()
         code = self.pop()
+        if sys.version_info >= (3, 11):
+            # MAKE_FUNCTION behavior actually changed in 3.11, see
+            # https://github.com/python/cpython/pull/93189/
+            assert hasattr(code.value, "co_qualname")
+            fn_name = ConstantVariable(value=code.value.co_qualname)
         defaults = None
         closure = None
         annotations = None
@@ -1332,8 +1382,8 @@ def FORMAT_VALUE(self, inst):
             fmt_spec = ConstantVariable("")
 
         value = self.pop()
-        if isinstance(value, DynamicShapeVariable):
-            value = ConstantVariable(str(value.dyn_shape))
+        if isinstance(value, SymNodeVariable):
+            value = ConstantVariable(str(value.sym_num))
         if (flags & 0x03) == 0x01:
             value = BuiltinVariable(str).call_function(self, [value], {})
         elif (flags & 0x03) == 0x02:
@@ -1428,11 +1478,13 @@ def MATCH_KEYS(self, inst):
         assert isinstance(tos1, ConstDictVariable)
         match_obj = tos1.items
         if all(key in match_obj for key in keys):
-            self.push(TupleVariable(list(match_obj[key] for key in keys)))
-            self.push(ConstantVariable(True))
+            self.push(TupleVariable([match_obj[key] for key in keys]))
+            if sys.version_info < (3, 11):
+                self.push(ConstantVariable(True))
         else:
             self.push(ConstantVariable(None))
-            self.push(ConstantVariable(False))
+            if sys.version_info < (3, 11):
+                self.push(ConstantVariable(False))
 
     UNARY_POSITIVE = stack_op(operator.pos)
     UNARY_NEGATIVE = stack_op(operator.neg)
@@ -1445,6 +1497,7 @@ def MATCH_KEYS(self, inst):
     BINARY_FLOOR_DIVIDE = stack_op(operator.floordiv)
     BINARY_TRUE_DIVIDE = stack_op(operator.truediv)
     BINARY_MODULO = stack_op(operator.mod)
+    BINARY_REMAINDER = stack_op(operator.mod)
     BINARY_ADD = stack_op(operator.add)
     BINARY_SUBTRACT = stack_op(operator.sub)
     BINARY_SUBSCR = break_graph_if_unsupported(push=1)(stack_op(operator.getitem))
@@ -1460,6 +1513,7 @@ def MATCH_KEYS(self, inst):
     INPLACE_FLOOR_DIVIDE = stack_op(operator.ifloordiv)
     INPLACE_TRUE_DIVIDE = stack_op(operator.itruediv)
     INPLACE_MODULO = stack_op(operator.imod)
+    INPLACE_REMAINDER = stack_op(operator.imod)
     INPLACE_ADD = stack_op(operator.iadd)
     INPLACE_SUBTRACT = stack_op(operator.isub)
     INPLACE_LSHIFT = stack_op(operator.ilshift)
@@ -1468,6 +1522,110 @@ def MATCH_KEYS(self, inst):
     INPLACE_XOR = stack_op(operator.ixor)
     INPLACE_OR = stack_op(operator.ior)
 
+    # 3.11 opcodes
+    # note: passed opcodes are intentional
+    def RESUME(self, inst):
+        pass
+
+    def BINARY_OP(self, inst):
+        if sys.version_info >= (3, 11):
+            opname = dis._nb_ops[inst.arg][0][3:]
+            if opname.startswith("INPLACE"):
+                return getattr(self, "INPLACE_" + opname[8:])(inst)
+            return getattr(self, "BINARY_" + opname)(inst)
+        else:
+            unimplemented("BINARY_OP requires Python 3.11+")
+
+    def PRECALL(self, inst):
+        pass
+
+    def KW_NAMES(self, inst):
+        kw_names = self.code_options["co_consts"][inst.arg]
+        assert isinstance(kw_names, tuple)
+        for name in kw_names:
+            assert isinstance(name, str)
+        assert self.kw_names is None
+        self.kw_names = ConstantVariable(value=kw_names)
+
+    def PUSH_NULL(self, inst):
+        self.push(NullVariable())
+
+    @break_graph_if_unsupported(push=1)
+    def CALL(self, inst):
+        # see https://docs.python.org/3.11/library/dis.html#opcode-CALL
+        # for convention
+        contents = self.popn(inst.arg + 2)
+        if isinstance(contents[0], NullVariable):
+            fn = contents[1]
+            args = []
+        else:
+            fn = contents[0]
+            args = [contents[1]]
+        kw_names = self.kw_names.value if self.kw_names else ()
+        if kw_names:
+            args = args + contents[2 : -len(kw_names)]
+            kwargs_list = contents[-len(kw_names) :]
+            kwargs = dict(zip(kw_names, kwargs_list))
+            assert len(kwargs) == len(kw_names)
+        else:
+            args = args + contents[2:]
+            kwargs = {}
+        self.call_function(fn, args, kwargs)
+        self.kw_names = None
+        # 3.11 removed POP_BLOCK, so we manually pop the block stack here
+        if (
+            isinstance(fn, WithExitFunctionVariable)
+            and len(self.block_stack) > 0
+            and id(fn) == self.block_stack[-1].id
+        ):
+            self.block_stack.pop()
+
+    def COPY(self, inst):
+        self.push(self.stack[-inst.arg])
+
+    def SWAP(self, inst):
+        self.stack[-1], self.stack[-inst.arg] = self.stack[-inst.arg], self.stack[-1]
+
+    JUMP_BACKWARD = jump
+    JUMP_BACKWARD_NO_INTERRUPT = jump
+
+    POP_JUMP_FORWARD_IF_TRUE = generic_jump(operator.truth, False)
+    POP_JUMP_BACKWARD_IF_TRUE = generic_jump(operator.truth, False)
+    POP_JUMP_FORWARD_IF_FALSE = generic_jump(operator.not_, False)
+    POP_JUMP_BACKWARD_IF_FALSE = generic_jump(operator.not_, False)
+
+    POP_JUMP_FORWARD_IF_NOT_NONE = generic_jump(is_not_none, False)
+    POP_JUMP_BACKWARD_IF_NOT_NONE = generic_jump(is_not_none, False)
+    POP_JUMP_FORWARD_IF_NONE = generic_jump(is_none, False)
+    POP_JUMP_BACKWARD_IF_NONE = generic_jump(is_none, False)
+
+    def CACHE(self, inst):
+        pass
+
+    def BEFORE_WITH(self, inst):
+        ctx = self.pop()
+        if not isinstance(ctx, ContextWrappingVariable):
+            unimplemented(f"BEFORE_WITH {ctx}")
+        self.output.guards.update(ctx.guards)
+
+        exit = WithExitFunctionVariable(
+            ctx,
+            inst.target,
+            **VariableTracker.propagate(ctx),
+        )
+        # 3.11 no longer uses a block stack, but we still keep track of one
+        # so that we know which contexts are currently active.
+        if isinstance(self, InstructionTranslator):
+            self.block_stack.append(
+                BlockStackEntry(id(exit), inst.target, self.real_stack_len(), ctx)
+            )
+        else:
+            # can't restore this while inlining
+            self.block_stack.append(BlockStackEntry(id(exit), inst.target))
+
+        self.push(exit)
+        self.push(ctx.enter(self))
+
     def copy_graphstate(self) -> InstructionTranslatorGraphState:
         """Create a checkpoint of the current state by copying everything"""
         return InstructionTranslatorGraphState(
@@ -1566,6 +1724,7 @@ def __init__(
         self.next_instruction = None
         self.block_stack = []
         self.lineno = code_options["co_firstlineno"]
+        self.kw_names = None
 
         # Properties of the input/output code
         self.instructions: List[Instruction] = instructions
@@ -1580,8 +1739,10 @@ def __init__(
 
         # Execution record for replaying errors
         self.exec_recorder = ExecutionRecorder(code=f_code, code_options=code_options)
-        # Stack of module being parsed, current nn.module is at the end of ordered dict
-        self.nn_module_stack: Dict[str, str] = {}
+        # Stack of module being parsed, current nn.module is at the end of ordered dict.
+        # The first field of tuple is the fully qualified name of current module
+        # in original hierarchy.  The second field is the type of current nn.module
+        self.nn_module_stack: Dict[str, Tuple[str, Type[Any]]] = {}
         # Flag to indicate whether tracing is used for export.
         self.export = export
 
@@ -1618,8 +1779,8 @@ def __init__(
         export,
         mutated_closure_cell_contents: Set[str],
     ):
-        super(InstructionTranslator, self).__init__(
-            output=OutputGraph(f_globals, code_options, compiler_fn, self),
+        super().__init__(
+            output=OutputGraph(f_globals, code_options, compiler_fn, self, export),
             instructions=instructions,
             f_locals=f_locals,
             f_globals=f_globals,
@@ -1641,8 +1802,17 @@ def __init__(
 
         vars = list(code_options["co_varnames"])
         vars.extend(x for x in self.cell_and_freevars() if x not in vars)
+
         self.symbolic_locals = collections.OrderedDict(
-            (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
+            (
+                k,
+                VariableBuilder(
+                    self,
+                    LocalInputSource(k, code_options["co_varnames"].index(k))
+                    if k in code_options["co_varnames"]
+                    else LocalSource((k)),
+                )(f_locals[k]),
+            )
             for k in vars
             if k in f_locals
         )
@@ -1714,7 +1884,25 @@ def create_call_resume_at(self, inst):
             for k in self.symbolic_locals.keys()
             if k in reads and k not in self.cell_and_freevars()
         )
-        nargs = len(self.stack) + len(argnames)
+
+        cg = PyCodegen(self)
+
+        # Python does not allow null to be an arg to a function, so
+        # we remove nulls from the stack and restore them in the
+        # prologue of the resume function
+        null_idxes: List[int] = []
+        if sys.version_info >= (3, 11):
+            for i, var in enumerate(reversed(self.stack)):
+                if isinstance(var, NullVariable):
+                    for j in range(2, i + 2 - len(null_idxes)):
+                        cg.append_output(create_instruction("SWAP", j))
+                    null_idxes.append(i + 1)
+                    cg.extend_output(cg.pop_null())
+
+        # we popped all nulls from the stack at runtime,
+        # so we should not count NullVariables
+        stack_len = len(self.stack) - len(null_idxes)
+        nargs = stack_len + len(argnames)
 
         name = unique_id(f"__resume_at_{inst.offset}")
 
@@ -1722,40 +1910,37 @@ def create_call_resume_at(self, inst):
             self.f_code,
             self.lineno,
             inst.offset,
-            len(self.stack),
+            stack_len,
             argnames,
             tuple(b.resume_fn() for b in self.block_stack),
+            tuple(null_idxes),
         )
 
-        cg = PyCodegen(self)
-
         if new_code.co_freevars:
-            cg.make_function_with_closure(name, new_code, len(self.stack))
+            cg.make_function_with_closure(name, new_code, stack_len)
         else:
             self.output.install_global(
                 name, types.FunctionType(new_code, self.f_globals, name)
             )
-            cg.extend_output(cg.load_function_name(name, len(self.stack)))
+            cg.extend_output(cg.load_function_name(name, True, stack_len))
 
         cg.extend_output([cg.create_load(k) for k in argnames])
-        cg.extend_output(
-            [
-                create_instruction("CALL_FUNCTION", nargs),
-                create_instruction("RETURN_VALUE"),
-            ]
-        )
+        cg.extend_output(create_call_function(nargs, False))
+        cg.append_output(create_instruction("RETURN_VALUE"))
         return cg.get_instructions()
 
     def RETURN_VALUE(self, inst):
-        if self.output.count_calls() == 0:
-            raise exc.SkipFrame()
+        if self.output.count_calls() == 0 and not self.export:
+            raise exc.SkipFrame("because no content in function call")
         self.instruction_pointer = None
         _step_logger()(
             logging.INFO,
             f"torchdynamo done tracing {self.f_code.co_name} (RETURN_VALUE)",
         )
         log.debug("RETURN_VALUE triggered compile")
-        self.output.compile_subgraph(self)
+        self.output.compile_subgraph(
+            self, reason=GraphCompileReason("return_value", [self.frame_summary()])
+        )
         self.output.add_output_instructions([create_instruction("RETURN_VALUE")])
 
 
@@ -1791,7 +1976,7 @@ def inline_call_(parent, func, args, kwargs):
             func.get_filename()
         ) and not skipfiles.is_torch_inline_allowed(func.get_filename()):
             unimplemented(
-                f"inline in skipfiles: {func.get_name()} {func.get_filename()}"
+                f"inline in skipfiles: {func.fn.__qualname__}  | {func.get_name()} {func.get_filename()}"
             )
 
         try:
@@ -1864,7 +2049,7 @@ def __init__(
         f_builtins = f_globals["__builtins__"]
         if not isinstance(f_builtins, dict):
             f_builtins = f_builtins.__dict__
-        super(InliningInstructionTranslator, self).__init__(
+        super().__init__(
             output=parent.output,
             f_locals={},
             f_globals=f_globals,
@@ -1962,7 +2147,7 @@ class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
     generated_items: List[VariableTracker]
 
     def __init__(self, *args, **kwargs):
-        super(InliningGeneratorInstructionTranslator, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.generated_items = []
 
     def YIELD_VALUE(self, inst: Instruction):
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 39eda31646d2..e8d5e7aa60ba 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -1,7 +1,6 @@
 import contextlib
 import importlib
 import sys
-from unittest.mock import patch
 
 import torch
 import torch.testing
@@ -52,7 +51,7 @@ def setUpClass(cls):
         super().setUpClass()
         cls._exit_stack = contextlib.ExitStack()
         cls._exit_stack.enter_context(
-            patch.object(config, "raise_on_ctx_manager_usage", True)
+            config.patch(raise_on_ctx_manager_usage=True, suppress_errors=False),
         )
 
     def setUp(self):
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index 9a1e5804a443..247e73f95013 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -31,12 +31,6 @@ def tearDownClass(cls):
         cls._debug_dir_obj.cleanup()
         cls._exit_stack.close()
 
-    def setUp(self):
-        super().setUp()
-
-    def tearDown(self):
-        super().tearDown()
-
     # Search for the name of the first function defined in a code string.
     def _get_fn_name(self, code):
         fn_name_match = re.search(r"def (\w+)\(", code)
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 832b6f8ce343..e1770b81eac3 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -32,6 +32,16 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
+def skip_if_pytest(fn):
+    @functools.wraps(fn)
+    def wrapped(*args, **kwargs):
+        if "PYTEST_CURRENT_TEST" in os.environ:
+            raise unittest.SkipTest("does not work under pytest")
+        return fn(*args, **kwargs)
+
+    return wrapped
+
+
 def named_parameters_for_optimized_module(mod):
     assert isinstance(mod, eval_frame.OptimizedModule)
     return mod._orig_mod.named_parameters
@@ -46,7 +56,7 @@ def remove_optimized_module_prefix(name):
     prefix = "_orig_mod."
     assert name.startswith(prefix)
     name = name[len(prefix) :]
-    return torch.distributed.fsdp._common_utils.clean_tensor_name(name)
+    return name
 
 
 def collect_results(model, prediction, loss, example_inputs):
@@ -175,7 +185,7 @@ def __init__(self, backend):
         self.backend = backend
 
     def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-        from torch._dynamo.eval_frame import lookup_backend
+        from .backends.registry import lookup_backend
 
         self.frame_count += 1
         for node in gm.graph.nodes:
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index 4abba9014df2..4ef9af8625ea 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -1,9 +1,17 @@
 import dataclasses
 import sys
 import types
-from typing import Callable, Dict, List, NamedTuple, Optional, OrderedDict, Union
+from typing import (
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    OrderedDict,
+    Protocol,
+    Union,
+)
 
-from typing_extensions import Protocol
 
 if sys.version_info >= (3, 11):
     from torch._C._dynamo import eval_frame
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index f9acade618af..0fb798a71852 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -5,11 +5,11 @@
 import dataclasses
 import datetime
 import dis
+import enum
 import functools
 import gc
 import inspect
 import itertools
-import logging
 import logging.config
 import math
 import operator
@@ -23,7 +23,7 @@
 import weakref
 from contextlib import contextmanager
 from functools import lru_cache, wraps
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 try:
     import numpy as np
@@ -33,7 +33,10 @@
     np = None  # type: ignore[assignment]
     HAS_NUMPY = False
 
+import importlib
+
 import torch
+import torch.fx.experimental.symbolic_shapes
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._subclasses.fake_tensor import FakeTensor
@@ -50,7 +53,6 @@
 # profiling compilation time
 compilation_metrics = collections.OrderedDict()
 
-
 timer_counter = itertools.count()
 
 
@@ -90,6 +92,7 @@ def profile_wrapper(*args, **kwargs):
 
 curr_frame = 0
 
+
 # Note: Called for you by dynamo - you almost never ever want to invoke this yourself.
 def increment_frame():
     global curr_frame
@@ -103,6 +106,14 @@ def reset_frame_count():
     curr_frame = 0
 
 
+op_count = 0
+
+
+def increment_op_count(cnt):
+    global op_count
+    op_count += cnt
+
+
 # Print a report of time spent so far
 # Ex:
 # TIMING:
@@ -186,7 +197,6 @@ def compile_times(repr="str", aggregate=False):
     """
 
     def fmt_fn(values, item_fn=lambda x: x):
-
         if aggregate:
             return item_fn(sum(values))
         return ", ".join(map(item_fn, values))
@@ -222,7 +232,7 @@ def fmt_fn(values, item_fn=lambda x: x):
 }
 
 
-class DuplicateWarningChecker(object):
+class DuplicateWarningChecker:
     def __init__(self, maxsize=4096):
         self.maxsize = maxsize
         self.reset()
@@ -690,8 +700,9 @@ def is_safe_constant(v):
             slice,
             type(type),
             torch.device,
+            torch.dtype,
         ),
-    )
+    ) or isinstance(v, enum.Enum)
 
 
 def check_constant_args(args, kwargs):
@@ -740,12 +751,33 @@ def tuple_iterator_getitem(it, index):
     return obj[start + index]
 
 
+def enum_repr(value):
+    # Workaround repr(Enum) returning invalid global reference before python 3.11
+    # https://peps.python.org/pep-0663/
+    if sys.version_info < (3, 11):
+        return str(value)
+    else:
+        return repr(value)
+
+
 def dict_param_key_ids(value):
-    return set([id(k) for k in value.keys() if isinstance(k, torch.nn.Parameter)])
+    return {id(k) for k in value.keys() if isinstance(k, torch.nn.Parameter)}
 
 
 def dict_const_keys(value):
-    return set(k for k in value.keys() if not isinstance(k, torch.nn.Parameter))
+    return {k for k in value.keys() if not isinstance(k, torch.nn.Parameter)}
+
+
+def dict_const_keys_repr(const_keys):
+    if any(isinstance(k, enum.Enum) for k in const_keys):
+        # To workaround repr(Enum) returning invalid global reference before python 3.11
+        # by calling enum_repr and removing quotes to render enum in guard code.
+        const_keys_str = f"{ {enum_repr(k) if isinstance(k, enum.Enum) else repr(k) for k in const_keys} }".replace(
+            "'", ""
+        )
+    else:
+        const_keys_str = f"{const_keys!r}"
+    return const_keys_str
 
 
 def global_key_name(key):
@@ -901,7 +933,7 @@ def same(
                 ):
                     # In the presence of noise, noise might dominate our error
                     # metric for smaller tensors.
-                    # Similary, for 1x1 kenerls, there seems to be high noise with amp.
+                    # Similary, for 1x1 kernels, there seems to be high noise with amp.
                     multiplier = 3.0
 
                 passes_test = res_error <= (multiplier * ref_error + tol / 10.0)
@@ -922,14 +954,14 @@ def same(
     elif isinstance(ref, float):
         r = math.isclose(ref, res, rel_tol=tol, abs_tol=tol)
         if not r:
-            log.error("Accuracy failed (float): {ref} != {res} (within tol={tol})")
+            log.error(f"Accuracy failed (float): {ref} != {res} (within tol={tol})")
         return r
     elif is_numpy_int_type(ref) or is_numpy_float_type(ref):
         if relax_numpy_equality:
             ref = ref.item()
         r = (type(ref) is type(res)) and (ref == res)
         if not r:
-            log.error("Accuracy failed (numpy): {ref} != {res}")
+            log.error(f"Accuracy failed (numpy): {ref} != {res}")
         return r
     elif is_numpy_ndarray(ref):
         return (type(ref) is type(res)) and (ref == res).all()
@@ -1031,7 +1063,7 @@ def recompile_reasons(code):
             rpt += "\n"
             rpt += "The following conditions caused torchdynamo to break out of tracing and fall back to python.\n"
             rpt += (
-                f"You may gain additional insight by passing `nopython=True` to {config.dynamo_import}.optimize, "
+                "You may gain additional insight by passing `nopython=True` to torch._dynamo.optimize, "
                 "to break on the first condition.\n"
             )
             graph_breaks = counters["graph_break"]
@@ -1056,7 +1088,7 @@ def recompile_reasons(code):
             )
             rpt += "\n"
             rpt += (
-                f"Set {config.dynamo_import}.config.cache_size_limit to "
+                f"Set torch._dynamo.config.cache_size_limit to "
                 f"{max_recompiles} to avoid being cache limited.\n"
             )
         else:
@@ -1068,7 +1100,13 @@ def recompile_reasons(code):
 # return same dir unless user changes config between calls
 @functools.lru_cache(None)
 def _get_debug_dir(root_dir):
-    dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+    dir_name = (
+        "run_"
+        + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+        # use pid to avoid conflicts among ranks
+        + "-pid_"
+        + str(os.getpid())
+    )
     return os.path.join(root_dir, dir_name)
 
 
@@ -1123,14 +1161,15 @@ def visit(n: torch.fx.Node):
         if isinstance(
             cause, torch._subclasses.fake_tensor.DataDependentOutputException
         ):
-            if config.capture_scalar_outputs and node.target == "item":
-                return torch.zeros(size=(), dtype=args[0].dtype).item()
-            else:
-                unimplemented(f"data dependent operator: {cause.func}")
+            unimplemented(f"data dependent operator: {cause.func}")
         elif isinstance(
             cause, torch._subclasses.fake_tensor.DynamicOutputShapeException
         ):
             unimplemented(f"dynamic shape operator: {cause.func}")
+        elif isinstance(
+            cause, torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
+        ):
+            unimplemented("guard on data-dependent symbolic int/float")
         raise TorchRuntimeError() from e
 
 
@@ -1240,3 +1279,95 @@ def fake_mode_from_tensors(inputs: List[Any]):
             else:
                 assert fake_mode is flat_input.fake_mode
     return fake_mode
+
+
+def fqn(obj: Any):
+    """
+    Returns the fully qualified name of the object.
+    """
+    return f"{obj.__module__}.{obj.__qualname__}"
+
+
+def ifdyn(count1, count2):
+    if torch._dynamo.config.dynamic_shapes:
+        return count1
+    else:
+        return count2
+
+
+def import_submodule(mod: types.ModuleType):
+    """
+    Ensure all the files in a given submodule are imported
+    """
+    for filename in sorted(os.listdir(os.path.dirname(mod.__file__))):
+        if filename.endswith(".py") and filename[0] != "_":
+            importlib.import_module(f"{mod.__name__}.{filename[:-3]}")
+
+
+def object_has_getattribute(value: Any):
+    try:
+        if isinstance(
+            inspect.getattr_static(type(value), "__getattribute__"),
+            types.FunctionType,
+        ):
+            return True
+    except AttributeError:
+        pass
+    return False
+
+
+def get_custom_getattr(value: Any):
+    try:
+        getattr_fn = inspect.getattr_static(type(value), "__getattr__")
+    except AttributeError:
+        getattr_fn = None
+    if getattr_fn is torch.nn.Module.__getattr__:
+        # ignore this case of getattr
+        getattr_fn = None
+    return getattr_fn
+
+
+class TensorStaticReason(enum.Enum):
+    NO_SOURCE = 1
+    PARAMETER = 2
+    CONFIG_NOT_DYN = 3
+    NOT_TENSOR = 4
+
+
+def tensor_static_reason_to_message(reason: TensorStaticReason):
+    if reason == TensorStaticReason.NO_SOURCE:
+        return "mark_dynamic usage without a source is illegal."
+    if reason == TensorStaticReason.PARAMETER:
+        return "mark_dynamic on parameter, parameters are always static today."
+    if reason == TensorStaticReason.CONFIG_NOT_DYN:
+        return "mark_dynamic usage with dynamic_shapes=False is not yet supported"
+    if reason == TensorStaticReason.NOT_TENSOR:
+        return "mark_dynamic on a non tensor, how did this happen?"
+    raise AssertionError(f"Illegal reason {reason}")
+
+
+def tensor_shape_should_be_static(
+    tensor: Union[torch.Tensor, Any], source: Optional["Source"], is_tensor: bool
+) -> Tuple[bool, TensorStaticReason]:
+    """
+    Given a tensor, source, and is_tensor flag, determine if a shape should be static.
+
+    Args:
+    tensor - the real tensor to evaluate, parameters force a static shape.
+    source - an optional source, None forces a static shape
+    is_tensor - internal dynamo check, esentially "is_tensor": target_cls is TensorVariable,
+    tensors not in a TensorVariable for whatever reason are forced static.
+
+    Returns a tuple, where the first element is the bool of whether or not this tensor should have a static shape.
+    The second element is a TensorStaticReason, useful for passing to tensor_static_reason_to_message if needed.
+    """
+    if source is None:
+        # TODO(voz): Look into why we need this case?
+        return True, TensorStaticReason.NO_SOURCE
+    if type(tensor) is torch.nn.Parameter:
+        return True, TensorStaticReason.PARAMETER
+    if config.dynamic_shapes is False:
+        return True, TensorStaticReason.CONFIG_NOT_DYN
+    if not is_tensor:
+        return True, TensorStaticReason.NOT_TENSOR
+    return False, None
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index c26b93320836..ee928f1a5f44 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -21,6 +21,8 @@
     BlackHoleVariable,
     ClosureVariable,
     ContextWrappingVariable,
+    CUDAStreamContextVariable,
+    CUDAStreamVariable,
     GetAttrVariable,
     GradModeVariable,
     InspectSignatureVariable,
@@ -35,8 +37,8 @@
 )
 from .nn_module import NNModuleVariable, UnspecializedNNModuleVariable
 from .tensor import (
-    DynamicShapeVariable,
     FakeItemVariable,
+    SymNodeVariable,
     TensorVariable,
     UnspecializedPythonVariable,
 )
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 52161a8dbdcb..224c0c9a1b62 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -29,7 +29,7 @@ def __call__(cls, *args, **kwargs):
         return obj
 
 
-class VariableTracker(object, metaclass=HasPostInit):
+class VariableTracker(metaclass=HasPostInit):
     """
     Base class for tracked locals and stack values
 
@@ -222,7 +222,7 @@ def num_parameters(self):
         unimplemented(f"num_parameters: {self}")
 
     def call_hasattr(self, tx, name: str) -> "VariableTracker":
-        unimplemented(f"hasattr: {self}")
+        unimplemented(f"hasattr: {repr(self)}")
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
@@ -259,7 +259,7 @@ def __init__(
         mutable_local: MutableLocal = None,
         recursively_contains: Optional[Set] = None,
     ):
-        super(VariableTracker, self).__init__()
+        super().__init__()
         self.guards = guards or set()
         self.source = source
         self.mutable_local = mutable_local
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 83dffdf1339d..1bee600e1dbf 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -3,12 +3,10 @@
 import enum
 import functools
 import inspect
-import math
-import numbers
 import operator
 import re
 import types
-from typing import Any, Optional, Union
+from typing import Any, NamedTuple, Optional, Union
 
 import torch
 
@@ -30,6 +28,7 @@
     GlobalSource,
     GlobalWeakRefSource,
     is_constant_source,
+    LocalInputSource,
     LocalSource,
     RandomValueSource,
     Source,
@@ -44,18 +43,19 @@
     is_namedtuple,
     is_numpy_int_type,
     is_typing,
-    istensor,
     istype,
     np,
     odict_values,
     preserve_rng_state,
+    tensor_shape_should_be_static,
+    tensor_static_reason_to_message,
     tuple_iterator,
     tuple_iterator_getitem,
     tuple_iterator_len,
     wrap_fake_exception,
 )
 
-from .base import MutableLocal, typestr
+from .base import MutableLocal, typestr, VariableTracker
 from .builtin import BuiltinVariable
 from .constant import ConstantVariable, EnumVariable
 from .dicts import (
@@ -64,7 +64,7 @@
     DefaultDictVariable,
     HFPretrainedConfigVariable,
 )
-from .functions import UserFunctionVariable
+from .functions import UserFunctionVariable, UserMethodVariable
 from .lists import (
     ListIteratorVariable,
     ListVariable,
@@ -78,6 +78,7 @@
     AutogradFunctionContextVariable,
     AutogradFunctionVariable,
     ComptimeVariable,
+    CUDAStreamVariable,
     GetAttrVariable,
     InspectSignatureVariable,
     LambdaVariable,
@@ -88,8 +89,7 @@
 )
 from .nn_module import UnspecializedNNModuleVariable
 from .tensor import (
-    DynamicShapeVariable,
-    FakeItemVariable,
+    SymNodeVariable,
     TensorVariable,
     TensorWithTFOverrideVariable,
     UnspecializedPythonVariable,
@@ -113,7 +113,6 @@ class GraphArg:
     example: Any
     is_unspecialized: bool
     fake_tensor: Optional[torch._subclasses.fake_tensor.FakeTensor]
-
     # UnspecializedPythonVariable often masquerades as a tensor.
     # We MUST NOT generate shape guard code
     # that actually tries to access tensor properties on these values.
@@ -126,6 +125,11 @@ def __post_init__(self):
             assert isinstance(
                 self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
             )
+            # Mapping for downstream systems to remap back into dynamo arg positions
+            if isinstance(self.source, LocalInputSource):
+                if "graph_arg_pos" not in self.fake_tensor.__dict__:
+                    self.fake_tensor.__dict__["graph_arg_pos"] = []
+                self.fake_tensor.__dict__["graph_arg_pos"].append(self.source.pos)
         if isinstance(self.example, torch._subclasses.fake_tensor.FakeTensor):
             raise AssertionError("Fake Tensor observed in TorchDynamo Fx graph inputs")
 
@@ -158,7 +162,7 @@ def __init__(
         source: Source,
     ):
         assert source is not None
-        super(VariableBuilder, self).__init__()
+        super().__init__()
         self.tx = tx
         self.source = source
         self.name = source.name()
@@ -226,69 +230,122 @@ def make_guards(self, *guards):
             return None
         return {source.make_guard(guard) for guard in guards}
 
-    def _wrap(self, value):
+    @classmethod
+    @functools.lru_cache(None)
+    def _type_dispatch(cls):
+        # NB: Careful not to close over self to avoid ref cycle from lru_cache
+        entries = [
+            (
+                (torch.Tensor, torch.nn.Parameter, torch._subclasses.FakeTensor),
+                cls.wrap_tensor,
+            ),
+            ((torch.SymInt, torch.SymFloat), cls.wrap_sym),
+            ((tuple, list, odict_values), cls.wrap_listlike),
+            (tuple_iterator, cls.wrap_tuple_iterator),
+            ((slice, range), cls.wrap_slice_range),
+            (
+                (
+                    int,
+                    float,
+                    bool,
+                    type(None),
+                    str,
+                    torch.Size,
+                    torch.device,
+                    torch.dtype,
+                ),
+                cls.wrap_literal,
+            ),
+        ]
+
+        result = {}
+        for ts, fn in entries:
+            for t in ts if isinstance(ts, tuple) else (ts,):
+                assert t not in result
+                result[t] = fn
+
+        return result
+
+    @classmethod
+    @functools.lru_cache(None)
+    def _id_dispatch(cls):
         from ..comptime import comptime
 
+        entries = [
+            (
+                inspect.signature,
+                lambda self, value: LambdaVariable(
+                    InspectSignatureVariable.create,
+                    source=self.source,
+                    guards=self.make_guards(GuardBuilder.FUNCTION_MATCH),
+                ),
+            ),
+            (comptime, lambda self, value: ComptimeVariable()),
+            (
+                dataclasses.fields,
+                lambda self, value: LambdaVariable(
+                    _dataclasses_fields_lambda,
+                    source=self.source,
+                    guards=self.make_guards(GuardBuilder.FUNCTION_MATCH),
+                ),
+            ),
+            (
+                tensor_dunder_fns,
+                lambda self, value: TorchVariable(
+                    value,
+                    source=self.source,
+                    guards=self.make_guards(GuardBuilder.FUNCTION_MATCH),
+                ),
+            ),
+        ]
+
+        result = {}
+        for ts, fn in entries:
+            for t in ts if isinstance(ts, (tuple, list)) else (ts,):
+                assert t not in result
+                result[id(t)] = fn
+
+        return result
+
+    def _wrap(self, value):
         make_guards = self.make_guards
-        if istype(value, (torch.SymInt, torch.SymFloat)):
-            return self.wrap_sym(value)
-        if istensor(value):
+
+        # Handle exact type() match
+        type_dispatch = self._type_dispatch().get(type(value))
+        if type_dispatch is not None:
+            return type_dispatch(self, value)
+
+        # Handle exact id() match
+        id_dispatch = self._id_dispatch().get(id(value))
+        if id_dispatch is not None:
+            return id_dispatch(self, value)
+
+        # Everything else (NB: order matters!)
+        if istype(value, config.traceable_tensor_subclasses):
             return self.wrap_tensor(value)
-        elif istype(value, (tuple, list, odict_values)) or is_namedtuple(value):
-            # One can index a tensor with a list/tuple. Therefore, we need to
-            # have a stricter match.
-            if istype(value, (tuple, list)) and all(
-                [isinstance(x, int) or is_numpy_int_type(x) or x is None for x in value]
-            ):
-                guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
-            else:
-                guards = self.make_guards(GuardBuilder.LIST_LENGTH)
-            output = [
-                VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
-                    item
-                ).add_guards(guards)
-                for i, item in enumerate(value)
-            ]
-            result = self.list_type(value)(output, guards=guards)
-            if istype(value, list):
-                return self.tx.output.side_effects.track_list(
-                    self.source, value, result
-                )
-            return result
-        elif istype(value, tuple_iterator):
-            guards = self.make_guards(GuardBuilder.TUPLE_ITERATOR_LEN)
-            output = [
-                VariableBuilder(
-                    self.tx, TupleIteratorGetItemSource(self.get_source(), i)
-                )(tuple_iterator_getitem(value, i)).add_guards(guards)
-                for i in range(tuple_iterator_len(value))
-            ]
-            return ListIteratorVariable(
-                output, mutable_local=MutableLocal(), guards=guards
-            )
-        elif istype(value, (slice, range)):
-            items = [
-                VariableBuilder(self.tx, AttrSource(self.get_source(), k))(
-                    getattr(value, k)
-                )
-                for k in ("start", "stop", "step")
-            ]
-            if isinstance(value, slice):
-                return SliceVariable(items, guards=make_guards(GuardBuilder.TYPE_MATCH))
-            else:
-                return RangeVariable(
-                    items, guards=make_guards(GuardBuilder.EQUALS_MATCH)
-                )
+        elif is_namedtuple(value):
+            return self.wrap_listlike(value)
         elif istype(
             value, (dict, collections.defaultdict, collections.OrderedDict)
         ) and all(
             map(
                 lambda k: ConstantVariable.is_literal(k)
-                or self.tensor_can_be_dict_key(k),
+                or self.tensor_can_be_dict_key(k)
+                or isinstance(k, enum.Enum),
                 value.keys(),
             )
         ):
-            guards = self.make_guards(GuardBuilder.DICT_KEYS)
+            if not value and self.get_source().is_nn_module():
+                # It is faster to guard on 'false' property than to guard
+                # on actual dict keys, but we can't do this fast guard in general because
+                # it omits a crucial type check that ensures the value is actually still a dict at runtime.
+
+                # Why is this OK for (specialized) nnmodules? We set up a setattr hook
+                # to check for module property mutations, which does a reasonable,
+                # but not completely secure job ensuring a property wasn't changed.
+                guards = self.make_guards(GuardBuilder.BOOL_FALSE)
+            else:
+                guards = self.make_guards(GuardBuilder.DICT_KEYS)
 
             # store key variables in global location for reconstruction
             for key in value.keys():
@@ -301,17 +358,12 @@ def index_source(key):
                 else:
                     return key
 
-            result = dict(
-                [
-                    (
-                        k,
-                        VariableBuilder(
-                            self.tx, GetItemSource(self.get_source(), index_source(k))
-                        )(value[k]).add_guards(guards),
-                    )
-                    for k in value.keys()
-                ]
-            )
+            result = {
+                k: VariableBuilder(
+                    self.tx, GetItemSource(self.get_source(), index_source(k))
+                )(value[k]).add_guards(guards)
+                for k in value.keys()
+            }
 
             if istype(value, collections.defaultdict):
                 result = DefaultDictVariable(
@@ -322,71 +374,10 @@ def index_source(key):
 
             return self.tx.output.side_effects.track_dict(self.source, value, result)
         elif isinstance(value, torch.nn.Module):
-            if (
-                isinstance(value, (torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM))
-                and not config.allow_rnn
-            ):
-                unimplemented("TorchDynamo purposely graph breaks on RNN, GRU, LSTMs")
-            if mutation_guard.is_dynamic_nn_module(value):
-                # created dynamically, don't specialize on it
-                result = UnspecializedNNModuleVariable(
-                    value, guards=make_guards(GuardBuilder.TYPE_MATCH)
-                )
-                if not SideEffects.cls_supports_mutation_side_effects(type(value)):
-                    # don't allow STORE_ATTR mutation with custom __setattr__
-                    return result
-                return self.tx.output.side_effects.track_object_existing(
-                    self.source, value, result
-                )
-            elif getattr(value, "_is_fsdp_managed_module", False) or issubclass(
-                value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
-            ):
-                if getattr(value, "_is_fsdp_managed_module", False):
-                    # Note: we can't do this assert inside FSDP constructor,
-                    # since we don't know yet whether dynamo will be used
-                    assert getattr(
-                        value, "_fsdp_use_orig_params", False
-                    ), "Dynamo only supports FSDP with use_orig_params=True"
-
-                # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
-                # in fully_sharded_data_parallel.py for more information
-                return UnspecializedNNModuleVariable(
-                    value, guards=make_guards(GuardBuilder.TYPE_MATCH)
-                )
-            else:
-                return self.tx.output.register_attr_or_module(
-                    value,
-                    self.name,
-                    source=self.get_source(),
-                    # Guards are added inside register_attr_or_module
-                )
-        elif ConstantVariable.is_literal(value) or istype(
-            value, (torch.Size, torch.device, torch.dtype)
-        ):
-            if type(value) in (int, float) and not config.specialize_int_float:
-                # unspecializing int/float by default, but still
-                # specialize for the following conditions
-                if (
-                    value in self._common_constants()
-                    or isinstance(self.source, GlobalSource)
-                    or isinstance(self.source, GetItemSource)
-                    or (
-                        isinstance(self.source, AttrSource)
-                        and isinstance(self.source.base, GlobalSource)
-                    )
-                ):
-                    return ConstantVariable(
-                        value=value,
-                        guards=make_guards(GuardBuilder.CONSTANT_MATCH),
-                    )
-                else:
-                    return self.wrap_unspecialized_primitive(value)
-            else:
-                return ConstantVariable(
-                    value=value,
-                    guards=make_guards(GuardBuilder.CONSTANT_MATCH),
-                )
-        elif isinstance(value, frozenset) and (
+            return self.wrap_module(value)
+        elif ConstantVariable.is_literal(value):  # non-atomic literals
+            return self.wrap_literal(value)
+        elif istype(value, frozenset) and (
             all(is_allowed(x) or ConstantVariable.is_literal(x) for x in value)
         ):
             # For frozenset, we can guard by object ID instead of value
@@ -421,20 +412,6 @@ def index_source(key):
                 source=self.source,
                 guards=make_guards(GuardBuilder.ID_MATCH),
             )
-        elif value is inspect.signature:
-            return LambdaVariable(
-                InspectSignatureVariable.create,
-                source=self.source,
-                guards=make_guards(GuardBuilder.FUNCTION_MATCH),
-            )
-        elif value is comptime:
-            return ComptimeVariable()
-        elif value is dataclasses.fields:
-            return LambdaVariable(
-                _dataclasses_fields_lambda,
-                source=self.source,
-                guards=make_guards(GuardBuilder.FUNCTION_MATCH),
-            )
         elif is_numpy(value):
             return NumpyVariable(
                 value,
@@ -445,12 +422,6 @@ def index_source(key):
                     else GuardBuilder.TYPE_MATCH
                 ),
             )
-        elif value in tensor_dunder_fns:
-            return TorchVariable(
-                value,
-                source=self.source,
-                guards=make_guards(GuardBuilder.FUNCTION_MATCH),
-            )
         elif (
             istype(value, (type, types.FunctionType))
             and skipfiles.check(getfile(value), allow_torch=True)
@@ -461,13 +432,8 @@ def index_source(key):
                 source=self.source,
                 guards=make_guards(GuardBuilder.FUNCTION_MATCH),
             )
-        elif value in tensor_dunder_fns:
-            return TorchVariable(
-                value,
-                source=self.source,
-                guards=make_guards(GuardBuilder.FUNCTION_MATCH),
-            )
-        elif istype(value, types.FunctionType):
+        # NB: These can't be put in type_dispatch, they have to run later
+        elif istype(value, (types.FunctionType, torch.jit.ScriptFunction)):
             return UserFunctionVariable(
                 value,
                 source=self.source,
@@ -479,7 +445,7 @@ def index_source(key):
                 source=self.source,
                 guards=make_guards(GuardBuilder.PYMODULE_MATCH),
             )
-        elif type(value) is torch.autograd.function.FunctionMeta:
+        elif istype(value, torch.autograd.function.FunctionMeta):
             return AutogradFunctionVariable(
                 value,
                 source=self.source,
@@ -490,8 +456,9 @@ def index_source(key):
             return AutogradFunctionContextVariable()
         elif (
             isinstance(value, types.MethodType)
-            and type(getattr(value, "__self__", None))
-            is torch.autograd.function.FunctionMeta
+            and istype(
+                getattr(value, "__self__", None), torch.autograd.function.FunctionMeta
+            )
             and getattr(value, "__name__", "") == "apply"
             and value == getattr(value.__self__, "apply", None)
         ):
@@ -504,9 +471,7 @@ def index_source(key):
                 ),
                 "apply",
             )
-        elif isinstance(value, (int, float)) or (
-            HAS_NUMPY and (isinstance(value, np.number))
-        ):
+        elif HAS_NUMPY and isinstance(value, np.number):
             return self.wrap_unspecialized_primitive(value)
         elif DataClassVariable.is_matching_object(value):
             return DataClassVariable.wrap(self, value).add_guards(
@@ -530,6 +495,13 @@ def index_source(key):
                 value,
                 guards=make_guards(GuardBuilder.FUNCTION_MATCH),
             )
+        elif isinstance(value, torch.cuda.streams.Stream):
+            return CUDAStreamVariable(
+                None,
+                value,
+                source=self.source,
+                guards=self.make_guards(GuardBuilder.ID_MATCH),
+            )
         elif issubclass(type(value), type):
             # TODO(whc) the following seems preferable but breaks some tests, debug
             # elif inspect.isclass(value):
@@ -538,6 +510,32 @@ def index_source(key):
                 source=self.source,
                 guards=make_guards(GuardBuilder.FUNCTION_MATCH),
             )
+        elif isinstance(value, types.MethodType) and isinstance(
+            value.__self__, torch.nn.Module
+        ):
+            # don't let MethodTypes fall through to UserDefinedObject,
+            # which doesn't support 'CALL_FUNCTION'
+
+            # TODO(whc): Why do we limit this to methods on NNModules?
+            # I don't have a good reason for this, but it preserves the existing behavior
+            # for MBartForConditionalGeneration, which generates many graph breaks and OOMs otherwise.
+            # I suspect we probably want to relax this check and dig deeper there.
+
+            # In order to construct a MethodVariable in Dynamo, we start with an actual method obj from python,
+            # but need to separately wrap its underlying `__func__` and its `self` argument.  We wrap `self` here
+            # and then `__func__` gets wrapped inside UserMethodVariable.
+            self_obj = VariableBuilder(
+                self.tx, source=AttrSource(self.source, "__self__")
+            )(value.__self__)
+            assert self_obj and isinstance(
+                self_obj, VariableTracker
+            ), "Failed to produce a valid self obj"
+            return UserMethodVariable(
+                value.__func__,
+                self_obj,
+                source=self.source,
+                guards=make_guards(GuardBuilder.FUNCTION_MATCH),
+            )
         else:
             result = UserDefinedObjectVariable(
                 value,
@@ -582,26 +580,135 @@ def tensor_should_specialize(self):
 
     def wrap_sym(self, value: Union[torch.SymInt, torch.SymFloat]):
         if not is_constant_source(self.get_source()):
-            self.tx.output.graphargs.append(
-                GraphArg(self.get_source(), value, False, None)
-            )
+            self.tx.output.add_grapharg(GraphArg(self.get_source(), value, False, None))
         elif is_constant_source(self.get_source()):
             return self.tx.output.register_attr_or_module(
                 value,
                 re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
                 source=None,
-                dyn_shape=value
+                sym_num=value
                 # shape Guards live their own rich life via shape_env
             )
-        return DynamicShapeVariable.create(
+        return SymNodeVariable.create(
             tx=self.tx,
             proxy=self.tx.output.create_graph_input(
                 re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
             ),
-            dyn_shape=value
+            sym_num=value
             # shape Guards live their own rich life via shape_env
         )
 
+    def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
+        # One can index a tensor with a list/tuple. Therefore, we need to
+        # have a stricter match.
+        if istype(value, (tuple, list)) and all(
+            [isinstance(x, int) or is_numpy_int_type(x) or x is None for x in value]
+        ):
+            guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
+        else:
+            guards = self.make_guards(GuardBuilder.LIST_LENGTH)
+        output = [
+            VariableBuilder(self.tx, GetItemSource(self.get_source(), i))(
+                item
+            ).add_guards(guards)
+            for i, item in enumerate(value)
+        ]
+        result = self.list_type(value)(output, guards=guards)
+        if istype(value, list):
+            return self.tx.output.side_effects.track_list(self.source, value, result)
+        return result
+
+    def wrap_tuple_iterator(self, value: tuple_iterator):
+        guards = self.make_guards(GuardBuilder.TUPLE_ITERATOR_LEN)
+        output = [
+            VariableBuilder(self.tx, TupleIteratorGetItemSource(self.get_source(), i))(
+                tuple_iterator_getitem(value, i)
+            ).add_guards(guards)
+            for i in range(tuple_iterator_len(value))
+        ]
+        return ListIteratorVariable(output, mutable_local=MutableLocal(), guards=guards)
+
+    def wrap_slice_range(self, value: Union[slice, range]):
+        items = [
+            VariableBuilder(self.tx, AttrSource(self.get_source(), k))(
+                getattr(value, k)
+            )
+            for k in ("start", "stop", "step")
+        ]
+        if isinstance(value, slice):
+            return SliceVariable(
+                items, guards=self.make_guards(GuardBuilder.TYPE_MATCH)
+            )
+        else:
+            return RangeVariable(
+                items, guards=self.make_guards(GuardBuilder.EQUALS_MATCH)
+            )
+
+    def wrap_module(self, value: torch.nn.Module):
+        if (
+            isinstance(value, (torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM))
+            and not config.allow_rnn
+        ):
+            unimplemented("TorchDynamo purposely graph breaks on RNN, GRU, LSTMs")
+        if mutation_guard.is_dynamic_nn_module(value):
+            # created dynamically, don't specialize on it
+            result = UnspecializedNNModuleVariable(
+                value, guards=self.make_guards(GuardBuilder.TYPE_MATCH)
+            )
+            if not SideEffects.cls_supports_mutation_side_effects(type(value)):
+                # don't allow STORE_ATTR mutation with custom __setattr__
+                return result
+            return self.tx.output.side_effects.track_object_existing(
+                self.source, value, result
+            )
+        elif getattr(value, "_is_fsdp_managed_module", False) or issubclass(
+            value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
+        ):
+            if getattr(value, "_is_fsdp_managed_module", False):
+                # Note: we can't do this assert inside FSDP constructor,
+                # since we don't know yet whether dynamo will be used
+                assert getattr(
+                    value, "_fsdp_use_orig_params", False
+                ), "Dynamo only supports FSDP with use_orig_params=True"
+
+            # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
+            # in fully_sharded_data_parallel.py for more information
+            return UnspecializedNNModuleVariable(
+                value, guards=self.make_guards(GuardBuilder.TYPE_MATCH)
+            )
+        else:
+            return self.tx.output.register_attr_or_module(
+                value,
+                self.name,
+                source=self.get_source(),
+                # Guards are added inside register_attr_or_module
+            )
+
+    def wrap_literal(self, value):
+        if type(value) in (int, float) and not config.specialize_int_float:
+            # unspecializing int/float by default, but still
+            # specialize for the following conditions
+            if (
+                value in self._common_constants()
+                or isinstance(self.source, GlobalSource)
+                or isinstance(self.source, GetItemSource)
+                or (
+                    isinstance(self.source, AttrSource)
+                    and isinstance(self.source.base, GlobalSource)
+                )
+            ):
+                return ConstantVariable(
+                    value=value,
+                    guards=self.make_guards(GuardBuilder.CONSTANT_MATCH),
+                )
+            else:
+                return self.wrap_unspecialized_primitive(value)
+        else:
+            return ConstantVariable(
+                value=value,
+                guards=self.make_guards(GuardBuilder.CONSTANT_MATCH),
+            )
+
     def wrap_tensor(self, value: torch.Tensor):
         if self.get_source().guard_source().is_nn_module():
             return self.tx.output.register_attr_or_module(
@@ -643,17 +750,20 @@ def wrap_tensor(self, value: torch.Tensor):
             assert type(value) in (torch.Tensor, torch.nn.Parameter)
             ignore_subclass = False
 
+        tensor_proxy = self.tx.output.create_graph_input(
+            re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
+        )
         tensor_variable = wrap_fx_proxy(
             tx=self.tx,
-            proxy=self.tx.output.create_graph_input(
-                re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
-            ),
+            proxy=tensor_proxy,
             example_value=value,
             guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
             should_specialize=self.tensor_should_specialize(),
             ignore_subclass=ignore_subclass,
             source=self.get_source(),
         )
+        assert "tensor_dict" not in tensor_proxy.node.meta
+        tensor_proxy.node.meta["tensor_dict"] = value.__dict__.copy()
 
         # TODO: I think the result is guaranteed to be fake with
         # ignore_subclass changes
@@ -662,7 +772,7 @@ def wrap_tensor(self, value: torch.Tensor):
         if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
             fake_tensor_value = example_value
 
-        self.tx.output.graphargs.append(
+        self.tx.output.add_grapharg(
             GraphArg(self.get_source(), value, False, fake_tensor_value)
         )
 
@@ -692,7 +802,7 @@ def wrap_unspecialized_primitive(self, value):
             ):
                 shape_env = self.tx.output.shape_env
                 wrapped_value = shape_env.create_symintnode(
-                    shape_env.create_symbol(value, source=self.source)
+                    shape_env.create_symbol(value, source=self.source), hint=value
                 )
                 self.tx.output.tracked_fakes.append(
                     TrackedFake(wrapped_value, self.source)
@@ -727,7 +837,7 @@ def wrap_unspecialized_primitive(self, value):
                 example_value = unspec_var.proxy.node.meta["example_value"]
                 if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
                     fake_tensor_value = example_value
-                self.tx.output.graphargs.append(
+                self.tx.output.add_grapharg(
                     GraphArg(
                         self.get_source(),
                         wrapped_value,
@@ -864,13 +974,13 @@ def _clone_input(value):
         return UserDefinedObjectVariable(example_value)
     elif istype(example_value, (int, bool, float)) and config.dynamic_shapes:
         proxy.node.meta["example_value"] = example_value
-        return DynamicShapeVariable.create(tx, proxy, example_value, **options)
+        return SymNodeVariable.create(tx, proxy, example_value, **options)
     elif istype(example_value, torch.Size) and config.dynamic_shapes:
         proxy.node.meta["example_value"] = example_value
         sizes = []
         for i, v in enumerate(example_value):
             proxy_i = proxy[i]
-            sizes.append(DynamicShapeVariable.create(tx, proxy_i, v, **options))
+            sizes.append(SymNodeVariable.create(tx, proxy_i, v, **options))
         return SizeVariable(sizes, proxy, **options)
     elif istype(example_value, int) and proxy.node.target in (
         torch.seed,
@@ -881,7 +991,7 @@ def _clone_input(value):
     ):
         if config.dynamic_shapes:
             proxy.node.meta["example_value"] = example_value
-            return DynamicShapeVariable.create(tx, proxy, example_value, **options)
+            return SymNodeVariable.create(tx, proxy, example_value, **options)
         else:
             return ConstantVariable(example_value, **options)
     elif istype(example_value, torch.Size) and all(
@@ -926,22 +1036,12 @@ def _clone_input(value):
     ):
         proxy.node.meta["example_value"] = example_value
         return ConstantVariable(example_value, **options)
-    elif (
-        isinstance(example_value, numbers.Number)
-        and (proxy.node.target == "item" or proxy.node.target in {math.sqrt, math.pow})
-        and config.capture_scalar_outputs
-    ):
-        # item raw value should not be accessed
-        return wrap_fx_proxy_cls(
-            FakeItemVariable,
-            tx=tx,
-            proxy=proxy,
-            example_value=torch.tensor(example_value),
-            **options,
-        )
     elif isinstance(example_value, (torch.SymInt, torch.SymFloat)):
         proxy.node.meta["example_value"] = example_value
-        return DynamicShapeVariable(proxy, example_value, **options)
+        return SymNodeVariable(proxy, example_value, **options)
+    elif proxy.node.target in [torch.cuda.streams.Stream, torch.cuda.current_stream]:
+        proxy.node.meta["example_value"] = example_value
+        return CUDAStreamVariable(proxy, example_value, **options)
     else:
         unimplemented(
             "torch.* op returned non-Tensor "
@@ -963,12 +1063,8 @@ def wrap_to_fake_tensor_and_record(
     if type(e) in (torch.Tensor, torch.nn.Parameter) or (
         ignore_subclass and isinstance(e, torch.Tensor)
     ):
-        static_shapes = (
-            source is None
-            or type(e) is torch.nn.Parameter
-            or config.dynamic_shapes is False
-            or not is_tensor
-        )
+        static_shapes, reason = tensor_shape_should_be_static(e, source, is_tensor)
+
         fake_e = wrap_fake_exception(
             lambda: tx.fake_mode.from_tensor(
                 e,
@@ -977,6 +1073,9 @@ def wrap_to_fake_tensor_and_record(
                 source=source,
             )
         )
+        if hasattr(e, "_dynamo_dynamic_indices"):
+            fake_e._dynamo_dynamic_indices = e._dynamo_dynamic_indices
+            assert not static_shapes, tensor_static_reason_to_message(reason)
         if is_tensor:
             tx.output.tracked_fakes.append(TrackedFake(fake_e, source))
         return fake_e
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 65585385701c..a302124c1ae3 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -23,9 +23,12 @@
     proxy_args_kwargs,
     specialize_args_kwargs,
 )
-from .base import MutableLocal, VariableTracker
+from .base import MutableLocal, typestr, VariableTracker
+from .constant import ConstantVariable
 from .dicts import ConstDictVariable
-from .tensor import DynamicShapeVariable, FakeItemVariable, UnspecializedPythonVariable
+from .lists import BaseListVariable, ListVariable, TupleVariable
+from .tensor import FakeItemVariable, SymNodeVariable, UnspecializedPythonVariable
+from .user_defined import UserDefinedVariable
 
 log = logging.getLogger(__name__)
 
@@ -136,11 +139,233 @@ def _fx_graph_functions():
         }
         return fns
 
+    @staticmethod
+    @functools.lru_cache(None)
+    def _binops():
+        # function -> ([forward name, reverse name, in-place name], in-place op)
+        fns = {
+            operator.add: (["__add__", "__radd__", "__iadd__"], operator.iadd),
+            operator.sub: (["__sub__", "__rsub__", "__isub__"], operator.isub),
+            operator.mul: (["__mul__", "__rmul__", "__imul__"], operator.imul),
+            operator.truediv: (
+                ["__truediv__", "__rtruediv__", "__itruediv__"],
+                operator.itruediv,
+            ),
+            operator.floordiv: (
+                ["__floordiv__", "__rfloordiv__", "__ifloordiv__"],
+                operator.ifloordiv,
+            ),
+            operator.mod: (["__mod__", "__rmod__", "__imod__"], operator.imod),
+            pow: (["__pow__", "__rpow__", "__ipow__"], operator.ipow),
+            operator.pow: (["__pow__", "__rpow__", "__ipow__"], operator.ipow),
+            # NB: The follow binary operators are not supported for now, since the
+            # corresponding magic methods aren't defined on SymInt / SymFloat:
+            # operator.matmul
+            # divmod
+            # operator.lshift
+            # operator.rshift
+            # operator.and_
+            # operator.or_
+            # operator.xor
+        }
+        return fns
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _binop_handlers():
+        # Multiple dispatch mechanism defining custom binop behavior for certain type
+        # combinations. Handlers are attempted in order, and will be used if the type checks
+        # match. They are expected to have the signature:
+        # fn(tx, arg0: VariableTracker, arg1: VariableTracker, options) -> VariableTracker
+
+        # Override table contains: op_fn -> [list of handlers]
+        op_handlers = {}
+        for (
+            op,
+            (magic_method_names, in_place_op),
+        ) in BuiltinVariable._binops().items():
+            op_handlers[op] = []
+            op_handlers[in_place_op] = []
+
+            forward_name, reverse_name, inplace_name = magic_method_names
+
+            # User-defined args (highest precedence)
+            def user_defined_handler(
+                tx,
+                a,
+                b,
+                options,
+                forward_name=forward_name,
+                reverse_name=reverse_name,
+            ):
+                # Manually handle reversing logic if needed (e.g. call __radd__)
+
+                # TODO: If we expand this to handle tensor args, we need to manually
+                # handle cases like this:
+                #
+                # class A(int):
+                #     def __radd__(self, other):
+                #         print("woof")
+                # torch.randn(3) + A(3)
+                #
+                # In this example, A.__radd__() is not called -> nothing is printed, because
+                # Tensor.__add__ only does a subtype test against int, ignoring the subclass.
+                # To be fully correct, we should not call A.__radd__() here, and there may be
+                # other cases to reason about and add exceptions for.
+                if isinstance(a, UserDefinedVariable):
+                    return a.call_method(tx, forward_name, [b], {})
+                else:
+                    return b.call_method(tx, reverse_name, [a], {})
+
+            op_handlers[op].append(
+                ((UserDefinedVariable, VariableTracker), user_defined_handler)
+            )
+            op_handlers[op].append(
+                ((VariableTracker, UserDefinedVariable), user_defined_handler)
+            )
+
+            def user_defined_inplace_handler(
+                tx, a, b, options, forward_name=inplace_name
+            ):
+                return a.call_method(tx, forward_name, [b], {})
+
+            op_handlers[in_place_op].append(
+                ((UserDefinedVariable, VariableTracker), user_defined_inplace_handler)
+            )
+            op_handlers[in_place_op].append(
+                ((VariableTracker, UserDefinedVariable), user_defined_inplace_handler)
+            )
+
+            # Dynamic shape args
+            def dynamic_handler(tx, a, b, options, fn=op):
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(
+                    tx,
+                    tx.output.create_proxy(
+                        "call_function", fn, *proxy_args_kwargs([a, b], {})
+                    ),
+                    **options,
+                )
+
+            op_handlers[op].append(
+                ((SymNodeVariable, VariableTracker), dynamic_handler)
+            )
+            op_handlers[op].append(
+                ((VariableTracker, SymNodeVariable), dynamic_handler)
+            )
+
+            # NB: Prefer out-of-place op when calling in-place op to generate valid graph
+            op_handlers[in_place_op].append(
+                ((SymNodeVariable, VariableTracker), dynamic_handler)
+            )
+            op_handlers[in_place_op].append(
+                ((VariableTracker, SymNodeVariable), dynamic_handler)
+            )
+
+        # Special cases - lower precedence but still prefer these over constant folding
+
+        # List-like addition (e.g. [1, 2] + [3, 4])
+        def tuple_add_handler(tx, a, b, options):
+            return TupleVariable(a.items + list(b.unpack_var_sequence(tx)), **options)
+
+        list_like_addition_handlers = [
+            # NB: Prefer the tuple-specific logic over base logic because of
+            # some SizeVariable weirdness. Specifically, the tuple-specific logic
+            # drops the subclass type (e.g. SizeVariable) and returns TupleVariables.
+            (
+                (TupleVariable, TupleVariable),
+                tuple_add_handler,
+            ),
+            (
+                (TupleVariable, ConstantVariable),
+                tuple_add_handler,
+            ),
+            (
+                (ConstantVariable, TupleVariable),
+                lambda tx, a, b, options: TupleVariable(
+                    list(a.unpack_var_sequence(tx)) + b.items, **options
+                ),
+            ),
+            (
+                (BaseListVariable, BaseListVariable),
+                lambda tx, a, b, options: type(a)(a.items + b.items, **options),
+            ),
+        ]
+        op_handlers[operator.add].extend(list_like_addition_handlers)
+
+        def list_iadd_handler(tx, a, b, options):
+            if not a.mutable_local or not b.has_unpack_var_sequence(tx):
+                # Handler doesn't apply
+                return None
+
+            return tx.replace_all(
+                a,
+                ListVariable(
+                    list(a.items) + list(b.unpack_var_sequence(tx)),
+                    regen_guards=False,
+                    **options,
+                ),
+            )
+
+        list_like_iadd_handlers = [
+            (
+                (ListVariable, VariableTracker),
+                list_iadd_handler,
+            ),
+            (
+                (TupleVariable, TupleVariable),
+                tuple_add_handler,
+            ),
+            (
+                (TupleVariable, ConstantVariable),
+                tuple_add_handler,
+            ),
+        ]
+        op_handlers[operator.iadd].extend(list_like_iadd_handlers)
+
+        # List-like expansion (e.g. [1, 2, 3] * 3)
+        def expand_list_like(tx, lst, const, options):
+            return lst.__class__(
+                items=lst.items * const.as_python_constant(),
+                mutable_local=MutableLocal(),
+                **options,
+            )
+
+        list_like_expansion_handlers = [
+            ((ListVariable, ConstantVariable), expand_list_like),
+            ((TupleVariable, ConstantVariable), expand_list_like),
+            (
+                (ConstantVariable, ListVariable),
+                lambda tx, a, b, options: expand_list_like(tx, b, a, options),
+            ),
+            (
+                (ConstantVariable, TupleVariable),
+                lambda tx, a, b, options: expand_list_like(tx, b, a, options),
+            ),
+        ]
+        op_handlers[operator.mul].extend(list_like_expansion_handlers)
+
+        return op_handlers
+
+    @staticmethod
+    def _find_binop_handler(op, a, b):
+        handlers = BuiltinVariable._binop_handlers()
+        if op not in handlers:
+            return None
+
+        # Return first handler that matches the type checks
+        for ((type1, type2), handler) in handlers[op]:
+            if isinstance(a, type1) and isinstance(b, type2):
+                return handler
+
+        return None
+
     def can_insert_in_graph(self):
         return self.fn in self._fx_graph_functions()
 
     def __init__(self, fn, **kwargs):
-        super(BuiltinVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.fn = fn
 
     def __str__(self):
@@ -161,7 +386,7 @@ def reconstruct(self, codegen):
         name = self.fn.__name__
         assert self.fn.__module__ == "builtins"
         assert name not in codegen.tx.f_globals, "shadowed global"
-        return [codegen.create_load_global(name, add=True)]
+        return [codegen.create_load_global(name, False, add=True)]
 
     def constant_args(self, *args, **kwargs):
         return check_constant_args(args, kwargs)
@@ -278,6 +503,8 @@ def call_function(
                         need_unwrap=need_unwrap,
                         **options,
                     )
+                elif all(isinstance(x, SymNodeVariable) for x in args):
+                    return SymNodeVariable.create(tx, proxy, None, **options)
                 else:
                     # Work around for vision_maskrcnn due to precision difference
                     # specialize the dividend when float divide by tensor
@@ -292,7 +519,7 @@ def call_function(
 
         # Handle cases like int(torch.seed())
         # Also handle sym_float to sym_int cases
-        if self.fn in (int, float) and isinstance(args[0], DynamicShapeVariable):
+        if self.fn in (int, float) and isinstance(args[0], SymNodeVariable):
             fn_ = sym_int if self.fn is int else sym_float
             out = wrap_fx_proxy(
                 tx=tx,
@@ -306,6 +533,18 @@ def call_function(
             )
             return out
 
+        # Handle binary ops (e.g. __add__ / __radd__, __iadd__, etc.)
+        # NB: Tensor args are handled above and not here
+        if len(kwargs) == 0 and len(args) == 2:
+            # Try to find a handler for the arg types; otherwise, fall through to constant handler
+            binop_handler = BuiltinVariable._find_binop_handler(
+                self.fn, args[0], args[1]
+            )
+            if binop_handler:
+                res = binop_handler(tx, args[0], args[1], options)
+                if res is not None:
+                    return res
+
         handler = getattr(self, f"call_{self.fn.__name__}", None)
         if handler:
             try:
@@ -340,7 +579,24 @@ def call_function(
             )
         return super().call_function(tx, args, kwargs)
 
-    def _call_min_max(self, tx, a, b):
+    def _call_min_max(self, tx, *args):
+        if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
+            # expand iterable
+            items = args[0].unpack_var_sequence(tx)
+            return self._call_min_max_seq(tx, items)
+        elif len(args) == 2:
+            return self._call_min_max_binary(tx, args[0], args[1])
+        elif len(args) > 2:
+            return self._call_min_max_seq(tx, args)
+
+    def _call_min_max_seq(self, tx, items):
+        assert len(items) > 0
+        if len(items) == 1:
+            return items[0]
+
+        return functools.reduce(functools.partial(self._call_min_max_binary, tx), items)
+
+    def _call_min_max_binary(self, tx, a, b):
         if self.tensor_args(a, b):
             if not isinstance(a, variables.TensorVariable):
                 a, b = b, a
@@ -351,7 +607,7 @@ def _call_min_max(self, tx, a, b):
                 a = variables.TorchVariable(torch.tensor).call_function(tx, [a], {})
 
             # Dynamic input does not get resolved, rather, gets stored as call_function
-            if isinstance(a, DynamicShapeVariable):
+            if isinstance(a, SymNodeVariable):
                 from .builder import wrap_fx_proxy
 
                 return wrap_fx_proxy(
@@ -416,11 +672,11 @@ def _call_min_max(self, tx, a, b):
                 return variables.ConstantVariable(max(a.value, b.value))
             else:
                 return variables.ConstantVariable(min(a.value, b.value))
-        elif isinstance(a, DynamicShapeVariable) or isinstance(b, DynamicShapeVariable):
+        elif isinstance(a, SymNodeVariable) or isinstance(b, SymNodeVariable):
             proxy = tx.output.create_proxy(
                 "call_function", self.fn, *proxy_args_kwargs([a, b], {})
             )
-            return DynamicShapeVariable.create(tx, proxy, None)
+            return SymNodeVariable.create(tx, proxy, None)
         else:
 
             unimplemented(f"unsupported min / max over args {str(a)}, {str(b)}")
@@ -435,8 +691,10 @@ def call_range(self, tx, *args):
         elif self._dynamic_args(*args):
 
             def guard_if_dyn(arg):
-                if isinstance(arg, DynamicShapeVariable):
+                if isinstance(arg, SymNodeVariable):
                     return arg.evaluate_expr(tx.output)
+                elif isinstance(arg, ConstantVariable):
+                    return arg.as_python_constant()
                 return arg
 
             args = [variables.ConstantVariable(guard_if_dyn(arg)) for arg in args]
@@ -445,15 +703,14 @@ def guard_if_dyn(arg):
         return None
 
     def _dynamic_args(self, *args, **kwargs):
-        return any([isinstance(x, DynamicShapeVariable) for x in args]) or any(
-            [isinstance(x, DynamicShapeVariable) for x in kwargs.values()]
+        return any([isinstance(x, SymNodeVariable) for x in args]) or any(
+            [isinstance(x, SymNodeVariable) for x in kwargs.values()]
         )
 
     def call_slice(self, tx, *args):
         return variables.SliceVariable(args)
 
     def _dyn_proxy(self, tx, *args, **kwargs):
-        assert self._dynamic_args(*args, **kwargs)
         from .builder import wrap_fx_proxy
 
         options = VariableTracker.propagate(self, args, kwargs.values())
@@ -465,10 +722,6 @@ def _dyn_proxy(self, tx, *args, **kwargs):
             **options,
         )
 
-    def call_mod(self, tx, *args, **kwargs):
-        if self._dynamic_args(*args, **kwargs):
-            return self._dyn_proxy(tx, *args, **kwargs)
-
     def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
         if self._dynamic_args(*args, **kwargs):
             return self._dyn_proxy(tx, *args, **kwargs)
@@ -492,9 +745,20 @@ def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
     call_tuple = _call_iter_tuple_list
     call_list = _call_iter_tuple_list
 
-    def call_dict(self, tx, arg):
-        if isinstance(arg, variables.ConstDictVariable):
-            return arg.clone(mutable_local=MutableLocal())
+    @staticmethod
+    def call_dict_helper(user_cls, arg):
+        if arg is None:
+            return variables.ConstDictVariable(
+                {}, user_cls, mutable_local=MutableLocal()
+            )
+        elif isinstance(arg, variables.ConstDictVariable):
+            return arg.clone(user_cls=user_cls, mutable_local=MutableLocal())
+        else:
+            raise AssertionError("call_dict_helper with illegal arg")
+
+    def call_dict(self, tx, obj=None):
+        if obj is None or isinstance(obj, variables.ConstDictVariable):
+            return self.call_dict_helper(dict, obj)
 
     def call_zip(self, tx, *args):
         options = VariableTracker.propagate(self, args)
@@ -523,45 +787,9 @@ def call_enumerate(self, tx, *args):
             ]
             return variables.TupleVariable(items, **options)
 
-    def call_mul(self, tx, a, b):
-        if isinstance(
-            a, (variables.ListVariable, variables.TupleVariable)
-        ) and isinstance(b, variables.ConstantVariable):
-            return a.__class__(
-                items=a.items * b.as_python_constant(), mutable_local=MutableLocal()
-            ).add_options(self, a, b)
-        elif isinstance(
-            b, (variables.ListVariable, variables.TupleVariable)
-        ) and isinstance(a, variables.ConstantVariable):
-            return b.__class__(
-                items=b.items * a.as_python_constant(), mutable_local=MutableLocal()
-            ).add_options(self, a, b)
-        # TODO this doesn't generalize in other builtin operators.
-        elif isinstance(a, variables.ConstantVariable) and isinstance(
-            b, DynamicShapeVariable
-        ):
-            return b.call_method(tx, "__rmul__", [a], {})
-        else:
-            return a.call_method(tx, "__mul__", [b], {})
-
     def call_len(self, tx, *args, **kwargs):
         return args[0].call_method(tx, "__len__", args[1:], kwargs)
 
-    def call_add(self, tx, *args, **kwargs):
-        return args[0].call_method(tx, "__add__", args[1:], kwargs)
-
-    def call_sub(self, tx, *args, **kwargs):
-        return args[0].call_method(tx, "__sub__", args[1:], kwargs)
-
-    def call_truediv(self, tx, *args, **kwargs):
-        return args[0].call_method(tx, "__truediv__", args[1:], kwargs)
-
-    def call_floordiv(self, tx, *args, **kwargs):
-        return args[0].call_method(tx, "__floordiv__", args[1:], kwargs)
-
-    def call_iadd(self, tx, *args, **kwargs):
-        return args[0].call_method(tx, "__iadd__", args[1:], kwargs)
-
     def call_getitem(self, tx, *args, **kwargs):
         if self.unspec_python_args(*args, **kwargs):
             args, kwargs = specialize_args_kwargs(tx, args, kwargs)
@@ -582,6 +810,15 @@ def call_isinstance(self, tx, arg, isinstance_type):
             unimplemented(
                 f"isinstance called on UserDefinedClass {arg} {isinstance_type}"
             )
+        # handle __instancecheck__ defined in user class
+        if (
+            isinstance(arg, variables.UserDefinedObjectVariable)
+            and "__instancecheck__" in isinstance_type.__class__.__dict__
+        ):
+            return variables.ConstantVariable(
+                isinstance_type.__class__.__instancecheck__(isinstance_type, arg.value)
+            )
+
         try:
             val = issubclass(arg_type, isinstance_type)
         except TypeError:
@@ -798,6 +1035,33 @@ def call_reversed(self, tx, obj: VariableTracker):
                 items, **VariableTracker.propagate(self, obj)
             )
 
+    def call_sorted(self, tx, obj: VariableTracker, **kwargs):
+        if (
+            obj.has_unpack_var_sequence(tx)
+            and not isinstance(obj, variables.TensorVariable)
+            and all(x.is_python_constant() for x in obj.unpack_var_sequence(tx))
+        ):
+            function = kwargs.pop("key", None)
+            reverse = kwargs.pop(
+                "reverse", ConstantVariable(False)
+            ).as_python_constant()
+            assert len(kwargs) == 0
+            if function:
+                items = sorted(
+                    obj.unpack_var_sequence(tx),
+                    key=lambda x: function.call_function(
+                        tx, [x], {}
+                    ).as_python_constant(),
+                    reverse=reverse,
+                )
+            else:
+                items = sorted(
+                    obj.unpack_var_sequence(tx),
+                    key=lambda x: x.as_python_constant(),
+                    reverse=reverse,
+                )
+            return variables.ListVariable(items, **VariableTracker.propagate(self, obj))
+
     def call_chain(self, tx, *args):
         if all(obj.has_unpack_var_sequence(tx) for obj in args):
             items = []
@@ -825,3 +1089,114 @@ def call_id(self, tx, *args):
             return variables.ConstantVariable(id(mod))
         else:
             unimplemented(f"call_id with args {args}")
+
+    def _comparison(self, tx, left, right):
+        """
+        Used to implement comparison operators for different types.
+        For example, list1 < list2 is implemented differently from tensor1 < tensor2
+        """
+        from . import (
+            BaseListVariable,
+            ConstantVariable,
+            TensorVariable,
+            UserFunctionVariable,
+        )
+        from .lists import SizeVariable
+        from .tensor import (
+            supported_const_comparison_ops,
+            supported_tensor_comparison_ops,
+        )
+
+        op = self.fn
+
+        def _unimplemented():
+            unimplemented(f"comparison {typestr(left)} {op} {typestr(right)}")
+
+        if isinstance(left, UserFunctionVariable):
+            if op not in supported_const_comparison_ops.values():
+                _unimplemented()
+            if not isinstance(right, UserFunctionVariable):
+                _unimplemented()
+            return ConstantVariable(op(left.fn, right.fn))
+
+        # Note, we have a rare BaseListVariable subtype mismatch with valid comparison
+        # x = torch.randn([3, 3])
+        # x.size() == (3, 3) # True
+        # (3, 3) == x.size() # True
+        if isinstance(left, (SizeVariable, TupleVariable)) and isinstance(
+            right, (TupleVariable, SizeVariable)
+        ):
+            return BaseListVariable.list_compare(tx, op, left, right)
+
+        if isinstance(left, BaseListVariable):
+            if not type(left) == type(right):  # Mismatch in BaseListVariable subclasses
+                _unimplemented()
+            return BaseListVariable.list_compare(tx, op, left, right)
+
+        if isinstance(left, TensorVariable):
+            from .builder import wrap_fx_proxy
+
+            if op not in supported_tensor_comparison_ops.values():
+                _unimplemented()
+            return wrap_fx_proxy(
+                tx,
+                op(left.as_proxy(), right.as_proxy()),
+            )
+
+        if isinstance(left, SymNodeVariable) or isinstance(right, SymNodeVariable):
+            if op not in supported_tensor_comparison_ops.values():
+                _unimplemented()
+
+            return SymNodeVariable.create(
+                tx,
+                op(left.as_proxy(), right.as_proxy()),
+                sym_num=None,
+            )
+
+        _unimplemented()
+
+    # and_ is a constant fold function, so we only get here if constant fold is not valid
+    def call_and_(self, tx, a, b):
+        if isinstance(a, SymNodeVariable) and isinstance(b, SymNodeVariable):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.and_, *proxy_args_kwargs([a, b], {})
+                ),
+                sym_num=None,
+            )
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    # or_ is a constant fold function, so we only get here if constant fold is not valid
+    def call_or_(self, tx, a, b):
+        if isinstance(a, SymNodeVariable) and isinstance(b, SymNodeVariable):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.or_, *proxy_args_kwargs([a, b], {})
+                ),
+                sym_num=None,
+            )
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    def call_not_(self, tx, a):
+        if isinstance(a, SymNodeVariable):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.not_, *proxy_args_kwargs([a], {})
+                ),
+                sym_num=None,
+            )
+        return None
+
+    call_eq = _comparison
+    call_gt = _comparison
+    call_lt = _comparison
+    call_ge = _comparison
+    call_le = _comparison
+    call_ne = _comparison
+    call_is_ = _comparison
+    call_is_not = _comparison
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 9306928dc6c2..e591aba7d438 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -11,7 +11,7 @@
 
 class ConstantVariable(VariableTracker):
     def __init__(self, value, **kwargs):
-        super(ConstantVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert not isinstance(value, torch.Tensor)
         assert not isinstance(value, torch.SymInt)
         assert not isinstance(value, torch.SymFloat)
@@ -76,7 +76,7 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        from .tensor import DynamicShapeVariable
+        from .tensor import SymNodeVariable
 
         options = VariableTracker.propagate(self, args, kwargs.values())
 
@@ -86,25 +86,17 @@ def call_method(
                 items=self.unpack_var_sequence(tx), source=self.source, **options
             ).call_method(tx, name, args, kwargs)
 
-        if any([isinstance(x, DynamicShapeVariable) for x in args]):
-            # NOTE! DANGER! THIS ONLY WORKS FOR COMMUTATIVE OPS
-            # we are relying on add to have arg[0] be a DynamicShapeVariable
-            # because we are in ConstantVariable land
-            # This transforms
-            # constant + dynamic
-            # into
-            # dynamic + constant
-            # Which already has infra built for writing to the graph
-            if name == "__add__":
-                assert len(args) == 1
-                return args[0].call_method(tx, name, [self], {})
-            # Unfortunate constant
-            return super(ConstantVariable, self).call_method(tx, name, args, kwargs)
+        if any([isinstance(x, SymNodeVariable) for x in args]):
+            # Promote to SymNodeVariable for operations involving dynamic shapes.
+            return variables.SymNodeVariable(self.as_proxy(), self.value).call_method(
+                tx, name, args, kwargs
+            )
+
         try:
             const_args = [a.as_python_constant() for a in args]
             const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
         except NotImplementedError:
-            return super(ConstantVariable, self).call_method(tx, name, args, kwargs)
+            return super().call_method(tx, name, args, kwargs)
 
         def has_arith_binop(num_ty):
             return (
@@ -122,16 +114,16 @@ def has_arith_binop(num_ty):
             op = getattr(operator, name)
             add_target = const_args[0]
             if isinstance(add_target, (torch.SymInt, torch.SymFloat)):
-                from .tensor import DynamicShapeVariable
+                from .tensor import SymNodeVariable
 
                 # Addition between a non sym and sym makes a sym
-                # dyn_shape = tx.output.register_attr_or_module(
+                # sym_num = tx.output.register_attr_or_module(
                 #     add_target, f"sym_shape_{add_target}", source=None
                 # )
                 proxy = tx.output.create_proxy(
                     "call_function", op, (self.value, add_target), {}
                 )
-                return DynamicShapeVariable.create(tx, proxy, add_target, **options)
+                return SymNodeVariable.create(tx, proxy, add_target, **options)
             return ConstantVariable(op(self.value, add_target), **options)
         elif name == "__len__" and not (args or kwargs):
             return ConstantVariable(len(self.value), **options)
@@ -146,7 +138,7 @@ def has_arith_binop(num_ty):
 
 class EnumVariable(VariableTracker):
     def __init__(self, value, **kwargs):
-        super(EnumVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.value = value
 
     def as_proxy(self):
@@ -160,3 +152,9 @@ def python_type(self):
 
     def as_python_constant(self):
         return self.value
+
+    def const_getattr(self, tx, name):
+        member = getattr(self.value, name)
+        if callable(member):
+            raise NotImplementedError()
+        return member
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index e05eecffc7e6..ce052161f09e 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -5,7 +5,7 @@
 from typing import Dict, List
 
 from .. import variables
-from ..bytecode_transformation import create_instruction
+from ..bytecode_transformation import create_call_function, create_instruction
 from ..eval_frame import skip_code
 from ..exc import unimplemented
 from ..source import AttrSource, GlobalWeakRefSource
@@ -17,9 +17,7 @@
 
 class ConstDictVariable(VariableTracker):
     def __init__(self, items, user_cls, recursively_contains=None, **kwargs):
-        super(ConstDictVariable, self).__init__(
-            recursively_contains=recursively_contains, **kwargs
-        )
+        super().__init__(recursively_contains=recursively_contains, **kwargs)
 
         self.guards.update(VariableTracker.propagate(items.values())["guards"])
         self.items = items
@@ -28,23 +26,40 @@ def __init__(self, items, user_cls, recursively_contains=None, **kwargs):
     def as_proxy(self):
         return {k: v.as_proxy() for k, v in self.items.items()}
 
+    def as_python_constant(self):
+        return {k: v.as_python_constant() for k, v in self.items.items()}
+
     def python_type(self):
         return self.user_cls
 
     def reconstruct(self, codegen):
-        for key, value in self.items.items():
+        # instructions to load collections.OrderedDict if necessary
+        if self.user_cls is collections.OrderedDict:
+            codegen.extend_output(
+                [
+                    codegen.create_load_python_module(collections),
+                    create_instruction("LOAD_METHOD", "OrderedDict"),
+                ]
+            )
+        # instructions to build the dict keys and values
+        for key in self.items.keys():
             if istensor(key):
-                codegen.extend_output(
-                    [
-                        codegen.create_load_global(global_key_name(key), add=True),
-                        create_instruction("CALL_FUNCTION", 0),
-                    ]
+                codegen.append_output(
+                    codegen.create_load_global(global_key_name(key), True, add=True)
                 )
+                codegen.extend_output(create_call_function(0, False))
             else:
                 codegen.append_output(codegen.create_load_const(key))
             codegen(self.items[key])
-
-        return [create_instruction("BUILD_MAP", len(self.items))]
+        # BUILD_MAP and calling collections.OrderedDict if necessary
+        if self.user_cls is collections.OrderedDict:
+            return [
+                create_instruction("BUILD_MAP", len(self.items)),
+                create_instruction("CALL_METHOD", 1),
+            ]
+        # BUILD_MAP only if user_cls is dict
+        else:
+            return [create_instruction("BUILD_MAP", len(self.items))]
 
     def getitem_const(self, arg: VariableTracker):
         return self.items[ConstDictVariable.get_key(arg)].add_options(self, arg)
@@ -218,7 +233,7 @@ def _key_to_var(cls, tx, key, **options):
 
 class DefaultDictVariable(ConstDictVariable):
     def __init__(self, items, user_cls, default_factory=None, **kwargs):
-        super(DefaultDictVariable, self).__init__(items, user_cls, **kwargs)
+        super().__init__(items, user_cls, **kwargs)
         assert user_cls is collections.defaultdict
         self.default_factory = default_factory
 
@@ -355,7 +370,7 @@ def wrap(cls, builder, obj):
         )
 
     def __init__(self, items, user_cls, **options):
-        super(DataClassVariable, self).__init__(items, user_cls, **options)
+        super().__init__(items, user_cls, **options)
         assert self.is_matching_cls(user_cls)
 
     def as_proxy(self):
@@ -366,10 +381,7 @@ def reconstruct(self, codegen):
         keys = tuple(self.items.keys())
         for key in keys:
             codegen(self.items[key])
-        return [
-            codegen.create_load_const(keys),
-            create_instruction("CALL_FUNCTION_KW", len(keys)),
-        ]
+        return codegen.create_call_function_kw(len(keys), keys, True)
 
     def call_method(
         self,
@@ -395,7 +407,7 @@ def call_method(
             return variables.TupleVariable(list(self.items.values()), **options)
         elif name == "__setattr__":
             name = "__setitem__"
-        return super(DataClassVariable, self).call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
 
     def var_getattr(self, tx, name: str) -> "VariableTracker":
         if name in self.items:
@@ -407,7 +419,7 @@ def var_getattr(self, tx, name: str) -> "VariableTracker":
             if name in defaults:
                 assert variables.ConstantVariable.is_literal(defaults[name])
                 return variables.ConstantVariable(defaults[name]).add_options(self)
-        super(DataClassVariable, self).var_getattr(tx, name)
+        super().var_getattr(tx, name)
 
 
 class HFPretrainedConfigVariable(VariableTracker):
@@ -429,7 +441,7 @@ def is_matching_object(cls, obj):
         return cls.is_matching_cls(type(obj))
 
     def __init__(self, obj, **kwargs):
-        super(HFPretrainedConfigVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.obj = obj
         assert self.is_matching_cls(type(obj))
 
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 6f233ae7b818..2b6767250770 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -3,14 +3,17 @@
 import functools
 import inspect
 import itertools
+import sys
 import types
 from typing import Dict, List
 
+import torch
+
 from .. import variables
 from ..bytecode_transformation import create_instruction
 from ..exc import unimplemented
 from ..source import AttrSource, ConstantSource, DefaultsSource, GetItemSource
-from ..utils import istensor, make_cell
+from ..utils import istensor, istype, make_cell
 from .base import typestr, VariableTracker
 
 
@@ -39,7 +42,9 @@ def wrap_bound_arg(tx, val, options, source=None):
             **options,
         )
 
-    if variables.ConstantVariable.is_literal(val):
+    if variables.ConstantVariable.is_literal(val) or istype(
+        val, (torch.Size, torch.device, torch.dtype)
+    ):
         return variables.ConstantVariable(val, **options)
     elif isinstance(val, types.FunctionType):
         return variables.UserFunctionVariable(val, source=source, **options)
@@ -100,7 +105,7 @@ class UserFunctionVariable(BaseUserFunctionVariable):
     """Some unsupported user-defined global function"""
 
     def __init__(self, fn, is_constant=False, **kwargs):
-        super(UserFunctionVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         if getattr(fn, "_dynamo_marked_constant", False):
             # This method should be treated as a constant for the purposes of compilation
             self.is_constant = True
@@ -108,7 +113,7 @@ def __init__(self, fn, is_constant=False, **kwargs):
             self.is_constant = False
 
         assert isinstance(
-            fn, types.FunctionType
+            fn, (types.FunctionType, torch.jit.ScriptFunction)
         ), f"expected FunctionType found {typestr(fn)} {fn}"
         # unpack @torch._dynamo.optimize()(fn) wrapped function
         fn = inspect.getattr_static(fn, "_torchdynamo_inline", fn)
@@ -252,14 +257,14 @@ def call_function(
                 tx, self.fn, self.get_name(), options, args, kwargs
             )
 
-        return super(UserFunctionVariable, self).call_function(tx, args, kwargs)
+        return super().call_function(tx, args, kwargs)
 
 
 class UserMethodVariable(UserFunctionVariable):
     """Some unsupported user-defined method"""
 
     def __init__(self, fn, obj, **kwargs):
-        super(UserMethodVariable, self).__init__(fn=fn, **kwargs)
+        super().__init__(fn=fn, **kwargs)
         self.obj = obj
 
     def __str__(self):
@@ -274,27 +279,27 @@ def python_type(self):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        if (
-            isinstance(self.obj, variables.NNModuleVariable)
-            and getattr(self.fn, "__module__", "").startswith("torch.nn.")
-            or self.is_constant
-        ):
-            return self.obj.call_method(
-                tx, self.fn.__name__, args, kwargs, constant=self.is_constant
-            ).add_options(self)
+        if isinstance(self.obj, variables.NNModuleVariable):
+            module_attr = getattr(self.fn, "__module__", "")
+            if (
+                module_attr is not None
+                and module_attr.startswith("torch.nn.")
+                or self.is_constant
+            ):
+                return self.obj.call_method(
+                    tx, self.fn.__name__, args, kwargs, constant=self.is_constant
+                ).add_options(self)
         return super().call_function(tx, args, kwargs)
 
     def num_parameters(self):
-        return super(UserMethodVariable, self).num_parameters() - 1
+        return super().num_parameters() - 1
 
 
 class WrappedUserMethodVariable(UserMethodVariable):
     def __init__(self, wrapped, context, **kwargs):
         kwargs.pop("fn", None)
         kwargs.pop("obj", None)
-        super(WrappedUserMethodVariable, self).__init__(
-            wrapped.fn, wrapped.obj, **kwargs
-        )
+        super().__init__(wrapped.fn, wrapped.obj, **kwargs)
         self.wrapped = wrapped
         self.context = context
 
@@ -311,7 +316,7 @@ class WrappedUserFunctionVariable(UserFunctionVariable):
     def __init__(self, wrapped, context, **kwargs):
         kwargs.pop("fn", None)
         kwargs.pop("obj", None)
-        super(WrappedUserFunctionVariable, self).__init__(wrapped.fn, **kwargs)
+        super().__init__(wrapped.fn, **kwargs)
         self.wrapped = wrapped
         self.context = context
 
@@ -354,7 +359,7 @@ def __init__(
         closure_scope,
         **kwargs,
     ):
-        super(NestedUserFunctionVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert isinstance(fn_name.as_python_constant(), str)
         assert isinstance(code.as_python_constant(), types.CodeType)
         assert isinstance(f_globals, dict)
@@ -468,5 +473,6 @@ def reconstruct(self, codegen):
             flags |= 0x08
             codegen(self.closure)
         codegen(self.code)
-        codegen(self.fn_name)
+        if sys.version_info < (3, 11):
+            codegen(self.fn_name)
         return [create_instruction("MAKE_FUNCTION", flags)]
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index a2a44a27b42e..019a1f25b168 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -1,13 +1,15 @@
+import functools
+import operator
 from typing import Dict, List, Optional
 
 import torch
 import torch.fx
 
 from .. import config, variables
-from ..bytecode_transformation import create_instruction
+from ..bytecode_transformation import create_call_function, create_instruction
 from ..exc import unimplemented
 from ..source import GetItemSource
-from ..utils import namedtuple_fields, proxy_args_kwargs
+from ..utils import check_constant_args, namedtuple_fields, proxy_args_kwargs
 from .base import MutableLocal, VariableTracker
 from .constant import ConstantVariable
 
@@ -30,9 +32,7 @@ def __init__(
         regen_guards=True,
         **kwargs,
     ):
-        super(BaseListVariable, self).__init__(
-            recursively_contains=recursively_contains, **kwargs
-        )
+        super().__init__(recursively_contains=recursively_contains, **kwargs)
         assert isinstance(items, list)
         assert all(isinstance(x, VariableTracker) for x in items)
 
@@ -84,21 +84,75 @@ def call_method(
         if name == "__getitem__":
             assert not kwargs and len(args) == 1
             return self.getitem_const(args[0])
-        elif name == "__add__":
-            assert not kwargs and len(args) == 1
-            return type(self)(self.items + args[0].items, **options)
-        elif (
-            name == "__contains__"
-            and len(args) == 1
-            and args[0].is_python_constant()
-            and all(x.is_python_constant() for x in self.items)
-        ):
+        elif name == "__contains__":
+            assert len(args) == 1
             assert not kwargs
-            search = args[0].as_python_constant()
-            result = any(x.as_python_constant() == search for x in self.items)
-            return variables.ConstantVariable(result, **options)
 
-        return super(BaseListVariable, self).call_method(tx, name, args, kwargs)
+            search = args[0]
+            if check_constant_args(args, {}) and search.is_python_constant():
+                result = any(
+                    x.as_python_constant() == search.as_python_constant()
+                    for x in self.items
+                )
+                return variables.ConstantVariable(result, **options)
+
+            from .builtin import BuiltinVariable
+
+            result = None
+            for x in self.items:
+                check = BuiltinVariable(operator.eq).call_function(tx, [x, search], {})
+                if result is None:
+                    result = check
+                else:
+                    result = BuiltinVariable(operator.or_).call_function(
+                        tx, [check, result], {}
+                    )
+            return result
+
+        return super().call_method(tx, name, args, kwargs)
+
+    @staticmethod
+    def list_compare(tx, op, left, right):
+        from .builtin import BuiltinVariable
+
+        eq_result = BaseListVariable.list_eq(tx, left, right)
+        if op is operator.eq:
+            return eq_result
+        elif op is operator.ne:
+            return BuiltinVariable(operator.not_).call_function(tx, [eq_result], {})
+        else:
+            unimplemented(f"list_compare {left} {op} {right}")
+
+    @staticmethod
+    def list_eq(tx, left, right):
+        from .builtin import BuiltinVariable
+
+        options = VariableTracker.propagate(left, right)
+
+        # Most list-like variables implement comparison ops the same way,
+        # so they can re-use this helper.
+        # There are quirks though, like how `tuple([2]) == torch.Size([2])`,
+        # but `tuple([2]) != list([2])`
+        if len(left.items) != len(right.items):
+            return ConstantVariable(False, **options)
+        if len(left.items) == 0:
+            return ConstantVariable(True, **options)
+
+        # Generic list comparison works by iterating over left aka self and right the compared-to list.
+        # If we hit here, their lengths are the same and they cannot be expressed as python constants.
+        # So, we iterate over the zipped list items.
+        comps = []
+        for l, r in zip(left.items, right.items):
+            comp = BuiltinVariable(operator.eq).call_function(tx, [l, r], {})
+            if comp.is_python_constant() and not comp.as_python_constant():
+                # early exit in false case
+                return comp.add_options(options)
+            comps.append(comp)
+
+        return functools.reduce(
+            lambda a, b: BuiltinVariable(operator.and_).call_function(tx, [a, b], {}),
+            comps,
+        ).add_options(options)
 
 
 class RangeVariable(BaseListVariable):
@@ -137,9 +191,9 @@ def unpack_var_sequence(self, tx):
 
     def reconstruct(self, codegen):
         assert "range" not in codegen.tx.f_globals
-        codegen.append_output(codegen.create_load_python_module(range))
+        codegen.append_output(codegen.create_load_python_module(range, True))
         codegen.foreach(self.items)
-        return [create_instruction("CALL_FUNCTION", 3)]
+        return create_call_function(3, False)
 
     def var_getattr(self, tx, name):
         fields = ["start", "stop", "step"]
@@ -181,7 +235,7 @@ def call_method(
             )
             return ConstantVariable(None)
         elif (
-            name in ("extend", "__iadd__")
+            name == "extend"
             and self.mutable_local
             and args
             and args[0].has_unpack_var_sequence(tx)
@@ -254,23 +308,6 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        options = VariableTracker.propagate(self, args, kwargs.values())
-        if (
-            name in ("__add__", "__iadd__")
-            and len(args) == 1
-            and isinstance(args[0], TupleVariable)
-        ):
-            assert not kwargs
-            return TupleVariable(self.items + args[0].items, **options)
-        elif (
-            name in ("__add__", "__iadd__")
-            and len(args) == 1
-            and isinstance(args[0], variables.ConstantVariable)
-        ):
-            assert not kwargs
-            return TupleVariable(
-                self.items + list(args[0].unpack_var_sequence(self)), **options
-            )
         return super().call_method(tx, name, args, kwargs)
 
 
@@ -335,8 +372,7 @@ def reconstruct(self, codegen):
         codegen.foreach(self.items)
         build_torch_size = [
             create_instruction("BUILD_TUPLE", len(self.items)),
-            create_instruction("CALL_FUNCTION", 1),
-        ]
+        ] + create_call_function(1, True)
         return build_torch_size
 
     def unpack_var_sequence(self, tx):
@@ -357,10 +393,10 @@ def call_method(
             else:
                 out = self.getitem_const(args[0])
             return out
-        return super(SizeVariable, self).call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
 
     def get_item_dyn(self, tx, arg: VariableTracker):
-        from .tensor import DynamicShapeVariable
+        from .tensor import SymNodeVariable
 
         index = arg.as_python_constant()
         if isinstance(index, slice):
@@ -377,8 +413,8 @@ def _dynamo_get_item_lambda(target, index):
             items = self.items[index]
 
             def _unpack_into_example(item):
-                if isinstance(item, DynamicShapeVariable):
-                    return item.dyn_shape
+                if isinstance(item, SymNodeVariable):
+                    return item.sym_num
                 return item.as_python_constant()
 
             # Mirror the indexing into example_value for downstream correctness
@@ -417,8 +453,7 @@ def reconstruct(self, codegen):
         codegen.foreach(self.items)
         return [
             create_instruction("BUILD_TUPLE", len(self.items)),
-            create_instruction("CALL_FUNCTION", 1),
-        ]
+        ] + create_call_function(1, True)
 
     def var_getattr(self, tx, name):
         fields = namedtuple_fields(self.tuple_cls)
@@ -475,9 +510,7 @@ def var_getattr(self, tx, name):
 
 class ListIteratorVariable(VariableTracker):
     def __init__(self, items, index: int = 0, recursively_contains=None, **kwargs):
-        super(ListIteratorVariable, self).__init__(
-            recursively_contains=recursively_contains, **kwargs
-        )
+        super().__init__(recursively_contains=recursively_contains, **kwargs)
         assert isinstance(items, list)
         # Removing this check as it slows things down too much
         # https://github.com/pytorch/pytorch/pull/87533#issuecomment-1287574492
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index f4e3ed251ddf..869afbc91fbd 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -1,3 +1,4 @@
+import collections
 import inspect
 import sys
 import types
@@ -6,8 +7,8 @@
 import torch._C
 from torch._guards import Guard, GuardSource
 
-from .. import config, variables
-from ..bytecode_transformation import create_instruction
+from .. import variables
+from ..bytecode_transformation import create_call_function, create_instruction
 from ..exc import unimplemented
 from ..guards import GuardBuilder
 from ..source import AttrSource
@@ -24,7 +25,7 @@
 
 class SuperVariable(VariableTracker):
     def __init__(self, typevar, objvar=None, specialized=False, **kwargs):
-        super(SuperVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.typevar = typevar
         self.objvar = objvar
         self.specialized = specialized  # directly get attr from self.typevar if true
@@ -34,9 +35,9 @@ def reconstruct(self, codegen):
         codegen(self.typevar)
         if self.objvar is not None:
             codegen(self.objvar)
-            return [create_instruction("CALL_FUNCTION", 2)]
+            return create_call_function(2, True)
         else:
-            return [create_instruction("CALL_FUNCTION", 1)]
+            return create_call_function(1, True)
 
     def const_getattr(self, tx, name):
         assert self.objvar, "1-arg super not implemented"
@@ -44,14 +45,11 @@ def const_getattr(self, tx, name):
             return getattr(self.typevar.as_python_constant(), name)
         search_type = self.typevar.as_python_constant()
 
-        # We default to the python type of the object. However,
-        # 1. If this is a `type`, then the original object represents the user
-        # defined type.
-        # 2. If this is `torch._C._TensorMeta`, the original object is the user
-        # defined type of a custom tensor subclass.
-        # TODO(future PR): figure out how to do this in a less hacky way
+        # We default to the python type of the object. However, if this is
+        # a `type` or subclass of `type`, then the original object represents
+        # the user defined type.
         type_to_use = self.objvar.python_type()
-        if type_to_use is type or type_to_use is torch._C._TensorMeta:
+        if issubclass(type_to_use, type):
             type_to_use = self.objvar.value
 
         # TODO(jansel): there is a small chance this could trigger user code, prevent that
@@ -148,7 +146,7 @@ def call_function(
 
 class ClosureVariable(UnknownVariable):
     def __init__(self, name, **kwargs):
-        super(ClosureVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.name = name
 
     def reconstruct(self, codegen):
@@ -157,17 +155,17 @@ def reconstruct(self, codegen):
 
 class NewCellVariable(VariableTracker):
     def __init__(self, **kwargs):
-        super(NewCellVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
 
 class NewGlobalVariable(VariableTracker):
     def __init__(self, **kwargs):
-        super(NewGlobalVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
 
 class ContextWrappingVariable(VariableTracker):
     def __init__(self, target_values, initial_values=None, **kwargs):
-        super(ContextWrappingVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.target_values = target_values
         self.initial_values = initial_values
         self.recursively_contains = (
@@ -183,9 +181,6 @@ def exit(self, tx, *args):
         self._call_func(tx, self.initial_values)
         return variables.ConstantVariable(None, **VariableTracker.propagate(self))
 
-    def module_name(self):
-        return "torch"
-
     def reconstruct(self, codegen, target_inst=None):
         """
         Generate following Python Bytecode, with a `torch._C._set_grad_enable` call
@@ -257,16 +252,20 @@ def reconstruct(self, codegen, target_inst=None):
             return ([], [])
 
         def set_context_insts(values):
-            global_torch_source = codegen.tx.import_source("torch")
-            attr_source = AttrSource(global_torch_source, self._func_name())
+            attr_source = AttrSource(
+                codegen.tx.import_source(self.module_name()), self.fn_name()
+            )
             load_set_context_enabling_insts = attr_source.reconstruct(codegen)
 
-            loads = [codegen.create_load_const(val) for val in values]
+            if values:
+                loads = [codegen.create_load_const(val) for val in values]
+            else:
+                loads = []
 
             return [
                 *load_set_context_enabling_insts,
                 *loads,
-                create_instruction("CALL_FUNCTION", len(values)),
+                *create_call_function(len(loads), True),
                 create_instruction("POP_TOP"),
             ]
 
@@ -299,13 +298,18 @@ def set_context_insts(values):
     def _call_func(self, tx, initial_values):
         raise NotImplementedError("_call_func called on base")
 
-    def _func_name(self):
-        raise NotImplementedError("_func_name called on base")
+    def module_name(self):
+        raise NotImplementedError("module_name called on base")
+
+    def fn_name(self):
+        raise NotImplementedError("fn_name called on base")
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
         assert len(args) == 1
+        if isinstance(args[0], NestedUserFunctionVariable):
+            args[0] = UserFunctionVariable(args[0].get_function())
         assert isinstance(args[0], UserMethodVariable) or isinstance(
             args[0], UserFunctionVariable
         )
@@ -333,7 +337,7 @@ def create(tx, target_value, **kwargs):
         return var
 
     def __init__(self, target_values, initial_values=None, **kwargs):
-        super(GradModeVariable, self).__init__(
+        super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
         self.guards = self.guards | self._guards_singleton
@@ -349,14 +353,11 @@ def _call_func(self, tx, values):
         ),
         torch._C._set_grad_enabled(value)
 
-    def _func_name(self):
-        return "_C._set_grad_enabled"
+    def module_name(self):
+        return "torch"
 
     def fn_name(self):
-        if self.target_values[0]:
-            return "enable_grad"
-        else:
-            return "no_grad"
+        return "set_grad_enabled"
 
 
 class AutocastModeVariable(ContextWrappingVariable):
@@ -372,25 +373,25 @@ def create(target_values, kwargs):
         kwargs.clear()
 
         for key in ["device_type", "dtype", "enabled", "cache_enabled"]:
-            if isinstance(bound_args.arguments[key], VariableTracker):
-                target_values.append(bound_args.arguments[key])
+            arg = bound_args.arguments[key]
+            if isinstance(arg, VariableTracker):
+                target_values.append(bound_args.arguments[key].as_python_constant())
             else:
-                target_values.append(
-                    variables.ConstantVariable(bound_args.arguments[key])
-                )
+                target_values.append(bound_args.arguments[key])
 
         var = AutocastModeVariable(target_values, initial_values=None, **kwargs)
         return var
 
     def __init__(self, target_values, initial_values=None, **kwargs):
-        super(AutocastModeVariable, self).__init__(
+        mode = kwargs.pop("mode", None)
+        super().__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
-        self.target_values = [val.as_python_constant() for val in target_values]
-        self.mode = None
+        self.target_values = target_values
+        self.mode = mode
 
     def exit(self, tx, *args):
-        tx.output.create_node(
+        self.mode = tx.output.create_node(
             "call_function", exit_functional_autocast, (self.mode,), {}
         )
 
@@ -399,11 +400,11 @@ def enter(self, tx):
             "call_function", enter_functional_autocast, (*self.target_values,), {}
         )
 
-    def _func_name(self):
-        return "torch.amp.autocast_mode.autocast"
+    def module_name(self):
+        return "torch.amp.autocast_mode"
 
     def fn_name(self):
-        return "torch.amp.autocast_mode.autocast"
+        return "autocast"
 
 
 def enter_functional_autocast(*vals):
@@ -424,7 +425,7 @@ class NullContextVariable(ContextWrappingVariable):
     """
 
     def __init__(self, target_values=None, **kwargs):
-        super(NullContextVariable, self).__init__(target_values=target_values, **kwargs)
+        super().__init__(target_values=target_values, **kwargs)
 
     def enter(self, tx):
         return variables.ConstantVariable(None, **VariableTracker.propagate(self))
@@ -439,9 +440,93 @@ def fn_name(self):
         return "nullcontext"
 
 
+class CUDAStreamContextVariable(ContextWrappingVariable):
+    @staticmethod
+    def create(tx, target_value, **kwargs):
+        from .builder import wrap_fx_proxy_cls
+
+        current_stream = wrap_fx_proxy_cls(
+            CUDAStreamVariable,
+            tx,
+            tx.output.create_proxy(
+                "call_function",
+                torch.cuda.current_stream,
+                (None,),
+                {},
+            ),
+        )
+        return CUDAStreamContextVariable(
+            target_values=[target_value],
+            initial_values=[current_stream],
+            **kwargs,
+        )
+
+    def __init__(self, target_values, initial_values=None, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+
+    def enter(self, tx):
+        # CUDA stream generated inside of traced function
+        if self.target_values[0].as_proxy() is not None:
+            tx.output.create_proxy(
+                "call_function",
+                torch.cuda.set_stream,
+                (self.target_values[0].as_proxy(),),
+                {},
+            )
+        # CUDA stream passed from outside of traced function
+        else:
+            stream = self.target_values[0].value
+            tx.output.create_proxy(
+                "call_function",
+                torch._C._cuda_setStream,
+                (stream.stream_id, stream.device_index, stream.device_type),
+                {},
+            )
+        torch.cuda.set_stream(self.target_values[0].value)
+
+    def exit(self, tx, *args):
+        tx.output.create_proxy(
+            "call_function",
+            torch.cuda.set_stream,
+            (self.initial_values[0].as_proxy(),),
+            {},
+        )
+        torch.cuda.set_stream(self.initial_values[0].value)
+
+    def module_name(self):
+        return "torch.cuda"
+
+    def fn_name(self):
+        return "stream"
+
+
+class CUDAStreamVariable(VariableTracker):
+    def __init__(self, proxy, value, **kwargs):
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        unimplemented("cuda stream")
+
+    def as_proxy(self):
+        return self.proxy
+
+
 class WithExitFunctionVariable(VariableTracker):
-    def __init__(self, ctx: VariableTracker, target, **kwargs):
-        super(WithExitFunctionVariable, self).__init__(**kwargs)
+    def __init__(self, ctx: ContextWrappingVariable, target, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(ctx, ContextWrappingVariable)
         self.ctx = ctx
         self.target = target
 
@@ -460,9 +545,11 @@ def reconstruct(self, codegen):
         ).reconstruct(codegen)
 
         if codegen.tx.output.partial_convert:
+            loads = [codegen.create_load_const(val) for val in self.ctx.target_values]
+            output.extend(loads)
             output.extend(
                 [
-                    create_instruction("CALL_FUNCTION", 0),
+                    *create_call_function(len(loads), True),
                     create_instruction("SETUP_WITH", target=self.target),
                     create_instruction("POP_TOP"),
                 ]
@@ -480,7 +567,7 @@ def create(callable, **kwargs):
         return InspectSignatureVariable(callable)
 
     def __init__(self, inspected, **kwargs):
-        super(InspectSignatureVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.inspected = inspected
 
 
@@ -560,7 +647,7 @@ class AutogradFunctionContextVariable(VariableTracker):
 
 class LambdaVariable(VariableTracker):
     def __init__(self, fn, **kwargs):
-        super(LambdaVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.fn = fn
 
     def call_function(
@@ -571,7 +658,7 @@ def call_function(
 
 class GetAttrVariable(VariableTracker):
     def __init__(self, obj, name, **kwargs):
-        super(GetAttrVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert isinstance(obj, VariableTracker)
         assert isinstance(name, str)
         self.obj = obj
@@ -580,8 +667,12 @@ def __init__(self, obj, name, **kwargs):
     def __str__(self):
         return f"{self.__class__.__name__}({self.obj}, {self.name})"
 
+    @staticmethod
+    def create_getattr_proxy(base_proxy: torch.fx.Proxy, attr):
+        return getattr(base_proxy, attr)
+
     def as_proxy(self):
-        return getattr(self.obj.as_proxy(), self.name)
+        return GetAttrVariable.create_getattr_proxy(self.obj.as_proxy(), self.name)
 
     def const_getattr(self, tx, name):
         if not isinstance(self.obj, variables.NNModuleVariable):
@@ -683,12 +774,12 @@ def call_method(
                 self.obj.inspected.num_parameters(),
                 **VariableTracker.propagate(self, self.obj, self.obj.inspected),
             )
-        return super(GetAttrVariable, self).call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
 
 
 class PythonModuleVariable(VariableTracker):
     def __init__(self, value: types.ModuleType, **kwargs):
-        super(PythonModuleVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.value = value
 
     def python_type(self):
@@ -709,16 +800,26 @@ def as_python_constant(self):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
+        from .builtin import BuiltinVariable
+        from .dicts import ConstDictVariable
+
         if inspect.getattr_static(self.value, "_torchdynamo_disable", False):
-            unimplemented(
-                f"call {config.dynamo_import}.disable() wrapped function {self.value}"
+            unimplemented(f"call torch._dynamo.disable() wrapped function {self.value}")
+        # Allowlist a few popular classes(e.g, collections.OrderedDict) calls in skip files.
+        elif self.value is collections.OrderedDict and (
+            len(args) == 0 or len(args) == 1 and isinstance(args[0], ConstDictVariable)
+        ):
+            return BuiltinVariable.call_dict_helper(
+                collections.OrderedDict, None if len(args) == 0 else args[0]
             )
         else:
             try:
                 path = inspect.getfile(self.value)
             except TypeError:
                 path = f"Builtin {self.value.__name__}"
-            unimplemented("call_function in skip_files " + path)
+            unimplemented(
+                f"call_function {self.value.__qualname__} in skip_files {path}"
+            )
 
 
 class TypingVariable(VariableTracker):
@@ -775,3 +876,17 @@ def python_type(self):
 
     def as_python_constant(self):
         return self.value
+
+
+# Used to keep track of NULLs pushed on the stack for Python 3.11 function calls
+class NullVariable(VariableTracker):
+    def __init__(self, **kwargs):
+        super(NullVariable, self).__init__(**kwargs)
+
+    def __str__(self):
+        return "NullVariable"
+
+    def reconstruct(self, codegen):
+        if sys.version_info < (3, 11):
+            unimplemented("cannot reconstruct NullVariable in < Python 3.11")
+        return [create_instruction("PUSH_NULL")]
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 56898465e543..42dbaa59df68 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -14,10 +14,12 @@
 from ..mutation_guard import GenerationTracker
 from ..source import AttrSource, GetItemSource, NNModuleSource, NotNNModuleSource
 from ..utils import (
+    get_custom_getattr,
     is_lazy_module,
     is_safe_constant,
     istensor,
     istype,
+    object_has_getattribute,
     proxy_args_kwargs,
 )
 from .base import MutableLocal, typestr, VariableTracker
@@ -30,7 +32,7 @@ class NNModuleVariable(VariableTracker):
     _nonvar_fields = ["module_type", "module_key"]
 
     def __init__(self, module_type: type, module_key: str, **kwargs):
-        super(NNModuleVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.module_type = module_type
         self.module_key = module_key
         assert self.source
@@ -86,6 +88,22 @@ def convert_to_unspecialized(self, tx):
             GenerationTracker.mark_class_dynamic(type(mod))
         raise RestartAnalysis()
 
+    def _custom_getattr_fallback(self, base, tx, name, options):
+        """Check for a __getattr__ and handle it specially if it is implemented"""
+        if object_has_getattribute(base):
+            unimplemented("torch.nn.Module with a custom __getattribute__ defined")
+
+        getattr_fn = get_custom_getattr(base)
+        if getattr_fn is None:
+            return None
+
+        if not isinstance(getattr_fn, types.FunctionType):
+            unimplemented("torch.nn.Module with a non-function custom __getattr__")
+
+        return variables.UserMethodVariable(getattr_fn, self, **options).call_function(
+            tx, [variables.ConstantVariable(name)], {}
+        )
+
     def var_getattr(self, tx, name):
         from .builder import VariableBuilder
 
@@ -121,8 +139,18 @@ def var_getattr(self, tx, name):
         elif "_buffers" in base_dict and name in base_dict["_buffers"]:
             subobj = base_dict["_buffers"][name]
         else:
-            subobj = inspect.getattr_static(base, name)
-            object_member = False
+            try:
+                subobj = inspect.getattr_static(base, name)
+                object_member = False
+            except AttributeError:
+                # see if we can fallback to __getattr__, which is not checked by getattr_static
+                result = self._custom_getattr_fallback(
+                    base=base, tx=tx, name=name, options=options
+                )
+                if result is not None:
+                    return result
+                # if we can't find a __getattr__, just raise the AttributeError
+                raise
 
         if name == "__class__" and not object_member:
             return variables.UserDefinedClassVariable(base.__class__, **options)
@@ -165,8 +193,9 @@ def call_function(
 
         @contextmanager
         def record_nn_module_stack():
+            fully_qualified_name = self.source.name()
             try:
-                tx.nn_module_stack[self.module_key] = type(mod)
+                tx.nn_module_stack[self.module_key] = (fully_qualified_name, type(mod))
                 yield
             finally:
                 del tx.nn_module_stack[self.module_key]
@@ -180,13 +209,13 @@ def record_nn_module_stack():
                 # unroll Sequential()
                 assert not kwargs
                 (arg,) = args
-                for idx, submod in enumerate(mod):
+                for child_name, submod in mod.named_children():
                     tx.call_function(
                         tx.output.register_attr_or_module(
                             submod,
                             self.module_key,
-                            idx,
-                            source=NNModuleSource(GetItemSource(self.source, idx)),
+                            child_name,
+                            source=NNModuleSource(AttrSource(self.source, child_name)),
                             **options,
                         ),
                         [arg],
@@ -211,24 +240,29 @@ def record_nn_module_stack():
                 )
 
             else:
-                # for lazy modules, run the pre-hooks which will update the type
-                # TODO mlazos: we don't fully support all of the hooks that exist,
-                # so restrict using __call__ only to lazy modules for now
                 assert self.source, (
                     "Must provide a valid source in order to inline, "
                     "since inlined function may have default args which must be guarded."
                 )
-                class_source = AttrSource(self.source, "__class__")
-                if is_lazy:
-                    fn = mod.__class__.__call__
-                    fn_source = AttrSource(class_source, "__call__")
+                if isinstance(mod, torch.fx.GraphModule):
+                    # TODO: do we want to support __call__ for GM's?
+                    # If so at least some changes are needed, we don't allow inlining
+                    # the call_wrapped currently, and maybe other issues too
+                    fn = mod.forward
                 else:
-                    fn = mod.__class__.forward
-                    fn_source = AttrSource(class_source, "forward")
+                    fn = mod.__call__
+                fn_source = AttrSource(self.source, "__call__")
+                if istype(mod.__call__, types.MethodType):
+                    fn = fn.__func__
+                    fn_source = AttrSource(fn_source, "__func__")
+                    args = [self] + args
+                else:
+                    assert istype(mod.__call__, types.FunctionType)
+
                 options["source"] = fn_source
                 return tx.inline_user_function_return(
                     variables.UserFunctionVariable(fn, **options),
-                    [self] + args,
+                    args,
                     kwargs,
                 )
 
@@ -246,8 +280,33 @@ def call_method(
         key = self.module_key
         module = tx.output.get_submodule(key)
 
-        if name == "forward":
+        if name == "__call__":
+            # TODO(whc)  do we really need this special case?
             return self.call_function(tx, args, kwargs)
+        elif name == "forward":
+            # TODO(whc)
+            # This is the old special case moved to a new place.  (copy from call_function below)
+            # Old behavior: we'd route "forward" meth call to 'call_function', which inlined forward.
+            # New behavior: since call_function now hits '__call__', forward would fall through to 'wrap_proxy' below,
+            # instead of being inlined.  What should we do about this?
+            #   1) all methods get inlined now at the bottom of this call_method, instead of put into the graph as calls
+            #   2) we maintain this special case just for forward
+            assert self.source, (
+                "Must provide a valid source in order to inline, "
+                "since inlined function may have default args which must be guarded."
+            )
+            fn = module.forward.__func__
+            assert istype(fn, types.FunctionType)
+            options["source"] = AttrSource(
+                AttrSource(self.source, "forward"), "__func__"
+            )
+            args = [self] + args
+
+            return tx.inline_user_function_return(
+                variables.UserFunctionVariable(fn, **options),
+                args,
+                kwargs,
+            )
 
         if name == "_check_input_dim" and skipfiles.is_torch_inline_allowed(
             inspect.getfile(module.__class__._check_input_dim)
@@ -501,7 +560,7 @@ class UnspecializedNNModuleVariable(UserDefinedObjectVariable):
     """
 
     def __init__(self, value, **kwargs):
-        super(UnspecializedNNModuleVariable, self).__init__(value=value, **kwargs)
+        super().__init__(value=value, **kwargs)
         if self.source and self.source.is_nn_module():
             # force guard checks even when `not config.guard_nn_modules``
             self.source = NotNNModuleSource(self.source)
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 25a3f909293e..afbffe155a28 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -1,9 +1,12 @@
+import inspect
 import itertools
 import operator
+import types
 from typing import Dict, List
 
 import torch.fx
 import torch.random
+from torch.fx.experimental.symbolic_shapes import guard_scalar
 
 from .. import config, variables
 from ..exc import unimplemented
@@ -11,10 +14,9 @@
 from ..source import AttrSource
 
 from ..utils import (
+    fqn,
     get_fake_value,
     get_real_value,
-    HAS_NUMPY,
-    np,
     product,
     proxy_args_kwargs,
     tensortype_to_dtype,
@@ -23,6 +25,21 @@
 from .constant import ConstantVariable
 from .lists import ShapeVariable, SizeVariable
 
+supported_tensor_comparison_ops = {
+    ">": operator.gt,
+    "<": operator.lt,
+    ">=": operator.ge,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+}
+supported_const_comparison_ops = {
+    "is": operator.is_,
+    "is not": operator.is_not,
+    "==": operator.eq,
+    "!=": operator.ne,
+}
+
 
 class TensorVariable(VariableTracker):
     """A torch.Tensor input or an intermediate value in the FX graph"""
@@ -66,7 +83,7 @@ def __init__(
         specialized_value=None,
         **kwargs,
     ):
-        super(TensorVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.proxy = proxy
         self.dtype = dtype
         self.device = device
@@ -154,41 +171,6 @@ def var_getattr(self, tx, name):
             result = self.call_method(tx, "dim", [], {})
         elif name == "data":
             result = self.call_method(tx, "detach", [], {})
-        # TODO: reimplement the T/H/mT/mH by generating a function call
-        # to torch.Tensor.{T/H/mT/mH}.__get__
-        elif name in ("T", "H"):
-            out = (
-                tx.output.create_proxy(
-                    "call_method",
-                    "conj",
-                    *proxy_args_kwargs([self], {}),
-                )
-                if name == "H"
-                else self
-            )
-            args_list = [
-                variables.ConstantVariable(i) for i in range(self.ndim - 1, -1, -1)
-            ]
-            args = [variables.TupleVariable(args_list)]
-            result = out.call_method(tx, "permute", args, {})
-        elif name in ("mT", "mH"):
-            out = (
-                tx.output.create_proxy(
-                    "call_method",
-                    "conj",
-                    *proxy_args_kwargs([self], {}),
-                )
-                if name == "mH"
-                else self
-            )
-            if self.ndim > 0:
-                args = [
-                    variables.ConstantVariable(-2),
-                    variables.ConstantVariable(-1),
-                ]
-                result = out.call_method(tx, "transpose", args, {})
-            else:
-                result = out.call_method(tx, "t", [], {})
         if name == "__class__":
             return TorchVariable(self.python_type(), **options)
 
@@ -198,20 +180,59 @@ def var_getattr(self, tx, name):
         if result is not None and self.source is not None:
             result = result.add_guard(self.make_guard(GuardBuilder.TYPE_MATCH))
 
+        # For attributes (not methods) that were not caught in the special handling above,
+        # (e.g. tensor.real), we handle these generically, assuming that the output type is
+        # a tensor.
+        if result is None:
+
+            def try_generic_attr_handling():
+                from .builder import wrap_fx_proxy
+                from .misc import GetAttrVariable
+
+                try:
+                    static_attr = inspect.getattr_static(torch.Tensor, name)
+                except AttributeError:
+                    return None
+
+                # Make sure this is an attribute, not a method.
+                # type(torch.Tensor.H) should be "getset_descriptor"
+                # This is a because of CPython implementation, see THPVariableType:
+                # these attributes are implemented under tp_getset, which appear
+                # as `getset_descriptor`s, (compared to, say, methods which appear
+                # as `method_descriptor`s)
+                if type(static_attr) != types.GetSetDescriptorType:
+                    return None
+
+                return wrap_fx_proxy(
+                    tx=tx,
+                    proxy=GetAttrVariable.create_getattr_proxy(self.as_proxy(), name),
+                    **options,
+                )
+
+            result = try_generic_attr_handling()
+
         if result is None:
             raise NotImplementedError()
 
         return result
 
+    def has_unpack_var_sequence(self, tx):
+        return (self.size is not None and len(self.size) > 0) or (
+            self.size is None and config.dynamic_shapes
+        )
+
     def unpack_var_sequence(self, tx, idxes=None):
         from .builder import wrap_fx_proxy
 
+        options = VariableTracker.propagate(self)
         if idxes is None:
             if self.size:
-                idxes = range(self.size[0])
+                length = self.size[0]
             else:
-                return super(TensorVariable, self).unpack_var_sequence(tx)
-        options = VariableTracker.propagate(self)
+                dyn_length = self.call_method(tx, "size", [ConstantVariable(0)], {})
+                assert isinstance(dyn_length, SymNodeVariable)
+                length = dyn_length.evaluate_expr(tx.output)
+            idxes = range(length)
         return [wrap_fx_proxy(tx, self.as_proxy()[i], **options) for i in idxes]
 
     def call_method(
@@ -272,6 +293,25 @@ def call_method(
                 constant_result = ConstantVariable(
                     f"torch.{tensortype.__name__}", **options
                 )
+        elif (
+            name == "type"
+            and len(args) == 1
+            and fqn(type(args[0].as_python_constant())) == "torch.tensortype"
+        ):
+            # torch.FloatTensor, etc. are all of type "torch.tensortype".
+            # torch.fx's tracer fails on these types, because it doesn't support arguments of torch.tensortype type.
+            # So, we pass it in as a string (which is also supported, see above implementation for .type() with 0 args)
+            tensor_type = args[0].as_python_constant()
+            tensor_type_const = ConstantVariable(fqn(tensor_type), **options)
+            return wrap_fx_proxy(
+                tx,
+                tx.output.create_proxy(
+                    "call_method",
+                    name,
+                    *proxy_args_kwargs([self, tensor_type_const], kwargs),
+                ),
+                **options,
+            )
         elif name == "get_device" and isinstance(self.device, torch.device):
             index = self.device.index if self.device.type != "cpu" else -1
             constant_result = ConstantVariable(index, **options)
@@ -299,22 +339,16 @@ def call_method(
             unimplemented(f"Tensor.{name}")
         elif name == "nonzero" and not config.dynamic_shapes:
             unimplemented(f"Tensor.{name}")
-        elif name == "item":
-            if config.capture_scalar_outputs:
-                example_value = get_fake_value(self.proxy.node, tx)
-                return wrap_fx_proxy(
-                    tx,
-                    tx.output.create_proxy(
-                        "call_method",
-                        "item",
-                        (self.as_proxy(),),
-                        {},
-                    ),
-                    example_value=example_value,
-                    **options,
-                )
-            else:
-                unimplemented(f"Tensor.{name}")
+        elif name == "item" and not config.capture_scalar_outputs:
+            unimplemented(f"Tensor.{name}")
+        elif (
+            name == "item"
+            and config.capture_scalar_outputs
+            and not config.dynamic_shapes
+        ):
+            raise AssertionError(
+                "To capture_scalar_outputs, you must also set dynamic_shapes = True"
+            )
         elif name == "__len__":
             return self.call_method(tx, "size", [ConstantVariable(0, **options)], {})
         elif name == "__setitem__":
@@ -396,38 +430,36 @@ def call_method(
             )
 
 
-class DynamicShapeVariable(VariableTracker):
+class SymNodeVariable(VariableTracker):
     """
     Represents a symbolic size, e.g., as returned by tensor.size(0)
     """
 
     @classmethod
-    def create(cls, tx, proxy, dyn_shape, **options):
+    def create(cls, tx, proxy, sym_num, **options):
         if "example_value" in proxy.node.meta:
-            assert proxy.node.meta["example_value"] == dyn_shape
-        if dyn_shape is None:
-            dyn_shape = get_fake_value(proxy.node, tx)
-        proxy.node.meta["example_value"] = dyn_shape
-        return DynamicShapeVariable(proxy, dyn_shape, **options)
-
-    def __init__(self, proxy, dyn_shape, **kwargs):
-        super(DynamicShapeVariable, self).__init__(**kwargs)
+            assert proxy.node.meta["example_value"] == sym_num
+        if sym_num is None:
+            sym_num = get_fake_value(proxy.node, tx)
+        proxy.node.meta["example_value"] = sym_num
+        return SymNodeVariable(proxy, sym_num, **options)
+
+    def __init__(self, proxy, sym_num, **kwargs):
+        super().__init__(**kwargs)
         self.proxy = proxy
-        self.dyn_shape = dyn_shape
+        self.sym_num = sym_num
 
     def python_type(self):
-        return type(self.dyn_shape)
+        return type(self.sym_num)
 
     def unpack_var_sequence(self, tx):
-        super(DynamicShapeVariable, self).unpack_var_sequence(tx)
+        super().unpack_var_sequence(tx)
 
     def as_proxy(self):
         return self.proxy
 
     def evaluate_expr(self, output_graph):
-        if not isinstance(self.dyn_shape, torch.SymInt):
-            return self.dyn_shape
-        return output_graph.shape_env.evaluate_expr(self.dyn_shape.node.expr)
+        return guard_scalar(self.sym_num)
 
     def call_method(
         self,
@@ -464,7 +496,7 @@ def __init__(
         subclass_type,
         **kwargs,
     ):
-        super(TensorWithTFOverrideVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.tensor_variable = tensor_variable
         self.orig_tensor_variable_source = orig_tensor_variable_source
         self.subclass_torch_function__func = subclass_torch_function__func
@@ -577,10 +609,8 @@ class UnspecializedPythonVariable(TensorVariable):
 
     def __init__(self, proxy: torch.fx.Proxy, **kwargs):
         raw_value = kwargs.pop("raw_value", None)
-        if HAS_NUMPY and isinstance(raw_value, np.number):
-            raw_values = raw_value.item()
         need_unwrap = kwargs.pop("need_unwrap", True)
-        super(UnspecializedPythonVariable, self).__init__(proxy, **kwargs)
+        super().__init__(proxy, **kwargs)
         self.raw_value = raw_value
         self.need_unwrap = need_unwrap
 
@@ -611,7 +641,7 @@ class FakeItemVariable(TensorVariable):
 
     def __init__(self, proxy: torch.fx.Proxy, **kwargs):
         need_unwrap = kwargs.pop("need_unwrap", False)
-        super(FakeItemVariable, self).__init__(proxy, **kwargs)
+        super().__init__(proxy, **kwargs)
         self.need_unwrap = need_unwrap
 
     @classmethod
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 75eaec9a2cb8..d11fb95020fb 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -3,7 +3,6 @@
 import math
 import re
 import types
-from collections import OrderedDict
 from typing import Dict, List
 
 import torch._C
@@ -11,6 +10,7 @@
 import torch.nn
 import torch.onnx.operators
 from torch._dynamo.utils import get_fake_value
+from torch._dynamo.variables import SymNodeVariable
 from torch._guards import GuardsCheckpointState
 
 from .. import config, variables
@@ -121,7 +121,7 @@ class TorchVariable(VariableTracker):
     """Points to a module or method in torch.*"""
 
     def __init__(self, value, **kwargs):
-        super(TorchVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
         if value in tensor_dunder_fns_remap:
             value = tensor_dunder_fns_remap[value]
@@ -160,7 +160,7 @@ def unique_var_name(self):
         return "__" + re.sub(r"[^a-zA-Z0-9_]+", "_", name)
 
     def reconstruct(self, codegen):
-        return codegen.setup_globally_cached(self.unique_var_name(), self.value)
+        return codegen.setup_globally_cached(self.unique_var_name(), self.value, False)
 
     def as_proxy(self):
         return self.value
@@ -183,13 +183,15 @@ def call_function(
     ) -> "VariableTracker":
         from . import (
             ConstantVariable,
-            DynamicShapeVariable,
+            CUDAStreamContextVariable,
+            CUDAStreamVariable,
             GradModeVariable,
+            SymNodeVariable,
             TensorVariable,
             UserDefinedObjectVariable,
         )
 
-        from .builder import wrap_fx_proxy
+        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
 
         constant_args = check_constant_args(args, kwargs)
         unspec_python_args = check_unspec_python_args(args, kwargs)
@@ -269,6 +271,24 @@ def call_function(
             return ConstantVariable(torch.is_grad_enabled(), **options).add_guards(
                 GradModeVariable._guards_singleton
             )
+        elif self.value is torch.cuda.stream:
+            log.warning(
+                "torch.cuda.stream() not fully supported, streams may be ignored"
+            )
+            assert len(args) == 1
+            return CUDAStreamContextVariable.create(tx, args[0], **options)
+        elif self.value is torch.cuda.streams.Stream:
+            return wrap_fx_proxy_cls(
+                CUDAStreamVariable,
+                tx,
+                tx.output.create_proxy(
+                    "call_function",
+                    torch.cuda.streams.Stream,
+                    (),
+                    {},
+                ),
+                **options,
+            )
         elif not config.dynamic_shapes and self.is_dynamic_shapes(args, kwargs):
             unimplemented(f"dynamic shapes: {self.value.__name__}")
         elif len(args) > 0 and isinstance(args[0], TensorWithTFOverrideVariable):
@@ -421,19 +441,19 @@ def get_state_from_generator():
             )
         else:
             any_symints_or_symfloats = any(
-                [isinstance(x, DynamicShapeVariable) for x in args]
+                [isinstance(x, SymNodeVariable) for x in args]
             )
             all_ints_or_floats = all(
                 [
                     isinstance(
-                        x, (variables.ConstantVariable, variables.DynamicShapeVariable)
+                        x, (variables.ConstantVariable, variables.SymNodeVariable)
                     )
                     for x in args
                 ]
             )
-            bin_ops = set(["add", "sub", "mul", "div", "sqrt"])
+            bin_ops = {"add", "sub", "mul", "div", "sqrt"}
             if (
-                self.value.__module__ == "torch"
+                getattr(self.value, "__module__", "") == "torch"
                 and self.value.__name__ in bin_ops
                 and any_symints_or_symfloats
                 and all_ints_or_floats
@@ -458,11 +478,68 @@ def get_state_from_generator():
                     if isinstance(x.value, np.generic):
                         x.value = x.value.item()
 
+            if self.value == torch._C._nn.scaled_dot_product_attention:
+                # See:[Note] SDPA_flash's meta function returns incorrect Philox seed and offset
+                # in pytorch/torch/_meta_registrations.py
+                all_kwargs = kwargs.copy()
+                all_kwargs.update(
+                    dict(
+                        zip(
+                            (
+                                "query",
+                                "key",
+                                "value",
+                                "attn_mask",
+                                "dropout_p",
+                                "is_causal",
+                            ),
+                            args,
+                        )
+                    )
+                )
+                fake_query = all_kwargs["query"].as_proxy().node.meta["example_value"]
+                fake_key = all_kwargs["key"].as_proxy().node.meta["example_value"]
+                fake_value = all_kwargs["value"].as_proxy().node.meta["example_value"]
+                fake_mask = all_kwargs.get("attn_mask")
+                if isinstance(fake_mask, TensorVariable):
+                    fake_mask = fake_mask.as_proxy().node.meta["example_value"]
+                else:
+                    fake_mask = None
+                dropout_p = kwargs.get("dropout_p")
+                dropout_p = dropout_p.value if dropout_p is not None else 0.0
+                is_causal = kwargs.get("is_causal")
+                is_causal = is_causal.value if is_causal is not None else False
+                # We look through the stack to find a cuda autocast context
+                # If we do we will convert the fake tensors to torch.float16
+                is_cuda_autocast_context = False
+                for block in tx.block_stack:
+                    if (
+                        isinstance(block.with_context, AutocastModeVariable)
+                        and block.with_context.target_values[0] == "cuda"
+                    ):
+                        is_cuda_autocast_context = True
+                        break
+
+                if is_cuda_autocast_context and fake_query.device.type == "cuda":
+                    amp_dtype = torch.float16
+                    fake_query = fake_query.clone().to(amp_dtype)
+                    fake_key = fake_key.clone().to(amp_dtype)
+                    fake_value = fake_value.clone().to(amp_dtype)
+
+                backend_choice = torch._fused_sdp_choice(
+                    fake_query, fake_key, fake_value, fake_mask, dropout_p, is_causal
+                )
+                if backend_choice == torch.backends.cuda.SDPBackend.FLASH_ATTENTION:
+                    if dropout_p is not None and dropout_p != 0.0:
+                        unimplemented(
+                            "FlashAttention with dropout is not supported in cuda graphs"
+                        )
+
             # TODO(voz): Replace w/ dynamic shape rewrite table.
             # Ideally, we would be able to do this at ctor time, but alas we need a combination
             # of value + args to determine this.
             fn_ = self.value
-            if any([isinstance(x, DynamicShapeVariable) for x in args]):
+            if any([isinstance(x, SymNodeVariable) for x in args]):
                 if self.value == math.sqrt:
                     from torch.fx.experimental.symbolic_shapes import sym_sqrt
 
@@ -478,7 +555,10 @@ def get_state_from_generator():
                 **options,
             )
 
-            if "out" in kwargs:
+            if "out" in kwargs and not (
+                isinstance(kwargs["out"], variables.ConstantVariable)
+                and kwargs["out"].as_python_constant() is None
+            ):
                 # out variants of torch operators like torch.sort and
                 # torch.sigmoid mutate the tensors in the out field. Track such
                 # tensors and rewrite the symbolic locals.
@@ -488,13 +568,13 @@ def get_state_from_generator():
                         tx.find_symbolic_locals_name(x) for x in kwargs["out"].items
                     ]
                     for idx, name in enumerate(output_tensor_names):
-                        assert name in tx.symbolic_locals
-                        tx.symbolic_locals[name] = tensor_variable.items[idx]
+                        if name in tx.symbolic_locals:
+                            tx.symbolic_locals[name] = tensor_variable.items[idx]
                 elif isinstance(tensor_variable, TensorVariable):
                     assert isinstance(kwargs["out"], TensorVariable)
                     name = tx.find_symbolic_locals_name(kwargs["out"])
-                    assert name in tx.symbolic_locals
-                    tx.symbolic_locals[name] = tensor_variable
+                    if name in tx.symbolic_locals:
+                        tx.symbolic_locals[name] = tensor_variable
                 else:
                     unimplemented(f"out variant of {type(kwargs['out'])}")
 
@@ -659,7 +739,7 @@ def handle_ntuple(value):
 
 class TorchPyOperator(VariableTracker):
     def __init__(self, value, **kwargs):
-        super(TorchPyOperator, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.value = value
 
     def call_function(
@@ -710,9 +790,6 @@ def get_comparable_state(state):
                     # Timestamp is monotonically increasing so we don't
                     # care about divergence
                     timestamp=0,
-                    # Meh (problem is the nodes don't compare equal;
-                    # maybe nub out outputs only)
-                    name_to_input=OrderedDict(),
                     # Unused in branches
                     graphargs=[],
                 )
@@ -768,7 +845,9 @@ def speculate_subgraph(f, sub_args, graph_checkpoint, checkpoint):
             # ops - see torch/dispatch/_dispatcher.py
 
             assert len(args) == 4
-            assert type(args[0]) is TensorVariable, str(type(args[0]))  # predicate
+            assert type(args[0]) in (TensorVariable, SymNodeVariable), str(
+                type(args[0])
+            )  # predicate
             assert isinstance(
                 args[1], (UserFunctionVariable, NestedUserFunctionVariable)
             ), str(
@@ -844,7 +923,7 @@ def speculate_branch(branch):
                 args[0].as_proxy(),
                 true_node,
                 false_node,
-                list(a.as_proxy() for a in sub_args),
+                [a.as_proxy() for a in sub_args],
             )
             # TODO: assert that the true/false return values are
             # consistent
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 6958556793c8..ce8abdb5807a 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -1,6 +1,5 @@
 import collections
 import contextlib
-import dataclasses
 import functools
 import importlib
 import inspect
@@ -14,7 +13,12 @@
 from ..exc import unimplemented
 from ..guards import GuardBuilder
 from ..source import AttrSource, ODictGetItemSource, RandomValueSource
-from ..utils import is_namedtuple_cls, namedtuple_fields
+from ..utils import (
+    get_custom_getattr,
+    is_namedtuple_cls,
+    namedtuple_fields,
+    object_has_getattribute,
+)
 from .base import MutableLocal, VariableTracker
 from .misc import NullContextVariable
 
@@ -31,6 +35,9 @@ def __init__(self, value, **kwargs):
     def as_python_constant(self):
         return self.value
 
+    def python_type(self):
+        return type(self.value)
+
     def var_getattr(self, tx, name: str) -> "VariableTracker":
         from . import ConstantVariable
         from .builder import VariableBuilder
@@ -58,7 +65,7 @@ def var_getattr(self, tx, name: str) -> "VariableTracker":
             elif ConstantVariable.is_literal(obj):
                 return ConstantVariable(obj, **options)
 
-        return super(UserDefinedClassVariable, self).var_getattr(tx, name)
+        return super().var_getattr(tx, name)
 
     def call_method(
         self,
@@ -93,10 +100,7 @@ def call_function(
 
         options = VariableTracker.propagate(self, args, kwargs.values())
 
-        if self.value in (
-            contextlib.nullcontext,
-            torch.autograd.profiler.profile,
-        ):
+        if self.value is contextlib.nullcontext:
             return NullContextVariable(**options)
         elif is_namedtuple_cls(self.value):
             fields = namedtuple_fields(self.value)
@@ -136,7 +140,7 @@ class UserDefinedObjectVariable(UserDefinedVariable):
     """
 
     def __init__(self, value, value_type=None, **kwargs):
-        super(UserDefinedObjectVariable, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.value = value
         self.value_type = value_type or type(value)
         assert type(value) is self.value_type
@@ -262,27 +266,17 @@ def call_function(
         return super().call_function(tx, args, kwargs)
 
     def _check_for_getattribute(self):
-        try:
-            if isinstance(
-                inspect.getattr_static(type(self.value), "__getattribute__"),
-                types.FunctionType,
-            ):
-                unimplemented("UserDefinedObjectVariable with custom __getattribute__")
-        except AttributeError:
-            pass
+        if object_has_getattribute(self.value):
+            unimplemented("UserDefinedObjectVariable with custom __getattribute__")
 
     def _check_for_getattr(self):
-        try:
-            getattr_fn = inspect.getattr_static(type(self.value), "__getattr__")
-        except AttributeError:
-            getattr_fn = None
-        if getattr_fn is torch.nn.Module.__getattr__:
-            # ignore this case of getattr
-            getattr_fn = None
-        return getattr_fn
+        return get_custom_getattr(self.value)
 
     def _getattr_static(self, name):
-        if isinstance(self.value, (dataclasses.Field, torch.nn.Module)):
+        if (
+            isinstance(self.value, torch.nn.Module)
+            or "__slots__" in self.value.__class__.__dict__
+        ):
             # getattr_static doesn't work on these
             subobj = getattr(self.value, name)
         else:
@@ -315,11 +309,15 @@ def var_getattr(self, tx, name):
                 subobj.fget, self, source=source, **options
             ).call_function(tx, [], {})
         elif isinstance(subobj, staticmethod):
-            return variables.UserFunctionVariable(subobj.__get__(self.value), **options)
+            return variables.UserFunctionVariable(
+                subobj.__get__(self.value), source=source, **options
+            )
         elif isinstance(subobj, classmethod):
-            return variables.UserMethodVariable(subobj.__func__, self, **options)
+            return variables.UserMethodVariable(
+                subobj.__func__, self, source=source, **options
+            )
         elif isinstance(subobj, types.FunctionType):
-            return variables.UserMethodVariable(subobj, self, **options)
+            return variables.UserMethodVariable(subobj, self, source=source, **options)
 
         if (
             name in getattr(value, "__dict__", {})
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
new file mode 100644
index 000000000000..941eb84ec937
--- /dev/null
+++ b/torch/_export/__init__.py
@@ -0,0 +1,215 @@
+import contextlib
+import copy
+from typing import Callable, Tuple, Generator, Dict
+from unittest.mock import patch
+
+import torch
+import torch._dynamo as torchdynamo
+from torch._decomp import core_aten_decompositions
+from torch._dispatch.python import enable_python_dispatcher
+from torch.nn.utils import stateless
+from torch.utils import _pytree as pytree
+
+from torch._functorch.aot_autograd import (
+    AOTConfig,
+    create_aot_dispatcher_function,
+    default_partition,
+    run_functionalized_fw_and_collect_metadata,
+)
+
+from torch.fx.experimental.proxy_tensor import (
+    get_proxy_slot,
+    get_torch_dispatch_modes,
+    has_proxy_slot,
+    make_fx,
+    ProxyTorchDispatchMode,
+    set_proxy_slot,
+)
+
+from torch._functorch.eager_transforms import _unwrap_all_tensors_from_functional
+
+from .workflow import ExportedProgram
+
+CORE_ATEN_DECOMPOSITIONS_TABLE = core_aten_decompositions()
+
+__all__ = ["experimental_export"]
+
+
+def _aot_capture(mod, flat_args):
+    """
+    A wrapper around aot_autograd() to mix AOT Autograd + torch.export.
+    Some assumptions were made about the AOT Autograd internal:
+    1. The functionalization metadata format.
+    2. Calling convention of returned forward graph.
+    3. make_fx() internal proxy storage.
+
+    In the current context we're just experimenting the idea so it's possible things
+    could break. For the next step we should find a way to upstream something reasonable.
+    """
+    param_list = [
+        *mod.named_parameters(remove_duplicate=False),
+        *mod.named_buffers(remove_duplicate=False),
+    ]
+    params = dict(param_list)
+    params_flat, params_spec = pytree.tree_flatten(params)
+    params_len = len(params_flat)
+
+    full_args = []
+    full_args.extend(params_flat)
+    full_args.extend(flat_args)
+
+    def functional_call(*args):
+
+        with stateless._reparametrize_module(
+            mod,
+            pytree.tree_unflatten(args[:params_len], params_spec),  # type: ignore[arg-type]
+        ):
+            return torch.fx.Interpreter(mod).run(*args[params_len:])
+
+    out_spec = None
+
+    with enable_python_dispatcher():
+        fw_metadata, _ = run_functionalized_fw_and_collect_metadata(
+            lambda *args: pytree.tree_flatten(functional_call(*args))[0],
+            keep_input_mutations=False,
+        )(*copy.deepcopy(full_args))  # type: ignore[operator]
+
+    assert len(fw_metadata.input_info) == len(full_args)
+    mutated_input_indices = [
+        i
+        for i, input_info in enumerate(fw_metadata.input_info)
+        if input_info.mutates_data or input_info.mutates_metadata
+    ]
+
+    graph_module = None
+
+    def fw_compiler(gm, inputs):
+        nonlocal graph_module
+        graph_module = gm
+
+    num_fwd_returns = None
+
+    def partition_fn(joint_module, joint_inputs, *, num_fwd_outputs, **kwargs):
+        nonlocal num_fwd_returns
+        num_fwd_returns = num_fwd_outputs
+        return default_partition(
+            joint_module, joint_inputs, num_fwd_outputs=num_fwd_outputs, **kwargs
+        )
+
+    def set_state_proxies(state_args):
+        modes = get_torch_dispatch_modes()
+        proxy_tensor_modes = [m for m in modes if isinstance(m, ProxyTorchDispatchMode)]
+        if len(proxy_tensor_modes) == 0:
+            return
+        assert len(state_args) == len(params_flat)
+        for i, arg in enumerate(state_args):
+            tracer = next(
+                m.tracer for m in proxy_tensor_modes if has_proxy_slot(arg, m.tracer)
+            )
+            set_proxy_slot(arg, tracer, params_flat[i])
+
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=lambda gm, inputs: None,
+        partition_fn=partition_fn,
+        decompositions=CORE_ATEN_DECOMPOSITIONS_TABLE,  # type: ignore[arg-type]
+        num_params_buffers=params_len,
+        aot_id=-1,
+        keep_inference_input_mutations=False,
+    )
+
+    @contextlib.contextmanager
+    def setup_dynamic_shape():
+        prev, torch._functorch.config.use_dynamic_shapes = (
+            torch._functorch.config.use_dynamic_shapes,
+            True,
+        )
+        try:
+            yield
+        finally:
+            torch._functorch.config.use_dynamic_shapes = prev
+
+    def exported_call(*args):
+        state_args = args[:params_len]
+        unwrapped_state_args = _unwrap_all_tensors_from_functional(
+            state_args, reapply_views=False
+        )
+        set_state_proxies(unwrapped_state_args)
+        with torch.fx.traceback.preserve_node_meta():
+            outputs = functional_call(*args)
+        nonlocal out_spec
+        outputs, out_spec = pytree.tree_flatten(outputs)
+        return outputs
+
+    with torch.enable_grad(), setup_dynamic_shape():
+        create_aot_dispatcher_function(
+            exported_call,
+            full_args,
+            aot_config,
+        )
+
+    assert graph_module is not None
+
+    for i, node in enumerate(graph_module.graph.nodes):
+        if i == len(params_flat):
+            break
+        assert node.op == "placeholder" and len(node.users) == 0
+        graph_module.graph.erase_node(node)
+
+    output_node = next(iter(reversed(graph_module.graph.nodes)))
+    assert output_node.op == "output" and len(output_node.args) == 1
+    assert num_fwd_returns is not None
+    # Turncate the output so we only output what we need.
+    output_node.args = (
+        output_node.args[0][
+            : len(mutated_input_indices) + len(fw_metadata.output_info)
+        ],
+    )
+
+    graph_module.graph.eliminate_dead_code()
+    graph_module.recompile()
+
+    def find_mutation_destinations(gm, w):
+        assert isinstance(w, torch.Tensor)
+        ret = [
+            name for name, x in [*gm.named_parameters(), *gm.named_buffers()] if x is w
+        ]
+        assert len(ret) != 0, "Cannot find mutation destination."
+        return ret
+
+    mutation = [
+        (
+            "copy_",
+            output_node.args[0][k].name,
+            find_mutation_destinations(graph_module, param_list[i][1]),
+        )
+        for k, i in enumerate(mutated_input_indices)
+    ]
+    assert out_spec is not None
+    return graph_module, mutation, out_spec
+
+
+@patch.object(torchdynamo.config, "dynamic_shapes", True)
+@patch.object(torchdynamo.config, "capture_scalar_outputs", True)
+@patch.object(torchdynamo.config, "guard_nn_modules", True)
+@patch.object(torchdynamo.config, "specialize_int_float", True)
+@patch.object(torchdynamo.config, "allow_rnn", True)
+@patch.object(torchdynamo.config, "verbose", True)
+def do_not_use_experimental_export(f: Callable, args: Tuple, training=False):
+    """
+    This prototype is under heavy development. Pls don't use it if you are
+    not part of PyTorch 2.0 Export team.
+    """
+    if training:
+        NotImplementedError("training mode is not supported yet")
+
+    flattened_args, in_spec = pytree.tree_flatten(args)
+    # Doing it twice so that if graph_module accidentally modifies the input
+    # we still get the same original input.
+    original_flat_args = tuple(flattened_args)
+    flat_args = tuple(flattened_args)
+
+    graph_module, guards = torchdynamo.export(f, *args, aten_graph=False)
+    # TODO (tmanlaibaatar) do sth with guards?
+    graph_module, _, out_spec = _aot_capture(graph_module, flat_args)
+    return ExportedProgram(fw_module=graph_module, example_inputs=original_flat_args, in_spec=in_spec, out_spec=out_spec)
diff --git a/torch/_export/logical_schema.py b/torch/_export/logical_schema.py
new file mode 100644
index 000000000000..097a732df412
--- /dev/null
+++ b/torch/_export/logical_schema.py
@@ -0,0 +1,296 @@
+# type: ignore[assignment]
+
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import List, Union, Dict
+
+################################################################################
+# Following section is the defining the permissible argument types for operators
+
+# Copied from torchgen/model.py
+class ScalarType(Enum):
+    u8 = auto()     # torch.uint8
+    i8 = auto()     # torch.int8
+    i16 = auto()    # torch.int16 or torch.short
+    i32 = auto()    # torch.int32 or torch.int
+    i64 = auto()    # torch.int64 or torch.long
+    f16 = auto()    # torch.float16 or torch.half
+    f32 = auto()    # torch.float32 or torch.float
+    f64 = auto()    # torch.float64 or torch.double
+    c32 = auto()    # torch.complex32
+    c64 = auto()    # torch.complex64 or torch.cfloat
+    c128 = auto()   # torch.complex128 or torch.cdouble
+    b8 = auto()     # torch.bool
+    bf16 = auto()   # torch.bfloat16
+
+# Copied from torch/_C/__init__.pyi.in
+class Layout(Enum):
+    # Defined in torch/csrc/utils/tensor_layouts.cpp
+    strided = auto()
+    sparse_coo = auto()
+    sparse_csr = auto()
+    sparse_csc = auto()
+    sparse_bsr = auto()
+    sparse_bsc = auto()
+    _mkldnn = auto()
+
+
+# Copied from torch/_C/__init__.pyi.in
+class MemoryFormat(Enum):
+    # Defined in torch/csrc/utils/tensor_memoryformats.cpp
+    contiguous_format = auto()
+    channels_last = auto()
+    channels_last_3d = auto()
+    preserve_format = auto()
+
+
+# Copied from torch/_C/__init__.pyi.in
+@dataclass
+class Device:
+    # Defined in torch/csrc/Device.cpp
+    type: str
+    index: int
+
+@dataclass
+class SymInt:  # Union, ONLY EXACTLY ONE of the following fields can be set
+    as_int: int = None
+    as_sym: str = None
+
+# !!! To support t.item(), we need to introduce SymFloat
+# @dataclass
+# class SymFloat:  # Union, ONLY EXACTLY ONE of the following fields can be set
+#     as_flaot: float = None
+#     as_sym: str = None
+
+
+# This is a Tensor Arugment used in the args of an node
+# We intentionally don't store the tensor's storage, nor the tensor's meta data here,
+# as the same tensor argument can be used in multiple nodes, and we want to avoid storing the same data multiple times.
+# In another word, this field is an reference to the tensor, not the tensor itself.
+@dataclass
+class TensorArgument:
+    name: str   # identifier of the tensor, which must exist in graph's tensor_values
+
+# This is a SymInt Arugment used in the args of an node
+# We intentionally don't store the SymInt's value here, as the same SymInt argument can be used in multiple nodes
+# This field is an reference to the SymInt
+@dataclass
+class SymIntArgument:
+    name: str   # identifier of the symint, which must exist in graph's symint_values
+
+#  Permissible return types for operators
+# !!! Notice: this assumes that a node can only return Tensor(s) and Symint(s), and not other int/float/bool types...
+# !!! What about .item()? Do we need to handle this?
+@dataclass
+class ReturnArgument:  # Union, ONLY EXACTLY ONE of the following fields can be set
+    as_tensor: TensorArgument = None
+
+    # !!! ATM, no operator has return type as Tensor[], might need this latter?
+    # as_tensors: List[TensorArgument] = None
+
+    as_symint: SymIntArgument = None
+
+
+# Permissible argument types for operators
+# !!! This is a Union struct, but there is no good python construct to model this
+@dataclass
+class Argument:  # Union, ONLY EXACTLY ONE of the following fields can be set
+    # A special type for representing python None in the arguments
+    # This must only be used for ops that accepts None as an argument, e.g. Tensor?, Scalar?, int?, int[]?
+    as_none: bool = None
+
+    as_tensor: TensorArgument = None
+    as_tensors: List[TensorArgument] = None   # Tensor[], used by aten.cat, and condition ops
+
+    as_symint: SymIntArgument = None         # Symint can be an argument, there are symint in native_function.yaml
+    as_symints: List[SymIntArgument] = None   # Symint[] can be an argement, there are symint[] in native_function.yaml
+
+    as_bool: bool = None
+
+    # !!! There are use of bool[3] in canonical aten ops, consider if we can simplify this
+    as_bools: List[bool] = None     # for bool[]
+
+    as_int: int = None
+    as_ints: List[int] = None      # for int[]
+    as_float: float = None
+    as_floats: List[float] = None    # for float[]
+    as_str: str = None
+    # List[str],        # !!! There is no str[] in native_function.yaml. Consider if this is needed for expressiveness
+
+    # Graph,            # !!! Consider how to handle condition op, which need to pass in a graph for the branch
+    # List[Graph],      # !!! What about list of graphs? Do we need this?
+    as_gm: "GraphModule" = None     # !!! ATM, torch.cond models branch as GraphModule
+
+    # !!! Following types doesn't have a list version in native_function.yaml
+    as_scalar_type: ScalarType = None
+    as_memory_format: MemoryFormat = None
+    as_layout: Layout = None
+    as_device: Device = None
+
+
+################################################################################
+# Following section is the defining the schema of serializing a concrete tensor
+
+# TensorMeta is a decription of a tensor, without the actual data (,effectively maps to FakeTensor)
+# TensorMeta has multliple uses
+#   1. Represent the property of a concrete tensor backed by a storage
+#     - This is used in the serialization of a concrete tensor, e.g. model weight
+#     - In this case, sizes and strides must be concrete ints, and cannot be symbolic
+#     - stride and storage_offset have to used to correctly reconstruct the tensor from the storage
+#   2. Represent the property of a virtual tensor (see TensorValue below)
+#     - In this case, sizes and strides can be either concrete ints or symbolic ints.
+#     - device/strides/storage_offset/layout/memory_format are tied to pytorch's implementation.
+#       These are faithful capture of tensor's detail in pytorch's executions during tracing
+#       However, it's up to downstream system on how to utilized these fields
+#       In another word, these feilds are suggestive, rather than mandatory.
+
+
+@dataclass
+class TensorMeta:
+    dtype: ScalarType
+    sizes: List[SymInt]
+
+    # needed for training
+    requires_grad: bool
+
+    # !!! see description above, there are subtle difference on how these fields should be interpreted
+    device: Device
+    strides: List[SymInt]
+    storage_offset: SymInt
+    layout: Layout
+
+
+@dataclass
+class Buffer:
+    # data stored in big endian
+    buffer: bytes
+
+
+# External data needs to stored in big endian
+@dataclass
+class ExternalBuffer:
+    location: str
+    offset: str     # !!! Consider using int, but int has int_max limitation
+    length: str     # !!! Consider using int, but int has int_max limitation
+    checksum: str
+
+
+@dataclass
+class Storage:
+    class DataLocation(Enum):
+        Internal = auto()
+        External = auto()
+
+    data_location: DataLocation
+    data: Union[Buffer, ExternalBuffer]
+
+
+# This is a concrete tensor backed by storage
+@dataclass
+class Tensor:
+    # storage
+    storage: Storage
+
+    # metadata
+    meta: TensorMeta
+
+
+################################################################################
+# Following section is defining the schema of 3 level construct: GraphModule, Graph, Node
+
+# TensorValue has no corresponding class in fx
+# TensorValue is the "tensor results" that are passed between nodes in the graph
+# TensorValue is a named virtual tensor, with an TensorMeta that describes the properties of the tensor
+@dataclass
+class TensorValue:
+    name: str           # unique identifier of the TensorValue, referenced in Argument.as_tensor field
+    meta: TensorMeta    # tensor meta
+
+
+@dataclass
+class SymIntValue:
+    name: str       # unique identifier of the SymIntValue, referenced in Argument.as_symint field
+    value: SymInt
+
+@dataclass
+class NodeMetadata:
+    stack_trace: str                      # source info of a node
+    nn_module_stack: str                  # stack of nn.Module that the node originates from
+    extra: Dict[str, str]                 # arbitrary string-string pairs for extra metadata
+
+
+# Maps to fx.Node
+# Node can only be 'call_function' ops
+# 'placeholder' and 'output' are serialized as inputs and outputs of the Graph
+# 'get_attr' is not needed anymore, as it's an implicit lookup from GraphModule's parameters/buffers
+# 'call_method' and 'call_module' is not supported, as it's not used in the canonical FX Graph
+@dataclass
+class Node:
+    # fully qualified name to the target, e.g. aten.add.Tensnor
+    # !!! Consider using a structured operator name instead of string
+    target: str
+
+    args: List[Argument]
+
+    # kwargs for this node
+    # !!! Not all types in Argument are used as kwargs, e.g. TensorArgument should not be used as kwargs
+    # Do we want to enforce this in the schema? i.e. only allow certain types to be used as kwargs?
+    kwargs: Dict[str, Argument]
+
+    # A list of Argument returned by this node
+    outputs: List[ReturnArgument]
+
+    metadata: NodeMetadata          # metadata fields for this node
+
+
+# Maps to fx.Graph
+@dataclass(init=False)
+class Graph:
+    # Maps to fx.graph's placeholder nodes.
+    # !!! Do we allow SymInt as graph input?
+    # !!! need to think about where to store the metadata for placeholder nodes
+    inputs: List[TensorArgument]
+
+    # Maps to fx.graph's output node.
+    # !!! Do we allow SymInt as graph output?
+    # !!! need to thinking about where to store the metadata for original output node
+    outputs: List[TensorArgument]
+
+    # maps to computations nodes in fx.graph
+    # Placeholder nodes and output node are not included in this list.
+    # Only call_function can be included in this list
+    nodes: List[Node]
+
+    # Tensor values that appear in the graph
+    # They could be graph inputs, graph outputs, or intermediate tensor values produced by nodes
+    tensor_values: List[TensorValue]
+
+    # SymInt values that appear in the graph
+    symint_values: List[SymIntValue]
+
+
+# Maps to fx.GraphModule
+# This the top level construct for the model
+@dataclass(init=False)
+class GraphModule:
+    # A readable name for the model, potentially maps to GraphModule's self.__class__.__name__
+    # This is not an identified for GraphModule
+    name: str
+
+    graph: Graph    # Only one Graph per GraphModule
+
+    # maps to GraphModule's meta, which is a Dict[str, Any], but we only support string key and string value.
+    metadata : Dict[str, str]
+
+    # Stateful fields of the graph module
+
+    # The name of the tensor will be used to bind to the TensorValues of Graph
+    # !!! Consider storing them in the Graph.
+    # There are functional difference between buffers and parameters, so they are stored separately.
+    parameters: Dict[str, Tensor]
+    buffers: Dict[str, Tensor]
+
+    # !!! model constants: constant, etc.
+
+    # !!! Might also need to store the shape_env for symints, but it's unclear how downstream system will use it.
+    # !!! Consider storing it in the GraphModule, or in the Graph.
diff --git a/torch/_export/workflow.py b/torch/_export/workflow.py
new file mode 100644
index 000000000000..27208f041a40
--- /dev/null
+++ b/torch/_export/workflow.py
@@ -0,0 +1,19 @@
+import dataclasses
+from typing import Callable, Tuple
+
+import torch
+from torch.fx.passes.pass_manager import PassManager
+from torch.utils._pytree import TreeSpec
+
+@dataclasses.dataclass
+class ExportedProgram:
+    fw_module: torch.fx.GraphModule
+    example_inputs: Tuple[torch.Tensor, ...]
+    in_spec: TreeSpec
+    out_spec: TreeSpec
+
+    def transform(self, *passes: Callable) -> "ExportedProgram":
+        res = PassManager(list(passes))(self.fw_module)
+        assert res is not None
+        transformed = dataclasses.replace(self, fw_module=res.graph_module)
+        return transformed
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 7294a61a5ccd..a465d4aa7a09 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -26,7 +26,6 @@
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.nn.utils import stateless
 from . import config
-from .named_members_polyfill import _named_buffers, _named_parameters
 from .partitioners import default_partition
 from torch._guards import TracingContext, DuplicateInputs
 
@@ -437,10 +436,15 @@ class ViewAndMutationMeta:
     # another user output (in both cases, we won't redundantly append bases to the end of the graph)
     num_intermediate_bases: int
 
+    # For inference only: instructs us to keep data-only input mutations directly in the graph
+    keep_input_mutations: int
+
     def __post_init__(self):
-        # pre-compute the indices of the inputs that are mutated
+        # pre-compute the indices of the inputs that are mutated.
+        # When keep_input_mutations is set, we don't need to worry about our epilogue
+        # handling data-only mutations, because we keep them directly in the graph.
         mutated_inp_indices = [
-            i for i, m in enumerate(self.input_info) if m.mutates_data or m.mutates_metadata
+            i for i, m in enumerate(self.input_info) if m.mutates_metadata or (not self.keep_input_mutations and m.mutates_data)
         ]
         aliased_out_indices = [
             i
@@ -497,8 +501,17 @@ def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires
         else:
             reshaped_base_tensor = aliased_base_tensor
         out = target_meta_tensor._view_func(reshaped_base_tensor)
-        if out is not None:
-            out.requires_grad_(target_requires_grad)
+        # This shape mismatch can happen due to a bug in inplace/view handling in autograd.
+        # Try putting a breakpoint here and running
+        # `test/functorch/test_aotdispatch TestAOTAutograd.test_output_all_alias_types`
+        # Also, https://github.com/pytorch/pytorch/issues/49825
+        #
+        # As a stopgap, we'll fall back to as_strided.
+        if out is not None and out.shape == target_meta_tensor.shape:
+            if aliased_base_tensor.requires_grad and not target_requires_grad:
+                out = out.detach()
+            elif not aliased_base_tensor.requires_grad and target_requires_grad:
+                out.requires_grad_(True)
             return out
     size = target_meta_tensor.size()
     stride = target_meta_tensor.stride()
@@ -520,6 +533,18 @@ def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires
         aliased_out.requires_grad_(True)
     return aliased_out
 
+def to_fun(t):
+    if isinstance(t, Tensor):
+        return torch._to_functional_tensor(t, mirror_autograd_meta=True)
+    else:
+        return t
+
+def from_fun(t):
+    if not isinstance(t, Tensor) or not torch._is_functional_tensor(t):
+        return t
+    torch._sync(t)
+    return torch._from_functional_tensor(t)
+
 
 # This is a version of functionalization that is specifically designed
 # for the AOTAutograd use case.
@@ -542,6 +567,8 @@ def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires
 #   in the compiled backward function.
 def run_functionalized_fw_and_collect_metadata(
     f,
+    *,
+    keep_input_mutations: bool
 ) -> Tuple[ViewAndMutationMeta, List[Any]]:
     memo = {}
 
@@ -720,223 +747,421 @@ def inner(*flat_args):
             requires_grad_info=requires_grad_info,
             output_info=output_info,
             num_intermediate_bases=len(intermediate_bases),
+            keep_input_mutations=keep_input_mutations,
         )
         return metadata, pytree.tree_map(from_fun, f_tangents)
 
     return inner
 
 
-# This creates a functionalized joint forwards-backwards function given both
-# the primals (to run forwards) and tangents (to run backwards).
-#
-# It uses the metadata that was created earlier to figure out what all of the outputs to the autograd.Function.forward are:
-# (1) Which inputs received data mutations (and need to be passed as outputs into autograd.grad())
-# (2) Which outputs are aliases of inputs (and should *not* be passed as outputs into autograd.grad())
-def create_joint_forward_backward_functionalized(
-    fn,
-    *,
-    meta: ViewAndMutationMeta,
+def unpack_synthetic_bases(
+    primals: List[Any],
     synthetic_base_info: Optional[List[Union[int, Tuple[int, torch.Tensor]]]],
-):
-    # What's happening here? For any inputs in the graph that are mutated, we need to clone them first
-    # (and similarly for metadata-only mutations, we need to view them first).
-    # The idea is that when we trace the backward, we need to pass in the *original* primals
-    # to autograd.grad(), before they were mutated.
-    #
-    # NOTE: when we have synthetic base inputs, we need to clone them *before* creating views off of them.
-    # This means that "idx" here represents the index of the (potentially) synthetic base.
-    # What we need to do is:
-    # (1) map the current (post-synthetic-base calling convention) input argument index
-    #     to int index pre-synthetic-base-calling-convention.
-    # (2) There could be multiple, if this index corresponds to a synthetic base
-    #     that has multiple input aliases.
-    # (3) If any of those corresponding inputs get metadata mutations, then we clone the base.
-    def maybe_to_fresh_input(idx, t):
-        if not isinstance(t, Tensor):
-            return t
-
-        if synthetic_base_info is None:
-            outer_aliased_indices_of_current_base_arg = [idx]
+) -> List[Any]:
+    # This is only not None if our graph mutates a graph input that aliases another graph input.
+    if synthetic_base_info is None:
+        return primals
+
+    f_args_inner = []
+    for outer_idx_or_tuple in synthetic_base_info:
+        if isinstance(outer_idx_or_tuple, int):
+            f_args_inner.append(primals[outer_idx_or_tuple])
         else:
-            outer_aliased_indices_of_current_base_arg = [
-                # For every argument index in the outer calling convention (before synthetic bases)
-                # find its index in the inner calling convention.
-                # if it matches the index of our current arg (idx), track the outer argument's index (i)
-                i
-                for i, outer_idx_or_tuple in enumerate(synthetic_base_info)
-                if (isinstance(outer_idx_or_tuple, int) and outer_idx_or_tuple == idx)
-                or (
-                    isinstance(outer_idx_or_tuple, tuple)
-                    and outer_idx_or_tuple[0] == idx
-                )
+            outer_base_idx, view_tensor = outer_idx_or_tuple
+            outer_base = primals[outer_base_idx]
+            view_arg = gen_alias_from_base(
+                outer_base, view_tensor, view_tensor.requires_grad
+            )
+            f_args_inner.append(view_arg)
+    return f_args_inner
+
+# This class contains all the metadata we care about for the current function we're compiling.
+# This data is needed both at trace time and at runtime.
+@dataclass
+class CompiledRuntimeMetadata:
+    # This type / object should be cleaned up
+    # See Note [Synthetic Base Info Metadata]
+    synthetic_base_info: Optional[List[Union[int, Tuple[int, torch.Tensor]]]]
+    fw_metadata: ViewAndMutationMeta
+
+    def __post_init__(self):
+        self.num_outputs = len(self.fw_metadata.output_info)
+        self.num_outputs_non_aliased = len(
+            [x for x in self.fw_metadata.output_info if x.output_type == OutputType.non_alias]
+        )
+        self.num_outputs_aliased_to_inputs = len(
+            [
+                x
+                for x in self.fw_metadata.output_info
+                if x.output_type in [
+                    OutputType.alias_of_input,
+                    OutputType.is_input,
+                ]
             ]
-        if any(
-            meta.input_info[i].mutates_data
-            for i in outer_aliased_indices_of_current_base_arg
-        ):
-            # Make sure the primal we pass to autograd.grad()
-            # sees the tensor before the mutation
-            return t.clone()
-        if any(
-            meta.input_info[i].mutates_metadata and not meta.input_info[i].mutates_data
-            for i in outer_aliased_indices_of_current_base_arg
-        ):
-            # Make sure the primal we pass to autograd.grad()
-            # sees the tensor before the metadata mutation
-            return t.view(t.shape)
+        )
+        self.num_outputs_aliased_to_intermediates = len(
+            [
+                x
+                for x in self.fw_metadata.output_info
+                if x.output_type in [
+                    OutputType.alias_of_intermediate,
+                    OutputType.alias_of_intermediate_save_as_output,
+                    OutputType.alias_of_intermediate_base_is_user_output,
+                ]
+            ]
+        )
+        self.num_outputs_aliased = (
+            self.num_outputs_aliased_to_inputs + self.num_outputs_aliased_to_intermediates
+        )
+        self.num_mutated_data_inputs = len(
+            [x for x in self.fw_metadata.input_info if x.mutates_data]
+        )
+        self.num_mutated_metadata_inputs = len(
+            [
+                x
+                for x in self.fw_metadata.input_info
+                if x.mutates_metadata
+            ]
+        )
+        self.num_mutated_metadata_only_inputs = len(
+            [
+                x
+                for x in self.fw_metadata.input_info
+                if not x.mutates_data and x.mutates_metadata
+            ]
+        )
+        self.num_mutated_inputs = self.num_mutated_data_inputs + self.num_mutated_metadata_only_inputs
+
+# This function takes in a tensor t, and returns one of t, t.view(), or t.clone().
+# When tracing the joint forward + backward, for any inputs in the graph that are mutated,
+# we need to clone them first (and similarly for metadata-only mutations, we need to view them first).
+# The idea is that when we trace the backward, we need to pass in the *original* primals
+# to autograd.grad(), before they were mutated.
+# Note: when we have synthetic base inputs, we need to clone them *before* creating views off of them.
+# This means that "idx" here represents the index of the (potentially) synthetic base.
+# What we need to do is:
+# (1) map the current (post-synthetic-base calling convention) input argument index
+#     to int index pre-synthetic-base-calling-convention.
+# (2) There could be multiple, if this index corresponds to a synthetic base
+#     that has multiple input aliases.
+# (3) If any of those corresponding inputs get metadata mutations, then we clone the base.
+def maybe_to_fresh_input(idx, t, meta):
+    if not isinstance(t, Tensor):
         return t
 
-    def unpack_synthetic_bases(primals: List[Any]) -> List[Any]:
-        # This is only not None if our graph mutates a graph input that aliases another graph input.
-        if synthetic_base_info is None:
-            return primals
-
-        f_args_inner = []
-        for outer_idx_or_tuple in synthetic_base_info:
-            if isinstance(outer_idx_or_tuple, int):
-                f_args_inner.append(primals[outer_idx_or_tuple])
-            else:
-                outer_base_idx, view_tensor = outer_idx_or_tuple
-                outer_base = primals[outer_base_idx]
-                view_arg = gen_alias_from_base(
-                    outer_base, view_tensor, view_tensor.requires_grad
-                )
-                f_args_inner.append(view_arg)
-        return f_args_inner
-
-    def joint_forward_backward(
-        primals: List[Any], tangents: List[Any]
-    ) -> Tuple[List[Any], List[Any]]:
-        # Call the forward pass, making sure to clone any inputs that are mutated first.
-        # We need to ensure that the inputs we pass to autograd.grad() are the *original*
-        # inputs, and not their mutated values.
-        primals_no_input_mutations = [
-            maybe_to_fresh_input(i, t) for i, t in enumerate(primals)
-        ]
-        # This is also where we handle the calling convention around synthetic bases.
-        # We need to make sure that we convert any synthetic base arguments into views
-        # *after* we do the cloning above, to preserve the view relationship.
-        primals_ = unpack_synthetic_bases(primals_no_input_mutations)
-        assert len(meta.input_info) == len(primals_)
-        outs = fn(*primals_)
-
-        intermediate_bases = []
-        for o, info in zip(outs, meta.output_info):
-            if info.output_type == OutputType.alias_of_intermediate_save_as_output:
-                intermediate_bases.append(o._base)
-
-        assert len(meta.output_info) == len(outs)
-        assert meta.num_intermediate_bases == len(intermediate_bases)
-
-        # Pass any (non-aliased) outputs in as tangents, since they'll be returned as outputs in the fw
-        # For outputs that are aliases of intermediates, we will have returned the output's _base as an output in the graph instead,
-        # which we *should* send to grad()
-        outputs_for_grad = [
-            x
-            for (i, x) in enumerate(outs)
-            if meta.output_info[i].output_type == OutputType.non_alias
+    if meta.synthetic_base_info is None:
+        outer_aliased_indices_of_current_base_arg = [idx]
+    else:
+        outer_aliased_indices_of_current_base_arg = [
+            # For every argument index in the outer calling convention (before synthetic bases)
+            # find its index in the inner calling convention.
+            # if it matches the index of our current arg (idx), track the outer argument's index (i)
+            i
+            for i, outer_idx_or_tuple in enumerate(meta.synthetic_base_info)
+            if (isinstance(outer_idx_or_tuple, int) and outer_idx_or_tuple == idx)
+            or (
+                isinstance(outer_idx_or_tuple, tuple)
+                and outer_idx_or_tuple[0] == idx
+            )
         ]
-        # Pass any (non-aliased) mutated inputs in as tangents, since they'll be returned as outputs in the fw
-        # Important: the traced joint fw/bw will return updated inputs with data mutations,
-        # but *not* with metadata mutations.
-        # Instead, we shunt the updated metadata around externally
-        # and update the input's metadata outside of the autograd.Function
-        mutated_inputs_for_grad = [
+    if any(
+        meta.fw_metadata.input_info[i].mutates_data
+        for i in outer_aliased_indices_of_current_base_arg
+    ):
+        # Make sure the primal we pass to autograd.grad()
+        # sees the tensor before the mutation
+        return t.clone()
+    if any(
+        meta.fw_metadata.input_info[i].mutates_metadata and not meta.fw_metadata.input_info[i].mutates_data
+        for i in outer_aliased_indices_of_current_base_arg
+    ):
+        # Make sure the primal we pass to autograd.grad()
+        # sees the tensor before the metadata mutation
+        return t.view(t.shape)
+    return t
+
+# This function takes in a forward fn, runs it, and (optionally) runs autograd to compute the joint.
+# When maybe_tangents is None, we only run the forward. Otherwise we run the "joint" forward + backward.
+# Preconditions:
+# - fn corresponds to the flattened user fw function, with duplicate inputs removed
+# - functionalization is turned on (and inputs are wrapped in functional tensors)
+# - Synthetic bases have been *removed* (we've taken views on them corresponding to the user argument views).
+# - primals_after_cloning are what we run our forward function on. It is identical to primals_before_cloning,
+#   except that every input we know will be mutated in the forward has been cloned.
+#   We run our forward on primals_after_cloning (potentially mutating some inputs), and then compute our gradients
+#   w.r.t. primals_before_cloning (so we properly capture the mutation in our gradient computation).
+# Importantly, due functionalization + some autograd.Function constraints, this function can return EXTRA outputs
+# compared to what the original user forward returns.
+#
+# If we are only running the forward (and not computing the joint):
+# - Our function will return (updated_inputs, fw_outs)
+#
+# If we are running the forward + backward (computing the joint):
+# - Our function will return (updated_inputs, fw_outs, intermediate_bases), (gradients)
+#
+# Finally, if keep_input_mutations is set, then we will explicitly *not* return updated inputs, for any inputs
+# that experienced data-only mutations.
+# Instead, we are relying on the logic in create_forward_or_joint_functionalized to manually perform the input mutations,
+# keeping them directly in the traced graph.
+def forward_or_joint(
+    fn: Callable,
+    primals_before_cloning: List[Any],
+    primals_after_cloning: List[Any],
+    maybe_tangents: Optional[List[Any]],
+    meta: CompiledRuntimeMetadata,
+    keep_input_mutations: bool,
+) -> Any:
+    outs = fn(*primals_after_cloning)
+    assert len(meta.fw_metadata.output_info) == len(outs)
+
+    # The compiled fw will return mutated input tensors, *including* metadata-only mutation.
+    # However, if keep_input_mutations is set, the compiled fw only needs to return metadata-mutated inputs.
+    # (because data-only input mutations are handled directly in the compiled graph)
+    if keep_input_mutations:
+        mutated_inputs_to_return = [
             x
-            for (i, x) in enumerate(primals_)
-            if meta.input_info[i].mutates_data
+            for (i, x) in enumerate(primals_after_cloning)
+            if meta.fw_metadata.input_info[i].mutates_metadata
         ]
-        # The tensors that we include in the backward graph are:
-        # - inputs that recieve *data* mutations (not metadata-only; those are recomputed later)
-        # - outputs that are not aliased (aliased outputs are recomputed later)
-        # - intermediate ._base tensors of aliased outputs (we use those later to recompute the aliased outputs)
-        fw_outs_to_grad = mutated_inputs_for_grad + outputs_for_grad + intermediate_bases
-
-        # The compiled fw will return mutated input tensors, *including* metadata-only mutation.
+    else:
         mutated_inputs_to_return = [
             x
-            for (i, x) in enumerate(primals_)
-            if meta.input_info[i].mutates_data or meta.input_info[i].mutates_metadata
-        ]
-        # the compiled forward should return (mutated_inputs, user_outs, intermediate_bases)
-        fw_outs_to_return = *mutated_inputs_to_return, *outs, *intermediate_bases
-
-        # Take care to grab and sync the updated inputs from primals_ (the inputs we actually mutate!)
-        # and not primals (the preserved inputs, pre-mutation, that we pass to grad())
-        for i, arg in enumerate(primals_):
-            if not isinstance(arg, Tensor):
-                continue
-            torch._sync(arg)
-
-        # Get the inputs that need gradients
-        grad_primals = []
-        inputs_needs_grads = []
-        # Note that we're not using primals_ here, being carefully not to pass any mutated inputs into autograd.grad()
-        for p in primals:
-            is_grad_tensor = isinstance(p, Tensor) and p.requires_grad
-            inputs_needs_grads.append(is_grad_tensor)
-            if is_grad_tensor:
-                grad_primals.append(p)
-
-        # Get the outputs that need gradients
-        assert len(tangents) == len(fw_outs_to_grad)
-        needed_outs = []
-        needed_tangents = []
-        for out, tangent in zip(fw_outs_to_grad, tangents):
-            if isinstance(out, Tensor) and out.requires_grad:
-                # A bit sketchy, but fixes e.g. test_aot_autograd_exhaustive_matmul_cpu_float32
-                # The issue is that we are sensitive to decomps that don't accurately maintain
-                # their output's _base.shape compared to eager mode, and this helps mitigate a bit.
-                needed_outs.append(
-                    out if out.shape == tangent.shape else out.view(tangent.shape)
-                )
-                needed_tangents.append(tangent.requires_grad_(True))
-
-        setup_stacktrace_preservation_hooks([out.grad_fn for out in needed_outs])
-
-        backward_out = []
-        # Call the backwards pass
-        if grad_primals:
-            with fx_traceback.override_stack_trace():
-                backward_out = torch.autograd.grad(
-                    needed_outs,
-                    grad_primals,
-                    grad_outputs=needed_tangents,
-                    allow_unused=True,
-                )
-        backward_out_iter = iter(backward_out)
-        return fw_outs_to_return, [
-            next(backward_out_iter) if i else None for i in inputs_needs_grads
+            for (i, x) in enumerate(primals_after_cloning)
+            if meta.fw_metadata.input_info[i].mutates_data or meta.fw_metadata.input_info[i].mutates_metadata
         ]
 
-    def to_fun(t):
-        if isinstance(t, Tensor):
-            return torch._to_functional_tensor(t, mirror_autograd_meta=True)
-        else:
-            return t
-
-    def from_fun(t):
-        if not isinstance(t, Tensor) or not torch._is_functional_tensor(t):
-            return t
-        torch._sync(t)
-        return torch._from_functional_tensor(t)
+    # Case 1: We are just tracing the forward; not the joint forward + backward.
+    if maybe_tangents is None:
+        return *mutated_inputs_to_return, *outs
+    else:
+        tangents = maybe_tangents
+
+    # Case 2: We are tracing the joint forward backward.
+    # This also requires us to:
+    # - update the graph to return intermediate bases
+    # - Figure out what grad_outputs to pass into the backward
+    # - (this includes intermediate bases in the forward, and forward inputs that had data mutations)
+    # - actually call autograd.grad to trace the backward.
+    intermediate_bases = []
+    for o, info in zip(outs, meta.fw_metadata.output_info):
+        if info.output_type == OutputType.alias_of_intermediate_save_as_output:
+            intermediate_bases.append(o._base)
+
+    assert meta.fw_metadata.num_intermediate_bases == len(intermediate_bases)
+
+    # Pass any (non-aliased) outputs in as tangents, since they'll be returned as outputs in the fw
+    # For outputs that are aliases of intermediates, we will have returned the output's _base as an output in the graph instead,
+    # which we *should* send to grad()
+    outputs_for_grad = [
+        x
+        for (i, x) in enumerate(outs)
+        if meta.fw_metadata.output_info[i].output_type == OutputType.non_alias
+    ]
+    # Pass any (non-aliased) mutated inputs in as tangents, since they'll be returned as outputs in the fw
+    # Important: the traced joint fw/bw will return updated inputs with data mutations,
+    # but *not* with metadata mutations.
+    # Instead, we shunt the updated metadata around externally
+    # and update the input's metadata outside of the autograd.Function
+    mutated_inputs_for_grad = [
+        x
+        for (i, x) in enumerate(primals_after_cloning)
+        if meta.fw_metadata.input_info[i].mutates_data
+    ]
+    # The tensors that we include in the backward graph are:
+    # - inputs that recieve *data* mutations (not metadata-only; those are recomputed later)
+    # - outputs that are not aliased (aliased outputs are recomputed later)
+    # - intermediate ._base tensors of aliased outputs (we use those later to recompute the aliased outputs)
+    fw_outs_to_grad = mutated_inputs_for_grad + outputs_for_grad + intermediate_bases
+    assert len(tangents) == len(fw_outs_to_grad)
+
+    # the compiled forward should return (mutated_inputs, user_outs, intermediate_bases)
+    fw_outs_to_return = *mutated_inputs_to_return, *outs, *intermediate_bases
+
+    # Take care to grab and sync the updated inputs from primals_after_cloning (the inputs we actually mutate!)
+    # and not primals_before_cloning (the preserved inputs, pre-mutation, that we pass to grad())
+    for i, arg in enumerate(primals_after_cloning):
+        if not isinstance(arg, Tensor):
+            continue
+        torch._sync(arg)
+
+    # Get the inputs that need gradients
+    grad_primals = []
+    inputs_needs_grads = []
+    # Note that we're not using primals_before_cloning here,
+    # being carefully not to pass any mutated inputs into autograd.grad()
+    for p in primals_before_cloning:
+        is_grad_tensor = isinstance(p, Tensor) and p.requires_grad
+        inputs_needs_grads.append(is_grad_tensor)
+        if is_grad_tensor:
+            grad_primals.append(p)
+
+    # Get the outputs that need gradients
+    needed_outs = []
+    needed_tangents = []
+    for out, tangent in zip(fw_outs_to_grad, tangents):
+        if isinstance(out, Tensor) and out.requires_grad:
+            # A bit sketchy, but fixes e.g. test_aot_autograd_exhaustive_matmul_cpu_float32
+            # The issue is that we are sensitive to decomps that don't accurately maintain
+            # their output's _base.shape compared to eager mode, and this helps mitigate a bit.
+            needed_outs.append(
+                out if out.shape == tangent.shape else out.view(tangent.shape)
+            )
+            needed_tangents.append(tangent.requires_grad_(True))
+
+    setup_stacktrace_preservation_hooks([out.grad_fn for out in needed_outs])
+
+    backward_out = []
+    # Call the backwards pass
+    if grad_primals:
+        with fx_traceback.preserve_node_meta():
+            backward_out = torch.autograd.grad(
+                needed_outs,
+                grad_primals,
+                grad_outputs=needed_tangents,
+                allow_unused=True,
+            )
+    backward_out_iter = iter(backward_out)
+    return fw_outs_to_return, [
+        next(backward_out_iter) if i else None for i in inputs_needs_grads
+    ]
 
-    def functionalized_joint(
-        primals: List[Any], tangents: List[Any]
-    ) -> Tuple[List[Any], List[Any]]:
+# This function expands synthetic base arguments into the original aliased inputs that the user passed in.
+# Preconditions:
+# - fn corresponds to the flattened user fw function, with duplicate inputs removed
+# - functionalization is turned on (and inputs are wrapped in functional tensors)
+# - both primals args **include** synthetic bases.
+#   "primals_after_cloning" just corresponds to "primals_before_cloning", but with some inputs (optionally) cloned.
+#   "primals_before_cloning" is unused, and is only needed so we can pass the correct leaf tensors into autograd.
+def flat_fn_with_synthetic_bases_expanded(
+    fn: Callable,
+    primals_before_cloning: List[Any],
+    primals_after_cloning: List[Any],
+    maybe_tangents: Optional[List[Any]],
+    meta: CompiledRuntimeMetadata,
+    keep_input_mutations: bool
+):
+    # This is where we handle the calling convention around synthetic bases.
+    # We need to make sure that we convert any synthetic base arguments into views
+    # *after* we clone inputs for autograd (see below), to preserve the view relationship.
+    primals = unpack_synthetic_bases(primals_after_cloning, meta.synthetic_base_info)
+    assert len(meta.fw_metadata.input_info) == len(primals)
+    outs = forward_or_joint(fn, primals_before_cloning, primals, maybe_tangents, meta, keep_input_mutations)
+    return outs
+
+# This function adds extra clone() calls on any inputs in the forward that get mutated.
+# It *only* does this if we plan on performing autograd on fn.
+# The idea here is that when computing grdients w.r.t. inputs, we need to compute our gradients
+# w.r.t. the inputs *before* they were mutated!
+# Preconditions:
+# - fn corresponds to the flattened user fw function, with duplicate inputs removed
+# - primals **includes** synthetic bases. Importantly, if a synthetic base is mutated,
+#   we need to clone it *before* taking views off of it (if we clone the views they won't be views anymore)
+# - functionalization is turned on (and inputs are wrapped in functional tensors)
+def flat_fn_no_input_mutations(
+    fn: Callable,
+    primals: List[Any],
+    maybe_tangents: Optional[List[Any]],
+    meta: CompiledRuntimeMetadata,
+    keep_input_mutations: bool
+):
+    # When tracing the joint fwd + bwd, making sure to clone any inputs that are mutated first.
+    # We need to ensure that the inputs we pass to autograd.grad() are the *original*
+    # inputs, and not their mutated values.
+    if maybe_tangents is not None:
+        primals_after_cloning = [
+            maybe_to_fresh_input(i, t, meta) for i, t in enumerate(primals)
+        ]
+    else:
+        primals_after_cloning = primals
+    outs = flat_fn_with_synthetic_bases_expanded(fn, primals, primals_after_cloning, maybe_tangents, meta, keep_input_mutations)
+    return outs
+
+# This creates the final function that we want to trace using make_fx(),
+# in both aot_dispatch_autograd and aot_dispatch_base.
+# Preconditions:
+# - fn corresponds to the user's fw function
+# - fn arguments have been flattened, duplicate arguments have been handled
+# - In the returned function, the "primals" arguments *includes* synthetic bases.
+# This function does the work of functionalizing the input function,
+# and performing copy_() calls at the end of the function if `keep_input_mutations` is set.
+# The function returned has signature that is either:
+# (1) "traced_fn(primals: List[Any])" if trace_joint is False
+# (2) "traced_fn(primals: List[Any], tangents: List[Any])" if trace_joint is True
+def create_forward_or_joint_functionalized(
+    fn,
+    *,
+    meta: CompiledRuntimeMetadata,
+    trace_joint: bool,
+    keep_input_mutations: bool
+):
 
+    def functionalized_f_helper(primals, maybe_tangents=None):
+        # Convention: this function is used to trace both the joint, and just the forward (for inference).
+        # When trace_joint is set, tangents should be passed in.
+        assert (maybe_tangents is not None) == trace_joint
         # Wrap inputs into functional wrappers
-        f_primals, f_tangents = pytree.tree_map(to_fun, (primals, tangents))
+        f_primals = pytree.tree_map(to_fun, primals)
+        f_tangents = None if maybe_tangents is None else pytree.tree_map(to_fun, maybe_tangents)
         torch._enable_functionalization(reapply_views=True)
         try:
             # Run the joint
-            f_outs = joint_forward_backward(f_primals, f_tangents)
+            f_outs = flat_fn_no_input_mutations(fn, f_primals, f_tangents, meta, keep_input_mutations)
         finally:
             torch._disable_functionalization()
 
+        if keep_input_mutations:
+            # Note: This is a bit annoying. There's a layering issue here, where:
+            # (1) functionalization needs to operate on **synthetic base** inputs, before unpacking them into the "real" inputs.
+            # (2) For keep_input_mutations, we support tracing a call to copy_() directly on mutated inputs.
+            #     However, we **only** want to support this for inputs that have data-only (and no metadata) mutations,
+            #     because inductor (and backends in generally) would prefer not to see these (e.g. as_strided_(), resize_()).
+            #     This makes it pretty difficult for this logic to operate on synthetic bases.
+            # (3) In addition, there are cases where it's significantly cheaper to perform the copy on the individual
+            #     (unpacked) input aliases, instead of the synthetic base.
+            # The result is that ideally this function shouldn't have to worry about synthetic bases
+            # (unpacking them happens underneath this function),
+            # but we actually do need to unpack the synthetic bases when performing the copy_'s to keep input mutations around.
+            # Example case where this could be important:
+            #
+            #     def f(x, y):
+            #         x.mul_(2)
+            #         y.mul_(3)
+            #         return x, y
+            #    a = torch.ones(1'000'000)
+            #    x, y = out(a[0:9], a[1:10])
+            #
+            # It would be much better to add copy_() calls into the graph for the two tiny slices, instead of materializing
+            # a giant "updated synthetic base" and copying into a's entire storage.
+            primals_unpacked = unpack_synthetic_bases(primals, meta.synthetic_base_info)
+            f_primals_unpacked = unpack_synthetic_bases(f_primals, meta.synthetic_base_info)
+            assert len(meta.fw_metadata.input_info) == len(f_primals_unpacked)
+            for i, (inpt_old, inpt_f) in enumerate(zip(primals_unpacked, f_primals_unpacked)):
+                if not isinstance(inpt_f, torch.Tensor):
+                    continue
+                torch._sync(inpt_f)
+                inpt_new = torch._from_functional_tensor(inpt_f)
+                if meta.fw_metadata.input_info[i].mutates_data and not meta.fw_metadata.input_info[i].mutates_metadata:
+                    # We found an input that had a (data-only) mutation.
+                    # Since keep_input_mutations is set, we need to faithfully apply a copy_()
+                    # so the compiler will see the input mutation in the graph.
+                    assert inpt_new is not inpt_old
+                    assert has_same_metadata(inpt_new, inpt_old)
+                    inpt_old.copy_(inpt_new)
+
         return pytree.tree_map(from_fun, f_outs)
 
-    return functionalized_joint
+    # the joint needs have args named "primals" and "tangents",
+    # which are hardcoded into the partitioning logic.
+    def traced_joint(primals, tangents):
+        return functionalized_f_helper(primals, tangents)
+
+    def traced_forward(*primals):
+        return functionalized_f_helper(primals)
+
+    if trace_joint:
+        return traced_joint
+    else:
+        return traced_forward
 
 
 def normalize_as_list(x):
@@ -1024,7 +1249,7 @@ def call_func_with_args(f, args, steal_args=False, disable_amp=False):
             # TODO: Please remove soon
             # https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670
             warnings.warn(
-                "Your compiler for AOTAutograd is returning a a function that doesn't take boxed arguments. "
+                "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. "
                 "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. "
                 "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale."
             )
@@ -1034,7 +1259,6 @@ def call_func_with_args(f, args, steal_args=False, disable_amp=False):
             del guard
     return out
 
-
 @dataclasses.dataclass
 class AOTConfig:
     """
@@ -1047,10 +1271,50 @@ class AOTConfig:
     decompositions: Dict[Callable, Callable]
     num_params_buffers: int
     aot_id: int
-
+    keep_inference_input_mutations: bool
 
 def aot_dispatch_base(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
-    fw_module = make_fx(flat_fn, aot_config.decompositions)(*flat_args)
+    with enable_python_dispatcher():
+        _fw_metadata, _out = run_functionalized_fw_and_collect_metadata(
+            flat_fn,
+            keep_input_mutations=aot_config.keep_inference_input_mutations,
+        )(
+            *flat_args
+        )
+
+    _input_info = _fw_metadata.input_info
+
+    flat_args_with_views_handled, _synthetic_base_info = merge_view_inputs(
+        flat_args, _input_info, is_inference=True
+    )
+    metadata_ = CompiledRuntimeMetadata(
+        synthetic_base_info=_synthetic_base_info,
+        fw_metadata=_fw_metadata,
+    )
+    # aot_dispatch_base requires functionalization, but doesn't need to handle as many cases as the autograd case.
+    # The cases that aot_dispatch_base doesn't need to handle include:
+    # - outputs that are aliases of graph intermediates
+    # - outputs that are aliases of graph inputs
+    # While cases that it does need to handle include:
+    # - input mutations (including when inputs are aliases of each other)
+    # - input metadata mutations
+    trace_fn = create_forward_or_joint_functionalized(
+        flat_fn,
+        meta=metadata_,
+        trace_joint=False,
+        keep_input_mutations=aot_config.keep_inference_input_mutations
+    )
+
+    with enable_python_dispatcher():
+        fw_module = make_fx(trace_fn, aot_config.decompositions)(*flat_args_with_views_handled)
+
+    if not aot_config.keep_inference_input_mutations:
+        # As long as we opted to remove input mutations, then
+        # there should be *NO* mutating ops in the graph at this point.
+        assert_functional_graph(fw_module.graph)
+        fw_module.graph.eliminate_dead_code()
+        fw_module.recompile()
+
     if config.debug_graphs:
         log.debug(f"====== Forward (only) graph {aot_config.aot_id} ======")
         log.debug(fw_module.print_readable(print_output=False))
@@ -1059,16 +1323,16 @@ def aot_dispatch_base(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
     context = disable_autocast_manager if disable_amp else nullcontext
 
     with context(), track_graph_compiling(aot_config, "inference"):
-        compiled_fw = aot_config.fw_compiler(fw_module, flat_args)
+        compiled_fw = aot_config.fw_compiler(fw_module, flat_args_with_views_handled)
 
-    @wraps(compiled_fw)
-    def new_fn(args):
-        fw_outs = call_func_with_args(compiled_fw, args, disable_amp=disable_amp)
-        return fw_outs
-
-    new_fn._boxed_call = True
+    compiled_fn = create_runtime_wrapper(
+        compiled_fw,
+        runtime_metadata=metadata_,
+        trace_joint=False,
+        keep_input_mutations=aot_config.keep_inference_input_mutations
+    )
 
-    return new_fn
+    return compiled_fn
 
 
 def assert_functional_graph(fx_g: torch.fx.Graph):
@@ -1175,7 +1439,10 @@ def same_dtype_views(view1, view2):
 #   c_base = torch.Tensor(c.storage())
 #   f(c_base, b_base, a, d)
 def merge_view_inputs(
-    fwd_inputs: List[Any], mutated_input_info: List[InputAliasInfo]
+    fwd_inputs: List[Any], mutated_input_info: List[InputAliasInfo],
+    *,
+    # The autograd case currently has more restrictions than the inference case.
+    is_inference: bool,
 ) -> Tuple[List[Any], Optional[List[Union[int, Tuple[int, torch.Tensor]]]]]:
     assert len(fwd_inputs) == len(mutated_input_info)
     storage_ref_to_idx: Dict[StorageWeakRef, List[int]] = collections.defaultdict(list)
@@ -1187,6 +1454,7 @@ def merge_view_inputs(
             storage_ref_to_idx[storage_ref].append(i)
         else:
             other_args.append(inpt)
+    # Note [Synthetic Base Info Metadata]
     # This list contains metadata that tells you what the i'th argument in the inner calling convention should be.
     # It's either:
     # - another int (corresponding to the index in the argument list of the element from the outer calling convention)
@@ -1213,9 +1481,10 @@ def merge_view_inputs(
             view2 = fwd_inputs[idx2]
             # The "inputs that are aliased but have different differentiable bases" case
             # is more complicated and hopefully pretty rare. Not currently handled.
-            assert are_differentiable_views(
-                view1, view2
-            ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+            if not is_inference:
+                assert are_differentiable_views(
+                    view1, view2
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
             # Regenerating views when reinterpreting complex / real tensors seems non-trivial,
             # not handling for now
             assert same_dtype_views(
@@ -1232,8 +1501,35 @@ def merge_view_inputs(
         if len(non_none_bases) == 0:
             # Case where none of the aliases have a ._base
             # we generate a synthetic base without gradients, and generate views off of it
+            # We hit this case when we have input tensors to the graph that share a storage,
+            # but do not have a ._base field.
+            # Wondering when we hit this case?
+            # The _base field simply says that autograd knows about the aliasing relationship,
+            # but sometimes we create tensors which are aliased out of the same storage but guaranteed
+            # to be disjoint. In these cases, we will skip setting up the _base relationship
+            # for performance reasons (because the fact that the tensors share the same storage
+            # is unobservable unless you (1) do naughty things with resize_/as_strided
+            # or (2) look at the storage--as we are doing here.)
+            # One particular example of this is optimizer steps on the LSTM module:
+            # LSTM parameters are packed into a contiguous storage for efficiency reasons when
+            # calling cuDNN kernels, so when these parameters get passed to the optimizer we will
+            # find they share the same storage, but do not have _base set since they are all disjoint.
+            #
+            # NOTE: There is one case where this is unsafe:
+            # torch.Tensor(storage) will ALWAYS create a 1D tensor, which is not necessarily
+            # the same shape as the "actual" base that the tensor came from.
+            # For the most part this is fine, because we always use as_strided()
+            # to generate the original aliased inputs again.
+            # If we were to use view-replay though, this could cause the aliased views
+            # to have incorrect sizes.
             example_idx = aliased_input_indices[0]
-            synthetic_base = torch.Tensor(fwd_inputs[example_idx].untyped_storage())
+            example_alias = fwd_inputs[example_idx]
+            # Note that this function is re-used at both trace time and rutnime.
+            # At trace time, we're under a FakeMode so synthetic_base becomes a FakeTensor.
+            synthetic_base = torch.empty((0,), dtype=example_alias.dtype, device=example_alias.device)
+            # We don't actually have a convenient way of going from storage -> tensor,
+            # So using set_() here (we suffer some minor overhead, but this case is rare).
+            synthetic_base.set_(example_alias.untyped_storage())
         else:
             # Case where all of the aliases require gradients, and have the same _base.
             synthetic_base = non_none_bases[0]
@@ -1375,7 +1671,12 @@ def aot_wrapper_dedupe(
     # or not
     try:
         with enable_python_dispatcher():
-            fw_metadata, _out = run_functionalized_fw_and_collect_metadata(flat_fn)(
+            fw_metadata, _out = run_functionalized_fw_and_collect_metadata(
+                flat_fn,
+                # For the purpose of checking for dupes that are mutated,
+                # we always want our metadata to correctly reflect input mutations
+                keep_input_mutations=False,
+            )(
                 *flat_args
             )
     except RuntimeError as e:
@@ -1399,7 +1700,9 @@ def aot_wrapper_dedupe(
         ok = True
 
         for i, a in enumerate(flat_args):
-            if a not in args_set:
+            if not isinstance(a, torch.Tensor):
+                leaf_flat_args.append(a)
+            elif a not in args_set:
                 args_set.add(a)
                 leaf_flat_args.append(a)
             elif not fw_metadata.input_info[i].mutates_data and not fw_metadata.input_info[i].mutates_metadata:
@@ -1482,9 +1785,17 @@ def add_dupe_args(args):
         # kept_pos:[dupe_arg_pos], however, add_dupe_map is 1:1 so we would need a new structure there,
         # which feels like needless complexity for a tiny bit of efficiency at this point.
         for dupe_arg_pos, kept_pos in add_dupe_map.items():
-            # Edge case, only happens for identity
-            if dupe_arg_pos != kept_pos:
-                tracing_context.guards_context.aotautograd_guards.append(DuplicateInputs(kept_pos, dupe_arg_pos))
+            dupe_arg_dict = flat_args[dupe_arg_pos].__dict__
+            kept_arg_dict = flat_args[kept_pos].__dict__
+            if 'graph_arg_pos' in dupe_arg_dict and 'graph_arg_pos' in kept_arg_dict:
+                d_positions = dupe_arg_dict['graph_arg_pos']
+                k_positions = kept_arg_dict['graph_arg_pos']
+                assert(d_positions == k_positions)
+                if len(d_positions) > 1:
+                    for i in range(1, len(d_positions)):
+                        pos = d_positions[i]
+                        pre_pos = d_positions[i - 1]
+                        tracing_context.guards_context.aotautograd_guards.append(DuplicateInputs(pre_pos, pos))
 
     @wraps(flat_fn)
     def wrapped_flat_fn(*args):
@@ -1546,6 +1857,176 @@ def describe_input(i, aot_config):
     else:
         return f"input {i - aot_config.num_params_buffers}"
 
+# The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
+# that needs to run after the compiled function.
+#
+# This function accepts a trace_joint flag, indicating whether or not we're generating the runtime
+# epilogue for a forward-only inference graph, or for an autograd.Function.apply function.
+# This is because there are some minor differences in how we treat these cases at runtime:
+# - resize_() is currently handled in the inference case, but not fully handled in the autograd case.
+# - the autograd cases inserts TensorAlias wrapper objects for outputs that alias inputs
+def create_runtime_wrapper(
+    compiled_fn,
+    *,
+    runtime_metadata: CompiledRuntimeMetadata,
+    trace_joint: bool,
+    keep_input_mutations: bool,
+):
+    if not hasattr(compiled_fn, "_boxed_call"):
+        compiled_fn = make_boxed_func(compiled_fn)
+
+    def runtime_wrapper(*args):
+        # Step 2: remove aliased inputs that are mutated, replace with synthetic bases
+        # Only happens if our graph mutates an input that aliases another input.
+        if runtime_metadata.synthetic_base_info is not None:
+            # Given: the original args, including at least one pair of inputs that are aliased
+            # and get subsequently mutated.
+            # Generate: the updated args, including (potentially multiple) synthetic bases
+            # that replace the views. The input views are regenerated manually in the compiled function.
+            # TODO: think harder about what happens if (a view of) one of these mutated input views is ALSO returned
+            new_inputs, metadata = merge_view_inputs(
+                args, runtime_metadata.fw_metadata.input_info, is_inference=not trace_joint,
+            )
+            # We're just re-running the original-args-to-synthetic-base transformation
+            # that we ran during compilation.
+            # This returns metadata that we use during tracing to recover the input views,
+            # which we don't actually need at runtime.
+            assert metadata is not None
+            args_with_synthetic_bases = new_inputs
+        else:
+            args_with_synthetic_bases = args
+
+        with torch.autograd._force_original_view_tracking(True):
+            all_outs = call_func_with_args(
+                compiled_fn,
+                args_with_synthetic_bases,
+                disable_amp=True,
+            )
+
+        num_mutated_inps = runtime_metadata.num_mutated_inputs
+        num_metadata_mutated_inps = runtime_metadata.num_mutated_metadata_inputs
+        num_intermediate_bases = runtime_metadata.fw_metadata.num_intermediate_bases
+
+        if keep_input_mutations:
+            assert (
+                len(all_outs)
+                == num_metadata_mutated_inps + runtime_metadata.num_outputs + num_intermediate_bases
+            )
+            assert (
+                len(runtime_metadata.fw_metadata.mutated_inp_indices) == num_metadata_mutated_inps
+            )
+        else:
+            assert (
+                len(all_outs)
+                == num_mutated_inps + runtime_metadata.num_outputs + num_intermediate_bases
+            )
+            assert (
+                len(runtime_metadata.fw_metadata.mutated_inp_indices) == num_mutated_inps
+            )
+        # Step 3: After running the compiled fw, apply updates to mutated inputs
+        num_mutations_to_apply = len(runtime_metadata.fw_metadata.mutated_inp_indices)
+        if num_mutations_to_apply > 0:
+            updated_inputs = all_outs[: num_mutations_to_apply]
+            fw_outs = all_outs[num_mutations_to_apply :]
+
+            for i, inpt_idx in enumerate(
+                runtime_metadata.fw_metadata.mutated_inp_indices
+            ):
+                meta = runtime_metadata.fw_metadata.input_info[inpt_idx]
+                if not meta.mutates_data and not meta.mutates_metadata:
+                    continue
+                original_inpt = args[inpt_idx]
+                updated_inpt = updated_inputs[i]
+                # TODO: add better resize_() support for autograd case.
+                # Check for the case when an input has been resized.
+                # Note: One important thing to check for is user code that calls inpt.storage().resize_().
+                # We can't trace operations on storage into the graph, so we should get dynamo to graph break.
+                # TODO: handle resize_() on inputs to a larger size.
+                # This is actually non-trivial to detect, so we should probably just handle it
+                # (or make dynamo detect).
+                # We can't just check of original_inpt.storage_size != updated_inpt.storage_size,
+                # Because the original_inpt might be a view of some larger tensor,
+                # and updated_inpt is always densely packed.
+                if not trace_joint and original_inpt.storage().size() != updated_inpt.storage().size():
+                    original_inpt.resize_(updated_inpt.size())
+                if meta.mutates_metadata and not meta.mutates_data:
+                    if trace_joint:
+                        assert isinstance(updated_inpt, TensorAlias)
+                        updated_inpt = updated_inpt.alias
+                    # We need to grab the size/stride/storage_offset from the compiled forward,
+                    # and use that to mutate the metadata of the input
+                    original_inpt.as_strided_(
+                        updated_inpt.size(),
+                        updated_inpt.stride(),
+                        updated_inpt.storage_offset(),
+                    )
+                else:
+                    if meta.mutates_data and meta.mutates_metadata:
+                        original_inpt.as_strided_(
+                            updated_inpt.size(),
+                            updated_inpt.stride(),
+                            updated_inpt.storage_offset(),
+                        )
+                    else:
+                        assert meta.mutates_data
+                    original_inpt.copy_(updated_inpt)
+        else:
+            fw_outs = all_outs
+
+        # Step 4: Manually regenerate any outputs that are aliased to inputs, instead of
+        # compiling them.
+        if runtime_metadata.num_outputs_aliased > 0:
+            # The compiled forward also returned intermediate bases. We don't want to return them to the user.
+            if runtime_metadata.fw_metadata.num_intermediate_bases > 0:
+                fw_outs_no_intermediate_bases = fw_outs[
+                    : -runtime_metadata.fw_metadata.num_intermediate_bases
+                ]
+                intermediate_bases = fw_outs[-runtime_metadata.fw_metadata.num_intermediate_bases:]
+            else:
+                fw_outs_no_intermediate_bases = fw_outs
+                intermediate_bases = []
+            assert len(fw_outs_no_intermediate_bases) == len(runtime_metadata.fw_metadata.output_info)
+
+            fw_outs_including_aliases = []
+            for i, (o, info) in enumerate(zip(
+                fw_outs_no_intermediate_bases, runtime_metadata.fw_metadata.output_info
+            )):
+                if info.output_type == OutputType.non_alias:
+                    fw_outs_including_aliases.append(o)
+                    continue
+                if trace_joint:
+                    assert isinstance(o, TensorAlias)
+                    o_ = o.alias
+                else:
+                    o_ = o
+                o_grad = runtime_metadata.fw_metadata.requires_grad_info[runtime_metadata.num_mutated_inputs + i]
+                if info.output_type == OutputType.alias_of_input:
+                    aliased_base_tensor = args[info.base_idx]
+                    regenerated_out = gen_alias_from_base(aliased_base_tensor, o_, o_grad)
+                    fw_outs_including_aliases.append(regenerated_out)
+                    continue
+                elif info.output_type == OutputType.is_input:
+                    aliased_base_tensor = args[info.base_idx]
+                    regenerated_out = aliased_base_tensor
+                    fw_outs_including_aliases.append(regenerated_out)
+                    continue
+                elif info.output_type == OutputType.alias_of_intermediate:
+                    base_tensor_list = intermediate_bases
+                elif info.output_type == OutputType.alias_of_intermediate_save_as_output:
+                    base_tensor_list = intermediate_bases
+                else:
+                    assert info.output_type == OutputType.alias_of_intermediate_base_is_user_output
+                    base_tensor_list = fw_outs_no_intermediate_bases
+                aliased_base_tensor = base_tensor_list[info.base_idx]
+                # TODO: handle the custom autograd function case here.
+                # We need a way to check whether a tensor came from a custom autograd fn from python,
+                # AND a way to replay that custom view fn.
+                regenerated_out = gen_alias_from_base(aliased_base_tensor, o_, o_grad)
+                fw_outs_including_aliases.append(regenerated_out)
+            return fw_outs_including_aliases
+        else:
+            return fw_outs
+    return runtime_wrapper
 
 # Has the precondition that there
 # are no duplicate arguments in flat_args (e.g., the same Tensor
@@ -1554,53 +2035,15 @@ def describe_input(i, aot_config):
 def aot_dispatch_autograd(flat_fn, flat_args: List[Any], aot_config: AOTConfig):
 
     with enable_python_dispatcher():
-        _fw_metadata, out = run_functionalized_fw_and_collect_metadata(flat_fn)(
+        _fw_metadata, out = run_functionalized_fw_and_collect_metadata(
+            flat_fn,
+            # Note: in the non-inference path, we are currently not passing input mutations into the graph directly.
+            # This is mainly difficult due to the partitioner, but we are leaving (a bit of) perf on the table.
+            keep_input_mutations=False,
+        )(
             *flat_args
         )
 
-    # pre-compute, so we can bail out quickly in the hotpath
-    _num_outputs = len(_fw_metadata.output_info)
-    _num_outputs_non_aliased = len(
-        [x for x in _fw_metadata.output_info if x.output_type == OutputType.non_alias]
-    )
-    _num_outputs_aliased_to_inputs = len(
-        [
-            x
-            for x in _fw_metadata.output_info
-            if x.output_type in [
-                OutputType.alias_of_input,
-                OutputType.is_input,
-            ]
-        ]
-    )
-    _num_outputs_aliased_to_intermediates = len(
-        [
-            x
-            for x in _fw_metadata.output_info
-            if x.output_type in [
-                OutputType.alias_of_intermediate,
-                OutputType.alias_of_intermediate_save_as_output,
-                OutputType.alias_of_intermediate_base_is_user_output,
-            ]
-        ]
-    )
-    _num_outputs_aliased = (
-        _num_outputs_aliased_to_inputs + _num_outputs_aliased_to_intermediates
-    )
-
-    _num_mutated_data_inputs = len(
-        [x for x in _fw_metadata.input_info if x.mutates_data]
-    )
-    _num_mutated_metadata_only_inputs = len(
-        [
-            x
-            for x in _fw_metadata.input_info
-            if not x.mutates_data and x.mutates_metadata
-        ]
-    )
-    _num_mutated_inputs = _num_mutated_data_inputs + _num_mutated_metadata_only_inputs
-
-    assert len(_fw_metadata.requires_grad_info) == _num_mutated_inputs + _num_outputs
 
     # out here corresponds to the set of outputs in the traced forward that should get grad_outputs in the traced backward.
     # It includes outputs of the original forward, *and* any updated inputs due to input mutations.
@@ -1616,13 +2059,24 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Any], aot_config: AOTConfig):
     # When that happens, we replace the aliased inputs with a synthetic base, and in the traced forward
     # we later generate the input views
     flat_args_with_views_handled, _synthetic_base_info = merge_view_inputs(
-        flat_args, _fw_metadata.input_info
+        flat_args, _fw_metadata.input_info, is_inference=False,
     )
 
-    joint_forward_backward = create_joint_forward_backward_functionalized(
-        flat_fn,
-        meta=_fw_metadata,
+    # pre-compute, so we can bail out quickly in the hotpath
+    metadata_ = CompiledRuntimeMetadata(
         synthetic_base_info=_synthetic_base_info,
+        fw_metadata=_fw_metadata,
+    )
+
+    assert len(_fw_metadata.requires_grad_info) == metadata_.num_mutated_inputs + metadata_.num_outputs
+
+    joint_forward_backward = create_forward_or_joint_functionalized(
+        flat_fn,
+        meta=metadata_,
+        trace_joint=True,
+        # For now in the autograd case, we NEVER keep input mutations (we could eventually fix this for slightly better perf
+        # in some cases, but it's annoying to fix the partitioner)
+        keep_input_mutations=False,
     )
 
     joint_inputs = (flat_args_with_views_handled, out)
@@ -1658,7 +2112,7 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Any], aot_config: AOTConfig):
 
     with torch.no_grad():
         with track_graph_compiling(aot_config, "joint"):
-            num_inner_fwd_outputs = _num_mutated_inputs + _num_outputs + _fw_metadata.num_intermediate_bases
+            num_inner_fwd_outputs = metadata_.num_mutated_inputs + metadata_.num_outputs + _fw_metadata.num_intermediate_bases
             fw_module, bw_module = aot_config.partition_fn(
                 fx_g, joint_inputs, num_fwd_outputs=num_inner_fwd_outputs
             )
@@ -1685,16 +2139,8 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Any], aot_config: AOTConfig):
     class CompiledFunction(torch.autograd.Function):
         compiled_fw = compiled_fw_func
         compiled_bw = None
-        num_outputs = _num_outputs
-        num_outputs_aliased_to_inputs = _num_outputs_aliased_to_inputs
-        num_outputs_aliased_to_intermediates = _num_outputs_aliased_to_intermediates
-        num_outputs_aliased = _num_outputs_aliased
+        metadata = metadata_
         num_symints_saved_for_bw = _num_symints_saved_for_bw
-        num_mutated_inputs = _num_mutated_inputs
-        num_mutated_data_inputs = _num_mutated_data_inputs
-        num_mutated_metadata_only_inputs = _num_mutated_metadata_only_inputs
-        synthetic_base_info = _synthetic_base_info
-        fw_metadata = _fw_metadata
 
         @staticmethod
         def forward(ctx, *deduped_flat_tensor_args):
@@ -1710,25 +2156,25 @@ def forward(ctx, *deduped_flat_tensor_args):
                 disable_amp=disable_amp,
             )
 
-            num_outputs = CompiledFunction.num_outputs
+            num_outputs = CompiledFunction.metadata.num_outputs
             num_outputs_aliased_to_inputs = (
-                CompiledFunction.num_outputs_aliased_to_inputs
+                CompiledFunction.metadata.num_outputs_aliased_to_inputs
             )
             num_outputs_aliased_to_intermediates = (
-                CompiledFunction.num_outputs_aliased_to_intermediates
+                CompiledFunction.metadata.num_outputs_aliased_to_intermediates
             )
-            num_outputs_aliased = CompiledFunction.num_outputs_aliased
-            num_intermediate_bases = CompiledFunction.fw_metadata.num_intermediate_bases
+            num_outputs_aliased = CompiledFunction.metadata.num_outputs_aliased
+            num_intermediate_bases = CompiledFunction.metadata.fw_metadata.num_intermediate_bases
             num_symints_saved_for_bw = CompiledFunction.num_symints_saved_for_bw
-            num_mutated_inputs = CompiledFunction.num_mutated_inputs
+            num_mutated_inputs = CompiledFunction.metadata.num_mutated_inputs
             num_mutated_metadata_only_inputs = (
-                CompiledFunction.num_mutated_metadata_only_inputs
+                CompiledFunction.metadata.num_mutated_metadata_only_inputs
             )
             # Our forward() returns both (mutated_inputs, outputs, output_intermediate_bases, saved_tensors, saved_symints)
             num_forward_returns = num_mutated_inputs + num_outputs + num_intermediate_bases
 
             assert num_forward_returns == len(
-                CompiledFunction.fw_metadata.requires_grad_info
+                CompiledFunction.metadata.fw_metadata.requires_grad_info
             ) + num_intermediate_bases
 
             # Partitioners must put symint arguments at the end separate from tensor arguments
@@ -1739,7 +2185,8 @@ def forward(ctx, *deduped_flat_tensor_args):
                 assert all(
                     [isinstance(x, torch.Tensor) for x in tensors_saved_for_backwards]
                 )
-                ctx.save_for_backward(*tensors_saved_for_backwards)
+                # See Note [Detaching saved tensors in AOTAutograd]
+                ctx.save_for_backward(*map(lambda x: x.detach() if x._is_view() else x, tensors_saved_for_backwards))
                 symint_outs = fw_outs[-num_symints_saved_for_bw:]
                 assert all(
                     [
@@ -1749,7 +2196,9 @@ def forward(ctx, *deduped_flat_tensor_args):
                 )
                 ctx.symints = symint_outs
             else:
-                ctx.save_for_backward(*fw_outs[num_forward_returns:])
+                tensors_saved_for_backwards = fw_outs[num_forward_returns:]
+                # See Note [Detaching saved tensors in AOTAutograd]
+                ctx.save_for_backward(*map(lambda x: x.detach() if x._is_view() else x, tensors_saved_for_backwards))
                 ctx.symints = []
 
             raw_returns = fw_outs[0:num_forward_returns]
@@ -1758,21 +2207,23 @@ def forward(ctx, *deduped_flat_tensor_args):
             # so that autograd.Function doesn't treat them as tensors
             if num_mutated_metadata_only_inputs > 0:
                 for i, idx in enumerate(
-                    CompiledFunction.fw_metadata.mutated_inp_indices
+                    CompiledFunction.metadata.fw_metadata.mutated_inp_indices
                 ):
                     # We could make this faster by only looping over inputs with metadata-only mutations
                     # (instead of looping over inputs with either data or metadata mutations), but there shouldn't be many.
-                    info = CompiledFunction.fw_metadata.input_info[idx]
+                    info = CompiledFunction.metadata.fw_metadata.input_info[idx]
                     if info.mutates_metadata and not info.mutates_data:
                         raw_returns[i] = TensorAlias(raw_returns[i])
 
                 if config.debug_assert:
                     user_mutated_inputs_raw = raw_returns[0:num_mutated_inputs]
-                    mut_inp_infos = [x for x in CompiledFunction.fw_metadata.input_info if x.mutates_data or x.mutates_metadata]
+                    mut_inp_infos = [
+                        x for x in CompiledFunction.metadata.fw_metadata.input_info if x.mutates_data or x.mutates_metadata
+                    ]
                     assert len(user_mutated_inputs_raw) == len(mut_inp_infos)
 
             if num_outputs_aliased > 0:
-                for idx in CompiledFunction.fw_metadata.aliased_out_indices:
+                for idx in CompiledFunction.metadata.fw_metadata.aliased_out_indices:
                     raw_return_idx = num_mutated_inputs + idx
                     raw_returns[raw_return_idx] = TensorAlias(raw_returns[raw_return_idx])
 
@@ -1787,7 +2238,7 @@ def forward(ctx, *deduped_flat_tensor_args):
                 x
                 for (i, x) in enumerate(raw_returns_not_including_intermediate_bases)
                 if isinstance(x, torch.Tensor)
-                and not CompiledFunction.fw_metadata.requires_grad_info[i]
+                and not CompiledFunction.metadata.fw_metadata.requires_grad_info[i]
             ]
             ctx.mark_non_differentiable(*fw_outs_not_requiring_grad)
 
@@ -1804,27 +2255,27 @@ def backward(ctx, *flat_args):
             # - updated inputs due to metadata-only mutations.
             # We need to return them in the forward, but ensure that they all do not get gradients in the backward,
             # and we filter them out here before passing the remaining grad_outputs into the compiled backward.
-            num_mutated_inps = CompiledFunction.num_mutated_inputs
-            num_intermediate_bases = CompiledFunction.fw_metadata.num_intermediate_bases
+            num_mutated_inps = CompiledFunction.metadata.num_mutated_inputs
+            num_intermediate_bases = CompiledFunction.metadata.fw_metadata.num_intermediate_bases
             expected_grad_outs = (
-                CompiledFunction.num_outputs + num_mutated_inps + num_intermediate_bases
+                CompiledFunction.metadata.num_outputs + num_mutated_inps + num_intermediate_bases
             )
 
             assert len(flat_args) == expected_grad_outs
             if (
-                CompiledFunction.num_mutated_metadata_only_inputs > 0
-                or CompiledFunction.num_outputs_aliased > 0
+                CompiledFunction.metadata.num_mutated_metadata_only_inputs > 0
+                or CompiledFunction.metadata.num_outputs_aliased > 0
             ):
                 inp_tangents, out_tangents, intermediate_base_tangents = (
                     flat_args[0:num_mutated_inps],
-                    flat_args[num_mutated_inps:num_mutated_inps + CompiledFunction.num_outputs],
-                    flat_args[num_mutated_inps + CompiledFunction.num_outputs:],
+                    flat_args[num_mutated_inps:num_mutated_inps + CompiledFunction.metadata.num_outputs],
+                    flat_args[num_mutated_inps + CompiledFunction.metadata.num_outputs:],
                 )
                 # input_info contains info on *every* input,
                 # But in the backward(), we are only given grad outputs for every mutated input.
                 # We then need to filter out the grad outputs that correspond to metadata-only mutations.
-                mutated_inp_indices = CompiledFunction.fw_metadata.mutated_inp_indices
-                input_info = CompiledFunction.fw_metadata.input_info
+                mutated_inp_indices = CompiledFunction.metadata.fw_metadata.mutated_inp_indices
+                input_info = CompiledFunction.metadata.fw_metadata.input_info
                 assert len(inp_tangents) == len(mutated_inp_indices)
                 inp_tangents_filtered = [
                     x
@@ -1832,7 +2283,7 @@ def backward(ctx, *flat_args):
                     if input_info[info_idx].mutates_data
                 ]
                 # We also need to filter out grad outputs that correspond to outputs aliasing inputs/intermediates
-                out_info = CompiledFunction.fw_metadata.output_info
+                out_info = CompiledFunction.metadata.fw_metadata.output_info
                 out_tangents_filtered = [
                     x
                     for x, info in zip(out_tangents, out_info)
@@ -1856,178 +2307,63 @@ def backward(ctx, *flat_args):
             contiguous_args = [
                 t.contiguous() if torch.is_tensor(t) else t for t in flat_bw_args
             ]
+
             all_args = (
                 list(ctx.symints) + list(ctx.saved_tensors) + list(contiguous_args)
             )
             del contiguous_args
 
-            def call_compiled_backward(all_args):
-                all_args_list = list(all_args)
+            def call_compiled_backward():
                 if CompiledFunction.compiled_bw is None:
-                    # TODO - pass in fake tensors ?
-                    context = disable_autocast_manager if disable_amp else nullcontext
-                    with context(), track_graph_compiling(aot_config, "backward"):
-                        CompiledFunction.compiled_bw = aot_config.bw_compiler(
-                            bw_module, all_args_list
+                    if config.use_dynamic_shapes:
+                        all_args_list = list(all_args)
+                        CompiledFunction.compiled_bw = create_aot_dispatcher_function(
+                            bw_module, all_args_list, AOTConfig(
+                                aot_config.bw_compiler, None, None,
+                                aot_config.decompositions, 0, aot_config.aot_id, aot_config.keep_inference_input_mutations
+                            )
                         )
+                    else:
+                        context = disable_autocast_manager if disable_amp else nullcontext
+                        with context(), track_graph_compiling(aot_config, "backward"):
+                            CompiledFunction.compiled_bw = aot_config.bw_compiler(
+                                bw_module, all_args
+                            )
 
                 ctx.maybe_clear_saved_tensors()
                 out = call_func_with_args(
                     CompiledFunction.compiled_bw,
-                    all_args_list,
+                    all_args,
                     steal_args=True,
                     disable_amp=disable_amp,
                 )
+
                 return tuple(out)
 
             if torch.is_grad_enabled() and any(t.requires_grad for t in all_args if isinstance(t, torch.Tensor)):
-                # If backward pass was run with create_graph=True, ensure that the graph is
-                # properly connected, but errors when the user performs double backward.
+                # Ensure that the graph is connected, and error if double backward is performed.
                 # See comment for why once_differentiable is not sufficient:
                 # https://github.com/pytorch/pytorch/pull/92348/files#r1072962107
                 class CompiledFunctionBackward(torch.autograd.Function):
                     @staticmethod
-                    def forward(ctx, *all_args):
-                        return call_compiled_backward(all_args)
+                    def forward(ctx, *unused_args):
+                        return call_compiled_backward()
 
                     @staticmethod
                     def backward(ctx, *args):
                         raise RuntimeError("torch.compile with aot_autograd does not currently support double backward")
-
+                # Pass args even though they're unused, so that the graph is built
                 out = CompiledFunctionBackward.apply(*all_args)
             else:
-                out = call_compiled_backward(all_args)
+                out = call_compiled_backward()
             return out
 
-    @wraps(CompiledFunction.apply)
-    def compiled_function(*args):
-        # Step 2: remove aliased inputs that are mutated, replace with synthetic bases
-        # Only happens if our graph mutates an input that aliases another input.
-        if CompiledFunction.synthetic_base_info is not None:
-            # Given: the original args, including at least one pair of inputs that are aliased
-            # and get subsequently mutated.
-            # Generate: the updated args, including (potentially multiple) synthetic bases
-            # that replace the views. The input views are regenerated manually in the compiled function.
-            # TODO: think harder about what happens if (a view of) one of these mutated input views is ALSO returned
-            new_inputs, metadata = merge_view_inputs(
-                args, CompiledFunction.fw_metadata.input_info
-            )
-            # We're just re-running the original-args-to-synthetic-base transformation
-            # that we ran during compilation.
-            # This returns metadata that we use during tracing to recover the input views,
-            # which we don't actually need at runtime.
-            assert metadata is not None
-            args_with_synthetic_bases = new_inputs
-        else:
-            args_with_synthetic_bases = args
-
-        all_outs = CompiledFunction.apply(*args_with_synthetic_bases)
-
-        num_mutated_inps = CompiledFunction.num_mutated_inputs
-        num_intermediate_bases = CompiledFunction.fw_metadata.num_intermediate_bases
-        assert (
-            len(all_outs)
-            == num_mutated_inps + CompiledFunction.num_outputs + num_intermediate_bases
-        )
-        # Step 3: After running the compiled fw, apply updates to mutated inputs
-        if CompiledFunction.num_mutated_inputs > 0:
-            assert (
-                len(CompiledFunction.fw_metadata.mutated_inp_indices)
-                == CompiledFunction.num_mutated_inputs
-            )
-
-            updated_inputs = all_outs[: CompiledFunction.num_mutated_inputs]
-            fw_outs = all_outs[CompiledFunction.num_mutated_inputs :]
-
-            for i, inpt_idx in enumerate(
-                CompiledFunction.fw_metadata.mutated_inp_indices
-            ):
-                meta = CompiledFunction.fw_metadata.input_info[inpt_idx]
-                if not meta.mutates_data and not meta.mutates_metadata:
-                    continue
-                original_inpt = args[inpt_idx]
-                updated_inpt = updated_inputs[i]
-                if meta.mutates_metadata and not meta.mutates_data:
-                    assert isinstance(updated_inpt, TensorAlias)
-                    updated_inpt = updated_inpt.alias
-                    # We need to grab the size/stride/storage_offset from the compiled forward,
-                    # and use that to mutate the metadata of the input
-                    original_inpt.as_strided_(
-                        updated_inpt.size(),
-                        updated_inpt.stride(),
-                        updated_inpt.storage_offset(),
-                    )
-                else:
-                    # TODO: handle resize_() on inputs to a larger size.
-                    # This is actually non-trivial to detect, so we should probably just handle it
-                    # (or make dynamo detect).
-                    # We can't just check of original_inpt.storage_size != updated_inpt.storage_size,
-                    # Because the original_inpt might be a view of some larger tensor,
-                    # and updated_inpt is always densely packed.
-                    if meta.mutates_data and meta.mutates_metadata:
-                        original_inpt.as_strided_(
-                            updated_inpt.size(),
-                            updated_inpt.stride(),
-                            updated_inpt.storage_offset(),
-                        )
-                    else:
-                        assert meta.mutates_data
-                    original_inpt.copy_(updated_inpt)
-        else:
-            fw_outs = all_outs
-
-        # Step 4: Manually regenerate any outputs that are aliased to inputs, instead of
-        # compiling them.
-        if CompiledFunction.num_outputs_aliased > 0:
-            # The compiled forward also returned intermediate bases. We don't want to return them to the user.
-            if CompiledFunction.fw_metadata.num_intermediate_bases > 0:
-                fw_outs_no_intermediate_bases = fw_outs[
-                    : -CompiledFunction.fw_metadata.num_intermediate_bases
-                ]
-                intermediate_bases = fw_outs[-CompiledFunction.fw_metadata.num_intermediate_bases:]
-            else:
-                fw_outs_no_intermediate_bases = fw_outs
-                intermediate_bases = []
-            assert len(fw_outs_no_intermediate_bases) == len(CompiledFunction.fw_metadata.output_info)
-
-            fw_outs_including_aliases = []
-            for i, (o, info) in enumerate(zip(
-                fw_outs_no_intermediate_bases, CompiledFunction.fw_metadata.output_info
-            )):
-                if info.output_type == OutputType.non_alias:
-                    fw_outs_including_aliases.append(o)
-                    continue
-                assert isinstance(o, TensorAlias)
-                o_ = o.alias
-                o_grad = CompiledFunction.fw_metadata.requires_grad_info[CompiledFunction.num_mutated_inputs + i]
-                if info.output_type == OutputType.alias_of_input:
-                    aliased_base_tensor = args[info.base_idx]
-                    regenerated_out = gen_alias_from_base(aliased_base_tensor, o_, o_grad)
-                    fw_outs_including_aliases.append(regenerated_out)
-                    continue
-                elif info.output_type == OutputType.is_input:
-                    aliased_base_tensor = args[info.base_idx]
-                    regenerated_out = aliased_base_tensor
-                    fw_outs_including_aliases.append(regenerated_out)
-                    continue
-                elif info.output_type == OutputType.alias_of_intermediate:
-                    base_tensor_list = intermediate_bases
-                elif info.output_type == OutputType.alias_of_intermediate_save_as_output:
-                    base_tensor_list = intermediate_bases
-                else:
-                    assert info.output_type == OutputType.alias_of_intermediate_base_is_user_output
-                    base_tensor_list = fw_outs_no_intermediate_bases
-                aliased_base_tensor = base_tensor_list[info.base_idx]
-                # TODO: handle the custom autograd function case here.
-                # We need a way to check whether a tensor came from a custom autograd fn from python,
-                # AND a way to replay that custom view fn.
-                regenerated_out = gen_alias_from_base(
-                    aliased_base_tensor, o_, o_grad
-                )
-                fw_outs_including_aliases.append(regenerated_out)
-            return fw_outs_including_aliases
-        else:
-            return fw_outs
+    compiled_function = create_runtime_wrapper(
+        CompiledFunction.apply,
+        runtime_metadata=metadata_,
+        trace_joint=True,
+        keep_input_mutations=False,
+    )
 
     if not config.debug_assert:
         return compiled_function
@@ -2108,15 +2444,13 @@ def create_aot_dispatcher_function(
         # coordinate flags
         config.use_fake_tensor = False
 
-    if config.use_dynamic_shapes:
-        assert config.use_fake_tensor, "Dynamic shapes only works with fake tensor"
-
     # Check flat_args to see if they're already fake.  If so, use that fake
     # mode instead.
 
     for x in flat_args:
         if isinstance(x, FakeTensor):
             fake_mode = x.fake_mode
+            shape_env = fake_mode.shape_env
             break
     else:
         shape_env = ShapeEnv() if config.use_dynamic_shapes else None
@@ -2128,7 +2462,7 @@ def create_aot_dispatcher_function(
 
     cross_ref = CrossRefFakeMode() if config.debug_fake_cross_ref else nullcontext()
     python_dispatcher_mode = (
-        enable_python_dispatcher() if config.use_dynamic_shapes else nullcontext()
+        enable_python_dispatcher() if shape_env is not None else nullcontext()
     )
 
     with torch.autograd.set_multithreading_enabled(
@@ -2137,8 +2471,11 @@ def create_aot_dispatcher_function(
 
         def process_inputs(flat_args):
             if config.use_fake_tensor or isinstance(fake_mode, FakeTensorMode):
-
                 def convert(idx, x):
+                    if shape_env is not None:
+                        from torch._dynamo.source import ConstantSource
+                        if isinstance(x, int):
+                            return shape_env.create_symintnode(shape_env.create_symbol(x, ConstantSource(f"sym_{idx}")), hint=x)
                     if not isinstance(x, torch.Tensor):
                         return x
                     if isinstance(x, FakeTensor):
@@ -2215,6 +2552,7 @@ def aot_function(
     num_params_buffers: int = 0,
     hasher_type=None,  # deprecated
     static_argnums: Optional[Tuple[int]] = None,  # deprecated
+    keep_inference_input_mutations: bool = False
 ) -> Callable:
     """
     Traces the forward and backward graph of :attr:`fn` using torch dispatch
@@ -2280,6 +2618,7 @@ def aot_function(
         decompositions=decompositions,
         num_params_buffers=num_params_buffers,
         aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations
     )
     cached_res = None
 
@@ -2364,8 +2703,8 @@ def functional_call(named_params, named_buffers, *args, **kwargs):
         params_and_buffers = {**named_params, **named_buffers}
         return torch.func.functional_call(mod, params_and_buffers, args, kwargs)
 
-    named_params = dict(_named_parameters(mod, remove_duplicate=False))
-    named_buffers = dict(_named_buffers(mod, remove_duplicate=False))
+    named_params = dict(mod.named_parameters(remove_duplicate=False))
+    named_buffers = dict(mod.named_buffers(remove_duplicate=False))
     num_params_buffers = len(named_params) + len(named_buffers)
     compiled_f = aot_function(
         functional_call, num_params_buffers=num_params_buffers, *args, **kwargs
@@ -2373,7 +2712,7 @@ def functional_call(named_params, named_buffers, *args, **kwargs):
 
     class AOTModule(nn.Module):
         def __init__(self):
-            super(AOTModule, self).__init__()
+            super().__init__()
             self.orig_module = mod
 
         def forward(self, *args, **kwargs):
@@ -2396,6 +2735,7 @@ def aot_module_simplified(
     decompositions: Optional[Dict] = None,
     hasher_type=None,
     static_argnums=None,
+    keep_inference_input_mutations=False,
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
@@ -2429,8 +2769,8 @@ def aot_module_simplified(
     torch._dynamo.utils.assert_no_fake_params_or_buffers(mod)
 
     params = {
-        **dict(_named_parameters(mod, remove_duplicate=False)),
-        **dict(_named_buffers(mod, remove_duplicate=False)),
+        **dict(mod.named_parameters(remove_duplicate=False)),
+        **dict(mod.named_buffers(remove_duplicate=False)),
     }
     params_flat, params_spec = pytree.tree_flatten(params)
     params_flat = tuple(params_flat)
@@ -2441,7 +2781,7 @@ def functional_call(*args, **kwargs):
             mod, pytree.tree_unflatten(args[:params_len], params_spec)
         ):
             if isinstance(mod, torch.fx.GraphModule):
-                with fx_traceback.override_stack_trace(), warnings.catch_warnings():
+                with fx_traceback.preserve_node_meta(), warnings.catch_warnings():
                     warnings.filterwarnings(
                         "ignore", "Anomaly Detection has been enabled."
                     )
@@ -2468,6 +2808,7 @@ def functional_call(*args, **kwargs):
         decompositions=decompositions,
         num_params_buffers=params_len,
         aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
     )
 
     full_args = []
diff --git a/torch/_functorch/compilers.py b/torch/_functorch/compilers.py
index 37bbcf8b03a4..735fcadb1c44 100644
--- a/torch/_functorch/compilers.py
+++ b/torch/_functorch/compilers.py
@@ -6,6 +6,7 @@
 from contextlib import contextmanager
 from functools import partial
 from typing import Callable, Optional, Tuple, Union
+import sympy
 
 import torch
 from torch import SymInt
@@ -96,7 +97,6 @@ def ts_compile(fx_g: fx.GraphModule, inps) -> Callable:
     return f
 
 
-@make_boxed_compiler
 def _draw_graph_compile(fx_g, _, name, clear_meta=True):
     print(fx_g.code)
     draw_graph(fx_g, name, clear_meta=clear_meta)
@@ -104,7 +104,9 @@ def _draw_graph_compile(fx_g, _, name, clear_meta=True):
 
 
 def draw_graph_compile(name):
-    return partial(_draw_graph_compile, name=name)
+    return make_boxed_compiler(
+        partial(_draw_graph_compile, name=name)
+    )
 
 
 @make_boxed_compiler
@@ -125,7 +127,6 @@ def run(self, *args):
         super().run(*args)
 
     def run_node(self, n):
-        import sympy
 
         def subst_symint(ni):
             if not isinstance(ni, SymInt):
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 3bf964633510..40703ba653d7 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -8,12 +8,16 @@
 Global flags for aot autograd
 """
 import os
+import sys
 import logging
 
 use_functionalize = True
 
 use_fake_tensor = True
 
+# can be useful for debugging if we are incorrectly creating meta fake tensors
+fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", True)
+
 # Enables optional asserts in hotpath code to check for errors.  If
 # you are seeing weird accuracy problems, try turning this on.
 # For now, to more easily identify bugs, this is turned on by default.
@@ -40,3 +44,8 @@
 log_level = (
     logging.DEBUG if debug_partitioner or debug_graphs or debug_joint else logging.INFO
 )
+
+from .._dynamo.config_utils import install_config_module
+
+# adds patch, save_config, invalid config checks, etc
+install_config_module(sys.modules[__name__])
diff --git a/torch/_functorch/eager_transforms.py b/torch/_functorch/eager_transforms.py
index 496ea846df18..254759b2348b 100644
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@@ -8,7 +8,9 @@
 import torch
 from functools import partial, wraps
 import contextlib
-from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
+from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map, tree_map_only
+from torch.fx.experimental import const_fold
+from torch.fx.experimental.proxy_tensor import make_fx
 from .pytree_hacks import tree_map_, treespec_pprint
 import torch.autograd.forward_ad as fwAD
 
@@ -337,6 +339,16 @@ def _safe_zero_index(x):
     assert len(x) == 1
     return x[0]
 
+# jacrev and jacfwd don't support complex functions
+# Helper function to throw appropriate error.
+def error_if_complex(func_name, args, is_input):
+    flat_args, _ = tree_flatten(args)
+    for idx, arg in enumerate(flat_args):
+        if arg.dtype.is_complex:
+            input_or_output = ("inputs" if is_input else "outputs")
+            err_msg = (f"{func_name}: Expected all {input_or_output} "
+                       f"to be real but received complex tensor at flattened input idx: {idx}")
+            raise RuntimeError(err_msg)
 
 @exposed_in("torch.func")
 def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False,
@@ -473,6 +485,7 @@ def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False
 
     @wraps(func)
     def wrapper_fn(*args):
+        error_if_complex("jacrev", args, is_input=True)
         vjp_out = _vjp_with_argnums(func, *args, argnums=argnums, has_aux=has_aux)
         if has_aux:
             output, vjp_fn, aux = vjp_out
@@ -482,6 +495,8 @@ def wrapper_fn(*args):
         # See NOTE: [Computing jacobian with vmap and vjp for multiple outputs]
         flat_output, output_spec = tree_flatten(output)
 
+        error_if_complex("jacrev", flat_output, is_input=False)
+
         # NB: vjp already checks that all outputs are tensors
         # Step 1: Construct grad_outputs by splitting the standard basis
         flat_output_numels = tuple(out.numel() for out in flat_output)
@@ -1093,6 +1108,7 @@ def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, ran
     """
     @wraps(func)
     def wrapper_fn(*args):
+        error_if_complex("jacfwd", args, is_input=True)
         primals = args if argnums is None else _slice_argnums(args, argnums)
         flat_primals, primals_spec = tree_flatten(primals)
         flat_primals_numels = tuple(p.numel() for p in flat_primals)
@@ -1101,6 +1117,8 @@ def wrapper_fn(*args):
 
         def push_jvp(basis):
             output = _jvp_with_argnums(func, args, basis, argnums=argnums, has_aux=has_aux)
+            # output[0] is the output of `func(*args)`
+            error_if_complex("jacfwd", output[0], is_input=False)
             if has_aux:
                 _, jvp_out, aux = output
                 return jvp_out, aux
@@ -1600,3 +1618,112 @@ def wrapped(*args, **kwargs):
         finally:
             _func_decrement_nesting()
     return wrapped
+
+@exposed_in("torch.func")
+def linearize(func: Callable, *primals) -> Tuple[Any, Callable]:
+    '''
+    Returns the value of ``func`` at ``primals`` and linear approximation
+    at ``primals``.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments.
+        primals (Tensors): Positional arguments to ``func`` that must all be
+            Tensors. These are the values at which the function is linearly approximated.
+
+    Returns:
+        Returns a ``(output, jvp_fn)`` tuple containing the output of ``func``
+        applied to ``primals`` and a function that computes the jvp of
+        ``func`` evaluated at ``primals``.
+
+    linearize is useful if jvp is to be computed multiple times at ``primals``. However,
+    to achieve this, linearize saves intermediate computation and has higher memory requrements
+    than directly applying `jvp`. So, if all the ``tangents`` are known, it maybe more efficient
+    to compute vmap(jvp) instead of using linearize.
+
+    .. note::
+        linearize evaluates ``func`` twice. Please file an issue for an implementation
+        with a single evaluation.
+
+    Example::
+        >>> import torch
+        >>> from torch.func import linearize
+        >>> def fn(x):
+        ...     return x.sin()
+        ...
+        >>> output, jvp_fn = linearize(fn, torch.zeros(3, 3))
+        >>> jvp_fn(torch.ones(3, 3))
+        tensor([[1., 1., 1.],
+                [1., 1., 1.],
+                [1., 1., 1.]])
+        >>>
+
+    '''
+    # Note: We evaluate `fn` twice.
+    # Once for returning the output and other while
+    # tracing the graph.
+    # If this becomes a bottle-neck, we should update
+    # make_fx such that it also returns the output.
+
+    output = func(*primals)
+    _, output_spec = tree_flatten(output)
+
+    flat_primals, primals_argspec = tree_flatten(primals)
+
+    # tangents for tracing
+    flat_tangents = tuple(p.new_empty(()).expand_as(p) for p in flat_primals)
+
+    # function to trace
+    def trace_fn(flat_tangents):
+        with fwAD.dual_level():
+            flat_duals = tuple(fwAD.make_dual(p, t) for p, t in zip(flat_primals, flat_tangents))
+            duals = tree_unflatten(flat_duals, primals_argspec)
+            output = func(*duals)
+            tangents = tree_map_only(torch.Tensor, lambda t: fwAD.unpack_dual(t)[1], output)
+
+        return tangents
+
+    jvp_graph = make_fx(trace_fn)(flat_tangents)
+    const_folded_jvp_graph = const_fold.split_const_subgraphs(jvp_graph)
+
+    # Hold only the meta-data regarding the primals.
+    flat_primals_shape = tuple(p.shape for p in flat_primals)
+    flat_primals_device = tuple(p.device for p in flat_primals)
+    flat_primals_dtype = tuple(p.dtype for p in flat_primals)
+
+    def forward_ad_checks(flat_tangents):
+        for idx, t in enumerate(flat_tangents):
+            if t.shape != flat_primals_shape[idx]:
+                msg = (f"tangent:{idx} with shape {t.shape} in flattened "
+                       f"pytree doesn't match the shape {flat_primals_shape[idx]} "
+                       "of the corresponding primal.")
+                raise RuntimeError(msg)
+
+            if t.device != flat_primals_device[idx]:
+                msg = (f"tangent:{idx} with device {t.device} in flattened "
+                       f"pytree doesn't match the device {flat_primals_device[idx]} "
+                       "of the corresponding primal.")
+                raise RuntimeError(msg)
+
+            if t.dtype != flat_primals_dtype[idx]:
+                msg = (f"tangent:{idx} with dtype {t.dtype} in flattened "
+                       f"pytree doesn't match the dtype {flat_primals_dtype[idx]} "
+                       "of the corresponding primal.")
+                raise RuntimeError(msg)
+
+    # jvp_fn : callable to return
+    #   It takes care of checking the argspec of tangents,
+    #   calling the folded fx graph and unflattening fx graph output
+    def jvp_fn(*tangents):
+        flat_tangents, tangent_argspec = tree_flatten(tangents)
+        if tangent_argspec != primals_argspec:
+            raise RuntimeError(f"Expected the tangents {tangent_argspec} to have "
+                               f"the same argspec as the primals {primals_argspec}")
+
+        forward_ad_checks(flat_tangents)
+
+        flat_output = const_folded_jvp_graph(*flat_tangents)
+        # const folded graph can return flat output,
+        # so transform output.
+        return tree_unflatten(flat_output, output_spec)
+
+    return output, jvp_fn
diff --git a/torch/_functorch/functional_call.py b/torch/_functorch/functional_call.py
index de9c5879e436..0f8791d3b9ff 100644
--- a/torch/_functorch/functional_call.py
+++ b/torch/_functorch/functional_call.py
@@ -1,4 +1,5 @@
-from typing import Dict, Union, Any, Tuple, List
+from collections import Counter
+from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -8,12 +9,13 @@
 
 @exposed_in("torch.func")
 def functional_call(
-    module: 'torch.nn.Module',
-    parameter_and_buffer_dicts: Union[Dict[str, Tensor], Tuple[Dict[str, Tensor], ...]],
+    module: "torch.nn.Module",
+    parameter_and_buffer_dicts: Union[Dict[str, Tensor], Sequence[Dict[str, Tensor]]],
     args: Union[Any, Tuple],
     kwargs: Dict[str, Any] = None,
     *,
     tie_weights: bool = True,
+    strict: bool = False,
 ):
     r"""Performs a functional call on the module by replacing the module parameters
     and buffers with the provided ones.
@@ -100,7 +102,7 @@ def compute_loss(params, x, t):
 
     Args:
         module (torch.nn.Module): the module to call
-        parameters_and_buffers (Dict[str,Tensor] or tuple of Dict[str, Tensor]): the parameters that will be used in
+        parameters_and_buffers (Dict[str, Tensor] or tuple of Dict[str, Tensor]): the parameters that will be used in
             the module call. If given a tuple of dictionaries, they must have distinct keys so that all dictionaries can
             be used together
         args (Any or tuple): arguments to be passed to the module call. If not a tuple, considered a single argument.
@@ -109,25 +111,49 @@ def compute_loss(params, x, t):
             tied in the reparamaterized version. Therefore, if True and different values are passed for the tied
             paramaters and buffers, it will error. If False, it will not respect the originally tied parameters and
             buffers unless the values passed for both weights are the same. Default: True.
+        strict (bool, optional): If True, then the parameters and buffers passed in must match the parameters and
+            buffers in the original module. Therefore, if True and there are any missing or unexpected keys, it will
+            error. Default: False.
 
     Returns:
         Any: the result of calling ``module``.
     """
-    parameters_and_buffers = parameter_and_buffer_dicts if isinstance(parameter_and_buffer_dicts, dict) else {}
-    if isinstance(parameter_and_buffer_dicts, tuple):
-        key_list = [i for dct in parameter_and_buffer_dicts for i in dct.keys()]
-        key_set = set(key_list)
-        if len(key_set) != len(key_list):
-            repeated_key = list(filter(lambda key: key_list.count(key) > 1, key_set))[0]
-            raise ValueError(f"{repeated_key} appeared in multiple dictionaries; behavior of functional call is ambiguous")
-
-        parameters_and_buffers = {k: v for d in parameter_and_buffer_dicts for k, v in d.items()}
-
-    return nn.utils.stateless._functional_call(module, parameters_and_buffers, args, kwargs, tie_weights=tie_weights)
+    if isinstance(parameter_and_buffer_dicts, dict):
+        parameters_and_buffers = parameter_and_buffer_dicts
+    elif isinstance(parameter_and_buffer_dicts, Sequence):
+        if not all(isinstance(d, dict) for d in parameter_and_buffer_dicts):
+            raise ValueError(
+                "Expected all elements of parameter_and_buffer_dicts to be dictionaries"
+            )
+        all_keys = [k for d in parameter_and_buffer_dicts for k in d.keys()]
+        repeated_keys = [key for key, n in Counter(all_keys).items() if n > 1]
+        if len(repeated_keys) > 0:
+            raise ValueError(
+                f"{repeated_keys} appeared in multiple dictionaries; behavior of functional call is ambiguous"
+            )
+        parameters_and_buffers = {
+            k: v for d in parameter_and_buffer_dicts for k, v in d.items()
+        }
+    else:
+        raise ValueError(
+            f"Expected parameter_and_buffer_dicts to be a dict, or a list/tuple of dicts, "
+            f"but got {type(parameter_and_buffer_dicts)}"
+        )
+
+    return nn.utils.stateless._functional_call(
+        module,
+        parameters_and_buffers,
+        args,
+        kwargs,
+        tie_weights=tie_weights,
+        strict=strict,
+    )
 
 
 @exposed_in("torch.func")
-def stack_module_state(models: List[nn.Module]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+def stack_module_state(
+    models: List[nn.Module],
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """stack_module_state(models) -> params, buffers
 
     Prepares a list of torch.nn.Modules for ensembling with :func:`vmap`.
@@ -183,29 +209,39 @@ def forward(self, x):
         same mode (training vs eval).
     """
     if len(models) == 0:
-        raise RuntimeError('stack_module_state: Expected at least one model, got 0.')
+        raise RuntimeError("stack_module_state: Expected at least one model, got 0.")
     if not (all(m.training for m in models) or all(not m.training for m in models)):
-        raise RuntimeError('stack_module_state: Expected all models to '
-                           'have the same training/eval mode.')
+        raise RuntimeError(
+            "stack_module_state: Expected all models to have the same training/eval mode."
+        )
     model0_typ = type(models[0])
     if not all(type(m) == model0_typ for m in models):
-        raise RuntimeError('stack_module_state: Expected all models to '
-                           'be of the same class.')
-    all_params = [{k: v for k, v in model.named_parameters()} for model in models]
-    params = {k: construct_stacked_leaf(tuple(params[k] for params in all_params), k)
-              for k in all_params[0]}
-    all_buffers = [{k: v for k, v in model.named_buffers()} for model in models]
-    buffers = {k: construct_stacked_leaf(tuple(buffers[k] for buffers in all_buffers), k)
-               for k in all_buffers[0]}
+        raise RuntimeError(
+            "stack_module_state: Expected all models to be of the same class."
+        )
+    all_params = [dict(model.named_parameters()) for model in models]
+    params = {
+        k: construct_stacked_leaf(tuple(params[k] for params in all_params), k)
+        for k in all_params[0]
+    }
+    all_buffers = [dict(model.named_buffers()) for model in models]
+    buffers = {
+        k: construct_stacked_leaf(tuple(buffers[k] for buffers in all_buffers), k)
+        for k in all_buffers[0]
+    }
 
     return params, buffers
 
-def construct_stacked_leaf(tensors, name):
-    all_requires_grad = all([t.requires_grad for t in tensors])
-    none_requires_grad = all([not t.requires_grad for t in tensors])
+
+def construct_stacked_leaf(
+    tensors: Union[Tuple[Tensor, ...], List[Tensor]], name: str
+) -> Tensor:
+    all_requires_grad = all(t.requires_grad for t in tensors)
+    none_requires_grad = all(not t.requires_grad for t in tensors)
     if not all_requires_grad and not none_requires_grad:
         raise RuntimeError(
-            f'Expected {name} from each model to have the same .requires_grad')
+            f"Expected {name} from each model to have the same .requires_grad"
+        )
     result = torch.stack(tensors)
     if all_requires_grad:
         result = result.detach().requires_grad_()
diff --git a/torch/_functorch/make_functional.py b/torch/_functorch/make_functional.py
index cd7db8256e11..711be174d827 100644
--- a/torch/_functorch/make_functional.py
+++ b/torch/_functorch/make_functional.py
@@ -21,44 +21,13 @@
 import torch
 import torch.nn as nn
 from torch import Tensor
-from .named_members_polyfill import _named_buffers, _named_parameters
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
 
 # Utilities to make nn.Module "functional"
 # In particular the goal is to be able to provide a function that takes as input
 # the parameters and evaluate the nn.Module using fixed inputs.
 
 
-def _del_nested_attr(obj: nn.Module, names: List[str]) -> None:
-    """
-    Deletes the attribute specified by the given list of names.
-    For example, to delete the attribute obj.conv.weight,
-    use _del_nested_attr(obj, ['conv', 'weight'])
-    """
-    if len(names) == 1:
-        delattr(obj, names[0])
-    else:
-        _del_nested_attr(getattr(obj, names[0]), names[1:])
-
-
-def _set_nested_attr(obj: nn.Module, names: List[str], value: Tensor) -> None:
-    """
-    Set the attribute specified by the given list of names to value.
-    For example, to set the attribute obj.conv.weight,
-    use _del_nested_attr(obj, ['conv', 'weight'], value)
-    """
-    if len(names) == 1:
-        setattr(obj, names[0], value)
-    else:
-        _set_nested_attr(getattr(obj, names[0]), names[1:], value)
-
-
-def _get_nested_attr(obj: nn.Module, names: List[str]) -> Tensor:
-    if len(names) == 1:
-        return getattr(obj, names[0])
-    else:
-        return _get_nested_attr(getattr(obj, names[0]), names[1:])
-
-
 def raise_parameter_tying_error() -> NoReturn:
     raise RuntimeError(
         "make_functional(module): we don't yet support models that "
@@ -72,14 +41,14 @@ def raise_parameter_tying_error() -> NoReturn:
 def create_names_map(
     named_params: Union[Dict[str, Tensor], Iterable[Tuple[str, Tensor]]],
     tied_named_params: Union[Dict[str, Tensor], Iterable[Tuple[str, Tensor]]],
-) -> Dict[str, List[List[str]]]:
+) -> Dict[str, List[str]]:
     """
     named_params is a dictionary of tensors: {'A': A, 'B': B}
     tied_named_params is another dictionary of tensors {'A': A, 'B': B, 'B_tied': B}
     with potentially tied (or 'duplicated') tensors
 
     This function creates a mapping from the names in named_params to the
-    names in tied_named_params: {'A': [['A']], 'B': [['B'], ['B_tied']]}.
+    names in tied_named_params: {'A': ['A'], 'B': ['B', 'B_tied']}.
     """
     named_params = dict(named_params)
     tied_named_params = dict(tied_named_params)
@@ -88,32 +57,32 @@ def create_names_map(
     tied_tensors_dict_keys = set(tied_named_params.keys())
     assert tensors_dict_keys.issubset(tied_tensors_dict_keys)
 
-    tensor_to_mapping: Dict[Tensor, Tuple[str, List[List[str]]]] = {}
+    tensor_to_mapping: Dict[Tensor, Tuple[str, List[str]]] = {}
     for key, tensor in named_params.items():
         tensor_to_mapping[tensor] = (key, [])
     for key, tensor in tied_named_params.items():
         assert tensor in tensor_to_mapping
-        tensor_to_mapping[tensor][1].append(key.split("."))
+        tensor_to_mapping[tensor][1].append(key)
     return dict(tensor_to_mapping.values())
 
 
 def _extract_members(
     mod: nn.Module,
-    _named_members: Callable[..., Iterable[Tuple[str, Tensor]]],
     named_members: Callable[..., Iterable[Tuple[str, Tensor]]],
     subclass: Callable[[Tensor], Tensor],
-) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[List[str]]]]:
-    all_named_members = tuple(_named_members(mod, remove_duplicate=False))
-    unique_named_members = tuple(named_members())
+) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[str]]]:
+    all_named_members = tuple(named_members(remove_duplicate=False))
+    unique_named_members = tuple(named_members(remove_duplicate=True))
     names_map = create_names_map(unique_named_members, all_named_members)
 
     # Remove all the members in the model
     memo = {}
+    accessor = NamedMemberAccessor(mod)
     for name, p in all_named_members:
         if p not in memo:
             memo[p] = subclass(torch.empty_like(p, device="meta"))
         replacement = memo[p]
-        _set_nested_attr(mod, name.split("."), replacement)
+        accessor.set_tensor(name, replacement)
 
     if len(unique_named_members) == 0:
         names, params = (), ()
@@ -124,7 +93,7 @@ def _extract_members(
 
 def extract_weights(
     mod: nn.Module,
-) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[List[str]]]]:
+) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[str]]]:
     """
     This function removes all the Parameters from the model and
     return them as a tuple as well as their original attribute names.
@@ -133,13 +102,13 @@ def extract_weights(
     Note that this function modifies the model in place and after this
     call, mod.parameters() will be empty.
     """
-    return _extract_members(mod, _named_parameters, mod.named_parameters, nn.Parameter)
+    return _extract_members(mod, mod.named_parameters, nn.Parameter)
 
 
 def extract_buffers(
     mod: nn.Module,
-) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[List[str]]]]:
-    return _extract_members(mod, _named_buffers, mod.named_buffers, lambda x: x)
+) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[str]]]:
+    return _extract_members(mod, mod.named_buffers, lambda x: x)
 
 
 def load_weights(
@@ -153,23 +122,23 @@ def load_weights(
     Note that the `params` are regular Tensors (that can have history) and so are left
     as Tensors. This means that mod.parameters() will still be empty after this call.
     """
-    for name, p in zip(names, params):
-        if as_params:
-            p = nn.Parameter(p)
-        _del_nested_attr(mod, name.split("."))
-        _set_nested_attr(mod, name.split("."), p)
+    accessor = NamedMemberAccessor(mod)
+    if as_params:
+        params = [nn.Parameter(p) for p in params]
+    accessor.set_tensors(names, params)
 
 
 def _swap_state(
-    mod: nn.Module, names_map: Dict[str, List[List[str]]], elems: Iterable[Tensor]
+    mod: nn.Module, names_map: Dict[str, List[str]], elems: Iterable[Tensor]
 ) -> List[Tensor]:
     result: List[Tensor] = []
+    accessor = NamedMemberAccessor(mod)
     for (_, attr_names), elem in zip(names_map.items(), elems):
         for i, attr_name in enumerate(attr_names):
             if i == 0:
-                result.append(_get_nested_attr(mod, attr_name))
-            _del_nested_attr(mod, attr_name)
-            _set_nested_attr(mod, attr_name, elem)
+                result.append(accessor.swap_tensor(attr_name, elem))
+            else:
+                accessor.set_tensor(attr_name, elem)
     return result
 
 
@@ -179,8 +148,8 @@ def load_buffers(
     buffers: Sequence[Tensor],
     as_params: bool = False,
 ) -> None:
-    for name, p in zip(names, buffers):
-        _set_nested_attr(mod, name.split("."), p)
+    accessor = NamedMemberAccessor(mod)
+    accessor.set_tensors(names, buffers)
 
 
 def load_state(
@@ -292,10 +261,10 @@ def __init__(
         stateless_model: nn.Module,
         param_names: Tuple[str, ...],
         buffer_names: Tuple[str, ...],
-        param_names_map: Dict[str, List[List[str]]],
-        buffer_names_map: Dict[str, List[List[str]]],
+        param_names_map: Dict[str, List[str]],
+        buffer_names_map: Dict[str, List[str]],
     ) -> None:
-        super(FunctionalModuleWithBuffers, self).__init__()
+        super().__init__()
         self.stateless_model = stateless_model
         self.param_names = param_names
         self.buffer_names = buffer_names
@@ -347,9 +316,9 @@ def __init__(
         self,
         stateless_model: nn.Module,
         param_names: Tuple[str, ...],
-        names_map: Dict[str, List[List[str]]],
+        names_map: Dict[str, List[str]],
     ) -> None:
-        super(FunctionalModule, self).__init__()
+        super().__init__()
         self.stateless_model = stateless_model
         self.param_names = param_names
         self.names_map = names_map
@@ -569,8 +538,7 @@ def combine_state_for_ensemble(
     model0_typ = type(models[0])
     if not all(type(m) == model0_typ for m in models):
         raise RuntimeError(
-            "combine_state_for_ensemble: Expected all models to "
-            "be of the same class."
+            "combine_state_for_ensemble: Expected all models to be of the same class."
         )
     funcs, params, buffers = zip(
         *[make_functional_with_buffers(model) for model in models]
diff --git a/torch/_functorch/named_members_polyfill.py b/torch/_functorch/named_members_polyfill.py
deleted file mode 100644
index 80704eb551ad..000000000000
--- a/torch/_functorch/named_members_polyfill.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Polyfilled from pytorch core while we figure out the `remove_duplicate` issues.
-def _named_members(mod, get_members_fn, prefix='', recurse=True, remove_duplicate=True):
-    r"""Helper method for yielding various names + members of modules."""
-    memo = set()
-    modules = mod.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, mod)]
-    for module_prefix, module in modules:
-        members = get_members_fn(module)
-        for k, v in members:
-            if v is None or v in memo:
-                continue
-            if remove_duplicate:
-                memo.add(v)
-            name = module_prefix + ('.' if module_prefix else '') + k
-            yield name, v
-
-
-def _named_parameters(mod, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True):
-    gen = _named_members(
-        mod,
-        lambda module: module._parameters.items(),
-        prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
-    for elem in gen:
-        yield elem
-
-
-def _named_buffers(mod, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True):
-    gen = _named_members(
-        mod,
-        lambda module: module._buffers.items(),
-        prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
-    for elem in gen:
-        yield elem
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 0abbf87b327a..03b5563e9966 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -1,4 +1,5 @@
 from torch.fx.experimental.proxy_tensor import is_sym_node, py_sym_types
+from torch.fx.experimental.symbolic_shapes import hint_int
 import torch
 import torch.fx as fx
 import operator
@@ -17,7 +18,7 @@
 
 
 
-class InvalidNodeBase(object):
+class InvalidNodeBase:
     def __repr__(self):
         return "Invalid Node"
 
@@ -221,21 +222,14 @@ def _tensor_nbytes(numel, dtype):
     return numel * sizes[dtype]
 
 def _size_of(node: fx.Node) -> int:
-    def to_size_hint(s):
-        if isinstance(s, torch.SymInt):
-            py_s = s.node
-            return py_s.shape_env.size_hint(py_s.expr)
-        assert isinstance(s, int)
-        return s
-
     if 'val' in node.meta:
         val = node.meta['val']
         if isinstance(val, py_sym_types):
             return 1
         elif isinstance(val, (list, tuple)):
-            return sum(_tensor_nbytes(to_size_hint(n.numel()), n.dtype) for n in val if isinstance(n, torch.Tensor))
+            return sum(_tensor_nbytes(hint_int(n.numel()), n.dtype) for n in val if isinstance(n, torch.Tensor))
         elif isinstance(val, torch.Tensor):
-            return _tensor_nbytes(to_size_hint(val.numel()), val.dtype)
+            return _tensor_nbytes(hint_int(val.numel()), val.dtype)
 
         raise RuntimeError(f"Unknown metadata type {type(val)}")
 
@@ -394,19 +388,19 @@ def is_tensor_node(x):
 
     fusible_ops = recomputable_ops | set(random_ops)
     if AOT_PARTITIONER_DEBUG:
-        joint_module_ops = set(
+        joint_module_ops = {
             str(node.target._overloadpacket)
             for node in joint_module.graph.nodes
             if node.op == "call_function" and hasattr(node.target, "_overloadpacket")
-        )
-        ops_ignored = joint_module_ops - set([str(i) for i in recomputable_ops])
+        }
+        ops_ignored = joint_module_ops - {str(i) for i in recomputable_ops}
         print("Ops banned from rematerialization: ", ops_ignored)
         print()
 
     AGGRESSIVE_RECOMPUTATION = False
 
     def is_materialized_backwards(node):
-        cur_nodes = set([node])
+        cur_nodes = {node}
         while len(cur_nodes) > 0:
             cur = cur_nodes.pop()
             for user in cur.users:
@@ -528,8 +522,8 @@ def get_node_weight(node) -> int:
         joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
     if AOT_PARTITIONER_DEBUG:
         print("Theoretical Activations Stored: ", sum([_size_of(i) for i in saved_values]) / 1e9)
-        fw_module_nodes = set([node.name for node in fw_module.graph.nodes if node.op == 'call_function'])
-        bw_module_nodes = set([node.name for node in bw_module.graph.nodes if node.op == 'call_function'])
+        fw_module_nodes = {node.name for node in fw_module.graph.nodes if node.op == 'call_function'}
+        bw_module_nodes = {node.name for node in bw_module.graph.nodes if node.op == 'call_function'}
         remat_nodes = fw_module_nodes & bw_module_nodes
 
         counts = defaultdict(int)
diff --git a/torch/_functorch/vmap.py b/torch/_functorch/vmap.py
index 0cae1b900eda..efb0f6ed0b81 100644
--- a/torch/_functorch/vmap.py
+++ b/torch/_functorch/vmap.py
@@ -12,7 +12,6 @@
 from .pytree_hacks import tree_map_
 from functools import partial
 import os
-import sys
 import itertools
 
 from torch._C._functorch import (
@@ -226,8 +225,7 @@ def lazy_load_decompositions():
         return
     DECOMPOSITIONS_LOADED = True
 
-    if not (os.environ.get("PYTORCH_JIT", "1" if sys.version_info < (3, 11) else "0") == "1" and
-            __debug__):
+    if not (os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__):
         return
     # use an alternate way to register an operator into the decomposition table
     # _register_jit_decomposition doesn't work for some operators, e.g. addr,
diff --git a/torch/_guards.py b/torch/_guards.py
index 0591d4048d95..5e2fb89b904e 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -8,12 +8,7 @@
 
 log = logging.getLogger(__name__)
 
-# TODO(voz): Stolen pattern, not sure why this is the case,
-# but mypy complains.
-try:
-    import sympy  # type: ignore[import]
-except ImportError:
-    log.warning("No sympy found")
+import sympy
 
 """
 torch._guards is the definitional source of truth for general purpose guard structures.
@@ -356,19 +351,16 @@ class Source:
     def reconstruct(self, codegen):
         raise NotImplementedError()
 
-    def guard_source(self):
+    def guard_source(self) -> GuardSource:
         raise NotImplementedError()
 
-    def name(self):
+    def name(self) -> str:
         raise NotImplementedError()
 
-    def make_guard(self, fn, is_volatile=False):
+    def make_guard(self, fn, is_volatile=False) -> Guard:
         if self.guard_source() is GuardSource.CONSTANT:
             raise NotImplementedError()
         return Guard(self.name(), self.guard_source(), fn, is_volatile)
 
-    def is_nn_module(self):
-        return self.guard_source() in (
-            GuardSource.LOCAL_NN_MODULE,
-            GuardSource.GLOBAL_NN_MODULE,
-        )
+    def is_nn_module(self) -> bool:
+        return self.guard_source().is_nn_module()
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index e69de29bb2d1..ceadaac7472e 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -0,0 +1,27 @@
+from typing import Any, Dict, List, Optional
+
+import torch.fx
+
+__all__ = ["compile"]
+
+
+def compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    options: Optional[Dict[str, Any]] = None,
+):
+    """
+    Compile a given FX graph with TorchInductor.  This allows compiling
+    FX graphs captured without using TorchDynamo.
+
+    Args:
+        gm: The FX graph to compile.
+        example_inputs:  List of tensor inputs.
+        options:  Optional dict of config options.  See `torch._inductor.config`.
+
+    Returns:
+        Callable with same behavior as gm but faster.
+    """
+    from .compile_fx import compile_fx
+
+    return compile_fx(gm, example_inputs, config_patches=options)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index af6eb16fc448..c2316529ffd2 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -17,15 +17,25 @@
 import types
 from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
 from ctypes import cdll
+from functools import partial
 from threading import Thread
 from time import sleep, time
 from typing import Any, Callable, Dict, List
 
 import torch
 
+from torch._inductor import config, cuda_properties, exc
+from torch._inductor.utils import developer_warning
 from torch.hub import _Faketqdm, tqdm
 from torch.utils import cpp_extension
-from . import config, cuda_properties, exc
+
+if config.is_fbcode():
+    from torch._inductor.fb.logging import global_cache_log
+else:
+
+    def global_cache_log(*args, **kwargs):
+        pass
+
 
 LOCK_TIMEOUT = 600
 
@@ -55,40 +65,106 @@ def _compile_end():
 
 @functools.lru_cache(None)
 def cache_dir():
-    return os.environ.get(
+    cache_dir = os.environ.get(
         "TORCHINDUCTOR_CACHE_DIR",
         f"{tempfile.gettempdir()}/torchinductor_{getpass.getuser()}",
     )
+    os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
 
 
-class DiskCache:
-    @staticmethod
-    @functools.lru_cache(None)
-    def _subdir():
-        subdir = os.path.join(cache_dir(), "cached_tunings")
-        os.makedirs(subdir, exist_ok=True)
-        return subdir
+class PersistentCache:
+    def __init__(self):
+        self.local_cache_path = os.path.join(cache_dir(), "local_cache")
+        self.global_cache_path = config.global_cache_path
 
-    @staticmethod
-    @functools.lru_cache(4096)
-    def _read_file(path):
-        with open(path, "r") as fd:
-            return json.loads(fd.read())
+        if torch.cuda.is_available():
+            self.dinfo = torch.cuda.get_device_properties(
+                torch.cuda.current_device()
+            ).name
+            self.vinfo = torch.version.cuda
 
-    def __init__(self, unique_name):
-        super().__init__()
-        self.unique_name = unique_name
+    def get_local_cache(self):
+        if not os.path.isfile(self.local_cache_path):
+            return {}
+        with open(self.local_cache_path, "r") as local_cache_file:
+            local_cache = json.load(local_cache_file)
+        return local_cache
 
-    def lookup(self, key: Any, generate: Callable[[], Any]):
+    def update_local_cache(self, local_cache):
+        write_atomic(self.local_cache_path, json.dumps(local_cache, indent=4))
+
+    @functools.lru_cache(None)
+    def get_global_cache(self):
+        if self.global_cache_path is None or not os.path.isfile(self.global_cache_path):
+            return {}
+        with open(self.global_cache_path, "r") as global_cache_file:
+            global_cache = json.load(global_cache_file)
+        if self.dinfo not in global_cache:
+            global_cache[self.dinfo] = {}
+        if self.vinfo not in global_cache[self.dinfo]:
+            global_cache[self.dinfo][self.vinfo] = {}
+        return global_cache[self.dinfo][self.vinfo]
+
+    def lookup(
+        self,
+        choices,
+        name: str,
+        inputs: str,
+        benchmark: Callable[[Any], float],
+    ):
         """
-        Check if we have already generated key, if not call generate()
-        to populate the cache.
+        Check to see if we have benchmarked the given choice callers. For each
+        choice caller:
+
+            1. Check global_cache[name][inputs][choice], return benchmark if cached.
+            2. Check local_cache[name][inputs][choice], return benchmark if cached.
+            3.
+                a. `max_autotune=True`: benchmark the choice, update
+                    local_cache[name][inputs][choice], and return the benchmark.
+                b. `max_autotune=False`: don't benchmark the choice, return nothing.
         """
-        path = os.path.join(self._subdir(), code_hash(self.unique_name + repr(key)))
-        if not os.path.exists(path):
-            value = generate()
-            write_atomic(path, json.dumps(value))
-        return self._read_file(path)
+
+        gc_log = partial(global_cache_log, self.dinfo, self.vinfo, name, inputs)
+        timings = {}
+
+        def check_cache(cache, callback=None):
+            """Check if `cache` contains data for all the choices"""
+            hit = True
+            for choice in choices:
+                choice_hash = choice.hash_key()
+                if choice_hash in cache.get(name, {}).get(inputs, {}):
+                    # cache hit
+                    timings[choice] = cache[name][inputs][choice_hash]
+                    if callback:
+                        callback(choice_hash, cached=True)
+                else:
+                    # cache miss
+                    hit = False
+                    if callback:
+                        callback(choice_hash, cached=False)
+            return hit
+
+        if config.max_autotune:
+            local_cache = self.get_local_cache()
+            # check local cache first since it is data specific to the current machine
+            if not check_cache(local_cache) and not check_cache(
+                self.get_global_cache(), callback=gc_log
+            ):
+                # re-benchmark everything to try to get consistent numbers from the same machine
+                for choice in choices:
+                    timings[choice] = benchmark(choice)
+                    local_cache.setdefault(name, {})
+                    local_cache[name].setdefault(inputs, {})
+                    local_cache[name][inputs][choice.hash_key()] = timings[choice]
+
+                self.update_local_cache(local_cache)
+        else:
+            # only check global cache, not local one
+            check_cache(self.get_global_cache(), callback=gc_log)
+            # may have a partial cache hit, where not everything is benchmarked
+
+        return timings
 
 
 def get_lock_dir():
@@ -111,7 +187,7 @@ def get_code_path(source_code, ext, extra):
     basename = code_hash(source_code + extra)
     subdir = os.path.join(cache_dir(), basename[1:3])
     path = os.path.join(subdir, f"{basename}.{ext}")
-    return basename, subdir, path
+    return extra + basename, subdir, path
 
 
 def write(source_code, ext, extra=""):
@@ -196,7 +272,7 @@ def is_gcc():
     return re.search(r"(gcc|g\+\+)", cpp_compiler())
 
 
-class VecISA(object):
+class VecISA:
     _bit_width: int
     _macro: str
     _arch_flags: str
@@ -252,7 +328,7 @@ def __hash__(self) -> int:
 
     @functools.lru_cache(None)
     def __bool__(self):
-        key, input_path = write(VecISA._avx_code, "cpp", extra="")
+        key, input_path = write(VecISA._avx_code, "cpp")
         from filelock import FileLock
 
         lock_dir = get_lock_dir()
@@ -371,7 +447,14 @@ def cpp_flags():
 
 
 def optimization_flags():
-    return "-march=native -O3 -ffast-math -fno-finite-math-only -fopenmp"
+    base_flags = "-O3 -ffast-math -fno-finite-math-only"
+    if sys.platform == "darwin":
+        # Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang`
+        # Also, `-march=native` is unrecognized option on M1
+        base_flags += " -Xclang -fopenmp"
+    else:
+        base_flags += " -march=native -fopenmp"
+    return base_flags
 
 
 def use_custom_generated_macros():
@@ -402,8 +485,24 @@ def get_include_and_linking_paths(
         # This approach allows us to only pay for what we use.
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = []
-        libs = ["gomp"]
         macros = ""
+        if sys.platform == "darwin":
+            # GNU OpenMP generally is not available on MacOS
+            # There is either Intel OpenMP(for x86) or LLVM OpenMP (for both x86 and arm64)
+            libs = ["omp"]
+            if os.getenv("CONDA_PREFIX") is not None:
+                # On MacOS OpenMP is not available via the system install
+                # But on conda can be provided using https://anaconda.org/anaconda/llvm-openmp
+                conda_lib_path = os.path.join(os.getenv("CONDA_PREFIX"), "lib")
+                ipaths.append(os.path.join(os.getenv("CONDA_PREFIX"), "include"))
+                lpaths.append(conda_lib_path)
+                # Prefer Intel OpenMP on x86 machine
+                if os.uname().machine == "x86_64" and os.path.exists(
+                    os.path.join(conda_lib_path, "libiomp5.dylib")
+                ):
+                    libs = ["iomp5"]
+        else:
+            libs = ["gomp"]
     ipaths = " ".join(["-I" + p for p in ipaths])
     lpaths = " ".join(["-L" + p for p in lpaths])
     libs = " ".join(["-l" + p for p in libs])
@@ -464,7 +563,7 @@ def load(cls, source_code):
         key, input_path = write(
             source_code,
             "cpp",
-            extra=cpp_compile_command("i", "o", vec_isa=picked_vec_isa),
+            code_hash(repr(cpp_compile_command("i", "o", vec_isa=picked_vec_isa))),
         )
         if key not in cls.cache:
             from filelock import FileLock
@@ -493,8 +592,8 @@ class PyCodeCache:
     clear = staticmethod(cache.clear)
 
     @classmethod
-    def load(cls, source_code):
-        key, path = write(source_code, "py")
+    def load(cls, source_code, extra=""):
+        key, path = write(source_code, "py", extra)
         if key not in cls.cache:
             with open(path) as f:
                 code = compile(f.read(), path, "exec")
@@ -507,13 +606,6 @@ def load(cls, source_code):
         return cls.cache[key]
 
 
-@functools.lru_cache(None)
-def patch_triton_dir():
-    os.environ["TRITON_CACHE_DIR"] = os.environ.get(
-        "TRITON_CACHE_DIR", os.path.join(cache_dir(), "triton")
-    )
-
-
 class TritonCodeCache:
     @staticmethod
     def get_name(mod):
@@ -522,7 +614,6 @@ def get_name(mod):
 
     @classmethod
     def load(cls, source_code):
-        patch_triton_dir()
         mod = PyCodeCache.load(source_code)
         return getattr(mod, cls.get_name(mod))
 
@@ -559,10 +650,10 @@ def result(self):
         latency = time() - t0
         if latency > 50:
             name = _load_kernel_name(self.source_code)
-            log.warning(
+            developer_warning(
                 f"Detected long compilation time of {latency} seconds for kernel name {name}"
             )
-            log.warning(self.source_code)
+            developer_warning(self.source_code)
         del self.source_code, self.future
         return kernel
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index b53f8d6d227a..c3f35da4aa73 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -9,6 +9,8 @@
 import sympy
 from sympy.printing.printer import Printer
 
+import torch
+
 from .. import metrics
 from ..utils import (
     DeferredLineBase,
@@ -69,7 +71,32 @@ def _print_Mod(self, expr):
         return " % ".join(map(self.paren, map(self._print, expr.args)))
 
     def _print_CleanDiv(self, expr):
-        return self._print_IndexingDiv(expr)
+        return self._print_FloorDiv(expr)
+
+
+class PythonPrinter(ExprPrinter):
+    def _print_ModularIndexing(self, expr):
+        x, div, mod = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        mod = self.paren(self.doprint(mod))
+        if div != "1":
+            x = f"({x} // {div})"
+        return f"{x} % {mod}"
+
+    def _print_FloorDiv(self, expr):
+        x, div = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        return f"({x} // {div})"
+
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        return f"math.floor({self.paren(self._print(expr.args[0]))})"
+
+    def _print_ceiling(self, expr):
+        assert len(expr.args) == 1
+        return f"math.ceil({self.paren(self._print(expr.args[0]))})"
 
 
 class OpOverrides:
@@ -123,6 +150,16 @@ def bitwise_or(x, y):
     def bitwise_xor(x, y):
         return f"{ExprPrinter.paren(x)} ^ {ExprPrinter.paren(y)}"
 
+    @staticmethod
+    def bitwise_left_shift(x, y):
+        return f"{ExprPrinter.paren(x)} << {ExprPrinter.paren(y)}"
+
+    # TODO(fdrocha): this is currently not being used anywhere,
+    # pending on moving triton pin past 972b761
+    @staticmethod
+    def bitwise_right_shift(x, y):
+        return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
+
     @staticmethod
     def remainder(a, b):
         r = ops.mod(a, b)
@@ -150,7 +187,7 @@ def _new_line(self, line):
 
 class DeferredIndentedBuffer(IndentedBuffer):
     def __init__(self, initial_indent=0):
-        super(DeferredIndentedBuffer, self).__init__(initial_indent)
+        super().__init__(initial_indent)
 
     def writeline(self, name, line):
         if name is None:
@@ -274,9 +311,14 @@ def cpp_argdefs(self):
 
         # TODO(jansel): replace this with data from scheduler
         buffer_types = {x.get_name(): x.get_dtype() for x in V.graph.buffers}
-        buffer_types.update(
-            {name: val.get_dtype() for name, val in V.graph.graph_inputs.items()}
-        )
+        for name, val in V.graph.graph_inputs.items():
+            if isinstance(val, sympy.Expr):
+                if val.is_integer:
+                    buffer_types[name] = torch.int64
+                else:
+                    buffer_types[name] = torch.float64
+            else:
+                buffer_types[name] = val.get_dtype()
         buffer_types.update(
             {name: val.dtype for name, val in V.graph.constants.items()}
         )
@@ -340,6 +382,7 @@ def python_argdefs(self):
             arg_defs.append(inner)
             call_args.append(str(outer))
             precompile_args.append(SizeArg(inner, outer))
+
         return arg_defs, call_args, precompile_args
 
     def aliases(self):
@@ -619,7 +662,9 @@ def rename_indexing(self, index) -> sympy.Expr:
         index = V.graph.sizevars.simplify(index)
         sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
         replacements = {
-            x: self.args.size(x) for x in sorted_symbols if x.name.startswith("s")
+            x: self.args.size(x)
+            for x in sorted_symbols
+            if x.name.startswith("s") or x.name.startswith("ps")
         }
         return sympy_subs(index, replacements)
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 709d9981370a..50b5f360fda8 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -5,12 +5,13 @@
 import sys
 from copy import copy, deepcopy
 from pathlib import Path
-from typing import Dict, List
-from unittest.mock import patch
+from typing import ClassVar, Dict, List
 
+import numpy
 import sympy
 
 import torch
+import torch.fx
 from torch._prims_common import is_float_dtype
 
 from .. import codecache, config, ir, metrics
@@ -20,6 +21,7 @@
 from .common import (
     BracesBuffer,
     CppWrapperKernelArgs,
+    CSEVariable,
     DeferredIndentedBuffer,
     ExprPrinter,
     IndentedBuffer,
@@ -179,7 +181,7 @@ def _print_ModularIndexing(self, expr):
             x = f"({x} / {div})"
         return f"{x} % {mod}"
 
-    def _print_IndexingDiv(self, expr):
+    def _print_FloorDiv(self, expr):
         x, div = expr.args
         x = self.paren(self.doprint(x))
         div = self.paren(self.doprint(div))
@@ -189,6 +191,59 @@ def _print_IndexingDiv(self, expr):
 cexpr = CppPrinter().doprint
 
 
+@dataclasses.dataclass
+class OptimizationContext:
+    key: ClassVar[str] = "opt_ctx"
+
+    # Masked load
+    is_masked_load: bool = False
+    # Load value as mask
+    is_load_as_mask: bool = False
+
+    dtype: torch.dtype = torch.float
+    ops_name: str = ""
+    is_most_inner_loop_irrevelant: bool = False
+
+
+class RecordOptimizationContext:
+    def __init__(self, func_name: str = ""):
+        self.func_name = func_name
+        self.current_node: torch.fx.Node = None
+        self.opt_ctx: OptimizationContext = None
+
+    def __enter__(self):
+        assert V.interpreter
+        assert V.interpreter.current_node
+
+        self.current_node: torch.fx.Node = V.interpreter.current_node
+        if OptimizationContext.key in self.current_node.meta:
+            self.opt_ctx = self.current_node.meta[OptimizationContext.key]
+        else:
+            self.opt_ctx = OptimizationContext()
+        self.opt_ctx.ops_name = self.func_name
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self.current_node
+        assert self.opt_ctx
+        self.current_node.meta[OptimizationContext.key] = self.opt_ctx
+
+    def get_opt_ctx(self):
+        return self.opt_ctx
+
+    def get_fx_node(self):
+        assert self.current_node
+        return self.current_node
+
+
+def get_current_node_opt_ctx() -> OptimizationContext:
+    assert V.interpreter.current_node
+    if OptimizationContext.key in V.interpreter.current_node.meta:
+        return V.interpreter.current_node.meta[OptimizationContext.key]
+    else:
+        return None
+
+
 class CppVecOverrides(OpOverrides):
     """Map element-wise ops to aten vectorization C++"""
 
@@ -266,6 +321,10 @@ def le(x, y):
     def ge(x, y):
         return f"{x} >= {y}"
 
+    @staticmethod
+    def and_(x, y):
+        return f"{x} & {y}"
+
     @staticmethod
     def rsqrt(x):
         return f"{x}.rsqrt()"
@@ -304,11 +363,15 @@ def lgamma(x):
 
     @staticmethod
     def logical_and(a, b):
-        return f"{a} && {b}"
+        return f"({a} != 0) & ({b} != 0)"
 
     @staticmethod
     def logical_or(a, b):
-        return f"{a} || {b}"
+        return f"({a} != 0) | ({b} != 0)"
+
+    @staticmethod
+    def tan(a):
+        return f"{a}.tan()"
 
     @staticmethod
     def tanh(a):
@@ -321,19 +384,81 @@ def tanh(a):
     def reciprocal(a):
         return f"{a}.reciprocal()"
 
+    @staticmethod
+    def atan(x):
+        return f"{x}.atan()"
+
+    @staticmethod
+    def acos(x):
+        return f"{x}.acos()"
+
+    @staticmethod
+    def asin(x):
+        return f"{x}.asin()"
+
+    @staticmethod
+    def log10(x):
+        return f"{x}.log10()"
+
+    @staticmethod
+    def erfc(x):
+        return f"{x}.erfc()"
+
+    @staticmethod
+    def nextafter(x):
+        return f"{x}.nextafter()"
+
+    @staticmethod
+    def copysign(a, b):
+        return f"{a}.copysign({b})"
+
+    @staticmethod
+    def atan2(a, b):
+        return f"{a}.atan2({b})"
+
+    @staticmethod
+    def hypot(a, b):
+        return f"{a}.hypot({b})"
+
+    @staticmethod
+    def atanh(x):
+        # For real x, atanh(x) = 1/2 * log((1+x)/(1-x))
+        vec_one = f"decltype({x})(1)"
+        vec_one_half = f"decltype({x})(0.5)"
+        return f"{vec_one_half} * (({vec_one} + {x})/({vec_one} - {x})).log()"
+
+    @staticmethod
+    def asinh(x):
+        # For real x, asinh(x) = log(x + sqrt(1 + x**2))
+        vec_one = f"decltype({x})(1)"
+        return f"({x} + ({vec_one} + {x}*{x}).sqrt()).log()"
+
+    @staticmethod
+    def acosh(x):
+        # For real x, acosh(x) = log(x + sqrt(x**2 -1))
+        vec_one = f"decltype({x})(1)"
+        return f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"
+
     @staticmethod
     def constant(val, dtype):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx
+        assert opt_ctx.dtype in [torch.int32, torch.float32]
+        proposed_dtype = opt_ctx.dtype
         if val == float("inf"):
-            quote = f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+            assert proposed_dtype == torch.float
+            quote = f"std::numeric_limits<{DTYPE_TO_CPP[proposed_dtype]}>::infinity()"
         elif val == float("-inf"):
-            quote = f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+            assert proposed_dtype == torch.float
+            quote = f"-std::numeric_limits<{DTYPE_TO_CPP[proposed_dtype]}>::infinity()"
         elif math.isnan(val):
-            quote = f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::quiet_NaN()"
+            quote = f"std::numeric_limits<{DTYPE_TO_CPP[proposed_dtype]}>::quiet_NaN()"
         elif val is True or val is False:
-            quote = f"static_cast<{DTYPE_TO_CPP[dtype]}>({str(val).lower()})"
+            quote = f"static_cast<{DTYPE_TO_CPP[proposed_dtype]}>({str(val).lower()})"
         else:
-            quote = f"static_cast<{DTYPE_TO_CPP[dtype]}>({repr(val)})"
-        return f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>({quote})"
+            quote = f"static_cast<{DTYPE_TO_CPP[proposed_dtype]}>({repr(val)})"
+
+        return f"at::vec::Vectorized<{DTYPE_TO_CPP[proposed_dtype]}>({quote})"
 
     @staticmethod
     def relu(x):
@@ -404,6 +529,44 @@ def to_dtype(x, dtype):
     def log1p(x):
         return f"{x}.log1p()"
 
+    @staticmethod
+    def masked(mask, body, other):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx
+        assert opt_ctx.is_masked_load
+
+        code = BracesBuffer()
+
+        var = V.kernel.cse.newvar()
+        if other == float("-inf"):
+            code.writeline(
+                f"auto {var} = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());"
+            )
+        elif other == float("inf"):
+            code.writeline(
+                f"auto {var} = at::vec::Vectorized<float>(std::numeric_limits<float>::infinity());"
+            )
+        else:
+            code.writeline(f"auto {var} = at::vec::Vectorized<float>({other!r});")
+
+        with V.kernel.swap_buffers(code), code.indent():
+            result = body()
+            zero_val = "at::vec::Vectorized<float>(0)"
+            float_mask = f"to_float_mask({mask})"
+            blendv = f"decltype({result})::blendv({var}, {result}, {float_mask} != {zero_val})"
+            code.writeline(f"{var} = {blendv};")
+        V.kernel.compute.splice(code)
+        return var
+
+    @staticmethod
+    def index_expr(expr, dtype):
+        assert dtype == torch.int64
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx
+        assert opt_ctx.dtype == torch.int32
+        assert opt_ctx.is_most_inner_loop_irrevelant
+        return f"at::vec::Vectorized<int>(static_cast<int>({cexpr(V.kernel.rename_indexing(expr))}))"
+
 
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
@@ -425,6 +588,10 @@ def sin(x):
     def cos(x):
         return f"std::cos({x})"
 
+    @staticmethod
+    def neg(x):
+        return f"decltype({x})(-{x})"
+
     @staticmethod
     def exp(x):
         # return f"Sleef_expf_u10({x})"
@@ -454,6 +621,10 @@ def rsqrt(x):
     def log1p(x):
         return f"std::log1p({x})"
 
+    @staticmethod
+    def tan(x):
+        return f"std::tan({x})"
+
     @staticmethod
     def tanh(x):
         return f"std::tanh({x})"
@@ -514,6 +685,54 @@ def isnan(x):
     def lgamma(x):
         return f"std::lgamma({x})"
 
+    @staticmethod
+    def acos(x):
+        return f"std::acos({x})"
+
+    @staticmethod
+    def acosh(x):
+        return f"std::acosh({x})"
+
+    @staticmethod
+    def asin(x):
+        return f"std::asin({x})"
+
+    @staticmethod
+    def asinh(x):
+        return f"std::asinh({x})"
+
+    @staticmethod
+    def atan2(x, y):
+        return f"std::atan2({x}, {y})"
+
+    @staticmethod
+    def atan(x):
+        return f"std::atan({x})"
+
+    @staticmethod
+    def atanh(x):
+        return f"std::atanh({x})"
+
+    @staticmethod
+    def copysign(x, y):
+        return f"std::copysign({x}, {y})"
+
+    @staticmethod
+    def hypot(x, y):
+        return f"std::hypot({x}, {y})"
+
+    @staticmethod
+    def erfc(x):
+        return f"std::erfc({x})"
+
+    @staticmethod
+    def log10(x):
+        return f"std::log10({x})"
+
+    @staticmethod
+    def nextafter(x, y):
+        return f"std::nextafter({x}, {y})"
+
     @staticmethod
     def relu(x):
         return f"{x} * ({x}>0)"
@@ -558,26 +777,29 @@ def index_expr(expr, dtype):
     @staticmethod
     def masked(mask, body, other):
         code = BracesBuffer()
-        var = V.kernel.cse.newvar()
+
+        # Write masked operation into a lambda
+        body_var = V.kernel.cse.newvar()
+        code.writeline(f"auto {body_var} = [&]")
+        with V.kernel.swap_buffers(code), code.indent():
+            result = body()
+            code.writeline(f"return {result};")
+        code.writeline(";")
+        V.kernel.compute.splice(code)
+
+        # Use the lambda's return type as the type of other
+        type = f"decltype({body_var}())"
+
         if other == float("-inf"):
-            code.writeline(f"float {var} = -std::numeric_limits<float>::infinity();")
+            other_code = f"-std::numeric_limits<{type}>::infinity()"
         elif other == float("inf"):
-            code.writeline(f"float {var} = std::numeric_limits<float>::infinity();")
+            other_code = "std::numeric_limits<{type}>::infinity()"
         elif isinstance(other, bool):
-            if other:
-                code.writeline(f"auto {var} = true;")
-            else:
-                code.writeline(f"auto {var} = false;")
-        elif isinstance(other, float):
-            code.writeline(f"float {var} = {other};")
+            other_code = f"static_cast<{type}>({str(other).lower()})"
         else:
-            code.writeline(f"auto {var} = {other!r};")
-        code.writeline(f"if({mask})")
-        with V.kernel.swap_buffers(code), code.indent():
-            result = body()
-            code.writeline(f"{var} = {result};")
-        V.kernel.compute.splice(code)
-        return var
+            other_code = f"static_cast<{type}>({repr(other)})"
+
+        return f"{mask} ? {body_var}() : {other_code}"
 
     @staticmethod
     def logical_and(a, b):
@@ -597,8 +819,7 @@ def randn(seed: sympy.Expr, offset: sympy.Expr, dtype):
 
     @staticmethod
     def sigmoid(x):
-        x = ops.exp(f"-{x}")
-        return f"1 / (1 + {x})"
+        return f"decltype({x})(1) / (decltype({x})(1) + std::exp(-{x}))"
 
     @staticmethod
     def sign(x):
@@ -621,7 +842,7 @@ class CppKernel(Kernel):
     suffix = ";"
 
     def __init__(self, args, num_threads):
-        super(CppKernel, self).__init__(args)
+        super().__init__(args)
         self.call_ranges = None
         self.ranges = None
         self.itervars = None
@@ -843,7 +1064,7 @@ class CppVecKernel(CppKernel):
     overrides = CppVecOverrides
 
     def __init__(self, args, num_threads, tiling_factor=0):
-        super(CppVecKernel, self).__init__(args, num_threads)
+        super().__init__(args, num_threads)
         assert codecache.pick_vec_isa()
         if tiling_factor == 0:
             tiling_factor = codecache.pick_vec_isa().nelements()
@@ -871,22 +1092,27 @@ def load(self, name: str, index: sympy.Expr):
         expanded_index = sympy.expand(index)
         new_index = self.scale_index_with_offset(index, self.tiling_factor)
 
-        if expanded_index == new_index:
-            line = f"at::vec::Vectorized<float>({var}[{cexpr(index)}])"
-        else:
-            if V.graph.get_dtype(name) in [torch.bool, torch.uint8]:
-                nelements = codecache.pick_vec_isa().nelements()
-                if var not in self.var_vec_buf_map:
-                    self.var_vec_buf_map[var] = f"g_tmp_buffer_{var}"
-                    self.loads.writeline(
-                        f"float {self.var_vec_buf_map[var]}[{nelements}] = {{0}};"
-                    )
+        is_broadcast = expanded_index == new_index
+
+        var_expr = (
+            f"{var}[{cexpr(index)}]" if is_broadcast else f"{var} + {cexpr(new_index)}"
+        )
+
+        if V.graph.get_dtype(name) in [torch.bool, torch.uint8]:
+            nelements = codecache.pick_vec_isa().nelements()
+            if var not in self.var_vec_buf_map:
+                self.var_vec_buf_map[var] = f"g_tmp_buffer_{var}"
                 self.loads.writeline(
-                    f"flag_to_float({var} + {cexpr(new_index)}, {self.var_vec_buf_map[var]}, {nelements});"
+                    f"float {self.var_vec_buf_map[var]}[{nelements}] = {{0}};"
                 )
-                line = f"at::vec::Vectorized<float>::loadu({self.var_vec_buf_map[var]})"
-            else:
-                line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
+            self.loads.writeline(
+                f"flag_to_float({var_expr}, {self.var_vec_buf_map[var]}, {nelements});"
+            )
+            line = f"at::vec::Vectorized<float>::loadu({self.var_vec_buf_map[var]})"
+        elif is_broadcast:
+            line = f"at::vec::Vectorized<float>({var_expr})"
+        else:
+            line = f"at::vec::Vectorized<float>::loadu({var_expr})"
 
         return self.cse.generate(self.loads, line)
 
@@ -1143,14 +1369,14 @@ def codegen_inner_loops(self, code):
 
 class CppVecKernelChecker(CppVecKernel):
     def __init__(self, args, num_threads, tiling_factor):
-        super(CppVecKernelChecker, self).__init__(args, num_threads, tiling_factor)
+        super().__init__(args, num_threads, tiling_factor)
 
-        # Since this kernel is only for checker but does not genreate any
+        # Since this kernel is only for checker but does not generate any
         # code, so we need to decrease the kernel count.
         metrics.generated_kernel_count -= 1
         metrics.generated_cpp_vec_kernel_count -= 1
 
-        # Used to recorde the graph wrapper code as the wrapper_code status could be
+        # Used to record the graph wrapper code as the wrapper_code status could be
         # changed during graph run.
         self._orig_wrapper_code = None
 
@@ -1161,45 +1387,114 @@ def __init__(self, args, num_threads, tiling_factor):
                 self.fast_vec_list.append(k)
         self.exit_stack = contextlib.ExitStack()
 
+        # Cache all the load result
+        self.load_results: list[CSEVariable] = []
+        self.load_supported_dtypes: list[torch.dtype] = [
+            torch.float,
+            torch.float32,
+            torch.bool,
+            torch.uint8,
+        ]
+        self.store_supported_dtypes: list[torch.dtype] = [torch.float, torch.float32]
+        # Cache the dtypes of the store operation. If the store is mixing dtypes, the
+        # vectorization would not support it as it is hard to determine the vec dtype
+        self.store_dtypes: list[torch.dtype] = []
+        # The dtype is used for vectorization
+        self.vec_dtype: torch.dtype = torch.float32
+
+    def is_indirect_indexing(self, index: sympy.Expr):
+        for _load_res in self.load_results:
+            # The index expression contains a value that loads from memory
+            if index.count(sympy_symbol(_load_res.name)) > 0:
+                return True
+        return False
+
     def could_vec(self, name: str, index: sympy.Expr):
         assert self.itervars is not None
         # Not a loop
         if len(self.itervars) == 0:
             return False
 
+        if self.is_indirect_indexing(index):
+            return False
+
         most_inner_var = self.itervars[-1]
         return self.is_invariant_under(most_inner_var, index) or self.is_stride1_at(
             most_inner_var, index
         )
 
+    def is_mask(self, name: str, users: Dict[torch.fx.Node, None]):
+        load_type = V.graph.get_dtype(name)
+        if load_type == torch.bool:
+            return all(user.target in ("where", "masked") for user in users.keys())
+        elif load_type == torch.uint8:
+            """
+            If the load value is torch.uint8, then we only support the loaded
+            value is as the mask.
+            """
+            if not all(
+                user.target == "to_dtype" and user.args[-1] == torch.bool
+                for user in users.keys()
+            ):
+                return False
+
+            for to_dtype_node in users.keys():
+                assert to_dtype_node.target == "to_dtype"
+                if not all(
+                    user.target in ("where", "masked")
+                    for user in to_dtype_node.users.keys()
+                ):
+                    return False
+            return True
+        else:
+            return False
+
     def load(self, name: str, index: sympy.Expr):
-        if not V.graph.get_dtype(name) in [
-            torch.float,
-            torch.float32,
-            torch.bool,
-            torch.uint8,
-        ]:
-            self.simd_vec = False
-            return self.simd_vec
+        with RecordOptimizationContext(__name__) as node_ctx:
+            load_dtype = V.graph.get_dtype(name)
+            opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+            assert opt_ctx
+            opt_ctx.dtype = load_dtype
+            opt_ctx.is_load_as_mask = self.is_mask(name, node_ctx.get_fx_node().users)
 
-        index = self.rename_indexing(index)
-        self.simd_vec = self.simd_vec and self.could_vec(name, index)
-        return self.simd_vec
+            var = self.cse.newvar()
+            self.load_results.append(var)
+
+            if load_dtype in [torch.bool, torch.uint8] and not opt_ctx.is_load_as_mask:
+                self.simd_vec = False
+                return var
+
+            if load_dtype not in self.load_supported_dtypes:
+                self.simd_vec = False
+                return var
+
+            index = self.rename_indexing(index)
+            self.simd_vec = self.simd_vec and self.could_vec(name, index)
+            return var
 
     def store(self, name, index, value, mode=None):
-        if not V.graph.get_dtype(name) in [torch.float, torch.float32]:
-            self.simd_vec = False
-            return self.simd_vec
+        with RecordOptimizationContext(__name__) as node_ctx:
+            store_dtype = V.graph.get_dtype(name)
 
-        assert "buf" in name
-        index = self.rename_indexing(index)
+            opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+            assert opt_ctx
+            opt_ctx.dtype = store_dtype
 
-        if mode:
-            self.simd_vec = False
-            return False
+            store_dtype = torch.float if store_dtype == torch.float32 else store_dtype
+            self.store_dtypes.append(store_dtype)
+            if store_dtype not in self.store_supported_dtypes:
+                self.simd_vec = False
+                return self.simd_vec
 
-        self.simd_vec = self.simd_vec and self.could_vec(name, index)
-        return self.simd_vec
+            assert "buf" in name
+            index = self.rename_indexing(index)
+
+            if mode:
+                self.simd_vec = False
+                return False
+
+            self.simd_vec = self.simd_vec and self.could_vec(name, index)
+            return self.simd_vec
 
     def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
         if (
@@ -1212,6 +1507,70 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
             self.simd_vec = False
         return self.simd_vec
 
+    def is_supported_cmp(self, node: torch.fx.Node):
+        def get_node_dtype(node):
+            if type(node) == torch.fx.Node:
+                opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+                return opt_ctx.dtype if opt_ctx else None
+            else:
+                return None
+
+        def get_cmp_dtypes(node: torch.fx.Node):
+            return get_node_dtype(node.args[-2]), get_node_dtype(node.args[-1])
+
+        assert len(node.args) >= 2
+        # cmp(x, y): y is a magic value like x >= 1
+        if type(node.args[-1]) in [int, float]:
+            return True
+        # cmp(x, y): x is a magic value like 1 >= y
+        if type(node.args[-2]) in [int, float]:
+            return False
+
+        left_dtype, right_dtype = get_cmp_dtypes(node)
+        if left_dtype is None or right_dtype is None:
+            # TODO(Eikan): To record, deduce and propagate the data type of every expression.
+            return True
+        else:
+            return left_dtype == right_dtype
+
+    def is_load_only_block(self, sub_graph: torch.fx.Graph):
+        # The sub graph only contains "placeholder", "output", "get_index", "load"
+        is_load_only = False
+        load_dtype = None
+        skip_io_nodes = ["placeholder", "output"]
+        for _node in sub_graph.nodes:
+            if _node.op in skip_io_nodes:
+                continue
+
+            if _node.target not in ["load", "get_index", "constant"]:
+                # The body contains non load node
+                is_load_only = False
+                break
+
+            if _node.target == "load":
+                _, name, _ = _node.args
+                load_dtype = V.graph.get_dtype(name)
+                is_load_only = True
+
+            # Support "constant" node
+            if _node.target == "constant":
+                _, _, load_dtype = _node.args
+
+                # Create and record the context
+                opt_ctx = OptimizationContext()
+                opt_ctx.dtype = load_dtype
+                opt_ctx.ops_name = _node.target
+                _node.meta[OptimizationContext.key] = opt_ctx
+
+                # TODO: Support BF16 and FP16
+                if load_dtype in [torch.float32, torch.int32]:
+                    is_load_only = True
+                else:
+                    is_load_only = False
+                    break
+
+        return is_load_only, load_dtype
+
     def __exit__(self, exc_type, exc_val, exc_tb):
         assert self._orig_wrapper_code is not None
         # Restore the wrapper_code
@@ -1219,19 +1578,31 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
 
     def __enter__(self):
-        # Recorde the graph wrapper code. The wrapper_code status could be
+        # Record the graph wrapper code. The wrapper_code status could be
         # changed during graph run. Regarding this checker, we also need to
         # run the graph but we don't expect to change any status that would
-        # impact the code generation. Hence, we record the graph wapper code
-        # and replace it with a dummy warpper_code and then restore to the
+        # impact the code generation. Hence, we record the graph wrapper code
+        # and replace it with a dummy wrapper_code and then restore to the
         # original one as long as the checker is finished.
         self._orig_wrapper_code = V.graph.wrapper_code
         V.graph.wrapper_code = WrapperCodeGen()
 
         class VecCheckerProxy:
+            @staticmethod
+            def _bin_cmp_op(x, y):
+                current_node: torch.fx.Node = V.interpreter.current_node
+                if not self.is_supported_cmp(current_node):
+                    self.simd_vec = False
+                return self.simd_vec
+
             @staticmethod
             def __getattr__(name):
+                bin_cmp_ops = ["eq", "ne", "le", "ge", "lt", "gt"]
+
                 def inner(*args, **kwargs):
+                    if name in bin_cmp_ops:
+                        return VecCheckerProxy._bin_cmp_op(args, kwargs)
+
                     if not (name in self.fast_vec_list):
                         self.simd_vec = False
                     return self.simd_vec
@@ -1254,17 +1625,100 @@ def reduction(name, dtype, src_dtype, reduction_type, index, value):
 
             @staticmethod
             def constant(val, dtype):
-                supported_dtype = (torch.float32, torch.int32)
-                is_supported_dtype = dtype in (supported_dtype)
-                if not is_supported_dtype:
-                    self.simd_vec = False
-                return is_supported_dtype
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    opt_ctx.dtype = dtype
+                    i32_iinfo = numpy.iinfo(numpy.int32)
+                    if (
+                        dtype == torch.int64
+                        and val <= i32_iinfo.max
+                        and val >= i32_iinfo.min
+                    ):
+                        opt_ctx.dtype = torch.int32
+
+                    f32_iinfo = numpy.finfo(numpy.float32)
+                    if dtype == torch.double:
+                        if (
+                            (val <= f32_iinfo.max and val >= f32_iinfo.min)
+                            or (val == numpy.inf)
+                            or (val == -numpy.inf)
+                        ):
+                            opt_ctx.dtype = torch.float32
+
+                    supported_dtype = (torch.float32, torch.int32)
+                    is_supported_dtype = opt_ctx.dtype in (supported_dtype)
+                    if not is_supported_dtype:
+                        self.simd_vec = False
+                    return is_supported_dtype
 
             @staticmethod
             def index_expr(expr, dtype):
-                self.simd_vec = False
-                tmp_var = self.cse.newvar()
-                return tmp_var
+                current_node: torch.fx.Node = V.interpreter.current_node
+
+                assert len(self.ranges) == len(self.itervars)
+                if not len(self.ranges) or not all(
+                    not isinstance(range, sympy.Expr) or sympy.simplify(range).is_number
+                    for range in self.ranges
+                ):
+                    # if the range value is sympy.Expr, we might could not deduce the accurate loop interval.
+                    self.simd_vec = False
+                    return self.cse.newvar()
+
+                def mod_indexing_rep(x, y, z):
+                    if z.is_constant():
+                        return x / y
+
+                    # never really happens, we'll bail on optimizing
+                    return (x / y) % z
+
+                def indexing_div_rep(x, y):
+                    return x / y
+
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    assert len(self.ranges) == len(self.itervars)
+
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    max_expr = expr.replace(
+                        ir.ModularIndexing, mod_indexing_rep
+                    ).replace(ir.FloorDiv, indexing_div_rep)
+                    min_expr = max_expr
+                    for idx in range(len(self.ranges)):
+                        max_expr = sympy.maximum(
+                            max_expr,
+                            self.itervars[idx],
+                            sympy.Interval(0, self.ranges[idx]),
+                        )
+                        min_expr = sympy.minimum(
+                            min_expr,
+                            self.itervars[idx],
+                            sympy.Interval(0, self.ranges[idx]),
+                        )
+                    i32_iinfo = numpy.iinfo(numpy.int32)
+                    if (
+                        dtype == torch.int64
+                        and max_expr.is_number
+                        and min_expr.is_number
+                        and max_expr <= i32_iinfo.max
+                        and min_expr >= i32_iinfo.min
+                    ):
+                        opt_ctx.dtype = torch.int32
+                    else:
+                        opt_ctx.dtype = dtype
+                        self.simd_vec = False
+
+                    # Pick the most inner loop variable since we always vectorize the
+                    # most inner loop
+                    most_inner_var = self.itervars[-1]
+                    most_inner_loop_irrevelant = self.is_invariant_under(
+                        most_inner_var, expr
+                    )
+                    if not most_inner_loop_irrevelant:
+                        self.simd_vec = False
+                    opt_ctx.is_most_inner_loop_irrevelant = most_inner_loop_irrevelant
+                    tmp_var = self.cse.newvar()
+                    return tmp_var
 
             @staticmethod
             def indirect_indexing(index_var):
@@ -1273,14 +1727,33 @@ def indirect_indexing(index_var):
 
             @staticmethod
             def masked(mask, body, other):
-                tmp_var = self.cse.newvar()
-                return tmp_var
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    is_masked_load, load_dtype = self.is_load_only_block(body.graph)
+                    opt_ctx.dtype = load_dtype
+                    opt_ctx.is_masked_load = is_masked_load
+
+                    _simd_vec = is_masked_load and load_dtype in [
+                        torch.float32,
+                        torch.float,
+                    ]
+                    if not _simd_vec:
+                        self.simd_vec = False
+
+                    tmp_var = self.cse.newvar()
+                    return tmp_var
 
             @staticmethod
             def to_dtype(x, dtype):
-                if dtype != torch.bool:
-                    self.simd_vec = False
-                return x
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    opt_ctx.dtype = dtype
+
+                    if dtype != torch.bool:
+                        self.simd_vec = False
+                    return x
 
         self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
@@ -1304,6 +1777,13 @@ def __init__(self, args, num_threads, tiling_factor):
     def check_can_tile2d(self, name: str, index: sympy.Expr):
         if not self.can_tile2d:
             return
+        # make sure the transpose_mxn(src, ld_src, dst, ld_dst) ld_src doesn't depend on most inner var.
+        if len(self.itervars) > 0 and not self.is_invariant_under(
+            self.itervars[-1], self.stride_at(self.itervars[-1], index)
+        ):
+            self.can_tile2d = False
+            return
+
         # check contiguity from any of the outer loops
         has_stride1 = False
         for loop_idx, itervar in enumerate(self.itervars[:-1]):
@@ -1317,6 +1797,7 @@ def check_can_tile2d(self, name: str, index: sympy.Expr):
                 else:
                     self.outer_tiling_idx = loop_idx
                 has_stride1 = True
+
         if not has_stride1 and not self.could_vec(name, index):
             self.can_tile2d = False
         return self.can_tile2d
@@ -1355,9 +1836,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 class CppKernelProxy(CppKernel):
     def __init__(self, kernel_group):
-        super(CppKernelProxy, self).__init__(
-            kernel_group.args, kernel_group.ws.num_threads
-        )
+        super().__init__(kernel_group.args, kernel_group.ws.num_threads)
         self.kernel_group = kernel_group
         self.loop_nest = None
         self.call_ranges = None
@@ -1414,8 +1893,7 @@ def run(kernel):
         # But the generated scalar kernel has updated these global contexts. Hence, the other kernels
         # should not do this again to avoid context conflict. By now, we only control the
         # config.inplace_buffers. In the future, we could maintain more contexts.
-        with patch.object(torch._inductor.config, "inplace_buffers", False):
-
+        with torch._inductor.config.patch(inplace_buffers=False):
             with CppVecKernelChecker(
                 deepcopy(self.kernel_group.args), parallel_num_threads(), tiling_factor
             ) as vec_checker:
@@ -1688,7 +2166,7 @@ def clone_inner():
         def do_split_with_tiling():
             sympy_factor = sympy.Integer(factor)
 
-            main_loop_range = ir.IndexingDiv(self.size, sympy_factor)
+            main_loop_range = ir.FloorDiv(self.size, sympy_factor)
             main_loop = LoopLevel(self.var, main_loop_range)
             main_loop.parallel = self.parallel
             main_loop.collapsed = False
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index c1c9c3bae112..08321da5ce95 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -69,3 +69,40 @@ void flag_to_float(const T* src, float* dst, int64_t n) {
     dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0;
   }
 }
+
+template <typename T, std::enable_if_t<std::is_same<T, bool>::value || std::is_same<T, uint8_t>::value, bool> = true>
+void flag_to_float(T src, float* dst, int64_t n) {
+#pragma unroll
+  for (int64_t i = 0; i < n; i++) {
+    uint32_t* dst_u32 = (uint32_t*)dst;
+    dst_u32[i] = src ? 0xFFFFFFFF : 0;
+  }
+}
+
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
+template <typename SRC>
+inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<SRC>& src) {
+  assert(
+      at::vec::Vectorized<float>::size() == at::vec::Vectorized<SRC>::size());
+  at::vec::Vectorized<float> res_vec(0);
+  __at_align__ float dst_tmp[at::vec::Vectorized<float>::size()];
+  __at_align__ SRC src_tmp[at::vec::Vectorized<SRC>::size()];
+  src.store(src_tmp);
+
+#pragma unroll
+  for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
+    dst_tmp[i] = src_tmp[i] ? 0xFFFFFFFF : 0;
+  }
+
+  return res_vec.loadu(dst_tmp);
+}
+
+template <>
+inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<int>& src) {
+#if defined(CPU_CAPABILITY_AVX2)
+  return at::vec::Vectorized<float>(_mm256_cvtepi32_ps(src));
+#else
+  return at::vec::Vectorized<float>(_mm512_cvtepi32_ps(src));
+#endif
+}
+#endif
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 68857c7993ab..38965930d12d 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -14,11 +14,13 @@
 
 from ..._dynamo import config as dynamo_config
 from .. import config, ir, scheduler
+from ..codecache import get_code_path
 from ..ir import ReductionHint
 from ..optimize_indexing import indexing_dtype_strength_reduction
 from ..utils import (
     get_fused_kernel_name,
     instance_descriptor,
+    next_power_of_2,
     sympy_product,
     sympy_subs,
     sympy_symbol,
@@ -28,12 +30,12 @@
 from .common import (
     CSEVariable,
     DeferredLine,
-    ExprPrinter,
     free_symbol_startswith,
     IndentedBuffer,
     index_prevent_reordering,
     Kernel,
     OpOverrides,
+    PythonPrinter,
     SizeArg,
     TensorArg,
 )
@@ -66,31 +68,22 @@ def config_of(args):
     def is_aligned(x):
         if isinstance(x, TensorArg):
             return x.buffer not in V.graph.unaligned_buffers
-        assert isinstance(x, SizeArg)
-        return V.graph.sizevars.maybe_guard_multiple_of(x.expr, ALIGNMENT)
+        if isinstance(x, SizeArg):
+            return V.graph.sizevars.maybe_guard_multiple_of(x.expr, ALIGNMENT)
+        raise NotImplementedError(f"unhandled {type(x)}: {x}")
 
     divisible_by_16 = [i for i, arg in enumerate(args) if is_aligned(arg)]
     return instance_descriptor(tuple(divisible_by_16), ())
 
 
-class TritonPrinter(ExprPrinter):
-    def _print_ModularIndexing(self, expr):
-        x, div, mod = expr.args
-        x = self.paren(self.doprint(x))
-        div = self.paren(self.doprint(div))
-        mod = self.paren(self.doprint(mod))
-        if div != "1":
-            x = f"({x} // {div})"
-        return f"{x} % {mod}"
-
-    def _print_IndexingDiv(self, expr):
-        x, div = expr.args
-        x = self.paren(self.doprint(x))
-        div = self.paren(self.doprint(div))
-        return f"({x} // {div})"
+class TritonPrinter(PythonPrinter):
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        return f"tl.libdevice.floor({self.paren(self._print(expr.args[0]))})"
 
 
 texpr = TritonPrinter().doprint
+pexpr = PythonPrinter().doprint
 
 
 def triton_compute_type(dtype):
@@ -138,6 +131,10 @@ class TritonOverrides(OpOverrides):
     def to_dtype(x, dtype: torch.dtype):
         if dtype == torch.bool:
             return f"({x} != 0)"
+        elif dtype == torch.uint8:
+            # to work around llvm uint conversion semantics
+            # that produces 0's for negative values
+            return f"{x}.to(tl.int8).to(tl.uint8)"
         return f"{x}.to({triton_compute_type(dtype)})"
 
     @staticmethod
@@ -227,6 +224,62 @@ def lgamma(x):
     def erf(x):
         return f"tl.libdevice.erf({x})"
 
+    @staticmethod
+    def cosh(x):
+        return f"tl.libdevice.cosh({x})"
+
+    @staticmethod
+    def sinh(x):
+        return f"tl.libdevice.sinh({x})"
+
+    @staticmethod
+    def acos(x):
+        return f"tl.libdevice.acos({x})"
+
+    @staticmethod
+    def acosh(x):
+        return f"tl.libdevice.acosh({x})"
+
+    @staticmethod
+    def asin(x):
+        return f"tl.libdevice.asin({x})"
+
+    @staticmethod
+    def asinh(x):
+        return f"tl.libdevice.asinh({x})"
+
+    @staticmethod
+    def atan2(x, y):
+        return f"tl.libdevice.atan2({x}, {y})"
+
+    @staticmethod
+    def atan(x):
+        return f"tl.libdevice.atan({x})"
+
+    @staticmethod
+    def atanh(x):
+        return f"tl.libdevice.atanh({x})"
+
+    @staticmethod
+    def copysign(x, y):
+        return f"tl.libdevice.copysign({x}, {y})"
+
+    @staticmethod
+    def erfc(x):
+        return f"tl.libdevice.erfc({x})"
+
+    @staticmethod
+    def hypot(x, y):
+        return f"tl.libdevice.hypot({x}, {y})"
+
+    @staticmethod
+    def log10(x):
+        return f"tl.libdevice.log10({x})"
+
+    @staticmethod
+    def nextafter(x, y):
+        return f"tl.libdevice.nextafter({x}, {y})"
+
     @staticmethod
     def logical_and(a, b):
         return f"{a} & {b}"
@@ -251,6 +304,10 @@ def rsqrt(x):
     def log1p(x):
         return f"tl.libdevice.log1p({x})"
 
+    @staticmethod
+    def tan(x):
+        return f"tl.libdevice.tan({x})"
+
     @staticmethod
     def tanh(x):
         return f"tl.libdevice.tanh({x})"
@@ -347,10 +404,12 @@ def __init__(
         var_ranges: Dict[sympy.Symbol, sympy.Expr],
         numel: sympy.Expr,
         prefix: str,
+        *,
+        kernel: "Kernel",
         divisor=sympy.Integer(1),
         length=sympy.Integer(1),
     ):
-        super(IterationRanges, self).__init__()
+        super().__init__()
         self.name = name
         self.var_list = var_list
         self.var_ranges = var_ranges
@@ -358,9 +417,10 @@ def __init__(
         self.prefix = prefix
         self.divisor = divisor
         self.length = length
+        self.kernel = kernel
 
     def is_loop(self):
-        return self.prefix == "r"
+        return self.prefix == "r" and not self.kernel.persistent_reduction
 
 
 class IterationRangesRoot(IterationRanges):
@@ -375,15 +435,15 @@ def __init__(
     ):
         if pid_cache is None:
             pid_cache = {}
-        super(IterationRangesRoot, self).__init__(
+        super().__init__(
             name=name,
             var_list=[],
             var_ranges={},
             numel=numel,
             prefix=prefix,
+            kernel=kernel,
         )
         self.index = index
-        self.kernel = kernel
         # Store all the nodes in one flat list
         self.nodes: Dict[sympy.Expr, IterationRangesEntry] = {}
         # This is for re-ordering program ID in triton mm template
@@ -399,7 +459,7 @@ def lookup(self, divisor, length):
         Lookup a given RangeTreeEntry, creating it if needed
         """
         if V.graph.sizevars.maybe_guard_equals(divisor * length, self.numel):
-            expr = ir.IndexingDiv(sympy_symbol(f"{self.prefix}index"), divisor)
+            expr = ir.FloorDiv(sympy_symbol(f"{self.prefix}index"), divisor)
         else:
             expr = ir.ModularIndexing(
                 sympy_symbol(f"{self.prefix}index"), divisor, length
@@ -448,12 +508,12 @@ def add(node):
         for node in nodes:
             if not V.graph.sizevars.maybe_guard_equals(node.divisor, divisor):
                 # fill in unused index var
-                add(self.lookup(divisor, ir.IndexingDiv(node.divisor, divisor)))
+                add(self.lookup(divisor, ir.FloorDiv(node.divisor, divisor)))
                 divisor = node.divisor
             add(node)
         if not V.graph.sizevars.maybe_guard_equals(self.numel, divisor):
             # fill in unused index var
-            add(self.lookup(divisor, ir.IndexingDiv(self.numel, divisor)))
+            add(self.lookup(divisor, ir.FloorDiv(self.numel, divisor)))
 
         return list(reversed(index_vars)), list(reversed(sizes))
 
@@ -470,6 +530,11 @@ def codegen_header(self, code):
         x = self.prefix
         if self.is_loop():
             code.writeline(f"{self.name} = {x}offset + {x}base")
+        elif x == "r" and self.kernel.persistent_reduction:
+            # no need to "roffset = "
+            code.writeline(
+                f"{self.name} = {self.ranges_code()}",
+            )
         else:
             pid = self.pid_cache_lookup(f"tl.program_id({self.index})")
             code.writelines(
@@ -490,7 +555,7 @@ def __init__(
         expr: sympy.Expr,
         parent: IterationRanges,
     ):
-        super(IterationRangesEntry, self).__init__(
+        super().__init__(
             name=name,
             numel=parent.numel / length,
             var_list=parent.var_list,
@@ -498,6 +563,7 @@ def __init__(
             prefix=parent.prefix,
             divisor=divisor,
             length=length,
+            kernel=parent.kernel,
         )
         self.parent = parent
         self.codegen = functools.lru_cache(None)(self._codegen)
@@ -522,6 +588,19 @@ def _codegen(self):
         self.writeline(f"{self.name} = " + texpr(V.kernel.rename_indexing(self.expr)))
         return self.name
 
+    def precomputed_args(self):
+        # for dynamic shapes, find parts of indexing expressions that have to be precomputed
+        precomputed_args = []
+        if isinstance(self.expr, sympy.Symbol):
+            return precomputed_args
+        assert isinstance(self.expr, (ir.FloorDiv, ir.ModularIndexing)), type(self.expr)
+        for arg in self.expr.args[1:]:
+            if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
+                symbols = arg.free_symbols
+                if len(symbols) > 0 and all(s.name.startswith("s") for s in symbols):
+                    precomputed_args.append(arg)
+        return precomputed_args
+
     def symbol(self):
         return sympy_symbol(self.name)
 
@@ -534,7 +613,7 @@ def __eq__(self, other):
 
 class TritonKernel(Kernel):
     overrides = TritonOverrides
-    sexpr = texpr
+    sexpr = pexpr
 
     def __init__(
         self,
@@ -545,7 +624,7 @@ def __init__(
     ):
         if pid_cache is None:
             pid_cache = {}
-        super(TritonKernel, self).__init__()
+        super().__init__()
         self.numels = [V.graph.sizevars.simplify(s) for s in groups]
         self.mutations = mutations
         self.range_trees = []
@@ -557,8 +636,9 @@ def __init__(
         self.indexing_code = IndentedBuffer()
         self.suffix = IndentedBuffer()
         self.outside_loop_vars = set()
-        self.initialize_range_tree(pid_cache)
         self.reduction_hint = reduction_hint
+        self.persistent_reduction = self.should_use_persistent_reduction()
+        self.initialize_range_tree(pid_cache)
 
         # define this in a closure to make cache local to object
         @functools.lru_cache(None)
@@ -570,6 +650,23 @@ def simplify_indexing(index: sympy.Expr):
 
         self.simplify_indexing = simplify_indexing
 
+    def should_use_persistent_reduction(self):
+        """
+        Heuristic to set self.persistent_reduction and add guards
+        if needed.
+        """
+        if not (self.inside_reduction and config.triton.persistent_reductions):
+            return False
+        threshold = {
+            ReductionHint.INNER: 1024,
+        }.get(self.reduction_hint, 64)
+        hint = V.graph.sizevars.size_hint(self.numels[-1])
+        if hint > threshold:
+            return False
+        # will need to recompile if we cross a larger power of 2 boundary
+        V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint))
+        return True
+
     def initialize_range_tree(self, pid_cache):
         names = ["xindex", "yindex", "zindex"][: len(self.numels) - 1] + ["rindex"]
         for i in range(len(self.numels)):
@@ -580,7 +677,7 @@ def initialize_range_tree(self, pid_cache):
             )
         for tree in self.range_trees:
             # reduction indexing goes inside a loop
-            if tree.prefix != "r":
+            if not tree.is_loop():
                 tree.codegen_header(self.body)
         if self.inside_reduction and self.range_trees[-1].is_loop():
             # workaround for this issue:
@@ -594,13 +691,15 @@ def ctx():
                 assert not self.inside_reduction
                 yield
                 return
-            # calling codegen_body() will flush all the pending buffers
-            # and write out a reduction loop
-            self.codegen_body()
+            if not self.persistent_reduction:
+                # calling codegen_body() will flush all the pending buffers
+                # and write out a reduction loop
+                self.codegen_body()
             self.inside_reduction = False
             yield
-            # flush out any code before opening the next loop
-            self.codegen_body()
+            if not self.persistent_reduction:
+                # flush out any code before opening the next loop
+                self.codegen_body()
             self.inside_reduction = True
 
         return ctx()
@@ -627,7 +726,7 @@ def add_range(i, expr):
                 raise CantSplit()
             # guard on the last item out
             sv.maybe_guard_equals(remaining[i], expr)
-            remaining[i] = ir.IndexingDiv(remaining[i], expr)
+            remaining[i] = ir.FloorDiv(remaining[i], expr)
             new_ranges[i].append(expr)
             return next(var_count)
 
@@ -658,7 +757,7 @@ def getter(flat_vars):
                     if not sv.maybe_guard_multiple_of(size, remaining[current_group]):
                         raise CantSplit()
                     size1 = remaining[current_group]
-                    size2 = ir.IndexingDiv(size, remaining[current_group])
+                    size2 = ir.FloorDiv(size, remaining[current_group])
                     return_getters.append(
                         make_combined(
                             size2,
@@ -749,6 +848,10 @@ def indexing(
         Compute the index and mask to pass to tl.load() or tl.store()
         """
         index = self.simplify_indexing(index)
+        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
+        # if simple replacements didn't get rid of floor/ceil, try full subs
+        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
+            index = index.subs(V.graph.sizevars.precomputed_replacements)
         index_vars = index.free_symbols
         index_str = texpr(self.rename_indexing(self.codegen_indexing(index)))
 
@@ -760,7 +863,7 @@ def indexing(
                 # indirect indexing
                 cse_var = self.cse.varname_map[var.name]
                 mask_vars.update(cse_var.mask_vars)
-            elif var.name.startswith("s"):
+            elif var.name.startswith(("s", "ps")):
                 pass
             else:
                 # var is one of xN, yN or rN
@@ -804,14 +907,25 @@ def indexing(
         if self._load_mask:
             mask_vars.add(self._load_mask)
 
-        if mask_vars == {"xmask"} and index == 0 and self.range_trees[0].numel == 1:
-            # This causes a triton error:
-            # https://github.com/openai/triton/issues/633
-            mask_vars = set()
+        self.filter_masks(mask_vars)
 
         mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
         return index_str, mask_vars, mask_str
 
+    def filter_masks(self, mask_vars):
+        for tree in self.range_trees:
+            # Masks are superfluous if we only have one element
+            if V.graph.sizevars.maybe_guard_equals(tree.numel, 1):
+                mask_vars.discard(f"{tree.prefix}mask")
+                continue
+            # Masks are superfluous if numel is a multiple of BLOCK
+            # (We use the fact that BLOCK is required by triton to be a power of 2)
+            if tree.prefix.upper() not in config.triton.max_block:
+                continue
+            max_block = config.triton.max_block[tree.prefix.upper()]
+            if V.graph.sizevars.maybe_guard_multiple_of(tree.numel, max_block):
+                mask_vars.discard(f"{tree.prefix}mask")
+
     def var_ranges(self):
         return dict(
             itertools.chain.from_iterable(
@@ -823,6 +937,15 @@ def codegen_indexing(self, expr: sympy.Expr):
         expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
         for sym in sorted(expr.free_symbols, key=str):
             if sym in self.range_tree_nodes:
+                # if indexing expression is complicated, we precompute it on the host side
+                # and send the result as a kernel argument
+                replacements = {}
+                for ps in self.range_tree_nodes[sym].precomputed_args():
+                    replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
+                if len(replacements) > 0:
+                    self.range_tree_nodes[sym].expr = sympy_subs(
+                        self.range_tree_nodes[sym].expr, replacements
+                    )
                 self.range_tree_nodes[sym].codegen()
         return expr
 
@@ -845,7 +968,7 @@ def load(self, name: str, index: sympy.Expr):
         original_index = index
         index, mask_vars, mask = self.indexing(index)
 
-        if "rmask" in mask:
+        if "rmask" in mask and not self.persistent_reduction:
             # This eviction policy heuristic is untested.
             # ptillet suggested we should try only doing this for
             # the first N-1 loops and not for the final loop.
@@ -856,7 +979,7 @@ def load(self, name: str, index: sympy.Expr):
         # "other" below is a workaround for https://github.com/openai/triton/issues/737
         # for bool, even though it's likely subject to the same bug, setting `other` leads
         # to LLVM errors so we are skipping it for now
-        if "tmp" in mask and V.graph.get_dtype(name) != torch.bool:
+        if ("tmp" in mask or "rmask" in mask) and V.graph.get_dtype(name) != torch.bool:
             other = ", other=0"
         else:
             other = ""
@@ -876,6 +999,7 @@ def load(self, name: str, index: sympy.Expr):
 
         if (
             self.inside_reduction
+            and not self.persistent_reduction
             and "rmask" not in mask
             and "tmp" not in mask
             and not indirect_indexing
@@ -913,7 +1037,9 @@ def store(self, name, index, value, mode=None):
     def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
         assert self.inside_reduction
         default = triton_constant(ir.Reduction.default_value(reduction_type, src_dtype))
-        masks = [f"{tree.prefix}mask" for tree in self.range_trees]
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
         if self._load_mask:
             masks.append(self._load_mask)
         sizes = [":" for _ in self.range_trees]
@@ -927,8 +1053,17 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
 
         dim = len(self.range_trees) - 1
         result_var = self.cse.newvar()
-        result_var.mask_vars = set(var for var in masks if var[0] != "r")
-        if (src_dtype, reduction_type, value) not in self.cse.reduction_cache:
+        result_var.mask_vars = {var for var in masks if var[0] != "r"}
+        if self.persistent_reduction:
+            cond = " & ".join(masks)
+            masked_value = self.cse.generate(
+                self.compute, f"tl.where({cond}, {value}, {default})"
+            )
+            result_var = self.cse.generate(
+                self.compute,
+                f"tl.{reduction_type}({masked_value}, {dim})[{', '.join(sizes)}]",
+            )
+        elif (src_dtype, reduction_type, value) not in self.cse.reduction_cache:
             self.cse.reduction_cache[(src_dtype, reduction_type, value)] = result_var
             accumulator = f"_{result_var}"
             default_value = f" + {default}" if default != 0 else ""
@@ -1014,7 +1149,7 @@ def codegen_body(self):
         ):
             return
 
-        if self.inside_reduction:
+        if self.inside_reduction and not self.persistent_reduction:
             self.body.writeline("for roffset in range(0, rnumel, RBLOCK):")
             with self.body.indent():
                 # last range tree is always reduction
@@ -1039,6 +1174,78 @@ def codegen_body(self):
         self.stores.clear()
         self.suffix.clear()
 
+    def codegen_kernel_benchmark(self):
+        result = IndentedBuffer()
+        argdefs, call_args, signature = self.args.python_argdefs()
+
+        result.writelines(["", "", "def get_args():"])
+        with result.indent():
+            for arg_name in call_args:
+                buf = V.graph.get_buffer(arg_name)
+                if buf:
+                    result.writeline(
+                        f"{arg_name} = rand_strided({tuple(buf.get_size())}, {tuple(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+                    )
+                elif arg_name in V.graph.constants:
+                    # note that random seed is put in V.graph.constants
+                    const_tensor = V.graph.constants[arg_name]
+                    result.writeline(
+                        f"{arg_name} = rand_strided({tuple(const_tensor.size())}, {tuple(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # noqa: B950 line too long
+                    )
+                else:
+                    raise KeyError(
+                        f"Don't find the buffer or const tensor for {arg_name}"
+                    )
+            result.writeline(f"return {', '.join(call_args)},")
+
+        result.writelines(["\n", "\n", "def call(args):"])
+        grid = []
+        extra_args = []
+        with result.indent():
+            index = V.graph.scheduler.current_device.index
+            result.writeline(f"with torch.cuda._DeviceGuard({index}):")
+            with result.indent():
+                result.writeline(
+                    f"torch.cuda.set_device({index})"
+                )  # no-op to ensure context
+                for tree in self.range_trees:
+                    expr = pexpr(tree.numel)
+                    if tree.prefix != "r" or self.inside_reduction:
+                        extra_args.append(expr)
+                    if tree.prefix != "r":
+                        grid.append(expr)
+
+                stream_name = f"stream{index}"
+                result.writeline(f"{stream_name} = get_cuda_stream({index})")
+                extra_args_str = ", ".join(map(str, extra_args)) + ", "
+                result.writeline(
+                    f"triton_.run(*args, {extra_args_str}grid=grid({', '.join(grid)}), stream={stream_name})"
+                )
+
+        result.writelines(["\n", "\n", "if __name__ == '__main__':"])
+        with result.indent():
+            result.writeline(
+                "from torch._C import _cuda_getCurrentRawStream as get_cuda_stream"
+            )
+            result.writeline("from torch._dynamo.testing import rand_strided")
+            result.writeline("from torch._inductor.utils import get_num_bytes")
+            result.writeline("import torch")
+            result.writeline("from torch._inductor.triton_ops.autotune import grid")
+            result.writeline("from triton.testing import do_bench")
+            result.writeline("")
+
+            result.writeline("args = get_args()")
+            result.writeline(
+                "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)[0]"
+            )
+            result.writeline("num_gb = get_num_bytes(*args) / 1e9")
+            result.writeline("gb_per_s = num_gb / (ms / 1e3)")
+            result.writeline(
+                'print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")'
+            )
+
+        return result
+
     def codegen_kernel(self, name=None):
         from triton import next_power_of_2
 
@@ -1046,25 +1253,37 @@ def codegen_kernel(self, name=None):
         size_hints = [
             next_power_of_2(V.graph.sizevars.size_hint(numel)) for numel in self.numels
         ]
-        if not self.inside_reduction:
+        if self.persistent_reduction:
+            assert self.inside_reduction
+            heuristics = "persistent_reduction"
+        elif self.inside_reduction:
+            heuristics = "reduction"
+        else:
             size_hints.pop()
             heuristics = "pointwise"
-        else:
-            heuristics = "reduction"
 
         if name is None:
             code.splice(
                 f"""
                     import triton
                     import triton.language as tl
-                    from {config.inductor_import}.ir import ReductionHint
-                    from {config.inductor_import}.ir import TileHint
-                    from {config.inductor_import}.triton_ops.autotune import {heuristics}
-                    from {config.inductor_import}.utils import instance_descriptor
+                    from torch._inductor.ir import ReductionHint
+                    from torch._inductor.ir import TileHint
+                    from torch._inductor.triton_ops.autotune import {heuristics}
+                    from torch._inductor.utils import instance_descriptor
                 """
             )
 
         argdefs, _, signature = self.args.python_argdefs()
+        # maps actual expression to SizeArg if its in sizevars replacements
+        for i, arg in enumerate(signature):
+            if (
+                isinstance(arg, SizeArg)
+                and arg.expr in V.graph.sizevars.inv_precomputed_replacements
+            ):
+                signature[i] = SizeArg(
+                    arg.name, V.graph.sizevars.inv_precomputed_replacements[arg.expr]
+                )
 
         mutated_args = set()
         for mutation in self.mutations:
@@ -1104,10 +1323,12 @@ def codegen_kernel(self, name=None):
         if self.inside_reduction:
             reduction_hint = self.reduction_hint
             heuristics_line = f"""
-                @{heuristics}(size_hints={size_hints!r},
-                              reduction_hint={reduction_hint},
-                              filename=__file__,
-                              meta={triton_meta!r})
+                @{heuristics}(
+                    size_hints={size_hints!r},
+                    reduction_hint={reduction_hint},
+                    filename=__file__,
+                    meta={triton_meta!r}
+                )
                 @triton.jit
             """
         else:
@@ -1125,27 +1346,19 @@ def codegen_kernel(self, name=None):
         code.writeline(f"def {name or 'KERNEL_NAME'}({', '.join(argdefs)}):")
         self.codegen_body()
         with code.indent():
-            if not config.dynamic_shapes:
+            if not dynamo_config.dynamic_shapes:
                 self.codegen_static_numels(code)
             for old, new in self.args.aliases():
                 code.writeline(f"{old} = {new}")
             code.splice(self.body)
 
+        if config.benchmark_kernel:
+            code.splice(self.codegen_kernel_benchmark())
+
         if name is not None:
             return code.getvalue()
 
-        wrapper = IndentedBuffer()
-        wrapper.writeline("async_compile.triton('''")
-        wrapper.splice(code.getvalue(), strip=True)
-        wrapper.writeline("''')")
-        return wrapper.getvalue()
-
-    def codegen_template_wrapper(self, src_code):
-        wrapper = IndentedBuffer()
-        wrapper.writeline("async_compile.triton('''")
-        wrapper.splice(src_code, strip=True)
-        wrapper.writeline("''')")
-        return wrapper.getvalue()
+        return code.getvalue()
 
     def codegen_static_numels(self, code):
         """
@@ -1157,7 +1370,7 @@ def codegen_static_numels(self, code):
                     code.writeline(
                         f"{tree.prefix}numel = {V.graph.sizevars.size_hint(tree.numel)}"
                     )
-                elif not config.dynamic_shapes:
+                elif not dynamo_config.dynamic_shapes:
                     code.writeline(
                         f"{tree.prefix}numel = {V.graph.sizevars.size_hint(tree.numel)}  # dynamic_shapes=False"
                     )
@@ -1187,10 +1400,10 @@ def call_kernel(self, code, name: str):
         # TODO(jansel): if there are constants, we shouldn't bother passing them as args
         for tree in self.range_trees:
             if isinstance(tree.numel, (sympy.Integer, sympy.Symbol)):
-                expr = texpr(tree.numel)
+                expr = pexpr(tree.numel)
             else:
                 expr = f"{name}_{tree.prefix}numel"
-                code.writeline(f"{expr} = {texpr(tree.numel)}")
+                code.writeline(f"{expr} = {pexpr(tree.numel)}")
             if tree.prefix != "r" or self.inside_reduction:
                 call_args.append(expr)
             if tree.prefix != "r":
@@ -1282,6 +1495,7 @@ def codegen_nodes(self, nodes):
         _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
         node_schedule = []
         current_loop_writes = set()
+        is_current_reductions = set()
         done = set()
 
         def fits_in_main_body(n):
@@ -1296,6 +1510,7 @@ def fits_outside_reduction(n):
 
         @contextlib.contextmanager
         def end_current_reduction_loop():
+
             if current_loop_writes:
                 # flush out any other runnable nodes to reduce number of loops
                 for other_node in nodes[index + 1 :]:
@@ -1308,6 +1523,7 @@ def end_current_reduction_loop():
                     ):
                         done.add(node)
                         current_loop_writes.add(node.get_name())
+                        is_current_reductions.add(node.is_reduction())
                         node_schedule.append(node)
 
             if node_schedule and node_schedule[-1] is EnableReduction:
@@ -1317,17 +1533,29 @@ def end_current_reduction_loop():
             yield
             node_schedule.append(EnableReduction)
             current_loop_writes.clear()
+            is_current_reductions.clear()
 
         for index, node in enumerate(nodes):
             if node in done:
                 continue
             done.add(node)
 
+            def requires_closing_previous_reduction(node, node_schedule):
+                if rnumel == 1:
+                    return False
+                if not current_loop_writes & node.recursive_predecessors:
+                    return False
+                assert node_schedule and not isinstance(
+                    node_schedule[-1], (EnableReduction, DisableReduction)
+                )
+                return True in is_current_reductions
+
             if fits_in_main_body(node):
-                if current_loop_writes & node.recursive_predecessors and rnumel != 1:
+                if requires_closing_previous_reduction(node, node_schedule):
                     with end_current_reduction_loop():
                         pass  # need to start a new reduction loop
                 current_loop_writes.add(node.get_name())
+                is_current_reductions.add(node.is_reduction())
                 node_schedule.append(node)
             elif fits_outside_reduction(node):
                 with end_current_reduction_loop():
@@ -1389,7 +1617,7 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
                     stack.close()
                 else:
                     # TODO - mostly works but needs a couple fixes
-                    if not config.dynamic_shapes:
+                    if not dynamo_config.dynamic_shapes:
                         # TODO - use split ranges ?
                         indexing_dtype_strength_reduction(node._body)
                     index_vars = kernel.split_and_set_ranges(node.get_ranges())
@@ -1412,12 +1640,25 @@ def define_kernel(self, src_code, node_schedule):
             )
             kernel_name = "_".join(["triton", fused_name, wrapper.next_kernel_suffix()])
             wrapper.kernels[src_code] = kernel_name
-            subs_name = kernel_name if config.triton.ordered_kernel_names else "triton_"
+            subs_name = (
+                kernel_name
+                if config.triton.ordered_kernel_names
+                or config.triton.descriptive_kernel_names
+                else "triton_"
+            )
             src_code = src_code.replace("KERNEL_NAME", subs_name)
+
             # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
             # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
             src_code = src_code.replace("#pragma CMT", "#")
-            wrapper.define_kernel(kernel_name, src_code)
+
+            _, _, kernel_path = get_code_path(src_code, "py", extra="")
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline("async_compile.triton('''")
+            compile_wrapper.splice(src_code, strip=True)
+            compile_wrapper.writeline("''')")
+
+            wrapper.define_kernel(kernel_name, compile_wrapper.getvalue(), kernel_path)
         return kernel_name
 
     def codegen_template(self, template_node, epilogue_nodes):
@@ -1434,7 +1675,7 @@ def codegen_template(self, template_node, epilogue_nodes):
             for node in epilogue_nodes:
                 node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
 
-        src_code = kernel.codegen_template_wrapper(render())
+        src_code = render()
         kernel_name = self.define_kernel(src_code, [template_node, *epilogue_nodes])
         kernel.call_kernel(V.graph.wrapper_code, kernel_name)
         self.scheduler.free_buffers()
@@ -1540,7 +1781,7 @@ def select_tiling(cls, node_schedule, numel, reduction_numel=sympy.Integer(1)):
                     b0, b1 = ranked_tilings[0]
                 assert V.graph.sizevars.size_hint(a1 - b1) > 0
                 if V.graph.sizevars.maybe_guard_multiple_of(a1, b1):
-                    tiling = (a0, ir.IndexingDiv(a1, b1), b1)
+                    tiling = (a0, ir.FloorDiv(a1, b1), b1)
                     ranked_tilings = [tiling] + ranked_tilings
                     break  # only 1 choice for now
 
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 62d8dcd257e3..8a0d9c29dfb3 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -6,16 +6,17 @@
 from itertools import count
 from typing import Any, Dict, List
 
+import sympy
+
 from torch._dynamo.utils import dynamo_timed
 
 from .. import codecache, config, ir
-from ..codecache import cpp_compile_command, get_code_path
+from ..codecache import code_hash, cpp_compile_command, get_code_path
 from ..utils import cache_on_self, has_triton, sympy_dot, sympy_product
 from ..virtualized import V
-from .common import CodeGen, DeferredLine, IndentedBuffer, Kernel
-from .triton import texpr
+from .common import CodeGen, DeferredLine, IndentedBuffer, Kernel, PythonPrinter
 
-pexpr = texpr
+pexpr = PythonPrinter().doprint
 
 
 def buffer_reuse_key(node: ir.Buffer):
@@ -272,6 +273,7 @@ def __init__(self):
             f"""
                 from ctypes import c_void_p, c_long
                 import torch
+                import math
                 import random
                 from torch import empty_strided, as_strided, device
                 from {codecache.__name__} import AsyncCompile
@@ -286,20 +288,20 @@ def __init__(self):
 
         if has_triton():
             self.header.splice(
-                f"""
+                """
                 import triton
                 import triton.language as tl
-                from {config.inductor_import}.triton_ops.autotune import grid
+                from torch._inductor.triton_ops.autotune import grid, start_graph, end_graph
                 from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
                 """
             )
 
             if config.triton.convolution != "aten":
                 self.header.splice(
-                    f"""
-                    from {config.inductor_import}.triton_ops.conv_perf_model import early_config_prune
-                    from {config.inductor_import}.triton_ops.conv_perf_model import estimate_conv_time
-                    from {config.inductor_import}.triton_ops.autotune import conv_heuristics
+                    """
+                    from torch._inductor.triton_ops.conv_perf_model import early_config_prune
+                    from torch._inductor.triton_ops.conv_perf_model import estimate_conv_time
+                    from torch._inductor.triton_ops.autotune import conv_heuristics
                     """
                 )
 
@@ -312,6 +314,10 @@ def __init__(self):
 
         self.allocated = set()
         self.freed = set()
+
+        # maps from reusing buffer to reused buffer
+        self.reuses = dict()
+
         self.write_get_cuda_stream = functools.lru_cache(None)(
             self.write_get_cuda_stream
         )
@@ -345,19 +351,23 @@ def write_prefix(self):
             def call(args):
             """
         )
-        with self.wrapper_call.indent():
+        with self.prefix.indent():
             if config.triton.debug_sync_graph:
-                self.wrapper_call.writeline("torch.cuda.synchronize()")
+                self.prefix.writeline("torch.cuda.synchronize()")
             inp_len = len(V.graph.graph_inputs.keys())
             if inp_len != 0:
                 lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
-                self.wrapper_call.writeline(f"{lhs} = args")
-                self.wrapper_call.writeline("args.clear()")
+                self.prefix.writeline(f"{lhs} = args")
+                self.prefix.writeline("args.clear()")
             for name in V.graph.randomness_seeds:
-                self.wrapper_call.writeline(
+                self.prefix.writeline(
                     f"torch.randint(2**31, size=(), dtype=torch.int64, out={name})"
                 )
-            V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
+            V.graph.sizevars.codegen(self.prefix, V.graph.graph_inputs)
+
+    def append_precomputed_sizes_to_prefix(self):
+        with self.prefix.indent():
+            V.graph.sizevars.codegen_precomputed_sizes(self.prefix)
 
     def write_get_cuda_stream(self, index):
         name = f"stream{index}"
@@ -437,6 +447,14 @@ def can_reuse(self, buffer):
             return False
         return True
 
+    def did_reuse(self, buffer, reused_buffer):
+        # Check whether a given buffer was reused by a possible reuser in the wrapper codegen
+        # Can be consulted from inside ir codegen, e.g. to determine whether a copy is needed
+        return (
+            buffer.get_name() in self.reuses
+            and self.reuses[buffer.get_name()] == reused_buffer.get_name()
+        )
+
     def write_reuse_line(self, input_buffer, output_buffer):
         self.writeline(ReuseLine(input_buffer, output_buffer))
 
@@ -445,6 +463,7 @@ def codegen_inplace_reuse(self, input_buffer, output_buffer):
         self.codegen_allocation(input_buffer)
         self.freed.add(input_buffer.get_name())
         self.allocated.add(output_buffer.get_name())
+        self.reuses[output_buffer.get_name()] = input_buffer.get_name()
         self.write_reuse_line(input_buffer, output_buffer)
 
     def codegen_cuda_device_guard_enter(self, device_idx):
@@ -475,7 +494,6 @@ def generate_extern_kernel_out(
     def generate(self):
         result = IndentedBuffer()
         result.splice(self.header)
-        result.splice(self.prefix)
 
         out_names = V.graph.get_output_names()
         with contextlib.ExitStack() as stack:
@@ -488,6 +506,9 @@ def generate(self):
                     "with record_function('inductor_wrapper_call'):"
                 )
                 stack.enter_context(self.wrapper_call.indent())
+            if config.profile_bandwidth:
+                self.wrapper_call.writeline("start_graph()")
+
             while (
                 self.lines
                 and isinstance(self.lines[-1], MemoryPlanningLine)
@@ -520,8 +541,15 @@ def generate(self):
             output_refs = self.get_output_refs()
             if config.triton.debug_sync_graph:
                 self.wrapper_call.writeline("torch.cuda.synchronize()")
+
+            if config.profile_bandwidth:
+                self.wrapper_call.writeline("end_graph()")
+
             self.generate_return(output_refs)
 
+        self.append_precomputed_sizes_to_prefix()
+        result.splice(self.prefix)
+
         with result.indent():
             result.splice(self.wrapper_call)
 
@@ -546,6 +574,9 @@ def add_fake_input(name, shape, stride, device, dtype):
                 f"device='{device}', dtype={dtype})"
             )
 
+        def add_expr_input(name, val):
+            output.writeline(f"{name} = {val}")
+
         output.writelines(["", "", 'if __name__ == "__main__":'])
         with output.indent():
             output.splice(
@@ -562,18 +593,22 @@ def add_fake_input(name, shape, stride, device, dtype):
                 )
 
             for name, value in V.graph.graph_inputs.items():
-                shape = [V.graph.sizevars.size_hint(x) for x in value.get_size()]
-                stride = [V.graph.sizevars.size_hint(x) for x in value.get_stride()]
-                add_fake_input(
-                    name, shape, stride, value.get_device(), value.get_dtype()
-                )
+                if isinstance(value, sympy.Expr):  # Don't need to add symbolic
+                    add_expr_input(name, V.graph.sizevars.size_hint(value))
+                else:
+                    shape = [V.graph.sizevars.size_hint(x) for x in value.get_size()]
+                    stride = [V.graph.sizevars.size_hint(x) for x in value.get_stride()]
+                    add_fake_input(
+                        name, shape, stride, value.get_device(), value.get_dtype()
+                    )
 
             output.writeline(
                 f"print_performance(lambda: call([{', '.join(V.graph.graph_inputs.keys())}]))"
             )
 
-    def define_kernel(self, name: str, kernel: str):
-        self.header.splice(f"\n\n{name} = {kernel}")
+    def define_kernel(self, name: str, kernel: str, kernel_path: str = None):
+        kernel_path_comment = f"# kernel path: {kernel_path}\n" if kernel_path else ""
+        self.header.splice(f"\n\n{kernel_path_comment}{name} = {kernel}")
 
     def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
         return
@@ -633,6 +668,16 @@ def write_prefix(self):
             '''
             #include <dlfcn.h>
             #include <assert.h>
+
+            template <typename KernelFunc>
+            KernelFunc load_cpp_kernel(const char* so_filename) {
+                KernelFunc kernel_cpp;
+                auto kernel_cpp_lib = dlopen(so_filename, RTLD_NOW);
+                assert(kernel_cpp_lib != nullptr);
+                *(void **) (&kernel_cpp) = dlsym(kernel_cpp_lib, "kernel");
+                return kernel_cpp;
+            }
+
             """
         )
         with self.wrapper_call.indent():
@@ -687,7 +732,7 @@ def get_kernel_path(self, code):
 
         picked_vec_isa = pick_vec_isa()
         ext = "so"
-        extra = cpp_compile_command("i", "o", vec_isa=picked_vec_isa)
+        extra = code_hash(repr(cpp_compile_command("i", "o", vec_isa=picked_vec_isa)))
         # \n is required to match with the CodeCache behavior
         #  For reductions, the code string gotten from code.getvalue() will use backslash '\'
         # at the end of lines for readability purpose:
@@ -704,11 +749,9 @@ def get_kernel_path(self, code):
 
     def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
         kernel_path = self.get_kernel_path(kernel)
-
-        self.writeline(f'auto {name}_lib = dlopen("{kernel_path}", RTLD_NOW);')
-        self.writeline(f"assert({name}_lib != nullptr);")
-        self.writeline(f"void (*{name})({arg_types});")
-        self.writeline(f'*(void **) (&{name}) = dlsym({name}_lib, "kernel");')
+        self.writeline(
+            f'static auto {name} = load_cpp_kernel<void (*)({arg_types})>("{kernel_path}");'
+        )
 
     def wrap_kernel_call(self, name, call_args):
         return "{}({});".format(name, ", ".join(call_args))
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index beb027753c25..64ae64f480f9 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -4,25 +4,29 @@
 import logging
 import sys
 import warnings
-from typing import List
+from typing import Any, Callable, Dict, List, Optional
 
 import functorch
 from functorch.compile import min_cut_rematerialization_partition
 
+import torch._dynamo.config as dynamo_config
+
 import torch.fx
+import torch.utils._pytree as pytree
 
 from torch._dynamo import logging as dynamo_logging, utils as dynamo_utils
-from torch._dynamo.optimizations.normalize import normalize_ir
-from torch._dynamo.optimizations.training import aot_autograd
 from torch._dynamo.utils import fake_mode_from_tensors
 from torch._functorch.aot_autograd import make_boxed_func
+from torch._ops import OpOverload
 from torch._subclasses.fake_tensor import FakeTensor
-
-from . import config, metrics, overrides
+from .._dynamo.backends.common import aot_autograd
+from ..fx.graph import _PyTreeCodeGen
+from . import config, metrics, overrides, pattern_matcher
 from .debug import DebugContext
 from .decomposition import select_decomp_table
 from .graph import GraphLowering
-from .utils import get_dtype_size, has_incompatible_cudagraph_ops
+from .mkldnn import convert_outplace_to_inplace
+from .utils import developer_warning, get_dtype_size, has_incompatible_cudagraph_ops
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -87,11 +91,31 @@ def _warn_tf32_disabled():
         and torch.cuda.get_device_capability() >= (8, 0)
     ):
         warnings.warn(
-            "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled."
+            "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. "
             "Consider setting `torch.set_float32_matmul_precision('high')` for better performance."
         )
 
 
+def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
+    aten = torch.ops.aten
+    tf32_ops = {
+        aten.mm.default,
+        aten.addmm.default,
+        aten.bmm.default,
+        aten.baddbmm.default,
+    }
+    for node in gm.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target in tf32_ops
+            and isinstance(node.meta.get("val", None), torch.Tensor)
+            and node.meta["val"].dtype == torch.float32
+            and node.meta["val"].device.type == "cuda"
+        ):
+            return True
+    return False
+
+
 @DebugContext.wrap
 def count_bytes_inner(gm, example_inputs, num_fixed=0, **kwargs):
     shape_env = _shape_env_from_inputs(example_inputs)
@@ -115,7 +139,8 @@ def compile_fx_inner(
     is_backward=False,
     graph_id=None,
 ):
-    _warn_tf32_disabled()
+    if is_tf32_warning_applicable(gm):
+        _warn_tf32_disabled()
 
     if dynamo_utils.count_calls(gm.graph) == 0:
         return make_boxed_func(gm.forward)
@@ -130,24 +155,29 @@ def compile_fx_inner(
         f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
         f"graph {graph_id}",
     )
-
     V.debug.fx_graph(gm, example_inputs)
 
     if cudagraphs is None:
         cudagraphs = config.triton.cudagraphs
 
     shape_env = _shape_env_from_inputs(example_inputs)
-    fake_mode = fake_mode_from_tensors(example_inputs)
-    graph = GraphLowering(
-        gm,
-        shape_env=shape_env,
-        num_static_inputs=num_fixed,
-        graph_id=graph_id,
-        fake_mode=fake_mode,
-    )
-    with V.set_graph_handler(graph):
-        graph.run(*example_inputs)
-        compiled_fn = graph.compile_to_fn()
+    fake_mode = fake_mode_from_tensors(
+        example_inputs
+    ) or torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+
+    with V.set_fake_mode(fake_mode):
+        pattern_matcher.fx_passes(gm)
+        V.debug.fx_graph_transformed(gm, example_inputs)
+
+        graph = GraphLowering(
+            gm,
+            shape_env=shape_env,
+            num_static_inputs=num_fixed,
+            graph_id=graph_id,
+        )
+        with V.set_graph_handler(graph):
+            graph.run(*example_inputs)
+            compiled_fn = graph.compile_to_fn()
 
     if cudagraphs:
         complex_memory_overlap_inputs = any(
@@ -167,12 +197,14 @@ def compile_fx_inner(
             BoxedBool.disable(cudagraphs)
 
             if len(set(graph.device_types)) > 1:
-                log.warning("skipping cudagraphs due to multiple devices")
+                developer_warning("skipping cudagraphs due to multiple devices")
             elif set(graph.device_types) == {"cuda"}:
                 if graph.mutated_inputs:
-                    log.warning("skipping cudagraphs due to input mutation")
+                    developer_warning("skipping cudagraphs due to input mutation")
                 elif complex_memory_overlap_inputs:
-                    log.warning("skipping cudagraphs due to complex input striding")
+                    developer_warning(
+                        "skipping cudagraphs due to complex input striding"
+                    )
 
     result = align_inputs(compiled_fn, example_inputs, range(num_fixed))
     _step_logger()(
@@ -202,7 +234,8 @@ def is_aligned(storage_offset, dtype):
     check_inputs = [
         i
         for i in range(len(inputs))
-        if (
+        if isinstance(inputs[i], torch.Tensor)
+        and (
             i not in static_input_idxs
             or not is_aligned(inputs[i].storage_offset(), inputs[i].dtype)
         )
@@ -364,24 +397,68 @@ def compile_fx(
     model_: torch.fx.GraphModule,
     example_inputs_: List[torch.Tensor],
     inner_compile=compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+    decompositions: Optional[Dict[OpOverload, Callable]] = None,
 ):
     """Main entrypoint to a compile given FX graph"""
+    if config_patches:
+        with config.patch(config_patches):
+            return compile_fx(
+                model_,
+                example_inputs_,
+                # need extra layer of patching as backwards is compiled out of scope
+                inner_compile=config.patch(config_patches)(inner_compile),
+                decompositions=decompositions,
+            )
+    recursive_compile_fx = functools.partial(
+        compile_fx,
+        inner_compile=inner_compile,
+        decompositions=decompositions,
+    )
+
+    if not graph_returns_tuple(model_):
+        return make_graph_return_tuple(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    if isinstance(model_, torch.fx.GraphModule):
+        with overrides.patch_functions():
+            model_ = overrides.replace_fx(model_)
+            model_ = overrides.fuse_fx(model_, example_inputs_)
+
+        if isinstance(model_.graph._codegen, _PyTreeCodeGen):
+            # this graph is the result of dynamo.export()
+            return handle_dynamo_export_graph(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
 
+    if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+        return flatten_graph_inputs(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    assert not config._raise_error_for_testing
     functorch.compile.config.use_functionalize = True
     functorch.compile.config.use_fake_tensor = True
-
-    with overrides.patch_functions():
-        model_ = normalize_ir(model_, example_inputs_)
-        model_ = overrides.replace_fx(model_)
-        model_ = overrides.fuse_fx(model_, example_inputs_)
     num_example_inputs = len(example_inputs_)
-    cudagraphs = BoxedBool(config.triton.cudagraphs and not config.dynamic_shapes)
-
+    cudagraphs = BoxedBool(
+        config.triton.cudagraphs and not dynamo_config.dynamic_shapes
+    )
     graph_id = next(_graph_counter)
 
     @dynamo_utils.dynamo_timed
     def fw_compiler(model: torch.fx.GraphModule, example_inputs):
         fixed = len(example_inputs) - num_example_inputs
+        # Why convert outplace op to inplace? Inductor can support inplace operations well and for custom
+        # inplace ops which are lowered as ExternKernel, it is beneficial to performance when the inplace
+        # implementation is used if available.
+        model = convert_outplace_to_inplace(model)
         return inner_compile(
             model,
             example_inputs,
@@ -403,17 +480,19 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
         )
 
     with overrides.patch_functions():
-
+        if decompositions is None:
+            decompositions = select_decomp_table()
         # TODO: can add logging before/after the call to create_aot_dispatcher_function
         # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
         # once torchdynamo is merged into pytorch
         return aot_autograd(
             fw_compiler=fw_compiler,
             bw_compiler=bw_compiler,
-            decompositions=select_decomp_table(),
+            decompositions=decompositions,
             partition_fn=functools.partial(
                 min_cut_rematerialization_partition, compiler="inductor"
             ),
+            keep_inference_input_mutations=True,
         )(model_, example_inputs_)
 
 
@@ -431,3 +510,86 @@ def _shape_env_from_inputs(inputs):
 
     # TODO(voz): Should we always have one anyway?
     return None
+
+
+def output_node(gm: torch.fx.GraphModule):
+    """Get the output node from an FX graph"""
+    last_node = next(iter(reversed(gm.graph.nodes)))
+    assert last_node.op == "output"
+    return last_node
+
+
+def graph_returns_tuple(gm: torch.fx.GraphModule):
+    """True if a FX graph returns a tuple"""
+    if not isinstance(gm, torch.fx.GraphModule):
+        return True  # can't check this, assume true
+    (rv,) = output_node(gm).args
+    if isinstance(rv, (list, tuple)):
+        return True
+    return False
+
+
+def make_graph_return_tuple(gm: torch.fx.GraphModule, inputs, compile_gm):
+    """
+    Mutate gm so it returns a tuple.  This is only needed for graphs
+    not created by torchdynamo that return non-tuples.
+    """
+    node = output_node(gm)
+    (rv,) = node.args
+    rv, spec = pytree.tree_flatten(rv)
+    with gm.graph.inserting_before(node):
+        gm.graph.output(rv)
+    gm.graph.erase_node(node)
+    assert graph_returns_tuple(gm)
+
+    compiled_fn = compile_gm(gm, inputs)
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args, **kwargs):
+        return pytree.tree_unflatten(compiled_fn(*args, **kwargs), spec)
+
+    return wrapper
+
+
+def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
+    """
+    Mutate inputs so that they are flat and wrap gm such that it
+    accepts those inputs.  This is only needed for graphs not created
+    by torchdynamo that take bumpy inputs.
+    """
+    inputs, spec = pytree.tree_flatten(inputs)
+
+    class GmWrapper(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gm = gm
+
+        def forward(self, *args):
+            return self.gm(*pytree.tree_unflatten(args, spec))
+
+    compiled_fn = compile_gm(GmWrapper(), inputs)
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        # note this doesn't check the spec, assuming it is the same
+        return compiled_fn(*pytree.tree_flatten(args)[0])
+
+    return wrapper
+
+
+def handle_dynamo_export_graph(gm, inputs, compile_gm):
+    """
+    `torch._dynamo.export` embeds pytrees in the FX graph codgen object,
+    convert that to a normal FX graph so inductor can compile it.
+    """
+    codegen = gm.graph._codegen
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.recompile()
+
+    compiled_fn = compile_gm(gm, codegen.process_inputs(*inputs))
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args)))
+
+    return wrapper
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index dc1b77b8dd4a..beb0315a1618 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,6 +1,8 @@
 import os
 import sys
 
+import torch
+
 # add some debug printouts
 debug = False
 
@@ -16,11 +18,6 @@
 # dead code elimination
 dce = False
 
-# assume input tensors are dynamic
-dynamic_shapes = (
-    os.environ.get("TORCHDYNAMO_DYNAMIC_SHAPES") == "1"
-)  # Use dynamic shapes if torchdynamo dynamic shapes is set
-
 # assume weight tensors are fixed size
 static_weight_shapes = True
 
@@ -42,9 +39,20 @@
 # do epilogue fusions before other fusions
 epilogue_fusion_first = False
 
+# enable pattern match+replace optimizations
+pattern_matcher = True
+
+# enable reordering pass
+reordering = False
+
 # enable slow autotuning passes to select algorithms
 max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1"
 
+# enable searching global and local cache regardless of `max_autotune`
+search_autotune_cache = (
+    os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE", "1") == "1"
+)
+
 # control store vs recompute heuristic
 # For fanouts, rematearialization can lead to exponential blowup. So, have
 # smaller threshold
@@ -60,9 +68,6 @@
 # automatically create fallbacks when encountering an unhandled op
 implicit_fallbacks = True
 
-# Enables a fusion pass that groups nodes together before the scheduler
-prefuse_nodes = True
-
 # do bench to decide best layout, currently only for aten.conv
 tune_layout = False
 
@@ -77,13 +82,16 @@
 
 comment_origin = False
 
+benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
 
-def is_fbcode():
-    import torch
 
+def is_fbcode():
     return not hasattr(torch.version, "git_version")
 
 
+# warnings intended for PyTorch developers, disable for point releases
+developer_warnings = is_fbcode() or "+" in torch.__version__
+
 compile_threads = (
     1
     if sys.platform == "win32" or is_fbcode()
@@ -95,13 +103,18 @@ def is_fbcode():
     )
 )
 
+# autotuning global cache path
+if is_fbcode():
+    from libfb.py import parutil
+
+    global_cache_path = parutil.get_file_path("fb/global_cache", pkg=__package__)
+else:
+    global_cache_path = None
+
 # If kernel is fused, the name is generated from the origin node op names
 # for larger kernels limit this
 kernel_name_max_ops = 10
 
-# How to import torchinductor, either torchinductor or torch.inductor
-inductor_import = __name__.replace(".config", "")
-
 # Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
 shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1"
 
@@ -111,6 +124,14 @@ def is_fbcode():
 # Mark the wrapper call in PyTorch profiler
 profiler_mark_wrapper_call = False
 
+# used for debugging to make sure config is properly set
+_raise_error_for_testing = False
+
+_profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "")
+profile_bandwidth = _profile_var != ""
+profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var
+
+
 # config specific to codegen/cpp.pp
 class cpp:
     # set to torch.get_num_threads()
@@ -130,18 +151,21 @@ class cpp:
         # "g++-11",
         # "g++-10",
         # "clang++",
-        "g++",
+        os.environ.get("CXX", "g++"),
         # "g++.par",
     )
     # Allow kernel performance profiling via PyTorch profiler
     enable_kernel_profile = False
 
+    # enable weight prepacking to get a better performance; may lead to large memory footprint
+    weight_prepack = True
+
 
 # config specific to codegen/triton.py
 class triton:
 
     # Use cudagraphs on output code
-    cudagraphs = True
+    cudagraphs = False
 
     # Synchronize before and after every compiled graph.
     debug_sync_graph = False
@@ -153,10 +177,6 @@ class triton:
     convolution = "aten"
 
     # Always load full blocks (rather than broadcasting inside the block)
-    # Set default as True because otherwise will encouter `map::at` error
-    # in triton if loading from 1-dim tensor using 2-dim pointer offset
-    # https://triton-lang.slack.com/archives/C01L1FLTX70/p1656023403343639
-    # could be set as False if triton fixes the bug later
     dense_indexing = False
 
     # limit tiling dimensions
@@ -169,10 +189,19 @@ class triton:
     # should we stop a fusion to allow better tiling?
     tiling_prevents_pointwise_fusion = True
     tiling_prevents_reduction_fusion = True
+
     # should we give different names to kernels
     ordered_kernel_names = False
+
     # should we put op names in kernel names
-    descriptive_kernel_names = True
+    descriptive_kernel_names = False
+
+    # use alternate codegen for smaller reductions
+    persistent_reductions = True
+
+    # theses are not enforced, but they are used by asserts in triton_ops/autotune.py
+    # NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048
+    max_block = {"X": 2048, "Y": 1024, "Z": 1024}
 
 
 # create a directory containing lots of debug information
@@ -186,9 +215,12 @@ class trace:
     # Save python logger call >=logging.INFO
     info_log = False
 
-    # Save input FX graph (post decomps)
+    # Save input FX graph (post decomps, pre optimization)
     fx_graph = True
 
+    # Save FX graph after transformations
+    fx_graph_transformed = True
+
     # Save TorchInductor IR before fusion pass
     ir_pre_fusion = True
 
@@ -209,83 +241,7 @@ class trace:
     upload_tar = None
 
 
-class InductorConfigContext:
-    static_memory: bool
-    matmul_padding: bool
-    max_autotune: bool
-    triton_convolution: str
-    rematerialize_threshold: int
-    rematerialize_acc_threshold: int
-
-    def _save(self):
-        self.static_memory = triton.cudagraphs
-        self.matmul_padding = shape_padding
-        self.max_autotune = max_autotune
-        self.triton_convolution = triton.convolution
-        self.rematerialize_threshold = realize_reads_threshold
-        self.rematerialize_acc_threshold = realize_acc_reads_threshold
-
-    def _apply(self):
-        global shape_padding, realize_reads_threshold, realize_acc_reads_threshold, max_autotune
-        triton.cudagraphs = self.static_memory
-        shape_padding = self.matmul_padding
-        max_autotune = self.max_autotune
-        triton.convolution = self.triton_convolution
-        realize_reads_threshold = self.rematerialize_threshold
-        realize_acc_reads_threshold = self.rematerialize_acc_threshold
-
-    def __init__(self, arg=None):
-        self._save()
-        if arg is None:
-            return
-        # Handle mode
-        if type(arg) is str:
-
-            def default():
-                self.static_memory = False
-
-            def reduce_overhead():
-                self.static_memory = True
-
-            def max_autotune():
-                self.max_autotune = True
-
-            modes = {
-                x.__name__.replace("_", "-"): x
-                for x in [default, reduce_overhead, max_autotune]
-            }
-            if arg not in modes:
-                raise RuntimeError(
-                    f"Unrecognized mode {arg}, should be one of {', '.join(modes.keys())}"
-                )
-            modes[arg]()
-            return
-        # Handle passes
-        for (name, val) in arg.items():
-            attr_name = name.replace("-", "_")
-            if not hasattr(self, attr_name):
-                known_passes = ", ".join(
-                    [x.replace("_", "-") for x in dir(self) if not x.startswith("_")]
-                )
-                raise RuntimeError(
-                    f"Unexpected optimization pass {name}, known passes are {known_passes}"
-                )
-            if type(val) != type(getattr(self, attr_name)):
-                val_type_str = type(val).__name__
-                expected_type_str = type(getattr(self, attr_name)).__name__
-                raise RuntimeError(
-                    f"Unexpected type of attr {name}, got {val_type_str} should be {expected_type_str}"
-                )
-            setattr(self, attr_name, val)
-
-    def __enter__(self):
-        self._prev = InductorConfigContext()
-        self._apply()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self._prev._apply()
-
-
-from .._dynamo.config_utils import get_config_serialization_fns
-
-save_config, load_config = get_config_serialization_fns(sys.modules[__name__])
+from .._dynamo.config_utils import install_config_module
+
+# adds patch, save_config, etc
+install_config_module(sys.modules[__name__])
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index 111a21c23d8c..89edaabff995 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -290,7 +290,7 @@ def upload_tar(self):
             config.trace.upload_tar(tar_file)
 
     def __enter__(self):
-        log = logging.getLogger(config.inductor_import)
+        log = logging.getLogger("torch._inductor")
         if not log.handlers:
             init_logging()
 
@@ -318,7 +318,7 @@ def reset_log_level(level):
             self._prof.enable()
 
     def _setup_log_capture(self, filename, level):
-        log = logging.getLogger(config.inductor_import)
+        log = logging.getLogger("torch._inductor")
         fd = self._stack.enter_context(self.fopen(filename))
         ch = logging.StreamHandler(fd)
         ch.setLevel(level)
@@ -379,6 +379,12 @@ def fx_graph(self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]):
         with self.fopen("fx_graph_readable.py") as fd:
             fd.write(gm.print_readable(print_output=False))
 
+    def fx_graph_transformed(
+        self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]
+    ):
+        with self.fopen("fx_graph_transformed.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+
     def ir_pre_fusion(self, nodes: SchedulerNodeList):
         self._write_ir("ir_pre_fusion.txt", nodes)
 
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index b6f16b1427fe..9ede1d6dfcbd 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -6,128 +6,35 @@
 import torch
 import torch._decomp as decomp
 from torch import Tensor
-from torch._decomp import get_decompositions
-from torch._prims_common import is_boolean_dtype, is_integer_dtype
+from torch._decomp import core_aten_decompositions, get_decompositions
+from torch._decomp.decompositions import pw_cast_for_opmath
 from torch.utils._mode_utils import no_dispatch
 
 from . import config, utils
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
-log = logging.getLogger(__name__)
 
-decompositions = get_decompositions(
+inductor_decompositions = get_decompositions(
     [
-        aten.linspace,
-        aten.logaddexp,
-        aten._adaptive_avg_pool2d_backward,
-        aten.addcmul,
-        aten.avg_pool2d_backward,
-        aten.binary_cross_entropy_with_logits,
-        aten.clamp_max,
-        aten.clamp_min,
-        aten.col2im,
-        aten.cudnn_batch_norm,
-        aten.cudnn_batch_norm_backward,
-        aten.detach,
-        aten.dot,
-        aten.elu,
-        aten.elu_backward,
-        aten._embedding_bag,
-        aten.embedding_dense_backward,
-        aten.expand_as,
-        aten.eye,
-        aten.ones_like,
-        aten.zeros_like,
-        aten.zeros,
-        aten.ones,
-        aten.fill,
+        aten.arange,
+        aten.bitwise_and_,
+        aten.bitwise_or_,
+        aten.clamp_min_,
         aten.flip,
-        aten._fused_moving_avg_obs_fq_helper,
-        aten.gelu,
-        aten.gelu_backward,
-        aten.glu_backward,
-        aten.grid_sampler_2d,
-        aten.hardsigmoid,
-        aten.hardsigmoid_backward,
-        aten.upsample_bilinear2d,
-        aten.hardswish,
-        aten.hardswish_,
-        aten.hardswish_backward,
-        aten.hardtanh,
-        aten.hardtanh_,
-        aten.hardtanh_backward,
-        aten.im2col,
-        aten.index_select,
-        aten.index_add,
-        aten.index_add_,
-        aten.index_copy,
-        aten.index_copy_,
-        aten.index_fill,
-        aten.index_fill_,
-        aten.l1_loss,
-        aten.leaky_relu,
-        aten.leaky_relu_,
-        aten.leaky_relu_backward,
+        aten.lcm,
         aten.linalg_vector_norm,
-        aten.logit,
-        aten.logit_backward,
-        aten._log_softmax,
-        aten._log_softmax_backward_data,
-        aten.logsumexp.default,
-        aten.masked_fill,
-        aten.masked_fill_,
-        aten.max_pool2d_with_indices_backward,
-        aten.mse_loss,
-        aten.mse_loss_backward,
-        aten.mv,
-        aten.narrow,
-        aten.native_batch_norm,
-        aten._native_batch_norm_legit,
-        aten._native_batch_norm_legit_functional,
-        aten.native_batch_norm_backward,
-        aten.native_dropout_backward,
-        aten.native_group_norm,
-        aten.native_group_norm_backward,
-        aten.native_layer_norm,
-        aten.native_layer_norm_backward,
-        aten.new_empty,
-        aten.new_full,
-        aten.new_zeros,
-        aten.new_ones,
-        aten.nll_loss_backward,
-        aten.nll_loss_forward,
-        aten.norm,
-        aten._reshape_alias,
-        aten.select_backward,
-        aten.select_scatter,
-        aten.sgn,
-        aten.sigmoid_backward,
-        aten.silu,
-        aten.silu_,
-        aten.silu_backward,
-        aten.slice_backward,
-        aten._softmax,
-        aten._softmax_backward_data,
-        aten.softplus,
-        aten.softplus_backward,
-        aten.stack,
-        aten.std_mean.correction,
-        aten.t,
-        aten.tanh_backward,
-        aten.threshold_backward,
+        aten.sin_,
+        aten.sqrt_,
+        aten.std,
+        aten.std_mean,
         aten._to_copy,
-        aten.transpose.int,
-        aten.tril.default,
-        aten.unfold,
-        aten.unfold_backward,
-        aten.upsample_bilinear2d.vec,
-        aten.upsample_nearest2d_backward,
-        aten.bucketize,
-        aten.zero_,
-        aten.zero,
+        aten.tril_indices,
+        aten.triu_indices,
+        aten.unsafe_split,
     ]
 )
+decompositions = {**core_aten_decompositions(), **inductor_decompositions}
 
 
 def register_decomposition(ops):
@@ -138,11 +45,12 @@ def register_decomposition(ops):
 
 
 @register_decomposition([aten.clamp])
+@pw_cast_for_opmath
 def clamp(x, min=None, max=None):
     if min is not None:
-        x = torch.maximum(x, torch.tensor(min, dtype=x.dtype, device=x.device))
+        x = x.clamp_min(min)
     if max is not None:
-        x = torch.minimum(x, torch.tensor(max, dtype=x.dtype, device=x.device))
+        x = x.clamp_max(max)
     return x
 
 
@@ -153,6 +61,18 @@ def floordiv(a, b):
     return aten.div.Tensor_mode(a, b, rounding_mode="floor")
 
 
+# Not really sure how to put this into the main library.  PrimTorch wants
+# empty_permuted to go to the prim, and typically users don't really want
+# to decompose to empty_strided (but inductor is OK with it, because we are
+# cool with strides and everything goes to empty_strided)
+@register_decomposition([aten.empty_permuted.default])
+def empty_permuted(size, physical_layout, **kwargs):
+    perm = [0] * len(size)
+    for p, l in enumerate(physical_layout):
+        perm[l] = p
+    return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm)
+
+
 def get_alignment_size(x):
     if x.dtype == torch.float16 or x.dtype == torch.half or x.dtype == torch.bfloat16:
         return 8
@@ -191,7 +111,7 @@ def addmm(input, mat1, mat2, *, beta=1, alpha=1):
         n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
         if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
             return pad_addmm(
-                input, mat1, mat2, m_padded_length, n_padded_length, k_padded_length
+                input, mat1, mat2, m_padded_length, k_padded_length, n_padded_length
             )
 
     return NotImplemented  # go directly to lowering
@@ -305,8 +225,8 @@ def should_pad_bench(mat1, mat2, op, input=None):
                 fast_flush=True,
             )[0]
 
-        # Shape padding introduces addtional memory ops. Based on microbenchmarks, 1.1x represents a reasonable
-        # tradeoff between performance improvement from shape padding and overhead from addtional memory ops
+        # Shape padding introduces additional memory ops. Based on microbenchmarks, 1.1x represents a reasonable
+        # tradeoff between performance improvement from shape padding and overhead from additional memory ops
         # TODO: Build a learned model which would be better than this heuristic
         return ori_time > pad_time * 1.1
 
@@ -417,41 +337,14 @@ def round_dec(x, decimals=0):
     return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
 
 
-@register_decomposition([aten.rsub.Tensor, aten.rsub.Scalar])
-def rsub(a, b):
-    if isinstance(b, numbers.Number):
-        b = torch.tensor(b, dtype=a.dtype, device=a.device)
-    return b - a
-
-
-@register_decomposition([aten.nan_to_num])
-def nan_to_num(x, nan=0.0, posinf=None, neginf=None):
-    if is_boolean_dtype(x.dtype) or is_integer_dtype(x.dtype):
-        return x
-
-    if nan is None:
-        nan = 0.0
-    if posinf is None:
-        posinf = torch.finfo(x.dtype).max
-    if neginf is None:
-        neginf = torch.finfo(x.dtype).min
-    nan, posinf, neginf = (
-        torch.tensor(v, dtype=x.dtype, device=x.device) for v in (nan, posinf, neginf)
-    )
-    x = torch.where(x != x, nan, x)
-    x = torch.where(x == float("inf"), posinf, x)
-    x = torch.where(x == float("-inf"), neginf, x)
-    return x
-
-
 @register_decomposition([aten.all.default])
 def all(input):
     return torch.logical_not(torch.any(torch.logical_not(input)))
 
 
 @register_decomposition([aten.all.dim])
-def all_dim(input, dim, keeepdim=False):
-    return torch.logical_not(torch.any(torch.logical_not(input), dim, keeepdim))
+def all_dim(input, dim, keepdim=False):
+    return torch.logical_not(torch.any(torch.logical_not(input), dim, keepdim))
 
 
 # NB: this decomposition is not stride accurate, do not put it in the main
@@ -492,19 +385,27 @@ def bernoulli(self, *, generator=None):
     return torch.rand_like(self, dtype=torch.float32) < self
 
 
-@register_decomposition([aten.bernoulli.p])
-def bernoulli_p(self, p=0.5, *, generator=None):
-    assert generator is None
-    return torch.rand_like(self, dtype=torch.float32) < p
-
-
 """
 Some decomps result in differences from eager related to randomness.
 We put these decomps in a separate table `extra_random_decomps` to allow
 turning them on and off via `config.fallback_random`.
 """
 extra_random_decomps = get_decompositions(
-    [aten.native_dropout, aten.exponential, aten.exponential_, aten.uniform_]
+    [
+        aten.native_dropout,
+        aten.cauchy,
+        aten.cauchy_,
+        aten.exponential,
+        aten.exponential_,
+        aten.geometric,
+        aten.geometric_,
+        aten.normal,
+        aten.normal_,
+        aten.normal_functional,
+        aten.log_normal,
+        aten.log_normal_,
+        aten.uniform_,
+    ]
 )
 register_extra_random_decomp = functools.partial(
     decomp.register_decomposition, registry=extra_random_decomps
@@ -516,6 +417,12 @@ def bernoulli_(self, p=0.5):
     return self.copy_(torch.rand_like(self, dtype=torch.float32) < p)
 
 
+@register_extra_random_decomp([aten.bernoulli.p])
+def bernoulli_p(self, p=0.5, *, generator=None):
+    assert generator is None
+    return torch.rand_like(self, dtype=torch.float32) < p
+
+
 @functools.lru_cache(None)
 def fast_random_decomps():
     return {**decompositions, **extra_random_decomps}
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 8d2d278b982d..4cbca047995e 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -20,7 +20,7 @@
 
 log = logging.getLogger(__name__)
 
-Dep = Union["MemoryDep", "StarDep"]
+Dep = Union["MemoryDep", "StarDep", "WeakDep"]
 
 
 class MemoryDep(typing.NamedTuple):
@@ -121,6 +121,24 @@ def is_contiguous(self) -> bool:
         return False
 
 
+# Used for tracking mutation ordering
+# if A reads a buffer and B mutates it
+# B must be ordered after A
+class WeakDep(typing.NamedTuple):
+    name: str
+
+    def rename(self, renames: Dict[str, str]) -> "WeakDep":
+        if self.name in renames:
+            return WeakDep(renames[self.name])
+        return self
+
+    def numbytes_hint(self):
+        return 1  # Purely inserted for ordering, not an actual dep
+
+    def is_contiguous(self) -> bool:
+        return False
+
+
 class IndexExprDep(typing.NamedTuple):
     index: sympy.Expr  # type: ignore[assignment]
     size: Tuple[sympy.Expr, ...]
@@ -143,10 +161,10 @@ def rename(self, renames: typing.Dict[str, str]) -> "ReadWrites":
             self.var_ranges,
         )
 
-    def with_read(self, name: str) -> "ReadWrites":
-        assert isinstance(name, str)
+    def with_read(self, dep: Dep) -> "ReadWrites":
+        assert isinstance(dep, (WeakDep, StarDep))
         return ReadWrites(
-            set.union(self.reads, {StarDep(name)}),
+            set.union(self.reads, {dep}),
             self.writes,
             self.index_exprs,
             self.range_vars,
@@ -163,6 +181,15 @@ def merge(self, other):
             index_exprs,
         )
 
+    def remove_reads(self, rem_reads):
+        return ReadWrites(
+            self.reads - rem_reads,
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+        )
+
 
 class _RecordLoadStoreInner(V.MockHandler):
     def __init__(self, var_ranges: VarRanges, normalize: bool):
diff --git a/torch/_inductor/exc.py b/torch/_inductor/exc.py
index 8c6f2f262c4f..3278323aa066 100644
--- a/torch/_inductor/exc.py
+++ b/torch/_inductor/exc.py
@@ -3,8 +3,6 @@
 import textwrap
 from functools import lru_cache
 
-from . import config
-
 if os.environ.get("TORCHINDUCTOR_WRITE_MISSING_OPS") == "1":
 
     @lru_cache(None)
@@ -45,7 +43,7 @@ def __init__(self, target, args, kwargs):
 
                 There is a decomposition available for {target} in
                 torch._decomp.get_decompositions().  Please add this operator to the
-                `decompositions` list in {config.inductor_import}.decompositions
+                `decompositions` list in torch._inductor.decompositions
                 """
             )
         )
diff --git a/torch/_inductor/fx_utils.py b/torch/_inductor/fx_utils.py
index 3d228d4b4124..5daced969034 100644
--- a/torch/_inductor/fx_utils.py
+++ b/torch/_inductor/fx_utils.py
@@ -1,8 +1,7 @@
 import torch
 
-
-# Check the pattern: (nn.module, F.function) matched.
-# Works for length 2 patterns with 1 module and 1 function.
+# Check the pattern: (nn.module, F.function/torch.Tensor.method) matched.
+# Works for length 2 patterns with 1 module and 1 function/method.
 def matches_module_function_pattern(pattern, node, modules):
     if len(node.args) == 0:
         return False
@@ -19,8 +18,8 @@ def matches_module_function_pattern(pattern, node, modules):
         return False
     if type(modules[node.args[0].target]) is not pattern[0]:
         return False
-    # the second node is call_function
-    if node.op != "call_function":
+    # the second node is call_function or call_method
+    if node.op != "call_function" and node.op != "call_method":
         return False
     if node.target != pattern[1]:
         return False
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 029bfb9e740a..7ae6fee46cde 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -12,7 +12,12 @@
 import torch.fx
 from torch._decomp import get_decompositions
 from torch._dynamo.utils import dynamo_timed
-from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.fx.experimental.symbolic_shapes import (
+    magic_methods,
+    method_to_operator,
+    ShapeEnv,
+    SymTypes,
+)
 from torch.utils._mode_utils import no_dispatch
 
 from .._dynamo import config as dynamo_config
@@ -26,6 +31,7 @@
 )
 from .ir import Constant, FixedLayout, InputBuffer, Pointwise, Reduction, TensorBox
 from .lowering import (
+    FALLBACK_ALLOW_LIST,
     layout_constraints,
     lowerings,
     make_fallback,
@@ -59,6 +65,11 @@ def supported_dtype_of_cpp_wrapper(dtype):
     return dtype in supported_dtype
 
 
+def is_magic_method(op):
+    magic_ops = {method_to_operator(m) for m in magic_methods}
+    return op in magic_ops
+
+
 class GraphLowering(torch.fx.Interpreter):
     def symbolic_sizes_strides(self, ex: torch.Tensor):
         """
@@ -71,7 +82,19 @@ def symbolic_sizes_strides(self, ex: torch.Tensor):
                 ex.stride()
             )
         else:
-            size, stride = self._shape_env.create_symbolic_sizes_strides(ex)
+            from torch._dynamo.source import ConstantSource
+
+            # TODO: this should not be needed once #93059 lands
+            # https://github.com/pytorch/pytorch/pull/94031#discussion_r1096044816
+            # TODO: make a dedicated UnknownSource for this?
+            source = ConstantSource(
+                f"__unknown_tensor_{len(self._shape_env.var_to_val)}"
+            )
+            (
+                size,
+                stride,
+                _,
+            ) = self._shape_env.create_symbolic_sizes_strides_storage_offset(ex, source)
 
         size = [i.node.expr if isinstance(i, torch.SymInt) else i for i in size]
         stride = [i.node.expr if isinstance(i, torch.SymInt) else i for i in stride]
@@ -91,13 +114,8 @@ def __init__(
         shape_env=None,
         num_static_inputs=None,
         graph_id=None,
-        fake_mode=None,
     ):
         super().__init__(gm)
-        if fake_mode is None:
-            self.fake_mode = torch._subclasses.FakeTensorMode()
-        else:
-            self.fake_mode = fake_mode
         if shape_env is None:
             shape_env = ShapeEnv()
             self.reuse_shape_env = False
@@ -131,7 +149,18 @@ def __init__(
     def warn_fallback(self, name):
         if name not in self._warned_fallback:
             self._warned_fallback.add(name)
-            log.warning(f"Using FallbackKernel: {name}")
+            log.info(f"Using FallbackKernel: {name}")
+
+    @property
+    def fake_mode(self):
+        return V.fake_mode
+
+    def get_buffer(self, buffer_name: str):
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name]
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name]
+        return None
 
     def get_dtype(self, buffer_name: str):
         if buffer_name in self.constants:
@@ -257,6 +286,10 @@ def constant_name(self, name: str, device_override: torch.device):
 
     def placeholder(self, target: str, args, kwargs):
         example: torch.Tensor = super().placeholder(target, args, kwargs)
+        if isinstance(example, SymTypes):
+            expr = example.node.expr
+            self.graph_inputs[target] = expr
+            return expr
         # todo(chilli): We can remove the last check once we turn buffers into
         # static shape tensors. That's a hack to workaround Inductor believing
         # the buffer should be static but us passing in a fake tensor with
@@ -265,7 +298,7 @@ def placeholder(self, target: str, args, kwargs):
             config.static_weight_shapes
             and (
                 len(self.graph_inputs) < self.num_static_inputs
-                or not config.dynamic_shapes
+                or not dynamo_config.dynamic_shapes
             )
             and not example._has_symbolic_sizes_strides
         ):
@@ -290,14 +323,21 @@ def call_function(self, target, args, kwargs):
             if target is operator.getitem and isinstance(args[0], (list, tuple)):
                 return super().call_function(target, args, kwargs)
 
+            if hasattr(target, "_inductor_lowering_function"):
+                # passthrough lowerings from .pattern_matcher
+                return target(*args, **kwargs)
+
             if target not in lowerings:
-                if config.implicit_fallbacks:
+                base_name = target.name().split(".")[0]
+                if base_name in FALLBACK_ALLOW_LIST:
+                    make_fallback(target)
+                elif config.implicit_fallbacks:
                     error = (
                         MissingOperatorWithDecomp
                         if get_decompositions([target])
                         else MissingOperatorWithoutDecomp
                     )
-                    log.warning(
+                    log.info(
                         "Creating implicit fallback for:\n%s",
                         error.operator_str(target, args, kwargs),
                     )
@@ -356,6 +396,9 @@ def output(self, target, args, kwargs):
         ), result
         self.graph_outputs = [ir.ExternKernel.realize_input(x) for x in result]
         for name, value in self.graph_inputs.items():
+            assert isinstance(value, (TensorBox, sympy.Expr))
+            if not isinstance(value, TensorBox):
+                continue
             value.realize()
             assert isinstance(value, TensorBox)
             value = value.data
@@ -384,17 +427,29 @@ def run_node(self, n: torch.fx.Node):
                 args, kwargs = self.fetch_args_kwargs_from_env(n)
                 args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
                 result = self.call_function(n.target, args, kwargs)
+            elif is_magic_method(n.target):
+                if isinstance(n.meta["val"], torch.SymInt):
+                    result = n.meta["val"].node.expr
+                else:
+                    result = super().run_node(n)
             else:
                 result = super().run_node(n)
 
             # require the same stride order for dense outputs,
-            # so that user-land view() will not throw because inductor
+            # 1. user-land view() will not throw because inductor
             # output different strides than eager
             # long term the solution is to make view() always succeed
             # with infallible strides.
-            if any(user.op == "output" for user in n.users) and isinstance(
-                n.meta["val"], torch.Tensor
-            ):
+            # 2: as_strided ops, we need make sure its input has same size/stride with
+            # eager model to align with eager behavior.
+            as_strided_ops = [
+                torch.ops.aten.as_strided.default,
+                torch.ops.aten.as_strided_.default,
+                torch.ops.aten.as_strided_scatter.default,
+            ]
+            if any(
+                user.op == "output" or user.target in as_strided_ops for user in n.users
+            ) and isinstance(n.meta["val"], torch.Tensor):
                 strides = n.meta["val"].stride()
                 dense = torch._prims_common.is_non_overlapping_and_dense(n.meta["val"])
                 # requiring a stride order for a non-dense output wouldn't
@@ -425,6 +480,7 @@ def run_node(self, n: torch.fx.Node):
                             torch.ops.aten.convolution.default,
                             torch.ops.aten.convolution_backward.default,
                             torch.ops.aten.mm.default,
+                            torch.ops.aten._int_mm.default,
                         ):
                             result = ir.ExternKernel.require_stride_order(
                                 result, ir.get_stride_order(n.meta["val"].stride())
@@ -505,17 +561,17 @@ def count_bytes(self):
         def get_read_write_buffers_sizes(node):
             if isinstance(node, NopKernelSchedulerNode):
                 return 0
-            reads = set(dep.name for dep in node.read_writes.reads)
-            writes = set(dep.name for dep in node.read_writes.writes)
+            reads = {dep.name for dep in node.read_writes.reads}
+            writes = {dep.name for dep in node.read_writes.writes}
 
             def is_materialized(buf):
-                buf_uses = set(
-                    [user.node for user in scheduler.name_to_node[buf].users]
-                )
+                buf_uses = {user.node for user in scheduler.name_to_node[buf].users}
                 return len(buf_uses - set(node.snodes)) > 0
 
             if isinstance(node, FusedSchedulerNode):
-                writes = set([dep for dep in writes if is_materialized(dep)])
+                removed_buffers = {dep for dep in writes if not is_materialized(dep)}
+                writes = writes - removed_buffers
+                reads = reads - removed_buffers
             node_bytes = 0
             for buf in reads | writes:
                 if buf in self.name_to_buffer:
@@ -550,8 +606,8 @@ def compile_to_module(self):
         for name, value in self.constants.items():
             setattr(mod, name, value)
 
-        if dynamo_config.output_code:
-            log.info("Output code: %s", mod.__file__)
+        if config.benchmark_kernel:
+            print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
         V.debug.output_code(mod.__file__)
         V.debug.rename(os.path.splitext(mod.__file__)[0] + ".debug")
         return mod
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 89edef3520e6..25e2fa9c737b 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -24,6 +24,7 @@
     make_channels_last_strides_for,
     make_contiguous_strides_for,
 )
+from torch.fx.experimental.symbolic_shapes import FloorDiv
 
 from . import config, dependencies
 from .codegen.common import index_prevent_reordering
@@ -34,6 +35,7 @@
     cache_on_self,
     convert_shape_to_inductor,
     convert_shape_to_symint,
+    developer_warning,
     sympy_dot,
     sympy_product,
     sympy_subs,
@@ -45,6 +47,61 @@
 indent = functools.partial(textwrap.indent, prefix="  ")
 aten = torch.ops.aten
 
+""" [Note: Inductor IR]
+
+Inductor's IR is produced by executing 'lowering' code (see lowering.py).  Each
+lowering is registered to a particular aten operator, and expects inputs that
+correspond to the aten schema.  However, in place of torch Tensor inputs, lowerings
+expect Inductor TensorBox inputs.
+
+TensorBox IR represents torch tensors.  Tensors are sometimes single objects owning
+storage, and sometimes views of another Tensor's storage.  Mutating tensor operations
+(such as add_()) affect the underlying storage and any associated views.  Other operations
+(such as .t_()) update metadata about the current view but don't modify the underlying storage.
+
+To model this in Inductor, the IR distinguishes between TensorBox, View, StorageBox and Buffer.
+
+TensorBox is the top level IR construct that any lowering should produce and maps to a torch.Tensor
+output from an operation.  But just as torch.Tensors take different forms, TensorBox IR can
+reference View IR or directly reference StorageBox IRs.
+
+Some Inductor lowerings produce new sets of 'Box'es, while others (such as .t() or other view ops)
+may take an existing TensorBox and point it to a new underlying View IR.
+
+Tensors that directly own storage are represented as a chain of:
+TensorBox -> StorageBox -> Buffer
+where Buffer is a simple (1D) allocation, and StorageBox introduces the concept of a Layout.
+
+If you mutate the data of such a tensor, we swing the StorageBox pointer to point to a new buffer
+(leaving the old buffer unmodified and functionalizing the operation).
+
+Tensors backed by views add one more indirection to the IR.
+TensorBox -> View -> StorageBox -> Buffer
+In these cases, the underlying StorageBox/Buffer will be shared with the pre-view TensorBox.
+"""
+
+
+def validate_ir(node_or_nodes):
+    def _check_tensorbox(node):
+        # Could expand this to check deeper properties
+        # (e.g. TensorBox points to View or StorageBox)
+        assert isinstance(
+            node,
+            (
+                TensorBox,
+                RandSeedBuffer,
+                sympy.Symbol,
+                Expr,
+            ),
+        ), f"Found {type(node)}, which is not a supported top level IR node. See [Note: Inductor IR]"
+
+    # Be picky about the accepted data structure (don't use pytree here)
+    if isinstance(node_or_nodes, (List, Tuple)):
+        for node in node_or_nodes:
+            _check_tensorbox(node)
+    else:
+        _check_tensorbox(node_or_nodes)
+
 
 def inverse_reorder(order):
     inv_order = dict(zip(order, range(len(order))))
@@ -175,6 +232,7 @@ class ModularIndexing(sympy.Function):
     """
 
     nargs = (3,)
+    is_integer = True
 
     @classmethod
     def eval(cls, base, divisor, modulus):
@@ -215,48 +273,11 @@ def eval(cls, base, divisor, modulus):
             if len(new_terms) != len(base.args) and all_positive:
                 return ModularIndexing(sum(new_terms), divisor, modulus)
 
-        if isinstance(base, IndexingDiv):
+        if isinstance(base, FloorDiv):
             return ModularIndexing(base.args[0], base.args[1] * divisor, modulus)
 
 
-class IndexingDiv(sympy.Function):
-    """
-    a // b used in indexing where we need to be careful about simplification.
-    We don't use sympy.FloorDiv to bypass some simplification rules.
-    """
-
-    nargs = (2,)
-    precedence = 50  # precedence of mul  # noqa: F811
-
-    def _sympystr(self, printer):
-        base = printer.parenthesize(self.args[0], self.precedence)
-        divisor = printer.parenthesize(self.args[1], self.precedence)
-        return f"{base}//{divisor}"
-
-    @classmethod
-    def eval(cls, base, divisor):
-        if base == 0:
-            return sympy.Integer(0)
-        if divisor == 1:
-            return base
-        if isinstance(base, sympy.Integer) and isinstance(divisor, sympy.Integer):
-            return base // divisor
-        if isinstance(base, IndexingDiv):
-            return IndexingDiv(base.args[0], base.args[1] * divisor)
-
-        if isinstance(base, sympy.Add):
-            for a in base.args:
-                gcd = sympy.gcd(a, divisor)
-                if gcd == divisor:
-                    return IndexingDiv(base - a, divisor) + a / gcd
-        gcd = sympy.gcd(base, divisor)
-        if gcd != 1:
-            return IndexingDiv(
-                sympy.simplify(base / gcd), sympy.simplify(divisor / gcd)
-            )
-
-
-class CleanDiv(IndexingDiv):
+class CleanDiv(FloorDiv):
     """
     Div where we can assume no rounding.
     This is to enable future optimizations.
@@ -270,11 +291,13 @@ class CeilDiv(sympy.Function):
     Div used in indexing that rounds up.
     """
 
+    is_integer = True
+
     def __new__(cls, base, divisor):
         if sympy.gcd(base, divisor) == divisor:
             return CleanDiv(base, divisor)
         else:
-            return IndexingDiv(base + (divisor - 1), divisor)
+            return FloorDiv(base + (divisor - 1), divisor)
 
 
 def get_device_type(x):
@@ -294,7 +317,7 @@ def is_cpu(x):
 
 
 @dataclasses.dataclass
-class IRNode(object):
+class IRNode:
     _current_origins: ClassVar[Set[Any]] = set()
 
     @staticmethod
@@ -309,9 +332,11 @@ def __post_init__(self):
         self.origins = set(self._current_origins)
 
     def common_repr(self):
-        return (
-            [f"origins={self.origins}"] if hasattr(self, "origins") else ["no origins?"]
-        )
+        origins = f"origins={getattr(self, 'origins', '')}"
+        if len(origins) > 64:
+            # this can get *very* long
+            origins = f"{origins[:61]}..."
+        return [origins]
 
     def str_helper(self, lines):
         lines = lines + self.common_repr()
@@ -818,7 +843,7 @@ def const_fn(index):
             if reduction_type in ("argmin", "argmax"):
 
                 def fn(index):
-                    return 0
+                    return ops.constant(0, dst_dtype)
 
             else:
 
@@ -942,7 +967,7 @@ def create_multilayer(
             need_mask = True
 
         split = sympy.Integer(split)
-        block_size = IndexingDiv(reduction_numel + (split - 1), split)
+        block_size = FloorDiv(reduction_numel + (split - 1), split)
 
         reindex = View.dynamic_reshape_indexer(reduction_ranges, [reduction_numel])
 
@@ -1476,10 +1501,10 @@ def get_dtype(self):
         return self.layout.dtype
 
     def get_size(self):
-        return self.layout.size
+        return list(self.layout.size)
 
     def get_stride(self):
-        return self.layout.stride
+        return list(self.layout.stride)
 
     def make_loader(self):
         def loader(index):
@@ -1530,7 +1555,7 @@ def create(cls, x, dim, start, end, step=1):
             sizevars.guard_equals(end, new_size[dim])
             return x
 
-        new_size[dim] = IndexingDiv(end - start + (step - 1), step)
+        new_size[dim] = FloorDiv(end - start + (step - 1), step)
 
         if is_storage_and_layout(x):
             # Fast path
@@ -1620,7 +1645,7 @@ def __init__(
     ):
         self.device = device
         self.dtype = dtype
-        assert all(isinstance(s, Expr) or isinstance(s, int) for s in size)
+        assert all(isinstance(s, (Expr, int)) for s in size)
         self.size = size
         self._stride = stride
         self.offset = offset
@@ -1835,7 +1860,7 @@ def __init__(self, device, dtype, size, stride_order=None):
             strides = FlexibleLayout.fill_ordered(size, stride_order)
         else:
             strides = FlexibleLayout.contiguous_strides(size)
-        super(FlexibleLayout, self).__init__(device, dtype, size, strides)
+        super().__init__(device, dtype, size, strides)
 
 
 class AliasedLayout(Layout):
@@ -1938,10 +1963,10 @@ def get_dtype(self):
         return getattr(self.layout, "dtype", None)
 
     def get_size(self):
-        return self.layout.size
+        return list(self.layout.size)
 
     def get_stride(self):
-        return self.layout.stride
+        return list(self.layout.stride)
 
     def get_layout(self):
         return self.layout
@@ -2287,7 +2312,7 @@ def constant_to_device(self, device):
 
 class TemplateBuffer(Buffer):
     """
-    Represents a Triton (in the futurue other type) of template operator
+    Represents a Triton (in the future other type) of template operator
     that we can fuse an epilogue onto.
     """
 
@@ -2515,7 +2540,7 @@ def process_kernel(cls, kernel, *args, **kwargs):
                 tensor_args.append(arg)
             else:
                 if isinstance(arg, sympy.Expr):
-                    arg = V.graph.sizevars.shape_env.create_symintnode(arg)
+                    arg = V.graph.sizevars.shape_env.create_symintnode(arg, hint=None)
                 non_tensor_args.append(arg)
 
         def unflatten_args(new_tensor_args, new_non_tensor_args):
@@ -2556,7 +2581,7 @@ def convert_to_reinterpret_view(cls, x):
         """
         In order to pass this to an extern kernel we need a
         ReinterpretView not a View.  This allows us to avoid some
-        uneeded copies.
+        unneeded copies.
         """
         assert isinstance(x, BaseView)
         if isinstance(x, ReinterpretView):
@@ -2896,7 +2921,7 @@ def create(cls, x, device):
         V.graph.device_types.add(device.type)
         V.graph.device_types.add(x.get_device().type)
 
-        log.warning("DeviceCopy")
+        developer_warning("DeviceCopy in input program")
         return DeviceCopy(
             FlexibleLayout(
                 device=device,
@@ -2941,7 +2966,7 @@ def __init__(
         unflatten_args,
         kwargs=None,
     ):
-        super(FallbackKernel, self).__init__(
+        super().__init__(
             layout,
             tuple(tensor_args),
             tuple(nontensor_args),
@@ -2970,7 +2995,7 @@ def gen_kwarg(k, v):
         tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
         constant_args = [Shim(repr(x)) for x in self.constant_args]
         args, kwargs = self.unflatten_args(tensor_args, constant_args)
-        return list(map(repr, args)) + list(gen_kwarg(k, v) for k, v in kwargs.items())
+        return list(map(repr, args)) + [gen_kwarg(k, v) for k, v in kwargs.items()]
 
     @classmethod
     def create(cls, kernel, *args, **kwargs):
@@ -3026,6 +3051,8 @@ def generate_output(output, index=""):
                     packed,
                     index,
                 )
+            elif isinstance(output, int):
+                return output
             else:
                 assert output is None, "FallbackKernel output type is not supported"
                 return None
@@ -3073,10 +3100,8 @@ def __init__(
         self.preferred_stride_order = preferred_stride_order
 
     def codegen(self, wrapper):
-        if self.kernel == "triton_ops.conv":
-            wrapper.header.writeline(
-                f"import {config.inductor_import}.triton_ops.conv as {self.kernel}"
-            )
+        if self.kernel.startswith("triton_ops."):
+            wrapper.header.writeline("from torch._inductor import triton_ops")
         wrapper.writeline(
             f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
         )
@@ -3171,13 +3196,20 @@ def create(
             )
 
         # for conv2d or conv3d, prefer channels last format
+        transform_x_layout = config.triton.convolution != "aten"
         if kernel == "triton_ops.conv":
             output_layout_str = "torch.channels_last"
+        else:
+            output_layout_str = (
+                "torch.contiguous_format"
+                if output.is_contiguous()
+                else "torch.channels_last"
+            )
 
-        elif config.tune_layout and len(x.get_size()) == 4:
+        if config.tune_layout and len(x.get_size()) == 4:
             from .codegen.autotuner import tuned_conv_layout
 
-            output_layout_str = tuned_conv_layout(
+            faster_output_layout_str = tuned_conv_layout(
                 kernel,
                 x.get_size(),
                 weight.get_size(),
@@ -3190,13 +3222,9 @@ def create(
                 x.get_device(),
                 x.get_dtype(),
             )
-
-        else:
-            output_layout_str = (
-                "torch.contiguous_format"
-                if output.is_contiguous()
-                else "torch.channels_last"
-            )
+            if faster_output_layout_str != output_layout_str:
+                output_layout_str = faster_output_layout_str
+                transform_x_layout = True
 
         if output_layout_str == "torch.channels_last":
             stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1)))
@@ -3208,7 +3236,7 @@ def create(
             stride_order = list(reversed(range(len(output_size))))
             strides = make_contiguous_strides_for(output_size)
 
-        if config.triton.convolution != "aten":
+        if transform_x_layout:
             x = cls.require_stride_order(x, stride_order)
 
         output_layout = FixedLayout(
@@ -3343,6 +3371,8 @@ def _prepare_convolution_fusion_create(
     stride_: List[int],
     dilation_: List[int],
     groups: int,
+    transposed: bool = False,
+    output_padding_: List[int] = None,
 ):
     """
     This function is a helper function to prepare inputs, layout and constant args
@@ -3351,28 +3381,92 @@ def _prepare_convolution_fusion_create(
     function only supports the CPU device since conv post-op fusion kernel is only
     supported on CPU right now.
     """
+
+    # Port from aten/src/ATen/native/ConvUtils.h: _conv_input_size
+    def _conv_input_size(
+        output_size, weight_size, padding, output_padding, stride, dilation, groups
+    ):
+        assert len(output_size) == len(weight_size), "Expect input dim == weight dim"
+        dim = len(output_size)
+        assert dim > 2, "Expect input dim > 2"
+
+        BATCH_DIM = 0
+        WEIGHT_INPUT_CHANNELS_DIM = 1
+        input_size = []
+        input_size.append(output_size[BATCH_DIM])
+        input_size.append(weight_size[WEIGHT_INPUT_CHANNELS_DIM] * groups)
+        for d in range(2, dim):
+            kernel = (weight_size[d] - 1) * dilation[d - 2] + 1
+            input_size_d = (
+                (output_size[d] - 1) * stride[d - 2]
+                - (padding[d - 2] * 2)
+                + kernel
+                + output_padding[d - 2]
+            )
+            input_size.append(input_size_d)
+        return list(map(int, input_size))
+
+    # The size of prepacked_weight is the prepacked weight size of deconv:
+    #   Groups > 1:  [g*o, i/g, ...]
+    #   Groups == 1: [o, i, ...]
+    # Returns original weight size in [i, o, ...]
+    def _original_deconv_weight_size(
+        prepacked_weight,
+        groups,
+    ):
+        prepacked_weight_size = prepacked_weight.size()
+        dim = len(prepacked_weight_size)
+        assert dim > 2, "Expect weight dim > 2"
+        if groups > 1:
+            weight_size = []
+            weight_size.append(prepacked_weight_size[1] * groups)
+            weight_size.append(prepacked_weight_size[0] / groups)
+            for d in range(2, dim):
+                weight_size.append(prepacked_weight_size[d])
+        else:
+            weight_size = prepacked_weight.transpose(0, 1).size()
+        return weight_size
+
     stride = tuple(stride_)
     padding = tuple(padding_)
     dilation = tuple(dilation_)
     assert isinstance(groups, int)
+    output_padding = tuple(output_padding_) if output_padding_ else (0, 0)
     with V.graph.fake_mode:
         x_fake = ir_node_to_tensor(x, guard_shape=True)
         weight_fake = ir_node_to_tensor(weight, guard_shape=True)
-        bias_fake = (
-            ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
-        )
-        output = torch.ops.aten.convolution(
-            x_fake,
-            weight_fake,
-            bias_fake,
-            stride,
-            padding,
-            dilation,
-            False,
-            [0, 0],
-            groups,
-        )
-        output_size = output.size()
+        if transposed:
+            # When transposed, the size of the prepacked oneDNN weight is different
+            # from the PyTorch weight. We're not able to run aten conv with such
+            # size. We infer the output size from the input params here:
+            weight_size = _original_deconv_weight_size(weight_fake, groups)
+            input_size = x_fake.size()
+            output_size = _conv_input_size(
+                input_size,
+                weight_size,
+                padding,
+                output_padding,
+                stride,
+                dilation,
+                groups,
+            )
+        else:
+            bias_fake = (
+                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+            )
+            output = torch.ops.aten.convolution(
+                x_fake,
+                weight_fake,
+                bias_fake,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                groups,
+            )
+            output_size = output.size()
+
         req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
         req_stride_order = [len(req_stride_order)] + req_stride_order
         output_stride = make_channels_last_strides_for(output_size)
@@ -3384,10 +3478,12 @@ def _prepare_convolution_fusion_create(
     kernel_layout = FixedLayout(
         x.get_device(),
         x.get_dtype(),
-        output.size(),
-        output_stride,
+        convert_shape_to_inductor(output_size),
+        convert_shape_to_inductor(output_stride),
     )
     constant_args = [padding, stride, dilation, groups]
+    if transposed:
+        constant_args.insert(1, output_padding)
 
     if bias is not None:
         inputs.append(bias)
@@ -3586,30 +3682,18 @@ def codegen(self, wrapper):
         )
 
     @classmethod
-    def create(cls, x, packed_w, orig_w, bias, batch_size):
+    def create(cls, x, packed_w, orig_w, batch_size):
         kernel = "torch.ops.mkl._mkl_linear"
 
-        with V.graph.fake_mode:
-            x_fake = ir_node_to_tensor(x, guard_shape=True)
-            weight_fake = ir_node_to_tensor(orig_w, guard_shape=True)
-            bias_fake = (
-                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
-            )
-            output = torch.ops.aten.linear(
-                x_fake,
-                weight_fake,
-                bias_fake,
-            )
-            output_size = output.size()
-            req_stride_order = list(reversed(range(len(output_size))))
-            output_stride = output.stride()
-        x = cls.require_stride_order(x, req_stride_order)
+        x = cls.require_stride1(cls.realize_input(x))
+        orig_w = cls.require_stride1(cls.realize_input(orig_w))
+        *m, _ = x.get_size()
+        oc, _ = orig_w.get_size()
+        output_size = list(m) + [oc]
+        output_stride = make_contiguous_strides_for(output_size)
         inputs = [x, packed_w, orig_w]
-        constant_args = [batch_size]
-        if bias is not None:
-            inputs.append(bias)
-        else:
-            constant_args.insert(0, bias)
+        bias = None
+        constant_args = [bias, batch_size]
 
         return MKLPackedLinear(
             layout=FixedLayout(
@@ -3722,6 +3806,62 @@ def apply_constraint(self):
         pass
 
 
+class ConvolutionTransposeUnary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._convolution_transpose_pointwise"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._convolution_transpose_pointwise",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        output_padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups_: int,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        kernel = "torch.ops.mkldnn._convolution_transpose_pointwise"
+        transposed = True
+        (inputs, constant_args, kernel_layout, _,) = _prepare_convolution_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+            padding_,
+            stride_,
+            dilation_,
+            groups_,
+            transposed,
+            output_padding_,
+        )
+        constant_args = constant_args + [attr, scalars, algorithm]
+        return ConvolutionTransposeUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
@@ -3866,6 +4006,15 @@ def __init__(self, graph, submodules):
         self.env = {}
         self.fetch_attr = submodules.__getitem__
         self.name = "InterpreterShim"
+        self.current_node = None
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        self.current_node = n
+        return super().run_node(n)
+
+    def run(self, *args, **kwargs):
+        with V.set_interpreter_handler(self):
+            return super().run(*args, **kwargs)
 
 
 class LoopBody:
@@ -4053,3 +4202,164 @@ def debug_str(self, name="block"):
             "",
             code.strip().replace("def forward(", f"def {name}("),
         )
+
+
+class Wait(ExternKernel):
+    """
+    Wait should not be used by itself.  It should always be constructed in tandem
+    with a collective op that produces a work to wait on.
+    """
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(None, layout, inputs, constant_args)
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return False
+
+    def codegen(self, wrapper):
+        (input_collective,) = [t.codegen_reference() for t in self.inputs]
+        work = f"{input_collective}_work"  # hacky way to name work objs..
+        wrapper.writeline(f"{work}.wait()")
+
+        # wait op still needs to produce a 'buffer' that represents the tensor output.
+        # this is a symbolic gesture, and it gets handled by WrapperCodegen.
+        # codegen outputs a '# reuse' line that assigns the input buffer here ('input_collective')
+        # to a new name (`self.get_name()`) and `del`s the old name.
+        wrapper.writeline(f"{self.get_name()} = {input_collective}")
+
+    @classmethod
+    def create(cls, collective_op: "TensorBox"):
+        return Wait(
+            layout=collective_op.get_layout(),
+            inputs=[collective_op],
+        )
+
+    def get_alias_names(self):
+        # Signal to codegen that our output buffer isn't safe to reuse
+        return [self.inputs[0].codegen_reference()]
+
+
+class AllReduce(ExternKernel):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(None, layout, inputs, constant_args)
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return True
+
+    @classmethod
+    def create(
+        cls, x: "TensorBox", reduce_op: str, tag: str, ranks: List[int], group_size: int
+    ):
+        x = cls.realize_input(x)
+
+        # is there a difference between literally using x.data.layout below, vs
+        # creating a new one that has the same properties?
+        new_layout = FlexibleLayout(x.get_device(), x.get_dtype(), x.get_size())
+
+        # AllReduce returns a 'work' object.  But Inductor's scheduler doesn't need to know
+        # about that, and we just pretend for scheduling purposes that the work obj is a 1-elem tensor.
+        # Nobody should consume the output of AllReduce except 'Wait', which we control here.
+        return AllReduce(
+            layout=new_layout,
+            inputs=[x],
+            constant_args=[reduce_op, tag, ranks, group_size],
+        )
+
+    def codegen(self, wrapper):
+        wrapper.add_import_once("import torch.distributed as dist")
+        wrapper.add_import_once(
+            "from torch.distributed._functional_collectives import _str_to_reduce_op"
+        )
+        wrapper.add_import_once(
+            "from torch.distributed.distributed_c10d import _find_or_create_pg_by_ranks_and_tag"
+        )
+
+        # extract references to our args in string form for codegen output
+        (input_name,) = [t.codegen_reference() for t in self.inputs]
+        output_name = self.get_name()
+        reduce_op, tag, ranks, group_size = self.constant_args
+
+        # TODO: avoid more than one ref of the same pg (even though they are cached inside the api)
+        wrapper.writeline(
+            f"{output_name}_pg = _find_or_create_pg_by_ranks_and_tag('{tag}', {ranks}, {group_size})"
+        )
+
+        # We must copy our input buffer sometimes, but the scheduler will help us find opportunities
+        # to reuse the input buffer.  (This requires no other users of the input buffer.)
+        if not wrapper.did_reuse(self, self.inputs[0]):
+            wrapper.writeline(f"{output_name}.copy_({input_name})")
+
+        # At this point, output_name points to a buffer that is either
+        # (1) the input buffer, which we're allowed to inplace modify
+        # (2) a freshly allocated buffer, which we've copied the input into above
+        wrapper.writeline(
+            f"{output_name}_work = dist.all_reduce({output_name}, async_op=True,"
+            f" group={output_name}_pg, op=_str_to_reduce_op('{str(reduce_op)}'))"
+        )
+
+
+class AllGatherIntoTensor(ExternKernel):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(None, layout, inputs, constant_args)
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return True
+
+    @classmethod
+    def create(cls, x: "TensorBox", tag: str, ranks: List[int], group_size: int):
+        x = cls.realize_input(x)
+
+        # is there a difference between literally using x.data.layout below, vs
+        # creating a new one that has the same properties?
+        new_size = x.get_size()
+        new_size[0] *= group_size
+        new_layout = FlexibleLayout(x.get_device(), x.get_dtype(), new_size)
+
+        # AllReduce returns a 'work' object.  But Inductor's scheduler doesn't need to know
+        # about that, and we just pretend for scheduling purposes that the work obj is a 1-elem tensor.
+        # Nobody should consume the output of AllReduce except 'Wait', which we control here.
+        return AllGatherIntoTensor(
+            layout=new_layout,
+            inputs=[x],
+            constant_args=[tag, ranks, group_size],
+        )
+
+    def codegen(self, wrapper):
+        wrapper.add_import_once("import torch.distributed as dist")
+        wrapper.add_import_once(
+            "from torch.distributed.distributed_c10d import _find_or_create_pg_by_ranks_and_tag"
+        )
+
+        # extract references to our args in string form for codegen output
+        (input_name,) = [t.codegen_reference() for t in self.inputs]
+        output_name = self.get_name()
+        tag, ranks, group_size = self.constant_args
+
+        # TODO: avoid more than one ref of the same pg (even though they are cached inside the api)
+        wrapper.writeline(
+            f"{output_name}_pg = _find_or_create_pg_by_ranks_and_tag('{tag}', {ranks}, {group_size})"
+        )
+
+        # At this point, output_name points to a fresh buffer
+        wrapper.writeline(
+            f"{output_name}_work = dist.all_gather_into_tensor({output_name}, {input_name}, async_op=True,"
+            f" group={output_name}_pg)"
+        )
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 885b9f6e0502..255750ebf600 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -92,7 +92,7 @@ def tuned_bmm(mat1, mat2, *, layout=None):
     # options to tune from
     choices = [aten_bmm.bind((mat1, mat2), layout)]
     if use_triton_template(layout):
-        for config in mm_configs():
+        for config in mm_configs(m, n, k):
             choices.append(
                 bmm_template.generate(
                     (mat1, mat2),
@@ -112,7 +112,7 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     # options to tune from
     choices = [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
     if use_triton_template(layout):
-        for config in mm_configs():
+        for config in mm_configs(m, n, k):
             choices.append(
                 bmm_template.generate(
                     (inp, mat1, mat2),
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 5ba1b57dbbe9..acc2d78ac1e8 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -8,7 +8,14 @@
     TritonTemplate,
 )
 from ..utils import use_triton_template
-from .mm_common import addmm_epilogue, mm_args, mm_configs, mm_grid, mm_options
+from .mm_common import (
+    addmm_epilogue,
+    int8_mm_configs,
+    mm_args,
+    mm_configs,
+    mm_grid,
+    mm_options,
+)
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -71,8 +78,26 @@
 )
 
 aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
+
+
 aten_addmm = ExternKernelChoice(torch.addmm, "at::addmm_out")
 
+aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm")
+
+
+def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
+    """
+    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
+    kernel under the hood.  There are a few shapes where this is slower,
+    but they are rare.
+    """
+    if inp.stride(0) == 0 or inp.size(0) == 1:
+        return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
+    return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
+
+
+aten_bias_addmm = ExternKernelChoice(bias_addmm, None)
+
 
 @register_lowering(aten.mm)
 def tuned_mm(mat1, mat2, *, layout=None):
@@ -81,7 +106,7 @@ def tuned_mm(mat1, mat2, *, layout=None):
     # options to tune from
     choices = [aten_mm.bind((mat1, mat2), layout)]
     if use_triton_template(layout):
-        for config in mm_configs():
+        for config in mm_configs(m, n, k):
             choices.append(
                 mm_template.generate(
                     (mat1, mat2),
@@ -93,29 +118,54 @@ def tuned_mm(mat1, mat2, *, layout=None):
     return autotune_select_algorithm(choices, [mat1, mat2], layout)
 
 
+@register_lowering(aten._int_mm)
+def tuned_int_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=torch.int32
+    )
+    choices = [aten__int_mm.bind((mat1, mat2), layout)]
+    if use_triton_template(layout):
+        # TODO: Re-enable eager mode implementation once cuBLAS is fixed
+        choices = []
+        for config in int8_mm_configs(m, n, k):
+            choices.append(
+                mm_template.generate(
+                    (mat1, mat2),
+                    layout,
+                    **mm_options(config, k, layout),
+                )
+            )
+    return autotune_select_algorithm(choices, [mat1, mat2], layout)
+
+
 @register_lowering(aten.addmm)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
-    # don't expand inp to make sure fused addmm from cublasLt is used
     if not use_triton_template(layout):
         choices = [aten_addmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
         return autotune_select_algorithm(choices, [inp, mat1, mat2], layout)
 
-    # TODO this is not quite fair benchmarking because we won't use fused cublasLt addmm
-    # options to tune from
     choices = [
         aten_addmm.bind((inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta)
     ]
-    if use_triton_template(layout):
-        for config in mm_configs():
-            choices.append(
-                mm_template.generate(
-                    (inp_expanded, mat1, mat2),
-                    layout,
-                    **mm_options(config, k, layout),
-                    prefix_args=1,
-                    epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
-                )
+    if inp_expanded.get_stride()[0] == 0 and inp_expanded.get_device().type == "cuda":
+        # unexpand inp to make sure fused addmm from cublasLt is used
+        choices.insert(
+            0,
+            aten_bias_addmm.bind(
+                (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+            ),
+        )
+
+    for config in mm_configs(m, n, k):
+        choices.append(
+            mm_template.generate(
+                (inp_expanded, mat1, mat2),
+                layout,
+                **mm_options(config, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
             )
+        )
 
     return autotune_select_algorithm(choices, [inp_expanded, mat1, mat2], layout)
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 5b48c5165595..e7a14ea8872f 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -1,59 +1,88 @@
 import functools
 import logging
+from typing import List, Tuple
 
 import sympy
 
 import torch
 from torch._inductor.select_algorithm import realize_inputs
 from torch._inductor.virtualized import V
-from ..utils import ceildiv as cdiv
-
+from ..utils import ceildiv as cdiv, next_power_of_2
 
 log = logging.getLogger(__name__)
 
 
-@functools.lru_cache(None)
-def mm_configs():
-    import triton
-
-    return [
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=2, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32}, num_stages=3, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=3, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32}, num_stages=4, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=4, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32}, num_stages=5, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=5, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32}, num_stages=2, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64}, num_stages=3, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128}, num_stages=2, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16}, num_stages=2, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16}, num_stages=1, num_warps=2
-        ),
-    ]
+def triton_config(num_stages, num_warps, **kwargs):
+    from triton import Config
+
+    return Config(kwargs, num_stages=num_stages, num_warps=num_warps)
+
+
+def filtered_configs(
+    m: int, n: int, k: int, configs: List[Tuple[int, int, int, int, int]]
+):
+    """Heuristic to shrink configs when they are bigger than the input size"""
+    m = max(next_power_of_2(V.graph.sizevars.size_hint(m)), 16)
+    n = max(next_power_of_2(V.graph.sizevars.size_hint(n)), 16)
+    k = max(next_power_of_2(V.graph.sizevars.size_hint(k)), 16)
+    used = set()
+    for block_m, block_n, block_k, num_stages, num_warps in configs:
+        # shrink configs for small sizes
+        block_m = min(block_m, m)
+        block_n = min(block_n, n)
+        block_k = min(block_k, k)
+        # each warp computes 16x16 tile = 256
+        num_warps = min(num_warps, block_m * block_n // 256)
+        if (block_m, block_n, block_k, num_stages, num_warps) not in used:
+            used.add((block_m, block_n, block_k, num_stages, num_warps))
+            yield triton_config(
+                BLOCK_M=block_m,
+                BLOCK_N=block_n,
+                BLOCK_K=block_k,
+                num_stages=num_stages,
+                num_warps=num_warps,
+            )
+
+
+mm_configs = functools.partial(
+    filtered_configs,
+    configs=(
+        # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+        (64, 64, 32, 2, 4),
+        (64, 128, 32, 3, 4),
+        (128, 64, 32, 3, 4),
+        (64, 128, 32, 4, 8),
+        (128, 64, 32, 4, 8),
+        (64, 32, 32, 5, 8),
+        (32, 64, 32, 5, 8),
+        (128, 128, 32, 2, 8),
+        (64, 64, 64, 3, 8),
+        (32, 32, 128, 2, 4),
+        (64, 64, 16, 2, 4),
+        (32, 32, 16, 1, 2),
+    ),
+)
+
+int8_mm_configs = functools.partial(
+    filtered_configs,
+    configs=(
+        # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+        (64, 64, 32, 2, 4),
+        (64, 128, 32, 3, 4),
+        (128, 64, 32, 3, 4),
+        (64, 128, 32, 4, 8),
+        (128, 64, 32, 4, 8),
+        (64, 32, 32, 5, 8),
+        (32, 64, 32, 5, 8),
+        (128, 128, 32, 2, 8),
+        (64, 64, 64, 3, 8),
+        (32, 32, 128, 2, 4),
+        (64, 64, 16, 2, 4),
+        (32, 32, 16, 1, 2),
+        (128, 256, 128, 3, 8),
+        (256, 128, 128, 3, 8),
+    ),
+)
 
 
 def mm_grid(m, n, meta):
@@ -89,7 +118,7 @@ def mm_options(config, sym_k, layout):
     )
 
 
-def mm_args(mat1, mat2, *others, layout=None):
+def mm_args(mat1, mat2, *others, layout=None, out_dtype=None):
     """
     Common arg processing for mm,bmm,addmm,etc
     """
@@ -101,11 +130,15 @@ def mm_args(mat1, mat2, *others, layout=None):
     if layout is None:
         from torch._inductor.ir import FixedLayout
 
+        if out_dtype is None:
+            out_dtype = mat1.get_dtype()
         layout = FixedLayout(
             mat1.get_device(),
-            mat1.get_dtype(),
+            out_dtype,
             [*b, m, n],
         )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
 
     from ..lowering import expand
 
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
new file mode 100644
index 000000000000..d7bd381d21a3
--- /dev/null
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -0,0 +1,174 @@
+import functools
+
+import torch
+from ..lowering import lowerings
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import use_triton_template
+from ..virtualized import V
+from .mm_common import mm_args, mm_grid, mm_options
+
+aten = torch.ops.aten
+
+
+def ref_mm_plus_mm(a, b, c, d, out):
+    torch.mm(a, b, out=out)
+    out.addmm_(c, d)
+    return out
+
+
+aten_mm_plus_mm = ExternKernelChoice(ref_mm_plus_mm)
+
+mm_plus_mm_template = TritonTemplate(
+    name="mm_plus_mm",
+    grid=mm_grid,
+    debug=False,
+    source=r"""
+{{def_kernel("A", "B", "C", "D")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K1 = {{size("A", 1)}}
+    # K2 = {{size("C", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    stride_cm = {{stride("C", 0)}}
+    stride_ck = {{stride("C", 1)}}
+    stride_dk = {{stride("D", 0)}}
+    stride_dn = {{stride("D", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    C = C + (ram[:, None] * stride_cm + rk[None, :] * stride_ck)
+    D = D + (rk[:, None] * stride_dk + rbn[None, :] * stride_dn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k1 in range(K1, 0, -BLOCK_K):
+        # First matmul with A @ B
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k1, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k1, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+        # Splitting this into two loops causes an internal triton LLVM error
+        # https://github.com/openai/triton/issues/967
+        # for k2 in range(K2, 0, -BLOCK_K):
+        k2 = k1
+
+        # Second matmul with C @ D
+        if EVEN_K:
+            c = tl.load(C)
+            d = tl.load(D)
+        else:
+            c = tl.load(C, mask=rk[None, :] < k2, other=0.)
+            d = tl.load(D, mask=rk[:, None] < k2, other=0.)
+        acc += tl.dot(c, d, allow_tf32=ALLOW_TF32)
+        C += BLOCK_K * stride_ck
+        D += BLOCK_K * stride_dk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+@functools.lru_cache(None)
+def mm_configs():
+    import triton
+
+    # these have been tweaked to workaround register issues
+    return [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=2, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=4, num_warps=16
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32}, num_stages=4, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32}, num_stages=4, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32}, num_stages=1, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64}, num_stages=1, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128}, num_stages=1, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16}, num_stages=2, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16}, num_stages=1, num_warps=2
+        ),
+    ]
+
+
+def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
+    """
+    Computes mm(mat1, mat2) + mm(mat3, mat4)
+    """
+    if not V.graph.sizevars.maybe_guard_list_equals(
+        mat1.get_size(), mat3.get_size()
+    ) or not V.graph.sizevars.maybe_guard_list_equals(mat2.get_size(), mat4.get_size()):
+        # TODO(jansel): support different K values when this is fixed:
+        # https://github.com/openai/triton/issues/967
+        return lowerings[aten.addmm](lowerings[aten.mm](mat1, mat2), mat3, mat4)
+
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m, n, k, layout, mat3, mat4 = mm_args(mat3, mat4, layout=layout)
+
+    # options to tune from
+    choices = [aten_mm_plus_mm.bind((mat1, mat2, mat3, mat4), layout)]
+    if use_triton_template(layout):
+        for config in mm_configs():
+            choices.append(
+                mm_plus_mm_template.generate(
+                    (mat1, mat2, mat3, mat4),
+                    layout,
+                    **mm_options(config, k, layout),
+                )
+            )
+
+    return autotune_select_algorithm(choices, [mat1, mat2, mat3, mat4], layout)
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 5ead8a0e99d0..c4f8ec8feb5c 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1,8 +1,6 @@
 import functools
 import itertools
 import logging
-import math
-import operator
 from collections.abc import Iterable
 from typing import List, Optional, Tuple
 
@@ -20,7 +18,10 @@
     is_float_dtype,
     is_integer_dtype,
     Number,
+    type_to_dtype,
 )
+from torch.fx.experimental.symbolic_shapes import magic_methods, method_to_operator
+from .._dynamo.utils import import_submodule
 
 from . import config, ir, overrides, test_operators  # NOQA: F401
 from .cuda_properties import current_device
@@ -28,15 +29,15 @@
 from .ir import (
     ExpandView,
     IndexingConstant,
-    IndexingDiv,
     PermuteView,
     Pointwise,
     Reduction,
     SqueezeView,
     TensorBox,
+    validate_ir,
     View,
 )
-from .utils import ceildiv, has_torchvision_roi_align, sympy_product
+from .utils import ceildiv, developer_warning, sympy_product
 from .virtualized import ops, V
 
 log = logging.getLogger(__name__)
@@ -44,6 +45,7 @@
 layout_constraints = {}
 fallbacks = set()
 aten = torch.ops.aten
+tr_c10d = torch.ops.tr_c10d
 prims = torch.ops.prims
 needs_realized_inputs = set()
 
@@ -79,6 +81,7 @@ def add_layout_constraint(fn, constraint):
         aten.upsample_bilinear2d,
         aten.upsample_nearest2d,
         aten.upsample_bicubic2d,
+        aten._int_mm,
     ]
 )
 
@@ -222,7 +225,10 @@ def wrapped(*args, **kwargs):
                         args[i], list(args[indices[0]].get_size())
                     )
 
-        return decomp_fn(*args, **kwargs)
+        out = decomp_fn(*args, **kwargs)
+        validate_ir(out)
+
+        return out
 
     if not isinstance(aten_fn, (list, tuple)):
         aten_fn = [aten_fn]
@@ -570,16 +576,10 @@ def expand(x, sizes):
     if tuple(x.get_size()) == tuple(sizes):
         return x
 
-    x_size_product = sympy_product(x.get_size())
-    try:
-        if x_size_product > 0:
-            x.mark_reuse(
-                V.graph.sizevars.size_hint(sympy_product(sizes) / x_size_product)
-            )
-    except TypeError:
-        # Certain sympy products cannot be compared, fails with
-        # cannot determine truth value of Relational
-        pass
+    x_size_product = V.graph.sizevars.size_hint(sympy_product(x.get_size()))
+    if x_size_product > 0:
+        # maybe realize input before broadcasting it
+        x.mark_reuse(V.graph.sizevars.size_hint(sympy_product(sizes)) // x_size_product)
     return TensorBox(ExpandView.create(x.data, tuple(sizes)))
 
 
@@ -631,16 +631,12 @@ def inner_fn(index):
                     index[i] = ir.ModularIndexing(index[i], 1, old_size[i])
         return x_loader(index)
 
-    old_size_product = sympy_product(old_size)
-    try:
-        if old_size_product > 0:
-            x.mark_reuse(
-                V.graph.sizevars.size_hint(sympy_product(new_size) / old_size_product)
-            )
-    except TypeError:
-        # Certain sympy products cannot be compared, fails with
-        # cannot determine truth value of Relational
-        pass
+    old_size_product = V.graph.sizevars.size_hint(sympy_product(old_size))
+    if old_size_product > 0:
+        # maybe realize the input
+        x.mark_reuse(
+            V.graph.sizevars.size_hint(sympy_product(new_size)) // old_size_product
+        )
 
     x_loader = x.make_loader()
     return Pointwise.create(
@@ -738,9 +734,9 @@ def as_strided(x, size, stride, storage_offset=None):
         # as_strided ignores views
         x = x.data.unwrap_view()
     x.realize()
-    if not ir.is_contiguous_storage_and_layout(x):
+    if not ir.is_storage_and_layout(x):
         raise NotImplementedError(f"unrealized as_strided({x}, ...)")
-    storage, old_layout = ir.as_contiguous_storage_and_layout(x)
+    storage, old_layout = ir.as_storage_and_layout(x)
     new_layout = ir.FixedLayout(
         old_layout.device,
         old_layout.dtype,
@@ -761,7 +757,7 @@ def as_strided_(x, size, stride, storage_offset=None):
 @register_lowering(aten.cat)
 def cat(inputs, dim=0):
     if len(inputs) == 1:
-        return inputs[0]
+        return clone(inputs[0])
 
     dim = _validate_dim(inputs[0], dim, 0)
     dtype = get_promoted_dtype(
@@ -781,7 +777,9 @@ def select(x, dim, idx):
 def split(x, sizes, dim=0):
     dim = _validate_dim(x, dim, 0)
     x_size = V.graph.sizevars.guard_static_shape(x.get_size()[dim])
-    if isinstance(sizes, int):
+    if isinstance(sizes, sympy.Expr):
+        sizes = V.graph.sizevars.guard_static_shape(sizes)
+    if isinstance(sizes, (int, sympy.Integer)):
         sizes = [sizes] * ((x_size + sizes - 1) // sizes)
     result = []
     start = 0
@@ -953,6 +951,36 @@ def linear_unary(
         def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
             return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
 
+        @register_lowering(torch.ops.mkldnn._convolution_transpose_pointwise)
+        def convolution_transpose_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionTransposeUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    output_padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
         if torch._C.has_mkl:
 
             @register_lowering(torch.ops.mkl._mkl_linear)
@@ -963,9 +991,12 @@ def mkl_packed_linear(
                 b: TensorBox,
                 batch_size,
             ):
-                return TensorBox.create(
-                    ir.MKLPackedLinear.create(x, packed_w, orig_w, b, batch_size)
+                result = TensorBox.create(
+                    ir.MKLPackedLinear.create(x, packed_w, orig_w, batch_size)
                 )
+                if b is not None:
+                    result = add(result, b)
+                return result
 
     else:
         pass
@@ -985,12 +1016,12 @@ def handler(*args, **kwargs):
     return handler
 
 
-def make_fallback(kernel, layout_constraint=None):
+def make_fallback(kernel, layout_constraint=None, warn=True):
     assert (
         kernel not in decompositions
     ), f"both a fallback and a decomp for same kernel: {kernel}"
-    if get_decompositions([kernel]) and kernel is not aten.cumsum:
-        log.warning(
+    if get_decompositions([kernel]) and warn:
+        developer_warning(
             f"make_fallback({kernel}): a decomposition exists, we should switch to it"
         )
 
@@ -1024,6 +1055,14 @@ def bernoulli_(x, *args):
     return x
 
 
+@register_lowering(aten.bernoulli.p, type_promotion_kind=None)
+def bernoulli_p(x, *args):
+    assert (
+        config.fallback_random
+    ), "this should be handled in decomps unless config.fallback_random"
+    return bernoulli_(clone(x), *args)
+
+
 # This shouldn't be called in general
 @register_lowering(aten._foobar)
 def _foobar(_):
@@ -1032,7 +1071,7 @@ def _foobar(_):
 
 @functools.lru_cache(1)
 def _warn_triton_random(salt):
-    log.warning("using triton random, expect difference from eager")
+    developer_warning("using triton random, expect difference from eager")
 
 
 def warn_triton_random():
@@ -1099,17 +1138,19 @@ def inner_fn(index):
 
 @register_lowering([aten.rand, torch.rand])
 def rand(*args, **kwargs):
-    if config.fallback_random:
+    if config.fallback_random or kwargs.get("generator", None) is not None:
         return fallback_rand(*args, **kwargs)
     else:
+        kwargs.pop("generator", None)
         return fast_rand(*args, **kwargs)
 
 
 @register_lowering([aten.randn, torch.randn])
 def randn(*args, **kwargs):
-    if config.fallback_random:
+    if config.fallback_random or kwargs.get("generator", None) is not None:
         return fallback_randn(*args, **kwargs)
     else:
+        kwargs.pop("generator", None)
         return fast_randn(*args, **kwargs)
 
 
@@ -1162,10 +1203,6 @@ def require_contiguous(_, *args, **kwargs):
     return args, kwargs
 
 
-if has_torchvision_roi_align():
-    make_fallback(torch.ops.torchvision.roi_align)
-
-
 def constrain_to_fx_strides(fx_node, *args, **kwargs):
     def apply_constraint(arg, fx_arg):
         if isinstance(arg, ir.IRNode):
@@ -1180,11 +1217,14 @@ def apply_constraint(arg, fx_arg):
 
 # TODO(jansel): we should implement decomps or lowerings for these
 # https://github.com/pytorch/torchdynamo/issues/327
+FALLBACK_ALLOW_LIST = {
+    "torchvision::roi_align",
+}
 make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
 make_fallback(aten.convolution_backward, constrain_to_fx_strides)
 make_fallback(aten._cudnn_rnn, require_dense)
 make_fallback(aten._cudnn_rnn_backward, require_contiguous)
-make_fallback(aten.cumsum, require_dense)
+make_fallback(aten.cumsum, require_dense, warn=False)
 make_fallback(aten._embedding_bag, require_contiguous)
 make_fallback(aten._embedding_bag_forward_only, require_contiguous)
 make_fallback(aten._fused_moving_avg_obs_fq_helper)
@@ -1199,6 +1239,173 @@ def apply_constraint(arg, fx_arg):
 make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
 make_fallback(aten.upsample_bilinear2d_backward, require_dense)
 
+# The following were added as a result of https://github.com/pytorch/pytorch/pull/94039 to pass tests
+# It's not necessarily a priority to implement these
+make_fallback(aten.upsample_linear1d)
+make_fallback(aten.upsample_trilinear3d)
+make_fallback(aten.upsample_linear1d_backward)
+make_fallback(aten.upsample_trilinear3d_backward)
+make_fallback(aten._adaptive_avg_pool3d)
+make_fallback(aten.adaptive_max_pool2d)
+make_fallback(aten.adaptive_max_pool3d)
+make_fallback(aten.addbmm)
+make_fallback(aten.addmv)
+make_fallback(aten.aminmax)
+make_fallback(aten.avg_pool3d)
+make_fallback(aten.block_diag)
+make_fallback(aten._cdist_forward)
+make_fallback(aten.count_nonzero)
+make_fallback(aten.cummax)
+make_fallback(aten.cummin)
+make_fallback(aten.cumprod)
+make_fallback(aten.deg2rad)
+make_fallback(aten.diagonal_copy, warn=False)
+make_fallback(aten.diagonal_scatter, warn=False)
+make_fallback(aten.digamma, warn=False)
+make_fallback(aten.dist)
+make_fallback(aten._efficientzerotensor)
+make_fallback(aten._embedding_bag_per_sample_weights_backward)
+make_fallback(aten.erfc, warn=False)
+make_fallback(aten.erfinv, warn=False)
+make_fallback(aten.fmax, warn=False)
+make_fallback(aten.fmin, warn=False)
+make_fallback(aten.dist)
+make_fallback(aten._efficientzerotensor)
+make_fallback(aten._embedding_bag_per_sample_weights_backward)
+make_fallback(aten.fractional_max_pool2d)
+make_fallback(aten.fractional_max_pool3d)
+make_fallback(aten.frexp)
+make_fallback(aten.geqrf)
+make_fallback(aten.histc)
+make_fallback(aten.i0)
+make_fallback(aten.igamma, warn=False)
+make_fallback(aten.igammac, warn=False)
+make_fallback(aten.isin)
+make_fallback(aten.isneginf, warn=False)
+make_fallback(aten.isposinf, warn=False)
+make_fallback(aten.kthvalue)
+make_fallback(aten.linalg_cholesky_ex)
+make_fallback(aten.linalg_cross)
+make_fallback(aten._linalg_det)
+make_fallback(aten.linalg_householder_product)
+make_fallback(aten.linalg_inv_ex)
+make_fallback(aten.linalg_ldl_factor_ex)
+make_fallback(aten.linalg_ldl_solve)
+make_fallback(aten.linalg_lu)
+make_fallback(aten.linalg_lu_factor_ex)
+make_fallback(aten.linalg_lu_solve)
+make_fallback(aten.linalg_matrix_exp)
+make_fallback(aten.linalg_qr)
+make_fallback(aten._linalg_slogdet)
+make_fallback(aten._linalg_solve_ex)
+make_fallback(aten.linalg_solve_triangular)
+make_fallback(aten._linalg_svd)
+make_fallback(aten.logaddexp2)
+make_fallback(aten.logcumsumexp)
+make_fallback(aten.log_sigmoid_forward, warn=False)
+make_fallback(aten.logspace, warn=False)
+make_fallback(aten.lu_unpack)
+make_fallback(aten.max_pool3d_with_indices)
+make_fallback(aten.max_unpool2d)
+make_fallback(aten.max_unpool3d)
+make_fallback(aten.median)
+make_fallback(aten.mode)
+make_fallback(aten.multilabel_margin_loss_forward)
+make_fallback(aten.multi_margin_loss)
+make_fallback(aten.nanmedian)
+make_fallback(aten.nansum)
+make_fallback(aten.narrow_copy, warn=False)
+make_fallback(aten.ormqr)
+make_fallback(aten._pdist_forward)
+make_fallback(aten.pixel_shuffle)
+make_fallback(aten.pixel_unshuffle)
+make_fallback(aten.polygamma)
+make_fallback(aten.prod, warn=False)
+make_fallback(aten.put)
+make_fallback(aten.rad2deg)
+make_fallback(aten.reflection_pad1d)
+make_fallback(aten.renorm)
+make_fallback(aten.replication_pad1d)
+make_fallback(aten.resize)
+make_fallback(aten.resize_)
+make_fallback(aten.resize_as)
+make_fallback(aten.resize_as_)
+make_fallback(aten.searchsorted)
+make_fallback(aten.smooth_l1_loss)
+make_fallback(aten.special_airy_ai)
+make_fallback(aten.special_bessel_j0, warn=False)
+make_fallback(aten.special_bessel_j1, warn=False)
+make_fallback(aten.special_bessel_y0, warn=False)
+make_fallback(aten.special_bessel_y1)
+make_fallback(aten.special_chebyshev_polynomial_t)
+make_fallback(aten.special_chebyshev_polynomial_u)
+make_fallback(aten.special_erfcx, warn=False)
+make_fallback(aten.special_hermite_polynomial_h)
+make_fallback(aten.special_hermite_polynomial_he)
+make_fallback(aten.special_i0e, warn=False)
+make_fallback(aten.special_i1, warn=False)
+make_fallback(aten.special_i1e, warn=False)
+make_fallback(aten.special_laguerre_polynomial_l)
+make_fallback(aten.special_modified_bessel_i0)
+make_fallback(aten.special_modified_bessel_i1)
+make_fallback(aten.special_modified_bessel_k0)
+make_fallback(aten.special_modified_bessel_k1)
+make_fallback(aten.special_ndtri, warn=False)
+make_fallback(aten.special_scaled_modified_bessel_k0)
+make_fallback(aten.special_scaled_modified_bessel_k1)
+make_fallback(aten.special_spherical_bessel_j0, warn=False)
+make_fallback(aten.special_zeta, warn=False)
+make_fallback(aten.take)
+make_fallback(aten.threshold, warn=False)
+make_fallback(aten.trace, warn=False)
+make_fallback(aten._trilinear)
+make_fallback(aten.unfold_copy, warn=False)
+make_fallback(aten.uniform, warn=False)
+make_fallback(aten.unsafe_split, warn=False)
+make_fallback(aten.vdot)
+make_fallback(aten.view_as_complex)
+make_fallback(aten.view_copy)
+make_fallback(aten._adaptive_avg_pool3d_backward)
+make_fallback(aten.adaptive_max_pool2d_backward)
+make_fallback(aten.adaptive_max_pool3d_backward)
+make_fallback(aten.avg_pool3d_backward)
+make_fallback(aten.bitwise_or_, warn=False)
+make_fallback(aten._cdist_backward)
+make_fallback(aten.diagonal_backward, warn=False)
+make_fallback(aten._embedding_bag_dense_backward)
+make_fallback(aten.fractional_max_pool2d_backward)
+make_fallback(aten.fractional_max_pool3d_backward)
+make_fallback(aten._linalg_check_errors)
+make_fallback(aten.max_pool3d_with_indices_backward)
+make_fallback(aten.multilabel_margin_loss_backward)
+make_fallback(aten.multi_margin_loss_backward)
+make_fallback(aten._pdist_backward)
+make_fallback(aten.reflection_pad1d_backward)
+make_fallback(aten.replication_pad1d_backward)
+make_fallback(aten.smooth_l1_loss_backward)
+make_fallback(aten.soft_margin_loss_backward, warn=False)
+make_fallback(aten.softshrink_backward, warn=False)
+make_fallback(aten.squeeze_copy)
+make_fallback(aten.linalg_pinv.atol_rtol_tensor)
+make_fallback(aten.segment_reduce.default)
+make_fallback(aten._segment_reduce_backward.default)
+make_fallback(aten.angle)
+make_fallback(aten.cholesky_inverse)
+make_fallback(aten.cholesky_solve)
+make_fallback(aten._fft_r2c)
+make_fallback(aten.histogram.bin_ct)
+make_fallback(aten._histogramdd_bin_edges.default)
+make_fallback(aten._histogramdd_from_bin_cts.default)
+make_fallback(aten.index_reduce)
+make_fallback(aten.masked_scatter)
+make_fallback(aten.to_sparse)
+make_fallback(aten.triangular_solve)
+make_fallback(aten.expand_copy)
+make_fallback(aten.gcd.default, warn=False)
+make_fallback(aten._linalg_eigh)
+make_fallback(aten.zeros.names)
+
+
 add_layout_constraint(aten.convolution, constrain_to_fx_strides)
 
 
@@ -1276,55 +1483,24 @@ def clone(x, *, memory_format=0):
     register_lowering(aten.lift_fresh_copy)(clone)
 
 
-fallback_arange = fallback_handler(aten.arange)
-
-
-@register_lowering([torch.arange, aten.arange])
-def arange(
-    start,
-    end=None,
-    step=1,
+@register_lowering(prims.iota)
+def iota(
+    length,
     *,
-    dtype=None,
-    device=None,
-    layout=torch.strided,
-    pin_memory=False,
+    start,
+    step,
+    dtype,
+    device,
+    requires_grad,
 ):
-    assert layout == torch.strided
-    assert not pin_memory
-    if end is None:
-        end = start
-        start = 0
-
-    if isinstance(start, float) and int(start) == start:
-        start = int(start)
-    if isinstance(end, float) and int(end) == end:
-        end = int(end)
-    if isinstance(step, float) and int(step) == step:
-        step = int(step)
-
-    # Triton kernel doesn't support float arange yet, fallback to aten.arange
-    if not (isinstance(start, int) and isinstance(end, int) and isinstance(step, int)):
-        return fallback_arange(
-            start,
-            end,
-            step,
-            dtype=dtype,
-            device=device,
-            layout=layout,
-            pin_memory=pin_memory,
-        )
-
-    dtype = dtype or torch.int64
-    length = ceildiv((end - start), step)
-    start = sympy.Integer(start)
-    step = sympy.Integer(step)
+    def fn(index):
+        return ops.index_expr(step * index[0] + start, dtype=dtype)
 
     return Pointwise.create(
         device=decode_device(device),
         dtype=dtype,
-        inner_fn=lambda index: ops.index_expr(step * index[0] + start, dtype),
-        ranges=[sympy.Integer(length)],
+        inner_fn=fn,
+        ranges=[length],
     )
 
 
@@ -1398,7 +1574,7 @@ def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
         end = dim_size
 
     src_size = list(x.get_size())
-    src_size[dim] = ir.IndexingDiv(sympy.expand(end - start), sympy.expand(step))
+    src_size[dim] = ir.FloorDiv(sympy.expand(end - start), sympy.expand(step))
     src = expand(src, src_size)
     src_loader = src.make_loader()
 
@@ -1409,7 +1585,7 @@ def inner_fn(idx):
 
         idx_dim = ops.index_expr(idx[dim], torch.int32)
         src_idx = list(idx)
-        src_idx[dim] = ir.IndexingDiv(idx[dim] - start, step)
+        src_idx[dim] = ir.FloorDiv(idx[dim] - start, step)
 
         mask = []
         if start != 0:
@@ -1539,11 +1715,16 @@ def _full(fill_value, device, dtype, size):
     if not isinstance(fill_value, (int, float)) and hasattr(value, "value"):
         value = value.value
 
-    if isinstance(value, (int, float, sympy.Expr)):
+    if isinstance(value, (int, float)):
 
         def inner_fn(index):
             return ops.constant(value, dtype)
 
+    elif isinstance(value, sympy.Expr):
+
+        def inner_fn(index):
+            return ops.index_expr(value, dtype)
+
     else:
         assert len(value.get_size()) == 0
         value_loader = value.make_loader()
@@ -1640,6 +1821,7 @@ def constant_like(fill_value):
 ones_like = create_tensor_like(tensor_constructor(1))
 if not config.fallback_random:
     rand_like = register_lowering(aten.rand_like)(create_tensor_like(rand))
+    randn_like = register_lowering(aten.randn_like)(create_tensor_like(randn))
 
 
 def new_constant(fill_value):
@@ -1712,13 +1894,24 @@ def new_empty_strided(
     )
 
 
+@register_lowering(prims.copy_strided.default)
+def copy_strided(x, stride):
+    stride = [V.graph.sizevars.size_hint(s) for s in stride]
+    stride_order = sorted(range(len(stride)), key=stride.__getitem__)
+    return ir.ExternKernel.require_stride_order(x, stride_order)
+
+
 @register_lowering([torch.full, aten.full])
 def full(size, fill_value, **kwargs):
+    dtype = kwargs.get("dtype")
+    kwargs["dtype"] = dtype if dtype is not None else type_to_dtype(type(fill_value))
     return tensor_constructor(fill_value)(size, **kwargs)
 
 
 @register_lowering(aten.gather, type_promotion_kind=None)
-def gather(x, dim, index):
+def gather(x, dim, index, sparse_grad=False):
+    # sparse_grad doesn't affect forward computation,
+    # and backward tracing is taken care of by AOT Autograd
     assert isinstance(x, TensorBox)
     assert index.get_dtype() == torch.int64
     offset = len(x.get_size()) == 0
@@ -2368,7 +2561,7 @@ def accumulate(out_x, out_y, index_range1, index_range2=None):
         # -----------------------------------------
         #   bottom-left |   bottom  |   bottom-right
         #
-        # The center area is the orignial matrix. Other areas are reflections.
+        # The center area is the original matrix. Other areas are reflections.
 
         center_x, center_y = x + top, y + left
         top_reflect_x, left_reflect_y = top - x, left - y
@@ -2504,15 +2697,18 @@ def load(index):
 
 def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
 
-    x_out = ir.IndexingDiv(
+    x_out = ir.FloorDiv(
         x + 2 * padding[i] - (kernel_size[i] - 1) + (stride[i] - 1), stride[i]
     )
 
     if ceil_mode:
-        x_alt = ir.IndexingDiv(
+        x_alt = ir.FloorDiv(
             x + 2 * padding[i] - (kernel_size[i] - 1) + 2 * (stride[i] - 1), stride[i]
         )
-
+        if V.graph.sizevars.size_hint((x_alt - 1) * stride[i] - x - padding[i]) >= 0:
+            # Sliding windows must start within the input or left padding
+            x_alt -= 1
+            V.graph.sizevars.guard_leq(0, x_alt * stride[i] - x - padding[i])
         if V.graph.sizevars.size_hint(x_out - x_alt) == 0:
             # ceil mode is actually a no-op, lets guard on that
             V.graph.sizevars.guard_equals(x_out, x_alt)
@@ -2693,13 +2889,13 @@ def fn(idx):
         h = h + padding[0]
         w = w + padding[1]
         phstart = ops.index_expr(
-            ir.IndexingDiv(h - kernel_size[0] + stride[0], stride[0]), torch.int32
+            ir.FloorDiv(h - kernel_size[0] + stride[0], stride[0]), torch.int32
         )
         pwstart = ops.index_expr(
-            ir.IndexingDiv(w - kernel_size[1] + stride[1], stride[1]), torch.int32
+            ir.FloorDiv(w - kernel_size[1] + stride[1], stride[1]), torch.int32
         )
-        phend = ops.index_expr(ir.IndexingDiv(h, stride[0]) + 1, torch.int32)
-        pwend = ops.index_expr(ir.IndexingDiv(w, stride[1]) + 1, torch.int32)
+        phend = ops.index_expr(ir.FloorDiv(h, stride[0]) + 1, torch.int32)
+        pwend = ops.index_expr(ir.FloorDiv(w, stride[1]) + 1, torch.int32)
 
         phstart = ops.maximum(phstart, ops.constant(0, torch.int32))
         pwstart = ops.maximum(pwstart, ops.constant(0, torch.int32))
@@ -2840,10 +3036,10 @@ def _adaptive_avg_pool2d(x, output_size):
     dtype = x.get_dtype()
 
     def start_index(index, out_dim, inp_dim):
-        return ir.IndexingDiv((index * inp_dim), out_dim)
+        return ir.FloorDiv((index * inp_dim), out_dim)
 
     def end_index(index, out_dim, inp_dim):
-        return ir.IndexingDiv((index + 1) * inp_dim + out_dim - 1, out_dim)
+        return ir.FloorDiv((index + 1) * inp_dim + out_dim - 1, out_dim)
 
     h_start_index = functools.partial(start_index, out_dim=h_out, inp_dim=h_in)
     h_end_index = functools.partial(end_index, out_dim=h_out, inp_dim=h_in)
@@ -3121,13 +3317,13 @@ def fn(idx):
         h = h + padding[0]
         w = w + padding[1]
         phstart = ops.index_expr(
-            ir.IndexingDiv(h - kernel_size[0] + stride[0], stride[0]), torch.int32
+            ir.FloorDiv(h - kernel_size[0] + stride[0], stride[0]), torch.int32
         )
         pwstart = ops.index_expr(
-            ir.IndexingDiv(w - kernel_size[1] + stride[1], stride[1]), torch.int32
+            ir.FloorDiv(w - kernel_size[1] + stride[1], stride[1]), torch.int32
         )
-        phend = ops.index_expr(ir.IndexingDiv(h, stride[0]) + 1, torch.int32)
-        pwend = ops.index_expr(ir.IndexingDiv(w, stride[1]) + 1, torch.int32)
+        phend = ops.index_expr(ir.FloorDiv(h, stride[0]) + 1, torch.int32)
+        pwend = ops.index_expr(ir.FloorDiv(w, stride[1]) + 1, torch.int32)
 
         phstart = ops.maximum(phstart, ops.constant(0, torch.int32))
         pwstart = ops.maximum(pwstart, ops.constant(0, torch.int32))
@@ -3289,11 +3485,17 @@ def mean(x, axis=None, keepdim=False, *, dtype=None):
     return to_dtype(div(sum_result, denom), output_dtype)
 
 
-@register_lowering([aten.var, prims.var])
-def var_(x, axis=None, correction=1, keepdim=False):
+def var_mean_(x, axis, correction, keepdim, return_mean):
+    if correction is None:
+        correction = 1
+
     size = x.get_size()
     axis = _validate_reduction_axis(x, axis)
-    diffs = square(sub(x, mean(x, axis, keepdim=True)))
+    x_mean = mean(x, axis, keepdim=True)
+    if return_mean:
+        x_mean.realize()
+
+    diffs = square(sub(x, x_mean))
     sum_result = sum_(diffs, axis, keepdim)
 
     denom = sympy_product(size[i] for i in axis)
@@ -3301,22 +3503,26 @@ def var_(x, axis=None, correction=1, keepdim=False):
         denom = denom - correction
     denom = ir.IndexingConstant(denom, x.get_dtype(), x.get_device())
     denom = ExpandView.create(denom, list(sum_result.get_size()))
-    return div(sum_result, denom)
+    x_var = div(sum_result, denom)
+    if not return_mean:
+        return x_var
 
+    x_mean = x_mean if keepdim else squeeze(x_mean, axis)
+    return x_var, x_mean
 
-@register_lowering(aten.var_mean)
-def var_mean(x, dim=None, unbiased=True, keepdim=False, correction=None):
-    if correction is None:
-        correction = int(unbiased)
-    return [
-        var_(x, dim, correction=correction, keepdim=keepdim),
-        mean(x, dim, keepdim=keepdim),
-    ]
 
+@register_lowering([aten.var, prims.var])
+def var_(x, axis=None, *, correction=None, keepdim=False):
+    return var_mean_(
+        x, axis=axis, correction=correction, keepdim=keepdim, return_mean=False
+    )
 
-@register_lowering(aten.std)
-def std(x, axis=None, correction=1, keepdim=False):
-    return sqrt(var_(x, axis, correction, keepdim=keepdim))
+
+@register_lowering(aten.var_mean)
+def var_mean(x, axis=None, *, correction=None, keepdim=False):
+    return var_mean_(
+        x, axis=axis, correction=correction, keepdim=keepdim, return_mean=True
+    )
 
 
 def pow_recursive(x, y, dtype):
@@ -3401,7 +3607,9 @@ def mutate_to(changed, val):
         ).data
         assert isinstance(val, ir.StorageBox)
 
-    if isinstance(changed_data, ir.StorageBox) and not changed_data.is_input_buffer():
+    if isinstance(changed_data, ir.StorageBox) and not (
+        changed_data.is_input_buffer() or isinstance(changed_data.data, ir.NopKernel)
+    ):
         # Fast path, just swing the data pointer
         val.realize()
         changed_data.data = val.data
@@ -3537,91 +3745,67 @@ def sum_(x, axis=None, keepdims=False, *, dtype=None):
 add = register_pointwise(
     aten.add, allow_alpha=True, override_fn_when_input_bool="logical_or"
 )
-exp = register_pointwise(
-    aten.exp,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    use_libdevice_for_f64=True,
-)
-exp2 = register_pointwise(
-    aten.exp2,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-)
-expm1 = register_pointwise(
-    aten.expm1,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-)
+
+
+def register_pointwise_numeric(op):
+    return register_pointwise(
+        op, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    )
+
+
+def register_pointwise_numeric_ldf64(op):
+    return register_pointwise(
+        op,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        use_libdevice_for_f64=True,
+    )
+
+
+exp = register_pointwise_numeric_ldf64(aten.exp)
+exp2 = register_pointwise_numeric(aten.exp2)
+expm1 = register_pointwise_numeric(aten.expm1)
 relu = register_pointwise(aten.relu)
-sigmoid = register_pointwise(
-    aten.sigmoid,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    use_libdevice_for_f64=True,
-)
-sqrt = register_pointwise(
-    aten.sqrt,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    use_libdevice_for_f64=True,
-)
+sigmoid = register_pointwise_numeric_ldf64(aten.sigmoid)
+sqrt = register_pointwise_numeric_ldf64(aten.sqrt)
 square = register_pointwise(aten.square)
 sub = register_pointwise(aten.sub, allow_alpha=True)
-
-register_pointwise(
-    aten.cos,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    use_libdevice_for_f64=True,
-)
-register_pointwise(
-    aten.sin,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    use_libdevice_for_f64=True,
-)
+register_pointwise_numeric_ldf64(aten.cos)
+register_pointwise_numeric_ldf64(aten.sin)
 register_pointwise(aten.abs)
 register_pointwise(aten.bitwise_and)
 register_pointwise(aten.bitwise_not, override_fn_when_input_bool="logical_not")
 register_pointwise(aten.bitwise_or)
 register_pointwise(aten.bitwise_xor)
-register_pointwise(
-    aten.lgamma, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
-)
-erf = register_pointwise(
-    aten.erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
-)
+register_pointwise(aten.bitwise_left_shift)
+register_pointwise(aten.bitwise_right_shift)
+register_pointwise_numeric(aten.lgamma)
+erf = register_pointwise_numeric(aten.erf)
 register_lowering(
     aten.special_erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
 )(erf)
 
-register_pointwise(
-    aten.log1p,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-)
-
-register_pointwise(
-    aten.tanh,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-)
-
-register_pointwise(
-    aten.log,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    use_libdevice_for_f64=True,
-)
+register_pointwise_numeric(aten.log1p)
+register_pointwise_numeric(aten.tan)
+register_pointwise_numeric(aten.tanh)
+register_pointwise_numeric_ldf64(aten.log)
 register_pointwise(aten.logical_not, convert_input_to_bool=True)
-register_pointwise(aten.maximum)
-register_pointwise(aten.minimum)
+maximum = register_pointwise(aten.maximum)
+minimum = register_pointwise(aten.minimum)
+register_lowering(aten.clamp_min)(maximum)
+register_lowering(aten.clamp_max)(minimum)
 register_pointwise(aten.neg)
-register_pointwise(
-    aten.reciprocal, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
-)
+register_pointwise_numeric(aten.reciprocal)
 register_pointwise(aten.remainder)
 register_pointwise(aten.sign, override_fn_when_input_bool="identity")
 register_pointwise(aten.ceil)
 register_pointwise(aten.signbit, override_return_dtype=torch.bool)
 
-register_pointwise(aten.le, type_promotion_kind=None, override_return_dtype=torch.bool)
-register_pointwise(aten.lt, type_promotion_kind=None, override_return_dtype=torch.bool)
-register_pointwise(aten.ge, type_promotion_kind=None, override_return_dtype=torch.bool)
-register_pointwise(aten.gt, type_promotion_kind=None, override_return_dtype=torch.bool)
-register_pointwise(aten.eq, type_promotion_kind=None, override_return_dtype=torch.bool)
-register_pointwise(aten.ne, type_promotion_kind=None, override_return_dtype=torch.bool)
+register_pointwise(aten.le, override_return_dtype=torch.bool)
+register_pointwise(aten.lt, override_return_dtype=torch.bool)
+register_pointwise(aten.ge, override_return_dtype=torch.bool)
+register_pointwise(aten.gt, override_return_dtype=torch.bool)
+register_pointwise(aten.eq, override_return_dtype=torch.bool)
+register_pointwise(aten.ne, override_return_dtype=torch.bool)
 logical_and = register_pointwise(
     aten.logical_and,
     type_promotion_kind=None,
@@ -3637,6 +3821,29 @@ def sum_(x, axis=None, keepdims=False, *, dtype=None):
         override_return_dtype=torch.bool,
     )
 )
+logical_xor = register_pointwise(
+    aten.logical_xor,
+    name="bitwise_xor",
+    type_promotion_kind=None,
+    convert_input_to_bool=True,
+    override_return_dtype=torch.bool,
+)
+register_lowering(aten.__xor__, type_promotion_kind=None)(logical_xor)
+
+register_pointwise_numeric(aten.cosh)
+register_pointwise_numeric(aten.sinh)
+register_pointwise_numeric(aten.acos)
+register_pointwise_numeric(aten.acosh)
+register_pointwise_numeric(aten.asin)
+register_pointwise_numeric(aten.asinh)
+register_pointwise_numeric(aten.atan2)
+register_pointwise_numeric(aten.atan)
+register_pointwise_numeric(aten.atanh)
+register_pointwise_numeric(aten.copysign)
+register_pointwise_numeric(aten.erfc)
+register_pointwise_numeric(aten.hypot)
+register_pointwise_numeric(aten.log10)
+register_pointwise_numeric(aten.nextafter)
 
 
 def register_inplace(aten_op, outplace_op):
@@ -3673,44 +3880,8 @@ def sym_numel(a):
     return a.get_numel()
 
 
-@register_lowering(operator.mul)
-def op_mul(a, b):
-    return a * b
-
-
-@register_lowering(operator.add)
-def op_add(a, b):
-    return a + b
-
-
-@register_lowering(operator.sub)
-def op_sub(a, b):
-    return a - b
-
-
-@register_lowering(operator.floordiv)
-def op_floordiv(a, b):
-    return IndexingDiv(a, b)
-
-
-@register_lowering(operator.truediv)
-def op_truediv(a, b):
-    return a / b
-
-
-@register_lowering(math.ceil)
-def op_ceil(a):
-    return sympy.ceiling(a)
-
-
-@register_lowering(math.floor)
-def op_floor(a):
-    return sympy.floor(a)
-
-
-@register_lowering(torch.sym_float)
-def op_sym_float(a):
-    return a
+for method, func in magic_methods.items():
+    register_lowering(method_to_operator(method))(func)
 
 
 @register_lowering(aten._foobar)
@@ -3724,18 +3895,31 @@ def _realize(x):
     return clone(x)
 
 
-def _import_kernels():
-    """
-    Need to make sure all these get registered in the lowers dict
-    """
-    import importlib
-    import os
+try:
+    import torch.distributed._functional_collectives
+
+    @register_lowering(aten.wait_tensor)
+    def wait(input):
+        return TensorBox.create(ir.Wait.create(input))
 
-    from . import kernel
+    @register_lowering(aten.all_reduce)
+    def allreduce(input, reduce_op, tag, ranks, group_size):
+        return TensorBox.create(
+            ir.AllReduce.create(input, reduce_op, tag, ranks, group_size)
+        )
+
+    @register_lowering(aten.all_gather_into_tensor)
+    def all_gather_into_tensor(shard, tag, ranks, group_size):
+        return TensorBox.create(
+            ir.AllGatherIntoTensor.create(shard, tag, ranks, group_size)
+        )
 
-    for filename in sorted(os.listdir(os.path.dirname(kernel.__file__))):
-        if filename.endswith(".py") and filename[0] != "_":
-            importlib.import_module(f"{kernel.__name__}.{filename[:-3]}")
+except ImportError:
+    log.info(
+        "Inductor support for distributed collectives depends on building torch.distributed"
+    )
 
+# populate lowerings defined in kernel/*
+from . import kernel
 
-_import_kernels()
+import_submodule(kernel)
diff --git a/torch/_inductor/mkldnn.py b/torch/_inductor/mkldnn.py
index 5328649a4f61..94eb801621f0 100644
--- a/torch/_inductor/mkldnn.py
+++ b/torch/_inductor/mkldnn.py
@@ -13,18 +13,20 @@
     matches_module_pattern,
     replace_node_module,
 )
+from torch.fx.experimental.symbolic_shapes import guard_int
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn.modules.utils import _pair
+from . import config
 
 from .fx_utils import matches_module_function_pattern
 
 
-class UnaryAttr(object):
+class UnaryAttr:
     def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
         self.op_name = op_name
         self.scalars_attr = scalars_attr if scalars_attr else []
         self.algorithm_attr = algorithm_attr if algorithm_attr else ""
-        super(UnaryAttr, self).__init__()
+        super().__init__()
 
     def __call__(self, unary_module: nn.Module):
         if type(unary_module) is nn.ReLU6:
@@ -46,6 +48,12 @@ def is_bfloat16_module(m):
     return weight_is_bf16 and bias_is_bf16
 
 
+def is_group_depthwise_conv_transpose(m):
+    return (
+        type(m) in [nn.ConvTranspose2d] and m.groups > 1 and m.groups == m.in_channels
+    )
+
+
 def check_node_kind(current_node, modules, node_kind):
     if not isinstance(current_node, torch.fx.Node):
         return False
@@ -85,12 +93,6 @@ def check_binary_op_kwargs_is_default(node):
     return True
 
 
-def check_node_is_add_inplace(node):
-    return (node.op == "call_function" and node.target in [operator.iadd]) or (
-        node.op == "call_method" and node.target in ["add_"]
-    )
-
-
 class ConvUnary2d(nn.Conv2d):
     def __init__(
         self,
@@ -98,7 +100,7 @@ def __init__(
         unary: Optional[nn.Module],
         input_size: list,
     ):
-        super(ConvUnary2d, self).__init__(
+        super().__init__(
             conv.in_channels,
             conv.out_channels,
             conv.kernel_size,
@@ -129,7 +131,7 @@ def _update_module_params(self, conv, unary, input_size):
                 self.stride,
                 self.dilation,
                 self.groups,
-                input_size,
+                tuple(guard_int(x) for x in input_size),
             ),
             requires_grad=self.weight.requires_grad,
         )
@@ -174,7 +176,7 @@ def __init__(
         binary_op_name: str,
         input_size: list,
     ):
-        super(ConvBinary2d, self).__init__(
+        super().__init__(
             conv.in_channels,
             conv.out_channels,
             conv.kernel_size,
@@ -203,7 +205,7 @@ def _update_module_params(self, conv, binary_op_name, input_size):
                 self.stride,
                 self.dilation,
                 self.groups,
-                input_size,
+                tuple(guard_int(x) for x in input_size),
             ),
             requires_grad=self.weight.requires_grad,
         )
@@ -252,94 +254,9 @@ def forward(self, input, other):
         return self._conv_forward(input, other, self.weight, self.bias)
 
 
-class ConvBinaryInplace2d(nn.Conv2d):
-    def __init__(
-        self,
-        conv: nn.Module,
-        binary_op_name: str,
-        input_size: list,
-    ):
-        super(ConvBinaryInplace2d, self).__init__(
-            conv.in_channels,
-            conv.out_channels,
-            conv.kernel_size,
-            conv.stride,
-            conv.padding,
-            conv.dilation,
-            conv.groups,
-            conv.bias is not None,
-            conv.padding_mode,
-            conv.weight.device,
-            conv.weight.dtype,
-        )
-        self._update_module_params(conv, binary_op_name, input_size)
-
-    def _update_module_params(self, conv, binary_op_name, input_size):
-        self.__dict__ = copy.deepcopy(conv.__dict__)
-        self.binary_attr = binary_op_name
-        self.binary_alpha = None
-        self.unary_attr = None
-        self.unary_scalars = []
-        self.unary_algorithm = None
-        self.weight = torch.nn.Parameter(
-            torch._C._nn.mkldnn_reorder_conv2d_weight(
-                self.weight.to_mkldnn(),
-                self.padding,
-                self.stride,
-                self.dilation,
-                self.groups,
-                input_size,
-            ),
-            requires_grad=self.weight.requires_grad,
-        )
-
-    def _update_unary_params(self, unary):
-        self.unary_attr, self.unary_scalars, self.unary_algorithm = unary_modules_map[
-            unary.__class__
-        ](unary)
-
-    def _conv_forward(self, input, other, weight, bias):
-        if self.padding_mode != "zeros":
-            return torch.ops.mkldnn._convolution_pointwise_(
-                F.pad(
-                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
-                ),
-                other,
-                weight,
-                bias,
-                _pair(0),
-                self.stride,
-                self.dilation,
-                self.groups,
-                self.binary_attr,
-                self.binary_alpha,
-                self.unary_attr,
-                self.unary_scalars,
-                self.unary_algorithm,
-            )
-        return torch.ops.mkldnn._convolution_pointwise_(
-            input,
-            other,
-            weight,
-            bias,
-            self.padding,
-            self.stride,
-            self.dilation,
-            self.groups,
-            self.binary_attr,
-            self.binary_alpha,
-            self.unary_attr,
-            self.unary_scalars,
-            self.unary_algorithm,
-        )
-
-    def forward(self, input, other):
-        return self._conv_forward(input, other, self.weight, self.bias)
-
-
 class PackedLinear(nn.Linear):
     def __init__(self, linear: nn.Module, input_size: list):
-        super(PackedLinear, self).__init__(
+        super().__init__(
             linear.in_features,
             linear.out_features,
             linear.bias is not None,
@@ -350,7 +267,7 @@ def __init__(self, linear: nn.Module, input_size: list):
 
     def _update_module_params(self, linear, input_size):
         self.__dict__ = copy.deepcopy(linear.__dict__)
-        self.batch_size = int(reduce(lambda x, y: x * y, input_size) / input_size[-1])
+        self.batch_size = reduce(lambda x, y: x * y, input_size[:-1])
         self.packed_weight = torch.nn.Parameter(
             torch.ops.mkl._mkl_reorder_linear_weight(
                 self.weight.to_mkldnn(), self.batch_size
@@ -371,7 +288,7 @@ def __init__(
         linear: nn.Module,
         unary: nn.Module,
     ):
-        super(LinearUnary, self).__init__(
+        super().__init__(
             linear.in_features,
             linear.out_features,
             linear.bias is not None,
@@ -395,7 +312,7 @@ def forward(self, input):
 
 class LinearBinary(nn.Linear):
     def __init__(self, linear: nn.Module, binary_op_name: str):
-        super(LinearBinary, self).__init__(
+        super().__init__(
             linear.in_features,
             linear.out_features,
             linear.bias is not None,
@@ -416,6 +333,83 @@ def forward(self, input, other):
         return y
 
 
+class ConvTransposeUnary2d(nn.ConvTranspose2d):
+    def __init__(
+        self,
+        conv_transpose: nn.Module,
+        unary: Optional[nn.Module],
+        input_size: list,
+    ):
+        super().__init__(
+            conv_transpose.in_channels,
+            conv_transpose.out_channels,
+            conv_transpose.kernel_size,
+            conv_transpose.stride,
+            conv_transpose.padding,
+            conv_transpose.output_padding,
+            conv_transpose.groups,
+            conv_transpose.bias is not None,
+            conv_transpose.dilation,
+            conv_transpose.padding_mode,
+            conv_transpose.weight.device,
+            conv_transpose.weight.dtype,
+        )
+        self._update_module_params(conv_transpose, unary, input_size)
+
+    def _update_module_params(self, conv_transpose, unary, input_size):
+        self.__dict__ = copy.deepcopy(conv_transpose.__dict__)
+        self.attr, self.scalars, self.algorithm = (
+            unary_modules_map[unary.__class__](unary) if unary else ("none", [], "")
+        )
+        packed_weight = torch.ops.mkldnn._reorder_convolution_transpose_weight(
+            self.weight.to_mkldnn(),
+            self.padding,
+            self.output_padding,
+            self.stride,
+            self.dilation,
+            self.groups,
+            input_size,
+        )
+        self.weight = torch.nn.Parameter(
+            packed_weight,
+            requires_grad=self.weight.requires_grad,
+        )
+
+    def _conv_transpose_forward(self, input, weight, bias):
+        if self.padding_mode != "zeros":
+            return torch.ops.mkldnn._convolution_transpose_pointwise(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                _pair(0),
+                self.output_padding,
+                self.stride,
+                self.dilation,
+                self.groups,
+                self.attr,
+                self.scalars,
+                self.algorithm,
+            )
+        return torch.ops.mkldnn._convolution_transpose_pointwise(
+            input,
+            weight,
+            bias,
+            self.padding,
+            self.output_padding,
+            self.stride,
+            self.dilation,
+            self.groups,
+            self.attr,
+            self.scalars,
+            self.algorithm,
+        )
+
+    def forward(self, input):
+        return self._conv_transpose_forward(input, self.weight, self.bias)
+
+
 def packed_conv_eval(conv: nn.Module, input_size: list):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -425,6 +419,15 @@ def packed_conv_eval(conv: nn.Module, input_size: list):
     )
 
 
+def packed_conv_transpose_eval(conv_transpose: nn.Module, input_size: list):
+    assert not (conv_transpose.training), "Fusion only for eval!"
+    return ConvTransposeUnary2d(
+        conv_transpose,
+        None,
+        input_size,
+    )
+
+
 def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module, input_size: list):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -443,17 +446,6 @@ def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str, input_size: lis
     )
 
 
-def fused_conv_binary_inplace_eval(
-    conv: nn.Module, binary_op_name: str, input_size: list
-):
-    assert not (conv.training), "Fusion only for eval!"
-    return ConvBinaryInplace2d(
-        conv,
-        binary_op_name,
-        input_size,
-    )
-
-
 def fused_conv_binary_unary_eval(
     conv_binary: nn.Module, unary: nn.Module, input_size: list
 ):
@@ -485,9 +477,22 @@ def fused_linear_binary_eval(linear: nn.Module, attr: str, input_size: list):
     return linear_binary
 
 
+def fused_conv_transpose_unary_eval(
+    conv_transpose: nn.Module, unary: nn.Module, input_size: list
+):
+    assert not (conv_transpose.training), "Fusion only for eval!"
+    return ConvTransposeUnary2d(
+        conv_transpose,
+        unary,
+        input_size,
+    )
+
+
 def mkldnn_fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     is_cpu = all(
-        example_input.device == torch.device("cpu") for example_input in example_inputs
+        example_input.device == torch.device("cpu")
+        for example_input in example_inputs
+        if isinstance(example_input, torch.Tensor)
     )
 
     # make sure the autograd is disabled.
@@ -503,17 +508,19 @@ def mkldnn_fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     fake_mode = fake_mode_from_tensors(example_inputs)
     ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
     gm = fuse_unary(gm)
-    gm = fuse_binary_inplace(gm)
     gm = fuse_binary(gm)
     # why re-run fuse_unary? we want to enable conv+binary+unary fusion,
     # such as conv+add+relu for vision model.
     gm = fuse_unary(gm)
-    gm = pack_module(gm)
+    if config.cpp.weight_prepack:
+        gm = pack_module(gm)
     return gm
 
 
 def create_unary_module(node: torch.fx.node):
-    assert node.op == "call_function", "The current node should be a function node"
+    assert (
+        node.op == "call_function" or node.op == "call_method"
+    ), "The current node should be a function/method node"
     unary_map = {
         F.relu: nn.ReLU,
         F.sigmoid: nn.Sigmoid,
@@ -524,6 +531,13 @@ def create_unary_module(node: torch.fx.node):
         F.gelu: nn.GELU,
         F.relu6: nn.ReLU6,
         F.silu: nn.SiLU,
+        F.hardsigmoid: nn.Hardsigmoid,
+        torch.relu: nn.ReLU,
+        torch.sigmoid: nn.Sigmoid,
+        torch.tanh: nn.Tanh,
+        "relu": nn.ReLU,
+        "sigmoid": nn.Sigmoid,
+        "tanh": nn.Tanh,
     }
     return unary_map[node.target](*(node.args[1:]), **(node.kwargs))
 
@@ -545,7 +559,7 @@ def fuse_unary(gm: torch.fx.GraphModule):
                 ):  # Output of computation_node is used by other nodes
                     continue
                 computation_node = modules[node.args[0].target]
-                if node.op == "call_function":
+                if node.op == "call_function" or node.op == "call_method":
                     # make sure unary function's inputs only one fx.node(others should be constant value).
                     if any(isinstance(v, torch.fx.Node) for v in node.args[1:]) or any(
                         isinstance(v, torch.fx.Node) for _, v in node.kwargs.items()
@@ -564,16 +578,18 @@ def fuse_unary(gm: torch.fx.GraphModule):
                 ):
                     continue
                 # TODO: support more conv+binary+unary fusion.
-                if type(computation_node) in [
-                    ConvBinary2d,
-                    ConvBinaryInplace2d,
-                ] and type(unary_node) not in [nn.ReLU]:
+                if type(computation_node) in [ConvBinary2d] and type(
+                    unary_node
+                ) not in [nn.ReLU]:
                     continue
                 # only fuse for linear when the dtype is bf16
                 if type(computation_node) in [nn.Linear] and not is_bfloat16_module(
                     computation_node
                 ):
                     continue
+                # TODO: remove this when group depthwise ConvTranspose is supported
+                if is_group_depthwise_conv_transpose(computation_node):
+                    continue
                 computation_node_input_size = (
                     node.args[0].args[0].meta.get("tensor_meta").shape
                 )
@@ -638,6 +654,9 @@ def fuse_binary(gm: torch.fx.GraphModule):
                         if len(node.args[index_node].users) > 1:
                             continue
                         computation_node = modules[node.args[index_node].target]
+                        if computation_node.training:
+                            continue
+
                         # TODO: support padding str input("valid", "same").
                         if type(computation_node) in [nn.Conv2d] and isinstance(
                             computation_node.padding, str
@@ -666,45 +685,29 @@ def fuse_binary(gm: torch.fx.GraphModule):
     return gm
 
 
-def fuse_binary_inplace(gm: torch.fx.GraphModule):
-    modules = dict(gm.named_modules())
+def convert_outplace_to_inplace(gm: torch.fx.GraphModule):
+    if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
+        return gm
+    # This function is about replace outplace with inplace for better performance(external call),
+    # which happen after AOTAutograd.
     for node in gm.graph.nodes:
-        if check_node_is_add_inplace(node) and check_binary_op_kwargs_is_default(node):
-            for (
-                node_kind,
-                fuse_func,
-            ) in computation_op_binary_op_fusion_inplace_map.items():
-                if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
-                    node.args[1], torch.fx.Node
-                ):
-                    continue
-                if not binary_inputs_meta_is_same(node):
-                    continue
-                if check_node_kind(node.args[1], modules, node_kind):
-                    if len(node.args[1].users) > 1:
-                        continue
-                    # make sure the output and input are not same tensor.
-                    if node.args[1].args[0] == node.args[0]:
-                        continue
-                    computation_node = modules[node.args[1].target]
-                    # TODO: support padding str input("valid", "same").
-                    if type(computation_node) in [nn.Conv2d] and isinstance(
-                        computation_node.padding, str
-                    ):
-                        continue
-                    replace_and_fuse_for_binary(
-                        computation_node,
-                        node,
-                        fuse_func,
-                        "add",
-                        modules,
-                        1,  # conv module index
-                        0,  # binary op index
-                    )
-                    # Make sure the fused node is post node of node's inputs nodes.
-                    node.append(node.args[1])
-                    gm.graph.erase_node(node)
-                    break
+        if node.op == "call_function" and node.target in [
+            torch.ops.mkldnn._convolution_pointwise.binary
+        ]:
+            # args[0] and args[1] is _convolution_pointwise.binary's input,
+            # need to check whether args[1] can be written or not.
+            if node.args[1].op in ["placeholder", "output"]:
+                continue
+            # TODO: node.args[1].users > 1, but node.args[1] never be used after current node.
+            if len(node.args[1].users) > 1:
+                continue
+            if node.args[1] == node.args[0]:
+                continue
+            binary_attr = node.args[8]
+            unary_attr = node.args[10]
+            if binary_attr != "add" or unary_attr not in ["", "relu"]:
+                continue
+            node.target = torch.ops.mkldnn._convolution_pointwise_.binary
     gm.graph.lint()
     gm.recompile()
     return gm
@@ -717,16 +720,26 @@ def pack_module(gm: torch.fx.GraphModule):
             assert isinstance(node.target, str)
             cur_module = modules[node.target]
             if type(cur_module) in computation_op_packed_map:
+                if cur_module.training:
+                    continue
                 computation_node_input_meta = node.args[0].meta.get("tensor_meta")
                 if computation_node_input_meta.dtype != torch.float32:
                     continue
                 if type(cur_module) in [torch.nn.Linear] and not torch._C.has_mkl:
                     continue
                 computation_node_input_size = computation_node_input_meta.shape
+                if (
+                    type(cur_module) in [torch.nn.Linear]
+                    and len(computation_node_input_size) < 2
+                ):
+                    continue
                 if type(cur_module) in [nn.Conv2d] and isinstance(
                     cur_module.padding, str
                 ):
                     continue
+                # TODO: remove this when group depthwise ConvTranspose is supported
+                if is_group_depthwise_conv_transpose(cur_module):
+                    continue
                 new_module = computation_op_packed_map[type(cur_module)](
                     cur_module, computation_node_input_size
                 )
@@ -741,7 +754,7 @@ def pack_module(gm: torch.fx.GraphModule):
     nn.Conv2d: fused_conv_unary_eval,
     nn.Linear: fused_linear_unary_eval,
     ConvBinary2d: fused_conv_binary_unary_eval,
-    ConvBinaryInplace2d: fused_conv_binary_unary_eval,
+    nn.ConvTranspose2d: fused_conv_transpose_unary_eval,
 }
 
 
@@ -755,6 +768,7 @@ def pack_module(gm: torch.fx.GraphModule):
     nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"),
     nn.ReLU6: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
     nn.SiLU: UnaryAttr("swish"),
+    nn.Hardsigmoid: UnaryAttr("hardsigmoid"),
 }
 
 unary_ops = [
@@ -768,6 +782,7 @@ def pack_module(gm: torch.fx.GraphModule):
     nn.GELU,
     nn.ReLU6,
     nn.SiLU,
+    nn.Hardsigmoid,
     # functional
     F.relu,
     F.sigmoid,
@@ -778,6 +793,14 @@ def pack_module(gm: torch.fx.GraphModule):
     F.gelu,
     F.relu6,
     F.silu,
+    F.hardsigmoid,
+    torch.relu,
+    torch.sigmoid,
+    torch.tanh,
+    # methods (torch.Tensor.xxx)
+    "relu",
+    "sigmoid",
+    "tanh",
 ]
 
 
@@ -801,14 +824,10 @@ def pack_module(gm: torch.fx.GraphModule):
 }
 
 
-computation_op_binary_op_fusion_inplace_map = {
-    nn.Conv2d: fused_conv_binary_inplace_eval,
-}
-
-
 computation_op_packed_map = {
     nn.Linear: packed_linear_eval,
     nn.Conv2d: packed_conv_eval,
+    nn.ConvTranspose2d: packed_conv_transpose_eval,
 }
 
 
diff --git a/torch/_inductor/optimize_indexing.py b/torch/_inductor/optimize_indexing.py
index 027a51dd7f71..e4728275be12 100644
--- a/torch/_inductor/optimize_indexing.py
+++ b/torch/_inductor/optimize_indexing.py
@@ -1,287 +1,19 @@
-import dataclasses
 import functools
-import itertools
 import logging
 import math
-import operator
 from typing import Dict, Iterable, Union
 
 import sympy
 
 import torch
-from .ir import IndexingDiv, InterpreterShim, LoopBody, ModularIndexing
+from torch.utils._sympy.value_ranges import ValueRangeAnalysis, ValueRanges
+from .ir import FloorDiv, InterpreterShim, LoopBody, ModularIndexing
 from .utils import sympy_subs
 from .virtualized import V
 
 log = logging.getLogger(__name__)
 
 
-@dataclasses.dataclass(frozen=True)
-class ValueRanges(object):
-    lower: Union[sympy.Expr, sympy.Number, int, float, bool]
-    upper: Union[sympy.Expr, sympy.Number, int, float, bool]
-
-    def __contains__(self, x):
-        # TODO This needs to be generalised if lower/upper are sympy.Expr
-        assert not isinstance(x, sympy.Expr)
-        return self.lower <= x <= self.upper
-
-    @classmethod
-    def wrap(cls, arg):
-        if isinstance(arg, ValueRanges):
-            return arg
-        assert isinstance(arg, (int, float, bool))
-        return ValueRanges(arg, arg)
-
-    @classmethod
-    def increasing_map(cls, x, fn):
-        """map lower and upper bound with fn"""
-        x = cls.wrap(x)
-        return ValueRanges(fn(x.lower), fn(x.upper))
-
-    @classmethod
-    def decreasing_map(cls, x, fn):
-        """map lower bound to upper bound and upper bound to lower bound"""
-        x = cls.wrap(x)
-        return ValueRanges(fn(x.upper), fn(x.lower))
-
-    @classmethod
-    def monotone_map(cls, x, fn):
-        """check the max and min of computed upper and lower bound for the output"""
-        x = cls.wrap(x)
-        l = fn(x.lower)
-        u = fn(x.upper)
-        return ValueRanges(min(l, u), max(l, u))
-
-    @classmethod
-    def convex_min_zero_map(cls, x, fn):
-        """the max is at one of the ends"""
-        x = ValueRanges.wrap(x)
-        if 0 in x:
-            return ValueRanges(0, max(fn(x.lower), fn(x.upper)))
-        else:
-            return cls.monotone_map(x, fn)
-
-    @classmethod
-    def coordinatewise_increasing_map(cls, x, y, fn):
-        """map upper and lower bounds accessing corresponding values of inputs"""
-        x, y = cls.wrap(x), cls.wrap(y)
-        return ValueRanges(
-            fn(x.lower, y.lower),
-            fn(x.upper, y.upper),
-        )
-
-    @classmethod
-    def coordinatewise_monotone_map(cls, x, y, fn):
-        """compute the product of all lower and upper bounds and take min and max"""
-        x, y = cls.wrap(x), cls.wrap(y)
-        products = [
-            fn(a, b)
-            for a, b in itertools.product([x.lower, x.upper], [y.lower, y.upper])
-        ]
-        return ValueRanges(min(products), max(products))
-
-
-class ValueRangeAnalysis(object):
-    def __init__(self):
-        self.name = "ValueRangeAnalysis"
-        boolean_operators = (
-            "eq",
-            "ne",
-            "lt",
-            "gt",
-            "le",
-            "ge",
-            "and_",
-            "or_",
-            "xor",
-            "logical_and",
-            "logical_or",
-            "logical_not",
-        )
-        for op in boolean_operators:
-            setattr(self, op, self.bool_handler)
-
-    @staticmethod
-    def bool_handler(*args, **kwargs):
-        # just assuming bools can have both values
-        return ValueRanges(
-            sympy.logic.boolalg.BooleanFalse, sympy.logic.boolalg.BooleanTrue
-        )
-
-    @staticmethod
-    def default_handler(*args, **kwargs):
-        # many ops are unlikely to show up in optimizable indexing compute,
-        # so we dont have full coverage
-        return ValueRanges(-math.inf, math.inf)
-
-    def load(self, name: str, index: sympy.Expr):
-        return ValueRanges(-math.inf, math.inf)
-
-    def store(self, name, index, value, mode=None):
-        return
-
-    def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
-        return ValueRanges(-math.inf, math.inf)
-
-    def index_expr(self, index, dtype):
-        assert isinstance(index, ValueRanges)
-        return index
-
-    @staticmethod
-    def to_dtype(x, dtype: torch.dtype):
-        def is_bool(val):
-            return isinstance(val, bool) or (
-                hasattr(val, "is_Boolean") and val.is_Boolean
-            )
-
-        x = ValueRanges.wrap(x)
-        low, up = x.lower, x.upper
-        if is_bool(low):
-            assert is_bool(up)
-            if dtype.is_floating_point:
-                return ValueRanges(sympy.Float(0.0), sympy.Float(1.0))
-            else:
-                return ValueRanges(sympy.Integer(0), sympy.Integer(1))
-        return ValueRanges.wrap(x)
-
-    @staticmethod
-    def constant(value, dtype):
-        # using nan makes subsequent computation throw, and for the purposes of optimization
-        # returning -math.inf - math.inf is equivalent to giving up
-        if math.isnan(value):
-            return ValueRanges(-math.inf, math.inf)
-        if isinstance(value, int):
-            return ValueRanges(sympy.Integer(value), sympy.Integer(value))
-        else:
-            return ValueRanges(sympy.Float(value), sympy.Float(value))
-
-    @staticmethod
-    def reciprocal(x):
-        x = ValueRanges.wrap(x)
-        if 0 in x:
-            return ValueRanges(-math.inf, math.inf)
-        else:
-            return ValueRanges.decreasing_map(x, lambda y: 1 / y)
-
-    @staticmethod
-    def square(x):
-        return ValueRanges.convex_min_zero_map(x, lambda y: y * y)
-
-    @staticmethod
-    def abs(x):
-        return ValueRanges.convex_min_zero_map(x, abs)
-
-    @staticmethod
-    def neg(x):
-        return ValueRanges.decreasing_map(x, operator.neg)
-
-    @staticmethod
-    def truediv(a, b):
-        b = ValueRanges.wrap(b)
-        if 0 in b:
-            return ValueRanges(-math.inf, math.inf)
-        else:
-            return ValueRangeAnalysis.mul(a, ValueRanges(1 / b.upper, 1 / b.lower))
-
-    @staticmethod
-    def div(a, b):
-        # We think of this as floor(a / b)
-        out = ValueRangeAnalysis.truediv(a, b)
-        return ValueRangeAnalysis.floor(out)
-
-    @staticmethod
-    def add(a, b):
-        return ValueRanges.coordinatewise_increasing_map(a, b, operator.add)
-
-    @staticmethod
-    def mul(a, b):
-        return ValueRanges.coordinatewise_monotone_map(a, b, operator.mul)
-
-    @staticmethod
-    def sub(a, b):
-        b = ValueRanges.wrap(b)
-        return ValueRangeAnalysis.add(a, ValueRanges(-b.upper, -b.lower))
-
-    @staticmethod
-    def exp(x):
-        return ValueRanges.increasing_map(x, sympy.functions.elementary.exponential.exp)
-
-    @staticmethod
-    def log(x):
-        return ValueRanges.increasing_map(
-            x, lambda y: -math.inf if y <= 0 else sympy.log(y)
-        )
-
-    @staticmethod
-    def sqrt(x):
-        return ValueRanges.increasing_map(x, sympy.sqrt)
-
-    @staticmethod
-    def pow(a, b):
-        def is_integer(val):
-            return (
-                isinstance(val, int)
-                or (isinstance(val, float) and val == int(val))
-                or (hasattr(val, "is_integer") and val.is_integer)
-            )
-
-        a = ValueRanges.wrap(a)
-        b = ValueRanges.wrap(b)
-        if a.lower < 0 and not is_integer(b.lower):
-            # The function is not defined
-            return ValueRanges(-math.inf, math.inf)
-        elif 0 in a and b.lower <= 0:
-            return ValueRanges(-math.inf, math.inf)
-        return ValueRanges.coordinatewise_monotone_map(a, b, operator.pow)
-
-    @staticmethod
-    def minimum(a, b):
-        return ValueRanges.coordinatewise_increasing_map(a, b, min)
-
-    @staticmethod
-    def maximum(a, b):
-        return ValueRanges.coordinatewise_increasing_map(a, b, max)
-
-    @staticmethod
-    def where(a, b, c):
-        b = ValueRanges.wrap(b)
-        c = ValueRanges.wrap(c)
-        return ValueRanges(min(b.lower, c.lower), max(b.upper, c.upper))
-
-    @staticmethod
-    def floor(x):
-        return ValueRangeAnalysis.floor_ceil(
-            x, sympy.functions.elementary.integers.floor
-        )
-
-    @staticmethod
-    def ceil(x):
-        return ValueRangeAnalysis.floor_ceil(
-            x, sympy.functions.elementary.integers.ceiling
-        )
-
-    @staticmethod
-    def floor_ceil(x, fn_int):
-        def is_integer(val):
-            return isinstance(val, int) or (
-                hasattr(val, "is_integer") and val.is_integer
-            )
-
-        if is_integer(x):
-            fn = fn_int
-        else:
-
-            def fn(x):
-                return sympy.core.numbers.Float(fn_int(x))
-
-        return ValueRanges.increasing_map(x, fn)
-
-    def __getattr__(self, name):
-        log.warning(f"unhandled ValueRange op {name}")
-        return self.default_handler
-
-
 def dominated_nodes(
     initial_queue: Union[torch.fx.Node, Iterable[torch.fx.Node]], skip_filter=None
 ):
@@ -331,7 +63,7 @@ def range_expressable_in_32_bits(range):
     )
 
 
-class OptimizeIndexing(object):
+class OptimizeIndexing:
     """
     Performs Value Range Analysis on LoopBody's fx graph to reduce precision of
     intermediaries from int64 to int32. This is an important optimization for indexing
@@ -528,7 +260,7 @@ def indexing_div_rep(x, y):
                 return x / y
 
             return expr.replace(ModularIndexing, mod_indexing_rep).replace(
-                IndexingDiv, indexing_div_rep
+                FloorDiv, indexing_div_rep
             )
 
         symbols = expr.free_symbols
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index e129b742e4a3..8878fb0d1b82 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -4,6 +4,7 @@
 import weakref
 
 import torch
+import torch._dynamo.config as dynamo_config
 import torch.nn as nn
 from torch import _prims
 from torch._dynamo.utils import fake_mode_from_tensors
@@ -63,7 +64,9 @@ def replace_fx(gm: torch.fx.GraphModule):
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     is_cpu = all(
-        example_input.device == torch.device("cpu") for example_input in example_inputs
+        example_input.device == torch.device("cpu")
+        for example_input in example_inputs
+        if isinstance(example_input, torch.Tensor)
     )
 
     fake_mode = fake_mode_from_tensors(example_inputs)
@@ -85,7 +88,12 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     gm = remove_identity(gm)
     gm = fuse_conv_bn(gm)
     # do mkldnn fusion(conv(linear)+unary(binary)
-    gm = mkldnn_fuse_fx(gm, example_inputs)
+    # This is skipped when dynamic shapes is enabled, as the resulting
+    # mkl packing ops don't support dynamic shapes.  Once they do support,
+    # you can remove this.  A good test case is wav2vec2, see
+    # https://github.com/pytorch/pytorch/issues/91719
+    if not dynamo_config.dynamic_shapes:
+        gm = mkldnn_fuse_fx(gm, example_inputs)
     return gm
 
 
@@ -290,15 +298,11 @@ def is_pointwise_unary(node):
 
         if user and is_pointwise_unary(user):
             with g.inserting_before(node):
-                new_args = (
-                    [
-                        g.create_node(
-                            user.op, user.target, args=(arg,), kwargs=user.kwargs
-                        )
-                        for arg in node.args[0]
-                    ],
-                )
-                node.args = new_args
+                new_tensors = [
+                    g.create_node(user.op, user.target, args=(arg,), kwargs=user.kwargs)
+                    for arg in node.args[0]
+                ]
+                node.args = (new_tensors,) + node.args[1:]
                 user.replace_all_uses_with(cat_or_view)
                 g.erase_node(user)
     g.lint()
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
new file mode 100644
index 000000000000..1a796211e0b5
--- /dev/null
+++ b/torch/_inductor/pattern_matcher.py
@@ -0,0 +1,609 @@
+import dataclasses
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import os
+from collections import defaultdict
+from typing import Any, Callable, List, Union
+
+import torch
+import torch._inductor as inductor
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dynamo.utils import counters
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+
+from . import config, ir
+from .lowering import lowerings as L
+from .virtualized import V
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+Constant = Any
+NodeOrConstant = Union[Constant, torch.fx.Node]
+
+
+class Match:
+    """
+    Represents a successfully matched pattern.
+    """
+
+    def __init__(self, pattern, args=None, kwargs=None):
+        super().__init__()
+        self.pattern = pattern
+        # The input nodes that must be passed in to the result
+        self.args = args or []
+        self.kwargs = kwargs or {}
+        # The nodes matched in this expression
+        self.nodes = []
+        # Mapping CallFunction to the node.target
+        self.targets = {}
+
+    def extend(self, other):
+        if self.kwargs:
+            for key in set(self.kwargs.keys()) & set(other.kwargs.keys()):
+                if self.kwargs[key] != other.kwargs[key]:
+                    raise FailedMatch(f"kwarg mismatch: {key}")
+        self.args.extend(other.args)
+        self.nodes.extend(other.nodes)
+        self.kwargs.update(other.kwargs)
+        self.targets.update(other.targets)
+
+    def bundle(self):
+        # Wrap args in an extra list
+        self.args = [tuple(self.args)]
+        return self
+
+    def __repr__(self):
+        return f"Match(..., {self.args}, {self.kwargs})"
+
+    def erase_nodes(self, graph: torch.fx.Graph):
+        for n in reversed(self.nodes):
+            graph.erase_node(n)
+
+
+class FailedMatch(RuntimeError):
+    def __bool__(self):
+        return False
+
+
+class MatchContext:
+    """
+    State needed while running PatternExpr._match().
+    """
+
+    def __init__(self, outputs: List["PatternExpr"]):
+        self.outputs = outputs
+        self.pattern_to_node = {}
+
+    def match(self, pattern, node):
+        """wrapper to check reused nodes in patterns"""
+        if pattern in self.pattern_to_node:
+            if self.pattern_to_node[pattern] == node:
+                return Match(pattern)  # already checked this node
+            else:
+                return FailedMatch("repeated pattern differs")
+        m = pattern._match(node, self)
+        assert pattern not in self.pattern_to_node
+        self.pattern_to_node[pattern] = node if m else None
+        return m
+
+
+class PatternExpr:
+    """
+    Base class for types of patterns
+    """
+
+    def _match(self, node: torch.fx.Node, outputs) -> Union[Match, FailedMatch]:
+        raise NotImplementedError()
+
+    def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
+        try:
+            return MatchContext([self]).match(self, node)
+        except FailedMatch as e:
+            return e
+
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
+
+class Arg(PatternExpr):
+    """
+    Capture an arg which will become an input to the handler.  Args are
+    passed in depth first order.
+    """
+
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self, args=[node])  # matches anything
+
+
+class KeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self, kwargs={self.name: node})  # matches anything
+
+
+class CallFunction(PatternExpr):
+    """
+    Matches a call_function node in the FX graps: `fns[i](*args, **kwargs)`
+    """
+
+    def __init__(self, fns, *args, _users=1, **kwargs):
+        super().__init__()
+        fns = [fns] if callable(fns) else list(fns)
+        for fn in list(fns):
+            if isinstance(fn, torch._ops.OpOverloadPacket):
+                fns.extend([getattr(fn, overload) for overload in fn.overloads()])
+
+        self.fns = fns
+        self.fns_set = set(fns)
+        self.args = tuple(args)
+        self.kwargs = dict(kwargs)
+        self.users = _users
+        if any(
+            isinstance(x, (dict, list, tuple))
+            for x in itertools.chain(args, kwargs.values())
+        ):
+            self.flatten = self.pytree_flatten
+        else:
+            self.flatten = self.simple_flatten
+        self.flat_args_kwargs = self.flatten(self.args, self.kwargs)
+
+    @staticmethod
+    def simple_flatten(args, kwargs):
+        return (*args, *kwargs.values()), (len(args), *kwargs.keys())
+
+    @staticmethod
+    def pytree_flatten(args, kwargs):
+        def norm_spec(s: pytree.TreeSpec):
+            if s.type is None:
+                return s
+            mapping = {immutable_list: list, tuple: list, immutable_dict: dict}
+            return pytree.TreeSpec(
+                mapping.get(s.type, s.type),
+                s.context,
+                list(map(norm_spec, s.children_specs)),
+            )
+
+        flat, spec = pytree.tree_flatten([args, kwargs])
+        spec = norm_spec(spec)
+        return flat, spec
+
+    def __repr__(self):
+        args = [
+            f"[{self.fns[0].__name__}, ...]",
+            *map(repr, self.args),
+            *[f"{k}={v}" for k, v in self.kwargs.items()],
+        ]
+        return f"{self.__class__.__name__}({', '.join(args)})"
+
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        if (
+            not isinstance(node, torch.fx.Node)
+            or node.op != "call_function"
+            or node.target not in self.fns_set
+            or len(node.args) != len(self.args)
+            or len(node.kwargs) != len(self.kwargs)
+        ):
+            return FailedMatch("function_mismatch")
+
+        if self not in ctx.outputs and len(node.users) != self.users:
+            return FailedMatch("multiple_users")
+
+        node_items, node_spec = self.flatten(node.args, node.kwargs)
+        self_items, self_spec = self.flat_args_kwargs
+        if node_spec != self_spec:
+            return FailedMatch(f"args_structure {node_spec} {self_spec}")
+        assert len(node_items) == len(self_items)
+
+        m = Match(self)
+        for i, pattern, child_node in zip(itertools.count(), self_items, node_items):
+            if isinstance(pattern, PatternExpr):
+                child_match = ctx.match(pattern, child_node)
+                if not child_match:
+                    return FailedMatch(f"arg[{i}]: {child_match}")
+                m.extend(child_match)
+            elif isinstance(child_node, torch.fx.Node) or child_node != pattern:
+                return FailedMatch("constant_args")
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        return m
+
+
+class ListOf(PatternExpr):
+    """
+    Matches a repeated pattern
+    """
+
+    def __init__(self, pattern):
+        super().__init__()
+        assert isinstance(pattern, PatternExpr)
+        self.pattern = pattern
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.pattern})"
+
+    def _match(self, node: List[torch.fx.Node], ctx: MatchContext):
+        if not isinstance(node, (list, tuple)) or len(node) == 0:
+            return FailedMatch("non_list")
+        m = Match(self)
+        for i, child_node in enumerate(node):
+            child_match = MatchContext(ctx.outputs).match(self.pattern, child_node)
+            if not child_match:
+                return FailedMatch(f"list[{i}]: {child_match}")
+            m.extend(child_match.bundle())
+        return m.bundle()
+
+
+pass_patterns = [
+    defaultdict(list),
+    defaultdict(list),
+    defaultdict(list),
+]
+
+
+@dataclasses.dataclass
+class PatternEntry:
+    pattern: PatternExpr
+    extra_check: Callable[[Match], bool]
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        raise NotImplementedError()
+
+    def register(self, pass_number, target):
+        if isinstance(pass_number, int):
+            pass_patterns[pass_number][target].append(self)
+        else:
+            for x in pass_number:
+                self.register(x, target)
+
+
+@dataclasses.dataclass
+class LoweringPatternEntry(PatternEntry):
+    handler: Any
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        handler = functools.wraps(self.handler)(functools.partial(self.handler, match))
+        with graph.inserting_before(node):
+            replacement = graph.call_function(handler, tuple(match.args), match.kwargs)
+            replacement.meta.update(node.meta)
+            node.replace_all_uses_with(replacement)
+        assert match.nodes[-1] is node
+        match.erase_nodes(graph)
+
+
+@dataclasses.dataclass
+class ReplacementPatternEntry(PatternEntry):
+    replacement_graph: torch.fx.GraphModule
+    signature: inspect.Signature
+    propagate: bool = False
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        class Replacer(torch.fx.Interpreter):
+            call_method = None
+            call_module = None
+            get_attr = None
+
+            def call_function(self, target, args, kwargs):
+                result = graph.call_function(target, args, kwargs)
+                if propagate and V.fake_mode:
+                    fargs, fkwargs = torch.fx.map_arg(
+                        (args, kwargs), lambda n: n.meta["val"]
+                    )
+                    with V.fake_mode:
+                        result.meta["val"] = target(*fargs, **fkwargs)
+                return result
+
+        propagate = self.propagate
+        norm_args = self.signature.bind(*match.args, **match.kwargs)
+        with graph.inserting_before(node):
+            replacement = Replacer(self.replacement_graph).run(
+                *norm_args.arguments.values()
+            )
+            replacement.meta.update(node.meta)
+            node.replace_all_uses_with(replacement)
+        assert match.nodes[-1] is node
+        match.erase_nodes(graph)
+
+
+def _return_true(match):
+    return True
+
+
+def register_replacement_pattern(pattern, extra_check=_return_true, pass_number=1):
+    """
+    Register an aten to aten replacement pattern
+    """
+
+    def decorator(handler):
+        signature = inspect.signature(handler)
+        replacement_graph = torch.fx.symbolic_trace(handler)
+        for target in pattern.fns:
+            ReplacementPatternEntry(
+                pattern=pattern,
+                extra_check=extra_check,
+                replacement_graph=replacement_graph,
+                signature=signature,
+            ).register(pass_number, target)
+        return handler
+
+    assert isinstance(pattern, CallFunction)
+    return decorator
+
+
+def register_lowering_pattern(pattern, extra_check=_return_true, pass_number=1):
+    """
+    Register an aten to inductor IR replacement pattern
+    """
+
+    def decorator(handler):
+        assert callable(handler)
+        for target in pattern.fns:
+            LoweringPatternEntry(
+                pattern=pattern, extra_check=extra_check, handler=handler
+            ).register(pass_number, target)
+        handler._inductor_lowering_function = True
+        return handler
+
+    assert isinstance(pattern, CallFunction)
+    return decorator
+
+
+register_pattern = register_lowering_pattern
+
+
+def replace_matched_patterns(graph: torch.fx.Graph):
+    # the actual replacement work
+    for patterns in pass_patterns:
+        if not patterns:
+            continue
+        for node in reversed(graph.nodes):
+            if node.op == "call_function" and node.target in patterns:
+                for entry in patterns[node.target]:
+                    if node._erased:
+                        break
+                    m = entry.pattern.match(node)
+                    if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
+                        log.warning(f"{node}{node.args} {m} {entry.pattern}")
+                    if m and entry.extra_check(m):
+                        entry.apply(m, graph, node)
+                        counters["inductor"]["pattern_matcher_count"] += 1
+                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+
+
+def reorder_for_locality(graph: torch.fx.Graph):
+    def visit(other_node):
+        if (
+            other_node.op == "call_function"
+            and other_node.target != operator.getitem
+            and all((n in seen_nodes) for n in other_node.users)
+        ):
+            # move node's producers right before it
+            node.prepend(other_node)
+
+    seen_nodes = set()
+    for node in reversed(graph.nodes):
+        seen_nodes.add(node)
+        torch.fx.map_arg((node.args, node.kwargs), visit)
+
+
+def fx_passes(gm: torch.fx.GraphModule):
+    if config.dce:
+        # has some issues with mutation in inference mode
+        gm.graph.eliminate_dead_code()
+
+    if config.reordering:
+        # has some issues with mutation in inference mode
+        reorder_for_locality(gm.graph)
+
+    if config.pattern_matcher:
+        replace_matched_patterns(gm.graph)
+
+    gm.graph.lint()
+
+
+################################################################################
+# Actual patterns below this point.
+# Priority of patterns is:
+#   - later output nodes first
+#   - order patterns are defined in
+################################################################################
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.add,
+        CallFunction(aten.mm, Arg(), Arg()),
+        CallFunction(aten.mm, Arg(), Arg()),
+    )
+)
+def mm_plus_mm(match: Match, mat1, mat2, mat3, mat4):
+    return inductor.kernel.mm_plus_mm.tuned_mm_plus_mm(mat1, mat2, mat3, mat4)
+
+
+@register_lowering_pattern(
+    CallFunction(aten.cat, ListOf(CallFunction(aten.mm, Arg(), Arg())), Arg()),
+)
+def cat_mm(match, inputs, dim):
+    def shape_of(a, b):
+        m, _ = a.get_size()
+        _, n = b.get_size()
+        return [m, n]
+
+    return cat_tuned_op(match, inputs, dim, op=L[aten.mm], shape_of=shape_of)
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.cat, ListOf(CallFunction(aten.addmm, Arg(), Arg(), Arg())), Arg()
+    ),
+)
+def cat_addmm(match, inputs, dim):
+    def shape_of(bias, a, b):
+        m, _ = a.get_size()
+        _, n = b.get_size()
+        return [m, n]
+
+    return cat_tuned_op(match, inputs, dim, op=L[aten.addmm], shape_of=shape_of)
+
+
+def cat_tuned_op(match, inputs, dim, *, op, shape_of):
+    """
+    Memory planning to remove cat.  We can't use the stock memory
+    planner since autotuning matmauls needs to know the output layout.
+    """
+    # TODO(jansel): rewrite this as a bmm?
+    if dim < 0:
+        dim += len(shape_of(*inputs[0]))
+    assert dim in (0, 1)
+    notdim = 1 - dim
+
+    new_size = None
+    offsets_start = []
+    offsets_end = []
+
+    # compute output sizes
+    for i in range(len(inputs)):
+        shape = shape_of(*inputs[i])
+        if new_size is None:
+            new_size = shape
+        else:
+            new_size[notdim] = V.graph.sizevars.guard_equals(
+                shape[notdim], new_size[notdim]
+            )
+            new_size[dim] += shape[dim]
+        offsets_start.append(new_size[dim] - shape[dim])
+        offsets_end.append(new_size[dim])
+
+    dtype = functools.reduce(
+        torch.promote_types, [x.get_dtype() for x in itertools.chain(*inputs)]
+    )
+    device = inputs[0][0].get_device()
+    kernel = ir.ConcatKernel(
+        name=None,
+        layout=ir.FixedLayout(device, dtype, new_size),
+        inputs=[],
+    )
+    kernel_tensor = ir.TensorBox.create(kernel)
+
+    for i in range(len(inputs)):
+        dst = ir.SliceView.create(kernel_tensor, dim, offsets_start[i], offsets_end[i])
+        src = op(*inputs[i], layout=dst.get_layout()).data.data
+        assert isinstance(src, (ir.ExternKernelOut, ir.TemplateBuffer))
+        src.layout = ir.AliasedLayout(dst)
+        kernel.inputs.append(src)
+
+    kernel.name = V.graph.register_buffer(kernel)
+    kernel.inputs = ir.ConcatKernel.unwrap_storage(kernel.inputs)
+    return kernel_tensor
+
+
+_cat_1 = CallFunction(aten.cat, Arg(), 1, _users=2)
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.cat,
+        [
+            _cat_1,
+            CallFunction(
+                aten.slice,
+                CallFunction(aten.slice, _cat_1, 0, 0, 9223372036854775807),
+                1,
+                0,
+                KeywordArg("size"),
+            ),
+        ],
+        1,
+    )
+)
+def cat_slice_cat(match, cat_input, size, dim=1):
+    """
+    This is an example of a more complex pattern where cat_1 is used
+    multiple times inside the pattern.  We fold 2 calls to cat into one.
+
+    Matches:
+        cat_1: f32[1024, 4077] = torch.ops.aten.cat.default([add_26, primals_217], 1)
+        slice_1: f32[1024, 4077] = torch.ops.aten.slice.Tensor(cat_1, 0, 0, 9223372036854775807)
+        slice_2: f32[1024, 19] = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 19)
+        cat_2: f32[1024, 4096] = torch.ops.aten.cat.default([cat_1, slice_2], 1)
+
+
+    Rewrite to:
+        slice_2 = torch.ops.aten.slice.Tensor(add_26, 1, 0, 19)
+        cat_2 = torch.ops.aten.cat.default([add_26, primals_217, slice2], 1)
+    """
+    first, *rest = cat_input
+    if V.graph.sizevars.maybe_guard_leq(size, first.get_size()[dim]):
+        # fold 2 cats into 1 cat
+        return L[aten.cat](
+            [
+                first,
+                *rest,
+                L[aten.slice](first, dim, 0, size),
+            ],
+            dim,
+        )
+    else:
+        # don't expect to hit this case, just fall back
+        tmp = L[aten.cat](cat_input, dim)
+        return L[aten.cat](
+            [
+                tmp,
+                L[aten.slice](tmp, dim, 0, size),
+            ],
+            dim,
+        )
+
+
+@register_replacement_pattern(
+    CallFunction(
+        aten.add,
+        CallFunction(aten.mm, Arg(), Arg()),
+        KeywordArg("added"),
+    ),
+    pass_number=2,
+)
+@register_replacement_pattern(
+    CallFunction(
+        aten.add,
+        KeywordArg("added"),
+        CallFunction(aten.mm, Arg(), Arg()),
+    ),
+    pass_number=2,
+)
+def addmm(mat1, mat2, added):
+    return aten.addmm(added, mat1, mat2)
+
+
+# This slows things down:
+"""
+@register_replacement_pattern(
+    CallFunction(
+        aten.add,
+        CallFunction(aten.bmm, Arg(), Arg()),
+        KeywordArg("added"),
+    ),
+    pass_number=3
+)
+@register_replacement_pattern(
+    CallFunction(
+        aten.add,
+        KeywordArg("added"),
+        CallFunction(aten.bmm, Arg(), Arg()),
+    ),
+    pass_number=3
+)
+def baddbmm(mat1, mat2, added):
+    return aten.baddbmm(added, mat1, mat2)
+"""
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index e40adecaa9b2..f94d4d39a1de 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -14,7 +14,7 @@
 from torch._dynamo.utils import dynamo_timed
 
 from . import config, dependencies, ir, metrics
-from .dependencies import StarDep
+from .dependencies import StarDep, WeakDep
 from .sizevars import SimplifyIndexing
 from .utils import cache_on_self, cmp, has_triton
 from .virtualized import V
@@ -96,8 +96,8 @@ def log_details(self):
     def update_mutated_names(self, renames: Dict[str, str]):
         self.set_read_writes(self.read_writes.rename(renames))
 
-    def add_mutation_dep(self, name):
-        self.set_read_writes(self.read_writes.with_read(name))
+    def add_mutation_dep(self, dep):
+        self.set_read_writes(self.read_writes.with_read(dep))
 
     def set_users(self, users: List["NodeUser"]):
         # deduplicate
@@ -138,6 +138,38 @@ def prune_deps(self):
             if dep.name not in self.scheduler.available_buffer_names
         }
 
+    def prune_redundant_deps(self, name_to_fused_node):
+        """
+        Prunes stardeps intended for mutation ordering
+        on an upstream fused node if after fusion there is another dependency
+        on the fused upstream node, making the stardep redundant
+
+        In essence this enforces an ordering on fusions. As fusions occur, prunable stardeps will
+        be incrementally removed, enabling other fusions, ensuring they are fused in order.
+        """
+        name_to_dep_count = collections.Counter()
+
+        for dep in self.unmet_dependencies:
+            if not isinstance(dep, WeakDep):
+                name_to_dep_count[name_to_fused_node[dep.name].get_name()] += 1
+
+        def should_prune(dep):
+            if isinstance(dep, WeakDep):
+                is_redundant = (
+                    name_to_dep_count[name_to_fused_node[dep.name].get_name()] > 0
+                )
+                # These can occur because fused nodes always gather deps from their snodes
+                # If B has a weakdep on A
+                # B gets fused with C, then any time BC is fused, the weakdep will reappear
+                is_self_dep = name_to_fused_node[dep.name] == self
+                return is_redundant or is_self_dep
+            else:
+                return False
+
+        deps_to_prune = {dep for dep in self.unmet_dependencies if should_prune(dep)}
+        self.unmet_dependencies = self.unmet_dependencies - deps_to_prune
+        self.set_read_writes(self.read_writes.remove_reads(deps_to_prune))
+
     def get_name(self) -> str:
         return self.node.get_name()
 
@@ -145,7 +177,7 @@ def get_first_name(self) -> str:
         return self.get_name()
 
     def get_names(self) -> Set[str]:
-        return set([self.get_name()])
+        return {self.get_name()}
 
     def get_nodes(self) -> List["BaseSchedulerNode"]:
         return [self]
@@ -166,11 +198,80 @@ def can_inplace(self, read_dep: dependencies.MemoryDep):
         return False
 
     def allocate(self):
-        if self.node.should_allocate():
-            # if self.node should allocate or
-            # if self.node is generated by TritonKernelTemplates
-            # because Triton kernel could not allocate tensor itself
+        if not self.node.should_allocate():
+            return
+
+        if isinstance(self, (SchedulerNode,)) and (
+            self.node.get_alias_names() or self.node.get_mutation_names()
+        ):
             V.graph.wrapper_code.codegen_allocation(self.node)
+            return
+
+        if (
+            (
+                isinstance(self, (SchedulerNode,))
+                # o what have i done.  lets make this an api
+                or (
+                    isinstance(self, ExternKernelSchedulerNode)
+                    and isinstance(self.node, ir.AllReduce)
+                )
+            )
+            and config.inplace_buffers
+            and (
+                not isinstance(V.kernel, torch._inductor.codegen.triton.TritonKernel)
+                or getattr(V.kernel, "mutations", None) is not None
+            )
+        ):
+            from .codegen.wrapper import buffer_reuse_key
+
+            ordered_reads = sorted(self.read_writes.reads, key=lambda x: x.name)
+
+            for read in ordered_reads:
+                input_node: BaseSchedulerNode = self.scheduler.name_to_node.get(
+                    read.name
+                )
+                if input_node and V.graph.wrapper_code.can_reuse(input_node):
+                    remaining_uses = [
+                        x
+                        for x in input_node.users
+                        if x.node.get_name()
+                        not in self.scheduler.available_buffer_names
+                    ]
+                    if (
+                        len(remaining_uses) == 1
+                        and remaining_uses[0].can_inplace
+                        and remaining_uses[0].node is self
+                        and not isinstance(
+                            input_node.node.get_layout(),
+                            (
+                                ir.MultiOutputLayout,
+                                ir.MutationLayout,
+                                ir.AliasedLayout,
+                            ),
+                        )
+                        and buffer_reuse_key(input_node.node)
+                        == buffer_reuse_key(self.node)
+                    ):
+                        V.graph.wrapper_code.codegen_inplace_reuse(
+                            input_node.node, self.node
+                        )
+                        # hacky check for if V.kernel is a real kernel or NullHandler
+                        if hasattr(V.kernel, "args"):
+                            # if there isn't a triton kernel, then we don't need to call triton-specific things.
+                            # but TODO this might be a convenient place to signal to the Collective kernels to inplace
+                            # (and, can we make "kernel" less generic of a name?)
+                            V.kernel.args.make_inplace(
+                                input_node.get_name(), self.get_name()
+                            )
+                            # mutations not tracked in cpp kernels
+                            if isinstance(
+                                V.kernel, torch._inductor.codegen.triton.TritonKernel
+                            ):
+                                V.kernel.mutations.add(input_node.get_name())
+                                V.kernel.mutations.add(self.get_name())
+                        return
+
+        V.graph.wrapper_code.codegen_allocation(self.node)
 
     def can_free(self):
         for use in self.users:
@@ -224,6 +325,25 @@ def debug_str_extra(self):
     def is_extern(self):
         return True
 
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        if self.get_aliases() or self.is_template():
+            return False
+
+        if read_dep.name not in self.scheduler.name_to_node:
+            # don't allow reuse of an 'input' buffer, we don't own it
+            # (would this have been fixed if I tracked mutations properly above?)
+            return False
+
+        if not isinstance(self.node, torch._inductor.ir.AllReduce):
+            # TODO make this a property of the IR
+            return False
+
+        if len(self.read_writes.writes) == 1:
+            write_dep = next(iter(self.read_writes.writes))
+            return read_dep.numbytes_hint() == write_dep.numbytes_hint()
+
+        return False
+
 
 class NopKernelSchedulerNode(BaseSchedulerNode):
     pass
@@ -290,56 +410,6 @@ def is_reduction(self):
     def is_template(self):
         return isinstance(self.node, ir.TemplateBuffer)
 
-    def allocate(self):
-        if (
-            not self.node.should_allocate()
-            or self.node.get_alias_names()
-            or self.node.get_mutation_names()
-        ):
-            return super().allocate()
-
-        if config.inplace_buffers and getattr(V.kernel, "mutations", None) is not None:
-            from .codegen.wrapper import buffer_reuse_key
-
-            ordered_reads = sorted(self.read_writes.reads, key=lambda x: x.name)
-
-            for read in ordered_reads:
-                input_node: BaseSchedulerNode = self.scheduler.name_to_node.get(
-                    read.name
-                )
-                if input_node and V.graph.wrapper_code.can_reuse(input_node):
-                    remaining_uses = [
-                        x
-                        for x in input_node.users
-                        if x.node.get_name()
-                        not in self.scheduler.available_buffer_names
-                    ]
-                    if (
-                        len(remaining_uses) == 1
-                        and remaining_uses[0].can_inplace
-                        and remaining_uses[0].node is self
-                        and not isinstance(
-                            input_node.node.get_layout(),
-                            (ir.MultiOutputLayout, ir.MutationLayout, ir.AliasedLayout),
-                        )
-                        and buffer_reuse_key(input_node.node)
-                        == buffer_reuse_key(self.node)
-                    ):
-                        V.graph.wrapper_code.codegen_inplace_reuse(
-                            input_node.node, self.node
-                        )
-                        V.kernel.args.make_inplace(
-                            input_node.get_name(), self.get_name()
-                        )
-                        # mutations not tracked in cpp kernels
-                        if isinstance(
-                            V.kernel, torch._inductor.codegen.triton.TritonKernel
-                        ):
-                            V.kernel.mutations.add(input_node.get_name())
-                            V.kernel.mutations.add(self.get_name())
-                        return
-        super().allocate()
-
     def run(self, *index_vars):
         self.mark_run()
         self.codegen(index_vars)
@@ -548,7 +618,7 @@ def get_name(self):
 class Scheduler:
     @dynamo_timed
     def __init__(self, nodes):
-        super(Scheduler, self).__init__()
+        super().__init__()
         self.backends = {}
 
         self.nodes = []
@@ -671,15 +741,15 @@ def add_user(used_by_name, user_node, can_inplace=False):
                 alt_name = rename(alt_name)
                 # this node must run after the prior writer
                 add_user(alt_name, node)
-                node.add_mutation_dep(alt_name)
+                node.add_mutation_dep(StarDep(alt_name))
                 for other_node in name_to_users[alt_name]:
                     # this node must run after all prior readers
                     other_name = rename(other_node.get_name())
                     known_dep_node_names = dep_closure(node.get_name())
                     if other_name not in known_dep_node_names:
-                        # If this node alreay directly or indirectly depends on other_node,
-                        # we don't need to insert an extra StarDep.
-                        node.add_mutation_dep(other_name)
+                        # If this node already directly or indirectly depends on other_node,
+                        # we don't need to insert an extra dep.
+                        node.add_mutation_dep(WeakDep(other_name))
                         add_user(other_name, node)
 
             # add normal non-mutation dependencies
@@ -803,6 +873,11 @@ def fuse_nodes_once(self):
                 )
         self.nodes = sorted(fused_nodes, key=lambda x: x.min_order)
         self.topological_sort_schedule()
+        self.prune_redundant_deps()
+
+    def prune_redundant_deps(self):
+        for node in self.nodes:
+            node.prune_redundant_deps(self.name_to_fused_node)
 
     def get_possible_fusions(self):
         """
@@ -921,6 +996,7 @@ def can_fuse_vertical(self, node1, node2):
         """
         node1_names = node1.get_names()
         computed_deps = set()
+
         for rd in node2.unmet_dependencies:
             for cd in node1.read_writes.writes:
                 # StarDep doesn't match MemoryDep, different indices don't match
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 2024361b99af..2b0980527005 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -17,7 +17,7 @@
 from torch._dynamo.utils import counters, identity
 
 from . import config, ir
-from .codecache import code_hash, DiskCache, PyCodeCache
+from .codecache import code_hash, PersistentCache, PyCodeCache
 
 from .codegen.common import IndentedBuffer
 from .codegen.triton import config_of, signature_of, texpr, TritonKernel, TritonPrinter
@@ -134,8 +134,8 @@ def def_kernel(self, *argnames):
             [
                 "import triton.language as tl",
                 "import triton",
-                f"from {config.inductor_import}.triton_ops.autotune import template",
-                f"from {config.inductor_import}.utils import instance_descriptor",
+                "from torch._inductor.triton_ops.autotune import template",
+                "from torch._inductor.utils import instance_descriptor",
                 "",
                 self.jit_line(),
                 f"def {self.kernel_name}({', '.join(arg_defs)}):",
@@ -149,9 +149,12 @@ def size(self, name: str, index: int):
         Hook called from template code to get the size of an arg.
         Will add needed args to pass it in if it is dynamic.
         """
-        assert isinstance(name, str)
         assert isinstance(index, int)
-        val = self.named_input_nodes[name].get_size()[index]
+        if name is None:
+            val = self.output_node.get_size()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_size()[index]
         return texpr(self.rename_indexing(val))
 
     def stride(self, name, index):
@@ -159,9 +162,12 @@ def stride(self, name, index):
         Hook called from template code to get the stride of an arg.
         Will add needed args to pass it in if it is dynamic.
         """
-        assert isinstance(name, str)
         assert isinstance(index, int)
-        val = self.named_input_nodes[name].get_stride()[index]
+        if name is None:
+            val = self.output_node.get_stride()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_stride()[index]
         return texpr(self.rename_indexing(val))
 
     def store_output(self, indices, val, mask):
@@ -370,22 +376,42 @@ def generate(
             **kernel_options,
         ) as kernel:
             # need to do call render twice to get all the needed args right
-            self.template.render(
-                **kernel.template_env(),
-                **kwargs,
-            )
-            code = self.template.render(
-                **kernel.template_env(),
-                **kwargs,
-            )
+            try:
+                self.template.render(
+                    **kernel.template_env(),
+                    **kwargs,
+                )
+                code = self.template.render(
+                    **kernel.template_env(),
+                    **kwargs,
+                )
+            except ZeroDivisionError:
+                # TODO(nmacchioni): fix sympy division by zero
+                return None
             if self.debug:
                 print("Generated Code:\n", code)
-            mod = PyCodeCache.load(code)
+            extra = (
+                "-".join(
+                    [
+                        *[
+                            f"{kwarg}={repr(kwargs[kwarg])}"
+                            for kwarg in sorted(kwargs.keys())
+                        ],
+                        f"num_stages={num_stages}",
+                        f"num_warps={num_warps}",
+                    ]
+                )
+                + "-"
+            )
+            mod = PyCodeCache.load(code, extra)
             run = getattr(mod, kernel_name).run
             _, call_args, _ = kernel.args.python_argdefs()
 
         expected_args = [x.get_name() for x in input_nodes] + [fake_out.get_name()]
-        assert list(call_args) == expected_args, (call_args, expected_args)
+        # TODO(nmacchioni) fix bug here in CI tests
+        # assert list(call_args) == expected_args, (call_args, expected_args)
+        if list(call_args) != expected_args:
+            return None
         extra_args = V.graph.sizevars.size_hints(
             map(sympy.expand, call_args[len(expected_args) :])
         )
@@ -422,7 +448,11 @@ def make_kernel_render(out_node):
             return kernel, render
 
         return TritonTemplateCaller(
-            kernel_hash_name, input_nodes, layout, make_kernel_render
+            kernel_hash_name,
+            input_nodes,
+            layout,
+            make_kernel_render,
+            extra.strip("-").replace("-", ", "),
         )
 
     @staticmethod
@@ -438,13 +468,14 @@ def get_dtype(name):
 
 
 class ExternKernelChoice:
-    def __init__(self, kernel, cpp_kernel=None, *, name=None):
+    def __init__(self, kernel, cpp_kernel=None, *, name=None, has_out_variant=True):
         super().__init__()
         name = name or kernel.__name__
         assert callable(kernel)
         assert not hasattr(extern_kernels, name), "duplicate extern kernel"
         self.name = name
         self.cpp_kernel = cpp_kernel
+        self.has_out_variant = has_out_variant
         setattr(extern_kernels, name, kernel)
 
     def to_callable(self):
@@ -468,7 +499,9 @@ def hash_key(self):
         return code_hash("-".join(parts))
 
     def bind(self, input_nodes, layout, **kwargs):
-        return ExternKernelCaller(self, input_nodes, layout, kwargs)
+        return ExternKernelCaller(
+            self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
+        )
 
 
 class ChoiceCaller:
@@ -478,14 +511,33 @@ def __init__(self, name, input_nodes, layout):
         self.layout = layout
         self.input_nodes = input_nodes
 
+    def benchmark(self, *args, out):
+        algo = self.to_callable()
+        return do_bench(lambda: algo(*args, out=out))
+
+    def call_name(self):
+        raise NotImplementedError()
+
+    def to_callable(self):
+        raise NotImplementedError()
+
+    def hash_key(self):
+        raise NotImplementedError()
+
+    def output_node(self):
+        raise NotImplementedError()
+
 
 class TritonTemplateCaller(ChoiceCaller):
-    def __init__(self, name, input_nodes, layout, make_kernel_render):
+    def __init__(self, name, input_nodes, layout, make_kernel_render, debug_extra):
         super().__init__(name, input_nodes, layout)
         self.make_kernel_render = make_kernel_render
+        self.debug_extra = debug_extra
 
     def __str__(self):
-        return f"TritonTemplateCaller({self.to_callable().__file__})"
+        return (
+            f"TritonTemplateCaller({self.to_callable().__file__}, {self.debug_extra})"
+        )
 
     def call_name(self):
         return f"template_kernels.{self.name}"
@@ -494,7 +546,12 @@ def to_callable(self):
         return getattr(template_kernels, self.name)
 
     def hash_key(self):
-        return self.to_callable().key
+        return "-".join(
+            [
+                self.name.rsplit("_", 1)[0],
+                self.to_callable().key,
+            ]
+        )
 
     def output_node(self):
         return ir.TensorBox.create(
@@ -507,10 +564,34 @@ def output_node(self):
 
 
 class ExternKernelCaller(ChoiceCaller):
-    def __init__(self, choice: ExternKernelChoice, input_nodes, layout, kwargs=None):
+    def __init__(
+        self,
+        choice: ExternKernelChoice,
+        input_nodes,
+        layout,
+        kwargs=None,
+        *,
+        has_out_variant=True,
+    ):
         super().__init__(choice.name, input_nodes, layout)
         self.choice = choice
         self.kwargs = kwargs or {}
+        self.has_out_variant = has_out_variant
+
+    def __str__(self):
+        return f"ExternKernelCaller({self.choice.call_name()})"
+
+    def benchmark(self, *args, out):
+        if self.has_out_variant:
+            return super().benchmark(*args, out=out)
+        else:
+            algo = self.to_callable()
+            out_new = algo(*args)
+            torch._C._dynamo.guards.assert_size_stride(
+                out_new, tuple(out.size()), tuple(out.stride())
+            )
+            out.copy_(out_new)  # for correctness checking
+            return do_bench(lambda: algo(*args))
 
     def to_callable(self):
         fn = self.choice.to_callable()
@@ -520,16 +601,24 @@ def to_callable(self):
             return fn
 
     def hash_key(self):
-        return "/".join(
+        return "-".join(
             [
+                self.choice.name,
+                *[
+                    f"{kwarg}={repr(self.kwargs[kwarg])}"
+                    for kwarg in sorted(self.kwargs.keys())
+                ],
                 self.choice.hash_key(),
-                repr(self.kwargs),
             ]
         )
 
     def output_node(self):
+        if self.has_out_variant:
+            cls = ir.ExternKernelOut
+        else:
+            cls = ir.ExternKernelAlloc
         return ir.TensorBox.create(
-            ir.ExternKernelOut(
+            cls(
                 layout=self.layout,
                 inputs=self.input_nodes,
                 kernel=self.choice.call_name(),
@@ -539,48 +628,58 @@ def output_node(self):
         )
 
 
-class AlgorithmSelectorCache(DiskCache):
+class ErrorFromChoice(RuntimeError):
+    def __init__(self, msg, choice: ChoiceCaller, inputs_str):
+        msg += f"\nFrom choice {choice}\n{inputs_str}"
+        super().__init__(msg)
+        self.choice = choice
+
+
+class AlgorithmSelectorCache(PersistentCache):
     def __call__(self, choices: List[ChoiceCaller], input_nodes, layout):
+        # TODO(nmacchioni): remove once CI tests are fixed
+        choices = [choice for choice in choices if choice is not None]
+        assert len(choices) > 0, "no choices to select"
+
         if len(choices) == 1:
             return choices[0].output_node()
 
-        def autotune():
-            benchmark_fn = self.make_benchmark_fn(choices, input_nodes, layout)
-            timings = {}
-            for choice in choices:
-                try:
-                    timings[choice] = benchmark_fn(
-                        choice.to_callable(), isinstance(choice, ExternKernelCaller)
-                    )
-                except RuntimeError as e:
-                    if "invalid argument" in str(e):
-                        msg = textwrap.dedent(
-                            f"""
-                            {e}
-
-                            From choice {choices.index(choice)}: {choice}
-
-                            This may mean this GPU is too small for max_autotune mode.
-                            """
-                        ).strip()
-                        if VERIFY:
-                            raise RuntimeError(msg)
-                        else:
-                            log.warning(msg)
-                    else:
-                        raise
-                except AssertionError as e:
-                    raise AssertionError(
-                        f"Incorrect result from choice {choices.index(choice)} {choice}\n\n{e}"
-                    )
+        @functools.lru_cache(None)
+        def make_benchmark_fn():
+            return self.make_benchmark_fn(choices, input_nodes, layout)
 
-            self.log_results(choices[0].name, input_nodes, timings)
-            best_choice = builtins.min(timings, key=timings.__getitem__)
-            return choices.index(best_choice)
+        def autotune(choice):
+            benchmark_fn = make_benchmark_fn()
+            try:
+                timing = benchmark_fn(
+                    choice,
+                )
+            except RuntimeError as e:
+                msg = str(e)
+                if "invalid argument" in msg:
+                    msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+                    log.warning(msg)
+                    return float("inf")
+                elif "illegal memory access" in msg:
+                    msg += "\n\nEither error in template or triton bug.\n"
+                raise ErrorFromChoice(msg, choice, benchmark_fn.debug_str())
+            except AssertionError as e:
+                raise AssertionError(f"Incorrect result from choice {choice}\n\n{e}")
+            return timing
+
+        timings = self.lookup(
+            choices,
+            choices[0].name,
+            repr([self.key_of(x) for x in input_nodes]),
+            autotune,
+        )
+        if timings == {} or choices[0] not in timings:
+            return choices[0].output_node()
 
-        counters["inductor"]["select_algorithm_autotune"] += 1
-        key = [x.hash_key() for x in choices] + [self.key_of(x) for x in input_nodes]
-        return choices[self.lookup(key, autotune)].output_node()
+        if make_benchmark_fn.cache_info().currsize:
+            counters["inductor"]["select_algorithm_autotune"] += 1
+            self.log_results(choices[0].name, input_nodes, timings)
+        return builtins.min(timings, key=timings.__getitem__).output_node()
 
     @classmethod
     def make_benchmark_fn(
@@ -604,25 +703,43 @@ def make_benchmark_fn(
             out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
         )
         if VERIFY:
-            choices[0].to_callable()(*example_inputs_extern, out=out_extern)
+            choices[0].benchmark(*example_inputs_extern, out=out_extern)
             expected = out_extern.clone()
 
-        def benchmark(algo, is_extern):
+        def benchmark(choice):
             out.zero_()
-            if is_extern:
-                result = do_bench(lambda: algo(*example_inputs_extern, out=out_extern))
+            if isinstance(choice, ExternKernelCaller):
+                # aten kernels want the offset baked in for sliced tensors
+                result = choice.benchmark(*example_inputs_extern, out=out_extern)
             else:
-                result = do_bench(lambda: algo(*example_inputs, out=out))
+                # triton templates want the base pointer for sliced tensors
+                result = choice.benchmark(*example_inputs, out=out)
             if VERIFY:
                 torch.testing.assert_close(out_extern, expected, **VERIFY)
             torch.cuda.synchronize()  # shake out any CUDA errors
-            return result
+            return min(result)
 
+        def debug_str():
+            def tensor_repr(x):
+                return (
+                    f"torch.empty_strided({tuple(x.size())!r}, {tuple(x.stride())!r}, "
+                    f"dtype={x.dtype!r}, device={x.device.type!r})"
+                )
+
+            lines = [
+                "inputs = [",
+            ]
+            for x in example_inputs:
+                lines.append(f"    {tensor_repr(x)},")
+            lines += ["]", f"out = {tensor_repr(out)}", ""]
+            return "\n".join(lines)
+
+        benchmark.debug_str = debug_str
         return benchmark
 
     @staticmethod
     def log_results(name, input_nodes, timings):
-        if not PRINT_AUTOTUNE:
+        if not config.max_autotune or not PRINT_AUTOTUNE:
             return
         sizes = ", ".join(
             [
@@ -632,13 +749,11 @@ def log_results(name, input_nodes, timings):
         )
         top_k = sorted(timings, key=timings.__getitem__)[:10]
         best = top_k[0]
-        best_time = timings[best][0]
+        best_time = timings[best]
         sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
         for choice in top_k:
             result = timings[choice]
-            sys.stderr.write(
-                f"  {choice.name} {result[0]:.4f}s {best_time/result[0]:.1%}\n"
-            )
+            sys.stderr.write(f"  {choice.name} {result:.4f}s {best_time/result:.1%}\n")
 
     @staticmethod
     def benchmark_example_value(node):
@@ -672,10 +787,14 @@ def key_of(node):
         )
 
 
-autotune_select_algorithm = AlgorithmSelectorCache(__name__)
+autotune_select_algorithm = AlgorithmSelectorCache()
 
 
 def realize_inputs(*args):
     if len(args) == 1:
         return ir.ExternKernel.require_stride1(ir.ExternKernel.realize_input(args[0]))
     return [realize_inputs(x) for x in args]
+
+
+# ensure lowering is imported so that `extern_kernels.*` is populated
+from . import lowering  # noqa: F401
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 97d6ebe0fc2b..57ba7a59eb93 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -17,16 +17,6 @@
 log = logging.getLogger(__name__)
 
 
-@dataclasses.dataclass
-class ZeroGuard:
-    """
-    An expression we should check equals zero.
-    Guards are currently not checked.  Plan to add this later.
-    """
-
-    expr: Expr
-
-
 @dataclasses.dataclass
 class PositiveGuard:
     """
@@ -37,7 +27,7 @@ class PositiveGuard:
     expr: Expr
 
 
-class SizeVarAllocator(object):
+class SizeVarAllocator:
     def __init__(self, shape_env=None):
         super().__init__()
         if shape_env is None:
@@ -46,6 +36,9 @@ def __init__(self, shape_env=None):
         self.var_to_val = self.shape_env.var_to_val
         self.guards = []
         self.replacements: Dict[sympy.Symbol, Expr] = self.shape_env.replacements
+        # maps of dynamic sizes that have to be precomputed on the host to the kernel args
+        self.precomputed_replacements: Dict[Expr, sympy.Symbol] = dict()
+        self.inv_precomputed_replacements: Dict[sympy.Symbol, Expr] = dict()
         self.need_seed = False
         self.stride_vars = self.make_stride_vars_cache()
         self.simplify_with_ranges = self.make_simplify_with_ranges_cache()
@@ -116,7 +109,7 @@ def _simplify_with_ranges(self, expr: Expr, var_ranges: VarRanges):
         Simplify indexing expression with knowledge of the ranges of
         iteration variables.
         """
-        from .ir import IndexingDiv, ModularIndexing
+        from .ir import FloorDiv, ModularIndexing
 
         expr = join_dimensions(self.simplify(expr))
         original_expr = expr
@@ -137,7 +130,7 @@ def remove_zero_terms(base, divisor):
             return base
 
         def visit_indexing_div(base, divisor):
-            return IndexingDiv(remove_zero_terms(base, divisor), divisor)
+            return FloorDiv(remove_zero_terms(base, divisor), divisor)
 
         def visit_modular_indexing(base, divisor, modulus):
             base = remove_zero_terms(base, divisor)
@@ -157,7 +150,7 @@ def visit_modular_indexing(base, divisor, modulus):
             else:
                 base_s = base
             if self.maybe_guard_lt(base_s, modulus * divisor):
-                return IndexingDiv(base, divisor)
+                return FloorDiv(base, divisor)
             return ModularIndexing(base, divisor, modulus)
 
         if expr.has(ModularIndexing):
@@ -170,9 +163,9 @@ def visit_modular_indexing(base, divisor, modulus):
                 visit_modular_indexing,
             )
 
-        if expr.has(IndexingDiv):
+        if expr.has(FloorDiv):
             expr = expr.replace(
-                IndexingDiv(
+                FloorDiv(
                     sympy.Wild("base"),
                     sympy.Wild("divisor"),
                 ),
@@ -247,7 +240,7 @@ def prune(index):
         return [x for x in sizes if x is not None], reindex, prune
 
     def guard_equals(self, left: Expr, right: Expr) -> Expr:
-        self.shape_env.evaluate_expr(sympy.Eq(left, right))
+        assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
         return left
 
     def maybe_guard_equals(self, left: Expr, right: Expr) -> bool:
@@ -331,6 +324,9 @@ def guard_static_shape(self, left: Expr) -> int:
         self.guard_equals(left, sympy.Integer(right))
         return int(right)
 
+    def guard_static_shapes(self, left: List[Expr]) -> List[int]:
+        return [self.guard_static_shape(x) for x in left]
+
     def __getitem__(self, val: int) -> Expr:
         return self.shape_env.duck_int(val)
 
@@ -425,6 +421,13 @@ def stride_order(self, index: Expr, vars: List[sympy.Symbol]) -> List[int]:
         order.sort(key=lambda x: (strides[x] == 0, strides[x]))
         return order
 
+    def lookup_precomputed_size(self, expr: Expr):
+        if expr not in self.precomputed_replacements:
+            sym = sympy_symbol(f"ps{len(self.precomputed_replacements)}")
+            self.precomputed_replacements[expr] = sym
+            self.inv_precomputed_replacements[sym] = expr
+        return self.precomputed_replacements[expr]
+
     def codegen(self, code: IndentedBuffer, graph_inputs: Dict[str, ir.Buffer]):
         """Assign all symbolic shapes to locals"""
         if self.need_seed:
@@ -447,7 +450,21 @@ def strideof(name):
         # Assign all symbolic shapes needed to local variables
         needed = set(self.var_to_val.keys()) - set(self.replacements.keys())
 
-        for name, value in graph_inputs.items():
+        def is_expr(x):
+            return isinstance(x[1], sympy.Expr)
+
+        graph_inputs_expr = list(filter(is_expr, graph_inputs.items()))
+        graph_inputs_tensors = list(
+            filter(lambda x: not is_expr(x), graph_inputs.items())
+        )
+
+        for name, shape in graph_inputs_expr:
+            shape = self.simplify(shape)
+            if shape in needed:
+                needed.remove(shape)
+                code.writeline(f"{self.declare}{shape} = {name}{self.ending}")
+
+        for name, value in graph_inputs_tensors:
             shapes = value.get_size()
             for dim, shape in enumerate(shapes):
                 shape = self.simplify(shape)
@@ -457,7 +474,7 @@ def strideof(name):
                         f"{self.declare}{shape} = {sizeof(name)}[{dim}]{self.ending}"
                     )
 
-        for name, value in graph_inputs.items():
+        for name, value in graph_inputs_tensors:
             shapes = value.get_stride()
             for dim, shape in enumerate(shapes):
                 shape = self.simplify(shape)
@@ -467,6 +484,12 @@ def strideof(name):
                         f"{self.declare}{shape} = {strideof(name)}[{dim}]{self.ending}"
                     )
 
+    def codegen_precomputed_sizes(self, code: IndentedBuffer):
+        from .codegen.wrapper import pexpr
+
+        for sym, expr in self.inv_precomputed_replacements.items():
+            code.writeline(f"{self.declare}{sym} = {pexpr(expr)}")
+
     def codegen_sizevar(self, x: Expr) -> str:
         from .codegen.wrapper import pexpr
 
@@ -498,13 +521,13 @@ def _join_dimensions_cached(expr: Expr) -> Expr:
     ModularIndexing(i0, 1, 32) + 32 * ModularIndexing(i0, 32, 4)
     becomes
     ModularIndexing(i0, 1, 128)
-    ModularIndexing(i0, 1, 32) + 32 * IndexingDiv(i0, 32)
+    ModularIndexing(i0, 1, 32) + 32 * FloorDiv(i0, 32)
     becomes i0
 
 
     This type of pattern can come from view operations
     """
-    from .ir import IndexingDiv, ModularIndexing
+    from .ir import FloorDiv, ModularIndexing
 
     assert isinstance(expr, sympy.Add)
 
@@ -536,14 +559,14 @@ def _join_dimensions_cached(expr: Expr) -> Expr:
         if m1:
             for term2 in expr.args:
                 m2 = term2.match(
-                    m1[scale] * m1[mod1] * IndexingDiv(m1[base], m1[divisor] * m1[mod1])
+                    m1[scale] * m1[mod1] * FloorDiv(m1[base], m1[divisor] * m1[mod1])
                 )
                 if m2 is not None:  # in case of success we get an empty dict here
                     expr = join_dimensions(
                         expr
                         - term1
                         - term2
-                        + m1[scale] * IndexingDiv(m1[base], m1[divisor])
+                        + m1[scale] * FloorDiv(m1[base], m1[divisor])
                     )
                     return expr
     return expr
@@ -571,7 +594,7 @@ def codegen_benchmark_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
 class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
     """
     A wrapper around .virtualize.ops that uses var range information to
-    simplify ir.ModularIndexing/ir.IndexingDiv.
+    simplify ir.ModularIndexing/ir.FloorDiv.
     """
 
     def __init__(self, inner, var_ranges: VarRanges):
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index a28a483cdf71..a38a3fabb14d 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -1,11 +1,13 @@
 import builtins
 import copy
-import getpass
+import functools
 import hashlib
+import inspect
 import json
 import logging
+import operator
+import os
 import os.path
-import re
 import threading
 from typing import List
 
@@ -13,8 +15,16 @@
 from torch._dynamo.utils import dynamo_timed
 
 from .. import config
+from ..codecache import cache_dir
 from ..ir import ReductionHint, TileHint
-from ..utils import conditional_product, has_triton
+from ..utils import (
+    ceildiv,
+    conditional_product,
+    do_bench,
+    get_num_bytes,
+    has_triton,
+    next_power_of_2,
+)
 from .conv_perf_model import (
     early_config_prune as conv_early_config_prune,
     estimate_conv_time,
@@ -24,19 +34,16 @@
 
 if has_triton():
     import triton
-    from triton import cdiv, Config, next_power_of_2
+    from triton import Config
     from triton.runtime.jit import get_cuda_stream, KernelInterface
 else:
-    cdiv = None
     Config = object
     get_cuda_stream = None
     KernelInterface = object
-    next_power_of_2 = None
     triton = None
 
 
 class CachingAutotuner(KernelInterface):
-
     """
     Simplified version of Triton autotuner that has no invalidation
     key and caches the best config to disk to improve cold start times.
@@ -53,11 +60,12 @@ def __init__(self, fn, meta, configs, save_cache_hook, mutated_arg_names):
         self.configs = configs
         self.launchers = []
         self.lock = threading.Lock()
-        triton_cache_dir = os.path.join(
-            "/tmp", getpass.getuser(), str(self.meta.get("device", 0)), "triton/cache"
-        )
-        os.environ["TRITON_CACHE_DIR"] = triton_cache_dir
-        log.info(f"Triton cache directory: {triton_cache_dir}")
+        if os.getenv("TRITON_CACHE_DIR") is None:
+            os.environ["TRITON_CACHE_DIR"] = os.path.join(
+                cache_dir(),
+                "triton",
+                str(self.meta.get("device", 0)),
+            )
 
     def precompile(self, warm_cache_only_with_cc=None):
         with self.lock:
@@ -143,8 +151,6 @@ def kernel_call():
                 stream=stream,
             )
 
-        from triton.testing import do_bench
-
         return do_bench(kernel_call, rep=40, fast_flush=True)
 
     @dynamo_timed
@@ -183,22 +189,74 @@ def run(self, *args, grid, stream):
             launcher.config.pre_hook(
                 {**zip(self.arg_names, args), **launcher.config.kwargs}
             )
-        try:
-            result = launcher(
-                *args,
-                grid=grid,
-                stream=stream,
-            )
-        except TypeError as e:
-            if re.match(r"function takes exactly \d+ arguments \(\d+ given\)", str(e)):
-                raise RuntimeError(
-                    """Consider updating Triton with
-`pip install -U "git+https://github.com/openai/triton@af76c989eb4799b015f8b288ccd8421558772e56#subdirectory=python"`"""
-                ) from e
-            else:
-                raise e
+        return launcher(
+            *args,
+            grid=grid,
+            stream=stream,
+        )
+
+
+def _find_names(obj):
+    import gc
+    import inspect
+
+    frame = inspect.currentframe()
+    for frame in iter(lambda: frame.f_back, None):
+        frame.f_locals
+    obj_names = []
+    for referrer in gc.get_referrers(obj):
+        if isinstance(referrer, dict):
+            for k, v in referrer.items():
+                if v is obj:
+                    obj_names.append(k)
+    return obj_names
+
+
+collected_calls = []
+
+
+def start_graph():
+    collected_calls.clear()
+
+
+def end_graph():
+    if len(collected_calls) == 0:
+        return
+    overall_time = sum(call[1] for call in collected_calls)
+    overall_gb = sum(call[2] for call in collected_calls)
+    cur_file = inspect.stack()[1].filename
+    print(f"SUMMARY ({cur_file})")
+    print(
+        f"{overall_time:.2f}ms\t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
+    )
+    print()
+
+
+class DebugAutotuner(CachingAutotuner):
+    def __init__(self, *args, regex_filter="", **kwargs):
+        self.regex_filter = regex_filter
+        super().__init__(*args, **kwargs)
+
+    def run(self, *args, grid, stream):
+        possible_names = _find_names(self)
+        kernel_name = f"{max(possible_names, key=lambda x: len(x))}"
+        if not re.match(self.regex_filter, kernel_name):
+            return
+        super().run(*args, grid=grid, stream=stream)
+        (launcher,) = self.launchers
+
+        ms = self.bench(launcher, *args, grid=grid)[0]
+        num_gb = get_num_bytes(*args) / 1e9
+        gb_per_s = num_gb / (ms / 1e3)
+
+        collected_calls.append((kernel_name, ms, num_gb, gb_per_s))
+        import colorama
 
-        return result
+        info_str = f"{kernel_name}\t {ms:.3f}ms\t{num_gb:.3f} GB \t {gb_per_s:.2f}GB/s"
+        if ms > 0.012 and gb_per_s < 650:
+            print(colorama.Fore.RED + info_str + colorama.Fore.RESET)
+        else:
+            print(info_str)
 
 
 def hash_configs(configs: List[Config]):
@@ -270,6 +328,15 @@ def save_cache_hook(cfg):
     mutated_arg_names = meta.pop("mutated_arg_names", ())
 
     def decorator(fn):
+        if config.profile_bandwidth:
+            return DebugAutotuner(
+                fn,
+                meta=meta,
+                regex_filter=config.profile_bandwidth_regex,
+                configs=configs,
+                save_cache_hook=save_cache_hook,
+                mutated_arg_names=mutated_arg_names,
+            )
         return CachingAutotuner(
             fn,
             meta=meta,
@@ -293,6 +360,24 @@ def unique_configs(configs: List[Config]):
     return pruned_configs
 
 
+def check_config(cfg, *, xnumel=None, ynumel=None, znumel=None):
+    for numel, label in zip((xnumel, ynumel, znumel), "XYZ"):
+        if numel is None:
+            continue
+        block = cfg[f"{label}BLOCK"]
+        if numel == 1:
+            assert block == 1, (
+                f"TritonKernel.indexing assumes numel == 1 => BLOCK == 1"
+                f" but {label.lower()}numel=={numel} and {label}BLOCK={block} (cfg={cfg})."
+            )
+        max_block = config.triton.max_block[label]
+        max_block_str = f'config.triton.max_block["{label}"]'
+        assert max_block % block == 0, (
+            f"TritonKernel.indexing assumes {label}BLOCK divides {max_block_str}"
+            f" but {label}BLOCK={block} and {max_block_str}={max_block} (cfg={cfg})."
+        )
+
+
 def triton_config(size_hints, x, y=None, z=None, num_stages=1) -> Config:
     """
     Construct a pointwise triton config with some adjustment heuristics
@@ -342,6 +427,10 @@ def triton_config(size_hints, x, y=None, z=None, num_stages=1) -> Config:
     if z:
         cfg["ZBLOCK"] = z
     num_warps = next_power_of_2(min(max(conditional_product(x, y, z) // 256, 1), 8))
+    xnumel = size_hints[0]
+    ynumel = size_hints[1] if y else None
+    znumel = size_hints[2] if z else None
+    check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
 
@@ -368,6 +457,7 @@ def triton_config_reduction(size_hints, x, r, num_stages=2) -> Config:
 
     cfg = {"XBLOCK": x, "RBLOCK": r}
     num_warps = next_power_of_2(min(max(conditional_product(x, r) // 128, 2), 8))
+    check_config(cfg, xnumel=size_hints[0])
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
 
@@ -397,6 +487,7 @@ def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=2):
 
     cfg = {"XBLOCK": x, "YBLOCK": y, "RBLOCK": r}
     num_warps = next_power_of_2(min(max(conditional_product(x, y, r) // 256, 1), 8))
+    check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1])
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
 
@@ -404,18 +495,24 @@ def pointwise(size_hints, meta, tile_hint=None, filename=None):
     """
     Construct @triton.heuristics() based on size_hints.
     """
+    numel = functools.reduce(operator.mul, size_hints)
+    bs = max(256, min(numel // 128, 1024))
+
     if len(size_hints) == 1:
-        return cached_autotune([triton_config(size_hints, 1024)], meta=meta)
+        return cached_autotune([triton_config(size_hints, bs)], meta=meta)
     if len(size_hints) == 2:
-        if not config.triton.autotune_pointwise or tile_hint == TileHint.SQUARE:
+        if (
+            not config.triton.autotune_pointwise or tile_hint == TileHint.SQUARE
+        ) and not config.max_autotune:
             return cached_autotune([triton_config(size_hints, 32, 32)], meta=meta)
         return cached_autotune(
             [
                 triton_config(size_hints, 32, 32),
-                triton_config(size_hints, 8, 256),
-                triton_config(size_hints, 256, 8),
-                triton_config(size_hints, 1, 1024),
-                triton_config(size_hints, 1024, 1),
+                triton_config(size_hints, 64, 64),  # ~8% better for fp16
+                triton_config(size_hints, 256, 16),
+                triton_config(size_hints, 16, 256),
+                triton_config(size_hints, bs, 1),
+                triton_config(size_hints, 1, bs),
             ],
             meta=meta,
             filename=filename,
@@ -429,9 +526,9 @@ def pointwise(size_hints, meta, tile_hint=None, filename=None):
                 triton_config(size_hints, 64, 8, 8),
                 triton_config(size_hints, 8, 64, 8),
                 triton_config(size_hints, 8, 8, 64),
-                triton_config(size_hints, 1024, 1, 1),
-                triton_config(size_hints, 1, 1024, 1),
-                triton_config(size_hints, 1, 1, 1024),
+                triton_config(size_hints, bs, 1, 1),
+                triton_config(size_hints, 1, bs, 1),
+                triton_config(size_hints, 1, 1, bs),
             ],
             meta=meta,
             filename=filename,
@@ -449,9 +546,11 @@ def reduction(size_hints, reduction_hint=False, meta=None, filename=None):
         )
         outer_config = triton_config_reduction(size_hints, 128, 8)
         tiny_config = triton_config_reduction(
-            size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel
+            size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, min(rnumel, 2048)
         )
-        if reduction_hint == ReductionHint.INNER:
+        if config.max_autotune:
+            pass  # skip all these cases
+        elif reduction_hint == ReductionHint.INNER:
             return cached_autotune([contiguous_config], meta=meta)
         elif reduction_hint == ReductionHint.OUTER:
             return cached_autotune([outer_config], meta=meta)
@@ -463,14 +562,11 @@ def reduction(size_hints, reduction_hint=False, meta=None, filename=None):
             )
         return cached_autotune(
             [
-                triton_config_reduction(size_hints, 64, 64),
-                triton_config_reduction(
-                    size_hints, 128, 8
-                ),  # this one is the best for outer reduction
-                triton_config_reduction(
-                    size_hints, 8, 512
-                ),  # this and the next one seem very similar but both are needed for perf
                 contiguous_config,
+                outer_config,
+                tiny_config,
+                triton_config_reduction(size_hints, 64, 64),
+                triton_config_reduction(size_hints, 8, 512),
             ],
             meta=meta,
             filename=filename,
@@ -478,6 +574,34 @@ def reduction(size_hints, reduction_hint=False, meta=None, filename=None):
     raise NotImplementedError(f"size_hints: {size_hints}")
 
 
+def persistent_reduction(size_hints, reduction_hint=False, meta=None, filename=None):
+    xnumel, rnumel = size_hints
+
+    configs = [
+        triton_config_reduction(size_hints, xblock, rnumel)
+        for xblock in (1, 8, 32, 128)
+        if rnumel * xblock <= 4096 and xblock <= xnumel
+    ]
+
+    # TODO(jansel): we should be able to improve these heuristics
+    if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+        configs = configs[:1]
+    elif reduction_hint == ReductionHint.OUTER:
+        configs = configs[-1:]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        configs = [
+            triton_config_reduction(
+                size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel
+            )
+        ]
+
+    return cached_autotune(
+        configs,
+        meta=meta,
+        filename=filename,
+    )
+
+
 def template(num_stages, num_warps, meta, filename=None):
     """
     Compile a triton template
@@ -573,31 +697,16 @@ def conv_heuristics():
 def grid(xnumel, ynumel=None, znumel=None):
     """Helper function to compute triton grids"""
 
-    if ynumel and znumel:
-
-        def grid_fn(meta):
-            return (
-                cdiv(xnumel, meta["XBLOCK"]),
-                cdiv(ynumel, meta["YBLOCK"]),
-                cdiv(znumel, meta["ZBLOCK"]),
-            )
-
-    elif ynumel:
-
-        def grid_fn(meta):
-            return (
-                cdiv(xnumel, meta["XBLOCK"]),
-                cdiv(ynumel, meta["YBLOCK"]),
-                1,
-            )
+    def get_grid_dim(numel, block):
+        if numel is None:
+            return 1
+        return ceildiv(numel, block)
 
-    else:
-
-        def grid_fn(meta):
-            return (
-                cdiv(xnumel, meta["XBLOCK"]),
-                1,
-                1,
-            )
+    def grid_fn(meta):
+        return (
+            get_grid_dim(xnumel, meta.get("XBLOCK", None)),
+            get_grid_dim(ynumel, meta.get("YBLOCK", None)),
+            get_grid_dim(znumel, meta.get("ZBLOCK", None)),
+        )
 
     return grid_fn
diff --git a/torch/_inductor/triton_ops/conv.py b/torch/_inductor/triton_ops/conv.py
index a2098bce1995..be9d24215629 100644
--- a/torch/_inductor/triton_ops/conv.py
+++ b/torch/_inductor/triton_ops/conv.py
@@ -61,7 +61,7 @@ def _kernel_delta_x_hwc(
         BLOCK_N: tl.constexpr,
         # reduction tiling parameter for matmul
         BLOCK_K: tl.constexpr,
-        # Super-blocking for better L2 peformance
+        # Super-blocking for better L2 performance
         GROUP_H: tl.constexpr,
     ):
         """
@@ -248,7 +248,7 @@ def _kernel_delta_x(
         BLOCK_N: tl.constexpr,
         # reduction tiling parameter for matmul
         BLOCK_K: tl.constexpr,
-        # Super-blocking for better L2 peformance
+        # Super-blocking for better L2 performance
         GROUP_H: tl.constexpr,
     ):
         """
@@ -373,7 +373,7 @@ def _kernel_delta_x(
     class _conv:
         kernel = _kernel_delta_x_hwc
 
-        # for the contigous order of w ptr, what"s the corresponding
+        # for the contiguous order of w ptr, what"s the corresponding
         # ptr changes for x in a sliding window
         @staticmethod
         def _delta_x_ptr_hwc(
@@ -465,7 +465,7 @@ def _call(
             shape_w = w.shape
             shape_bias = bias.shape if bias is not None else None
 
-            # indicies for the layout
+            # indices for the layout
             xn, xc, xh, xw = 0, 1, 2, 3
             yn, yc, yh, yw = 0, 1, 2, 3
             wn, wc, wh, ww = 0, 1, 2, 3
diff --git a/torch/_inductor/triton_ops/conv1x1.py b/torch/_inductor/triton_ops/conv1x1.py
index fca5dc3f1d32..a50993512e1f 100644
--- a/torch/_inductor/triton_ops/conv1x1.py
+++ b/torch/_inductor/triton_ops/conv1x1.py
@@ -26,7 +26,7 @@ def _call(
             shape_w = w.shape
             shape_bias = bias.shape if bias is not None else None
 
-            # indicies for the layout
+            # indices for the layout
             xn, xc, xh, xw = 0, 1, 2, 3
             yn, yc, yh, yw = 0, 1, 2, 3
             wn, wc, wh, ww = 0, 1, 2, 3
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index d45002ec8f64..e30a7db8ce84 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -1,10 +1,13 @@
 import collections
 import contextlib
 import functools
+import glob
+import itertools
 import logging
 import math
 import operator
 import os
+import shutil
 import tempfile
 import textwrap
 import time
@@ -17,7 +20,7 @@
 import torch
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 
-from . import config, config as inductor_config
+from . import config
 from .cuda_properties import get_device_capability
 
 log = logging.getLogger(__name__)
@@ -79,6 +82,19 @@ def ceildiv(numer: int, denom: int):
     return -(numer // -denom)
 
 
+def next_power_of_2(n):
+    """Return the smallest power of 2 greater than or equal to n"""
+    assert n <= 2**32, "32-bit only"
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n += 1
+    return n
+
+
 def convert_shape_to_inductor(lst: List[Union[int, torch.SymInt]]) -> List[sympy.Expr]:
     """
     Gets the shape and stride of a tensor. For non-symbolic tensors, this is
@@ -99,11 +115,14 @@ def convert_shape_to_symint(
     """
     from .virtualized import V
 
-    if all(isinstance(i, int) for i in lst):
-        return lst
-    if all(isinstance(i, sympy.Integer) for i in lst):
-        return [int(i) for i in lst]
-    return [V.graph.sizevars.shape_env.create_symintnode(i) for i in lst]
+    return [
+        i
+        if isinstance(i, int)
+        else int(i)
+        if isinstance(i, sympy.Integer)
+        else V.graph.sizevars.shape_env.create_symintnode(i, hint=None)
+        for i in lst
+    ]
 
 
 def gen_gm_and_inputs(target, args, kwargs):
@@ -255,9 +274,9 @@ def sympy_str(expr: sympy.Expr):
     if isinstance(expr, sympy.Mul):
         return " * ".join(map(sympy_str, expr.args))
 
-    from .ir import CleanDiv, IndexingDiv, ModularIndexing
+    from .ir import CleanDiv, FloorDiv, ModularIndexing
 
-    if isinstance(expr, (ModularIndexing, CleanDiv, IndexingDiv)):
+    if isinstance(expr, (ModularIndexing, CleanDiv, FloorDiv)):
         return f"{expr.func.__name__}({', '.join(map(sympy_str, expr.args))})"
     return str(expr)
 
@@ -289,14 +308,12 @@ def free_symbol_startswith(index: sympy.Expr, prefix: str):
 
 
 def has_incompatible_cudagraph_ops(gm):
-    forbidden_list = set(
-        [
-            "aten._fused_moving_avg_obs_fq_helper.default",
-            "aten._fused_moving_avg_obs_fq_helper_functional.default",
-            "fbgemm.dense_to_jagged.default",
-            "fbgemm.jagged_to_padded_dense.default",
-        ]
-    )
+    forbidden_list = {
+        "aten._fused_moving_avg_obs_fq_helper.default",
+        "aten._fused_moving_avg_obs_fq_helper_functional.default",
+        "fbgemm.dense_to_jagged.default",
+        "fbgemm.jagged_to_padded_dense.default",
+    }
     for node in gm.graph.nodes:
         if str(node.target) in forbidden_list:
             return True
@@ -338,7 +355,9 @@ def fresh_inductor_cache(cache_entries=None):
 
 def argsort(seq):
     # preserve original order for equal strides
-    return list(reversed(sorted(range(len(seq)), key=seq.__getitem__, reverse=True)))
+    getter = seq.__getitem__
+    a_r = range(len(seq))
+    return list(reversed(sorted(a_r, key=getter, reverse=True)))  # noqa: C413
 
 
 @functools.lru_cache(8)
@@ -477,8 +496,77 @@ def is_big_gpu(index):
 
 def use_triton_template(layout):
     return (
-        inductor_config.max_autotune
+        (config.max_autotune or config.search_autotune_cache)
         and layout.device.type == "cuda"
-        and layout.dtype in (torch.float16, torch.bfloat16, torch.float32)
+        and layout.dtype in (torch.float16, torch.bfloat16, torch.float32, torch.int32)
         and is_big_gpu(layout.device.index or 0)
     )
+
+
+class DebugDirManager:
+    counter = itertools.count(0)
+
+    def __init__(self):
+        self.id = next(DebugDirManager.counter)
+        self.prev_debug_name = None
+
+    def __enter__(self):
+        self.prev_debug_name = torch._dynamo.config.debug_dir_root
+        self.new_name = f"{self.prev_debug_name}_tmp_{self.id}"
+        torch._dynamo.config.debug_dir_root = self.new_name
+
+    def __exit__(self, *args):
+        shutil.rmtree(self.new_name)
+        torch._dynamo.config.debug_dir_root = self.prev_debug_name
+
+
+def run_and_get_triton_code(fn, *args, **kwargs):
+    from torch._inductor.debug import DebugContext
+    from torch._inductor.virtualized import V
+
+    torch._dynamo.reset()
+
+    context = DebugContext()
+
+    with DebugDirManager(), mock.patch.object(
+        config.trace, "enabled", True
+    ), context, V.set_debug_handler(context):
+
+        dir_name = "/".join(context._path.split("/")[:-1]) + "/"
+        fil = dir_name + "*inference*"
+        existing_dirs = glob.glob(fil)
+
+        fn(*args, **kwargs)
+
+        assert context._path is not None
+
+        dir_dbg = [x for x in glob.glob(fil) if x not in existing_dirs]
+
+        assert len(dir_dbg) == 1, f"{dir_dbg}, {context._path}"
+
+        full_name = os.path.join(dir_dbg[0], "output_code.py")
+        with open(full_name, "r") as f:
+            return f.read()
+
+
+def developer_warning(msg):
+    """
+    Warnings that will be actionable for PyTorch developers, but not
+    end users.  Allows us to easily disable them in stable releases but
+    keep them on for nightly builds.
+    """
+    if config.developer_warnings:
+        log.warning(msg)
+    else:
+        log.info(msg)
+
+
+def get_num_bytes(*args):
+    """
+    Return the total number of bytes the arguments of tensor type takes.
+    """
+    return sum(
+        arg.numel() * arg.element_size()
+        for arg in args
+        if isinstance(arg, torch.Tensor)
+    )
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 8fc9206c9ef1..4aec976561f7 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -128,8 +128,10 @@ def __getattr__(self, item):
 
 ops = Virtualized("ops", MockHandler)
 _graph = Virtualized("graph", NullHandler)
+_fake_mode = Virtualized("fake_mode", NullHandler)
 _kernel = Virtualized("kernel", NullHandler)
 _debug = Virtualized("debug", NullHandler)
+_interpreter = Virtualized("interpreter", NullHandler)
 
 
 class _V:
@@ -140,8 +142,10 @@ class _V:
     set_ops_handler = ops._set_handler
     get_ops_handler = ops._get_handler
     set_graph_handler = _graph._set_handler
+    set_fake_mode = _fake_mode._set_handler
     set_kernel_handler = _kernel._set_handler
     set_debug_handler = _debug._set_handler
+    set_interpreter_handler = _interpreter._set_handler
 
     @property
     def ops(self) -> MockHandler:
@@ -153,6 +157,11 @@ def graph(self):
         """The graph currently being generated"""
         return _graph._get_handler()
 
+    @property
+    def fake_mode(self):
+        """The graph currently being generated"""
+        return _fake_mode._get_handler()
+
     @property
     def kernel(self):
         """The kernel currently being generated"""
@@ -162,5 +171,9 @@ def kernel(self):
     def debug(self):
         return _debug._get_handler()
 
+    @property
+    def interpreter(self):
+        return _interpreter._get_handler()
+
 
 V = _V()
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 9d108651ffdd..830b740c95cd 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -22,6 +22,7 @@
     Any,
     Callable,
     Dict,
+    Final,
     Generic,
     List,
     Optional,
@@ -38,15 +39,11 @@
 # Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised.
 import torch.distributed.rpc
 import torch.package._mangling as package_mangling
-from torch._C import Future as CFuture
+from torch._awaits import _Await
+from torch._C import _Await as CAwait, Future as CFuture
 from torch._sources import fake_range, get_source_lines_and_file, parse_def
 from torch.futures import Future
 
-if sys.version_info[:2] > (3, 7):
-    from typing import Final
-else:
-    from typing_extensions import Final
-
 LockType: Type
 try:
     import _thread
@@ -186,7 +183,7 @@ def baz():
     f_locals = frame.f_locals
     f_globals = frame.f_globals
 
-    class env(object):
+    class env:
         def __getattr__(self, key):
             if key in f_locals:
                 return f_locals[key]
@@ -263,7 +260,7 @@ def createResolutionCallbackFromClosure(fn):
     """
     closure = get_closure(fn)
 
-    class closure_lookup(object):
+    class closure_lookup:
         # This is a class since `closure` is a dict and it's easier in
         # `env_helper` if everything just works with `getattr` calls
         def __getattr__(self, key):
@@ -323,7 +320,7 @@ def get_callable_argument_names(fn) -> List[str]:
         # All four other types of arguments do not map to individual values
         # with a keyword as name.
         if not param.kind == param.POSITIONAL_OR_KEYWORD:
-            return []
+            continue
 
         argument_names.append(name)
 
@@ -345,9 +342,7 @@ def get_annotation_str(annotation):
         return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]"
     elif isinstance(annotation, ast.Tuple):
         return ",".join([get_annotation_str(elt) for elt in annotation.elts])
-    elif isinstance(annotation, ast.Constant) or isinstance(
-        annotation, ast.NameConstant
-    ):
+    elif isinstance(annotation, (ast.Constant, ast.NameConstant)):
         return f"{annotation.value}"
 
     # If an AST node is not handled here, it's probably handled in ScriptTypeParser.
@@ -516,7 +511,7 @@ def fn(*args, **kwargs):
     return fn
 
 
-class FunctionModifiers(object):
+class FunctionModifiers:
     """
     Used to denote the behavior of a function in TorchScript. See export() and
     ignore() for details.
@@ -529,6 +524,7 @@ class FunctionModifiers(object):
     COPY_TO_SCRIPT_WRAPPER = (
         "if this method is not scripted, copy the python method onto the scripted model"
     )
+    _DROP = "_drop (function is fully ignored, declaration can be unscriptable)"
 
 
 def export(fn):
@@ -591,7 +587,7 @@ def unused(fn):
 
             class MyModule(nn.Module):
                 def __init__(self, use_memory_efficient):
-                    super(MyModule, self).__init__()
+                    super().__init__()
                     self.use_memory_efficient = use_memory_efficient
 
                 @torch.jit.unused
@@ -743,6 +739,11 @@ def decorator(fn):
     return decorator
 
 
+def _drop(fn):
+    fn._torchscript_modifier = FunctionModifiers._DROP
+    return fn
+
+
 def _copy_to_script_wrapper(fn):
     fn._torchscript_modifier = FunctionModifiers.COPY_TO_SCRIPT_WRAPPER
     return fn
@@ -765,12 +766,21 @@ def should_drop(fn) -> bool:
     attr = get_torchscript_modifier(fn)
     if attr is None:
         return False
-    return attr is FunctionModifiers.UNUSED
+    return attr is FunctionModifiers.UNUSED or attr is FunctionModifiers._DROP
 
 
 def is_ignored_fn(fn) -> bool:
     mod = get_torchscript_modifier(fn)
-    return mod is FunctionModifiers.UNUSED or mod is FunctionModifiers.IGNORE
+    return (
+        mod is FunctionModifiers.UNUSED
+        or mod is FunctionModifiers.IGNORE
+        or mod is FunctionModifiers._DROP
+    )
+
+
+def _is_drop_fn(fn) -> bool:
+    mod = get_torchscript_modifier(fn)
+    return mod is FunctionModifiers._DROP
 
 
 def is_static_fn(cls, fn) -> bool:
@@ -1041,6 +1051,12 @@ def is_future(ann) -> bool:
     return getattr(ann, "__origin__", None) is Future
 
 
+def is_await(ann) -> bool:
+    if ann is _Await:
+        return True
+    return getattr(ann, "__origin__", None) is _Await
+
+
 if torch.distributed.rpc.is_available():
     from torch._C._distributed_rpc import PyRRef
     from torch.distributed.rpc import RRef
@@ -1071,7 +1087,7 @@ def is_final(ann) -> bool:
 
 
 # allows BroadcastingList instance to be subscriptable
-class BroadcastingListCls(object):
+class BroadcastingListCls:
     def __getitem__(self, types):
         return
 
@@ -1216,12 +1232,7 @@ def _get_named_tuple_properties(obj):
 def _create_named_tuple(
     t, unqual_name: str, field_names: List[str], defaults: Tuple[Any, ...]
 ):
-    # mypy: namedtuple() expects a string literal as the first argument
-    if sys.version_info < (3, 7, 0):
-        TupleType = collections.namedtuple(unqual_name, field_names)  # type: ignore[no-redef, misc]
-        TupleType.__new__.__defaults__ = defaults  # type: ignore[attr-defined]
-    else:
-        TupleType = collections.namedtuple(unqual_name, field_names, defaults=defaults)  # type: ignore[call-arg, no-redef, misc]
+    TupleType = collections.namedtuple(unqual_name, field_names, defaults=defaults)  # type: ignore[call-arg, no-redef, misc]
     return TupleType(*t)
 
 
@@ -1402,6 +1413,8 @@ def persistent_id(self, obj):
         # the means to access a value.
         if isinstance(obj, CFuture) or is_rref_instance(obj):
             return ""
+        if isinstance(obj, CAwait):
+            return ""
         if isinstance(obj, torch.cuda.Event):
             return ""
         if isinstance(obj, threading.Thread):
diff --git a/torch/_linalg_utils.py b/torch/_linalg_utils.py
index bdd22f395d2d..3a81fc6c27ad 100644
--- a/torch/_linalg_utils.py
+++ b/torch/_linalg_utils.py
@@ -113,6 +113,14 @@ def lstsq(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
     )
 
 
+def _symeig(
+    input, eigenvectors=False, upper=True, *, out=None
+) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. Please use the `torch.linalg.eigh` function instead.",
+    )
+
+
 def eig(
     self: Tensor, eigenvectors: bool = False, *, e=None, v=None
 ) -> Tuple[Tensor, Tensor]:
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index 5d2c7a2fff0c..d35aa8fce3a3 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -692,7 +692,7 @@ def _lobpcg(
     return worker.E[:k], worker.X[:, :k]
 
 
-class LOBPCG(object):
+class LOBPCG:
     """Worker class of LOBPCG methods."""
 
     def __init__(
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 3bd12be74b40..f074632f9d3f 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -13,7 +13,6 @@
     corresponding_real_dtype,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
-    FloatLike,
     IntLike,
     make_contiguous_strides_for,
 )
@@ -21,7 +20,6 @@
 from torch._prims_common.wrappers import out_wrapper
 from torch._refs import _broadcast_shapes
 
-from torch._subclasses.fake_tensor import check_no_bool_index_tensors
 from torch.utils._pytree import tree_map
 
 
@@ -997,7 +995,6 @@ def vdot(self, other):
 # get shape inference through structured kernels
 @register_meta(aten.index.Tensor)
 def meta_index_Tensor(self, indices):
-    check_no_bool_index_tensors(aten.index.Tensor, self, indices)
     check(indices, lambda: "at least one index must be provided")
     # aten::index is the internal advanced indexing implementation
     # checkIndexTensorTypes and expandTensors
@@ -1154,6 +1151,29 @@ def meta_addbmm(self, batch1, batch2, *, beta=1, alpha=1):
     return self.new_empty(self.size())
 
 
+@register_meta([aten._int_mm])
+@out_wrapper()
+def meta__int_mm(a, b):
+    check(a.dim() == 2, lambda: "a must be a 2D tensor")
+    check(b.dim() == 2, lambda: "b must be a 2D tensor")
+    check(
+        a.dtype is torch.int8,
+        lambda: f"expected self to be int8, got {a.dtype}",
+    )
+    check(
+        b.dtype is torch.int8,
+        lambda: f"expected mat2 to be int8, got {b.dtype}",
+    )
+    check(
+        a.size(1) == b.size(0),
+        lambda: (
+            f"Incompatible matrix sizes for _int_mm ({a.size(0)}x{a.size(1)} "
+            f"and {b.size(0)}x{b.size(1)})"
+        ),
+    )
+    return a.new_empty((a.size(0), b.size(1)), dtype=torch.int32)
+
+
 @register_meta(aten._cdist_forward.default)
 def meta_cdist_forward(x1, x2, p, compute_mode):
     check(
@@ -1471,7 +1491,7 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None):
         check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor")
         check(
             self_baddbmm.size() == output_size,
-            lambda: "Expected an input tensor shape with shape {output_size} but got shape: {self.size()}",
+            lambda: f"Expected an input tensor shape with shape {output_size} but got shape: {self_baddbmm.size()}",
         )
 
     return output
@@ -1668,7 +1688,7 @@ def meta_max_pool2d_with_indices_backward(
 
     check(
         self.dtype == grad_output.dtype,
-        lambda: "expected dtype {self.dtype} for `gradOutput` but got dtype {grad_output.dtype}",
+        lambda: f"Expected dtype {self.dtype} for `gradOutput` but got dtype {grad_output.dtype}",
     )
 
     nOutputPlane = nInputPlane
@@ -1788,27 +1808,6 @@ def zeros_like(
     )
 
 
-# hacky: Please remove after math.ceil works with arange
-@register_meta(aten.arange.default)
-def arange(end, **kwargs):
-    if isinstance(end, FloatLike):
-        end = math.ceil(end)  # type: ignore[arg-type]
-
-    def is_integral(x):
-        return isinstance(x, IntLike) or isinstance(x, bool)
-
-    set_to_integral_dtype = kwargs.get("dtype", None) is None and is_integral(end)
-    if set_to_integral_dtype:
-        kwargs["dtype"] = torch.int64
-
-    return aten.empty([end], **kwargs)
-
-
-@register_meta(aten.arange.start)
-def arange_start(start, end, **kwargs):
-    return aten.arange(end - start, **kwargs)
-
-
 @register_meta(aten.select.int)
 def meta_select(self, dim, index):
     ndim = self.dim()
@@ -2049,9 +2048,22 @@ def meta__scaled_dot_product_flash(
     key: Tensor,
     value: Tensor,
     dropout_p: float = 0.0,
-    return_softmax: bool = False,
     is_causal: bool = False,
+    return_debug_mask: bool = False,
 ):
+    # [Note] SDPA_flash's meta function returns incorrect Philox seed and offset:
+    # We have added logic to torch/_dynamo/variables/torch.py
+    # We need to check if scaled_dot_product_attention will run the flash attention
+    # kernel and if dropout is != 0.0. If that is the case then we want dynamo
+    # to graph break. The derivative calculation for _scaled_dot_product_flash_attention
+    # does not function correctly with cuda graphs because the full philox state is not captured
+    # the forward's return values. Another reason to graph break is that the the meta function
+    # returns the wrong outputs for philox seed and offset and these values get baked into the
+    # inductor fallback calls to the eager kernels.
+    check(
+        dropout_p == 0.0,
+        lambda: f"Can only trace _scaled_dot_product_flash_attention when dropout is set to 0 but got a dropout_p of {dropout_p}.",
+    )
     batch_size = query.size(0)
     num_heads = query.size(1)
     max_seqlen_batch_q = query.size(2)
@@ -2068,7 +2080,7 @@ def meta__scaled_dot_product_flash(
     output = torch.empty(
         (Nnz_q, num_heads, head_dim), dtype=query.dtype, device=query.device
     )
-    ouput = output.view(batch_size, max_seqlen_batch_q, num_heads, head_dim).transpose(
+    output = output.view(batch_size, max_seqlen_batch_q, num_heads, head_dim).transpose(
         1, 2
     )
     max_seqlen_q = math.ceil(max_seqlen_batch_q / 16) * 16
@@ -2077,32 +2089,86 @@ def meta__scaled_dot_product_flash(
         dtype=torch.float,
         device=query.device,
     )
-    is_sm80 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0)
-    is_sm75 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (7, 5)
-    head_size_rounded = 64 if head_dim <= 64 else 128
-    blocksize_c = (
-        128
-        if (head_size_rounded == 128 and (dropout_p != 0.0 or not is_sm80))
-        or (is_sm75 and head_size_rounded == 64 and dropout_p != 0.0)
-        else 256
-    )
-    max_seqlen_k = math.ceil(max_seqlen_batch_k / blocksize_c) * blocksize_c
-    if max_seqlen_k <= 128:
-        max_seqlen_k = 128
-    elif max_seqlen_k <= 256:
-        max_seqlen_k = 256
-
-    softmax = torch.empty(
-        (batch_size, num_heads, max_seqlen_q, max_seqlen_k),
-        dtype=query.dtype,
-        device=query.device,
+    cumulative_sequence_length_q = torch.empty(
+        batch_size + 1, dtype=torch.int32, device="meta"
     )
-    softmax = torch.empty(
-        0,
-        dtype=query.dtype,
-        device=query.device,
+    cumulative_sequence_length_k = torch.empty(
+        batch_size + 1, dtype=torch.int32, device="meta"
+    )
+
+    if return_debug_mask:
+        blocksize_c = 128 if head_dim > 64 else 256
+        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
+        if max_seqlen_batch_k <= 128:
+            max_seqlen_k = 128
+        elif max_seqlen_batch_k <= 256:
+            max_seqlen_k = 256
+        debug_mask = torch.empty(
+            (batch_size, num_heads, max_seqlen_q, max_seqlen_k),
+            dtype=query.dtype,
+            device=query.device,
+        )
+    else:
+        debug_mask = torch.empty(0, dtype=query.dtype, device=query.device)
+
+    return (
+        output,
+        logsumexp,
+        cumulative_sequence_length_q,
+        cumulative_sequence_length_k,
+        max_seqlen_batch_q,
+        max_seqlen_batch_k,
+        1,  # Philox Seed will not be used, see note at top.
+        1,  # Philox Offset will not be used, see note at top.
+        debug_mask,
     )
-    return ouput, logsumexp, softmax
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_backward,
+    ]
+)
+def meta__scaled_dot_product_flash_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    cum_seq_q: Tensor,
+    cum_seq_k: Tensor,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: int,
+    philox_offset: int,
+):
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    head_dim = query.size(3)
+
+    Nnz_q = batch_size * max_q
+    Nnz_kv = batch_size * max_k
+
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    query_reshaped = query.reshape(Nnz_q, num_heads, head_dim)
+    key_reshaped = key.reshape(Nnz_kv, num_heads, head_dim)
+    value_reshaped = value.reshape(Nnz_kv, num_heads, head_dim)
+
+    grad_q = torch.empty_like(query_reshaped)
+    grad_k = torch.empty_like(key_reshaped)
+    grad_v = torch.empty_like(value_reshaped)
+
+    grad_q = grad_q.view(batch_size, max_q, num_heads, head_dim).transpose(1, 2)
+    grad_k = grad_k.view(batch_size, max_k, num_heads, head_dim).transpose(1, 2)
+    grad_v = grad_v.view(batch_size, max_k, num_heads, head_dim).transpose(1, 2)
+
+    return grad_q, grad_k, grad_v
 
 
 @register_meta(
@@ -2235,7 +2301,7 @@ def upsample_common_check(input_size, output_size, num_spatial_dims):
 def upsample_nearest1d(input, output_size, scales=None):
     check(
         input.numel() != 0 or multiply_integers(input.size()[1:]),
-        lambda: "Non-empty 3D data tensor expected but got a tensor with sizes {input.size()}",
+        lambda: f"Non-empty 3D data tensor expected but got a tensor with sizes {input.size()}",
     )
     full_output_size = upsample_common_check(
         input.size(), output_size, num_spatial_dims=1
@@ -2249,7 +2315,7 @@ def upsample_nearest1d(input, output_size, scales=None):
 def upsample_nearest2d(input, output_size, scales_h=None, scales_w=None):
     check(
         input.numel() != 0 or multiply_integers(input.size()[1:]),
-        lambda: "Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
+        lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
     )
     full_output_size = upsample_common_check(
         input.size(), output_size, num_spatial_dims=2
@@ -2273,7 +2339,7 @@ def upsample_nearest2d(input, output_size, scales_h=None, scales_w=None):
 def upsample_nearest3d(input, output_size, scales_d=None, scales_h=None, scales_w=None):
     check(
         input.numel() != 0 or multiply_integers(input.size()[1:]),
-        lambda: "Non-empty 5D data tensor expected but got a tensor with sizes {input.size()}",
+        lambda: f"Non-empty 5D data tensor expected but got a tensor with sizes {input.size()}",
     )
     full_output_size = upsample_common_check(
         input.size(), output_size, num_spatial_dims=3
@@ -2586,6 +2652,30 @@ def mkldnn_rnn_layer_backward(
     return diff_x, diff_w1, diff_w2, diff_b, diff_b, diff_hx, diff_cx
 
 
+@register_meta([aten.bucketize.Tensor, aten.bucketize.Tensor_out])
+@out_wrapper()
+def meta_bucketize(self, boundaries, *, out_int32=False, right=False):
+    return torch.empty_like(
+        self, dtype=torch.int32 if out_int32 else torch.int64
+    ).contiguous()
+
+
+@register_meta(aten._upsample_bilinear2d_aa.default)
+def meta_upsample_bilinear2d_aa(
+    input, output_size, align_corners, scales_h=None, scales_w=None
+):
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=2
+    )
+    check(
+        input.numel() != 0 or all([size > 0 for size in input.size()[1:]]),
+        lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs
@@ -2649,4 +2739,21 @@ def activate_meta():
                 _meta_lib_dont_use_me_use_register_meta.impl(op_overload, fn)
 
 
+@register_meta(aten.all_reduce)
+def all_reduce_meta(self, reduceOp, tag, rankset, group_size):
+    return torch.empty_like(self)
+
+
+@register_meta(aten.all_gather_into_tensor)
+def all_gather_into_tensor_meta(shard, tag, rankset, group_size):
+    out_size = list(shard.size())
+    out_size[0] *= group_size
+    return shard.new_empty(out_size)
+
+
+@register_meta(aten.wait_tensor)
+def wait_tensor_meta(self):
+    return torch.empty_like(self)
+
+
 activate_meta()
diff --git a/torch/_ops.py b/torch/_ops.py
index ac60b9aa3f2a..afba4d38d4a2 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -540,7 +540,7 @@ class _OpNamespace(types.ModuleType):
     """
 
     def __init__(self, name):
-        super(_OpNamespace, self).__init__("torch.ops." + name)
+        super().__init__("torch.ops." + name)
         self.name = name
         self._dir = []
 
@@ -584,7 +584,7 @@ def __getattr__(self, op_name):
 
 class _PyOpNamespace(_OpNamespace):
     def __init__(self):
-        super(_PyOpNamespace, self).__init__("torch.ops")
+        super().__init__("torch.ops")
         self.pyop_namespace = pyop_namespace
 
 
@@ -592,7 +592,7 @@ class _Ops(types.ModuleType):
     __file__ = "_ops.py"
 
     def __init__(self):
-        super(_Ops, self).__init__("torch.ops")
+        super().__init__("torch.ops")
         self.loaded_libraries = set()
         self.pyops = _PyOpNamespace()
         self._dir = []
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 78116e59a8f1..575cdb9f5ce8 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -174,6 +174,7 @@
     "maximum_value",
     "minimum_value",
     "to_dtype",
+    "copy_strided",
     #
     # Inplace prims
     #
@@ -192,8 +193,9 @@
     # Tensor Creation Prims
     #
     "empty_strided",
+    "empty_permuted",
     "scalar_tensor",
-    "arange",
+    "iota",
     #
     # Linear algebra (linalg) Prims
     #
@@ -345,7 +347,7 @@ def _elementwise_meta(
     utils.check_same_device(*args_, allow_cpu_scalar_tensors=True)
     utils.check_same_shape(*args_, allow_cpu_scalar_tensors=True)
 
-    strides = utils.compute_elementwise_output_strides(*args_)
+    l2p_perm = utils.compute_elementwise_output_logical_to_physical_perm(*args_)
     shape = utils.extract_shape(*args_, allow_cpu_scalar_tensors=True)
 
     # Acquires the dtype
@@ -369,8 +371,13 @@ def _elementwise_meta(
     number = None
     for arg in args_:
         if isinstance(arg, TensorLike):
-            device = arg.device
-            break
+            if utils.is_cpu_scalar_tensor(arg):
+                if device is None:
+                    device = arg.device
+                # keep going, in case there is a cuda tensor later
+            else:
+                device = arg.device
+                break
 
         elif isinstance(arg, Number):
             if number is None:
@@ -391,7 +398,8 @@ def _elementwise_meta(
             else:
                 dtype = dtype
 
-        return TensorMeta(device=device, shape=shape, strides=strides, dtype=dtype)
+        assert shape is not None
+        return torch.empty_permuted(shape, l2p_perm, device=device, dtype=dtype)  # type: ignore[return-value]
 
     # Number case
     # TODO: fix number type promotion (bool, complex->float)
@@ -1270,11 +1278,42 @@ def _broadcast_in_dim_aten(a, shape, broadcast_dimensions):
 )
 
 
+def _validate_collapse_args(a: Tensor, start: int, end: int) -> None:
+    # Special-case for zero dimensional tensors
+    ndim = max(1, a.dim())
+    utils.validate_idx(ndim, start)
+    utils.validate_idx(ndim, end)
+
+    # Verifies end is strictly greater than start
+    # (Collapse requires a non-empty interval)
+    utils.check(
+        end >= start,
+        lambda: f"Attempting to collapse but end, {end}, is less than start, {start}!",
+        ValueError,
+    )
+
+
+def _collapsed_shape(shape: ShapeType, start: int, end: int) -> Tuple[int, ...]:
+    """
+    Returns the shape of a with dims in [start, end) merged into a single dimension.
+    """
+    # Special-case for zero dimensional tensors
+    shape = (1,) if len(shape) == 0 else tuple(shape)
+
+    dim_length = 1
+    for s in shape[start : end + 1]:
+        dim_length = dim_length * s
+
+    return shape[0:start] + (dim_length,) + shape[end + 1 :]
+
+
 def _collapse_view_helper(
     a: TensorLikeType, start: int, end: int
 ) -> Tuple[Optional[ShapeType], Optional[StrideType]]:
     assert isinstance(a, TensorLike)
 
+    _validate_collapse_args(a, start, end)
+
     # Special-case for zero dimensional tensors
     if a.ndim == 0:
         shape = (1,)
@@ -1283,23 +1322,12 @@ def _collapse_view_helper(
         shape = a.shape  # type: ignore[assignment]
         strides = a.stride()  # type: ignore[assignment]
 
-    utils.validate_idx(len(shape), start)
-    utils.validate_exclusive_idx(len(shape), end)
-
-    # Verifies end is strictly greater than start
-    # (Collapse requires a non-empty interval)
-    if end <= start:
-        msg = "Attempting to collapse but end, {0}, is less than or equal to start, {1}!".format(
-            end, start
-        )
-        raise ValueError(msg)
-
-    if a.ndim == 0 or (end - 1 == start):
+    if a.ndim == 0 or (end == start):
         return shape, strides
 
-    length = shape[end - 1]
-    stride = strides[end - 1]
-    for idx in reversed(range(start, end - 1)):
+    length = shape[end]
+    stride = strides[end]
+    for idx in range(end - 1, start - 1, -1):
         if shape[idx] == 0 or shape[idx + 1] == 0:
             length = 0
             stride = 0
@@ -1318,8 +1346,8 @@ def _collapse_view_helper(
         ):
             return None, None
 
-    new_shape = shape[:start] + (length,) + shape[end:]
-    new_strides = strides[:start] + (stride,) + strides[end:]
+    new_shape = shape[:start] + (length,) + shape[end + 1 :]
+    new_strides = strides[:start] + (stride,) + strides[end + 1 :]
 
     # NOTE: when the input has no elements it's restrided as if it were contiguous
     if a.numel() == 0:
@@ -1335,25 +1363,12 @@ def _collapse_view_meta(a: TensorLikeType, start: int, end: int) -> TensorLikeTy
         msg = "Attempting to view a collapsed tensor, but no such view exists!"
         raise ValueError(msg)
 
-    if new_strides is None:
-        return a.view(new_shape)
-    else:
-        return a.as_strided(new_shape, new_strides, a.storage_offset())
+    assert new_strides is not None
+    return a.as_strided(new_shape, new_strides, a.storage_offset())
 
 
 def _collapse_view_aten(a: Tensor, start: int, end: int) -> Tensor:
-    # Special-cases zero-dim tensors
-    if a.ndim == 0:
-        shape = (1,)
-    else:
-        shape = a.shape  # type: ignore[assignment]
-
-    dim_length = 1
-    for idx in range(start, end):
-        dim_length = dim_length * shape[idx]
-
-    new_shape = shape[0:start] + (dim_length,) + shape[end:]
-
+    new_shape = _collapsed_shape(a.shape, start, end)
     return a.view(new_shape)
 
 
@@ -1832,19 +1847,35 @@ def _as_strided_scatter_meta(
 #
 # Shape operations
 #
-def collapse(a: Tensor, start: int, end: int) -> Tensor:
-    """
-    Wrapper around reshape that collapses a span of dimensions.
 
-    See collapse_view for the corresponding view operation.
-    """
 
-    dim_length = 1
-    for idx in range(start, end):
-        dim_length = dim_length * a.shape[idx]
+def _collapse_meta(a: Tensor, start: int, end: int) -> Tensor:
+    # Special-case for zero dimensional tensors
+    _validate_collapse_args(a, start, end)
+    new_shape = _collapsed_shape(a.shape, start, end)
+    return a.new_empty(new_shape)
 
-    new_shape = a.shape[0:start] + (dim_length,) + a.shape[end:]
-    return reshape(a, new_shape)
+
+def _collapse_aten(a: Tensor, start: int, end: int) -> Tensor:
+    new_shape = _collapsed_shape(a.shape, start, end)
+    out = a.new_empty(new_shape)
+    with torch.no_grad():
+        out.view_as(a).copy_(a)
+    return out
+
+
+_collapse_doc = """
+Collapse a span of neighboring dimensions into one.
+
+See collapse_view for the corresponding view operation.
+"""
+collapse = _make_prim(
+    schema="collapse(Tensor a, int start, int end) -> Tensor",
+    meta=_collapse_meta,
+    impl_aten=_collapse_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_collapse_doc,
+)
 
 
 # TODO: review stride logic
@@ -1926,8 +1957,7 @@ def _reshape_aten(a: Tensor, shape: ShapeType) -> Tensor:
 
 def _rev_meta(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
     utils.validate_dimension_indices(a.ndim, dims)
-    out = torch.empty_like(a, memory_format=torch.preserve_format)
-    return TensorMeta(out)
+    return torch.empty_like(a, memory_format=torch.preserve_format)
 
 
 _rev_doc = """
@@ -2177,6 +2207,45 @@ def _copy_to_aten(a: Tensor, b: Tensor) -> Tensor:
 )
 
 
+def _copy_strided_meta(a: TensorLikeType, stride: ShapeType):
+    assert isinstance(a, TensorLike)
+    return torch.empty_strided(
+        a.shape,
+        stride,
+        dtype=a.dtype,
+        layout=a.layout,
+        device=a.device,
+        requires_grad=a.requires_grad,
+    )
+
+
+def _copy_strided_aten(a: Tensor, stride: ShapeType) -> Tensor:
+    out = torch.empty_strided(
+        a.size(),
+        stride=stride,
+        dtype=a.dtype,
+        layout=a.layout,
+        device=a.device,
+        requires_grad=a.requires_grad,
+    )
+    out.copy_(a)
+    return out
+
+
+_copy_strided_doc = """
+  Copies the data in a to a new tensor, the new tensor has same shape with a size, but has different stride.
+  """
+
+
+copy_strided = _make_prim(
+    schema="copy_strided(Tensor a, SymInt[] stride) -> Tensor",
+    meta=_copy_strided_meta,
+    impl_aten=_copy_strided_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_copy_strided_doc,
+)
+
+
 def _resize_meta(a: TensorLikeType, shape: ShapeType):
     return a.resize_(shape)
 
@@ -2261,7 +2330,7 @@ def _make_reduction_prim(name: str, impl_aten, doc):
 def _make_var_reduction_prim(name: str, impl_aten, doc):
     """Creates a reduction prim."""
     return _make_prim(
-        schema=f"{name}(Tensor inp, int[]? dims, *, int correction, ScalarType? output_dtype=None) -> Tensor",
+        schema=f"{name}(Tensor inp, int[]? dims, *, float correction, ScalarType? output_dtype=None) -> Tensor",
         meta=_var_reduction_meta,
         impl_aten=impl_aten,
         return_type=RETURN_TYPE.NEW,
@@ -2316,84 +2385,56 @@ def _prod_aten(
 )
 
 
-_arange_doc = """
-    Constructs a 1-D tensor with values from the interval [start, end) taken
-    with common difference `step` beginning from `start`.
+_iota_doc = """
+    Constructs a 1-D tensor t where ``t[i] == start + i * step``.
 """
 
 
 # TODO: layout, pin_memory, memory_format
 # TODO: model requires_grad on TensorMeta
-def _arange_meta(
-    start: NumberType,
-    end: NumberType,
-    step: NumberType,
+def _iota_meta(
+    length: int,
     *,
-    dtype: Optional[torch.dtype],
-    device: Optional[torch.device],
+    start: int,
+    step: int,
+    dtype: torch.dtype,
+    device: torch.device,
     requires_grad: bool,
 ) -> TensorLikeType:
-    assert not (
-        isinstance(start, complex)
-        and isinstance(end, complex)
-        and isinstance(step, complex)
-    )
     utils.check(
-        step != 0,
-        lambda: "step must be nonzero",
+        utils.is_integer_dtype(dtype),
+        lambda: "prims.iota only supports integer dtypes",
     )
-    # SymInts can't represent inf
-    if not isinstance(start, torch.SymInt) and not isinstance(end, torch.SymInt):
-        utils.check(
-            math.isfinite(start) and math.isfinite(end),
-            lambda: f"unsupported range: {start} -> {end}",
-        )
-    utils.check(
-        (step > 0 and end >= start) or (step < 0 and end <= start),
-        lambda: "upper bound and lower bound inconsistent with step sign",
+    utils.check(step != 0, lambda: "step must be nonzero")
+    return torch.empty(
+        length,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
     )
-    if dtype is not None:
-        pass
-    elif all(isinstance(arg, IntLike) for arg in (start, end, step)):
-        dtype = torch.int64
-    else:
-        dtype = torch.get_default_dtype()
-    device = _get_default_device() if device is None else device
-    shape = (math.ceil((end - start) / step),)
-    strides = utils.make_contiguous_strides_for(shape)
-    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
 
 
-def _arange_aten(
-    start: NumberType,
-    end: NumberType,
-    step: NumberType,
+def _iota_aten(
+    length: int,
     *,
-    dtype: Optional[torch.dtype],
-    device: Optional[torch.device],
+    start: int,
+    step: int,
+    dtype: torch.dtype,
+    device: torch.device,
     requires_grad: bool,
 ) -> TensorLikeType:
-    # mypy: Not all union combinations were tried because there are too many unions
-    return torch.arange(  # type: ignore[call-overload, misc]
-        start,  # type: ignore[arg-type]
-        end,  # type: ignore[arg-type]
-        step,  # type: ignore[arg-type]
-        dtype=dtype,
-        device=device,
-        layout=torch.strided,
-        pin_memory=False,
-        requires_grad=requires_grad,
+    end = start + length * step
+    return torch.arange(
+        start, end, step, dtype=dtype, device=device, requires_grad=requires_grad
     )
 
 
-# TODO: maybe prims should not have requires_grad arg
-# see: https://github.com/pytorch/pytorch/pull/77542/files#r873943255
-arange = _make_prim(
-    schema="arange(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype, Device? device, bool requires_grad) -> Tensor",  # noqa: B950
+iota = _make_prim(
+    schema="iota(SymInt length, *, SymInt start, SymInt step, ScalarType dtype, Device device, bool requires_grad) -> Tensor",  # noqa: B950
     return_type=RETURN_TYPE.NEW,
-    meta=_arange_meta,
-    impl_aten=_arange_aten,
-    doc=_arange_doc,
+    meta=_iota_meta,
+    impl_aten=_iota_aten,
+    doc=_iota_doc,
 )
 
 
@@ -2450,6 +2491,61 @@ def _empty_strided_meta(
 )
 
 
+def _empty_permuted_meta(
+    shape: ShapeType,
+    physical_layout: DimsSequenceType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    p_strides = utils.make_contiguous_strides_for([shape[l] for l in physical_layout])
+    dim = len(shape)
+    utils.check(
+        len(physical_layout) == dim,
+        lambda: (
+            "Number of dimensions in the tensor input does not match the "
+            f"length of the physical layout; i.e. len(size) = {dim} "
+            f"is not equal to len(physical_layout) = {len(physical_layout)}"
+        ),
+    )
+    strides = [0] * len(shape)
+    seen_dims = set()
+    for p, l in enumerate(physical_layout):
+        utils.check(
+            0 <= l < dim,
+            lambda: (
+                f"Dimension out of range (expected to be between 0 and {dim - 1}, but got "
+                f"{l} at index {p}).  NB: negative dims "
+                "not currently supported; file an issue if you want it."
+            ),
+        )
+        utils.check(l not in seen_dims, lambda: "Duplicate dim not allowed")
+        strides[l] = p_strides[p]
+        seen_dims.add(l)
+    return TensorMeta(
+        shape=shape,
+        strides=strides,
+        dtype=dtype,
+        device=device,
+    )
+
+
+_empty_permuted_doc = """
+    Creates a tensor with uninitialized values according to some physical layout,
+    that is guaranteed to be non-overlapping and dense.
+"""
+
+# TODO: add layout, pin_memory
+empty_permuted = _make_prim(
+    schema="empty_permuted(SymInt[] shape, int[] physical_layout, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",  # noqa: B950
+    return_type=RETURN_TYPE.NEW,
+    meta=_empty_permuted_meta,
+    impl_aten=torch.empty_permuted,
+    doc=_empty_permuted_doc,
+)
+
+
 def _full_meta(
     shape: ShapeType,
     fill_value: NumberType,
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 22452e4daefc..7cb3d50c87ff 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -121,9 +121,7 @@ def __torch_function__(
         if torch.overrides.resolve_name(orig_func) in self.skip_ops:
             return orig_func(*args, **kwargs)
 
-        if isinstance(orig_func, torch._ops.OpOverload) or isinstance(
-            orig_func, torch._ops.OpOverloadPacket
-        ):
+        if isinstance(orig_func, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
             namespace = str(orig_func).split(".")[0]
             name = str(orig_func).split(".")[1]
             if namespace == "prims":
@@ -333,10 +331,7 @@ def _cudnn_batch_norm_backward(
 
     def _is_var_mean(self, func):
         return "torch.var_mean" == torch.overrides.resolve_name(func) or (
-            (
-                isinstance(func, torch._ops.OpOverload)
-                or isinstance(func, torch._ops.OpOverloadPacket)
-            )
+            (isinstance(func, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)))
             and "aten.var_mean" in str(func)
         )
 
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index b44f7653ee81..7c48bff53c7f 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -19,12 +19,29 @@
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
 if torch.cuda.is_available():
-    from torch._C._nvfuser import (  # type: ignore[import]
-        DataType,
-        Fusion,
-        FusionDefinition,
-        Tensor,
-    )
+    try:
+        from nvfuser import (  # type: ignore[attr-defined, import]
+            DataType,
+            FusionDefinition,
+            Tensor,
+        )
+
+        def create_fusion_definition():
+            fd = FusionDefinition()
+            return fd, fd
+
+    except ImportError:
+        from nvfuser._C import (  # type: ignore[import]
+            DataType,
+            Fusion,
+            FusionDefinition,
+            Tensor,
+        )
+
+        def create_fusion_definition():
+            fusion = Fusion()
+            return fusion, FusionDefinition(fusion)
+
 else:
     DataType = None
 
@@ -74,7 +91,12 @@ def compute_contiguity(shape, strides):
     Contiguous dimensions are represented by True, strided dimensions
     are represented by False.
     """
-    return torch._C._nvfuser.compute_contiguity(shape, strides)
+    try:
+        from nvfuser import compute_contiguity  # type: ignore[attr-defined]
+    except ImportError:
+        from nvfuser._C import compute_contiguity
+
+    return compute_contiguity(shape, strides)
 
 
 def to_nvfuser_template_args(args):
@@ -146,8 +168,8 @@ def make_nvfuser_fusion(gm: GraphModule, *nv_args_templates):
     output_node = next(filter(lambda n: n.op == "output", gm.graph.nodes))
     orig_flat_out, _ = tree_flatten(output_node.args[0])
 
-    fusion = Fusion()
-    with FusionDefinition(fusion) as fd:
+    fusion, fd = create_fusion_definition()
+    with fd:
 
         def _to_nvfuser_constant(arg):
             if isinstance(arg, Number):
diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index 7f3727611dd2..e0b01865164f 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -143,7 +143,12 @@
 
 def _assert_nvfuser_op_exists(fname: str):
     try:
-        from torch._C._nvfuser import FusionDefinition as fd  # type: ignore[import]
+        try:
+            from nvfuser import (  # type: ignore[import, attr-defined]
+                FusionDefinition as fd,
+            )
+        except ImportError:
+            from nvfuser._C import FusionDefinition as fd  # type: ignore[import]
 
         assert getattr(fd.Operators, fname)
     except ImportError:
@@ -260,7 +265,7 @@ def _transpose_nvfuser(fd, a, dims):
 
 
 def _squeeze_nvfuser(fd, a, a_shape, dimensions):
-    for idx in reversed(sorted(dimensions)):
+    for idx in sorted(dimensions, reverse=True):
         a = fd.ops.squeeze(a, a_shape, idx)
         a_shape = a_shape[:idx] + a_shape[idx + 1 :]
     return a
@@ -276,7 +281,10 @@ def _view_nvfuser(
     a_shape,
     new_shape,
 ):
-    return fd.ops.view(a, a_shape, new_shape)
+    try:
+        return fd.ops.view(a, a_shape, new_shape)
+    except AttributeError:
+        return fd.ops.reshape(a, a_shape, new_shape)
 
 
 def _sum_nvfuser(
@@ -285,7 +293,12 @@ def _sum_nvfuser(
     dims: DimsSequenceType,
 ):
     keep_dims = False
-    output_dtype = torch._C._nvfuser.DataType.Null
+    try:
+        from nvfuser import DataType  # type: ignore[import, attr-defined]
+    except ImportError:
+        from nvfuser._C import DataType  # type: ignore[import]
+
+    output_dtype = DataType.Null
     return fd.ops.sum(a, dims, keep_dims, output_dtype)
 
 
@@ -294,7 +307,7 @@ def _var_nvfuser(
     a: TensorLikeType,
     dims: DimsSequenceType,
     *,
-    correction: int,
+    correction: float,
 ):
     keep_dims = False
     return fd.ops.var(a, dims, correction, keep_dims)
@@ -307,7 +320,7 @@ def _var_mean_nvfuser(
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
-    correction: int,
+    correction: float,
 ):
     # Unbiased arg shouldn't be set when this function is called
     assert unbiased is None
@@ -668,7 +681,7 @@ def register_var_mean():
 
     # This signature tries to combine several overloads of the torch.var_mean function into one overload.
     nvprim.define(
-        f"{name}(Tensor inp, int[1]? dim=None, bool? unbiased=None, bool keepdim=False, *, int? correction=None)"
+        f"{name}(Tensor inp, int[1]? dim=None, bool? unbiased=None, bool keepdim=False, *, float? correction=None)"
         + " -> (Tensor, Tensor)"
     )
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index eaee1f132164..b02a194b84a7 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -6,11 +6,13 @@
 import operator
 import weakref
 import torch
-from torch import sym_float, sym_int
+from torch import sym_float, sym_int, sym_max
 
-# nvFuser imports are conditional on being compiled with CUDA
-if hasattr(torch._C, "_nvfuser"):
-    from torch._C._nvfuser import DataType  # type: ignore[import]
+try:
+    try:
+        from nvfuser import DataType  # type: ignore[import, attr-defined]
+    except ImportError:
+        from nvfuser._C import DataType  # type: ignore[import]
 
     _torch_dtype_to_nvfuser_dtype_map = {
         torch.cdouble: DataType.ComplexDouble,
@@ -29,7 +31,7 @@
         int: DataType.Int,
         bool: DataType.Bool,
     }
-else:
+except ImportError:
     _torch_dtype_to_nvfuser_dtype_map = {}
 
 
@@ -75,6 +77,7 @@ def getnvFuserDtype(dtype: Union[torch.dtype, NumberTypeType]):
     torch.Tensor.device.__get__,  # type: ignore[attr-defined]
     torch.Tensor.requires_grad.__get__,  # type: ignore[attr-defined]
     torch.Tensor.layout.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.is_contiguous,
     # For TorchRefsMode only
     torch.Tensor.__format__,
     torch.Tensor.__repr__,
@@ -244,14 +247,12 @@ def is_channels_last_contiguous_3d(a: Tensor) -> bool:
     return True
 
 
-_memory_formats = set(
-    (
-        torch.contiguous_format,
-        torch.preserve_format,
-        torch.channels_last,
-        torch.channels_last_3d,
-    )
-)
+_memory_formats = {
+    torch.contiguous_format,
+    torch.preserve_format,
+    torch.channels_last,
+    torch.channels_last_3d,
+}
 
 
 def validate_memory_format(memory_format: torch.memory_format):
@@ -321,7 +322,7 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
     # Sorts (length, stride) pairs by stride
     lengths_and_strides = sorted(
-        tuple(zip(a.shape, a.stride())), key=operator.itemgetter(1)
+        zip(a.shape, a.stride()), key=operator.itemgetter(1)
     )
 
     expected_stride = 1
@@ -346,33 +347,41 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
 # non overlapping and dense strides.
 # This is also INCORRECT because it does not model TensorIterator's
 # short-circuit, which can cause different strides.
-def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
-    """
-    Computes the output strides for elementwise operations.
-    """
-
-    if len(tensors) == 0:
+def compute_elementwise_output_logical_to_physical_perm(*tensors, _skip_checks=False) -> List[int]:
+    if not _skip_checks and len(tensors) == 0:
         msg = "Can't compute elementwise output strides for zero tensors!"
         raise ValueError(msg)
 
-    check_same_shape(*tensors, allow_cpu_scalar_tensors=True)
+    if not _skip_checks:
+        check_same_shape(*tensors, allow_cpu_scalar_tensors=True)
 
     # Filters the tensors to actual tensors
-    tensors = tuple(
-        a for a in tensors if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a)
-    )
+    if not _skip_checks:
+        tensors = tuple(
+            a for a in tensors if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a)
+        )
 
     # Short-circuits for CPU scalar case
     if len(tensors) == 0:
-        return ()
+        return []
 
     # Short-circuits for shapes with zero or one dimensions
     # TODO: are these necessary?
     ndim = tensors[0].ndim
     if ndim == 0:
-        return ()
+        return []
     if ndim == 1:
-        return (1,)
+        return [0]
+
+    # Short-circuits if contiguous, following the fake fast path.
+    # This reduces the number of guards we end up making
+    # TODO: do channels last too
+    is_contiguous = True
+    for t in tensors:
+        is_contiguous = is_contiguous and t.is_contiguous(memory_format=torch.contiguous_format)
+
+    if is_contiguous:
+        return list(range(ndim))
 
     shape = tensors[0].shape
 
@@ -398,6 +407,11 @@ def should_swap(idx_a, idx_b):
         # or all strides are equal and all dimensions have the same length
         return 0
 
+    # The "sort" order for the permutation is back-to-front, but
+    # the natural order for permutations is front-to-back.  Do the
+    # sorting back-to-front and then reverse it on output.
+    #
+    # also, note this returns the logical to physical shape permutation
     perm = list(reversed(range(ndim)))
 
     # insertion sort with support for ambiguous comparisons
@@ -411,18 +425,64 @@ def should_swap(idx_a, idx_b):
             elif comparison < 0:
                 break
 
-    permuted_shape = [-1] * ndim
-    for idx, x in enumerate(reversed(perm)):
-        permuted_shape[idx] = shape[x]
+    return list(reversed(perm))
+
+
+def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
+    """
+    Computes the output strides for elementwise operations.
+    """
+    if len(tensors) == 0:
+        msg = "Can't compute elementwise output strides for zero tensors!"
+        raise ValueError(msg)
+
+    check_same_shape(*tensors, allow_cpu_scalar_tensors=True)
+
+    # Filters the tensors to actual tensors
+    tensors = tuple(
+        a for a in tensors if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a)
+    )
+
+    # Short-circuits for CPU scalar case
+    if len(tensors) == 0:
+        return ()
+
+    ndim = tensors[0].ndim
+    shape = tensors[0].shape
+
+    if ndim == 0:
+        return ()
+    if ndim == 1:
+        return (1,)
+
+    logical_to_physical_perm = compute_elementwise_output_logical_to_physical_perm(
+        *tensors, _skip_checks=True
+    )
+    permuted_shape = apply_perm(shape, logical_to_physical_perm)  # to physical
 
     new_strides = make_contiguous_strides_for(permuted_shape)
-    permuted_strides = [-1] * ndim
-    for idx, x in enumerate(reversed(perm)):
-        permuted_strides[x] = new_strides[idx]
+    permuted_strides = apply_perm(new_strides, invert_perm(logical_to_physical_perm))  # to logical
 
     return tuple(permuted_strides)
 
 
+# Identity permutation is [0, 1, 2]
+def apply_perm(inp, perm):
+    ndim = len(inp)
+    permuted_inp = [-1] * ndim
+    for idx, x in enumerate(perm):
+        permuted_inp[idx] = inp[x]
+    return permuted_inp
+
+
+def invert_perm(perm):
+    ndim = len(perm)
+    new_perm = [-1] * ndim
+    for idx, x in enumerate(perm):
+        new_perm[x] = idx
+    return new_perm
+
+
 #
 # Common helper functions
 #
@@ -1106,6 +1166,21 @@ def check_same_dtype(*args):
 def get_computation_dtype(dtype: torch.dtype) -> torch.dtype:
     return _computation_dtype_map.get(dtype, dtype)
 
+_cpu_acc_type_map = {
+    torch.bfloat16: torch.float64,
+    torch.float16: torch.float64,
+    torch.float32: torch.float64,
+    torch.complex32: torch.complex128,
+    torch.complex64: torch.complex128,
+}
+
+def get_acc_type(dtype: torch.dtype, device: torch.device) -> torch.dtype:
+    # Equivalent to at::toAccumulateType, prefer computation_dtype where possible
+    if device.type == "cpu":
+        return _cpu_acc_type_map.get(dtype, dtype)
+    else:
+        return get_computation_dtype(dtype)
+
 
 class ELEMENTWISE_TYPE_PROMOTION_KIND(Enum):
     DEFAULT = (0,)
@@ -1374,8 +1449,7 @@ def make_contiguous_strides_for(
     strides = []
     for l in reversed(shape):
         strides.append(multiplier)
-        if l != 0:
-            multiplier *= l
+        multiplier *= sym_max(l, 1)
 
     result = tuple(reversed(strides))
 
@@ -1466,20 +1540,20 @@ def reduction_dims(shape: ShapeType, dims: Optional[Sequence]) -> Tuple[int, ...
 
 def set_correction(
     unbiased: Optional[bool] = None,
-    correction: Optional[int] = None,
-):
+    correction: Optional[NumberType] = None,
+) -> float:
     if correction is not None and unbiased is not None:
         raise RuntimeError("cannot specify both correction and unbiased arguments")
     elif correction is None and unbiased is None:
-        correction = 1
+        correction = 1.0
     elif correction is None and unbiased is not None:
-        correction = 0 if unbiased is False else 1
+        correction = 0.0 if unbiased is False else 1.0
     # NB: we don't actually support symint here, but it's harmless to accept
-    if not isinstance(correction, IntLike):
-        raise ValueError("correction argument should be integer")
+    if not isinstance(correction, (IntLike, FloatLike)):
+        raise ValueError("correction argument should be integer or float")
     if correction < 0:
         raise ValueError("correction argument should be non-negative")
-    return correction
+    return sym_float(correction)
 
 
 def compute_required_storage_length(
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index 1847164d26b9..ac19d4319932 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -34,12 +34,9 @@ def _maybe_convert_to_dtype(a: None, dtype: torch.dtype) -> None:
 
 # TODO: implement ref.cast with an option to enforce safe casting
 def _maybe_convert_to_dtype(a, dtype):
-    import torch._prims as prims
     if isinstance(a, TensorLike):
         if a.dtype != dtype:
-            # NOTE: this is incorrect on the CPU
-            # See https://github.com/pytorch/pytorch/issues/77553
-            return prims.convert_element_type(a, dtype)
+            return a.to(dtype)
         return a
     if isinstance(a, Number):
         return utils.dtype_to_type_ctor(dtype)(a)  # type: ignore[arg-type]
@@ -78,7 +75,7 @@ def _annotation_has_type(*, typ, annotation):
     return typ is annotation
 
 
-class elementwise_type_promotion_wrapper(object):
+class elementwise_type_promotion_wrapper:
     """
     Adds elementwise type promotion to a Python reference implementation.
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index ea1b35078e2e..f792431a57e7 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -23,6 +23,7 @@
     dtype_to_type,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     FloatLike,
+    FloatWithoutSymFloat,
     IntLike,
     is_weakly_lesser_type,
     Number,
@@ -82,6 +83,8 @@
     "index_fill_",
     "isfinite",
     "isinf",
+    "isposinf",
+    "isneginf",
     "isnan",
     "isreal",
     "i0",
@@ -193,12 +196,13 @@
     "amin",
     "any",
     "mean",
+    "std",
     "std_mean",
-    "var_mean",
     "sum",
     "sum_to_size",
     "prod",
     "var",
+    "var_mean",
     #
     # Linear algebra ops
     #
@@ -272,6 +276,7 @@
     "arange",
     "empty",
     "empty_like",
+    "empty_permuted",
     "empty_strided",
     "eye",
     "full",
@@ -584,8 +589,8 @@ def floor(a):
 
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
 def frac(x: TensorLikeType) -> TensorLikeType:
-    trunc_x = mul(floor(abs(x)), sign(x))
-    return sub(x, trunc_x)
+    trunc_x = torch.mul(torch.floor(torch.abs(x)), torch.sign(x))
+    return torch.sub(x, trunc_x)
 
 
 # imag does not use _make_elementwise_unary_reference because it does not support out
@@ -611,8 +616,10 @@ def isfinite(a: TensorLikeType) -> TensorLikeType:
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
 def isinf(a: TensorLikeType) -> TensorLikeType:
     if utils.is_complex_dtype(a.dtype):
-        return logical_or(isinf(real(a)), isinf(imag(a)))
-    return logical_not(logical_or(isnan(a), isfinite(a)))
+        return torch.logical_or(isinf(torch.real(a)), isinf(torch.imag(a)))
+    if utils.is_float_dtype(a.dtype):
+        return torch.abs(a) == float("inf")
+    return torch.zeros_like(a, dtype=torch.bool)
 
 
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
@@ -622,8 +629,8 @@ def isposinf(a: TensorLikeType) -> TensorLikeType:
         lambda: f"Complex dtype is not supported for isposinf, got dtype {a.dtype}",
     )
     if utils.is_float_dtype(a.dtype):
-        return eq(a, float("inf"))
-    return zeros_like(a, dtype=torch.bool)
+        return a == float("inf")
+    return torch.zeros_like(a, dtype=torch.bool)
 
 
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
@@ -633,8 +640,8 @@ def isneginf(a: TensorLikeType) -> TensorLikeType:
         lambda: f"Complex dtype is not supported for isneginf, got dtype {a.dtype}",
     )
     if utils.is_float_dtype(a.dtype):
-        return eq(a, float("-inf"))
-    return zeros_like(a, dtype=torch.bool)
+        return a == float("-inf")
+    return torch.zeros_like(a, dtype=torch.bool)
 
 
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
@@ -733,7 +740,7 @@ def nan_to_num(
     assert isinstance(a, TensorLike)
 
     if utils.is_boolean_dtype(a.dtype) or utils.is_integer_dtype(a.dtype):
-        return clone(a)
+        return a.clone()
 
     if nan is None:
         nan = 0.0
@@ -744,14 +751,9 @@ def nan_to_num(
     if neginf is None:
         neginf = torch.finfo(a.dtype).min
 
-    result = where(isnan(a), nan, a)
-
-    is_neg = signbit(a)
-    is_neginf = bitwise_and(isinf(a), is_neg)
-    result = where(is_neginf, neginf, result)
-
-    is_posinf = bitwise_and(isinf(a), bitwise_not(is_neg))
-    result = where(is_posinf, posinf, result)
+    result = torch.where(torch.isnan(a), nan, a)  # type: ignore[call-overload]
+    result = torch.where(torch.isneginf(a), neginf, result)  # type: ignore[call-overload]
+    result = torch.where(torch.isposinf(a), posinf, result)  # type: ignore[call-overload]
     return result
 
 
@@ -1296,15 +1298,15 @@ def gt(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
 
 
 @_make_elementwise_binary_reference(
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
 def heaviside(input: TensorLikeType, values: TensorLikeType) -> TensorLikeType:
-    input_eq_zero = eq(input, 0)
-    input_lt_zero = logical_or(lt(input, 0), isnan(input))
-    zeros_and_ones = where(input_lt_zero, 0, 1)
-    output = where(input_eq_zero, values, zeros_and_ones)
+    input_eq_zero = torch.eq(input, 0)
+    input_lt_zero = torch.logical_or(torch.lt(input, 0), torch.isnan(input))
+    zeros_and_ones = torch.where(input_lt_zero, 0, 1)
+    output = torch.where(input_eq_zero, values, zeros_and_ones)
     return output
 
 
@@ -1444,12 +1446,27 @@ def le(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
     supports_rhs_python_scalar=False,
 )
 def logaddexp(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
-    # Nb. this implementation does nto distribute the gradients evenly when a == b
-    mask = a >= b
+    # Nb. this implementation does not distribute the gradients evenly when a == b
+    mask = torch.real(a) >= torch.real(b)
     max_ = torch.where(mask, a, b)
     min_ = torch.where(mask, b, a)
-    inf_mask = torch.logical_and(torch.isinf(a), a == b)
-    return torch.where(inf_mask, a, max_ + torch.log1p(torch.exp(min_ - max_)))
+    inf_mask = torch.logical_and(
+        torch.logical_not(torch.isfinite(torch.real(a))), torch.real(a) == torch.real(b)
+    )
+    if utils.is_complex_dtype(a.dtype) or utils.is_complex_dtype(b.dtype):
+        # are you wondering what this bunch of codes are for? edge cases!
+        neg_min_mask = torch.real(min_) < 0
+        inf_vals = torch.where(
+            neg_min_mask, min_, torch.log(torch.exp(min_) + torch.exp(max_))
+        )
+        non_nan_vals = torch.where(
+            inf_mask, inf_vals, max_ + torch.log1p(torch.exp(min_ - max_))
+        )
+        # the type for full_like does not include tensor yet
+        nan_mask = torch.isnan(min_)
+        return torch.where(nan_mask, complex(float("nan"), float("nan")), non_nan_vals)  # type: ignore[call-overload]
+    else:
+        return torch.where(inf_mask, a, max_ + torch.log1p(torch.exp(min_ - max_)))
 
 
 # TODO: add docstring
@@ -1602,7 +1619,13 @@ def sub(
                 )
             )
             raise ValueError(msg)
-        b = prims.mul(b, alpha)
+        if isinstance(b, torch.Tensor):
+            b = prims.mul(b, alpha)
+        else:
+            # Carefully not to use prims.mul if b is a scalar / symint.
+            # prims.mul always returns a tensor,
+            # which will mess with type promotion.
+            b = b * alpha
 
     return prims.sub(a, b)
 
@@ -2221,7 +2244,7 @@ def prod(
 @register_decomposition(aten.amin)
 def amin(
     a: TensorLikeType,
-    dim: Union[Optional[int], Optional[List[int]]] = None,
+    dim: Optional[DimsType] = None,
     keepdim: bool = False,
     *,
     out: Optional[Tensor] = None,
@@ -2284,7 +2307,7 @@ def var(
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
-    correction: Optional[int] = None,
+    correction: Optional[NumberType] = None,
 ) -> TensorLikeType:
     dim, unbiased = _dim_var_dispatch(dim, unbiased)
     correction = utils.set_correction(unbiased, correction)
@@ -2313,30 +2336,19 @@ def std(
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
-    correction: Optional[int] = None,
+    correction: Optional[NumberType] = None,
 ) -> TensorLikeType:
     dim, unbiased = _dim_var_dispatch(dim, unbiased)
     correction = utils.set_correction(unbiased, correction)
-    # reduces over all dimensions if dim=() is passed
-    if dim == () or dim == []:
-        dim = None
 
     opmath_dtype, dtype = utils.reduction_dtypes(
         a, REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT
     )
-
-    result = _reduction(
-        a,
-        partial(prims.var, correction=correction),
-        dims=dim,
-        keepdims=keepdim,
-        dtype=opmath_dtype,
-        out=None,
-        has_identity=True,
-        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT,
-    )
-    result = sqrt(result)
-    return _maybe_convert_to_dtype(result, dtype)  # type: ignore[return-value,arg-type]
+    a = _maybe_convert_to_dtype(a, opmath_dtype)
+    a_var = torch.var(a, dim, correction=correction, keepdim=keepdim)
+    a_std = torch.sqrt(a_var)
+    assert dtype is not None
+    return _maybe_convert_to_dtype(a_std, dtype)
 
 
 @register_decomposition(aten.mean)
@@ -2393,16 +2405,26 @@ def mean(
 @register_decomposition(aten.std_mean.correction)
 def std_mean(
     a: TensorLikeType,
-    dim: Union[Optional[int], Optional[List[int]]] = None,
+    dim: Optional[DimsType] = None,
     *,
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
-    correction: Optional[int] = None,
+    correction: Optional[NumberType] = None,
 ):
     dim, unbiased = _dim_var_dispatch(dim, unbiased)
-    s = std(a, dim, unbiased, keepdim, correction=correction)
-    m = mean(a, dim, keepdim)
-    return s, m
+    correction = utils.set_correction(unbiased, correction)
+    opmath_dtype, dtype = utils.reduction_dtypes(
+        a, REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT
+    )
+    original_dtype = a.dtype
+    a = _maybe_convert_to_dtype(a, opmath_dtype)
+    a_var, a_mean = torch.var_mean(a, dim, correction=correction, keepdim=keepdim)
+    a_std = torch.sqrt(a_var)
+    assert dtype is not None
+    return (
+        _maybe_convert_to_dtype(a_std, dtype),
+        _maybe_convert_to_dtype(a_mean, original_dtype),
+    )
 
 
 @register_decomposition(aten.var_mean)
@@ -2412,7 +2434,7 @@ def var_mean(
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
-    correction: Optional[int] = None,
+    correction: Optional[NumberType] = None,
 ):
     dim, unbiased = _dim_var_dispatch(dim, unbiased)
     v = var(a, dim, unbiased, keepdim, correction=correction)
@@ -2807,7 +2829,6 @@ def chunk(a: TensorLikeType, chunks: int, dim: int = 0) -> Tuple[TensorLikeType,
     return tuple(result)
 
 
-# Note: flatten, unlike prim.collapse and prim.collapse_view has an inclusive end_dim
 # Note: flatten, unlike other shape operators, returns the input tensor on a no-op (unless
 # a 0D tensor is flattened, in which case it's returned in 1D)
 # CompositeImplicitAutograd - don't register decomp
@@ -2821,12 +2842,12 @@ def flatten(a: TensorLikeType, start_dim: int = 0, end_dim: int = -1) -> TensorL
 
     # Tries to take a view
     # TODO: we could look at directing collapse_view to skip its meta function here (unsafe_collapse_view)
-    new_shape, new_strides = prims._collapse_view_helper(a, start_dim, end_dim + 1)
+    new_shape, new_strides = prims._collapse_view_helper(a, start_dim, end_dim)
     if new_shape is not None:
-        return prims.collapse_view(a, start_dim, end_dim + 1)
+        return prims.collapse_view(a, start_dim, end_dim)
 
     # Makes a copy if it can't make a view
-    return prims.collapse(a, start_dim, end_dim + 1)
+    return prims.collapse(a, start_dim, end_dim)
 
 
 @register_decomposition(aten.flip)
@@ -2956,7 +2977,7 @@ def native_group_norm(
     out, mean, rstd = _normalize(input_reshaped, reduction_dims, eps)
     out = out.view(input.shape)
 
-    broadcast_dims = [0] + list(dim for dim in range(2, input.ndim))
+    broadcast_dims = [0] + list(range(2, input.ndim))
     unsqueeze_bias = None
     if bias is not None:
         unsqueeze_bias = _unsqueeze_multiple(bias, broadcast_dims)
@@ -3226,7 +3247,7 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
             # may return a view of a copy
 
             # Checks if collapse can be a view and short-circuits to copying reshape if it can't
-            new_shape, new_strides = prims._collapse_view_helper(a_, idx, end + 1)
+            new_shape, new_strides = prims._collapse_view_helper(a_, idx, end)
             if new_shape is None:
                 if allow_copy:
                     return prims.reshape(a, shape)
@@ -3282,6 +3303,11 @@ def roll(
         # Keeping this as ref for now as FakeTensor runs into some issues with complex tensors
         return clone(a)
 
+    if a.dim() == 0 and len(dims) > 0:
+        raise IndexError(
+            f"Dimension specified as {dims[0]} but tensor has no dimensions"
+        )
+
     len_shifts = len(shifts)
     len_dims = len(dims)
     if len_shifts != 1 or len_dims != 1:
@@ -3905,12 +3931,17 @@ def T(a: TensorLikeType) -> TensorLikeType:
     return a.t()
 
 
+@register_decomposition(aten.alias)
+def alias(a: TensorLikeType) -> TensorLikeType:
+    return prims.view_of(a)
+
+
 @register_decomposition(aten.transpose)
 def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     _dim0, _dim1 = utils.canonicalize_dims(a.ndim, (dim0, dim1))  # type: ignore[misc]
 
     if a.ndim <= 1 or dim0 == dim1:
-        return prims.view_of(a)
+        return aten.alias.default(a)
 
     _permutation = list(range(0, a.ndim))
     _permutation[_dim0] = _dim1
@@ -3993,7 +4024,6 @@ def ravel(a: TensorLikeType) -> TensorLikeType:
     return reshape(a, (-1,))
 
 
-@register_decomposition(aten.empty.memory_format)
 @out_wrapper()
 def empty(
     *shape,
@@ -4033,6 +4063,25 @@ def empty(
     )
 
 
+@out_wrapper()
+def empty_permuted(
+    shape,
+    physical_layout,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[torch.device] = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    return prims.empty_permuted(
+        shape,
+        physical_layout,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
 @register_decomposition(aten.new_empty)
 def new_empty(
     a: TensorLikeType,
@@ -4245,10 +4294,13 @@ def empty_like(
         )
 
     # memory_format == torch.preserve_format
-    strides = utils.compute_elementwise_output_strides(a)
-    return torch.empty_strided(
+    logical_to_physical_perm = (
+        utils.compute_elementwise_output_logical_to_physical_perm(a)
+    )
+    # identity perm is [2, 1, 0]
+    return torch.empty_permuted(
         a.shape,
-        strides,
+        logical_to_physical_perm,
         dtype=dtype,
         layout=layout,
         device=device,
@@ -4257,13 +4309,7 @@ def empty_like(
     )
 
 
-@register_decomposition(
-    [
-        aten.arange.default,
-        aten.arange.start,
-        aten.arange.start_step,
-    ]
-)
+@register_decomposition(aten.arange)
 @out_wrapper()
 def arange(
     start: NumberType = 0,
@@ -4278,20 +4324,78 @@ def arange(
 ) -> TensorLikeType:
     utils.check_layout(layout)
     utils.check_pin_memory(pin_memory)
+    device = torch.device(utils.device_or_default(device))
+
+    assert not isinstance(start, complex)
+    assert not isinstance(end, complex)
+    assert not isinstance(step, complex)
+
     # Case: torch.arange(5)
     if end is None:
         end = start
         start = 0
-    return prims.arange(
-        start,
-        end,
-        step,
-        dtype=dtype,
-        # layout=layout,
+    utils.check(step != 0, lambda: "step must be nonzero")
+    utils.check(
+        (step > 0 and end >= start) or (step < 0 and end <= start),
+        lambda: "upper bound and lower bound inconsistent with step sign",
+    )
+
+    def is_finite(x):
+        return not isinstance(x, FloatWithoutSymFloat) or math.isfinite(x)
+
+    utils.check(
+        is_finite(start) and is_finite(end),
+        lambda: f"unsupported range: {start} -> {end}",
+    )
+    utils.check(
+        is_finite(step),
+        lambda: f"step must be finite but got {step}",
+    )
+
+    if dtype is None:
+        args = (start, end, step)
+        integer_args = builtins.all(isinstance(arg, IntLike) for arg in args)
+        dtype = torch.int64 if integer_args else torch.get_default_dtype()
+
+    is_integer = utils.is_integer_dtype(dtype)
+    if is_integer:
+        xstart = sym_int(start)
+        xend = sym_int(end)
+        xstep = sym_int(step)
+
+    # For int64 we truncate arguments to int before calculating length, but
+    # other integral dtypes we don't. Weird... but needed to match ATen shapes.
+    if dtype == torch.int64:
+        length = math.ceil((xend - xstart) / xstep)
+    else:
+        length = math.ceil((end - start) / step)
+
+    if is_integer:
+        return prims.iota(
+            length,
+            start=xstart,
+            step=xstep,
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+
+    computation_dtype = utils.get_acc_type(dtype, device)
+    index = prims.iota(
+        length,
+        start=0,
+        step=1,
+        dtype=torch.int64,
         device=device,
-        # pin_memory=pin_memory,
-        requires_grad=requires_grad,
+        requires_grad=False,
     )
+    index = _maybe_convert_to_dtype(index, computation_dtype)
+    result = start + step * index
+    result = _maybe_convert_to_dtype(result, dtype)
+
+    if requires_grad:
+        result.requires_grad_(True)
+    return result
 
 
 @register_decomposition(aten.lerp)
@@ -4301,8 +4405,11 @@ def arange(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 def lerp(start: Tensor, end: Tensor, weight: Union[Tensor, NumberType]):
+    inputs = [start, end]
     if isinstance(weight, Number):
         weight = start.new_full((), weight)  # type: ignore[arg-type]
+    else:
+        inputs.append(weight)
     assert isinstance(weight, Tensor)  # mypy
     # We implement it this way for numerical stability. We assume (in the stability optimisation)
     # that 0 <= weight <= 1. We take the abs to deal with complex numbers
@@ -4313,7 +4420,12 @@ def lerp(start: Tensor, end: Tensor, weight: Union[Tensor, NumberType]):
     mask = weight.abs() >= 0.5
     coeff = torch.where(mask, weight - 1, weight)
     base = torch.where(mask, end, start)
-    return coeff * (end - start) + base
+    output = coeff * (end - start) + base
+    # make sure the decomposition output's stride is same as non-decomposition path.
+    stride = utils.compute_elementwise_output_strides(*_maybe_broadcast(*inputs))
+    if output.stride() != stride:
+        return prims.copy_strided(output, stride)
+    return output
 
 
 @register_decomposition(aten.linspace)
@@ -5179,6 +5291,28 @@ def bucketize(
     return start.to(dtype=out_dtype)
 
 
+@register_decomposition(aten.cauchy)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def cauchy(self, median=0, sigma=1, generator=None):
+    assert generator is None
+    utils.check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_integer_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"Cauchy distribution is a continuous probability distribution. \
+        dtype must be a floating point but you specified {self.dtype}",
+    )
+    utils.check(
+        sigma > 0.0,
+        lambda: f"cauchy_ expects sigma > 0.0, but found sigma={sigma}",
+    )
+    return median + sigma * torch.tan(math.pi * (torch.rand_like(self) - 0.5))
+
+
 @register_decomposition(aten.exponential)
 @out_wrapper()
 @elementwise_type_promotion_wrapper(
@@ -5187,9 +5321,75 @@ def bucketize(
 )
 def exponential(self, rate=1, generator=None):
     assert generator is None
+    utils.check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_integer_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"Exponential distribution is a continuous probability distribution. \
+        dtype must be a floating point but you specified {self.dtype}",
+    )
+    utils.check(
+        rate > 0.0,
+        lambda: f"exponential_ expects lambda > 0.0, but found lambda={rate}",
+    )
     return -1 / rate * torch.log1p(-torch.rand_like(self))
 
 
+@register_decomposition(aten.geometric)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def geometric(self, p, generator=None):
+    assert generator is None
+    # TODO: fix inductor rand_like for integer, bool dtypes
+    utils.check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"geometric not implemented for {self.dtype}",
+    )
+    utils.check(
+        0 < p and p < 1,
+        lambda: f"geometric_ expects p to be in (0, 1), but got p={p}",
+    )
+    return torch.floor(torch.log1p(-torch.rand_like(self)) / math.log1p(-p)) + 1
+
+
+@register_decomposition(aten.log_normal)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def log_normal(self, mean=1, std=2, generator=None):
+    assert generator is None
+    utils.check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_integer_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"log_normal not implemented for {self.dtype}",
+    )
+    utils.check(
+        0 < std,
+        lambda: f"log_normal_ expects std > 0.0, but found std={std}",
+    )
+    return torch.exp(std * torch.randn_like(self) + mean)
+
+
+# TODO: add support for functionalization aten.normal_functional
+@register_decomposition(aten.normal)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def normal(self, mean=0, std=1, generator=None):
+    assert generator is None
+    utils.check(std >= 0, lambda: f"normal expects std >= 0.0, but found std {std}")
+    return std * torch.randn_like(self) + mean
+
+
 # inplace
 abs_ = _make_inplace(abs)
 acos_ = _make_inplace(acos)
@@ -5278,7 +5478,11 @@ def exponential(self, rate=1, generator=None):
 true_divide_ = _make_inplace(true_divide)
 trunc_ = _make_inplace(trunc)
 xlogy_ = _make_inplace(xlogy)
+cauchy_ = _make_inplace(cauchy)
 exponential_ = _make_inplace(exponential)
+geometric_ = _make_inplace(geometric)
+normal_ = _make_inplace(normal)
+log_normal_ = _make_inplace(log_normal)
 zero_ = _make_inplace(zero)
 
 # Views
diff --git a/torch/_refs/fft.py b/torch/_refs/fft.py
index 130c2e761369..54a98c273e85 100644
--- a/torch/_refs/fft.py
+++ b/torch/_refs/fft.py
@@ -1,8 +1,6 @@
 import math
 
-from typing import Iterable, List, NamedTuple, Optional, Sequence, Tuple, Union
-
-from typing_extensions import Literal
+from typing import Iterable, List, Literal, NamedTuple, Optional, Sequence, Tuple, Union
 
 import torch
 import torch._prims as prims
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index 934220ca200e..92e9b699519e 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -72,9 +72,6 @@ def vector_norm(
 
     if isinstance(dim, Dim):
         dim = [dim]  # type: ignore[assignment]
-    elif not isinstance(dim, List) and dim is not None:
-        # refs.amin just accepts List rather than DimType (Tuple)
-        dim = list(dim)  # type: ignore[assignment]
 
     if x.numel() == 0 and (ord < 0.0 or ord == float("inf")):
         check(
@@ -101,15 +98,15 @@ def vector_norm(
 
     # Implementation
     if ord == 0.0:
-        return refs.sum(refs.ne(x, 0.0), dim=dim, keepdim=keepdim, dtype=result_dtype)
+        return torch.sum(torch.ne(x, 0.0), dim=dim, keepdim=keepdim, dtype=result_dtype)
     elif ord == float("inf"):
-        return to_result_dtype(refs.amax(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value]
+        return to_result_dtype(torch.amax(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value,arg-type]
     elif ord == float("-inf"):
-        return to_result_dtype(refs.amin(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value]
+        return to_result_dtype(torch.amin(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value,arg-type]
     else:
         # From here on the computation dtype is important as the reduction is non-trivial
         x = _maybe_convert_to_dtype(x, computation_dtype)  # type: ignore[assignment]
-        reduce_sum = partial(refs.sum, dim=dim, keepdim=keepdim)
+        reduce_sum = partial(torch.sum, dim=dim, keepdim=keepdim)
 
         if not (ord % 2.0 == 0.0 and utils.is_float_dtype(x.dtype)):
             x = torch.abs(x)
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index f6ea2e55ece4..0ae540b3a3d5 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -7,7 +7,6 @@
 import torch._prims_common as utils
 import torch._refs as refs
 from torch._decomp import register_decomposition
-from torch._decomp.decompositions import Reduction
 from torch._prims_common import (
     check,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
@@ -23,8 +22,6 @@
 )
 from torch._refs import _make_inplace
 
-from torch._subclasses.fake_tensor import FakeTensor
-
 __all__ = [
     "alpha_dropout",
     "celu",
@@ -454,7 +451,7 @@ def hardshrink(a: TensorLikeType, lambd: float = 0.5):
     # hardshrink(x) = x if x > lambd
     #               = x if x < -lambd
     #               = 0 otherwise
-    return refs.where(refs.logical_and(a >= -lambd, a <= lambd), 0, a)
+    return torch.where(torch.logical_and(a >= -lambd, a <= lambd), 0, a)
 
 
 @register_decomposition(aten.softshrink)
@@ -470,14 +467,16 @@ def softshrink(a: TensorLikeType, lambd: float = 0.5):
     )
     ge_mask = a > lambd
     le_mask = a < -lambd
-    zero_mask = torch.logical_not(refs.logical_or(ge_mask, le_mask))
-    result = refs.where(ge_mask, a - lambd, a)
-    result = refs.where(le_mask, a + lambd, result)
-    return refs.where(zero_mask, 0, result)
+    zero_mask = torch.logical_not(torch.logical_or(ge_mask, le_mask))
+    result = torch.where(ge_mask, a - lambd, a)
+    result = torch.where(le_mask, a + lambd, result)
+    return torch.where(zero_mask, 0, result)
 
 
 # Losses
 def _reduction_int_to_str(reduction: int) -> str:
+    from torch._decomp.decompositions import Reduction
+
     if reduction == Reduction.NONE.value:
         return "none"
     elif reduction == Reduction.MEAN.value:
@@ -650,6 +649,7 @@ def _nll_loss_nd(
     # TODO: This check does not work with FakeTensor inputs; See Issue #85834
     # Explicit cast for class_check to bool; See Issue #78071
     """
+    from torch._subclasses.fake_tensor import FakeTensor
     num_classes = input.shape[1] if input.ndim > 1 else input.shape[0]
     valid_classes_mask = torch.logical_and(
         (flat_target >= 0), (flat_target < num_classes)
diff --git a/torch/_six.py b/torch/_six.py
deleted file mode 100644
index 7ccc12f6bc5d..000000000000
--- a/torch/_six.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2010-2017 Benjamin Peterson
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import math
-
-inf = math.inf
-nan = math.nan
-string_classes = (str, bytes)
-
-
-def with_metaclass(meta: type, *bases) -> type:
-    """Create a base class with a metaclass."""
-    # This requires a bit of explanation: the basic idea is to make a dummy
-    # metaclass for one level of class instantiation that replaces itself with
-    # the actual metaclass.
-    class metaclass(meta):  # type: ignore[misc, valid-type]
-        def __new__(cls, name, this_bases, d):
-            return meta(name, bases, d)
-
-        @classmethod
-        def __prepare__(cls, name, this_bases):
-            return meta.__prepare__(name, bases)
-
-    return type.__new__(metaclass, "temporary_class", (), {})
diff --git a/torch/_sources.py b/torch/_sources.py
index 23d7338114dc..3f56bd8ef247 100644
--- a/torch/_sources.py
+++ b/torch/_sources.py
@@ -93,9 +93,7 @@ def __init__(
         uses_true_division=True,
         funcname=None,
     ):
-        super(SourceContext, self).__init__(
-            source, filename, file_lineno, leading_whitespace_len
-        )
+        super().__init__(source, filename, file_lineno, leading_whitespace_len)
         self.uses_true_division = uses_true_division
         self.filename = filename
         self.funcname = funcname
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index bb85b058d947..c3d29185d677 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import itertools
+import logging
 import os
 import weakref
 from dataclasses import dataclass
@@ -11,7 +12,12 @@
 import torch
 from torch._guards import Source
 from torch._ops import OpOverload
-from torch._prims_common import is_float_dtype, is_integer_dtype
+from torch._prims_common import (
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    is_float_dtype,
+    is_integer_dtype,
+)
 from torch._subclasses.meta_utils import MetaConverter
 from torch.fx.operator_schemas import normalize_function
 from torch.multiprocessing.reductions import StorageWeakRef
@@ -20,8 +26,11 @@
 from torch.utils._python_dispatch import TorchDispatchMode
 
 from torch.utils._pytree import PyTree, tree_flatten, tree_map, tree_map_only
+from torch.utils._stats import count, count_label
 from torch.utils.weak import WeakIdRef
 
+log = logging.getLogger(__name__)
+
 pytree = torch.utils._pytree
 T = TypeVar("T")
 TensorWeakRef = Any
@@ -30,6 +39,22 @@
 
 CONSTANT_NUMEL_LIMIT = 1
 
+RECURSION_COUNT = 0
+
+
+# Small helper that increments recursion count, and
+# resets it when the object goes out of scope.  Useful
+# if you don't want to increase indentation which is
+# what a context manager would do.
+class IncrementRecursionCount:
+    def __init__(self):
+        global RECURSION_COUNT
+        RECURSION_COUNT += 1
+
+    def __del__(self):
+        global RECURSION_COUNT
+        RECURSION_COUNT -= 1
+
 
 @dataclass
 class UnsupportedFakeTensorException(RuntimeError):
@@ -141,7 +166,7 @@ def tree_flatten_only(ty: Type[T], pytree: PyTree):
 # multiple tensors into fake tensors which share the same view/storage
 # structure. Like `MetaConverter`, it uses `WeakIdRef` to
 # hold a weak reference for all memoized tensors.
-class FakeTensorConverter(object):
+class FakeTensorConverter:
     @property
     def tensor_memo(self):
         return self.meta_converter.tensor_memo
@@ -276,7 +301,6 @@ def from_meta_and_device(self, fake_mode, t, device):
     # You're allowed to pass a meta tensor to be turned into a fake
     # tensor; although an odd thing to do, this can occur if you're doing
     # cross ref testing and the inner test is already operating on meta tensors.
-    # You must have created the FakeTensorMode with allow_meta == True
     def __call__(
         self,
         fake_mode,
@@ -373,7 +397,7 @@ def _sparse_coo_tensor_with_dims_and_tensors(fake_mode, func, *args, **kwargs):
 # index.Tensor data-dependent in only some conditions
 @register_op_impl(
     lambda func: torch.Tag.dynamic_output_shape in func.tags  # type: ignore[attr-defined]
-    and func != aten.index.Tensor
+    and func not in [aten.index.Tensor, aten.nonzero.default]
 )
 def dyn_shape(fake_mode, func, *args, **kwargs):
     raise DynamicOutputShapeException(func)
@@ -381,7 +405,7 @@ def dyn_shape(fake_mode, func, *args, **kwargs):
 
 @register_op_impl(lambda func: func is torch.ops.aten._local_scalar_dense.default)
 def local_scalar_dense(fake_mode, func, arg):
-    if fake_mode.shape_env is None:
+    if fake_mode.shape_env is None or not fake_mode.shape_env.allow_scalar_outputs:
         # Without symints/symfloats, cannot handle this
         raise DataDependentOutputException(func)
     if is_float_dtype(arg.dtype):
@@ -392,14 +416,47 @@ def local_scalar_dense(fake_mode, func, arg):
         raise NotImplementedError(f"local_scalar_dense/item NYI for {arg.dtype}")
 
 
+@register_op_impl(lambda func: func is torch.ops.aten.nonzero.default)
+def nonzero(fake_mode, func, arg):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    if arg.nonzero_memo is None:
+        from torch.fx.experimental.symbolic_shapes import (
+            constrain_range,
+            definitely_true,
+            guard_int,
+        )
+
+        nnz = fake_mode.shape_env.create_unbacked_symint()
+
+        # This is unsound, but it works well in practice
+        # See https://docs.google.com/document/d/1lFRYAJo5nrfxRhwIzGnfi2pbLpU6T4ytSRSuLJ5qebI/edit#
+        # TODO: Add a config knob to turn off this unsound behavior
+        lower = 2
+        upper = None
+        # But don't give totally unsatisfiable bounds if we know it's too small!
+        if definitely_true(arg.numel() < 2):
+            lower = 0
+            upper = guard_int(arg.numel())
+        constrain_range(nnz, min=lower, max=upper)
+
+        arg._nonzero_memo = nnz
+        arg._nonzero_memo_vc = arg._version
+
+    return arg.new_empty((arg.nonzero_memo, arg.dim()), dtype=torch.int64)
+
+
 # NB: this must be ordered after local_scalar_dense
 @register_op_impl(
     lambda func: torch.Tag.data_dependent_output in func.tags  # type: ignore[attr-defined]
 )
 def data_dep(fake_mode, func, *args, **kwargs):
-    if fake_mode.throw_on_data_dependent_ops:
-        raise DataDependentOutputException(func)
-    return NotImplemented
+    raise DataDependentOutputException(func)
 
 
 # Bool Indices get Expanded as Masks
@@ -427,10 +484,17 @@ def run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs):
 # index tensors with cuda self
 @register_op_impl(aten.index.Tensor)
 def index_tensor(fake_mode, func, *args, **kwargs):
-    # dynamic shape op if indices are bool/uint8
-    check_no_bool_index_tensors(func, *args, **kwargs)
+    from torch._meta_registrations import meta_index_Tensor
 
-    return run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    out_device = new_kwargs["input"].device
+    # ensure nonzero call goes to fake tensor
+    with fake_mode:
+        out = meta_index_Tensor(*args, **kwargs)
+        return out.to(out_device)
 
 
 # takes in multiple-devices, dont default to default device handling
@@ -469,7 +533,15 @@ def conv(fake_mode, func, *args, **kwargs):
     with fake_mode:
         # if the input is unsqueezed is done in Convolution.cpp we get segfault
         k = kwargs["weight"].ndim
-        if k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+        batch = kwargs["input"].shape[0]
+
+        from torch.fx.experimental.symbolic_shapes import has_hint
+
+        if not has_hint(batch):
+            # TODO: We can make this a little more faithful with best effort
+            # channels last detection (but only if it's statically obvious!)
+            mem_fmt = None
+        elif k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
             mem_fmt = None
         else:
             if func is aten.convolution.default:
@@ -511,6 +583,189 @@ def convert(t, mem_fmt):
             )
 
 
+FAST_OP_IMPLEMENTATIONS = {}
+
+
+# Unlike register_op_impl, these don't do the slow iteration for
+# run_impl_check, and these run BEFORE decompositions
+def register_fast_op_impl(func: OpOverload):
+    def impl_decorator(op_impl):
+        FAST_OP_IMPLEMENTATIONS[func] = op_impl
+        return op_impl
+
+    return impl_decorator
+
+
+# infer_size_impl in ExpandUtils
+def infer_size(a, b):
+    dimsA = len(a)
+    dimsB = len(b)
+    ndim = max(dimsA, dimsB)
+    expandedSizes = [0] * ndim
+    for i in range(ndim - 1, -1, -1):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        sizeA = a[dimA] if dimA >= 0 else 1
+        sizeB = b[dimB] if dimB >= 0 else 1
+        if not (sizeA == sizeB or sizeA == 1 or sizeB == 1):
+            raise RuntimeError(
+                f"The size of tensor a ({sizeA}) "
+                f"must match the size of tensor b ({sizeB}) "
+                f"at non-singleton dimension {i})"
+            )
+        expandedSizes[i] = sizeB if sizeA == 1 else sizeA
+    return tuple(expandedSizes)
+
+
+def make_fast_binary_impl(slow_ref):
+    def fast_binary_impl(mode, *args, **kwargs):
+        def slow(msg):
+            count_label(f"slow {msg}")
+            with mode:
+                return slow_ref(*args, **kwargs)
+
+        count_label("attempt fast")
+
+        # Fast path (based off of TensorIterator fast path).
+        # Unfortunately, there is no way to easily deduplicate
+        # this with either the TensorIterator C++ implementation
+        # (which we don't want to SymIntify, and also the algorithm
+        # here is slightly different from TensorIterator to allow
+        # for broadcasting), nor the PrimTorch implementation
+        # (which does not actually implement a fast path.)
+
+        operands = args
+
+        # compute_shape
+        has_scalars = False
+        has_tensors = False
+        final_shape = None
+        for op in operands:
+            shape = op.shape if isinstance(op, torch.Tensor) else ()
+            if len(shape) == 0:
+                has_scalars = True
+            else:
+                has_tensors = True
+            if final_shape is None:
+                final_shape = shape
+            # TODO: Minor optimization: track if the shapes
+            # were equal so you can skip the equality check
+            # below if unnecessary
+            final_shape = infer_size(final_shape, shape)
+        assert final_shape is not None
+
+        # Do some extra safety checks to see if the output
+        # stride is obvious
+        for op in operands:
+            if isinstance(op, torch.Tensor) and op.shape == final_shape:
+                break
+        else:
+            return slow("both tensors nontrivially broadcast")
+
+        # compute_types
+        cpu = torch.device("cpu")
+        common_device = cpu
+        common_dtype = None
+        output_dtype = None
+        has_different_input_dtypes = False
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                # Use elementwise_dtypes for the tricky case
+                has_different_input_dtypes = True
+                continue
+            if common_device == cpu and not op.device.type == "cpu":
+                common_device = op.device
+            # Slightly simplified here as target_dtype cannot vary
+            if common_dtype is None:
+                common_dtype = op.dtype
+            elif common_dtype != op.dtype:
+                has_different_input_dtypes = True
+
+        if has_different_input_dtypes:
+            # compute promotion
+            # TODO: we don't need the compute type
+            _, common_dtype = elementwise_dtypes(
+                *operands, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+            )
+
+        # check all tensors on same device
+        # cpu scalars are assumed allow
+        current_cpu_scalars_on_non_cpu = 0
+        max_cpu_scalars_on_non_cpu = 1  # hard coded atm
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                continue
+            if common_device != cpu and op.dim() == 0 and op.device == cpu:
+                if current_cpu_scalars_on_non_cpu >= max_cpu_scalars_on_non_cpu:
+                    return slow("error")
+                current_cpu_scalars_on_non_cpu += 1
+            elif op.device != common_device:
+                return slow("error")
+
+        # compute_fast_setup_type
+        is_contiguous = True
+        is_channels_last = True
+        # TODO: is_non-overlapping_and_dense (not bound from Python
+        # no inplace, no out, everything defined
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                continue
+            is_contiguous = is_contiguous and op.is_contiguous(
+                memory_format=torch.contiguous_format
+            )
+            is_channels_last = is_channels_last and op.is_contiguous(
+                memory_format=torch.channels_last
+            )
+        if is_contiguous:
+            # do contiguous
+            count_label("fast is_contiguous")
+            return FakeTensor(
+                mode,
+                torch.empty(
+                    final_shape,
+                    dtype=common_dtype,
+                    device="meta",
+                    memory_format=torch.contiguous_format,
+                ),
+                device=common_device,
+            )
+        if is_channels_last:
+            count_label("fast channels_last")
+            # do channels last
+            return FakeTensor(
+                mode,
+                torch.empty(
+                    final_shape,
+                    dtype=common_dtype,
+                    device="meta",
+                    memory_format=torch.channels_last,
+                ),
+                device=common_device,
+            )
+
+        return slow("no contiguity match")
+
+    return fast_binary_impl
+
+
+@functools.lru_cache(None)
+def get_fast_op_impls():
+    import torch._refs
+
+    register_fast_op_impl(torch.ops.aten.add.Tensor)(
+        make_fast_binary_impl(torch._refs.add)
+    )
+    register_fast_op_impl(torch.ops.aten.sub.Tensor)(
+        make_fast_binary_impl(torch._refs.sub)
+    )
+    register_fast_op_impl(torch.ops.aten.mul.Tensor)(make_fast_binary_impl(torch._refs.mul))  # type: ignore[has-type]
+    register_fast_op_impl(torch.ops.aten.div.Tensor)(
+        make_fast_binary_impl(torch._refs.div)
+    )
+    return FAST_OP_IMPLEMENTATIONS
+
+
 @contextlib.contextmanager
 def in_kernel_invocation_manager(fake_mode):
     # See: note [Fake Tensor Dispatch Keys]
@@ -529,6 +784,13 @@ def in_kernel_invocation_manager(fake_mode):
         del guard
 
 
+# Return if the function allows Python numbers to bind to Tensors
+def should_allow_numbers_as_tensors(func: OpOverload):
+    return torch._C._should_allow_numbers_as_tensors(
+        func.name().split("::")[-1].split(".")[0]
+    )
+
+
 class FakeTensorConfig:
     debug = os.environ.get("TORCH_FAKE_TENSOR_DEBUG", False)
 
@@ -546,6 +808,33 @@ class FakeTensor(torch.Tensor):
     fake_mode: "FakeTensorMode"
     constant: Optional[torch.Tensor]
 
+    # This memorizes the unbacked SymInt representing the number of nonzero
+    # elements in this tensor.  This is helpful if you do something like
+    # x[mask] and y[mask]; mask.nonzero() gets repeatedly called and should
+    # give a consistent unbacked SymInt.  It needs to be invalidated in the
+    # same way constant is.
+    # TODO: Generalize this as needed, e.g., into a trie of memos
+    _nonzero_memo: Optional[torch.SymInt]
+    _nonzero_memo_vc: Optional[int]
+
+    @property
+    def nonzero_memo(self):
+        if self._nonzero_memo is None:
+            return None
+        # Version counter based tracking isn't 100% sound but it's close
+        # enough
+        if self._nonzero_memo_vc != self._version:
+            self._nonzero_memo = None
+            return None
+        return self._nonzero_memo
+
+    @property
+    def device(self):
+        if self.fake_mode.in_kernel_invocation:
+            return torch.device("meta")
+        else:
+            return self.fake_device
+
     # Note: [Fake Tensor Dispatch Keys]
     # In order to model the behavior of device-specific autocast
     # and autograd logic, we update the dispatch keys of FakeTensors
@@ -560,7 +849,7 @@ class FakeTensor(torch.Tensor):
 
     @staticmethod
     def __new__(cls, fake_mode, elem, device, constant=None):
-        return torch.Tensor._make_subclass(
+        self = torch.Tensor._make_subclass(
             cls,
             elem,
             elem.requires_grad,
@@ -568,13 +857,6 @@ def __new__(cls, fake_mode, elem, device, constant=None):
             device_for_backend_keys=device,
         )
 
-    def __init__(
-        self,
-        fake_mode,
-        elem,
-        device: Union[torch.device, str],
-        constant: Optional[torch.Tensor] = None,
-    ):
         assert elem.device.type == "meta", elem.device.type
         device = device if isinstance(device, torch.device) else torch.device(device)
         # NB: it is fine, if a little confusing, for device to be meta
@@ -589,13 +871,38 @@ def __init__(
         # normalize cuda device.
         if device.type == "cuda" and device.index is None:
             device = torch.device(f"cuda:{torch.cuda.current_device()}")
-        self.fake_device = device
-        self.fake_mode = fake_mode
-        self.constant = constant
+        self.fake_device = device  # type: ignore[attr-defined]
+        self.fake_mode = fake_mode  # type: ignore[attr-defined]
+        self.constant = constant  # type: ignore[attr-defined]
+        self._nonzero_memo = None  # type: ignore[attr-defined]
+        self._nonzero_memo_vc = None  # type: ignore[attr-defined]
+
         if FakeTensorConfig.debug:
             import traceback
 
-            self._debug_trace = traceback.extract_stack()
+            self._debug_trace = traceback.extract_stack()  # type: ignore[attr-defined]
+        return self
+
+    # In some circumstances, a conventional torch.Tensor constructor
+    # will get rewritten to call into FakeTensor.  We must provide an
+    # __init__ method that can accept the Python interpreters initialization
+    # in such a situation; we must also be able to handle direct fake
+    # tensor construction via FakeTensor().
+    #
+    # In particular, the __init__ call will look funny in the following case:
+    #
+    #   with FakeTensorMode():
+    #       x = torch.Tensor([1, 2, 3])
+    #
+    # this desugars into:
+    #
+    #   with FakeTensorMode():
+    #       x = torch.Tensor.__new__([1, 2, 3])
+    #       # NB: x is a fake tensor, because of the mode!
+    #       x.__init__([1, 2, 3])  # not the normal fake tensor args!
+    #
+    def __init__(self, *args, **kwargs):
+        super().__init__()
 
     @staticmethod
     def from_tensor(t, fake_mode):
@@ -608,6 +915,7 @@ def __repr__(self):
         return f"FakeTensor({self_repr}, {self.fake_device})"
 
     @classmethod
+    @count
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         # need to handle here to avoid infinite recursion
         # see [in_kernel_invocation]
@@ -692,12 +1000,7 @@ def merge_devices(t):
         # some functions that allow Python numbers to bind to Tensors
         # if we have failed to find a device, and we're running one of these operators,
         # we must have scalar only inputs
-        if (
-            torch._C._should_allow_numbers_as_tensors(
-                func.name().split("::")[-1].split(".")[0]
-            )
-            and common_device is None
-        ):
+        if should_allow_numbers_as_tensors(func) and common_device is None:
             # ops with scalar only inputs always have result on cpu
             has_scalar_only_inputs = True
             common_device = torch.device("cpu")
@@ -723,17 +1026,15 @@ def __init__(
         self,
         *,
         allow_fallback_kernels=True,
-        allow_meta=False,
-        throw_on_data_dependent_ops=True,
         allow_non_fake_inputs=False,
         shape_env=None,
     ):
         self.allow_fallback_kernels = allow_fallback_kernels
         self.fake_tensor_converter = FakeTensorConverter()
-        self.allow_meta = allow_meta
 
-        # TODO: delete arg and default to true. waiting on dynamo perf regression testing
-        self.throw_on_data_dependent_ops = throw_on_data_dependent_ops
+        import torch._functorch.config
+
+        self.allow_meta = torch._functorch.config.fake_tensor_allow_meta
 
         # A flag that controls, whether we want to invoke ops on mix of
         # real weights/global variables and fake inputs
@@ -753,7 +1054,15 @@ def __init__(
 
         self.shape_env = shape_env
 
+    @count
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        try:
+            return self.dispatch(func, types, args, kwargs)
+        except TypeError:
+            log.exception("fake tensor raised TypeError")
+            raise
+
+    def dispatch(self, func, types, args=(), kwargs=None):
         kwargs = kwargs if kwargs else {}
 
         if func == torch.ops.prim.device.default:
@@ -763,6 +1072,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             else:
                 return args[0].fake_device
 
+        if log.getEffectiveLevel() <= logging.DEBUG:
+            log.debug(
+                f"{' ' * RECURSION_COUNT}FakeTensorMode.__torch_dispatch__: {func}"
+            )
+            incr = IncrementRecursionCount()
+
         # Some attribute queries that can be serviced directly
         # See Note [is_coalesced is dispatched]
         if func in {
@@ -783,10 +1098,16 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         converter = self.fake_tensor_converter
 
-        # If this is a lift, the input tensor is guaranteed to be a
-        # constant, so we keep a copy of the original argument along so
-        # we can query it if we're asked to item() it at some later point
-        if func in self.lift_fns:
+        # To constant propagate through these functions:
+        # 1, If this is a lift, the input tensor is guaranteed to be a
+        #    constant, so we keep a copy of the original argument along so
+        #    we can query it if we're asked to item() it at some later point
+        # 2, Some functions that allow Python numbers to bind to Tensors, e.g, torch.div
+        if func in self.lift_fns or (
+            should_allow_numbers_as_tensors(func)
+            and not has_symbolic_sizes
+            and not flat_arg_fake_tensors
+        ):
             out = func(*args, **kwargs)
             if self.may_turn_const(out):
                 # NB: not in_kernel_invocation_manager because we're doing real
@@ -866,6 +1187,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
+        # Try for fastpath
+        if has_symbolic_sizes:
+            fast_impl = get_fast_op_impls().get(func)
+            if fast_impl is not None:
+                return fast_impl(self, *args, **kwargs)
+
         # If there's a Python meta, prefer that over the decomposition
         from torch._decomp import meta_table as meta_table
 
@@ -988,7 +1315,11 @@ def gen_wrap_fn(self, func, args, kwargs):
         def wrap(e, device=None):
             nonlocal common_device
             nonlocal has_scalar_only_inputs
-            if isinstance(e, torch.Tensor) and not isinstance(e, FakeTensor):
+            if (
+                isinstance(e, torch.Tensor)
+                and not isinstance(e, FakeTensor)
+                and converter is not None
+            ):
                 if common_device is None:
                     (
                         common_device,
@@ -1013,6 +1344,7 @@ def cpp_meta_supports_symint(self, func):
         if torch.Tag.view_copy in func.tags:  # type: ignore[attr-defined]
             return True
         return func in [
+            aten.empty.memory_format,
             aten.empty_strided.default,
             aten.as_strided_scatter.default,
             aten.as_strided.default,
@@ -1034,6 +1366,7 @@ def may_turn_const(self, t):
             t.numel() <= CONSTANT_NUMEL_LIMIT
             and not t.is_sparse
             and not isinstance(t, FakeTensor)
+            and not t.device.type == "meta"
         )
 
     def invalidate_written_to_constants(
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 640826b0449e..2c298d84eaec 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -475,7 +475,7 @@ def __call__(
                     # don't work
                     t.is_neg(),
                     t.is_conj(),
-                    t.device.type in ("lazy", "meta"),
+                    t.device.type in ("lazy"),
                     # We need a way to test if a tensor is batched but there
                     # is no official APi to do it
                     # torch._C._is_batched(t),
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 7a706536ea77..cabfcbf8983c 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -97,8 +97,7 @@ def __deepcopy__(self, memo):
             # Update the test in test_serialization if you remove 'meta' from here
             if (
                 self.is_sparse
-                or self.device.type
-                in ["lazy", "xla", "mps", "ort", "meta", "hpu", "ipu"]
+                or self.device.type in ["lazy", "xla", "mps", "ort", "meta", "ipu"]
                 or (
                     not torch._C._has_storage(self)
                     and self.device.type == "privateuseone"
@@ -256,7 +255,7 @@ def _reduce_ex_internal(self, proto):
         # 2. Python list is not a good fit due to performance reason.
         #    `tolist()` converts every single element in the tensor into python objects
         #    and serialize them one by one.
-        if self.device.type in ["xla", "ort", "hpu"] or (
+        if self.device.type in ["xla", "ort"] or (
             not torch._C._has_storage(self) and self.device.type == "privateuseone"
         ):
             # Convert BFloat16 tesors to Float32 before conversion to numpy, as numpy doesn't
@@ -662,6 +661,11 @@ def eig(self, eigenvectors=False):
 
         return eig(self, eigenvectors=eigenvectors)
 
+    def symeig(self, eigenvectors=False):
+        from ._linalg_utils import _symeig
+
+        return _symeig(self, eigenvectors=eigenvectors)
+
     def lu(self, pivot=True, get_infos=False):
         r"""See :func:`torch.lu`"""
         # If get_infos is True, then we don't need to check for errors and vice versa
@@ -1115,7 +1119,7 @@ def refine_names(self, *names):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.refine_names, (self,), self, *names)
         names = resolve_ellipsis(names, self.names, "refine_names")
-        return super(Tensor, self).refine_names(names)
+        return super().refine_names(names)
 
     def align_to(self, *names):
         r"""Permutes the dimensions of the :attr:`self` tensor to match the order
@@ -1157,8 +1161,8 @@ def align_to(self, *names):
             return handle_torch_function(Tensor.align_to, (self,), self, *names)
         ellipsis_idx = single_ellipsis_index(names, "align_to")
         if ellipsis_idx is None:
-            return super(Tensor, self).align_to(names)
-        return super(Tensor, self).align_to(
+            return super().align_to(names)
+        return super().align_to(
             [name for name in names if not is_ellipsis(name)], ellipsis_idx
         )
 
@@ -1180,9 +1184,9 @@ def unflatten(self, dim, sizes):
             isinstance(sizes, (tuple, list)) and isinstance(sizes[0], (tuple, list))
         ):
             names, sizes = unzip_namedshape(sizes)
-            return super(Tensor, self).unflatten(dim, sizes, names)
+            return super().unflatten(dim, sizes, names)
         else:
-            return super(Tensor, self).unflatten(dim, sizes)
+            return super().unflatten(dim, sizes)
 
     def rename_(self, *names, **rename_map):
         """In-place version of :meth:`~Tensor.rename`."""
@@ -1262,9 +1266,9 @@ def _update_names(self, names, inplace):
 
         # See Note [rename_ / rename API]
         if inplace:
-            return super(Tensor, self).rename_(names)
+            return super().rename_(names)
         else:
-            return super(Tensor, self).rename(names)
+            return super().rename(names)
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 427cd5b65591..a504cafd4804 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3124,10 +3124,12 @@ def callable(a, b) -> number
 masked_scatter_(mask, source)
 
 Copies elements from :attr:`source` into :attr:`self` tensor at positions where
-the :attr:`mask` is True.
+the :attr:`mask` is True. Elements from :attr:`source` are copied into :attr:`self`
+starting at position 0 of :attr:`source` and continuing in order one-by-one for each
+occurrence of :attr:`mask` being True.
 The shape of :attr:`mask` must be :ref:`broadcastable <broadcasting-semantics>`
 with the shape of the underlying tensor. The :attr:`source` should have at least
-as many elements as the number of ones in :attr:`mask`
+as many elements as the number of ones in :attr:`mask`.
 
 Args:
     mask (BoolTensor): the boolean mask
@@ -3137,6 +3139,16 @@ def callable(a, b) -> number
 
     The :attr:`mask` operates on the :attr:`self` tensor, not on the given
     :attr:`source` tensor.
+
+Example:
+
+    >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter_(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
 """,
 )
 
@@ -4274,6 +4286,11 @@ def callable(a, b) -> number
 Reducing with the addition operation is the same as using
 :meth:`~torch.Tensor.scatter_add_`.
 
+.. warning::
+    The reduce argument with Tensor ``src`` is deprecated and will be removed in
+    a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+    instead for more reduction options.
+
 Args:
     dim (int): the axis along which to index
     index (LongTensor): the indices of elements to scatter, can be either empty
@@ -4916,15 +4933,6 @@ def callable(a, b) -> number
 """,
 )
 
-add_docstr_all(
-    "symeig",
-    r"""
-symeig(eigenvectors=False, upper=True) -> (Tensor, Tensor)
-
-See :func:`torch.symeig`
-""",
-)
-
 add_docstr_all(
     "swapdims",
     r"""
@@ -6366,6 +6374,21 @@ def callable(a, b) -> number
 masked_scatter(mask, tensor) -> Tensor
 
 Out-of-place version of :meth:`torch.Tensor.masked_scatter_`
+
+.. note::
+
+    The inputs :attr:`self` and :attr:`mask`
+    :ref:`broadcast <broadcasting-semantics>`.
+
+Example:
+
+    >>> self = torch.tensor([0, 0, 0, 0, 0])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
 """,
 )
 
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 3906d7d0e582..adea080f1e86 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -3,10 +3,10 @@
 from typing import Optional
 
 import torch
-from torch._six import inf
+from torch import inf
 
 
-class __PrinterOptions(object):
+class __PrinterOptions:
     precision: int = 4
     threshold: float = 1000
     edgeitems: int = 3
@@ -96,7 +96,7 @@ def tensor_totype(t):
     return t.to(dtype=dtype)
 
 
-class _Formatter(object):
+class _Formatter:
     def __init__(self, tensor):
         self.floating_dtype = tensor.dtype.is_floating_point
         self.int_mode = True
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7d0a3c3f7cf8..c6fe93ef9b78 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1230,7 +1230,7 @@ def merge_dicts(*dicts):
 :attr:`obj` can be one of:
 
 1. a tensor
-2. a NumPy array
+2. a NumPy array or a NumPy scalar
 3. a DLPack capsule
 4. an object that implements Python's buffer protocol
 5. a scalar
@@ -1245,14 +1245,18 @@ def merge_dicts(*dicts):
 is ``True`` then the returned tensor will require a gradient, and if :attr:`obj` is
 also a tensor with an autograd history then the returned tensor will have the same history.
 
-When :attr:`obj` is not a tensor, NumPy Array, or DLPack capsule but implements Python's
+When :attr:`obj` is not a tensor, NumPy array, or DLPack capsule but implements Python's
 buffer protocol then the buffer is interpreted as an array of bytes grouped according to
 the size of the datatype passed to the :attr:`dtype` keyword argument. (If no datatype is
 passed then the default floating point datatype is used, instead.) The returned tensor
 will have the specified datatype (or default floating point datatype if none is specified)
 and, by default, be on the CPU device and share memory with the buffer.
 
-When :attr:`obj` is none of the above but a scalar or sequence of scalars then the
+When :attr:`obj` is a NumPy scalar, the returned tensor will be a 0-dimensional tensor on
+the CPU and that doesn't share its memory (i.e. ``copy=True``). By default datatype will
+be the PyTorch datatype corresponding to the NumPy's scalar's datatype.
+
+When :attr:`obj` is none of the above but a scalar, or a sequence of scalars then the
 returned tensor will, by default, infer its datatype from the scalar values, be on the
 CPU device, and not share its memory.
 
@@ -1320,6 +1324,10 @@ def merge_dicts(*dicts):
     >>> t2 = torch.asarray(array, dtype=torch.float32)
     >>> array.__array_interface__['data'][0] == t1.data_ptr()
     False
+
+    >>> scalar = numpy.float64(0.5)
+    >>> torch.asarray(scalar)
+    tensor(0.5000, dtype=torch.float64)
 """,
 )
 
@@ -11086,104 +11094,6 @@ def merge_dicts(*dicts):
 """,
 )
 
-add_docstr(
-    torch.symeig,
-    r"""
-symeig(input, eigenvectors=False, upper=True, *, out=None) -> (Tensor, Tensor)
-
-This function returns eigenvalues and eigenvectors
-of a real symmetric or complex Hermitian matrix :attr:`input` or a batch thereof,
-represented by a namedtuple (eigenvalues, eigenvectors).
-
-This function calculates all eigenvalues (and vectors) of :attr:`input`
-such that :math:`\text{input} = V \text{diag}(e) V^T`.
-
-The boolean argument :attr:`eigenvectors` defines computation of
-both eigenvectors and eigenvalues or eigenvalues only.
-
-If it is ``False``, only eigenvalues are computed. If it is ``True``,
-both eigenvalues and eigenvectors are computed.
-
-Since the input matrix :attr:`input` is supposed to be symmetric or Hermitian,
-only the upper triangular portion is used by default.
-
-If :attr:`upper` is ``False``, then lower triangular portion is used.
-
-.. warning::
-
-    :func:`torch.symeig` is deprecated in favor of :func:`torch.linalg.eigh`
-    and will be removed in a future PyTorch release. The default behavior has changed
-    from using the upper triangular portion of the matrix by default to using the
-    lower triangular portion.
-
-    ``L, _ = torch.symeig(A, upper=upper)`` should be replaced with
-
-    .. code :: python
-
-        UPLO = "U" if upper else "L"
-        L = torch.linalg.eigvalsh(A, UPLO=UPLO)
-
-    ``L, V = torch.symeig(A, eigenvectors=True, upper=upper)`` should be replaced with
-
-    .. code :: python
-
-        UPLO = "U" if upper else "L"
-        L, V = torch.linalg.eigh(A, UPLO=UPLO)
-
-.. note:: The eigenvalues are returned in ascending order. If :attr:`input` is a batch of matrices,
-          then the eigenvalues of each matrix in the batch is returned in ascending order.
-
-.. note:: Irrespective of the original strides, the returned matrix `V` will
-          be transposed, i.e. with strides `V.contiguous().mT.stride()`.
-
-.. warning:: Extra care needs to be taken when backward through outputs. Such
-             operation is only stable when all eigenvalues are distinct and becomes
-             less stable the smaller :math:`\min_{i \neq j} |\lambda_i - \lambda_j|` is.
-
-Args:
-    input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more
-                    batch dimensions consisting of symmetric or Hermitian matrices.
-    eigenvectors(bool, optional): controls whether eigenvectors have to be computed
-    upper(bool, optional): controls whether to consider upper-triangular or lower-triangular region
-
-Keyword args:
-    out (tuple, optional): the output tuple of (Tensor, Tensor)
-
-Returns:
-    (Tensor, Tensor): A namedtuple (eigenvalues, eigenvectors) containing
-
-        - **eigenvalues** (*Tensor*): Shape :math:`(*, m)`. The eigenvalues in ascending order.
-        - **eigenvectors** (*Tensor*): Shape :math:`(*, m, m)`.
-          If ``eigenvectors=False``, it's an empty tensor.
-          Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
-
-Examples::
-
-
-    >>> a = torch.randn(5, 5)
-    >>> a = a + a.t()  # To make a symmetric
-    >>> a
-    tensor([[-5.7827,  4.4559, -0.2344, -1.7123, -1.8330],
-            [ 4.4559,  1.4250, -2.8636, -3.2100, -0.1798],
-            [-0.2344, -2.8636,  1.7112, -5.5785,  7.1988],
-            [-1.7123, -3.2100, -5.5785, -2.6227,  3.1036],
-            [-1.8330, -0.1798,  7.1988,  3.1036, -5.1453]])
-    >>> e, v = torch.symeig(a, eigenvectors=True)
-    >>> e
-    tensor([-13.7012,  -7.7497,  -2.3163,   5.2477,   8.1050])
-    >>> v
-    tensor([[ 0.1643,  0.9034, -0.0291,  0.3508,  0.1817],
-            [-0.2417, -0.3071, -0.5081,  0.6534,  0.4026],
-            [-0.5176,  0.1223, -0.0220,  0.3295, -0.7798],
-            [-0.4850,  0.2695, -0.5773, -0.5840,  0.1337],
-            [ 0.6415, -0.0447, -0.6381, -0.0193, -0.4230]])
-    >>> a_big = torch.randn(5, 2, 2)
-    >>> a_big = a_big + a_big.mT  # To make a_big symmetric
-    >>> e, v = a_big.symeig(eigenvectors=True)
-    >>> torch.allclose(torch.matmul(v, torch.matmul(e.diag_embed(), v.mT)), a_big)
-    True
-""",
-)
 
 add_docstr(
     torch.t,
@@ -12141,7 +12051,7 @@ def merge_dicts(*dicts):
 
 The returned tensor shares the same underlying data with this tensor.
 
-A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
+A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1]``
 can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
 applied at :attr:`dim` = ``dim + input.dim() + 1``.
 
@@ -12443,6 +12353,51 @@ def merge_dicts(*dicts):
     ),
 )
 
+add_docstr(
+    torch.empty_permuted,
+    r"""
+empty_permuted(size, physical_layout, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+
+Creates an uninitialized, non-overlapping and dense tensor with the
+specified :attr:`size`, with :attr:`physical_layout` specifying how the
+dimensions are physically laid out in memory (each logical dimension is listed
+from outermost to innermost).  :attr:`physical_layout` is a generalization
+of NCHW/NHWC notation: if each dimension is assigned a number according to
+what order they occur in size (N=0, C=1, H=2, W=3), then NCHW is ``(0, 1, 2, 3)``
+while NHWC is ``(0, 2, 3, 1)``.  Equivalently, the strides of the output
+tensor ``t`` are such that ``t.stride(physical_layout[i]) == contiguous_strides[i]``
+(notably, this function is *not* equivalent to ``torch.empty(size).permute(physical_layout)``).
+
+Unlike :func:`torch.empty_strided`, this is guaranteed to produce a dense
+tensor with no overlaps.  If possible, prefer using this function over
+:func:`torch.empty_strided` or manual use of :func:`torch.as_strided`.
+
+Args:
+    size (tuple of int): the shape of the output tensor
+    physical_layout (tuple of int): the ordering of dimensions physically in memory
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+
+Examples:
+
+    >>> torch.empty((2, 3, 5, 7)).stride()
+    (105, 35, 7, 1)
+    >>> torch.empty_permuted((2, 3, 5, 7), (0, 1, 2, 3)).stride()
+    (105, 35, 7, 1)
+    >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).stride()
+    (105, 1, 21, 3)
+    >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).stride()
+    (105, 1, 21, 3)
+""".format(
+        **factory_common_args
+    ),
+)
+
 add_docstr(
     torch.full,
     r"""
@@ -12547,6 +12502,10 @@ def merge_dicts(*dicts):
     tensor([[-0.4620,  0.3139],
             [ 0.3898, -0.7197],
             [ 0.0478, -0.1657]])
+    >>> torch.where(x > 0, 1.0, 0.0)
+    tensor([[0., 1.],
+            [1., 0.],
+            [1., 0.]])
     >>> torch.where(x > 0, x, y)
     tensor([[ 1.0000,  0.3139],
             [ 0.3898,  1.0000],
diff --git a/torch/_utils.py b/torch/_utils.py
index ff2edad4aa3a..cdfc9df18f51 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -374,6 +374,8 @@ def _get_obj_state(obj):
     # This loosely mimicks the function on the object class but since Tensor do not inherit
     # from it, we cannot call that function directly
     # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+    # Note that starting with Python 3.11, this `__getstate__` is always defined and thus
+    # the else branch will never be taken.
     getstate_fn = getattr(obj, "__getstate__", None)
     if getstate_fn:
         state = getstate_fn()
@@ -404,8 +406,11 @@ def _set_obj_state(obj, state):
         dict_state = state
         slots_state = None
 
-    for k, v in dict_state.items():
-        setattr(obj, k, v)
+    # Starting with Python 3.11, the __dict__ attribute is lazily created
+    # and is serialized as None when not needed.
+    if dict_state:
+        for k, v in dict_state.items():
+            setattr(obj, k, v)
 
     if slots_state:
         for k, v in slots_state.items():
@@ -602,7 +607,7 @@ def __repr__(self):
         return self
 
 
-class ExceptionWrapper(object):
+class ExceptionWrapper:
     r"""Wraps an exception plus traceback to communicate across threads"""
 
     def __init__(self, exc_info=None, where="in background"):
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 30e10409184f..53107327a3e4 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -21,6 +21,7 @@
 from pickle import (
     APPEND,
     APPENDS,
+    BINFLOAT,
     BINGET,
     BININT,
     BININT1,
@@ -226,6 +227,8 @@ def load(self):
                 self.append(self.read(1)[0])
             elif key[0] == BININT2[0]:
                 self.append(unpack("<H", read(2))[0])
+            elif key[0] == BINFLOAT[0]:
+                self.append(unpack(">d", self.read(8))[0])
             elif key[0] == BINUNICODE[0]:
                 strlen = unpack("<I", read(4))[0]
                 if strlen > maxsize:
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index e0ff5efed2a4..11ce2c7beb4b 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -15,7 +15,7 @@ def decorate_autocast(*args, **kwargs):
     decorate_autocast.__script_unsupported = '@autocast() decorator is not supported in script mode'  # type: ignore[attr-defined]
     return decorate_autocast
 
-class autocast(object):
+class autocast:
     r"""
     Instances of :class:`autocast` serve as context managers or decorators that
     allow regions of your script to run in mixed precision.
@@ -118,7 +118,7 @@ def forward(self, input):
 
         class TestModel(nn.Module):
             def __init__(self, input_size, num_classes):
-                super(TestModel, self).__init__()
+                super().__init__()
                 self.fc1 = nn.Linear(input_size, num_classes)
             def forward(self, x):
                 return self.fc1(x)
diff --git a/torch/ao/nn/intrinsic/__init__.py b/torch/ao/nn/intrinsic/__init__.py
index 7d0c0664e917..a18bae3eaa38 100644
--- a/torch/ao/nn/intrinsic/__init__.py
+++ b/torch/ao/nn/intrinsic/__init__.py
@@ -21,6 +21,8 @@
     'LinearBn1d',
     'LinearLeakyReLU',
     'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
 ]
 
 # We are exposing all subpackages to the end-user.
diff --git a/torch/ao/nn/intrinsic/modules/__init__.py b/torch/ao/nn/intrinsic/modules/__init__.py
index 46f9a469e407..afc6c63f5f0c 100644
--- a/torch/ao/nn/intrinsic/modules/__init__.py
+++ b/torch/ao/nn/intrinsic/modules/__init__.py
@@ -14,7 +14,8 @@
 from .fused import LinearBn1d
 from .fused import LinearLeakyReLU
 from .fused import LinearTanh
-
+from .fused import ConvAdd2d
+from .fused import ConvAddReLU2d
 
 __all__ = [
     'ConvBn1d',
@@ -32,4 +33,6 @@
     'LinearBn1d',
     'LinearLeakyReLU',
     'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
 ]
diff --git a/torch/ao/nn/intrinsic/modules/fused.py b/torch/ao/nn/intrinsic/modules/fused.py
index 5eaf6c50e91f..f70a5430e65c 100644
--- a/torch/ao/nn/intrinsic/modules/fused.py
+++ b/torch/ao/nn/intrinsic/modules/fused.py
@@ -4,7 +4,8 @@
 
 __all__ = ['ConvReLU1d', 'ConvReLU2d', 'ConvReLU3d', 'LinearReLU', 'ConvBn1d', 'ConvBn2d',
            'ConvBnReLU1d', 'ConvBnReLU2d', 'ConvBn3d', 'ConvBnReLU3d', 'BNReLU2d', 'BNReLU3d',
-           'LinearBn1d', 'LinearLeakyReLU', 'LinearTanh']
+           'LinearBn1d', 'LinearLeakyReLU', 'LinearTanh', 'ConvAdd2d', 'ConvAddReLU2d']
+
 # Used for identifying intrinsic modules used in quantization
 class _FusedModule(torch.nn.Sequential):
     pass
@@ -61,7 +62,7 @@ def __init__(self, conv, bn):
         assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(bn) == BatchNorm2d, \
             'Incorrect types for input modules{}{}'.format(
                 type_before_parametrizations(conv), type_before_parametrizations(bn))
-        super(ConvBn2d, self).__init__(conv, bn)
+        super().__init__(conv, bn)
 
 class ConvBnReLU1d(_FusedModule):
     r"""This is a sequential container which calls the Conv 1d, Batch Norm 1d, and ReLU modules.
@@ -144,3 +145,24 @@ def __init__(self, linear, tanh):
             'Incorrect types for input modules{}{}'.format(
                 type(linear), type(tanh))
         super().__init__(linear, tanh)
+
+class ConvAdd2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d modules with extra Add.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, add):
+        super().__init__(conv)
+        self.add = add
+
+    def forward(self, x1, x2):
+        return self.add(self[0](x1), x2)
+
+class ConvAddReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d, add, Relu.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, add, relu):
+        super().__init__(conv)
+        self.add = add
+        self.relu = relu
+
+    def forward(self, x1, x2):
+        return self.relu(self.add(self[0](x1), x2))
diff --git a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
index 6a6f4c14d6b4..b0af9e669876 100644
--- a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -83,7 +83,7 @@ def reset_bn_parameters(self):
             init.uniform_(self.bias, -bound, bound)
 
     def reset_parameters(self):
-        super(_ConvBnNd, self).reset_parameters()
+        super().reset_parameters()
 
     def update_bn_stats(self):
         self.freeze_bn = False
@@ -218,7 +218,7 @@ def _forward_slow(self, input):
 
     def extra_repr(self):
         # TODO(jerryzh): extend
-        return super(_ConvBnNd, self).extra_repr()
+        return super().extra_repr()
 
     def forward(self, input):
         return self._forward(input)
@@ -285,8 +285,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss
                 elif strict:
                     missing_keys.append(prefix + v2_name)
 
-        super(_ConvBnNd, self)._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                      missing_keys, unexpected_keys, error_msgs)
 
     @classmethod
     def from_float(cls, mod):
@@ -476,10 +476,10 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1,
                  bias=True, padding_mode='zeros',
                  qconfig=None):
-        super(ConvReLU1d, self).__init__(in_channels, out_channels, kernel_size,
-                                         stride=stride, padding=padding, dilation=dilation,
-                                         groups=groups, bias=bias, padding_mode=padding_mode,
-                                         qconfig=qconfig)
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride=stride, padding=padding, dilation=dilation,
+                         groups=groups, bias=bias, padding_mode=padding_mode,
+                         qconfig=qconfig)
         assert qconfig, 'qconfig must be provided for QAT module'
         self.qconfig = qconfig
         self.weight_fake_quant = self.qconfig.weight()
@@ -574,11 +574,11 @@ def __init__(self,
                  # Args for this module
                  freeze_bn=False,
                  qconfig=None):
-        super(ConvBnReLU2d, self).__init__(in_channels, out_channels, kernel_size, stride,
-                                           padding, dilation, groups, bias,
-                                           padding_mode, eps, momentum,
-                                           freeze_bn,
-                                           qconfig)
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias,
+                         padding_mode, eps, momentum,
+                         freeze_bn,
+                         qconfig)
 
     def forward(self, input):
         return F.relu(ConvBn2d._forward(self, input))
@@ -608,10 +608,10 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1,
                  bias=True, padding_mode='zeros',
                  qconfig=None):
-        super(ConvReLU2d, self).__init__(in_channels, out_channels, kernel_size,
-                                         stride=stride, padding=padding, dilation=dilation,
-                                         groups=groups, bias=bias, padding_mode=padding_mode,
-                                         qconfig=qconfig)
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride=stride, padding=padding, dilation=dilation,
+                         groups=groups, bias=bias, padding_mode=padding_mode,
+                         qconfig=qconfig)
         assert qconfig, 'qconfig must be provided for QAT module'
         self.qconfig = qconfig
         self.weight_fake_quant = self.qconfig.weight()
@@ -737,7 +737,7 @@ def __init__(
         freeze_bn=False,
         qconfig=None,
     ):
-        super(ConvBnReLU3d, self).__init__(
+        super().__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -790,7 +790,7 @@ def __init__(
         padding_mode="zeros",
         qconfig=None,
     ):
-        super(ConvReLU3d, self).__init__(
+        super().__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -816,13 +816,9 @@ def from_float(cls, mod):
         return super(ConvReLU3d, cls).from_float(mod)
 
 def update_bn_stats(mod):
-    if type(mod) in set(
-        [ConvBnReLU1d, ConvBnReLU2d, ConvBnReLU3d, ConvBn1d, ConvBn2d, ConvBn3d]
-    ):
+    if type(mod) in {ConvBnReLU1d, ConvBnReLU2d, ConvBnReLU3d, ConvBn1d, ConvBn2d, ConvBn3d}:
         mod.update_bn_stats()
 
 def freeze_bn_stats(mod):
-    if type(mod) in set(
-        [ConvBnReLU1d, ConvBnReLU2d, ConvBnReLU3d, ConvBn1d, ConvBn2d, ConvBn3d]
-    ):
+    if type(mod) in {ConvBnReLU1d, ConvBnReLU2d, ConvBnReLU3d, ConvBn1d, ConvBn2d, ConvBn3d}:
         mod.freeze_bn_stats()
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
index 604350287242..3bff8e5f9f80 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -68,7 +68,7 @@ def reset_bn_parameters(self):
         init.zeros_(self.bn.bias)
 
     def reset_parameters(self):
-        super(LinearBn1d, self).reset_parameters()
+        super().reset_parameters()
 
     def update_bn_stats(self):
         self.freeze_bn = False
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
index 1c779658e38e..93b195370834 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -11,7 +11,7 @@ class LinearReLU(nnqat.Linear, nni._FusedModule):
 
     We adopt the same interface as :class:`torch.nn.Linear`.
 
-    Similar to `torch.nn.intrinsic.LinearReLU`, with FakeQuantize modules initialized to
+    Similar to `torch.ao.nn.intrinsic.LinearReLU`, with FakeQuantize modules initialized to
     default.
 
     Attributes:
@@ -30,7 +30,7 @@ class LinearReLU(nnqat.Linear, nni._FusedModule):
 
     def __init__(self, in_features, out_features, bias=True,
                  qconfig=None):
-        super(LinearReLU, self).__init__(in_features, out_features, bias, qconfig)
+        super().__init__(in_features, out_features, bias, qconfig)
 
     def forward(self, input):
         return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias))
@@ -45,4 +45,4 @@ def to_float(self):
         if self.bias is not None:
             linear.bias = torch.nn.Parameter(self.bias.detach())
         relu = torch.nn.ReLU()
-        return torch.nn.intrinsic.LinearReLU(linear, relu)
+        return torch.ao.nn.intrinsic.LinearReLU(linear, relu)
diff --git a/torch/ao/nn/intrinsic/quantized/__init__.py b/torch/ao/nn/intrinsic/quantized/__init__.py
index 0a5c21ddd1de..78c75f0c82b5 100644
--- a/torch/ao/nn/intrinsic/quantized/__init__.py
+++ b/torch/ao/nn/intrinsic/quantized/__init__.py
@@ -9,4 +9,6 @@
     'LinearReLU',
     'LinearLeakyReLU',
     'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
 ]
diff --git a/torch/ao/nn/intrinsic/quantized/modules/__init__.py b/torch/ao/nn/intrinsic/quantized/modules/__init__.py
index 1d21f58acf3d..51149bff646c 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/__init__.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/__init__.py
@@ -1,6 +1,7 @@
 from .linear_relu import LinearReLU, LinearLeakyReLU, LinearTanh
 from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
 from .bn_relu import BNReLU2d, BNReLU3d
+from .conv_add import ConvAdd2d, ConvAddReLU2d
 
 __all__ = [
     'LinearReLU',
@@ -11,4 +12,6 @@
     'BNReLU3d',
     'LinearLeakyReLU',
     'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
 ]
diff --git a/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py b/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
index 1927564aa6e4..5cd2ed8a757c 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
@@ -22,7 +22,7 @@ class BNReLU2d(nnq.BatchNorm2d):
     _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU2d
 
     def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
-        super(BNReLU2d, self).__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
+        super().__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
@@ -58,7 +58,7 @@ class BNReLU3d(nnq.BatchNorm3d):
     _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU3d
 
     def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
-        super(BNReLU3d, self).__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
+        super().__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
 
     def forward(self, input):
         # Temporarily using len(shape) instead of ndim due to JIT issue
diff --git a/torch/ao/nn/intrinsic/quantized/modules/conv_add.py b/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
new file mode 100644
index 000000000000..6e46aa8915e4
--- /dev/null
+++ b/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
@@ -0,0 +1,93 @@
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.nn.functional as F
+import torch.ao.nn.quantized as nnq
+
+_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding
+
+class ConvAdd2d(nnq.Conv2d):
+    r"""
+    A ConvAdd2d module is a fused module of Conv2d and Add
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAdd2d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input, extra_input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv2d_add(
+            input, extra_input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvAdd2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+class ConvAddReLU2d(nnq.Conv2d):
+    r"""
+    A ConvAddReLU2d module is a fused module of Conv2d, Add and Relu
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAddReLU2d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input, extra_input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv2d_add_relu(
+            input, extra_input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvAddReLU2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
index bd5f8800be2c..7a88a7b8f92d 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
@@ -31,7 +31,7 @@ class ConvReLU1d(nnq.Conv1d):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
                  padding_mode='zeros', device=None, dtype=None):
-        super(ConvReLU1d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride=stride,
             padding=padding, dilation=dilation, groups=groups, bias=bias,
             padding_mode=padding_mode, device=device, dtype=dtype)
@@ -62,7 +62,7 @@ def from_float(cls, mod):
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
-        assert type(ref_qconv) != torch.nn.intrinsic.ConvBnReLU1d, \
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU1d, \
             "BatchNorm1d should be fused into Conv1d before converting to reference module"
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
 
@@ -81,7 +81,7 @@ class ConvReLU2d(nnq.Conv2d):
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
                  padding_mode='zeros', device=None, dtype=None):
-        super(ConvReLU2d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride=stride,
             padding=padding, dilation=dilation, groups=groups, bias=bias,
             padding_mode=padding_mode, device=device, dtype=dtype)
@@ -111,7 +111,7 @@ def from_float(cls, mod):
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
-        assert type(ref_qconv) != torch.nn.intrinsic.ConvBnReLU2d, \
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU2d, \
             "BatchNorm2d should be fused into Conv2d before converting to reference module"
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
 
@@ -131,7 +131,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
                  padding_mode='zeros', device=None, dtype=None):
         assert padding_mode != 'reflect', "Conv3d does not support reflection padding"
-        super(ConvReLU3d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride=stride,
             padding=padding, dilation=dilation, groups=groups, bias=bias,
             padding_mode=padding_mode, device=device, dtype=dtype)
@@ -167,6 +167,6 @@ def from_float(cls, mod):
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
-        assert type(ref_qconv) != torch.nn.intrinsic.ConvBnReLU3d, \
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU3d, \
             "BatchNorm3d should be fused into Conv3d before converting to reference module"
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
index 1945a0447c15..9c3a7bcd3b4a 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -51,9 +51,9 @@ class LinearLeakyReLU(nnq.Linear):
     r"""
     For onednn backend only
     A LinearLeakyReLU module fused from Linear and LeakyReLU modules
-    We adopt the same interface as :class:`torch.nn.quantized.Linear`.
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
     Attributes:
-        Same as torch.nn.quantized.Linear
+        Same as torch.ao.nn.quantized.Linear
         + negative_slope
     Examples::
         >>> # xdoctest: +SKIP
diff --git a/torch/ao/nn/quantizable/modules/activation.py b/torch/ao/nn/quantizable/modules/activation.py
index cf62d8882a3c..d94c18eda309 100644
--- a/torch/ao/nn/quantizable/modules/activation.py
+++ b/torch/ao/nn/quantizable/modules/activation.py
@@ -51,7 +51,7 @@ class MultiheadAttention(nn.MultiheadAttention):
 
     Examples::
 
-        >>> import torch.nn.quantizable as nnqa
+        >>> import torch.ao.nn.quantizable as nnqa
         >>> multihead_attn = nnqa.MultiheadAttention(embed_dim, num_heads)
         >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
 
@@ -66,10 +66,10 @@ def __init__(self, embed_dim: int, num_heads: int,
                  kdim: int = None, vdim: int = None, batch_first: bool = False,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(MultiheadAttention, self).__init__(embed_dim, num_heads, dropout,
-                                                 bias, add_bias_kv,
-                                                 add_zero_attn, kdim, vdim, batch_first,
-                                                 **factory_kwargs)
+        super().__init__(embed_dim, num_heads, dropout,
+                         bias, add_bias_kv,
+                         add_zero_attn, kdim, vdim, batch_first,
+                         **factory_kwargs)
         self.linear_Q = nn.Linear(self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs)
         self.linear_K = nn.Linear(self.kdim, self.embed_dim, bias=bias, **factory_kwargs)
         self.linear_V = nn.Linear(self.vdim, self.embed_dim, bias=bias, **factory_kwargs)
@@ -77,8 +77,8 @@ def __init__(self, embed_dim: int, num_heads: int,
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs)  # type: ignore[assignment]
 
         # Functionals
-        self.q_scaling_product = torch.nn.quantized.FloatFunctional()
-        # note: importing torch.nn.quantized at top creates a circular import
+        self.q_scaling_product = torch.ao.nn.quantized.FloatFunctional()
+        # note: importing torch.ao.nn.quantized at top creates a circular import
 
         # Quant/Dequant
         self.quant_attn_output = torch.ao.quantization.QuantStub()
@@ -253,9 +253,7 @@ def forward(self,
             See "Attention Is All You Need" for more details.
         key_padding_mask: if provided, specified padding elements in the key will
             be ignored by the attention. When given a binary mask and a value is True,
-            the corresponding value on the attention layer will be ignored. When given
-            a byte mask and a value is non-zero, the corresponding value on the attention
-            layer will be ignored
+            the corresponding value on the attention layer will be ignored.
         need_weights: output attn_output_weights.
         attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
             the batches while a 3D mask allows to specify a different mask for the entries of each batch.
@@ -269,14 +267,12 @@ def forward(self,
         - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
           the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
         - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
-          If a ByteTensor is provided, the non-zero positions will be ignored while the position
-          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          If a BoolTensor is provided, the positions with the
           value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
         - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
           3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
           S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
-          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
-          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          positions. If a BoolTensor is provided, positions with ``True``
           is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
           is provided, it will be added to the attention weight.
         - is_causal: If specified, applies a causal mask as attention mask. Mutually exclusive with providing attn_mask.
@@ -339,12 +335,11 @@ def _forward_impl(self,
         q = self.q_scaling_product.mul_scalar(q, scaling)
 
         if attn_mask is not None:
-            assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
-                attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
-                'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
             if attn_mask.dtype == torch.uint8:
                 warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
                 attn_mask = attn_mask.to(torch.bool)
+            assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \
+                'Only float and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
 
             if attn_mask.dim() == 2:
                 attn_mask = attn_mask.unsqueeze(0)
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index a262fe704f37..bb161fd80f38 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -22,7 +22,7 @@ class LSTMCell(torch.nn.Module):
 
     Examples::
 
-        >>> import torch.nn.quantizable as nnqa
+        >>> import torch.ao.nn.quantizable as nnqa
         >>> rnn = nnqa.LSTMCell(10, 20)
         >>> input = torch.randn(6, 10)
         >>> hx = torch.randn(3, 20)
@@ -272,7 +272,7 @@ class LSTM(torch.nn.Module):
 
     Examples::
 
-        >>> import torch.nn.quantizable as nnqa
+        >>> import torch.ao.nn.quantizable as nnqa
         >>> rnn = nnqa.LSTM(10, 20, 2)
         >>> input = torch.randn(5, 3, 10)
         >>> h0 = torch.randn(2, 3, 20)
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index c06a39cad7b8..ede4a4aa64aa 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -20,7 +20,7 @@ class Conv1d(nnq.Conv1d):
     r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
 
     For details on input arguments, parameters, and implementation see
-    :class:`~torch.nn.Conv1d` and :class:`~torch.nn.quantized.dynamic.Conv1d` and
+    :class:`~torch.nn.Conv1d` and :class:`~torch.ao.nn.quantized.dynamic.Conv1d` and
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -67,7 +67,7 @@ def __init__(self,
         padding = padding if isinstance(padding, str) else _single(padding)
         dilation = _single(dilation)
 
-        super(Conv1d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, bias, padding_mode, **factory_kwargs)
 
@@ -91,7 +91,7 @@ class Conv2d(nnq.Conv2d):
     r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
 
     For details on input arguments, parameters, and implementation see
-    :class:`~torch.nn.Conv2d` and :class:`~torch.nn.quantized.dynamic.Conv2d` and
+    :class:`~torch.nn.Conv2d` and :class:`~torch.ao.nn.quantized.dynamic.Conv2d` and
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -132,7 +132,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         padding = _pair(padding)
         dilation = _pair(dilation)
 
-        super(Conv2d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, bias, padding_mode, **factory_kwargs)
 
@@ -156,7 +156,7 @@ class Conv3d(nnq.Conv3d):
     r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
 
     For details on input arguments, parameters, and implementation see
-    :class:`~torch.nn.Conv3d` and :class:`~torch.nn.quantized.dynamic.Conv3d` and
+    :class:`~torch.nn.Conv3d` and :class:`~torch.ao.nn.quantized.dynamic.Conv3d` and
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -197,7 +197,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         stride = _triple(stride)
         padding = _triple(padding)
         dilation = _triple(dilation)
-        super(Conv3d, self)._init(
+        super()._init(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -223,7 +223,7 @@ class ConvTranspose1d(nnq.ConvTranspose1d):
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose1d`.
 
-    For special notes, please, see :class:`~torch.nn.quantized.dynamic.Conv1d`
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv1d`
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -262,7 +262,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
             )
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(ConvTranspose1d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, output_padding,
             groups, bias, dilation, padding_mode, **factory_kwargs)
 
@@ -284,7 +284,7 @@ class ConvTranspose2d(nnq.ConvTranspose2d):
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose2d`.
 
-    For special notes, please, see :class:`~torch.nn.quantized.dynamic.Conv2d`
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv2d`
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -323,7 +323,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
             )
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(ConvTranspose2d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, output_padding,
             groups, bias, dilation, padding_mode, **factory_kwargs)
 
@@ -345,7 +345,7 @@ class ConvTranspose3d(nnq.ConvTranspose3d):
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose3d`.
 
-    For special notes, please, see :class:`~torch.nn.quantized.dynamic.Conv3d`
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv3d`
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -384,7 +384,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
             )
         )
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(ConvTranspose3d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, output_padding,
             groups, bias, dilation, padding_mode, **factory_kwargs)
 
diff --git a/torch/ao/nn/quantized/dynamic/modules/linear.py b/torch/ao/nn/quantized/dynamic/modules/linear.py
index c82f888aee33..78e459f9bc63 100644
--- a/torch/ao/nn/quantized/dynamic/modules/linear.py
+++ b/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -37,7 +37,7 @@ class Linear(nnq.Linear):
     _version = 4
 
     def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8):
-        super(Linear, self).__init__(in_features, out_features, bias_, dtype=dtype)
+        super().__init__(in_features, out_features, bias_, dtype=dtype)
         # We don't muck around with buffers or attributes or anything here
         # to keep the module simple. *everything* is simply a Python attribute.
         # Serialization logic is explicitly handled in the below serialization and
@@ -75,8 +75,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         version = local_metadata.get('version', None)
         self.version = version
-        super(Linear, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                  missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
     @classmethod
     def from_float(cls, mod):
@@ -87,7 +87,7 @@ def from_float(cls, mod):
                           utilities or provided by the user
         """
         float_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
-                         torch.nn.intrinsic.modules.fused.LinearReLU, torch.ao.nn.qat.dynamic.Linear]
+                         torch.ao.nn.intrinsic.modules.fused.LinearReLU, torch.ao.nn.qat.dynamic.Linear]
 
         assert type(mod) in float_modules, \
             'nn.quantized.dynamic.Linear.from_float only works for one of' + \
diff --git a/torch/ao/nn/quantized/dynamic/modules/rnn.py b/torch/ao/nn/quantized/dynamic/modules/rnn.py
index 514cc72bafe5..d5056a6360a8 100644
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -45,18 +45,18 @@ def pack_weight_bias(qweight, bias, dtype):
 
 class PackedParameter(torch.nn.Module):
     def __init__(self, param):
-        super(PackedParameter, self).__init__()
+        super().__init__()
         self.param = param
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(PackedParameter, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + 'param'] = self.param
 
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         self.param = state_dict[prefix + 'param']
-        super(PackedParameter, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                           missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
 
 class RNNBase(torch.nn.Module):
@@ -68,7 +68,7 @@ class RNNBase(torch.nn.Module):
     def __init__(self, mode, input_size, hidden_size,
                  num_layers=1, bias=True, batch_first=False,
                  dropout=0., bidirectional=False, dtype=torch.qint8):
-        super(RNNBase, self).__init__()
+        super().__init__()
 
         self.mode = mode
         self.input_size = input_size
@@ -225,8 +225,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         version = local_metadata.get('version', None)
         self.version = version
-        super(RNNBase, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                   missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
     def set_weight_bias(self, weight_bias_dict):
 
@@ -267,10 +267,8 @@ def weight_bias_name(ihhh, layer, suffix):
 
     @classmethod
     def from_float(cls, mod):
-        assert type(mod) in set(
-            [torch.nn.LSTM,
-             torch.nn.GRU]
-        ), 'nn.quantized.dynamic.RNNBase.from_float only works for nn.LSTM and nn.GRU'
+        assert type(mod) in {torch.nn.LSTM,
+                             torch.nn.GRU}, 'nn.quantized.dynamic.RNNBase.from_float only works for nn.LSTM and nn.GRU'
         assert hasattr(
             mod,
             'qconfig'
@@ -401,7 +399,7 @@ class LSTM(RNNBase):
     __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
 
     def __init__(self, *args, **kwargs):
-        super(LSTM, self).__init__('LSTM', *args, **kwargs)
+        super().__init__('LSTM', *args, **kwargs)
 
     def _get_name(self):
         return 'DynamicQuantizedLSTM'
@@ -457,11 +455,11 @@ def forward_packed(
         self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
     ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
         input_, batch_sizes, sorted_indices, unsorted_indices = input
-        max_batch_size = batch_sizes[0]
-        max_batch_size = int(max_batch_size)
+        max_batch_size = int(batch_sizes[0])
 
         output_, hidden = self.forward_impl(
-            input_, hx, batch_sizes, max_batch_size, sorted_indices)
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
 
         output = PackedSequence(output_, batch_sizes,
                                 sorted_indices, unsorted_indices)
@@ -627,7 +625,7 @@ class GRU(RNNBase):
     __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
 
     def __init__(self, *args, **kwargs):
-        super(GRU, self).__init__('GRU', *args, **kwargs)
+        super().__init__('GRU', *args, **kwargs)
 
     def _get_name(self):
         return 'DynamicQuantizedGRU'
@@ -703,10 +701,10 @@ def forward_packed(
         self, input: PackedSequence, hx: Optional[Tensor] = None
     ) -> Tuple[PackedSequence, Tensor]:
         input_, batch_sizes, sorted_indices, unsorted_indices = input
-        max_batch_size = batch_sizes[0]
-        max_batch_size = int(max_batch_size)
+        max_batch_size = int(batch_sizes[0])
         output_, hidden = self.forward_impl(
-            input_, hx, batch_sizes, max_batch_size, sorted_indices)
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
 
         output = PackedSequence(output_, batch_sizes,
                                 sorted_indices, unsorted_indices)
@@ -753,7 +751,7 @@ class RNNCellBase(torch.nn.Module):
     __constants__ = ['input_size', 'hidden_size', 'bias']
 
     def __init__(self, input_size, hidden_size, bias=True, num_chunks=4, dtype=torch.qint8):
-        super(RNNCellBase, self).__init__()
+        super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
@@ -823,9 +821,9 @@ def check_forward_hidden(self, input: Tensor, hx: Tensor, hidden_label: str = ''
 
     @classmethod
     def from_float(cls, mod):
-        assert type(mod) in set([torch.nn.LSTMCell,
-                                 torch.nn.GRUCell,
-                                 torch.nn.RNNCell]), 'nn.quantized.dynamic.RNNCellBase.from_float \
+        assert type(mod) in {torch.nn.LSTMCell,
+                             torch.nn.GRUCell,
+                             torch.nn.RNNCell}, 'nn.quantized.dynamic.RNNCellBase.from_float \
                                  only works for nn.LSTMCell, nn.GRUCell and nn.RNNCell'
         assert hasattr(
             mod, 'qconfig'), 'Input float module must have qconfig defined'
@@ -935,7 +933,7 @@ def set_weight_bias(self, weight_bias_dict):
             self.weight_dtype)
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(RNNCellBase, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + '_packed_weight_ih'] = self._packed_weight_ih
         destination[prefix + '_packed_weight_hh'] = self._packed_weight_hh
 
@@ -943,8 +941,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         self._packed_weight_ih = state_dict.pop(prefix + '_packed_weight_ih')
         self._packed_weight_hh = state_dict.pop(prefix + '_packed_weight_hh')
-        super(RNNCellBase, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                       missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
 
 class RNNCell(RNNCellBase):
@@ -967,7 +965,7 @@ class RNNCell(RNNCellBase):
     __constants__ = ['input_size', 'hidden_size', 'bias', 'nonlinearity']
 
     def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh", dtype=torch.qint8):
-        super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1, dtype=dtype)
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, dtype=dtype)
         self.nonlinearity = nonlinearity
 
     def _get_name(self):
@@ -1020,7 +1018,7 @@ class LSTMCell(RNNCellBase):
     """
 
     def __init__(self, *args, **kwargs):
-        super(LSTMCell, self).__init__(*args, num_chunks=4, **kwargs)  # type: ignore[misc]
+        super().__init__(*args, num_chunks=4, **kwargs)  # type: ignore[misc]
 
     def _get_name(self):
         return 'DynamicQuantizedLSTMCell'
@@ -1062,7 +1060,7 @@ class GRUCell(RNNCellBase):
     """
 
     def __init__(self, input_size, hidden_size, bias=True, dtype=torch.qint8):
-        super(GRUCell, self).__init__(input_size, hidden_size, bias, num_chunks=3, dtype=dtype)
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, dtype=dtype)
 
     def _get_name(self):
         return 'DynamicQuantizedGRUCell'
diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py
index fac6326d2345..72218184fcfa 100644
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@@ -552,7 +552,7 @@ def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=
 
     .. warning::
         This function is deprecated in favor of
-        :func:`torch.nn.quantized.functional.interpolate`.
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
         This is equivalent with ``nn.quantized.functional.interpolate(...)``.
 
     See :func:`torch.nn.functional.interpolate` for implementation details.
@@ -604,7 +604,7 @@ def upsample_bilinear(input, size=None, scale_factor=None):
 
     .. warning::
         This function is deprecated in favor of
-        :func:`torch.nn.quantized.functional.interpolate`.
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
         This is equivalent with
         ``nn.quantized.functional.interpolate(..., mode='bilinear', align_corners=True)``.
 
@@ -626,7 +626,7 @@ def upsample_nearest(input, size=None, scale_factor=None):
 
     .. warning::
         This function is deprecated in favor of
-        :func:`torch.nn.quantized.functional.interpolate`.
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
         This is equivalent with ``nn.quantized.functional.interpolate(..., mode='nearest')``.
 
     .. note:: The input quantization parameters propagate to the output.
diff --git a/torch/ao/nn/quantized/modules/__init__.py b/torch/ao/nn/quantized/modules/__init__.py
index 90c69ad50915..05866f6da406 100644
--- a/torch/ao/nn/quantized/modules/__init__.py
+++ b/torch/ao/nn/quantized/modules/__init__.py
@@ -86,7 +86,7 @@ class Quantize(torch.nn.Module):
 
     def __init__(self, scale, zero_point, dtype, factory_kwargs=None):
         factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
-        super(Quantize, self).__init__()
+        super().__init__()
         self.register_buffer('scale', torch.tensor([scale], **factory_kwargs))
         self.register_buffer('zero_point',
                              torch.tensor([zero_point], dtype=torch.long,
@@ -123,9 +123,6 @@ class DeQuantize(torch.nn.Module):
                 [ 1., -1.]], dtype=torch.float32)
     """
 
-    def __init__(self):
-        super(DeQuantize, self).__init__()
-
     def forward(self, Xq):
         return Xq.dequantize()
 
diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py
index c28aa7850d00..da91af991033 100644
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@@ -36,7 +36,7 @@ class ReLU6(torch.nn.ReLU):
         >>> output = m(input)
     """
     def __init__(self, inplace=False):
-        super(ReLU6, self).__init__(inplace)
+        super().__init__(inplace)
         self.inplace = inplace
 
     def forward(self, input):
@@ -56,14 +56,14 @@ class Hardswish(torch.nn.Hardswish):
         scale: quantization scale of the output tensor
         zero_point: quantization zero point of the output tensor
     """
-    def __init__(self, scale, zero_point):
-        super(Hardswish, self).__init__()
-        self.scale = scale
-        self.zero_point = zero_point
+    def __init__(self, scale, zero_point, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
-        return torch.ao.nn.quantized.functional.hardswish(
-            input, scale=self.scale, zero_point=self.zero_point)
+        return torch.ops.quantized.hardswish(input, self.scale, self.zero_point)
 
     def _get_name(self):
         return 'QuantizedHardswish'
@@ -86,7 +86,7 @@ class ELU(torch.nn.ELU):
         alpha: the alpha constant
     """
     def __init__(self, scale, zero_point, alpha=1.):
-        super(ELU, self).__init__(alpha)
+        super().__init__(alpha)
         self.scale = scale
         self.zero_point = zero_point
 
diff --git a/torch/ao/nn/quantized/modules/conv.py b/torch/ao/nn/quantized/modules/conv.py
index abd0e7ff96f5..cd2605875e2e 100644
--- a/torch/ao/nn/quantized/modules/conv.py
+++ b/torch/ao/nn/quantized/modules/conv.py
@@ -48,7 +48,7 @@ def _init(self, in_channels, out_channels, kernel_size, stride,
               device=None,
               dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_ConvNd, self).__init__()
+        super().__init__()
 
         if in_channels % groups != 0:
             raise ValueError('in_channels must be divisible by groups')
@@ -120,7 +120,7 @@ def extra_repr(self):
     #   self
     #   |--- _packed_params : Conv2dPackedParamsBase or Conv3dPackedParamsBase
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(_ConvNd, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         (w, b) = self._weight_bias()
         destination[prefix + 'weight'] = w
         destination[prefix + 'bias'] = b
@@ -161,7 +161,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         state_dict.pop(prefix + 'scale')
         self.zero_point = int(state_dict[prefix + 'zero_point'])
         state_dict.pop(prefix + 'zero_point')
-        super(_ConvNd, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict, prefix, local_metadata, False, missing_keys,
             unexpected_keys, error_msgs)
 
@@ -236,7 +236,7 @@ def from_float(cls, mod):
                 "Input float module must have qconfig defined."
             activation_post_process = None if not hasattr(
                 mod, "activation_post_process") else mod.activation_post_process
-            if type(mod) == cls._NNI_CONV_RELU_MODULE:
+            if type(mod) in [cls._NNI_CONV_RELU_MODULE, cls._NNI_CONV_ADD_MODULE, cls._NNI_CONV_ADD_RELU_MODULE]:
                 mod = mod[0]
             weight_post_process = mod.qconfig.weight()
         return cls.get_qconv(mod, activation_post_process, weight_post_process)
@@ -307,6 +307,8 @@ class Conv1d(_ConvNd):
     _FLOAT_MODULE = nn.Conv1d
     _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn1d
     _NNI_CONV_RELU_MODULE = nni.ConvReLU1d
+    _NNI_CONV_ADD_MODULE = None
+    _NNI_CONV_ADD_RELU_MODULE = None
 
     def __init__(self,
                  in_channels: int,
@@ -328,7 +330,7 @@ def __init__(self,
 
         # Subclasses of _ConvNd needs to call _init rather than __init__. See
         # discussion on PR #49702
-        super(Conv1d, self)._init(
+        super()._init(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             False, _single(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -418,6 +420,8 @@ class Conv2d(_ConvNd):
     _FLOAT_MODULE = nn.Conv2d
     _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn2d
     _NNI_CONV_RELU_MODULE = nni.ConvReLU2d
+    _NNI_CONV_ADD_MODULE = nni.ConvAdd2d
+    _NNI_CONV_ADD_RELU_MODULE = nni.ConvAddReLU2d
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
@@ -429,7 +433,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         dilation = _pair(dilation)
         # Subclasses of _ConvNd need to call _init rather than __init__. See
         # discussion on PR #49702
-        super(Conv2d, self)._init(
+        super()._init(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             False, _pair(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -517,6 +521,8 @@ class Conv3d(_ConvNd):
     _FLOAT_MODULE = nn.Conv3d
     _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn3d
     _NNI_CONV_RELU_MODULE = nni.ConvReLU3d
+    _NNI_CONV_ADD_MODULE = None
+    _NNI_CONV_ADD_RELU_MODULE = None
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1, bias=True,
@@ -529,7 +535,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         dilation = _triple(dilation)
         # Subclasses of _ConvNd need to call _init rather than __init__. See
         # discussion on PR #49702
-        super(Conv3d, self)._init(
+        super()._init(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -591,7 +597,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
         factory_kwargs = {'device': device, 'dtype': dtype}
         # Subclasses of _ConvNd need to call _init rather than __init__. See
         # discussion on PR #49702
-        super(_ConvTransposeNd, self)._init(
+        super()._init(
             in_channels, out_channels, kernel_size, stride,
             padding, dilation, transposed, output_padding,
             groups, bias, padding_mode, **factory_kwargs)
@@ -672,7 +678,7 @@ class ConvTranspose1d(_ConvTransposeNd):
     .. note:: Currently only the QNNPACK engine is implemented.
         Please, set the `torch.backends.quantized.engine = 'qnnpack'`
 
-    For special notes, please, see :class:`~torch.nn.quantized.Conv1d`
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv1d`
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -685,7 +691,7 @@ class ConvTranspose1d(_ConvTransposeNd):
 
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
         >>> torch.backends.quantized.engine = 'qnnpack'
-        >>> from torch.nn import quantized as nnq
+        >>> from torch.ao.nn import quantized as nnq
         >>> # With square kernels and equal stride
         >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
@@ -719,7 +725,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         dilation = _single(dilation)
         output_padding = _single(output_padding)
 
-        super(ConvTranspose1d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             True, output_padding, groups, bias, padding_mode, **factory_kwargs)
 
@@ -762,7 +768,7 @@ class ConvTranspose2d(_ConvTransposeNd):
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose2d`.
 
-    For special notes, please, see :class:`~torch.nn.quantized.Conv2d`
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv2d`
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -777,7 +783,7 @@ class ConvTranspose2d(_ConvTransposeNd):
         >>> # QNNPACK or FBGEMM as backend
         >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
-        >>> import torch.nn.quantized as nnq
+        >>> import torch.ao.nn.quantized as nnq
         >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
         >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
@@ -810,7 +816,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         dilation = _pair(dilation)
         output_padding = _pair(output_padding)
 
-        super(ConvTranspose2d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             True, output_padding, groups, bias, padding_mode, **factory_kwargs)
 
@@ -856,7 +862,7 @@ class ConvTranspose3d(_ConvTransposeNd):
     .. note:: Currently only the FBGEMM engine is implemented.
         Please, set the `torch.backends.quantized.engine = 'fbgemm'`
 
-    For special notes, please, see :class:`~torch.nn.quantized.Conv3d`
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv3d`
 
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
@@ -869,7 +875,7 @@ class ConvTranspose3d(_ConvTransposeNd):
 
         >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
         >>> torch.backends.quantized.engine = 'fbgemm'
-        >>> from torch.nn import quantized as nnq
+        >>> from torch.ao.nn import quantized as nnq
         >>> # With cubic kernels and equal stride
         >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
         >>> # non-cubic kernels and unequal stride and with padding
@@ -903,7 +909,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         dilation = _triple(dilation)
         output_padding = _triple(output_padding)
 
-        super(ConvTranspose3d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             True, output_padding, groups, bias, padding_mode, **factory_kwargs)
 
diff --git a/torch/ao/nn/quantized/modules/embedding_ops.py b/torch/ao/nn/quantized/modules/embedding_ops.py
index c8b90eb8afbc..c4389a60d9b0 100644
--- a/torch/ao/nn/quantized/modules/embedding_ops.py
+++ b/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -12,7 +12,7 @@ class EmbeddingPackedParams(torch.nn.Module):
     _version = 1
 
     def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8):
-        super(EmbeddingPackedParams, self).__init__()
+        super().__init__()
         self.dtype = dtype
         if self.dtype in [torch.quint8, torch.quint4x2]:
             scales = torch.ones(num_embeddings, dtype=torch.float)
@@ -48,7 +48,7 @@ def forward(self, x):
     #   |--- dtype : torch.dtype
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(EmbeddingPackedParams, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + 'dtype'] = self.dtype
         destination[prefix + '_packed_weight'] = self._weight()
 
@@ -61,8 +61,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         state_dict.pop(prefix + '_packed_weight')
         self.set_weight(weight)
 
-        super(EmbeddingPackedParams, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                                 missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
     def __repr__(self):
         return self._weight().__repr__()
@@ -93,7 +93,7 @@ class Embedding(torch.nn.Module):
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
                  max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
                  sparse: bool = False, _weight: Optional[Tensor] = None, dtype=torch.quint8) -> None:
-        super(Embedding, self).__init__()
+        super().__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
         self.dtype = dtype
@@ -220,7 +220,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int,
                  max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
                  mode: str = 'sum', sparse: bool = False, _weight: Optional[Tensor] = None,
                  include_last_offset: bool = False, dtype=torch.quint8) -> None:
-        super(EmbeddingBag, self).__init__(num_embeddings, embedding_dim, _weight=_weight, dtype=dtype)
+        super().__init__(num_embeddings, embedding_dim, _weight=_weight, dtype=dtype)
 
         self.mode = mode
         self.pruned_weights = False
diff --git a/torch/ao/nn/quantized/modules/functional_modules.py b/torch/ao/nn/quantized/modules/functional_modules.py
index 5bf7a7322652..1fb27da5ee2a 100644
--- a/torch/ao/nn/quantized/modules/functional_modules.py
+++ b/torch/ao/nn/quantized/modules/functional_modules.py
@@ -33,7 +33,7 @@ class FloatFunctional(torch.nn.Module):
         - mul_scalar
     """
     def __init__(self):
-        super(FloatFunctional, self).__init__()
+        super().__init__()
         self.activation_post_process = torch.nn.Identity()
 
     def forward(self, x):
@@ -154,13 +154,13 @@ class QFunctional(torch.nn.Module):
         - mul_scalar
     """
     def __init__(self):
-        super(QFunctional, self).__init__()
+        super().__init__()
         self.scale = 1.0
         self.zero_point = 0
         self.activation_post_process = torch.nn.Identity()
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(QFunctional, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + 'scale'] = torch.tensor(self.scale)
         destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
 
@@ -169,8 +169,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 
         self.scale = float(state_dict.pop(prefix + 'scale'))
         self.zero_point = int(state_dict.pop(prefix + 'zero_point'))
-        super(QFunctional, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                       missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
     def _get_name(self):
         return 'QFunctional'
diff --git a/torch/ao/nn/quantized/modules/linear.py b/torch/ao/nn/quantized/modules/linear.py
index 864012bf5f81..e592c5f9b4d0 100644
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@@ -65,7 +65,7 @@ def forward(self, x):
     #                         of LinearPackedParams
     #   |--- dtype : torch.dtype
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(LinearPackedParams, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + 'dtype'] = self.dtype
         destination[prefix + '_packed_params'] = self._weight_bias()
 
@@ -88,8 +88,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             state_dict.pop(prefix + '_packed_params')
             self.set_weight_bias(weight, bias)
 
-        super(LinearPackedParams, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
-                                                              missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
 
 
     def __repr__(self):
diff --git a/torch/ao/nn/quantized/modules/normalization.py b/torch/ao/nn/quantized/modules/normalization.py
index 3c77e1277598..f798a241e324 100644
--- a/torch/ao/nn/quantized/modules/normalization.py
+++ b/torch/ao/nn/quantized/modules/normalization.py
@@ -14,9 +14,8 @@ class LayerNorm(torch.nn.LayerNorm):
     def __init__(self, normalized_shape, weight, bias, scale, zero_point, eps=1e-5,
                  elementwise_affine=True, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(LayerNorm, self).__init__(
-            normalized_shape, eps=eps, elementwise_affine=elementwise_affine,
-            **factory_kwargs)
+        super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine,
+                         **factory_kwargs)
         self.weight = weight
         self.bias = bias
         self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
@@ -57,8 +56,7 @@ class GroupNorm(torch.nn.GroupNorm):
     def __init__(self, num_groups, num_channels, weight, bias, scale, zero_point, eps=1e-5,
                  affine=True, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(GroupNorm, self).__init__(num_groups, num_channels, eps, affine,
-                                        **factory_kwargs)
+        super().__init__(num_groups, num_channels, eps, affine, **factory_kwargs)
         self.weight = weight
         self.bias = bias
         self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
@@ -92,8 +90,7 @@ def __init__(self, num_features, weight, bias, scale, zero_point,
                  eps=1e-5, momentum=0.1, affine=False,
                  track_running_stats=False, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(InstanceNorm1d, self).__init__(
-            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+        super().__init__(num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
         self.weight = weight
         self.bias = bias
         self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
@@ -133,8 +130,7 @@ def __init__(self, num_features, weight, bias, scale, zero_point,
                  eps=1e-5, momentum=0.1, affine=False,
                  track_running_stats=False, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(InstanceNorm2d, self).__init__(
-            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+        super().__init__(num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
         self.weight = weight
         self.bias = bias
         self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
@@ -174,8 +170,7 @@ def __init__(self, num_features, weight, bias, scale, zero_point,
                  eps=1e-5, momentum=0.1, affine=False,
                  track_running_stats=False, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(InstanceNorm3d, self).__init__(
-            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+        super().__init__(num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
         self.weight = weight
         self.bias = bias
         self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
diff --git a/torch/ao/nn/quantized/modules/rnn.py b/torch/ao/nn/quantized/modules/rnn.py
index 732b4a6a773f..25551c5b6d42 100644
--- a/torch/ao/nn/quantized/modules/rnn.py
+++ b/torch/ao/nn/quantized/modules/rnn.py
@@ -14,7 +14,7 @@ class LSTM(torch.ao.nn.quantizable.LSTM):
 
     .. note::
         To access the weights and biases, you need to access them per layer.
-        See examples in :class:`~torch.nn.quantizable.LSTM`
+        See examples in :class:`~torch.ao.nn.quantizable.LSTM`
 
     Examples::
         >>> # xdoctest: +SKIP
@@ -29,7 +29,7 @@ class LSTM(torch.ao.nn.quantizable.LSTM):
         >>> tq.prepare(model, prepare_custom_module_class=custom_module_config)
         >>> tq.convert(model, convert_custom_module_class=custom_module_config)
     """
-    _FLOAT_MODULE = torch.nn.quantizable.LSTM  # type: ignore[assignment]
+    _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM  # type: ignore[assignment]
 
     def _get_name(self):
         return 'QuantizedLSTM'
diff --git a/torch/ao/nn/quantized/reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
index 53b10c3cb7dc..566642832a54 100644
--- a/torch/ao/nn/quantized/reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -412,8 +412,7 @@ def forward(self, input, hx=None):  # noqa: F811
         batch_sizes = None
         if isinstance(orig_input, PackedSequence):
             input, batch_sizes, sorted_indices, unsorted_indices = input
-            max_batch_size = batch_sizes[0]
-            max_batch_size = int(max_batch_size)
+            max_batch_size = int(batch_sizes[0])
         else:
             batch_sizes = None
             is_batched = input.dim() == 3
@@ -544,8 +543,7 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             input, batch_sizes, sorted_indices, unsorted_indices = input
-            max_batch_size = batch_sizes[0]
-            max_batch_size = int(max_batch_size)
+            max_batch_size = int(batch_sizes[0])
         else:
             batch_sizes = None
             assert (input.dim() in (2, 3)), f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
diff --git a/torch/ao/nn/sparse/quantized/dynamic/linear.py b/torch/ao/nn/sparse/quantized/dynamic/linear.py
index 7eac81f1814d..87d174db8098 100644
--- a/torch/ao/nn/sparse/quantized/dynamic/linear.py
+++ b/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -1,7 +1,7 @@
 from typing import Optional
 
 import torch
-import torch.nn.intrinsic as nni
+import torch.ao.nn.intrinsic as nni
 
 from torch.ao.nn.sparse.quantized import linear
 from torch.ao.nn.sparse.quantized.utils import LinearBlockSparsePattern
diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py
index 3ddca96b1de5..3f0df31dfd2a 100644
--- a/torch/ao/ns/_numeric_suite.py
+++ b/torch/ao/ns/_numeric_suite.py
@@ -171,7 +171,7 @@ class Logger(nn.Module):
     """
 
     def __init__(self):
-        super(Logger, self).__init__()
+        super().__init__()
         self.stats = {}
         # We only insert observer if the op is quantized with static quantization,
         # which is identified by activation_observer.dtype == quint8.  This is needed
@@ -190,7 +190,7 @@ class ShadowLogger(Logger):
     """
 
     def __init__(self):
-        super(ShadowLogger, self).__init__()
+        super().__init__()
         self.stats["float"] = []
         self.stats["quantized"] = []
 
@@ -210,7 +210,7 @@ class OutputLogger(Logger):
     """
 
     def __init__(self):
-        super(OutputLogger, self).__init__()
+        super().__init__()
         self.stats["tensor_val"] = []
 
 
@@ -222,12 +222,12 @@ def forward(self, x):
 
 
 def _convert_tuple_to_list(t: Any) -> Any:
-    return list(_convert_tuple_to_list(x) for x in t) if type(t) is tuple else t
+    return [_convert_tuple_to_list(x) for x in t] if type(t) is tuple else t
 
 
 def _dequantize_tensor_list(t: Any) -> Any:
     return (
-        list(_dequantize_tensor_list(x) for x in t)
+        [_dequantize_tensor_list(x) for x in t]
         if type(t) is list
         else t.dequantize()
         if t.is_quantized
@@ -248,7 +248,7 @@ class Shadow(nn.Module):
     """
 
     def __init__(self, q_module, float_module, logger_cls):
-        super(Shadow, self).__init__()
+        super().__init__()
         self.orig_module = q_module
         self.shadow_module = float_module
         self.dequant = nnq.DeQuantize()
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index c437d1857e30..8b13ec55cc4d 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -4,11 +4,11 @@
 
     import copy
     import torch
-    import torch.quantization.quantize_fx as quantize_fx
+    import torch.ao.quantization.quantize_fx as quantize_fx
     import torch.ao.ns._numeric_suite_fx as ns
 
     m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)).eval()
-    mp = quantize_fx.prepare_fx(m, {'': torch.quantization.default_qconfig})
+    mp = quantize_fx.prepare_fx(m, {'': torch.ao.quantization.default_qconfig})
     # We convert a copy because we need the original prepared model
     # to be available for comparisons, and `quantize_fx.convert_fx` is inplace.
     mq = quantize_fx.convert_fx(copy.deepcopy(mp))
@@ -122,9 +122,11 @@
 from torch.ao.quantization.backend_config.utils import get_fusion_pattern_to_root_node_getter
 from torch.ao.quantization.backend_config import BackendConfig
 from torch.ao.quantization.fx.match_utils import _find_matches
+from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr
 from torch.ao.quantization.fx.qconfig_mapping_utils import _generate_node_name_to_qconfig
 from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
 from torch.ao.quantization.qconfig import QConfigAny
+from torch.ao.quantization import QConfigMapping
 from torch.ao.ns.fx.n_shadows_utils import (
     OutputProp,
     _get_dedup_subgraphs,
@@ -132,7 +134,9 @@
     group_results_by_subgraph,
     create_results_comparison,
     print_n_shadows_summary,
-    handle_subgraph,
+    create_n_transformed_and_logged_copies_of_subgraph,
+    create_add_loggers_graph,
+    extract_weight_comparison,
 )
 from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 
@@ -236,11 +240,13 @@ def forward(self, x):
         return x
 
     def __repr__(self):
-        return f"""OutputLogger(ref_name={self.ref_name}, model_name={self.model_name},
-prev_node_name={self.prev_node_name}, ref_node_name={self.ref_node_name},
-ref_node_target_type={self.ref_node_target_type}
-results_type={self.results_type}, index_within_arg={self.index_within_arg},
-index_of_arg={self.index_of_arg}, fqn={self.fqn})"""
+        clean_dict = {
+            k: v
+            for k, v in self.__dict__.items()
+            # skip nn.Module keys
+            if (k != 'training') and not k.startswith('_')
+        }
+        return f"OutputLogger({clean_dict})"
 
 
 class OutputComparisonLogger(OutputLogger):
@@ -272,7 +278,13 @@ def forward(self, x, x_ref):
         return x
 
     def __repr__(self):
-        return "OutputComparisonLogger"
+        clean_dict = {
+            k: v
+            for k, v in self.__dict__.items()
+            # skip nn.Module keys
+            if (k != 'training') and not k.startswith('_')
+        }
+        return f"OutputComparisonLogger({clean_dict})"
 
 
 class NSTracer(quantize_fx.QuantizationTracer):
@@ -387,11 +399,13 @@ def extract_weights(
     tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
     tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
     gm_a = GraphModule(model_a, tracer_a.trace(model_a))
-    if hasattr(model_a, '_node_name_to_scope'):
-        gm_a._node_name_to_scope = model_a._node_name_to_scope
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(model_a, 'node_name_to_scope')
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
     gm_b = GraphModule(model_b, tracer_b.trace(model_b))
-    if hasattr(model_b, '_node_name_to_scope'):
-        gm_b._node_name_to_scope = model_b._node_name_to_scope
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(model_b, 'node_name_to_scope')
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
     return _extract_weights_impl(
         model_name_a, gm_a, model_name_b, gm_b, base_name_to_sets_of_related_ops,
         unmatchable_types_map, op_to_type_to_weight_extraction_fn)
@@ -498,11 +512,13 @@ def add_loggers(
     tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
     tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
     gm_a = GraphModule(model_a, tracer_a.trace(model_a))
-    if hasattr(model_a, '_node_name_to_scope'):
-        gm_a._node_name_to_scope = model_a._node_name_to_scope
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(model_a, 'node_name_to_scope')
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
     gm_b = GraphModule(model_b, tracer_b.trace(model_b))
-    if hasattr(model_b, '_node_name_to_scope'):
-        gm_b._node_name_to_scope = model_b._node_name_to_scope
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(model_b, 'node_name_to_scope')
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
     return _add_loggers_impl(
         name_a, gm_a, name_b, gm_b, logger_cls,
         should_log_inputs=should_log_inputs,
@@ -651,11 +667,13 @@ def add_shadow_loggers(
     tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
     tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
     gm_a = GraphModule(model_a, tracer_a.trace(model_a))
-    if hasattr(model_a, '_node_name_to_scope'):
-        gm_a._node_name_to_scope = model_a._node_name_to_scope
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(model_a, 'node_name_to_scope')
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
     gm_b = GraphModule(model_b, tracer_b.trace(model_b))
-    if hasattr(model_b, '_node_name_to_scope'):
-        gm_b._node_name_to_scope = model_b._node_name_to_scope
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(model_b, 'node_name_to_scope')
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
     return _add_shadow_loggers_impl(
         name_a, gm_a, name_b, gm_b, logger_cls,
         should_log_inputs=should_log_inputs,
@@ -755,7 +773,7 @@ def prepare_n_shadows_model(
     custom_prepare_fn: Optional[Callable] = None,
     custom_prepare_kwargs: Optional[Dict[str, Any]] = None,
     custom_tracer: Any = None,
-) -> torch.nn.Module:
+) -> GraphModule:
     """
     Given a model with a graph with M ops such as
 
@@ -768,29 +786,26 @@ def prepare_n_shadows_model(
 
     .. code::
 
-      args_kwargs_m -> op_m -> output_m
-           |                        |
-           |---------------------------> mod_with_op_m_transformed_with_qconfig_n
+           |---------> op_m_n -> log_m_n
+           |                     /
+      args_kwargs_m ---------> op_m -> log_m_0
 
-    Where mod_with_op_m_transformed_with_qconfig_n is a submodule, and its
-    inner graph looks like
+    Where op_m_n is op_m wrapped in a submodule and transformed with
+    qconfig_n, and its inner graph looks like
 
     .. code::
 
-      args_m -------- op_m_prepared_with_qconfig_n -> output_m_n -> comparison_logger
-                  /                                                    /
-      kwargs_m ---                                                    /
-                                                                     /
-      output_m ------------------------------------------------------
+      args_m -------- op_m_prepared_with_qconfig_n -> out_m_n
+                  /
+      kwargs_m ---
 
     This is useful for testing different quantization of multiple layers in
     a single pass through the model.
 
     High level TODOs for future PRs:
-    1. add deduplication for qconfigs per subgraph
-    2. figure out a better way to name the output structure
-    3. return a results data structure instead of printing it out
-    4. add examples to docblocks
+    * figure out a better way to name the output structure
+    * return a results data structure instead of printing it out
+    * add examples to docblocks
     """
 
     if custom_tracer is None:
@@ -838,15 +853,113 @@ def prepare_n_shadows_model(
     #     4. run `prepare_fx` on the module
     for (subgraph_idx, (match_name, nodes_in_this_subgraph)) in \
             enumerate(subgraphs_dedup.items()):
-        handle_subgraph(
+        create_n_transformed_and_logged_copies_of_subgraph(
             mt, subgraph_idx, match_name, nodes_in_this_subgraph,
             qconfig_multi_mapping.qconfig_mappings_list, list_of_node_name_to_qconfig,
             custom_prepare_fn, custom_prepare_kwargs
         )
 
-    mt.recompile()
     return mt
 
+# TODO(future PR): we should rethink the names of all the PNP APIs
+def _prepare_n_shadows_add_loggers_model(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_mapping: QConfigMapping,
+    backend_config: BackendConfig,
+) -> torch.nn.Module:
+    """
+    Note: this API is not recommended for wide usage, it is only
+    provided for customers who need to migrate from the `add_loggers`
+    API.
+
+    This creates a model which provides logging for the following
+    problem: if we quantize `model` with `qconfig_mapping` and feed
+    the same input through both models, log the comparisons of
+    corresponding intermediate layers.
+
+    The problem is solved with a single model.  Specifically, we
+    partition `model` into N subgraphs, create a copy of each relevant
+    subgraph, wrap it in a module, apply the quantization API to that
+    module, and hook up loggers to measure the comparisons.
+
+    Example starting graph:
+
+      x0 -> op0 -> x1 -> op1 -> x2
+
+    Example config: quantize op0 to int8, do nothing to op1.
+    The following graph will be created:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \                           \       # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog -> op1_0 -> x2_1 ----> clog
+
+    Where op0_0 is op0, op0_1 is op0 wrapped in a submodule and quantized
+    to int8, op1_0 is op1 (appearing in the graph twice), log is a logger,
+    and clog is a comparison logger.
+    """
+
+    tracer = quantize_fx.QuantizationTracer([], [])
+    mt = torch.fx.GraphModule(model, tracer.trace(model))
+    # this is necessary to ensure logger FQNs get populated
+    mt._node_name_to_scope = tracer.node_name_to_scope
+
+    # run example input propagation, we need this to call prepare_fx on
+    # individual subgraphs
+    output_prop = OutputProp(mt)
+    output_prop.propagate(*example_inputs)
+
+    # Find the set of subgraphs in the original graph which we need to
+    # consider.
+    modules = dict(mt.named_modules(remove_duplicate=False))
+    patterns = _get_pattern_to_quantize_handlers(backend_config)
+    root_node_getter_mapping = \
+        get_fusion_pattern_to_root_node_getter(backend_config)
+    standalone_module_names: List[str] = []
+    standalone_module_classes: List[Type] = []
+    custom_module_classes: List[Type] = []
+    matches = _find_matches(
+        mt.graph, modules, patterns, root_node_getter_mapping,
+        standalone_module_names, standalone_module_classes, custom_module_classes)
+    subgraphs_dedup: Dict[str, List[Node]] = \
+        _get_dedup_subgraphs(matches)
+
+    # generate node to qconfig for each subgraph
+    node_name_to_qconfig = _generate_node_name_to_qconfig(
+        mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope)
+
+    # Now, mutate the graph to be the add_loggers graph with propagation
+    # error.
+    create_add_loggers_graph(
+        mt, subgraphs_dedup, qconfig_mapping, node_name_to_qconfig)
+
+    return mt
+
+# TODO(future PR): we should rethink the names of all the PNP APIs
+def _n_shadows_compare_weights(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_mapping: QConfigMapping,
+    backend_config: BackendConfig,
+) -> NSResultsType:
+    """
+    Note: this API is not recommended for wide usage, it is only
+    provided for customers who need to migrate from the `add_loggers`
+    API.
+    """
+    qconfig_multi_mapping = \
+        QConfigMultiMapping.from_list_qconfig_mapping([qconfig_mapping])
+    mp = prepare_n_shadows_model(
+        model, example_inputs, qconfig_multi_mapping, backend_config)
+    # passing inputs through the model is necessary to populate
+    # observers which observe weights with real values
+    mp(*example_inputs)
+    mq = convert_n_shadows_model(mp)
+    weight_comparison = extract_weight_comparison(mq)
+    return weight_comparison
+
 # TODO(future PR): consider aligning API signature with other similar quantization
 # functions (enable_fake_quant, etc)
 def loggers_set_enabled(model: torch.nn.Module, enabled: bool) -> None:
diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py
index fddf24af48e7..84944e1e8658 100644
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@@ -8,7 +8,7 @@
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
 import torch.ao.nn.intrinsic.quantized as nniq
-import torch.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
 import torch.ao.nn.intrinsic.qat as nniqat
 import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.qat as nnqat
@@ -27,303 +27,307 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
     # note: this set is modified below by items from backend_config
     sets_of_related_ops: List[Set[NSNodeTargetType]] = [
         # conv modules
-        set([
+        {
             nn.Conv1d,
-        ]),
-        set([
+        },
+        {
             nn.Conv2d,
-        ]),
-        set([
+        },
+        {
             nn.Conv3d,
-        ]),
+        },
         # conv functionals
-        set([
+        {
             F.conv1d,
-        ]),
-        set([
+        },
+        {
             F.conv2d,
-        ]),
-        set([
+        },
+        {
             F.conv3d,
-        ]),
+        },
         # linear modules
-        set([
+        {
             nn.Linear,
-        ]),
+        },
         # linear functionals
-        set([
+        {
             F.linear,
-        ]),
+        },
         # average pool
-        set([
+        {
             nn.AvgPool1d,
             torch.avg_pool1d,
-        ]),
-        set([
+        },
+        {
             nn.AvgPool2d,
             torch._C._nn.avg_pool2d,
-        ]),
-        set([
+        },
+        {
             nn.AvgPool3d,
             torch._C._nn.avg_pool3d,
-        ]),
+        },
         # adaptive average pool
-        set([
+        {
             nn.AdaptiveAvgPool1d,
             F.adaptive_avg_pool1d,
-        ]),
-        set([
+        },
+        {
             nn.AdaptiveAvgPool2d,
             F.adaptive_avg_pool2d,
-        ]),
-        set([
+        },
+        {
             nn.AdaptiveAvgPool3d,
             F.adaptive_avg_pool3d,
-        ]),
+        },
         # LSTM
-        set([
+        {
             nn.LSTM,
-        ]),
+        },
         # add
-        set([
+        {
             torch.add,
             operator.add,  # x + y
-        ]),
+        },
         # cat
-        set([
+        {
             torch.cat,
-        ]),
+        },
         # mul
-        set([
+        {
             torch.mul,
             operator.mul,
-        ]),
+        },
         # relu
-        set([
+        {
             F.relu,
             nn.ReLU,
             'relu',
             'relu_',
             torch.relu,
-        ]),
+        },
         # maxpool
-        set([
+        {
             nn.MaxPool1d,
             F.max_pool1d,
-        ]),
-        set([
+        },
+        {
             nn.MaxPool2d,
             F.max_pool2d,
-        ]),
-        set([
+        },
+        {
             nn.MaxPool3d,
             F.max_pool3d,
-        ]),
+        },
         # sigmoid
-        set([
+        {
             torch.sigmoid,
             'sigmoid',
             'sigmoid_',
             nn.Sigmoid,
             F.sigmoid,
-        ]),
+        },
         # BatchNorm
-        set([
+        {
             nn.BatchNorm2d,
-        ]),
-        set([
+        },
+        {
             nn.BatchNorm3d,
-        ]),
+        },
         # ConvTranspose
-        set([
+        {
             nn.ConvTranspose1d,
-        ]),
-        set([
+        },
+        {
             nn.ConvTranspose2d,
-        ]),
-        set([
+        },
+        {
             nn.ConvTranspose3d,
-        ]),
+        },
         # ELU
-        set([
+        {
             nn.ELU,
-        ]),
+        },
         # Embedding
-        set([
+        {
             nn.Embedding,
-        ]),
+        },
         # EmbeddingBag
-        set([
+        {
             nn.EmbeddingBag,
-        ]),
+        },
         # GroupNorm
-        set([
+        {
             nn.GroupNorm,
-        ]),
+        },
         # Hardswish
-        set([
+        {
             nn.Hardswish,
-        ]),
+        },
         # InstanceNorm
-        set([
+        {
             nn.InstanceNorm1d,
-        ]),
-        set([
+        },
+        {
             nn.InstanceNorm2d,
-        ]),
-        set([
+        },
+        {
             nn.InstanceNorm3d,
-        ]),
+        },
         # LayerNorm
-        set([
+        {
             nn.LayerNorm,
-        ]),
+        },
         # LeakyReLU
-        set([
+        {
             nn.LeakyReLU,
-        ]),
+        },
         # ReLU6
-        set([
+        {
             nn.ReLU6,
             F.relu6,
-        ]),
+        },
         # F.elu
-        set([
+        {
             F.elu,
-        ]),
+        },
         # F.hardswish
-        set([
+        {
             F.hardswish,
-        ]),
+        },
         # F.group_norm
-        set([
+        {
             F.group_norm,
-        ]),
+        },
         # F.instance_norm
-        set([
+        {
             F.instance_norm,
-        ]),
+        },
         # F.layer_norm
-        set([
+        {
             F.layer_norm,
-        ]),
+        },
         # F.leaky_relu
-        set([
+        {
             F.leaky_relu,
-        ]),
+        },
         # F.silu
-        set([
+        {
             nn.SiLU,
             F.silu,
-        ]),
+        },
         # F.mish
-        set([
+        {
             nn.Mish,
             F.mish,
-        ]),
+        },
         # F.tanh
-        set([
+        {
             nn.Tanh,
             F.tanh,
             torch.tanh,
             'tanh_',
             'tanh',
-        ]),
+        },
         # F.hardsigmoid
-        set([
+        {
             'hardsigmoid_',
             'hardsigmoid',
             F.hardsigmoid,
             nn.Hardsigmoid,
-        ]),
+        },
         # F.hardtanh
-        set([
+        {
             nn.Hardtanh,
             F.hardtanh,
             F.hardtanh_,
-        ]),
+        },
         # floordiv
-        set([
+        {
             operator.floordiv,
-        ]),
+        },
         # unsqueeze
-        set([
+        {
             torch.unsqueeze,
-        ]),
+        },
         # stack
-        set([
+        {
             torch.stack,
-        ]),
+        },
         # squeeze
-        set([
+        {
             torch.squeeze,
-        ]),
+        },
         # sort
-        set([
+        {
             torch.sort,
-        ]),
+        },
         # repeat_interleave
-        set([
+        {
             torch.repeat_interleave,
-        ]),
+        },
         # min
-        set([
+        {
             torch.min,
-        ]),
+        },
         # mean
-        set([
+        {
             torch.mean,
-        ]),
+        },
         # max
-        set([
+        {
             torch.max,
-        ]),
+        },
         # transpose
-        set([
+        {
             torch.transpose,
-        ]),
+        },
         # flatten
-        set([
+        {
             torch.flatten,
-        ]),
+        },
         # clamp
-        set([
+        {
             torch.clamp,
-        ]),
+        },
         # chunk
-        set([
+        {
             torch.chunk,
-        ]),
+        },
         # interpolate
-        set([
+        {
             torch.nn.functional.interpolate,
-        ]),
+        },
         # dropout
-        set([
+        {
             nn.Dropout,
-        ]),
+        },
         # F.dropout
-        set([
+        {
             F.dropout,
-        ]),
+        },
         # matmul
-        set([
+        {
             torch.matmul,
-        ]),
+        },
         # Softmax
-        set([
+        {
             nn.Softmax,
-        ]),
+        },
         # PReLU
-        set([
+        {
             nn.PReLU,
             nnq.PReLU,
-        ]),
+        },
         # F.prelu
-        set([
+        {
             F.prelu,
             toq.prelu,
-        ]),
+        },
+        # pixel shuffle
+        {
+            F.pixel_shuffle,
+        },
     ]
 
     # for each floating point op, add versions of the op added by
@@ -373,6 +377,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
 
     for source_to_double_target in (
         _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_MAP,
+        _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP,
         _lower_to_native_backend.DYNAMIC_LOWER_FUSED_MODULE_MAP,
     ):
         for source, (target1, target2) in source_to_double_target.items():  # type: ignore[attr-defined]
@@ -452,12 +457,12 @@ def add_op_to_sets_of_related_ops(
         counter = 0
         while str(counter) in base_name_to_sets_of_related_ops:
             counter += 1
-        base_name_to_sets_of_related_ops[str(counter)] = set([op])
+        base_name_to_sets_of_related_ops[str(counter)] = {op}
 
 
 # TODO(future PR): clean this up
 def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
-    FUNS_IO_TYPE_FP32: Set[NSNodeTargetType] = set([
+    FUNS_IO_TYPE_FP32: Set[NSNodeTargetType] = {
         F.linear,
         F.conv1d,
         F.conv2d,
@@ -477,11 +482,11 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         torch.mul,
         torch.sum,
         F.prelu,
-    ])
+    }
 
     FUNS_IO_TYPE_FP16: Set[NSNodeTargetType] = set()
 
-    FUNS_IO_TYPE_INT8: Set[NSNodeTargetType] = set([
+    FUNS_IO_TYPE_INT8: Set[NSNodeTargetType] = {
         toq.linear,
         toq.linear_relu,
         toq.conv1d,
@@ -502,9 +507,9 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         # uncomment below
         # toq.add,
         # toq.mul,
-    ])
+    }
 
-    FUNS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = set([
+    FUNS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = {
         F.relu,
         F.tanh,
         torch.tanh,
@@ -523,6 +528,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         F.max_pool2d,
         F.max_pool3d,
         F.relu6,
+        F.pixel_shuffle,
         torch.avg_pool1d,
         torch._C._nn.avg_pool2d,
         torch._C._nn.avg_pool3d,
@@ -540,9 +546,9 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         torch.stack,
         torch.unsqueeze,
         operator.add,
-    ])
+    }
 
-    MODS_IO_TYPE_FP32: Set[NSNodeTargetType] = set([
+    MODS_IO_TYPE_FP32: Set[NSNodeTargetType] = {
         nn.Linear,
         nnqat.Linear,
         nnqatd.Linear,
@@ -603,9 +609,11 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nniqd.LinearReLU,
         nni.LinearLeakyReLU,
         nni.LinearTanh,
-    ])
+        nni.ConvAdd2d,
+        nni.ConvAddReLU2d,
+    }
 
-    MODS_IO_TYPE_INT8: Set[NSNodeTargetType] = set([
+    MODS_IO_TYPE_INT8: Set[NSNodeTargetType] = {
         nnq.Linear,
         nnq.Conv1d,
         nnq.Conv2d,
@@ -635,9 +643,11 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nniq.LinearReLU,
         nniq.LinearLeakyReLU,
         nniq.LinearTanh,
-    ])
+        nniq.ConvAdd2d,
+        nniq.ConvAddReLU2d,
+    }
 
-    MODS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = set([
+    MODS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = {
         nn.ReLU,
         nn.Tanh,
         nn.Sigmoid,
@@ -655,9 +665,9 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         nn.MaxPool2d,
         nn.MaxPool3d,
         nn.ReLU6,
-    ])
+    }
 
-    METHS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = set([
+    METHS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = {
         'sigmoid_',
         'sigmoid',
         'tanh_',
@@ -666,7 +676,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
         'hardsigmoid',
         'relu_',
         'relu',
-    ])
+    }
 
     return {
         'funs_io_type_fp32': FUNS_IO_TYPE_FP32,
@@ -682,16 +692,16 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
 
 def get_unmatchable_types_map() -> Dict[str, Set[NSNodeTargetType]]:
 
-    FUNS_UNMATCHABLE: Set[NSNodeTargetType] = set([
+    FUNS_UNMATCHABLE: Set[NSNodeTargetType] = {
         torch.quantize_per_tensor,
         operator.getitem,
-    ])
+    }
 
-    MODS_UNMATCHABLE: Set[NSNodeTargetType] = set([
+    MODS_UNMATCHABLE: Set[NSNodeTargetType] = {
         nn.Identity,
-    ])
+    }
 
-    METHS_UNMATCHABLE: Set[NSNodeTargetType] = set([
+    METHS_UNMATCHABLE: Set[NSNodeTargetType] = {
         'to',
         'dequantize',
         'reshape',
@@ -714,7 +724,7 @@ def get_unmatchable_types_map() -> Dict[str, Set[NSNodeTargetType]]:
         'contiguous',
         'clamp',
         'chunk',
-    ])
+    }
 
     return {
         'funs_unmatchable': FUNS_UNMATCHABLE,
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index af3dbcc9fb42..a5a5921cbd99 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -17,10 +17,10 @@
 )
 from torch.ao.ns.fx.graph_passes import _maybe_get_fqn
 from torch.ao.quantization import QConfigMapping
-from torch.ao.quantization.fx.custom_config import PrepareCustomConfig
 from torch.ao.quantization.qconfig import QConfigAny
 from torch.ao.quantization.utils import getattr_from_fqn
 from torch.ao.quantization.fx.match_utils import _MatchResult
+from torch.utils._pytree import tree_map
 
 import collections
 import copy
@@ -243,65 +243,6 @@ def _get_logger_for_subgraph(
     logger_mod_orig.enabled = False
     return logger_mod_orig
 
-def _add_logger_to_subgraph_wrapper(
-    model: GraphModule,
-    subgraph_idx: int,
-    subgraph_candidate_idx: int,
-    qconfig_str: str,
-    logger_cls: Callable,
-    ref_output_node: Node,
-    fqn: Optional[str],
-) -> None:
-    """
-    Given a model which consists of a subgraph and nothing else, adds a logger
-    to the end of this model. The logger takes `ref_output_node` as the reference
-    output, and does the comparison during calibration time.
-    """
-    first_node, last_node, first_non_ph_node = None, None, None
-    for idx, node in enumerate(model.graph.nodes):  # type: ignore[union-attr, arg-type]
-        if idx == 0:
-            first_node = node
-        elif idx == len(model.graph.nodes) - 1:  # type: ignore[union-attr, arg-type]
-            # last node is the output, so we want the first
-            # arg of the output
-            last_node = node.args[0]
-        if first_non_ph_node is None and node.op != 'placeholder':
-            first_non_ph_node = node
-    assert first_node is not None and last_node is not None and \
-        first_non_ph_node is not None
-    logger_mod = _get_logger_for_subgraph(
-        model, first_non_ph_node, last_node, subgraph_idx,  # type: ignore[arg-type]
-        subgraph_candidate_idx, qconfig_str, logger_cls, fqn)
-    attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
-    assert not hasattr(model, attr_name)
-    setattr(model, attr_name, logger_mod)
-
-    # add a new placeholder to the original subgraph module
-    # to represent the reference input
-    # before:
-    #
-    #   x0 -> mod -> x1
-    #
-    # after:
-    #
-    #   x0 -> mod -> x1
-    #         /
-    #   x0_ref
-
-    ph_name = 'SHADOW_PH_NAME'
-    # verify a node with this name does not exist
-    assert len([n for n in model.graph.nodes if n.name == ph_name]) == 0, \
-        'graph already contains node with name {ph_name}'
-
-    new_ph = None
-    with model.graph.inserting_before(first_node):
-        new_ph = model.graph.placeholder(ph_name)
-
-    with model.graph.inserting_after(last_node):
-        new_node = model.graph.call_module(
-            attr_name, args=(last_node, new_ph), kwargs={})
-    model.recompile()
-
 def create_submodule_from_subgraph(
     model: torch.nn.Module,
     first_node: Node,
@@ -491,7 +432,7 @@ def _add_placeholder(
     gm.recompile()
     return gm
 
-def handle_subgraph_candidate(
+def create_one_transformed_and_logged_copy_of_subgraph(
     mt: GraphModule,
     subgraph_idx: int,
     subgraph_candidate_idx: int,
@@ -500,6 +441,7 @@ def handle_subgraph_candidate(
     fqn: Optional[str],
     list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]],
     example_inputs: Any,
+    last_added_shadow_node_list: List[Optional[Node]],
     custom_prepare_fn: Optional[Callable] = None,
     custom_prepare_kwargs: Dict[str, Any] = None,
 ) -> None:
@@ -531,6 +473,7 @@ def handle_subgraph_candidate(
         setattr(mt, attr_name, logger_mod_orig)
         with mt.graph.inserting_after(last_node):
             new_node = mt.graph.call_module(attr_name, args=(last_node,), kwargs={})
+            last_added_shadow_node_list[0] = new_node
 
     else:
         # idx > 0 means we have a candidate qconfig to try, so we need
@@ -556,22 +499,10 @@ def handle_subgraph_candidate(
         orig_mod_copy_wrapped = create_submodule_from_subgraph(
             mt, first_node, last_node)
 
-        # add a logger to the end of this submodule
-        # get first and last nodes of the submodule
-        _add_logger_to_subgraph_wrapper(
-            orig_mod_copy_wrapped, subgraph_idx, subgraph_candidate_idx,
-            str(qconfig), OutputComparisonLogger, last_node, fqn)
-
-        # We need to set the loggers as non traceable to have them survive
-        # prepare_fx and convert_fx calls.
-        prepare_custom_config = PrepareCustomConfig()\
-            .set_non_traceable_module_classes([OutputLogger, OutputComparisonLogger])
-
         # add a call to prepare_fx on the wrapper module
         if custom_prepare_fn is None:
             orig_mod_copy_wrapped = torch.ao.quantization.quantize_fx.prepare_fx(
-                orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs,
-                prepare_custom_config=prepare_custom_config)
+                orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs)
         else:
             if custom_prepare_kwargs is None:
                 custom_prepare_kwargs = {}
@@ -579,7 +510,6 @@ def handle_subgraph_candidate(
                 assert kwarg_name not in custom_prepare_kwargs, f"cannot specify {kwarg_name} in custom_prepare_kwargs"
             prepare_kwargs: Dict[str, Any] = {
                 "example_inputs": example_inputs,
-                "prepare_custom_config": prepare_custom_config,
                 "qconfig_mapping": qconfig_mapping
             }
             prepare_kwargs.update(custom_prepare_kwargs)
@@ -593,16 +523,14 @@ def handle_subgraph_candidate(
         setattr(mt, attr_name, orig_mod_copy_wrapped)
 
         # add a call to the wrapper module from the parent graph
-        with mt.graph.inserting_after(last_node):
+        insert_after_node = last_added_shadow_node_list[0]
+        with mt.graph.inserting_after(insert_after_node):
             # TODO(future PR): handle fusion patterns where non-first nodes
             # need inputs
 
             # pass in all node args and kwargs
 
-            # the first argument is always the reference output of the last
-            # node of this subgraph
-            new_args = [last_node]
-
+            new_args = []
             for arg in first_node.args:
                 if isinstance(arg, Node):
                     new_args.append(arg)
@@ -625,7 +553,21 @@ def handle_subgraph_candidate(
             new_node = mt.graph.call_module(
                 attr_name, args=new_args, kwargs=new_kwargs)
 
-def handle_subgraph(
+        # add a logger to parent graph to observe the shadow wrapper
+        logger_mod_orig = _get_logger_for_subgraph(
+            mt, first_node, last_node, subgraph_idx, subgraph_candidate_idx,
+            str(qconfig), OutputComparisonLogger, fqn)
+
+        attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+        assert not hasattr(mt, attr_name)
+        setattr(mt, attr_name, logger_mod_orig)
+        with mt.graph.inserting_after(new_node):
+            logger = mt.graph.call_module(attr_name, args=(new_node, last_node), kwargs={})
+            last_added_shadow_node_list[0] = logger
+
+    mt.recompile()
+
+def create_n_transformed_and_logged_copies_of_subgraph(
     mt: GraphModule,
     subgraph_idx: int,
     match_name: str,
@@ -705,11 +647,442 @@ def handle_subgraph(
 
     fqn = _maybe_get_fqn(first_node, mt)
 
+    # We want the results to contain the subgraphs in natural order,
+    # and the graph to also contain shadow wrappers and shadow loggers
+    # in natural order.
+    # If we just iterate in reverse, the graph will be in natural
+    # order but the eventual results will be in reverse order.
+    # So, we keep track of the last shadow logger we added and
+    # always insert after it.
+    last_added_shadow_node_list: List[Optional[Node]] = [None]
     for subgraph_candidate_idx in range(len(qconfig_mappings) + 1):
-        handle_subgraph_candidate(
+
+        create_one_transformed_and_logged_copy_of_subgraph(
             mt, subgraph_idx, subgraph_candidate_idx, first_node,
             last_node, fqn, list_of_node_name_to_qconfig,
-            example_inputs, custom_prepare_fn, custom_prepare_kwargs)
+            example_inputs, last_added_shadow_node_list, custom_prepare_fn,
+            custom_prepare_kwargs)
+
+def create_add_loggers_graph(
+    model: GraphModule,
+    subgraphs_dedup: Dict[str, List[Node]],
+    qconfig_mapping: QConfigMapping,
+    node_name_to_qconfig: Dict[str, QConfigAny],
+) -> None:
+    """
+    Given a model, a model graph partition (currently a set of matched
+    subgraphs) and instructions how to transform each subgraph
+    (currently quantizing it according to qconfig_mapping), modifies
+    the model graph to create an alternate path through the original graph,
+    with each of the subgraphs quantized.  This is useful to compare
+    propagation error of a transformation such as quantization.
+
+    For example, given layer op0 and op1, there are four cases when handling op1:
+    1. op0 and op1 quantized
+    2. op0 and op1 unquantized
+    3. op0 quantized, op1 unquantized
+    4. op0 unquantized, op1 quantized
+
+    Example input, case 1:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \          \                 \       # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog    op1_1 -> x2_1 ----> clog
+
+    Example output, case 1:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \                           \        # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog -> op1_1 -> x2_1 ----> clog
+
+    """
+    # TODO(future PR): move logger classes to utils to remove circular dependency
+    from torch.ao.ns._numeric_suite_fx import OutputLogger, OutputComparisonLogger
+
+    def _get_subgraph_containing_node(node, subgraphs_dedup):
+        for name, subgraph in subgraphs_dedup.items():
+            if node in subgraph:
+                return subgraph
+        return None
+
+    # First, we need to create shadow branches, going from
+    #
+    #   x0 -> op0 -> x1 -> ...
+    #
+    #
+    # to
+    #
+    #   x0 -> op0_0 -> x1_0 -> log -> ...
+    #    \                     \
+    #      -> op0_1 -> x1_1 -> clog
+    #
+    # Later, the outputs of each shadow will be rerouted to calculate
+    # propagation error.
+
+    # Note: we cannot iterate over matched subgraphs because some nodes
+    # may not be matched. So, we iterate over nodes in the graph, and
+    # associate them to matched subgraphs if possible.
+
+    nodes_to_skip = set()
+    # for each subgraph, save a mapping from first node of subgraph
+    # to first and last node of the shadow of this subgraph
+    orig_first_node_to_shadow_in_node = {}
+    orig_first_node_to_shadow_out_node = {}
+    # need to record original list because we will mutate the graph as we go
+    orig_nodes = list(model.graph.nodes)  # type: ignore[union-attr, arg-type]
+    cur_subgraph_idx = 0
+    for n in orig_nodes:
+        if n.op in ('placeholder', 'get_attr', 'output') or n in nodes_to_skip:
+            continue
+
+        maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
+        insert_submodule_copy = False
+        if maybe_subgraph is not None:
+            first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
+            for node_to_skip in maybe_subgraph:
+                nodes_to_skip.add(node_to_skip)
+            qconfig = node_name_to_qconfig[first_node.name]
+            if qconfig is not None:
+                insert_submodule_copy = True
+        else:
+            first_node, last_node = n, n
+
+        if insert_submodule_copy:
+            match_name = first_node.name
+            create_n_transformed_and_logged_copies_of_subgraph(
+                model, cur_subgraph_idx, match_name, maybe_subgraph,
+                [qconfig_mapping], [node_name_to_qconfig],
+                None, None
+            )
+            # find the created shadow module and record it so we
+            # can find it easily in step 2
+            expected_shadow_target = f"shadow_wrapper_{cur_subgraph_idx}_1"
+            new_shadow_mod = None
+            for maybe_shadow_mod in model.graph.nodes:
+                if maybe_shadow_mod.op == 'call_module' and \
+                        maybe_shadow_mod.target == expected_shadow_target:
+                    new_shadow_mod = maybe_shadow_mod
+                    break
+            assert new_shadow_mod is not None
+            orig_first_node_to_shadow_in_node[first_node] = new_shadow_mod
+            orig_first_node_to_shadow_out_node[first_node] = new_shadow_mod
+
+        else:
+            # create a copy of the subgraph by only copying FX nodes
+            # but not copying any parameters, to minimize memory usage
+            subgraph_to_use = maybe_subgraph if maybe_subgraph is not None \
+                else [first_node]
+
+            # add a regular logger after last_node
+            qconfig_str = ''
+            subgraph_candidate_idx = 0
+            fqn = _maybe_get_fqn(first_node, model)
+            logger_mod_orig = _get_logger_for_subgraph(
+                model, first_node, last_node, cur_subgraph_idx, subgraph_candidate_idx,
+                qconfig_str, OutputLogger, fqn)
+            attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+            assert not hasattr(model, attr_name)
+            setattr(model, attr_name, logger_mod_orig)
+            insertion_point = last_node
+            with model.graph.inserting_after(insertion_point):
+                logger = model.graph.call_module(
+                    attr_name, args=(last_node,), kwargs={})
+                insertion_point = logger
+
+            # create a copy of the subgraph
+            cur_node_orig = first_node
+            cur_node_copy = None
+            first_node_copy = None
+            while cur_node_orig in subgraph_to_use:
+                # TODO(future PR): make this support all possible args/kwargs
+                if cur_node_orig is first_node:
+                    new_args = cur_node_orig.args
+                    new_kwargs = cur_node_orig.kwargs
+                else:
+                    first_arg_for_copy = cur_node_copy
+                    new_args = tuple([first_arg_for_copy, *cur_node_orig.args[1:]])  # noqa: C409
+                    new_kwargs = cur_node_orig.kwargs
+                # make a copy of cur_node_orig
+                with model.graph.inserting_after(insertion_point):
+                    cur_node_copy = model.graph.create_node(
+                        cur_node_orig.op,
+                        cur_node_orig.target,
+                        new_args,
+                        new_kwargs,
+                        # cur_node_orig.name,  # TODO(future PR): set name explicitly
+                    )
+                    if first_node_copy is None:
+                        first_node_copy = cur_node_copy
+                # since now only linear subgraphs are supported, all nodes
+                # except the last one must have only one user
+                if cur_node_orig != last_node:
+                    assert len(cur_node_orig.users.keys()) == 1
+                cur_node_orig = list(cur_node_orig.users.keys())[0]
+                assert not cur_node_orig.name.startswith(SHADOW_NODE_NAME_PREFIX)
+                insertion_point = cur_node_copy
+
+            # add a comparison logger after last_node's copy
+            subgraph_candidate_idx = 1
+            logger_mod_orig = _get_logger_for_subgraph(
+                model, first_node, last_node, cur_subgraph_idx, subgraph_candidate_idx,
+                qconfig_str, OutputComparisonLogger, fqn)
+            attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+            assert not hasattr(model, attr_name)
+            setattr(model, attr_name, logger_mod_orig)
+            with model.graph.inserting_after(insertion_point):
+                logger = model.graph.call_module(
+                    attr_name, args=(cur_node_copy, last_node), kwargs={})
+
+            # save the final node so we can use it in step 2
+            orig_first_node_to_shadow_in_node[first_node] = first_node_copy
+            orig_first_node_to_shadow_out_node[first_node] = cur_node_copy
+
+        cur_subgraph_idx += 1
+
+    model.recompile()
+
+    # Now, we go from
+    #
+    #   x0 -> op0_0 -> x1_0 -> log -> x1 -> op1_0 -> ...
+    #    \                     \       \
+    #      -> op0_1 -> x1_1 -> clog      -> op1_1 -> ...
+    #
+    # to
+    #
+    #   x0 -> op0_0 -> x1_0 -> log --> x1_0 -> op1_0 -> ...
+    #    \                     \
+    #      -> op0_1 -> x1_1 -> clog -> x1_1 -> op1_1 -> ...
+    #
+    # sample values of key internal variables for the example above:
+    #
+    #   orig_first_node_to_shadow_in_node = {op0_0: op0_1, op1_0: op1_1}
+    #   orig_first_node_to_shadow_out_node = {op0_0: op0_1, op1_0: op1_1}
+    #
+    # note: for subgraphs with more than one node, in_node will be different
+    # compared to out_node
+
+
+    nodes_to_skip = set()
+    for n in orig_nodes:
+        if n.op in ('placeholder', 'get_attr', 'output') or n in nodes_to_skip:
+            continue
+
+        maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
+        if maybe_subgraph is not None:
+            first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
+            for node_to_skip in maybe_subgraph:
+                nodes_to_skip.add(node_to_skip)
+        else:
+            first_node, last_node = n, n
+
+        def maybe_remap_node_to_shadow(node):
+            """
+            If unshadowed `node` has a shadow version, return that. If not,
+            return `node`.
+            """
+            if not isinstance(node, Node):
+                # handle scalars
+                return node
+
+            if node.op in ('placeholder', 'get_attr'):
+                return node
+
+            # Find the shadowed version of this arg from the previous
+            # subgraph. For this, we need to:
+            # 1. navigate to the first node of the previous subgraph
+            # 2. get the output of the shadow wrapper which has (1) as an input
+
+            # For now, assume the arg is in matched subgraphs. In the
+            # future we may have to handle the case where this is not true.
+            prev_subgraph = _get_subgraph_containing_node(
+                node, subgraphs_dedup)
+            if prev_subgraph is None:
+                prev_subgraph = [node]
+            prev_first_node = prev_subgraph[0]
+            prev_shadow_output = \
+                orig_first_node_to_shadow_out_node[prev_first_node]
+            return prev_shadow_output
+
+        cur_shadow_input = \
+            orig_first_node_to_shadow_in_node[first_node]
+        assert cur_shadow_input is not None
+        cur_shadow_input.args = tree_map(
+            maybe_remap_node_to_shadow, cur_shadow_input.args)
+        cur_shadow_input.kwargs = tree_map(
+            maybe_remap_node_to_shadow, cur_shadow_input.kwargs)
+
+        model.recompile()
+
+def _get_weight_info_from_shadow_wrapper(shadow_wrapper: torch.nn.Module):
+    # input: shadow wrapper module
+    # output if shadow wrapper module has a weighted op:
+    #   (quantize_fn, (quantize_fn_args))
+    # output if shadow wrapper module doesn't have a weighted op:
+    #   None
+
+    # For now, assume that the weight is the second input
+    # to the shadow module. If that changes, we can fix it later.
+    placeholders_seen = 0
+    for shadow_n in shadow_wrapper.graph.nodes:  # type: ignore[union-attr]
+        if shadow_n.op != 'placeholder':
+            continue
+
+        placeholders_seen += 1
+        if placeholders_seen != 2:
+            continue
+
+        # the subgraph looks like
+        #
+        #   _input_scale_1 = self._input_scale_1
+        #   _input_zero_point_1 = self._input_zero_point_1
+        #   quantize_per_channel = torch.quantize_per_channel(
+        #       w2_0, _input_scale_1, _input_zero_point_1,
+        #       0, torch.qint8)
+        #
+        #  we have `w2_0`, and are navigating this subgraph
+        #  to get `_input_scale_1` and `_input_zero_point_1`
+
+        assert len(shadow_n.users) == 1
+        quant_node = list(shadow_n.users.keys())[0]
+        new_args: Any = None
+        if quant_node.target == torch.quantize_per_channel:
+            _weight, scale_node, zp_node, axis, dtype = quant_node.args
+            scale_val = getattr_from_fqn(
+                shadow_wrapper, scale_node.target)
+            zp_val = getattr_from_fqn(
+                shadow_wrapper, zp_node.target)
+            new_args = (scale_val, zp_val, axis, dtype)
+        else:
+            assert quant_node.target == torch.quantize_per_tensor
+            _weight, scale_node, zp_node, dtype = quant_node.args
+            scale_val = getattr_from_fqn(
+                shadow_wrapper, scale_node.target)
+            zp_val = getattr_from_fqn(
+                shadow_wrapper, zp_node.target)
+            new_args = (scale_val, zp_val, dtype)
+        return (quant_node.target, new_args)
+
+    return None
+
+
+def extract_weight_comparison(m: GraphModule) -> NSResultsType:
+
+    # example graph:
+    #
+    #   w1 = self.w1
+    #   b1 = self.b1
+    #   linear = torch._C._nn.linear(x, w1, b1)
+    #   shadow_0_0 = self.shadow_0_0(linear)
+    #   shadow_wrapper_0_1 = self.shadow_wrapper_0_1(x, w1, b1)
+    #   shadow_0_1 = self.shadow_0_1(shadow_wrapper_0_1, linear)
+    #
+    # algorithm:
+    # 1. for each call_function node matching our allowlist:
+    # 2.   if corresponding shadow wrapper exists, extract the weight pair
+    #
+    # Note: this is not super robust, but that's ok because this is
+    # just for legacy customers who depend on the previous two-model version
+    # of this API. TBD if we need to make this robust.
+    # Note: modules are not supported, since existing customers only
+    # use functions.
+
+    # TODO(future PR): move this to config
+    weighted_ops = {
+        torch.nn.functional.linear,
+    }
+
+    results: NSResultsType = {
+        'model': {NSSingleResultValuesType.WEIGHT.value: {}}
+    }
+
+    for n in m.graph.nodes:  # type: ignore[union-attr]
+        if not (n.op == 'call_function' and n.target in weighted_ops):
+            continue
+
+        # Check if we have a corresponding shadow wrapper
+        # TODO(future PR, if needed): support kwargs
+        # TODO(future PR, if needed): support multiple shadow users
+        first_arg = n.args[0]
+        shadow_wrapper_node = None
+        for user in first_arg.users:
+            # TODO(before land): fix string match
+            if user.op == 'call_module' and \
+                    user.target.startswith('shadow_wrapper'):
+                shadow_wrapper_node = user
+                break
+
+        if shadow_wrapper_node is None:
+            continue
+
+        shadow_wrapper = getattr_from_fqn(
+            m, shadow_wrapper_node.target)  # type: ignore[arg-type]
+        weight_info = _get_weight_info_from_shadow_wrapper(
+            shadow_wrapper)
+        if weight_info is None:
+            continue
+
+        # get weight
+        w_node = n.args[1]
+        w_obj = getattr_from_fqn(m, w_node.target).detach()
+
+        # get a quantized version of weight
+        quant_fn, quant_fn_args_except_first = weight_info
+        new_args = (w_obj, *quant_fn_args_except_first)
+        w_obj_q = quant_fn(*new_args)
+
+        # add a comparison
+        ref_node_name = n.name
+        prev_node_name = n.name
+        ref_node_type = get_target_type_str(n, m)
+        prev_node_type = ref_node_type
+        fqn = None
+        if hasattr(m, '_node_name_to_scope'):
+            fqn = m._node_name_to_scope[n.name][0]  # type: ignore[index]
+        comparison = torch.ao.ns.fx.utils.compute_sqnr(w_obj, w_obj_q)
+        result_fp32 = {
+            'res_type': NSSingleResultValuesType.WEIGHT.value,
+            'values': [w_obj],
+            'prev_node_name': prev_node_name,
+            'prev_node_target_type': prev_node_type,
+            'ref_node_name': ref_node_name,
+            'ref_node_target_type': ref_node_type,
+            'index_within_arg': 0,
+            'index_of_arg': 0,
+            'fqn': fqn,
+            'qconfig_str': '',
+            'comparisons': [comparison],
+            'comparison_fn_name': 'sqnr',
+        }
+        result_q = {
+            'res_type': NSSingleResultValuesType.WEIGHT.value,
+            'values': [w_obj_q],
+            'prev_node_name': prev_node_name,
+            'prev_node_target_type': prev_node_type,
+            'ref_node_name': ref_node_name,
+            'ref_node_target_type': ref_node_type,
+            'index_within_arg': 0,
+            'index_of_arg': 0,
+            'fqn': fqn,
+            'qconfig_str': '',
+            'comparisons': [comparison],
+            'comparison_fn_name': 'sqnr',
+        }
+
+        # go from subgraph_n_1 to subgraph_n_0
+        _1, _2, node_idx, _3 = shadow_wrapper_node.target.split('_')
+        name_fp32 = f"subgraph_{node_idx}_0"
+        name_q = f"subgraph_{node_idx}_1"
+
+        results['model'][NSSingleResultValuesType.WEIGHT.value][name_fp32] = \
+            [result_fp32]
+        results['model'][NSSingleResultValuesType.WEIGHT.value][name_q] = \
+            [result_q]
+
+    return results
 
 # TODO(future PR): redesign this to make it easier to consume outputs
 def group_results_by_subgraph(results: NSResultsType) -> Any:
@@ -771,8 +1144,11 @@ def group_results_by_subgraph(results: NSResultsType) -> Any:
     """
     subgraph_name_to_subgraph_results: Any = collections.defaultdict(dict)
 
+    # node_output or weight
+    key_to_use = list(results['model'].keys())[0]
+
     for subgraph_name_with_idx, subgraph_candidate_results in \
-            results['model']['node_output'].items():
+            results['model'][key_to_use].items():
 
         # convert from `subgraph_m_n` to `subgraph_m` and `n`
         subgraph_str, subgraph_idx, subgraph_candidate_idx = \
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index db42c38abd44..8d6f54ef9c14 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -3,7 +3,7 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized as nniq
 import torch.ao.nn.quantized as nnq
 
 toq = torch.ops.quantized
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
index e02d464a1fb7..870b183acc61 100644
--- a/torch/ao/ns/fx/weight_utils.py
+++ b/torch/ao/ns/fx/weight_utils.py
@@ -5,8 +5,8 @@
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.intrinsic.qat as nniqat
 import torch.ao.nn.qat as nnqat
-import torch.nn.intrinsic as nni
-import torch.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.quantized as nniq
 toq = torch.ops.quantized
 from torch.fx import GraphModule
 from torch.fx.graph import Node
@@ -50,15 +50,11 @@ def get_qlstm_weight(mod: nn.Module) -> List[torch.Tensor]:
 
 def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor:
     if (
-        isinstance(mod, nn.Conv1d) or
-        isinstance(mod, nn.Conv2d) or
-        isinstance(mod, nn.Conv3d)
+        isinstance(mod, (nn.Conv1d, nn.Conv2d, nn.Conv3d))
     ):
         return mod.weight.detach()
     elif (
-        isinstance(mod, nni.ConvReLU1d) or
-        isinstance(mod, nni.ConvReLU2d) or
-        isinstance(mod, nni.ConvReLU3d)
+        isinstance(mod, (nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d))
     ):
         return mod[0].weight.detach()
     else:
diff --git a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
index 7cdaf95af8c8..a61ffe694d7e 100644
--- a/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
+++ b/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@@ -8,7 +8,7 @@
 __all__ = ['BaseDataScheduler']
 
 
-class BaseDataScheduler(object):
+class BaseDataScheduler:
     r"""
     The BaseDataScheduler is the abstract scheduler class specifically for the
     BaseDataSparsifier class. This class controls a specific hyperparameter of
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
index e6d0b98efff2..6d6cf3fcca49 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -29,8 +29,7 @@
 
 
 class _Container(nn.Module):
-    def __init__(self):
-        super().__init__()
+    pass
 
 
 class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
index f7f83d7d6f3b..692960c09b5d 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
@@ -50,7 +50,7 @@ The benchmark codes depend on the [DLRM codebase](https://github.com/facebookres
 
 ### **Disk savings**
 ```
-python evaluate_disk_savings.py --model_path=<path_to_model_checkpoint> --sparsified_model_dump_path=<path_to_dump_sparsified_models>
+python evaluate_disk_savings.py --model-path=<path_to_model_checkpoint> --sparsified-model-dump-path=<path_to_dump_sparsified_models>
 ```
 
 Running this script should dump
@@ -62,13 +62,13 @@ Running this script should dump
 
 ### **Model Quality**
 ```
-python evaluate_model_metrics.py --raw_data_file=<path_to_raw_data_txt_file> --processed_data_file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse_model_metadata=<path_to_sparse_model_metadata_csv>
+python evaluate_model_metrics.py --raw-data-file=<path_to_raw_data_txt_file> --processed-data-file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse-model-metadata=<path_to_sparse_model_metadata_csv>
 ```
 Running this script should dump ```sparse_model_metrics.csv``` that contains evaluation metrics for all sparsified models.
 
 ### **Model forward time**:
 ```
-python evaluate_forward_time.py --raw_data_file=<path_to_raw_data_txt_file> --processed_data_file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse_model_metadata=<path_to_sparse_model_metadata_csv>
+python evaluate_forward_time.py --raw-data-file=<path_to_raw_data_txt_file> --processed-data-file=<path_to_kaggleAdDisplayChallenge_processed.npz> --sparse-model-metadata=<path_to_sparse_model_metadata_csv>
 ```
 Running this script should dump ```dlrm_forward_time_info.csv``` that contains forward time for all sparsified models with and without torch.sparse in the forward pass.
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index eb4d2a04751b..a9aed69a7966 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -152,8 +152,8 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_path', type=str)
-    parser.add_argument('--sparsified_model_dump_path', type=str)
+    parser.add_argument('--model-path', '--model_path', type=str)
+    parser.add_argument('--sparsified-model-dump-path', '--sparsified_model_dump_path', type=str)
     args = parser.parse_args()
 
     sparsify_model(args.model_path, args.sparsified_model_dump_path)
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
index 4435365c2efc..4f205312e181 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
@@ -85,9 +85,9 @@ def measure_forward_pass(sparse_model_metadata, device, sparse_dlrm, **batch):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--raw_data_file', type=str)
-    parser.add_argument('--processed_data_file', type=str)
-    parser.add_argument('--sparse_model_metadata', type=str)
+    parser.add_argument('--raw-data-file', '--raw_data_file', type=str)
+    parser.add_argument('--processed-data-file', '--processed_data_file', type=str)
+    parser.add_argument('--sparse-model-metadata', '--sparse_model_metadata', type=str)
 
     args = parser.parse_args()
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_model_metrics.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_model_metrics.py
index 05246d545ba7..d26b2161dced 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_model_metrics.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_model_metrics.py
@@ -119,9 +119,9 @@ def evaluate_metrics(test_dataloader, sparse_model_metadata):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--raw_data_file', type=str)
-    parser.add_argument('--processed_data_file', type=str)
-    parser.add_argument('--sparse_model_metadata', type=str)
+    parser.add_argument('--raw-data-file', '--raw_data_file', type=str)
+    parser.add_argument('--processed-data-file', '--processed_data_file', type=str)
+    parser.add_argument('--sparse-model-metadata', '--sparse_model_metadata', type=str)
 
     args = parser.parse_args()
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
index 8e79cedbb8ea..1a2791c359b6 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
@@ -85,16 +85,16 @@ def post_training_sparse_quantize(model,
         for _, emb_module in embedding_modules:
             emb_module.qconfig = torch.ao.quantization.float_qparams_weight_only_qconfig
 
-        torch.quantization.prepare(model, inplace=True)
-        torch.quantization.convert(model, inplace=True)
+        torch.ao.quantization.prepare(model, inplace=True)
+        torch.ao.quantization.convert(model, inplace=True)
 
     else:
         # quantize
         for _, emb_module in embedding_modules:
             emb_module.qconfig = torch.ao.quantization.float_qparams_weight_only_qconfig
 
-        torch.quantization.prepare(model, inplace=True)
-        torch.quantization.convert(model, inplace=True)
+        torch.ao.quantization.prepare(model, inplace=True)
+        torch.ao.quantization.convert(model, inplace=True)
 
         # retrieve scale & zero_points
         quantize_params: Dict[str, Dict] = {'scales': {}, 'zero_points': {},
diff --git a/torch/ao/pruning/_experimental/pruner/README.md b/torch/ao/pruning/_experimental/pruner/README.md
index 20f1dcee1db2..572b07414df8 100644
--- a/torch/ao/pruning/_experimental/pruner/README.md
+++ b/torch/ao/pruning/_experimental/pruner/README.md
@@ -4,7 +4,7 @@
 
 **Pruning** is the technique of removing parameters from a model to reduce the computational cost. The goal of pruning is to improve the performance of the model while maintaining it's accuracy.
 
-### Unstrictured vs. Structured Pruning
+### Unstructured vs. Structured Pruning
 One way to do this is to consider each parameter individually. This gives us the greatest granularity when pruning and is called **unstructured pruning**.
 
 For example, consider a simple linear regression model that is parametrized by a weight tensor W.
@@ -47,7 +47,7 @@ By removing a row from U and a column from W, we can avoid a shape mismatch.
 ![](./images/prune_6.png)
 
 
-One benefit of **structured pruning** is that it uses the same dense kernels that the original model uses, and does not rely on custom sparse kerenel like **unstructured pruning**.
+One benefit of **structured pruning** is that it uses the same dense kernels that the original model uses, and does not rely on custom sparse kernel like **unstructured pruning**.
 However, structured pruning degrades accuracy more than unstructured pruning because of the lack of granularity, so it is not always the right choice.
 
 Generally the structured pruning process looks something like this:
@@ -56,7 +56,7 @@ Generally the structured pruning process looks something like this:
 3. Remove rows by resizing the weight matrices of each layer
 4. Stop if target sparsity level is met.
 
-The accuracy degredation of pruning can be quite large initially. Once we are satisfied with our pruned tensor, we usually retrain the model after pruning in order to restore some of this accuracy loss.
+The accuracy degradation of pruning can be quite large initially. Once we are satisfied with our pruned tensor, we usually retrain the model after pruning in order to restore some of this accuracy loss.
 
 ## Quickstart Guide
 
@@ -76,7 +76,7 @@ Structured pruning works by traversing this graph and looking for specific **pat
 
 Each pattern is tied to a pruning function, which is responsible for structured pruning the graph nodes that match the pattern.
 
-The above [example](#weight-resizing) of two linear layers would match agains a `(nn.Linear, nn.Linear)` pattern. This is how we identify the rows to remove and the columns of the subsequent layer.
+The above [example](#weight-resizing) of two linear layers would match against a `(nn.Linear, nn.Linear)` pattern. This is how we identify the rows to remove and the columns of the subsequent layer.
 
 Structured pruning also works on other patterns other than two adjacent Linear layers,
 
@@ -146,7 +146,7 @@ pruner.step()
 # The output of pruner.prune() is a model with resized weights and the masks / parametrizations removed.
 pruned_model = pruner.prune()
 ```
-Afterwards, by printinting the name and size of each parameter in our model, we can see that it has been pruned.
+Afterwards, by printing the name and size of each parameter in our model, we can see that it has been pruned.
 
 ```
 # original model
diff --git a/torch/ao/pruning/_experimental/pruner/__init__.py b/torch/ao/pruning/_experimental/pruner/__init__.py
index 3849af7c4180..d39aa394f12b 100644
--- a/torch/ao/pruning/_experimental/pruner/__init__.py
+++ b/torch/ao/pruning/_experimental/pruner/__init__.py
@@ -4,3 +4,4 @@
     BiasHook,
 )
 from .saliency_pruner import SaliencyPruner
+from .lstm_saliency_pruner import LSTMSaliencyPruner
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index 3b568f1557d0..62ac9573bf5b 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -1,4 +1,5 @@
 from itertools import chain
+from operator import getitem
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -7,8 +8,8 @@
 from typing import Type, Set, Dict, Callable, Tuple, Optional, Union
 
 from torch.ao.pruning import BaseSparsifier
-from .parametrization import FakeStructuredSparsity, BiasHook
-from .match_utils import apply_match
+from .parametrization import FakeStructuredSparsity, BiasHook, module_contains_param
+from .match_utils import apply_match, MatchAllNode
 from .prune_functions import (
     prune_linear,
     prune_linear_linear,
@@ -19,6 +20,8 @@
     prune_conv2d_activation_pool_conv2d,
     prune_conv2d_pool_activation_conv2d,
     prune_conv2d_pool_flatten_linear,
+    prune_lstm_output_linear,
+    prune_lstm_output_layernorm_linear,
 )
 
 
@@ -26,6 +29,7 @@ def _get_supported_structured_pruning_modules():
     SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
         nn.Linear,
         nn.Conv2d,
+        nn.LSTM,
     }
     return SUPPORTED_STRUCTURED_PRUNING_MODULES
 
@@ -83,14 +87,14 @@ def _get_supported_activation_modules():
 
 
 def _get_default_structured_pruning_patterns() -> Dict[
-    Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
+    Tuple[Union[Type[nn.Module], Callable, MatchAllNode, str], ...],
     Callable[..., None],
 ]:
     """
     Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
     """
     patterns: Dict[
-        Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
+        Tuple[Union[Type[nn.Module], Callable, MatchAllNode, str], ...],
         Callable[..., None],
     ] = {
         # linear -> linear
@@ -99,6 +103,13 @@ def _get_default_structured_pruning_patterns() -> Dict[
         # conv2d -> conv2d
         (nn.Conv2d, "output"): prune_conv2d,
         (nn.Conv2d, nn.Conv2d): prune_conv2d_conv2d,
+        # TODO LSTM Structured pruning does not support returned state currently.
+        # Should find a way to explicitly match getitem(0) instead of getitem.
+        # This will also require changing the pruning function.
+        # lstm -> getitem(0) -> linear
+        (nn.LSTM, getitem, nn.Linear): prune_lstm_output_linear,
+        # lstm -> getitem(0) -> layernorm -> linear
+        (nn.LSTM, getitem, nn.LayerNorm, nn.Linear): prune_lstm_output_layernorm_linear,
     }
 
     for activation in chain(
@@ -222,8 +233,6 @@ def _prepare(self, *args, **kwargs) -> None:
         r"""This function will attach the FakeStructuredSparsity parameterizations
         and BiasHooks at the appropriate points in the model.
         """
-        self.bias_handles = []
-
         for config in self.groups:
             module = config["module"]
             tensor_name = config["tensor_name"]
@@ -238,17 +247,20 @@ def _prepare(self, *args, **kwargs) -> None:
             parametrize.register_parametrization(
                 module, tensor_name, parametrization(mask)
             )
-            prune_bias = config.get("prune_bias", True)
-            if module.bias is not None:
-                module.register_parameter("_bias", nn.Parameter(module.bias.detach()))
-                module.bias = None
-                module.prune_bias = prune_bias
 
-            self.bias_handles.append(
+            # if linear / conv, we add in bias hooks
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                prune_bias = config.get("prune_bias", True)
+                if module.bias is not None:
+                    module.register_parameter(
+                        "_bias", nn.Parameter(module.bias.detach())
+                    )
+                    module.bias = None
+                    module.prune_bias = prune_bias
+
                 module.register_forward_hook(
                     BiasHook(module.parametrizations.weight[0], prune_bias)
                 )
-            )
 
     def prune(self) -> None:
         r"""
@@ -264,7 +276,6 @@ def prune(self) -> None:
 
         # Right now we check for matches simply by iterating across all the patterns
         # if this is slow we can store patterns in a trie-structure and modify this code for faster lookup
-
         for node in self.traced.graph.nodes:
             for pattern, convert_fn in self.patterns.items():
                 matched = apply_match(modules, pattern, node, [])
@@ -276,10 +287,7 @@ def prune(self) -> None:
                 if (
                     first_module is not None
                     and parametrize.is_parametrized(first_module)
-                    and isinstance(
-                        first_module.parametrizations["weight"][0],
-                        FakeStructuredSparsity,
-                    )
+                    and module_contains_param(first_module, FakeStructuredSparsity)
                 ):
                     convert_block = []
                     for node in matched:
@@ -289,6 +297,12 @@ def prune(self) -> None:
                             convert_block.append(node.target)
                     convert_fn(*convert_block)
 
+        for module in self.traced.modules():
+            if module_contains_param(module, FakeStructuredSparsity):
+                raise Exception(
+                    f"Error: {module} still contains FakeStructuredSparsity parametrizations!"
+                )
+
         self.traced.graph.lint()
         self.traced.recompile()
         return self.traced
diff --git a/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py b/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
new file mode 100644
index 000000000000..8ad90927b459
--- /dev/null
+++ b/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
@@ -0,0 +1,48 @@
+from typing import cast
+
+import torch
+from .base_structured_sparsifier import BaseStructuredSparsifier, FakeStructuredSparsity
+
+class LSTMSaliencyPruner(BaseStructuredSparsifier):
+    """
+    Prune packed LSTM weights based on saliency.
+    For each layer {k} inside a LSTM, we have two packed weight matrices
+    - weight_ih_l{k}
+    - weight_hh_l{k}
+
+    These tensors pack the weights for the 4 linear layers together for efficency.
+
+    [W_ii | W_if | W_ig | W_io]
+
+    Pruning this tensor directly will lead to weights being misassigned when unpacked.
+    To ensure that each packed linear layer is pruned the same amount:
+        1. We split the packed weight into the 4 constitutient linear parts
+        2. Update the mask for each individual piece using saliency individually
+
+    This applies to both weight_ih_l{k} and weight_hh_l{k}.
+    """
+
+    def update_mask(self, module, tensor_name, **kwargs):
+        weights = getattr(module, tensor_name)
+
+        for p in getattr(module.parametrizations, tensor_name):
+            if isinstance(p, FakeStructuredSparsity):
+                mask = cast(torch.Tensor, p.mask)
+
+                # select weights based on magnitude
+                if weights.dim() <= 1:
+                    raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
+                # take norm over all but first dim
+                dims = tuple(range(1, weights.dim()))
+                saliency = weights.norm(dim=dims, p=1)
+
+                # handle weights in 4 groups
+                split_size = len(mask) // 4
+                masks = torch.split(mask, split_size)
+                saliencies = torch.split(saliency, split_size)
+
+                for keep_mask, sal in zip(masks, saliencies):
+                    # mask smallest k values to be removed
+                    k = int(len(keep_mask) * kwargs["sparsity_level"])
+                    prune = sal.topk(k, largest=False, sorted=False).indices
+                    keep_mask.data[prune] = False  # modifies underlying p.mask directly
diff --git a/torch/ao/pruning/_experimental/pruner/parametrization.py b/torch/ao/pruning/_experimental/pruner/parametrization.py
index aeddd0a84152..f169c8520156 100644
--- a/torch/ao/pruning/_experimental/pruner/parametrization.py
+++ b/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -1,7 +1,19 @@
 import torch
 from torch import nn
+from torch.nn.utils.parametrize import is_parametrized
 
 
+def module_contains_param(module, parametrization):
+    if is_parametrized(module):
+        # see if any of the module tensors have a parametriztion attached that matches the one passed in
+        return any(
+            [
+                any(isinstance(param, parametrization) for param in param_list)
+                for key, param_list in module.parametrizations.items()
+            ]
+        )
+    return False
+
 
 # Structured Pruning Parameterizations
 class FakeStructuredSparsity(nn.Module):
@@ -27,15 +39,15 @@ def state_dict(self, *args, **kwargs):
         # avoid double saving masks
         return {}
 
-class BiasHook:
 
+class BiasHook:
     def __init__(self, parametrization, prune_bias):
         self.param = parametrization
         self.prune_bias = prune_bias
 
     def __call__(self, module, input, output):
 
-        if getattr(module, '_bias', None) is not None:
+        if getattr(module, "_bias", None) is not None:
             bias = module._bias.data
             if self.prune_bias:
                 bias[~self.param.mask] = 0
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
index ee8bffb7f9f3..7c03cd953714 100644
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -10,7 +10,6 @@
 from torch.nn.utils.parametrize import ParametrizationList
 from .parametrization import FakeStructuredSparsity, BiasHook
 
-
 # BIAS PROPOGATION
 def _remove_bias_handles(module: nn.Module) -> None:
     if hasattr(module, "_forward_hooks"):
@@ -357,3 +356,120 @@ def prune_conv2d_pool_flatten_linear(
         else:
             linear.weight = nn.Parameter(linear.weight[:, flattened_mask])
             linear.in_features = linear.weight.shape[1]
+
+
+def prune_lstm_output_linear(
+    lstm: nn.LSTM, getitem: Callable, linear: nn.Linear
+) -> None:
+    prune_lstm_output_layernorm_linear(lstm, getitem, None, linear)
+
+
+def prune_lstm_output_layernorm_linear(
+    lstm: nn.LSTM,
+    getitem: Callable,
+    layernorm: Optional[nn.LayerNorm],
+    linear: nn.Linear,
+) -> None:
+    for i in range(lstm.num_layers):
+        if parametrize.is_parametrized(lstm, f"weight_ih_l{i}"):
+            parametrization_dict = cast(nn.ModuleDict, lstm.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict[f"weight_ih_l{i}"]
+            )
+            mask = weight_parameterizations[0].mask
+
+            with torch.no_grad():
+                parametrize.remove_parametrizations(
+                    lstm, f"weight_ih_l{i}", leave_parametrized=True
+                )
+                setattr(
+                    lstm,
+                    f"weight_ih_l{i}",
+                    nn.Parameter(getattr(lstm, f"weight_ih_l{i}")[mask]),
+                )
+                setattr(
+                    lstm,
+                    f"bias_ih_l{i}",
+                    nn.Parameter(getattr(lstm, f"bias_ih_l{i}")[mask]),
+                )
+
+        if parametrize.is_parametrized(lstm, f"weight_hh_l{i}"):
+            parametrization_dict = cast(nn.ModuleDict, lstm.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict[f"weight_hh_l{i}"]
+            )
+            mask = weight_parameterizations[0].mask
+
+            with torch.no_grad():
+                parametrize.remove_parametrizations(
+                    lstm, f"weight_hh_l{i}", leave_parametrized=True
+                )
+                # splitting out hidden-hidden masks
+                W_hi, W_hf, W_hg, W_ho = torch.split(
+                    getattr(lstm, f"weight_hh_l{i}"), lstm.hidden_size
+                )
+                M_hi, M_hf, M_hg, M_ho = torch.split(mask, lstm.hidden_size)
+
+                # resize each individual weight separately
+                W_hi = W_hi[M_hi][:, M_hi]
+                W_hf = W_hf[M_hf][:, M_hf]
+                W_hg = W_hg[M_hg][:, M_hg]
+                W_ho = W_ho[M_ho][:, M_ho]
+
+                # concat, use this as new weight
+                new_weight = torch.cat((W_hi, W_hf, W_hg, W_ho))
+                setattr(lstm, f"weight_hh_l{i}", nn.Parameter(new_weight))
+                setattr(
+                    lstm,
+                    f"bias_hh_l{i}",
+                    nn.Parameter(getattr(lstm, f"bias_hh_l{i}")[mask]),
+                )
+
+            # If this is the final layer, then we need to prune linear layer columns
+            if i + 1 == lstm.num_layers:
+                lstm.hidden_size = int(M_hi.sum())
+                with torch.no_grad():
+                    if parametrize.is_parametrized(linear):
+                        parametrization_dict = cast(
+                            nn.ModuleDict, linear.parametrizations
+                        )
+                        weight_parameterizations = cast(
+                            ParametrizationList, parametrization_dict.weight
+                        )
+
+                        weight_parameterizations.original = nn.Parameter(
+                            weight_parameterizations.original[:, M_ho]
+                        )
+                        linear.in_features = weight_parameterizations.original.shape[1]
+                    else:
+                        linear.weight = nn.Parameter(linear.weight[:, M_ho])
+                        linear.in_features = linear.weight.shape[1]
+
+                    # if layernorm module, prune weight and bias
+                    if layernorm is not None:
+                        layernorm.normalized_shape = (linear.in_features,)
+                        layernorm.weight = nn.Parameter(layernorm.weight[M_ho])
+                        layernorm.bias = nn.Parameter(layernorm.bias[M_ho])
+
+            # otherwise need to prune the columns of the input of the next LSTM layer
+            else:
+                with torch.no_grad():
+                    if parametrize.is_parametrized(lstm, f"weight_ih_l{i+1}"):
+                        parametrization_dict = cast(
+                            nn.ModuleDict, lstm.parametrizations
+                        )
+                        weight_parameterizations = cast(
+                            ParametrizationList,
+                            getattr(parametrization_dict, f"weight_ih_l{i+1}"),
+                        )
+
+                        weight_parameterizations.original = nn.Parameter(
+                            weight_parameterizations.original[:, M_ho]
+                        )
+                    else:
+                        next_layer_weight = getattr(lstm, f"weight_ih_l{i+1}")
+                        setattr(
+                            lstm,
+                            f"weight_ih_l{i+1}",
+                            nn.Parameter(next_layer_weight[:, M_ho]),
+                        )
diff --git a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
index d790295718b9..f965fa647de9 100644
--- a/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
@@ -17,6 +17,8 @@ def update_mask(self, module, tensor_name, **kwargs):
         mask = getattr(module.parametrizations, tensor_name)[0].mask
 
         # use negative weights so we can use topk (we prune out the smallest)
+        if weights.dim() <= 1:
+            raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
         saliency = -weights.norm(dim=tuple(range(1, weights.dim())), p=1)
         assert saliency.shape == mask.shape
 
diff --git a/torch/ao/pruning/scheduler/base_scheduler.py b/torch/ao/pruning/scheduler/base_scheduler.py
index 2adec4b27a67..0bd3640b0a33 100644
--- a/torch/ao/pruning/scheduler/base_scheduler.py
+++ b/torch/ao/pruning/scheduler/base_scheduler.py
@@ -7,7 +7,7 @@
 
 __all__ = ["BaseScheduler"]
 
-class BaseScheduler(object):
+class BaseScheduler:
 
     def __init__(self, sparsifier, last_epoch=-1, verbose=False):
 
diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py
index 97f9072ef304..90a5a8ef6994 100644
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -37,7 +37,7 @@ def __init__(self, sparsifier, sl_lambda, last_epoch=-1, verbose=False):
                 raise ValueError("Expected {} lr_lambdas, but got {}".format(
                     len(sparsifier.groups), len(sl_lambda)))
             self.sl_lambdas = list(sl_lambda)
-        super(LambdaSL, self).__init__(sparsifier, last_epoch, verbose)
+        super().__init__(sparsifier, last_epoch, verbose)
 
     def get_sl(self):
         if not self._get_sl_called_within_step:
diff --git a/torch/ao/quantization/_correct_bias.py b/torch/ao/quantization/_correct_bias.py
index 7dfc58dfe52a..d807b9811cd9 100644
--- a/torch/ao/quantization/_correct_bias.py
+++ b/torch/ao/quantization/_correct_bias.py
@@ -45,7 +45,7 @@ class MeanShadowLogger(ns.Logger):
     of the data passed to the floating point and quantized models
     """
     def __init__(self):
-        super(MeanShadowLogger, self).__init__()
+        super().__init__()
         self.stats["float"] = None
         self.stats["quantized"] = None
         self.count = 0
diff --git a/torch/ao/quantization/_equalize.py b/torch/ao/quantization/_equalize.py
index b15ffc65b7ad..519d33118086 100644
--- a/torch/ao/quantization/_equalize.py
+++ b/torch/ao/quantization/_equalize.py
@@ -16,7 +16,7 @@
 ]
 
 _supported_types = {torch.nn.Conv2d, torch.nn.Linear}
-_supported_intrinsic_types = {torch.nn.intrinsic.ConvReLU2d, torch.nn.intrinsic.LinearReLU}
+_supported_intrinsic_types = {torch.ao.nn.intrinsic.ConvReLU2d, torch.ao.nn.intrinsic.LinearReLU}
 _all_supported_types = _supported_types.union(_supported_intrinsic_types)
 
 def set_module_weight(module, weight) -> None:
diff --git a/torch/ao/quantization/_learnable_fake_quantize.py b/torch/ao/quantization/_learnable_fake_quantize.py
index 10600363d356..df86cd50a2a7 100644
--- a/torch/ao/quantization/_learnable_fake_quantize.py
+++ b/torch/ao/quantization/_learnable_fake_quantize.py
@@ -30,7 +30,7 @@ class _LearnableFakeQuantize(torch.ao.quantization.FakeQuantizeBase):
     """
     def __init__(self, observer, quant_min=0, quant_max=255, scale=1., zero_point=0., channel_len=-1,
                  use_grad_scaling=False, **observer_kwargs):
-        super(_LearnableFakeQuantize, self).__init__()
+        super().__init__()
         assert quant_min < quant_max, 'quant_min must be strictly less than quant_max.'
         self.quant_min = quant_min
         self.quant_max = quant_max
@@ -75,7 +75,7 @@ def enable_param_learning(self):
 
     @torch.jit.export
     def enable_static_estimate(self):
-        r"""Enables static observer estimates and disbales learning of
+        r"""Enables static observer estimates and disables learning of
         quantization parameters. Forward path returns fake quantized X.
         """
         self.toggle_qparam_learning(enabled=False) \
diff --git a/torch/ao/quantization/_pt2e/utils.py b/torch/ao/quantization/_pt2e/utils.py
index 686337080d80..434b9babf9ae 100644
--- a/torch/ao/quantization/_pt2e/utils.py
+++ b/torch/ao/quantization/_pt2e/utils.py
@@ -5,26 +5,8 @@
 from torch.ao.quantization.fx.prepare import (
     _is_activation_post_process_node,
 )
-from collections import OrderedDict
 import operator
 
-# TODO[qihan]: longer term, this should happen in the dynamo stack as well
-def _get_renamed_nn_module_stack(nn_module_stack):
-    # initialize with top level parent scope
-    nn_module_stack_renamed = OrderedDict([("", None)])
-    if nn_module_stack:
-        # Rename module_key, e.g. "self_layer1_1__conv1" to "self.layer1.1._conv1", for easier downstream parsing
-        prev_key = ""
-        for key, value in nn_module_stack.items():
-            if not prev_key:
-                if key.startswith("self_"):
-                    new_key = key[5:]
-                    prev_key = new_key
-            else:
-                new_key = prev_key + "." + key[len(prev_key) + 6 :]
-            nn_module_stack_renamed[new_key] = value
-            prev_key = new_key
-    return nn_module_stack_renamed
 
 def _get_tensor_constant_from_node(node, m):
     if node is None:
diff --git a/torch/ao/quantization/_quantize_pt2e.py b/torch/ao/quantization/_quantize_pt2e.py
index d750317bbdeb..f0fd04038314 100644
--- a/torch/ao/quantization/_quantize_pt2e.py
+++ b/torch/ao/quantization/_quantize_pt2e.py
@@ -5,7 +5,6 @@
 from .fx import prepare
 from .quantize_fx import _convert_to_reference_decomposed_fx
 from ._pt2e.utils import (
-    _get_renamed_nn_module_stack,
     _fuse_conv_bn_,
     _rearrange_weight_observer_for_addmm,
 )
@@ -21,8 +20,11 @@ def prepare_pt2e(
     # TODO: move this information to fx node itself
     node_name_to_scope: Dict[str, Tuple[str, type]] = {}
     for n in model.graph.nodes:
-        renamed_stack = _get_renamed_nn_module_stack(n.meta.get("nn_module_stack", None))
-        current_scope = list(renamed_stack.items())[-1]
+        nn_module_stack = n.meta.get("nn_module_stack", None)
+        current_scope = ("", type(None))
+        if nn_module_stack:
+            bt = list(nn_module_stack.values())[-1]
+            current_scope = (bt[0].split(".")[-1], bt[1])
         node_name_to_scope[n.name] = current_scope
 
     # TODO: check qconfig_mapping to make sure conv and bn are both configured
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index 11f1ea3cedf9..4872d418d559 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -3,9 +3,9 @@
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
-import torch.nn.intrinsic as nni
+import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.intrinsic.qat as nniqat
-import torch.nn.qat as nnqat
+import torch.ao.nn.qat as nnqat
 import torch.ao.nn.quantized.reference as nnqr
 from collections import namedtuple
 from typing import Callable, Dict, List, Union
@@ -503,6 +503,7 @@ def _get_share_qprams_op_backend_config(op):
         torch.nn.functional.max_pool1d,
         torch.nn.functional.max_pool2d,
         torch.nn.functional.max_pool3d,
+        torch.nn.functional.pixel_shuffle,
         torch.nn.functional.relu,
         torch.nn.functional.relu6,
         torch.avg_pool1d,
@@ -529,8 +530,6 @@ def _get_share_qprams_op_backend_config(op):
         "resize_",
         "relu",
         "relu_",
-        "shape",
-        "size",
         "squeeze",
         "squeeze_",
         "transpose",
@@ -603,14 +602,26 @@ def _get_embedding_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendP
                 .set_dtype_configs(dtype_configs)
                 .set_qat_module(qat_embedding_op)
                 .set_root_module(embedding_op)
-                .set_reference_quantized_module(ref_embedding_op)
-                ._set_input_output_observed(False))  # This is temporary, and will be removed soon
+                .set_reference_quantized_module(ref_embedding_op))
+
         # config for qat op
         embedding_op_configs.append(
             BackendPatternConfig(qat_embedding_op)
                 .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
                 .set_dtype_configs(dtype_configs)
                 .set_root_module(embedding_op)
-                .set_reference_quantized_module(ref_embedding_op)
-                ._set_input_output_observed(False))  # This is temporary, and will be removed soon
+                .set_reference_quantized_module(ref_embedding_op))
     return embedding_op_configs
+
+def _get_tensor_info_op_configs(dtype_configs):
+    """
+    These ops work on tensors of different dtypes but return non-tensors
+    containing information about the input tensor.
+    """
+
+    def _get_config(op):
+        return BackendPatternConfig(op) \
+            .set_observation_type(ObservationType.INPUT_OUTPUT_NOT_OBSERVED) \
+            .set_dtype_configs(dtype_configs)
+
+    return [_get_config(op) for op in ("shape", "size")]
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index faf2fd03ade9..ef31166b5cda 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -41,7 +41,6 @@
 EXTRA_INPUTS_GETTER_DICT_KEY = "extra_inputs_getter"
 NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY = "num_tensor_args_to_observation_type"
 INPUT_TYPE_TO_INDEX_DICT_KEY = "input_type_to_index"
-INPUT_OUTPUT_OBSERVED_DICT_KEY = "input_output_observed"
 
 
 # TODO: maybe rename this to something that's not related to observer
@@ -63,6 +62,11 @@ class ObservationType(Enum):
     example: torch.cat, maxpool
     """
 
+    INPUT_OUTPUT_NOT_OBSERVED = 2
+    """this means the input and output are never observed
+    example: x.shape, x.size
+    """
+
 
 @dataclass
 class DTypeWithConstraints:
@@ -417,7 +421,6 @@ def __init__(self, pattern: Optional[Pattern] = None):
         self._extra_inputs_getter: Optional[Callable] = None
         self._num_tensor_args_to_observation_type: Dict[int, ObservationType] = {}
         self._input_type_to_index: Dict[str, int] = {}
-        self._input_output_observed: Optional[bool] = None
         self._pattern_complex_format: Optional[Pattern] = None
 
     def __repr__(self):
@@ -558,10 +561,6 @@ def _set_input_type_to_index(self, input_type_to_index: Dict[str, int]) -> Backe
         self._input_type_to_index = input_type_to_index
         return self
 
-    def _set_input_output_observed(self, input_output_observed: bool) -> BackendPatternConfig:
-        self._input_output_observed = input_output_observed
-        return self
-
     def _set_pattern_complex_format(self, pattern: Pattern) -> BackendPatternConfig:
         """
         Set the pattern to configure, using the reversed nested tuple format.
@@ -620,7 +619,6 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
         conf._set_num_tensor_args_to_observation_type(
             backend_pattern_config_dict.get(NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY, {}))
         conf._set_input_type_to_index(backend_pattern_config_dict.get(INPUT_TYPE_TO_INDEX_DICT_KEY, {}))
-        conf._set_input_output_observed(backend_pattern_config_dict.get(INPUT_OUTPUT_OBSERVED_DICT_KEY, None))
         if PATTERN_COMPLEX_FORMAT_DICT_KEY in backend_pattern_config_dict:
             conf._set_pattern_complex_format(backend_pattern_config_dict[PATTERN_COMPLEX_FORMAT_DICT_KEY])
         return conf
@@ -654,8 +652,6 @@ def to_dict(self) -> Dict[str, Any]:
             backend_pattern_config_dict[NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY] = self._num_tensor_args_to_observation_type
         if len(self._input_type_to_index) > 0:
             backend_pattern_config_dict[INPUT_TYPE_TO_INDEX_DICT_KEY] = self._input_type_to_index
-        if self._input_output_observed is not None:
-            backend_pattern_config_dict[INPUT_OUTPUT_OBSERVED_DICT_KEY] = self._input_output_observed
         if self._pattern_complex_format is not None:
             backend_pattern_config_dict[PATTERN_COMPLEX_FORMAT_DICT_KEY] = self._pattern_complex_format
         return backend_pattern_config_dict
diff --git a/torch/ao/quantization/backend_config/executorch.py b/torch/ao/quantization/backend_config/executorch.py
index 3e6e1d7aa24a..cd4df7fb0f86 100644
--- a/torch/ao/quantization/backend_config/executorch.py
+++ b/torch/ao/quantization/backend_config/executorch.py
@@ -6,12 +6,13 @@
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
-import torch.nn.qat as nnqat
-import torch.nn.quantized._reference as nnqr
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.quantized.reference as nnqr
 from .backend_config import (
     BackendConfig,
     BackendPatternConfig,
     DTypeConfig,
+    DTypeWithConstraints,
     ObservationType,
 )
 from .qnnpack import (
@@ -43,7 +44,7 @@
     output_dtype=torch.quint8,
 )
 
-executorch_default_dynamic_int8_dtype_config = DTypeConfig(
+executorch_default_dynamic_quint8_dtype_config = DTypeConfig(
     input_dtype=torch.quint8,
     output_dtype=torch.float,
     weight_dtype=torch.qint8,
@@ -51,6 +52,26 @@
     is_dynamic=True,
 )
 
+executorch_act_qint8_scale_min_2_neg_12 = DTypeWithConstraints(
+    dtype=torch.qint8,
+    scale_min_lower_bound=2 ** -12,
+)
+
+executorch_weight_qint8_neg_127_to_127_scale_min_2_neg_12 = DTypeWithConstraints(
+    dtype=torch.qint8,
+    quant_min_lower_bound=-127,
+    quant_max_upper_bound=127,
+    scale_min_lower_bound=2 ** -12,
+)
+
+executorch_default_dynamic_qint8_dtype_config = DTypeConfig(
+    input_dtype=executorch_act_qint8_scale_min_2_neg_12,
+    output_dtype=torch.float,
+    weight_dtype=executorch_weight_qint8_neg_127_to_127_scale_min_2_neg_12,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
 executorch_default_dynamic_float16_dtype_config = DTypeConfig(
     input_dtype=torch.float16,
     output_dtype=torch.float,
@@ -78,7 +99,8 @@ def _get_linear_configs() -> List[BackendPatternConfig]:
     dtype_configs = [
         qnnpack_weighted_op_qint8_symmetric_dtype_config,
         executorch_weighted_op_int8_dtype_config,
-        executorch_default_dynamic_int8_dtype_config,
+        executorch_default_dynamic_quint8_dtype_config,
+        executorch_default_dynamic_qint8_dtype_config,
         executorch_default_dynamic_float16_dtype_config,
     ]
     linear_configs: List[BackendPatternConfig] = []
@@ -258,16 +280,21 @@ def _get_embedding_op_configs() -> List[BackendPatternConfig]:
                 .set_dtype_configs(dtype_configs)
                 .set_qat_module(qat_embedding_op)
                 .set_root_module(embedding_op)
-                .set_reference_quantized_module(ref_embedding_op)
-                ._set_input_output_observed(False))  # This is temporary, and will be removed soon
+                .set_reference_quantized_module(ref_embedding_op))
         # config for qat op
         embedding_op_configs.append(
             BackendPatternConfig(qat_embedding_op)
                 .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
                 .set_dtype_configs(dtype_configs)
                 .set_root_module(embedding_op)
-                .set_reference_quantized_module(ref_embedding_op)
-                ._set_input_output_observed(False))  # This is temporary, and will be removed soon
+                .set_reference_quantized_module(ref_embedding_op))
+
+        # config for functional embedding
+        embedding_op_configs.append(
+            BackendPatternConfig(torch.nn.functional.embedding)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                ._set_input_type_to_index({"weight": 1}))
     return embedding_op_configs
 
 # =====================
diff --git a/torch/ao/quantization/backend_config/fbgemm.py b/torch/ao/quantization/backend_config/fbgemm.py
index d2bc87879c44..74759fa73580 100644
--- a/torch/ao/quantization/backend_config/fbgemm.py
+++ b/torch/ao/quantization/backend_config/fbgemm.py
@@ -10,6 +10,7 @@
     _get_linear_configs,
     _get_rnn_op_configs,
     _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
 )
 from .backend_config import BackendConfig, DTypeConfig
 
@@ -92,6 +93,7 @@ def get_fbgemm_backend_config() -> BackendConfig:
     default_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
     fixed_qparams_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
     share_qparams_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
+    tensor_info_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
     rnn_op_dtype_configs = [
         fbgemm_default_dynamic_int8_dtype_config,
         fbgemm_default_dynamic_float16_dtype_config,
@@ -108,6 +110,7 @@ def get_fbgemm_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
diff --git a/torch/ao/quantization/backend_config/native.py b/torch/ao/quantization/backend_config/native.py
index ad5a12e6053b..81cfc928adb5 100644
--- a/torch/ao/quantization/backend_config/native.py
+++ b/torch/ao/quantization/backend_config/native.py
@@ -11,6 +11,7 @@
     _get_ln_configs,
     _get_rnn_op_configs,
     _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
 )
 from .backend_config import BackendConfig, DTypeConfig
 
@@ -124,6 +125,9 @@ def get_test_only_legacy_native_backend_config() -> BackendConfig:
         default_op_quint8_dtype_config,
         default_op_fp16_dtype_config
     ]
+    tensor_info_op_dtype_configs = [
+        default_op_quint8_dtype_config,
+    ]
     rnn_op_dtype_configs = [
         default_dynamic_int8_dtype_config,
         default_dynamic_float16_dtype_config,
@@ -141,6 +145,7 @@ def get_test_only_legacy_native_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
@@ -161,6 +166,7 @@ def get_native_backend_config() -> BackendConfig:
     default_op_dtype_configs = [default_op_quint8_dtype_config]
     fixed_qparams_op_dtype_configs = [default_op_quint8_dtype_config]
     share_qparams_op_dtype_configs = [default_op_quint8_dtype_config]
+    tensor_info_op_dtype_configs = [default_op_quint8_dtype_config]
     rnn_op_dtype_configs = [
         default_dynamic_int8_dtype_config,
         default_dynamic_float16_dtype_config,
@@ -178,6 +184,7 @@ def get_native_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
diff --git a/torch/ao/quantization/backend_config/onednn.py b/torch/ao/quantization/backend_config/onednn.py
index a23de8f5366b..6a896608c9b5 100644
--- a/torch/ao/quantization/backend_config/onednn.py
+++ b/torch/ao/quantization/backend_config/onednn.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 import torch.ao.nn.intrinsic as nni
 import torch.nn.functional as F
-import torch.nn.quantized._reference as nnqr
+import torch.ao.nn.quantized.reference as nnqr
 from ._common_operator_config_utils import (
     _get_conv_configs,
     _get_linear_configs,
@@ -25,7 +25,9 @@
 from ..fuser_method_mappings import (
     _sequential_wrapper2,
 )
-
+import operator
+from torch.ao.quantization.utils import MatchAllNode
+import itertools
 
 # ===================
 # |  DTYPE CONFIGS  |
@@ -103,10 +105,341 @@ def _fuse_linear_bn_leaky_relu(is_qat, linear, bn, leaky_relu):
 # ======================
 # |  CONFIGS FOR CONV  |
 # ======================
+observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
 
 conv_dtype_configs = [onednn_weighted_op_int8_dtype_config]
 conv_configs = _get_conv_configs(conv_dtype_configs)
 
+# (1) Conv2d + Add
+
+# conv2d   Y
+#   \   /
+#    add
+
+# include:
+# conv2d conv2d
+#   \   /
+#    add
+
+def _fuse_conv_add_left(is_qat, add, conv, _):
+    return nni.ConvAdd2d(conv, add)
+
+def _conv_add_root_node_getter_left(pattern):
+    _, conv, _ = pattern
+    return conv
+
+def _conv_add_extra_inputs_getter_left(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, conv, extra_input = pattern
+    return [extra_input]
+
+# conv2d
+#  \
+#  bn   Y
+#   \   /
+#    add
+
+def _fuse_conv_bn_add_left(is_qat, add, bn_conv, _):
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError("Cannot fuse train modules: {}".format((conv, bn, add)))
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAdd2d(fused_conv, add)
+
+def _conv_bn_add_root_node_getter_left(add_pattern):
+    _, bn_conv, _ = add_pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_extra_inputs_getter_left(add_pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, bn_conv, extra_input = add_pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_left_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_left_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_left)
+                ._set_root_node_getter(_conv_bn_add_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_bn_add_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAdd2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, nn.Conv2d, MatchAllNode))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_left)
+                ._set_root_node_getter(_conv_add_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_add_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAdd2d))
+
+#  Y   conv2d
+#   \   /
+#    add
+
+def _fuse_conv_add_right(is_qat, add, _, conv):
+    return nni.ConvAdd2d(conv, add)
+
+def _conv_add_root_node_getter_right(pattern):
+    add, _, conv = pattern
+    return conv
+
+def _conv_add_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, extra_input, conv = pattern
+    return [extra_input]
+
+#      conv2d
+#        /
+#  Y    bn
+#   \   /
+#    add
+
+def _fuse_conv_bn_add_right(is_qat, add, _, bn_conv):
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError("Cannot fuse train modules: {}".format((conv, bn, add)))
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAdd2d(fused_conv, add)
+
+def _conv_bn_add_root_node_getter_right(pattern):
+    add, _, bn_conv = pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, extra_input, bn_conv = pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_right)
+                ._set_root_node_getter(_conv_bn_add_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_bn_add_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAdd2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, MatchAllNode, nn.Conv2d))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_right)
+                ._set_root_node_getter(_conv_add_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_add_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAdd2d))
+
+conv_configs.append(
+    BackendPatternConfig(nni.ConvAdd2d)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(conv_dtype_configs)
+        .set_root_module(nn.Conv2d)
+        .set_reference_quantized_module(nnqr.Conv2d))
+
+# (2) Conv2d + Add + Relu
+
+# conv2d Y
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_add_relu_left(is_qat, relu, add_pattern):
+    add, conv, _ = add_pattern
+    return nni.ConvAddReLU2d(conv, add, relu)
+
+def _conv_add_relu_root_node_getter_left(pattern):
+    relu, add_pattern = pattern
+    _, conv, _ = add_pattern
+    return conv
+
+def _conv_add_relu_extra_inputs_getter_left(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, conv, extra_input = add_pattern
+    return [extra_input]
+
+# conv2d
+#  \
+#  bn   Y
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_bn_add_relu_left(is_qat, relu, add_pattern):
+    add, bn_conv, _ = add_pattern
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError("Cannot fuse train modules: {}".format((conv, bn, add, relu)))
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAddReLU2d(fused_conv, add, relu)
+
+def _conv_bn_add_relu_root_node_getter_left(pattern):
+    relu, add_pattern = pattern
+    _, bn_conv, _ = add_pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_relu_extra_inputs_getter_left(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, bn_conv, extra_input = add_pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_relu_left_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_relu_left_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_relu_left)
+                ._set_root_node_getter(_conv_bn_add_relu_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_bn_add_relu_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAddReLU2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, nn.Conv2d, MatchAllNode)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_relu_left)
+                ._set_root_node_getter(_conv_add_relu_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_add_relu_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAddReLU2d))
+
+#  Y   conv2d
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_add_relu_right(is_qat, relu, add_pattern):
+    add, _, conv = add_pattern
+    return nni.ConvAddReLU2d(conv, add, relu)
+
+def _conv_add_relu_root_node_getter_right(pattern):
+    relu, add_pattern = pattern
+    _, _, conv = add_pattern
+    return conv
+
+def _conv_add_relu_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, extra_input, conv = add_pattern
+    return [extra_input]
+
+#      conv2d
+#        /
+#  Y    bn
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_bn_add_relu_right(is_qat, relu, add_pattern):
+    add, _, bn_conv = add_pattern
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError("Cannot fuse train modules: {}".format((conv, bn, add, relu)))
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAddReLU2d(fused_conv, add, relu)
+
+def _conv_bn_add_relu_root_node_getter_right(pattern):
+    relu, add_pattern = pattern
+    _, _, bn_conv = add_pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_relu_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, extra_input, bn_conv = add_pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_relu_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_relu_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_relu_right)
+                ._set_root_node_getter(_conv_bn_add_relu_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_bn_add_relu_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAddReLU2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, MatchAllNode, nn.Conv2d)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_relu_right)
+                ._set_root_node_getter(_conv_add_relu_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_add_relu_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAddReLU2d))
+
+conv_configs.append(
+    BackendPatternConfig(nni.ConvAddReLU2d)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(conv_dtype_configs)
+        .set_root_module(nn.Conv2d)
+        .set_reference_quantized_module(nnqr.Conv2d))
 
 # ========================
 # |  CONFIGS FOR LINEAR  |
@@ -116,7 +449,6 @@ def _fuse_linear_bn_leaky_relu(is_qat, linear, bn, leaky_relu):
     onednn_weighted_op_int8_dtype_config,
     onednn_dynamic_int8_dtype_config,
 ]
-observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
 linear_configs = _get_linear_configs(linear_dtype_configs)
 
 def _add_eltwise_fusion_configs(configs, root_module, root_op, post_module, post_op,
diff --git a/torch/ao/quantization/backend_config/tensorrt.py b/torch/ao/quantization/backend_config/tensorrt.py
index a617f765adf7..1c5f761508bb 100644
--- a/torch/ao/quantization/backend_config/tensorrt.py
+++ b/torch/ao/quantization/backend_config/tensorrt.py
@@ -10,6 +10,7 @@
     _get_linear_configs,
     _get_conv_configs,
     _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
 )
 
 __all__ = [
@@ -59,6 +60,9 @@ def get_tensorrt_backend_config() -> BackendConfig:
     share_qparams_op_dtype_configs = [
         non_weighted_op_qint8_dtype_config,
     ]
+    tensor_info_op_dtype_configs = [
+        non_weighted_op_qint8_dtype_config,
+    ]
     # there might be things not supported in fx2trt, but it will error out
     # during fx2trt conversion and can support them after that
     return BackendConfig("tensorrt") \
@@ -67,7 +71,8 @@ def get_tensorrt_backend_config() -> BackendConfig:
         .set_backend_pattern_config(cat_config) \
         .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
         .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
-        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs))
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs))
 
 def get_tensorrt_backend_config_dict():
     """
diff --git a/torch/ao/quantization/backend_config/x86.py b/torch/ao/quantization/backend_config/x86.py
index 78a3f7618782..b4f165958f27 100644
--- a/torch/ao/quantization/backend_config/x86.py
+++ b/torch/ao/quantization/backend_config/x86.py
@@ -10,6 +10,7 @@
     _get_linear_configs,
     _get_rnn_op_configs,
     _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
 )
 from .backend_config import BackendConfig, DTypeConfig
 
@@ -89,6 +90,7 @@ def get_x86_backend_config() -> BackendConfig:
     default_op_dtype_configs = [x86_default_op_quint8_dtype_config]
     fixed_qparams_op_dtype_configs = [x86_weighted_op_int8_dtype_config]
     share_qparams_op_dtype_configs = [x86_default_op_quint8_dtype_config]
+    tensor_info_op_dtype_configs = [x86_default_op_quint8_dtype_config]
     rnn_op_dtype_configs = [
         x86_default_dynamic_int8_dtype_config,
         x86_default_dynamic_float16_dtype_config,
@@ -105,6 +107,7 @@ def get_x86_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
diff --git a/torch/ao/quantization/experimental/linear.py b/torch/ao/quantization/experimental/linear.py
index 92cf96aa5c80..240e708bc5ec 100644
--- a/torch/ao/quantization/experimental/linear.py
+++ b/torch/ao/quantization/experimental/linear.py
@@ -1,7 +1,7 @@
 import torch
 import numpy as np
 
-from torch.nn.quantized.modules.utils import WeightedQuantizedModule
+from torch.ao.nn.quantized.modules.utils import WeightedQuantizedModule
 from torch.ao.quantization.experimental.observer import APoTObserver
 from torch.ao.quantization.experimental.quantizer import quantize_APoT
 
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index 4fb639015127..f8d3a453e98d 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -222,7 +222,7 @@ def extra_repr(self):
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         # We cannot currently register scalar values as buffers, so need to manually
         # specify serialization here.
-        super(FakeQuantize, self)._save_to_state_dict(destination, prefix, keep_vars)
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + 'scale'] = self.scale
         destination[prefix + 'zero_point'] = self.zero_point
 
@@ -254,8 +254,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                         self.zero_point.copy_(val)
             elif strict:
                 missing_keys.append(key)
-        super(FakeQuantize, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict,
-                                                        missing_keys, unexpected_keys, error_msgs)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                      missing_keys, unexpected_keys, error_msgs)
 
 
 class FixedQParamsFakeQuantize(FakeQuantize):
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index 9d6455d7b0d4..03ee38d339fb 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -1,5 +1,5 @@
 import torch.nn as nn
-import torch.nn.intrinsic as nni
+import torch.ao.nn.intrinsic as nni
 
 from typing import Union, Callable, Tuple, Dict, Optional, Type
 from torch.ao.quantization.utils import Pattern, get_combined_dict, MatchAllNode
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index e932c28529c8..75291053a18a 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1,8 +1,9 @@
 import torch
 from torch.library import Library, impl
-from torch.ao.quantization import MinMaxObserver
+from torch.ao.quantization.utils import determine_qparams, validate_qmin_qmax
 from typing import Tuple
 
+
 # Note: decomposed means decomposed quantized tensor, using decomposed so that the
 # name is not too long
 quantized_decomposed_lib = Library("quantized_decomposed", "DEF")
@@ -177,13 +178,14 @@ def dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant
 
 quantized_decomposed_lib.define(
     "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, "
-    "ScalarType dtype) -> (Tensor, Tensor)")
+    "float eps, ScalarType dtype) -> (Tensor, Tensor)")
 
 @impl(quantized_decomposed_lib, "choose_qparams.tensor", "CompositeExplicitAutograd")
 def choose_qparams_tensor(
         input: torch.Tensor,
-        quant_min: int,
-        quant_max: int,
+        qmin: int,
+        qmax: int,
+        eps: float,
         dtype: torch.dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """ Given an input Tensor, derive the per tensor affine quantization parameter
@@ -200,28 +202,79 @@ def choose_qparams_tensor(
        zero_point (int): quantization parameter for the target quantized Tensor
     """
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
-    assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: {quant_min} max: {quant_max}"
+    assert dtype == torch.int8 or dtype == torch.uint8 or dtype == torch.int32, \
+        f"Expecting target dtype to be int8 uint8 or int32, but got: {dtype}"
+    validate_qmin_qmax(qmin, qmax)
+
+    min_val, max_val = torch.aminmax(input)
+
+    return determine_qparams(
+        min_val, max_val, qmin, qmax, dtype, torch.Tensor([eps]), has_customized_qrange=False)
+
+quantized_decomposed_lib.define(
+    "choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, "
+    "float eps, ScalarType dtype) -> (Tensor, Tensor)")
+
+@impl(quantized_decomposed_lib, "choose_qparams_symmetric.tensor", "CompositeExplicitAutograd")
+def choose_qparams_symmetric_tensor(
+        input: torch.Tensor,
+        qmin: int,
+        qmax: int,
+        eps: float,
+        dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Given an input Tensor, derive the per tensor affine quantization parameter
+    (scale and zero_point) for target quantized Tensor from the Tensor
+
+    Args:
+       input (torch.Tensor): floating point input Tensor
+       quant_min (int): minimum quantized value for target quantized Tensor
+       quant_max (int): maximum quantized value for target quantized Tensor
+       dtype (torch.dtype): dtype for target quantized Tensor
 
-    # Its weird to create an observer manually just to calculate qparams. I tried refactoring this functionality out of observer
-    # into a util and then use that util directly, but I kept running into jit typing errors related to torch.qscheme not
-    # being recognized as a type. TODO: properly refactor this out to avoid observer overhead
-    tensor_dtype_to_observer_dtype = {torch.uint8: torch.quint8, torch.int8: torch.qint8}
-    observer = MinMaxObserver(quant_min=quant_min, quant_max=quant_max, dtype=tensor_dtype_to_observer_dtype[dtype])
-    observer(input)
-    scale, zero_point = observer.calculate_qparams()
-    return (scale, zero_point)
+    Returns:
+       scale (float): quantization parameter for the target quantized Tensor
+       zero_point (int): quantization parameter for the target quantized Tensor
+    """
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert dtype == torch.int8 or dtype == torch.uint8 or dtype == torch.int32, \
+        f"Expecting target dtype to be int8 uint8 or int32, but got: {dtype}"
+    validate_qmin_qmax(qmin, qmax)
+
+    min_val, max_val = torch.aminmax(input)
+    return determine_qparams(
+        min_val,
+        max_val,
+        qmin,
+        qmax,
+        dtype,
+        torch.Tensor([eps]),
+        has_customized_qrange=False,
+        qscheme=torch.per_tensor_symmetric
+    )
 
 @impl(quantized_decomposed_lib, "choose_qparams.tensor", "Meta")
 def choose_qparams_tensor_meta(
         input: torch.Tensor,
         quant_min: int,
         quant_max: int,
+        eps: float,
         dtype: torch.dtype
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
-    assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: {quant_min} max: {quant_max}"
+    assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: \
+        {quant_min} max: {quant_max}"
     return torch.empty(1, dtype=torch.float, device=input.device), torch.empty(1, dtype=torch.int32, device=input.device)
 
+@impl(quantized_decomposed_lib, "choose_qparams_symmetric.tensor", "Meta")
+def choose_qparams_symmetric_tensor_meta(
+        input: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        eps: float,
+        dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty(1, dtype=torch.float, device=input.device), torch.empty(1, dtype=torch.int32, device=input.device)
 # Helper function used to implement per-channel quantization against any axis
 def _permute_to_axis_zero(x, axis):
     new_axis_list = list(range(x.dim()))
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index af0b79835d7d..8022f28cbfc5 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -6,9 +6,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.nn.intrinsic as nni
+import torch.ao.nn.intrinsic as nni
 from torch.fx import GraphModule
 from torch.fx.graph import Node
+from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr
 
 from torch.ao.quantization.backend_config import get_native_backend_config
 
@@ -57,7 +58,7 @@ class _InputEqualizationObserver(nn.Module):
 
     def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
                  quant_min=None, quant_max=None, factory_kwargs=None) -> None:
-        super(_InputEqualizationObserver, self).__init__()
+        super().__init__()
 
         if qscheme not in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
             raise TypeError("Input qscheme must be per-tensor")
@@ -141,7 +142,7 @@ class _WeightEqualizationObserver(nn.Module):
 
     def __init__(self, dtype=torch.qint8, qscheme=torch.per_tensor_affine, quant_min=None,
                  quant_max=None, factory_kwargs=None) -> None:
-        super(_WeightEqualizationObserver, self).__init__()
+        super().__init__()
 
         self.dtype = dtype
         self.qscheme = qscheme
@@ -266,8 +267,7 @@ def node_supports_equalization(node: Node, modules) -> bool:
     return False
 
 def is_equalization_observer(observer: nn.Module) -> bool:
-    return (isinstance(observer, _InputEqualizationObserver) or
-            isinstance(observer, _WeightEqualizationObserver))
+    return (isinstance(observer, (_InputEqualizationObserver, _WeightEqualizationObserver)))
 
 
 ###############################################################################
@@ -297,7 +297,9 @@ def get_op_node_and_weight_eq_obs(
     if op_node.op == 'call_module':
         # If the op_node is a nn.Linear layer, then it must have a
         # WeightEqualizationObserver configuration
-        equalization_node_name_to_qconfig: Dict[str, Any] = model._equalization_node_name_to_qconfig  # type: ignore[assignment]
+        maybe_equalization_node_name_to_config = _get_observed_graph_module_attr(model, "equalization_node_name_to_qconfig")
+        assert maybe_equalization_node_name_to_config is not None
+        equalization_node_name_to_qconfig: Dict[str, Any] = maybe_equalization_node_name_to_config  # type: ignore[assignment]
         assert(equalization_node_name_to_qconfig.get(op_node.name, None) is not None)
         weight_eq_obs = equalization_node_name_to_qconfig.get(op_node.name, None).weight()
 
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index fbca587cead5..e1be7d7ec2ce 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -5,12 +5,12 @@
 import torch.nn.functional as F
 import torch.ao.nn.intrinsic as nni
 import torch.ao.nn.intrinsic.quantized as nniq
-import torch.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
 import torch.ao.nn.quantized.reference as nnqr
 from torch.ao.nn.quantized.modules.utils import WeightedQuantizedModule
-from .graph_module import QuantizedGraphModule
+from torch.fx import GraphModule
 from .utils import (
     collect_producer_nodes,
     get_linear_prepack_op_for_dtype,
@@ -86,8 +86,8 @@ def is_default_node(node, modules):
         torch.nn.PReLU,
         torch.nn.BatchNorm2d,
         torch.nn.BatchNorm3d,
-        torch.nn.intrinsic.BNReLU2d,
-        torch.nn.intrinsic.BNReLU3d,
+        torch.ao.nn.intrinsic.BNReLU2d,
+        torch.ao.nn.intrinsic.BNReLU3d,
     ]
     return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
 
@@ -145,6 +145,7 @@ def is_general_tensor_shape_node(node, modules):
         torch.squeeze,
         torch.stack,
         torch.unsqueeze,
+        torch.nn.functional.pixel_shuffle,
     ]
     method_list = [
         "contiguous",
@@ -194,6 +195,10 @@ def is_getattr_tensor_metadata_node(node):
         node.target == getattr and \
         node.args[1] in ["shape"]
 
+def is_get_tensor_info_node(node):
+    return node.op == "call_method" and \
+        node.target in ["shape", "size"]
+
 def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigAny]):
     """
     Return True if the op is configured with a None qconfig, False otherwise.
@@ -264,6 +269,16 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigA
     nni.ConvReLU3d: (nnqr.Conv3d, nniq.ConvReLU3d),
 }
 
+# The difference between STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP and STATIC_LOWER_FUSED_MODULE_MAP:
+# The refer node inside STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP has 2 inputs.
+# Mapping from fused module class to a 2-tuple of:
+#   1) The inner reference module class
+#   2) The replacement static quantized module class for lowering
+STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[WeightedQuantizedModule]]] = {
+    nni.ConvAdd2d: (nnqr.Conv2d, nniq.ConvAdd2d),
+    nni.ConvAddReLU2d: (nnqr.Conv2d, nniq.ConvAddReLU2d),
+}
+
 # Mapping from fused module class to a 2-tuple of:
 #   1) The inner reference module class
 #   2) The replacement dynamic quantized module class for lowering
@@ -332,10 +347,29 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigA
     torch.mul: torch.ops.quantized.mul_relu,
 }
 
+def _save_packed_weight(self, destination, prefix, keep_vars):
+    for attr_name in dir(self):
+        if "_packed_weight" in attr_name and \
+           isinstance(getattr(self, attr_name), torch._C.ScriptObject):  # type: ignore[attr-defined]
+            packed_weight = getattr(self, attr_name)
+            destination[prefix + attr_name] = packed_weight
+
+def _load_packed_weight(self, state_dict, prefix, local_metadata, strict,
+                        missing_keys, unexpected_keys, error_msgs):
+    attrs_to_pop = []
+    for attr_name in state_dict:
+        if attr_name.startswith("_packed_weight") and isinstance(state_dict[attr_name], torch._C.ScriptObject):  # type: ignore[attr-defined] # noqa: B950
+            setattr(self, attr_name, state_dict[attr_name])
+            attrs_to_pop.append(attr_name)
+
+    # pop the packed param attributesn
+    for attr_name in attrs_to_pop:
+        state_dict.pop(attr_name)
+
 def fold_weight(
-    quantized: QuantizedGraphModule,
+    quantized_model: GraphModule,
     node_name_to_scope: Dict[str, Tuple[str, type]]
-) -> QuantizedGraphModule:
+) -> GraphModule:
     """
     Trace back from the weight node util we hit getattr, reconstruct the
     graph module with the traced nodes and run the graph module to pack the
@@ -345,7 +379,7 @@ def fold_weight(
     # map from folded node name to the prepacked weight name
     folded_nodes = {}
     # get packed weights
-    for node in quantized.graph.nodes:
+    for node in quantized_model.graph.nodes:
         if node.op == 'call_function' and node.target in WEIGHT_PREPACK_OPS:
             nodes_to_fold = collect_producer_nodes(node)
             if nodes_to_fold is not None:
@@ -353,7 +387,7 @@ def fold_weight(
                     folded_nodes[node_to_fold.name] = node
 
                 prepacking_module = graph_module_from_producer_nodes(
-                    quantized, nodes_to_fold)
+                    quantized_model, nodes_to_fold)
                 packed_weight = prepacking_module()
                 packed_weights[node.name] = packed_weight
 
@@ -363,10 +397,8 @@ def fold_weight(
 
     def load_arg(a):
         return map_arg(a, lambda node: env[node.name])
-    quantized_root = quantized
-    quantized_graph = quantized.graph
 
-    for node in quantized_graph.nodes:
+    for node in quantized_model.graph.nodes:
         prepack_node = folded_nodes.get(node.name, None)
         if prepack_node is node:
             packed_weight = packed_weights[node.name]
@@ -375,8 +407,8 @@ def load_arg(a):
             module_path, _ = node_name_to_scope[op_node.name]
             get_new_packed_weight_name = \
                 get_new_attr_name_with_prefix(module_path + '_packed_weight_')
-            packed_weight_name = get_new_packed_weight_name(quantized_root)
-            setattr(quantized_root, packed_weight_name, packed_weight)
+            packed_weight_name = get_new_packed_weight_name(quantized_model)
+            setattr(quantized_model, packed_weight_name, packed_weight)
             # replace prepack node with a getattr node
             env[node.name] = folded_graph.create_node(
                 'get_attr', packed_weight_name, (), {})
@@ -386,7 +418,11 @@ def load_arg(a):
         else:
             # copy other nodes
             env[node.name] = folded_graph.node_copy(node, load_arg)
-    return QuantizedGraphModule(quantized_root, folded_graph, quantized_root.preserved_attr_names)
+
+    quantized_model = GraphModule(quantized_model, folded_graph)
+    quantized_model._register_state_dict_hook(_save_packed_weight)
+    quantized_model._register_load_state_dict_pre_hook(_load_packed_weight, with_module=True)
+    return quantized_model
 
 def _get_module(node: Node, modules: Dict[str, nn.Module]) -> Optional[nn.Module]:
     """
@@ -471,8 +507,64 @@ def _match_static_pattern(
 
     return (q_node, relu_node, ref_node)
 
+def _match_static_pattern_with_two_inputs(
+    node: Node,
+    modules: Dict[str, nn.Module],
+    qconfig_map: Dict[str, QConfigAny],
+    matching_modules_or_ops: List[Callable]
+) -> Union[Tuple[Node, Node], Tuple[None, None]]:
+    """
+                      (dequantize \
+    Match the pattern (dequantize - ref node - quantize) against the node provided.
+
+    If there is a match, return a 2-tuple of:
+      1) q_node: the quantize node,
+      2) ref_node: a reference module or functional node to replace with its quantized counterpart
+    Otherwise, if there is no match, return a 2-tuple of (None, None).
+
+    Parameters:
+      node: The `torch.fx.Node` to match against.
+      modules: A mapping from node names to modules in the model graph, used for module lookup.
+      qconfig_map: A mapping from node names to the qconfigs associated with the nodes.
+          If the corresponding qconfig for the reference node is None, then return no match.
+      matching_modules_or_ops: Either a list of functions or a list of `torch.nn.Module`s.
+          If the reference node is not in this list, then return no match.
+    """
+    SKIP_LOWERING_VALUE = (None, None)
+
+    # Match quantize node
+    if node.op != "call_function" or node.target != torch.quantize_per_tensor:
+        return SKIP_LOWERING_VALUE
+    q_node = node
+    ref_node = q_node.args[0]
+    assert(isinstance(ref_node, Node))
+
+    if should_skip_lowering(ref_node, qconfig_map):
+        return SKIP_LOWERING_VALUE
+
+    # Match reference module or functional
+    if isinstance(matching_modules_or_ops[0], type) and issubclass(matching_modules_or_ops[0], nn.Module):
+        expected_op = "call_module"
+        match_key = type(_get_module(ref_node, modules))
+    else:
+        # This pass only support op of "call_module"
+        return SKIP_LOWERING_VALUE
+
+    if ref_node.op != expected_op or match_key not in matching_modules_or_ops:
+        return SKIP_LOWERING_VALUE
+
+    # Check ref_node has 2 input nodes, both are dq node.
+    if len(ref_node.args) != 2:
+        return SKIP_LOWERING_VALUE
+    for i in range(len(ref_node.args)):
+        arg = ref_node.args[i]
+        if not is_dequantize_node(arg):
+            return SKIP_LOWERING_VALUE
+
+    return (q_node, ref_node)
+
 def _lower_static_weighted_ref_module(
-        model: QuantizedGraphModule,
+        model: GraphModule,
         qconfig_map: Dict[str, QConfigAny]):
     """
     Traverse the graph and find dequantize - ref module - quantize patterns
@@ -521,7 +613,67 @@ def _lower_static_weighted_ref_module(
         model.graph.erase_node(scale_node)
         model.graph.erase_node(zero_point_node)
 
-def _lower_dynamic_weighted_ref_module(model: QuantizedGraphModule):
+def _lower_static_weighted_ref_module_with_two_inputs(
+        model: GraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and find patterns
+    dequantize   dequantize
+       \\         //
+        ref module
+            \\
+          quantize
+    and replace them with the quantized version of the ref module.
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    for n in model.graph.nodes:
+        #                                            (dequantize \
+        # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
+        matching_modules = list(STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP.keys())
+        (q_node, ref_node) = _match_static_pattern_with_two_inputs(
+            n, modules, qconfig_map, matching_modules)  # type: ignore[arg-type]
+        if q_node is None:
+            continue
+        assert(ref_node is not None)
+        (_, scale_node, zero_point_node, _) = q_node.args
+        ref_module = _get_module(ref_node, modules)
+        ref_class = type(ref_module)
+        assert(isinstance(scale_node, Node))
+        assert(isinstance(zero_point_node, Node))
+        assert(issubclass(ref_class, nn.Module))
+
+        # Step 1: Change this pattern to use the corresponding quantized module
+        # For fused modules, we also check whether the inner module is a reference module
+        # If so, we replace the entire fused module with the corresponding quantized module
+        if ref_class in STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP:
+            inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP[ref_class]
+            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+                continue
+        else:
+            continue
+        output_scale = getattr(model, scale_node.target)
+        output_zero_point = getattr(model, zero_point_node.target)
+        q_module = q_class.from_reference(ref_module, output_scale, output_zero_point)
+        # replace reference module with quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(modules[parent_name], module_name, q_module)
+
+        # Step 2: Reroute around dq_node, and remove q_node and its args
+        assert(len(ref_node.args) == 2)
+        for arg in ref_node.args:
+            if not is_dequantize_node(arg):
+                continue
+            dq_node = arg
+            assert(isinstance(dq_node, Node))
+            ref_node.replace_input_with(dq_node, dq_node.args[0])
+
+        q_node.replace_all_uses_with(ref_node)
+        model.graph.erase_node(q_node)
+        model.graph.erase_node(scale_node)
+        model.graph.erase_node(zero_point_node)
+
+def _lower_dynamic_weighted_ref_module(model: GraphModule):
     """
     Traverse the graph and find quantize_per_tensor_dynamic - dequantize - ref_module patterns
     and replace them with the dynamically quantized version of the ref module.
@@ -566,7 +718,7 @@ def _lower_dynamic_weighted_ref_module(model: QuantizedGraphModule):
         setattr(named_modules[parent_name], module_name, q_module)
         ref_node.replace_input_with(dq_node, input_dynamic_q_node.args[0])
 
-def _lower_weight_only_weighted_ref_module(model: QuantizedGraphModule):
+def _lower_weight_only_weighted_ref_module(model: GraphModule):
     """
     Traverse the graph and find ref_module patterns
     and replace them with the weight only quantized version of the ref module.
@@ -592,7 +744,7 @@ def _lower_weight_only_weighted_ref_module(model: QuantizedGraphModule):
         setattr(named_modules[parent_name], module_name, q_module)
 
 def _lower_static_weighted_ref_functional(
-        model: QuantizedGraphModule,
+        model: GraphModule,
         qconfig_map: Dict[str, QConfigAny]):
     """
     Traverse the graph and replace functional reference patterns with their quantized versions.
@@ -653,7 +805,7 @@ def _lower_static_weighted_ref_functional(
             model.graph.erase_node(relu_node)
 
 def _lower_dynamic_weighted_ref_functional(
-        model: QuantizedGraphModule,
+        model: GraphModule,
         qconfig_map: Dict[str, QConfigAny]):
     """
     Traverse the graph and replace functional reference patterns with their dynamically
@@ -756,7 +908,7 @@ def _lower_dynamic_weighted_ref_functional(
             model.graph.erase_node(relu_node)
 
 def _lower_quantized_binary_op(
-        model: QuantizedGraphModule,
+        model: GraphModule,
         qconfig_map: Dict[str, QConfigAny]):
     binary_ops_to_lower: List[Callable] = [operator.add, torch.add, operator.mul, torch.mul, torch.matmul]
     modules = dict(model.named_modules(remove_duplicate=False))
@@ -806,7 +958,7 @@ def _lower_quantized_binary_op(
             model.graph.erase_node(relu_node)
         model.graph.erase_node(bop_node)
 
-def special_pattern_replacement(model: QuantizedGraphModule):
+def special_pattern_replacement(model: GraphModule):
     modules = dict(model.named_modules(remove_duplicate=False))
     for n in model.graph.nodes:
         q_node = n
@@ -838,7 +990,7 @@ def special_pattern_replacement(model: QuantizedGraphModule):
             continue
         assert len(ref_node.args) > 0 or len(ref_node.kwargs) > 0
         dq_node_or_nodes = ref_node.args[0] if len(ref_node.args) > 0 else list(ref_node.kwargs.values())[0]
-        assert isinstance(dq_node_or_nodes, Node) or isinstance(dq_node_or_nodes, (tuple, list))
+        assert isinstance(dq_node_or_nodes, (Node, tuple, list))
         is_dequantize = False
         if isinstance(dq_node_or_nodes, Node):
             is_dequantize = dq_node_or_nodes.op == 'call_method' and \
@@ -914,7 +1066,7 @@ def special_pattern_replacement(model: QuantizedGraphModule):
 
     return model
 
-def _lower_getattr_tensor_metadta_op(model: QuantizedGraphModule):
+def _lower_getattr_tensor_metadta_op(model: GraphModule):
     """ Modified the graph of the model inplace, to skip extra dequantize op before
     the general tensor shape ops when possible
     """
@@ -928,22 +1080,39 @@ def _lower_getattr_tensor_metadta_op(model: QuantizedGraphModule):
             args[0] = n.args[0].args[0]
             n.args = tuple(args)
 
+def _lower_get_tensor_info_op(model: GraphModule):
+    """ Modified the graph of the model inplace, to skip extra dequantize op before
+    the general tensor shape ops when possible
+    """
+    for n in model.graph.nodes:
+        if not is_get_tensor_info_node(n):
+            continue
+        maybe_dq = n.args[0]
+        if maybe_dq.op != "call_method" or maybe_dq.target != "dequantize":
+            continue
+        # skip the dequantize node
+        args = list(n.args)
+        args[0] = n.args[0].args[0]
+        n.args = tuple(args)
+
 def _lower_to_native_backend(
-    model: QuantizedGraphModule,
+    model: GraphModule,
     qconfig_map: Dict[str, QConfigAny],
     node_name_to_scope: Dict[str, Tuple[str, type]]
-) -> QuantizedGraphModule:
+) -> GraphModule:
     """ Lower a quantized reference model (with reference quantized operator patterns)
     to the native backend in PyTorch (fbgemm/qnnpack), both backends shares the same
     operator signature so they can be lowered with the same function
     """
     _lower_static_weighted_ref_module(model, qconfig_map)
+    _lower_static_weighted_ref_module_with_two_inputs(model, qconfig_map)
     _lower_dynamic_weighted_ref_module(model)
     _lower_weight_only_weighted_ref_module(model)
     _lower_static_weighted_ref_functional(model, qconfig_map)
     _lower_dynamic_weighted_ref_functional(model, qconfig_map)
     _lower_quantized_binary_op(model, qconfig_map)
     _lower_getattr_tensor_metadta_op(model)
+    _lower_get_tensor_info_op(model)
     special_pattern_replacement(model)
     model.graph.eliminate_dead_code()
     model = fold_weight(model, node_name_to_scope)
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index b47f24ece078..bbca4609a2c6 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -2,7 +2,7 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.qat as nnqat
+import torch.ao.nn.qat as nnqat
 from abc import ABC, abstractmethod
 from torch.ao.quantization.fake_quantize import FakeQuantize
 from torch.ao.quantization.fx.graph_module import GraphModule
@@ -219,10 +219,10 @@ class PerChannelDetector(DetectorBase):
 
     # Default map for representing supported per channel quantization modules for different backends
     DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES: Dict[str, Set[Any]] = {
-        "fbgemm": set([nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d]),
-        "qnnpack": set([nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d]),
-        "onednn": set([nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d]),
-        "x86": set([nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d]),
+        "fbgemm": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+        "qnnpack": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+        "onednn": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+        "x86": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
     }
 
     def __init__(self, backend: str = torch.backends.quantized.engine):
@@ -230,7 +230,7 @@ def __init__(self, backend: str = torch.backends.quantized.engine):
 
         # store the backend information
         self.backend_chosen = backend
-        self.supported_modules = set([])
+        self.supported_modules = set()
         if self.backend_chosen in self.DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES:
             self.supported_modules = self.DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES[self.backend_chosen]
         else:
@@ -307,7 +307,7 @@ def _detect_per_channel_helper(self, model: nn.Module):
 
                 # this object should either be fake quant or observer
                 q_or_s_obj = module.qconfig.weight.p.func()
-                assert isinstance(q_or_s_obj, FakeQuantize) or isinstance(q_or_s_obj, ObserverBase)
+                assert isinstance(q_or_s_obj, (FakeQuantize, ObserverBase))
 
                 per_channel_used = False  # will be true if found in qconfig
 
@@ -413,17 +413,17 @@ class DynamicStaticDetector(DetectorBase):
     IS_CURRENTLY_SUPPORTED_KEY = "is_dynamic_supported"
 
     # modules that are supported both dynamic and static for this report function
-    DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED = set([nn.Linear])
+    DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED = {nn.Linear}
 
     # modules that will be supported soon for both
-    DEFAULT_DYNAMIC_STATIC_FUTURE_SUPPORTED = set([nn.Conv1d, nn.Conv2d, nn.Conv3d])
+    DEFAULT_DYNAMIC_STATIC_FUTURE_SUPPORTED = {nn.Conv1d, nn.Conv2d, nn.Conv3d}
 
     def __init__(self, tolerance=0.5):
         super().__init__()
 
         # set tolerance level and initialize a set to keep track of useful fqn locations
         self.tolerance = tolerance
-        self.useful_observer_fqns: Set[str] = set([])
+        self.useful_observer_fqns: Set[str] = set()
 
     def determine_observer_insert_points(self, prepared_fx_model: GraphModule) -> Dict[str, Dict[str, Any]]:
         r"""
@@ -737,9 +737,14 @@ class InputWeightEqualizationDetector(DetectorBase):
     * :attr:`DEFAULT_PRE_OBSERVER_NAME`: The name of the pre-observer to be inserted for this detector
     """
 
-    SUPPORTED_MODULES: Set[Callable] = set(
-        [nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d]
-    )
+    SUPPORTED_MODULES: Set[Callable] = {nn.Linear,
+                                        nn.Conv1d,
+                                        nn.Conv2d,
+                                        nn.Conv3d,
+                                        nnqat.Linear,
+                                        nnqat.Conv1d,
+                                        nnqat.Conv2d,
+                                        nnqat.Conv3d}
 
     # names for the pre and post observers that are inserted
     DEFAULT_PRE_OBSERVER_NAME: str = "model_report_pre_observer"
diff --git a/torch/ao/quantization/fx/_model_report/model_report.py b/torch/ao/quantization/fx/_model_report/model_report.py
index ee96dd4bf5a9..8bc2aec13503 100644
--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@@ -120,7 +120,7 @@ def __init__(self, model: GraphModule, desired_report_detectors: Set[DetectorBas
 
         # keep the reports private so they can't be modified
         self._desired_report_detectors = desired_report_detectors
-        self._desired_detector_names = set([detector.get_detector_name() for detector in desired_report_detectors])
+        self._desired_detector_names = {detector.get_detector_name() for detector in desired_report_detectors}
 
         # keep a mapping of desired reports to observers of interest
         # this is to get the readings, and to remove them, can create a large set
@@ -129,7 +129,7 @@ def __init__(self, model: GraphModule, desired_report_detectors: Set[DetectorBas
 
         # initialize each report to have empty set of observers of interest
         for desired_report in self._desired_detector_names:
-            self._detector_name_to_observer_fqns[desired_report] = set([])
+            self._detector_name_to_observer_fqns[desired_report] = set()
 
         # flags to ensure that we can only prepare and remove observers once
         self._prepared_flag = False
@@ -287,7 +287,7 @@ def generate_model_report(
         if remove_inserted_observers:
             self._removed_observers = True
             # get the set of all Observers inserted by this instance of ModelReport
-            all_observers_of_interest: Set[str] = set([])
+            all_observers_of_interest: Set[str] = set()
             for desired_report in self._detector_name_to_observer_fqns:
                 observers_of_interest = self._detector_name_to_observer_fqns[desired_report]
                 all_observers_of_interest.update(observers_of_interest)
diff --git a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
index ae450436d4f8..811dcba776eb 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@@ -355,8 +355,8 @@ def generate_filtered_tables(self, feature_filter: str = "", module_fqn_filter:
                     tensor_features.add(feature_name)
 
         # we make them lists for iteration purposes
-        tensor_features_list: List[str] = sorted(list(tensor_features))
-        channel_features_list: List[str] = sorted(list(channel_features))
+        tensor_features_list: List[str] = sorted(tensor_features)
+        channel_features_list: List[str] = sorted(channel_features)
 
         # get the tensor info
         tensor_headers, tensor_table = self._generate_tensor_table(filtered_data, tensor_features_list)
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index f6b0f94ee3e0..efd6dead0967 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -42,7 +42,6 @@
 )
 from torch.ao.quantization.observer import _is_activation_post_process
 from .graph_module import (
-    QuantizedGraphModule,
     _is_observed_module,
     _is_observed_standalone_module,
 )
@@ -51,6 +50,7 @@
 from .utils import (
     _get_module,
     _is_custom_module_lstm,
+    _is_custom_module_mha,
     get_custom_module_class_keys,
     create_getattr_from_value,
     collect_producer_nodes,
@@ -81,6 +81,11 @@
     "convert_weighted_module",
 ]
 
+_QSCHEME_TO_CHOOSE_QPARAMS_OP = {
+    torch.per_tensor_affine: torch.ops.quantized_decomposed.choose_qparams.tensor,
+    torch.per_tensor_symmetric: torch.ops.quantized_decomposed.choose_qparams_symmetric.tensor,
+}
+
 def _replace_observer_with_quantize_dequantize_node_decomposed(
         model: torch.nn.Module,
         graph: Graph,
@@ -212,15 +217,19 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
             "dynamic quantization right now"
         quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
         quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+        qscheme = getattr(activation_post_process, "qscheme", torch.per_tensor_affine)  # type: ignore[attr-defined]
+        eps = getattr(activation_post_process, "eps", torch.finfo(torch.float32).eps)  # type: ignore[attr-defined]
         # note: scale and zero_point are missing for quantize_per_tensor op
         # we'll need to get this from choose_qparams op, which we'll add after
         # this step
         qparams = {
             "_quant_min_": quant_min,
             "_quant_max_": quant_max,
+            "_eps_": eps,
             "_dtype_": dtype_
         }
 
+        choose_qparams_op = _QSCHEME_TO_CHOOSE_QPARAMS_OP[qscheme]
         # 2. insert choose_qparams op and update the qparams list
         with graph.inserting_before(node):
             input_node = node.args[0]
@@ -231,7 +240,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
                 choose_qparams_op_inputs.append(value)
             choose_qparams_node = graph.create_node(
                 "call_function",
-                torch.ops.quantized_decomposed.choose_qparams.tensor,
+                choose_qparams_op,
                 tuple(choose_qparams_op_inputs),
                 {}
             )
@@ -445,18 +454,6 @@ def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
         dtype == torch.float16
     )
 
-def _restore_state(
-        observed: torch.nn.Module
-) -> Tuple[Dict[str, Tuple[str, type]],
-           PrepareCustomConfig,
-           Set[str]]:
-    assert _is_observed_module(observed), \
-        'incoming model must be produced by prepare_fx'
-    prepare_custom_config: PrepareCustomConfig = observed._prepare_custom_config  # type: ignore[assignment]
-    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
-    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
-    return node_name_to_scope, prepare_custom_config, observed_node_names
-
 def _has_none_qconfig(node: Argument, node_name_to_qconfig: Dict[str, QConfigAny]) -> bool:
     """ Check if a node has a qconfig of None, i.e. user requested to not quantize
     the node
@@ -607,8 +604,7 @@ def convert_standalone_module(
     observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
     sm_input_quantized_idxs = \
         observed_standalone_module \
-        ._standalone_module_input_quantized_idxs\
-        .tolist()  # type: ignore[operator]
+        .meta["_observed_graph_module_attrs"].standalone_module_input_quantized_idxs
     # remove the dequantize nodes for inputs
     args = list(node.args)
     for idx in range(len(args)):
@@ -622,8 +618,7 @@ def convert_standalone_module(
     # add dequantize node for output
     sm_output_quantized_idxs = \
         observed_standalone_module \
-        ._standalone_module_output_quantized_idxs \
-        .tolist()  # type: ignore[operator]
+        .meta["_observed_graph_module_attrs"].standalone_module_output_quantized_idxs
     if len(sm_output_quantized_idxs) > 0:
         assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
         "output idxs = [0] is supported"
@@ -698,7 +693,7 @@ def convert_weighted_module(
     fused_module = None
     float_module = original_module
     # extract the inidividual float_module and fused module
-    if isinstance(original_module, torch.nn.intrinsic._FusedModule):
+    if isinstance(original_module, torch.ao.nn.intrinsic._FusedModule):
         fused_module = float_module
         float_module = fused_module[0]  # type: ignore[index]
 
@@ -820,6 +815,21 @@ def convert_custom_module(
             _remove_previous_dequantize_in_custom_module(node, inputs, graph)
             _remove_previous_dequantize_in_custom_module(node, hidden0, graph)
             _remove_previous_dequantize_in_custom_module(node, hidden1, graph)
+        elif _is_custom_module_mha(node, modules):
+            # Inputs are in the form (query, key, value)
+            # TODO: This is the first step in enabling the full fx custom module
+            # quantization path for MultiheadAttention, and only covers the inputs
+            # to the module.
+            # Additional handling is yet to be implemented for the outputs, similar
+            # to LSTM custom module
+            assert len(node.args) == 3
+            query, key, value = node.args
+            assert isinstance(query, Node)
+            assert isinstance(key, Node)
+            assert isinstance(value, Node)
+            _remove_previous_dequantize_in_custom_module(node, query, graph)
+            _remove_previous_dequantize_in_custom_module(node, key, graph)
+            _remove_previous_dequantize_in_custom_module(node, value, graph)
         else:
             # remove the previous dequant node to ensure the inputs are quantized
             arg = node.args[0]
@@ -899,8 +909,13 @@ def convert(
     if backend_config is None:
         backend_config = get_native_backend_config()
 
-    node_name_to_scope, prepare_custom_config, observed_node_names = _restore_state(model)
-    node_name_to_qconfig: Dict[str, QConfigAny] = model._node_name_to_qconfig  # type: ignore[assignment]
+    assert _is_observed_module(model), \
+        'incoming model must be produced by prepare_fx'
+    observed_graph_module_attrs = model.meta["_observed_graph_module_attrs"]
+    node_name_to_scope: Dict[str, Tuple[str, type]] = observed_graph_module_attrs.node_name_to_scope
+    prepare_custom_config: PrepareCustomConfig = observed_graph_module_attrs.prepare_custom_config
+    observed_node_names: Set[str] = observed_graph_module_attrs.observed_node_names
+    node_name_to_qconfig: Dict[str, QConfigAny] = observed_graph_module_attrs.node_name_to_qconfig  # type: ignore[assignment]
 
     # mapping from fully qualified module name to module instance
     # for example,
@@ -916,11 +931,11 @@ def convert(
     # TODO refactor this code once we update the prepare logic to have additional information on
     # which graph nodes have been observed and share that with convert to decide which observers to ignore.
     if qconfig_mapping:
-        prepare_qconfig_mapping: QConfigMapping = model._qconfig_mapping  # type: ignore[assignment]
+        prepare_qconfig_mapping: QConfigMapping = observed_graph_module_attrs.qconfig_mapping  # type: ignore[assignment]
         modules_copy = copy.deepcopy(modules)
 
-        if model._is_qat:
-            _update_qconfig_for_qat(qconfig_mapping, {})
+        if observed_graph_module_attrs.is_qat:
+            _update_qconfig_for_qat(qconfig_mapping, backend_config)
         _update_qconfig_for_fusion(model, qconfig_mapping)
 
         _compare_prepare_convert_qconfig_mappings(prepare_qconfig_mapping, qconfig_mapping)  # type: ignore[arg-type]
@@ -940,7 +955,7 @@ def convert(
     custom_module_classes = get_custom_module_class_keys(convert_custom_config.observed_to_quantized_mapping)
     custom_module_class_mapping = convert_custom_config.observed_to_quantized_mapping
 
-    if model._equalization_node_name_to_qconfig is not None:
+    if observed_graph_module_attrs.equalization_node_name_to_qconfig is not None:
         # If we want to do equalization then do the following:
         # Calculate the equalization scale, update the observers with the scaled
         # inputs, and scale the weight
@@ -1037,19 +1052,19 @@ def convert(
                     node, model.graph, modules, custom_module_class_mapping,
                     statically_quantized_custom_module_nodes)
 
-    preserved_attributes = set(convert_custom_config.preserved_attributes)
-    model = QuantizedGraphModule(model, copy.deepcopy(model.graph), preserved_attributes)
-
     # remove deadcode after converting observers to quant/dequant ops
     model.graph.eliminate_dead_code()
-    model.recompile()
+    model = GraphModule(model, model.graph)
 
     # TODO: maybe move this to quantize_fx.py
     if not is_reference:
         model = lower_to_fbgemm(model, node_name_to_qconfig, node_name_to_scope)
+
     # TODO: this looks hacky, we want to check why we need this and see if we can
     # remove this
     # removes qconfig and activation_post_process modules
     if _remove_qconfig_flag:
         _remove_qconfig(model)
+    model.delete_all_unused_submodules()
+    model.meta.pop("_observed_graph_module_attrs", None)
     return model
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 241803f35c74..91b876997d10 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -4,9 +4,6 @@
     map_arg
 )
 from torch.fx.graph import Graph
-from .graph_module import (
-    FusedGraphModule
-)
 from .match_utils import (
     _is_match,
     MatchAllNode,
@@ -67,9 +64,7 @@ def fuse(
             "in a future version. Please pass in a BackendConfig instead.")
         backend_config = BackendConfig.from_dict(backend_config)
 
-    input_root = model
-    input_graph = model.graph
-    named_modules = dict(input_root.named_modules())
+    named_modules = dict(model.named_modules())
 
     if backend_config is None:
         backend_config = get_native_backend_config()
@@ -81,7 +76,9 @@ def fuse(
 
     # find fusion
     fusion_pairs = _find_matches(
-        input_root, input_graph, fusion_pattern_to_fuse_handler_cls)
+        model, model.graph, fusion_pattern_to_fuse_handler_cls)
+    # TODO: change this to inplace changes to graph, since we no longer construct
+    # new GraphModule anymore
     fused_graph = Graph()
     env: Dict[Any, Any] = {}
 
@@ -93,7 +90,7 @@ def default_root_node_getter(node_pattern):
             node_pattern = node_pattern[-1]
         return node_pattern[-1]
 
-    for node in input_graph.nodes:
+    for node in model.graph.nodes:
         maybe_last_node, pattern, matched_node_pattern, obj, node_to_subpattern = \
             fusion_pairs.get(node.name, (None, None, None, None, None))
         # get the corresponding subpattern for the current node
@@ -118,8 +115,7 @@ def default_root_node_getter(node_pattern):
             env[node.name] = fused_graph.node_copy(node, load_arg)
         # node matched in patterns and is not root is removed here
 
-    preserved_attributes = set(fuse_custom_config.preserved_attributes)
-    model = FusedGraphModule(input_root, fused_graph, preserved_attributes)
+    model = GraphModule(model, fused_graph)
     return model
 
 def _find_matches(
diff --git a/torch/ao/quantization/fx/graph_module.py b/torch/ao/quantization/fx/graph_module.py
index c239dc55c225..cc9187285ae6 100644
--- a/torch/ao/quantization/fx/graph_module.py
+++ b/torch/ao/quantization/fx/graph_module.py
@@ -30,7 +30,7 @@ def __deepcopy__(self, memo):
 class ObservedGraphModule(GraphModule):
 
     def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, preserved_attr_names: Set[str]):
-        self.preserved_attr_names = set([
+        self.preserved_attr_names = {
             '_activation_post_process_map',
             '_activation_post_process_indexes',
             '_patterns',
@@ -40,7 +40,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p
             '_node_name_to_scope',
             '_qconfig_mapping',
             '_is_qat',
-            '_observed_node_names']).union(preserved_attr_names)
+            '_observed_node_names'}.union(preserved_attr_names)
         preserved_attrs = {attr: getattr(root, attr) for attr in self.preserved_attr_names if hasattr(root, attr)}
         super().__init__(root, graph)
         for attr in preserved_attrs:
@@ -55,13 +55,18 @@ def __deepcopy__(self, memo):
         return ObservedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
 
 def _is_observed_module(module: Any) -> bool:
-    return isinstance(module, ObservedGraphModule)
+    return hasattr(module, "meta") and "_observed_graph_module_attrs" in module.meta
+
+def _get_observed_graph_module_attr(model: Union[torch.nn.Module, GraphModule], attr_name: str) -> Any:
+    if hasattr(model, "meta") and "_observed_graph_module_attrs" in model.meta:  # type: ignore[operator, index]
+        return getattr(model.meta["_observed_graph_module_attrs"], attr_name)  # type: ignore[index]
+    return None
 
 class ObservedStandaloneGraphModule(ObservedGraphModule):
     def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, preserved_attr_names: Set[str]):
-        preserved_attr_names = preserved_attr_names.union(set([
+        preserved_attr_names = preserved_attr_names.union({
             "_standalone_module_input_quantized_idxs",
-            "_standalone_module_output_quantized_idxs"]))
+            "_standalone_module_output_quantized_idxs"})
         super().__init__(root, graph, preserved_attr_names)
 
     def __deepcopy__(self, memo):
@@ -70,7 +75,7 @@ def __deepcopy__(self, memo):
         return ObservedStandaloneGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
 
 def _is_observed_standalone_module(module: Any) -> bool:
-    return isinstance(module, ObservedStandaloneGraphModule)
+    return _is_observed_module(module) and module.meta["_observed_graph_module_attrs"].is_observed_standalone_module
 
 def _save_packed_weight(self, destination, prefix, keep_vars):
     for attr_name in dir(self):
diff --git a/torch/ao/quantization/fx/lower_to_fbgemm.py b/torch/ao/quantization/fx/lower_to_fbgemm.py
index e08efc3104c3..ef58652b1add 100644
--- a/torch/ao/quantization/fx/lower_to_fbgemm.py
+++ b/torch/ao/quantization/fx/lower_to_fbgemm.py
@@ -1,15 +1,15 @@
 from ._lower_to_native_backend import _lower_to_native_backend
-from .graph_module import QuantizedGraphModule
 from ..qconfig import QConfigAny
+from torch.fx import GraphModule
 from typing import Dict, Tuple
 
 __all__ = ['lower_to_fbgemm']
 
 def lower_to_fbgemm(
-    model: QuantizedGraphModule,
+    model: GraphModule,
     qconfig_map: Dict[str, QConfigAny],
     node_name_to_scope: Dict[str, Tuple[str, type]]
-) -> QuantizedGraphModule:
+) -> GraphModule:
     """ Lower a quantized reference model (with reference quantized operator patterns)
     to fbgemm
     """
diff --git a/torch/ao/quantization/fx/lower_to_qnnpack.py b/torch/ao/quantization/fx/lower_to_qnnpack.py
index 1ceccc66c480..a3a82179789d 100644
--- a/torch/ao/quantization/fx/lower_to_qnnpack.py
+++ b/torch/ao/quantization/fx/lower_to_qnnpack.py
@@ -1,6 +1,6 @@
 from ._lower_to_native_backend import _lower_to_native_backend
-from .graph_module import QuantizedGraphModule
 from ..qconfig import QConfigAny
+from torch.fx import GraphModule
 from typing import Dict, Tuple
 
 __all__ = [
@@ -8,10 +8,10 @@
 ]
 
 def lower_to_qnnpack(
-    model: QuantizedGraphModule,
+    model: GraphModule,
     qconfig_map: Dict[str, QConfigAny],
     node_name_to_scope: Dict[str, Tuple[str, type]]
-) -> QuantizedGraphModule:
+) -> GraphModule:
     """ Lower a quantized reference model (with reference quantized operator patterns)
     to qnnpack
     """
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 289836c6a2c2..6f5d242d5293 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -47,11 +47,6 @@
     node_supports_equalization,
 )
 
-from .graph_module import (
-    ObservedGraphModule,
-    ObservedStandaloneGraphModule,
-)
-
 from .pattern_utils import (
     _sorted_patterns_dict,
 )
@@ -61,7 +56,6 @@
     _find_matches,
 )
 
-from ..utils import _parent_name
 from .utils import (
     _insert_dequant_stubs_for_custom_module_lstm_output,
     _is_custom_module_lstm,
@@ -75,13 +69,18 @@
     node_arg_is_weight,
     node_arg_is_bias,
     NON_QUANTIZABLE_WEIGHT_OPS,
+    ObservedGraphModuleAttrs,
 )
 
+from torch.ao.quantization import (
+    PlaceholderObserver
+)
 from torch.ao.quantization.quantize import (
     convert
 )
 
 from ..utils import (
+    _parent_name,
     get_qconfig_dtypes,
     get_swapped_custom_module_class,
     activation_is_statically_quantized,
@@ -104,7 +103,7 @@
 
 from torch._subclasses import FakeTensor
 
-from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union, Callable
 
 
 __all__ = [
@@ -117,10 +116,33 @@
 # list of dtypes to not add observers to
 _DO_NOT_OBS_DTYPE_LIST = [int, float, torch.bool, None]
 
+# note: the following default target dtype info dicts are temporary,
+# should be moved to the new programmable API class soon
+_DEFAULT_FP32_QCONFIG_FOR_TARGET_DTYPE_INFO = {
+    "input_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_fp32_placeholder_qconfig.activation,
+    "output_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_fp32_placeholder_qconfig.activation
+}
+
+_DEFAULT_QUINT8_QCONFIG_FOR_TARGET_DTYPE_INFO = {
+    "input_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_quint8_placeholder_qconfig.activation,
+    "output_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_quint8_placeholder_qconfig.activation
+}
+
 def _is_activation_post_process_node(node: Node, named_modules: Dict[str, torch.nn.Module]) -> bool:
     return isinstance(node, torch.fx.Node) and node.op == "call_module" and \
         _is_activation_post_process(named_modules[str(node.target)])
 
+def _get_dtype_and_is_dynamic(obs_or_fq_ctr: Optional[Callable]) -> Tuple[Optional[torch.dtype], bool]:
+    """ Given a constructor for observer or fake quant module, returns
+    a Tuple of dtype and is_dynamic
+    """
+    # TODO: instead of instantiating the instance, we can use inspect to get the default args
+    if obs_or_fq_ctr is None:
+        return None, False
+    else:
+        obs_or_fq = obs_or_fq_ctr()
+        return obs_or_fq.dtype, getattr(obs_or_fq, "is_dynamic", False)
+
 def _is_input_arg_dtype_supported_by_backend(
     arg: Argument,
     node: Node,
@@ -142,11 +164,8 @@ def _is_input_arg_dtype_supported_by_backend(
     is_bias = node_arg_is_bias(node, arg, backend_config)
     is_activation = not is_weight and not is_bias
     if is_activation:
-        qconfig_info = node.meta["target_dtype_info"].get("input_activation_dtype")
-        if qconfig_info is not None:
-            qconfig_dtype, qconfig_is_dynamic = qconfig_info
-        else:
-            qconfig_dtype, qconfig_is_dynamic = None, None
+        input_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("input_act_obs_or_fq_ctr")
+        qconfig_dtype, qconfig_is_dynamic = _get_dtype_and_is_dynamic(input_act_obs_or_fq_ctr)
         # TODO(future PR): remove the cast to bool below after figuring
         # out why backend_config has is_dynamic set to None in some cases.
         return (dtype_config.input_dtype is None) or (
@@ -156,19 +175,19 @@ def _is_input_arg_dtype_supported_by_backend(
         )
     elif is_weight:
         # TODO: move dtype check into `_qconfig_satisfies_dtype_config_constraints` as well
-        weight_dtype = dtype_config.weight_dtype
-        dtype_matches = "weight_dtype" in node.meta["target_dtype_info"] and \
-            node.meta["target_dtype_info"]["weight_dtype"][0] == weight_dtype  # type: ignore[index]
+        weight_obs_or_fq_ctr = node.meta["target_dtype_info"].get("weight_obs_or_fq_ctr", None)
+        qconfig_weight_dtype, _ = _get_dtype_and_is_dynamic(weight_obs_or_fq_ctr)
+        backend_config_weight_dtype = dtype_config.weight_dtype
+        dtype_matches = qconfig_weight_dtype == backend_config_weight_dtype
         qconfig_satisfies_constraints = _qconfig_satisfies_dtype_config_constraints(
             qconfig, dtype_config.weight_dtype_with_constraints, is_activation=False)
-        return weight_dtype is None or (dtype_matches and qconfig_satisfies_constraints)
+        return backend_config_weight_dtype is None or (dtype_matches and qconfig_satisfies_constraints)
     else:  # bias
-        bias_dtype = dtype_config.bias_dtype
-        return bias_dtype is None or \
-            (
-                "bias_dtype" in node.meta["target_dtype_info"] and
-                node.meta["target_dtype_info"]["bias_dtype"][0] == bias_dtype  # type: ignore[index]
-            )
+        # TODO: move dtype check into `_qconfig_satisfies_dtype_config_constraints` as well
+        bias_obs_or_fq_ctr = node.meta["target_dtype_info"].get("bias_obs_or_fq_ctr", None)
+        qconfig_bias_dtype, _ = _get_dtype_and_is_dynamic(bias_obs_or_fq_ctr)
+        backend_config_bias_dtype = dtype_config.bias_dtype
+        return backend_config_bias_dtype is None or qconfig_bias_dtype == backend_config_bias_dtype
 
 def _is_output_dtype_supported_by_backend(
     node: Node,
@@ -178,11 +197,23 @@ def _is_output_dtype_supported_by_backend(
     """ Check if the configured qconfig for the output
     is supported by the backend or not
     """
-    output_dtype = dtype_config.output_dtype
-    dtype_matches = node.meta["target_dtype_info"]["output_activation_dtype"][0] == output_dtype  # type: ignore[index]
+    # TODO: move dtype check into `_qconfig_satisfies_dtype_config_constraints` as well
+    backend_config_output_dtype = dtype_config.output_dtype
+    # TODO: we should check is_dynamic here as well, the code from _is_input_arg_dtype_supported_by_backend
+    # from input activation check can be reused here
+    qconfig_output_dtype = None
+    output_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("output_act_obs_or_fq_ctr")
+    qconfig_output_dtype, qconfig_output_is_dynamic = _get_dtype_and_is_dynamic(output_act_obs_or_fq_ctr)
+    # TODO: this is a hack because we can only specify one activation_obs_or_fq for
+    # qconfig (qconfig.activation), and we are only supporting dynamically quantized
+    # linear op which has fp32 output dtype, this should be removed if we generalize
+    # the structure of qconfig in the future
+    if qconfig_output_is_dynamic:
+        qconfig_output_dtype = torch.float32
+    dtype_matches = qconfig_output_dtype == backend_config_output_dtype
     qconfig_satisfies_constraints = _qconfig_satisfies_dtype_config_constraints(
         qconfig, dtype_config.output_dtype_with_constraints)
-    return output_dtype is None or (dtype_matches and qconfig_satisfies_constraints)
+    return backend_config_output_dtype is None or (dtype_matches and qconfig_satisfies_constraints)
 
 def _is_observer_in_same_graph(node: Node, named_modules: Dict[str, torch.nn.Module]):
     """ Check if observer in same graph
@@ -295,14 +326,61 @@ def _insert_observer(
             'call_module', observer_name, (node,), {})
     return new_obs
 
+def _set_target_dtype_info_for_matched_node_pattern(
+    matched_node_pattern: NodePattern,
+    last_node: Node,
+    qconfig: QConfigAny,
+    backend_config: BackendConfig,
+    named_modules: Dict[str, torch.nn.Module],
+    cache_for_no_tensor_check: Dict[Node, bool],
+    processed_nodes: Set[Node],
+) -> None:
+    """ Sets the target_dtype_info for each node in matched_node_pattern
+    Note: processed_nodes is used to ensure we only process each node once
+    """
+    if isinstance(matched_node_pattern, (list, tuple)):
+        for node_pattern in matched_node_pattern:
+            _set_target_dtype_info_for_matched_node_pattern(
+                node_pattern,
+                last_node,
+                qconfig,
+                backend_config,
+                named_modules,
+                cache_for_no_tensor_check,
+                processed_nodes
+            )
+
+    # set target_dtype_info if matched_node_pattern is a Node
+    # other types of matched object, e.g. int, float literals, are ignored
+    elif isinstance(matched_node_pattern, Node):
+        # for pyre
+        assert isinstance(matched_node_pattern, Node)
+        node = matched_node_pattern
+        if node in processed_nodes:
+            return
+        processed_nodes.add(node)
+
+        if qconfig is None:
+            return
+        # TODO: refactor the following code in terms of apply a qconfig to a pattern
+        # e.g. for a pattern with op1 -> op2 -> op3, and qconfig = QConfig(input_act=obs0, output_act=obs1)
+        # we set the input_obs_or_fq_ctr for the arguments of op1 to based on qconfig.input_act,
+        # and set output_obs_or_fq_ctr based on qconfig.output_act
+        # this also requires we extend the structure of QConfig to support more fine
+        # grained configurations
+        target_dtype_info: Dict[str, Optional[Tuple[Union[torch.dtype, type], bool]]] = (
+            _get_target_activation_dtype_for_node(
+                node,
+                qconfig,
+                named_modules,
+                cache_for_no_tensor_check,
+            )
+        )
+        node.meta["target_dtype_info"] = target_dtype_info
+
 def _get_target_activation_dtype_for_node(
     node: Node,
     qconfig: QConfigAny,
-    inputs_seen_counter: int,
-    outputs_seen_counter: int,
-    input_quantized_idxs: List[int],
-    output_quantized_idxs: List[int],
-    qhandler: Optional[QuantizeHandler],
     named_modules: Dict[str, torch.nn.Module],
     cache_for_no_tensor_check: Dict[Node, bool],
 ) -> Dict[str, Optional[Tuple[Union[torch.dtype, type], bool]]]:
@@ -323,92 +401,49 @@ def _get_target_activation_dtype_for_node(
     Then this function will return
 
       {
-        'input_activation': {'dtype': torch.quint8, is_dynamic: False},
-        'output_activation': {'dtype': torch.quint8, is_dynamic: True},
+        "input_act_obs_or_fq_ctr": MinMaxObserver.with_args(dtype=torch.quint8, is_dynamic=False),
+        "output_act_obs_or_fq_ctr": MinMaxObserver.with_args(dtype=torch.quint8, is_dynamic=False),
       }
 
     TODO(future PR, if needed): explicitly spell out the non-Tensor
     dtypes.
     """
-    if node.op == 'placeholder':
-        if inputs_seen_counter in input_quantized_idxs:
-            return {
-                "input_activation_dtype": (torch.quint8, False),
-                "output_activation_dtype": (torch.quint8, False),
-            }
-        else:
-            # if dtype is fp32 (default), do nothing
-            # note: other dtypes are not supported
-            return {
-                "input_activation_dtype": (torch.float, False),
-                "output_activation_dtype": (torch.float, False),
-            }
-
-    elif node.op in ('call_module', 'call_method', 'call_function'):
-        args_have_no_tensors = \
-            all_node_args_have_no_tensors(
-                node, named_modules, cache_for_no_tensor_check)
-        if args_have_no_tensors:
-            return {
-                "input_activation_dtype": None,
-                "output_activation_dtype": None,
-            }
-
-        # get qconfig to determine the eventual dtype of this node
-        if qconfig is not None:
-            if qhandler is not None and qhandler.input_output_observed():
-                act_dtype, weight_dtype, input_act_is_dynamic = \
-                    get_qconfig_dtypes(qconfig)
-
-                # Currently `QConfig` only has one `activation` field.
-                # For static quantization, it is reused for both input
-                # and output activation. For dynamic quantization, this
-                # field is currently only used for the input activation,
-                # with the output activation being in fp32.
-                # In the future this may change as we add more fields
-                # to the `QConfig` object.
-                output_act_dtype = act_dtype \
-                    if (not input_act_is_dynamic) else torch.float
-
-                bias_dtype = torch.float16 \
-                    if (
-                        act_dtype == torch.float16
-                        and weight_dtype == torch.float16
-                        and (not input_act_is_dynamic)
-                    ) else torch.float
-                return {
-                    "input_activation_dtype": (act_dtype, input_act_is_dynamic),
-                    "weight_dtype": (weight_dtype, False),
-                    "bias_dtype": (bias_dtype, False),
-                    "output_activation_dtype": (output_act_dtype, False),
-                }
+    args_have_no_tensors = \
+        all_node_args_have_no_tensors(
+            node, named_modules, cache_for_no_tensor_check)
+    if args_have_no_tensors:
         return {
-            "input_activation_dtype": (torch.float, False),
-            "output_activation_dtype": (torch.float, False),
+            "input_act_obs_or_fq_ctr": None,
+            "output_act_obs_or_fq_ctr": None,
         }
-
-    elif node.op == 'get_attr':
+    # get qconfig to determine the eventual dtype of this node
+    if qconfig is not None:
+        act_dtype, weight_dtype, input_act_is_dynamic = \
+            get_qconfig_dtypes(qconfig)
+
+        # Currently `QConfig` only has one `activation` field.
+        # For static quantization, it is reused for both input
+        # and output activation. For dynamic quantization, this
+        # field is currently only used for the input activation,
+        # with the output activation being in fp32.
+        # In the future this may change as we add more fields
+        # to the `QConfig` object.
+        output_act_dtype = act_dtype \
+            if (not input_act_is_dynamic) else torch.float
+
+        bias_dtype = torch.float16 \
+            if (
+                act_dtype == torch.float16
+                and weight_dtype == torch.float16
+                and (not input_act_is_dynamic)
+            ) else torch.float
         return {
-            "input_activation_dtype": (torch.float, False),
-            "output_activation_dtype": (torch.float, False),
+            "input_act_obs_or_fq_ctr": qconfig.activation,
+            "weight_obs_or_fq_ctr": qconfig.weight,
+            "bias_obs_or_fq_ctr": PlaceholderObserver.with_args(dtype=bias_dtype),
+            "output_act_obs_or_fq_ctr": qconfig.activation,
         }
-
-    elif node.op == 'output':
-        if outputs_seen_counter in output_quantized_idxs:
-            return {
-                "input_activation_dtype": (torch.quint8, False),
-                "output_activation_dtype": (torch.quint8, False),
-            }
-        else:
-            # if dtype is fp32 (default), do nothing
-            # note: other dtypes are not supported
-            return {
-                "input_activation_dtype": (torch.float, False),
-                "output_activation_dtype": (torch.float, False),
-            }
-
-    else:
-        raise AssertionError(f'need to handle {node.format_node()}')
+    return copy.copy(_DEFAULT_FP32_QCONFIG_FOR_TARGET_DTYPE_INFO)
 
 def _get_arg_target_dtype_as_output(
     arg: Node,
@@ -426,19 +461,19 @@ def _get_arg_target_dtype_as_output(
     # the specific nodes we added in order to reach the original LSTM node. Otherwise, we would
     # not be able to accurately detect whether this node is a consumer of custom module LSTM.
     custom_module_lstm_node = _maybe_get_custom_module_lstm_from_node_arg(arg, named_modules)
+    output_act_obs_or_fq_ctr = None
     if custom_module_lstm_node is not None:
-        return custom_module_lstm_node.meta["target_dtype_info"]["output_activation_dtype"][0]  # type: ignore[index]
+        output_act_obs_or_fq_ctr = custom_module_lstm_node.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"]
     elif _is_activation_post_process_node(arg, named_modules):
         observed_arg = arg.args[0]
         assert isinstance(observed_arg, Node), "Currently we only support observing Node"
-        return observed_arg.meta["target_dtype_info"]["output_activation_dtype"][0]  # type: ignore[index]
+        output_act_obs_or_fq_ctr = observed_arg.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"]
     else:
-        output_act_dtype_info = \
-            arg.meta["target_dtype_info"]["output_activation_dtype"]
-        if output_act_dtype_info is not None:
-            return output_act_dtype_info[0]
-        else:
-            return None
+        output_act_obs_or_fq_ctr = \
+            arg.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"]
+    output_act_dtype, _ = _get_dtype_and_is_dynamic(output_act_obs_or_fq_ctr)
+    # TODO: should support is_dynamic here as well
+    return output_act_dtype
 
 def _get_arg_target_dtype_as_input_to_node(
     arg: Node,
@@ -454,14 +489,20 @@ def _get_arg_target_dtype_as_input_to_node(
     is_bias = node_arg_is_bias(node, arg, backend_config)
     is_activation = not is_weight and not is_bias
     if is_activation:
-        return node.meta["target_dtype_info"]["input_activation_dtype"][0]  # type: ignore[index]
+        input_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("input_act_obs_or_fq_ctr")
+        qconfig_dtype, _ = _get_dtype_and_is_dynamic(input_act_obs_or_fq_ctr)
+        return qconfig_dtype
     elif is_weight:
         if node.target in NON_QUANTIZABLE_WEIGHT_OPS:
             return None
         else:
-            return node.meta["target_dtype_info"]["weight_dtype"][0]  # type: ignore[index]
+            weight_obs_or_fq_ctr = node.meta["target_dtype_info"].get("weight_obs_or_fq_ctr", None)
+            qconfig_weight_dtype, _ = _get_dtype_and_is_dynamic(weight_obs_or_fq_ctr)
+            return qconfig_weight_dtype
     else:
-        return node.meta["target_dtype_info"]["bias_dtype"][0]  # type: ignore[index]
+        bias_obs_or_fq_ctr = node.meta["target_dtype_info"].get("bias_obs_or_fq_ctr", None)
+        qconfig_bias_dtype, _ = _get_dtype_and_is_dynamic(bias_obs_or_fq_ctr)
+        return qconfig_bias_dtype
 
 def _get_arg_target_is_dynamic_as_input_to_node(
     arg: Node,
@@ -476,9 +517,10 @@ def _get_arg_target_is_dynamic_as_input_to_node(
     is_weight = node_arg_is_weight(node, arg, backend_config)
     is_bias = node_arg_is_bias(node, arg, backend_config)
     is_activation = not is_weight and not is_bias
-    if is_activation and \
-       "input_activation_dtype" in node.meta["target_dtype_info"]:
-        return node.meta["target_dtype_info"]["input_activation_dtype"][1]
+    if is_activation and "input_act_obs_or_fq_ctr" in node.meta["target_dtype_info"]:
+        input_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("input_act_obs_or_fq_ctr")
+        _, qconfig_is_dynamic = _get_dtype_and_is_dynamic(input_act_obs_or_fq_ctr)
+        return qconfig_is_dynamic
     else:
         return False
 
@@ -721,7 +763,7 @@ def _maybe_insert_output_observer_for_node(
     model: torch.nn.Module,
     named_modules: Dict[str, torch.nn.Module],
     graph: Graph,
-    matches: Dict[str, _MatchResultWithQConfig],
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
     matched_pattern: Any,
     qhandler: Optional[QuantizeHandler],
     is_qat: bool,
@@ -732,7 +774,7 @@ def _maybe_insert_output_observer_for_node(
 
     If `node` does not need an output observer, returns None.
     """
-    root_node, _, pattern, qhandler, qconfig = matches.get(
+    root_node, _, pattern, qhandler, qconfig = node_name_to_match_result_with_qconfig.get(
         node.name, (None, None, None, None, None))
 
     if qhandler is None:
@@ -743,8 +785,9 @@ def _maybe_insert_output_observer_for_node(
 
     is_standalone_module = qhandler is not None and qhandler.is_standalone_module()
 
-    dtype, is_dynamic = node.meta["target_dtype_info"]["output_activation_dtype"]  # type: ignore[misc]
-    should_insert_observer = dtype not in _DO_NOT_OBS_DTYPE_LIST + [torch.float]
+    output_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("output_act_obs_or_fq_ctr")
+    qconfig_dtype, _ = _get_dtype_and_is_dynamic(output_act_obs_or_fq_ctr)
+    should_insert_observer = qconfig_dtype not in _DO_NOT_OBS_DTYPE_LIST + [torch.float]
     # TODO(future PR): move the following logic to
     # should_insert_observer_for_output
     should_insert_observer = should_insert_observer and \
@@ -860,27 +903,28 @@ def _recursive_maybe_replace_node_with_obs(
 def _maybe_propagate_dtype_for_node(
     node: Node,
     target_dtype: Union[torch.dtype, type],
-    matches: Dict[str, _MatchResultWithQConfig],
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
 ) -> None:
     """
     Assigns `target_dtype` to `node`, setting `is_dynamic` to False. If `node`
     is a general tensor shape op, also call this function recursively on
     the first argument, to propagate the dtype to the caller.
     """
-    node.meta["target_dtype_info"]["input_activation_dtype"] = (target_dtype, False)
-    node.meta["target_dtype_info"]["output_activation_dtype"] = (target_dtype, False)
+    node.meta["target_dtype_info"]["input_act_obs_or_fq_ctr"] = None
+    node.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"] = None
     # if this is a copy node, propagate to first arg
-    root_node, _, pattern, qhandler, qconfig = matches.get(
+    root_node, _, pattern, qhandler, qconfig = node_name_to_match_result_with_qconfig.get(
         node.name, (None, None, None, None, None))
+    # TODO: probably need to remove `is_general_tensor_value_op`
     if qhandler is not None and qhandler.is_general_tensor_value_op():
         prev_node = node.args[0]
         if isinstance(prev_node, Node):
             _maybe_propagate_dtype_for_node(
-                prev_node, target_dtype, matches)
+                prev_node, target_dtype, node_name_to_match_result_with_qconfig)
 
 def propagate_dtypes_for_known_nodes(
     graph: Graph,
-    matches: Dict[str, _MatchResultWithQConfig],
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
 ) -> None:
     """
     Currently we assume that inputs to the graph are either `torch.float` or
@@ -903,7 +947,7 @@ def propagate_dtypes_for_known_nodes(
 
                 # when an argument is a tuple, it does not show up as another node so we need to go through
                 # all elements of the tuple manually
-                if isinstance(arg, tuple) or isinstance(arg, list):
+                if isinstance(arg, (tuple, list)):
                     arg_list = list(arg)
                 else:
                     arg_list = [arg]
@@ -912,7 +956,7 @@ def propagate_dtypes_for_known_nodes(
                     # hard coded arguments show up but aren't `Node` typed and do not need dtype propgated
                     if isinstance(cur_arg, torch.fx.node.Node):
                         _maybe_propagate_dtype_for_node(
-                            cur_arg, arg_type, matches)
+                            cur_arg, arg_type, node_name_to_match_result_with_qconfig)
 
 def _maybe_make_input_output_share_observers(
     node: Node,
@@ -1034,7 +1078,7 @@ def _swap_custom_module_to_observed(
 
 def insert_observers_for_model(
     model: GraphModule,
-    matches: Dict[str, _MatchResultWithQConfig],
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
     node_name_to_qconfig: Dict[str, QConfigAny],
     prepare_custom_config: PrepareCustomConfig,
     equalization_config_map: Dict[str, Any],
@@ -1075,58 +1119,127 @@ def insert_observers_for_model(
     # node.meta["target_dtype_info"] stores the target dtype information
     # that's derived from qconfig for the Node, for example, if we have
     # a conv2d node that has a qconfig
-    # {
-    #   # information for input and bias node omitted
-    #   # for getattr node
-    #   # weight = getattr(self, 'weight')
-    #   weight.meta["target_dtype_info"] = {
-    #      'output_activation_dtype': (torch.float, False)
-    #   }
-    #   # Note: False means it's not a dynamic quantization (but a static quantization)
-    #   # for conv2d node
-    #   # conv2d = call_function[target=torch.nn.functional.conv2d](
-    #   #            args=(input, weight, bias))
-    #   conv2d.meta["target_dtype_info"] = {
-    #     'input_activation_dtype': (torch.quint8, False),
-    #     'weight_dtype': (torch.qint8, False),
-    #     'bias_dtype': (torch.float, False),
-    #     'output_activation_dtype': (torch.quint8, False),
-    #   }
+    # qconfig = QConfig(activation=..., weight=...)
+    # # information for input and bias node omitted
+    # # for getattr node
+    # # weight = getattr(self, 'weight')
+    # weight.meta["target_dtype_info"] = {
+    #    'output_act_obs_or_fq_ctr': qconfig.weight,
+    # }
+    # # for conv2d node
+    # # conv2d = call_function[target=torch.nn.functional.conv2d](
+    # #            args=(input, weight, bias))
+    # conv2d.meta["target_dtype_info"] = {
+    #   'input_act_obs_or_fq_ctr': qconfig.activation
+    #   'weight_obs_or_fq_ctr': qconfig.weight,
+    #   'bias_obs_or_fq_ctr': PlaceholderObserver.with_args(dtype=torch.float32),
+    #   'output_act_obs_or_fq_ctr': qconfig.activation,
+    # }
     #
     cache_for_no_tensor_check: Dict[Node, bool] = {}
 
-    inputs_seen_counter = 0
-    outputs_seen_counter = 0
-
     # first, populate the dtype map based only on qconfig and qhandler
     # this assumes:
     # graph inputs are fp32 by default, and int8 where overriden
     # other nodes output dtype is specified by the qconfig
     named_modules = dict(model.named_modules(remove_duplicate=False))
+
+    input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
+    output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
+    processed_nodes: Set[Node] = set()
+    # initalize target_dtype_info
+    for node in model.graph.nodes:
+        node.meta["target_dtype_info"] = copy.copy(_DEFAULT_FP32_QCONFIG_FOR_TARGET_DTYPE_INFO)
+
+    inputs_seen_counter = 0
+    outputs_seen_counter = 0
+    placeholder_node_to_input_index: Dict[Node, int] = {}
+    # TODO: we probably don't need this counter since each graph will only have
+    # one output node?
+    output_node_to_output_index: Dict[Node, int] = {}
     for node in model.graph.nodes:
-        root_node, _, pattern, qhandler, qconfig = matches.get(
-            node.name, (None, None, None, None, None))
-        input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
-        output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
-        target_dtype_info: Dict[str, Optional[Tuple[Union[torch.dtype, type], bool]]] = \
-            _get_target_activation_dtype_for_node(
-                node, qconfig, inputs_seen_counter, outputs_seen_counter,
-                input_quantized_idxs, output_quantized_idxs, qhandler,
-                named_modules, cache_for_no_tensor_check)
-        node.meta["target_dtype_info"] = target_dtype_info
         if node.op == "placeholder":
+            placeholder_node_to_input_index[node] = inputs_seen_counter
             inputs_seen_counter += 1
         if node.op == "output":
+            output_node_to_output_index[node] = outputs_seen_counter
             outputs_seen_counter += 1
 
-    # Second, for nodes with known input dtypes, propagate them throughout the
+    # Step 1, set the observer or fake quantize module constructor for each node in the
+    # matched_node_pattern
+
+    for node_name, match_res_with_qconfig in node_name_to_match_result_with_qconfig.items():
+        last_node, matched_node_pattern, pattern, qhandler, qconfig = match_res_with_qconfig
+        assert qhandler is not None
+        _set_target_dtype_info_for_matched_node_pattern(
+            matched_node_pattern,
+            last_node,
+            qconfig,
+            backend_config,
+            named_modules,
+            cache_for_no_tensor_check,
+            processed_nodes
+        )
+
+    # Step 2. Special cases for some operators, we might be able to remove them
+    # in the future if we know dtype information of each node better
+
+    # Step 2.1. some settings are not based on patterns, we need to process each node
+    # instead
+    for node in model.graph.nodes:
+        if node.op == "placeholder" and placeholder_node_to_input_index[node] in input_quantized_idxs:
+            # users are not supposed to call calculate_qparams on PlaceholderObserver, and
+            # this is OK because we are using this as a way to encode the dtypes of input
+            # tensor, we won't actually insert these observers in the graph and won't
+            # actually call calculate_qparams
+            node.meta["target_dtype_info"] = copy.copy(_DEFAULT_QUINT8_QCONFIG_FOR_TARGET_DTYPE_INFO)
+        elif node.op in ("call_module", "call_method", "call_function"):
+            args_have_no_tensors = \
+                all_node_args_have_no_tensors(
+                    node, named_modules, cache_for_no_tensor_check)
+            if args_have_no_tensors:
+                node.meta["target_dtype_info"] = {
+                    "input_act_obs_or_fq_ctr": None,
+                    "output_act_obs_or_fq_ctr": None,
+                }
+        elif node.op == "output" and output_node_to_output_index[node] in output_quantized_idxs:
+            node.meta["target_dtype_info"] = copy.copy(_DEFAULT_QUINT8_QCONFIG_FOR_TARGET_DTYPE_INFO)
+
+    # Step 2.2, for nodes with known input dtypes, propagate them throughout the
     # graph. For example, if there is a call such as
     #   x1 = x0.masked_fill(mask, 1)
     # we propagate the type of mask to be torch.bool
-    propagate_dtypes_for_known_nodes(model.graph, matches)
+    propagate_dtypes_for_known_nodes(model.graph, node_name_to_match_result_with_qconfig)
+
+    # Step 3, check if the requested target_dtype_info is supported by backend or not
+    # if not, we'll reset the target_dtye_info to use the default (float Tensor)
+
+    # reset the counters and set of processed_nodes
+    processed_nodes = set()
+    for node_name, match_res_with_qconfig in node_name_to_match_result_with_qconfig.items():
+        last_node, matched_node_pattern, pattern, qhandler, qconfig = match_res_with_qconfig
+        is_supported_by_backend = _is_pattern_dtype_config_and_qconfig_supported_by_backend(
+            pattern, matched_node_pattern, qconfig, backend_config)
+        assert qhandler is not None
+
+        # get output_act_dtype so that we don't also reset the special typed nodes
+        # TODO: we might want to handle these more uniformly with the default path
+        # this can be improved if we can use node.meta["val"]
+        output_act_dtype, _ = _get_dtype_and_is_dynamic(node.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"])
+        if not is_supported_by_backend and output_act_dtype not in [None, int, float, torch.bool]:
+            # restore target_dtype_info to default if it is not supported by backend
+            _set_target_dtype_info_for_matched_node_pattern(
+                matched_node_pattern,
+                last_node,
+                torch.ao.quantization.qconfig._default_fp32_placeholder_qconfig,
+                backend_config,
+                named_modules,
+                cache_for_no_tensor_check,
+                processed_nodes
+            )
 
     # After this point, the current node and all of its arguments
-    # have a dtype assigned. Now, we insert observers for inputs
+    # have a target_dtype_info assigned. Now, we insert observers for inputs
     # of this node (if needed for this node), and the output of this node
     # (if needed for this node).
 
@@ -1137,11 +1250,13 @@ def insert_observers_for_model(
     # Avoid duplicates custom module swaps for multiple nodes with same target.
     custom_module_names_already_swapped: Set[str] = set()
 
+    # TODO: reuse placeholder_node_to_input_index and output_node_to_output_index
     # reset inputs/outputs counters
     inputs_seen_counter = 0
     outputs_seen_counter = 0
     results_node = None
 
+    # TODO: change this to insert obs/fq by pattern instead of by node
     for node in nodes_before_observation:
 
         if node.op == 'placeholder':
@@ -1152,8 +1267,9 @@ def insert_observers_for_model(
 
         elif node.op in ('call_module', 'call_method', 'call_function', 'output'):
             # check for matches
-            last_node, matched_node_pattern, pattern, qhandler, qconfig = matches.get(
-                node.name, (None, None, None, None, None))
+            last_node, matched_node_pattern, pattern, qhandler, qconfig = (
+                node_name_to_match_result_with_qconfig.get(node.name, (None, None, None, None, None))  # type: ignore[assignment]
+            )
             equalization_qconfig = equalization_config_map.get(node.name, None)
 
             this_node_dtype_info = node.meta["target_dtype_info"]
@@ -1172,6 +1288,10 @@ def insert_observers_for_model(
                 not node.op == 'output'
             )
 
+            # TODO: take a closer look to see if we can remove this check
+            # right now it is here because of `observed_node_names`, we are using
+            # it as an indicator for swapping the modules to reference modules in
+            # convert
             is_supported_by_backend = _is_pattern_dtype_config_and_qconfig_supported_by_backend(
                 pattern, matched_node_pattern, qconfig, backend_config)
 
@@ -1219,7 +1339,7 @@ def insert_observers_for_model(
                             prepare_custom_config,
                             backend_config)
 
-                        # Insert equalization input observers if needed
+                        # insert equalization input observers if needed
                         _maybe_insert_input_equalization_observers_for_node(
                             node, equalization_qconfig, model, named_modules, model.graph,
                             is_quantized_branch, backend_config)
@@ -1249,7 +1369,7 @@ def insert_observers_for_model(
                         else:
                             # this returns the new observer node if it was needed
                             maybe_output_obs_node = _maybe_insert_output_observer_for_node(
-                                node, model, named_modules, model.graph, matches,
+                                node, model, named_modules, model.graph, node_name_to_match_result_with_qconfig,
                                 pattern, qhandler, is_qat)
 
                             if maybe_output_obs_node is not None:
@@ -1315,7 +1435,7 @@ def _run_prepare_fx_on_standalone_modules(
     model: torch.nn.Module,
     is_qat: bool,
     named_modules: Dict[str, torch.nn.Module],
-    matches: Any,
+    node_name_to_match_result_with_qconfig: Any,
     prepare_custom_config: PrepareCustomConfig,
     backend_config: BackendConfig,
 ) -> None:
@@ -1327,7 +1447,7 @@ def _run_prepare_fx_on_standalone_modules(
     for (
         node_name,
         (root_node, _, pattern, qhandler, qconfig),
-    ) in matches.items():
+    ) in node_name_to_match_result_with_qconfig.items():
         if qhandler is None:
             continue
         elif not qhandler.is_standalone_module():
@@ -1348,13 +1468,8 @@ def _run_prepare_fx_on_standalone_modules(
                 example_inputs=sm_example_inputs,
                 prepare_custom_config=sm_prepare_custom_config,
                 backend_config=sm_backend_config)
-        preserved_attributes = set(sm_prepare_custom_config.preserved_attributes)
-        observed_standalone_module = ObservedStandaloneGraphModule(
-            observed_standalone_module, observed_standalone_module.graph,
-            preserved_attributes)
         parent_name, name = _parent_name(root_node.target)
-        setattr(named_modules[parent_name], name,
-                observed_standalone_module)
+        setattr(named_modules[parent_name], name, observed_standalone_module)
         named_modules[root_node.target] = observed_standalone_module
 
 def _save_state(
@@ -1367,13 +1482,17 @@ def _save_state(
     is_qat: bool,
     observed_node_names: Set[str],
 ) -> None:
-    observed._node_name_to_qconfig = node_name_to_qconfig  # type: ignore[assignment]
-    observed._prepare_custom_config = prepare_custom_config  # type: ignore[assignment]
-    observed._node_name_to_scope = node_name_to_scope  # type: ignore[assignment]
-    observed._equalization_node_name_to_qconfig = equalization_node_name_to_qconfig  # type: ignore[assignment]
-    observed._qconfig_mapping = qconfig_mapping  # type: ignore[assignment]
-    observed._is_qat = is_qat  # type: ignore[assignment]
-    observed._observed_node_names = observed_node_names  # type: ignore[assignment]
+    observed.meta["_observed_graph_module_attrs"] = (
+        ObservedGraphModuleAttrs(
+            node_name_to_qconfig=node_name_to_qconfig,
+            node_name_to_scope=node_name_to_scope,
+            prepare_custom_config=prepare_custom_config,
+            equalization_node_name_to_qconfig=equalization_node_name_to_qconfig,
+            qconfig_mapping=qconfig_mapping,
+            is_qat=is_qat,
+            observed_node_names=observed_node_names,
+        )
+    )
 
 def prepare(
         model: GraphModule,
@@ -1384,7 +1503,7 @@ def prepare(
         prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
         _equalization_config: Union[QConfigMapping, Dict[str, Any], None] = None,
         backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
-        is_standalone_module: bool = False) -> ObservedGraphModule:
+        is_standalone_module: bool = False) -> GraphModule:
     """ standalone_module means it a submodule that is not inlined in
     parent module, and will be quantized separately as one unit.
 
@@ -1395,12 +1514,15 @@ def prepare(
         The scope is a tuple of fully qualified path of the module and the type of the module
     Returns:
         model(GraphModule): prepared standalone module
-        attributes:
-            _standalone_module_input_quantized_idxs(List[Int]): a list of
+        attributes related to standalone module
+        in model.meta["_observed_graph_module_attrs"]:
+            is_observed_standalone_module (bool): boolean value that shows whether the
+            current model is a observed standalone module or not
+            standalone_module_input_quantized_idxs(List[Int]): a list of
                 indexes for the graph input that is expected to be quantized,
                 same as input_quantized_idxs configuration provided
                 for the standalone module
-            _standalone_module_output_quantized_idxs(List[Int]): a list of
+            standalone_module_output_quantized_idxs(List[Int]): a list of
                 indexs for the graph output that is quantized
                 same as input_quantized_idxs configuration provided
                 for the standalone module
@@ -1468,7 +1590,7 @@ def prepare(
     if is_qat:
         module_to_qat_module = get_module_to_qat_module(backend_config)
         _qat_swap_modules(model, module_to_qat_module)
-        _update_qconfig_for_qat(qconfig_mapping, {})
+        _update_qconfig_for_qat(qconfig_mapping, backend_config)
 
     # mapping from fully qualified module name to module instance
     # for example,
@@ -1494,13 +1616,13 @@ def prepare(
         standalone_module_names, standalone_module_classes, custom_module_classes)
 
     # map qconfig instances to matches
-    matches = {}
+    node_name_to_match_result_with_qconfig = {}
     for node_name, match_without_qconfig in matches_without_qconfig.items():
         match_with_qconfig = (*match_without_qconfig, node_name_to_qconfig[node_name])
-        matches[node_name] = match_with_qconfig
+        node_name_to_match_result_with_qconfig[node_name] = match_with_qconfig
 
     _run_prepare_fx_on_standalone_modules(
-        model, is_qat, named_modules, matches, prepare_custom_config, backend_config)
+        model, is_qat, named_modules, node_name_to_match_result_with_qconfig, prepare_custom_config, backend_config)
 
     # record names for the set of observed node, so that in convert step
     # we know whether we need to convert a floating point module to reference
@@ -1509,7 +1631,7 @@ def prepare(
 
     result_node = insert_observers_for_model(
         model,
-        matches,
+        node_name_to_match_result_with_qconfig,
         node_name_to_qconfig,
         prepare_custom_config,
         equalization_node_name_to_qconfig,
@@ -1517,12 +1639,12 @@ def prepare(
         observed_node_names,
         is_qat
     )
+    model = GraphModule(model, model.graph)
 
     _save_state(model, node_name_to_qconfig, node_name_to_scope,
-                prepare_custom_config, equalization_node_name_to_qconfig, qconfig_mapping, is_qat, observed_node_names)
+                prepare_custom_config, equalization_node_name_to_qconfig,
+                qconfig_mapping, is_qat, observed_node_names)
 
-    preserved_attributes = set(prepare_custom_config.preserved_attributes)
-    model = ObservedGraphModule(model, model.graph, preserved_attributes)
     if is_standalone_module:
         assert result_node is not None
         assert isinstance(result_node.args[0], Node), \
@@ -1533,7 +1655,11 @@ def prepare(
         # Union[Tensor, Module]
         input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
         output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
-        model._standalone_module_input_quantized_idxs = \
-            torch.tensor(input_quantized_idxs)
-        model._standalone_module_output_quantized_idxs = torch.tensor(output_quantized_idxs)
+        observed_graph_module_attrs = model.meta["_observed_graph_module_attrs"]
+        # inplace modification
+        observed_graph_module_attrs.is_observed_standalone_module = True
+        observed_graph_module_attrs.standalone_module_input_quantized_idxs = \
+            input_quantized_idxs
+        observed_graph_module_attrs.standalone_module_output_quantized_idxs = \
+            output_quantized_idxs
     return model
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index c780ace51b14..15d2a94b8304 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -8,8 +8,12 @@
     _is_activation_post_process,
 )
 from torch.ao.quantization.backend_config import (
+    BackendConfig,
     DTypeConfig,
 )
+from torch.ao.quantization.backend_config.utils import (
+    get_module_to_qat_module,
+)
 
 from torch.fx import (
     GraphModule,
@@ -17,12 +21,11 @@
 from torch.fx.graph import (
     Graph,
 )
-from torch.nn.intrinsic import _FusedModule
+from torch.ao.nn.intrinsic import _FusedModule
 
 from ..utils import (
     _parent_name,
     get_qconfig_dtypes,
-    get_combined_dict
 )
 from ..qconfig_mapping import (
     _OBJECT_TYPE_DICT_KEY,
@@ -30,10 +33,6 @@
     _MODULE_NAME_REGEX_DICT_KEY,
     QConfigMapping,
 )
-from ..quantization_mappings import (
-    get_default_qat_module_mappings,
-)
-
 
 __all__: List[str] = []
 
@@ -331,15 +330,14 @@ def _get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[C
 
 def _update_qconfig_for_qat(
         qconfig_mapping: QConfigMapping,
-        additional_qat_module_mapping: Dict[Callable, Callable]):
+        backend_config: BackendConfig):
     """
-    Update the qconfig_dict to account for module swaps during QAT.
+    Update the qconfig_mapping to account for module swaps during QAT.
     During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types.
     """
-    all_qat_mappings = get_combined_dict(
-        get_default_qat_module_mappings(), additional_qat_module_mapping)
+    module_to_qat_module_class = get_module_to_qat_module(backend_config)
     object_type_dict = qconfig_mapping.object_type_qconfigs
     new_object_type_dict = object_type_dict.copy()
     for k, v in new_object_type_dict.items():
-        if k in all_qat_mappings:
-            object_type_dict[all_qat_mappings[k]] = v
+        if k in module_to_qat_module_class:
+            object_type_dict[module_to_qat_module_class[k]] = v
diff --git a/torch/ao/quantization/fx/quantize_handler.py b/torch/ao/quantization/fx/quantize_handler.py
index 473cc0d9f895..57e3c97411a5 100644
--- a/torch/ao/quantization/fx/quantize_handler.py
+++ b/torch/ao/quantization/fx/quantize_handler.py
@@ -77,15 +77,6 @@ def __init__(
                             arg, self.modules, cache_for_no_tensor_check)):
                     self.num_tensor_args += 1
 
-    # TODO: can remove after the is_dynamic flag is defined, so that we can
-    # move embedding op to backend_config_dict
-    def input_output_observed(self) -> bool:
-        """
-        Returns True if the pattern matched to this qhandler could be
-        be observed, and False it it should not be observed.
-        """
-        return True
-
     def is_general_tensor_value_op(self) -> bool:
         """
         Returns True if the operator works for both floating point and
@@ -112,8 +103,7 @@ def is_standalone_module(self):
 def _get_quantize_handler_cls(
         observation_type: ObservationType,
         dtype_configs: List[DTypeConfig],
-        num_tensor_args_to_observation_type: Dict[int, ObservationType],
-        input_output_observed: bool) -> Type[QuantizeHandler]:
+        num_tensor_args_to_observation_type: Dict[int, ObservationType]) -> Type[QuantizeHandler]:
     """
     Return a configurable QuantizeHandler that matches the given specifications from the backend.
     """
@@ -133,15 +123,10 @@ def __init__(
             else:
                 self.observation_type = observation_type
             self.dtype_configs = dtype_configs
-            self.input_output_observed_ = input_output_observed
 
         def is_general_tensor_value_op(self) -> bool:
             return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
 
-        # This is temporary, and will be removed soon
-        def input_output_observed(self):
-            return self.input_output_observed_
-
     return ConfigurableQuantizeHandler
 
 def _get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Pattern, QuantizerCls]:
@@ -156,18 +141,14 @@ def _get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Pat
         observation_type = config.observation_type
         dtype_configs = config.dtype_configs
         num_tensor_args_to_observation_type = config._num_tensor_args_to_observation_type
-        input_output_observed = config._input_output_observed
-        if input_output_observed is None:
-            input_output_observed = True
         pattern_to_quantize_handlers[pattern] = \
             _get_quantize_handler_cls(
                 observation_type,
                 dtype_configs,
-                num_tensor_args_to_observation_type,
-                input_output_observed)
+                num_tensor_args_to_observation_type)
     return pattern_to_quantize_handlers
 
-# TODO: remove this class, this is still exposed in torch.quantization
+# TODO: remove this class, this is still exposed in torch.ao.quantization
 # but we should be able to break bc
 class BinaryOpQuantizeHandler(QuantizeHandler):
     pass
@@ -213,10 +194,10 @@ class CopyNodeQuantizeHandler(QuantizeHandler):
 class GeneralTensorShapeOpQuantizeHandler(QuantizeHandler):
     pass
 
-# TODO: not used, can be removed after torch.quantization namespace is deprecated
+# TODO: not used, can be removed after torch.ao.quantization namespace is deprecated
 class CustomModuleQuantizeHandler(QuantizeHandler):
     pass
 
-# TODO: not used, can be removed after torch.quantization namespace is deprecated
+# TODO: not used, can be removed after torch.ao.quantization namespace is deprecated
 class StandaloneModuleQuantizeHandler(QuantizeHandler):
     pass
diff --git a/torch/ao/quantization/fx/tracer.py b/torch/ao/quantization/fx/tracer.py
index d372c6c06c0a..47f326caf704 100644
--- a/torch/ao/quantization/fx/tracer.py
+++ b/torch/ao/quantization/fx/tracer.py
@@ -1,7 +1,7 @@
 import torch
 from torch.fx._symbolic_trace import Tracer
 from torch.fx.proxy import Scope
-from torch.nn.intrinsic import _FusedModule
+from torch.ao.nn.intrinsic import _FusedModule
 from typing import List, Callable
 
 __all__ = [
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 4ed6db5b795b..5907edc5420f 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -27,6 +27,7 @@
     activation_is_statically_quantized,
 )
 from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.qconfig_mapping import QConfigMapping
 
 from torch.fx import GraphModule, map_arg
 
@@ -39,6 +40,7 @@
 from ._decomposed import quantized_decomposed_lib  # noqa: F401
 
 from typing import Callable, Optional, List, Dict, Any, Set, Tuple, Union, Type
+from dataclasses import dataclass
 from collections import namedtuple
 import operator
 import warnings
@@ -66,10 +68,24 @@
     "NON_OBSERVABLE_ARG_DICT",
     "NON_QUANTIZABLE_WEIGHT_OPS",
     "return_arg_list",
+    "ObservedGraphModuleAttrs",
 ]
 
 NON_QUANTIZABLE_WEIGHT_OPS = {torch.nn.functional.layer_norm, torch.nn.functional.group_norm, torch.nn.functional.instance_norm}
 
+@dataclass
+class ObservedGraphModuleAttrs:
+    node_name_to_qconfig: Dict[str, QConfigAny]
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+    prepare_custom_config: PrepareCustomConfig
+    equalization_node_name_to_qconfig: Dict[str, Any]
+    qconfig_mapping: QConfigMapping
+    is_qat: bool
+    observed_node_names: Set[str]
+    is_observed_standalone_module: bool = False
+    standalone_module_input_quantized_idxs: Optional[List[int]] = None
+    standalone_module_output_quantized_idxs: Optional[List[int]] = None
+
 def node_arg_is_weight(node: Node, arg: Any, backend_config: BackendConfig) -> bool:
     """Returns if node arg is weight"""
     if isinstance(node, Node) and node.op == "call_function" and \
@@ -448,6 +464,25 @@ def _is_custom_module_lstm(
     else:
         return isinstance(mod, torch.ao.nn.quantizable.LSTM)
 
+def _is_custom_module_mha(
+        node: Node,
+        named_modules: Dict[str, torch.nn.Module],
+        qconfig: QConfigAny = None,
+        # QuantizeHandler, but we cannot include the type here due to circular imports
+        qhandler: Optional[Any] = None,
+) -> bool:
+    """
+    Return whether this refers to the custom module MultiheadAttention flow.
+    """
+    mod = _get_module(node, named_modules)
+    if qconfig is not None and qhandler is not None:
+        assert isinstance(qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler)  # type: ignore[attr-defined]
+        return isinstance(mod, torch.nn.MultiheadAttention) and \
+            activation_is_statically_quantized(qconfig) and \
+            qhandler.is_custom_module()
+    else:
+        return isinstance(mod, torch.ao.nn.quantizable.MultiheadAttention)
+
 def _get_module(node: Node, named_modules: Dict[str, torch.nn.Module]) -> Optional[torch.nn.Module]:
     """
     If `node` refers to a call_module node, return the module, else None.
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 2134f4139c02..fc29e1813d93 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -13,7 +13,7 @@
 import torch
 import torch.nn as nn
 from torch.ao.quantization.utils import (
-    check_min_max_valid, calculate_qmin_qmax, is_per_tensor, is_per_channel)
+    check_min_max_valid, calculate_qmin_qmax, is_per_tensor, is_per_channel, validate_qmin_qmax)
 
 __all__ = [
     "default_affine_fixed_qparams_observer",
@@ -49,7 +49,7 @@
 ]
 
 
-class _PartialWrapper(object):
+class _PartialWrapper:
     def __init__(self, p):
         self.p = p
         self.callable_args = {}
@@ -136,7 +136,7 @@ class ObserverBase(ABC, nn.Module):
     """
 
     def __init__(self, dtype):
-        super(ObserverBase, self).__init__()
+        super().__init__()
         self.dtype = dtype
 
     @abstractmethod
@@ -236,7 +236,7 @@ def __init__(
         ), "Default Observer only works for qint8, quint8 and quint4x2 data type"
         self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
         if self.has_customized_qrange:
-            self._validate_qmin_qmax(quant_min, quant_max)
+            validate_qmin_qmax(quant_min, quant_max)
         self.quant_min, self.quant_max = \
             calculate_qmin_qmax(quant_min, quant_max, self.has_customized_qrange, self.dtype, self.reduce_range)
 
@@ -258,7 +258,7 @@ def _load_from_state_dict(
             eps = torch.tensor([torch.finfo(torch.float32).eps])
             state_dict[prefix + "eps"] = eps
 
-        super(ObserverBase, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict,
             prefix,
             local_metadata,
@@ -307,6 +307,11 @@ def _calculate_qparams(
             scales: Scales tensor of shape (#channels,)
             zero_points: Zero points tensor of shape (#channels,)
         """
+        # Functionally equivalent to 'determine_qparams' in utils.py. Observers must be torchscriptable however and qscheme
+        # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+        # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code
+        # seems unlikey to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
+        # TODO(jakeszwe, jerryzh168)
         if not check_min_max_valid(min_val, max_val):
             return torch.tensor([1.0], device=min_val.device.type), torch.tensor([0], device=min_val.device.type)
 
@@ -462,7 +467,7 @@ def __init__(
         # For more details see aten/src/ATen/native/quantized/cpu/qconv.cpp
         # This is not an optimal choice for non x86 backends as it loses a bit
         # of precision for activations.
-        super(MinMaxObserver, self).__init__(
+        super().__init__(
             dtype=dtype,
             qscheme=qscheme,
             reduce_range=reduce_range,
@@ -575,7 +580,7 @@ def __init__(
                     torch.per_tensor_symmetric and torch.per_tensor_affine."
             )
         self.averaging_constant = averaging_constant
-        super(MovingAverageMinMaxObserver, self).__init__(
+        super().__init__(
             dtype=dtype,
             qscheme=qscheme,
             reduce_range=reduce_range,
@@ -649,7 +654,7 @@ def __init__(
                 "PerChannelMinMaxObserver's qscheme only support \
                     torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams."
             )
-        super(PerChannelMinMaxObserver, self).__init__(
+        super().__init__(
             dtype=dtype,
             qscheme=qscheme,
             reduce_range=reduce_range,
@@ -755,7 +760,7 @@ def _load_from_state_dict(
                 missing_keys.append(key)
 
         if not torch.jit.is_scripting():
-            super(PerChannelMinMaxObserver, self)._load_from_state_dict(
+            super()._load_from_state_dict(
                 state_dict,
                 prefix,
                 local_metadata,
@@ -841,7 +846,7 @@ def __init__(
                 "MovingAveragePerChannelMinMaxObserver's qscheme only support \
                     torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams."
             )
-        super(MovingAveragePerChannelMinMaxObserver, self).__init__(
+        super().__init__(
             ch_axis=ch_axis,
             dtype=dtype,
             qscheme=qscheme,
@@ -928,7 +933,7 @@ def __init__(
                     and torch.per_tensor_affine."
             )
         # bins: The number of bins used for histogram calculation.
-        super(HistogramObserver, self).__init__(
+        super().__init__(
             dtype=dtype,
             qscheme=qscheme,
             reduce_range=reduce_range,
@@ -1216,9 +1221,7 @@ def calculate_qparams(self):
         return self._calculate_qparams(new_min, new_max)
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
-        super(HistogramObserver, self)._save_to_state_dict(
-            destination, prefix, keep_vars
-        )
+        super()._save_to_state_dict(destination, prefix, keep_vars)
         destination[prefix + "min_val"] = self.min_val
         destination[prefix + "max_val"] = self.max_val
 
@@ -1253,7 +1256,7 @@ def _load_from_state_dict(
                 setattr(self, name, val)
             elif strict:
                 missing_keys.append(key)
-        super(HistogramObserver, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict,
             prefix,
             local_metadata,
@@ -1289,7 +1292,7 @@ def __init__(self,
                  qscheme=torch.per_tensor_affine,
                  quant_min=0,
                  quant_max=255):
-        super(FixedQParamsObserver, self).__init__(dtype=dtype)
+        super().__init__(dtype=dtype)
         self.quant_min = quant_min
         self.quant_max = quant_max
         self.register_buffer('scale', torch.tensor([scale], dtype=torch.float))
@@ -1330,14 +1333,22 @@ class PlaceholderObserver(ObserverBase):
 
     def __init__(
         self, dtype=torch.float32, custom_op_name="", compute_dtype=None,
-        quant_min=None, quant_max=None, is_dynamic=False,
+        quant_min=None, quant_max=None, qscheme=None, eps=None,
+        is_dynamic=False,
     ) -> None:
         super().__init__(dtype=dtype)
+        if qscheme is None:
+            qscheme = torch.per_tensor_affine
+        if eps is None:
+            eps = torch.finfo(torch.float32).eps
+
         # dtype of input of the target operator, e.g. for dynamic quantization
         # ops, the dtype will be float32
         self.dtype = dtype
+        self.qscheme = qscheme
         self.quant_min = quant_min
         self.quant_max = quant_max
+        self.eps = eps
         self.custom_op = custom_op_name
         # used for configuration of computation type for dynamic quantization
         if compute_dtype:
@@ -1371,7 +1382,7 @@ class RecordingObserver(ObserverBase):
     __annotations__ = {"tensor_val": List[Optional[torch.Tensor]]}
 
     def __init__(self, dtype=torch.quint8, **kwargs):
-        super(RecordingObserver, self).__init__(dtype=dtype, **kwargs)  # type: ignore[call-arg]
+        super().__init__(dtype=dtype, **kwargs)  # type: ignore[call-arg]
         self.tensor_val = []
 
     def forward(self, x):
@@ -1402,7 +1413,7 @@ class NoopObserver(ObserverBase):
     """
 
     def __init__(self, dtype=torch.float16, custom_op_name="") -> None:
-        super(NoopObserver, self).__init__(dtype=dtype)
+        super().__init__(dtype=dtype)
         self.dtype = dtype
         self.custom_op = custom_op_name
 
@@ -1448,9 +1459,8 @@ def _is_observer_script_module(mod, obs_type_name):
 
 def _is_activation_post_process(module):
     return (
-        isinstance(module, torch.ao.quantization.ObserverBase)
-        or isinstance(module, torch.ao.quantization.FakeQuantizeBase)
-        or _is_observer_script_module(module, "quantization.observer")
+        isinstance(module, (torch.ao.quantization.ObserverBase,
+                            torch.ao.quantization.FakeQuantizeBase)) or _is_observer_script_module(module, "quantization.observer")
     )
 
 
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 2dec48498aa5..80f2f6dd768d 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -402,6 +402,17 @@ def get_default_qat_qconfig(backend='x86', version=1):
                                                        eps=2 ** -12),
     weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127)
 
+_default_fp32_placeholder_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.float32),
+    weight=PlaceholderObserver.with_args(dtype=torch.float32)
+)
+
+_default_quint8_placeholder_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.quint8),
+    # operators using this qconfig doesn't have weights
+    weight=None,
+)
+
 def get_default_qconfig_dict(backend='x86', version=0):
     warnings.warn(
         "torch.ao.quantization.get_default_qconfig_dict is deprecated and will be removed in "
@@ -422,17 +433,15 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig],
     if qconfig is None:
         return
     is_conv_transpose_mod = (
-        isinstance(mod, torch.nn.ConvTranspose1d) or
-        isinstance(mod, torch.nn.ConvTranspose2d) or
-        isinstance(mod, torch.nn.ConvTranspose3d))
+        isinstance(mod, (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d)))
     if is_conv_transpose_mod:
         if qconfig.weight is None:
             # for now, we assume that any qconfig for ConvTranspose without a weight is valid
             return
         example_observer = qconfig.weight()
         is_per_channel = (
-            isinstance(example_observer, torch.ao.quantization.PerChannelMinMaxObserver) or
-            isinstance(example_observer, torch.ao.quantization.MovingAveragePerChannelMinMaxObserver)
+            isinstance(example_observer, (torch.ao.quantization.PerChannelMinMaxObserver,
+                                          torch.ao.quantization.MovingAveragePerChannelMinMaxObserver))
         )
         assert not is_per_channel, \
             'Per channel weight observer is not supported yet for ConvTranspose{n}d.'
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 959eb14aa983..1c0c0a308180 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -106,15 +106,8 @@ def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QC
             fixed_qparams_observer_to_qconfig[observer] = fixed_qparams_qconfig
         qconfig_mapping.set_object_type(fixed_qparams_op, fixed_qparams_qconfig)
 
-    # QConfig for fused ops for onednn backend
-    # Separate ops are required to have the same qconfig as fused ops
-    # TODO: we should be able to configure qconfig for patterns
-    if backend == 'onednn':
-        qconfig_mapping.set_object_type(torch.nn.Linear, qconfig) \
-                       .set_object_type(torch.nn.LeakyReLU, qconfig) \
-                       .set_object_type(torch.nn.functional.leaky_relu, qconfig) \
-                       .set_object_type(torch.nn.Tanh, qconfig) \
-                       .set_object_type(torch.nn.functional.tanh, qconfig)
+    # TODO Currently it's required that separate ops in a fused op/module have the same qconfig.
+    #      Need to be able to support fusion of ops with different qconfigs
 
     return qconfig_mapping
 
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index 7c2fa4cae5ef..96db52624acd 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -109,6 +109,8 @@
     nni.ConvReLU1d: nniq.ConvReLU1d,
     nni.ConvReLU2d: nniq.ConvReLU2d,
     nni.ConvReLU3d: nniq.ConvReLU3d,
+    nni.ConvAdd2d: nniq.ConvAdd2d,
+    nni.ConvAddReLU2d: nniq.ConvAddReLU2d,
     nni.LinearReLU: nniq.LinearReLU,
     nni.LinearLeakyReLU: nniq.LinearLeakyReLU,
     nni.LinearTanh: nniq.LinearTanh,
@@ -206,10 +208,10 @@
 
 def no_observer_set() -> Set[Any]:
     r"""These modules cannot have observers inserted by default."""
-    no_observers = set([
+    no_observers = {
         nn.quantizable.LSTM,
         nn.quantizable.MultiheadAttention
-    ])
+    }
     return no_observers
 
 def get_default_static_quant_module_mappings() -> Dict[Callable, Any]:
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index d766990814b7..3b59b133ba9e 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import torch.ao.nn.quantized as nnq
-from torch.nn.intrinsic import _FusedModule
+from torch.ao.nn.intrinsic import _FusedModule
 
 from torch.ao.quantization.quantization_mappings import (
     get_default_dynamic_quant_module_mappings,
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index 7fd5a9fc6343..5a2edbeb2921 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -1,8 +1,10 @@
-from typing import Any, Dict, Optional, Set, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 import warnings
 
 import torch
+import copy
 from torch.fx import GraphModule
+from torch.fx.graph_module import _USER_PRESERVED_ATTRIBUTES_KEY
 from .fx.tracer import QuantizationTracer
 from .fx.tracer import (  # noqa: F401
     Scope,
@@ -15,7 +17,7 @@
     BackendConfig,
     get_tensorrt_backend_config,
 )
-from .fx.graph_module import ObservedGraphModule
+from .fx.graph_module import ObservedGraphModule  # noqa: F401
 from .fx.custom_config import (
     ConvertCustomConfig,
     FuseCustomConfig,
@@ -25,6 +27,16 @@
 from .fx.utils import get_skipped_module_name_and_classes
 from .qconfig_mapping import QConfigMapping
 
+def attach_preserved_attrs_to_model(
+        model: Union[GraphModule, torch.nn.Module], preserved_attrs: Dict[str, Any]):
+    """ Store preserved attributes to the model.meta so that it can be preserved during deepcopy
+    """
+    model.meta[_USER_PRESERVED_ATTRIBUTES_KEY] = copy.copy(preserved_attrs)  # type: ignore[operator, index, assignment]
+    # set the preserved attributes in the model so that user can call
+    # model.attr as they do before calling fx graph mode quantization
+    for attr_name, attr in model.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():  # type: ignore[index, union-attr]
+        setattr(model, attr_name, attr)
+
 def _check_is_graph_module(model: torch.nn.Module) -> None:
     if not isinstance(model, GraphModule):
         raise ValueError(
@@ -77,7 +89,6 @@ def _fuse_fx(
     return fuse(
         model, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
 
-
 def _prepare_fx(
     model: torch.nn.Module,
     qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
@@ -87,7 +98,7 @@ def _prepare_fx(
     _equalization_config: Optional[Union[QConfigMapping, Dict[str, Any]]] = None,
     backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
     is_standalone_module: bool = False,
-) -> ObservedGraphModule:
+) -> GraphModule:
     r""" Internal helper function for prepare_fx
     Args:
       `model`, `qconfig_mapping`, `prepare_custom_config`, `_equalization_config`:
@@ -115,14 +126,13 @@ def _prepare_fx(
 
     skipped_module_names, skipped_module_classes = \
         get_skipped_module_name_and_classes(prepare_custom_config, is_standalone_module)
-    preserved_attributes = prepare_custom_config.preserved_attributes
+    preserved_attr_names = prepare_custom_config.preserved_attributes
+    preserved_attrs = {attr: getattr(model, attr) for attr in preserved_attr_names if hasattr(model, attr)}
     # symbolically trace the model
     tracer = QuantizationTracer(skipped_module_names, skipped_module_classes)  # type: ignore[arg-type]
     graph_module = GraphModule(model, tracer.trace(model))
     _attach_meta_to_node_if_not_exist(graph_module)
 
-    for attr_name in preserved_attributes:
-        setattr(graph_module, attr_name, getattr(model, attr_name))
     fuse_custom_config = FuseCustomConfig().set_preserved_attributes(prepare_custom_config.preserved_attributes)
     graph_module = _fuse_fx(
         graph_module,
@@ -141,8 +151,7 @@ def _prepare_fx(
         is_standalone_module=is_standalone_module,
     )  # type: ignore[operator]
 
-    for attr_name in preserved_attributes:
-        setattr(prepared, attr_name, getattr(model, attr_name))
+    attach_preserved_attrs_to_model(prepared, preserved_attrs)
     return prepared
 
 
@@ -164,13 +173,14 @@ def _prepare_standalone_module_fx(
 
     Returns:
 
-        * model(GraphModule): prepared standalone module. It has these attributes:
+        * model(GraphModule): prepared standalone module. It has these attributes in
+          model.meta:
 
-            * `_standalone_module_input_quantized_idxs(List[Int])`: a list of
+            * `standalone_module_input_quantized_idxs(List[Int])`: a list of
               indexes for the graph input that is expected to be quantized,
               same as input_quantized_idxs configuration provided
               for the standalone module
-            * `_standalone_module_output_quantized_idxs(List[Int])`: a list of
+            * `standalone_module_output_quantized_idxs(List[Int])`: a list of
               indexs for the graph output that is quantized
               same as input_quantized_idxs configuration provided
               for the standalone module
@@ -193,7 +203,7 @@ def fuse_fx(
     backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
 ) -> GraphModule:
     r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
-    Fusion rules are defined in torch.quantization.fx.fusion_pattern.py
+    Fusion rules are defined in torch.ao.quantization.fx.fusion_pattern.py
 
     Args:
 
@@ -217,15 +227,15 @@ def fuse_fx(
         fuse_custom_config = FuseCustomConfig.from_dict(fuse_custom_config)
 
     torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx")
+    preserved_attr_names = fuse_custom_config.preserved_attributes
+    preserved_attrs = {attr: getattr(model, attr) for attr in preserved_attr_names if hasattr(model, attr)}
+
     graph_module = torch.fx.symbolic_trace(model)
     _attach_meta_to_node_if_not_exist(graph_module)
-    preserved_attributes: Set[str] = set()
-    if fuse_custom_config:
-        preserved_attributes = set(fuse_custom_config.preserved_attributes)
-    for attr_name in preserved_attributes:
-        setattr(graph_module, attr_name, getattr(model, attr_name))
-    return _fuse_fx(graph_module, False, fuse_custom_config, backend_config)
+    graph_module = _fuse_fx(graph_module, False, fuse_custom_config, backend_config)
 
+    attach_preserved_attrs_to_model(graph_module, preserved_attrs)
+    return graph_module
 
 def prepare_fx(
     model: torch.nn.Module,
@@ -234,7 +244,7 @@ def prepare_fx(
     prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
     _equalization_config: Optional[Union[QConfigMapping, Dict[str, Any]]] = None,
     backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
-) -> ObservedGraphModule:
+) -> GraphModule:
     r""" Prepare a model for post training static quantization
 
     Args:
@@ -384,7 +394,7 @@ def prepare_qat_fx(
     example_inputs: Tuple[Any, ...],
     prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
     backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
-) -> ObservedGraphModule:
+) -> GraphModule:
     r""" Prepare a model for quantization aware training
 
     Args:
@@ -506,6 +516,8 @@ def _convert_fx(
         convert_custom_config = ConvertCustomConfig.from_dict(convert_custom_config)
 
     _check_is_graph_module(graph_module)
+    preserved_attr_names = convert_custom_config.preserved_attributes
+    preserved_attrs = {attr: getattr(graph_module, attr) for attr in preserved_attr_names if hasattr(graph_module, attr)}
 
     quantized = convert(
         graph_module,
@@ -518,9 +530,7 @@ def _convert_fx(
         is_decomposed=is_decomposed,
     )
 
-    preserved_attributes = convert_custom_config.preserved_attributes
-    for attr_name in preserved_attributes:
-        setattr(quantized, attr_name, getattr(graph_module, attr_name))
+    attach_preserved_attrs_to_model(quantized, preserved_attrs)
     return quantized
 
 
diff --git a/torch/ao/quantization/stubs.py b/torch/ao/quantization/stubs.py
index 7ae526a8921e..f39a28ef7ee5 100644
--- a/torch/ao/quantization/stubs.py
+++ b/torch/ao/quantization/stubs.py
@@ -10,7 +10,7 @@ class QuantStub(nn.Module):
             if qconfig is not provided, we will get qconfig from parent modules
     """
     def __init__(self, qconfig=None):
-        super(QuantStub, self).__init__()
+        super().__init__()
         if qconfig:
             self.qconfig = qconfig
 
@@ -27,7 +27,7 @@ class DeQuantStub(nn.Module):
             if qconfig is not provided, we will get qconfig from parent modules
     """
     def __init__(self, qconfig=None):
-        super(DeQuantStub, self).__init__()
+        super().__init__()
         if qconfig:
             self.qconfig = qconfig
 
@@ -51,7 +51,7 @@ class QuantWrapper(nn.Module):
     module: nn.Module
 
     def __init__(self, module):
-        super(QuantWrapper, self).__init__()
+        super().__init__()
         qconfig = module.qconfig if hasattr(module, 'qconfig') else None
         self.add_module('quant', QuantStub(qconfig))
         self.add_module('dequant', DeQuantStub(qconfig))
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index a40935bacefc..5d8ec40a6ca3 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -1,16 +1,16 @@
 """
 Utils shared by different modes of quantization (eager/graph)
 """
-import warnings
 import functools
+import warnings
+from collections import OrderedDict
+from inspect import getfullargspec, signature
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
 import torch
-from torch.fx import Node
 from torch.ao.quantization.quant_type import QuantType
-from typing import Tuple, Any, Union, Callable, Dict, Optional
+from torch.fx import Node
 from torch.nn.utils.parametrize import is_parametrized
-from collections import OrderedDict
-from inspect import signature
-from inspect import getfullargspec
 
 NodePattern = Union[Tuple[Node, Node], Tuple[Node, Tuple[Node, Node]], Any]
 NodePattern.__module__ = "torch.ao.quantization.utils"
@@ -152,12 +152,14 @@ def to_underlying_dtype(qdtype):
     return DTYPE_MAPPING[qdtype]
 
 def get_qparam_dict(observer_or_fake_quant):
+    from torch.ao.quantization.observer import PlaceholderObserver
+
     qscheme = observer_or_fake_quant.qscheme if hasattr(observer_or_fake_quant, "qscheme") else None
     dtype = observer_or_fake_quant.dtype
     qparams = {"qscheme": qscheme, "dtype": dtype}
 
-    if not qscheme:
-        return qparams
+    if not qscheme or isinstance(observer_or_fake_quant, PlaceholderObserver):
+        return {"qscheme": None, "dtype": dtype}
 
     if is_per_tensor(qscheme):
         qscheme = torch.per_tensor_affine
@@ -336,7 +338,7 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b
         # using of refinement to decouple initial_qmin and initial_qmax from quantization range.
         # The actual values of initial_qmin and initial_qmax will be reset below.
         if dtype == torch.qint32:
-            initial_quant_min, initial_quant_max = 0, 2**31 - 1
+            initial_quant_min, initial_quant_max = 0, 2**32 - 1
         else:
             initial_quant_min, initial_quant_max = 0, 255
         # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the
@@ -355,7 +357,7 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b
             ), "quantization range should be positive and not exceed the maximum bit range (=256)."
         elif dtype == torch.qint32:
             assert (
-                0 < qrange_len <= 2**31
+                0 < qrange_len <= 2**32
             ), "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
         if reduce_range:
             quant_min, quant_max = quant_min // 2, quant_max // 2
@@ -476,6 +478,105 @@ def _normalize_kwargs(func: Callable, loc: Dict[str, Any]) -> "OrderedDict[str,
             normalized_kwargs[attr] = val
     return normalized_kwargs
 
+def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:
+    r"""Validates that the user-specified quantization range is properly initialized
+    and within the given bound supported by the observer dtype.
+
+    To accommodate lower-bit quantization with respect to the existing torch.qint8 and
+    torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing
+    in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax
+    values are used to calculate static estimates of the scale and zero point for aggressive lower-bit
+    fake quantization. These estimates are compared against parameters learned through backpropagation.
+    The related literatures for scale and zero point via backpropagation are as follows:
+
+    Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS
+    Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf
+    """
+    # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
+    # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+    assert (
+        quant_min <= 0 <= quant_max
+    ), "Used-specified quantization range must include 0."
+    assert (
+        quant_min < quant_max
+    ), "qmin must be strictly less than qmax for user-specified quantization range."
+
+
+# Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
+# as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikey to change
+# (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. TODO(jakeszwe, jerryzh168)
+def determine_qparams(
+        min_val: torch.Tensor, max_val: torch.Tensor, quant_min: int, quant_max: int,
+        dtype: torch.dtype, eps: torch.Tensor, has_customized_qrange: bool,
+        qscheme: torch.qscheme = torch.per_tensor_affine) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""Calculates the quantization parameters, given min and max
+    value tensors. Works for both per tensor and per channel cases
+
+    Args:
+        min_val: Minimum values per channel
+        max_val: Maximum values per channel
+
+    Returns:
+        scales: Scales tensor of shape (#channels,)
+        zero_points: Zero points tensor of shape (#channels,)
+    """
+    if not check_min_max_valid(min_val, max_val):
+        return torch.tensor([1.0], device=min_val.device.type), torch.tensor([0], device=min_val.device.type)
+
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+
+    device = min_val_neg.device
+    scale = torch.ones(min_val_neg.size(), dtype=torch.float32, device=device)
+    zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+    if (
+        qscheme == torch.per_tensor_symmetric
+        or qscheme == torch.per_channel_symmetric
+    ):
+        max_val_pos = torch.max(-min_val_neg, max_val_pos)
+        scale = max_val_pos / (float(quant_max - quant_min) / 2)
+        scale = torch.max(scale, eps)
+        if dtype == torch.uint8 or dtype == torch.quint8:
+            if has_customized_qrange:
+                # When customized quantization range is used, down-rounded midpoint of the range is chosen.
+                zero_point = zero_point.new_full(
+                    zero_point.size(), (quant_min + quant_max) // 2
+                )
+            else:
+                zero_point = zero_point.new_full(zero_point.size(), 128)
+    elif qscheme == torch.per_channel_affine_float_qparams:
+        scale = (max_val - min_val) / float(quant_max - quant_min)
+        scale = torch.where(scale > eps, scale, torch.ones_like(scale))
+        # We use the quantize function
+        # xq = Round(Xf * inv_scale + zero_point),
+        # setting zero_point to (-1 * min *inv_scale) we get
+        # Xq = Round((Xf - min) * inv_scale)
+        zero_point = -1 * min_val / scale
+    else:
+        scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+        scale = torch.max(scale, eps)
+        zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
+        zero_point = torch.clamp(zero_point, quant_min, quant_max)
+
+    # For scalar values, cast them to Tensors of size 1 to keep the shape
+    # consistent with default values in FakeQuantize.
+    if len(scale.shape) == 0:
+        # TODO: switch to scale.item() after adding JIT support
+        scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device)
+    if len(zero_point.shape) == 0:
+        # TODO: switch to zero_point.item() after adding JIT support
+        zero_point = torch.tensor(
+            [int(zero_point)], dtype=zero_point.dtype, device=device
+        )
+        if qscheme == torch.per_channel_affine_float_qparams:
+            zero_point = torch.tensor(
+                [float(zero_point)], dtype=zero_point.dtype, device=device
+            )
+
+    return scale, zero_point
+
 def _get_num_pos_args(f: Callable) -> int:
     """ Get number of positional args for a function
 
@@ -593,7 +694,14 @@ def make_qconfig(obs_ctr: Callable) -> torch.ao.quantization.QConfig:
         float_lstm.input_size, float_lstm.hidden_size, float_lstm.num_layers, float_lstm.bias,
         float_lstm.batch_first, float_lstm.dropout, float_lstm.bidirectional)
 
-    # Assign QConfigs with fixed qparams to all inner submodules
+    # Propagate the QConfig configured in the float module to all inner submodules first
+    # Need to import here to avoid circular dependency
+    from torch.ao.quantization.quantize import _add_observer_, propagate_qconfig_
+    observed_lstm.qconfig = float_lstm.qconfig
+    propagate_qconfig_(observed_lstm)
+
+    # For the inner submodules of interest, override the original
+    # QConfig with more specific ones that have fixed qparams
     # Module hierarchy: LSTM > _LSTMLayer > _LSTMSingleLayer (forward or backward) > LSTMCell
     for layer in observed_lstm.layers:
         inner_layers = [layer.layer_fw]
@@ -625,8 +733,6 @@ def make_qconfig(obs_ctr: Callable) -> torch.ao.quantization.QConfig:
                     cell.initial_hidden_state_qparams = (obs.scale, obs.zero_point)
                 cell.hidden_state_dtype = obs.dtype
 
-    # need to do this here to avoid circular dependency
-    from torch.ao.quantization.quantize import _add_observer_
     # Insert the observers based on the previously attached QConfigs
     # Pass in non_leaf_module_list to prevent the observers for sigmoid/tanh from being overridden
     _add_observer_(  # type: ignore[attr-defined]
@@ -662,4 +768,6 @@ def make_qconfig(obs_ctr: Callable) -> torch.ao.quantization.QConfig:
     "has_no_children_ignoring_parametrizations",
     "get_fqn_to_example_inputs",
     "to_underlying_dtype",
+    "determine_qparams",
+    "validate_qmin_qmax",
 ]
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 721a8f5376b6..c71e36cbcc65 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -15,7 +15,10 @@
 from .variable import Variable
 from .function import Function, NestedIOFunction
 from .gradcheck import gradcheck, gradgradcheck
-from .grad_mode import no_grad, enable_grad, set_grad_enabled, inference_mode, set_multithreading_enabled
+from .grad_mode import (
+    no_grad, enable_grad, set_grad_enabled, inference_mode, set_multithreading_enabled, _force_original_view_tracking,
+    _unsafe_preserve_version_counter
+)
 from .anomaly_mode import detect_anomaly, set_detect_anomaly
 from ..overrides import has_torch_function, handle_torch_function, is_tensor_like
 from . import functional
@@ -83,6 +86,10 @@ def _make_grads(outputs: Sequence[torch.Tensor], grads: Sequence[_OptionalTensor
             if out.requires_grad:
                 if out.numel() != 1:
                     raise RuntimeError("grad can be implicitly created only for scalar outputs")
+                if not out.dtype.is_floating_point:
+                    msg = ("grad can be implicitly created only for real scalar outputs"
+                           f" but got {out.dtype}")
+                    raise RuntimeError(msg)
                 new_grads.append(torch.ones_like(out, memory_format=torch.preserve_format))
             else:
                 new_grads.append(None)
diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py
index 87cd795d7e73..fea16a69215f 100644
--- a/torch/autograd/anomaly_mode.py
+++ b/torch/autograd/anomaly_mode.py
@@ -6,7 +6,7 @@
 __all__ = ["detect_anomaly", "set_detect_anomaly"]
 
 
-class detect_anomaly(object):
+class detect_anomaly:
     r"""Context-manager that enable anomaly detection for the autograd engine.
 
     This does two things:
@@ -88,7 +88,7 @@ def __exit__(self, *args: Any) -> None:
         torch.set_anomaly_enabled(self.prev, self.prev_check_nan)
 
 
-class set_detect_anomaly(object):
+class set_detect_anomaly:
     r"""Context-manager that sets the anomaly detection for the autograd engine on or off.
 
     ``set_detect_anomaly`` will enable or disable the autograd anomaly detection
diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py
index 5db1041b4613..440497bea35f 100644
--- a/torch/autograd/forward_ad.py
+++ b/torch/autograd/forward_ad.py
@@ -1,6 +1,5 @@
 import torch
 import os
-import sys
 from .grad_mode import _DecoratorContextManager
 from collections import namedtuple
 
@@ -87,9 +86,7 @@ def make_dual(tensor, tangent, *, level=None):
     #         buffer = z
     #     return min - torch.log1p(z), buffer
     #     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
-    # Currently broken for 3.11, see https://github.com/pytorch/pytorch/issues/85506
-    if (os.environ.get("PYTORCH_JIT", "1" if sys.version_info < (3, 11) else "0") == "1" and
-            __debug__):
+    if os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__:
         from torch._decomp import decompositions_for_jvp  # noqa: F401
 
     if level is None:
@@ -179,9 +176,6 @@ class dual_level(_DecoratorContextManager):
     Please see the `forward-mode AD tutorial <https://pytorch.org/tutorials/intermediate/forward_ad_usage.html>`__
     for detailed steps on how to use this API.
     """
-    def __init__(self):
-        super().__init__()
-
     def __enter__(self):
         return enter_dual_level()
 
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index c94fdd4252dc..880ef803ea38 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -3,7 +3,6 @@
 from torch._C import _functions
 import torch._functorch as _functorch
 import torch.utils.hooks as hooks
-from torch._six import with_metaclass
 import functools
 import warnings
 from collections import OrderedDict
@@ -14,7 +13,7 @@
            "InplaceFunction", "NestedIOFunction"]
 
 # Formerly known as: _ContextMethodMixin
-class FunctionCtx(object):
+class FunctionCtx:
 
     def save_for_backward(self, *tensors: torch.Tensor):
         r"""Saves given tensors for a future call to :func:`~Function.backward`.
@@ -250,7 +249,7 @@ def set_materialize_grads(self, value: bool):
 # DO NOT USE: This is only defined to be able to load old serialized models
 _ContextMethodMixin = FunctionCtx
 
-class _HookMixin(object):
+class _HookMixin:
 
     @staticmethod
     def _register_hook(backward_hooks, hook):
@@ -294,8 +293,7 @@ def __init__(cls, name, bases, attrs):
         super(FunctionMeta, cls).__init__(name, bases, attrs)
 
 
-# mypy doesn't understand `with_metaclass` from torch._six
-class _SingleLevelFunction(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _HookMixin)):  # type: ignore[misc]
+class _SingleLevelFunction(_C._FunctionBase, FunctionCtx, _HookMixin, metaclass=FunctionMeta):
     @staticmethod
     def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
         r"""
@@ -505,7 +503,7 @@ def apply(cls, *args, **kwargs):
         if not torch._C._are_functorch_transforms_active():
             # See NOTE: [functorch vjp and autograd interaction]
             args = _functorch.utils.unwrap_dead_wrappers(args)
-            return super().apply(*args, **kwargs)
+            return super().apply(*args, **kwargs)  # type: ignore[misc]
 
         if cls.setup_context == _SingleLevelFunction.setup_context:
             raise RuntimeError(
@@ -578,7 +576,7 @@ def traceable(fn_cls):
 class InplaceFunction(Function):
 
     def __init__(self, inplace=False):
-        super(InplaceFunction, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
 
@@ -680,14 +678,14 @@ class NestedIOFunction(Function):
     def _do_forward(self, *input):
         self._nested_input = input
         flat_input = tuple(_iter_tensors(input))
-        flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
+        flat_output = super()._do_forward(*flat_input)  # type: ignore[misc]
         nested_output = self._nested_output
         nested_tensors = _unflatten(flat_output, self._nested_output)
         return nested_tensors
 
     def _do_backward(self, gradients, retain_variables):
         self.retain_variables = retain_variables
-        result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
+        result = super()._do_backward(gradients, retain_variables)  # type: ignore[misc]
         if not retain_variables:
             del self._nested_output
             del self._to_save_nested
@@ -713,7 +711,7 @@ def save_for_backward(self, *args: Any) -> None:
 
     @property
     def saved_tensors(self):
-        flat_tensors = super(NestedIOFunction, self).saved_tensors
+        flat_tensors = super().saved_tensors  # type: ignore[misc]
         return _unflatten(flat_tensors, self._to_save_nested)
 
     def mark_dirty(self, *args: Any, **kwargs: Any) -> None:
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 9e0ce5b7b83e..f6ec35517957 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -1,5 +1,5 @@
 import torch
-from typing import Any
+from typing import Any, Optional
 
 from torch.utils._contextlib import _DecoratorContextManager
 
@@ -15,6 +15,9 @@ class no_grad(_DecoratorContextManager):
 
     In this mode, the result of every computation will have
     `requires_grad=False`, even when the inputs have `requires_grad=True`.
+    There is an exception! All factory functions, or functions that create
+    a new Tensor and take a requires_grad kwarg, will NOT be affected by
+    this mode.
 
     This context manager is thread local; it will not affect computation
     in other threads.
@@ -44,6 +47,11 @@ class no_grad(_DecoratorContextManager):
         >>> z = doubler(x)
         >>> z.requires_grad
         False
+        >>> # factory function exception
+        >>> with torch.no_grad():
+        ...     a = torch.nn.Parameter(torch.rand(10))
+        >>> a.requires_grad
+        True
     """
     def __init__(self) -> None:
         if not torch._jit_internal.is_scripting():
@@ -157,7 +165,7 @@ def __enter__(self) -> None:
     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         torch._C._set_grad_enabled(self.prev)
 
-    def clone(self):
+    def clone(self) -> "set_grad_enabled":
         return self.__class__(self.mode)
 
 
@@ -205,21 +213,21 @@ class inference_mode(_DecoratorContextManager):
         False
 
     """
-    def __init__(self, mode=True):
+    def __init__(self, mode: bool = True) -> None:
         if not torch._jit_internal.is_scripting():
             super().__init__()
         # Holds a python binding to a RAII guard that can enable or disable
         # inference mode
-        self._inference_mode_raii_guard = None
+        self._inference_mode_raii_guard: Optional[torch._C._InferenceMode] = None
         self.mode = mode
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         self._inference_mode_raii_guard = torch._C._InferenceMode(self.mode)
 
     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         del self._inference_mode_raii_guard
 
-    def clone(self):
+    def clone(self) -> "inference_mode":
         return self.__class__(self.mode)
 
 
@@ -251,5 +259,74 @@ def __enter__(self) -> None:
     def __exit__(self, *args) -> None:
         del self.multithreadeding_enabled_guard
 
+    def clone(self) -> "set_multithreading_enabled":
+        return self.__class__(self.mode)
+
+
+class _force_original_view_tracking(_DecoratorContextManager):
+    r"""Context-manager that sets whether or not to always enable view-replay in autograd.
+
+    ``set_view_replay_enabled`` will enable or disable view-replay based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    When a tensor view is mutated, the autograd engine needs to decide whether or not
+    to regenerate the "updated view" by either replaying the chain of views from the updated base,
+    or with a single call to as_strided.
+
+    If set_view_replay_enabled is set to True, then autograd will always use view replay.
+    Otherwise, it will fall back to its existing logic.
+
+    Args:
+        mode (bool): Flag whether to enable view-replay (``True``), or disable
+                     (``False``).
+
+    """
+
+    def __init__(self, mode: bool) -> None:
+        self.mode = mode
+        self._force_original_view_tracking_guard = torch._C._ViewReplayEnabled(mode)
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args) -> None:
+        del self._force_original_view_tracking_guard
+
     def clone(self):
         return self.__class__(self.mode)
+
+class _unsafe_preserve_version_counter(_DecoratorContextManager):
+    r"""DO NOT USE THIS UNLESS YOU KNOW EXACTLY WHAT YOU'RE DOING!
+
+    This context manager can lead to arbitrary silent-correctness issues in any other part of your code
+    (even the ones not touched directly by the context manager)!
+
+    Ordinarily, autograd will track mutations to tensors by incrementing it's `._version` attribute.
+    This is generally important for correctness, as for example, mutating a tensor that autograd has saved
+    for the backwards pass can result in incorrect gradients, and autograd uses the version counter to detect
+    and error out in this situation.
+
+    However, there are rare instances where it might be useful to hide mutations from autograd. For example:
+    if a tensor is very large, and you'd like to free its memory by storing it elsewhere, and re-populate
+    the tensor right before it is needed by autograd.
+
+    Args:
+        tensor (torch.Tensor): the tensor in question, that you would like to preserve the version counter of.
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    """
+
+    def __init__(self, tensor: torch.Tensor) -> None:
+        self.tensor = tensor
+        self.prev_version = tensor._version
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args) -> None:
+        torch._C._autograd._unsafe_set_version_counter(self.tensor, self.prev_version)
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index dd56ff517b61..2680f3a64fc8 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -72,6 +72,43 @@ def _iter_tensors(x: Union[torch.Tensor, Iterable[torch.Tensor]],
                 yield result
 
 
+def _densify(x):
+    # return a copy of sparse x with all unspecified elements
+    # "replaced" with zero-valued elements
+    if isinstance(x, (list, tuple)):
+        return type(x)(map(_densify, x))
+    elif not is_tensor_like(x) or x.layout in {torch.strided, torch._mkldnn}:  # type: ignore[attr-defined] # no attr _mkldnn
+        return x
+    elif x.layout is torch.sparse_coo:
+        device = x.device
+        indices_dtype = x._indices().dtype
+        tmp = torch.ones(x.shape[:x.sparse_dim()], dtype=torch.int8, device=device)
+        indices = tmp.nonzero().t().to(dtype=indices_dtype)
+        values = torch.zeros((tmp.numel(), *x.shape[x.sparse_dim():]), dtype=x.dtype, device=device)
+        x_coalesced = x.detach().coalesce()
+        if x_coalesced.numel() > 0:
+            stride = tmp.stride()
+            flat_indices = x_coalesced.indices().mul(
+                torch.tensor(stride, dtype=indices_dtype, device=device).unsqueeze(1)).sum(0)
+            values[flat_indices] = x_coalesced.values()
+        return torch.sparse_coo_tensor(indices, values, x.shape)._coalesced_(True).requires_grad_(x.requires_grad)
+    elif _is_sparse_compressed_tensor(x):
+        blocksize = x.values().shape[1:3] if x.layout in {torch.sparse_bsr, torch.sparse_bsc} else None
+        compressed_indices = x.crow_indices() if x.layout in {torch.sparse_csr, torch.sparse_bsr} else x.ccol_indices()
+        # We'll use intermediate sparse COO for simplicity
+        r = _densify(x.detach().to_sparse(layout=torch.sparse_coo)).to_sparse(layout=x.layout, blocksize=blocksize)
+        # Check that all elements are specified also after `to_sparse` op:
+        dense_numel = r.values().numel() // max(1, r.values().shape[0])
+        batch_numel = compressed_indices.numel() // compressed_indices.shape[-1]
+        sparse_numel = r.numel() // max(1, dense_numel * batch_numel)
+        if sparse_numel != r._nnz():
+            raise AssertionError(f'{x.layout} densify failed: expected nnz={sparse_numel} but got {r._nnz()}')
+        return r.requires_grad_(x.requires_grad)
+    elif _is_sparse_any_tensor(x):
+        raise NotImplementedError(x.layout)
+    return x
+
+
 def _iter_tensor(x_tensor):
     # (Only used for slow gradcheck) Returns a generator that yields the following
     # elements at each iteration:
@@ -114,8 +151,8 @@ def get_stride(size):
             x_blocksize = x_block_values.size()[1:3]
             x_indices = torch._convert_indices_from_csr_to_coo(x_tensor.crow_indices(), x_tensor.col_indices()) \
                              .repeat_interleave(x_blocksize[0] * x_blocksize[1], 1) \
-                             .mul_(torch.tensor(x_blocksize).reshape(2, 1)) \
-                             .add_(torch.stack(torch.where(torch.ones(x_blocksize))).repeat(1, x_nnz)).t()
+                             .mul_(torch.tensor(x_blocksize, device=x_tensor.device).reshape(2, 1)) \
+                             .add_(torch.stack(torch.where(torch.ones(x_blocksize, device=x_tensor.device))).repeat(1, x_nnz)).t()
             x_values = x_block_values.flatten(0, 2)
             x_nnz = x_values.size(0)
         elif x_tensor.layout is torch.sparse_bsc:
@@ -123,8 +160,8 @@ def get_stride(size):
             x_blocksize = x_block_values.size()[1:3]
             x_indices = torch._convert_indices_from_csr_to_coo(x_tensor.ccol_indices(), x_tensor.row_indices(), transpose=True) \
                              .repeat_interleave(x_blocksize[0] * x_blocksize[1], 1) \
-                             .mul_(torch.tensor(x_blocksize).reshape(2, 1)) \
-                             .add_(torch.stack(torch.where(torch.ones(x_blocksize))).repeat(1, x_nnz)).t()
+                             .mul_(torch.tensor(x_blocksize, device=x_tensor.device).reshape(2, 1)) \
+                             .add_(torch.stack(torch.where(torch.ones(x_blocksize, device=x_tensor.device))).repeat(1, x_nnz)).t()
             x_values = x_block_values.flatten(0, 2)
             x_nnz = x_values.size(0)
         else:
@@ -225,6 +262,19 @@ def fn_pack_inps(*inps):
 def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
     # Performs finite differencing by perturbing `entry` in-place by `v` and
     # returns the gradient of each of the outputs wrt to x at idx.
+    if _is_sparse_compressed_tensor(entry):
+        # sparse compressed tensors don't implement sub/add/copy_
+        # yet. However, in non-masked semantics context entry and v
+        # have the same sparse indices ...
+        assert entry.layout == v.layout, (entry.layout, v.layout)
+        assert entry._nnz() == v._nnz(), (entry._nnz(), v._nnz(), entry.shape)
+        # ... the finite differencing can be performed on values only:
+        entry = entry.values()
+        v = v.values()
+        # we'll detach to avoid backward computations that sparse
+        # tensors have limited support for.
+        entry = entry.detach()
+
     orig = entry.clone()
     entry.copy_(orig - v)
     outa = fn()
@@ -677,9 +727,10 @@ def _get_analytical_vjps_wrt_specific_output(vjp_fn, sample_output, v) -> List[L
     return vjps
 
 
-def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool:
-    if not check_sparse_nnz and any(_is_sparse_any_tensor(t) for t in tupled_inputs if isinstance(t, torch.Tensor)):
-        raise GradcheckError('gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.')
+def _check_inputs(tupled_inputs, check_sparse_nnz, masked) -> bool:
+    if masked and not check_sparse_nnz and any(_is_sparse_any_tensor(t) for t in tupled_inputs if isinstance(t, torch.Tensor)):
+        raise GradcheckError('gradcheck expects all tensor inputs are dense'
+                             ' when check_sparse_nnz is set to False and masked is set to True.')
     # Make sure that gradients are saved for at least one input
     any_input_requiring_grad = False
     for idx, inp in enumerate(tupled_inputs):
@@ -917,8 +968,10 @@ def _test_backward_mul_by_grad_output(outputs, inputs, check_sparse_nnz) -> bool
                 raise GradcheckError('backward not multiplied by grad_output')
         elif not gi.eq(0).all():
             raise GradcheckError('backward not multiplied by grad_output')
-        if gi.dtype != di.dtype or gi.device != di.device or gi.is_sparse != di.is_sparse:
+        if gi.dtype != di.dtype:
             raise GradcheckError("grad is incorrect type")
+        if gi.device != di.device:
+            raise GradcheckError("grad is incorrect device")
         if gi.size() != di.size():
             raise GradcheckError('grad is incorrect size')
     return True
@@ -1141,13 +1194,15 @@ def _gradcheck_real_imag(gradcheck_fn, func, func_out, tupled_inputs, outputs, e
                 _test_undefined_forward_mode(func, outputs, tupled_inputs)
 
 def _slow_gradcheck(func, func_out, tupled_inputs, outputs, eps, rtol, atol, check_grad_dtypes,
-                    nondet_tol, *, use_forward_ad=False, complex_indices=None, test_imag=False):
+                    nondet_tol, *, use_forward_ad=False, complex_indices=None, test_imag=False, masked=False):
     func_out = _as_tuple(func_out)
     if not outputs:
         return _check_no_differentiable_outputs(func, tupled_inputs, func_out,
                                                 eps=eps, is_forward_ad=use_forward_ad)
+    tupled_inputs_numerical = tupled_inputs if masked else _densify(tupled_inputs)
 
-    numerical = _transpose(_get_numerical_jacobian(func, tupled_inputs, func_out, eps=eps, is_forward_ad=use_forward_ad))
+    numerical = _transpose(_get_numerical_jacobian(func, tupled_inputs_numerical, func_out,
+                                                   eps=eps, is_forward_ad=use_forward_ad))
     # Note: [numerical vs analytical output length]
     # The numerical path returns jacobian quantity for all outputs, even if requires_grad of that
     # output is False. This behavior is necessary for _check_no_differentiable_outputs to work.
@@ -1240,9 +1295,8 @@ def _adjusted_atol(atol, u, v):
     # matrix): v^T M u = \sum_{i} \sum_{j} u_i * v_j = (\sum_{i} u_i)(\sum_{i} v_i)
     # TODO: properly handle case when u is tuple instead of only taking first element
     u = u[0] if isinstance(u, tuple) else u
-    # TODO: replace torch.sparse.sum(u) with u.sum()
-    sum_u = torch.sparse.sum(u) if u.layout == torch.sparse_coo else u.sum()
-    sum_v = 1. if v is None else torch.sparse.sum(v) if v.layout == torch.sparse_coo else v.sum()
+    sum_u = u.sum()
+    sum_v = 1. if v is None else v.sum()
     return atol * float(sum_u) * float(sum_v)
 
 
@@ -1336,7 +1390,8 @@ def _check_analytical_numerical_equal(all_analytical, all_numerical, complex_ind
 
 
 def _fast_gradcheck(func, func_out, inputs, outputs, eps, rtol,
-                    atol, check_grad_dtypes, nondet_tol, *, use_forward_ad=False, complex_indices=None, test_imag=False):
+                    atol, check_grad_dtypes, nondet_tol, *, use_forward_ad=False, complex_indices=None, test_imag=False,
+                    masked=False):
     # See https://github.com/pytorch/pytorch/issues/53876 for details
     inp_tensors_idx, inp_tensors = _get_inp_tensors(inputs)
     # Backward mode computes v^T * J (VJP)
@@ -1348,7 +1403,10 @@ def _fast_gradcheck(func, func_out, inputs, outputs, eps, rtol,
     # we don't need v for correctness check here as asserted below
     all_v, all_u, all_u_dense = _make_vectors(inp_tensors, outputs, use_forward_ad=use_forward_ad)
 
-    numerical_vJu = _get_numerical_vJu(func, inputs, inp_tensors_idx, func_out, all_u, all_v, eps, is_forward_ad=use_forward_ad)
+    inputs_numerical, all_u_numerical, all_v_numerical = (inputs, all_u, all_v) if masked else _densify((inputs, all_u, all_v))
+
+    numerical_vJu = _get_numerical_vJu(func, inputs_numerical, inp_tensors_idx, func_out,
+                                       all_u_numerical, all_v_numerical, eps, is_forward_ad=use_forward_ad)
     # TODO: replicate https://github.com/pytorch/pytorch/pull/77743 for fast gradcheck as well
     if use_forward_ad:
         assert all_v is None
@@ -1391,6 +1449,7 @@ def gradcheck(
     check_forward_ad: bool = False,
     check_backward_ad: bool = True,
     fast_mode: bool = False,
+    masked: bool = False,
 ) -> bool:
     r"""Check gradients computed via small finite differences against analytical
     gradients w.r.t. tensors in :attr:`inputs` that are of floating point or complex type
@@ -1413,6 +1472,12 @@ def gradcheck(
         This check will likely fail if :attr:`input` is of less precision, e.g.,
         ``FloatTensor``.
 
+    .. note::
+        Gradcheck may fail when evaluated on non-differentiable points
+        because the numerically computed gradients via finite differencing may differ
+        those computed analytically (not necessarily because either is incorrect).
+        For more context, see :ref:`non-differentiable-func-grad`.
+
     .. warning::
        If any checked tensor in :attr:`input` has overlapping memory, i.e.,
        different indices pointing to the same memory address (e.g., from
@@ -1449,7 +1514,8 @@ def gradcheck(
             implemented for R to R functions. If none of the inputs and outputs are complex
             a faster implementation of gradcheck that no longer computes the entire jacobian
             is run; otherwise, we fall back to the slow implementation.
-
+        masked (bool, optional): if True, the gradients of unspecified elements of
+            sparse tensors are ignored (default, False).
     Returns:
         True if all differences satisfy allclose condition
     """
@@ -1472,15 +1538,15 @@ def gradcheck(
 
 def _gradcheck_helper(func, inputs, eps, atol, rtol, check_sparse_nnz, nondet_tol, check_undefined_grad,
                       check_grad_dtypes, check_batched_grad, check_batched_forward_grad, check_forward_ad,
-                      check_backward_ad, fast_mode):
+                      check_backward_ad, fast_mode, masked):
     tupled_inputs = _as_tuple(inputs)
-    _check_inputs(tupled_inputs, check_sparse_nnz)
+    _check_inputs(tupled_inputs, check_sparse_nnz, masked)
 
     func_out = func(*tupled_inputs)
     outputs = _differentiable_outputs(func_out)
     _check_outputs(outputs)
 
-    gradcheck_fn = _fast_gradcheck if fast_mode else _slow_gradcheck
+    gradcheck_fn = functools.partial(_fast_gradcheck if fast_mode else _slow_gradcheck, masked=masked)
     _gradcheck_real_imag(gradcheck_fn, func, func_out, tupled_inputs, outputs, eps,
                          rtol, atol, check_grad_dtypes, check_forward_ad=check_forward_ad,
                          check_backward_ad=check_backward_ad, nondet_tol=nondet_tol,
@@ -1521,6 +1587,7 @@ def gradgradcheck(
     check_fwd_over_rev: bool = False,
     check_rev_over_rev: bool = True,
     fast_mode: bool = False,
+    masked: bool = False,
 ) -> bool:
     r"""Check gradients of gradients computed via small finite differences
     against analytical gradients w.r.t. tensors in :attr:`inputs` and
@@ -1571,7 +1638,8 @@ def gradgradcheck(
             batched gradients using prototype vmap support. Defaults to False.
         fast_mode (bool, optional): if True, run a faster implementation of gradgradcheck that
             no longer computes the entire jacobian.
-
+        masked (bool, optional): if True, the gradients of unspecified elements of
+            sparse tensors are ignored (default, False).
     Returns:
         True if all differences satisfy allclose condition
     """
@@ -1609,8 +1677,8 @@ def gradgradcheck(
 
     # NB: We need to save the requires_grad information about the inputs here because gradcheck detaches inputs
     #     before running forward mode AD
-    diff_input_args_indices = set(i for i, x in enumerate(tupled_inputs) if is_tensor_like(x) and x.requires_grad)
-    diff_grad_output_indices = set(i for i, x in enumerate(tupled_grad_outputs) if x.requires_grad)
+    diff_input_args_indices = {i for i, x in enumerate(tupled_inputs) if is_tensor_like(x) and x.requires_grad}
+    diff_grad_output_indices = {i for i, x in enumerate(tupled_grad_outputs) if x.requires_grad}
 
     def new_func(*args):
         # Restore the requires_grad information
@@ -1627,4 +1695,4 @@ def new_func(*args):
         new_func, tupled_inputs + tupled_grad_outputs, eps=eps, atol=atol, rtol=rtol, raise_exception=raise_exception,
         nondet_tol=nondet_tol, check_undefined_grad=check_undefined_grad,
         check_grad_dtypes=check_grad_dtypes, check_batched_grad=check_batched_grad, fast_mode=fast_mode,
-        check_forward_ad=check_fwd_over_rev, check_backward_ad=check_rev_over_rev)
+        check_forward_ad=check_fwd_over_rev, check_backward_ad=check_rev_over_rev, masked=masked)
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 3ec23010e601..cff62beae83a 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -40,7 +40,7 @@
 except ImportError:
     import functools
 
-    class _ContextDecorator(object):  # type: ignore[no-redef]
+    class _ContextDecorator:  # type: ignore[no-redef]
 
         def __enter__(self):
             raise NotImplementedError
@@ -56,7 +56,7 @@ def wrapped(*args, **kwargs):
 
             return wrapped
 
-class profile(object):
+class profile:
     """Context manager that manages autograd profiler state and holds a summary of results.
     Under the hood it just records events of functions being executed in C++ and
     exposes those events to Python. You can wrap any code into it and it will
@@ -549,12 +549,12 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
         return profiled_future
 
 
-class emit_itt(object):
+class emit_itt:
     """Context manager that makes every autograd operation emit an ITT range.
 
     It is useful when running the program under Intel(R) VTune Profiler::
 
-        vtune <--vtune_flags> <regular command here>
+        vtune <--vtune-flags> <regular command here>
 
     The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
     control the collection of trace data during its execution across different Intel tools.
@@ -616,7 +616,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return False
 
 
-class emit_nvtx(object):
+class emit_nvtx:
     """Context manager that makes every autograd operation emit an NVTX range.
 
     It is useful when running the program under nvprof::
@@ -742,7 +742,7 @@ def load_nvprof(path):
     return EventList(parse_nvprof_trace(path))
 
 
-class EnforceUnique(object):
+class EnforceUnique:
     """Raises an error if a key is seen more than once."""
     def __init__(self):
         self.seen = set()
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 5848e21ed15e..1f71c61d51d2 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -15,7 +15,7 @@
 
 __all__ = ["profile"]
 
-class profile(object):
+class profile:
     """DEPRECATED: use torch.profiler instead"""
     def __init__(
             self,
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index 891992aed5c6..a4585c9699b0 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs):
         use_cuda = kwargs.pop('use_cuda', True)
         profile_memory = kwargs.pop('profile_memory', False)
         with_flops = kwargs.pop('with_flops', False)
-        super(EventList, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self._use_cuda = use_cuda
         self._profile_memory = profile_memory
         self._tree_built = False
@@ -349,7 +349,7 @@ def _attr_formatter(name):
     return property(lambda self: _format_time(getattr(self, name)))
 
 
-class FormattedTimesMixin(object):
+class FormattedTimesMixin:
     """Helpers for FunctionEvent and FunctionEventAvg.
 
     The subclass should define `*_time_total` and `count` attributes.
@@ -370,7 +370,7 @@ def cuda_time(self):
         return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count  # type: ignore[attr-defined]
 
 
-class Interval(object):
+class Interval:
     def __init__(self, start, end):
         self.start = start
         self.end = end
diff --git a/torch/autograd/variable.py b/torch/autograd/variable.py
index 57b210e7fe5d..ed841d4da7d4 100644
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@@ -1,15 +1,14 @@
 import torch
-from torch._six import with_metaclass
+from torch._C import _ImperativeEngine as ImperativeEngine
+
 
 __all__ = ["VariableMeta", "Variable"]
 
+
 class VariableMeta(type):
     def __instancecheck__(cls, other):
         return isinstance(other, torch.Tensor)
 
-# mypy doesn't understand torch._six.with_metaclass
-class Variable(with_metaclass(VariableMeta, torch._C._LegacyVariableBase)):  # type: ignore[misc]
-    pass
 
-from torch._C import _ImperativeEngine as ImperativeEngine
-Variable._execution_engine = ImperativeEngine()
+class Variable(torch._C._LegacyVariableBase, metaclass=VariableMeta):  # type: ignore[misc]
+    _execution_engine = ImperativeEngine()
diff --git a/torch/backends/__init__.py b/torch/backends/__init__.py
index 9d74b8f9f0f0..5f8e5171bc2e 100644
--- a/torch/backends/__init__.py
+++ b/torch/backends/__init__.py
@@ -23,7 +23,7 @@ def __allow_nonbracketed_mutation():
     finally:
         __allow_nonbracketed_mutation_flag = old
 
-class ContextProp(object):
+class ContextProp:
     def __init__(self, getter, setter):
         self.getter = getter
         self.setter = setter
@@ -40,7 +40,7 @@ def __set__(self, obj, val):
 
 class PropModule(types.ModuleType):
     def __init__(self, m, name):
-        super(PropModule, self).__init__(name)
+        super().__init__(name)
         self.m = m
 
     def __getattr__(self, attr):
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 4bbf9b5e8530..d16bfea22215 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -21,7 +21,7 @@
 LOG = logging.getLogger("nnapi_serialize")
 
 
-class NNAPI_OperandCode(object):
+class NNAPI_OperandCode:
     FLOAT32 = 0
     INT32 = 1
     UINT32 = 2
@@ -37,7 +37,7 @@ class NNAPI_OperandCode(object):
     TENSOR_QUANT16_ASYMM = 12
 
 
-class NNAPI_OperationCode(object):
+class NNAPI_OperationCode:
     ADD = 0
     AVERAGE_POOL_2D = 1
     CONCATENATION = 2
@@ -135,14 +135,14 @@ class NNAPI_OperationCode(object):
     RESIZE_NEAREST_NEIGHBOR = 94
 
 
-class NNAPI_FuseCode(object):
+class NNAPI_FuseCode:
     FUSED_NONE = 0
     FUSED_RELU = 1
     FUSED_RELU1 = 2
     FUSED_RELU6 = 3
 
 
-class OperandValueSourceType(object):
+class OperandValueSourceType:
     IMMEDIATE = 0
     NUMBERED_BUFFER = 2
     NUMBERED_MEMORY = 3
@@ -319,7 +319,7 @@ def flex_name(op_id, dim):
     return f"s_{op_id}_{dim}"
 
 
-class _NnapiSerializer(object):
+class _NnapiSerializer:
     def __init__(self, config, use_int16_for_qint16=False):
         self.operands = []
         self.values = []
@@ -491,7 +491,7 @@ def get_tensor_operand_by_jitval_fixed_size(self, jitval):
                 raise Exception("Flexible size is not supported for this operand.")
             if s < 0:
                 # runtime flex
-                LOG.warn(f"Operand {oper} has runtime flex shape")
+                LOG.warning(f"Operand {oper} has runtime flex shape")
         return op_id, oper
 
     def get_tensor_operand_or_constant(self, jitval, dim_order=DimOrder.PRESUMED_CONTIGUOUS):
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index 2fd5cf6fdce9..4f188ee8ad53 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -18,7 +18,7 @@ def is_built():
     return torch._C.has_cuda
 
 
-class cuFFTPlanCacheAttrContextProp(object):
+class cuFFTPlanCacheAttrContextProp:
     # Like regular ContextProp, but uses the `.device_index` attribute from the
     # calling object as the first argument to the getter and setter.
     def __init__(self, getter, setter):
@@ -34,7 +34,7 @@ def __set__(self, obj, val):
         self.setter(obj.device_index, val)
 
 
-class cuFFTPlanCache(object):
+class cuFFTPlanCache:
     r"""
     Represents a specific plan cache for a specific `device_index`. The
     attributes `size` and `max_size`, and method `clear`, can fetch and/ or
@@ -55,7 +55,7 @@ def clear(self):
         return torch._cufft_clear_plan_cache(self.device_index)
 
 
-class cuFFTPlanCacheManager(object):
+class cuFFTPlanCacheManager:
     r"""
     Represents all cuFFT plan caches. When indexed with a device object/index,
     this object returns the `cuFFTPlanCache` corresponding to that device.
@@ -88,7 +88,7 @@ def __setattr__(self, name, value):
         if self.__initialized:
             return setattr(self[torch.cuda.current_device()], name, value)
         else:
-            return super(cuFFTPlanCacheManager, self).__setattr__(name, value)
+            return super().__setattr__(name, value)
 
 
 class cuBLASModule:
@@ -172,9 +172,9 @@ def preferred_linalg_library(backend: Union[None, str, torch._C._LinalgBackend]
 class SDPBackend(IntEnum):
     r"""Enum class for the scaled dot product attention backends.
 
-    .. warning:: This flag is experimental and subject to change.'
+    .. warning:: This class is in beta and subject to change.
 
-    This class needs to stay inline with the enum defined in:
+    This class needs to stay aligned with the enum defined in:
     pytorch/aten/src/ATen/native/transformers/sdp_utils_cpp.h
     """
     ERROR = -1
@@ -185,52 +185,52 @@ class SDPBackend(IntEnum):
 
 def flash_sdp_enabled():
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    Returns whether flash sdp is enabled or not.
+    Returns whether flash scaled dot product attention is enabled or not.
     """
     return torch._C._get_flash_sdp_enabled()
 
 
 def enable_flash_sdp(enabled: bool):
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    Enables or disables flash sdp.
+    Enables or disables flash scaled dot product attention.
     """
     torch._C._set_sdp_use_flash(enabled)
 
 def mem_efficient_sdp_enabled():
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    Returns whether memory efficient sdp is enabled or not.
+    Returns whether memory efficient scaled dot product attention is enabled or not.
     """
     return torch._C._get_mem_efficient_sdp_enabled()
 
 
 def enable_mem_efficient_sdp(enabled: bool):
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    Enables or disables memory efficient sdp.
+    Enables or disables memory efficient scaled dot product attention.
     """
     torch._C._set_sdp_use_mem_efficient(enabled)
 
 def math_sdp_enabled():
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    Returns whether math sdp is enabled or not.
+    Returns whether math scaled dot product attention is enabled or not.
     """
     return torch._C._get_math_sdp_enabled()
 
 
 def enable_math_sdp(enabled: bool):
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    Enables or disables math sdp.
+    Enables or disables math scaled dot product attention.
     """
     torch._C._set_sdp_use_math(enabled)
 
@@ -238,9 +238,9 @@ def enable_math_sdp(enabled: bool):
 @contextlib.contextmanager
 def sdp_kernel(enable_flash: bool = True, enable_math: bool = True, enable_mem_efficient: bool = True):
     r"""
-    .. warning:: This flag is experimental and subject to change.
+    .. warning:: This flag is beta and subject to change.
 
-    This context manager can be used to temporarily enable or disable flash/memory efficient sdp and math sdp.
+    This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
     Upon exiting the context manager, the previous state of the flags will be restored.
     """
     previous_flash: bool = flash_sdp_enabled()
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 2b63a6379665..1875a50eb1c8 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -141,7 +141,7 @@ def flags(enabled=False, benchmark=False, benchmark_limit=10, deterministic=Fals
 
 class CudnnModule(PropModule):
     def __init__(self, m, name):
-        super(CudnnModule, self).__init__(m, name)
+        super().__init__(m, name)
 
     enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
     deterministic = ContextProp(torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic)
diff --git a/torch/backends/cudnn/rnn.py b/torch/backends/cudnn/rnn.py
index 3fa81b42cb11..706244e2bc3e 100644
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@@ -24,7 +24,7 @@ def get_cudnn_mode(mode):
 # NB: We don't actually need this class anymore (in fact, we could serialize the
 # dropout state for even better reproducibility), but it is kept for backwards
 # compatibility for old models.
-class Unserializable(object):
+class Unserializable:
 
     def __init__(self, inner):
         self.inner = inner
diff --git a/torch/backends/mkl/__init__.py b/torch/backends/mkl/__init__.py
index 25c11ea10515..22cad6db2203 100644
--- a/torch/backends/mkl/__init__.py
+++ b/torch/backends/mkl/__init__.py
@@ -6,7 +6,7 @@ def is_available():
 
 VERBOSE_OFF = 0
 VERBOSE_ON = 1
-class verbose(object):
+class verbose:
     """
     On-demand oneMKL verbosing functionality
     To make it easier to debug performance issues, oneMKL can dump verbose
diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
index 00b22cee15e0..e9b7846e840d 100644
--- a/torch/backends/mkldnn/__init__.py
+++ b/torch/backends/mkldnn/__init__.py
@@ -10,7 +10,7 @@ def is_available():
 VERBOSE_OFF = 0
 VERBOSE_ON = 1
 VERBOSE_ON_CREATION = 2
-class verbose(object):
+class verbose:
     """
     On-demand oneDNN (former MKL-DNN) verbosing functionality
     To make it easier to debug performance issues, oneDNN can dump verbose
@@ -70,7 +70,7 @@ def flags(enabled=False):
 
 class MkldnnModule(PropModule):
     def __init__(self, m, name):
-        super(MkldnnModule, self).__init__(m, name)
+        super().__init__(m, name)
 
     enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
 
diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py
index 80dc735f7b43..2c6ef64665bc 100644
--- a/torch/backends/mps/__init__.py
+++ b/torch/backends/mps/__init__.py
@@ -15,13 +15,13 @@ def is_built() -> bool:
 @_lru_cache()
 def is_available() -> bool:
     r"""Returns a bool indicating if MPS is currently available."""
-    return torch._C._is_mps_available()
+    return torch._C._mps_is_available()
 
 
 @_lru_cache()
-def is_macos13_or_newer() -> bool:
+def is_macos13_or_newer(minor: int = 0) -> bool:
     r"""Returns a bool indicating whether MPS is running on MacOS 13 or newer."""
-    return torch._C._is_mps_on_macos_13_or_newer()
+    return torch._C._mps_is_on_macos_13_or_newer(minor)
 
 
 # Register prims as implementation of var_mean and group_norm
diff --git a/torch/backends/opt_einsum/__init__.py b/torch/backends/opt_einsum/__init__.py
index 966258fdd016..5a280b08b4f9 100644
--- a/torch/backends/opt_einsum/__init__.py
+++ b/torch/backends/opt_einsum/__init__.py
@@ -82,7 +82,7 @@ def flags(enabled=None, strategy=None):
 
 class OptEinsumModule(PropModule):
     def __init__(self, m, name):
-        super(OptEinsumModule, self).__init__(m, name)
+        super().__init__(m, name)
 
     global enabled
     enabled = ContextProp(_get_enabled, _set_enabled)
diff --git a/torch/backends/quantized/__init__.py b/torch/backends/quantized/__init__.py
index 2db2b672f1b4..c0d60916084f 100644
--- a/torch/backends/quantized/__init__.py
+++ b/torch/backends/quantized/__init__.py
@@ -25,14 +25,14 @@ def _get_qengine_str(qengine: int) -> str:
     all_engines = {0 : 'none', 1 : 'fbgemm', 2 : 'qnnpack', 3 : 'onednn', 4 : 'x86'}
     return all_engines.get(qengine, '*undefined')
 
-class _QEngineProp(object):
+class _QEngineProp:
     def __get__(self, obj, objtype) -> str:
         return _get_qengine_str(torch._C._get_qengine())
 
     def __set__(self, obj, val: str) -> None:
         torch._C._set_qengine(_get_qengine_id(val))
 
-class _SupportedQEnginesProp(object):
+class _SupportedQEnginesProp:
     def __get__(self, obj, objtype) -> List[str]:
         qengines = torch._C._supported_qengines()
         return [_get_qengine_str(qe) for qe in qengines]
@@ -42,7 +42,7 @@ def __set__(self, obj, val) -> None:
 
 class QuantizedEngine(types.ModuleType):
     def __init__(self, m, name):
-        super(QuantizedEngine, self).__init__(name)
+        super().__init__(name)
         self.m = m
 
     def __getattr__(self, attr):
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index 8774d8acc11d..0a5774ff2319 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -60,20 +60,20 @@
 
 ::
 
-   python -m torch.backends.xeon.run_cpu --throughput_mode script.py args
+   python -m torch.backends.xeon.run_cpu --throughput-mode script.py args
 
 2. Run single-instance inference on a single CPU node.
 
 ::
 
-   python -m torch.backends.xeon.run_cpu --node_id 1 script.py args
+   python -m torch.backends.xeon.run_cpu --node-id 1 script.py args
 
 Multi-instance inference
 ------------------------
 
 1. Multi-instance
    By default this tool runs one process per node. If you want to set the instance numbers and core per instance,
-   --ninstances and  --ncores_per_instance should be set.
+   --ninstances and  --ncores-per-instance should be set.
 
 ::
 
@@ -83,7 +83,7 @@
 
 ::
 
-   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores_per_instance 4 python_script args
+   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores-per-instance 4 python_script args
 
 2. Run single-instance inference among multiple instances.
    By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank.
@@ -105,7 +105,7 @@
 
 ::
 
-   python -m torch.backends.xeon.run_cpu --core_list "0, 1, 2, 3" --ninstances 2 --ncores_per_instance 2
+   python -m torch.backends.xeon.run_cpu --core-list "0, 1, 2, 3" --ninstances 2 --ncores-per-instance 2
    --rank 0 python_script args
 
 3. To look up what optional arguments this module offers:
@@ -117,7 +117,7 @@
 Memory allocator
 ----------------
 
-"--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator.
+"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator.
 
 """
 
@@ -233,8 +233,8 @@ def numa_aware_check(self, core_list):
                 numa_ids.append(numa_id)
         if len(numa_ids) > 1:
             logger.warning(f"Numa Aware: cores:{str(core_list)} on different NUMA nodes:{str(numa_ids)}. To avoid \
-this behavior, please use --ncores_per_instance knob to make sure number of cores is divisible by --ncores_per_\
-instance. Alternatively, please use --skip_cross_node_cores knob.")
+this behavior, please use --ncores-per-instance knob to make sure number of cores is divisible by --ncores-per-\
+instance. Alternatively, please use --skip-cross-node-cores knob.")
         if len(numa_ids) == 0:
             raise RuntimeError("invalid number of NUMA nodes; please make sure numa_ids >= 1")
         return numa_ids
@@ -376,7 +376,7 @@ def launch(self, args):
         if args.core_list:  # user specify what cores will be used by params
             cores = [int(x) for x in args.core_list.split(",")]
             if args.ncores_per_instance == -1:
-                raise RuntimeError("please specify the \"--ncores_per_instance\" if you have pass the --core_list params")
+                raise RuntimeError("please specify the \"--ncores-per-instance\" if you have pass the --core-list params")
             elif args.ninstances > 1 and args.ncores_per_instance * args.ninstances < len(cores):
                 logger.warning(f"only first {args.ncores_per_instance * args.ninstances} cores will be used, \
 but you specify {len(cores)} cores in core_list")
@@ -417,17 +417,17 @@ def launch(self, args):
                     if args.ncores_per_instance > ncore_per_node:
                         # too many ncores_per_instance to skip cross-node cores
                         logger.warning("there are {} core(s) per socket, but you specify {} ncores_per_instance and \
-skip_cross_node_cores. Please make sure --ncores_per_instance < core(s) per \
+skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \
 socket".format(ncore_per_node, args.ncores_per_instance))
                         exit(-1)
                     elif num_leftover_cores == 0:
                         # aren't any cross-node cores
-                        logger.info('--skip_cross_node_cores is set, but there are no cross-node cores.')
+                        logger.info('--skip-cross-node-cores is set, but there are no cross-node cores.')
                         args.ninstances = len(cores) // args.ncores_per_instance
                     else:
                         # skip cross-node cores
                         if args.ninstances != -1:
-                            logger.warning('--skip_cross_node_cores is exclusive to --ninstances. --ninstances \
+                            logger.warning('--skip-cross-node-cores is exclusive to --ninstances. --ninstances \
 won\'t take effect even if it is set explicitly.')
 
                         i = 1
@@ -442,15 +442,15 @@ def launch(self, args):
                 if args.ninstances * args.ncores_per_instance > len(cores):
                     raise RuntimeError("Please make sure ninstances * ncores_per_instance <= total_cores")
             if args.latency_mode:
-                logger.warning("--latency_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \
---use_logical_core. They won't take effect even they are set explicitly.")
+                logger.warning("--latency-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
+--use-logical-core. They won't take effect even they are set explicitly.")
                 args.ncores_per_instance = 4
                 cores = self.cpuinfo.get_all_physical_cores()
                 args.ninstances = len(cores) // args.ncores_per_instance
 
             if args.throughput_mode:
-                logger.warning("--throughput_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \
---use_logical_core. They won't take effect even they are set explicitly.")
+                logger.warning("--throughput-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
+--use-logical-core. They won't take effect even they are set explicitly.")
                 args.ninstances = self.cpuinfo.node_nums
                 cores = self.cpuinfo.get_all_physical_cores()
                 args.ncores_per_instance = len(cores) // args.ninstances
@@ -531,48 +531,48 @@ def _add_memory_allocator_params(parser):
 
     group = parser.add_argument_group("Memory Allocator Parameters")
     # allocator control
-    group.add_argument("--enable_tcmalloc", action="store_true", default=False,
+    group.add_argument("--enable-tcmalloc", "--enable_tcmalloc", action="store_true", default=False,
                        help="Enable tcmalloc allocator")
-    group.add_argument("--enable_jemalloc", action="store_true", default=False,
+    group.add_argument("--enable-jemalloc", "--enable_jemalloc", action="store_true", default=False,
                        help="Enable jemalloc allocator")
-    group.add_argument("--use_default_allocator", action="store_true", default=False,
+    group.add_argument("--use-default-allocator", "--use_default_allocator", action="store_true", default=False,
                        help="Use default memory allocator")
 
 def _add_multi_instance_params(parser):
 
     group = parser.add_argument_group("Multi-instance Parameters")
     # multi-instance control
-    group.add_argument("--ncores_per_instance", metavar="\b", default=-1, type=int,
+    group.add_argument("--ncores-per-instance", "--ncores_per_instance", metavar="\b", default=-1, type=int,
                        help="Cores per instance")
     group.add_argument("--ninstances", metavar="\b", default=-1, type=int,
                        help="For multi-instance, you should give the cores number you used for per instance.")
-    group.add_argument("--skip_cross_node_cores", action='store_true', default=False,
-                       help="If specified --ncores_per_instance, skips cross-node cores.")
+    group.add_argument("--skip-cross-node-cores", "--skip_cross_node_cores", action='store_true', default=False,
+                       help="If specified --ncores-per-instance, skips cross-node cores.")
     group.add_argument("--rank", metavar="\b", default="-1", type=int,
                        help="Specify instance index to assign ncores_per_instance for rank; \
 otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \
 https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md")
-    group.add_argument("--latency_mode", action="store_true", default=False,
+    group.add_argument("--latency-mode", "--latency_mode", action="store_true", default=False,
                        help="By detault 4 core per instance and use all physical cores")
-    group.add_argument("--throughput_mode", action="store_true", default=False,
+    group.add_argument("--throughput-mode", "--throughput_mode", action="store_true", default=False,
                        help="By default one instance per node and use all physical cores")
-    group.add_argument("--node_id", metavar="\b", default=-1, type=int,
+    group.add_argument("--node-id", "--node_id", metavar="\b", default=-1, type=int,
                        help="node id for multi-instance, by default all nodes will be used")
-    group.add_argument("--use_logical_core", action="store_true", default=False,
+    group.add_argument("--use-logical-core", "--use_logical_core", action="store_true", default=False,
                        help="Whether only use physical cores")
-    group.add_argument("--disable_numactl", action="store_true", default=False,
+    group.add_argument("--disable-numactl", "--disable_numactl", action="store_true", default=False,
                        help="Disable numactl")
-    group.add_argument("--core_list", metavar="\b", default=None, type=str,
+    group.add_argument("--core-list", "--core_list", metavar="\b", default=None, type=str,
                        help="Specify the core list as \"core_id, core_id, ....\", otherwise, all the cores will be used.")
-    group.add_argument("--log_path", metavar="\b", default="", type=str,
+    group.add_argument("--log-path", "--log_path", metavar="\b", default="", type=str,
                        help="The log file directory. Default path is "", which means disable logging to files.")
-    group.add_argument("--log_file_prefix", metavar="\b", default="run", type=str,
+    group.add_argument("--log-file-prefix", "--log_file_prefix", metavar="\b", default="run", type=str,
                        help="log file prefix")
 
 def _add_kmp_iomp_params(parser):
 
     group = parser.add_argument_group("IOMP Parameters")
-    group.add_argument("--disable_iomp", action="store_true", default=False,
+    group.add_argument("--disable-iomp", "--disable_iomp", action="store_true", default=False,
                        help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD")
 
 def create_args(parser=None):
@@ -580,7 +580,7 @@ def create_args(parser=None):
     Helper function parsing the command line options
     @retval ArgumentParser
     """
-    parser.add_argument("--multi_instance", action="store_true", default=False,
+    parser.add_argument("--multi-instance", "--multi_instance", action="store_true", default=False,
                         help="Enable multi-instance, by default one instance per node")
 
     parser.add_argument("-m", "--module", default=False, action="store_true",
@@ -588,7 +588,7 @@ def create_args(parser=None):
                              "as a python module, executing with the same behavior as"
                              "\"python -m\".")
 
-    parser.add_argument("--no_python", default=False, action="store_true",
+    parser.add_argument("--no-python", "--no_python", default=False, action="store_true",
                         help="Do not prepend the --program script with \"python\" - just exec "
                              "it directly. Useful when the script is not a Python script.")
 
@@ -618,7 +618,7 @@ def main(args):
         raise RuntimeError("Either args.latency_mode or args.throughput_mode should be set")
 
     if not args.no_python and not args.program.endswith(".py"):
-        raise RuntimeError("For non Python script, you should use \"--no_python\" parameter.")
+        raise RuntimeError("For non Python script, you should use \"--no-python\" parameter.")
 
     # Verify LD_PRELOAD
     if "LD_PRELOAD" in os.environ:
@@ -653,7 +653,7 @@ def main(args):
                                         "\n   >>> python -m torch.backends.xeon.run_cpu python_script args \n"
                                         "\n2. multi-instance \n"
                                         "\n   >>> python -m torch.backends.xeon.run_cpu --ninstances xxx "
-                                        "--ncores_per_instance xx python_script args\n"
+                                        "--ncores-per-instance xx python_script args\n"
                                         "\n############################################################################# \n",
                                         formatter_class=RawTextHelpFormatter)
     create_args(parser)
diff --git a/torch/backends/xnnpack/__init__.py b/torch/backends/xnnpack/__init__.py
index 3731413575f2..17c7f15b355b 100644
--- a/torch/backends/xnnpack/__init__.py
+++ b/torch/backends/xnnpack/__init__.py
@@ -2,7 +2,7 @@
 import torch
 import types
 
-class _XNNPACKEnabled(object):
+class _XNNPACKEnabled:
     def __get__(self, obj, objtype):
         return torch._C._is_xnnpack_enabled()
 
@@ -11,7 +11,7 @@ def __set__(self, obj, val):
 
 class XNNPACKEngine(types.ModuleType):
     def __init__(self, m, name):
-        super(XNNPACKEngine, self).__init__(name)
+        super().__init__(name)
         self.m = m
 
     def __getattr__(self, attr):
diff --git a/torch/csrc/Device.h b/torch/csrc/Device.h
index 665c38bf035d..5b45e3902e83 100644
--- a/torch/csrc/Device.h
+++ b/torch/csrc/Device.h
@@ -5,7 +5,6 @@
 
 #include <ATen/Device.h>
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct TORCH_API THPDevice {
   PyObject_HEAD at::Device device;
 };
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 67ac3decd6b1..8ac76b723002 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -1,11 +1,10 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/python_headers.h>
 
+#include <array>
 #include <cstdarg>
 #include <exception>
-#include <sstream>
 #include <utility>
-#include <vector>
 
 #include <fmt/format.h>
 #include <torch/csrc/THP.h>
@@ -82,6 +81,7 @@ void processErrorMsgInplace(std::string& str) {
   // Translate Aten types to their respective pytorch ones
   constexpr std::array<std::pair<c10::string_view, c10::string_view>, 64>
       changes{{
+          // TODO: remove torch.(cuda.|)sparse.*Tensor items?
           {"Variable[SparseCUDAByteType]", "torch.cuda.sparse.ByteTensor"},
           {"Variable[SparseCUDACharType]", "torch.cuda.sparse.CharTensor"},
           {"Variable[SparseCUDADoubleType]", "torch.cuda.sparse.DoubleTensor"},
@@ -249,7 +249,7 @@ PyWarningHandler::~PyWarningHandler() noexcept(false) {
   c10::WarningUtils::set_warning_handler(prev_handler_);
   auto& warning_buffer = internal_handler_.warning_buffer_;
 
-  if (warning_buffer.size() > 0) {
+  if (!warning_buffer.empty()) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     PyObject *type, *value, *traceback;
     pybind11::gil_scoped_acquire gil;
@@ -281,16 +281,13 @@ PyWarningHandler::~PyWarningHandler() noexcept(false) {
       } else {
         // Lets Python set the source location and puts the C++ warning
         // location into the message.
-        fmt::memory_buffer buf;
-        fmt::format_to(
-            buf,
-            FMT_STRING("{} (Triggered internally at {}:{}.)"),
+        auto buf = fmt::format(
+            "{} (Triggered internally at {}:{}.)",
             msg,
             source_location.file,
             source_location.line);
-        buf.push_back('\0');
         result =
-            PyErr_WarnEx(map_warning_to_python_type(warning), buf.data(), 1);
+            PyErr_WarnEx(map_warning_to_python_type(warning), buf.c_str(), 1);
       }
       if (result < 0) {
         if (in_exception_) {
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 929b5a69c2d1..b9042658ebb8 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -68,6 +68,10 @@ static inline void PyErr_SetString(PyObject* type, const std::string& message) {
     e.restore();                                                        \
     retstmnt;                                                           \
   }                                                                     \
+  catch (py::error_already_set & e) {                                   \
+    e.restore();                                                        \
+    retstmnt;                                                           \
+  }                                                                     \
   _CATCH_GENERIC_ERROR(IndexError, PyExc_IndexError, retstmnt)          \
   _CATCH_GENERIC_ERROR(ValueError, PyExc_ValueError, retstmnt)          \
   _CATCH_GENERIC_ERROR(TypeError, PyExc_TypeError, retstmnt)            \
@@ -301,7 +305,7 @@ struct IndexError : public PyTorchError {
 // Translates to Python TypeError
 struct TypeError : public PyTorchError {
   using PyTorchError::PyTorchError;
-  TORCH_API TypeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
+  TORCH_PYTHON_API TypeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
   PyObject* python_type() override {
     return PyExc_TypeError;
   }
@@ -354,9 +358,9 @@ struct PyWarningHandler {
 
  public:
   /// See NOTE [ Conversion Cpp Python Warning ] for noexcept justification
-  TORCH_API PyWarningHandler() noexcept(true);
+  TORCH_PYTHON_API PyWarningHandler() noexcept(true);
   // NOLINTNEXTLINE(bugprone-exception-escape)
-  TORCH_API ~PyWarningHandler() noexcept(false);
+  TORCH_PYTHON_API ~PyWarningHandler() noexcept(false);
 
   /** Call if an exception has been thrown
 
@@ -379,14 +383,23 @@ template <typename Func, size_t i>
 using Arg = typename invoke_traits<Func>::template arg<i>::type;
 
 template <typename Func, size_t... Is>
-auto wrap_pybind_function_impl_(Func&& f, std::index_sequence<Is...>) {
+auto wrap_pybind_function_impl_(
+    Func&& f,
+    std::index_sequence<Is...>,
+    bool release_gil) {
   using result_type = typename invoke_traits<Func>::result_type;
   namespace py = pybind11;
 
   // f=f is needed to handle function references on older compilers
-  return [f = std::forward<Func>(f)](Arg<Func, Is>... args) -> result_type {
+  return [f = std::forward<Func>(f),
+          release_gil](Arg<Func, Is>... args) -> result_type {
     HANDLE_TH_ERRORS
-    return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+    if (release_gil) {
+      py::gil_scoped_release no_gil;
+      return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+    } else {
+      return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+    }
     END_HANDLE_TH_ERRORS_PYBIND
   };
 }
@@ -398,7 +411,16 @@ template <typename Func>
 auto wrap_pybind_function(Func&& f) {
   using traits = invoke_traits<Func>;
   return torch::detail::wrap_pybind_function_impl_(
-      std::forward<Func>(f), std::make_index_sequence<traits::arity>{});
+      std::forward<Func>(f), std::make_index_sequence<traits::arity>{}, false);
+}
+
+// Wrap a function with TH error, warning handling and releases the GIL.
+// Returns a function object suitable for registering with pybind11.
+template <typename Func>
+auto wrap_pybind_function_no_gil(Func&& f) {
+  using traits = invoke_traits<Func>;
+  return torch::detail::wrap_pybind_function_impl_(
+      std::forward<Func>(f), std::make_index_sequence<traits::arity>{}, true);
 }
 
 } // namespace torch
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 1f4d9ac30161..37b1ede4b09f 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -27,6 +27,7 @@
 #include <cstdlib>
 #include <unordered_map>
 
+#include <ATen/ThreadLocalPythonObjects.h>
 #include <torch/csrc/DataLoader.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/Dtype.h>
@@ -59,6 +60,7 @@
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/lazy/python/init.h>
 #include <torch/csrc/monitor/python_init.h>
+#include <torch/csrc/mps/Module.h>
 #include <torch/csrc/multiprocessing/init.h>
 #include <torch/csrc/onnx/init.h>
 #include <torch/csrc/profiler/python/init.h>
@@ -86,10 +88,6 @@
 #endif
 #endif
 
-#if defined(USE_MPS)
-#include <ATen/mps/MPSDevice.h>
-#endif
-
 #if defined(USE_VALGRIND)
 #include <callgrind.h>
 #endif
@@ -1246,11 +1244,7 @@ class WeakTensorRef {
   }
 };
 
-extern "C"
-#ifdef _WIN32
-    __declspec(dllexport)
-#endif
-        TORCH_API PyObject* initModule();
+extern "C" C10_EXPORT PyObject* initModule();
 // separate decl and defn for msvc error C2491
 PyObject* initModule() {
   HANDLE_TH_ERRORS
@@ -1270,6 +1264,7 @@ PyObject* initModule() {
   THPUtils_addPyMethodDefs(methods, DataLoaderMethods);
   THPUtils_addPyMethodDefs(methods, torch::autograd::python_functions());
   THPUtils_addPyMethodDefs(methods, torch::multiprocessing::python_functions());
+  THPUtils_addPyMethodDefs(methods, torch::mps::python_functions());
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
@@ -1561,6 +1556,23 @@ Call this whenever a new thread is created in order to propagate values from
     return at::globalContext().linalgPreferredBackend();
   });
 
+  py_module.def("_stash_obj_in_tls", [](std::string key, py::handle arg) {
+    at::impl::ThreadLocalPythonObjects::get_state().set(
+        key,
+        std::make_shared<c10::SafePyObject>(arg.ptr(), getPyInterpreter()));
+  });
+
+  py_module.def("_get_obj_in_tls", [](std::string key) -> py::handle {
+    auto safe_pyobject =
+        at::impl::ThreadLocalPythonObjects::get_state().get(key);
+    auto obj = safe_pyobject->ptr(getPyInterpreter());
+    return py::handle(obj);
+  });
+
+  py_module.def("_is_key_in_tls", [](std::string key) -> bool {
+    return at::impl::ThreadLocalPythonObjects::get_state().contains(key);
+  });
+
 #ifdef USE_CUDA
   PyObject* has_cuda = Py_True;
 #else
@@ -1575,15 +1587,6 @@ Call this whenever a new thread is created in order to propagate values from
 
   ASSERT_TRUE(set_module_attr("has_cuda", has_cuda));
   ASSERT_TRUE(set_module_attr("has_mps", has_mps));
-  py_module.def("_is_mps_available", []() { return at::hasMPS(); });
-  py_module.def("_is_mps_on_macos_13_or_newer", []() {
-#ifdef USE_MPS
-    return at::mps::is_macos_13_or_newer();
-#else
-    return false;
-#endif
-  });
-
   ASSERT_TRUE(
       set_module_attr("has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
 
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
new file mode 100644
index 000000000000..a01b1d39eb9d
--- /dev/null
+++ b/torch/csrc/PyInterpreter.cpp
@@ -0,0 +1,808 @@
+#include <ATen/core/PythonFallbackKernel.h>
+#include <ATen/core/PythonOpRegistrationTrampoline.h>
+#include <torch/csrc/PyInterpreter.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/autograd/generated/VariableType.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/python_dispatch.h>
+
+#include <string>
+
+using namespace torch;
+using namespace at;
+using namespace c10;
+
+namespace {
+
+// NB: This is a macro and not a template function (like it was before)
+// because passing in constexpr char* as template argument breaks some
+// versions of MSVC that are being used internally at Meta.
+// MSVC 14.16.27023 (vs2017_15.9)
+#define CONCRETE_TRACE_CUDA(func_name, ...)                           \
+  at::impl::MaybeSetTLSOnEntryGuard guard;                            \
+  if (Py_IsInitialized()) {                                           \
+    pybind11::gil_scoped_acquire gil;                                 \
+    try {                                                             \
+      py::module mod = py::module::import("torch.utils._cuda_trace"); \
+      py::object hook = mod.attr(func_name).attr("fire_callbacks");   \
+      hook(__VA_ARGS__);                                              \
+    } catch (const std::exception& e) {                               \
+      LOG(ERROR) << "CUDA trace hook execution failed: " << e.what(); \
+    }                                                                 \
+  }
+
+struct ConcretePyInterpreterVTable final
+    : public c10::impl::PyInterpreterVTable {
+  std::string name() const override;
+
+  void decref(PyObject* pyobj, bool is_tensor) const override;
+
+  // TODO: Need to make this work for StorageImpl too. I imagine I'll want to
+  // operate upon a PyObjectSlot rather than a TensorImpl
+  c10::intrusive_ptr<c10::TensorImpl> detach(
+      const c10::TensorImpl* self) const override;
+
+  void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
+      const override;
+  void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const override;
+  // NB: this is defined in python_dispatch.cpp
+  void python_op_registration_trampoline(
+      const c10::OperatorHandle& op,
+      c10::DispatchKey key,
+      torch::jit::Stack* stack) const override {
+    torch::impl::dispatch::python_op_registration_trampoline_impl(
+        op, key, stack);
+  }
+
+  bool is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
+      const override;
+  bool is_strides_like(const c10::TensorImpl* self, at::MemoryFormat)
+      const override;
+  bool is_non_overlapping_and_dense(const c10::TensorImpl* self) const override;
+  c10::Device device(const c10::TensorImpl* self) const override;
+  int64_t dim(const c10::TensorImpl* self) const override;
+  c10::IntArrayRef strides(const c10::TensorImpl* self) const override;
+  c10::IntArrayRef sizes(const c10::TensorImpl* self) const override;
+  c10::SymIntArrayRef sym_sizes(const c10::TensorImpl* self) const override;
+  c10::Layout layout(const c10::TensorImpl* self) const override;
+  c10::SymInt sym_numel(const c10::TensorImpl* self) const override;
+  c10::SymIntArrayRef sym_strides(const c10::TensorImpl* self) const override;
+  c10::SymInt sym_storage_offset(const c10::TensorImpl* self) const override;
+
+  void trace_gpu_event_creation(uintptr_t event) const override {
+    CONCRETE_TRACE_CUDA("CUDAEventCreationCallbacks", event);
+  }
+  void trace_gpu_event_deletion(uintptr_t event) const override {
+    CONCRETE_TRACE_CUDA("CUDAEventDeletionCallbacks", event);
+  }
+  void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
+      const override {
+    CONCRETE_TRACE_CUDA("CUDAEventRecordCallbacks", event, stream);
+  }
+  void trace_gpu_event_wait(uintptr_t event, uintptr_t stream) const override {
+    CONCRETE_TRACE_CUDA("CUDAEventWaitCallbacks", event, stream);
+  }
+  void trace_gpu_memory_allocation(uintptr_t ptr) const override {
+    CONCRETE_TRACE_CUDA("CUDAMemoryAllocationCallbacks", ptr);
+  }
+  void trace_gpu_memory_deallocation(uintptr_t ptr) const override {
+    CONCRETE_TRACE_CUDA("CUDAMemoryDeallocationCallbacks", ptr);
+  }
+  void trace_gpu_stream_creation(uintptr_t stream) const override {
+    CONCRETE_TRACE_CUDA("CUDAStreamCreationCallbacks", stream);
+  }
+  void trace_gpu_device_synchronization() const override {
+    CONCRETE_TRACE_CUDA("CUDADeviceSynchronizationCallbacks");
+  }
+  void trace_gpu_stream_synchronization(uintptr_t stream) const override {
+    CONCRETE_TRACE_CUDA("CUDAStreamSynchronizationCallbacks", stream);
+  }
+  void trace_gpu_event_synchronization(uintptr_t event) const override {
+    CONCRETE_TRACE_CUDA("CUDAEventSynchronizationCallbacks", event);
+  }
+
+  void reset_backward_hooks(const c10::TensorImpl* self) const override;
+
+  static ConcretePyInterpreterVTable* instance() {
+    static ConcretePyInterpreterVTable s;
+    return &s;
+  }
+};
+
+class PyInterpreterHolder {
+ public:
+  PyInterpreterHolder()
+      : impl_(new c10::impl::PyInterpreter(
+            ConcretePyInterpreterVTable::instance())) {
+    is_main_interpreter_ =
+        at::impl::PythonOpRegistrationTrampoline::registerInterpreter(impl_);
+  }
+  // NB: intentionally leaks the PyInterpreter, as there may still be
+  // references to it that are live, living in objects that aren't being
+  // destructed while Python is being cleaned up.
+  ~PyInterpreterHolder() {
+    impl_->disarm();
+  }
+  c10::impl::PyInterpreter* get() const noexcept {
+    return impl_;
+  }
+  bool is_main_interpreter() const noexcept {
+    return is_main_interpreter_;
+  }
+
+ private:
+  c10::impl::PyInterpreter* impl_;
+  bool is_main_interpreter_;
+};
+
+py::object torchDispatchFromTensorImpl(
+    const c10::TensorImpl* self,
+    const char* func_name,
+    PyObject* torch_api_function,
+    const char* module_name,
+    // WARNING: MUST NOT BE TENSOR ARGS
+    c10::SmallVector<py::object, 1> extra_args = {}) {
+  if (torch_api_function == nullptr) {
+    throw python_error();
+  }
+  TORCH_CHECK(
+      PyGILState_Check(),
+      "GIL must be held before you call parseIValuesToPyArgsKwargs");
+
+  std::vector<py::handle> overloaded_args;
+  // TODO: there should be a shorter way to spell this
+  // TODO: fix the constness of target
+  at::Tensor self_t = at::Tensor(
+      c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::
+          unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
+  auto self_p =
+      py::reinterpret_steal<py::object>(THPVariable_Wrap(std::move(self_t)));
+  // NB: this may not be a python tensor if you got here from a mode!
+  // TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
+  append_overloaded_tensor(&overloaded_args, self_p.ptr());
+  auto args =
+      py::reinterpret_steal<py::object>(PyTuple_New(1 + extra_args.size()));
+  PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
+  int64_t i = 1;
+  for (auto& a : extra_args) {
+    if (a.ptr() == nullptr)
+      throw python_error();
+    PyTuple_SET_ITEM(args.ptr(), i, std::move(a).release().ptr());
+    i++;
+  }
+
+  py::dict kwargs;
+
+  return py::reinterpret_steal<py::object>(
+      handle_torch_function_no_python_arg_parser(
+          overloaded_args,
+          args.ptr(),
+          kwargs.ptr(),
+          func_name,
+          torch_api_function,
+          module_name,
+          TorchFunctionName::TorchDispatch));
+}
+
+// NOTE [PyInterpreter::decref takes an `is_tensor` arg]
+// Before calling PyInterpreter::decref, we must statically know if the
+// pyobj is a Tensor or not.
+// - If it is a tensor, we need to be careful about PyObject resurrection
+// - If it is not a tensor, we can freely decref
+// One alternative to this is using PyObject_IsInstance
+// to get at this information. However, we don't want to risk an incorrect
+// `__instancecheck__` changing the semantics here.
+void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool is_tensor)
+    const {
+  // Leak the pyobj if not initialized.  This can happen if we are running
+  // exit handlers that are destructing tensors with residual (owned)
+  // PyObjects stored in them.
+  if (!Py_IsInitialized())
+    return;
+
+  pybind11::gil_scoped_acquire gil;
+  // Two possibilities:
+  // 1. We are decref-ing a tensor. Then we must be careful about
+  // PyObject resurrection (this only applies to Tensors, see
+  // THPVariable_clear).
+  // 2. We are decref-ing some other Python object. We don't do
+  // PyObject resurrection on non-Tensors, so we just carry on as usual
+  if (is_tensor && Py_REFCNT(pyobj) > 1) {
+    // It's still alive!  This can happen if a weak ref resurrected
+    // the PyObject without flipping ownership.  At this point it is
+    // too late to rescue the object, so just stub out the PyObject
+    // so that it fails on subsequent uses.  Don't raise an error here;
+    // you're probably in a destructor.
+    TORCH_WARN(
+        "Deallocating Tensor that still has live PyObject references.  "
+        "This probably happened because you took out a weak reference to "
+        "Tensor and didn't call _fix_weakref() after dereferencing it.  "
+        "Subsequent accesses to this tensor via the PyObject will now fail.");
+    ((THPVariable*)pyobj)->cdata = c10::MaybeOwned<torch::autograd::Variable>();
+  }
+  Py_DECREF(pyobj);
+};
+
+py::handle getTorchApiFunction(const c10::OperatorHandle& op) {
+  return op.getPythonOp(getPyInterpreter(), [&]() -> PyObject* {
+    // Parse the name into namespace and name (no overload_name)
+    // TODO: put this into the library
+    const auto& schema = op.schema();
+    const auto& qualified_name = op.operator_name().name;
+    const auto& overload_name = schema.overload_name();
+    auto pos = qualified_name.find("::");
+    TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name);
+    // Make me some null terminated strings
+    std::string ns_str = qualified_name.substr(0, pos);
+    const char* ns = ns_str.c_str();
+    const char* func_name = qualified_name.c_str() + pos + strlen("::");
+
+    py::handle torch_api_function =
+        py::module::import("torch").attr("ops").attr(ns).attr(func_name);
+    if (overload_name.empty()) {
+      return torch_api_function.attr("default").ptr();
+    } else {
+      return torch_api_function.attr(overload_name.c_str()).ptr();
+    }
+  });
+}
+
+bool isPythonTensor(const at::Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
+}
+
+void ConcretePyInterpreterVTable::dispatch(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) const {
+  const auto& schema = op.schema();
+  const auto num_arguments = schema.arguments().size();
+  auto arguments = torch::jit::pop(*stack, num_arguments);
+
+  // The plan: convert all the arguments back into PyObjects,
+  // extracting out the tensor handles, then call
+  // handle_torch_function_no_python_arg_parser
+  // NB: at the point arguments are pushed to the stack, ALL defaults
+  // are already present
+
+  py::gil_scoped_acquire g;
+
+  std::vector<py::handle> overloaded_args;
+  py::handle torch_api_function_overload = getTorchApiFunction(op);
+
+  // Find overloaded tensors
+  for (const auto idx : c10::irange(arguments.size())) {
+    const auto& ivalue = arguments[idx];
+    if (ivalue.isTensor()) {
+      const auto& tensor = ivalue.toTensor();
+      if (isPythonTensor(tensor)) {
+        append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
+      }
+    } else if (ivalue.isList()) {
+      const auto& list = ivalue.toListRef();
+      for (const auto jdx : c10::irange(list.size())) {
+        const auto& nv = list[jdx];
+        if (nv.isTensor()) {
+          const auto& tensor = nv.toTensor();
+          if (isPythonTensor(tensor)) {
+            append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
+          }
+        }
+      }
+    }
+  }
+
+  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
+  auto args = std::move(args_kwargs.first);
+  auto kwargs = std::move(args_kwargs.second);
+
+  PyObject* obj = handle_torch_function_no_python_arg_parser(
+      overloaded_args,
+      args.ptr(),
+      kwargs.ptr(),
+      nullptr,
+      torch_api_function_overload.ptr(),
+      nullptr,
+      TorchFunctionName::TorchDispatch);
+  pushPyOutToStack(
+      op, stack, py::reinterpret_steal<py::object>(obj), "__torch_dispatch__");
+}
+
+void ConcretePyInterpreterVTable::python_dispatcher(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet ks,
+    torch::jit::Stack* stack) const {
+  py::gil_scoped_acquire g;
+  py::handle torch_api_function_overload = getTorchApiFunction(op);
+  // TODO: if necessary, can optimize to cache the cache lookup
+  // TODO: if necessary, can optimize OpOverload to have slots
+  auto cache = py::dict(torch_api_function_overload.attr("_dispatch_cache"));
+  if (cache.ptr() == nullptr) {
+    throw python_error();
+  }
+
+  c10::DispatchKey k = ks.highestPriorityTypeId();
+  // TODO: allow this to be non-owning
+  auto handler = py::reinterpret_borrow<py::object>(
+      PyDict_GetItem(cache.ptr(), py::cast(k).ptr()));
+  if (handler.ptr() == nullptr) {
+    // Slow path
+    handler = torch_api_function_overload.attr("_get_dispatch")(k);
+  }
+  if (py::isinstance<c10::DispatchKey>(handler)) {
+    // NB: not redispatch, as that will permanently remove the python
+    // dispatcher for subsequent redispatches
+    op.callBoxedForDispatchKey(py::cast<c10::DispatchKey>(handler), *stack);
+    return;
+  }
+
+  const auto& schema = op.schema();
+  const auto num_arguments = schema.arguments().size();
+  auto arguments = torch::jit::pop(*stack, num_arguments);
+
+  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
+  auto args = std::move(args_kwargs.first);
+  auto kwargs = std::move(args_kwargs.second);
+
+  py::object obj = py::reinterpret_steal<py::object>(
+      PyObject_Call(handler.ptr(), args.ptr(), kwargs.ptr()));
+
+  if (obj.ptr() == nullptr) {
+    throw python_error();
+  }
+
+  pushPyOutToStack(op, stack, std::move(obj), "Python dispatcher");
+}
+
+c10::intrusive_ptr<c10::TensorImpl> ConcretePyInterpreterVTable::detach(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "detach",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("detach")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  TORCH_CHECK(
+      THPVariable_Check(out.ptr()),
+      "detach returned invalid type ",
+      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
+      ", expected Tensor");
+  const at::Tensor& res_t = THPVariable_Unpack(out.ptr());
+  return res_t.getIntrusivePtr();
+}
+
+bool ConcretePyInterpreterVTable::is_contiguous(
+    const c10::TensorImpl* self,
+    at::MemoryFormat memory_format) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  py::object out;
+  if (memory_format == at::MemoryFormat::Contiguous) {
+    // For backwards compatibility
+    out = torchDispatchFromTensorImpl(
+        self,
+        "is_contiguous",
+        py::module::import("torch")
+            .attr("ops")
+            .attr("aten")
+            .attr("is_contiguous")
+            .attr("default")
+            .ptr(),
+        "torch.ops.aten");
+  } else {
+    out = torchDispatchFromTensorImpl(
+        self,
+        "is_contiguous",
+        py::module::import("torch")
+            .attr("ops")
+            .attr("aten")
+            .attr("is_contiguous")
+            .attr("memory_format")
+            .ptr(),
+        "torch.ops.aten",
+        {py::cast(memory_format)});
+  }
+
+  if (out.is_none()) {
+    return self->is_contiguous_default(memory_format);
+  }
+
+  TORCH_CHECK(
+      PyBool_Check(out.ptr()),
+      "is_contiguous returned invalid type ",
+      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
+      ", expected bool");
+
+  return PyObject_IsTrue(out.ptr());
+}
+
+bool ConcretePyInterpreterVTable::is_strides_like(
+    const c10::TensorImpl* self,
+    at::MemoryFormat memory_format) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "is_strides_like",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          // NB: intentionally suffixed with _format to avoid
+          // triggering matches against "_like" suffix
+          .attr("is_strides_like_format")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten",
+      {py::cast(memory_format)});
+
+  if (out.is_none()) {
+    return self->is_strides_like_default(memory_format);
+  }
+
+  TORCH_CHECK(
+      PyBool_Check(out.ptr()),
+      "is_strides_like_format returned invalid type ",
+      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
+      ", expected bool");
+
+  return PyObject_IsTrue(out.ptr());
+}
+
+bool ConcretePyInterpreterVTable::is_non_overlapping_and_dense(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "is_non_overlapping_and_dense",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("is_non_overlapping_and_dense")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    return self->is_non_overlapping_and_dense_default();
+  }
+
+  TORCH_CHECK(
+      PyBool_Check(out.ptr()),
+      "is_non_overlapping_and_dense returned invalid type ",
+      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
+      ", expected bool");
+
+  return PyObject_IsTrue(out.ptr());
+}
+
+int64_t ConcretePyInterpreterVTable::dim(const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "dim",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("dim")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  TORCH_CHECK(
+      PyLong_Check(out.ptr()),
+      "dim returned invalid type ",
+      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
+      ", expected int");
+
+  return THPUtils_unpackLong(out.ptr());
+}
+
+c10::Device ConcretePyInterpreterVTable::device(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "device",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("prim")
+          .attr("device")
+          .attr("default")
+          .ptr(),
+      "torch.ops.prim");
+
+  return toDevice(out.ptr());
+}
+
+c10::IntArrayRef ConcretePyInterpreterVTable::strides(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "stride",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("stride")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    TORCH_CHECK(
+        !self->has_symbolic_sizes_strides(),
+        "Cannot call strides on a tensor with symbolic shapes/strides");
+    return self->strides_default();
+  }
+
+  py::object values = py::reinterpret_steal<py::object>(out.ptr());
+
+  c10::optional<PyObject*> mb_obj =
+      self->pyobj_slot()->check_pyobj(getPyInterpreter());
+  TORCH_CHECK(
+      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
+  PyObject* subclass = *mb_obj;
+  Py_INCREF(subclass);
+  py::object sub = py::reinterpret_steal<py::object>(subclass);
+
+  py::object os = py::module_::import("torch").attr("overrides");
+  py::function get_buffer =
+      py::reinterpret_borrow<py::function>(os.attr("get_buffer"));
+  auto buffer = get_buffer(sub, values, "stride");
+  auto result = THPUtils_unpackLongs(buffer.ptr());
+  int64_t* start = (int64_t*)result[0];
+  int64_t len = result[1];
+
+  return c10::IntArrayRef(start, len);
+}
+
+static std::vector<int64_t> values_from_buffer(
+    const c10::TensorImpl* self,
+    py::handle values) {
+  c10::TensorImpl* ptr = const_cast<c10::TensorImpl*>(self);
+  c10::optional<PyObject*> mb_obj =
+      ptr->pyobj_slot()->check_pyobj(getPyInterpreter());
+  TORCH_CHECK(
+      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
+
+  py::object os = py::module_::import("torch").attr("overrides");
+  py::function get_buffer =
+      py::reinterpret_borrow<py::function>(os.attr("get_buffer"));
+  auto buffer = get_buffer(py::handle(*mb_obj), values, "size");
+  auto result = THPUtils_unpackLongs(buffer.ptr());
+  return result;
+}
+
+c10::IntArrayRef ConcretePyInterpreterVTable::sizes(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "size",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("size")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    TORCH_CHECK(
+        !self->has_symbolic_sizes_strides(),
+        "Cannot call sizes on a tensor with symbolic shapes/strides");
+    return self->sizes_default();
+  }
+
+  py::object values = py::reinterpret_steal<py::object>(out.ptr());
+  auto result = values_from_buffer(self, values);
+  int64_t* start = (int64_t*)result[0];
+  int64_t len = result[1];
+
+  return c10::IntArrayRef(start, len);
+}
+
+c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_sizes(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  HANDLE_TH_ERRORS
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "sym_size",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_size")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    return self->sym_sizes_default();
+  }
+  // We need to squeeze SymIntNodes and ints into `SymInts`
+  // since it's a format `sym_sizes()` are stored in
+  TORCH_CHECK(
+      py::isinstance<py::tuple>(out) || py::isinstance<py::list>(out),
+      "Symshape must be a list or a tuple");
+  py::list symints;
+  for (auto it = out.begin(); it != out.end(); it++) {
+    auto elm = *it;
+    auto si = py::cast<c10::SymInt>(elm);
+    // TODO: the buffer will need to be made owning later
+    symints.append(si.as_int_unchecked());
+  }
+
+  auto result = values_from_buffer(self, symints);
+  c10::SymInt* start = (c10::SymInt*)result[0];
+  int64_t len = result[1];
+
+  return c10::SymIntArrayRef(start, len);
+  END_HANDLE_TH_ERRORS_PYBIND
+}
+
+c10::Layout ConcretePyInterpreterVTable::layout(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "layout",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("prim")
+          .attr("layout")
+          .attr("default")
+          .ptr(),
+      "torch.ops.prim");
+
+  TORCH_CHECK(
+      THPLayout_Check(out.ptr()),
+      "layout returned invalid type ",
+      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
+      ", expected Layout");
+
+  return toLayout(out.ptr());
+}
+
+c10::SymInt ConcretePyInterpreterVTable::sym_numel(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "sym_numel",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_numel")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    TORCH_CHECK(
+        !self->has_symbolic_sizes_strides(),
+        "Cannot call numel on a tensor with symbolic shapes/strides");
+    return self->sym_numel_default();
+  }
+  return torch::is_symint(out) ? out.cast<c10::SymInt>()
+                               : c10::SymInt{py::cast<int64_t>(out)};
+}
+
+c10::SymInt ConcretePyInterpreterVTable::sym_storage_offset(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "sym_storage_offset",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_storage_offset")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    return self->sym_storage_offset_default();
+  }
+  return torch::is_symint(out) ? out.cast<c10::SymInt>()
+                               : c10::SymInt{py::cast<int64_t>(out)};
+}
+
+c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_strides(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  HANDLE_TH_ERRORS
+  auto out = torchDispatchFromTensorImpl(
+      self,
+      "sym_stride",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_stride")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten");
+
+  if (out.is_none()) {
+    return self->sym_strides_default();
+  }
+  // We need to squeeze SymIntNodes and ints into `SymInts`
+  // since it's a format `sym_strides()` are stored in
+  TORCH_CHECK(
+      py::isinstance<py::tuple>(out) || py::isinstance<py::list>(out),
+      "Symshape must be a list or a tuple");
+  py::list symints;
+  for (auto it = out.begin(); it != out.end(); it++) {
+    auto elm = *it;
+    auto si = torch::is_symint(elm) ? elm.cast<c10::SymInt>()
+                                    : c10::SymInt{py::cast<int64_t>(elm)};
+    symints.append(si.as_int_unchecked());
+  }
+
+  auto result = values_from_buffer(self, symints);
+  c10::SymInt* start = (c10::SymInt*)result[0];
+  int64_t len = result[1];
+
+  return c10::SymIntArrayRef(start, len);
+  END_HANDLE_TH_ERRORS_PYBIND
+}
+
+PyInterpreterHolder self_interpreter;
+
+void ConcretePyInterpreterVTable::reset_backward_hooks(
+    const c10::TensorImpl* self) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+  HANDLE_TH_ERRORS
+  Tensor self_t = Tensor(
+      c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::
+          unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
+  auto self_p =
+      py::reinterpret_steal<py::object>(THPVariable_Wrap(std::move(self_t)));
+  PyObject_SetAttrString(self_p.ptr(), "_backward_hooks", Py_None);
+  END_HANDLE_TH_ERRORS_PYBIND
+}
+
+} // anonymous namespace
+
+c10::impl::PyInterpreter* getPyInterpreter() {
+  return self_interpreter.get();
+}
+
+bool isMainPyInterpreter() {
+  return self_interpreter.is_main_interpreter();
+}
+
+std::string ConcretePyInterpreterVTable::name() const {
+  std::stringstream ss;
+  ss << getPyInterpreter();
+  return ss.str();
+}
diff --git a/torch/csrc/PyInterpreter.h b/torch/csrc/PyInterpreter.h
new file mode 100644
index 000000000000..30809ff10be9
--- /dev/null
+++ b/torch/csrc/PyInterpreter.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <torch/csrc/Export.h>
+
+TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
+TORCH_PYTHON_API bool isMainPyInterpreter();
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 5cbf64ff474c..e998198cdf73 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -39,11 +39,18 @@ PyObject* THPStorage_New(c10::intrusive_ptr<c10::StorageImpl> ptr) {
   return obj;
 }
 
-static void THPStorage_dealloc(THPStorage* self) {
-  if (self->cdata) {
-    c10::raw::intrusive_ptr::decref(self->cdata);
+static void THPStorage_subclass_dealloc(PyObject* self) {
+  THPStorage* _self = (THPStorage*)self;
+  // Some subclass of StorageBase are GC-tracked objects even
+  // though the base class is not.
+  auto* type = Py_TYPE(self);
+  if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
+    PyObject_GC_UnTrack(self);
   }
-  Py_TYPE(self)->tp_free((PyObject*)self);
+  if (_self->cdata) {
+    c10::raw::intrusive_ptr::decref(_self->cdata);
+  }
+  Py_TYPE(_self)->tp_free(self);
 }
 
 static PyObject* THPStorage_pynew(
@@ -51,7 +58,9 @@ static PyObject* THPStorage_pynew(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
-
+  TORCH_CHECK(
+      type != &THPStorageType,
+      "Cannot directly construct StorageBase; subclass it and then construct that");
   static torch::PythonArgParser parser({
       THPStorageStr "(*, int64_t allocator=None, Device device=None)",
       THPStorageStr
@@ -308,14 +317,62 @@ static PyMappingMethods THPStorage_mappingmethods = {
     (binaryfunc)THPStorage_get,
     (objobjargproc)THPStorage_set};
 
+struct THPStorageMeta {
+  PyHeapTypeObject base;
+};
+
+int THPStorageMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs);
+
+PyTypeObject THPStorageMetaType = {
+    PyVarObject_HEAD_INIT(
+        DEFERRED_ADDRESS(&PyType_Type),
+        0) "torch._C._StorageMeta", /* tp_name */
+    sizeof(THPStorageMeta), /* tp_basicsize */
+    0, /* tp_itemsize */
+    nullptr, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    nullptr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    nullptr, /* tp_methods */
+    nullptr, /* tp_members */
+    nullptr, /* tp_getset */
+    DEFERRED_ADDRESS(&PyType_Type), /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    THPStorageMetaType_init, /* tp_init */
+    nullptr, /* tp_alloc */
+    nullptr, /* tp_new */
+};
+
 // TODO: implement equality
 PyTypeObject THPStorageType = {
     PyVarObject_HEAD_INIT(
-        nullptr,
-        0) "torch._C." THPStorageBaseStr, /* tp_name */
+        &THPStorageMetaType,
+        0) "torch._C.StorageBase", /* tp_name */
     sizeof(THPStorage), /* tp_basicsize */
     0, /* tp_itemsize */
-    (destructor)THPStorage_dealloc, /* tp_dealloc */
+    nullptr, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
@@ -353,6 +410,14 @@ PyTypeObject THPStorageType = {
     THPStorage_pynew, /* tp_new */
 };
 
+int THPStorageMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
+  if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
+    return -1;
+  }
+  ((PyTypeObject*)cls)->tp_dealloc = (destructor)THPStorage_subclass_dealloc;
+  return 0;
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyMemberDef THPStorage_members[] = {
     {(char*)"_cdata",
@@ -380,13 +445,19 @@ bool THPStorage_init(PyObject* module) {
   THPUtils_addPyMethodDefs(methods, THPStorage_getMethods());
   THPUtils_addPyMethodDefs(methods, THPStorage_getSharingMethods());
 
+  THPStorageMetaType.tp_base = &PyType_Type;
+  if (PyType_Ready(&THPStorageMetaType) < 0)
+    return false;
+  Py_INCREF(&THPStorageMetaType);
+  PyModule_AddObject(module, "_StorageMeta", (PyObject*)&THPStorageMetaType);
+
   THPStorageType.tp_methods = methods.data();
   THPStorageType.tp_members = THPStorage_members;
   THPStorageType.tp_getset = THPStorage_properties;
   if (PyType_Ready(&THPStorageType) < 0)
     return false;
   Py_INCREF(&THPStorageType);
-  PyModule_AddObject(module, THPStorageBaseStr, (PyObject*)&THPStorageType);
+  PyModule_AddObject(module, "StorageBase", (PyObject*)&THPStorageType);
   return true;
 }
 
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
index 827caea2a62f..645249b8bbdc 100644
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@@ -4,7 +4,6 @@
 #include <torch/csrc/Types.h>
 
 #define THPStorageStr "torch.UntypedStorage"
-#define THPStorageBaseStr "StorageBase"
 
 struct THPStorage {
   PyObject_HEAD c10::StorageImpl* cdata;
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 29f0f67ce6ec..410b044ba283 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -21,6 +21,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/MapAllocator.h>
+#include <ATen/StorageUtils.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <torch/csrc/utils/python_numbers.h>
@@ -71,7 +72,7 @@ static PyObject* THPStorage_copy_(
 
   TORCH_CHECK(self_.nbytes() == src.nbytes(), "size does not match");
 
-  storage_copy(self_, src, non_blocking);
+  at::storage_copy(self_, src, non_blocking);
 
   Py_INCREF(self);
   return self;
@@ -173,18 +174,16 @@ static PyObject* THPStorage_fromBuffer(
   PyObject* dtype_obj = nullptr;
   c10::ScalarType scalar_type = at::kByte;
   Py_buffer buffer = {};
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,clang-diagnostic-writable-strings)
-  static char* kwlist[] = {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr char* kwlist[] = {
       "buffer", "byte_order", "count", "offset", "dtype", nullptr};
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  const char* argtypes;
-  argtypes = "O|snnO";
+  constexpr char* argtypes = "O|snnO";
 
   if (!PyArg_ParseTupleAndKeywords(
           args,
           keywds,
           argtypes,
-          kwlist,
+          const_cast<char**>(kwlist),
           &obj,
           &byte_order_str,
           &count,
@@ -206,14 +205,17 @@ static PyObject* THPStorage_fromBuffer(
   size_t element_size = c10::elementSize(scalar_type);
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  torch::utils::THPByteOrder byte_order;
+  bool do_byte_swap;
   if (scalar_type != at::kByte && scalar_type != at::kChar) {
     if (strcmp(byte_order_str, "native") == 0) {
-      byte_order = torch::utils::THP_nativeByteOrder();
+      do_byte_swap = false;
     } else if (strcmp(byte_order_str, "big") == 0) {
-      byte_order = torch::utils::THP_BIG_ENDIAN;
+      do_byte_swap =
+          (torch::utils::THP_LITTLE_ENDIAN ==
+           torch::utils::THP_nativeByteOrder());
     } else if (strcmp(byte_order_str, "little") == 0) {
-      byte_order = torch::utils::THP_LITTLE_ENDIAN;
+      do_byte_swap =
+          (torch::utils::THP_BIG_ENDIAN == torch::utils::THP_nativeByteOrder());
     } else {
       PyErr_Format(
           PyExc_ValueError,
@@ -282,34 +284,40 @@ static PyObject* THPStorage_fromBuffer(
     // we are trying to get a value which is not 0 or 1, we have to manually
     // convert original values to boolean ones.
     torch::utils::THP_decodeBoolBuffer(
-        storage->data<bool>(), src + offset, byte_order, count);
+        storage->data<bool>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kShort) {
     torch::utils::THP_decodeInt16Buffer(
-        storage->data<int16_t>(), src + offset, byte_order, count);
+        storage->data<int16_t>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kInt) {
     torch::utils::THP_decodeInt32Buffer(
-        storage->data<int32_t>(), src + offset, byte_order, count);
+        storage->data<int32_t>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kLong) {
     torch::utils::THP_decodeInt64Buffer(
-        storage->data<int64_t>(), src + offset, byte_order, count);
+        storage->data<int64_t>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kHalf) {
     torch::utils::THP_decodeHalfBuffer(
-        storage->data<c10::Half>(), src + offset, byte_order, count);
+        storage->data<c10::Half>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kBFloat16) {
     torch::utils::THP_decodeBFloat16Buffer(
-        storage->data<c10::BFloat16>(), src + offset, byte_order, count);
+        storage->data<c10::BFloat16>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kFloat) {
     torch::utils::THP_decodeFloatBuffer(
-        storage->data<float>(), src + offset, byte_order, count);
+        storage->data<float>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kDouble) {
     torch::utils::THP_decodeDoubleBuffer(
-        storage->data<double>(), src + offset, byte_order, count);
+        storage->data<double>(), src + offset, do_byte_swap, count);
   } else if (scalar_type == at::kComplexFloat) {
     torch::utils::THP_decodeComplexFloatBuffer(
-        storage->data<c10::complex<float>>(), src + offset, byte_order, count);
+        storage->data<c10::complex<float>>(),
+        src + offset,
+        do_byte_swap,
+        count);
   } else if (scalar_type == at::kComplexDouble) {
     torch::utils::THP_decodeComplexDoubleBuffer(
-        storage->data<c10::complex<double>>(), src + offset, byte_order, count);
+        storage->data<c10::complex<double>>(),
+        src + offset,
+        do_byte_swap,
+        count);
   } else {
     TORCH_CHECK(false, "Unknown type: ", scalar_type);
   }
@@ -328,10 +336,16 @@ static PyObject* THPStorage_fromFile(
   const char* filename;
   Py_ssize_t nbytes = 0;
   int shared = 0;
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,clang-diagnostic-writable-strings)
-  static char* kwlist[] = {"filename", "shared", "nbytes", nullptr};
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr char* kwlist[] = {"filename", "shared", "nbytes", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
-          args, keywds, "s|in", kwlist, &filename, &shared, &nbytes)) {
+          args,
+          keywds,
+          "s|in",
+          const_cast<char**>(kwlist),
+          &filename,
+          &shared,
+          &nbytes)) {
     return nullptr;
   }
   if (shared)
diff --git a/torch/csrc/StorageSharing.cpp b/torch/csrc/StorageSharing.cpp
index 3ab36b672e19..bb66bfa3af5e 100644
--- a/torch/csrc/StorageSharing.cpp
+++ b/torch/csrc/StorageSharing.cpp
@@ -26,6 +26,7 @@
 #endif
 
 #include <ATen/MapAllocator.h>
+#include <ATen/StorageUtils.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <atomic>
 #include <string>
@@ -91,10 +92,10 @@ static PyObject* THPStorage_shareFilename(PyObject* _self, PyObject* noargs) {
       "_share_filename_: only available on CPU");
   auto self = (THPStorage*)_self;
   c10::StorageImpl* storage = self->cdata;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  THManagedMapAllocator* ctx;
+  THManagedMapAllocator* ctx =
+      THManagedMapAllocator::fromDataPtr(storage->data_ptr());
   // Storage is already in shared memory, just return a handle
-  if ((ctx = THManagedMapAllocator::fromDataPtr(storage->data_ptr()))) {
+  if (ctx) {
     // done
   } else {
     // TODO: retry on collision
@@ -113,7 +114,7 @@ static PyObject* THPStorage_shareFilename(PyObject* _self, PyObject* noargs) {
     {
       // Copying into shared memory can be slow, so release the GIL
       pybind11::gil_scoped_release no_gil;
-      storage_copy(new_storage, _self_aten);
+      at::storage_copy(new_storage, _self_aten);
     }
 
     std::swap(*storage, *new_storage.unsafeGetStorageImpl());
@@ -173,21 +174,6 @@ static PyObject* THPStorage_newSharedFilename(
   END_HANDLE_TH_ERRORS
 }
 
-static c10::intrusive_ptr<c10::StorageImpl> THPStorage_newFdStorage(
-    ptrdiff_t size) {
-  int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE |
-      at::ALLOCATOR_MAPPED_KEEPFD | at::ALLOCATOR_MAPPED_UNLINK;
-  std::string handle = at::NewProcessWideShmHandle();
-  auto sptr = at::MapAllocator::makeDataPtr(
-      handle.c_str(), flags, size * sizeof(uint8_t), nullptr);
-  return c10::make_intrusive<at::StorageImpl>(
-      c10::StorageImpl::use_byte_size_t(),
-      size,
-      std::move(sptr),
-      /*allocator=*/nullptr,
-      /*resizable=*/false);
-}
-
 static PyObject* THPStorage_pyNewFdStorage(PyObject* _unused, PyObject* args) {
   HANDLE_TH_ERRORS
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -195,7 +181,7 @@ static PyObject* THPStorage_pyNewFdStorage(PyObject* _unused, PyObject* args) {
   if (!PyArg_ParseTuple(args, "L", &size)) {
     return nullptr;
   }
-  return THPStorage_New(THPStorage_newFdStorage(size));
+  return THPStorage_New(at::new_shm_fd_storage(size));
   END_HANDLE_TH_ERRORS
 }
 
@@ -212,12 +198,12 @@ static PyObject* THPStorage_shareFd(PyObject* _self, PyObject* noargs) {
   if ((ctx = at::MapAllocator::fromDataPtr(storage->data_ptr()))) {
     // done
   } else {
-    at::Storage new_storage(THPStorage_newFdStorage(storage->nbytes()));
+    at::Storage new_storage(at::new_shm_fd_storage(storage->nbytes()));
     at::Storage _self_aten = torch::createStorage(_self);
     {
       // Copying into shared memory can be slow, so release the GIL
       pybind11::gil_scoped_release no_gil;
-      storage_copy(new_storage, _self_aten);
+      at::storage_copy(new_storage, _self_aten);
     }
 
     std::swap(*storage, *new_storage.unsafeGetStorageImpl());
diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp
index a52a0b77d87c..398e7b34af78 100644
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@@ -16,13 +16,14 @@ static PyObject* THPStream_pynew(
   int64_t stream_id = 0;
   int64_t device_index = 0;
   int64_t device_type = 0;
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,clang-diagnostic-writable-strings)
-  static char* kwlist[] = {"stream_id", "device_index", "device_type", nullptr};
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr char* kwlist[] = {
+      "stream_id", "device_index", "device_type", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
-          "|KKK",
-          kwlist,
+          "|LLL",
+          const_cast<char**>(kwlist),
           &stream_id,
           &device_index,
           &device_type)) {
@@ -48,10 +49,11 @@ static void THPStream_dealloc(THPStream* self) {
 
 static PyObject* THPStream_get_device(THPStream* self, void* unused) {
   HANDLE_TH_ERRORS
-  return THPDevice_New(
-      c10::Stream::unpack3(
-          self->stream_id, self->device_index, self->device_type)
-          .device());
+  return THPDevice_New(c10::Stream::unpack3(
+                           self->stream_id,
+                           self->device_index,
+                           static_cast<c10::DeviceType>(self->device_type))
+                           .device());
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/api/include/torch/enum.h b/torch/csrc/api/include/torch/enum.h
index 0e52d22b21c9..af900f69bb73 100644
--- a/torch/csrc/api/include/torch/enum.h
+++ b/torch/csrc/api/include/torch/enum.h
@@ -86,14 +86,14 @@
 // `SomeOptions options = {}` can work.
 #define TORCH_OPTIONS_CTOR_VARIANT_ARG3(                                       \
     OPTIONS_NAME, ARG_NAME, TYPE1, TYPE2, TYPE3)                               \
-  OPTIONS_NAME() {}                                                            \
+  OPTIONS_NAME() = default;                                                    \
   OPTIONS_NAME(torch::enumtype::TYPE1 ARG_NAME) : ARG_NAME##_(torch::TYPE1) {} \
   OPTIONS_NAME(torch::enumtype::TYPE2 ARG_NAME) : ARG_NAME##_(torch::TYPE2) {} \
   OPTIONS_NAME(torch::enumtype::TYPE3 ARG_NAME) : ARG_NAME##_(torch::TYPE3) {}
 
 #define TORCH_OPTIONS_CTOR_VARIANT_ARG4(                                       \
     OPTIONS_NAME, ARG_NAME, TYPE1, TYPE2, TYPE3, TYPE4)                        \
-  OPTIONS_NAME() {}                                                            \
+  OPTIONS_NAME() = default;                                                    \
   OPTIONS_NAME(torch::enumtype::TYPE1 ARG_NAME) : ARG_NAME##_(torch::TYPE1) {} \
   OPTIONS_NAME(torch::enumtype::TYPE2 ARG_NAME) : ARG_NAME##_(torch::TYPE2) {} \
   OPTIONS_NAME(torch::enumtype::TYPE3 ARG_NAME) : ARG_NAME##_(torch::TYPE3) {} \
diff --git a/torch/csrc/api/include/torch/nn/functional/embedding.h b/torch/csrc/api/include/torch/nn/functional/embedding.h
index dc5452b39907..8a729813785d 100644
--- a/torch/csrc/api/include/torch/nn/functional/embedding.h
+++ b/torch/csrc/api/include/torch/nn/functional/embedding.h
@@ -126,17 +126,6 @@ inline Tensor embedding_bag(
     TORCH_CHECK(
         offsets_.defined(), "offsets has to be a 1D Tensor but got null");
     TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor");
-    TORCH_CHECK(
-        offsets_[0].item<int64_t>() == 0,
-        "offsets[0] has to be 0, i.e., the first sequence in the mini-batch has to start from position 0. However, got ",
-        offsets_[0].item<int64_t>());
-    TORCH_CHECK(
-        offsets_[-1].item<int64_t>() <= input_.size(0),
-        "offsets[-1] can not be greater than input's length({",
-        input_.size(0),
-        "}), but got offsets[-1] of {",
-        offsets_[-1].item<int64_t>(),
-        "}");
   } else {
     TORCH_CHECK(
         false,
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index ff0348eb841b..20d1024ad410 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -302,7 +302,7 @@ class TORCH_API Module : public std::enable_shared_from_this<Module> {
   virtual void to(torch::Device device, bool non_blocking = false);
 
   /// Recursively zeros out the `grad` value of each registered parameter.
-  virtual void zero_grad(bool set_to_none = false);
+  virtual void zero_grad(bool set_to_none = true);
 
   /// Attempts to cast this `Module` to the given `ModuleType`.
   ///
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index b52c2b12d8f6..68056ec458eb 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -24,7 +24,6 @@ namespace nn {
 /// ```
 /// ELU model(ELUOptions().alpha(42.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ELUImpl : public torch::nn::Cloneable<ELUImpl> {
  public:
   explicit ELUImpl(const ELUOptions& options_ = {});
@@ -60,7 +59,6 @@ TORCH_MODULE(ELU);
 /// ```
 /// SELU model(SELUOptions().inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SELUImpl : public torch::nn::Cloneable<SELUImpl> {
  public:
   explicit SELUImpl(const SELUOptions& options_ = {});
@@ -96,7 +94,6 @@ TORCH_MODULE(SELU);
 /// ```
 /// Hardshrink model(HardshrinkOptions().lambda(42.42));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API HardshrinkImpl : public torch::nn::Cloneable<HardshrinkImpl> {
  public:
   explicit HardshrinkImpl(const HardshrinkOptions& options_ = {});
@@ -133,7 +130,6 @@ TORCH_MODULE(Hardshrink);
 /// Hardtanh
 /// model(HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API HardtanhImpl : public torch::nn::Cloneable<HardtanhImpl> {
  public:
   explicit HardtanhImpl(const HardtanhOptions& options_ = {});
@@ -169,7 +165,6 @@ TORCH_MODULE(Hardtanh);
 /// ```
 /// LeakyReLU model(LeakyReLUOptions().negative_slope(0.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LeakyReLUImpl : public torch::nn::Cloneable<LeakyReLUImpl> {
  public:
   explicit LeakyReLUImpl(const LeakyReLUOptions& options_ = {});
@@ -197,7 +192,6 @@ TORCH_MODULE(LeakyReLU);
 /// Applies the LogSigmoid function element-wise.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.LogSigmoid to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -227,7 +221,6 @@ TORCH_MODULE(LogSigmoid);
 /// ```
 /// Softmax model(SoftmaxOptions(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SoftmaxImpl : public torch::nn::Cloneable<SoftmaxImpl> {
  public:
   explicit SoftmaxImpl(int64_t dim) : SoftmaxImpl(SoftmaxOptions(dim)) {}
@@ -263,7 +256,6 @@ TORCH_MODULE(Softmax);
 /// ```
 /// Softmin model(SoftminOptions(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SoftminImpl : public torch::nn::Cloneable<SoftminImpl> {
  public:
   explicit SoftminImpl(int64_t dim) : SoftminImpl(SoftminOptions(dim)) {}
@@ -299,7 +291,6 @@ TORCH_MODULE(Softmin);
 /// ```
 /// LogSoftmax model(LogSoftmaxOptions(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LogSoftmaxImpl : public torch::nn::Cloneable<LogSoftmaxImpl> {
  public:
   explicit LogSoftmaxImpl(int64_t dim)
@@ -328,7 +319,6 @@ TORCH_MODULE(LogSoftmax);
 /// Applies the Softmax2d function element-wise.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmax2d to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API Softmax2dImpl : public torch::nn::Cloneable<Softmax2dImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -358,7 +348,6 @@ TORCH_MODULE(Softmax2d);
 /// ```
 /// PReLU model(PReLUOptions().num_parameters(42));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API PReLUImpl : public torch::nn::Cloneable<PReLUImpl> {
  public:
   explicit PReLUImpl(const PReLUOptions& options_ = {});
@@ -397,7 +386,6 @@ TORCH_MODULE(PReLU);
 /// ```
 /// ReLU model(ReLUOptions().inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReLUImpl : public torch::nn::Cloneable<ReLUImpl> {
  public:
   explicit ReLUImpl(const ReLUOptions& options_ = {});
@@ -433,7 +421,6 @@ TORCH_MODULE(ReLU);
 /// ```
 /// ReLU6 model(ReLU6Options().inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReLU6Impl : public torch::nn::Cloneable<ReLU6Impl> {
  public:
   explicit ReLU6Impl(const ReLU6Options& options_ = {});
@@ -469,7 +456,6 @@ TORCH_MODULE(ReLU6);
 /// ```
 /// RReLU model(RReLUOptions().lower(0.24).upper(0.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API RReLUImpl : public torch::nn::Cloneable<RReLUImpl> {
  public:
   explicit RReLUImpl(const RReLUOptions& options_ = {});
@@ -505,7 +491,6 @@ TORCH_MODULE(RReLU);
 /// ```
 /// CELU model(CELUOptions().alpha(42.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API CELUImpl : public torch::nn::Cloneable<CELUImpl> {
  public:
   explicit CELUImpl(const CELUOptions& options_ = {});
@@ -541,7 +526,6 @@ TORCH_MODULE(CELU);
 /// ```
 /// GLU model(GLUOptions(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API GLUImpl : public torch::nn::Cloneable<GLUImpl> {
  public:
   explicit GLUImpl(const GLUOptions& options_ = {});
@@ -569,7 +553,6 @@ TORCH_MODULE(GLU);
 /// Applies gelu over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.GELU to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API GELUImpl : public torch::nn::Cloneable<GELUImpl> {
  public:
   explicit GELUImpl(GELUOptions options_ = {});
@@ -596,7 +579,6 @@ TORCH_MODULE(GELU);
 /// Applies silu over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.SiLU to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SiLUImpl : public torch::nn::Cloneable<SiLUImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -618,7 +600,6 @@ TORCH_MODULE(SiLU);
 /// Applies mish over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Mish to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MishImpl : public torch::nn::Cloneable<MishImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -640,7 +621,6 @@ TORCH_MODULE(Mish);
 /// Applies sigmoid over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Sigmoid to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SigmoidImpl : public torch::nn::Cloneable<SigmoidImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -670,7 +650,6 @@ TORCH_MODULE(Sigmoid);
 /// ```
 /// Softplus model(SoftplusOptions().beta(0.24).threshold(42.42));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SoftplusImpl : public torch::nn::Cloneable<SoftplusImpl> {
  public:
   explicit SoftplusImpl(const SoftplusOptions& options_ = {});
@@ -706,7 +685,6 @@ TORCH_MODULE(Softplus);
 /// ```
 /// Softshrink model(SoftshrinkOptions(42.42));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SoftshrinkImpl : public torch::nn::Cloneable<SoftshrinkImpl> {
  public:
   explicit SoftshrinkImpl(const SoftshrinkOptions& options_ = {});
@@ -734,7 +712,6 @@ TORCH_MODULE(Softshrink);
 /// Applies Softsign over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Softsign to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API SoftsignImpl : public torch::nn::Cloneable<SoftsignImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -756,7 +733,6 @@ TORCH_MODULE(Softsign);
 /// Applies Tanh over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Tanh to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TanhImpl : public torch::nn::Cloneable<TanhImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -778,7 +754,6 @@ TORCH_MODULE(Tanh);
 /// Applies Tanhshrink over a given input.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.Tanhshrink to learn
 /// about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TanhshrinkImpl : public torch::nn::Cloneable<TanhshrinkImpl> {
  public:
   Tensor forward(const Tensor& input);
@@ -808,7 +783,6 @@ TORCH_MODULE(Tanhshrink);
 /// ```
 /// Threshold model(ThresholdOptions(42.42, 24.24).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ThresholdImpl : public torch::nn::Cloneable<ThresholdImpl> {
  public:
   ThresholdImpl(double threshold, double value)
@@ -846,7 +820,6 @@ TORCH_MODULE(Threshold);
 /// ```
 /// MultiheadAttention model(MultiheadAttentionOptions(20, 10).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MultiheadAttentionImpl
     : public torch::nn::Cloneable<MultiheadAttentionImpl> {
  public:
diff --git a/torch/csrc/api/include/torch/nn/modules/adaptive.h b/torch/csrc/api/include/torch/nn/modules/adaptive.h
index b8b5170d177a..939d57dd5d51 100644
--- a/torch/csrc/api/include/torch/nn/modules/adaptive.h
+++ b/torch/csrc/api/include/torch/nn/modules/adaptive.h
@@ -41,7 +41,6 @@ struct TORCH_API ASMoutput {
 /// AdaptiveLogSoftmaxWithLoss model(AdaptiveLogSoftmaxWithLossOptions(8, 10,
 /// {4, 8}).div_value(2.).head_bias(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveLogSoftmaxWithLossImpl
     : public Cloneable<AdaptiveLogSoftmaxWithLossImpl> {
  public:
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index 66dc747654d1..943e80bf01b1 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -15,7 +15,6 @@ namespace nn {
 /// Base class for all (dimension-specialized) batchnorm and instancenorm
 /// modules.
 template <size_t D, typename Derived, typename DerivedOptions>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class NormImplBase : public torch::nn::Cloneable<Derived> {
  protected:
   virtual void _check_input_dim(const Tensor& input) = 0;
@@ -99,7 +98,6 @@ class NormImplBase : public torch::nn::Cloneable<Derived> {
 
 /// Base class for all (dimension-specialized) batchnorm modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
  public:
   using NormImplBase<D, Derived, BatchNormOptions>::NormImplBase;
@@ -157,7 +155,6 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
 /// BatchNorm1d
 /// model(BatchNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API BatchNorm1dImpl : public BatchNormImplBase<1, BatchNorm1dImpl> {
  protected:
   void _check_input_dim(const Tensor& input) override;
@@ -188,7 +185,6 @@ TORCH_MODULE(BatchNorm1d);
 /// BatchNorm2d
 /// model(BatchNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API BatchNorm2dImpl : public BatchNormImplBase<2, BatchNorm2dImpl> {
  protected:
   void _check_input_dim(const Tensor& input) override;
@@ -219,7 +215,6 @@ TORCH_MODULE(BatchNorm2d);
 /// BatchNorm3d
 /// model(BatchNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API BatchNorm3dImpl : public BatchNormImplBase<3, BatchNorm3dImpl> {
  protected:
   void _check_input_dim(const Tensor& input) override;
diff --git a/torch/csrc/api/include/torch/nn/modules/container/functional.h b/torch/csrc/api/include/torch/nn/modules/container/functional.h
index d1af0e0fd504..2f87be9df568 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/functional.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/functional.h
@@ -55,7 +55,6 @@ namespace nn {
 ///
 /// Note that `Functional` overloads the call operator (`operator()`) such that
 /// you can invoke it with `my_func(...)`.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
  public:
   using Function = std::function<Tensor(Tensor)>;
diff --git a/torch/csrc/api/include/torch/nn/modules/container/moduledict.h b/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
index fe0264333851..1f7fffa5919f 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
@@ -64,7 +64,6 @@ namespace nn {
 /// iteration over submodules, positional access, adding new modules from a
 /// vector of key-module pairs or an `OrderedDict` or another `ModuleDict` after
 /// construction via `update`.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
  public:
   using Iterator =
@@ -179,7 +178,14 @@ class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
     static_assert(
         torch::detail::is_module<T>::value,
         "Can only call ModuleList::at with an nn::Module type");
-    return *modules_[key]->as<T>();
+    auto module = modules_[key]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        key,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
   }
 
   /// Attempts to return the module at the given key as the requested type.
@@ -190,7 +196,14 @@ class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
     static_assert(
         torch::detail::is_module<T>::value,
         "Can only call ModuleList::at with an nn::Module type");
-    return *modules_[key]->as<T>();
+    const auto module = modules_[key]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        key,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
   }
 
   /// Removes and returns the `Module` associated with the given `key`.
diff --git a/torch/csrc/api/include/torch/nn/modules/container/modulelist.h b/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
index 8cf7850a825d..72a76163ac03 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
@@ -54,7 +54,6 @@ namespace nn {
 /// iteration over submodules, positional access, adding a new module after
 /// construction via `push_back`, as well as joining two `ModuleList`s via
 /// `extend`.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class ModuleListImpl : public Cloneable<ModuleListImpl> {
  public:
   using Iterator = std::vector<std::shared_ptr<Module>>::iterator;
@@ -148,7 +147,14 @@ class ModuleListImpl : public Cloneable<ModuleListImpl> {
         torch::detail::is_module<T>::value,
         "Can only call ModuleList::at with an nn::Module type");
     TORCH_CHECK(index < size(), "Index out of range");
-    return *modules_[index]->as<T>();
+    auto module = modules_[index]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        index,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
   }
 
   /// Attempts to return the module at the given index as the requested type.
@@ -160,7 +166,14 @@ class ModuleListImpl : public Cloneable<ModuleListImpl> {
         torch::detail::is_module<T>::value,
         "Can only call ModuleList::at with an nn::Module type");
     TORCH_CHECK(index < size(), "Index out of range");
-    return *modules_[index]->as<T>();
+    const auto module = modules_[index]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        index,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
   }
 
   /// Attempts to return a `std::shared_ptr` whose dynamic type is that of the
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
index 1e00c32cdc76..f201825deb5b 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -9,7 +9,6 @@
 namespace torch {
 namespace nn {
 
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
  public:
   using Iterator = OrderedDict<std::string, Tensor>::Iterator;
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
index 34e215bffcbe..30b7eb89e48b 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -7,7 +7,6 @@
 
 namespace torch {
 namespace nn {
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class ParameterListImpl : public Cloneable<ParameterListImpl> {
  public:
   using Iterator = typename std::vector<
diff --git a/torch/csrc/api/include/torch/nn/modules/container/sequential.h b/torch/csrc/api/include/torch/nn/modules/container/sequential.h
index ead0f8294492..0d826c6b7fea 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/sequential.h
@@ -89,7 +89,6 @@ namespace nn {
 ///   must accept a single argument. If your modules need to take multiple
 ///   arguments, you should define them to take and return tuples.
 /// \endrst
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class SequentialImpl : public Cloneable<SequentialImpl> {
  public:
   using Iterator = std::vector<AnyModule>::iterator;
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index a53edf702d8e..bb47116bb365 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -22,7 +22,6 @@ namespace nn {
 
 /// Base class for all (dimension-specialized) convolution modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class ConvNdImpl : public torch::nn::Cloneable<Derived> {
  public:
   explicit ConvNdImpl(detail::ConvNdOptions<D> options_)
@@ -177,7 +176,6 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// Conv1d model(Conv1dOptions(3, 2, 3).stride(1).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API Conv1dImpl : public ConvNdImpl<1, Conv1dImpl> {
  public:
   Conv1dImpl(
@@ -210,7 +208,6 @@ TORCH_MODULE(Conv1d);
 /// ```
 /// Conv2d model(Conv2dOptions(3, 2, 3).stride(1).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API Conv2dImpl : public ConvNdImpl<2, Conv2dImpl> {
  public:
   Conv2dImpl(
@@ -246,7 +243,6 @@ TORCH_MODULE(Conv2d);
 /// ```
 /// Conv3d model(Conv3dOptions(3, 2, 3).stride(1).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API Conv3dImpl : public ConvNdImpl<3, Conv3dImpl> {
  public:
   Conv3dImpl(
@@ -270,7 +266,6 @@ TORCH_MODULE(Conv3d);
 
 /// Base class for all (dimension-specialized) convolution transpose modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
  public:
   using torch::nn::ConvNdImpl<D, Derived>::ConvNdImpl;
@@ -339,7 +334,6 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 /// ConvTranspose1d model(ConvTranspose1dOptions(3, 2,
 /// 3).stride(1).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConvTranspose1dImpl
     : public ConvTransposeNdImpl<1, ConvTranspose1dImpl> {
  public:
@@ -382,7 +376,6 @@ TORCH_MODULE(ConvTranspose1d);
 /// ConvTranspose2d model(ConvTranspose2dOptions(3, 2,
 /// 3).stride(1).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConvTranspose2dImpl
     : public ConvTransposeNdImpl<2, ConvTranspose2dImpl> {
  public:
@@ -425,7 +418,6 @@ TORCH_MODULE(ConvTranspose2d);
 /// ConvTranspose3d model(ConvTranspose3dOptions(2, 2,
 /// 2).stride(1).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConvTranspose3dImpl
     : public ConvTransposeNdImpl<3, ConvTranspose3dImpl> {
  public:
diff --git a/torch/csrc/api/include/torch/nn/modules/distance.h b/torch/csrc/api/include/torch/nn/modules/distance.h
index 6cf0b044eb39..93a872476436 100644
--- a/torch/csrc/api/include/torch/nn/modules/distance.h
+++ b/torch/csrc/api/include/torch/nn/modules/distance.h
@@ -23,7 +23,6 @@ namespace nn {
 /// ```
 /// CosineSimilarity model(CosineSimilarityOptions().dim(0).eps(0.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API CosineSimilarityImpl : public Cloneable<CosineSimilarityImpl> {
  public:
   explicit CosineSimilarityImpl(const CosineSimilarityOptions& options_ = {});
@@ -61,7 +60,6 @@ TORCH_MODULE(CosineSimilarity);
 /// PairwiseDistance
 /// model(PairwiseDistanceOptions().p(3).eps(0.5).keepdim(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API PairwiseDistanceImpl : public Cloneable<PairwiseDistanceImpl> {
  public:
   explicit PairwiseDistanceImpl(const PairwiseDistanceOptions& options_ = {});
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index af49b1e98791..7cc7dfb80fbd 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -16,7 +16,6 @@ namespace nn {
 namespace detail {
 
 template <typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class _DropoutNd : public torch::nn::Cloneable<Derived> {
  public:
   _DropoutNd(double p) : _DropoutNd(DropoutOptions().p(p)){};
@@ -52,7 +51,6 @@ class _DropoutNd : public torch::nn::Cloneable<Derived> {
 /// ```
 /// Dropout model(DropoutOptions().p(0.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API DropoutImpl : public detail::_DropoutNd<DropoutImpl> {
  public:
   using detail::_DropoutNd<DropoutImpl>::_DropoutNd;
@@ -83,7 +81,6 @@ TORCH_MODULE(Dropout);
 /// ```
 /// Dropout2d model(Dropout2dOptions().p(0.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API Dropout2dImpl : public detail::_DropoutNd<Dropout2dImpl> {
  public:
   using detail::_DropoutNd<Dropout2dImpl>::_DropoutNd;
@@ -114,7 +111,6 @@ TORCH_MODULE(Dropout2d);
 /// ```
 /// Dropout3d model(Dropout3dOptions().p(0.42).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API Dropout3dImpl : public detail::_DropoutNd<Dropout3dImpl> {
  public:
   using detail::_DropoutNd<Dropout3dImpl>::_DropoutNd;
@@ -145,7 +141,6 @@ TORCH_MODULE(Dropout3d);
 /// ```
 /// AlphaDropout model(AlphaDropoutOptions(0.2).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AlphaDropoutImpl : public detail::_DropoutNd<AlphaDropoutImpl> {
  public:
   using detail::_DropoutNd<AlphaDropoutImpl>::_DropoutNd;
@@ -173,7 +168,6 @@ TORCH_MODULE(AlphaDropout);
 /// ```
 /// FeatureAlphaDropout model(FeatureAlphaDropoutOptions(0.2).inplace(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API FeatureAlphaDropoutImpl
     : public detail::_DropoutNd<FeatureAlphaDropoutImpl> {
  public:
diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h
index 3bf305c4cbd8..fcaddd46e83b 100644
--- a/torch/csrc/api/include/torch/nn/modules/embedding.h
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -27,12 +27,11 @@ namespace nn {
 /// Embedding model(EmbeddingOptions(10,
 /// 2).padding_idx(3).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
  public:
   EmbeddingImpl(int64_t num_embeddings, int64_t embedding_dim)
       : EmbeddingImpl(EmbeddingOptions(num_embeddings, embedding_dim)) {}
-  explicit EmbeddingImpl(const EmbeddingOptions& options_);
+  explicit EmbeddingImpl(EmbeddingOptions options_);
 
   void reset() override;
 
@@ -104,13 +103,12 @@ class Embedding : public torch::nn::ModuleHolder<EmbeddingImpl> {
 /// EmbeddingBag model(EmbeddingBagOptions(10,
 /// 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true).mode(torch::kSum).padding_idx(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API EmbeddingBagImpl
     : public torch::nn::Cloneable<EmbeddingBagImpl> {
  public:
   EmbeddingBagImpl(int64_t num_embeddings, int64_t embedding_dim)
       : EmbeddingBagImpl(EmbeddingBagOptions(num_embeddings, embedding_dim)) {}
-  explicit EmbeddingBagImpl(const EmbeddingBagOptions& options_);
+  explicit EmbeddingBagImpl(EmbeddingBagOptions options_);
 
   void reset() override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/fold.h b/torch/csrc/api/include/torch/nn/modules/fold.h
index ff2d9e331d0f..da16381058a8 100644
--- a/torch/csrc/api/include/torch/nn/modules/fold.h
+++ b/torch/csrc/api/include/torch/nn/modules/fold.h
@@ -22,7 +22,6 @@ namespace nn {
 /// Fold model(FoldOptions({8, 8}, {3, 3}).dilation(2).padding({2,
 /// 1}).stride(2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API FoldImpl : public torch::nn::Cloneable<FoldImpl> {
  public:
   FoldImpl(ExpandingArray<2> output_size, ExpandingArray<2> kernel_size)
@@ -60,7 +59,6 @@ TORCH_MODULE(Fold);
 /// ```
 /// Unfold model(UnfoldOptions({2, 4}).dilation(2).padding({2, 1}).stride(2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API UnfoldImpl : public Cloneable<UnfoldImpl> {
  public:
   UnfoldImpl(ExpandingArray<2> kernel_size)
diff --git a/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
index 83b0ea1fbfbe..3b22e6ee011b 100644
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -8,7 +8,6 @@ namespace nn {
 
 /// Base class for all (dimension-specialized) instance norm modules
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class InstanceNormImpl
     : public torch::nn::NormImplBase<D, Derived, InstanceNormOptions> {
  private:
@@ -64,7 +63,6 @@ class InstanceNormImpl
 /// InstanceNorm1d
 /// model(InstanceNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API InstanceNorm1dImpl
     : public InstanceNormImpl<1, InstanceNorm1dImpl> {
  protected:
@@ -96,7 +94,6 @@ TORCH_MODULE(InstanceNorm1d);
 /// InstanceNorm2d
 /// model(InstanceNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API InstanceNorm2dImpl
     : public InstanceNormImpl<2, InstanceNorm2dImpl> {
  protected:
@@ -128,7 +125,6 @@ TORCH_MODULE(InstanceNorm2d);
 /// InstanceNorm3d
 /// model(InstanceNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API InstanceNorm3dImpl
     : public InstanceNormImpl<3, InstanceNorm3dImpl> {
  protected:
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
index 6ba9e35eef65..a58fdb36b43d 100644
--- a/torch/csrc/api/include/torch/nn/modules/linear.h
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -18,7 +18,6 @@ namespace nn {
 /// A placeholder identity operator that is argument-insensitive.
 /// See https://pytorch.org/docs/master/generated/torch.nn.Identity.html to
 /// learn about the exact behavior of this module.
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API IdentityImpl : public Cloneable<IdentityImpl> {
  public:
   void reset() override;
@@ -48,7 +47,6 @@ TORCH_MODULE(Identity);
 /// ```
 /// Linear model(LinearOptions(5, 2).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LinearImpl : public Cloneable<LinearImpl> {
  public:
   LinearImpl(int64_t in_features, int64_t out_features)
@@ -97,7 +95,6 @@ TORCH_MODULE(Linear);
 /// ```
 /// Flatten model(FlattenOptions().start_dim(2).end_dim(4));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API FlattenImpl : public Cloneable<FlattenImpl> {
  public:
   explicit FlattenImpl(const FlattenOptions& options_ = {});
@@ -136,7 +133,6 @@ TORCH_MODULE(Flatten);
 /// Unflatten model(UnflattenOptions(0, {2, 2}));
 /// Unflatten model(UnflattenOptions("B", {{"B1", 2}, {"B2", 2}}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API UnflattenImpl : public Cloneable<UnflattenImpl> {
  public:
   UnflattenImpl(int64_t dim, std::vector<int64_t> sizes)
@@ -177,7 +173,6 @@ TORCH_MODULE(Unflatten);
 /// ```
 /// Bilinear model(BilinearOptions(3, 2, 4).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API BilinearImpl : public Cloneable<BilinearImpl> {
  public:
   BilinearImpl(int64_t in1_features, int64_t in2_features, int64_t out_features)
diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h
index cabc1a0ed811..f34cfbf59334 100644
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -29,9 +29,8 @@ namespace nn {
 /// ```
 /// L1Loss model(L1LossOptions(torch::kNone));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API L1LossImpl : Cloneable<L1LossImpl> {
-  explicit L1LossImpl(const L1LossOptions& options_ = {});
+  explicit L1LossImpl(L1LossOptions options_ = {});
 
   void reset() override;
 
@@ -65,9 +64,8 @@ TORCH_MODULE(L1Loss);
 /// ```
 /// KLDivLoss model(KLDivLossOptions().reduction(torch::kNone));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API KLDivLossImpl : Cloneable<KLDivLossImpl> {
-  explicit KLDivLossImpl(const KLDivLossOptions& options_ = {});
+  explicit KLDivLossImpl(KLDivLossOptions options_ = {});
 
   void reset() override;
 
@@ -101,9 +99,8 @@ TORCH_MODULE(KLDivLoss);
 /// ```
 /// MSELoss model(MSELossOptions(torch::kNone));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API MSELossImpl : Cloneable<MSELossImpl> {
-  explicit MSELossImpl(const MSELossOptions& options_ = {});
+  explicit MSELossImpl(MSELossOptions options_ = {});
 
   void reset() override;
 
@@ -137,9 +134,8 @@ TORCH_MODULE(MSELoss);
 /// ```
 /// BCELoss model(BCELossOptions().reduction(torch::kNone).weight(weight));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API BCELossImpl : Cloneable<BCELossImpl> {
-  explicit BCELossImpl(const BCELossOptions& options_ = {});
+  explicit BCELossImpl(BCELossOptions options_ = {});
 
   void reset() override;
 
@@ -175,10 +171,8 @@ TORCH_MODULE(BCELoss);
 /// HingeEmbeddingLoss
 /// model(HingeEmbeddingLossOptions().margin(4).reduction(torch::kNone));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API HingeEmbeddingLossImpl : Cloneable<HingeEmbeddingLossImpl> {
-  explicit HingeEmbeddingLossImpl(
-      const HingeEmbeddingLossOptions& options_ = {});
+  explicit HingeEmbeddingLossImpl(HingeEmbeddingLossOptions options_ = {});
 
   void reset() override;
 
@@ -215,9 +209,8 @@ TORCH_MODULE(HingeEmbeddingLoss);
 /// ```
 /// MultiMarginLoss model(MultiMarginLossOptions().margin(2).weight(weight));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API MultiMarginLossImpl : public Cloneable<MultiMarginLossImpl> {
-  explicit MultiMarginLossImpl(const MultiMarginLossOptions& options_ = {});
+  explicit MultiMarginLossImpl(MultiMarginLossOptions options_ = {});
 
   void reset() override;
 
@@ -255,11 +248,9 @@ TORCH_MODULE(MultiMarginLoss);
 /// ```
 /// CosineEmbeddingLoss model(CosineEmbeddingLossOptions().margin(0.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API CosineEmbeddingLossImpl
     : public Cloneable<CosineEmbeddingLossImpl> {
-  explicit CosineEmbeddingLossImpl(
-      const CosineEmbeddingLossOptions& options_ = {});
+  explicit CosineEmbeddingLossImpl(CosineEmbeddingLossOptions options_ = {});
 
   void reset() override;
 
@@ -299,9 +290,8 @@ TORCH_MODULE(CosineEmbeddingLoss);
 /// ```
 /// SmoothL1Loss model(SmoothL1LossOptions().reduction(torch::kNone).beta(0.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API SmoothL1LossImpl : public Cloneable<SmoothL1LossImpl> {
-  explicit SmoothL1LossImpl(const SmoothL1LossOptions& options_ = {});
+  explicit SmoothL1LossImpl(SmoothL1LossOptions options = {});
 
   void reset() override;
 
@@ -336,9 +326,8 @@ TORCH_MODULE(SmoothL1Loss);
 /// ```
 /// HuberLoss model(HuberLossOptions().reduction(torch::kNone).delta(0.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API HuberLossImpl : public Cloneable<HuberLossImpl> {
-  explicit HuberLossImpl(const HuberLossOptions& options_ = {});
+  explicit HuberLossImpl(HuberLossOptions options_ = {});
 
   void reset() override;
 
@@ -375,11 +364,9 @@ TORCH_MODULE(HuberLoss);
 /// ```
 /// MultiLabelMarginLoss model(MultiLabelMarginLossOptions(torch::kNone));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API MultiLabelMarginLossImpl
     : public Cloneable<MultiLabelMarginLossImpl> {
-  explicit MultiLabelMarginLossImpl(
-      const MultiLabelMarginLossOptions& options_ = {});
+  explicit MultiLabelMarginLossImpl(MultiLabelMarginLossOptions options_ = {});
 
   void reset() override;
 
@@ -415,9 +402,8 @@ TORCH_MODULE(MultiLabelMarginLoss);
 /// ```
 /// SoftMarginLoss model(SoftMarginLossOptions(torch::kNone));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API SoftMarginLossImpl : public Cloneable<SoftMarginLossImpl> {
-  explicit SoftMarginLossImpl(const SoftMarginLossOptions& options_ = {});
+  explicit SoftMarginLossImpl(SoftMarginLossOptions options_ = {});
 
   /// Pretty prints the `SoftMarginLoss` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
@@ -454,11 +440,10 @@ TORCH_MODULE(SoftMarginLoss);
 /// MultiLabelSoftMarginLoss
 /// model(MultiLabelSoftMarginLossOptions().reduction(torch::kNone).weight(weight));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API MultiLabelSoftMarginLossImpl
     : public Cloneable<MultiLabelSoftMarginLossImpl> {
   explicit MultiLabelSoftMarginLossImpl(
-      const MultiLabelSoftMarginLossOptions& options_ = {});
+      MultiLabelSoftMarginLossOptions options_ = {});
 
   /// Pretty prints the `MultiLabelSoftMarginLoss` module into the given
   /// `stream`.
@@ -499,10 +484,9 @@ TORCH_MODULE(MultiLabelSoftMarginLoss);
 /// TripletMarginLoss
 /// model(TripletMarginLossOptions().margin(3).p(2).eps(1e-06).swap(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API TripletMarginLossImpl
     : public Cloneable<TripletMarginLossImpl> {
-  explicit TripletMarginLossImpl(const TripletMarginLossOptions& options_ = {});
+  explicit TripletMarginLossImpl(TripletMarginLossOptions options_ = {});
 
   void reset() override;
 
@@ -547,7 +531,6 @@ TORCH_MODULE(TripletMarginLoss);
 /// TripletMarginWithDistanceLoss
 /// model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API TripletMarginWithDistanceLossImpl
     : public Cloneable<TripletMarginWithDistanceLossImpl> {
   explicit TripletMarginWithDistanceLossImpl(
@@ -591,9 +574,8 @@ TORCH_MODULE(TripletMarginWithDistanceLoss);
 /// CTCLoss
 /// model(CTCLossOptions().blank(42).zero_infinity(false).reduction(torch::kSum));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API CTCLossImpl : public Cloneable<CTCLossImpl> {
-  explicit CTCLossImpl(const CTCLossOptions& options_ = {});
+  explicit CTCLossImpl(CTCLossOptions options_ = {});
 
   void reset() override;
 
@@ -632,9 +614,8 @@ TORCH_MODULE(CTCLoss);
 /// PoissonNLLLoss
 /// model(PoissonNLLLossOptions().log_input(false).full(true).eps(0.42).reduction(torch::kSum));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API PoissonNLLLossImpl : public Cloneable<PoissonNLLLossImpl> {
-  explicit PoissonNLLLossImpl(const PoissonNLLLossOptions& options_ = {});
+  explicit PoissonNLLLossImpl(PoissonNLLLossOptions options_ = {});
 
   void reset() override;
 
@@ -671,10 +652,9 @@ TORCH_MODULE(PoissonNLLLoss);
 /// MarginRankingLoss
 /// model(MarginRankingLossOptions().margin(0.5).reduction(torch::kSum));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API MarginRankingLossImpl
     : public Cloneable<MarginRankingLossImpl> {
-  explicit MarginRankingLossImpl(const MarginRankingLossOptions& options_ = {});
+  explicit MarginRankingLossImpl(MarginRankingLossOptions options_ = {});
 
   void reset() override;
 
@@ -711,9 +691,8 @@ TORCH_MODULE(MarginRankingLoss);
 /// ```
 /// NLLLoss model(NLLLossOptions().ignore_index(-100).reduction(torch::kMean));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API NLLLossImpl : public Cloneable<NLLLossImpl> {
-  explicit NLLLossImpl(const NLLLossOptions& options_ = {});
+  explicit NLLLossImpl(NLLLossOptions options_ = {});
 
   /// Pretty prints the `NLLLoss` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
@@ -752,9 +731,8 @@ TORCH_MODULE(NLLLoss);
 /// CrossEntropyLoss
 /// model(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
-  explicit CrossEntropyLossImpl(const CrossEntropyLossOptions& options_ = {});
+  explicit CrossEntropyLossImpl(CrossEntropyLossOptions options_ = {});
 
   void reset() override;
 
@@ -795,10 +773,9 @@ TORCH_MODULE(CrossEntropyLoss);
 /// BCEWithLogitsLoss
 /// model(BCEWithLogitsLossOptions().reduction(torch::kNone).weight(weight));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API BCEWithLogitsLossImpl
     : public Cloneable<BCEWithLogitsLossImpl> {
-  explicit BCEWithLogitsLossImpl(const BCEWithLogitsLossOptions& options_ = {});
+  explicit BCEWithLogitsLossImpl(BCEWithLogitsLossOptions options_ = {});
 
   void reset() override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/normalization.h b/torch/csrc/api/include/torch/nn/modules/normalization.h
index d57c26c94103..2f748ef79d0b 100644
--- a/torch/csrc/api/include/torch/nn/modules/normalization.h
+++ b/torch/csrc/api/include/torch/nn/modules/normalization.h
@@ -28,12 +28,11 @@ namespace nn {
 /// LayerNorm model(LayerNormOptions({2,
 /// 2}).elementwise_affine(false).eps(2e-5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LayerNormImpl : public torch::nn::Cloneable<LayerNormImpl> {
  public:
   LayerNormImpl(std::vector<int64_t> normalized_shape)
       : LayerNormImpl(LayerNormOptions(normalized_shape)) {}
-  explicit LayerNormImpl(const LayerNormOptions& options_);
+  explicit LayerNormImpl(LayerNormOptions options_);
 
   void reset() override;
 
@@ -90,7 +89,6 @@ TORCH_MODULE(LayerNorm);
 /// LocalResponseNorm
 /// model(LocalResponseNormOptions(2).alpha(0.0002).beta(0.85).k(2.));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LocalResponseNormImpl
     : public Cloneable<LocalResponseNormImpl> {
  public:
@@ -125,7 +123,6 @@ TORCH_MODULE(LocalResponseNorm);
 /// ```
 /// CrossMapLRN2d model(CrossMapLRN2dOptions(3).alpha(1e-5).beta(0.1).k(10));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API CrossMapLRN2dImpl
     : public torch::nn::Cloneable<CrossMapLRN2dImpl> {
  public:
@@ -165,7 +162,6 @@ TORCH_MODULE(CrossMapLRN2d);
 /// ```
 /// GroupNorm model(GroupNormOptions(2, 2).eps(2e-5).affine(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API GroupNormImpl : public torch::nn::Cloneable<GroupNormImpl> {
  public:
   GroupNormImpl(int64_t num_groups, int64_t num_channels)
diff --git a/torch/csrc/api/include/torch/nn/modules/padding.h b/torch/csrc/api/include/torch/nn/modules/padding.h
index 3efa41af8fb8..95af62f376fb 100644
--- a/torch/csrc/api/include/torch/nn/modules/padding.h
+++ b/torch/csrc/api/include/torch/nn/modules/padding.h
@@ -11,7 +11,6 @@ namespace nn {
 
 /// Base class for all (dimension-specialized) ReflectionPad modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReflectionPadImpl : public torch::nn::Cloneable<Derived> {
  public:
   ReflectionPadImpl(ExpandingArray<D * 2> padding)
@@ -43,7 +42,6 @@ class TORCH_API ReflectionPadImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// ReflectionPad1d model(ReflectionPad1dOptions({3, 1}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReflectionPad1dImpl
     : public ReflectionPadImpl<1, ReflectionPad1dImpl> {
  public:
@@ -71,7 +69,6 @@ TORCH_MODULE(ReflectionPad1d);
 /// ```
 /// ReflectionPad2d model(ReflectionPad2dOptions({1, 1, 2, 0}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReflectionPad2dImpl
     : public ReflectionPadImpl<2, ReflectionPad2dImpl> {
  public:
@@ -100,7 +97,6 @@ TORCH_MODULE(ReflectionPad2d);
 /// ReflectionPad3d model(ReflectionPad3dOptions(1));
 /// ReflectionPad3d model(ReflectionPad3dOptions({1, 1, 2, 0, 1, 2}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReflectionPad3dImpl
     : public ReflectionPadImpl<3, ReflectionPad3dImpl> {
  public:
@@ -118,7 +114,6 @@ TORCH_MODULE(ReflectionPad3d);
 
 /// Base class for all (dimension-specialized) ReplicationPad modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReplicationPadImpl : public torch::nn::Cloneable<Derived> {
  public:
   ReplicationPadImpl(ExpandingArray<D * 2> padding)
@@ -150,7 +145,6 @@ class TORCH_API ReplicationPadImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// ReplicationPad1d model(ReplicationPad1dOptions({3, 1}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReplicationPad1dImpl
     : public ReplicationPadImpl<1, ReplicationPad1dImpl> {
  public:
@@ -178,7 +172,6 @@ TORCH_MODULE(ReplicationPad1d);
 /// ```
 /// ReplicationPad2d model(ReplicationPad2dOptions({1, 1, 2, 0}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReplicationPad2dImpl
     : public ReplicationPadImpl<2, ReplicationPad2dImpl> {
  public:
@@ -206,7 +199,6 @@ TORCH_MODULE(ReplicationPad2d);
 /// ```
 /// ReplicationPad3d model(ReplicationPad3dOptions({1, 2, 1, 2, 1, 2}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ReplicationPad3dImpl
     : public ReplicationPadImpl<3, ReplicationPad3dImpl> {
  public:
@@ -233,7 +225,6 @@ TORCH_MODULE(ReplicationPad3d);
 /// ```
 /// ZeroPad2d model(ZeroPad2dOptions({1, 1, 2, 0}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ZeroPad2dImpl : public Cloneable<ZeroPad2dImpl> {
  public:
   ZeroPad2dImpl(ExpandingArray<4> padding)
@@ -262,7 +253,6 @@ TORCH_MODULE(ZeroPad2d);
 
 /// Base class for all (dimension-specialized) ConstantPad modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConstantPadImpl : public torch::nn::Cloneable<Derived> {
  public:
   ConstantPadImpl(ExpandingArray<D * 2> padding, double value)
@@ -293,7 +283,6 @@ class TORCH_API ConstantPadImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// ConstantPad1d model(ConstantPad1dOptions({3, 1}, 3.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConstantPad1dImpl
     : public ConstantPadImpl<1, ConstantPad1dImpl> {
  public:
@@ -320,7 +309,6 @@ TORCH_MODULE(ConstantPad1d);
 /// ```
 /// ConstantPad2d model(ConstantPad2dOptions({3, 0, 2, 1}, 3.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConstantPad2dImpl
     : public ConstantPadImpl<2, ConstantPad2dImpl> {
  public:
@@ -347,7 +335,6 @@ TORCH_MODULE(ConstantPad2d);
 /// ```
 /// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2}, 3.5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API ConstantPad3dImpl
     : public ConstantPadImpl<3, ConstantPad3dImpl> {
  public:
diff --git a/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h b/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
index 3fb456f618a5..e47e68519105 100644
--- a/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
+++ b/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
@@ -25,7 +25,6 @@ namespace nn {
 /// ```
 /// PixelShuffle model(PixelShuffleOptions(5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API PixelShuffleImpl
     : public torch::nn::Cloneable<PixelShuffleImpl> {
   explicit PixelShuffleImpl(const PixelShuffleOptions& options_);
@@ -63,7 +62,6 @@ TORCH_MODULE(PixelShuffle);
 /// ```
 /// PixelUnshuffle model(PixelUnshuffleOptions(5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 struct TORCH_API PixelUnshuffleImpl
     : public torch::nn::Cloneable<PixelUnshuffleImpl> {
   explicit PixelUnshuffleImpl(const PixelUnshuffleOptions& options_);
diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h
index 198ef0f2650b..522dc18fc5d1 100644
--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -13,7 +13,6 @@ namespace nn {
 
 /// Base class for all (dimension-specialized) avgpool modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AvgPoolImpl : public torch::nn::Cloneable<Derived> {
  public:
   AvgPoolImpl(ExpandingArray<D> kernel_size)
@@ -42,7 +41,6 @@ class TORCH_API AvgPoolImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// AvgPool1d model(AvgPool1dOptions(3).stride(2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AvgPool1dImpl : public AvgPoolImpl<1, AvgPool1dImpl> {
  public:
   using AvgPoolImpl<1, AvgPool1dImpl>::AvgPoolImpl;
@@ -69,7 +67,6 @@ TORCH_MODULE(AvgPool1d);
 /// ```
 /// AvgPool2d model(AvgPool2dOptions({3, 2}).stride({2, 2}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AvgPool2dImpl : public AvgPoolImpl<2, AvgPool2dImpl> {
  public:
   using AvgPoolImpl<2, AvgPool2dImpl>::AvgPoolImpl;
@@ -96,7 +93,6 @@ TORCH_MODULE(AvgPool2d);
 /// ```
 /// AvgPool3d model(AvgPool3dOptions(5).stride(2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AvgPool3dImpl : public AvgPoolImpl<3, AvgPool3dImpl> {
  public:
   using AvgPoolImpl<3, AvgPool3dImpl>::AvgPoolImpl;
@@ -114,7 +110,6 @@ TORCH_MODULE(AvgPool3d);
 
 /// Base class for all (dimension-specialized) maxpool modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxPoolImpl : public torch::nn::Cloneable<Derived> {
  public:
   MaxPoolImpl(ExpandingArray<D> kernel_size)
@@ -143,7 +138,6 @@ class TORCH_API MaxPoolImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// MaxPool1d model(MaxPool1dOptions(3).stride(2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxPool1dImpl : public MaxPoolImpl<1, MaxPool1dImpl> {
  public:
   using MaxPoolImpl<1, MaxPool1dImpl>::MaxPoolImpl;
@@ -174,7 +168,6 @@ TORCH_MODULE(MaxPool1d);
 /// ```
 /// MaxPool2d model(MaxPool2dOptions({3, 2}).stride({2, 2}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxPool2dImpl : public MaxPoolImpl<2, MaxPool2dImpl> {
  public:
   using MaxPoolImpl<2, MaxPool2dImpl>::MaxPoolImpl;
@@ -205,7 +198,6 @@ TORCH_MODULE(MaxPool2d);
 /// ```
 /// MaxPool3d model(MaxPool3dOptions(3).stride(2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxPool3dImpl : public MaxPoolImpl<3, MaxPool3dImpl> {
  public:
   using MaxPoolImpl<3, MaxPool3dImpl>::MaxPoolImpl;
@@ -227,7 +219,6 @@ TORCH_MODULE(MaxPool3d);
 
 /// Base class for all (dimension-specialized) adaptive maxpool modules.
 template <size_t D, typename output_size_t, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
  public:
   AdaptiveMaxPoolImpl(output_size_t output_size)
@@ -263,7 +254,6 @@ class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// AdaptiveMaxPool1d model(AdaptiveMaxPool1dOptions(3));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveMaxPool1dImpl
     : public AdaptiveMaxPoolImpl<1, ExpandingArray<1>, AdaptiveMaxPool1dImpl> {
  public:
@@ -297,7 +287,6 @@ TORCH_MODULE(AdaptiveMaxPool1d);
 /// ```
 /// AdaptiveMaxPool2d model(AdaptiveMaxPool2dOptions({3, 2}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveMaxPool2dImpl : public AdaptiveMaxPoolImpl<
                                             2,
                                             ExpandingArrayWithOptionalElem<2>,
@@ -335,7 +324,6 @@ TORCH_MODULE(AdaptiveMaxPool2d);
 /// ```
 /// AdaptiveMaxPool3d model(AdaptiveMaxPool3dOptions(3));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveMaxPool3dImpl : public AdaptiveMaxPoolImpl<
                                             3,
                                             ExpandingArrayWithOptionalElem<3>,
@@ -364,7 +352,6 @@ TORCH_MODULE(AdaptiveMaxPool3d);
 
 /// Base class for all (dimension-specialized) adaptive avgpool modules.
 template <size_t D, typename output_size_t, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
  public:
   AdaptiveAvgPoolImpl(output_size_t output_size)
@@ -400,7 +387,6 @@ class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// AdaptiveAvgPool1d model(AdaptiveAvgPool1dOptions(5));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveAvgPool1dImpl
     : public AdaptiveAvgPoolImpl<1, ExpandingArray<1>, AdaptiveAvgPool1dImpl> {
  public:
@@ -430,7 +416,6 @@ TORCH_MODULE(AdaptiveAvgPool1d);
 /// ```
 /// AdaptiveAvgPool2d model(AdaptiveAvgPool2dOptions({3, 2}));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveAvgPool2dImpl : public AdaptiveAvgPoolImpl<
                                             2,
                                             ExpandingArrayWithOptionalElem<2>,
@@ -464,7 +449,6 @@ TORCH_MODULE(AdaptiveAvgPool2d);
 /// ```
 /// AdaptiveAvgPool3d model(AdaptiveAvgPool3dOptions(3));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API AdaptiveAvgPool3dImpl : public AdaptiveAvgPoolImpl<
                                             3,
                                             ExpandingArrayWithOptionalElem<3>,
@@ -489,7 +473,6 @@ TORCH_MODULE(AdaptiveAvgPool3d);
 
 /// Base class for all (dimension-specialized) maxunpool modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxUnpoolImpl : public torch::nn::Cloneable<Derived> {
  public:
   MaxUnpoolImpl(ExpandingArray<D> kernel_size)
@@ -518,7 +501,6 @@ class TORCH_API MaxUnpoolImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// MaxUnpool1d model(MaxUnpool1dOptions(3).stride(2).padding(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxUnpool1dImpl : public MaxUnpoolImpl<1, MaxUnpool1dImpl> {
  public:
   using MaxUnpoolImpl<1, MaxUnpool1dImpl>::MaxUnpoolImpl;
@@ -551,7 +533,6 @@ TORCH_MODULE(MaxUnpool1d);
 /// ```
 /// MaxUnpool2d model(MaxUnpool2dOptions(3).stride(2).padding(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxUnpool2dImpl : public MaxUnpoolImpl<2, MaxUnpool2dImpl> {
  public:
   using MaxUnpoolImpl<2, MaxUnpool2dImpl>::MaxUnpoolImpl;
@@ -584,7 +565,6 @@ TORCH_MODULE(MaxUnpool2d);
 /// ```
 /// MaxUnpool3d model(MaxUnpool3dOptions(3).stride(2).padding(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API MaxUnpool3dImpl : public MaxUnpoolImpl<3, MaxUnpool3dImpl> {
  public:
   using MaxUnpoolImpl<3, MaxUnpool3dImpl>::MaxUnpoolImpl;
@@ -618,13 +598,12 @@ TORCH_MODULE(MaxUnpool3d);
 /// ```
 /// FractionalMaxPool2d model(FractionalMaxPool2dOptions(5).output_size(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API FractionalMaxPool2dImpl
     : public torch::nn::Cloneable<FractionalMaxPool2dImpl> {
  public:
   FractionalMaxPool2dImpl(ExpandingArray<2> kernel_size)
       : FractionalMaxPool2dImpl(FractionalMaxPool2dOptions(kernel_size)) {}
-  explicit FractionalMaxPool2dImpl(const FractionalMaxPool2dOptions& options_);
+  explicit FractionalMaxPool2dImpl(FractionalMaxPool2dOptions options_);
 
   void reset() override;
 
@@ -664,13 +643,12 @@ TORCH_MODULE(FractionalMaxPool2d);
 /// ```
 /// FractionalMaxPool3d model(FractionalMaxPool3dOptions(5).output_size(1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API FractionalMaxPool3dImpl
     : public torch::nn::Cloneable<FractionalMaxPool3dImpl> {
  public:
   FractionalMaxPool3dImpl(ExpandingArray<3> kernel_size)
       : FractionalMaxPool3dImpl(FractionalMaxPool3dOptions(kernel_size)) {}
-  explicit FractionalMaxPool3dImpl(const FractionalMaxPool3dOptions& options_);
+  explicit FractionalMaxPool3dImpl(FractionalMaxPool3dOptions options_);
 
   void reset() override;
 
@@ -700,7 +678,6 @@ TORCH_MODULE(FractionalMaxPool3d);
 
 /// Base class for all (dimension-specialized) lppool modules.
 template <size_t D, typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LPPoolImpl : public torch::nn::Cloneable<Derived> {
  public:
   LPPoolImpl(double norm_type, ExpandingArray<D> kernel_size)
@@ -728,7 +705,6 @@ class TORCH_API LPPoolImpl : public torch::nn::Cloneable<Derived> {
 /// ```
 /// LPPool1d model(LPPool1dOptions(1, 2).stride(5).ceil_mode(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LPPool1dImpl : public LPPoolImpl<1, LPPool1dImpl> {
  public:
   using LPPoolImpl<1, LPPool1dImpl>::LPPoolImpl;
@@ -757,7 +733,6 @@ TORCH_MODULE(LPPool1d);
 /// LPPool2d model(LPPool2dOptions(1, std::vector<int64_t>({3, 4})).stride({5,
 /// 6}).ceil_mode(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LPPool2dImpl : public LPPoolImpl<2, LPPool2dImpl> {
  public:
   using LPPoolImpl<2, LPPool2dImpl>::LPPoolImpl;
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index c3d892dc0f5f..2d15c807c2d4 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -22,7 +22,6 @@ namespace nn {
 namespace detail {
 /// Base class for all RNN implementations (intended for code sharing).
 template <typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
  public:
   explicit RNNImplBase(const RNNOptionsBase& options_);
@@ -103,7 +102,6 @@ class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
 /// RNN model(RNNOptions(128,
 /// 64).num_layers(3).dropout(0.2).nonlinearity(torch::kTanh));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API RNNImpl : public detail::RNNImplBase<RNNImpl> {
  public:
   RNNImpl(int64_t input_size, int64_t hidden_size)
@@ -153,7 +151,6 @@ TORCH_MODULE(RNN);
 /// LSTM model(LSTMOptions(2,
 /// 4).num_layers(3).batch_first(false).bidirectional(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
  public:
   LSTMImpl(int64_t input_size, int64_t hidden_size)
@@ -219,7 +216,6 @@ TORCH_MODULE(LSTM);
 /// GRU model(GRUOptions(2,
 /// 4).num_layers(3).batch_first(false).bidirectional(true));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API GRUImpl : public detail::RNNImplBase<GRUImpl> {
  public:
   GRUImpl(int64_t input_size, int64_t hidden_size)
@@ -261,7 +257,6 @@ TORCH_MODULE(GRU);
 namespace detail {
 /// Base class for all RNNCell implementations (intended for code sharing).
 template <typename Derived>
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API RNNCellImplBase : public torch::nn::Cloneable<Derived> {
  public:
   explicit RNNCellImplBase(const RNNCellOptionsBase& options_);
@@ -306,7 +301,6 @@ class TORCH_API RNNCellImplBase : public torch::nn::Cloneable<Derived> {
 /// RNNCell model(RNNCellOptions(20,
 /// 10).bias(false).nonlinearity(torch::kReLU));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API RNNCellImpl : public detail::RNNCellImplBase<RNNCellImpl> {
  public:
   RNNCellImpl(int64_t input_size, int64_t hidden_size)
@@ -346,7 +340,6 @@ TORCH_MODULE(RNNCell);
 /// ```
 /// LSTMCell model(LSTMCellOptions(20, 10).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API LSTMCellImpl : public detail::RNNCellImplBase<LSTMCellImpl> {
  public:
   LSTMCellImpl(int64_t input_size, int64_t hidden_size)
@@ -386,7 +379,6 @@ TORCH_MODULE(LSTMCell);
 /// ```
 /// GRUCell model(GRUCellOptions(20, 10).bias(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API GRUCellImpl : public detail::RNNCellImplBase<GRUCellImpl> {
  public:
   GRUCellImpl(int64_t input_size, int64_t hidden_size)
diff --git a/torch/csrc/api/include/torch/nn/modules/transformer.h b/torch/csrc/api/include/torch/nn/modules/transformer.h
index d4e6264e1b3a..c8c417c7564b 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformer.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformer.h
@@ -31,7 +31,6 @@ namespace nn {
 /// ```
 /// Transformer trans(TransformerOptions(512, 8));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TransformerImpl : public Cloneable<TransformerImpl> {
  public:
   explicit TransformerImpl(TransformerOptions options_);
diff --git a/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
index 38d432e86a03..fd1998449abd 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformercoder.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
@@ -33,7 +33,6 @@ namespace nn {
 /// encoder(TransformerEncoderOptions(encoderLayer,
 /// 6).norm(LayerNorm(LayerNormOptions({2}))));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TransformerEncoderImpl
     : public Cloneable<TransformerEncoderImpl> {
  public:
@@ -95,7 +94,6 @@ TORCH_MODULE(TransformerEncoder);
 /// torch::rand({10, 32, 512}); const auto tgt = torch::rand({20, 32, 512});
 /// auto out = transformer_decoder(tgt, memory);
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TransformerDecoderImpl
     : public Cloneable<TransformerDecoderImpl> {
  public:
diff --git a/torch/csrc/api/include/torch/nn/modules/transformerlayer.h b/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
index 1c8ffc98ad2d..0378226b1563 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
@@ -33,15 +33,13 @@ namespace nn {
 /// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
 /// 8).dropout(0.1));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TransformerEncoderLayerImpl
     : public Cloneable<TransformerEncoderLayerImpl> {
  public:
   TransformerEncoderLayerImpl(int64_t d_model, int64_t nhead)
       : TransformerEncoderLayerImpl(
             TransformerEncoderLayerOptions(d_model, nhead)) {}
-  explicit TransformerEncoderLayerImpl(
-      const TransformerEncoderLayerOptions& options_);
+  explicit TransformerEncoderLayerImpl(TransformerEncoderLayerOptions options_);
 
   Tensor forward(
       const Tensor& src,
@@ -110,15 +108,13 @@ TORCH_MODULE(TransformerEncoderLayer);
 /// TransformerDecoderLayer model(TransformerDecoderLayerOptions(512,
 /// 8).dropout(0.2));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API TransformerDecoderLayerImpl
     : public Cloneable<TransformerDecoderLayerImpl> {
  public:
   TransformerDecoderLayerImpl(int64_t d_model, int64_t nhead)
       : TransformerDecoderLayerImpl(
             TransformerDecoderLayerOptions(d_model, nhead)) {}
-  explicit TransformerDecoderLayerImpl(
-      const TransformerDecoderLayerOptions& options_);
+  explicit TransformerDecoderLayerImpl(TransformerDecoderLayerOptions options_);
 
   void reset() override;
 
diff --git a/torch/csrc/api/include/torch/nn/modules/upsampling.h b/torch/csrc/api/include/torch/nn/modules/upsampling.h
index 5340b9337501..6db8b04d574a 100644
--- a/torch/csrc/api/include/torch/nn/modules/upsampling.h
+++ b/torch/csrc/api/include/torch/nn/modules/upsampling.h
@@ -29,7 +29,6 @@ namespace nn {
 /// Upsample
 /// model(UpsampleOptions().scale_factor({3}).mode(torch::kLinear).align_corners(false));
 /// ```
-// NOLINTNEXTLINE(bugprone-exception-escape)
 class TORCH_API UpsampleImpl : public Cloneable<UpsampleImpl> {
  public:
   explicit UpsampleImpl(const UpsampleOptions& options_ = {});
diff --git a/torch/csrc/api/include/torch/nn/utils/clip_grad.h b/torch/csrc/api/include/torch/nn/utils/clip_grad.h
index 1a55da9590b3..e1023bd1eb5c 100644
--- a/torch/csrc/api/include/torch/nn/utils/clip_grad.h
+++ b/torch/csrc/api/include/torch/nn/utils/clip_grad.h
@@ -19,7 +19,7 @@ namespace utils {
 // sense!) in order to return a CPU-side `double`. This C++ version therefore
 // cannot be run fully asynchronously w.r.t. the device of the gradients.
 inline double clip_grad_norm_(
-    std::vector<Tensor> parameters,
+    const std::vector<Tensor>& parameters,
     double max_norm,
     double norm_type = 2.0,
     bool error_if_nonfinite = false) {
@@ -118,7 +118,7 @@ inline double clip_grad_norm_(
 // See https://pytorch.org/docs/stable/nn.html#clip-grad-value
 // for more details about this module.
 inline void clip_grad_value_(
-    std::vector<Tensor> parameters,
+    const std::vector<Tensor>& parameters,
     double clip_value) {
   for (const auto& param : parameters) {
     if (param.grad().defined()) {
diff --git a/torch/csrc/api/include/torch/nn/utils/convert_parameters.h b/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
index e08bb6228389..2ac1d317c992 100644
--- a/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
+++ b/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
@@ -56,20 +56,18 @@ inline torch::Tensor parameters_to_vector(
 // Convert one vector to the parameters
 inline void vector_to_parameters(
     const torch::Tensor& vec,
-    std::vector<torch::Tensor> parameters) {
+    const std::vector<torch::Tensor>& parameters) {
   // Flag for the device where the parameter is located
   c10::optional<int64_t> param_device;
 
   // Pointer for slicing the vector for each parameter
   int64_t pointer = 0;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t num_param;
-  for (torch::Tensor& param : parameters) {
+  for (const torch::Tensor& param : parameters) {
     // Ensure the parameters are located in the same device
     param_device = _check_param_device(param, param_device);
 
     // The length of the parameter
-    num_param = param.numel();
+    auto num_param = param.numel();
     // Slice the vector, reshape it, and replace the old data of the parameter
     param.set_data(
         vec.slice(0, pointer, pointer + num_param).view_as(param).data());
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index 7b360ca86eb9..7d7204caf3ee 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -34,13 +34,12 @@ struct TORCH_API LBFGSOptions : public OptimizerCloneableOptions<LBFGSOptions> {
   void set_lr(const double lr) override;
 };
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct TORCH_API LBFGSParamState
     : public OptimizerCloneableParamState<LBFGSParamState> {
   TORCH_ARG(int64_t, func_evals) = 0;
   TORCH_ARG(int64_t, n_iter) = 0;
-  TORCH_ARG(double, t);
-  TORCH_ARG(double, prev_loss);
+  TORCH_ARG(double, t) = 0;
+  TORCH_ARG(double, prev_loss) = 0;
   TORCH_ARG(Tensor, d) = {};
   TORCH_ARG(Tensor, H_diag) = {};
   TORCH_ARG(Tensor, prev_flat_grad) = {};
diff --git a/torch/csrc/api/include/torch/optim/optimizer.h b/torch/csrc/api/include/torch/optim/optimizer.h
index 9029ee5ccbb5..c75639837a3a 100644
--- a/torch/csrc/api/include/torch/optim/optimizer.h
+++ b/torch/csrc/api/include/torch/optim/optimizer.h
@@ -91,9 +91,7 @@ class TORCH_API OptimizerParamGroup {
   const std::vector<Tensor>& params() const;
 
  protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::vector<Tensor> params_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::unique_ptr<OptimizerOptions> options_;
 };
 
@@ -134,7 +132,7 @@ class TORCH_API Optimizer {
   void add_parameters(const std::vector<Tensor>& parameters);
 
   /// Zeros out the gradients of all parameters.
-  void zero_grad();
+  void zero_grad(bool set_to_none = true);
 
   /// Provides a const reference to the parameters in the first param_group this
   /// optimizer holds.
@@ -172,11 +170,8 @@ class TORCH_API Optimizer {
   virtual void load(serialize::InputArchive& archive);
 
  protected:
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::vector<OptimizerParamGroup> param_groups_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   ska::flat_hash_map<std::string, std::unique_ptr<OptimizerParamState>> state_;
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::unique_ptr<OptimizerOptions> defaults_;
 };
 
diff --git a/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h b/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
index 4a24c10de252..26d324fbecce 100644
--- a/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
+++ b/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
@@ -28,8 +28,7 @@ class TORCH_API LRScheduler {
   // Get current learning rates from the optimizer
   std::vector<double> get_current_lrs() const;
 
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  unsigned step_count_;
+  unsigned step_count_{};
 
  private:
   void set_optimizer_lrs(const std::vector<double>& learning_rates);
diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp
index 8032001857ec..105bd16f9d68 100644
--- a/torch/csrc/api/src/nn/modules/batchnorm.cpp
+++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp
@@ -11,8 +11,6 @@
 #include <utility>
 #include <vector>
 
-namespace F = torch::nn::functional;
-
 namespace torch {
 namespace nn {
 
diff --git a/torch/csrc/api/src/nn/modules/embedding.cpp b/torch/csrc/api/src/nn/modules/embedding.cpp
index 5354cef48625..95982482601d 100644
--- a/torch/csrc/api/src/nn/modules/embedding.cpp
+++ b/torch/csrc/api/src/nn/modules/embedding.cpp
@@ -13,8 +13,8 @@ namespace F = torch::nn::functional;
 
 namespace torch {
 namespace nn {
-EmbeddingImpl::EmbeddingImpl(const EmbeddingOptions& options_)
-    : options(options_) { // NOLINT(modernize-pass-by-value)
+EmbeddingImpl::EmbeddingImpl(EmbeddingOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -89,8 +89,8 @@ torch::Tensor EmbeddingImpl::forward(const Tensor& input) {
       options.sparse());
 }
 
-EmbeddingBagImpl::EmbeddingBagImpl(const EmbeddingBagOptions& options_)
-    : options(options_) { // NOLINT(modernize-pass-by-value)
+EmbeddingBagImpl::EmbeddingBagImpl(EmbeddingBagOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
diff --git a/torch/csrc/api/src/nn/modules/instancenorm.cpp b/torch/csrc/api/src/nn/modules/instancenorm.cpp
index a7eb31882e7d..99ab1d7d6708 100644
--- a/torch/csrc/api/src/nn/modules/instancenorm.cpp
+++ b/torch/csrc/api/src/nn/modules/instancenorm.cpp
@@ -1,8 +1,6 @@
 #include <torch/nn/functional/instancenorm.h>
 #include <torch/nn/modules/instancenorm.h>
 
-namespace F = torch::nn::functional;
-
 namespace torch {
 namespace nn {
 
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index 3e4ecca31d84..0b7ec33b53ad 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -5,7 +5,7 @@ namespace F = torch::nn::functional;
 namespace torch {
 namespace nn {
 
-L1LossImpl::L1LossImpl(const L1LossOptions& options_) : options(options_) {}
+L1LossImpl::L1LossImpl(L1LossOptions options_) : options(std::move(options_)) {}
 
 void L1LossImpl::reset() {}
 
@@ -19,8 +19,8 @@ Tensor L1LossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-KLDivLossImpl::KLDivLossImpl(const KLDivLossOptions& options_)
-    : options(options_) {}
+KLDivLossImpl::KLDivLossImpl(KLDivLossOptions options_)
+    : options(std::move(options_)) {}
 
 void KLDivLossImpl::reset() {}
 
@@ -35,7 +35,8 @@ Tensor KLDivLossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-MSELossImpl::MSELossImpl(const MSELossOptions& options_) : options(options_) {}
+MSELossImpl::MSELossImpl(MSELossOptions options_)
+    : options(std::move(options_)) {}
 
 void MSELossImpl::reset() {}
 
@@ -49,8 +50,8 @@ Tensor MSELossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-BCELossImpl::BCELossImpl(const BCELossOptions& options_)
-    : options(options_) { // NOLINT(modernize-pass-by-value)
+BCELossImpl::BCELossImpl(BCELossOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -71,8 +72,8 @@ Tensor BCELossImpl::forward(const Tensor& input, const Tensor& target) {
 // ============================================================================
 
 HingeEmbeddingLossImpl::HingeEmbeddingLossImpl(
-    const HingeEmbeddingLossOptions& options_)
-    : options(options_) {}
+    HingeEmbeddingLossOptions options_)
+    : options(std::move(options_)) {}
 
 void HingeEmbeddingLossImpl::reset() {}
 
@@ -89,9 +90,8 @@ Tensor HingeEmbeddingLossImpl::forward(
 
 // ============================================================================
 
-MultiMarginLossImpl::MultiMarginLossImpl(
-    const MultiMarginLossOptions& options_) // NOLINT(modernize-pass-by-value)
-    : options(options_) {
+MultiMarginLossImpl::MultiMarginLossImpl(MultiMarginLossOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -125,8 +125,8 @@ Tensor MultiMarginLossImpl::forward(const Tensor& input, const Tensor& target) {
 // ============================================================================
 
 CosineEmbeddingLossImpl::CosineEmbeddingLossImpl(
-    const CosineEmbeddingLossOptions& options_)
-    : options(options_) {}
+    CosineEmbeddingLossOptions options_)
+    : options(std::move(options_)) {}
 
 void CosineEmbeddingLossImpl::reset() {}
 
@@ -144,9 +144,8 @@ Tensor CosineEmbeddingLossImpl::forward(
 // ============================================================================
 
 MultiLabelSoftMarginLossImpl::MultiLabelSoftMarginLossImpl(
-    const torch::nn::MultiLabelSoftMarginLossOptions&
-        options_) // NOLINT(modernize-pass-by-value)
-    : options(options_) {
+    torch::nn::MultiLabelSoftMarginLossOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -168,9 +167,8 @@ Tensor MultiLabelSoftMarginLossImpl::forward(
 
 // ============================================================================
 
-TripletMarginLossImpl::TripletMarginLossImpl(
-    const TripletMarginLossOptions& options_)
-    : options(options_) {}
+TripletMarginLossImpl::TripletMarginLossImpl(TripletMarginLossOptions options_)
+    : options(std::move(options_)) {}
 
 void TripletMarginLossImpl::reset() {}
 
@@ -227,8 +225,8 @@ Tensor TripletMarginWithDistanceLossImpl::forward(
 // ============================================================================
 
 MultiLabelMarginLossImpl::MultiLabelMarginLossImpl(
-    const torch::nn::MultiLabelMarginLossOptions& options_)
-    : options(options_) {}
+    torch::nn::MultiLabelMarginLossOptions options_)
+    : options(std::move(options_)) {}
 
 void MultiLabelMarginLossImpl::reset() {}
 
@@ -245,8 +243,8 @@ Tensor MultiLabelMarginLossImpl::forward(
 // ============================================================================
 
 SoftMarginLossImpl::SoftMarginLossImpl(
-    const torch::nn::SoftMarginLossOptions& options_)
-    : options(options_) {}
+    torch::nn::SoftMarginLossOptions options_)
+    : options(std::move(options_)) {}
 
 void SoftMarginLossImpl::reset() {}
 
@@ -260,9 +258,8 @@ Tensor SoftMarginLossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-SmoothL1LossImpl::SmoothL1LossImpl(
-    const torch::nn::SmoothL1LossOptions& options_)
-    : options(options_) {}
+SmoothL1LossImpl::SmoothL1LossImpl(torch::nn::SmoothL1LossOptions options_)
+    : options(std::move(options_)) {}
 
 void SmoothL1LossImpl::reset() {}
 
@@ -277,8 +274,8 @@ Tensor SmoothL1LossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-HuberLossImpl::HuberLossImpl(const torch::nn::HuberLossOptions& options_)
-    : options(options_) {}
+HuberLossImpl::HuberLossImpl(torch::nn::HuberLossOptions options_)
+    : options(std::move(options_)) {}
 
 void HuberLossImpl::reset() {}
 
@@ -293,7 +290,8 @@ Tensor HuberLossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-CTCLossImpl::CTCLossImpl(const CTCLossOptions& options_) : options(options_) {}
+CTCLossImpl::CTCLossImpl(CTCLossOptions options_)
+    : options(std::move(options_)) {}
 
 void CTCLossImpl::reset() {}
 
@@ -318,8 +316,8 @@ Tensor CTCLossImpl::forward(
 
 // ============================================================================
 
-PoissonNLLLossImpl::PoissonNLLLossImpl(const PoissonNLLLossOptions& options_)
-    : options(options_) {}
+PoissonNLLLossImpl::PoissonNLLLossImpl(PoissonNLLLossOptions options_)
+    : options(std::move(options_)) {}
 
 void PoissonNLLLossImpl::reset() {}
 
@@ -341,9 +339,8 @@ Tensor PoissonNLLLossImpl::forward(
 
 // ============================================================================
 
-MarginRankingLossImpl::MarginRankingLossImpl(
-    const MarginRankingLossOptions& options_)
-    : options(options_) {}
+MarginRankingLossImpl::MarginRankingLossImpl(MarginRankingLossOptions options_)
+    : options(std::move(options_)) {}
 
 void MarginRankingLossImpl::reset() {}
 
@@ -361,9 +358,8 @@ Tensor MarginRankingLossImpl::forward(
 
 // ============================================================================
 
-NLLLossImpl::NLLLossImpl(
-    const NLLLossOptions& options_) // NOLINT(modernize-pass-by-value)
-    : options(options_) {
+NLLLossImpl::NLLLossImpl(NLLLossOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -383,9 +379,8 @@ Tensor NLLLossImpl::forward(const Tensor& input, const Tensor& target) {
 
 // ============================================================================
 
-CrossEntropyLossImpl::CrossEntropyLossImpl(
-    const CrossEntropyLossOptions& options_) // NOLINT(modernize-pass-by-value)
-    : options(options_) {
+CrossEntropyLossImpl::CrossEntropyLossImpl(CrossEntropyLossOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -412,10 +407,8 @@ Tensor CrossEntropyLossImpl::forward(
 
 // ============================================================================
 
-BCEWithLogitsLossImpl::BCEWithLogitsLossImpl(
-    // NOLINTNEXTLINE(modernize-pass-by-value)
-    const BCEWithLogitsLossOptions& options_)
-    : options(options_) {
+BCEWithLogitsLossImpl::BCEWithLogitsLossImpl(BCEWithLogitsLossOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
diff --git a/torch/csrc/api/src/nn/modules/normalization.cpp b/torch/csrc/api/src/nn/modules/normalization.cpp
index e64433b5a665..8170ecb8ae7a 100644
--- a/torch/csrc/api/src/nn/modules/normalization.cpp
+++ b/torch/csrc/api/src/nn/modules/normalization.cpp
@@ -12,8 +12,8 @@ namespace F = torch::nn::functional;
 namespace torch {
 namespace nn {
 
-LayerNormImpl::LayerNormImpl(const LayerNormOptions& options_)
-    : options(options_) { // NOLINT(modernize-pass-by-value)
+LayerNormImpl::LayerNormImpl(LayerNormOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp
index 8fef6353685a..c465a9acf404 100644
--- a/torch/csrc/api/src/nn/modules/pooling.cpp
+++ b/torch/csrc/api/src/nn/modules/pooling.cpp
@@ -272,9 +272,8 @@ template class MaxUnpoolImpl<3, MaxUnpool3dImpl>;
 // ============================================================================
 
 FractionalMaxPool2dImpl::FractionalMaxPool2dImpl(
-    const FractionalMaxPool2dOptions&
-        options_) // NOLINT(modernize-pass-by-value)
-    : options(options_) {
+    FractionalMaxPool2dOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -332,9 +331,8 @@ void FractionalMaxPool2dImpl::pretty_print(std::ostream& stream) const {
 }
 
 FractionalMaxPool3dImpl::FractionalMaxPool3dImpl(
-    const FractionalMaxPool3dOptions&
-        options_) // NOLINT(modernize-pass-by-value)
-    : options(options_) {
+    FractionalMaxPool3dOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
diff --git a/torch/csrc/api/src/nn/modules/transformer.cpp b/torch/csrc/api/src/nn/modules/transformer.cpp
index df08c629da56..7f007460a714 100644
--- a/torch/csrc/api/src/nn/modules/transformer.cpp
+++ b/torch/csrc/api/src/nn/modules/transformer.cpp
@@ -13,8 +13,8 @@ namespace nn {
 
 // ========================TransformerEncoderLayerImpl=========================
 TransformerEncoderLayerImpl::TransformerEncoderLayerImpl(
-    const TransformerEncoderLayerOptions& options_)
-    : options(options_) {
+    TransformerEncoderLayerOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
@@ -91,8 +91,8 @@ Tensor TransformerEncoderLayerImpl::forward(
 
 // ========================TransformerDecoderLayerImpl=========================
 TransformerDecoderLayerImpl::TransformerDecoderLayerImpl(
-    const TransformerDecoderLayerOptions& options_)
-    : options(options_) {
+    TransformerDecoderLayerOptions options_)
+    : options(std::move(options_)) {
   // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
   reset();
 }
diff --git a/torch/csrc/api/src/optim/optimizer.cpp b/torch/csrc/api/src/optim/optimizer.cpp
index f73e54d2835f..d7aa7012611b 100644
--- a/torch/csrc/api/src/optim/optimizer.cpp
+++ b/torch/csrc/api/src/optim/optimizer.cpp
@@ -121,12 +121,15 @@ void Optimizer::add_parameters(const std::vector<Tensor>& parameters) {
   parameters_.insert(parameters_.end(), parameters.begin(), parameters.end());
 }
 
-void Optimizer::zero_grad() {
+void Optimizer::zero_grad(bool set_to_none) {
   for (auto& group : param_groups_) {
     for (auto& p : group.params()) {
-      if (p.grad().defined()) {
-        p.grad().detach_();
-        p.grad().zero_();
+      if (p.mutable_grad().defined()) {
+        p.mutable_grad().detach_();
+        if (set_to_none)
+          p.mutable_grad().reset();
+        else
+          p.mutable_grad().zero_();
       }
     }
   }
diff --git a/torch/csrc/api/src/optim/schedulers/lr_scheduler.cpp b/torch/csrc/api/src/optim/schedulers/lr_scheduler.cpp
index 00b30a74fb88..1c2aa1b91eef 100644
--- a/torch/csrc/api/src/optim/schedulers/lr_scheduler.cpp
+++ b/torch/csrc/api/src/optim/schedulers/lr_scheduler.cpp
@@ -5,7 +5,7 @@ namespace torch {
 namespace optim {
 
 LRScheduler::LRScheduler(torch::optim::Optimizer& optimizer)
-    : step_count_(0), optimizer_(optimizer) {}
+    : optimizer_(optimizer) {}
 
 void LRScheduler::step() {
   std::vector<double> learning_rates = get_lrs();
@@ -31,7 +31,7 @@ void LRScheduler::set_optimizer_lrs(const std::vector<double>& learning_rates) {
 
 std::vector<double> LRScheduler::get_current_lrs() const {
   std::vector<double> learnings_rates(optimizer_.param_groups().size());
-  if (learnings_rates.size() > 0) {
+  if (!learnings_rates.empty()) {
     for (const auto i : c10::irange(optimizer_.param_groups().size())) {
       learnings_rates[i] = optimizer_.param_groups()[i].options().get_lr();
     }
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 57fa3686e71b..a30b7f519e77 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -30,6 +30,7 @@
 #include <ciso646>
 #include <functional>
 #include <numeric>
+#include <utility>
 
 // Helper functions for autogenerated code
 // These used to be inlined into the codegened Functions.cpp
@@ -103,7 +104,7 @@ template <typename T>
 T not_implemented_base(const char* name, const char* reason) {
   std::string msg =
       c10::str("the derivative for '", name, "' is not implemented.");
-  if (strlen(reason) > 0) {
+  if (reason[0] != '\0') {
     msg = c10::str(msg, " ", reason);
   };
   TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
@@ -134,7 +135,7 @@ Tensor maybe_multiply(const Tensor& t, const Scalar& s) {
 
 int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim) {
   int64_t size = 1;
-  if (sizes.size() == 0) {
+  if (sizes.empty()) {
     return 1;
   }
   for (auto d : dim) {
@@ -146,7 +147,7 @@ int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim) {
 
 c10::SymInt _safe_size(c10::SymIntArrayRef sizes, c10::IntArrayRef dim) {
   c10::SymInt size = 1;
-  if (sizes.size() == 0) {
+  if (sizes.empty()) {
     return 1;
   }
   for (auto d : dim) {
@@ -361,7 +362,7 @@ Tensor norm_jvp(
     const Tensor& self_t,
     const optional<Scalar>& p_,
     Tensor norm) {
-  return norm_jvp(self_p, self_t, p_, norm, {}, true);
+  return norm_jvp(self_p, self_t, p_, std::move(norm), {}, true);
 }
 
 Tensor _nested_from_padded_backward(
@@ -389,7 +390,7 @@ Tensor linalg_vector_norm_jvp(
   // No need to handle the dtype arg as it's handled via broadcasting in the
   // function
   auto dim = opt_dim.value_or(IntArrayRef({}));
-  return norm_jvp(self_p, self_t, scalar_ord, norm, dim, keepdim);
+  return norm_jvp(self_p, self_t, scalar_ord, std::move(norm), dim, keepdim);
 }
 
 Tensor linalg_vector_norm_backward(
@@ -402,7 +403,8 @@ Tensor linalg_vector_norm_backward(
   // No need to handle the dtype arg as it's handled via broadcasting in the
   // function
   auto dim = opt_dim.value_or(IntArrayRef({}));
-  return norm_backward(grad, self, scalar_ord, norm, dim, keepdim);
+  return norm_backward(
+      std::move(grad), self, scalar_ord, std::move(norm), dim, keepdim);
 }
 
 Tensor pow_backward(Tensor grad, const Tensor& self, const Scalar& exponent) {
@@ -415,7 +417,7 @@ Tensor pow_backward(Tensor grad, const Tensor& self, const Scalar& exponent) {
     Tensor out = (exponent.isComplex())
         ? grad_lambda(exponent.toComplexDouble())
         : grad_lambda(exponent.toDouble());
-    return handle_r_to_c(self, out);
+    return handle_r_to_c(self, std::move(out));
   }
 }
 
@@ -427,7 +429,7 @@ Tensor pow_backward_self(
       exponent == 0.0,
       at::zeros({}, grad.options()),
       grad * (exponent * self.pow(exponent - 1)).conj());
-  return handle_r_to_c(self, out);
+  return handle_r_to_c(self, std::move(out));
 }
 
 // Caveats:
@@ -451,11 +453,15 @@ Tensor pow_backward_exponent(
   } else {
     cond = at::logical_and(self == 0, exponent >= 0);
   }
+  auto promoted_dtype = at::result_type(self, exponent);
+  // `.to()` is no-op if dtype is same.
+  auto self_ = self.to(promoted_dtype);
+
   auto out =
       grad *
       at::where(
-          cond, at::zeros({}, grad.options()), (result * self.log()).conj());
-  return handle_r_to_c(exponent, out);
+          cond, at::zeros({}, grad.options()), (result * self_.log()).conj());
+  return handle_r_to_c(exponent, std::move(out));
 }
 
 Tensor pow_backward_exponent(
@@ -464,6 +470,9 @@ Tensor pow_backward_exponent(
     const Tensor& exponent,
     Tensor result) {
   auto grad_lambda = [](Tensor a, Scalar b) { return (a * b.log()).conj(); };
+  auto base_ = exponent.is_complex() && !base.isComplex()
+      ? base.toComplexDouble()
+      : base;
   if (base.equal(0.0)) {
     auto cond = [](auto exp) {
       if (exp.is_complex()) {
@@ -475,11 +484,11 @@ Tensor pow_backward_exponent(
     auto out = grad *
         at::where(cond(exponent),
                   at::zeros({}, grad.options()),
-                  grad_lambda(result, base));
-    return handle_r_to_c(exponent, out);
+                  grad_lambda(std::move(result), base_));
+    return handle_r_to_c(exponent, std::move(out));
   } else {
-    auto out = grad * grad_lambda(result, base);
-    return handle_r_to_c(exponent, out);
+    auto out = grad * grad_lambda(std::move(result), base_);
+    return handle_r_to_c(exponent, std::move(out));
   }
 }
 
@@ -519,14 +528,18 @@ Tensor masked_fill_backward(const Tensor& grad, const Tensor& mask) {
       : grad.masked_select(mask).sum();
 }
 
-Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
+template <typename T>
+Tensor mul_tensor_backward(Tensor grad, T other, ScalarType self_st) {
   auto out = grad * other.conj();
-  return handle_r_to_c(self_st, out);
+  return handle_r_to_c(self_st, std::move(out));
 }
+template Tensor mul_tensor_backward(Tensor, Tensor, ScalarType);
+template Tensor mul_tensor_backward(Tensor, Scalar, ScalarType);
 
+template <typename T>
 Tensor div_tensor_self_backward(
     Tensor grad,
-    Tensor other,
+    T other,
     ScalarType self_st,
     const c10::optional<c10::string_view>& rounding_mode) {
   if (rounding_mode.has_value()) {
@@ -534,12 +547,26 @@ Tensor div_tensor_self_backward(
   }
 
   auto result = grad / other.conj();
-  return handle_r_to_c(self_st, result);
-}
+  return handle_r_to_c(self_st, std::move(result));
+}
+template Tensor div_tensor_self_backward(
+    Tensor,
+    Tensor,
+    ScalarType,
+    const c10::optional<c10::string_view>&);
+template Tensor div_tensor_self_backward(
+    Tensor,
+    Scalar,
+    ScalarType,
+    const c10::optional<c10::string_view>&);
 
-Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
-  return div_tensor_self_backward(grad, other, self_st, c10::nullopt);
+template <typename T>
+Tensor div_tensor_self_backward(Tensor grad, T other, ScalarType self_st) {
+  return div_tensor_self_backward(
+      std::move(grad), std::move(other), self_st, c10::nullopt);
 }
+template Tensor div_tensor_self_backward(Tensor, Tensor, ScalarType);
+template Tensor div_tensor_self_backward(Tensor, Scalar, ScalarType);
 
 Tensor div_tensor_other_backward(
     Tensor grad,
@@ -551,11 +578,12 @@ Tensor div_tensor_other_backward(
   }
 
   auto result = -grad * ((self / other) / other).conj();
-  return handle_r_to_c(other, result);
+  return handle_r_to_c(std::move(other), std::move(result));
 }
 
 Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
-  return div_tensor_other_backward(grad, self, other, c10::nullopt);
+  return div_tensor_other_backward(
+      std::move(grad), std::move(self), std::move(other), c10::nullopt);
 }
 
 Tensor permute_backwards(const Tensor& grad, IntArrayRef fwd_dims) {
@@ -609,8 +637,8 @@ Tensor sum_backward(
     c10::SymIntArrayRef sizes,
     OptionalIntArrayRef opt_dims,
     bool keepdim) {
-  if (!keepdim && sizes.size() > 0) {
-    if (opt_dims.has_value() && opt_dims.value().size() > 0) {
+  if (!keepdim && !sizes.empty()) {
+    if (opt_dims.has_value() && !opt_dims.value().empty()) {
       return unsqueeze_multiple(grad, opt_dims, sizes.size())
           .expand_symint(sizes);
     }
@@ -623,7 +651,7 @@ Tensor sum_backward(
     c10::SymIntArrayRef sizes,
     c10::IntArrayRef dims,
     bool keepdim) {
-  if (!keepdim && sizes.size() > 0 && dims.size() > 0) {
+  if (!keepdim && !sizes.empty() && !dims.empty()) {
     // we are only using `keepdim=true` path for SymInts for now
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
@@ -648,9 +676,10 @@ Tensor mean_backward(
     OptionalIntArrayRef opt_dim,
     c10::SymInt numel,
     bool keepdim) {
-  bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().size() == 0;
-  auto n = is_all_reduce ? numel : _safe_size(shape, opt_dim.value());
-  return sum_backward(grad, shape, opt_dim, keepdim) / n;
+  bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().empty();
+  auto n =
+      is_all_reduce ? std::move(numel) : _safe_size(shape, opt_dim.value());
+  return sum_backward(grad, shape, opt_dim, keepdim) / std::move(n);
 }
 
 std::vector<int64_t> reverse_list(const IntArrayRef list) {
@@ -672,7 +701,7 @@ Tensor prod_safe_zeros_backward(
     const Tensor& grad,
     const Tensor& inp,
     int64_t dim) {
-  if (inp.numel() == 0) {
+  if (inp.sym_numel() == 0) {
     // When input has a zero sized dimension (empty tensor),
     // we don't need to actually compute the grads.
     // So we just reshape `grad` as `input`.
@@ -692,7 +721,8 @@ Tensor prod_safe_zeros_backward(
 
   Tensor narrow_reverse =
       reverse_dim(inp.narrow(dim, 1, inp.size(dim) - 1), dim);
-  Tensor exclusive_reverse_nocp = at::cat({ones, narrow_reverse}, dim);
+  Tensor exclusive_reverse_nocp =
+      at::cat({std::move(ones), std::move(narrow_reverse)}, dim);
   Tensor exclusive_reverse =
       reverse_dim(exclusive_reverse_nocp.cumprod(dim), dim);
 
@@ -719,7 +749,7 @@ Tensor prod_backward(
         .view_as(input);
   }
   Tensor zero_idx = (input == 0).nonzero();
-  if (zero_idx.numel() == 0) {
+  if (zero_idx.sym_numel() == 0) {
     return grad * (result / input).conj();
   } else if (zero_idx.size(0) > 1) {
     return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
@@ -778,7 +808,7 @@ static Tensor generic_solve_jvp(
 
 Tensor cumsum_backward(const Tensor& grad, int64_t dim) {
   // Trivial case
-  if (grad.numel() <= 1 || grad.size(dim) == 1) {
+  if (grad.sym_numel() <= 1 || grad.sym_size(dim) == 1) {
     return grad;
   }
   return grad.flip(dim).cumsum(dim).flip(dim);
@@ -802,7 +832,7 @@ Tensor logcumsumexp_backward(
     const Tensor& self,
     Tensor result,
     int64_t dim) {
-  if (grad.dim() == 0 || grad.numel() == 0) {
+  if (grad.dim() == 0 || grad.sym_numel() == 0) {
     return grad;
   }
 
@@ -992,7 +1022,7 @@ std::vector<Tensor> block_diag_backward(
                      .slice(1, cur_dim1, cur_dim1 + dim1);
     if (shape.size() == 1) {
       slice = slice.squeeze(-1);
-    } else if (shape.size() == 0) {
+    } else if (shape.empty()) {
       slice = slice.squeeze(-1).squeeze(-1);
     }
     grad_inputs[i] = slice;
@@ -1323,27 +1353,6 @@ Tensor mm_mat1_sparse_backward(
       mat2.layout());
 }
 
-// This function return a new SparseTensor with values from Tensor `input`
-// filtered by indices of `mask` and values are ignored. `input` and `mask` are
-// sparse matrices, a sparse tensor with sparse_dim=2 and  dense_dim=2, and they
-// must have the same shape. Note that the `output` must have the same `indices`
-// as the `mask` so we are using just a clone. However, to get `values` we have
-// to use specific helper function for CPU/CUDA and use the `mask` data to
-// filter `values` That's why we created this `_sparse_mask_helper` function.
-Tensor _sparse_matrix_mask(const Tensor& input, const Tensor& mask) {
-  Tensor output = at::empty_like(mask);
-  Tensor mask_indices = mask._indices().clone();
-  Tensor r_values;
-  if (mask._nnz() == 0) {
-    r_values = at::zeros_like(mask._values());
-  } else {
-    r_values = _sparse_mask_helper(input, mask_indices.contiguous());
-  }
-  at::sparse::get_sparse_impl(output)->set_indices_and_values_unsafe(
-      mask_indices, r_values);
-  return output;
-}
-
 Tensor sparse_sparse_matmul_backward(
     const Tensor& grad,
     const Tensor& a,
@@ -1368,12 +1377,19 @@ Tensor sparse_sparse_matmul_backward(
   TORCH_CHECK(
       grad_order == 0 || grad_order == 1,
       ": grad_order not in [0, 1] at sparse_sparse_matmul_backward function");
+  const auto mask_ones_like = [](const Tensor& t) -> Tensor {
+    return at::sparse_coo_tensor(
+        t._indices(),
+        at::ones({1}, t._values().options()).expand_as(t._values()),
+        t.sizes());
+  };
+
   if (grad_order == 0) {
     auto a_grad = _sparse_sparse_matmul(grad, b.conj().t());
-    return _sparse_matrix_mask(a_grad.coalesce(), a.coalesce());
+    return a_grad.mul(mask_ones_like(a.coalesce()));
   }
   auto b_grad = _sparse_sparse_matmul(a.conj().t(), grad);
-  return _sparse_matrix_mask(b_grad.coalesce(), b.coalesce());
+  return b_grad.mul(mask_ones_like(b.coalesce()));
 }
 
 Tensor renorm_backward(
@@ -1401,8 +1417,8 @@ Tensor renorm_backward(
   }
   grad_output =
       grad_output.sum(reduce_dims, /*keepdim=*/true, /*dtype=*/real_acc_type);
-  auto nb =
-      norm_backward(grad_output, self, p, norm, reduce_dims, /*keepdim=*/true);
+  auto nb = norm_backward(
+      std::move(grad_output), self, p, norm, reduce_dims, /*keepdim=*/true);
 
   auto invnorm = (norm + 1e-7).reciprocal();
   auto grad_norm = maxnorm * invnorm * (grad - invnorm * nb);
@@ -1553,27 +1569,32 @@ Tensor var_backward(
     Tensor grad,
     const Tensor& self,
     at::OptionalIntArrayRef dim_opt,
-    c10::optional<int64_t> correction_opt,
+    const c10::optional<at::Scalar>& correction_opt,
     bool keepdim) {
-  auto correction = correction_opt.value_or(1);
+  const auto correction = correction_opt.value_or(1).toSymFloat();
   if (self.dim() == 0 || !dim_opt.has_value()) {
-    // To apease ASAN
-    auto n = self.numel();
-    if (n == correction) {
-      return INFINITY * grad;
+    const auto dof = c10::SymFloat(self.sym_numel()) - correction;
+    if (dof <= 0) {
+      // when n == correction, 2 / (n - correction) is infinity
+      // when self == self.mean(), we return NaN because infinity * 0 = NaN
+      // otherwise, we return infinity because infinity * c = infinity, for all
+      // c > 0
+      return grad *
+          at::where(
+                 self == self.mean(),
+                 std::numeric_limits<double>::quiet_NaN(),
+                 std::numeric_limits<double>::infinity());
     } else {
-      return (c10::SymFloat(2.0) /
-              c10::SymFloat(self.sym_numel() - correction)) *
-          grad * (self - self.mean());
+      return (c10::SymFloat(2.0) / dof) * grad * (self - self.mean());
     }
   }
   auto dim = dim_opt.value();
   if (!keepdim && self.dim() > 1) {
     grad = unsqueeze_multiple(grad, dim, self.sym_sizes().size());
   }
-  const c10::SymInt dof = _safe_size(self.sym_sizes(), dim) - correction;
+  const c10::SymFloat rnumel(_safe_size(self.sym_sizes(), dim));
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions)
-  return (c10::SymFloat(2.0) / c10::SymFloat(dof)) * grad *
+  return (c10::SymFloat(2.0) / (rnumel - correction)) * grad *
       (self - self.mean(dim, /*keepdim=*/true));
 }
 
@@ -1582,10 +1603,10 @@ Tensor std_backward(
     const Tensor& grad,
     const Tensor& self,
     at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction,
+    const c10::optional<c10::Scalar>& correction_opt,
     bool keepdim) {
   auto grad_var = (grad / (result * 2)).masked_fill_(result == 0, 0);
-  return var_backward(grad_var, self, dim, correction, keepdim);
+  return var_backward(std::move(grad_var), self, dim, correction_opt, keepdim);
 }
 
 Tensor var_mean_backward(
@@ -1593,12 +1614,11 @@ Tensor var_mean_backward(
     const Tensor& gmean,
     const Tensor& self,
     at::OptionalIntArrayRef dim_opt,
-    c10::optional<int64_t> correction_opt,
+    const c10::optional<c10::Scalar>& correction_opt,
     bool keepdim) {
-  auto correction = correction_opt.value_or(1);
   Tensor gself;
   if (gvar.defined()) {
-    gself = var_backward(gvar, self, dim_opt, correction, keepdim);
+    gself = var_backward(gvar, self, dim_opt, correction_opt, keepdim);
   }
   if (gmean.defined()) {
     auto aux = mean_backward(
@@ -1607,7 +1627,7 @@ Tensor var_mean_backward(
         dim_opt.value_or(IntArrayRef({})),
         self.sym_numel(),
         keepdim);
-    gself = gself.defined() ? gself + aux : aux;
+    gself = gself.defined() ? gself + aux : std::move(aux);
   }
   return gself;
 }
@@ -1618,12 +1638,11 @@ Tensor std_mean_backward(
     const Tensor& self,
     const Tensor& std,
     at::OptionalIntArrayRef dim_opt,
-    c10::optional<int64_t> correction_opt,
+    const c10::optional<c10::Scalar>& correction_opt,
     bool keepdim) {
-  auto correction = correction_opt.value_or(1);
   Tensor gself;
   if (gstd.defined()) {
-    gself = std_backward(std, gstd, self, dim_opt, correction, keepdim);
+    gself = std_backward(std, gstd, self, dim_opt, correction_opt, keepdim);
   }
   if (gmean.defined()) {
     auto aux = mean_backward(
@@ -1632,7 +1651,7 @@ Tensor std_mean_backward(
         dim_opt.value_or(IntArrayRef({})),
         self.sym_numel(),
         keepdim);
-    gself = gself.defined() ? gself + aux : aux;
+    gself = gself.defined() ? gself + aux : std::move(aux);
   }
   return gself;
 }
@@ -1651,8 +1670,9 @@ Tensor masked_scatter_backward(
     // because mask_selected returns a 1-d tensor with size of masked elements
     // that are 1, we need to fill out the rest with zeros then reshape back to
     // tensor2's size.
-    auto zeros_fillin = at::zeros_symint({diff_nelem}, grad.options());
-    mask_selected = at::cat({mask_selected, zeros_fillin}, 0);
+    auto zeros_fillin =
+        at::zeros_symint({std::move(diff_nelem)}, grad.options());
+    mask_selected = at::cat({mask_selected, std::move(zeros_fillin)}, 0);
   }
   return mask_selected.view_symint(sizes);
 }
@@ -1675,7 +1695,7 @@ Tensor cholesky_jvp(const Tensor& dA, const Tensor& L, bool upper) {
   dL = at::linalg_solve_triangular(L_.mH(), dL, /*upper=*/true, /*left=*/false);
   dL = dL.tril() - dL.diagonal(0, -2, -1).mul(0.5).diag_embed();
   dL = L_.matmul(dL);
-  return upper ? dL.mH() : dL;
+  return upper ? dL.mH() : std::move(dL);
 }
 
 Tensor cholesky_backward(const Tensor& gL, bool upper, const Tensor& L) {
@@ -1874,7 +1894,7 @@ Tensor max_pool_double_backward(
     int dim) {
   AT_ASSERT(indices.dim() >= dim);
   // handle non-empty inputs
-  if (indices.numel()) {
+  if (indices.sym_numel() != 0) {
     auto size = indices.sizes().slice(0, indices.dim() - dim).vec();
     size.push_back(-1);
     auto indices_view = indices.view(size);
@@ -1913,7 +1933,7 @@ Tensor glu_double_backward(
   auto gI_second_half =
       ggI_second_half_times_first_half * gO * second_order_sh +
       ggI_first_half * gO * sig_one_sub_sig;
-  return at::cat({gI_first_half, gI_second_half}, dim);
+  return at::cat({std::move(gI_first_half), std::move(gI_second_half)}, dim);
 }
 
 Tensor glu_double_backward_grad_output(
@@ -1972,12 +1992,8 @@ Tensor binary_cross_entropy_target_backward(
     const Tensor& target,
     const c10::optional<Tensor>& weight,
     int64_t reduction) {
-  auto grad_target = [&] {
-    if (self.is_mps()) {
-      return self.neg().log1p_().sub_(self.log());
-    }
-    return at::logit(self).neg_();
-  }();
+  auto grad_target = at::logit(self).neg_();
+
   if (!areAnyTensorSubclassLike({grad})) {
     grad_target.mul_(grad);
   } else {
@@ -1993,7 +2009,7 @@ Tensor binary_cross_entropy_target_backward(
   }
 
   if (reduction == at::Reduction::Mean) {
-    grad_target.div_(target.numel());
+    grad_target.div_(target.sym_numel());
   }
 
   return grad_target;
@@ -2027,7 +2043,7 @@ Tensor binary_cross_entropy_double_backward_target(
   res = isTensorSubclassLike(denom) ? res.div(denom) : res.div_(denom);
 
   if (reduction == at::Reduction::Mean) {
-    res.div_(target.numel());
+    res.div_(target.sym_numel());
   }
 
   return res;
@@ -2078,7 +2094,7 @@ Tensor binary_cross_entropy_with_logits_backward(
   }
 
   if (reduction == at::Reduction::Mean) {
-    grad_input.div_(input.numel());
+    grad_input.div_(input.sym_numel());
   }
 
   return grad_input;
@@ -2119,7 +2135,7 @@ Tensor binary_cross_entropy_with_logits_target_backward(
   }
 
   if (reduction == at::Reduction::Mean) {
-    grad_target.div_(target.numel());
+    grad_target.div_(target.sym_numel());
   }
 
   return grad_target;
@@ -2196,7 +2212,7 @@ Tensor binary_cross_entropy_double_backward(
     }
   }
   if (reduction == at::Reduction::Mean) {
-    return gI / input.numel();
+    return gI / input.sym_numel();
   }
 
   return gI;
@@ -2225,7 +2241,7 @@ Tensor binary_cross_entropy_double_backward_grad_output(
     }
   }
   if (reduction == at::Reduction::Mean) {
-    return ggO / input.numel();
+    return ggO / input.sym_numel();
   }
   return ggO;
 }
@@ -2243,7 +2259,7 @@ Tensor smooth_l1_loss_double_backward(
   auto d = (input - target).abs();
   auto grad_input = grad * (d < beta).type_as(grad) / beta;
   if (reduction == at::Reduction::Mean) {
-    grad_input /= input.numel();
+    grad_input /= input.sym_numel();
   }
   return grad_input;
 }
@@ -2257,7 +2273,7 @@ Tensor huber_loss_double_backward(
   auto d = (input - target).abs();
   auto grad_input = grad * (d < delta);
   if (reduction == at::Reduction::Mean) {
-    grad_input /= input.numel();
+    grad_input /= input.sym_numel();
   }
   return grad_input;
 }
@@ -2283,7 +2299,7 @@ Tensor mse_loss_double_backward(
     int64_t reduction) {
   auto grad_input = 2 * grad;
   if (reduction == at::Reduction::Mean) {
-    grad_input /= input.numel();
+    grad_input /= input.sym_numel();
   }
   return grad_input;
 }
@@ -2297,7 +2313,7 @@ Tensor soft_margin_loss_double_backward(
   auto zplus1 = z + 1;
   auto grad_input = grad * (target * target) * z / (zplus1 * zplus1);
   if (reduction == at::Reduction::Mean) {
-    grad_input /= input.numel();
+    grad_input /= input.sym_numel();
   }
   return grad_input;
 }
@@ -2744,7 +2760,7 @@ Tensor softplus_double_backward(
 static inline bool _maybe_overlapping_memory(
     c10::SymIntArrayRef sizes,
     c10::SymIntArrayRef strides) {
-  if (sizes.size() > 0) {
+  if (!sizes.empty()) {
     std::vector<std::size_t> argsort(sizes.size());
     std::iota(argsort.begin(), argsort.end(), 0);
     std::sort(
@@ -2786,7 +2802,7 @@ static inline c10::SymInt _min_storage_size(
 // explanation
 Tensor as_strided_backward(
     Tensor grad,
-    TensorGeometry input_geometry,
+    const TensorGeometry& input_geometry,
     c10::SymIntArrayRef sym_sizes,
     c10::SymIntArrayRef sym_strides,
     optional<c10::SymInt> sym_storage_offset_) {
@@ -2915,7 +2931,7 @@ Tensor as_strided_backward(
 
 Tensor as_strided_scatter_backward(
     Tensor grad,
-    TensorGeometry input_geometry,
+    const TensorGeometry& input_geometry,
     TensorGeometry src_geometry,
     c10::SymIntArrayRef sizes,
     c10::SymIntArrayRef strides,
@@ -2933,7 +2949,8 @@ Tensor as_strided_scatter_backward(
       grad_.new_zeros_symint(input_geometry.sym_sizes())
           .as_strided_symint(
               input_geometry.sym_sizes(), input_geometry.sym_strides());
-  auto result_slice = result.as_strided_symint(sizes, strides, storage_offset);
+  auto result_slice =
+      result.as_strided_symint(sizes, strides, std::move(storage_offset));
   result_slice.copy_(grad_slice);
   return result;
 }
@@ -3028,7 +3045,12 @@ Tensor slice_backward_wrapper(
   auto end_val = end.has_value() ? end.value() : INT64_MAX;
 
   return slice_backward_symint(
-      grad, input_sizes, dim, start_val, end_val, step);
+      grad,
+      input_sizes,
+      dim,
+      std::move(start_val),
+      std::move(end_val),
+      std::move(step));
 }
 
 std::tuple<Tensor, Tensor, Tensor> linalg_svd_jvp(
@@ -3480,7 +3502,7 @@ std::tuple<Tensor, Tensor> linalg_eig_jvp(
   auto dL = is_hermitian && dA.is_complex() ? at::real(dP.diagonal(0, -2, -1))
                                             : dP.diagonal(0, -2, -1);
   auto dV = [&dP, &V, &L, is_hermitian] {
-    const auto dX = [&] {
+    auto dX = [&] {
       auto ret = dP / (L.unsqueeze(-2) - L.unsqueeze(-1));
       ret.diagonal(0, -2, -1).zero_();
       ret = at::matmul(V, ret);
@@ -3775,7 +3797,9 @@ Tensor differential_analytic_matrix_function(
   // eg. if both are BatchedTensor at different level.
   if (areAnyTensorSubclassLike({A, grad})) {
     meta_grad = at::cat(
-        {at::cat({A, grad}, -1), at::cat({at::zeros_like(A), A}, -1)}, -2);
+        {at::cat({A, grad}, -1),
+         at::cat({at::zeros_like(A), std::move(A)}, -1)},
+        -2);
   } else {
     meta_grad = at::zeros(meta_grad_sizes, grad.options());
     meta_grad.narrow(-2, 0, n).narrow(-1, 0, n).copy_(A);
@@ -3812,10 +3836,10 @@ Tensor masked_fmap(
   // for example det_backward
 
   // Precondition for the n == 0 case to make sense
-  TORCH_INTERNAL_ASSERT(t.numel() != 0);
+  TORCH_INTERNAL_ASSERT(t.sym_numel() != 0);
   auto t_masked = t.index({mask});
-  auto n = t_masked.numel();
-  if (n == t.numel()) {
+  auto n = t_masked.sym_numel();
+  if (n == t.sym_numel()) {
     return f1(t, ts...);
   } else if (n == 0) {
     return f2(t, ts...);
@@ -3858,7 +3882,7 @@ Tensor linalg_det_backward(
     const Tensor& pivots) {
   at::NoTF32Guard disable_tf32;
   // A.numel() == 0 necessary for the singular case
-  if (!grad.defined() || A.numel() == 0) {
+  if (!grad.defined() || A.sym_numel() == 0) {
     return {};
   }
 
@@ -4422,7 +4446,7 @@ std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
     ggO = ggO.defined() ? ggO.add_(ggO_G_term) : ggO_G_term;
   }
   if (ggB.defined()) {
-    auto ggO_B_term = ggB_expanded;
+    auto ggO_B_term = std::move(ggB_expanded);
     ggO = ggO.defined() ? ggO.add_(ggO_B_term) : ggO_B_term;
   }
 
@@ -4561,7 +4585,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
     ggO = ggO.defined() ? ggO.add_(ggO_G_term) : ggO_G_term;
   }
   if (ggB.defined()) {
-    auto ggO_B_term = ggB_expanded;
+    auto ggO_B_term = std::move(ggB_expanded);
     ggO = ggO.defined() ? ggO.add_(ggO_B_term) : ggO_B_term;
   }
   if (ggO.defined()) {
@@ -4603,7 +4627,7 @@ infinitely_differentiable_native_group_norm_backward(
   Tensor ds;
   Tensor db;
   if (dY.defined()) {
-    dY_tensor = dY.reshape_symint({N, G, D, HxW});
+    dY_tensor = dY.reshape_symint({N, G, D, std::move(HxW)});
     ds = (dY_tensor * X_tensor).sum(3).unsqueeze_(-1);
     db = dY_tensor.sum(3).unsqueeze_(-1);
   }
@@ -4627,12 +4651,12 @@ infinitely_differentiable_native_group_norm_backward(
       Tensor c = (isDefined(gamma) ? (db * gamma_tensor).sum(2) : db.sum(2))
                      .unsqueeze_(-2);
       b = (c * mean_tensor - b) * rstd_cube * s;
-      c = -b * mean_tensor - c * rstd_tensor * s;
+      c = -b * mean_tensor - c * rstd_tensor * std::move(s);
       dX = a * dY_tensor + b * X_tensor + c;
       if (dmean.defined() && drstd.defined()) {
         dX += var_mean_backward(
             dvar,
-            dmean.view_symint({N, G, 1, 1}),
+            dmean.view_symint({std::move(N), G, 1, 1}),
             X_tensor,
             IntArrayRef{2, 3},
             0,
@@ -4642,7 +4666,7 @@ infinitely_differentiable_native_group_norm_backward(
     } else if (dmean.defined() && drstd.defined()) {
       dX = var_mean_backward(
                dvar,
-               dmean.view_symint({N, G, 1, 1}),
+               dmean.view_symint({std::move(N), G, 1, 1}),
                X_tensor,
                IntArrayRef{2, 3},
                0,
@@ -4724,12 +4748,6 @@ Tensor sinc_backward(const Tensor& grad, const Tensor& self) {
   return at::where(self_squared_pi == 0.0, at::zeros({}, grad.options()), out);
 }
 
-Tensor sparse_constructor_values_backward(
-    const Tensor& sparse_grad_out,
-    const Tensor& indices) {
-  return _sparse_mask_helper(sparse_grad_out.coalesce(), indices.contiguous());
-}
-
 // Because the backward of pad(input, pads) is just pad(grad_output, [-p for p
 // in pads])
 Tensor constant_pad_nd_backward(const Tensor& grad, c10::SymIntArrayRef pad) {
@@ -4906,7 +4924,7 @@ std::tuple<Tensor, Tensor> householder_product_backward(
   // range(k) to range(k - 1, -1, -1) in the main loop, and left/right
   // Householder projection applications get flipped.
   // The comments below about the algorithmic details assume flip_order = false.
-  if (!grad.defined() || !input_.numel() || !tau.numel()) {
+  if (!grad.defined() || input_.sym_numel() == 0 || tau.sym_numel() == 0) {
     return std::tuple<Tensor, Tensor>(Tensor(), Tensor());
   }
   auto m = input_.size(-2);
@@ -5477,7 +5495,7 @@ Tensor linalg_lu_solve_jvp(
             /*unitriangular*/ true)
             .matmul(P.mT());
     // dX = op_2(R^H) + S
-    return (left ? R.mH() : R) + S;
+    return (left ? R.mH() : std::move(R)) + S;
   }
 }
 
@@ -5560,7 +5578,7 @@ std::tuple<Tensor, Tensor> linalg_solve_backward(
     gA_ = left ? -gB_.matmul(X_.mH()) : -X_.mH().matmul(gB_);
   }
   return std::make_tuple(
-      A_requires_grad ? gA_ : Tensor{},
+      A_requires_grad ? std::move(gA_) : Tensor{},
       B_requires_grad ? matrix_to_vector(gB_) : Tensor{});
 }
 
@@ -6130,7 +6148,7 @@ Tensor linalg_lu_backward(
         /*left=*/true,
         /*unitriangular=*/true);
 
-    return pivot ? P.matmul(std::move(A_grad)) : A_grad;
+    return pivot ? P.matmul(std::move(A_grad)) : std::move(A_grad);
   } else if (m < n) {
     // Wide case
     // A1_grad = P L^{-H} [U1_grad + (L^H L_grad o 1_L - U_grad U^H o 1_U)
@@ -6289,7 +6307,8 @@ std::tuple<Tensor, Tensor> linalg_lu_jvp(
         at::linalg_solve_triangular(
             L1, PdA2, /*upper=*/false, /*left=*/true, /*unitriangular*/ true) -
         dK.tril(-1).matmul(U2);
-    return std::make_tuple(std::move(dL1), at::cat({dU1, dU2}, /*dim=*/-1));
+    return std::make_tuple(
+        std::move(dL1), at::cat({std::move(dU1), std::move(dU2)}, /*dim=*/-1));
   } else {
     // we only need to update dL2 defined as
     // dL2 := PdA2 U^{-1} - L2 dK.triu()
@@ -6298,7 +6317,8 @@ std::tuple<Tensor, Tensor> linalg_lu_jvp(
     auto dL2 =
         at::linalg_solve_triangular(U1, PdA2, /*upper=*/true, /*left=*/false) -
         L2.matmul(dK.triu());
-    return std::make_tuple(at::cat({dL1, dL2}, /*dim=*/-2), std::move(dU1));
+    return std::make_tuple(
+        at::cat({std::move(dL1), std::move(dL2)}, /*dim=*/-2), std::move(dU1));
   }
 }
 
@@ -6334,7 +6354,7 @@ Tensor logsumexp_jvp(
   // NB: for simplicitly, we recompute some values that can be reused from
   // forward
   auto self_p_exp = [&self_p, &dim]() {
-    if (self_p.numel() > 0) {
+    if (self_p.sym_numel() > 0) {
       return (self_p - at::amax(self_p, dim, true))
           .exp(); // Use the exp-normalize trick
     } else {
@@ -6485,7 +6505,7 @@ std::tuple<Tensor, Tensor> scatter_reduce_backward(
       auto node = std::make_shared<DelayedError>(
           "scatter_reduce(): Double backward is unsupported for src when >1 zeros in src are scattered to the same position in self",
           /* num inputs */ 1);
-      auto result = node->apply({grad_src1});
+      auto result = node->apply({std::move(grad_src1)});
       grad_src = result[0];
     } else {
       grad_src = grad_src1;
@@ -6579,7 +6599,7 @@ std::tuple<Tensor, Tensor> index_reduce_backward(
       auto node = std::make_shared<DelayedError>(
           "index_reduce(): Double backward is unsupported for source when >1 zeros in source are scattered to the same position in self",
           /* num inputs */ 1);
-      auto result = node->apply({grad_src1});
+      auto result = node->apply({std::move(grad_src1)});
       grad_src = result[0];
     } else {
       grad_src = grad_src1;
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 1279f0af161d..61508725c41a 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -125,15 +125,15 @@ at::Tensor pow_backward_exponent(
     const at::Tensor& exponent,
     at::Tensor result);
 at::Tensor angle_backward(at::Tensor grad, const at::Tensor& self);
-at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
-at::Tensor div_tensor_self_backward(
-    Tensor grad,
-    Tensor other,
-    ScalarType self_st);
+template <typename T>
+at::Tensor mul_tensor_backward(Tensor grad, T other, ScalarType self_st);
+template <typename T>
+at::Tensor div_tensor_self_backward(Tensor grad, T other, ScalarType self_st);
 at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other);
+template <typename T>
 at::Tensor div_tensor_self_backward(
     Tensor grad,
-    Tensor other,
+    T other,
     ScalarType self_st,
     const c10::optional<c10::string_view>& rounding_mode);
 at::Tensor div_tensor_other_backward(
@@ -320,21 +320,21 @@ at::Tensor var_backward(
     at::Tensor grad,
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction,
+    const c10::optional<c10::Scalar>& correction,
     bool keepdim);
 at::Tensor var_jvp(
     const at::Tensor& self_t,
     const at::Tensor& self_p,
     const at::Tensor& result,
     at::OptionalIntArrayRef dim_opt,
-    c10::optional<int64_t> correction_opt,
+    const c10::optional<c10::Scalar>& correction,
     bool keepdim);
 at::Tensor std_backward(
     const at::Tensor& result,
     const at::Tensor& grad,
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction,
+    const c10::optional<c10::Scalar>& correction,
     bool keepdim);
 Tensor mean_backward(
     const Tensor& grad,
@@ -347,7 +347,7 @@ Tensor var_mean_backward(
     const Tensor& gmean,
     const Tensor& self,
     at::OptionalIntArrayRef dim_opt,
-    c10::optional<int64_t> correction_opt,
+    const c10::optional<c10::Scalar>& correction,
     bool keepdim);
 Tensor std_mean_backward(
     const Tensor& gstd,
@@ -355,7 +355,7 @@ Tensor std_mean_backward(
     const Tensor& self,
     const Tensor& std,
     at::OptionalIntArrayRef dim_opt,
-    c10::optional<int64_t> correction_opt,
+    const c10::optional<c10::Scalar>& correction,
     bool keepdim);
 at::Tensor masked_scatter_backward(
     const at::Tensor& grad,
@@ -715,13 +715,13 @@ Tensor gelu_double_backward(
     c10::string_view approximate);
 Tensor as_strided_backward(
     Tensor grad,
-    TensorGeometry input_geometry,
+    const TensorGeometry& input_geometry,
     c10::SymIntArrayRef sizes,
     c10::SymIntArrayRef strides,
     optional<c10::SymInt> storage_offset_);
 Tensor as_strided_scatter_backward(
     Tensor grad,
-    TensorGeometry input_geometry,
+    const TensorGeometry& input_geometry,
     TensorGeometry src_geometry,
     c10::SymIntArrayRef sizes,
     c10::SymIntArrayRef strides,
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index 101d8d9b2195..2998e65d9750 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -11,6 +11,8 @@
 #include <torch/csrc/utils/memory.h>
 #include <torch/library.h>
 
+#include <utility>
+
 using namespace at;
 using namespace torch::autograd::generated;
 using torch::autograd::as_view;
@@ -397,7 +399,7 @@ Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
       /* output */ out,
       /* is_bw_differentiable */ false,
       /* is_fw_differentiable */ false,
-      /* view_func */ func,
+      /* view_func */ std::move(func),
       /* creation_meta */ CreationMeta::DEFAULT,
       /*allow_tensor_metadata_change=*/false);
 
@@ -421,7 +423,7 @@ Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level) {
       /* output */ tmp,
       /* is_bw_differentiable */ true,
       /* is_fw_differentiable */ false,
-      /* view_func */ func,
+      /* view_func */ std::move(func),
       /* creation_meta */ CREATION_META_DEFINITION);
 
   return result;
@@ -449,7 +451,7 @@ Tensor _make_dual(
       /* output */ tmp,
       /* is_bw_differentiable */ true,
       /* is_fw_differentiable */ false,
-      /* view_func */ func,
+      /* view_func */ std::move(func),
       /* creation_meta */ CREATION_META_DEFINITION);
 
   return result;
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index a96d588a9d46..34eda5378721 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -187,7 +187,8 @@ inline at::Tensor as_view(
           diff_view_meta->get_creation_meta(), creation_meta);
       return make_variable_differentiable_view(
           tensor,
-          diff_view_meta->get_backward_view().chain(base, tensor, view_func),
+          diff_view_meta->get_backward_view().chain(
+              base, tensor, std::move(view_func)),
           c10::nullopt,
           /*shared_view_info*/ true,
           creation_meta,
@@ -195,7 +196,7 @@ inline at::Tensor as_view(
     } else {
       return make_variable_differentiable_view(
           tensor,
-          ViewInfo(base, view_func),
+          ViewInfo(base, std::move(view_func)),
           c10::nullopt,
           /*shared_view_info*/ true,
           creation_meta,
@@ -224,9 +225,9 @@ inline at::Tensor as_view(
     // Check if base is a forward differentiable view
     if (diff_view_meta && diff_view_meta->has_fw_view()) {
       const auto& base_fw_info = diff_view_meta->get_forward_view();
-      new_fw_info = base_fw_info.chain(base, tensor, view_func);
+      new_fw_info = base_fw_info.chain(base, tensor, std::move(view_func));
     } else {
-      new_fw_info = ViewInfo(base, view_func);
+      new_fw_info = ViewInfo(base, std::move(view_func));
     }
   }
 
diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp
index 83810321dde9..b81e5bee5e09 100644
--- a/torch/csrc/autograd/autograd.cpp
+++ b/torch/csrc/autograd/autograd.cpp
@@ -37,6 +37,10 @@ variable_list _make_grads(
         TORCH_CHECK(
             output.numel() == 1,
             "grad can be implicitly created only for scalar outputs");
+        TORCH_CHECK(
+            c10::isFloatingType(output.scalar_type()),
+            "grad can be computed only for real scalar outputs but got ",
+            output.scalar_type());
         new_grads.emplace_back(
             at::ones_like(output, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
       }
@@ -57,6 +61,10 @@ variable_list _make_grads(
           TORCH_CHECK(
               output.numel() == 1,
               "grad can be implicitly created only for scalar outputs");
+          TORCH_CHECK(
+              c10::isFloatingType(output.scalar_type()),
+              "grad can be computed only for real scalar outputs but got ",
+              output.scalar_type());
           new_grads.emplace_back(
               at::ones_like(output, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
         }
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index b29a05349975..890a7fa3e6e9 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -13,6 +13,7 @@
 #include <torch/csrc/autograd/functions/basic_ops.h>
 #include <torch/csrc/autograd/functions/utils.h>
 
+#include <utility>
 #include <vector>
 
 namespace torch {
@@ -114,7 +115,7 @@ void autogradNotImplementedFallbackImpl(
       stack_start,
       num_arguments);
 
-  const bool any_requires_grad = tensors_requiring_grad_on_stack.size() > 0;
+  const bool any_requires_grad = !tensors_requiring_grad_on_stack.empty();
 
   _foreach_tensor(
       [&](size_t _, size_t i, const at::Tensor& t) {
@@ -184,7 +185,15 @@ void autogradNotImplementedFallbackImpl(
         if (!is_inplace_output[idx_ret])
           TORCH_INTERNAL_ASSERT(
               t.use_count() <= 1, op_name); // Okay to return undefined tensor
-        if (!is_aliased_output[idx_ret] && t.has_storage())
+        // note(crcrpar): `_foreach_norm` returns a list of scalar Tensors and
+        // each Tensor shares a storage of a hidden, intermediate 1D Tensor
+        // created inside the CUDA implemenetation. This is because the
+        // reference implementation of nvidia/apex repo returns this 1D Tensor
+        // where each element represents the norm of corresponding input Tensor,
+        // here I want to return the same number of Tensors as the input
+        // TensorList, see https://github.com/pytorch/pytorch/issues/93940
+        if (!is_aliased_output[idx_ret] && t.has_storage() &&
+            op_name != "aten::_foreach_norm")
           TORCH_INTERNAL_ASSERT(t.storage().use_count() == 1);
       },
       stack,
@@ -377,7 +386,8 @@ void autogradNotImplementedInplaceOrViewFallbackImpl(
               ? CreationMeta::INFERENCE_MODE
               : (at::GradMode::is_enabled() ? CreationMeta::DEFAULT
                                             : CreationMeta::NO_GRAD_MODE));
-      stack->at(stack->size() - num_returns + aliased_output_idx) = result;
+      stack->at(stack->size() - num_returns + aliased_output_idx) =
+          std::move(result);
     }
   }
 }
diff --git a/torch/csrc/autograd/cpp_hook.cpp b/torch/csrc/autograd/cpp_hook.cpp
index 2075c0f5979d..d6948554d82c 100644
--- a/torch/csrc/autograd/cpp_hook.cpp
+++ b/torch/csrc/autograd/cpp_hook.cpp
@@ -3,6 +3,8 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/csrc/autograd/variable.h>
 
+#include <utility>
+
 namespace {
 using torch::autograd::Variable;
 void check_single_result(
@@ -13,7 +15,7 @@ void check_single_result(
     throw std::runtime_error(
         "can't replace a empty gradient with a non-empty value");
   }
-  torch::autograd::check_variable_result(value, result, hook_name);
+  torch::autograd::check_variable_result(value, result, std::move(hook_name));
 }
 } // namespace
 
@@ -48,5 +50,21 @@ variable_list CppFunctionTensorPreHook::operator()(
   return results;
 }
 
+CppFunctionSingleTensorPreHook::CppFunctionSingleTensorPreHook(
+    std::function<at::TensorBase(const at::TensorBase&)> hook,
+    int value_idx)
+    : hook_(std::move(hook)), value_idx_(value_idx) {}
+
+variable_list CppFunctionSingleTensorPreHook::operator()(
+    const variable_list& values) {
+  const auto& value = values[value_idx_];
+  auto res = hook_(value);
+  TORCH_INTERNAL_ASSERT(
+      !res.defined(),
+      "CppFunctionSingleTensorPreHook currently only supports hooks that don't return");
+  variable_list results(values);
+  return results;
+}
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/cpp_hook.h b/torch/csrc/autograd/cpp_hook.h
index bd8eadf71324..44f0ffb8b776 100644
--- a/torch/csrc/autograd/cpp_hook.h
+++ b/torch/csrc/autograd/cpp_hook.h
@@ -19,5 +19,15 @@ struct CppFunctionTensorPreHook : public FunctionPreHook {
   int value_idx_;
 };
 
+struct CppFunctionSingleTensorPreHook : public FunctionPreHook {
+  CppFunctionSingleTensorPreHook(
+      std::function<at::TensorBase(const at::TensorBase&)> hook,
+      int value_idx);
+  variable_list operator()(const variable_list& values) override;
+
+  std::function<at::TensorBase(const at::TensorBase&)> hook_;
+  int value_idx_;
+};
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 4cd49120fbca..05b3642c1572 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -3,6 +3,8 @@
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/csrc/autograd/functions/accumulate_grad.h>
 
+#include <utility>
+
 namespace torch {
 namespace autograd {
 
@@ -10,7 +12,7 @@ VariableInfo::VariableInfo(const Variable& var)
     : layout(var.layout()),
       device(var.device()),
       scalar_type(var.scalar_type()),
-      size(var.sizes().vec()),
+      size(var.sym_sizes().vec()),
       requires_grad(var.requires_grad()),
       is_empty(false) {}
 
@@ -21,7 +23,7 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
     // Return undefined tensor.
     return at::Tensor();
   } else {
-    return at::zeros(
+    return at::zeros_symint(
         size, at::TensorOptions(scalar_type).device(device).layout(layout));
   }
 }
@@ -113,7 +115,7 @@ void _process_forward_mode_AD(
   torch::autograd::variable_list forward_grads;
   {
     at::AutoFwGradMode fw_grad_mode(false);
-    forward_grads = jvp_user_function(inputs, input_grads);
+    forward_grads = jvp_user_function(inputs, std::move(input_grads));
   }
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
@@ -330,8 +332,8 @@ optional_variable_list _process_backward_mode_ad(
       var.mutable_grad().reset();
       impl::clear_hooks(var);
       if (auto grad_acc_fn = impl::try_get_grad_accumulator(var)) {
-        auto grad_acc = dynamic_cast<AccumulateGrad*>(grad_acc_fn.get());
-        grad_acc->variable.reset();
+        auto& grad_acc = dynamic_cast<AccumulateGrad&>(*grad_acc_fn);
+        grad_acc.variable.reset();
       }
       if (cdata) {
         impl::rebase_history(var, {cdata, output_nr});
@@ -439,12 +441,12 @@ optional_variable_list _wrap_outputs(
   // computations happening here to track backward mode gradients.
   _process_forward_mode_AD(
       input_vars,
-      inputs_mapping,
+      std::move(inputs_mapping),
       raw_outputs,
       outputs,
       non_differentiable,
       dirty_inputs,
-      jvp_user_function);
+      std::move(jvp_user_function));
 
   return outputs;
 }
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 3ba18ae75c1e..eb2b95305be9 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/core/ivalue.h>
+#include <c10/core/SymInt.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/autograd/function.h>
@@ -100,8 +101,7 @@ struct TORCH_API Function {
 /// `backward` in custom autograd operations (see `torch::autograd::Function`
 /// for details).
 struct TORCH_API AutogradContext {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  AutogradContext() : materialize_grads_(true) {}
+  AutogradContext() = default;
   AutogradContext(const AutogradContext& other) = delete;
   AutogradContext& operator=(const AutogradContext& other) = delete;
 
@@ -141,13 +141,13 @@ struct TORCH_API AutogradContext {
   std::unordered_set<at::TensorImpl*> dirty_inputs_;
   std::vector<torch::autograd::SavedVariable> saved_variables_;
   variable_list to_save_;
-  bool materialize_grads_;
+  bool materialize_grads_{true};
 
   // The CppNode in the autograd graph that owns this AutogradContext. We need a
   // weak_ptr to avoid a refcycle. Since grad_fn_ owns this AutogradContext, it
   // will always be alive when we want to use it.
   std::weak_ptr<Node> grad_fn_;
-  bool has_freed_buffers_;
+  bool has_freed_buffers_{false};
 
   void save_variables();
 
@@ -164,7 +164,7 @@ struct TORCH_API VariableInfo {
   at::Layout layout = at::Layout::Strided;
   at::Device device = at::kCPU;
   at::ScalarType scalar_type = at::kFloat;
-  std::vector<int64_t> size;
+  std::vector<c10::SymInt> size;
   bool requires_grad;
   bool is_empty;
 };
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 51d7a782c0f5..965c2dc109ae 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -44,6 +44,7 @@
 #include <thread>
 #include <typeinfo>
 #include <unordered_set>
+#include <utility>
 
 namespace torch {
 namespace autograd {
@@ -675,7 +676,7 @@ void GraphTask::exec_post_processing() {
   // See Note [Streaming backwards].
   // Syncs caller_current_stream with leaf streams, so final_callbacks may use
   // any grad on its device's current stream.
-  if (leaf_streams.size() > 0) {
+  if (!leaf_streams.empty()) {
     for (const auto& leaf_stream : leaf_streams) {
       // stash_current_streams() stashed streams for all device IDs that already
       // had a CUDA context before the GraphTask executed. For inactive devices,
@@ -757,8 +758,8 @@ static variable_list call_tensor_pre_hooks(Node& fn, variable_list inputs) {
   for (const auto& hook : fn.tensor_pre_hooks()) {
     inputs = (*hook)(inputs);
   }
-  for (const auto& hook : fn.retains_grad_hooks()) {
-    inputs = (*hook)(inputs);
+  for (const auto& pair : fn.retains_grad_hooks()) {
+    inputs = (*pair.second)(inputs);
   }
   return inputs;
 }
@@ -1008,7 +1009,7 @@ void Engine::evaluate_function(
     for (const auto i : c10::irange(num_outputs)) {
       auto& output = outputs[i];
       at::OptionalDeviceGuard guard(device_of(output));
-      if (output.defined() && isnan(output).any().item<uint8_t>()) {
+      if (output.defined() && isnan(output)._is_any_true().item<bool>()) {
         std::stringstream ss;
         ss << "Function '" << fn.name() << "' returned nan values in its " << i
            << "th output.";
@@ -1211,10 +1212,11 @@ auto Engine::execute(
         input_stream,
         opt_next_stream);
 
-    execute_with_graph_task(graph_task, graph_root, std::move(input_buffer));
+    execute_with_graph_task(
+        graph_task, std::move(graph_root), std::move(input_buffer));
   } else {
     execute_with_graph_task(
-        graph_task, graph_root, InputBuffer(variable_list()));
+        graph_task, std::move(graph_root), InputBuffer(variable_list()));
   }
   // Avoid a refcount bump for the Future, since we check for refcount in
   // DistEngine (see TORCH_INTERNAL_ASSERT(futureGrads.use_count() == 1)
@@ -1543,9 +1545,9 @@ void GraphTask::init_to_execute(
   captured_vars_.resize(output_idx);
 
   struct Frame {
-    Frame(Node* fn) : fn_(fn), next_next_fn_(0) {}
-    Node* fn_;
-    size_t next_next_fn_;
+    Frame(Node* fn) : fn_(fn) {}
+    Node* fn_{};
+    size_t next_next_fn_{};
 
     Node* get_next_fn() {
       const auto& next = fn_->next_edges();
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index 5ceef8e2dfc0..3fca057e3093 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -244,7 +244,7 @@ struct TORCH_API Engine {
     // Data structures used by the threads for executing reentrant backwards
     // tasks. See Note [Reentrant backwards]
     // Number of available threads for processing new GraphTasks.
-    unsigned int num_workers_;
+    unsigned int num_workers_{0};
     // The threads will wait on work_ to be notified of GraphTasks
     std::condition_variable work_;
     // To protect reads and writes to graphtask_queue_ and num_workers_
@@ -254,7 +254,7 @@ struct TORCH_API Engine {
     // allocated inside Engine::execute and lives for the duration of execute
     std::queue<std::weak_ptr<GraphTask>> graphtasks_queue_;
 
-    ThreadPoolShared() : num_workers_(0) {}
+    ThreadPoolShared() = default;
   };
 
   // Temporary workaround until shutting down threads is done
diff --git a/torch/csrc/autograd/forward_grad.cpp b/torch/csrc/autograd/forward_grad.cpp
index f9e6945f2133..e07baac591da 100644
--- a/torch/csrc/autograd/forward_grad.cpp
+++ b/torch/csrc/autograd/forward_grad.cpp
@@ -29,7 +29,7 @@ void ForwardADLevel::release_idx(uint64_t idx) {
       "Exiting a forward AD level that is not the "
       "last that was created is not support. Ensure they are released in the reverse "
       "order they were created.");
-  TORCH_INTERNAL_ASSERT(all_forward_levels_.size() > 0);
+  TORCH_INTERNAL_ASSERT(!all_forward_levels_.empty());
   // Keep the level alive until we have released the lock
   auto lvl = all_forward_levels_.back();
   all_forward_levels_.pop_back();
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 9132d9d6ca7b..05ba3edecf07 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -276,7 +276,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
 
   void add_next_edge(Edge edge) {
     update_topological_nr(edge);
-    next_edges_.push_back(std::move(edge));
+    next_edges_.emplace_back(std::move(edge));
   }
 
   void set_next_edges(edge_list&& next_edges) {
@@ -456,7 +456,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
   uintptr_t add_post_hook(std::unique_ptr<FunctionPostHook>&& post_hook) {
-    post_hooks_.push_back(std::move(post_hook));
+    post_hooks_.emplace_back(std::move(post_hook));
     // Use the raw pointer as the unique key to identify this hook. This key
     // can then be used in del_post_hook(key) to remove this hook.
     return reinterpret_cast<std::uintptr_t>(post_hooks_.back().get());
@@ -483,15 +483,23 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   }
 
   void add_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
-    pre_hooks_.push_back(std::move(pre_hook));
+    pre_hooks_.emplace_back(std::move(pre_hook));
   }
 
   void add_tensor_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
-    tensor_pre_hooks_.push_back(std::move(pre_hook));
+    tensor_pre_hooks_.emplace_back(std::move(pre_hook));
   }
 
-  void add_retains_grad_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
-    retains_grad_hooks_.push_back(std::move(pre_hook));
+  void add_retains_grad_hook(
+      std::unique_ptr<FunctionPreHook>&& pre_hook,
+      int output_idx) {
+    retains_grad_hooks_[output_idx] = std::move(pre_hook);
+  }
+
+  std::unique_ptr<FunctionPreHook> pop_retains_grad_hook(int output_idx) {
+    auto ret = std::move(retains_grad_hooks_[output_idx]);
+    retains_grad_hooks_.erase(output_idx);
+    return ret;
   }
 
   const std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks()
@@ -508,7 +516,8 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
     return tensor_pre_hooks_;
   }
 
-  std::vector<std::unique_ptr<FunctionPreHook>>& retains_grad_hooks() noexcept {
+  std::unordered_map<int, std::unique_ptr<FunctionPreHook>>&
+  retains_grad_hooks() noexcept {
     return retains_grad_hooks_;
   }
 
@@ -636,7 +645,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::vector<std::unique_ptr<FunctionPreHook>> tensor_pre_hooks_;
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  std::vector<std::unique_ptr<FunctionPreHook>> retains_grad_hooks_;
+  std::unordered_map<int, std::unique_ptr<FunctionPreHook>> retains_grad_hooks_;
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
@@ -663,7 +672,7 @@ struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
   void operator()(const Variable& variable) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (variable.defined()) {
-      next_edges.push_back(impl::gradient_edge(variable));
+      next_edges.emplace_back(impl::gradient_edge(variable));
     } else {
       next_edges.emplace_back();
     }
@@ -671,7 +680,7 @@ struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
   void operator()(const Variable* variable) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (variable->defined()) {
-      next_edges.push_back(impl::gradient_edge(*variable));
+      next_edges.emplace_back(impl::gradient_edge(*variable));
     } else {
       next_edges.emplace_back();
     }
@@ -679,7 +688,7 @@ struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
   void operator()(const c10::optional<Variable>& variable) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (variable.has_value() && variable->defined()) {
-      next_edges.push_back(impl::gradient_edge(*variable));
+      next_edges.emplace_back(impl::gradient_edge(*variable));
     } else {
       next_edges.emplace_back();
     }
diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp
index ec0dbf06f381..e25ee1025c93 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.cpp
+++ b/torch/csrc/autograd/functions/accumulate_grad.cpp
@@ -10,8 +10,6 @@
 #include <stdexcept>
 #include <utility>
 
-using at::Tensor;
-
 namespace torch {
 namespace autograd {
 
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
index b05c2f571e39..3a00570e2d51 100644
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@@ -15,6 +15,8 @@
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
 
+#include <utility>
+
 using namespace torch::autograd;
 
 struct DelayedErrorCtor {
@@ -30,7 +32,7 @@ struct DelayedErrorCtor {
     TORCH_CHECK(
         THPUtils_checkLong(arg2), "argument 'num_inputs' must be an int");
     int num_inputs = THPUtils_unpackLong(arg2);
-    return new DelayedError(msg, num_inputs);
+    return new DelayedError(std::move(msg), num_inputs);
   }
 };
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 68f4c6982466..cfdf291b66ba 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -32,6 +32,7 @@
 
 #include <set>
 #include <unordered_set>
+#include <utility>
 
 namespace {
 
@@ -54,6 +55,17 @@ struct MultithreadingEnabled {
   bool old_;
 };
 
+struct ViewReplayEnabled {
+  ViewReplayEnabled(bool enabled)
+      : old_(c10::AutogradState::get_tls_state().get_view_replay_enabled()) {
+    c10::AutogradState::get_tls_state().set_view_replay_enabled(enabled);
+  }
+  ~ViewReplayEnabled() {
+    c10::AutogradState::get_tls_state().set_view_replay_enabled(old_);
+  }
+  bool old_;
+};
+
 struct DisableAutocast {
   c10::impl::ExcludeDispatchKeyGuard guard_{c10::autocast_dispatch_keyset};
 };
@@ -289,6 +301,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
     return activities;
   });
 
+  m.def("_unsafe_set_version_counter", [](at::Tensor t, int64_t i) {
+    auto vc = torch::autograd::impl::version_counter(t);
+    vc.set_version(i);
+  });
+
   m.def("_enable_profiler_legacy", enableProfilerLegacy);
   py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
       .def(py::init<bool, bool>());
@@ -357,8 +374,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
   py::class_<DisableFuncTorch>(_C_m, "_DisableFuncTorch").def(py::init<>());
   py::class_<MultithreadingEnabled>(_C_m, "_MultithreadingEnabled")
       .def(py::init<bool>());
-  py::class_<DisableAutocast>(_C_m, "_DisableAutocast").def(py::init<>());
-  py::class_<torch::autograd::SavedVariable>(m, "SavedTensor")
+  py::class_<DisableAutocast>(std::move(_C_m), "_DisableAutocast")
+      .def(py::init<>());
+  py::class_<ViewReplayEnabled>(_C_m, "_ViewReplayEnabled")
+      .def(py::init<bool>());
+  py::class_<torch::autograd::SavedVariable>(std::move(m), "SavedTensor")
       .def(py::init([]() -> torch::autograd::SavedVariable {
         TORCH_CHECK(
             false,
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 50d4c0ce0aa6..a8d8b9880faa 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -4,6 +4,7 @@
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/TensorOperators.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Event.h>
@@ -66,6 +67,18 @@ void record_stream_any_impl(Variable& var, c10::Stream& stream) {
     }
   }
 }
+
+bool can_accumulate_inplace(const Variable& v) {
+  return (
+      // `v` is a "vanilla" Tensor
+      !(at::isTensorSubclassLike(v) || v._is_zerotensor() || v.is_nested()) &&
+
+      // with a favorable memory layout
+      v.is_non_overlapping_and_dense() &&
+
+      // and we hold the last reference
+      v.use_count() == 1 && v.has_storage() && v.storage().use_count() == 1);
+}
 } // anonymous namespace
 
 static void accumulate(
@@ -74,25 +87,38 @@ static void accumulate(
     Variable&& var) {
   TORCH_INTERNAL_ASSERT(pos < buffer.size());
   auto& old_var = buffer[pos];
-  // ATen doesn't route sparse additions correctly...
-  // do dense + sparse in-place if possible
-  if (old_var.is_sparse()) {
-    // It is safe to change the Tensor inplace if the Tensor is only used in
-    // this buffer (this could be the gradient passed by the user) and that no
-    // other Tensor is using the same storage.
-    if (!var.is_sparse() && var.is_contiguous() && var.use_count() == 1 &&
-        var.storage().use_count() == 1) {
+  // If we hold the last reference to `old_var` AND its storage we will try to
+  // repurpose it to store the output. (Or, if `old_var` is sparse then `var`
+  // becomes the candidate output Tensor.) We only do this if:
+  //  1) GradMode is disabled since Autograd has special handling for inplace
+  //     mutation which we don't want to trigger.
+  //
+  //  2) We hold the last reference.
+  //     (Both `.use_count` and `.storage().use_count()` are one)
+  //
+  //  3) The candidate tensor is a contiguous, non-overlapping, dense, and
+  //     otherwise stock standard Tensor.
+  //
+  //  4) The candidate is mutable. Currently only ZeroTensors are immutable.
+  //
+  //  5) The other Tensor is not a Tensor subclass (except sparse), since
+  //     it's hard to predict the semantics of arbitrary subclass behavior.
+
+  if (at::GradMode::is_enabled()) {
+    buffer[pos] = old_var + var;
+  } else if (
+      // ATen doesn't route sparse additions correctly...
+      old_var.is_sparse() || old_var.is_sparse_csr()) {
+    if (can_accumulate_inplace(var)) {
       buffer[pos] = var.add_(old_var);
     } else {
       buffer[pos] = var + old_var;
     }
+  } else if (
+      can_accumulate_inplace(old_var) && !at::isTensorSubclassLike(var)) {
+    buffer[pos] = old_var.add_(var);
   } else {
-    if (var.is_sparse() && !old_var.is_sparse() && old_var.is_contiguous() &&
-        old_var.use_count() == 1 && old_var.storage().use_count() == 1) {
-      buffer[pos] = old_var.add_(var);
-    } else {
-      buffer[pos] = old_var + var;
-    }
+    buffer[pos] = old_var + var;
   }
 }
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index ab29a32869ea..ef98d8f8b4db 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -27,6 +27,7 @@
 #include <limits>
 #include <sstream>
 #include <stdexcept>
+#include <utility>
 
 #ifdef USE_KINETO
 #include <libkineto.h>
@@ -217,24 +218,16 @@ struct AddGenericMetadata : public MetadataBase {
     addMetadata("Device Id", std::to_string(alloc.device_index_));
     addMetadata("Addr", std::to_string(reinterpret_cast<intptr_t>(alloc.ptr_)));
     addMetadata("Bytes", std::to_string(alloc.alloc_size_));
-    if (alloc.total_allocated_ >= 0) {
-      addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
-    }
-    if (alloc.total_reserved_ >= 0) {
-      addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
-    }
+    addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
+    addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
   }
 
   void operator()(const ExtraFields<EventType::OutOfMemory>& alloc) {
     addMetadata("Device Type", std::to_string((int8_t)alloc.device_type_));
     addMetadata("Device Id", std::to_string(alloc.device_index_));
     addMetadata("Bytes", std::to_string(alloc.alloc_size_));
-    if (alloc.total_allocated_ >= 0) {
-      addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
-    }
-    if (alloc.total_reserved_ >= 0) {
-      addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
-    }
+    addMetadata("Total Allocated", std::to_string(alloc.total_allocated_));
+    addMetadata("Total Reserved", std::to_string(alloc.total_reserved_));
   }
 
   template <typename T>
@@ -257,7 +250,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
       std::set<torch::profiler::impl::ActivityType> activities)
       : ProfilerStateBase(config),
         start_time_(getTimeUs()),
-        record_queue_(config, activities) {}
+        record_queue_(config, std::move(activities)) {}
   ~KinetoThreadLocalState() override = default;
 
   static KinetoThreadLocalState* get(bool global) {
@@ -282,8 +275,8 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
   void reportMemoryUsage(
       void* ptr,
       int64_t alloc_size,
-      int64_t total_allocated,
-      int64_t total_reserved,
+      size_t total_allocated,
+      size_t total_reserved,
       c10::Device device) override {
     if (config_.profile_memory && !config_.disabled()) {
       record_queue_.getSubqueue()->emplace_allocation_event(
@@ -299,8 +292,8 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
 
   void reportOutOfMemory(
       int64_t alloc_size,
-      int64_t total_allocated,
-      int64_t total_reserved,
+      size_t total_allocated,
+      size_t total_reserved,
       c10::Device device) override {
     if (config_.profile_memory && !config_.disabled()) {
       record_queue_.getSubqueue()->emplace_ooms_event(
@@ -329,7 +322,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
     std::lock_guard<std::mutex> guard(state_mutex_);
     auto converter = clock_converter_.makeConverter();
     auto records_and_trace =
-        record_queue_.getRecords(converter, start_time_, end_time);
+        record_queue_.getRecords(std::move(converter), start_time_, end_time);
 
     materializeOpEvents(records_and_trace.first);
 
@@ -576,7 +569,7 @@ void prepareProfiler(
   torch::profiler::impl::kineto::prepareTrace(
       /*cpuOnly=*/!at::hasCUDA(), activities, config.experimental_config);
 
-  if (config.experimental_config.performance_events.size()) {
+  if (!config.experimental_config.performance_events.empty()) {
     /* For now only CPU activity is supported */
     TORCH_CHECK(
         activities.count(torch::autograd::profiler::ActivityType::CPU),
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index f77b4f5928b3..35b8fac7e876 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -153,8 +153,8 @@ struct ProfilerLegacyThreadLocalState : public ProfilerStateBase {
   void reportMemoryUsage(
       void* /* unused */,
       int64_t alloc_size,
-      int64_t /* total_allocated, unused for legacy */,
-      int64_t /* total_reserved, unused for legacy */,
+      size_t /* total_allocated, unused for legacy */,
+      size_t /* total_reserved, unused for legacy */,
       c10::Device device) override;
 
   ActiveProfilerType profilerType() override {
@@ -300,8 +300,8 @@ void ProfilerLegacyThreadLocalState::popRange(
 void ProfilerLegacyThreadLocalState::reportMemoryUsage(
     void* /* unused */,
     int64_t alloc_size,
-    int64_t /* total_allocated, unused for legacy */,
-    int64_t /* total_reserved, unused for legacy */,
+    size_t /* total_allocated, unused for legacy */,
+    size_t /* total_reserved, unused for legacy */,
     c10::Device device) {
   if (config_.profile_memory && !config_.disabled()) {
     uint64_t thread_id = at::RecordFunction::currentThreadId();
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 8be0c1475b1e..6216281ecb6c 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -41,6 +41,7 @@ namespace {
 enum CallType { PyCall = 0, PyModuleCall, PyCCall, PyOptimizerCall };
 static constexpr size_t CallTypeSize = 4;
 using no_ephemeral_t = std::tuple<>;
+static constexpr uint64_t NoTID = std::numeric_limits<uint64_t>::max();
 
 // ============================================================================
 // == Miscellaneous structs and utils =========================================
@@ -403,7 +404,7 @@ void ValueCache::store<CallType::PyModuleCall>(
              recordIfTensor(py::getattr(it.second, "grad", py::none()))});
       }
     }
-    cache.cls_and_parameters_[key] = {cls, params_};
+    cache.cls_and_parameters_[key] = {cls, std::move(params_)};
   }
 }
 
@@ -450,7 +451,7 @@ void ValueCache::store<CallType::PyOptimizerCall>(
       }
     }
 
-    cache.cls_and_parameters_[key] = {cls, params};
+    cache.cls_and_parameters_[key] = {cls, std::move(params)};
   }
 }
 
@@ -600,6 +601,29 @@ static PyTypeObject TraceContextType = {
     nullptr /* tp_free */
 };
 
+class gil_and_restore_thread {
+ public:
+  gil_and_restore_thread()
+      : gil_(), initial_thread_state_{PyThreadState_Get()} {}
+  ~gil_and_restore_thread() {
+    PyThreadState_Swap(initial_thread_state_);
+
+    // `gil_scoped_acquire` is a bit fragile in on-demand mode:
+    // https://github.com/pytorch/pytorch/pull/91684#issuecomment-1413154458
+    if (!Py_IsInitialized()) {
+      gil_.disarm();
+    }
+  }
+
+  PyThreadState* initial_thread_state() const {
+    return initial_thread_state_;
+  }
+
+ private:
+  pybind11::gil_scoped_acquire gil_;
+  PyThreadState* initial_thread_state_;
+};
+
 // ============================================================================
 // == Thread local cache ======================================================
 // ============================================================================
@@ -666,26 +690,53 @@ class PythonTracer final : public python_tracer::PythonTracerBase {
       std::vector<python_tracer::CompressedEvent>& enters,
       time_t end_time_ns) override;
 
+  struct StartFrame {
+    TraceKey trace_key_;
+    approx_time_t start_time;
+  };
+
  private:
-  void recordPyCall(ThreadLocalResults& tls, PyFrameObject* frame);
+  void recordPyCall(
+      ThreadLocalResults& tls,
+      PyFrameObject* frame,
+      bool is_startup_frame);
+
   void recordCCall(
       ThreadLocalResults& tls,
       PyFrameObject* frame,
       PyObject* arg);
 
+  const std::vector<PyThreadState*> interpreterThreads() const;
+
   std::atomic<bool> active_lock_{false};
   bool active_{false};
 
   torch::profiler::impl::RecordQueue* queue_;
+  PyInterpreterState* interpreter_;
   PyCodeObject* module_call_code_;
   PyCodeObject* optimizer_hook_;
 
+  std::vector<StartFrame> start_frames_;
   std::deque<ThreadLocalResults> thread_local_results_;
   ValueCache value_cache_;
 };
 
+const std::vector<PyThreadState*> PythonTracer::interpreterThreads() const {
+  pybind11::gil_scoped_acquire gil;
+  std::vector<PyThreadState*> out;
+  if (SOFT_ASSERT(interpreter_)) {
+    auto* thread_state = PyInterpreterState_ThreadHead(interpreter_);
+    while (thread_state != nullptr) {
+      out.push_back(thread_state);
+      thread_state = PyThreadState_Next(thread_state);
+    }
+  }
+  return out;
+}
+
 PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     : queue_(queue),
+      interpreter_(nullptr),
       module_call_code_(getCode<CallType::PyModuleCall>()),
       optimizer_hook_(getCode<CallType::PyOptimizerCall>()) {
   TORCH_CHECK(queue_ != nullptr);
@@ -699,29 +750,16 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     return;
   }
 
-  pybind11::gil_scoped_acquire gil;
+  gil_and_restore_thread gil;
+  interpreter_ = PyInterpreterState_Get();
 
-  // Loop over all threads within the current interpreter. We will need to
-  // register a trace function with each thread. We set the current thread to
-  // position zero to ensure that it is traced, and so we can restore the
-  // thread state after registration. The profiler cannot post process multiple
-  // python threads yet, so this section is temporarily disabled.
-  std::vector<PyThreadState*> thread_states{PyThreadState_Get()};
-  /*
-  if (all_threads) {
-    auto thread_state = thread_states[0];
-    while (thread_state != nullptr) {
-      if (thread_state != thread_states[0]) {
-        thread_states.push_back(thread_state);
-      }
-      thread_state = PyThreadState_Next(thread_state);
-    }
+  if (!gil.initial_thread_state()) {
+    TORCH_WARN("PyThreadState_Get returned NULL");
+    return;
   }
-  */
 
   // Register the tracer in each thread.
-  for (const auto i : c10::irange(thread_states.size())) {
-    PyThreadState* thread_state = thread_states[i];
+  for (const auto thread_state : interpreterThreads()) {
     PyThreadState_Swap(thread_state);
 
     thread_local_results_.emplace_back(thread_state, &value_cache_, this);
@@ -730,18 +768,29 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     // When we begin profiling there are already frames on the Python
     // interpreter stack. To ensure a complete trace, we must push calls
     // to all the prior frames onto our event stack. (We stop at depth=128)
-    std::vector<PyFrameObject*> current_stack;
+
+    std::vector<THPFrameObjectPtr> current_stack;
     auto frame = PyEval_GetFrame();
+    Py_XINCREF(frame);
+
     size_t depth = 0; // Make sure we can't infinite loop.
-    while (frame != nullptr && depth <= 128) {
-      Py_INCREF(frame);
-      current_stack.push_back(frame);
+    while (frame != nullptr) {
+      current_stack.emplace_back(frame);
+      if (++depth == 128) {
+        break;
+      }
+
+      // NB: `PyFrame_GetBack` returns a strong reference.
       frame = PyFrame_GetBack(frame);
-      depth++;
     }
+
     for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) {
-      recordPyCall(thread_local_results_.back(), *it);
-      Py_DECREF(*it);
+      recordPyCall(thread_local_results_.back(), it->get(), true);
+      auto frame_refcount = Py_REFCNT(it->get());
+
+      // We hold one reference in `current_stack`, and the interpreter holds
+      // another.
+      TORCH_INTERNAL_ASSERT(frame_refcount >= 2, frame_refcount);
     }
 
     // Note:
@@ -749,20 +798,17 @@ PythonTracer::PythonTracer(torch::profiler::impl::RecordQueue* queue)
     //   cannot be round tripped via `sys.settrace(sys.gettrace())`
     PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx);
   }
-
-  // Restore the thread state to its initial value.
-  PyThreadState_Swap(thread_states[0]);
 };
 
 void PythonTracer::stop() {
-  pybind11::gil_scoped_acquire gil;
+  gil_and_restore_thread gil;
   if (active_) {
-    PyThreadState* initial_thread_state = PyThreadState_Get();
-    for (const auto& i : thread_local_results_) {
-      PyThreadState_Swap(i.thread_state_);
-      PyEval_SetProfile(nullptr, nullptr);
+    for (const auto thread_state : interpreterThreads()) {
+      if (thread_state->c_profilefunc == &PythonTracer::pyProfileFn) {
+        PyThreadState_Swap(thread_state);
+        PyEval_SetProfile(nullptr, nullptr);
+      }
     }
-    PyThreadState_Swap(initial_thread_state);
 
     auto lock_returned = active_lock_.compare_exchange_strong(active_, false);
     active_ = false;
@@ -777,9 +823,12 @@ PythonTracer::~PythonTracer() {
   }
 }
 
-void PythonTracer::recordPyCall(ThreadLocalResults& tls, PyFrameObject* frame) {
+void PythonTracer::recordPyCall(
+    ThreadLocalResults& tls,
+    PyFrameObject* frame,
+    bool is_startup_frame) {
   static constexpr auto E = EventType::PyCall;
-  auto get_key = [&]() -> TraceKey {
+  const auto key = [&]() -> TraceKey {
     auto code = THPCodeObjectPtr(PyFrame_GetCode(frame));
     if (code.get() == module_call_code_) {
       // By default, CPython stores locals in a "fast" format, with an array
@@ -811,8 +860,10 @@ void PythonTracer::recordPyCall(ThreadLocalResults& tls, PyFrameObject* frame) {
       auto f_back = (back.get() != nullptr) ? back.get() : frame;
       return tls.intern<CallType::PyCall, E>(no_ephemeral_t(), frame, f_back);
     }
-  };
-  queue_->getSubqueue()->emplace_py_call(get_key(), getApproximateTime());
+  }();
+  const auto time = getApproximateTime();
+  is_startup_frame ? start_frames_.push_back({key, time})
+                   : queue_->getSubqueue()->emplace_py_call(key, time);
 }
 
 void PythonTracer::recordCCall(
@@ -858,6 +909,18 @@ class PostProcess {
     }
   }
 
+  void set_start_frames(
+      const std::vector<PythonTracer::StartFrame>& start_frames,
+      std::vector<python_tracer::CompressedEvent>& enters) {
+    for (const auto& frame : start_frames) {
+      enters.push_back(
+          {frame.trace_key_,
+           NoTID, // Allows us to detect unhandled start frames
+           {},
+           time_converter_(frame.start_time)});
+    }
+  }
+
   template <CallType C>
   void operator()(
       const TraceKeyCacheState<C>& trace_cache,
@@ -895,6 +958,7 @@ class PostProcess {
       std::vector<python_tracer::CompressedEvent>& enters,
       std::vector<std::shared_ptr<Result>>& out) {
     using stack_t = std::vector<std::shared_ptr<Result>>;
+    const auto initial_size = out.size();
     auto pop = [](stack_t& stack, time_t t) {
       TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.");
       c10::get<ExtraFields<E>>(stack.back()->extra_fields_).end_time_ns_ = t;
@@ -928,6 +992,25 @@ class PostProcess {
         pop(i.second, end_time_);
       }
     }
+
+    // Assign system TIDs to start events based on the system TID of the next
+    // observed event with the same Python TID.
+    ska::flat_hash_map<size_t, std::pair<size_t, kineto::DeviceAndResource>>
+        tid_map;
+    auto it = out.rbegin();
+    for (C10_UNUSED auto _ : c10::irange(initial_size, out.size())) {
+      const auto python_tid =
+          c10::get<ExtraFields<E>>((*it)->extra_fields_).python_tid_;
+      if ((*it)->start_tid_ == NoTID && SOFT_ASSERT(E == EventType::PyCall)) {
+        const auto& tid_info =
+            tid_map.insert({python_tid, {NoTID, kineto::DeviceAndResource()}})
+                .first->second;
+        (*it)->start_tid_ = tid_info.first;
+        (*it)->kineto_info_ = tid_info.second;
+      }
+      tid_map[python_tid] = {(*it)->start_tid_, (*it)->kineto_info_};
+      ++it;
+    }
   }
 
   template <EventType E>
@@ -974,7 +1057,11 @@ std::vector<std::shared_ptr<Result>> PythonTracer::getEvents(
     time_t end_time_ns) {
   value_cache_.trimPrefixes();
   PostProcess post_process(
-      time_converter, thread_local_results_, value_cache_, end_time_ns);
+      std::move(time_converter),
+      thread_local_results_,
+      value_cache_,
+      end_time_ns);
+  post_process.set_start_frames(start_frames_, enters);
   auto out = post_process.run(enters);
 
   std::stable_sort(out.begin(), out.end(), [](const auto& a, const auto& b) {
@@ -1001,7 +1088,7 @@ int PythonTracer::pyProfileFn(
       *reinterpret_cast<TraceContext*>(obj)->thread_local_results_;
   switch (what) {
     case PyTrace_CALL:
-      local_results.active_tracer_->recordPyCall(local_results, frame);
+      local_results.active_tracer_->recordPyCall(local_results, frame, false);
       break;
 
     case PyTrace_C_CALL:
diff --git a/torch/csrc/autograd/python_anomaly_mode.h b/torch/csrc/autograd/python_anomaly_mode.h
index 6032940bfbaf..307040f28fac 100644
--- a/torch/csrc/autograd/python_anomaly_mode.h
+++ b/torch/csrc/autograd/python_anomaly_mode.h
@@ -10,9 +10,7 @@ namespace torch {
 namespace autograd {
 
 struct PyAnomalyMetadata : public AnomalyMetadata {
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-writable-strings)
   static constexpr const char* ANOMALY_TRACE_KEY = "traceback_";
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,clang-diagnostic-writable-strings)
   static constexpr const char* ANOMALY_PARENT_KEY = "parent_";
 
   PyAnomalyMetadata() {
diff --git a/torch/csrc/autograd/python_cpp_function.cpp b/torch/csrc/autograd/python_cpp_function.cpp
index 29a12e5c6d32..7c9cf35e59fa 100644
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@@ -86,8 +86,9 @@ int THPCppFunction_traverse(PyObject* self, visitproc visit, void* arg) {
   // In theory this shouldn't be necessary, because retains_grad_hooks should
   // not contain any PyFunctionTensorPreHooks. The alternative is to have a
   // check that actually guarantees this.
-  for (const auto& hook : fn.retains_grad_hooks()) {
-    if (auto pyhook = dynamic_cast<PyFunctionTensorPreHook*>(hook.get())) {
+  for (const auto& pair : fn.retains_grad_hooks()) {
+    if (auto pyhook =
+            dynamic_cast<PyFunctionTensorPreHook*>(pair.second.get())) {
       Py_VISIT(pyhook->dict);
     }
   }
@@ -147,10 +148,10 @@ PyObject* THPCppFunction_next_functions(THPCppFunction* self, PyObject* hook) {
 }
 
 PyObject* THPCppFunction_metadata(THPCppFunction* self, void* _unused) {
-  auto metadata =
+  auto* metadata =
       static_cast<PyAnomalyMetadata*>(self->cdata->metadata())->dict();
 
-  Py_INCREF(metadata);
+  Py_XINCREF(metadata);
   return metadata;
 }
 
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index dc365c170008..04aaa85c6c46 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -22,6 +22,7 @@
 
 #include <memory> // for unique_ptr
 #include <unordered_set>
+#include <utility>
 
 using namespace torch::autograd;
 
@@ -108,7 +109,7 @@ void PythonEngine::thread_on_exception(
   if (python_err) {
     python_err->persist();
   }
-  Engine::thread_on_exception(graph_task, fn, e);
+  Engine::thread_on_exception(std::move(graph_task), fn, e);
 }
 
 std::unique_ptr<AnomalyMetadata> PythonEngine::make_anomaly_metadata() {
@@ -148,7 +149,7 @@ c10::intrusive_ptr<at::ivalue::Future> PythonEngine::execute_with_graph_task(
     InputBuffer&& input_buffer) {
   try {
     return Engine::execute_with_graph_task(
-        graph_task, graph_root, std::move(input_buffer));
+        graph_task, std::move(graph_root), std::move(input_buffer));
   } catch (python_error& e) {
     pybind11::gil_scoped_acquire gil;
     if (!PyErr_Occurred()) {
@@ -178,20 +179,20 @@ PyObject* THPEngine_run_backward(
   unsigned char allow_unreachable = 0;
   unsigned char accumulate_grad =
       0; // Indicate whether to accumulate grad into leaf Tensors or capture
-  const char* accepted_kwargs[] = {// NOLINT
-                                   "tensors",
-                                   "grad_tensors",
-                                   "keep_graph",
-                                   "create_graph",
-                                   "inputs",
-                                   "allow_unreachable",
-                                   "accumulate_grad",
-                                   nullptr};
+  constexpr char* accepted_kwargs[] = {// NOLINT
+                                       "tensors",
+                                       "grad_tensors",
+                                       "keep_graph",
+                                       "create_graph",
+                                       "inputs",
+                                       "allow_unreachable",
+                                       "accumulate_grad",
+                                       nullptr};
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
           "OObb|Obb",
-          (char**)accepted_kwargs,
+          const_cast<char**>(accepted_kwargs),
           &tensors,
           &grad_tensors,
           &keep_graph,
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 0cb46ee001a5..ba6331ed5ff9 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -219,8 +219,9 @@ static int THPFunction_traverse(THPFunction* self, visitproc visit, void* arg) {
       }
     }
     // See NOTE [retains_grad_hook PyObject traversal]
-    for (const auto& hook : cdata->retains_grad_hooks()) {
-      if (auto pyhook = dynamic_cast<PyFunctionTensorPreHook*>(hook.get())) {
+    for (const auto& pair : cdata->retains_grad_hooks()) {
+      if (auto pyhook =
+              dynamic_cast<PyFunctionTensorPreHook*>(pair.second.get())) {
         Py_VISIT(pyhook->dict);
       }
     }
@@ -461,7 +462,7 @@ static void _wrap_outputs(
       dirty_inputs,
       raw_output_vars,
       cdata_if_executable,
-      jvp_user_function);
+      std::move(jvp_user_function));
 
   for (const auto i : c10::irange(num_outputs)) {
     PyObject* obj = PyTuple_GetItem(raw_output, i);
@@ -709,7 +710,7 @@ static void _trace_post_record(
     auto tuple_type = at::TupleType::create(std::move(tuple_values));
     // Original type is tuple of tensors "without" element type and shape.
     // The missed parts will be added below.
-    node->output()->setType(tuple_type);
+    node->output()->setType(std::move(tuple_type));
     auto unpacked = graph->createTupleUnpack(node->output())->insertAfter(node);
     node = unpacked;
   }
@@ -730,7 +731,7 @@ static void _trace_post_record(
   py::bool_ is_in_onnx_export =
       py::module::import("torch.onnx.__init__").attr("is_in_onnx_export");
   if (py::cast<bool>(is_in_onnx_export)) {
-    _append_subgraph(old_node, graph, trace_outputs, unpack_output);
+    _append_subgraph(old_node, graph, std::move(trace_outputs), unpack_output);
   }
 
   // If TupleUnpack operator is created, we copy its output type back
@@ -744,7 +745,7 @@ static void _trace_post_record(
     auto tuple_type = at::TupleType::create(std::move(new_tuple_values));
     // The i-th tuple element receives a new tensor type with element type and
     // shape.
-    old_node->output()->setType(tuple_type);
+    old_node->output()->setType(std::move(tuple_type));
   }
 }
 
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 4da5333546ba..2ae6d646be68 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -26,13 +26,13 @@ static PyObject* THPVariable_pynew(
   const char* name = nullptr;
 
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  const char* accepted_args[] = {
+  constexpr char* accepted_args[] = {
       "data", "requires_grad", "volatile", "_grad_fn", "name", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwds,
           "|ObbOz",
-          (char**)accepted_args,
+          const_cast<char**>(accepted_args),
           &data,
           &requires_grad,
           &is_volatile,
diff --git a/torch/csrc/autograd/python_saved_variable_hooks.cpp b/torch/csrc/autograd/python_saved_variable_hooks.cpp
index 8f8027f663ba..30ffd9b55c52 100644
--- a/torch/csrc/autograd/python_saved_variable_hooks.cpp
+++ b/torch/csrc/autograd/python_saved_variable_hooks.cpp
@@ -14,32 +14,34 @@ PySavedVariableHooks::PySavedVariableHooks(
       pack_hook_(pack_hook.release().ptr()),
       unpack_hook_(unpack_hook.release().ptr()) {}
 
+// We don't use pybind for call_pack_hook and call_unpack_hook to avoid
+// https://github.com/pytorch/pytorch/issues/34172
 void PySavedVariableHooks::call_pack_hook(const at::Tensor& tensor) {
   py::gil_scoped_acquire acquire;
-  auto pack_hook = py::reinterpret_borrow<py::function>(pack_hook_);
-  auto wrapped = THPVariable_Wrap(tensor);
-  py::object obj = py::reinterpret_steal<py::object>(wrapped);
-  py::object packed = pack_hook(obj);
-  data_ = packed.release().ptr();
-  // pack_hook, obj are decrefed on exit
-  // wrapped and packed had their references stolen
+  THPObjectPtr obj(THPVariable_Wrap(tensor));
+  THPObjectPtr packed(
+      PyObject_CallFunctionObjArgs(pack_hook_, obj.get(), nullptr));
+  if (!packed) {
+    throw python_error();
+  }
+  data_ = packed.release();
+  // obj is decrefed on exit, packed has their references stolen
   // pack_hook_ and data_ will be manually decrefed when the saved variable is
   // released
 }
 
 at::Tensor PySavedVariableHooks::call_unpack_hook() {
   py::gil_scoped_acquire acquire;
-  auto unpack_hook = py::reinterpret_borrow<py::function>(unpack_hook_);
-  py::object obj = py::cast<py::object>(data_);
-  py::object res = unpack_hook(obj);
-  PyObject* ptr = res.ptr();
+  THPObjectPtr res(PyObject_CallFunctionObjArgs(unpack_hook_, data_, nullptr));
+  if (!res) {
+    throw python_error();
+  }
   TORCH_CHECK_TYPE(
-      THPVariable_Check(ptr),
+      THPVariable_Check(res),
       "Output of saved tensor unpack_hook expected to be a Tensor but got result of type ",
-      THPUtils_typename(ptr));
-  return THPVariable_Unpack(ptr);
-  // unpack_hook, obj and res are decrefed on exit
-  // ptr is only alive as long as res is
+      THPUtils_typename(res));
+  return THPVariable_Unpack(res);
+  // res is decrefed on exit
   // unpack_hook_ will be manually decrefed when the saved variable is released
 }
 
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 537b77e8e523..a49d0db5d0d7 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -25,20 +25,14 @@
 #include <Python.h>
 #include <fmt/format.h>
 #include <pybind11/pybind11.h>
+#include <utility>
 #include <vector>
 
-using at::ArrayRef;
-using at::Backend;
-using at::Device;
 using at::DeviceGuard;
-using at::Dimname;
 using at::DimnameList;
-using at::Generator;
 using at::IntArrayRef;
-using at::Layout;
 using at::OptionalDeviceGuard;
 using at::Scalar;
-using at::ScalarType;
 using at::Tensor;
 using at::TensorList;
 using at::TensorOptions;
@@ -387,7 +381,7 @@ static PyObject* THPVariable__to_functional_tensor(
       }
     }
   }
-  return wrap(wrapped);
+  return wrap(std::move(wrapped));
   END_HANDLE_TH_ERRORS
 }
 
@@ -403,7 +397,7 @@ static PyObject* THPVariable__from_functional_tensor(
   auto r = parser.parse(args, kwargs, parsed_args);
   auto self_ = r.tensor(0);
   auto unwrapped = at::functionalization::impl::from_functional_tensor(self_);
-  return wrap(unwrapped);
+  return wrap(std::move(unwrapped));
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index a25147cd5b77..4a267a7956db 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -1,25 +1,20 @@
 #include <ATen/NamedTensorUtils.h>
-#include <ATen/core/PythonFallbackKernel.h>
-#include <ATen/core/PythonOpRegistrationTrampoline.h>
 #include <c10/core/DeviceType.h>
-#include <c10/core/SafePyObject.h>
 #include <c10/core/impl/GPUTrace.h>
 #include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/PythonDispatcherTLS.h>
-#include <c10/util/DeadlockDetection.h>
 #include <c10/util/irange.h>
 #include <pybind11/pytypes.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
+#include <torch/csrc/PyInterpreter.h>
 #include <torch/csrc/Size.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/Types.h>
 #include <torch/csrc/autograd/autograd.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function.h>
-#include <torch/csrc/autograd/functions/accumulate_grad.h>
-#include <torch/csrc/autograd/generated/VariableType.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_hook.h>
 #include <torch/csrc/autograd/python_variable_indexing.h>
@@ -29,20 +24,15 @@
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/tensor/python_tensor.h>
-#include <torch/csrc/utils/cuda_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <torch/csrc/utils/python_dispatch.h>
-#include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
-#include <torch/csrc/utils/tensor_memoryformats.h>
 #include <torch/csrc/utils/tensor_new.h>
 #include <torch/csrc/utils/tensor_numpy.h>
 
-#include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/torch_dispatch_mode.h>
-#include <torch/library.h>
 
 #include <ATen/ATen.h>
 
@@ -179,178 +169,20 @@ void pushPyOutToStack(
         " to return None but it returned something else instead.");
   } else if (num_returns == 1) {
     torch::jit::push(
-        stack, torch::jit::toIValue(out.ptr(), schema_returns[0].type()));
+        stack, torch::jit::toIValue(out.ptr(), schema_returns[0].real_type()));
   } else {
     auto outs = py::cast<py::sequence>(out);
     for (const auto idx : c10::irange(outs.size())) {
       torch::jit::push(
           stack,
-          torch::jit::toIValue(outs[idx].ptr(), schema_returns[idx].type()));
+          torch::jit::toIValue(
+              outs[idx].ptr(), schema_returns[idx].real_type()));
     }
   }
 }
 
 namespace {
 
-// NB: This is a macro and not a template function (like it was before)
-// because passing in constexpr char* as template argument breaks some
-// versions of MSVC that are being used internally at Meta.
-// MSVC 14.16.27023 (vs2017_15.9)
-#define CONCRETE_TRACE_CUDA(func_name, ...)                           \
-  at::impl::MaybeSetTLSOnEntryGuard guard;                            \
-  if (Py_IsInitialized()) {                                           \
-    pybind11::gil_scoped_acquire gil;                                 \
-    try {                                                             \
-      py::module mod = py::module::import("torch.utils._cuda_trace"); \
-      py::object hook = mod.attr(func_name).attr("fire_callbacks");   \
-      hook(__VA_ARGS__);                                              \
-    } catch (const std::exception& e) {                               \
-      LOG(ERROR) << "CUDA trace hook execution failed: " << e.what(); \
-    }                                                                 \
-  }
-
-struct ConcretePyInterpreterVTable final
-    : public c10::impl::PyInterpreterVTable {
-  std::string name() const override;
-
-  void decref(PyObject* pyobj, bool is_tensor) const override;
-
-  c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override;
-
-  void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
-      const override;
-  void python_dispatcher(
-      const c10::OperatorHandle& op,
-      c10::DispatchKeySet,
-      torch::jit::Stack* stack) const override;
-  // NB: this is defined in python_dispatch.cpp
-  void python_op_registration_trampoline(
-      const c10::OperatorHandle& op,
-      c10::DispatchKey key,
-      torch::jit::Stack* stack) const override {
-    torch::impl::dispatch::python_op_registration_trampoline_impl(
-        op, key, stack);
-  }
-
-  bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override;
-  bool is_strides_like(const TensorImpl* self, at::MemoryFormat) const override;
-  bool is_non_overlapping_and_dense(const TensorImpl* self) const override;
-  c10::Device device(const TensorImpl* self) const override;
-  int64_t dim(const TensorImpl* self) const override;
-  c10::IntArrayRef strides(const TensorImpl* self) const override;
-  c10::IntArrayRef sizes(const TensorImpl* self) const override;
-  c10::SymIntArrayRef sym_sizes(const TensorImpl* self) const override;
-  c10::Layout layout(const TensorImpl* self) const override;
-  c10::SymInt sym_numel(const TensorImpl* self) const override;
-  c10::SymIntArrayRef sym_strides(const TensorImpl* self) const override;
-  c10::SymInt sym_storage_offset(const TensorImpl* self) const override;
-
-  void trace_gpu_event_creation(uintptr_t event) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventCreationCallbacks", event);
-  }
-  void trace_gpu_event_deletion(uintptr_t event) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventDeletionCallbacks", event);
-  }
-  void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
-      const override {
-    CONCRETE_TRACE_CUDA("CUDAEventRecordCallbacks", event, stream);
-  }
-  void trace_gpu_event_wait(uintptr_t event, uintptr_t stream) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventWaitCallbacks", event, stream);
-  }
-  void trace_gpu_memory_allocation(uintptr_t ptr) const override {
-    CONCRETE_TRACE_CUDA("CUDAMemoryAllocationCallbacks", ptr);
-  }
-  void trace_gpu_memory_deallocation(uintptr_t ptr) const override {
-    CONCRETE_TRACE_CUDA("CUDAMemoryDeallocationCallbacks", ptr);
-  }
-  void trace_gpu_stream_creation(uintptr_t stream) const override {
-    CONCRETE_TRACE_CUDA("CUDAStreamCreationCallbacks", stream);
-  }
-  void trace_gpu_device_synchronization() const override {
-    CONCRETE_TRACE_CUDA("CUDADeviceSynchronizationCallbacks");
-  }
-  void trace_gpu_stream_synchronization(uintptr_t stream) const override {
-    CONCRETE_TRACE_CUDA("CUDAStreamSynchronizationCallbacks", stream);
-  }
-  void trace_gpu_event_synchronization(uintptr_t event) const override {
-    CONCRETE_TRACE_CUDA("CUDAEventSynchronizationCallbacks", event);
-  }
-
-  void reset_backward_hooks(const TensorImpl* self) const override;
-
-  static ConcretePyInterpreterVTable* instance() {
-    static ConcretePyInterpreterVTable s;
-    return &s;
-  }
-};
-
-// NOTE [PyInterpreter::decref takes an `is_tensor` arg]
-// Before calling PyInterpreter::decref, we must statically know if the
-// pyobj is a Tensor or not.
-// - If it is a tensor, we need to be careful about PyObject resurrection
-// - If it is not a tensor, we can freely decref
-// One alternative to this is using PyObject_IsInstance
-// to get at this information. However, we don't want to risk an incorrect
-// `__instancecheck__` changing the semantics here.
-void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool is_tensor)
-    const {
-  // Leak the pyobj if not initialized.  This can happen if we are running
-  // exit handlers that are destructing tensors with residual (owned)
-  // PyObjects stored in them.
-  if (!Py_IsInitialized())
-    return;
-
-  pybind11::gil_scoped_acquire gil;
-  // Two possibilities:
-  // 1. We are decref-ing a tensor. Then we must be careful about
-  // PyObject resurrection (this only applies to Tensors, see
-  // THPVariable_clear).
-  // 2. We are decref-ing some other Python object. We don't do
-  // PyObject resurrection on non-Tensors, so we just carry on as usual
-  if (is_tensor && Py_REFCNT(pyobj) > 1) {
-    // It's still alive!  This can happen if a weak ref resurrected
-    // the PyObject without flipping ownership.  At this point it is
-    // too late to rescue the object, so just stub out the PyObject
-    // so that it fails on subsequent uses.  Don't raise an error here;
-    // you're probably in a destructor.
-    TORCH_WARN(
-        "Deallocating Tensor that still has live PyObject references.  "
-        "This probably happened because you took out a weak reference to "
-        "Tensor and didn't call _fix_weakref() after dereferencing it.  "
-        "Subsequent accesses to this tensor via the PyObject will now fail.");
-    ((THPVariable*)pyobj)->cdata = MaybeOwned<Variable>();
-  }
-  Py_DECREF(pyobj);
-};
-
-class PyInterpreterHolder {
- public:
-  PyInterpreterHolder()
-      : impl_(new c10::impl::PyInterpreter(
-            ConcretePyInterpreterVTable::instance())) {
-    is_main_interpreter_ =
-        at::impl::PythonOpRegistrationTrampoline::registerInterpreter(impl_);
-  }
-  // NB: intentionally leaks the PyInterpreter, as there may still be
-  // references to it that are live, living in objects that aren't being
-  // destructed while Python is being cleaned up.
-  ~PyInterpreterHolder() {
-    impl_->disarm();
-  }
-  c10::impl::PyInterpreter* get() const noexcept {
-    return impl_;
-  }
-  bool is_main_interpreter() const noexcept {
-    return is_main_interpreter_;
-  }
-
- private:
-  c10::impl::PyInterpreter* impl_;
-  bool is_main_interpreter_;
-};
-PyInterpreterHolder self_interpreter;
-
 c10::TensorImpl::SizesStridesPolicy parseSizesStridesPolicyArgument(
     c10::string_view arg) {
   if (arg == "strides") {
@@ -369,20 +201,6 @@ c10::TensorImpl::SizesStridesPolicy parseSizesStridesPolicyArgument(
 }
 } // anonymous namespace
 
-c10::impl::PyInterpreter* getPyInterpreter() {
-  return self_interpreter.get();
-}
-
-bool isMainPyInterpreter() {
-  return self_interpreter.is_main_interpreter();
-}
-
-std::string ConcretePyInterpreterVTable::name() const {
-  std::stringstream ss;
-  ss << getPyInterpreter();
-  return ss.str();
-}
-
 PyObject* THPVariableClass = nullptr;
 
 PyObject* ParameterClass = nullptr;
@@ -390,7 +208,8 @@ PyObject* ParameterClass = nullptr;
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     Variable _var,
-    c10::impl::PyInterpreterStatus status);
+    c10::impl::PyInterpreterStatus status,
+    bool allow_preexisting_pyobj = false);
 
 // clang-tidy gets confused by static const
 static const char* VOLATILE_WARNING =
@@ -432,7 +251,7 @@ static PyObject* getPythonTensorClass(c10::Device d) {
 }
 
 void activateCUDATrace() {
-  c10::impl::GPUTrace::set_trace(self_interpreter.get());
+  c10::impl::GPUTrace::set_trace(getPyInterpreter());
 }
 
 // TODO: Make this take Variable by const reference
@@ -449,8 +268,7 @@ PyObject* THPVariable_Wrap(at::TensorBase var) {
   }
 
   c10::optional<PyObject*> mb_obj =
-      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          self_interpreter.get());
+      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(getPyInterpreter());
   c10::impl::PyInterpreterStatus status;
   if (mb_obj.has_value()) {
     auto obj = *mb_obj;
@@ -524,7 +342,7 @@ bool isResurrectable(THPVariable* self) {
   auto const& tensor = THPVariable_Unpack(self);
   // Check if this is hermetic. If it is, no resurrection.
   if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          self_interpreter.get()) != c10::make_optional((PyObject*)self)) {
+          getPyInterpreter()) != c10::make_optional((PyObject*)self)) {
     return false;
   }
   if (!tensor.defined() || tensor.use_count() <= 1) {
@@ -624,7 +442,7 @@ static int THPVariable_clear(THPVariable* self) {
 
     if (!self->cdata.unsafeIsBorrowed() &&
         tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-            self_interpreter.get()) == c10::make_optional((PyObject*)self)) {
+            getPyInterpreter()) == c10::make_optional((PyObject*)self)) {
       // TODO: empirically, on OS X this assert appears to be untrue
       // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
       // distributed/rpc/test_process_group_agent.py
@@ -718,7 +536,7 @@ static PyObject* THPVariable_view_func(PyObject* self_, PyObject* arg) {
       }
     }
   }
-  return THPVariable_Wrap(out);
+  return THPVariable_Wrap(std::move(out));
   END_HANDLE_TH_ERRORS
 }
 
@@ -1283,7 +1101,7 @@ PyObject* THPVariable_get_name(THPVariable* self, void* unused) {
     END_HANDLE_TH_ERRORS
   }
   const auto& tensor = THPVariable_Unpack(self);
-  if (tensor.name() == "")
+  if (tensor.name().empty())
     Py_RETURN_NONE;
   return THPUtils_packString(tensor.name().c_str());
 }
@@ -1686,15 +1504,6 @@ static PyMethodDef extra_methods[] = {
     {"_view_func", THPVariable_view_func, METH_O, nullptr},
     {nullptr}};
 
-/* From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
-   If compiled as a shared library instead, some compilers don't allow addresses
-   of Python objects defined in other libraries to be used in static
-   initializers here.  The DEFERRED_ADDRESS macro is used to tag the slots where
-   such addresses appear; the module init function must fill in the tagged slots
-   at runtime.  The argument is for documentation -- the macro ignores it.
-*/
-#define DEFERRED_ADDRESS(ADDR) nullptr
-
 struct THPVariableMeta {
   PyHeapTypeObject base;
 };
@@ -1804,10 +1613,14 @@ PyObject* THPVariable_pynew(
   auto tensor = torch::utils::base_tensor_ctor(args, kwargs);
   // WARNING: tensor is NOT guaranteed to be a fresh tensor; e.g., if it was
   // given a raw pointer that will refcount bump
+  // NB: base_tensor_ctor can call into dispatched ATen functions (e.g.,
+  // alias(), lift_fresh()) which can return Tensor subclasses.  We allow
+  // these to be passed on directly.
   return THPVariable_NewWithVar(
       type,
       std::move(tensor),
-      c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED);
+      c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED,
+      /*allow_preexisting_pyobj=*/true);
   END_HANDLE_TH_ERRORS
 }
 
@@ -1940,25 +1753,78 @@ void THPVariable_subclass_dealloc(PyObject* self) {
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     Variable _var,
-    c10::impl::PyInterpreterStatus status) {
-  // This function overwrite the Tensor's pyobj field without extra checks
-  // Make sure it is not set otherwise we would leak memory
-  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-      self_interpreter.get());
-  TORCH_CHECK(
-      !mb_obj.has_value() || !mb_obj.value(),
-      "Creating a new Tensor subclass ",
-      type->tp_name,
-      " but the raw Tensor object is already associated to a python object ",
-      "of type ",
-      mb_obj.value()->ob_type->tp_name);
-
+    c10::impl::PyInterpreterStatus status,
+    bool allow_preexisting_pyobj) {
   // Make sure that the reinterpret into a THPVariable* will be valid
   TORCH_CHECK(
       PyType_IsSubtype(type, &THPVariableType),
       "Creating a Tensor subclass from a class ",
       "that does not inherit from Tensor is not possible. Make sure your class inherits from Tensor.");
 
+  // This function overwrite the Tensor's pyobj field without extra checks
+  // Make sure it is not set otherwise we would leak memory
+  auto mb_obj =
+      _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(getPyInterpreter());
+
+  // Under some circumstances, we may attempt to create a new Python
+  // object for a variable that already has a Python object.  The most common
+  // situation this can occur is if you have a TorchDispatchMode active that
+  // is returning a subclass from lift_fresh (which is invoked to
+  // appropriately "wrap" a constant tensor into whatever ambient modes are
+  // active.)
+  //
+  // In general, it is impossible to handle this case compositionally.
+  // Suppose you have a user call ATensor([1, 2, 3]) when a mode is active
+  // that is transforming all ops (including the internal lift_fresh call that
+  // transforms [1, 2, 3] into a torch.tensor([1., 2., 3.])) to output
+  // BTensor, where ATensor and BTensor are completely unrelated subclasses
+  // and there is no way to compose them.  There is no way to satisfy the user
+  // request here: in particular, you can't just try to re-invoke the ATensor
+  // constructor on the returned BTensor, because (1) this could cause an
+  // infinite loop--we are already in ATensor.__new__ and (2) there isn't any
+  // guarantee that ATensor.__new__ supports a single element constructor
+  // anyway.
+  //
+  // However, a more common case is a user just called torch.Tensor([1, 2, 3]),
+  // and a fake tensor mode is active.  Really, all you want is to get back
+  // a FakeTensor, in the same way torch.tensor([1, 2, 3]) or torch.arange(3)
+  // would have returned a fake tensor (concretely, the way this happens
+  // is we create a *real* tensor torch.tensor([1., 2., 3.]), and then it
+  // turns into a FakeTensor when we call lift_fresh on this real tensor).
+  // This case is compositional because FakeTensor is a subclass of Tensor, so
+  // it's valid for us to return it in place of a Tensor.  So this is what we
+  // do.
+
+  if (mb_obj.has_value() && mb_obj.value()) {
+    TORCH_CHECK(
+        allow_preexisting_pyobj,
+        "Creating a new Tensor subclass ",
+        type->tp_name,
+        " but the raw Tensor object is already associated to a python object ",
+        "of type ",
+        mb_obj.value()->ob_type->tp_name);
+    // Even if we allow pre-existing PyObject, we don't allow completely
+    // ignoring the requested type.  Check that we fulfilled a subtype
+    // relation here.  In the common case the requested type is Tensor and
+    // this always succeeds.
+    PyObject* obj = *mb_obj;
+    // Check if it's OK to just directly return the Python object without
+    // allocating a new variable.  We just check that the existing Python
+    // object is a subclass of the requested type.
+    PyTypeObject* obj_type = Py_TYPE(obj);
+    TORCH_CHECK(
+        obj_type == type || PyType_IsSubtype(obj_type, type),
+        "Creating a new Tensor subclass ",
+        type->tp_name,
+        " but the raw Tensor object is already associated to a python object ",
+        "of type ",
+        mb_obj.value()->ob_type->tp_name,
+        " which is not a subclass of the "
+        "requested type");
+    // We may (in fact, we typically will) need to resurrect this
+    return THPVariable_Wrap(std::move(_var));
+  }
+
   PyObject* obj = type->tp_alloc(type, 0);
   if (obj) {
     auto v = (THPVariable*)obj;
@@ -1981,7 +1847,7 @@ static PyObject* THPVariable_NewWithVar(
       v->cdata = MaybeOwned<Variable>::owned(std::move(_var));
       const auto& var = THPVariable_Unpack(v);
       var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
-          self_interpreter.get(), obj, status);
+          getPyInterpreter(), obj, status);
       if (check_has_torch_dispatch(obj)) {
         var.unsafeGetTensorImpl()->set_python_dispatch(true);
       }
@@ -2217,617 +2083,3 @@ bool THPVariable_initModule(PyObject* module) {
   torch::utils::validate_numpy_for_dlpack_deleter_bug();
   return true;
 }
-
-namespace {
-
-bool isPythonTensor(const Tensor& tensor) {
-  return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
-}
-
-py::object torchDispatchFromTensorImpl(
-    const c10::TensorImpl* self,
-    const char* func_name,
-    PyObject* torch_api_function,
-    const char* module_name,
-    // WARNING: MUST NOT BE TENSOR ARGS
-    c10::SmallVector<py::object, 1> extra_args = {}) {
-  if (torch_api_function == nullptr) {
-    throw python_error();
-  }
-  TORCH_CHECK(
-      PyGILState_Check(),
-      "GIL must be held before you call parseIValuesToPyArgsKwargs");
-
-  std::vector<py::handle> overloaded_args;
-  // TODO: there should be a shorter way to spell this
-  // TODO: fix the constness of target
-  Tensor self_t = Tensor(
-      c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::
-          unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
-  auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
-  // NB: this may not be a python tensor if you got here from a mode!
-  // TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
-  append_overloaded_tensor(&overloaded_args, self_p.ptr());
-  auto args =
-      py::reinterpret_steal<py::object>(PyTuple_New(1 + extra_args.size()));
-  PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr());
-  int64_t i = 1;
-  for (auto& a : extra_args) {
-    if (a.ptr() == nullptr)
-      throw python_error();
-    PyTuple_SET_ITEM(args.ptr(), i, std::move(a).release().ptr());
-    i++;
-  }
-
-  py::dict kwargs;
-
-  return py::reinterpret_steal<py::object>(
-      handle_torch_function_no_python_arg_parser(
-          overloaded_args,
-          args.ptr(),
-          kwargs.ptr(),
-          func_name,
-          torch_api_function,
-          module_name,
-          TorchFunctionName::TorchDispatch));
-}
-
-py::handle getTorchApiFunction(const c10::OperatorHandle& op) {
-  return op.getPythonOp(getPyInterpreter(), [&]() -> PyObject* {
-    // Parse the name into namespace and name (no overload_name)
-    // TODO: put this into the library
-    const auto& schema = op.schema();
-    const auto& qualified_name = op.operator_name().name;
-    const auto& overload_name = schema.overload_name();
-    auto pos = qualified_name.find("::");
-    TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name);
-    // Make me some null terminated strings
-    std::string ns_str = qualified_name.substr(0, pos);
-    const char* ns = ns_str.c_str();
-    const char* func_name = qualified_name.c_str() + pos + strlen("::");
-
-    py::handle torch_api_function =
-        py::module::import("torch").attr("ops").attr(ns).attr(func_name);
-    if (overload_name == "") {
-      return torch_api_function.attr("default").ptr();
-    } else {
-      return torch_api_function.attr(overload_name.c_str()).ptr();
-    }
-  });
-}
-
-void ConcretePyInterpreterVTable::dispatch(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) const {
-  const auto& schema = op.schema();
-  const auto num_arguments = schema.arguments().size();
-  auto arguments = torch::jit::pop(*stack, num_arguments);
-
-  // The plan: convert all the arguments back into PyObjects,
-  // extracting out the tensor handles, then call
-  // handle_torch_function_no_python_arg_parser
-  // NB: at the point arguments are pushed to the stack, ALL defaults
-  // are already present
-
-  py::gil_scoped_acquire g;
-
-  std::vector<py::handle> overloaded_args;
-  py::handle torch_api_function_overload = getTorchApiFunction(op);
-
-  // Find overloaded tensors
-  for (const auto idx : c10::irange(arguments.size())) {
-    const auto& ivalue = arguments[idx];
-    if (ivalue.isTensor()) {
-      const auto& tensor = ivalue.toTensor();
-      if (isPythonTensor(tensor)) {
-        append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
-      }
-    } else if (ivalue.isList()) {
-      const auto& list = ivalue.toListRef();
-      for (const auto jdx : c10::irange(list.size())) {
-        const auto& nv = list[jdx];
-        if (nv.isTensor()) {
-          const auto& tensor = nv.toTensor();
-          if (isPythonTensor(tensor)) {
-            append_overloaded_tensor(&overloaded_args, py::cast(tensor).ptr());
-          }
-        }
-      }
-    }
-  }
-
-  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
-  auto args = std::move(args_kwargs.first);
-  auto kwargs = std::move(args_kwargs.second);
-
-  PyObject* obj = handle_torch_function_no_python_arg_parser(
-      overloaded_args,
-      args.ptr(),
-      kwargs.ptr(),
-      nullptr,
-      torch_api_function_overload.ptr(),
-      nullptr,
-      TorchFunctionName::TorchDispatch);
-  pushPyOutToStack(
-      op, stack, py::reinterpret_steal<py::object>(obj), "__torch_dispatch__");
-}
-
-void ConcretePyInterpreterVTable::python_dispatcher(
-    const c10::OperatorHandle& op,
-    c10::DispatchKeySet ks,
-    torch::jit::Stack* stack) const {
-  py::gil_scoped_acquire g;
-  py::handle torch_api_function_overload = getTorchApiFunction(op);
-  // TODO: if necessary, can optimize to cache the cache lookup
-  // TODO: if necessary, can optimize OpOverload to have slots
-  auto cache = py::dict(torch_api_function_overload.attr("_dispatch_cache"));
-  if (cache.ptr() == nullptr) {
-    throw python_error();
-  }
-
-  c10::DispatchKey k = ks.highestPriorityTypeId();
-  // TODO: allow this to be non-owning
-  auto handler = py::reinterpret_borrow<py::object>(
-      PyDict_GetItem(cache.ptr(), py::cast(k).ptr()));
-  if (handler.ptr() == nullptr) {
-    // Slow path
-    handler = torch_api_function_overload.attr("_get_dispatch")(k);
-  }
-  if (py::isinstance<c10::DispatchKey>(handler)) {
-    // NB: not redispatch, as that will permanently remove the python
-    // dispatcher for subsequent redispatches
-    op.callBoxedForDispatchKey(py::cast<c10::DispatchKey>(handler), *stack);
-    return;
-  }
-
-  const auto& schema = op.schema();
-  const auto num_arguments = schema.arguments().size();
-  auto arguments = torch::jit::pop(*stack, num_arguments);
-
-  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
-  auto args = std::move(args_kwargs.first);
-  auto kwargs = std::move(args_kwargs.second);
-
-  py::object obj = py::reinterpret_steal<py::object>(
-      PyObject_Call(handler.ptr(), args.ptr(), kwargs.ptr()));
-
-  if (obj.ptr() == nullptr) {
-    throw python_error();
-  }
-
-  pushPyOutToStack(op, stack, std::move(obj), "Python dispatcher");
-}
-
-c10::intrusive_ptr<TensorImpl> ConcretePyInterpreterVTable::detach(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "detach",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("detach")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  TORCH_CHECK(
-      THPVariable_Check(out.ptr()),
-      "detach returned invalid type ",
-      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
-      ", expected Tensor");
-  const Tensor& res_t = THPVariable_Unpack(out.ptr());
-  return res_t.getIntrusivePtr();
-}
-
-bool ConcretePyInterpreterVTable::is_contiguous(
-    const c10::TensorImpl* self,
-    at::MemoryFormat memory_format) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  py::object out;
-  if (memory_format == at::MemoryFormat::Contiguous) {
-    // For backwards compatibility
-    out = torchDispatchFromTensorImpl(
-        self,
-        "is_contiguous",
-        py::module::import("torch")
-            .attr("ops")
-            .attr("aten")
-            .attr("is_contiguous")
-            .attr("default")
-            .ptr(),
-        "torch.ops.aten");
-  } else {
-    out = torchDispatchFromTensorImpl(
-        self,
-        "is_contiguous",
-        py::module::import("torch")
-            .attr("ops")
-            .attr("aten")
-            .attr("is_contiguous")
-            .attr("memory_format")
-            .ptr(),
-        "torch.ops.aten",
-        {py::cast(memory_format)});
-  }
-
-  if (out.is_none()) {
-    return self->is_contiguous_default(memory_format);
-  }
-
-  TORCH_CHECK(
-      PyBool_Check(out.ptr()),
-      "is_contiguous returned invalid type ",
-      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
-      ", expected bool");
-
-  return PyObject_IsTrue(out.ptr());
-}
-
-bool ConcretePyInterpreterVTable::is_strides_like(
-    const c10::TensorImpl* self,
-    at::MemoryFormat memory_format) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "is_strides_like",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          // NB: intentionally suffixed with _format to avoid
-          // triggering matches against "_like" suffix
-          .attr("is_strides_like_format")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten",
-      {py::cast(memory_format)});
-
-  if (out.is_none()) {
-    return self->is_strides_like_default(memory_format);
-  }
-
-  TORCH_CHECK(
-      PyBool_Check(out.ptr()),
-      "is_strides_like_format returned invalid type ",
-      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
-      ", expected bool");
-
-  return PyObject_IsTrue(out.ptr());
-}
-
-bool ConcretePyInterpreterVTable::is_non_overlapping_and_dense(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "is_non_overlapping_and_dense",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("is_non_overlapping_and_dense")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    return self->is_non_overlapping_and_dense_default();
-  }
-
-  TORCH_CHECK(
-      PyBool_Check(out.ptr()),
-      "is_non_overlapping_and_dense returned invalid type ",
-      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
-      ", expected bool");
-
-  return PyObject_IsTrue(out.ptr());
-}
-
-int64_t ConcretePyInterpreterVTable::dim(const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "dim",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("dim")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  TORCH_CHECK(
-      PyLong_Check(out.ptr()),
-      "dim returned invalid type ",
-      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
-      ", expected int");
-
-  return THPUtils_unpackLong(out.ptr());
-}
-
-c10::Device ConcretePyInterpreterVTable::device(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "device",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("prim")
-          .attr("device")
-          .attr("default")
-          .ptr(),
-      "torch.ops.prim");
-
-  return toDevice(out.ptr());
-}
-
-c10::IntArrayRef ConcretePyInterpreterVTable::strides(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "stride",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("stride")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    TORCH_CHECK(
-        !self->has_symbolic_sizes_strides(),
-        "Cannot call strides on a tensor with symbolic shapes/strides");
-    return self->strides_default();
-  }
-
-  py::object values = py::reinterpret_steal<py::object>(out.ptr());
-
-  c10::optional<PyObject*> mb_obj =
-      self->pyobj_slot()->check_pyobj(getPyInterpreter());
-  TORCH_CHECK(
-      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
-  PyObject* subclass = *mb_obj;
-  Py_INCREF(subclass);
-  py::object sub = py::reinterpret_steal<py::object>(subclass);
-
-  py::object os = py::module_::import("torch").attr("overrides");
-  py::function get_buffer =
-      py::reinterpret_borrow<py::function>(os.attr("get_buffer"));
-  auto buffer = get_buffer(sub, values, "stride");
-  auto result = THPUtils_unpackLongs(buffer.ptr());
-  int64_t* start = (int64_t*)result[0];
-  int64_t len = result[1];
-
-  return c10::IntArrayRef(start, len);
-}
-
-static std::vector<int64_t> values_from_buffer(
-    const c10::TensorImpl* self,
-    py::handle values) {
-  c10::TensorImpl* ptr = const_cast<c10::TensorImpl*>(self);
-  c10::optional<PyObject*> mb_obj =
-      ptr->pyobj_slot()->check_pyobj(getPyInterpreter());
-  TORCH_CHECK(
-      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
-
-  py::object os = py::module_::import("torch").attr("overrides");
-  py::function get_buffer =
-      py::reinterpret_borrow<py::function>(os.attr("get_buffer"));
-  auto buffer = get_buffer(py::handle(*mb_obj), values, "size");
-  auto result = THPUtils_unpackLongs(buffer.ptr());
-  return result;
-}
-
-c10::IntArrayRef ConcretePyInterpreterVTable::sizes(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "size",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("size")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    TORCH_CHECK(
-        !self->has_symbolic_sizes_strides(),
-        "Cannot call sizes on a tensor with symbolic shapes/strides");
-    return self->sizes_default();
-  }
-
-  py::object values = py::reinterpret_steal<py::object>(out.ptr());
-  auto result = values_from_buffer(self, values);
-  int64_t* start = (int64_t*)result[0];
-  int64_t len = result[1];
-
-  return c10::IntArrayRef(start, len);
-}
-
-c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_sizes(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-  HANDLE_TH_ERRORS
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "sym_size",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("sym_size")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    return self->sym_sizes_default();
-  }
-  // We need to squeeze SymIntNodes and ints into `SymInts`
-  // since it's a format `sym_sizes()` are stored in
-  TORCH_CHECK(
-      py::isinstance<py::tuple>(out) || py::isinstance<py::list>(out),
-      "Symshape must be a list or a tuple");
-  py::list symints;
-  for (auto it = out.begin(); it != out.end(); it++) {
-    auto elm = *it;
-    auto si = py::cast<c10::SymInt>(elm);
-    // TODO: the buffer will need to be made owning later
-    symints.append(si.as_int_unchecked());
-  }
-
-  auto result = values_from_buffer(self, symints);
-  c10::SymInt* start = (c10::SymInt*)result[0];
-  int64_t len = result[1];
-
-  return c10::SymIntArrayRef(start, len);
-  END_HANDLE_TH_ERRORS_PYBIND
-}
-
-c10::Layout ConcretePyInterpreterVTable::layout(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "layout",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("prim")
-          .attr("layout")
-          .attr("default")
-          .ptr(),
-      "torch.ops.prim");
-
-  TORCH_CHECK(
-      THPLayout_Check(out.ptr()),
-      "layout returned invalid type ",
-      py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())),
-      ", expected Layout");
-
-  return toLayout(out.ptr());
-}
-
-c10::SymInt ConcretePyInterpreterVTable::sym_numel(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "sym_numel",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("sym_numel")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    TORCH_CHECK(
-        !self->has_symbolic_sizes_strides(),
-        "Cannot call numel on a tensor with symbolic shapes/strides");
-    return self->sym_numel_default();
-  }
-  return torch::is_symint(out) ? out.cast<c10::SymInt>()
-                               : c10::SymInt{py::cast<int64_t>(out)};
-}
-
-c10::SymInt ConcretePyInterpreterVTable::sym_storage_offset(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "sym_storage_offset",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("sym_storage_offset")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    return self->sym_storage_offset_default();
-  }
-  return torch::is_symint(out) ? out.cast<c10::SymInt>()
-                               : c10::SymInt{py::cast<int64_t>(out)};
-}
-
-c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_strides(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-  HANDLE_TH_ERRORS
-  auto out = torchDispatchFromTensorImpl(
-      self,
-      "sym_stride",
-      py::module::import("torch")
-          .attr("ops")
-          .attr("aten")
-          .attr("sym_stride")
-          .attr("default")
-          .ptr(),
-      "torch.ops.aten");
-
-  if (out.is_none()) {
-    return self->sym_strides_default();
-  }
-  // We need to squeeze SymIntNodes and ints into `SymInts`
-  // since it's a format `sym_strides()` are stored in
-  TORCH_CHECK(
-      py::isinstance<py::tuple>(out) || py::isinstance<py::list>(out),
-      "Symshape must be a list or a tuple");
-  py::list symints;
-  for (auto it = out.begin(); it != out.end(); it++) {
-    auto elm = *it;
-    auto si = torch::is_symint(elm) ? elm.cast<c10::SymInt>()
-                                    : c10::SymInt{py::cast<int64_t>(elm)};
-    symints.append(si.as_int_unchecked());
-  }
-
-  auto result = values_from_buffer(self, symints);
-  c10::SymInt* start = (c10::SymInt*)result[0];
-  int64_t len = result[1];
-
-  return c10::SymIntArrayRef(start, len);
-  END_HANDLE_TH_ERRORS_PYBIND
-}
-
-void ConcretePyInterpreterVTable::reset_backward_hooks(
-    const c10::TensorImpl* self) const {
-  pybind11::gil_scoped_acquire gil;
-  at::impl::MaybeSetTLSOnEntryGuard guard;
-  HANDLE_TH_ERRORS
-  Tensor self_t = Tensor(
-      c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::
-          unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
-  auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
-  PyObject_SetAttrString(self_p.ptr(), "_backward_hooks", Py_None);
-  END_HANDLE_TH_ERRORS_PYBIND
-}
-
-} // anonymous namespace
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index 932072c2e88b..f87d0166a912 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -23,11 +23,11 @@ struct THPVariable {
   PyObject* backward_hooks = nullptr;
 };
 
-TORCH_API void registerPythonTensorClass(
+TORCH_PYTHON_API void registerPythonTensorClass(
     const std::string& device,
     PyObject* python_tensor_class);
 
-TORCH_API void activateCUDATrace();
+TORCH_PYTHON_API void activateCUDATrace();
 
 TORCH_PYTHON_API extern PyObject* THPVariableClass;
 TORCH_PYTHON_API extern PyObject* ParameterClass;
@@ -67,9 +67,6 @@ inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
   return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
 }
 
-TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
-TORCH_PYTHON_API bool isMainPyInterpreter();
-
 std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
     const c10::OperatorHandle& op,
     const std::vector<c10::IValue>& arguments);
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 18a1e0f85d37..f6fcb1083d6e 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -24,6 +24,7 @@
 #include <stdexcept>
 #include <string>
 #include <typeinfo>
+#include <utility>
 #include <vector>
 
 namespace torch {
@@ -120,7 +121,7 @@ ViewInfo ViewInfo::chain(
     };
   }
 
-  return ViewInfo(base_, view_func);
+  return ViewInfo(base_, std::move(view_func));
 }
 
 namespace {
@@ -158,42 +159,41 @@ AutogradMeta* materialize_autograd_meta(const at::TensorBase& self) {
 
 void update_tensor_hooks_on_new_gradfn(
     const at::TensorBase& self,
+    const std::shared_ptr<torch::autograd::Node>& old_fn,
     const std::shared_ptr<torch::autograd::Node>& new_fn) {
   // This function is called whenever the grad_fn of the tensor is
   // changed. We assume here that new_fn does not yet have hooks of
-  // its own
+  // its own.
   //
   // This function does two things:
-  const auto& meta = impl::get_autograd_meta(self);
-  TORCH_INTERNAL_ASSERT(meta);
-  TORCH_INTERNAL_ASSERT(new_fn);
   // (1) reset the list when grad_fn is updated, so new hooks don't
   //     get erroneously registered to the old grad_fn.
   //     Note that the old cpp_hooks_list_ is still kept alive by the
   //     old grad_fn so hooks registered to the older version of the tensor
   //     will continue to be active.
+  // (2) If there is a retains_grad hook registered, move that from the
+  //     old cpp_hooks_list_ to the new one
+  const auto& meta = impl::get_autograd_meta(self);
+  TORCH_INTERNAL_ASSERT(meta);
+  TORCH_INTERNAL_ASSERT(new_fn);
   meta->cpp_hooks_list_ = nullptr;
   const c10::impl::PyInterpreter* interp =
       self.unsafeGetTensorImpl()->pyobj_slot()->pyobj_interpreter();
   if (interp) {
     (*interp)->reset_backward_hooks(self.unsafeGetTensorImpl());
   }
-  // (2) If there is a retains_grad hook registered, move that from the
-  //     old cpp_hooks_list_ to the new one
   if (self.retains_grad()) {
-    auto new_list = std::make_shared<hooks_list>();
-    new_list->push_back(std::move((*meta->retains_grad_hooks_list_)[0]));
-    (*meta->retains_grad_hooks_list_)[0] = nullptr;
-    meta->retains_grad_hooks_list_ = new_list;
-    std::unique_ptr<FunctionPreHook> hook_ptr =
-        std::make_unique<CppFunctionTensorPreHook>(
-            meta->retains_grad_hooks_list_, self.output_nr());
-    new_fn->add_retains_grad_hook(std::move(hook_ptr));
+    TORCH_INTERNAL_ASSERT(old_fn);
+    auto out = old_fn->pop_retains_grad_hook(self.output_nr());
+    TORCH_INTERNAL_ASSERT(out != nullptr);
+    new_fn->add_retains_grad_hook(std::move(out), self.output_nr());
   }
 }
 
 void rebase_history(const Variable& self, Edge gradient_edge) {
   TORCH_INTERNAL_ASSERT(gradient_edge.function != nullptr);
+  const auto& meta = impl::get_autograd_meta(self);
+  auto old_fn = meta != nullptr ? meta->grad_fn_ : nullptr;
   auto diff_view_meta = get_view_autograd_meta(self);
   if (diff_view_meta && diff_view_meta->has_bw_view()) {
     // See NOTE [ View + Inplace detection ]
@@ -221,35 +221,24 @@ void rebase_history(const Variable& self, Edge gradient_edge) {
   set_gradient_edge(self, std::move(gradient_edge));
   // Pass both self and its grad_fn to avoid calling into grad_fn reentrantly
   torch::autograd::impl::update_tensor_hooks_on_new_gradfn(
-      self, self.grad_fn());
+      self, old_fn, self.grad_fn());
 }
 
 void create_cpp_hook(const at::TensorBase& self, bool is_retains_grad_hook) {
   const auto& fn = self.grad_fn();
-  if (is_retains_grad_hook) {
-    std::shared_ptr<hooks_list>& list =
-        materialize_autograd_meta(self)->retains_grad_hooks_list_;
-    // NOLINTNEXTLINE(modernize-make-shared)
-    list.reset(new hooks_list());
-    std::unique_ptr<FunctionPreHook> hook_ptr{
-        new CppFunctionTensorPreHook(list, self.output_nr())};
-    TORCH_INTERNAL_ASSERT(fn, "Expect grad_fn to be defined for retains_grad");
-    fn->add_retains_grad_hook(std::move(hook_ptr));
-  } else {
-    std::shared_ptr<hooks_list>& list =
-        materialize_autograd_meta(self)->cpp_hooks_list_;
-    // NOLINTNEXTLINE(modernize-make-shared)
-    list.reset(new hooks_list());
-    std::unique_ptr<FunctionPreHook> hook_ptr{
-        new CppFunctionTensorPreHook(list, self.output_nr())};
-    // NB: we could potentially only update hooks_ if !fn, but it shouldn't
-    // matter
-    //     and this was the way before, so we keep it like this for now.
-    clear_hooks(self);
-    add_hook(self, std::make_unique<CppFunctionTensorPreHook>(list, 0));
-    if (fn) {
-      fn->add_tensor_pre_hook(std::move(hook_ptr));
-    }
+  std::shared_ptr<hooks_list>& list =
+      materialize_autograd_meta(self)->cpp_hooks_list_;
+  // NOLINTNEXTLINE(modernize-make-shared)
+  list.reset(new hooks_list());
+  std::unique_ptr<FunctionPreHook> hook_ptr{
+      new CppFunctionTensorPreHook(list, self.output_nr())};
+  // NB: we could potentially only update hooks_ if !fn, but it shouldn't
+  // matter
+  //     and this was the way before, so we keep it like this for now.
+  clear_hooks(self);
+  add_hook(self, std::make_unique<CppFunctionTensorPreHook>(list, 0));
+  if (fn) {
+    fn->add_tensor_pre_hook(std::move(hook_ptr));
   }
 }
 
@@ -362,7 +351,7 @@ void add_hook(
     const at::TensorBase& self,
     std::unique_ptr<FunctionPreHook> hook) {
   AutogradMeta* meta = materialize_autograd_meta(self);
-  TORCH_INTERNAL_ASSERT(meta->hooks_.size() == 0);
+  TORCH_INTERNAL_ASSERT(meta->hooks_.empty());
   meta->hooks_.push_back(std::move(hook));
 }
 
@@ -529,24 +518,6 @@ int64_t VariableHooks::_version(const at::TensorBase& self) const {
   return self.unsafeGetTensorImpl()->version_counter().current_version();
 }
 
-unsigned register_retains_grad_hook(
-    const at::TensorBase& self,
-    std::function<at::TensorBase(const at::TensorBase&)> hook) {
-  TORCH_CHECK(
-      self.requires_grad(),
-      "cannot retain grad on a variable that "
-      "doesn't require gradient");
-  // NB: materialize_autograd_meta unnecessary due to requires grad check
-  auto& list =
-      torch::autograd::impl::get_autograd_meta(self)->retains_grad_hooks_list_;
-  if (!list) {
-    torch::autograd::impl::create_cpp_hook(self, /*is_retains_grad_hook=*/true);
-  }
-  unsigned idx = list->size();
-  list->push_back(hook);
-  return idx;
-}
-
 void VariableHooks::retain_grad(const at::TensorBase& self) const {
   TORCH_CHECK(
       self.requires_grad(),
@@ -583,7 +554,10 @@ void VariableHooks::retain_grad(const at::TensorBase& self) const {
     return at::TensorBase{};
   };
 
-  register_retains_grad_hook(self, retain_grad_hook);
+  const auto& fn = self.grad_fn();
+  std::unique_ptr<FunctionPreHook> hook_ptr{new CppFunctionSingleTensorPreHook(
+      std::move(retain_grad_hook), self.output_nr())};
+  fn->add_retains_grad_hook(std::move(hook_ptr), self.output_nr());
   impl::get_autograd_meta(self)->retains_grad_ = true;
 }
 
@@ -608,7 +582,7 @@ void VariableHooks::_backward(
   std::vector<torch::autograd::Variable> input_vars(
       inputs.begin(), inputs.end());
   torch::autograd::backward(
-      {self}, {_gradient}, keep_graph, create_graph, input_vars);
+      {self}, {std::move(_gradient)}, keep_graph, create_graph, input_vars);
 }
 
 void VariableHooks::requires_grad_(
@@ -674,6 +648,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
       return diff_view_meta->grad_fn_;
     }
     auto current_version = self._version();
+    auto old_fn = diff_view_meta->grad_fn_;
     if (diff_view_meta->get_attr_version() != current_version) {
       // This is an indirect rebase_history due to another view or the base
       // being modified inplace
@@ -735,7 +710,7 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
       diff_view_meta->set_attr_version(current_version);
 
       torch::autograd::impl::update_tensor_hooks_on_new_gradfn(
-          self, diff_view_meta->grad_fn_);
+          self, old_fn, diff_view_meta->grad_fn_);
     }
     return diff_view_meta->grad_fn_;
   }
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 4cf78cb4f7ed..027ced02e52e 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -229,15 +229,14 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   // each other, so using both is not defined behavior.
   std::vector<std::unique_ptr<FunctionPreHook>> hooks_;
   std::shared_ptr<hooks_list> cpp_hooks_list_;
-  std::shared_ptr<hooks_list> retains_grad_hooks_list_;
 
   // Only meaningful on leaf variables (must be false otherwise)
-  bool requires_grad_;
+  bool requires_grad_{false};
 
   // Only meaningful on non-leaf variables (must be false otherwise)
-  bool retains_grad_;
+  bool retains_grad_{false};
 
-  bool is_view_;
+  bool is_view_{false};
 
   // The "output number" of this variable; e.g., if this variable
   // was the second output of a function, then output_nr == 1.
@@ -291,9 +290,7 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
       bool requires_grad = false,
       Edge gradient_edge = Edge())
       : grad_fn_(std::move(gradient_edge.function)),
-        requires_grad_(false),
-        retains_grad_(false),
-        is_view_(false),
+
         output_nr_(gradient_edge.input_nr) {
     // set_requires_grad also checks error conditions.
     if (requires_grad) {
diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp
index 72b740cecfe1..426064c9e823 100644
--- a/torch/csrc/cuda/Event.cpp
+++ b/torch/csrc/cuda/Event.cpp
@@ -25,13 +25,13 @@ static PyObject* THCPEvent_pynew(
   unsigned char interprocess = 0;
 
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  static char* kwlist[] = {
+  constexpr char* kwlist[] = {
       "enable_timing", "blocking", "interprocess", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
           "|bbb",
-          kwlist,
+          const_cast<char**>(kwlist),
           &enable_timing,
           &blocking,
           &interprocess)) {
@@ -43,9 +43,7 @@ static PyObject* THCPEvent_pynew(
     return nullptr;
   }
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   THCPEvent* self = (THCPEvent*)ptr.get();
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   unsigned int flags = (blocking ? cudaEventBlockingSync : cudaEventDefault) |
       (enable_timing ? cudaEventDefault : cudaEventDisableTiming) |
       (interprocess ? cudaEventInterprocess : cudaEventDefault);
@@ -88,7 +86,6 @@ static PyObject* THCPEvent_from_ipc_handle(
   if (!ptr) {
     return nullptr;
   }
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   THCPEvent* self = (THCPEvent*)ptr.get();
 
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index f43a7debb5e4..f0781f9b0ca0 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -30,37 +30,32 @@ void THCPGraph_init(PyObject* module) {
       // docs aren't clear. But it works.
       .def(
           "capture_begin",
-          torch::wrap_pybind_function(&at::cuda::CUDAGraph::capture_begin),
-          py::call_guard<py::gil_scoped_release>(),
+          torch::wrap_pybind_function_no_gil(
+              &at::cuda::CUDAGraph::capture_begin),
           py::arg("pool") = c10::cuda::MempoolId_t{0, 0})
       .def(
           "capture_end",
-          torch::wrap_pybind_function(&at::cuda::CUDAGraph::capture_end),
-          py::call_guard<py::gil_scoped_release>())
+          torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::capture_end))
       .def(
           "replay",
-          torch::wrap_pybind_function(&at::cuda::CUDAGraph::replay),
-          py::call_guard<py::gil_scoped_release>())
+          torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::replay))
       .def(
           "reset",
-          torch::wrap_pybind_function(&at::cuda::CUDAGraph::reset),
-          py::call_guard<py::gil_scoped_release>())
+          torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::reset))
       .def(
           "pool",
-          torch::wrap_pybind_function(&at::cuda::CUDAGraph::pool),
-          py::call_guard<py::gil_scoped_release>())
+          torch::wrap_pybind_function_no_gil(&at::cuda::CUDAGraph::pool))
       .def(
           "debug_dump",
-          torch::wrap_pybind_function(&::at::cuda::CUDAGraph::debug_dump),
-          py::call_guard<py::gil_scoped_release>())
+          torch::wrap_pybind_function_no_gil(
+              &::at::cuda::CUDAGraph::debug_dump))
       .def(
           "enable_debug_mode",
-          torch::wrap_pybind_function(
-              &::at::cuda::CUDAGraph::enable_debug_mode),
-          py::call_guard<py::gil_scoped_release>())
+          torch::wrap_pybind_function_no_gil(
+              &::at::cuda::CUDAGraph::enable_debug_mode))
       .def(
           "debug_dump",
-          torch::wrap_pybind_function(&::at::cuda::CUDAGraph::debug_dump),
-          py::call_guard<py::gil_scoped_release>(),
+          torch::wrap_pybind_function_no_gil(
+              &::at::cuda::CUDAGraph::debug_dump),
           py::arg("debug_path"));
 }
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 331b6add4434..6b1c44091d39 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -230,19 +230,20 @@ PyObject* THCPModule_setStream_wrap(
   int64_t device_type = 0;
 
   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-  static char* kwlist[] = {"stream_id", "device_index", "device_type", nullptr};
+  constexpr char* kwlist[] = {
+      "stream_id", "device_index", "device_type", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
-          "|KKK",
-          kwlist,
+          "|LLL",
+          const_cast<char**>(kwlist),
           &stream_id,
           &device_index,
           &device_type)) {
   }
 
-  auto stream =
-      at::cuda::CUDAStream::unpack3(stream_id, device_index, device_type);
+  auto stream = at::cuda::CUDAStream::unpack3(
+      stream_id, device_index, static_cast<c10::DeviceType>(device_type));
 
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   auto device = static_cast<int>(c10::cuda::current_device());
@@ -564,6 +565,7 @@ PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
   result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
   result["active_bytes"] = statArrayToDict(stats.active_bytes);
   result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
+  result["requested_bytes"] = statArrayToDict(stats.requested_bytes);
   result["oversize_allocations"] = statToDict(stats.oversize_allocations);
   result["oversize_segments"] = statToDict(stats.oversize_segments);
 
@@ -599,18 +601,39 @@ struct Frame {
   int lasti;
 };
 
+static std::mutex to_free_frames_mutex;
+static std::vector<Frame> to_free_frames;
+
 struct StackContext : public c10::cuda::CUDACachingAllocator::Context {
+  // Locking:
+  // We need to free PyCodeObjects when ~StackContext runs, but
+  // CUDACachingAllocator may hold its device lock when ~StackContext runs.
+
+  // Because the thread calling the allocator _may_ hold the GIL,
+  // attempting to lock the GIL in ~StackContext can deadlock:
+  // T0: GIL Lock -> Call Allocator    ->| Waiting Device Lock
+  // T1: Call Allocator -> Device Lock ->| Waiting GIL Lock
+  // Instead the destructor defers freeing stack frames by putting them in
+  // to_free_frames. We still need a lock to manage this vector, but
+  // we can ensure an overall lock ordering of GIL -> device_lock ->
+  // to_free_frames_mutex because ::gather is called outside of the device lock.
   std::vector<Frame> frames;
   // Empty if cpp traces weren't enabled
   std::string cpp_frames;
+
   ~StackContext() {
-    py::gil_scoped_acquire acquire;
-    for (auto& f : frames) {
-      Py_XDECREF((PyObject*)f.code);
-    }
+    std::lock_guard lock(to_free_frames_mutex);
+    to_free_frames.insert(to_free_frames.end(), frames.begin(), frames.end());
   }
   static std::shared_ptr<StackContext> _gather() {
     py::gil_scoped_acquire acquire;
+    {
+      std::lock_guard lock(to_free_frames_mutex);
+      for (Frame f : to_free_frames) {
+        Py_XDECREF(f.code);
+      }
+      to_free_frames.clear();
+    }
     auto r = std::make_shared<StackContext>();
     PyFrameObject* f = PyEval_GetFrame();
     Py_XINCREF(f);
@@ -645,6 +668,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
   py::str total_size_s = "total_size";
   py::str allocated_size_s = "allocated_size";
   py::str active_size_s = "active_size";
+  py::str requested_size_s = "requested_size";
   py::str stream_s = "stream";
   py::str segment_type_s = "segment_type";
   py::str large_s = "large";
@@ -690,6 +714,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
     segmentDict[total_size_s] = segmentInfo.total_size;
     segmentDict[allocated_size_s] = segmentInfo.allocated_size;
     segmentDict[active_size_s] = segmentInfo.active_size;
+    segmentDict[requested_size_s] = segmentInfo.requested_size;
     // we want the python objects to pickle easily so use an int to
     // represent the stream rather than a torch.cuda.stream object
     segmentDict[stream_s] = int64_t(segmentInfo.stream);
@@ -699,6 +724,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
     for (const auto& blockInfo : segmentInfo.blocks) {
       py::dict blockDict;
       blockDict[size_s] = blockInfo.size;
+      blockDict[requested_size_s] = blockInfo.requested_size;
       blockDict[state_s] =
           (blockInfo.allocated
                ? active_allocated_s
@@ -1072,9 +1098,8 @@ static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
   auto num_gpus = c10::cuda::device_count();
   auto default_cuda_generators = PyTuple_New(static_cast<Py_ssize_t>(num_gpus));
   for (const auto i : c10::irange(num_gpus)) {
-    // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-    auto gen = at::cuda::detail::getDefaultCUDAGenerator(i);
-    auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(gen);
+    auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(
+        at::cuda::detail::getDefaultCUDAGenerator(i));
     // This reference is meant to be given away, so no need to incref here.
     PyTuple_SetItem(default_cuda_generators, i, (PyObject*)cast_gen);
   }
diff --git a/torch/csrc/cuda/Stream.cpp b/torch/csrc/cuda/Stream.cpp
index bb7be99ef0c3..936af674c24d 100644
--- a/torch/csrc/cuda/Stream.cpp
+++ b/torch/csrc/cuda/Stream.cpp
@@ -28,7 +28,7 @@ static PyObject* THCPStream_pynew(
   uint64_t stream_ptr = 0;
 
   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-  static char* kwlist[] = {
+  constexpr char* kwlist[] = {
       "priority",
       "stream_id",
       "device_index",
@@ -38,8 +38,8 @@ static PyObject* THCPStream_pynew(
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
-          "|iKKKK",
-          kwlist,
+          "|iLLLK",
+          const_cast<char**>(kwlist),
           &priority,
           &stream_id,
           &device_index,
@@ -59,14 +59,14 @@ static PyObject* THCPStream_pynew(
   }
 
   at::cuda::CUDAStream stream = (stream_id || device_index || device_type)
-      ? at::cuda::CUDAStream::unpack3(stream_id, device_index, device_type)
+      ? at::cuda::CUDAStream::unpack3(
+            stream_id, device_index, static_cast<c10::DeviceType>(device_type))
       : stream_ptr
       ? at::cuda::getStreamFromExternal(
             reinterpret_cast<cudaStream_t>(stream_ptr), current_device)
       : at::cuda::getStreamFromPool(
             /* isHighPriority */ priority < 0 ? true : false);
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   THCPStream* self = (THCPStream*)ptr.get();
   self->stream_id = static_cast<int64_t>(stream.id());
   self->device_index = static_cast<int64_t>(stream.device_index());
@@ -104,9 +104,7 @@ static PyObject* THCPStream_priority_range(
     PyObject* _unused,
     PyObject* noargs) {
   HANDLE_TH_ERRORS
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int least_priority, greatest_priority;
-  std::tie(least_priority, greatest_priority) =
+  auto [least_priority, greatest_priority] =
       at::cuda::CUDAStream::priority_range();
   return Py_BuildValue("(ii)", least_priority, greatest_priority);
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/cuda/Stream.h b/torch/csrc/cuda/Stream.h
index 9b7197d74390..6175ac2ea032 100644
--- a/torch/csrc/cuda/Stream.h
+++ b/torch/csrc/cuda/Stream.h
@@ -5,7 +5,6 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/python_headers.h>
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct THCPStream : THPStream {
   at::cuda::CUDAStream cuda_stream;
 };
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index e215ce0e3ed6..30f0d873ef88 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -54,9 +54,9 @@ static inline std::vector<Tensor>& _broadcast_out_impl(
 #ifdef USE_NCCL
   std::vector<Tensor> nccl_list;
   nccl_list.reserve(out_tensors.size() + 1);
-  nccl_list.push_back(tensor);
+  nccl_list.emplace_back(tensor);
   for (auto& out_tensor : out_tensors) {
-    nccl_list.push_back(out_tensor);
+    nccl_list.emplace_back(out_tensor);
   }
   if (nccl::is_available(nccl_list)) {
     nccl::broadcast(nccl_list);
@@ -102,7 +102,7 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntArrayRef devices) {
     TORCH_CHECK(
         device >= 0, "Expected non-negative device index, but got ", device);
     if (device != tensor.get_device()) {
-      diff_device_dst_tensors.push_back(at::empty(
+      diff_device_dst_tensors.emplace_back(at::empty(
           tensor.sizes(),
           tensor.options().device(
               at::Device(DeviceType::CUDA, device)))); // preserve memory format
@@ -116,9 +116,9 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntArrayRef devices) {
   for (auto device : devices) {
     // NOLINTNEXTLINE(bugprone-branch-clone)
     if (device != tensor.get_device()) {
-      dst_tensors.push_back(*it++);
+      dst_tensors.emplace_back(*it++);
     } else {
-      dst_tensors.push_back(tensor);
+      dst_tensors.emplace_back(tensor);
     }
   }
   TORCH_INTERNAL_ASSERT(it == diff_device_dst_tensors.end());
@@ -197,7 +197,7 @@ tensor_list2d broadcast_coalesced(
         for (const auto& var : torch::utils::unflatten_sparse_tensors(
                  inds, vals, chunk.tensors)) {
           // See NOTE [ Version Counter in comm.*_coalesced ]
-          device_outputs.push_back(make_variable(var.tensor_data(), false));
+          device_outputs.emplace_back(make_variable(var.tensor_data(), false));
         }
       }
     } else {
@@ -209,7 +209,7 @@ tensor_list2d broadcast_coalesced(
         for (auto& var :
              torch::utils::unflatten_dense_tensors(results[i], chunk.tensors)) {
           // See NOTE [ Version Counter in comm.*_coalesced ]
-          device_outputs.push_back(make_variable(var.tensor_data(), false));
+          device_outputs.emplace_back(make_variable(var.tensor_data(), false));
         }
       }
     }
@@ -255,7 +255,7 @@ std::vector<at::Tensor>& scatter_out(
     bool same_ndim = out_sizes.size() == tensor.dim();
     if (same_ndim) {
       total_size += out_sizes[dim];
-      chunk_sizes.push_back(out_sizes[dim]);
+      chunk_sizes.emplace_back(out_sizes[dim]);
       out_sizes[dim] = tensor.size(dim);
     }
     TORCH_CHECK(
@@ -379,7 +379,7 @@ static inline at::Tensor& _gather_out_impl(
   std::vector<int64_t> chunk_sizes;
   chunk_sizes.reserve(tensors.size());
   for (auto& tensor : tensors) {
-    chunk_sizes.push_back(tensor.size(dim));
+    chunk_sizes.emplace_back(tensor.size(dim));
   }
   auto chunks =
       out_tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 13db7cd81010..bbde2d1ff420 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -44,6 +44,7 @@ std::string _memory_snapshot_pickled() {
   IValue total_size_s = "total_size";
   IValue allocated_size_s = "allocated_size";
   IValue active_size_s = "active_size";
+  IValue requested_size_s = "requested_size";
   IValue stream_s = "stream";
   IValue segment_type_s = "segment_type";
   IValue large_s = "large";
@@ -71,6 +72,7 @@ std::string _memory_snapshot_pickled() {
     segmentDict.insert(total_size_s, segmentInfo.total_size);
     segmentDict.insert(allocated_size_s, segmentInfo.allocated_size);
     segmentDict.insert(active_size_s, segmentInfo.active_size);
+    segmentDict.insert(requested_size_s, segmentInfo.requested_size);
     segmentDict.insert(stream_s, int64_t(segmentInfo.stream));
     segmentDict.insert(
         segment_type_s, (segmentInfo.is_large ? large_s : small_s));
@@ -79,6 +81,7 @@ std::string _memory_snapshot_pickled() {
     for (const auto& blockInfo : segmentInfo.blocks) {
       auto blockDict = new_dict();
       blockDict.insert(size_s, blockInfo.size);
+      blockDict.insert(requested_size_s, blockInfo.requested_size);
       blockDict.insert(
           state_s,
           (blockInfo.allocated
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index 1011e4683279..e62e176473f2 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -28,7 +28,8 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
       streams.emplace_back(at::cuda::CUDAStream::unpack3(
           (reinterpret_cast<THCPStream*>(stream))->stream_id,
           (reinterpret_cast<THCPStream*>(stream))->device_index,
-          (reinterpret_cast<THCPStream*>(stream))->device_type));
+          static_cast<c10::DeviceType>(
+              (reinterpret_cast<THCPStream*>(stream))->device_type)));
     } else if (stream == Py_None) {
       streams.emplace_back();
     } else {
diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp
index 4167d3b81154..3de6e1e4acd7 100644
--- a/torch/csrc/distributed/autograd/utils.cpp
+++ b/torch/csrc/distributed/autograd/utils.cpp
@@ -20,7 +20,6 @@ using torch::distributed::rpc::JitFuture;
 using torch::distributed::rpc::Message;
 using torch::distributed::rpc::MessageType;
 using torch::distributed::rpc::RpcAgent;
-using torch::distributed::rpc::RpcCommandBase;
 using torch::distributed::rpc::WorkerInfo;
 
 void addSendRpcBackward(
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
index b43881d06a42..70452b32287c 100644
--- a/torch/csrc/distributed/c10d/Backend.hpp
+++ b/torch/csrc/distributed/c10d/Backend.hpp
@@ -34,7 +34,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
         std::string backend,
         std::chrono::milliseconds timeout = kBackendDefaultTimeout)
         : timeout(timeout), backend(std::move(backend)) {}
-    virtual ~Options() = default;
+    ~Options() override = default;
 
     std::chrono::milliseconds timeout;
 
@@ -43,7 +43,7 @@ class TORCH_API Backend : public torch::CustomClassHolder {
   };
 
   explicit Backend(int rank, int size);
-  virtual ~Backend() = 0;
+  ~Backend() override = 0;
 
   int getRank() const {
     return rank_;
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 8e364e0e4207..f3043ee73b89 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -278,7 +278,7 @@ off_t refresh(
 FileStore::FileStore(std::string path, int numWorkers)
     : Store(),
       path_(std::move(path)),
-      pos_(0),
+
       numWorkers_(numWorkers),
       cleanupKey_("cleanup/"),
       refCountKey_("refcount/"),
@@ -313,7 +313,7 @@ FileStore::~FileStore() {
   // Clean up the file if number of references is 0.
   if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {
     // Best effort removal without checking the return
-    std::remove(path_.c_str());
+    ::remove(path_.c_str());
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/FileStore.hpp b/torch/csrc/distributed/c10d/FileStore.hpp
index 826c94f302f1..bb810c0b338c 100644
--- a/torch/csrc/distributed/c10d/FileStore.hpp
+++ b/torch/csrc/distributed/c10d/FileStore.hpp
@@ -13,7 +13,7 @@ class TORCH_API FileStore : public Store {
  public:
   explicit FileStore(std::string  path, int numWorkers);
 
-  virtual ~FileStore();
+  ~FileStore() override;
 
   void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
@@ -47,7 +47,7 @@ class TORCH_API FileStore : public Store {
   int64_t addHelper(const std::string& key, int64_t i);
 
   std::string path_;
-  off_t pos_;
+  off_t pos_{0};
 
   int numWorkers_;
   const std::string cleanupKey_;
diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
index dd37b261062f..1221e9d033f2 100644
--- a/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
+++ b/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
@@ -21,7 +21,7 @@ class TORCH_API GlooDeviceFactory {
       const std::string& hostname);
 };
 
-C10_DECLARE_SHARED_REGISTRY(
+TORCH_DECLARE_SHARED_REGISTRY(
     GlooDeviceRegistry,
     ::gloo::transport::Device,
     const std::string&, /* interface */
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index fb5d91d2e11c..9f45ec61e09b 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -215,7 +215,7 @@ class NCCLComm {
 
 // Helper that automatically cleans up premul sums.
 struct ncclRedOpRAII {
-  ncclRedOpRAII() {}
+  ncclRedOpRAII() = default;
   ncclRedOpRAII(ncclRedOp_t op) : op_(op) {}
   ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm) :
     op_(op), comm_(comm), premul_sum_(true) {}
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 19c0cf5efdc4..b2bd7fe0d42f 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -1,6 +1,5 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/util/intrusive_ptr.h>
-#include <torch/csrc/distributed/c10d/Ops.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/Types.hpp>
 #include <torch/library.h>
@@ -58,387 +57,6 @@ TORCH_LIBRARY(c10d, m) {
 
 namespace ops {
 
-c10::intrusive_ptr<Work> broadcast(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const BroadcastOptions& opts) {
-  // TODO: handles the case of using a PythonProcessGroup which is used in
-  // Reducer.cpp This can be removed once
-  // https://github.com/pytorch/pytorch/issues/90659 is resolved
-  if (!process_group->hasBackends()) {
-    auto tensor_vec = tensors.vec();
-    return process_group->broadcast(tensor_vec, opts);
-  }
-
-  static auto op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("c10d::broadcast_", "")
-          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
-              at::TensorList,
-              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-              int64_t,
-              int64_t,
-              int64_t)>();
-  // It's awakward to unbox the opts here and box them again in the custom C++
-  // op. But it's also complicated to make opts as a CustomClassHolder. Leave it
-  // as it is now.
-  return std::get<1>(op.call(
-      tensors,
-      process_group,
-      opts.rootRank,
-      opts.rootTensor,
-      opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> allreduce(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const AllreduceOptions& opts) {
-  // TODO: handles the case of using a PythonProcessGroup which is used in
-  // Reducer.cpp This can be removed once
-  // https://github.com/pytorch/pytorch/issues/90659 is resolved
-  if (!process_group->hasBackends()) {
-    auto tensor_vec = tensors.vec();
-    return process_group->allreduce(tensor_vec, opts);
-  }
-
-  static auto op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("c10d::allreduce_", "")
-          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
-              at::TensorList,
-              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-              const c10::intrusive_ptr<::c10d::ReduceOp>&,
-              int64_t)>();
-
-  return std::get<1>(op.call(
-      tensors,
-      process_group,
-      c10::make_intrusive<ReduceOp>(opts.reduceOp),
-      opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> allreduce_coalesced(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const AllreduceCoalescedOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::allreduce_coalesced_", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
-                           int64_t)>();
-
-  return op.call(
-      tensors,
-      process_group,
-      c10::make_intrusive<ReduceOp>(opts.reduceOp),
-      opts.timeout.count());
-}
-
-c10::intrusive_ptr<Work> allgather(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const std::vector<std::vector<at::Tensor>>& output_tensors,
-    at::TensorList input_tensors,
-    const AllgatherOptions& opts) {
-  // TODO: handles the case of using a PythonProcessGroup which is used in
-  // Reducer.cpp This can be removed once
-  // https://github.com/pytorch/pytorch/issues/90659 is resolved
-  if (!process_group->hasBackends()) {
-    auto input_tensors_vec = input_tensors.vec();
-    return process_group->allgather(
-        const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
-        input_tensors_vec,
-        opts);
-  }
-
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::allgather_", "")
-                       .typed<std::tuple<
-                           std::vector<std::vector<at::Tensor>>,
-                           c10::intrusive_ptr<Work>>(
-                           const std::vector<std::vector<at::Tensor>>&,
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t)>();
-
-  return std::get<1>(op.call(
-      output_tensors, input_tensors, process_group, opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> _allgather_base(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::Tensor& output_tensor,
-    at::Tensor& input_tensor,
-    const AllgatherOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::_allgather_base_", "")
-                       .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
-                           at::Tensor&,
-                           at::Tensor&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
-
-  return std::get<1>(op.call(output_tensor, input_tensor, process_group));
-}
-
-c10::intrusive_ptr<Work> allgather_coalesced(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const std::vector<std::vector<at::Tensor>>& output_lists,
-    const at::TensorList& input_list,
-    const AllgatherOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::allgather_coalesced_", "")
-                       .typed<c10::intrusive_ptr<Work>(
-                           const std::vector<std::vector<at::Tensor>>&,
-                           const at::TensorList&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
-
-  return op.call(output_lists, input_list, process_group);
-}
-
-c10::intrusive_ptr<Work> reduce_scatter(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const at::TensorList& output_tensors,
-    const std::vector<std::vector<at::Tensor>>& input_tensors,
-    const ReduceScatterOptions& opts) {
-  static auto op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("c10d::reduce_scatter_", "")
-          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
-              const at::TensorList&,
-              const std::vector<std::vector<at::Tensor>>&,
-              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-              const c10::intrusive_ptr<::c10d::ReduceOp>&,
-              int64_t)>();
-  return std::get<1>(op.call(
-      output_tensors,
-      input_tensors,
-      process_group,
-      c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
-      opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> _reduce_scatter_base(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::Tensor& output_tensor,
-    at::Tensor& input_tensor,
-    const ReduceScatterOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::_reduce_scatter_base_", "")
-                       .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
-                           at::Tensor&,
-                           at::Tensor&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
-                           int64_t)>();
-  return std::get<1>(op.call(
-      output_tensor,
-      input_tensor,
-      process_group,
-      c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
-      opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> reduce(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const ReduceOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::reduce_", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
-                           int64_t,
-                           int64_t,
-                           int64_t)>();
-  return op.call(
-      tensors,
-      process_group,
-      c10::make_intrusive<ReduceOp>(opts.reduceOp),
-      opts.rootRank,
-      opts.rootTensor,
-      opts.timeout.count());
-}
-
-c10::intrusive_ptr<Work> gather(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const std::vector<std::vector<at::Tensor>>& output_tensors,
-    const at::TensorList& input_tensors,
-    const GatherOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::gather_", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           const std::vector<std::vector<at::Tensor>>&,
-                           const at::TensorList&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t,
-                           int64_t)>();
-  return op.call(
-      output_tensors,
-      input_tensors,
-      process_group,
-      opts.rootRank,
-      opts.timeout.count());
-}
-
-c10::intrusive_ptr<Work> scatter(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const at::TensorList& output_tensors,
-    const std::vector<std::vector<at::Tensor>>& input_tensors,
-    const ScatterOptions& opts) {
-  static auto op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("c10d::scatter_", "")
-          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
-              const at::TensorList&,
-              const std::vector<std::vector<at::Tensor>>&,
-              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-              int64_t,
-              int64_t)>();
-  return std::get<1>(op.call(
-      output_tensors,
-      input_tensors,
-      process_group,
-      opts.rootRank,
-      opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> alltoall(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const at::TensorList& output_tensors,
-    const at::TensorList& input_tensors,
-    const AllToAllOptions& opts) {
-  static auto op =
-      c10::Dispatcher::singleton()
-          .findSchemaOrThrow("c10d::alltoall_", "")
-          .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
-              const at::TensorList&,
-              const at::TensorList&,
-              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-              int64_t)>();
-  return std::get<1>(op.call(
-      output_tensors, input_tensors, process_group, opts.timeout.count()));
-}
-
-c10::intrusive_ptr<Work> alltoall_base(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::Tensor& output,
-    at::Tensor& input,
-    std::vector<int64_t> output_split_sizes,
-    std::vector<int64_t> input_split_sizes,
-    const AllToAllOptions& opts) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::alltoall_base_", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::Tensor&,
-                           at::Tensor&,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           std::vector<int64_t>,
-                           std::vector<int64_t>,
-                           int64_t)>();
-  return op.call(
-      output,
-      input,
-      process_group,
-      output_split_sizes,
-      input_split_sizes,
-      opts.timeout.count());
-}
-
-void monitored_barrier(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const BarrierOptions& opts,
-    bool wait_all_ranks) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::monitored_barrier_", "")
-                       .typed<void(
-                           at::Tensor,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const std::vector<int64_t>&,
-                           int64_t,
-                           bool)>();
-  // Default to using cpu implementation, monitored barrier is only for GLOO
-  at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU));
-  op.call(
-      tensor,
-      process_group,
-      opts.device_ids,
-      opts.timeout.count(),
-      wait_all_ranks);
-}
-
-c10::intrusive_ptr<Work> barrier(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const BarrierOptions& opts) {
-  static at::Tensor tensor;
-  // TODO: if nccl was specified then use it
-  if (process_group->getBackendType() ==
-      c10d::ProcessGroup::BackendType::NCCL) {
-    // set cuda tensor
-    tensor = at::empty(
-        {1}, at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte));
-  } else {
-    // Default to using cpu implementation
-    tensor = at::empty(
-        {1}, at::TensorOptions().device(at::DeviceType::CPU).dtype(at::kByte));
-  }
-
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::barrier", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::Tensor,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           const std::vector<int64_t>&,
-                           int64_t)>();
-
-  return op.call(tensor, process_group, opts.device_ids, opts.timeout.count());
-}
-
-c10::intrusive_ptr<Work> send(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    int64_t dstRank,
-    int64_t tag) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::send", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t,
-                           int64_t)>();
-  return op.call(tensors, process_group, dstRank, tag);
-}
-
-c10::intrusive_ptr<Work> recv(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    int64_t srcRank,
-    int64_t tag) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::recv_", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t,
-                           int64_t)>();
-  return op.call(tensors, process_group, srcRank, tag);
-}
-
-c10::intrusive_ptr<Work> recv_any_source(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    int64_t tag) {
-  static auto op = c10::Dispatcher::singleton()
-                       .findSchemaOrThrow("c10d::recv_any_source_", "")
-                       .typed<c10::intrusive_ptr<::c10d::Work>(
-                           at::TensorList,
-                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
-                           int64_t)>();
-  return op.call(tensors, process_group, tag);
-}
-
 // Below are ProcessGroup's corresponding ops for each backend. Ops are but
 // routed through the dispatcher to be dispatched to the appropriate backend.
 // Currently a no-op as the process group does not have a list of backends.
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
deleted file mode 100644
index e414640cccac..000000000000
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-#pragma once
-
-#include <c10/util/intrusive_ptr.h>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
-
-namespace c10d {
-namespace ops {
-
-// Below are essentially ProcessGroup's corresponding ops but routed to the
-// dispatcher. To be noted, it's a convention to use at::TensorList to represent
-// const std::vector<at::Tensor>&. However, const std::vector<at::Tensor>& is
-// used whenever the API accepts std::vector<std::vector<at::Tensor>>& to keep
-// consistency.
-TORCH_API c10::intrusive_ptr<Work> broadcast(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const BroadcastOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> allreduce(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const AllreduceOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> allreduce_coalesced(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const AllreduceCoalescedOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> allgather(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const std::vector<std::vector<at::Tensor>>& output_tensors,
-    at::TensorList input_tensors,
-    const AllgatherOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> _allgather_base(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::Tensor& outputTensor,
-    at::Tensor& inputTensor,
-    const AllgatherOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> allgather_coalesced(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const std::vector<std::vector<at::Tensor>>& output_lists,
-    const at::TensorList& input_list,
-    const AllgatherOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> reduce_scatter(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const at::TensorList& output_tensors,
-    const std::vector<std::vector<at::Tensor>>& input_tensors,
-    const ReduceScatterOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> _reduce_scatter_base(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-     at::Tensor& output_tensor,
-     at::Tensor& input_tensor,
-    const ReduceScatterOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> reduce(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    const ReduceOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> gather(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const std::vector<std::vector<at::Tensor>>& output_tensors,
-    const at::TensorList& input_tensors,
-    const GatherOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> scatter(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const at::TensorList& output_tensors,
-    const std::vector<std::vector<at::Tensor>>& input_tensors,
-    const ScatterOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> alltoall_base(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::Tensor& output,
-    at::Tensor& input,
-    const std::vector<int64_t> outputSplitSizes,
-    const std::vector<int64_t> inputSplitSizes,
-    const AllToAllOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> alltoall(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const at::TensorList& output_tensors,
-    const at::TensorList& input_tensors,
-    const AllToAllOptions& opts = {});
-
-TORCH_API c10::intrusive_ptr<Work> barrier(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const BarrierOptions& opts = {});
-
-TORCH_API void monitored_barrier(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    const BarrierOptions& opts,
-    bool waitAllRanks);
-
-TORCH_API c10::intrusive_ptr<Work> send(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    int64_t dstRank,
-    int64_t tag);
-
-TORCH_API c10::intrusive_ptr<Work> recv(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    int64_t srcRank,
-    int64_t tag);
-
-TORCH_API c10::intrusive_ptr<Work> recv_any_source(
-    const c10::intrusive_ptr<ProcessGroup>& process_group,
-    at::TensorList tensors,
-    int64_t tag);
-
-} // namespace ops
-} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/PrefixStore.hpp b/torch/csrc/distributed/c10d/PrefixStore.hpp
index 42447b3c8bb8..57ada0c84544 100644
--- a/torch/csrc/distributed/c10d/PrefixStore.hpp
+++ b/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -11,7 +11,7 @@ class TORCH_API PrefixStore : public Store {
       std::string  prefix,
       c10::intrusive_ptr<Store> store);
 
-  virtual ~PrefixStore()= default;
+  ~PrefixStore() override = default;
 
   using Store::set;
   void set(const std::string& key, const std::vector<uint8_t>& value) override;
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index f3a743a5da0a..6966e640aa91 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -57,7 +57,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
         std::string backend,
         std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout)
         : timeout(timeout), backend(std::move(backend)) {}
-    virtual ~Options() = default;
+    ~Options() override = default;
 
     std::chrono::milliseconds timeout;
 
@@ -83,7 +83,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
       int rank,
       int size,
       c10::intrusive_ptr<Options> options);
-  virtual ~ProcessGroup();
+  ~ProcessGroup() override;
 
   int getRank() const {
     return rank_;
@@ -236,15 +236,15 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     static auto op =
         c10::Dispatcher::singleton()
             .findSchemaOrThrow("c10d::_allgather_base_", "")
-            .typed<c10::intrusive_ptr<Work>(
+            .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
                 at::Tensor&,
                 at::Tensor&,
                 const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
 
-    return op.call(
+    return std::get<1>(op.call(
         outputBuffer,
         inputBuffer,
-        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this)));
   }
 
   // This function is deprecated and will be moved out of ProcessGroup to comms:
@@ -339,18 +339,18 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
       const ReduceScatterOptions& opts = ReduceScatterOptions()) {
     static auto op = c10::Dispatcher::singleton()
                          .findSchemaOrThrow("c10d::_reduce_scatter_base_", "")
-                         .typed<c10::intrusive_ptr<Work>(
+                         .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
                              at::Tensor&,
                              at::Tensor&,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              const c10::intrusive_ptr<::c10d::ReduceOp>&,
                              int64_t)>();
-    return op.call(
+    return std::get<1>(op.call(
         outputBuffer,
         inputBuffer,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
-        opts.timeout.count());
+        opts.timeout.count()));
   }
 
   virtual c10::intrusive_ptr<Work> alltoall_base(
@@ -383,16 +383,16 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
       const AllToAllOptions& opts = AllToAllOptions()) {
     static auto op = c10::Dispatcher::singleton()
                          .findSchemaOrThrow("c10d::alltoall_", "")
-                         .typed<c10::intrusive_ptr<::c10d::Work>(
-                             at::TensorList,
-                             at::TensorList,
+                         .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                             const at::TensorList&,
+                             const at::TensorList&,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t)>();
-    return op.call(
+    return std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
-        opts.timeout.count());
+        opts.timeout.count()));
   }
 
   virtual void monitoredBarrier(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index 1d68523204c7..72f6734ac1ef 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -154,7 +154,7 @@ void checkRemainingTime(
       " ms.");
   if (remainingTime.count() < 0) {
     std::string rankInfo;
-    if (processedRanks.size() > 0) {
+    if (!processedRanks.empty()) {
       rankInfo = c10::str(
           "Successfully processed ranks: ", c10::Join(", ", processedRanks));
     } else {
@@ -446,8 +446,8 @@ std::vector<at::Tensor> ProcessGroupGloo::AsyncWork::result() {
   TORCH_CHECK(
       outputTensors_.size() <= 1,
       "work result does not support list of lists, use .getFuture() and value()");
-  return outputTensors_.size() == 0 ? std::vector<at::Tensor>()
-                                    : outputTensors_.at(0);
+  return outputTensors_.empty() ? std::vector<at::Tensor>()
+                                : outputTensors_.at(0);
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupGloo::AsyncWork::
@@ -469,7 +469,7 @@ c10::intrusive_ptr<c10::ivalue::Future> createFutureAsOutput(
 void returnFutureWithOutput(
     c10::intrusive_ptr<c10::ivalue::Future>& future,
     const std::vector<std::vector<at::Tensor>>& outputTensors) {
-  if (outputTensors.size() == 0) {
+  if (outputTensors.empty()) {
     future->markCompleted(c10::IValue(std::vector<at::Tensor>()));
     return;
   }
@@ -519,7 +519,7 @@ ProcessGroupGloo::AsyncWork::AsyncWork(
     // correct timestamps for work that is asynchronously executed.
     : Work(-1, OpType::UNKNOWN, nullptr, inputTensors),
       outputTensors_(std::move(outputTensors)),
-      future_(createFutureAsOutput(outputTensors)) {
+      future_(createFutureAsOutput(outputTensors_)) {
   if (profilingTitle != nullptr) {
     recordAsyncWorkProfilingInfo(profilingTitle, inputTensors);
   }
@@ -1847,7 +1847,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allgather(
     TORCH_CHECK(false, "ProcessGroupGloo::allgather: " + msg);
   };
 
-  if (inputs.size() == 0) {
+  if (inputs.empty()) {
     invalidArgument("requires non-empty input tensor list");
   }
 
@@ -2199,7 +2199,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::gather(
     const auto& sizes = inputs[0].sizes();
     assertTypeAndSizesMatch(invalidArgument, outputs[0], options, sizes);
   } else {
-    if (outputs.size() != 0) {
+    if (!outputs.empty()) {
       invalidArgument("requires empty output on non-root");
     }
   }
@@ -2245,9 +2245,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
       : ProcessGroupGloo::AsyncWork(
             {outputs},
             "gloo:scatter",
-            inputs.size() > 0
-                ? c10::optional<std::vector<at::Tensor>>(inputs[0])
-                : c10::nullopt),
+            !inputs.empty() ? c10::optional<std::vector<at::Tensor>>(inputs[0])
+                            : c10::nullopt),
         context(context),
         outputs(outputs),
         inputs(inputs),
@@ -2383,7 +2382,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::scatter(
     const auto& sizes = outputs[0].sizes();
     assertTypeAndSizesMatch(invalidArgument, inputs[0], options, sizes);
   } else {
-    if (inputs.size() != 0) {
+    if (!inputs.empty()) {
       invalidArgument("requires empty input on non-root");
     }
   }
@@ -2454,7 +2453,7 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
 
   void alltoall(at::Tensor& outputTensor, at::Tensor& inputTensor) {
     const auto scalarType = outputTensor.scalar_type();
-    if (outputCounts.size() == 0 && inputCounts.size() == 0) {
+    if (outputCounts.empty() && inputCounts.empty()) {
       // Gloo alltoall
       gloo::AlltoallOptions opts(context);
       opts.setTag(tag);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index b966c984971f..a64bc37c4de5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -227,7 +227,7 @@ class TORCH_API ProcessGroupGloo : public Backend {
       int size,
       c10::intrusive_ptr<Options> options = Options::create());
 
-  virtual ~ProcessGroupGloo();
+  ~ProcessGroupGloo() override;
 
   c10::intrusive_ptr<Options> getOptions() {
     return options_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 35f409fc0368..6a11bacb376a 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -356,7 +356,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
   exception_ = w.exception_;
 }
 
-ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {}
+ProcessGroupNCCL::WorkNCCL::~WorkNCCL() = default;
 
 bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
   checkAndSetException();
@@ -475,6 +475,12 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeStreams() {
     // Block the current stream on the NCCL stream
     (*ncclEndEvents_)[i].block(currentStream);
   }
+
+  if (avoidRecordStreams_) {
+    // TORCH_INTERNAL_ASSERT(outputs_->size() > 0);
+    // TORCH_INTERNAL_ASSERT(stashed_for_allocator_safety_->size() > 0);
+    stashed_for_allocator_safety_->clear();
+  }
 }
 
 // Waiting on the work's corresponding CUDA events
@@ -623,6 +629,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       parseEnvVarIntDefault(NCCL_ASYNC_ERROR_HANDLING, 0));
   desyncDebug_ = parseEnvVarFlag(NCCL_DESYNC_DEBUG) ||
       (dist_debug_level_ >= DebugLevel::Detail);
+  avoidRecordStreams_ = parseEnvVarFlag(NCCL_AVOID_RECORD_STREAMS);
 
   if (blockingWait_) {
     if (asyncErrorHandling_ != NoHandling || desyncDebug_) {
@@ -1124,7 +1131,10 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
           "', but store->get('",
           storeKey,
           "') got error: ");
-      TORCH_CHECK(false, exceptionMsg + e.what());
+      TORCH_CHECK(
+          false,
+          exceptionMsg + e.what() +
+              ". This may indicate a possible application crash on rank 0 or a network set up issue.");
     } catch (...) {
       TORCH_CHECK(
           false,
@@ -1134,7 +1144,8 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
               "] is setting up NCCL communicator and "
               "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
               storeKey,
-              "'"));
+              "'",
+              ". This may indicate a possible application crash on rank 0 or a network set up issue."));
     }
   }
 }
@@ -1483,7 +1494,7 @@ void ProcessGroupNCCL::workEnqueue(
     // View tensors' destruction invokes autograd_meta, which
     // needs to be destructed in user thread. Otherwise will
     // get deadlock. Here we enqueue work without outputs_.
-    workMetaList_.emplace_back(WorkNCCL(*work));
+    workMetaList_.emplace_back(*work);
   }
 }
 
@@ -1573,6 +1584,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
   // Store references to outputs to be used by WorkNCCL::result and operator<<.
   work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
 
+  if (avoidRecordStreams_) {
+    work->stashed_for_allocator_safety_ =
+        std::make_shared<std::vector<at::Tensor>>(inputs);
+  }
+
   at::cuda::OptionalCUDAGuard gpuGuard;
 
   // Start event should only be recorded before the ncclGroupStart()
@@ -1583,7 +1599,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     }
   }
 
-  pre(ncclStreams);
+  pre(ncclStreams, work);
 
   {
     torch::cuda::nccl::AutoNcclGroup nccl_group_guard;
@@ -1602,8 +1618,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       // operations where `inputs' and `outputs' are not the same.
       //
       // See [Sync Streams].
-      c10::cuda::CUDACachingAllocator::recordStream(
-          inputs[i].storage().data_ptr(), ncclStream);
+      if (!avoidRecordStreams_) {
+        c10::cuda::CUDACachingAllocator::recordStream(
+            inputs[i].storage().data_ptr(), ncclStream);
+      }
       C10D_NCCL_CHECK(
           fn(inputs[i], outputs[i], ncclComm->getNcclComm(), ncclStream),
           ncclComm->getNcclCommFailureReason());
@@ -1639,6 +1657,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
 
   // Set appropriate work parameters.
   work->blockingWait_ = blockingWait_;
+  work->avoidRecordStreams_ = avoidRecordStreams_;
   work->opTimeout_ = options_->timeout;
   work->store_ = store_;
 
@@ -1658,6 +1677,18 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     PreProcess pre,
     PostProcess post,
     const char* profilingTitle) {
+  // avoidRecordStreams_ note:
+  // send, recv, and irecv should be ok with avoidRecordStreams,
+  // However, for isend, I don't think the API requires the user
+  // to wait() on the returned handle, so ProcessGroupNCCL can't know
+  // when it's safe to release the input back to the allocator,
+  // and the present call has no way to know it's not an isend.
+  // Therefore, we warn and fall back to the typical recordStream logic:
+  TORCH_WARN_ONCE(
+      !avoidRecordStreams_,
+      "NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point "
+      "collectives.");
+
   const auto devices = getDeviceList(tensors);
   std::string key;
   int p2pRank = 0, p2pTargetRank = 0;
@@ -1713,7 +1744,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     }
   }
 
-  pre(ncclStreams_[key]);
+  pre(ncclStreams_[key], work);
 
   for (const auto i : c10::irange(tensors.size())) {
     gpuGuard.set_index(devices[i].index());
@@ -1789,7 +1820,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
       inputs,
       outputs,
       fn,
-      [](std::vector<at::cuda::CUDAStream>&) {},
+      [](std::vector<at::cuda::CUDAStream>&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       [](std::vector<at::cuda::CUDAStream>&) {},
       opType,
       profilingTitle);
@@ -1807,7 +1839,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
       fn,
       peer,
       opType,
-      [](std::vector<at::cuda::CUDAStream>&) {},
+      [](std::vector<at::cuda::CUDAStream>&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       [](std::vector<at::cuda::CUDAStream>&) {},
       profilingTitle);
 }
@@ -1860,6 +1893,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>()); // outSplitSizes
 
+  // avoidRecordStreams_ note: collective() will stash tensors.
   return allreduce_impl(tensors, opts);
 }
 
@@ -1884,6 +1918,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>()); // outSplitSizes
 
+  // avoidRecordStreams_ note: collective() will stash tensors.
   return allreduce_impl(tensors, opts);
 }
 
@@ -1909,6 +1944,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>()); // outSplitSizes
 
+  // avoidRecordStreams_ note: collective() will stash tensors.
   return collective(
       tensors,
       tensors,
@@ -2009,6 +2045,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
       std::vector<int64_t>()); // outSplitSizes
 
   int dev_in_group = 0;
+  // avoidRecordStreams_ note: collective() will stash tensors.
   return collective(
       tensors,
       tensors,
@@ -2134,8 +2171,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
             at::Tensor& output,
             ncclComm_t comm,
             at::cuda::CUDAStream& stream) {
-          c10::cuda::CUDACachingAllocator::recordStream(
-              output.storage().data_ptr(), stream);
+          if (!avoidRecordStreams_) {
+            c10::cuda::CUDACachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
           return ncclAllGather(
               input.data_ptr(),
               output.data_ptr(),
@@ -2144,16 +2183,29 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
               comm,
               stream.stream());
         },
-        [&](std::vector<at::cuda::CUDAStream>& ncclStreams) {},
+        [](std::vector<at::cuda::CUDAStream>& ncclStreams,
+           c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+          // avoidRecordStreams_ note: We actually don't need to stash anything
+          // here.
+          //  - inputTensors is stashed onto work->stashed_for_allocator_safety_
+          //    in collective().
+          //  - outputFlattened is stashed onto work->outputs_ in collective().
+          //  - User-facing outputTensors should be held by the user until after
+          //    waiting on work_, or the call makes no sense.
+          // So all participating tensors are accounted for, and won't be
+          // released back to their allocation streams until after work_ is
+          // waited on.
+        },
         [&](std::vector<at::cuda::CUDAStream>& ncclStreams) {
           // Copy the flattened output tensors to the outputs.
           for (const auto i : c10::irange(outputTensors.size())) {
             at::cuda::CUDAStreamGuard guard(ncclStreams[i]);
             for (const auto j : c10::irange(outputTensors[0].size())) {
               // See [Sync Streams].
-              c10::cuda::CUDACachingAllocator::recordStream(
-                  outputTensors[i][j].storage().data_ptr(), ncclStreams[i]);
-
+              if (!avoidRecordStreams_) {
+                c10::cuda::CUDACachingAllocator::recordStream(
+                    outputTensors[i][j].storage().data_ptr(), ncclStreams[i]);
+              }
               outputTensors[i][j].copy_(outputFlattened[i][j], true);
             }
           }
@@ -2235,8 +2287,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
             at::Tensor& output,
             ncclComm_t comm,
             at::cuda::CUDAStream& stream) {
-          c10::cuda::CUDACachingAllocator::recordStream(
-              output.storage().data_ptr(), stream);
+          if (!avoidRecordStreams_) {
+            c10::cuda::CUDACachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
           const auto ncclDataType = getNcclDataType(input.scalar_type());
           const auto ncclReduceOp = getNcclReduceOp(
               opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
@@ -2249,15 +2303,33 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
               comm,
               stream.stream());
         },
-        [&](std::vector<at::cuda::CUDAStream>& ncclStreams) {
+        [&](std::vector<at::cuda::CUDAStream>& ncclStreams,
+            c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+          if (avoidRecordStreams_) {
+            // We only need to stash inputTensors.
+            //  - inputFlattened is stashed onto
+            //  work->stashed_for_allocator_safety_
+            //    in collective().
+            //  - User-facing outputTensors is stashed onto work->outputs_ in
+            //  collective(),
+            //    and should also be held by the user until after waiting on
+            //    work_.
+            auto& v = work->stashed_for_allocator_safety_;
+            for (const auto i : c10::irange(inputTensors.size())) {
+              v->insert(
+                  v->end(), inputTensors[i].begin(), inputTensors[i].end());
+            }
+          }
+
           // Copy the input tensors to the flattened inputs.
           for (const auto i : c10::irange(inputTensors.size())) {
             at::cuda::CUDAStreamGuard guard(ncclStreams[i]);
             for (const auto j : c10::irange(inputTensors[0].size())) {
               // See [Sync Streams].
-              c10::cuda::CUDACachingAllocator::recordStream(
-                  inputTensors[i][j].storage().data_ptr(), ncclStreams[i]);
-
+              if (!avoidRecordStreams_) {
+                c10::cuda::CUDACachingAllocator::recordStream(
+                    inputTensors[i][j].storage().data_ptr(), ncclStreams[i]);
+              }
               inputFlattened[i][j].copy_(inputTensors[i][j], true);
             }
           }
@@ -2331,6 +2403,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
   auto outputs = std::vector<at::Tensor>{outputTensor};
 
   int dev_in_group = 0;
+  // avoidRecordStreams_ note: collective() will stash inputs and outputs.
   return collective(
       inputs,
       outputs,
@@ -2338,8 +2411,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
-        c10::cuda::CUDACachingAllocator::recordStream(
-            output.storage().data_ptr(), stream);
+        if (!avoidRecordStreams_) {
+          c10::cuda::CUDACachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+        }
         auto ncclDataType = getNcclDataType(input.scalar_type());
         auto ncclReduceOp = getNcclReduceOp(
             opts.reduceOp, input, ncclDataType, comm, dev_in_group++);
@@ -2352,8 +2427,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
             comm,
             stream.stream());
       },
-      [&](std::vector<at::cuda::CUDAStream>&) {},
-      [&](std::vector<at::cuda::CUDAStream>&) {},
+      [](std::vector<at::cuda::CUDAStream>&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      [](std::vector<at::cuda::CUDAStream>&) {},
       OpType::_REDUCE_SCATTER_BASE,
       "nccl:_reduce_scatter_base");
 }
@@ -2451,6 +2527,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
         std::vector<int64_t>(), // inSplitSizes
         std::vector<int64_t>()); // outSplitSizes
 
+    // avoidRecordStreams_ note: collective() will stash inputTensors and
+    // outputTensors.
     return collective(
         inputTensors,
         outputTensors,
@@ -2459,8 +2537,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
             ncclComm_t comm,
             at::cuda::CUDAStream& stream) {
           // See [Sync Streams].
-          c10::cuda::CUDACachingAllocator::recordStream(
-              output.storage().data_ptr(), stream);
+          if (!avoidRecordStreams_) {
+            c10::cuda::CUDACachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
           torch::cuda::nccl::all2all_single_equal_split(
               input, output, this->getSize(), comm, stream);
           return ncclSuccess;
@@ -2488,6 +2568,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
         inputSplitSizes, // inSplitSizes
         outputSplitSizes); // outSplitSizes
 
+    // avoidRecordStreams_ note: collective() will stash inputTensors and
+    // outputTensors.
     return collective(
         inputTensors,
         outputTensors,
@@ -2504,8 +2586,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
           c10d::computeLengthsAndOffsets(
               outputSplitSizes, output, &recv_lengths, &recv_offsets);
           // See [Sync Streams].
-          c10::cuda::CUDACachingAllocator::recordStream(
-              output.storage().data_ptr(), stream);
+          if (!avoidRecordStreams_) {
+            c10::cuda::CUDACachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
           torch::cuda::nccl::all2all_single_unequal_split(
               input.data_ptr(),
               send_lengths.data(),
@@ -2549,6 +2633,17 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
         torch::cuda::nccl::all2all(outputTensors, inputTensors, comm, stream);
         return ncclSuccess;
       },
+      [&](std::vector<at::cuda::CUDAStream>&,
+          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+        if (avoidRecordStreams_) {
+          // inputTensor0 and outputTensor0 are stashed redundantly by
+          // collective(), but that's ok.
+          auto& v = work->stashed_for_allocator_safety_;
+          v->insert(v->end(), inputTensors.begin(), inputTensors.end());
+          v->insert(v->end(), outputTensors.begin(), outputTensors.end());
+        }
+      },
+      [](std::vector<at::cuda::CUDAStream>&) {},
       OpType::ALLTOALL);
 }
 
@@ -2705,6 +2800,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>()); // outSplitSize
 
+  // avoidRecordStreams_ note: collective() will stash inputTensors and
+  // outputs, which == outputTensors[0] on the root rank where it matters.
   return collective(
       inputTensors,
       outputs,
@@ -2714,9 +2811,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
           at::cuda::CUDAStream& stream) {
         const auto root = opts.rootRank;
         if (getRank() == root) {
-          for (auto output : outputs) {
-            c10::cuda::CUDACachingAllocator::recordStream(
-                output.storage().data_ptr(), stream);
+          if (!avoidRecordStreams_) {
+            for (auto output : outputs) {
+              c10::cuda::CUDACachingAllocator::recordStream(
+                  output.storage().data_ptr(), stream);
+            }
           }
         }
         torch::cuda::nccl::gather(inputTensors[0], outputs, comm, stream, root);
@@ -2787,6 +2886,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
       std::vector<int64_t>(), // inSplitSizes
       std::vector<int64_t>()); // outSplitSize
 
+  // avoidRecordStreams_ note: collective() will stash outputTensors and
+  // inputs, which == inputTensors[0] on the root rank where it matters.
   return collective(
       outputTensors,
       inputs,
@@ -2796,9 +2897,11 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
           at::cuda::CUDAStream& stream) {
         const auto root = opts.rootRank;
         if (getRank() == root) {
-          for (auto input : inputs) {
-            c10::cuda::CUDACachingAllocator::recordStream(
-                input.storage().data_ptr(), stream);
+          if (!avoidRecordStreams_) {
+            for (auto input : inputs) {
+              c10::cuda::CUDACachingAllocator::recordStream(
+                  input.storage().data_ptr(), stream);
+            }
           }
         }
         torch::cuda::nccl::scatter(
@@ -2836,6 +2939,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
   auto inputs = std::vector<at::Tensor>{input_tensor};
   auto outputs = std::vector<at::Tensor>{output_tensor};
 
+  // avoidRecordStreams_ note: collective() will stash inputs and outputs.
   return collective(
       inputs,
       outputs,
@@ -2843,8 +2947,10 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
           at::Tensor& output,
           ncclComm_t comm,
           at::cuda::CUDAStream& stream) {
-        c10::cuda::CUDACachingAllocator::recordStream(
-            output.storage().data_ptr(), stream);
+        if (!avoidRecordStreams_) {
+          c10::cuda::CUDACachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+        }
         return ncclAllGather(
             input.data_ptr(),
             output.data_ptr(),
@@ -2853,7 +2959,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
             comm,
             stream.stream());
       },
-      [&](std::vector<at::cuda::CUDAStream>&) {},
+      [&](std::vector<at::cuda::CUDAStream>&,
+          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
       [&](std::vector<at::cuda::CUDAStream>&) {},
       OpType::_ALLGATHER_BASE,
       "nccl:_all_gather_base");
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index f4068d81c0f1..e9a0e5585832 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -48,6 +48,14 @@ constexpr const char* NCCL_BACKEND_NAME = "nccl";
 // Soft mode: just clean up collectives and abort communicators without tearing down process
 enum ErrorHandlingMode { NoHandling = 0, TearDown = 1, CleanUpOnly = 2 };
 
+// If set, ProcessGroupNCCL doesn't use recordStream calls to ensure
+// caching allocator safety for tensors used on both user-facing and
+// internal comm streams.
+// Instead, it stashes live references to those tensors until after
+// user-facing streams are synced with comm streams.
+// See stashed_for_allocator_safety_ below.
+constexpr const char* NCCL_AVOID_RECORD_STREAMS = "NCCL_AVOID_RECORD_STREAMS";
+
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -102,7 +110,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // destructs outputs_ tensors who are view tensors in autograd graph.
     WorkNCCL(const WorkNCCL& w);
 
-    virtual ~WorkNCCL();
+    ~WorkNCCL() override;
 
     // Checks if the NCCL kernel has started to execute.
     bool isStarted();
@@ -169,6 +177,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // Clone of blockingWait_ from ProcessGroupNCCL.
     bool blockingWait_ = false;
 
+    // Clone of avoidRecordStreams_ from ProcessGroupNCCL.
+    bool avoidRecordStreams_ = false;
+
     // Clone of opTimeout_ from ProcessGroupNCCL.
     std::chrono::milliseconds opTimeout_;
 
@@ -216,6 +227,18 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // give a more descriptive message when representing the Work as a string.
     std::shared_ptr<std::vector<at::Tensor>> outputs_;
 
+    // NCCL_AVOID_RECORD_STREAMS implementation helper.
+    // Stores references to participating non-output tensors (ie inputs,
+    // flattened intermediates).
+    // We'll clear this list in synchronizeStreams, just after user-facing
+    // stream(s) are synced with the nccl work stream(s).
+    // By keeping these refs (as well as outputs_) alive until after the
+    // collective's work rejoins the user-facing streams, we achieve
+    // caching allocator safety without any recordStream calls.
+    // For in-place collectives, some refs stashed here may alias outputs_,
+    // but that doesn't do any harm.
+    std::shared_ptr<std::vector<at::Tensor>> stashed_for_allocator_safety_;
+
     // The future returned by getFuture.
     c10::intrusive_ptr<at::ivalue::Future> future_;
 
@@ -291,7 +314,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
       c10::intrusive_ptr<Options> options = Options::create())
       : ProcessGroupNCCL(store, rank, size, options) {}
 
-  virtual ~ProcessGroupNCCL();
+  ~ProcessGroupNCCL() override;
 
   c10::intrusive_ptr<Options> getOptions() {
     return options_;
@@ -668,6 +691,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Whether or not to enable timeout root cause analysis.
   bool desyncDebug_;
 
+  // Whether or not NCCL_AVOID_RECORD_STREAMS was set
+  bool avoidRecordStreams_ = false;
+
   // Set of communicators that this process group has aborted and their
   // ncclUniqueId has been written to the store. We don't need a lock
   // for this map since only the watchdog thread accesses this set. The
diff --git a/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp b/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp
index 801d97bb1ddc..5bf2fba1a380 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.cpp
@@ -10,7 +10,7 @@ ProcessGroupRoundRobin::ProcessGroupRoundRobin(
   TORCH_WARN(
       "ProcessGroupRoundRobin is deprecated and scheduled to be removed after this current release (1.13). ",
       "Please file an issue on https://github.com/pytorch/pytorch/issues if there are any concerns or issues with this deprecation.");
-  TORCH_CHECK(processGroups_.size() >= 1);
+  TORCH_CHECK(!processGroups_.empty());
   for (const auto& processGroup : processGroups_) {
     TORCH_CHECK(processGroup->getRank() == rank_);
     TORCH_CHECK(processGroup->getSize() == size_);
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index 1914fae8a0fb..f0303d0837d9 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -28,7 +28,7 @@ class TORCH_API Store : public torch::CustomClassHolder {
   explicit Store(const std::chrono::milliseconds& timeout)
       : timeout_(timeout) {}
 
-  virtual ~Store();
+  ~Store() override;
 
   void set(const std::string& key, const std::string& value);
 
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index ff16f0710cdd..b925c0a8455f 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -242,7 +242,7 @@ void TCPStoreMasterDaemon::queryFds(std::vector<struct pollfd>& fds) {
             ++vecIt;
           }
         }
-        if (it->second.size() == 0) {
+        if (it->second.empty()) {
           it = waitingSockets_.erase(it);
         } else {
           ++it;
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 425b7b7c4139..664fac84ca4a 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -48,7 +48,7 @@ class TORCH_API TCPStore : public Store {
       const std::chrono::milliseconds& timeout = kDefaultTimeout,
       bool waitWorkers = true);
 
-  virtual ~TCPStore();
+  ~TCPStore() override;
 
   void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 54b269e68f4a..21da7f2fc4b7 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -15,7 +15,7 @@ namespace c10d {
 
 // Base class for supplementary data potentially needed by ReduceOps
 struct TORCH_API _SupplementBase : torch::CustomClassHolder {
-  virtual ~_SupplementBase() = default;
+  ~_SupplementBase() override = default;
 };
 
 // Supplementary data specific to NCCL PREMUL_SUM
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 636f07649845..0e025b418ca0 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -132,7 +132,7 @@ inline void assertSameSizes(
 
 inline void assertSameSizeAndType(const std::vector<at::Tensor>& tensors) {
   // Ensure we have at least one tensor
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     throw std::invalid_argument("argument is empty");
   }
 
@@ -214,7 +214,7 @@ inline void assertLayoutMatch(
 inline void assertNonEmpty(
     std::function<void(const std::string&)> fn,
     const at::ArrayRef<at::Tensor> tensors) {
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     fn("requires non-empty tensor list");
   }
 }
@@ -349,7 +349,7 @@ inline at::Tensor flattenDenseTensors(at::TensorList tensors) {
 inline at::Tensor newLikeFlat(
     std::vector<std::vector<at::Tensor>>& tensors,
     size_t deviceIdx) {
-  if (tensors.size() == 0 || tensors[0].size() == 0) {
+  if (tensors.empty() || tensors[0].empty()) {
     TORCH_CHECK(false, "Received an empty list");
   }
   if (deviceIdx >= tensors.size()) {
@@ -372,7 +372,7 @@ inline at::Tensor newLikeFlat(
 }
 
 inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
-  if (tensors.size() == 0) {
+  if (tensors.empty()) {
     TORCH_CHECK(false, "Received an empty list");
   }
   auto& t = tensors[0];
@@ -426,7 +426,7 @@ inline void checkSplitSizes(
     const std::vector<int64_t>& split_sizes,
     const at::Tensor& tensor,
     int group_size) {
-  if (split_sizes.size() == 0) {
+  if (split_sizes.empty()) {
     TORCH_CHECK(
         tensor.size(0) % group_size == 0,
         "Tensor's dim 0 does not divide equally across group size");
@@ -454,7 +454,7 @@ size_t computeLengthsAndOffsets(
   size_t split_size = 0;
   size_t offset = 0;
 
-  if (split_sizes.size() == 0) {
+  if (split_sizes.empty()) {
     equal_splits = true;
     split_size = tensor.size(0) / group_size;
   }
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index 252fc4205a02..212ed3041457 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -50,7 +50,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
       const c10::optional<std::vector<at::Tensor>>& inputTensors =
           c10::nullopt);
 
-  virtual ~Work();
+  ~Work() override;
 
   // Checks if request has completed. Non-blocking operation.
   virtual bool isCompleted();
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index d011e5543a5d..1d55af715043 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -4,7 +4,6 @@
 
 #include <ATen/core/functional.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/distributed/c10d/Ops.hpp>
 #include <torch/csrc/distributed/c10d/reducer.hpp>
 #include <torch/csrc/utils/tensor_flatten.h>
 
@@ -21,7 +20,7 @@ class BroadcastWork {
         flat_tensor_({torch::utils::flatten_dense_tensors(bucket_tensors_)}) {
     BroadcastOptions broadcastOptions;
     broadcastOptions.rootRank = root_rank;
-    work_ = ops::broadcast(process_group, flat_tensor_, broadcastOptions);
+    work_ = process_group->broadcast(flat_tensor_, broadcastOptions);
   }
 
   void finish() {
diff --git a/torch/csrc/distributed/c10d/default_comm_hooks.cpp b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
index 9a8b2a5d9532..cd3eec9b23d8 100644
--- a/torch/csrc/distributed/c10d/default_comm_hooks.cpp
+++ b/torch/csrc/distributed/c10d/default_comm_hooks.cpp
@@ -2,7 +2,6 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/distributed/c10d/default_comm_hooks.hpp>
 
-#include <torch/csrc/distributed/c10d/Ops.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/torch.h>
@@ -14,7 +13,7 @@ c10::intrusive_ptr<c10::ivalue::Future> AllReduceCommHook::runHook(
   std::vector<at::Tensor> tensors = {bucket.getBufferRef()};
   // Apply the division first to avoid overflow, especially for FP16.
   tensors[0] /= state_->getSize();
-  return ops::allreduce(state_, tensors)->getFuture();
+  return state_->allreduce(tensors)->getFuture();
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
@@ -24,7 +23,7 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
   compressed_tensor /= state_->getSize();
   std::vector<at::Tensor> tensors = {compressed_tensor};
 
-  auto allreduce_fut = ops::allreduce(state_, tensors)->getFuture();
+  auto allreduce_fut = state_->allreduce(tensors)->getFuture();
   auto decompressed_tensor = bucket.getBufferRef();
   auto decompress = [decompressed_tensor](c10::ivalue::Future& allreduce_fut) {
     auto result = allreduce_fut.value();
@@ -47,7 +46,7 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
 c10::intrusive_ptr<c10::ivalue::Future> _AllReduceBySumCommHook::runHook(
     GradBucket& bucket) {
   std::vector<at::Tensor> tensors = {bucket.getBufferRef()};
-  return ops::allreduce(state_, tensors)->getFuture();
+  return state_->allreduce(tensors)->getFuture();
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index e90b15b1b079..abc4359e7dda 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -40,7 +40,6 @@
 #include <torch/csrc/distributed/c10d/reducer.hpp>
 
 #include <torch/csrc/Exceptions.h>
-#include <torch/csrc/distributed/c10d/Ops.hpp>
 #include <torch/csrc/distributed/c10d/python_comm_hook.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/object_ptr.h>
@@ -1186,15 +1185,10 @@ that adds a prefix to each key inserted to the store.
           .def_property_readonly("options", &::c10d::ProcessGroup::getOptions)
           .def(
               "broadcast",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& tensors,
-                 const ::c10d::BroadcastOptions& opts) {
-                return ::c10d::ops::broadcast(self, tensors, opts);
-              },
+              &::c10d::ProcessGroup::broadcast,
               py::arg("tensors"),
               py::arg("opts") = ::c10d::BroadcastOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "broadcast",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
@@ -1202,23 +1196,18 @@ that adds a prefix to each key inserted to the store.
                  int rootRank) {
                 ::c10d::BroadcastOptions opts;
                 opts.rootRank = rootRank;
-                return ::c10d::ops::broadcast(self, {x}, opts);
+                std::vector<at::Tensor> tensors = {x};
+                return self->broadcast(tensors, opts);
               },
               py::arg("tensor"),
               py::arg("root"),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "allreduce",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& tensors,
-                 const ::c10d::AllreduceOptions& opts) {
-                return ::c10d::ops::allreduce(self, tensors, opts);
-              },
+              &::c10d::ProcessGroup::allreduce,
               py::arg("tensors"),
               py::arg("opts") = ::c10d::AllreduceOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "allreduce",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
@@ -1226,7 +1215,7 @@ that adds a prefix to each key inserted to the store.
                  ::c10d::ReduceOp op) {
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
-                return ::c10d::ops::allreduce(self, xs, opts);
+                return self->allreduce(xs, opts);
               },
               py::arg("tensors"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
@@ -1240,30 +1229,21 @@ that adds a prefix to each key inserted to the store.
                 ::c10d::AllreduceOptions opts;
                 opts.reduceOp = op;
                 std::vector<at::Tensor> xs = {x};
-                return ::c10d::ops::allreduce(self, xs, opts);
+                return self->allreduce(xs, opts);
               },
               py::arg("tensor"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "allreduce_coalesced",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& xs,
-                 ::c10d::AllreduceCoalescedOptions opts) {
-                return ::c10d::ops::allreduce_coalesced(self, xs, opts);
-              },
+              &::c10d::ProcessGroup::allreduce_coalesced,
               py::arg("tensors"),
               py::arg("opts") = ::c10d::AllreduceCoalescedOptions(),
               py::call_guard<py::gil_scoped_release>())
 
           .def(
               "reduce",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& tensors,
-                 const ::c10d::ReduceOptions& opts) {
-                return ::c10d::ops::reduce(self, tensors, opts);
-              },
+              &::c10d::ProcessGroup::reduce,
               py::arg("tensors"),
               py::arg("opts") = ::c10d::ReduceOptions(),
               py::call_guard<py::gil_scoped_release>())
@@ -1278,41 +1258,19 @@ that adds a prefix to each key inserted to the store.
                 opts.reduceOp = op;
                 opts.rootRank = rootRank;
                 std::vector<at::Tensor> xs = {x};
-                return ::c10d::ops::reduce(self, xs, opts);
+                return self->reduce(xs, opts);
               },
               py::arg("tensor"),
               py::arg("root"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "allgather",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<std::vector<at::Tensor>>& output_tensors,
-                 const std::vector<at::Tensor>& input_tensor,
-                 const ::c10d::AllgatherOptions& opts) {
-                return ::c10d::ops::allgather(
-                    self, output_tensors, input_tensor, opts);
-              },
+              &::c10d::ProcessGroup::allgather,
               py::arg("output_tensors"),
               py::arg("input_tensors"),
               py::arg("opts") = ::c10d::AllgatherOptions(),
               py::call_guard<py::gil_scoped_release>())
-
-          .def(
-              "_allgather_base",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 at::Tensor& output_tensor,
-                 at::Tensor& input_tensor,
-                 const ::c10d::AllgatherOptions& opts) {
-                return ::c10d::ops::_allgather_base(
-                    self, output_tensor, input_tensor, opts);
-              },
-              py::arg("output"),
-              py::arg("input"),
-              py::arg("opts") = ::c10d::AllgatherOptions(),
-              py::call_guard<py::gil_scoped_release>())
-
           .def(
               "allgather",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
@@ -1320,36 +1278,29 @@ that adds a prefix to each key inserted to the store.
                  at::Tensor& input) {
                 std::vector<std::vector<at::Tensor>> outputs = {output};
                 std::vector<at::Tensor> inputs = {input};
-                return ::c10d::ops::allgather(
-                    self, outputs, inputs, ::c10d::AllgatherOptions());
+                return self->allgather(
+                    outputs, inputs, ::c10d::AllgatherOptions());
               },
               py::arg("output_tensors"),
               py::arg("input_tensor"),
               py::call_guard<py::gil_scoped_release>())
-
+          .def(
+              "_allgather_base",
+              &::c10d::ProcessGroup::_allgather_base,
+              py::arg("output"),
+              py::arg("input"),
+              py::arg("opts") = ::c10d::AllgatherOptions(),
+              py::call_guard<py::gil_scoped_release>())
           .def(
               "allgather_coalesced",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<std::vector<at::Tensor>>& output_lists,
-                 const std::vector<at::Tensor>& input_list,
-                 const ::c10d::AllgatherOptions& opts) {
-                return ::c10d::ops::allgather_coalesced(
-                    self, output_lists, input_list, opts);
-              },
+              &::c10d::ProcessGroup::allgather_coalesced,
               py::arg("output_lists"),
               py::arg("input_list"),
               py::arg("opts") = ::c10d::AllgatherOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "gather",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<std::vector<at::Tensor>>& output_tensors,
-                 const std::vector<at::Tensor>& input_tensors,
-                 const ::c10d::GatherOptions& opts) {
-                return ::c10d::ops::gather(
-                    self, output_tensors, input_tensors, opts);
-              },
+              &::c10d::ProcessGroup::gather,
               py::arg("output_tensors"),
               py::arg("input_tensors"),
               py::arg("opts") = ::c10d::GatherOptions(),
@@ -1365,27 +1316,19 @@ that adds a prefix to each key inserted to the store.
                 opts.rootRank = rootRank;
                 std::vector<std::vector<at::Tensor>> outputs = {output};
                 std::vector<at::Tensor> inputs = {input};
-                return ::c10d::ops::gather(self, outputs, inputs, opts);
+                return self->gather(outputs, inputs, opts);
               },
               py::arg("output_tensors"),
               py::arg("input_tensor"),
               py::arg("root"),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "scatter",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& output_tensors,
-                 const std::vector<std::vector<at::Tensor>>& input_tensors,
-                 const ::c10d::ScatterOptions& opts) {
-                return ::c10d::ops::scatter(
-                    self, output_tensors, input_tensors, opts);
-              },
+              &::c10d::ProcessGroup::scatter,
               py::arg("output_tensors"),
               py::arg("input_tensors"),
               py::arg("opts") = ::c10d::ScatterOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "scatter",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
@@ -1396,27 +1339,19 @@ that adds a prefix to each key inserted to the store.
                 opts.rootRank = rootRank;
                 std::vector<std::vector<at::Tensor>> inputs = {input};
                 std::vector<at::Tensor> outputs = {output};
-                return ::c10d::ops::scatter(self, outputs, inputs, opts);
+                return self->scatter(outputs, inputs, opts);
               },
               py::arg("output_tensor"),
               py::arg("input_tensors"),
               py::arg("root"),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "reduce_scatter",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 std::vector<at::Tensor>& output_tensors,
-                 const std::vector<std::vector<at::Tensor>>& input_tensors,
-                 const ::c10d::ReduceScatterOptions& opts) {
-                return ::c10d::ops::reduce_scatter(
-                    self, output_tensors, input_tensors, opts);
-              },
+              &::c10d::ProcessGroup::reduce_scatter,
               py::arg("output_tensors"),
               py::arg("input_tensors"),
               py::arg("opts") = ::c10d::ReduceScatterOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "reduce_scatter",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
@@ -1427,43 +1362,22 @@ that adds a prefix to each key inserted to the store.
                 std::vector<std::vector<at::Tensor>> inputs = {input};
                 ::c10d::ReduceScatterOptions opts;
                 opts.reduceOp = op;
-                return ::c10d::ops::reduce_scatter(self, outputs, inputs, opts);
+                return self->reduce_scatter(outputs, inputs, opts);
               },
               py::arg("output"),
               py::arg("input"),
               py::arg("op") = ::c10d::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "_reduce_scatter_base",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 at::Tensor& output_tensor,
-                 at::Tensor& input_tensor,
-                 const ::c10d::ReduceScatterOptions& opts) {
-                return ::c10d::ops::_reduce_scatter_base(
-                    self, output_tensor, input_tensor, opts);
-              },
+              &::c10d::ProcessGroup::_reduce_scatter_base,
               py::arg("outputTensor"),
               py::arg("inputTensor"),
               py::arg("opts") = ::c10d::ReduceScatterOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "alltoall_base",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 at::Tensor& output,
-                 at::Tensor& input,
-                 std::vector<int64_t> outputSplitSizes,
-                 std::vector<int64_t> inputSplitSizes,
-                 const ::c10d::AllToAllOptions& opts) {
-                return ::c10d::ops::alltoall_base(
-                    self,
-                    output,
-                    input,
-                    outputSplitSizes,
-                    inputSplitSizes,
-                    opts);
-              },
+              &::c10d::ProcessGroup::alltoall_base,
               py::arg("output"),
               py::arg("input"),
               py::arg("output_split_sizes"),
@@ -1472,74 +1386,32 @@ that adds a prefix to each key inserted to the store.
               py::call_guard<py::gil_scoped_release>())
           .def(
               "alltoall",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& output_tensors,
-                 const std::vector<at::Tensor>& input_tensors,
-                 const ::c10d::AllToAllOptions& opts) {
-                return ::c10d::ops::alltoall(
-                    self, output_tensors, input_tensors, opts);
-              },
+              &::c10d::ProcessGroup::alltoall,
               py::arg("output_tensors"),
               py::arg("input_tensors"),
               py::arg("opts") = ::c10d::AllToAllOptions(),
               py::call_guard<py::gil_scoped_release>())
-
-          .def(
-              "alltoall",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& output_tensors,
-                 const std::vector<at::Tensor>& input_tensors) {
-                return ::c10d::ops::alltoall(
-                    self,
-                    output_tensors,
-                    input_tensors,
-                    ::c10d::AllToAllOptions());
-              },
-              py::arg("output_tensors"),
-              py::arg("input_tensors"),
-              py::call_guard<py::gil_scoped_release>())
-
           .def(
               "send",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& tensors,
-                 int64_t dstRank,
-                 int64_t tag) {
-                return ::c10d::ops::send(self, tensors, dstRank, tag);
-              },
+              &::c10d::ProcessGroup::send,
               py::arg("tensors"),
               py::arg("dstRank"),
               py::arg("tag"),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "recv",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& tensors,
-                 int64_t srcRank,
-                 int64_t tag) {
-                return ::c10d::ops::recv(self, tensors, srcRank, tag);
-              },
+              &::c10d::ProcessGroup::recv,
               py::arg("tensors"),
               py::arg("srcRank"),
               py::arg("tag"),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "recv_anysource",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const std::vector<at::Tensor>& tensors,
-                 int64_t tag) {
-                return ::c10d::ops::recv_any_source(self, tensors, tag);
-              },
+              &::c10d::ProcessGroup::recvAnysource,
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "barrier",
-              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
-                 const ::c10d::BarrierOptions& opts) {
-                return ::c10d::ops::barrier(self, opts);
-              },
+              &::c10d::ProcessGroup::barrier,
               py::arg("opts") = ::c10d::BarrierOptions(),
               py::call_guard<py::gil_scoped_release>())
           .def(
@@ -1557,7 +1429,7 @@ that adds a prefix to each key inserted to the store.
                  bool waitAllRanks) {
                 ::c10d::BarrierOptions opts;
                 opts.timeout = timeout;
-                return ::c10d::ops::monitored_barrier(self, opts, waitAllRanks);
+                return self->monitoredBarrier(opts, waitAllRanks);
               },
               py::arg("timeout") = ::c10d::kUnsetTimeout,
               py::arg("wait_all_ranks") = false,
@@ -1642,7 +1514,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       "_round_robin_process_groups",
       [](std::vector<c10::intrusive_ptr<::c10d::ProcessGroup>> processGroups)
           -> c10::intrusive_ptr<::c10d::ProcessGroup> {
-        if (processGroups.size() == 0) {
+        if (processGroups.empty()) {
           throw std::invalid_argument("Specify at least 1 process group");
         }
         const auto& first = processGroups.front();
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index ca3919eb034b..29850fb22397 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream& output, const Logger& logger) {
       ddp_logging_data.ints_map["avg_backward_comm_time"],
       ddp_logging_data.ints_map["avg_backward_compute_comm_overlap_time"]);
 
-  if (ddp_logging_data.strs_map["comm_hook"] != "") {
+  if (!ddp_logging_data.strs_map["comm_hook"].empty()) {
     loggerInfo += fmt::format(
         "\n Gradient comm. hook: {}", ddp_logging_data.strs_map["comm_hook"]);
   }
@@ -274,7 +274,7 @@ void Logger::set_runtime_stats_and_log() {
   // If unused_parameters_ is not empty, calculate its sizes.
   // unused_parameters_ is calculated in forward call of
   // each iteration.
-  if (reducer_->unused_parameters_.size() == 0 &&
+  if (reducer_->unused_parameters_.empty() &&
       reducer_->find_unused_parameters_) {
     // No unused params in this iteration
     ddp_logging_data_->ints_map["unused_parameter_size"] = 0;
@@ -320,7 +320,9 @@ void Logger::set_runtime_stats_and_log() {
         "Cuda time stats are not collected for multi-device modules.");
     return;
   }
-  if (!reducer_->params_[0].is_cuda() && !reducer_->params_[0].is_cpu()) {
+
+  if (!reducer_->timer_ &&
+      (!reducer_->params_[0].is_cuda() && !reducer_->params_[0].is_cpu())) {
     TORCH_WARN_ONCE(
         "Time stats are currently only collected for CPU and CUDA devices. "
         "Please refer to CpuTimer or CudaTimer for how to register timer "
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 762005c62dde..df11c6444f3c 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -17,7 +17,6 @@
 #include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/utils/grad_layout_contract.h>
 #include <torch/csrc/autograd/utils/lambda_post_hook.h>
-#include <torch/csrc/distributed/c10d/Ops.hpp>
 #include <torch/csrc/distributed/c10d/comm.hpp>
 #include <torch/csrc/distributed/c10d/logger.hpp>
 #include <torch/csrc/utils/memory.h>
@@ -119,8 +118,7 @@ Reducer::Reducer(
       param_names_(std::move(param_names)),
       first_bucket_bytes_cap_(first_bucket_bytes_cap) {
   C10_LOG_API_USAGE_ONCE("torch.distributed.ddp.reducer");
-  TORCH_INTERNAL_ASSERT(
-      params_.size() >= 1, "Expected at least one parameter.");
+  TORCH_INTERNAL_ASSERT(!params_.empty(), "Expected at least one parameter.");
 
   if (ddp_debug_level_ != c10d::DebugLevel::Off) {
     LOG(INFO) << "Reducer initialized with bucket_bytes_cap: "
@@ -515,7 +513,7 @@ void Reducer::set_divide_factor() {
       auto results = extractTensors(workHandle->getFuture()->value());
 
       // Guard against the results being empty
-      TORCH_INTERNAL_ASSERT(results.size() > 0);
+      TORCH_INTERNAL_ASSERT(!results.empty());
       at::Tensor& res = results.front();
       div_factor_ = res.item().to<int>();
     }
@@ -574,7 +572,7 @@ void Reducer::delay_all_reduce() {
     }
 
     // Each rank prints out all the unused parameters detected
-    if (unused_parameters_.size() > 0) {
+    if (!unused_parameters_.empty()) {
       LOG(INFO) << "[Rank " << process_group_->getRank() << "]: "
                 << "Parameter(s) (in the format of {param_name, index}): "
                 << unused_params_stream.str()
@@ -728,8 +726,7 @@ void Reducer::all_reduce_local_used_map() {
     local_used_map_dev_.copy_(local_used_map_, true);
   }
   std::vector<at::Tensor> temp_local_used_map_dev_vec_ = {local_used_map_dev_};
-  local_used_work_ =
-      ops::allreduce(process_group_, temp_local_used_map_dev_vec_);
+  local_used_work_ = process_group_->allreduce(temp_local_used_map_dev_vec_);
 }
 
 at::Tensor& Reducer::get_param_from_index(size_t index) {
@@ -1016,7 +1013,7 @@ void Reducer::initialize_buckets(
     // TODO(@pietern): Validate indices.
     // Must be non-empty, unique, and unique across buckets.
     REDUCER_CHECK(
-        bucket_indices[bucket_index].size() > 0,
+        !bucket_indices[bucket_index].empty(),
         logger_,
         "Empty bucket specified.");
 
@@ -1637,7 +1634,7 @@ void Reducer::sync_bucket_indices(
   auto indices_tensor_device = at::empty({total_size + 1}, options);
   indices_tensor_device.copy_(indices_tensor, /*non_blocking=*/true);
   std::vector<at::Tensor> indices_tensor_list = {indices_tensor_device};
-  ops::broadcast(process_group_, indices_tensor_list)->wait();
+  process_group_->broadcast(indices_tensor_list)->wait();
   indices_tensor.copy_(indices_tensor_list.front(), /*non_blocking=*/false);
 
   // Update num_buckets after receiving it from rank 0
@@ -1656,7 +1653,7 @@ void Reducer::sync_bucket_indices(
   bucket_sizes_tensor_device.copy_(bucket_sizes_tensor, /*non_blocking=*/true);
   std::vector<at::Tensor> bucket_sizes_tensor_list = {
       bucket_sizes_tensor_device};
-  ops::broadcast(process_group_, bucket_sizes_tensor_list)->wait();
+  process_group_->broadcast(bucket_sizes_tensor_list)->wait();
   bucket_sizes_tensor.copy_(
       bucket_sizes_tensor_list.front(), /*non_blocking=*/false);
 
@@ -1804,9 +1801,7 @@ void Reducer::ensure_prior_reduction_finished() {
     auto unmarked_param_indices = getUnmarkedParamIndicesForIteration();
     // We should have some unmarked parameter indices, otherwise we would not
     // have run into this error branch.
-    TORCH_INTERNAL_ASSERT(unmarked_param_indices.size() > 0);
-    const std::string unmarkedParamIndices =
-        c10::Join(", ", unmarked_param_indices);
+    TORCH_INTERNAL_ASSERT(!unmarked_param_indices.empty());
 
     std::string kBaseErrorMsg =
         "Expected to have finished reduction in the prior iteration before "
@@ -1872,7 +1867,7 @@ void Reducer::ensure_prior_reduction_finished() {
     } else {
       // Retrieve set of parameter names that did not receive gradient.
       auto unmarkedParams = getUnmarkedParamsForIteration();
-      TORCH_INTERNAL_ASSERT(unmarkedParams.size() > 0);
+      TORCH_INTERNAL_ASSERT(!unmarkedParams.empty());
       for (const auto& s : unmarkedParams) {
         LOG(INFO) << "[Rank " << process_group_->getRank() << "] "
                   << "Parameter: " << s
@@ -1988,7 +1983,7 @@ compute_bucket_assignment_by_size(
   TORCH_INTERNAL_ASSERT(
       expect_sparse_gradient.empty() ||
       (tensors.size() == expect_sparse_gradient.size()));
-  TORCH_INTERNAL_ASSERT(tensors.size() > 0);
+  TORCH_INTERNAL_ASSERT(!tensors.empty());
   // Store bucket indices and their sizes together, because we later sort the
   // resulting indices by minimum tensor index and want to keep sizes
   // consistent.
@@ -2130,8 +2125,7 @@ void verify_params_across_processes(
   }
 
   std::vector<at::Tensor> param_size_vec{param_size_tensor};
-  ops::allgather(process_group, param_size_output_tensors, param_size_vec)
-      ->wait();
+  process_group->allgather(param_size_output_tensors, param_size_vec)->wait();
   auto result_size_tensors = param_size_output_tensors.front();
   for (size_t i = 0; i < world_size; ++i) {
     auto param_size_for_rank = result_size_tensors[i][0].item<int>();
@@ -2173,7 +2167,7 @@ void verify_params_across_processes(
 
   auto metadata_dev = metadata.clone().to(params[0].device());
   std::vector<at::Tensor> vec{metadata_dev};
-  ops::broadcast(process_group, vec)->wait();
+  process_group->broadcast(vec)->wait();
 
   // Technically, process 0 doesn't need to double-check metadata, because it
   // was the source.  But no harm keeping work aligned.
diff --git a/torch/csrc/distributed/c10d/reducer_timer.hpp b/torch/csrc/distributed/c10d/reducer_timer.hpp
index ba696383b88e..fe7e77edd88d 100644
--- a/torch/csrc/distributed/c10d/reducer_timer.hpp
+++ b/torch/csrc/distributed/c10d/reducer_timer.hpp
@@ -71,5 +71,5 @@ class TORCH_API Timer {
   }
 };
 
-C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
+TORCH_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
 } // namespace c10d
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index dae9c162fe9d..72eaebce5e43 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -176,7 +176,7 @@ int syncCallCount(
   std::tie(processCountKey, activeCallCountKey, readyKey) = getNextKeyIds();
 
   // Add to keys which will record the number of processes and active calls
-  int totalCallCount = store.add(activeCallCountKey, activeCalls);
+  store.add(activeCallCountKey, activeCalls);
   int totalProcessCount = store.add(processCountKey, 1);
 
   // The last worker will need to set the ready key
@@ -189,7 +189,7 @@ int syncCallCount(
 
   // Read count of active calls which may have changed
   auto activeCallCountData = store.get(activeCallCountKey);
-  totalCallCount = std::stoi(
+  int totalCallCount = std::stoi(
       std::string(activeCallCountData.begin(), activeCallCountData.end()));
   return totalCallCount;
 }
diff --git a/torch/csrc/distributed/rpc/python_call.cpp b/torch/csrc/distributed/rpc/python_call.cpp
index 21a06e34364a..d7e4b25242bb 100644
--- a/torch/csrc/distributed/rpc/python_call.cpp
+++ b/torch/csrc/distributed/rpc/python_call.cpp
@@ -27,7 +27,7 @@ c10::intrusive_ptr<Message> PythonCall::toMessageImpl() && {
 
 std::unique_ptr<PythonCall> PythonCall::fromMessage(const Message& message) {
   TORCH_INTERNAL_ASSERT(
-      message.payload().size() >= 1,
+      !message.payload().empty(),
       "Failed to convert an RPC message to PythonCall, the payload should at "
       "least contain one byte indicating whether this is an async function, "
       "but got payload of size ",
diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp
index d620fe6b9465..73b66f954541 100644
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@@ -20,8 +20,8 @@ void confirmPendingUser(
     auto msgPtr = jitFuture.constValue().toCustomClass<Message>();
     auto msgType = msgPtr->type();
     auto rpc = deserializeResponse(*msgPtr, msgType);
-    auto rr = dynamic_cast<RemoteRet*>(rpc.get());
-    TORCH_INTERNAL_ASSERT(rr->forkId() == expectedForkId);
+    auto& rr = dynamic_cast<RemoteRet&>(*rpc);
+    TORCH_INTERNAL_ASSERT(rr.forkId() == expectedForkId);
   } else {
     // Handle errors, such as timeouts, by invoking the error handler on the
     // rref.
@@ -62,12 +62,12 @@ c10::intrusive_ptr<RRef> finishCreatingOwnerRRef(
     auto msgPtr = jitFuture.constValue().toCustomClass<Message>();
     auto msgType = msgPtr->type();
     auto rpc = deserializeResponse(*msgPtr, msgType);
-    auto rr = dynamic_cast<RemoteRet*>(rpc.get());
+    auto& rr = dynamic_cast<RemoteRet&>(*rpc);
     TORCH_INTERNAL_ASSERT(
-        rr->rrefId() == rr->forkId(),
+        rr.rrefId() == rr.forkId(),
         "Expecting an OwnerRRef as RemoteRet but got a fork.");
     auto& ctx = RRefContext::getInstance();
-    auto deletedRRef = ctx.delForkOfOwner(rr->rrefId(), rr->rrefId());
+    auto deletedRRef = ctx.delForkOfOwner(rr.rrefId(), rr.rrefId());
     return deletedRRef;
   }
 }
@@ -123,7 +123,7 @@ void RRefContext::handleExceptionSilent(const JitFuture& jitFuture) {
 }
 
 RRefContext::RRefContext(std::shared_ptr<RpcAgent> agent)
-    : agent_(std::move(agent)), destroyed_(false) {}
+    : agent_(std::move(agent)) {}
 
 RRefContext::~RRefContext() {
   if (!owners_.empty()) {
@@ -247,7 +247,7 @@ void RRefContext::delAllUsersAndUnforkedOwners(
   {
     std::unique_lock<std::mutex> lock(mutex_);
     bool noPending = deleteAllUsersCV_.wait_for(lock, timeoutMillis, [this]() {
-      return pendingUsers_.size() == 0 && pendingChildren_.size() == 0;
+      return pendingUsers_.empty() && pendingChildren_.empty();
     });
     if (!noPending) {
       LOG(ERROR)
@@ -297,7 +297,7 @@ void RRefContext::delAllUsersAndUnforkedOwners(
   {
     std::unique_lock<std::mutex> lock(mutex_);
     bool noOwner = deleteAllUsersCV_.wait_for(
-        lock, timeoutMillis, [this]() { return owners_.size() == 0; });
+        lock, timeoutMillis, [this]() { return owners_.empty(); });
     if (!noOwner) {
       LOG(ERROR) << "Timed out waiting for pending OwnerRRefs to be deleted.";
     }
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index 78f1b3afb731..70a2b31f6897 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -303,7 +303,7 @@ class TORCH_API RRefContext {
   std::atomic<int64_t> numPendingFutures_{0};
 
   std::mutex destroyedMutex_;
-  bool destroyed_;
+  bool destroyed_{false};
 
   // Thread local states to keep UserRRefs deserialized from user function
   // arguments.
diff --git a/torch/csrc/distributed/rpc/script_resp.cpp b/torch/csrc/distributed/rpc/script_resp.cpp
index dcc253f81689..28ede36ea7bb 100644
--- a/torch/csrc/distributed/rpc/script_resp.cpp
+++ b/torch/csrc/distributed/rpc/script_resp.cpp
@@ -9,13 +9,6 @@ namespace torch {
 namespace distributed {
 namespace rpc {
 
-namespace {
-
-using torch::jit::Pickler;
-using torch::jit::Unpickler;
-
-} // namespace
-
 ScriptResp::ScriptResp(at::IValue&& value) : value_(value) {}
 
 const at::IValue& ScriptResp::value() {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index 92a632802c6e..4b709c7351f1 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -97,8 +97,8 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
         "num_worker_threads must be positive, got ",
         numWorkerThreads);
 
-    if (transports.has_value()) {
-      for (const std::string& transportName : transports.value()) {
+    if (this->transports.has_value()) {
+      for (const std::string& transportName : this->transports.value()) {
         TORCH_CHECK(
             TensorPipeTransportRegistry()->Has(transportName),
             "Unknown transport: ",
@@ -106,8 +106,8 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
       }
     }
 
-    if (channels.has_value()) {
-      for (const std::string& channelName : channels.value()) {
+    if (this->channels.has_value()) {
+      for (const std::string& channelName : this->channels.value()) {
         TORCH_CHECK(
             TensorPipeChannelRegistry()->Has(channelName),
             "Unknown channel: ",
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 0b76e64e1392..418c7bb5d17a 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -92,7 +92,7 @@ std::string makeRPCError(
   return fmt::format(
       "{}:{}:{}",
       torch::distributed::rpc::kRPCErrorPrefix,
-      errorType,
+      static_cast<int>(errorType),
       rpcErrorStr);
 }
 
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index 0e39eca6c0fc..2db60ed59c6e 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -587,7 +587,7 @@ inline static PyObject* eval_custom_code(
   }
 
   PyObject* result = eval_frame_default(tstate, shadow, throw_flag);
-  Py_DECREF(shadow);
+  Py_DECREF(shadow_obj);
   return result;
 }
 
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index 5ff74bb5ab76..2820b0c2119d 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -44,6 +44,8 @@ class TensorCheck {
     }
   }
 
+  // See note in guards.py [Note - On Export Tensor Guards]
+  // Logic parallel to here must be maintained in python
   bool check(const LocalState& state, const at::Tensor& v) {
     if (dispatch_key_ != state.apply(v.key_set()).raw_repr() ||
         dtype_ != v.dtype().toScalarType() ||
@@ -319,8 +321,8 @@ static PyTypeObject TensorGuardsType = {
 static PyObject* check_type_id(PyObject* dummy, PyObject* args) {
   // faster `lambda obj, expected: id(type(obj)) == expected`
   PyObject* obj;
-  unsigned long expected;
-  if (!PyArg_ParseTuple(args, "Ok", &obj, &expected)) {
+  unsigned long long expected;
+  if (!PyArg_ParseTuple(args, "OK", &obj, &expected)) {
     return NULL;
   }
   if (Py_TYPE(obj) == (void*)expected) {
@@ -333,8 +335,8 @@ static PyObject* check_type_id(PyObject* dummy, PyObject* args) {
 static PyObject* check_obj_id(PyObject* dummy, PyObject* args) {
   // faster `lambda obj, expected: id(obj) == expected`
   PyObject* obj;
-  unsigned long expected;
-  if (!PyArg_ParseTuple(args, "Ok", &obj, &expected)) {
+  unsigned long long expected;
+  if (!PyArg_ParseTuple(args, "OK", &obj, &expected)) {
     return NULL;
   }
   if (obj == (void*)expected) {
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 232b403f6689..a07bf265fac0 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -476,7 +476,7 @@ void initFuncTorchBindings(PyObject* module) {
   });
   m.def("peek_interpreter_stack", []() -> c10::optional<Interpreter> {
     const auto& stack = getDynamicLayerStack();
-    if (stack.size() == 0) {
+    if (stack.empty()) {
       return c10::nullopt;
     }
     auto result = stack.back().interpreter();
diff --git a/torch/csrc/init_flatbuffer_module.cpp b/torch/csrc/init_flatbuffer_module.cpp
index 99e89d2588a3..96e69ea754cc 100644
--- a/torch/csrc/init_flatbuffer_module.cpp
+++ b/torch/csrc/init_flatbuffer_module.cpp
@@ -117,8 +117,8 @@ extern "C"
       "_get_module_info_from_flatbuffer", [](std::string flatbuffer_content) {
         py::gil_scoped_acquire acquire;
         py::dict result;
-        mobile::ModuleInfo minfo = torch::jit::get_module_info_from_flatbuffer(
-            flatbuffer_content.data());
+        mobile::ModuleInfo minfo =
+            torch::jit::get_module_info_from_flatbuffer(&flatbuffer_content[0]);
         result["bytecode_version"] = minfo.bytecode_version;
         result["operator_version"] = minfo.operator_version;
         result["function_names"] = minfo.function_names;
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index a6aa49278cbe..1e5c408602a5 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -271,6 +271,28 @@ struct TORCH_API Module : public Object {
     mem_to_delete_ = delete_mem;
   }
 
+  // A set of functions to maintain input shapes through torch.jit.save and
+  // torch.jit.load. It only works on tensors and lists/dicts of tensors
+  // because tracing is only supported by these types.
+  void store_traced_inputs(std::string func_name, std::vector<IValue> inputs) {
+    if (inputs.size() == 0) {
+      return;
+    }
+    auto c10_inputs = c10::impl::GenericList(AnyType::get());
+    for (const IValue& value : inputs) {
+      // Not checking whether this is traceable type as that is already checked
+      // higher up in the stack and changing that would require a larger
+      // restructuring.
+      c10_inputs.push_back(value);
+    }
+    traced_inputs_.insert_or_assign(func_name, c10_inputs);
+  }
+
+  c10::Dict<std::string, c10::impl::GenericList> retrieve_traced_inputs()
+      const {
+    return traced_inputs_;
+  }
+
  private:
   Module clone_impl(
       std::unordered_map<TypePtr, TypePtr>& type_remap,
@@ -295,6 +317,9 @@ struct TORCH_API Module : public Object {
 
   // Extra handle for the module to delete when itself is deleted
   std::shared_ptr<char> mem_to_delete_;
+
+  // Map of function names to the traced inputs that they have been traced with
+  c10::Dict<std::string, c10::impl::GenericList> traced_inputs_;
 };
 
 // C++ equivalent api of `torch.jit.freeze`. See documentation there for
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
index 9db3509dc1d2..099999fc5ad0 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
@@ -5,6 +5,7 @@
 #import <torch/csrc/jit/backends/coreml/objc/PTMCoreMLModelWrapper.h>
 #import <torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h>
 #import <torch/script.h>
+#import <fmt/format.h>
 
 #import <CoreML/CoreML.h>
 
@@ -17,7 +18,7 @@
 // This is a utility macro that can be used to throw an exception when a CoreML
 // API function produces a NSError. The exception will contain a message with
 // useful info extracted from the NSError.
-#define COREML_THROW_IF_ERROR(error, preamble)                                   \
+#define COREML_THROW_IF_ERROR(error, preamble, inputShapesStr)                   \
   do {                                                                           \
     if C10_LIKELY(error) {                                                       \
       throw c10::Error(                                                          \
@@ -28,7 +29,8 @@
               " Localized_description: ", error.localizedDescription.UTF8String, \
               " Domain: ", error.domain.UTF8String,                              \
               " Code: ", error.code,                                             \
-              " User Info: ", error.userInfo.description.UTF8String));           \
+              " User Info: ", error.userInfo.description.UTF8String,             \
+              " Input Shapes: ", inputShapesStr));                               \
     }                                                                            \
   } while (false)
 
@@ -46,6 +48,26 @@
   bool allow_low_precision = true;
 };
 
+std::string tensorListToShapesStr(GenericList tensors) {
+  std::string str("[");
+  for (const auto featureIdx : c10::irange(tensors.size())) {
+    if (featureIdx > 0) {
+      str = fmt::format("{}, ", str);
+    }
+    str = fmt::format("{}[", str);
+    auto shape = tensors.get(featureIdx).toTensor().sizes();
+    for (const auto shapeIdx : c10::irange(shape.size())) {
+      if (shapeIdx > 0) {
+        str = fmt::format("{}, ", str);
+      }
+      str = fmt::format("{}{}", str, shape[shapeIdx]);
+    }
+    str = fmt::format("{}]", str);
+  }
+  str = fmt::format("{}]", str);
+  return str;
+}
+
 bool type_validity(const std::vector<TensorSpec>& specs) {
   for (const TensorSpec& spec : specs) {
     if (spec.dtype != c10::ScalarType::Float) {
@@ -161,18 +183,20 @@ GenericDict compile(IValue processed, GenericDict method_compile_spec) override
   }
 
   GenericList execute(IValue handle, GenericList inputs) override {
-    const auto model_wrapper = c10::static_intrusive_pointer_cast<MLModelWrapper>(handle.toCapsule());
+    @autoreleasepool {
+      const auto model_wrapper = c10::static_intrusive_pointer_cast<MLModelWrapper>(handle.toCapsule());
 
-    PTMCoreMLExecutor *executor = model_wrapper->executor;
-    [executor setInputs:inputs];
+      PTMCoreMLExecutor *executor = model_wrapper->executor;
+      [executor setInputs:inputs];
 
-    NSError *error;
-    id<MLFeatureProvider> outputsProvider = [executor forward:&error];
-    if (!outputsProvider) {
-      COREML_THROW_IF_ERROR(error, "Error running CoreML inference");
-    }
+      NSError *error;
+      id<MLFeatureProvider> outputsProvider = [executor forward:&error];
+      if (!outputsProvider) {
+        COREML_THROW_IF_ERROR(error, "Error running CoreML inference", tensorListToShapesStr(inputs));
+      }
 
-    return pack_outputs(model_wrapper->outputs, outputsProvider);
+      return pack_outputs(model_wrapper->outputs, outputsProvider);
+    }
   }
 
   bool is_available() override {
diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
index 448d448f1057..f0792acd9627 100644
--- a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp
@@ -60,7 +60,7 @@ c10::IValue preprocess(
       }
     }
   }
-  if (error.size() != 0) {
+  if (!error.empty()) {
     throw std::runtime_error(
         error +
         "\nmethod_compile_spec should contain a Tensor or Tensor List which bundles input parameters:"
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index 12126726aa4d..6d4cdc0560d6 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -1,11 +1,11 @@
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 
+#include <ATen/DynamicLibrary.h>
 #include <ATen/core/dispatch/OperatorOptions.h>
 #include <ATen/native/NonSymbolicBC.h>
 #include <ATen/native/TensorShape.h>
 #include <c10/util/CallOnce.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/register_ops_utils.h>
 
@@ -26,6 +26,36 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+class LoadingNvfuserLibrary {
+ public:
+#ifdef USE_CUDA
+  LoadingNvfuserLibrary() {
+    std::string library_name;
+    if (const char* path = std::getenv("TORCH_NVFUSER_LIBRARY_PATH")) {
+      library_name = path;
+    }
+#if defined(_WIN32)
+    library_name += "nvfuser_codegen.dll";
+#elif defined(__APPLE__)
+    library_name += "libnvfuser_codegen.dylib";
+#else
+    library_name += "libnvfuser_codegen.so";
+#endif
+    try {
+      nvfuserLib_ = std::make_shared<at::DynamicLibrary>(library_name.c_str());
+    } catch (const c10::DynamicLibraryError& e) {
+#if defined(BUILD_NVFUSER) || !defined(NODEBUG)
+      TORCH_WARN("Loading nvfuser library failed with: ", e.msg());
+#endif
+    }
+  }
+
+#endif // USE_CUDA
+  std::shared_ptr<at::DynamicLibrary> nvfuserLib_;
+};
+
+static LoadingNvfuserLibrary loading_nvfuser_library_;
+
 static std::atomic<bool> cuda_fusion_guard_mode{true};
 
 // There are 3 sources of information on whether to enable nvfuser:
@@ -42,16 +72,16 @@ class NVFuserEnabler {
   std::mutex mutex_;
 
  public:
-  static bool nvfuserCanBeEnabled() {
+  bool nvfuserCanBeEnabled() {
 #if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
     return false;
 #endif
-    return at::globalContext().hasCUDA() &&
-        NVFuserPassManager::isRegistered() && getExecutorMode();
+    return at::globalContext().hasCUDA() && getExecutorMode() &&
+        loading_nvfuser_library_.nvfuserLib_ != nullptr;
   }
 
  private:
-  static void assertFuserCanBeEnabled(bool is_enabled) {
+  void assertFuserCanBeEnabled(bool is_enabled) {
     if (!is_enabled) {
       return;
     }
@@ -228,705 +258,7 @@ bool skipNode(const std::string& symbol_str, bool flip) {
       getFuserInterface()->fn_skip_n(symbol_str, flip);
 }
 
-AnalyzeViewConstraint getViewConstraint(
-    const std::vector<int64_t>& original_sizes,
-    const std::vector<int64_t>& new_sizes) {
-  if (getFuserInterface()->fn_analyze_view != nullptr) {
-    return getFuserInterface()->fn_analyze_view(original_sizes, new_sizes);
-  }
-  TORCH_INTERNAL_ASSERT(false, "Requires nvFuser which requires CUDA build.");
-}
-
-//! [ Note -- type guard logic in CudaFusionGuard ]
-//!
-//! CudaFusionGuard is used to Guard input tensor to `CudaFusionGroup` so that
-//! we would not feed inputs that violates the graph defined in `GraphCache`.
-//!
-//! see [ Note -- 2 level cache implementation ] for definition of unique
-//! computational graph.
-//! see [ Note -- CudaFusionGuard implementation] for details on how guard works
-//! in profiling executor
-//!
-//! Type guard logic is used to query whether a runtime input `tensor` compiles
-//! with profiled `guard_tensor_type`. `guard_tensor_type` is the observed
-//! tensor type during profiling runs.
-//!
-//! At this moment, we only do single profiling run, so `guard_tensor_type` has
-//! static shape / stride / scalarType. *This might be a little confusing as our
-//! implementation is actually more relaxed.
-//!
-//! Things that we check:
-//!   a. identical rank & scalar type
-//!   b. stride check:
-//!        b.1. identical stride order
-//!        b.2. identical contiguity
-//!             note that contiguity here is used for tensor collapsing. So
-//!             extra attention should be paid to contiguity across size-1
-//!             dimensions.
-//!   c. size check:
-//!        c.1 broadcast check:
-//!        making sure that broadcast semantics are identical. So we want to
-//!        make sure a given dimension either are both size-1 for `tensor` &
-//!        `guard_tensor_type`, or are both non-size-1.
-//!        This is due to the fact that we specialize size-1 dimension as
-//!        broadcasted dimension while translating PyTorch tensor to Fusion IR.
-//!        c.1 size-0 check:
-//!        we don't specialize this on codegen, but we do specialize fusion
-//!        logic for size-0 on reductoins, hence the check
-//!
-bool complyWith(
-    const at::Tensor& tensor,
-    const c10::TensorTypePtr& guard_tensor_type) {
-  // guard broadcast semantics, contiguity & stride order;
-  TORCH_INTERNAL_ASSERT(
-      guard_tensor_type && guard_tensor_type->dim().has_value());
-
-  // check a. if num_dimension check fails or scalar type check fails
-  if (*guard_tensor_type->dim() != static_cast<size_t>(tensor.ndimension()) ||
-      (guard_tensor_type->scalarType().has_value() &&
-       (guard_tensor_type->scalarType().value() != tensor.scalar_type())) ||
-      (guard_tensor_type->device().has_value() &&
-       (guard_tensor_type->device().value() != tensor.device())) ||
-      (guard_tensor_type->requiresGrad().has_value() &&
-       guard_tensor_type->requiresGrad().value() !=
-           (tensor.requires_grad() && at::GradMode::is_enabled()))) {
-    return false;
-  }
-
-  // TODO: should we get symbolic_size instead and check for size
-  // consistency across tensors as well?
-  const auto& sizes = guard_tensor_type->sizes();
-  // see [ Note -- stirde_properties in tensor type ]
-  const auto& stride_properties = guard_tensor_type->stride_properties();
-
-  const auto& t_sizes = tensor.sizes();
-  const auto& t_strides = tensor.strides();
-  int inner_dim = -1;
-  for (const auto j : c10::irange(*guard_tensor_type->dim())) {
-    // check b. for stride check, we go along dimensions from fastest stride to
-    // slowest stride
-    int sorted_index = stride_properties[j]->stride_index_
-        ? static_cast<int>(*stride_properties[j]->stride_index_)
-        : -1;
-
-    // only apply stride check when we have stride_properties
-    if (sorted_index != -1) {
-      // check b.1. stride order [current dimension has stride larger
-      // than its inner dimension(s)], check only applies when both:
-      //     i. already encountered an inner dimension
-      //    ii. not at the fastest dimension
-      if (j != 0 && inner_dim != -1) {
-        // we are not looking at dim-j, but dim-sorted_index, which
-        // is the j-th fastest dim;
-        // Note: we ignore 0-stride dimension, since eager logic on stride
-        // indices is ambiguous
-        if (t_strides[sorted_index] != 0 && t_strides[inner_dim] != 0 &&
-            t_strides[sorted_index] < t_strides[inner_dim]) {
-          return false;
-        }
-      }
-
-      // check b.2. contiguity, we only check when it's marked as
-      // contiguous.
-      if (stride_properties[j]->contiguous_ &&
-          *stride_properties[j]->contiguous_) {
-        if (j != 0) {
-          // we use contiguity to collapse dimension, if size == 1, it is
-          // always collapsible
-          // computeStrideProps also default to contiguous when stride == 1
-          if (t_sizes[sorted_index] != 1 && t_strides[sorted_index] != 1) {
-            TORCH_INTERNAL_ASSERT(
-                stride_properties[j - 1]->stride_index_.has_value(),
-                "Counknown index is meaningless");
-            // TODO: merge this check up
-            if (t_strides[sorted_index] !=
-                t_strides[inner_dim] * t_sizes[inner_dim]) {
-              return false;
-            }
-          }
-        } else {
-          // TODO: merge this check up
-          if (t_strides[sorted_index] != 1) {
-            return false;
-          }
-        }
-      }
-
-      // update inner_dim to be current dim. Note that we try to skip update
-      // when current `t_size[sorted_index] == 1`, because:
-      //   1. stride comparison on a size-1 dimension is meaningless
-      //      [check b.1]
-      //   2. contiguity on a size-1 dimension is misleading. For collapsing,
-      //      we should actually look at the next non-size-1 dimension
-      //      [check b.2]
-      if (inner_dim == -1 || t_sizes[sorted_index] != 1) {
-        inner_dim = sorted_index;
-      }
-    }
-
-    // check c.1, we go along semantic ordered dimensions
-    // check broadcast / size-1:
-    bool guard_bcast = sizes[j].has_value() && sizes[j].value() == 1;
-    if (guard_bcast != (t_sizes[j] == 1)) {
-      return false;
-    }
-
-    // check c.2, check for size-0
-    bool guard_size_0 = sizes[j].has_value() && sizes[j].value() == 0;
-    if (guard_size_0 != (t_sizes[j] == 0)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 } // namespace cuda
 } // namespace fuser
-
-namespace {
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators size_eq_guard({
-    Operator(
-        //"prim::CudaFusionSizeEq(int[] size, int[] ref) -> bool",
-        "prim::CudaFusionSizeEq(...) -> bool",
-        // prim::CudaFusionGuard returns a fresh Boolean type without aliasing.
-        // if we would ever return refined tensor, which would change aliasing
-        // analysis, we should update aliasdb pass.
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            at::ArrayRef<IValue> inputs = last(stack, 2);
-            drop(stack, 2);
-
-            if (!fuser::cuda::getCudaFusionGuardMode()) {
-              push(stack, IValue(true));
-              return;
-            }
-
-            // auto inp = inputs[0].toIntList();
-            TORCH_INTERNAL_ASSERT(
-                inputs[1].isIntList(), "reference needs to be of int list");
-            auto ref = inputs[1].toIntList();
-
-            auto ret = true;
-            if (ref.empty()) {
-              ret = inputs[0].isNone();
-            } else {
-              if (inputs[0].isIntList()) {
-                auto inp = inputs[0].toIntList();
-                if (inp.size() != ref.size()) {
-                  push(stack, IValue(false));
-                  return;
-                }
-
-                for (const auto i : c10::irange(inp.size())) {
-                  if (((inp[i] == 1) != (ref[i] == 1))) {
-                    ret = false;
-                    break;
-                  }
-                }
-              } else {
-                ret = false;
-              }
-            }
-
-            push(stack, IValue(ret));
-            return;
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_fusion({
-    Operator(
-        prim::CudaFusionGroup,
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            fuser::cuda::runFusionGroup(node, stack);
-          };
-        },
-        aliasAnalysisSpecialCase()),
-});
-
-RegisterOperators reg_guard({
-    Operator(
-        "prim::CudaFusionGuard(...) -> bool",
-        // prim::CudaFusionGuard returns a fresh Boolean type without aliasing.
-        // if we would ever return refined tensor, which would change aliasing
-        // analysis, we should update aliasdb pass.
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            // TODO: check latency here!!!!
-            std::vector<TypePtr> types = node->tys(attr::types);
-            const auto num_inputs = types.size();
-            at::ArrayRef<IValue> inputs = last(stack, num_inputs);
-            drop(stack, num_inputs);
-
-            if (!fuser::cuda::getCudaFusionGuardMode()) {
-              push(stack, IValue(true));
-              return;
-            }
-
-            for (const auto i : c10::irange(num_inputs)) {
-              const c10::TensorTypePtr& guard_tensor_type =
-                  types[i]->cast<TensorType>();
-
-              // TODO: maybe we should just push false and fallback
-              TORCH_INTERNAL_ASSERT(inputs[i].isTensor());
-              const at::Tensor& tensor = inputs[i].toTensor();
-
-              if (!fuser::cuda::complyWith(tensor, guard_tensor_type)) {
-                push(stack, IValue(false));
-                return;
-              }
-            }
-
-            // TODO: check type and return the right flag
-            // naively return true;
-            push(stack, IValue(true));
-            return;
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// Infer dynamic axis (-1) in view_sizes given tensor_sizes
-bool inferViewShape(
-    c10::List<int64_t> tensor_sizes,
-    c10::List<int64_t> view_sizes) {
-  int64_t dynamic_index = -1;
-  size_t view_size_num_elements = 1;
-  for (size_t idx = 0; idx < view_sizes.size(); ++idx) {
-    if (view_sizes[idx] == -1) {
-      TORCH_INTERNAL_ASSERT(
-          dynamic_index == -1, "Only one dimension can by inferred.")
-      dynamic_index = idx;
-    } else {
-      TORCH_INTERNAL_ASSERT(view_sizes[idx] > 0);
-      view_size_num_elements *= view_sizes[idx];
-    }
-  }
-  const size_t kNumElements = std::accumulate(
-      tensor_sizes.begin(), tensor_sizes.end(), 1, std::multiplies<>());
-
-  if (kNumElements % view_size_num_elements != 0) {
-    return false;
-  }
-
-  if (dynamic_index != -1) {
-    view_sizes[dynamic_index] = kNumElements / view_size_num_elements;
-  }
-
-  return true;
-}
-
-//!
-//! CudaFusionViewGuard Example Graph:
-//!
-//! graph(%self : __torch__.BiasViewRelu,
-//!       %inputs.1 : Tensor):
-//!   %2 : int = prim::Constant[value=-1]() # dynamic_bvg.py:50:40
-//!   %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
-//!   %4 : NoneType = prim::Constant()
-//!   %5 : int[] = prim::Constant[value=[2, 3]]()
-//!   %6 : int[] = aten::size(%inputs.1) # dynamic_bvg.py:50:25
-//!   %7 : int[] = aten::slice(%6, %4, %2, %3) # dynamic_bvg.py:50:25
-//!   %view_shape.1 : int[] = aten::add(%7, %5) # dynamic_bvg.py:50:25
-//!   %bias : Tensor = prim::GetAttr[name="bias"](%self)
-//!   %10 : int[] = aten::size(%bias)
-//!   %11 : int[] = prim::BroadcastSizes(%6, %10)
-//!   %12 : bool = prim::CudaFusionGuard[types=[...]](%inputs.1, %bias)
-//!   %13 : int[] = prim::Constant[value=[-1, -1, -1, 6]]()
-//!   %14 : int[] = prim::Constant[value=[-1, -1, -1, 2, 3]]()
-//!   %15 : bool = prim::CudaFusionViewGuard(%11, %view_shape.1, %13, %14)
-//!   %16 : bool[] = prim::ListConstruct(%15, %12)
-//!   %17 : bool = aten::all(%16)
-//!   %18 : Tensor = prim::If(%17)
-//!     block0():
-//!       %19 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%inputs.1, %bias)
-//!       -> (%19)
-//!     block1():
-//!       %20 : Function = prim::Constant[name="fallback_fn", fallback=1]()
-//!       %21 : (...) = prim::CallFunction(%20, %inputs.1, %bias, %view_shape.1)
-//!       %22 : Float(...) = prim::TupleUnpack(%21)
-//!       -> (%22)
-//!   return (%18)
-//! with prim::CudaFusionGroup_0 = graph(%0 : Float(...),
-//!       %1 : Float(...)):
-//!   %2 : int[] = prim::Constant[value=[2, 3, 4, 2, 3]]()
-//!   %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25
-//!   %o.1 : Float(...) = aten::add(%0, %1, %3) # dynamic_bvg.py:51:16
-//!   %5 : Float(...) = prim::view_copy(%o.1, %2)
-//!   %6 : Float(...) = aten::relu(%5) # dynamic_bvg.py:53:19
-//!   return (%6)
-//!
-RegisterOperators view_guard({
-    Operator(
-        "prim::CudaFusionViewGuard(...) -> bool",
-        // prim::CudaFusionViewGuard returns a fresh Boolean type without
-        // aliasing. if we would ever return refined tensor, which would change
-        // aliasing analysis, we should update aliasdb pass.
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            // view_sizes_constraint - Constant List[Int]
-            at::ArrayRef<IValue> inputs = last(stack, 3);
-
-            // tensor_sizes is the runtime size for the self tensor
-            // tensor_sizes - dynamic size List[Int]
-            TORCH_INTERNAL_ASSERT(
-                inputs[0].isIntList(), "tensor_sizes needs to be Int List");
-            auto tensor_sizes = inputs[0].toIntList();
-
-            // profiled_view_sizes is the runtime view size
-            // profiled_view_sizes - profile_ivalue List[Int]
-            TORCH_INTERNAL_ASSERT(
-                inputs[1].isIntList(),
-                "profiled_view_sizes needs to be Int list");
-            auto profiled_view_sizes = inputs[1].toIntList();
-
-            // tensor_constraints is a constant List[Int]
-            // used to guard tensor_sizes
-            TORCH_INTERNAL_ASSERT(
-                inputs[2].isIntList(),
-                "tensor constraint needs to be Int List");
-            auto tensor_constraints = inputs[2].toIntList();
-
-            // Drop after gather all input arguments
-            // If an argument is moved, it is destroyed when dropped from stack
-            drop(stack, 3);
-
-            auto status = inferViewShape(tensor_sizes, profiled_view_sizes);
-            if (!status) {
-              push(stack, IValue(false));
-              return;
-            }
-
-            if (!fuser::cuda::getCudaFusionGuardMode()) {
-              push(stack, IValue(true));
-              return;
-            }
-            std::vector<int64_t> tensor_sizes_int_vec = tensor_sizes.vec();
-            std::vector<int64_t> view_sizes_int_vec = tensor_sizes.vec();
-            std::vector<int64_t> previous_constraints =
-                tensor_constraints.vec();
-            auto new_constraints = fuser::cuda::getViewConstraint(
-                tensor_sizes_int_vec, view_sizes_int_vec);
-            bool guard_status =
-                (new_constraints.conglomerateString() == previous_constraints);
-            push(stack, IValue(guard_status));
-            return;
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-RegisterOperators ivalue_guard({
-    Operator(
-        "prim::CudaFusionIvalGuard(...) -> bool",
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            at::ArrayRef<IValue> inputs = last(stack, 2);
-            drop(stack, 2);
-            if (!fuser::cuda::getCudaFusionGuardMode()) {
-              push(stack, IValue(true));
-              return;
-            }
-            push(stack, inputs[0].equals(inputs[1]));
-            return;
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_add_optional({
-    Operator(
-        "prim::add_optional(Tensor(a) input, Tensor? bias) -> Tensor(a)",
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            IValue input, bias;
-            pop(stack, input, bias);
-            if (bias.isNone()) {
-              push(stack, std::move(input));
-            } else {
-              push(stack, at::add(input.toTensor(), bias.toTensor(), 1.0));
-            }
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_permute_copy({
-    Operator(
-        "prim::permute_copy(Tensor(a) self, int[] dims) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "permute_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, dims;
-            pop(stack, self, dims);
-            push(stack, at::native::view(self.toTensor(), dims.toIntVector()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_transpose_copy({
-    Operator(
-        "prim::transpose_copy.int(Tensor(a) self, int dim0, int dim1) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "transpose_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, dim0, dim1;
-            pop(stack, self, dim0, dim1);
-            push(
-                stack,
-                at::transpose(self.toTensor(), dim0.toInt(), dim1.toInt()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_t_copy({
-    Operator(
-        "prim::t_copy(Tensor(a) self) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "t_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self;
-            pop(stack, self);
-            push(stack, at::t(self.toTensor()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_view_copy({
-    Operator(
-        "prim::view_copy(Tensor self, int[] size) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "view_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, size;
-            pop(stack, self, size);
-            push(stack, at::native::view(self.toTensor(), size.toIntVector()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_flatten_copy({
-    Operator(
-        "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "flatten_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, start_dim, end_dim;
-            pop(stack, self, start_dim, end_dim);
-            push(
-                stack,
-                at::native::flatten(
-                    self.toTensor(), start_dim.toInt(), end_dim.toInt()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_reshape_copy({
-    Operator(
-        "prim::reshape_copy(Tensor self, int[] shape) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "reshape_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, shape;
-            pop(stack, self, shape);
-            push(
-                stack,
-                at::native::reshape(self.toTensor(), shape.toIntVector()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_squeeze_copy({
-    Operator(
-        "prim::squeeze_copy(Tensor self) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "squeeze_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self;
-            pop(stack, self);
-            push(stack, at::squeeze(self.toTensor()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_squeeze_dim_copy({
-    Operator(
-        "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "squeeze_dim_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, dim;
-            pop(stack, self, dim);
-            push(stack, at::squeeze(self.toTensor(), dim.toInt()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_unsqueeze_copy({
-    Operator(
-        "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "unsqueeze_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, dim;
-            pop(stack, self, dim);
-            push(stack, at::unsqueeze(self.toTensor(), dim.toInt()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_infer_unsqueeze_size({
-    Operator(
-        "prim::infer_unsqueeze_size(int[] a, int dim) -> int[]",
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            auto dim = pop(stack).toInt();
-            auto size = pop(stack).toIntVector();
-            if (dim < 0) {
-              dim = dim + 1 + size.size();
-            }
-            auto it = size.begin() + dim;
-            size.insert(it, 1);
-            push(stack, IValue(size));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_infer_squeeze_dim_size({
-    Operator(
-        "prim::infer_squeeze_size.dim(int[] a, int dim) -> int[]",
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            auto dim = pop(stack).toInt();
-            auto size = pop(stack).toIntVector();
-            if (dim < 0) {
-              dim = dim + size.size();
-            }
-            auto it = size.begin() + dim;
-            if (*it == 1) {
-              size.erase(it);
-            }
-            push(stack, IValue(size));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_infer_squeeze_size({
-    Operator(
-        "prim::infer_squeeze_size(int[] a) -> int[]",
-        [](const Node* node) -> Operation {
-          return [](Stack& stack) {
-            auto size = pop(stack).toIntVector();
-
-            for (auto it = size.begin(); it != size.end(); it++) {
-              if (*it == 1) {
-                auto pre = it - 1;
-                size.erase(it);
-                it = pre;
-              }
-            }
-            push(stack, IValue(size));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_expand_copy({
-    Operator(
-        "prim::expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "expand_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, size, implicit;
-            pop(stack, self, size, implicit);
-            push(stack, self.toTensor().expand(size.toIntVector()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
-RegisterOperators reg_expand_as_copy({
-    Operator(
-        "prim::expand_as_copy(Tensor self, Tensor other) -> Tensor",
-        [](const Node* node) -> Operation {
-          return [node](Stack& stack) {
-            TORCH_CHECK(
-                node->s(attr::name) == "CudaFusionGroup",
-                "expand_as_copy is only used by nvfuser to identify non-mutating ",
-                "alias ops, should be restored after fusion pass!");
-            IValue self, other;
-            pop(stack, self, other);
-            push(
-                stack,
-                at::native::expand_as(self.toTensor(), other.toTensor()));
-          };
-        },
-        aliasAnalysisFromSchema()),
-});
-
-} // namespace
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h
index 01ea2e934035..0ccdfe2c9ebd 100644
--- a/torch/csrc/jit/codegen/cuda/interface.h
+++ b/torch/csrc/jit/codegen/cuda/interface.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <c10/macros/Export.h>
-#include <torch/csrc/jit/codegen/cuda/transform_view.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include <torch/csrc/jit/runtime/profiling_record.h>
@@ -35,9 +34,6 @@ struct CudaFuserInterface {
   void (*fn_insert_profile_inodes)(ProfilingRecord* pr) = nullptr;
   bool (*fn_profile_n)(const Node*) = nullptr;
   bool (*fn_skip_n)(const std::string&, bool flip) = nullptr;
-  AnalyzeViewConstraint (*fn_analyze_view)(
-      const std::vector<int64_t>& original_sizes,
-      const std::vector<int64_t>& new_sizes) = nullptr;
 };
 
 // Get interface, this is used by registration and user facing API internally
@@ -52,34 +48,10 @@ TORCH_API bool profileNode(const Node* node);
 
 TORCH_API bool skipNode(const std::string& symbol_str, bool flip = true);
 
-TORCH_API AnalyzeViewConstraint getViewConstraint(
-    const std::vector<int64_t>& original_sizes,
-    const std::vector<int64_t>& new_sizes);
-
-TORCH_API bool complyWith(
-    const at::Tensor& tensor,
-    const c10::TensorTypePtr& guard_tensor_type);
-
 TORCH_API bool isEnabled();
 TORCH_API bool setEnabled(bool is_enabled);
 TORCH_API bool canBeEnabled();
 
-struct TORCH_API NVFuserPassManager : public PassManager<NVFuserPassManager> {
-  static bool registerPass(bool enabled) {
-    bool old_value = PassManager::isRegistered();
-    if (enabled) {
-      PassManager::registerPass(fuseGraph);
-    } else {
-      PassManager::clearPass();
-    }
-    return old_value;
-  }
-
-  static bool isRegistered() {
-    return PassManager::isRegistered();
-  }
-};
-
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/ir_all_nodes.h b/torch/csrc/jit/codegen/cuda/ir_all_nodes.h
deleted file mode 100644
index b86c2bb074ec..000000000000
--- a/torch/csrc/jit/codegen/cuda/ir_all_nodes.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
-
-// TODO: remove this once the Kernel IR split is complete
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
diff --git a/torch/csrc/jit/codegen/cuda/nvfuser.cmake b/torch/csrc/jit/codegen/cuda/nvfuser.cmake
deleted file mode 100644
index 147003054766..000000000000
--- a/torch/csrc/jit/codegen/cuda/nvfuser.cmake
+++ /dev/null
@@ -1,69 +0,0 @@
-if(USE_CUDA)
-  set(TORCHLIB_FLAVOR torch_cuda)
-elseif(USE_ROCM)
-  set(TORCHLIB_FLAVOR torch_hip)
-endif()
-
-# The list of NVFUSER runtime files
-list(APPEND NVFUSER_RUNTIME_FILES
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/array.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_reduction.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_welford_helper.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_welford_impl.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tuple.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/type_traits.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensorcore.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/memory.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/swizzle.cu
-  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
-  ${TORCH_ROOT}/aten/src/ATen/cuda/detail/UnpackRaw.cuh
-)
-
-if(USE_ROCM)
-list(APPEND NVFUSER_RUNTIME_FILES
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/array_rocm.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support_rocm.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default_rocm.cu
-  ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp_rocm.cu
-)
-endif()
-
-file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
-
-# "stringify" NVFUSER runtime sources
-# (generate C++ header files embedding the original input as a string literal)
-set(NVFUSER_STRINGIFY_TOOL "${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tools/stringify_file.py")
-foreach(src ${NVFUSER_RUNTIME_FILES})
-  get_filename_component(filename ${src} NAME_WE)
-  set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
-  add_custom_command(
-    COMMENT "Stringify NVFUSER runtime source file"
-    OUTPUT ${dst}
-    DEPENDS ${src} "${NVFUSER_STRINGIFY_TOOL}"
-    COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
-  )
-  add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
-  add_dependencies(${TORCHLIB_FLAVOR} nvfuser_rt_${filename})
-
-  # also generate the resource headers during the configuration step
-  # (so tools like clang-tidy can run w/o requiring a real build)
-  execute_process(COMMAND
-    ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
-endforeach()
-
-target_include_directories(${TORCHLIB_FLAVOR} PRIVATE "${CMAKE_BINARY_DIR}/include")
diff --git a/torch/csrc/jit/codegen/cuda/ops/all_ops.h b/torch/csrc/jit/codegen/cuda/ops/all_ops.h
deleted file mode 100644
index 07d3eb944e89..000000000000
--- a/torch/csrc/jit/codegen/cuda/ops/all_ops.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma once
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/ops/alias.h>
-#include <torch/csrc/jit/codegen/cuda/ops/composite.h>
-#include <torch/csrc/jit/codegen/cuda/ops/normalization.h>
diff --git a/torch/csrc/jit/codegen/cuda/register_interface.cpp b/torch/csrc/jit/codegen/cuda/register_interface.cpp
deleted file mode 100644
index ba50c1352e43..000000000000
--- a/torch/csrc/jit/codegen/cuda/register_interface.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/manager.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/codegen/cuda/partition.h>
-
-#include <torch/csrc/jit/runtime/profiling_record.h>
-
-/*
- * Registers function pointers in interface.h
- */
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-namespace {
-class RegisterInterface {
- public:
-  RegisterInterface() {
-    auto ptr = getFuserInterface();
-    ptr->fn_compile_n = &compileCudaFusionGroup;
-    ptr->fn_run_n_s = &runCudaFusionGroup;
-    ptr->fn_fuse_graph = &CudaFuseGraph;
-    ptr->fn_can_fuse_n = &isFusibleCudaFusionGroup;
-    ptr->fn_insert_profile_inodes = &InsertProfileNodes;
-    ptr->fn_profile_n = &shouldProfileNode;
-    ptr->fn_skip_n = &skipNodeKind;
-    ptr->fn_analyze_view = &analyzeViewConstraint;
-  }
-};
-
-static RegisterInterface register_interface_;
-
-class RegisterNVFuserPass {
- public:
-  RegisterNVFuserPass() {
-    NVFuserPassManager::registerPass(true);
-  }
-};
-
-static RegisterNVFuserPass register_nvfuser_pass_;
-
-} // namespace
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index 72a011febe76..b1b05c4f60cd 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -1,12 +1,12 @@
 #include <torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h>
 
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/fuser/compiler.h>
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <ATen/native/cuda/jit_utils.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/csrc/jit/resource_guard.h>
 
@@ -105,7 +105,7 @@ FusedKernelCUDA::FusedKernelCUDA(
           has_random),
       device_(device) {
   // Initializes driver's API context (if necessary)
-  executor_utils::initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
 
   // Note: hacked at::DeviceGuard since at::DeviceGuard was failing to work
   // properly in some scenarios
diff --git a/torch/csrc/jit/codegen/fuser/tensor_desc.h b/torch/csrc/jit/codegen/fuser/tensor_desc.h
index 992dd5f551cc..65f456e27ad5 100644
--- a/torch/csrc/jit/codegen/fuser/tensor_desc.h
+++ b/torch/csrc/jit/codegen/fuser/tensor_desc.h
@@ -26,7 +26,7 @@ struct TORCH_API TensorDesc {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
       : scalar_type{type}, contiguity{contiguity} {
-    if (contiguity.size() == 0) {
+    if (contiguity.empty()) {
       nDim_ = 0;
     } else {
       nDim_ = std::count(contiguity.begin(), contiguity.end(), false) +
@@ -59,7 +59,7 @@ struct TORCH_API TensorDesc {
 
   // True iff innermost stride is 1
   bool lastIsContiguous() const {
-    return (contiguity.size() == 0 || contiguity.back());
+    return (contiguity.empty() || contiguity.back());
   }
 
   static std::vector<bool> findContiguous(
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
index a14dce108dd1..c04cb46a9216 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -40,7 +40,7 @@ Operator makeWildcardOp(Node* node) {
   auto o = Operator(node, opkind::Wildcard);
   // wildcard op contains only topology info
   for (size_t i = 0; i < node->inputs().size(); i++) {
-    o.setInput(static_cast<size_t>(NULL), i);
+    o.setInput(0, i);
   }
   for (size_t i = 0; i < node->outputs().size(); i++) {
     o.setOutput(i);
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index 1eb0b0a5acbf..275f2ad8e07f 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -1,7 +1,9 @@
 #include <torch/csrc/jit/frontend/error_report.h>
 
+#include <c10/util/Logging.h>
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/frontend/tree.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/utils/memory.h>
 
 namespace torch::jit {
@@ -9,17 +11,42 @@ namespace torch::jit {
 // Avoid storing objects with destructor in thread_local for mobile build.
 #ifndef C10_MOBILE
 thread_local std::vector<Call> calls;
+
+namespace {
+std::string unwrap_backtrace(const c10::optional<std::string>& backtrace) {
+  if (backtrace.has_value()) {
+    return backtrace.value();
+  }
+  return c10::get_backtrace(/*frames_to_skip=*/1);
+}
+} // namespace
+#else // defined c10_MOBILE
+
+namespace {
+std::string unwrap_backtrace(const c10::optional<std::string>& backtrace) {
+  if (backtrace.has_value()) {
+    return backtrace.value();
+  }
+  return std::string("");
+}
+} // namespace
+
 #endif // C10_MOBILE
 
 ErrorReport::ErrorReport(const ErrorReport& e)
     : ss(e.ss.str()),
       context(e.context),
       the_message(e.the_message),
-      error_stack(e.error_stack.begin(), e.error_stack.end()) {}
+      error_stack(e.error_stack.begin(), e.error_stack.end()),
+      backtrace_(e.backtrace_) {}
 
 #ifndef C10_MOBILE
-ErrorReport::ErrorReport(SourceRange r)
-    : context(std::move(r)), error_stack(calls.begin(), calls.end()) {}
+ErrorReport::ErrorReport(
+    SourceRange r,
+    const c10::optional<std::string>& backtrace)
+    : context(std::move(r)),
+      error_stack(calls.begin(), calls.end()),
+      backtrace_(unwrap_backtrace(backtrace)) {}
 
 void ErrorReport::CallStack::update_pending_range(const SourceRange& range) {
   calls.back().caller_range = range;
@@ -35,7 +62,10 @@ ErrorReport::CallStack::~CallStack() {
   calls.pop_back();
 }
 #else // defined C10_MOBILE
-ErrorReport::ErrorReport(SourceRange r) : context(std::move(r)) {}
+ErrorReport::ErrorReport(
+    SourceRange r,
+    const c10::optional<std::string>& backtrace)
+    : context(std::move(r)), backtrace_(unwrap_backtrace(backtrace)) {}
 
 void ErrorReport::CallStack::update_pending_range(const SourceRange& range) {}
 
@@ -48,7 +78,7 @@ ErrorReport::CallStack::~CallStack() {}
 
 std::string get_stacked_errors(const std::vector<Call>& error_stack) {
   std::stringstream msg;
-  if (error_stack.size() > 0) {
+  if (!error_stack.empty()) {
     for (auto it = error_stack.rbegin(); it != error_stack.rend() - 1; ++it) {
       auto callee = it + 1;
 
@@ -77,6 +107,10 @@ const char* ErrorReport::what() const noexcept {
 
   msg << get_stacked_errors(error_stack);
 
+  if (get_cpp_stacktraces_enabled()) {
+    msg << "\n" << backtrace_;
+  }
+
   the_message = msg.str();
   return the_message.c_str();
 }
diff --git a/torch/csrc/jit/frontend/error_report.h b/torch/csrc/jit/frontend/error_report.h
index f3a77c76abcd..5fed498a4108 100644
--- a/torch/csrc/jit/frontend/error_report.h
+++ b/torch/csrc/jit/frontend/error_report.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/Backtrace.h>
 #include <c10/util/Optional.h>
 #include <torch/csrc/jit/frontend/tree.h>
 
@@ -14,9 +15,17 @@ struct Call {
 struct TORCH_API ErrorReport : public std::exception {
   ErrorReport(const ErrorReport& e);
 
-  explicit ErrorReport(SourceRange r);
-  explicit ErrorReport(const TreeRef& tree) : ErrorReport(tree->range()) {}
-  explicit ErrorReport(const Token& tok) : ErrorReport(tok.range) {}
+  explicit ErrorReport(
+      SourceRange r,
+      const c10::optional<std::string>& backtrace = c10::nullopt);
+  explicit ErrorReport(
+      const TreeRef& tree,
+      const c10::optional<std::string>& backtrace = c10::nullopt)
+      : ErrorReport(tree->range(), backtrace) {}
+  explicit ErrorReport(
+      const Token& tok,
+      const c10::optional<std::string>& backtrace = c10::nullopt)
+      : ErrorReport(tok.range, backtrace) {}
 
   const char* what() const noexcept override;
 
@@ -42,6 +51,7 @@ struct TORCH_API ErrorReport : public std::exception {
   OwnedSourceRange context;
   mutable std::string the_message;
   std::vector<Call> error_stack;
+  std::string backtrace_;
 };
 
 template <typename T>
diff --git a/torch/csrc/jit/frontend/exit_transforms.cpp b/torch/csrc/jit/frontend/exit_transforms.cpp
index 4dcbc8ec7f4a..e0e5ec42ed0d 100644
--- a/torch/csrc/jit/frontend/exit_transforms.cpp
+++ b/torch/csrc/jit/frontend/exit_transforms.cpp
@@ -125,7 +125,7 @@ struct ExitTransformer {
   }
 
   static void removeOutputs(Block* b) {
-    while (b->outputs().size() > 0) {
+    while (!b->outputs().empty()) {
       b->eraseOutput(0);
     }
   }
@@ -347,7 +347,7 @@ struct ExitTransformer {
       new_if->addOutput()->setType(block->outputs().at(i)->type());
     }
 
-    while (block->outputs().size() > 0) {
+    while (!block->outputs().empty()) {
       block->eraseOutput(0);
     }
     for (auto out : new_if->outputs()) {
@@ -368,7 +368,7 @@ struct ExitTransformer {
   // never be used, it is safe to replace them with unitialized value
   void destroyNodeAfterExit(Node* n) {
     for (auto output : n->outputs()) {
-      if (output->uses().size() > 0) {
+      if (!output->uses().empty()) {
         output->replaceAllUsesWith(getUnitValue(output->type()));
       }
     }
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 435b613a382c..1c384995f98b 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -655,7 +655,7 @@ struct to_ir {
     // Type annotations exclude explicitly typing the "self" parameter, so in
     // the case that this is a method with self we expect one fewer parameter
     // annotation than the number of parameters this Def takes.
-    if (self && def.decl().params().size() == 0) {
+    if (self && def.decl().params().empty()) {
       throw ErrorReport(def.decl().params().range())
           << "methods must have a self argument";
     }
@@ -2776,7 +2776,7 @@ struct to_ir {
 
       const auto slicedArg = NamedValue(stmt.lhs().range(), "self", sliced);
       const auto rhs = NamedValue(stmt.rhs().range(), emitExpr(stmt.rhs()));
-      if (tensorIndices.size() == 0) {
+      if (tensorIndices.empty()) {
         // Common case: we only tried to index with int and slices. Emit the
         // correct augmented assignment op to the sliced value
         emitBuiltinCall(
@@ -2869,7 +2869,7 @@ struct to_ir {
       // rhs must be a tensor, implicitly convert int/float/complex/bool
       const auto convertedRhs = emitValueToTensor(rhs, slicedArg);
 
-      if (tensorIndices.size() == 0) {
+      if (tensorIndices.empty()) {
         // Common case: we only tried to index with int and slices. Copy the
         // RHS into the resulting tensor.
         graph->insert(aten::copy_, {slicedArg, convertedRhs}, {}, stmtRange);
@@ -3284,7 +3284,7 @@ struct to_ir {
           << expected_inputs << " arguments but found "
           << apply.inputs().size();
     }
-    if (apply.attributes().size() > 0) {
+    if (!apply.attributes().empty()) {
       throw ErrorReport(loc)
           << Var(apply.callee()).name().name() << " takes no keyword arguments";
     }
@@ -3304,7 +3304,7 @@ struct to_ir {
           << min_expected_inputs << " and " << max_expected_inputs
           << " but found " << position_arg_size;
     }
-    if (apply.attributes().size() > 0) {
+    if (!apply.attributes().empty()) {
       throw ErrorReport(loc)
           << Var(apply.callee()).name().name() << " takes no keyword arguments";
     }
@@ -3337,7 +3337,7 @@ struct to_ir {
     switch (form) {
       case prim::fork: {
         auto& trees = apply.inputs().tree()->trees();
-        if (trees.size() < 1) {
+        if (trees.empty()) {
           throw ErrorReport(apply)
               << "Expected at least one argument to fork()";
         }
@@ -3347,6 +3347,19 @@ struct to_ir {
         auto kwargs = emitAttributes(apply.attributes());
         return emitForkExpr(apply.range(), forked, args, kwargs);
       }
+      case prim::awaitable: {
+        auto tree = apply.inputs().tree();
+        if (!tree || tree->trees().size() < 1) {
+          throw ErrorReport(apply)
+              << "Expected at least one argument to awaitable()";
+        }
+        auto& trees = tree->trees();
+        auto awaited = emitSugaredExpr(Expr(trees[0]), 1);
+        TreeList sliced_trees(trees.begin() + 1, trees.end());
+        auto args = getNamedValues(sliced_trees, true);
+        auto kwargs = emitAttributes(apply.attributes());
+        return emitAwaitableExpr(apply.range(), awaited, args, kwargs);
+      }
       case prim::annotate: {
         checkApplyNumInputs(apply, 2);
         TypePtr type = typeParser_.parseTypeFromExpr(apply.inputs()[0]);
@@ -3474,7 +3487,7 @@ struct to_ir {
         bool all_ints = std::all_of(args.begin(), args.end(), [](Value* v) {
           return v->type()->cast<IntType>();
         });
-        if (args.size() == 0) {
+        if (args.empty()) {
           // empty inputs == torch.tensor([], dtype=....)
           auto inp_list =
               graph->insertNode(graph->createList(IntType::get(), {}))
@@ -3619,7 +3632,7 @@ struct to_ir {
         // zip(x, y) can be rewrite as subtrees:
         // IterableTree(IterableTree(x), IterableTree(y))
         auto inputs = apply.inputs();
-        if (inputs.size() == 0) {
+        if (inputs.empty()) {
           throw ErrorReport(apply)
               << "zip expected at least 1 arguments, got 0";
         }
@@ -3663,7 +3676,7 @@ struct to_ir {
   std::shared_ptr<SugaredValue> emitApplySpecialFormForList(
       Apply& apply,
       const TypePtr& type_hint = nullptr) {
-    if (apply.inputs().size() == 0) {
+    if (apply.inputs().empty()) {
       TypePtr type = type_hint ? type_hint : ListType::ofTensors();
       if (!type->cast<ListType>()) {
         throw ErrorReport(apply.range())
@@ -4121,6 +4134,43 @@ struct to_ir {
     return std::make_shared<SimpleValue>(node_output);
   }
 
+  std::shared_ptr<SugaredValue> emitAwaitableExpr(
+      SourceRange loc,
+      const std::shared_ptr<SugaredValue>& awaited,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs) {
+    auto g = method.graph();
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    TypePtr out_type;
+
+    auto await_node =
+        g->insertNode(method.graph()->create(prim::awaitableClosure, 1))
+            ->setSourceRange(loc);
+
+    {
+      WithInsertPoint insert(await_node);
+      if (auto sv = dynamic_cast<ClosureValue*>(awaited.get())) {
+        Value* closure_output = sv->asValue(loc, method);
+        Block* closure_block = closure_output->node()->blocks().at(0);
+        TORCH_INTERNAL_ASSERT(closure_block->outputs().size() == 1);
+        out_type = closure_block->outputs().at(0)->type();
+        await_node->addInput(closure_output);
+      } else {
+        auto emit_closure_body = [&](Block* closure_block) {
+          auto fn_sugared_output = awaited->call(loc, method, args, kwargs, 1);
+          auto fn_simple_output = fn_sugared_output->asValue(loc, method);
+          closure_block->registerOutput(fn_simple_output);
+          out_type = fn_simple_output->type();
+        };
+        auto closure_value = emitClosure(emit_closure_body);
+        await_node->addInput(closure_value->asValue(loc, method));
+      }
+    }
+    Value* node_output =
+        await_node->output()->setType(AwaitType::create(out_type));
+    return std::make_shared<SimpleValue>(node_output);
+  }
+
   std::shared_ptr<SugaredValue> emitRpcExpr(const Apply& apply, Symbol rpc_op) {
     // TODO: This is a temporary apporoach to enable calling user fucntion
     // through RPC in TorchScript,
@@ -4140,7 +4190,7 @@ struct to_ir {
           << op_name << "(dst_worker_name, user_callable)\n"
           << "Now the number of arguments is " << apply.inputs().size();
     }
-    if (apply.attributes().size() != 0) {
+    if (!apply.attributes().empty()) {
       throw ErrorReport(apply)
           << op_name << "(dst_worker_name, user_callable, args, kwargs)"
           << "does not support kwargs yet";
@@ -4187,7 +4237,7 @@ struct to_ir {
     std::vector<NamedValue> kwargs;
     // Get args and kwargs as `NamedValue`s.
     // Similar to getNamedValues(..) and emitAttributes(..).
-    if (args_kwargs_timeout_trees.size() >= 1) {
+    if (!args_kwargs_timeout_trees.empty()) {
       // Unroll args from a Var that is known to be a Tuple.
       auto& args_tree = args_kwargs_timeout_trees[0];
       auto entry_sugared_values = emitSugaredExpr(Expr(args_tree), 1)
@@ -4298,7 +4348,7 @@ struct to_ir {
     // This is also the same behavior that C++ allows with {}
     // (cannot assign to a variable typed as auto)
     // These nodes will be removed in a later pass after initial compilation
-    if (values.size() == 0 && type_hint == nullptr) {
+    if (values.empty() && type_hint == nullptr) {
       auto node = graph->insertNode(graph->create(prim::EmptyListLiteral));
       node->output()->setType(ListType::ofTensors());
       return node->output();
@@ -5055,7 +5105,7 @@ struct to_ir {
     }
     auto idx = toIValue(idx_val);
     if (!idx) {
-      if (elems.size() == 0 ||
+      if (elems.empty() ||
           !convertibleToList(tuple_typ, ListType::create(elems[0]))) {
         throw ErrorReport(loc)
             << "Cannot index into a " << tuple_typ->repr_str()
@@ -5615,7 +5665,7 @@ void runCleanupPasses(std::shared_ptr<Graph>& to_clean) {
 // and do not record it as a unique name. This allows python printing to
 // be able to export and import more consistently named graphs
 bool meaningfulName(const std::string& name) {
-  if (name.size() == 0)
+  if (name.empty())
     return false;
   if (name[0] == '$')
     return false;
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index 8a1d4fba0437..80a3bad10c29 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -428,11 +428,11 @@ struct Lexer {
   }
   // Return the current token, and then move to the next one
   Token next() {
-    if (next_tokens.size() == 0)
+    if (next_tokens.empty())
       reportError("Lexer invariant violated: empty token queue");
     Token r = std::move(next_tokens.front());
     next_tokens.erase(next_tokens.begin());
-    if (next_tokens.size() == 0) {
+    if (next_tokens.empty()) {
       lex();
     }
     return r;
@@ -517,7 +517,7 @@ struct Lexer {
           while (indent_stack.back() != depth) {
             indent_stack.pop_back();
             next_tokens.emplace_back(TK_DEDENT, r.range);
-            if (indent_stack.size() == 0) {
+            if (indent_stack.empty()) {
               reportError(
                   "invalid indent level " + c10::guts::to_string(depth), r);
             }
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index bce550edfced..21c853ecf83b 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -578,7 +578,7 @@ std::pair<size_t, MatchedSchema> matchSchemas(
     at::ArrayRef<NamedValue> kwargs,
     const c10::optional<NamedValue>& self,
     bool render_errors) {
-  TORCH_INTERNAL_ASSERT(schemas.size() > 0);
+  TORCH_INTERNAL_ASSERT(!schemas.empty());
   // if there is only one schema, we do not need to try without conversions
   // first. this is faster and puts less dead code in the graph.
   if (schemas.size() == 1) {
@@ -667,7 +667,7 @@ static Value* emitBuiltinNode(
 }
 
 std::string getFullSchemaName(const ::c10::FunctionSchema& schema) {
-  if (schema.overload_name() != "") {
+  if (!schema.overload_name().empty()) {
     return schema.operator_name().name + "." + schema.overload_name();
   }
   return schema.operator_name().name;
@@ -743,12 +743,12 @@ Value* emitBuiltinCall(
   }
 
   // no operators found with the same name, print out similarly named operators
-  if (schemas.size() == 0) {
+  if (schemas.empty()) {
     const auto close_symbols = findSimilarOperators(name);
     auto error = ErrorReport(loc);
     const auto& user_function_name = name.toQualString();
     error << "Unknown builtin op: " << user_function_name << ".\n";
-    if (close_symbols.size() == 0) {
+    if (close_symbols.empty()) {
       error
           << "Could not find any similar ops to " << user_function_name
           << ". This op may not exist or may not be currently supported in TorchScript.\n";
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index a7244a57150e..f702286a3899 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -11,6 +11,7 @@
 #include <string>
 
 using c10::AliasInfo;
+using c10::AwaitType;
 using c10::BoolType;
 using c10::CapsuleType;
 using c10::ComplexType;
@@ -25,7 +26,6 @@ using c10::ListType;
 using c10::MemoryFormatType;
 using c10::NoneType;
 using c10::NumberType;
-using c10::OptionalType;
 using c10::QSchemeType;
 using c10::QuantizerType;
 using c10::RRefType;
@@ -81,7 +81,7 @@ TypePtr SchemaTypeParser::parseBaseType() {
 
   auto it = type_map.find(text);
   if (it == type_map.end()) {
-    if (text.size() > 0 && islower(text[0])) {
+    if (!text.empty() && islower(text[0])) {
       // lower case identifiers that are not otherwise valid types
       // are treated as type variables
       return c10::TypeFactory::createNamed<VarType>(text);
@@ -175,7 +175,14 @@ c10::optional<c10::Device> SchemaTypeParser::tryToParseDeviceType() {
       const std::string& num = L.expect(TK_NUMBER).text();
       // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
       std::string::size_type num_len;
-      device_idx = c10::stoi(num, &num_len);
+      try {
+        device_idx = c10::stoi(num, &num_len);
+      } catch (const std::invalid_argument& e) {
+        throw ErrorReport(L.cur())
+            << "Device index cannot be converted to integer";
+      } catch (const std::out_of_range& e) {
+        throw ErrorReport(L.cur()) << "Device index is too long";
+      }
     }
     if (dev == "cuda") {
       return c10::Device(at::kCUDA, device_idx);
@@ -192,7 +199,15 @@ c10::optional<bool> SchemaTypeParser::tryToParseRequiresGrad() {
   const std::string& num = L.expect(TK_NUMBER).text();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::string::size_type num_len;
-  return (bool)c10::stoi(num, &num_len);
+
+  try {
+    return (bool)c10::stoi(num, &num_len);
+  } catch (const std::invalid_argument& e) {
+    throw ErrorReport(L.cur())
+        << "Field requires_grad cannot be converted to integer";
+  } catch (const std::out_of_range& e) {
+    throw ErrorReport(L.cur()) << "Field requires_grad is too long";
+  }
 }
 
 TypePtr SchemaTypeParser::parseRefinedTensor() {
@@ -245,8 +260,15 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
           const std::string& num = L.expect(TK_NUMBER).text();
           // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
           std::string::size_type num_len;
-          auto stride = c10::stoll(num, &num_len);
-          strides.push_back(stride);
+          try {
+            auto stride = c10::stoll(num, &num_len);
+            strides.push_back(stride);
+          } catch (const std::invalid_argument& e) {
+            throw ErrorReport(L.cur())
+                << "The stride value cannot be converted to int";
+          } catch (const std::out_of_range& e) {
+            throw ErrorReport(L.cur()) << "The stride is too big";
+          }
         });
         return;
       }
@@ -277,7 +299,14 @@ TypePtr SchemaTypeParser::parseRefinedTensor() {
     const std::string& num = L.expect(TK_NUMBER).text();
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::string::size_type num_len;
-    int64_t dim = c10::stoll(num, &num_len);
+    int64_t dim = 0;
+    try {
+      dim = c10::stoll(num, &num_len);
+    } catch (const std::invalid_argument& e) {
+      throw ErrorReport(L.cur()) << "The number can't be converted to int";
+    } catch (const std::out_of_range& e) {
+      throw ErrorReport(L.cur()) << "Number is too big";
+    }
     if (shape_symbol) {
       L.expect(')');
       dim = -dim;
@@ -339,6 +368,14 @@ SchemaTypeParser::parseFakeAndRealType() {
     auto subalias = std::move(p.second);
     L.expect(')');
     fake_value = real_value = c10::TypeFactory::create<FutureType>(subtype);
+  } else if (L.cur().kind == TK_IDENT && L.cur().text() == "Await") {
+    L.next(); // Await
+    L.expect('(');
+    auto p = parseType();
+    auto subtype = std::move(p.first);
+    auto subalias = std::move(p.second);
+    L.expect(')');
+    fake_value = real_value = c10::TypeFactory::create<AwaitType>(subtype);
   } else if (L.cur().kind == TK_IDENT && L.cur().text() == "RRef") {
     L.next(); // RRef
     L.expect('(');
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index b254618ab4f7..301fd3cf8e84 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -37,7 +37,7 @@ TypePtr ScriptTypeParser::subscriptToType(
       // i.e. `typing.Tuple[()]`. Allow for parsing an empty tuple literal
       // here. See https://docs.python.org/3/library/typing.html#typing.Tuple
       auto tup_literal = TupleLiteral(subscript.subscript_exprs()[0]);
-      if (tup_literal.inputs().size() > 0) {
+      if (!tup_literal.inputs().empty()) {
         throw ErrorReport(tup_literal.range())
             << "Tuple literal in Tuple type annotation must not "
             << "have any elements!";
@@ -85,6 +85,15 @@ TypePtr ScriptTypeParser::subscriptToType(
     auto elem_type =
         parseTypeFromExprImpl(*subscript.subscript_exprs().begin());
     return FutureType::create(elem_type);
+  } else if (typeName == "Await" || typeName == "torch.jit._Await") {
+    if (subscript.subscript_exprs().size() != 1) {
+      throw ErrorReport(subscript)
+          << " expected exactly one element type but found "
+          << subscript.subscript_exprs().size();
+    }
+    auto elem_type =
+        parseTypeFromExprImpl(*subscript.subscript_exprs().begin());
+    return AwaitType::create(elem_type);
   } else if (typeName == "RRef") {
     if (subscript.subscript_exprs().size() != 1) {
       throw ErrorReport(subscript)
@@ -457,6 +466,10 @@ c10::IValue ScriptTypeParser::parseClassConstant(const Assign& assign) {
     throw ErrorReport(assign.range())
         << "Expected to a variable for class constant";
   }
+  if (!assign.type().present()) {
+    throw ErrorReport(assign.range())
+        << "Expected a type to present for class constant";
+  }
   const auto final_type = assign.type().get();
   auto expr = assign.rhs().get();
   if (final_type.kind() != TK_SUBSCRIPT) {
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index 0b1f4936a8c2..4693e66f63fa 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -18,7 +18,7 @@ StringCordView::StringCordView(
   accumulated_sizes_.push_back(0);
   size_t running_sum = 0;
   for (auto& s : pieces_) {
-    if (s.size() > 0) {
+    if (!s.empty()) {
       running_sum += s.size();
       accumulated_sizes_.push_back(running_sum);
     }
@@ -26,7 +26,7 @@ StringCordView::StringCordView(
 }
 
 size_t StringCordView::find(const std::string& tok, size_t start) const {
-  if (tok.size() == 0) {
+  if (tok.empty()) {
     return 0;
   }
 
@@ -257,7 +257,7 @@ void SourceRange::print_with_context(
     size_t line, col;
     std::tie(filename, line, col) = *flc;
     out << "  File \"" << filename << "\", line " << line;
-    if (funcname != "") {
+    if (!funcname.empty()) {
       out << ", in " << funcname;
     }
     out << "\n";
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index beeadf4a0a50..7eb01d3286e3 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -168,6 +168,12 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
         }
       }
     }
+  } else if (auto awaitType = value_->type()->cast<AwaitType>()) {
+    auto elType = awaitType->getElementType();
+    auto& g = *m.graph();
+    auto v = g.insert(prim::awaitable_wait, {value_}, {}, loc);
+    auto sv = std::make_shared<SimpleValue>(v);
+    return sv->attr(loc, m, field);
   } else if (auto classType = value_->type()->cast<ClassType>()) {
     // This is a class, emit the proper attribute lookup
     if (classType->findMethod(field)) {
@@ -503,7 +509,7 @@ RangeValue::RangeValue(
   }
 
   Graph& g = *m.graph();
-  if (inputs.size() == 0) {
+  if (inputs.empty()) {
     throw ErrorReport(loc) << "range expected at least 1 arguments, got 0";
   } else if (inputs.size() == 1) {
     end_ = inputs[0];
@@ -613,7 +619,7 @@ void IterableTree::addChild(
     GraphFunction& m,
     const SugaredValuePtr& iter_value) {
   c10::optional<int64_t> child_len = iter_value->staticLen();
-  if (children_.size() == 0) {
+  if (children_.empty()) {
     unroll_length_ = child_len;
   } else {
     if ((unroll_length_ && !child_len) || (child_len && !unroll_length_)) {
@@ -637,7 +643,7 @@ std::shared_ptr<SugaredValue> MagicMethod::call(
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
-  if (args.size() > 0) {
+  if (!args.empty()) {
     Value* self = args[0].value(*m.graph());
     if (auto class_ptr = self->type()->cast<ClassType>()) {
       return SimpleValue(self)
diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h
index f507513d0e82..7c024447401f 100644
--- a/torch/csrc/jit/frontend/sugared_value.h
+++ b/torch/csrc/jit/frontend/sugared_value.h
@@ -512,7 +512,7 @@ struct TORCH_API CastValue : public BuiltinFunction {
       at::ArrayRef<NamedValue> args,
       at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    if (args.size() == 1 && kwargs.size() == 0) {
+    if (args.size() == 1 && kwargs.empty()) {
       auto len_op = std::make_shared<BuiltinFunction>(aten::len, at::nullopt);
       auto gt_op = std::make_shared<BuiltinFunction>(aten::gt, at::nullopt);
       auto zero = m.graph()->insertConstant(0);
@@ -550,7 +550,7 @@ struct TORCH_API TensorCastValue : public SugaredValue {
       at::ArrayRef<NamedValue> args,
       at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    TORCH_INTERNAL_ASSERT(args.size() == 0 && kwargs.size() == 0);
+    TORCH_INTERNAL_ASSERT(args.empty() && kwargs.empty());
     Value* dtype_const = m.graph()->insertConstant(dtype_, loc);
     std::vector<NamedValue> kwargs_{
         self_, NamedValue(loc, "dtype", dtype_const)};
@@ -658,15 +658,15 @@ struct TORCH_API RangeValue : SugaredValue {
   }
 
  private:
-  Value* start_;
-  Value* end_;
-  Value* step_;
+  Value* start_{};
+  Value* end_{};
+  Value* step_{};
   // a flag to determine if it's a simple range() call with only end_ from
   // arguments If true, we will not insert length calculation and index
   // derivation nodes to simplify the graph and enable more possible
   // optimizations
-  bool has_only_end_;
-  c10::optional<int64_t> static_len_ = c10::nullopt;
+  bool has_only_end_{};
+  c10::optional<int64_t> static_len_;
 };
 
 // Specialized Tree structure to matched against for special handling
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 9f71a36492cf..682f2bba0c37 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -552,6 +552,15 @@ void TracingState::setValue(const IValue& v, Value* value) {
     auto& var = v.toTensor();
     AT_ASSERT(var.defined());
     env_stack.back()[v] = value;
+
+    // If the value comes from a CallFunction or CallMethod, it may not have
+    // shape information attached. For debuggability, we enhance the type
+    // information by assigning the concrete value's tupe to the jit::Value.
+    if (auto tensor_type = value->type()->cast<TensorType>()) {
+      if (!tensor_type->isComplete()) {
+        value->inferTypeFrom(var);
+      }
+    }
   } else if (v.isTensorList()) {
     auto outputs = v.toTensorList();
     Node* unpack_node =
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 9d19259078ad..7c355857e5b1 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -179,9 +179,7 @@ inline void warn(const char* _reason, const char* _kind = nullptr) {
 TORCH_API void setWarn(warn_fn_type fn);
 
 struct TORCH_API NoWarn {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   NoWarn() : state(getTracingState()) {
-    // NOLINTNEXTLINE(*.cplusplus.UninitializedObject)
     if (state) {
       prev = state->warn;
       state->warn = false;
@@ -193,7 +191,7 @@ struct TORCH_API NoWarn {
     }
   }
   std::shared_ptr<TracingState> state;
-  bool prev;
+  bool prev{false};
 };
 
 struct WithNestedTracingFrame {
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index b6f1937808e5..87031ec5867f 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -104,7 +104,7 @@ class MutableTypePtrHelper {
                 (*maybe_inner_types).end());
           }
         }
-        if (mutable_types.size() == 0) {
+        if (mutable_types.empty()) {
           return c10::nullopt;
         }
         return mutable_types;
@@ -123,6 +123,14 @@ class MutableTypePtrHelper {
         }
         return c10::nullopt;
       }
+      case TypeKind::AwaitType: {
+        if (auto maybe_mut_types = mapTypeToAliasTypeSet(
+                type->castRaw<AwaitType>()->getElementType())) {
+          return {
+              AliasTypeSet{AwaitType::create(*toSingleType(*maybe_mut_types))}};
+        }
+        return c10::nullopt;
+      }
       case TypeKind::TupleType: {
         std::vector<TypePtr> mutable_types;
         for (const TypePtr& inner : type->expectRef<TupleType>().elements()) {
@@ -133,7 +141,7 @@ class MutableTypePtrHelper {
                 (*maybe_inner_types).end());
           }
         }
-        if (mutable_types.size() == 0) {
+        if (mutable_types.empty()) {
           return c10::nullopt;
         }
         return {AliasTypeSet{TupleType::create(mutable_types)}};
@@ -631,6 +639,11 @@ void AliasDb::analyzeImpl(Node* node) {
       return analyzeFork(node);
     case aten::wait:
       return analyzeWait(node);
+    case prim::awaitable:
+    case prim::awaitable_nowait:
+      return analyzeAwaitable(node);
+    case prim::awaitable_wait:
+      return analyzeAwaitableWait(node);
     case prim::rpc_async:
     case prim::rpc_sync:
     case prim::rpc_remote:
@@ -736,7 +749,7 @@ void AliasDb::analyzeImpl(Node* node) {
       // run into lifetime issues with the graph
       std::vector<std::shared_ptr<Graph>>& graphs =
           function_call_copies_[graph.get()];
-      if (graphs.size() == 0) {
+      if (graphs.empty()) {
         graphs.push_back(graph);
         analyzeSubgraph(node, graph);
       } else {
@@ -914,7 +927,7 @@ void AliasDb::analyzeImpl(Node* node) {
     // Otherwise it is the form of a|fresh, which we can ignore, taking the
     // conservative assumption that the output must alias `a`, e.g
     //   aten::cuda(Tensor(a) self) -> Tensor(a|fresh)
-    if (!inputs_has_alias && formal->beforeSets().size()) {
+    if (!inputs_has_alias && !formal->beforeSets().empty()) {
       giveFreshAlias(actual);
     }
 
@@ -1051,6 +1064,27 @@ void AliasDb::analyzeWait(Node* node) {
   writeRegistry_->registerWriteToAllWildcards(node);
 }
 
+void AliasDb::analyzeAwaitable(Node* node) {
+  for (const auto input : node->inputs()) {
+    setWildcard(input);
+  }
+
+  for (const auto output : node->outputs()) {
+    giveFreshAlias(output);
+  }
+}
+
+void AliasDb::analyzeAwaitableWait(Node* node) {
+  TORCH_INTERNAL_ASSERT(node->kind() == prim::awaitable_wait);
+  for (const auto output : node->outputs()) {
+    setWildcard(output);
+  }
+  // the awaitable subgraph that `wait` is waiting on may write to any of its
+  // inputs. We don't have a reliable way of recovering the awaitable inputs, so
+  // for safety we just register a write to every wildcard.
+  writeRegistry_->registerWriteToAllWildcards(node);
+}
+
 void AliasDb::analyzeRpcAsync(Node* node) {
   for (const auto input : node->inputs()) {
     setWildcard(input);
@@ -1385,9 +1419,8 @@ bool AliasDb::mayContainAlias(
     const at::ArrayRef<Value*> a,
     const at::ArrayRef<Value*> b) const {
   auto a_elems = getElements(a);
-  return a_elems.size() == 0
-      ? false
-      : memoryDAG_->mayContainAlias(a_elems, getElements(b));
+  return a_elems.empty() ? false
+                         : memoryDAG_->mayContainAlias(a_elems, getElements(b));
 }
 
 bool AliasDb::mayContainAlias(Value* a, const at::ArrayRef<Value*> b) const {
@@ -1395,7 +1428,7 @@ bool AliasDb::mayContainAlias(Value* a, const at::ArrayRef<Value*> b) const {
     return false;
   }
   auto b_elems = getElements(b);
-  return b_elems.size() == 0
+  return b_elems.empty()
       ? false
       : memoryDAG_->mayContainAlias(elementMap_.at(a), b_elems);
 }
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index c365cd969189..380943635ea3 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -225,6 +225,8 @@ class AliasDb {
   void analyzeBroadcastingChunk(Node* node);
   void analyzeFork(Node* node);
   void analyzeWait(Node* node);
+  void analyzeAwaitable(Node* node);
+  void analyzeAwaitableWait(Node* node);
   void analyzeRpcAsync(Node* node);
   void analyzeBatchNorm(Node* node);
   void analyzeInstanceNorm(Node* node);
diff --git a/torch/csrc/jit/ir/graph_utils.cpp b/torch/csrc/jit/ir/graph_utils.cpp
new file mode 100644
index 000000000000..35186b7d833b
--- /dev/null
+++ b/torch/csrc/jit/ir/graph_utils.cpp
@@ -0,0 +1,93 @@
+#include <torch/csrc/jit/ir/graph_utils.h>
+
+namespace torch {
+namespace jit {
+
+TypePtr getTensorType(const at::Tensor& t, bool complete) {
+  auto r = TensorType::create(t);
+  if (!complete) {
+    r = r->dimensionedOnly();
+  }
+  return r;
+}
+
+TypePtr inferShapeAndTypeForInput(
+    TypePtr input_type,
+    Stack::const_iterator& s_iter,
+    const Stack::const_iterator& s_iter_end,
+    bool complete) {
+  if (auto tuple_type = input_type->cast<TupleType>()) {
+    std::vector<TypePtr> types;
+    for (const auto& sub_type : tuple_type->containedTypes()) {
+      TORCH_INTERNAL_ASSERT(s_iter != s_iter_end);
+      types.emplace_back(
+          inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete));
+    }
+    return TupleType::create(types);
+  } else if (auto list_type = input_type->cast<ListType>()) {
+    const TypePtr& sub_type = list_type->getElementType();
+    auto elem_type =
+        inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete);
+    return ListType::create(elem_type);
+  } else if (auto tensor_type = input_type->cast<TensorType>()) {
+    auto type = getTensorType(s_iter->toTensor(), complete);
+    s_iter++;
+    return type;
+  } else if (auto optional_type = input_type->cast<OptionalType>()) {
+    const TypePtr& sub_type = optional_type->getElementType();
+    auto elem_type =
+        inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete);
+    return OptionalType::create(elem_type);
+  } else {
+    // Primitive type, keep as is.
+    s_iter++;
+    return input_type;
+  }
+}
+
+void setInputTensorTypes(
+    Graph& g,
+    const Stack& stack,
+    bool complete,
+    const std::vector<int>& param_count_list) {
+  at::ArrayRef<Value*> input_values = g.inputs();
+  auto s_iter = stack.begin();
+  size_t list_idx = 0;
+  if (!param_count_list.empty()) {
+    TORCH_INTERNAL_ASSERT(
+        input_values.size() == param_count_list.size(),
+        " input_values:",
+        input_values.size(),
+        " vs param_count_list:",
+        param_count_list.size());
+  }
+  for (auto v : input_values) {
+    // Leave packed param types alone. This is needed for downstream passes
+    // (like alias analysis) to work properly. This will be unpacked later
+    // in unpackQuantizedWeights.
+    if (auto named_type = v->type()->cast<c10::NamedType>()) {
+      if (auto qualname = named_type->name()) {
+        if (getCustomClass(qualname->qualifiedName())) {
+          if (param_count_list.empty()) {
+            AT_ASSERT(s_iter != stack.end());
+            s_iter++;
+          } else {
+            if (param_count_list[list_idx] > 0) {
+              AT_ASSERT(s_iter != stack.end());
+            }
+            s_iter += param_count_list[list_idx];
+          }
+          list_idx++;
+          continue;
+        }
+      }
+    }
+    auto type =
+        inferShapeAndTypeForInput(v->type(), s_iter, stack.end(), complete);
+    v->setType(type);
+    list_idx++;
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/ir/graph_utils.h b/torch/csrc/jit/ir/graph_utils.h
new file mode 100644
index 000000000000..6d4f296fb132
--- /dev/null
+++ b/torch/csrc/jit/ir/graph_utils.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+TORCH_API TypePtr getTensorType(const at::Tensor& t, bool complete);
+
+TORCH_API TypePtr inferShapeAndTypeForInput(
+    TypePtr input_type,
+    Stack::const_iterator& s_iter,
+    const Stack::const_iterator& s_iter_end,
+    bool complete);
+
+TORCH_API void setInputTensorTypes(
+    Graph& g,
+    const Stack& stack,
+    bool complete,
+    const std::vector<int>& param_count_list = {});
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index a67bec800cbf..947fd13d0a59 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -473,28 +473,26 @@ void Node::lint() const {
   }
 
   for (auto o : outputs()) {
-    size_t i = 0;
     for (auto use : o->uses()) {
       // Use invariants
       // - Use is consistent with inputs
       // - Every user node is live (checked in Graph)
       AT_ASSERT(use.user->inputs_[use.offset] == o);
-      i++;
     }
   }
 
   // Node subclass invariants
   switch (kind()) {
     case prim::Constant:
-      AT_ASSERT(inputs_.size() == 0);
+      AT_ASSERT(inputs_.empty());
       break;
     case prim::Return:
       // Return uses is zero
-      AT_ASSERT(outputs().size() == 0);
+      AT_ASSERT(outputs().empty());
       break;
     case prim::Param:
       // Param inputs is zero
-      AT_ASSERT(inputs_.size() == 0);
+      AT_ASSERT(inputs_.empty());
       break;
     case prim::PythonOp: {
       // Python operator cconv is correct
@@ -837,7 +835,7 @@ std::string Value::debugNameBase() const {
 
 bool Value::isValidName(const std::string& name) {
   // Empty strings are legal
-  if (!name.size()) {
+  if (name.empty()) {
     return true;
   }
 
@@ -863,7 +861,7 @@ Value* Value::setDebugName(const std::string& name) {
   }
 
   // allow "" to clear the uniquename
-  if (name == "") {
+  if (name.empty()) {
     return this;
   }
 
@@ -1126,7 +1124,7 @@ const Operator& Node::getOperator() const {
     er << *inputs()[i]->type();
   }
   const auto& candidates = getAllOperatorsFor(kind());
-  if (candidates.size() > 0) {
+  if (!candidates.empty()) {
     er << "\ncandidates were:\n";
     for (auto& candidate : candidates) {
       er << "  " << candidate->schema() << "\n";
@@ -2111,7 +2109,7 @@ std::vector<Value*> inlineCallTo(
       module_instance_info = c10::make_optional(ModuleInstanceInfo(
           class_type_ptr, to_replace->input(0)->node()->s(attr::name)));
     } else if (
-        to_replace->owningGraph()->inputs().size() > 0 &&
+        !to_replace->owningGraph()->inputs().empty() &&
         to_replace->input(0) == to_replace->owningGraph()->inputs()[0]) {
       // This CallMethod must correspond to method of the same object
       // to which this graph belongs.
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index c070f9fa2cdc..80287e5c437c 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -850,7 +850,7 @@ struct TORCH_API Node {
     return removeAttribute(Symbol::attr(name));
   }
   bool hasAttributes() const {
-    return values_.size() > 0;
+    return !values_.empty();
   }
   size_t numAttributes() const {
     return values_.size();
diff --git a/torch/csrc/jit/ir/ir_views.h b/torch/csrc/jit/ir/ir_views.h
index 549997906627..ff380c5d146a 100644
--- a/torch/csrc/jit/ir/ir_views.h
+++ b/torch/csrc/jit/ir/ir_views.h
@@ -126,8 +126,9 @@ struct LoopView {
         trip_count->toInt() !=
             std::numeric_limits<int64_t>::max() || // it is a constant but not
                                                    // the default one
-        currentTripCount()->uses().size() >
-            0; // it is actually being used in the body.
+        !currentTripCount()
+             ->uses()
+             .empty(); // it is actually being used in the body.
 
     if (condition_is_always_true) {
       // if the trip count was not specified this was a user-written while True:
diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp
index 8a132a29fd9b..25c04a00e7ff 100644
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@@ -189,16 +189,40 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
       str += L.cur().text();
       if (str.find('j') != std::string::npos) {
         r.k = AttributeKind::c;
-        auto imag = c10::stod(str.substr(0, str.size() - 1));
+        double imag = 0.0f;
+        try {
+          imag = c10::stod(str.substr(0, str.size() - 1));
+        } catch (const std::invalid_argument& e) {
+          throw ErrorReport(token.range)
+              << "Number cannot be converted to double";
+        } catch (const std::out_of_range& e) {
+          throw ErrorReport(token.range)
+              << "Number is too long to be represented in type double";
+        }
         r.c = c10::complex<double>(0, imag);
       } else if (
           str.find('.') != std::string::npos ||
           str.find('e') != std::string::npos) {
         r.k = AttributeKind::f;
-        r.f = c10::stod(str);
+        try {
+          r.f = c10::stod(str);
+        } catch (const std::invalid_argument& e) {
+          throw ErrorReport(token.range)
+              << "Number cannot be converted to double";
+        } catch (const std::out_of_range& e) {
+          throw ErrorReport(token.range)
+              << "Number is too long to be represented in type double";
+        }
       } else {
         r.k = AttributeKind::i;
-        r.i = c10::stoll(str);
+        try {
+          r.i = c10::stoll(str);
+        } catch (const std::invalid_argument& e) {
+          throw ErrorReport(token.range)
+              << "Number cannot be converted to integer";
+        } catch (const std::out_of_range& e) {
+          throw ErrorReport(token.range) << "Number is too big";
+        }
       }
       L.next();
       return r;
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index 37e0c5f00a1f..d520ee2fa7ec 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -93,7 +93,7 @@ void JitLoggingConfig::parse() {
   files_to_levels.clear();
   std::string line;
   while (std::getline(in_ss, line, ':')) {
-    if (line.size() == 0) {
+    if (line.empty()) {
       continue;
     }
 
diff --git a/torch/csrc/jit/jit_opt_limit.cpp b/torch/csrc/jit/jit_opt_limit.cpp
index 749f12197a0f..8e11c4db7e6f 100644
--- a/torch/csrc/jit/jit_opt_limit.cpp
+++ b/torch/csrc/jit/jit_opt_limit.cpp
@@ -37,7 +37,7 @@ static std::unordered_map<std::string, int64_t> parseJITOptLimitOption(
   std::unordered_map<std::string, int64_t> passes_to_opt_limits;
   std::string line;
   while (std::getline(in_ss, line, ':')) {
-    if (line.size() == 0) {
+    if (line.empty()) {
       continue;
     }
     auto index_at = line.find_last_of('=');
diff --git a/torch/csrc/jit/mobile/compatibility/backport.cpp b/torch/csrc/jit/mobile/compatibility/backport.cpp
index 3cf184667f1e..f4058501a031 100644
--- a/torch/csrc/jit/mobile/compatibility/backport.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport.cpp
@@ -10,11 +10,8 @@
 namespace torch {
 namespace jit {
 
-using caffe2::serialize::FileAdapter;
 using caffe2::serialize::IStreamAdapter;
-using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::PyTorchStreamWriter;
-using caffe2::serialize::ReadAdapterInterface;
 
 const static BackportManager backportManager;
 
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 2bad08c0765a..884ad1a973a4 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -15,11 +15,9 @@
 namespace torch {
 namespace jit {
 
-using caffe2::serialize::FileAdapter;
 using caffe2::serialize::IStreamAdapter;
 using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::PyTorchStreamWriter;
-using caffe2::serialize::ReadAdapterInterface;
 
 // Current support bytecode version
 namespace {
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 9ce71eba9ce7..a8ca880ecf4b 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -235,7 +235,7 @@ std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
       // grab name
       std::string op_name = op.at(0).toStringRef();
       std::string op_overload_name = op.at(1).toStringRef();
-      if (op_overload_name != "") {
+      if (!op_overload_name.empty()) {
         op_name.append(".");
         op_name.append(op_overload_name);
       }
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index f2f1b368e034..f35ac0733581 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -77,7 +77,6 @@ bool Function::initialize_operators(bool should_check_operators) {
     if (!func.has_value()) {
       unsupported_op_names.insert(operator_str(opname));
       all_ops_supported = false;
-      break;
     } else {
       code_.operators_[i] = *func;
     }
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 5acd5cab3985..bd28710fbef3 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -310,7 +310,7 @@ void BytecodeDeserializer::parseMethods(
     c10::ivalue::TupleElements&& vals,
     c10::optional<c10::ivalue::TupleElements>&& debug_handles,
     mobile::CompilationUnit& mcu) {
-  TORCH_CHECK(vals.size() > 0, "Bytecode has no elements. ");
+  TORCH_CHECK(!vals.empty(), "Bytecode has no elements. ");
   // Initialized with the version number when kProducedBytecodeVersion was
   // introduced. The old models (some of them already in production) without
   // version number are seen as version 3 (deprecated).
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 09f5c061c7f1..d19ee838f4ca 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -232,7 +232,7 @@ bool InterpreterState::run(Stack& stack) {
         } break;
         case RET:
           leaveFrame();
-          if (frames_.size() > 0) {
+          if (!frames_.empty()) {
             continue;
           }
           return false;
diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index 8f61cc2402e1..9cb5b0374142 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -141,8 +141,7 @@ void slot_named_params_recurse(
   size_t nslots = slots.size();
   for (const auto i : c10::irange(nslots)) {
     auto slot = slots[i];
-    std::string name =
-        parent_name.size() == 0 ? parent_name : parent_name + ".";
+    std::string name = parent_name.empty() ? parent_name : parent_name + ".";
     name += obj->type()->getAttributeName(i);
     // TODO: Fix this filter. Requires_grad is not the appropriate
     // filter of a parameter, but is a temporary hack to help probable
diff --git a/torch/csrc/jit/mobile/nnc/registry.h b/torch/csrc/jit/mobile/nnc/registry.h
index 14c6939d4c4f..c68a4f7a19c6 100644
--- a/torch/csrc/jit/mobile/nnc/registry.h
+++ b/torch/csrc/jit/mobile/nnc/registry.h
@@ -15,7 +15,7 @@ struct TORCH_API NNCKernel {
   virtual int execute(void** /* args */) = 0;
 };
 
-C10_DECLARE_REGISTRY(NNCKernelRegistry, NNCKernel);
+TORCH_DECLARE_REGISTRY(NNCKernelRegistry, NNCKernel);
 
 #define REGISTER_NNC_KERNEL(id, kernel, ...)     \
   extern "C" {                                   \
diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
index 6d0342424170..5a90bae54f91 100644
--- a/torch/csrc/jit/mobile/profiler_edge.cpp
+++ b/torch/csrc/jit/mobile/profiler_edge.cpp
@@ -82,8 +82,8 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
 void KinetoEdgeCPUProfiler::recordBackendMemoryEvent(
     void* ptr,
     int64_t alloc_size,
-    int64_t total_allocated,
-    int64_t total_reserved,
+    size_t total_allocated,
+    size_t total_reserved,
     c10::Device device) {
   c10::reportMemoryUsageToProfiler(
       ptr, alloc_size, total_allocated, total_reserved, device);
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
index 8eea4ff32b53..6ac74b053c36 100644
--- a/torch/csrc/jit/mobile/profiler_edge.h
+++ b/torch/csrc/jit/mobile/profiler_edge.h
@@ -75,8 +75,8 @@ class TORCH_API KinetoEdgeCPUProfiler {
   void recordBackendMemoryEvent(
       void* ptr,
       int64_t alloc_size,
-      int64_t total_allocated,
-      int64_t total_reserved,
+      size_t total_allocated,
+      size_t total_reserved,
       c10::Device device);
 
   ~KinetoEdgeCPUProfiler();
diff --git a/torch/csrc/jit/mobile/type_parser.cpp b/torch/csrc/jit/mobile/type_parser.cpp
index 671584e142a4..8e8f4795ada5 100644
--- a/torch/csrc/jit/mobile/type_parser.cpp
+++ b/torch/csrc/jit/mobile/type_parser.cpp
@@ -132,7 +132,7 @@ TypePtr TypeParser::parse() {
   const auto& baseTypes = DynamicTypeFactory::basePythonTypes();
   auto simpleTypeIt = baseTypes.find(token);
   if (simpleTypeIt != baseTypes.end()) {
-    if (cur() != "]" && cur() != "," && cur() != "") {
+    if (cur() != "]" && cur() != "," && !cur().empty()) {
       TORCH_CHECK(
           false, "Simple type ", token, " is followed by ", "invalid chars.");
     }
diff --git a/torch/csrc/jit/mobile/upgrader_mobile.cpp b/torch/csrc/jit/mobile/upgrader_mobile.cpp
index 0e52829255d0..f22050857695 100644
--- a/torch/csrc/jit/mobile/upgrader_mobile.cpp
+++ b/torch/csrc/jit/mobile/upgrader_mobile.cpp
@@ -2,7 +2,7 @@
  * @generated
  * This is an auto-generated file. Please do not modify it by hand.
  * To re-generate, please run:
- * cd ~/pytorch && python torch/csrc/jit/mobile/upgrader_mobile.cpp
+ * cd ~/pytorch && python torchgen/operator_versions/gen_mobile_upgraders.py
  */
 
 #include <caffe2/serialize/versions.h>
@@ -27,45 +27,65 @@ getOperatorVersionMapForMobile() {
                     std::vector<Upgrader>({
                         Upgrader({0, 3, "div_Scalar_0_3", 0})
                     })},
+                {std::string("aten::div.Scalar_mode"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 3, "div_Scalar_mode_0_3", 1})
+                    })},
                 {std::string("aten::div.Tensor"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 3, "div_Tensor_0_3", 1})
+                        Upgrader({0, 3, "div_Tensor_0_3", 2})
+                    })},
+                {std::string("aten::div.Tensor_mode"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 3, "div_Tensor_mode_0_3", 3})
                     })},
                 {std::string("aten::div.out"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 3, "div_out_0_3", 4})
+                        Upgrader({0, 3, "div_out_0_3", 8})
+                    })},
+                {std::string("aten::div.out_mode"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 3, "div_out_mode_0_3", 9})
                     })},
                 {std::string("aten::div_.Scalar"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 3, "div__Scalar_0_3", 2})
+                        Upgrader({0, 3, "div__Scalar_0_3", 4})
+                    })},
+                {std::string("aten::div_.Scalar_mode"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 3, "div__Scalar_mode_0_3", 5})
                     })},
                 {std::string("aten::div_.Tensor"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 3, "div__Tensor_0_3", 3})
+                        Upgrader({0, 3, "div__Tensor_0_3", 6})
+                    })},
+                {std::string("aten::div_.Tensor_mode"),
+                    std::vector<Upgrader>({
+                        Upgrader({0, 3, "div__Tensor_mode_0_3", 7})
                     })},
                 {std::string("aten::gelu"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 9, "gelu_0_9", 5})
+                        Upgrader({0, 9, "gelu_0_9", 11})
                     })},
                 {std::string("aten::gelu.out"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 9, "gelu_out_0_9", 6})
+                        Upgrader({0, 9, "gelu_out_0_9", 12})
                     })},
                 {std::string("aten::linspace"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 7, "linspace_0_7", 7})
+                        Upgrader({0, 7, "linspace_0_7", 13})
                     })},
                 {std::string("aten::linspace.out"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 7, "linspace_out_0_7", 8})
+                        Upgrader({0, 7, "linspace_out_0_7", 14})
                     })},
                 {std::string("aten::logspace"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 8, "logspace_0_8", 9})
+                        Upgrader({0, 8, "logspace_0_8", 15})
                     })},
                 {std::string("aten::logspace.out"),
                     std::vector<Upgrader>({
-                        Upgrader({0, 8, "logspace_out_0_8", 10})
+                        Upgrader({0, 8, "logspace_out_0_8", 16})
                     })},
       });
   return operatorVersionMapForMobile;
@@ -120,6 +140,25 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                    OperatorString({"aten::div", "Scalar_mode", 3}),
                            }), // operators list
                    }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "div_Scalar_mode_0_3",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 3},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::MOVE, 3, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>(), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               3
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::div", "Scalar_mode", 3}),
+                           }), // operators list
+                   }),
                    ByteCodeFunctionWithOperator({
                            mobile::Function::registerFunc(
                                "div_Tensor_0_3",
@@ -162,6 +201,25 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                    OperatorString({"aten::div", "Tensor_mode", 3}),
                            }), // operators list
                    }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "div_Tensor_mode_0_3",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 3},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::MOVE, 3, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>(), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               3
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::div", "Tensor_mode", 3}),
+                           }), // operators list
+                   }),
                    ByteCodeFunctionWithOperator({
                            mobile::Function::registerFunc(
                                "div__Scalar_0_3",
@@ -208,6 +266,25 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                    OperatorString({"aten::div_", "Scalar_mode", 3}),
                            }), // operators list
                    }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "div__Scalar_mode_0_3",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 3},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::MOVE, 3, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>(), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               3
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::div_", "Scalar_mode", 3}),
+                           }), // operators list
+                   }),
                    ByteCodeFunctionWithOperator({
                            mobile::Function::registerFunc(
                                "div__Tensor_0_3",
@@ -250,6 +327,25 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                    OperatorString({"aten::div_", "Tensor_mode", 3}),
                            }), // operators list
                    }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "div__Tensor_mode_0_3",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 3},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::MOVE, 3, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>(), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               3
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::div_", "Tensor_mode", 3}),
+                           }), // operators list
+                   }),
                    ByteCodeFunctionWithOperator({
                            mobile::Function::registerFunc(
                                "div_out_0_3",
@@ -300,6 +396,49 @@ const std::vector<ByteCodeFunctionWithOperator>& getUpgraderBytecodeList() {
                                    OperatorString({"aten::div", "out_mode", 4}),
                            }), // operators list
                    }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "div_out_mode_0_3",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 4},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::MOVE, 3, 0},
+                                           Instruction{OpCode::MOVE, 4, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>(), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               4
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::div", "out_mode", 4}),
+                           }), // operators list
+                   }),
+                   ByteCodeFunctionWithOperator({
+                           mobile::Function::registerFunc(
+                               "full_names_0_4",
+                               std::vector<Instruction>({
+                                           Instruction{OpCode::STOREN, 1, 7},
+                                           Instruction{OpCode::MOVE, 1, 0},
+                                           Instruction{OpCode::MOVE, 2, 0},
+                                           Instruction{OpCode::MOVE, 3, 0},
+                                           Instruction{OpCode::MOVE, 4, 0},
+                                           Instruction{OpCode::MOVE, 5, 0},
+                                           Instruction{OpCode::MOVE, 6, 0},
+                                           Instruction{OpCode::MOVE, 7, 0},
+                                           Instruction{OpCode::OP, 0, 0},
+                                           Instruction{OpCode::RET, 0, 0},
+                                   }), // instructions list,
+                               std::vector<c10::IValue>(), // constants list,
+                               std::vector<c10::TypePtr>(), // types list,
+                               7
+                           ),
+                           std::vector<OperatorString>({
+                                   OperatorString({"aten::full", "names", 7}),
+                           }), // operators list
+                   }),
                    ByteCodeFunctionWithOperator({
                            mobile::Function::registerFunc(
                                "gelu_0_9",
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
index 75639006e503..616799720730 100644
--- a/torch/csrc/jit/operator_upgraders/README.md
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -38,7 +38,7 @@ When making changes to the operators, the first thing to identify is if it's BC/
   ```
   class TestVersionedLinspaceV7(torch.nn.Module):
       def __init__(self):
-          super(TestVersionedLinspaceV7, self).__init__()
+          super().__init__()
 
       def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]):
           c = torch.linspace(a, b, steps=5)
@@ -164,7 +164,7 @@ When making changes to the operators, the first thing to identify is if it's BC/
             # Step 2. Write down how current module should look like
             class MyModuleFloat(torch.nn.Module):
                 def __init__(self):
-                    super(MyModuleFloat, self).__init__()
+                    super().__init__()
 
                 def forward(self, a, b: float):
                     return a / b
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 8a109d7ffdb6..0770098b1e7e 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -120,7 +120,7 @@ RegisterOperators mm_tree_reduction_reg({Operator(
       }
       drop(stack, num_inputs);
 
-      AT_ASSERT(inputs.size() > 0);
+      AT_ASSERT(!inputs.empty());
       AT_ASSERT(inputs.size() % 2 == 0);
       size_t side_num_elems = inputs.size() / 2;
       auto lhs_inputs = at::TensorList(inputs).slice(0, side_num_elems);
@@ -372,7 +372,7 @@ std::pair<std::vector<Node*>, std::vector<Node*>> gatherIndependentMMUses(
     Value* value,
     AliasDb& alias_db) {
   const auto postprocess = [&](std::vector<Node*> mms) {
-    if (mms.size() == 0) {
+    if (mms.empty()) {
       return mms;
     }
     std::sort(mms.begin(), mms.end(), [](Node* n, Node* m) {
diff --git a/torch/csrc/jit/passes/canonicalize.cpp b/torch/csrc/jit/passes/canonicalize.cpp
index d8cad4e04435..0dfc9f6dd915 100644
--- a/torch/csrc/jit/passes/canonicalize.cpp
+++ b/torch/csrc/jit/passes/canonicalize.cpp
@@ -143,7 +143,7 @@ bool isBeforeOrAfter(const Use& a, const Use& b, bool checking_before) {
 }
 
 c10::optional<const Use> firstOrLastUse(Value* v, bool find_first) {
-  if (v->uses().size() == 0) {
+  if (v->uses().empty()) {
     return c10::nullopt;
   }
   Use extreme_use = v->uses()[0];
diff --git a/torch/csrc/jit/passes/check_strict_fusion.cpp b/torch/csrc/jit/passes/check_strict_fusion.cpp
index 866dba99bcd2..16841051066f 100644
--- a/torch/csrc/jit/passes/check_strict_fusion.cpp
+++ b/torch/csrc/jit/passes/check_strict_fusion.cpp
@@ -98,7 +98,7 @@ void checkForUnfusedOps(Node* enter_node) {
       unfused_nodes_not_used_in_guard.push_back(unfused);
     }
   }
-  if (unfused_nodes_not_used_in_guard.size()) {
+  if (!unfused_nodes_not_used_in_guard.empty()) {
     std::stringstream ss;
     ss << "Found unfused operators: \n";
     for (Node* unfused : unfused_nodes_not_used_in_guard) {
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index baadc821dd8c..4bd656788d77 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -144,6 +144,7 @@ std::unordered_set<Symbol> skip_list = {
     prim::profile,
     prim::profile_ivalue,
     prim::unchecked_unwrap_optional, // TODO remove
+    prim::awaitable,
     aten::dequantize,
     // TODO (zach): we should consider skipping tensor factories in the cases
     // where the constant tensor would be large but cheap to create.
@@ -358,7 +359,7 @@ struct ConstantPropagator {
     }
     return no_mutation && !n->kind().is_onnx() &&
         skip_list.count(n->kind()) == 0 && !n->isNondeterministic() &&
-        !n->hasSideEffects() && n->blocks().size() == 0;
+        !n->hasSideEffects() && n->blocks().empty();
   }
 
   void ConstantPropagation(at::ArrayRef<Block*> blocks) {
diff --git a/torch/csrc/jit/passes/create_functional_graphs.cpp b/torch/csrc/jit/passes/create_functional_graphs.cpp
index d5d85f6f5b2a..c929e311b376 100644
--- a/torch/csrc/jit/passes/create_functional_graphs.cpp
+++ b/torch/csrc/jit/passes/create_functional_graphs.cpp
@@ -37,7 +37,7 @@ struct FunctionalGraphSlicer {
  private:
   bool isEmptyFunctionalGraph(Node* n) {
     auto g = n->g(attr::Subgraph);
-    return g->inputs().size() == 0 && g->outputs().size() == 0;
+    return g->inputs().empty() && g->outputs().empty();
   }
 
   void nonConstNodes(Block* block, size_t* num) {
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
index d8504c212ed0..2f6a6de86dbf 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.cpp
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -286,8 +286,8 @@ class DeadCodeEliminator {
             "Node ",
             it->kind().toQualString(),
             " which outputs ",
-            (node->outputs().size() > 0 ? node->outputs().at(0)->debugName()
-                                        : "n/a"),
+            (!node->outputs().empty() ? node->outputs().at(0)->debugName()
+                                      : "n/a"),
             " will be removed");
         it.destroyCurrent();
       }
diff --git a/torch/csrc/jit/passes/dtype_analysis.cpp b/torch/csrc/jit/passes/dtype_analysis.cpp
index eb01fca895b5..c5fe1599c43b 100644
--- a/torch/csrc/jit/passes/dtype_analysis.cpp
+++ b/torch/csrc/jit/passes/dtype_analysis.cpp
@@ -162,7 +162,7 @@ using DtypePropRule = std::function<bool(Node*)>;
 bool setIfAllDtypeMatch(Node* n) {
   // Sets all tensor outputs to the dtype of the first input
   // only if all inputs are the same dtype, otherwise do nothing
-  TORCH_INTERNAL_ASSERT(n->inputs().size() >= 1);
+  TORCH_INTERNAL_ASSERT(!n->inputs().empty());
   auto first_arg = n->inputs().at(0);
   auto tensor_type = first_arg->type()->cast<TensorType>();
   TORCH_INTERNAL_ASSERT(tensor_type, "Expecting a tensor type");
@@ -278,7 +278,7 @@ struct DtypePropagationPass {
       const at::ArrayRef<Value*>& list2) {
     // This is currently a placeholder for MobileNet
     // After Month1: implement the merge function
-    TORCH_INTERNAL_ASSERT(list1.size() == 0, "Not implemented yet");
+    TORCH_INTERNAL_ASSERT(list1.empty(), "Not implemented yet");
     return false;
   }
 
diff --git a/torch/csrc/jit/passes/fixup_trace_scope_blocks.cpp b/torch/csrc/jit/passes/fixup_trace_scope_blocks.cpp
index f0d18497f46e..c6ce13437840 100644
--- a/torch/csrc/jit/passes/fixup_trace_scope_blocks.cpp
+++ b/torch/csrc/jit/passes/fixup_trace_scope_blocks.cpp
@@ -155,7 +155,7 @@ struct ConvertTracedAttrReferences {
         for (Value* v : sub_unresolved) {
           n->addInput(v);
         }
-      } else if (n->blocks().size()) {
+      } else if (!n->blocks().empty()) {
         for (Block* sub_block : n->blocks()) {
           auto sub_unresolved =
               convertAttrReferencesToLocalGetAttrs(sub_block, prefix, self);
@@ -326,7 +326,7 @@ void convertReturnsToTuples(Block* b) {
           WithInsertPoint guard(sub_block->return_node());
           Node* return_tup =
               g->insertNode(g->createTuple(sub_block->outputs()));
-          while (sub_block->outputs().size()) {
+          while (!sub_block->outputs().empty()) {
             sub_block->eraseOutput(0);
           }
           sub_block->registerOutput(return_tup->output());
@@ -344,7 +344,7 @@ void convertReturnsToTuples(Block* b) {
           n->output(rev_idx)->replaceAllUsesWith(tup_unpack->output(rev_idx));
           n->eraseOutput(rev_idx);
         }
-      } else if (sub_block->outputs().size() == 0) {
+      } else if (sub_block->outputs().empty()) {
         WithInsertPoint guard(sub_block->return_node());
         sub_block->registerOutput(g->insertNode(g->createNone())->output());
         n->addOutput()->setType(NoneType::get());
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index a653c05927ff..6000fba43c21 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -199,7 +199,7 @@ struct GraphFuser {
   bool isFusableDefault(Node* node, bool strict_fuser_check) {
     bool fusableDevice = true;
     for (const auto& output : node->outputs()) {
-      if (output->uses().size() > 0) {
+      if (!output->uses().empty()) {
         fusableDevice &= isFusableDevice(output, strict_fuser_check);
       }
     }
@@ -307,7 +307,7 @@ struct GraphFuser {
       auto outputs = node->outputs();
       for (const auto i : c10::irange(outputs.size())) {
         auto output = outputs[i];
-        if (output->uses().size() == 0)
+        if (output->uses().empty())
           continue;
         consumer_subgraph->registerOutput(merged->outputs()[i]);
         auto new_output = consumer_group->addOutput();
@@ -455,7 +455,7 @@ struct GraphFuser {
     // fusion in cases where uses remain after the consumer
     // if these exist, re-route them to the version of producer
     // created in FusionGroup
-    if (producer->uses().size() != 0) {
+    if (!producer->uses().empty()) {
       getSubgraph(group).registerOutput(merged->output());
       Value* new_producer = group->addOutput();
       new_producer->copyMetadata(producer);
@@ -586,7 +586,7 @@ struct GraphFuser {
   }
 
   at::ArrayRef<Value*> broadcast_tensors(value_list inputs) {
-    AT_ASSERT(inputs.size() > 0);
+    AT_ASSERT(!inputs.empty());
     auto* g = inputs[0]->owningGraph();
     auto* input_list =
         g->insertNode(g->createList(TensorType::get(), inputs))->output();
diff --git a/torch/csrc/jit/passes/guard_elimination.cpp b/torch/csrc/jit/passes/guard_elimination.cpp
index abc7c25738bb..e3b85a460a7a 100644
--- a/torch/csrc/jit/passes/guard_elimination.cpp
+++ b/torch/csrc/jit/passes/guard_elimination.cpp
@@ -131,7 +131,7 @@ struct GuardElimination {
 
         // find all uses of the input that the guard node dominates
         std::vector<Use> uses = input->uses();
-        while (uses.size() > 0) {
+        while (!uses.empty()) {
           auto use = uses.at(uses.size() - 1);
           uses.pop_back();
 
diff --git a/torch/csrc/jit/passes/inline_forked_closures.cpp b/torch/csrc/jit/passes/inline_forked_closures.cpp
index 771050030c97..fe854b9017e8 100644
--- a/torch/csrc/jit/passes/inline_forked_closures.cpp
+++ b/torch/csrc/jit/passes/inline_forked_closures.cpp
@@ -16,7 +16,7 @@ namespace jit {
 // subgraph, replace the context unpacking value with the new graph input.
 // fork(foo) ->
 // def foo(a, b):
-void inlineForkedClosure(Node* fork_closure) {
+void inlineForkedClosure(Node* fork_closure, NodeKind genKind) {
   Node* function_context_node = fork_closure->input()->node();
 
   if (function_context_node->inputs().size() != 2 ||
@@ -30,7 +30,7 @@ void inlineForkedClosure(Node* fork_closure) {
   Node* context = function_context_node->inputs().at(1)->node();
   auto fork_graph = function->g(attr::Subgraph)->copy();
   auto g = fork_closure->owningGraph();
-  Node* fork_node = g->create(prim::fork, 1)
+  Node* fork_node = g->create(genKind, 1)
                         ->insertAfter(fork_closure)
                         ->setSourceRange(fork_closure->sourceRange());
 
@@ -64,7 +64,10 @@ void inlineForkedClosures(Block* block) {
     it++;
     switch (n->kind()) {
       case prim::forkClosure: {
-        inlineForkedClosure(n);
+        inlineForkedClosure(n, prim::fork);
+      } break;
+      case prim::awaitableClosure: {
+        inlineForkedClosure(n, prim::awaitable);
       } break;
       default: {
         for (Block* b : n->blocks()) {
diff --git a/torch/csrc/jit/passes/inliner.cpp b/torch/csrc/jit/passes/inliner.cpp
index 3b012a1e3f1e..4fd808b7dc23 100644
--- a/torch/csrc/jit/passes/inliner.cpp
+++ b/torch/csrc/jit/passes/inliner.cpp
@@ -57,7 +57,7 @@ void inlineCalls(Block* block) {
           if (fallback && graphFunction->get_executor().isOptimized()) {
             auto exec_plans =
                 graphFunction->get_executor().getDebugState().execution_plans;
-            if (exec_plans.size() != 0) {
+            if (!exec_plans.empty()) {
               g = exec_plans.begin()->second.graph;
               // optimized_graph() calls Inline, so we only need to explicitly
               // invoke inlining on the jit optimized graph with recursive
diff --git a/torch/csrc/jit/passes/liveness.cpp b/torch/csrc/jit/passes/liveness.cpp
index 3b2cf54461f8..2aed7cbe3aab 100644
--- a/torch/csrc/jit/passes/liveness.cpp
+++ b/torch/csrc/jit/passes/liveness.cpp
@@ -68,7 +68,7 @@ struct LivenessAnalyzer {
       const std::unordered_map<Node*, std::vector<Value*>>& liveness_sets) {
     std::cout << "Liveness info:\n";
     for (auto e : liveness_sets) {
-      if (e.first->outputs().size() > 0) {
+      if (!e.first->outputs().empty()) {
         std::cout << e.first->outputs()[0]->debugName();
       }
 
diff --git a/torch/csrc/jit/passes/loop_unrolling.cpp b/torch/csrc/jit/passes/loop_unrolling.cpp
index 4d0ca10a2bc2..84dfb465f42f 100644
--- a/torch/csrc/jit/passes/loop_unrolling.cpp
+++ b/torch/csrc/jit/passes/loop_unrolling.cpp
@@ -172,7 +172,7 @@ void unroll(Node* loop) {
   // default one, because this will allow us to share it between the unrolled
   // loop and its epilogue. This is necessary only if the loop counter is
   // actually used in the body.
-  if (body->inputs()[0]->uses().size() > 0)
+  if (!body->inputs()[0]->uses().empty())
     replaceLoopCounter(loop);
 
   // Some optimization for constant-length loops. If we know they won't run too
diff --git a/torch/csrc/jit/passes/lower_graph.cpp b/torch/csrc/jit/passes/lower_graph.cpp
index b4da8216b5af..459c9edb58e6 100644
--- a/torch/csrc/jit/passes/lower_graph.cpp
+++ b/torch/csrc/jit/passes/lower_graph.cpp
@@ -63,7 +63,7 @@ std::pair<std::shared_ptr<Graph>, std::vector<Slot>> lower_graph(
   for (Use use : self_value->uses()) {
     to_scan.emplace_back(ToScan{self, use.user, use.offset});
   }
-  while (to_scan.size() > 0) {
+  while (!to_scan.empty()) {
     auto e = to_scan.back();
     to_scan.pop_back();
 
@@ -104,7 +104,7 @@ std::pair<std::shared_ptr<Graph>, std::vector<Slot>> lower_graph(
     e.n->destroy();
   }
 
-  while (to_clean.size() > 0) {
+  while (!to_clean.empty()) {
     Node* n = to_clean.back();
     AT_ASSERT(!n->hasUses());
     n->destroy();
diff --git a/torch/csrc/jit/passes/onnx/eval_peephole.cpp b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
index 4b39e5288632..b7ff1b1c9ac3 100644
--- a/torch/csrc/jit/passes/onnx/eval_peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/eval_peephole.cpp
@@ -63,7 +63,7 @@ static void fuseConvBatchNorm(Block* b, ValueToParamPairMap& valsToParamsMap) {
 
       auto epsilon = bnNode->f(attr::epsilon);
       auto convInputVals = getValues(oldConv, valsToParamsMap);
-      if (convInputVals.size() < 1 ||
+      if (convInputVals.empty() ||
           (oldConv->inputs().size() == 3 && convInputVals.size() != 2)) {
         continue;
       }
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index f25160260ea7..6e171e66bcf9 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -74,7 +74,7 @@ bool IsErasableSequence(const Node* loop_node, size_t i) {
   const auto init_seq_node_kind = init_seq_node->kind();
   if ((init_seq_node_kind != ::c10::onnx::SequenceEmpty) &&
       (init_seq_node_kind != ::c10::prim::ListConstruct ||
-       init_seq_node->inputs().size() != 0)) {
+       !init_seq_node->inputs().empty())) {
     // Initial sequence must be empty.
     return false;
   }
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index b012825c371a..7d856852fedd 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -128,7 +128,7 @@ FunctionExtractor::FunctionContext::FunctionContext(
   GRAPH_UPDATE(
       "Process function context for scope ",
       scope_key_->name().toDisplayString());
-  TORCH_INTERNAL_ASSERT(scopes.size() > 0);
+  TORCH_INTERNAL_ASSERT(!scopes.empty());
   const auto& ref_ctx = scope_ctxs[scope_key_];
   // NOTE: Function scopes must have same number and order of nodes.
   GRAPH_DEBUG(
@@ -332,7 +332,7 @@ c10::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
 
 c10::optional<ScopePtr> FunctionExtractor::FindCommonAncestor(
     const scope_list& scopes) {
-  if (scopes.size() == 0) {
+  if (scopes.empty()) {
     return c10::nullopt;
   }
 
@@ -372,7 +372,7 @@ c10::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
       output_scopes.emplace_back(use.user->scope());
     }
   }
-  if (output_scopes.size() > 0 &&
+  if (!output_scopes.empty() &&
       std::all_of(
           output_scopes.begin(),
           output_scopes.end(),
@@ -381,7 +381,7 @@ c10::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
           })) {
     return output_scopes.at(0);
   } else if (
-      input_scopes.size() > 0 &&
+      !input_scopes.empty() &&
       std::all_of(
           input_scopes.begin(),
           input_scopes.end(),
@@ -401,7 +401,7 @@ c10::optional<ScopePtr> FunctionExtractor::InferScope(Node* n) {
         output_scopes.end(),
         std::back_inserter(scopes),
         IsValidScope);
-    if (scopes.size() > 0) {
+    if (!scopes.empty()) {
       auto common_ancestor = FindCommonAncestor(scopes);
       if (common_ancestor.has_value() &&
           IsValidScope(common_ancestor.value())) {
@@ -829,7 +829,7 @@ void FunctionExtractor::HandleNoScopeNodes(
         "ONNX function extraction cannot determine the scope for node: ", *n);
   }
   TORCH_INTERNAL_ASSERT(
-      no_scope_nlist.size() == 0,
+      no_scope_nlist.empty(),
       "ONNX function extraction cannot determine the scope for the above nodes.");
 }
 
diff --git a/torch/csrc/jit/passes/onnx/function_substitution.cpp b/torch/csrc/jit/passes/onnx/function_substitution.cpp
index a5dd1d879370..a6e2f89e106e 100644
--- a/torch/csrc/jit/passes/onnx/function_substitution.cpp
+++ b/torch/csrc/jit/passes/onnx/function_substitution.cpp
@@ -40,7 +40,7 @@ std::string GetCallNodeVariableName(const Node* call_node) {
     return "";
   }
   std::string module_name = module_node->s(attr::name);
-  if (module_node->inputs().size() == 0) {
+  if (module_node->inputs().empty()) {
     return module_name;
   }
   // If module is from container, attr::name in module node only carries
@@ -53,7 +53,7 @@ std::string GetCallNodeVariableName(const Node* call_node) {
             "__torch__.torch.nn.modules.container.ModuleList") {
       auto parent_module_node = parent_module_value->node();
       module_name = parent_module_node->s(attr::name) + "." + module_name;
-      parent_module_value = parent_module_node->inputs().size() > 0
+      parent_module_value = !parent_module_node->inputs().empty()
           ? parent_module_node->input(0)
           : nullptr;
     } else {
@@ -167,7 +167,7 @@ void functionCallSubstitution(Block* block) {
 }
 
 ScopePtr ONNXGraphTopLevelScope(Graph& graph) {
-  if (graph.inputs().size() == 0) {
+  if (graph.inputs().empty()) {
     return graph.current_scope();
   }
   if (auto top_module_type = graph.inputs().at(0)->type()->cast<ClassType>()) {
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index a1ea88ae6572..e2a67363ba36 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -245,7 +245,7 @@ void ONNXLintGraph(
       GRAPH_DEBUG("Node does not set sourceRange:", *n);
       n_miss_source_range.emplace_back(n->kind());
     }
-    if (n->scopeName() == "") {
+    if (n->scopeName().empty()) {
       GRAPH_DEBUG("Node does not set scope:", *n);
       n_miss_scope.emplace_back(n->kind());
     }
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
index d93e34f87c6e..25b97ef908ec 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
@@ -298,7 +298,7 @@ std::vector<Value*> ConvertIndexPutToONNX(
   // select operator(0).
   std::vector<Node*> slice_and_select_nodes =
       IndexingPatternFinder::FetchSliceAndSelect(index_put_node);
-  Node* last_node = slice_and_select_nodes.size() > 0
+  Node* last_node = !slice_and_select_nodes.empty()
       ? slice_and_select_nodes.back()
       : index_put_node;
   // Update inner block input originates from outside.
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
index 8ac466e26511..41e3ac9ecc4e 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
@@ -33,7 +33,7 @@ Node* EncapsulateInplaceIndexPutForONNX(Node* index_put_node) {
   // select operator(0).
   std::vector<Node*> slice_and_select_nodes =
       IndexingPatternFinder::FetchSliceAndSelect(index_put_node);
-  Node* last_node = slice_and_select_nodes.size() > 0
+  Node* last_node = !slice_and_select_nodes.empty()
       ? slice_and_select_nodes.back()
       : index_put_node;
   Value* orig_data = last_node->input(0);
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 8fa08c110b6c..4814723621cc 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -194,7 +194,7 @@ void fuseConsecutiveTransposes(Block* b) {
           composeTransposes(
               origInput->node()->is(attr::perm), n->is(attr::perm)));
       n->replaceInput(0, origInput->node()->input());
-      if (origInput->uses().size() == 0) {
+      if (origInput->uses().empty()) {
         origInput->node()->destroy();
       }
       continue;
@@ -233,7 +233,7 @@ void fuseTransposeIntoGemm(Block* b) {
             inp->node()->is(attr::perm) == simpleTransPerm) {
           n->replaceInput(i, inp->node()->input());
           n->i_(trans, n->hasAttribute(trans) ? !n->i(trans) : 1);
-          if (inp->uses().size() == 0) {
+          if (inp->uses().empty()) {
             inp->node()->destroy();
           }
         }
@@ -307,7 +307,7 @@ void pushPackingPastRnn(Block* b) {
     n->outputs().at(0)->replaceAllUsesWith(n->inputs().at(0));
 
     Value* batch_sizes = n->outputs().at(1);
-    while (batch_sizes->uses().size()) {
+    while (!batch_sizes->uses().empty()) {
       Use use_0 = batch_sizes->uses().at(0);
       Node* user = use_0.user;
       // Make calculation of max_batch_size not depend on batch_sizes.
@@ -332,8 +332,13 @@ void pushPackingPastRnn(Block* b) {
         shape->addInput(rnn_input);
         shape->copyMetadata(n);
         batch_sizes->replaceFirstUseWith(shape->output());
-        user->inputs().at(1)->node()->t_(
-            attr::value, at::native::ones_like(const_val_t));
+        // New Constant node is needed, as it might be shared
+        // with a Constant node 0 from others.
+        Node* gather_indices = b->owningGraph()->create(onnx::Constant, 1);
+        gather_indices->t_(attr::value, at::native::ones_like(const_val_t));
+        gather_indices->copyMetadata(n);
+        gather_indices->insertBefore(user);
+        user->replaceInput(1, gather_indices->output());
       }
       // Make RNN not depend on batch_sizes.
       else if (user == rnn) {
@@ -526,7 +531,7 @@ void fixDefaultRNNState(
   fixed_init_state->addInput(concated_dims->outputs()[0]);
   n->replaceInput(input_index, fixed_init_state->outputs()[0]);
 
-  if (initial_state->uses().size() == 0) {
+  if (initial_state->uses().empty()) {
     initial_state->node()->destroy();
   }
 }
@@ -658,7 +663,7 @@ static void eraseListConstruct(Node* n, int opset_version) {
             i, std::vector<Value*>({concat_node->output()}));
       } else {
         if (opset_version >= OPSET_VERSION_11) {
-          c10::Symbol seq_node_kind = lc_node->inputs().size() > 0
+          c10::Symbol seq_node_kind = !lc_node->inputs().empty()
               ? onnx::SequenceConstruct
               : onnx::SequenceEmpty;
           Node* seq_node = block->owningGraph()->create(
@@ -855,7 +860,7 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
         // (%10)
         origLogSoftmaxNode = prev->input(0)->node();
         auto transpose = origLogSoftmaxNode->input(0)->node();
-        if (transpose->inputs().size() > 0) {
+        if (!transpose->inputs().empty()) {
           origLogSoftmaxNode->replaceInput(0, transpose->inputs().at(0));
         }
       } else if (
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index efb7686fae3f..08f415bb815a 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -294,7 +294,7 @@ static std::pair<Value*, Value*> PrepareListDeleteForONNX(Node* n) {
 
 static std::pair<Value*, Value*> PrepareListAppendAndInsertForONNX(Node* n) {
   TORCH_INTERNAL_ASSERT(n->kind() == aten::insert || n->kind() == aten::append);
-  if (n->outputs().size() == 0) {
+  if (n->outputs().empty()) {
     n->addOutput();
     n->output()->setType(n->inputs().at(0)->type());
   }
@@ -306,7 +306,7 @@ static std::pair<Value*, Value*> PrepareSetItemForONNX(Node* n) {
   // It seems the JIT does not always produce an output for _set_item.
   // In particular it seems to for list but not for dict.
   // So we add one if needed.
-  if (n->outputs().size() == 0) {
+  if (n->outputs().empty()) {
     n->addOutput();
     n->output()->setType(n->inputs().at(0)->type());
   }
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 9f45f302b2eb..bd55886b1261 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -198,7 +198,7 @@ bool IsValidONNXControlflowNode(const Node* n) {
   // nodes later, when the subgraph has already completed shape inferencing.
   auto node_kind = n->kind();
   if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) {
-    if (n->blocks().size() == 0) {
+    if (n->blocks().empty()) {
       return false;
     }
   }
@@ -413,7 +413,7 @@ void ConvertGraphToONNXProto(
 }
 
 c10::optional<at::Tensor> ComputeConstantFolding(Node* n, int opset_version) {
-  if (n->inputs().size() == 0) {
+  if (n->inputs().empty()) {
     return c10::nullopt;
   }
   std::vector<at::Tensor> inputTensorValues;
@@ -958,7 +958,7 @@ void ProcessReshapeNode(Node* n, int opset_version) {
     auto static_shape_value =
         ConstantValueMap::GetValueInto1DInt64Vector(shape_name);
     auto symbolic_input_shape = ConstantValueMap::GetShape(input_name);
-    if (symbolic_input_shape && static_shape_value.size() > 0) {
+    if (symbolic_input_shape && !static_shape_value.empty()) {
       auto final_shape = ComputeShapeFromReshape(
           n,
           symbolic_input_shape.value(),
@@ -2061,10 +2061,8 @@ void ONNXShapeTypeInference(
         const char shape_err[] = "ShapeInferenceError";
         // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
         const char type_err[] = "TypeInferenceError";
-        // NOLINTNEXTLINE(modernize-use-nullptr)
-        if ((strstr(ex.what(), shape_err) == NULL) &&
-            // NOLINTNEXTLINE(modernize-use-nullptr)
-            (strstr(ex.what(), type_err) == NULL)) {
+        if ((strstr(ex.what(), shape_err) == nullptr) &&
+            (strstr(ex.what(), type_err) == nullptr)) {
           throw;
         }
       }
@@ -2212,7 +2210,8 @@ size_t ONNXAssignOutputShape(
     size_t outputs_index,
     PyObject* output_obj,
     bool onnx_shape_inference,
-    bool is_script) {
+    bool is_script,
+    int opset_version) {
   auto index_check = [&]() {
     TORCH_INTERNAL_ASSERT(
         outputs_index <= graph->outputs().size(),
@@ -2234,7 +2233,8 @@ size_t ONNXAssignOutputShape(
           outputs_index,
           PyTuple_GET_ITEM(output_obj, i),
           onnx_shape_inference,
-          is_script);
+          is_script,
+          opset_version);
     }
   } else if (PyList_Check(output_obj)) {
     const auto list_len = PyList_GET_SIZE(output_obj);
@@ -2282,7 +2282,8 @@ size_t ONNXAssignOutputShape(
             outputs_index,
             PyList_GET_ITEM(output_obj, i),
             onnx_shape_inference,
-            is_script);
+            is_script,
+            opset_version);
       }
     }
   } else if (PyDict_Check(output_obj)) {
@@ -2298,7 +2299,8 @@ size_t ONNXAssignOutputShape(
           outputs_index,
           PyList_GET_ITEM(unrolled_dict.ptr(), i),
           onnx_shape_inference,
-          is_script);
+          is_script,
+          opset_version);
     }
   } else if (THPUtils_checkString(output_obj)) {
     // Ignore string, since they are not supported as output in ONNX.
@@ -2320,7 +2322,12 @@ size_t ONNXAssignOutputShape(
     // contain None objects. Ideally we'd remove this difference.
     if (is_script && outputs_index < graph->outputs().size()) {
       if (graph->outputs().at(outputs_index)->node()->mustBeNone()) {
-        graph->eraseOutput(outputs_index);
+        if (opset_version >= 15) {
+          ReplaceGraphOutputNoneWithOptional(graph, outputs_index);
+          outputs_index++;
+        } else {
+          graph->eraseOutput(outputs_index);
+        }
       } else {
         outputs_index++;
       }
@@ -2338,18 +2345,47 @@ size_t ONNXAssignOutputShape(
   return outputs_index;
 }
 
+Node* ONNXOptionalNodeForNone(std::shared_ptr<Graph>& graph) {
+  TypePtr elem_type = TensorType::get()->withScalarType(at::ScalarType::Float);
+  Node* opt_node = graph->create(::c10::onnx::Optional, 1);
+  opt_node->ty_(Symbol::attr("type"), elem_type);
+  opt_node->output()->setType(OptionalType::create(elem_type));
+  return opt_node;
+}
+
+void ReplaceGraphOutputNoneWithOptional(
+    std::shared_ptr<Graph>& graph,
+    size_t outputs_index) {
+  Node* opt_node = ONNXOptionalNodeForNone(graph);
+  opt_node->insertBefore(graph->return_node());
+  Value* graph_output = graph->outputs().at(outputs_index);
+  // replace only the last value as Optional type only affects
+  // the value right before output
+  graph_output->replaceAllUsesAfterNodeWith(opt_node, opt_node->output());
+  if (!graph_output->type()->cast<NoneType>()) {
+    opt_node->addInput(graph_output);
+    opt_node->copyMetadata(graph_output->node());
+  }
+}
+
 void ONNXAssignOutputShape(
     std::shared_ptr<Graph>& graph,
     at::ArrayRef<at::Tensor> outputs,
     const python::IODescriptor& desc,
     bool onnx_shape_inference,
-    bool is_script) {
+    bool is_script,
+    int opset_version) {
   size_t outputs_index = 0;
   PyObject* py_obj = unflatten(outputs, desc);
   TORCH_INTERNAL_ASSERT(PyTuple_Check(py_obj));
 
   outputs_index = ONNXAssignOutputShape(
-      graph, outputs_index, py_obj, onnx_shape_inference, is_script);
+      graph,
+      outputs_index,
+      py_obj,
+      onnx_shape_inference,
+      is_script,
+      opset_version);
 
   TORCH_INTERNAL_ASSERT(
       outputs_index == graph->outputs().size(),
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index 39350ed273d4..03e927a01bff 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -56,7 +56,17 @@ TORCH_API void ONNXAssignOutputShape(
     at::ArrayRef<at::Tensor> outputs,
     const python::IODescriptor& desc,
     bool onnx_shape_inference,
-    bool is_script);
+    bool is_script,
+    int opset_version);
+
+// Replace None in output with Optional node (opset > 15) if it's
+// script model. This helps align the output format in ONNX internal tests
+// when comparing pytorch results with ONNX results, as they have different
+// process for None in output.
+void ReplaceGraphOutputNoneWithOptional(
+    std::shared_ptr<Graph>& graph,
+    size_t outputs_index);
+Node* ONNXOptionalNodeForNone(std::shared_ptr<Graph>& graph);
 
 // Utilize ONNX Shape Inference for node.
 // The node must have ONNX namespace, and is valid ONNX node according to spec.
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index 300e3452a8d1..d46517c0187d 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -16,7 +16,6 @@
 #include <stack>
 
 using ::c10::Dispatcher;
-using ::c10::DispatchKey;
 namespace torch {
 namespace jit {
 namespace onnx {
@@ -211,7 +210,7 @@ std::vector<Node*> CreateQuantizedWeights(
   zero_point_node->t_(Symbol::attr("value"), zero_point_value.clone());
 
   Node* axis_node = graph->create(prim::Constant);
-  if (axis_data.size() > 0) {
+  if (!axis_data.empty()) {
     auto axis_value =
         at::from_blob(
             axis_data.data(), c10::IntArrayRef(axis_data.size()), at::kLong)
diff --git a/torch/csrc/jit/passes/pass_manager.h b/torch/csrc/jit/passes/pass_manager.h
index 111cb116dd41..8585c6ecdb3d 100644
--- a/torch/csrc/jit/passes/pass_manager.h
+++ b/torch/csrc/jit/passes/pass_manager.h
@@ -68,7 +68,7 @@ using RegisterPass = RegisterPostPass;
  * types.
  */
 template <typename DerivedType>
-struct TORCH_API PassManager {
+struct C10_EXPORT PassManager {
  private:
   // We want this class to be abstract because it's
   virtual void abstract() = 0;
diff --git a/torch/csrc/jit/passes/peephole_alias_sensitive.cpp b/torch/csrc/jit/passes/peephole_alias_sensitive.cpp
index 4c656eee4402..153975fddb50 100644
--- a/torch/csrc/jit/passes/peephole_alias_sensitive.cpp
+++ b/torch/csrc/jit/passes/peephole_alias_sensitive.cpp
@@ -52,7 +52,7 @@ struct PeepholeOptimizeAliasSensitiveImpl {
         auto dim_uses = c10::filter(node->output()->uses(), [](const Use& use) {
           return use.user->kind() == aten::dim;
         });
-        if (dim_uses.size() == 0) {
+        if (dim_uses.empty()) {
           continue;
         }
         auto kind = node->kind();
diff --git a/torch/csrc/jit/passes/peephole_dict_idioms.cpp b/torch/csrc/jit/passes/peephole_dict_idioms.cpp
index b3b4ed3d4044..4e2a56a9d06b 100644
--- a/torch/csrc/jit/passes/peephole_dict_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_dict_idioms.cpp
@@ -235,7 +235,7 @@ class PeepholeOptimizeDictIdiomsImpl {
       }
 
       // only optimizing dict ops
-      if (node->inputs().size() == 0 || !isDict(node->input(0))) {
+      if (node->inputs().empty() || !isDict(node->input(0))) {
         continue;
       }
 
diff --git a/torch/csrc/jit/passes/peephole_list_idioms.cpp b/torch/csrc/jit/passes/peephole_list_idioms.cpp
index 7a06b33409a7..2201347526f6 100644
--- a/torch/csrc/jit/passes/peephole_list_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp
@@ -37,7 +37,7 @@ struct ListLenRefiner {
   bool run() {
     std::unordered_set<Value*> li_with_len_use;
     collectListsToRefine(graph_->block(), li_with_len_use);
-    if (lists_to_refine_.size() == 0) {
+    if (lists_to_refine_.empty()) {
       return false;
     }
     ListRefinement refinements;
@@ -239,7 +239,7 @@ struct PeepholeOptimizeListIdiomsImpl {
       }
 
       // only optimizing list ops
-      if (node->inputs().size() == 0 ||
+      if (node->inputs().empty() ||
           !node->input(0)->type()->castRaw<ListType>()) {
         continue;
       }
diff --git a/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp b/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp
index ede7b3cae982..65e900d3888a 100644
--- a/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp
+++ b/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp
@@ -48,7 +48,7 @@ class ModuleUseDeduper {
 
         // path.size() == 0 means we're calling a method
         // on self, we don't need to dedup uses of self
-        if (path.size() == 0) {
+        if (path.empty()) {
           continue;
         }
         value_to_path_map_[instance] = path;
@@ -88,7 +88,7 @@ class ModuleUseDeduper {
       const Module& child_module,
       const std::vector<std::string>& path) {
     TORCH_INTERNAL_ASSERT(
-        path.size() > 0, "path must have at least one element.");
+        !path.empty(), "path must have at least one element.");
     // Parent module of the leaf child module corresponding to
     // the path
     auto parent_of_leaf = findChildModule(
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp
index 526aa8069569..70946f516460 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp
@@ -63,7 +63,7 @@ void fillQConfigMap(
 
   for (const NameModule& s : module.named_children()) {
     std::string child_key;
-    if (key == "") {
+    if (key.empty()) {
       child_key = s.name;
     } else {
       child_key = key + "." + s.name;
@@ -1562,7 +1562,7 @@ InsertObserversHelper::insertObserversFor(
             subblock_output_observe_state.push_back(
                 isObserved(output, block_observed_values));
           }
-          if (aggregated_output_observe_state.size() > 0) {
+          if (!aggregated_output_observe_state.empty()) {
             TORCH_CHECK(
                 aggregated_output_observe_state ==
                     subblock_output_observe_state,
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 1974dda885bd..88647f315d80 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -504,7 +504,7 @@ void ReplicateChooseQParamsQuantDequant(std::shared_ptr<Graph>& graph) {
   const Graph& dynamic_quant_graph = *dynamic_quant_pattern.pattern_graph;
 
   const auto& matches = findPatternMatches(dynamic_quant_graph, *graph);
-  if (matches.size() == 0) {
+  if (matches.empty()) {
     return;
   }
 
@@ -1250,7 +1250,7 @@ void removeDequantizeFromInputs(const std::unordered_set<Value*>& inputs) {
 // output
 c10::optional<std::vector<Value*>> getDequantizedInputs(Value* output) {
   auto inputs = getPassThroughInputs(output);
-  if (inputs.size() > 0) {
+  if (!inputs.empty()) {
     // note that we don't need to recursively check for prim::If
     // here because if all inputs of a prim::If is dequantized
     // the dequantize will be factored out before we get to this
@@ -1278,7 +1278,7 @@ void InsertQuantDeQuantHelper::propagateQuantizationOps(Block* block) {
       for (Block* subblock : n->blocks()) {
         propagateQuantizationOps(subblock);
       }
-      if (n->outputs().size() == 0) {
+      if (n->outputs().empty()) {
         continue;
       }
       if (n->outputs().size() > 1) {
@@ -1430,7 +1430,7 @@ void InsertQuantDeQuantHelper::run(
       auto qparam_map = std::get<1>(tp);
       // We check the size here because for some observers (like
       // PlaceholderObserver) the qparams might be empty.
-      if (qparam_map.size() > 0) {
+      if (!qparam_map.empty()) {
         TORCH_INTERNAL_ASSERT(
             qparam_name_map_for_node_.count(n),
             "Expected to have a qparam_name_map for node:",
diff --git a/torch/csrc/jit/passes/remove_mutation.cpp b/torch/csrc/jit/passes/remove_mutation.cpp
index 3898aabedceb..d610540e2cbc 100644
--- a/torch/csrc/jit/passes/remove_mutation.cpp
+++ b/torch/csrc/jit/passes/remove_mutation.cpp
@@ -15,7 +15,7 @@ bool MutationRemover::removeTensorMutation() {
 bool MutationRemover::hasSideEffectOrAlias(Value* v, AliasDb* aliasDb) {
   // bail on nodes with side effects, blocks, or graph / graph inputs
   Node* n = v->node();
-  bool unhandled_node = n->blocks().size() != 0 ||
+  bool unhandled_node = !n->blocks().empty() ||
       n->hasAttribute(attr::Subgraph) || n->hasSideEffects() ||
       (v->node()->kind() == prim::Param);
 
@@ -210,7 +210,7 @@ bool MutationRemover::RemoveListMutation(Block* block) {
     }
 
     // process use-chain and aliasing of node output
-    bool has_output = (node->outputs().size() > 0);
+    bool has_output = (!node->outputs().empty());
     if (has_output) {
       node->output()->replaceAllUsesWith(mutated_value);
       getOrCreateAliasDb()->writeIndex_->erase(node);
@@ -339,7 +339,7 @@ bool MutationRemover::inplaceOpVariant(Node* n) {
   // all inplace ops at time of writing have a single input that is mutated
   // and returned. check that this is true, anything else could have strange
   // semantics,
-  if (n->outputs().size() != 1 || n->inputs().size() == 0) {
+  if (n->outputs().size() != 1 || n->inputs().empty()) {
     return false;
   }
   auto inputs = n->inputs();
@@ -350,7 +350,7 @@ bool MutationRemover::inplaceOpVariant(Node* n) {
   }
 
   auto new_schema = name.substr(0, name.size() - 1);
-  return getAllOperatorsFor(Symbol::fromQualString(new_schema)).size() != 0;
+  return !getAllOperatorsFor(Symbol::fromQualString(new_schema)).empty();
 }
 
 bool RemoveListMutation(const std::shared_ptr<Graph>& graph) {
diff --git a/torch/csrc/jit/passes/specialize_autogradzero.cpp b/torch/csrc/jit/passes/specialize_autogradzero.cpp
index 2f72257f064f..5cc0bfe0ce0d 100644
--- a/torch/csrc/jit/passes/specialize_autogradzero.cpp
+++ b/torch/csrc/jit/passes/specialize_autogradzero.cpp
@@ -240,7 +240,7 @@ struct AutogradZeroSpecializer {
         continue;
       }
 
-      if (inp->uses().size() == 0 || !inp->type()->cast<TensorType>()) {
+      if (inp->uses().empty() || !inp->type()->cast<TensorType>()) {
         continue;
       }
 
@@ -265,7 +265,7 @@ struct AutogradZeroSpecializer {
     }
     GRAPH_DUMP("After for loop", graph_);
     // unable to specialize any of the inputs
-    if (nonzero_values.size() == 0 && zero_values.size() == 0) {
+    if (nonzero_values.empty() && zero_values.empty()) {
       GRAPH_DUMP("Unable to add any specialization guards", graph_);
       versioning_if->destroy();
       // the checks we inserted will be cleaned up
@@ -367,7 +367,7 @@ struct AutogradZeroSpecializer {
           // if we decided to specialize this graph
           // its input may have undefinedness info
           // otherwise it should be Unknown
-          if (n->inputs().size() > 0) {
+          if (!n->inputs().empty()) {
             state_[n->output()] = !state_.count(n->input())
                 ? State::Unknown
                 : state_[n->output()] = state_[n->input()];
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index a45f8d32b170..6d84ef43ba16 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -317,7 +317,7 @@ struct SymbolicShapeOpAnalyzer {
 
   // We handle non-constant values in the shape propagation step
   void substituteConstantInputs() {
-    if (shape_compute_graph_->inputs().size() == 0) {
+    if (shape_compute_graph_->inputs().empty()) {
       return;
     }
 
diff --git a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
index 7a1d2caedb18..4e63376850ef 100644
--- a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
@@ -322,7 +322,7 @@ void inlineFallbackGraphAndAddSRCopyOutOp(std::shared_ptr<Graph> graph) {
   auto false_block = if_v.elseBlock();
   std::vector<Value*> false_block_outputs(
       if_v.elseOutputs().begin(), if_v.elseOutputs().end());
-  TORCH_INTERNAL_ASSERT(false_block_outputs.size() != 0);
+  TORCH_INTERNAL_ASSERT(!false_block_outputs.empty());
 
   for (auto out : false_block_outputs) {
     TORCH_INTERNAL_ASSERT(out->type()->cast<TensorType>());
@@ -500,7 +500,7 @@ Operation StaticRuntimeCopyOuts(const Node* node) {
   return [num_ten_inputs](Stack& stack) {
     std::vector<IValue> inputs = pop(stack, num_ten_inputs);
     // uncommon case - first run
-    if (stack.size() == 0) {
+    if (stack.empty()) {
       for (IValue elem : inputs) {
         push(stack, std::move(elem));
       }
@@ -550,7 +550,7 @@ RegisterOperators reg_guard({
 
           // Map from symbolic dimension value to its set's index
           std::map<int64_t, size_t> sym_dim_flat_index;
-          TORCH_INTERNAL_ASSERT(types.size() >= 1);
+          TORCH_INTERNAL_ASSERT(!types.empty());
 
           // we should just be fusing fusion groups with a single device
           // and with tensors not requiring grad
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index f0608a8aeba5..ba9c5380681b 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -325,7 +325,7 @@ void insertTypeGuard(
     guard_types.emplace_back(
         type_converter(input->type()->expect<TensorType>()));
   }
-  if (!inputs_to_check.size()) {
+  if (inputs_to_check.empty()) {
     return;
   }
 
@@ -693,7 +693,7 @@ class TensorExprFuser {
     }
 
     Node* prev_fusion_group =
-        initial_fusion_groups.size() ? initial_fusion_groups[0] : nullptr;
+        !initial_fusion_groups.empty() ? initial_fusion_groups[0] : nullptr;
 
     for (const auto i : c10::irange(1, initial_fusion_groups.size())) {
       // Try merging the just created fusion group into the previous one.
@@ -1315,7 +1315,7 @@ class TensorExprFuser {
 
     std::string line;
     while (std::getline(in_ss, line, ':')) {
-      if (line.size() == 0) {
+      if (line.empty()) {
         continue;
       }
       operators_not_to_fuse.insert(c10::Symbol::aten(line));
diff --git a/torch/csrc/jit/passes/utils/memory_dag.cpp b/torch/csrc/jit/passes/utils/memory_dag.cpp
index d8eef5af852c..9d57f49bfe3d 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.cpp
+++ b/torch/csrc/jit/passes/utils/memory_dag.cpp
@@ -102,7 +102,7 @@ void MemoryDAG::collectAllContainedMemoryLocationsImpl(
 bool MemoryDAG::mayContainAlias(
     const Element* a,
     const at::ArrayRef<Element*> b) const {
-  if (b.size() == 0) {
+  if (b.empty()) {
     return false;
   }
 
@@ -115,7 +115,7 @@ bool MemoryDAG::mayContainAlias(
 bool MemoryDAG::mayContainAlias(
     const at::ArrayRef<Element*> a,
     const at::ArrayRef<Element*> b) const {
-  if (a.size() == 0 || b.size() == 0) {
+  if (a.empty() || b.empty()) {
     return false;
   }
 
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index fc2140495d8f..adf63bb6244e 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -263,7 +263,7 @@ void collectNestedUses(
       collectNestedUses(
           closed_over_values, new_values, externalValuesMap, node);
     }
-  } else if (input_node->blocks().size() != 0) {
+  } else if (!input_node->blocks().empty()) {
     TORCH_INTERNAL_ASSERT(false, input_node, " kind not handled yet");
   }
   for (Value* output : input_node->outputs()) {
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
index 0c37d5b50347..7ac76b032caa 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -105,7 +105,8 @@ void transferInputOutputBackends(std::shared_ptr<Graph>& graph) {
   // Move inputs to Vulkan backend
   for (Value* input : graph->inputs()) {
     NamedValue named_input = NamedValue("", input);
-    if (named_input.type()->kind() == TypeKind::TensorType) {
+    if (named_input.type()->kind() == TypeKind::TensorType &&
+        !input->uses().empty()) {
       // find the insertion point
       WithInsertPoint ip(input->uses()[0].user->prev());
       Value* replaced_input = graph->insert(
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index fed555c8cd7e..1ec6a444e8c0 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -7,7 +7,7 @@
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/backends/backend_init.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
+// #include <torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h>
 #include <torch/csrc/jit/codegen/fuser/interface.h>
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
 #if (!defined(FBCODE_CAFFE2) && defined(BUILD_ONEDNN_GRAPH))
@@ -1151,6 +1151,14 @@ void initJITBindings(PyObject* module) {
 #define SYMNODE_UNARY(n) .def(#n, [](c10::SymNode a) { return a->n(); })
 #define SYMNODE_BINARY(n) \
   .def(#n, [](c10::SymNode a, c10::SymNode b) { return a->n(b); })
+#define SYMNODE_SIZES_STRIDES(n)                \
+  .def(                                         \
+      #n,                                       \
+      [](c10::SymNode a,                        \
+         c10::ArrayRef<c10::SymNode> sizes,     \
+         c10::ArrayRef<c10::SymNode> strides) { \
+        return a->n(sizes, strides);            \
+      })
   auto symnode_class =
       py::class_<c10::SymNodeImpl, c10::SymNode>(m, "_SymNode")
       // clang-format off
@@ -1184,12 +1192,14 @@ void initJITBindings(PyObject* module) {
       SYMNODE_UNARY(ceil)
       SYMNODE_UNARY(floor)
       SYMNODE_UNARY(neg)
+      SYMNODE_SIZES_STRIDES(is_contiguous)
+      SYMNODE_SIZES_STRIDES(is_channels_last_contiguous_2d)
+      SYMNODE_SIZES_STRIDES(is_channels_last_contiguous_3d)
+      SYMNODE_SIZES_STRIDES(is_channels_last_strides_2d)
+      SYMNODE_SIZES_STRIDES(is_channels_last_strides_3d)
+      SYMNODE_SIZES_STRIDES(is_non_overlapping_and_dense)
       // Intentionally don't set file line, as the
       // Python backtrace matters more here
-      .def("is_non_overlapping_and_dense",
-          [](c10::SymNode a, c10::ArrayRef<c10::SymNode> sizes, c10::ArrayRef<c10::SymNode> strides) {
-            return a->is_non_overlapping_and_dense(sizes, strides);
-          })
       .def(
           "guard_int",
           [](c10::SymNode a) {
@@ -1205,6 +1215,11 @@ void initJITBindings(PyObject* module) {
           [](c10::SymNode a) {
             return a->guard_float(nullptr, 0);
           })
+      .def(
+          "has_hint",
+          [](c10::SymNode a) {
+            return a->has_hint();
+          })
       .def(
           "wrap_int",
           [](c10::SymNode a, int64_t b) {
@@ -1865,6 +1880,44 @@ void initJITBindings(PyObject* module) {
                 return nullptr;
               }),
           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<PythonAwaitWrapper, std::shared_ptr<PythonAwaitWrapper>>(
+      m, "_Await")
+      .def(
+          "wait",
+          &PythonAwaitWrapper::wait,
+          py::call_guard<py::gil_scoped_release>())
+      .def("fn", &PythonAwaitWrapper::fn)
+      .def("args", &PythonAwaitWrapper::args)
+      .def("type", &PythonAwaitWrapper::type)
+      .def("is_nowait", &PythonAwaitWrapper::is_nowait)
+      .def(
+          "__getattr__",
+          [](PythonAwaitWrapper& self, const std::string& name) -> py::object {
+            // In eager mode allow Await[W] to be used as W, redirecting getattr
+            // to the result of delayed function.
+            return py::getattr(self.wait(), name.c_str(), py::none());
+          })
+      .def(
+          py::pickle(
+              /* __getstate__ */
+              [](const PythonAwaitWrapper& /* unused */) {
+                TORCH_CHECK(false, "Can not pickle torch.jit._Await");
+                // Note that this return has no meaning since we always
+                // throw, it's only here to satisfy Pybind API's
+                // requirement.
+                return py::make_tuple();
+              },
+              /* __setstate__ */
+              [](const py::tuple& /* unused */) { // NOLINT
+                TORCH_CHECK(false, "Can not unpickle torch.jit._Await");
+                // Note that this return has no meaning since we always
+                // throw, it's only here to satisfy PyBind's API
+                // requirement.
+                return nullptr;
+              }),
+          py::call_guard<py::gil_scoped_release>());
+
   m.def("_is_alias_of", [](const py::object& self, const py::object& other) {
     c10::optional<IValue> self_value = toTypeInferredIValueOptional(self);
     c10::optional<IValue> other_value = toTypeInferredIValueOptional(other);
@@ -1885,8 +1938,25 @@ void initJITBindings(PyObject* module) {
     }
     return self_value->overlaps(*other_value);
   });
-  m.def("fork", [](const py::args& args, const py::kwargs& kwargs) {
+  m.def("_awaitable", [](const py::args& args, const py::kwargs& kwargs) {
     AT_ASSERT(args.size() >= 1);
+    py::tuple args_tup(args.size() - 1);
+    for (const auto i : c10::irange(1, args.size())) {
+      args_tup[i - 1] = args[i];
+    }
+    return std::make_shared<PythonAwaitWrapper>(
+        py::cast<py::function>(args[0]), std::move(args_tup));
+  });
+  m.def("_awaitable_nowait", [](py::handle input) {
+    return std::make_shared<PythonAwaitWrapper>(std::move(input));
+  });
+  m.def(
+      "_awaitable_wait", [](const std::shared_ptr<PythonAwaitWrapper>& py_aw) {
+        TORCH_CHECK(py_aw, "Await can't be None");
+        return py_aw->wait();
+      });
+  m.def("fork", [](const py::args& args, const py::kwargs& kwargs) {
+    AT_ASSERT(!args.empty());
 
     py::function f = py::cast<py::function>(args[0]);
     py::tuple args_tup(args.size() - 1);
@@ -1997,7 +2067,7 @@ void initJITBindings(PyObject* module) {
   initJitBackendBindings(module);
   initStaticModuleBindings(module);
   initTensorExprBindings(module);
-  initNvFuserPythonBindings(module);
+  // initNvFuserPythonBindings(module);
 
   setPrintHandler([](const std::string& str) {
     py::gil_scoped_acquire acquire;
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index ef88d700f113..221753ddc3f8 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/ir/graph_utils.h>
 #include <torch/csrc/jit/python/module_python.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/jit/python/python_dict.h>
@@ -225,7 +226,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       auto stream = c10::Stream::unpack3(
           thp_stream->stream_id,
           thp_stream->device_index,
-          thp_stream->device_type);
+          static_cast<c10::DeviceType>(thp_stream->device_type));
       return stream;
     }
     case TypeKind::ListType: {
@@ -470,6 +471,9 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
     case TypeKind::FutureType: {
       return obj.cast<std::shared_ptr<PythonFutureWrapper>>()->fut;
     }
+    case TypeKind::AwaitType: {
+      return obj.cast<std::shared_ptr<PythonAwaitWrapper>>()->aw_;
+    }
     case TypeKind::AnyType:
       return toTypeInferredIValue(obj);
     case TypeKind::QSchemeType: {
@@ -570,7 +574,7 @@ py::object toPyObject(IValue ivalue) {
 
     // If we have a NamedTuple
     if (tuple->type() && tuple->type()->schema() &&
-        tuple->type()->schema()->name() != "") {
+        !tuple->type()->schema()->name().empty()) {
       auto unqualName = tuple->type()->name()->name();
 
       const std::vector<Argument>& tuple_args =
@@ -646,6 +650,8 @@ py::object toPyObject(IValue ivalue) {
     return py::cast(c10::Capsule(ivalue.toCapsule()));
   } else if (ivalue.isFuture()) {
     return py::cast(std::make_shared<PythonFutureWrapper>(ivalue.toFuture()));
+  } else if (ivalue.isAwait()) {
+    return py::cast(std::make_shared<PythonAwaitWrapper>(ivalue.toAwait()));
   } else if (ivalue.isEnum()) {
     auto enum_holder = ivalue.toEnumHolder();
     auto py_class = getScriptedClassOrError(enum_holder->type());
@@ -758,7 +764,7 @@ py::object _get_operation_for_overload_or_packet(
         total_arg_num,
         false /* throw_error */);
   }
-  if (overloaded_args.size() > 0 || at::impl::torch_function_mode_enabled()) {
+  if (!overloaded_args.empty() || at::impl::torch_function_mode_enabled()) {
     py::object ret;
     std::string ns = symbol.ns().toUnqualString();
     std::string method_name = symbol.toUnqualString();
@@ -768,7 +774,7 @@ py::object _get_operation_for_overload_or_packet(
                          .attr(method_name.c_str());
     if (is_overload) {
       auto overload_name = operations[0]->schema().overload_name();
-      if (overload_name == "") {
+      if (overload_name.empty()) {
         self_func = self_func.attr("default");
       } else {
         self_func = self_func.attr(overload_name.c_str());
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index f1872994d859..715536952990 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -59,12 +59,12 @@ namespace jit {
 
 void clear_registered_instances(void* ptr);
 
-TORCH_API IValue toIValue(
+TORCH_PYTHON_API IValue toIValue(
     py::handle obj,
     const TypePtr& type,
     c10::optional<int32_t> N = c10::nullopt);
 
-TORCH_API py::object toPyObject(IValue ivalue);
+TORCH_PYTHON_API py::object toPyObject(IValue ivalue);
 
 // Hack to overload the behavior of toIValue to accept Python
 // numbers in places where a Tensor is expected
@@ -240,6 +240,79 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper
   }
 };
 
+// The PythonAwaitWrapper for ivalue::Await
+//
+// Expresses delayed function execution with Lazy semantic.
+// i.e. Await[W] in eager mode can be used as W.
+// When the attribute of W type is requested, Await[W] will return the
+// attribute of W, transparently calling wait() beforehand.
+// No Lazy semantic for script, explicit wait(Await[W]) -> W must be called to
+// convert to type W.
+//
+// The Await object takes shared ownership of specified function and the
+// arguments. After first call for wait() it owns the result. Deliberately no
+// type inference for eager mode.
+struct VISIBILITY_HIDDEN PythonAwaitWrapper
+    : std::enable_shared_from_this<PythonAwaitWrapper> {
+  explicit PythonAwaitWrapper(c10::intrusive_ptr<c10::ivalue::Await> aw)
+      : aw_(std::move(aw)) {}
+  explicit PythonAwaitWrapper(py::handle input) {
+    args_ = py::tuple(1u);
+    args_[0] = input;
+    auto type = PyObjectType::get();
+    aw_ = c10::make_intrusive<c10::ivalue::Await>(type);
+    aw_->markCompleted(toIValue(input, type));
+  }
+
+  explicit PythonAwaitWrapper(py::function pf, py::tuple args) {
+    pyfg_ = std::make_shared<torch::jit::PythonFunctionGuard>(std::move(pf));
+    args_ = std::move(args);
+    std::function<IValue()> f = [fg(pyfg_), &args(args_)]() {
+      pybind11::gil_scoped_acquire ag;
+      return toIValue(fg->func_(*args), PyObjectType::get());
+    };
+    aw_ = c10::make_intrusive<c10::ivalue::Await>(
+        PyObjectType::get(), std::move(f));
+  }
+
+  explicit PythonAwaitWrapper(const PythonAwaitWrapper&) = delete;
+  PythonAwaitWrapper& operator=(const PythonAwaitWrapper&) = delete;
+
+  py::object wait() {
+    py::gil_scoped_acquire acquire;
+    return toPyObject(aw_->wait());
+  }
+
+  // Nowait semantic means trivial case when Await is constructed from the
+  // result
+  bool is_nowait() {
+    return pyfg_ == nullptr;
+  }
+
+  const py::function fn() {
+    TORCH_CHECK(
+        pyfg_, "Await constructed as awaitable_nowait does not have fn");
+    return pyfg_->func_;
+  }
+
+  const py::tuple args() {
+    return args_;
+  }
+
+  TypePtr type() {
+    return aw_->type();
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Await> aw_;
+  std::shared_ptr<torch::jit::PythonFunctionGuard> pyfg_;
+  py::tuple args_;
+
+ private:
+  std::shared_ptr<PythonAwaitWrapper> getPtr() {
+    return shared_from_this();
+  }
+};
+
 // error reporting: when reporting user-caused errors, these functions should
 // not use AT_ERROR macros, since these macros add stack trace information
 // that is confusing to display to the end user since it always reports
@@ -403,6 +476,13 @@ inline InferredType tryToInferType(py::handle input) {
 #endif
   }
 
+  auto await_type = py::module::import("torch._awaits").attr("_Await");
+  py::bool_ is_await = py::isinstance(input, await_type);
+  if (py::cast<bool>(is_await)) {
+    auto awptr = input.cast<std::shared_ptr<PythonAwaitWrapper>>();
+    return InferredType(AwaitType::create(awptr->aw_->elementType()));
+  }
+
   if (as_module(py::cast<py::object>(input))) {
     return InferredType("Cannot infer type of ScriptModule");
   }
@@ -621,10 +701,6 @@ inline void guardAgainstNamedTensor(const T& var) {
       "workaround please drop names via `tensor = tensor.rename(None)`.");
 }
 
-// Defined in pybind_utils.cpp to break a circular dependency with
-// python_ivalue.h
-IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N);
-
 // Extract custom class registered with torchbind
 template <typename T>
 c10::intrusive_ptr<T> toCustomClass(py::handle obj) {
@@ -922,7 +998,7 @@ inline py::object runAndInsertCall(
   }
 
   TORCH_CHECK(
-      stack.size() > 0,
+      !stack.empty(),
       "Expected values in the stack after execution but found none");
   return toPyObject(std::move(stack.back()));
 }
@@ -963,7 +1039,7 @@ inline c10::optional<py::object> maybeTorchFunctionDispatch(
         total_arg_num,
         false /* throw_error */);
   }
-  if (overloaded_args.size() > 0) {
+  if (!overloaded_args.empty()) {
     return pybind11::reinterpret_steal<py::object>(
         handle_torch_function_no_python_arg_parser(
             /*overloaded_args=*/overloaded_args,
@@ -1015,18 +1091,18 @@ inline py::object invokeScriptMethodFromPython(
       });
 }
 
-TORCH_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+TORCH_PYTHON_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
     const py::kwargs& kwargs);
 
-TORCH_API py::object invokeOperatorFromPython(
+TORCH_PYTHON_API py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     py::args args,
     const py::kwargs& kwargs,
     c10::optional<c10::DispatchKey> dk = c10::nullopt);
 
-TORCH_API py::object _get_operation_for_overload_or_packet(
+TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
     const std::vector<std::shared_ptr<Operator>>& operations,
     Symbol symbol,
     py::args args,
diff --git a/torch/csrc/jit/python/python_dict.cpp b/torch/csrc/jit/python/python_dict.cpp
index 2c7716068e0d..ea64f5a985de 100644
--- a/torch/csrc/jit/python/python_dict.cpp
+++ b/torch/csrc/jit/python/python_dict.cpp
@@ -64,7 +64,7 @@ void initScriptDictBindings(PyObject* module) {
       .def(py::init([](py::dict dict) {
         TypePtr type = nullptr;
 
-        if (dict.size() > 0) {
+        if (!dict.empty()) {
           // If the source dictionary is nonempty, try to infer its type.
           auto inferred_type = tryToInferType(dict);
 
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 1666b3d550be..42be519bcf17 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -1059,6 +1059,10 @@ void initPythonIRBindings(PyObject* module_) {
       .def(py::init([](TypePtr a) { return FutureType::create(std::move(a)); }))
       .def("getElementType", &FutureType::getElementType);
 
+  py::class_<AwaitType, Type, AwaitTypePtr>(m, "AwaitType")
+      .def(py::init([](TypePtr a) { return AwaitType::create(std::move(a)); }))
+      .def("getElementType", &AwaitType::getElementType);
+
   py::class_<ClassType, Type, ClassTypePtr>(m, "ClassType")
       .def(py::init([](const std::string& qualified_name) {
         return get_python_cu()->get_class(c10::QualifiedName(qualified_name));
diff --git a/torch/csrc/jit/python/python_list.cpp b/torch/csrc/jit/python/python_list.cpp
index a0e30f78ee8d..ee2e7a7612ed 100644
--- a/torch/csrc/jit/python/python_list.cpp
+++ b/torch/csrc/jit/python/python_list.cpp
@@ -63,7 +63,7 @@ void initScriptListBindings(PyObject* module) {
       .def(py::init([](py::list list) {
         TypePtr type = nullptr;
 
-        if (list.size() > 0) {
+        if (!list.empty()) {
           // If the source list is nonempty, try to infer its type.
           auto inferred_type = tryToInferType(list);
 
@@ -289,7 +289,7 @@ void initScriptListBindings(PyObject* module) {
           [](py::list list) { // __setstate__
             TypePtr type = nullptr;
 
-            if (list.size() > 0) {
+            if (!list.empty()) {
               // If the source list is nonempty, try to infer its type.
               auto inferred_type = tryToInferType(list);
 
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 2050790f56e8..83721909678c 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -50,7 +50,7 @@ FunctionSchema PythonValue::getSchema(
   auto param_names = py::cast<std::vector<std::string>>(py_param_names);
   auto names_it = param_names.begin();
   if (moduleSelf_) {
-    if (param_names.size() == 0) {
+    if (param_names.empty()) {
       throw ErrorReport(loc)
           << "Non-static method does not have a self argument";
     }
@@ -418,7 +418,7 @@ void recurseThroughNestedModules(
     auto keys_value = keys_iter->tup_.at(i);
     auto key_string = toIValue(keys_value->asValue(loc, m))->toStringRef();
     std::string submodule_prefix = prefix;
-    if (prefix != "") {
+    if (!prefix.empty()) {
       submodule_prefix = prefix + ".";
     }
     submodule_prefix += key_string;
@@ -746,9 +746,8 @@ std::shared_ptr<SugaredValue> ModuleValue::call(
     at::ArrayRef<NamedValue> kwargs,
     size_t n_binders) {
   c10::ClassTypePtr class_type = concreteType_->getJitType()->cast<ClassType>();
-  bool have_pre_hooks =
-      class_type && class_type->getForwardPreHooks().size() != 0;
-  bool have_hooks = class_type && class_type->getForwardHooks().size() != 0;
+  bool have_pre_hooks = class_type && !class_type->getForwardPreHooks().empty();
+  bool have_hooks = class_type && !class_type->getForwardHooks().empty();
 
   std::vector<Value*> arg_values;
   std::vector<NamedValue> pre_hook_result;
@@ -797,7 +796,7 @@ std::shared_ptr<SugaredValue> ModuleValue::call(
     for (auto& output_node : output_nodes) {
       pre_hook_result.emplace_back(output_node);
     }
-    if (args.size() != 0) { // only replace input if it existed
+    if (!args.empty()) { // only replace input if it existed
       args = pre_hook_result;
     }
   }
@@ -971,7 +970,7 @@ std::shared_ptr<SugaredValue> PythonExceptionValue::call(
     at::ArrayRef<NamedValue> kwargs,
     size_t /*n_binders*/) {
   Value* error_message = nullptr;
-  if (args.size() == 0) {
+  if (args.empty()) {
     error_message = insertConstant(*caller.graph(), "", loc);
   } else if (args.size() == 1) {
     error_message = args.at(0).value(*caller.graph());
@@ -1207,6 +1206,9 @@ std::shared_ptr<SugaredValue> toSugaredValue(
       obj.ptr() == py::module::import("torch.jit").attr("_fork").ptr() ||
       obj.ptr() == py::module::import("torch.jit").attr("fork").ptr()) {
     return SpecialFormValue::create(prim::fork);
+  } else if (
+      obj.ptr() == py::module::import("torch.jit").attr("_awaitable").ptr()) {
+    return SpecialFormValue::create(prim::awaitable);
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) {
     return SpecialFormValue::create(prim::annotate);
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index be6a4bf887d6..1cf6e8d81b84 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -139,7 +139,7 @@ struct VISIBILITY_HIDDEN ModuleDictMethod : public SugaredValue {
       at::ArrayRef<NamedValue> args,
       at::ArrayRef<NamedValue> kwargs,
       size_t n_binders) override {
-    if (args.size() || kwargs.size()) {
+    if (!args.empty() || !kwargs.empty()) {
       throw ErrorReport(loc)
           << name_ << " method does not accept any arguments";
     }
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index dc64b781f847..13e5a5c27568 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -33,6 +33,7 @@
 #include <torch/csrc/jit/frontend/parser.h>
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/jit/ir/graph_utils.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
@@ -436,91 +437,6 @@ struct VISIBILITY_HIDDEN ModuleSelf : public Self {
   std::shared_ptr<ConcreteModuleType> concreteType_;
 };
 
-static TypePtr getTensorType(const at::Tensor& t, bool complete) {
-  auto r = TensorType::create(t);
-  if (!complete) {
-    r = r->dimensionedOnly();
-  }
-  return r;
-}
-
-static TypePtr inferShapeAndTypeForInput(
-    TypePtr input_type,
-    Stack::const_iterator& s_iter,
-    const Stack::const_iterator& s_iter_end,
-    bool complete) {
-  if (auto tuple_type = input_type->cast<TupleType>()) {
-    std::vector<TypePtr> types;
-    for (const auto& sub_type : tuple_type->containedTypes()) {
-      TORCH_INTERNAL_ASSERT(s_iter != s_iter_end);
-      types.emplace_back(
-          inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete));
-    }
-    return TupleType::create(types);
-  } else if (auto list_type = input_type->cast<ListType>()) {
-    const TypePtr& sub_type = list_type->getElementType();
-    auto elem_type =
-        inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete);
-    return ListType::create(elem_type);
-  } else if (auto tensor_type = input_type->cast<TensorType>()) {
-    auto type = getTensorType(s_iter->toTensor(), complete);
-    s_iter++;
-    return type;
-  } else if (auto optional_type = input_type->cast<OptionalType>()) {
-    const TypePtr& sub_type = optional_type->getElementType();
-    auto elem_type =
-        inferShapeAndTypeForInput(sub_type, s_iter, s_iter_end, complete);
-    return OptionalType::create(elem_type);
-  } else {
-    // Primitive type, keep as is.
-    s_iter++;
-    return input_type;
-  }
-}
-
-static void setInputTensorTypes(
-    Graph& g,
-    const Stack& stack,
-    bool complete,
-    const std::vector<int>& param_count_list = {}) {
-  at::ArrayRef<Value*> input_values = g.inputs();
-  auto s_iter = stack.begin();
-  size_t list_idx = 0;
-  if (!param_count_list.empty()) {
-    TORCH_INTERNAL_ASSERT(
-        input_values.size() == param_count_list.size(),
-        " input_values:",
-        input_values.size(),
-        " vs param_count_list:",
-        param_count_list.size());
-  }
-  for (auto v : input_values) {
-    // Leave packed param types alone. This is needed for downstream passes
-    // (like alias analysis) to work properly. This will be unpacked later
-    // in unpackQuantizedWeights.
-    if (auto named_type = v->type()->cast<c10::NamedType>()) {
-      if (auto qualname = named_type->name()) {
-        if (getCustomClass(qualname->qualifiedName())) {
-          if (param_count_list.empty()) {
-            AT_ASSERT(s_iter != stack.end());
-            s_iter++;
-          } else {
-            if (param_count_list[list_idx] > 0) {
-              AT_ASSERT(s_iter != stack.end());
-            }
-            s_iter += param_count_list[list_idx];
-          }
-          list_idx++;
-          continue;
-        }
-      }
-    }
-    v->setType(
-        inferShapeAndTypeForInput(v->type(), s_iter, stack.end(), complete));
-    list_idx++;
-  }
-}
-
 static std::shared_ptr<Graph> _propagate_shapes(
     Graph& graph,
     std::vector<at::Tensor> inputs,
@@ -1190,7 +1106,8 @@ void initJitScriptBindings(PyObject* module) {
              const py::function& var_name_lookup_fn,
              bool strict,
              bool force_outplace,
-             const std::vector<std::string>& argument_names) {
+             const std::vector<std::string>& argument_names,
+             bool store_inputs) {
             // prereq: Module's buffers and parameters are unique
             // this was ensured in python before calling this function
             auto typed_inputs = toTraceableStack(input_tuple);
@@ -1208,6 +1125,9 @@ void initJitScriptBindings(PyObject* module) {
             auto fn = self._ivalue()->compilation_unit()->create_function(
                 method_name, graph);
             self.type()->addMethod(fn);
+            if (store_inputs) {
+              self.store_traced_inputs(name, typed_inputs);
+            }
             didFinishEmitModule(self);
           },
           py::arg("name"),
@@ -1216,7 +1136,8 @@ void initJitScriptBindings(PyObject* module) {
           py::arg("var_name_lookup_fn"),
           py::arg("strict"),
           py::arg("force_outplace"),
-          py::arg("argument_names") = std::vector<std::string>())
+          py::arg("argument_names") = std::vector<std::string>(),
+          py::arg("store_inputs"))
       .def(
           "_create_method_from_trace_with_dict",
           [](Module& self,
@@ -1226,7 +1147,8 @@ void initJitScriptBindings(PyObject* module) {
              const py::function& var_name_lookup_fn,
              bool strict,
              bool force_outplace,
-             const std::vector<std::string>& argument_names) {
+             const std::vector<std::string>& argument_names,
+             bool store_inputs) {
             // prereq: Module's buffers and parameters are unique
             // this was ensured in python before calling this function
             auto typed_inputs = toTraceableStack(input_dict);
@@ -1244,6 +1166,9 @@ void initJitScriptBindings(PyObject* module) {
             const auto method_name = QualifiedName(*self.type()->name(), name);
             auto fn = self._ivalue()->compilation_unit()->create_function(
                 method_name, graph);
+            if (store_inputs) {
+              self.store_traced_inputs(name, typed_inputs);
+            }
             self.type()->addMethod(fn);
             didFinishEmitModule(self);
           },
@@ -1253,7 +1178,8 @@ void initJitScriptBindings(PyObject* module) {
           py::arg("var_name_lookup_fn"),
           py::arg("strict"),
           py::arg("force_outplace"),
-          py::arg("argument_names") = std::vector<std::string>())
+          py::arg("argument_names") = std::vector<std::string>(),
+          py::arg("store_inputs"))
       .def(
           "_get_forward_hooks",
           [](const Module& m) {
@@ -1272,6 +1198,11 @@ void initJitScriptBindings(PyObject* module) {
             }
             return funcs;
           })
+      .def(
+          "_retrieve_traced_inputs",
+          [](const Module& m) {
+            return ScriptDict(m.retrieve_traced_inputs());
+          })
       .def_property_readonly(
           "code",
           [](Module& self) {
@@ -1374,7 +1305,7 @@ void initJitScriptBindings(PyObject* module) {
       .def(
           py::init([](const std::string& lang, const uint32_t _frames_up) {
             auto cu = std::make_shared<CompilationUnit>();
-            if (lang.size() > 0) {
+            if (!lang.empty()) {
               pyCompilationUnitDefine(*cu, lang, nullptr, _frames_up);
             }
             return cu;
@@ -1864,7 +1795,8 @@ void initJitScriptBindings(PyObject* module) {
       [](std::shared_ptr<CompilationUnit> cu,
          const std::string& filename,
          py::object map_location,
-         const py::dict& extra_files) {
+         const py::dict& extra_files,
+         bool restore_shapes = false) {
         c10::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
@@ -1873,7 +1805,12 @@ void initJitScriptBindings(PyObject* module) {
         }
         ExtraFilesMap extra_files_map = extra_files_from_python(extra_files);
         auto ret = import_ir_module(
-            std::move(cu), filename, optional_device, extra_files_map);
+            std::move(cu),
+            filename,
+            optional_device,
+            extra_files_map,
+            /*load_debug_files*/ true,
+            restore_shapes);
         extra_files_to_python(extra_files_map, extra_files);
         return ret;
       });
@@ -1903,7 +1840,8 @@ void initJitScriptBindings(PyObject* module) {
       [](std::shared_ptr<CompilationUnit> cu,
          const std::string& buffer,
          py::object map_location,
-         const py::dict& extra_files) {
+         const py::dict& extra_files,
+         bool restore_shapes = false) {
         std::istringstream in(buffer);
         c10::optional<at::Device> optional_device;
         if (!map_location.is_none()) {
@@ -1913,7 +1851,12 @@ void initJitScriptBindings(PyObject* module) {
         }
         ExtraFilesMap extra_files_map = extra_files_from_python(extra_files);
         auto ret = import_ir_module(
-            std::move(cu), in, optional_device, extra_files_map);
+            std::move(cu),
+            in,
+            optional_device,
+            extra_files_map,
+            /*load_debug_files*/ true,
+            restore_shapes);
         extra_files_to_python(extra_files_map, extra_files);
         return ret;
       });
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index d09918522a81..b73b136bca5c 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -101,7 +101,8 @@ struct ArgumentSpec {
     // https://github.com/zdevito/pytorch/commit/21e7200a0a0fc456bea2f10e95b1781f83933d10
     // show overhead in extra refcounting along this path
     const at::Tensor* t = reinterpret_cast<const at::Tensor*>(&input);
-    if ((arg.defined_ = t->defined())) {
+    arg.defined_ = t->defined();
+    if (arg.defined_) {
       arg.requires_grad_ = with_grad && autograd::Variable(*t).requires_grad();
       arg.dim_ = t->dim();
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
@@ -134,7 +135,7 @@ struct ArgumentSpec {
       return false;
     // NB: we need to break out early when there are no elements, because
     // passing a nullptr to memcmp is UB.
-    if (tensor_args.size() == 0)
+    if (tensor_args.empty())
       return true;
     return std::memcmp(
                tensor_args.data(),
diff --git a/torch/csrc/jit/runtime/calculate_necessary_args.h b/torch/csrc/jit/runtime/calculate_necessary_args.h
index d9df369727dc..aa2352d4fe0a 100644
--- a/torch/csrc/jit/runtime/calculate_necessary_args.h
+++ b/torch/csrc/jit/runtime/calculate_necessary_args.h
@@ -14,7 +14,7 @@ inline std::pair<int64_t, int64_t> CalculateNecessaryArgs(
     const std::vector<Argument>& schema_args,
     at::ArrayRef<Value*> actual_inputs,
     bool allow_trailing_out_args) {
-  if (schema_args.size() == 0) {
+  if (schema_args.empty()) {
     return std::make_pair(0, 0);
   }
 
diff --git a/torch/csrc/jit/runtime/decomposition_registry.cpp b/torch/csrc/jit/runtime/decomposition_registry.cpp
index 05e5c9b6b196..8301e22d7107 100644
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@@ -51,7 +51,7 @@ void loadModule(const CompilationUnit& module) {
 
 void loadDecompositionFunctions() {
   std::lock_guard<std::mutex> guard(lock);
-  if (schema_to_decomposition.size() != 0) {
+  if (!schema_to_decomposition.empty()) {
     return;
   }
 
diff --git a/torch/csrc/jit/runtime/decomposition_registry_util.cpp b/torch/csrc/jit/runtime/decomposition_registry_util.cpp
index da972bfce4f8..55fe55e975be 100644
--- a/torch/csrc/jit/runtime/decomposition_registry_util.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry_util.cpp
@@ -16,8 +16,9 @@ namespace jit {
 const std::string decomp_funcs =
     R"(def var_decomposition(input: Tensor,
     dim: Optional[List[int]]=None,
-    correction: Optional[int]=None,
+    correction: Union[float, int, NoneType, bool]=None,
     keepdim: bool=False) -> Tensor:
+  _0 = uninitialized(float)
   if torch.__is__(dim, None):
     dim0 = annotate(List[int], [])
   else:
@@ -26,8 +27,8 @@ const std::string decomp_funcs =
     n = torch.numel(input)
   else:
     n0 = 1
-    for _0 in range(torch.len(dim0)):
-      dim_i = dim0[_0]
+    for _1 in range(torch.len(dim0)):
+      dim_i = dim0[_1]
       n1 = torch.mul(n0, (torch.size(input))[dim_i])
       n0 = n1
     n = n0
@@ -35,12 +36,28 @@ const std::string decomp_funcs =
   sub = torch.sub(input, mean)
   sq = torch.mul(sub, sub)
   sum = torch.sum(sq, dim0, keepdim)
-  if torch.__isnot__(correction, None):
-    correction0 = unchecked_cast(int, correction)
-    n2 = torch.sub(n, correction0)
+  if torch.__is__(correction, None):
+    denom = float(torch.sub(n, 1))
   else:
-    n2 = n
-  return torch.div(sum, n2)
+    correction0 = unchecked_cast(Union[float, int, bool], correction)
+    _2 = isinstance(correction0, int)
+    if _2:
+      correction1 = unchecked_cast(int, correction0)
+      denom0 = float(torch.sub(n, correction1))
+    else:
+      correction2 = unchecked_cast(Union[float, bool], correction0)
+      _3 = isinstance(correction2, float)
+      if _3:
+        correction3 = unchecked_cast(float, correction2)
+        denom2 = torch.sub(float(n), correction3)
+        denom1 = denom2
+      else:
+        ops.prim.RaiseException("correction must be int or float", "builtins.RuntimeError")
+        denom1 = _0
+      denom0 = denom1
+    denom = denom0
+  _4 = torch.div(sum, ops.prim.max(0, denom))
+  return _4
 
 def var(input: Tensor,
     unbiased: bool=True) -> Tensor:
@@ -48,13 +65,27 @@ def var(input: Tensor,
     _0 = 1
   else:
     _0 = 0
+  _1 = uninitialized(float)
   n = torch.numel(input)
   mean = torch.mean(input, annotate(List[int], []), True)
   sub = torch.sub(input, mean)
   sq = torch.mul(sub, sub)
   sum = torch.sum(sq, annotate(List[int], []))
-  n0 = torch.sub(n, _0)
-  return torch.div(sum, n0)
+  _2 = isinstance(_0, int)
+  if _2:
+    denom = float(torch.sub(n, _0))
+  else:
+    correction = unchecked_cast(Union[float, bool], _0)
+    _3 = isinstance(correction, float)
+    if _3:
+      correction0 = unchecked_cast(float, correction)
+      denom0 = torch.sub(float(n), correction0)
+    else:
+      ops.prim.RaiseException("correction must be int or float", "builtins.RuntimeError")
+      denom0 = _1
+    denom = denom0
+  _4 = torch.div(sum, ops.prim.max(0, denom))
+  return _4
 
 )";
 
@@ -65,8 +96,8 @@ const std::string& GetSerializedDecompositions() {
 const OperatorMap<std::string>& GetDecompositionMapping() {
   // clang-format off
  static const OperatorMap<std::string> decomposition_mapping {
-    {"aten::var.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> (Tensor)", "var_decomposition"},
-    {"aten::var(Tensor self, bool unbiased=True) -> (Tensor)", "var"},
+    {"aten::var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor", "var_decomposition"},
+    {"aten::var(Tensor self, bool unbiased=True) -> Tensor", "var"},
   };
   // clang-format on
 
diff --git a/torch/csrc/jit/runtime/instruction.h b/torch/csrc/jit/runtime/instruction.h
index 4bde105816a8..1b574de6fdd7 100644
--- a/torch/csrc/jit/runtime/instruction.h
+++ b/torch/csrc/jit/runtime/instruction.h
@@ -73,7 +73,8 @@ namespace jit {
   _(FORK, "CN") /* launch a thread to run code entry x with N inputs  */       \
   _(WARN, "I") /* emit a warning with line information */                      \
   _(ENTER, "EN") /* enter scope of a contextmanager */                         \
-  _(EXIT, "EX") /* exit the last entered contextmanager */
+  _(EXIT, "EX") /* exit the last entered contextmanager */                     \
+  _(AWAITABLE, "CN") /* initialize await for code entry x with N inputs  */
 
 enum OpCode : uint8_t {
 #define DEFINE_OP(op, _) op,
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index ac5df63b472e..e94d9a6f054a 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -324,6 +324,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             INST_NEXT;
           case INST(STOREN): {
             INST_GUARD;
+            TORCH_INTERNAL_ASSERT_DEBUG_ONLY(stack.size() >= inst.N);
             for (size_t i = inst.N; i > 0; --i) {
               reg(inst.X + i - 1) = pop(stack);
             }
@@ -678,11 +679,13 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             INST_NEXT;
           case INST(DTYPE): {
             INST_GUARD;
+            TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!stack.empty());
             dtype(stack);
           }
             INST_NEXT;
           case INST(DIM): {
             INST_GUARD;
+            TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!stack.empty());
             dim(stack);
           }
             INST_NEXT;
@@ -727,6 +730,46 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             taskLauncher_(std::move(continuation));
           }
             INST_NEXT;
+          case INST(AWAITABLE): {
+            INST_GUARD;
+            auto fn_ptr = frame.function->function_table_[inst.X];
+            auto& fn = toGraphFunction(*fn_ptr);
+            auto num_outputs = fn.graph()->outputs().size();
+            TypePtr out_type;
+            if (num_outputs == 1) {
+              out_type = fn.graph()->outputs()[0]->type();
+            } else {
+              std::vector<TypePtr> out_types;
+              for (const auto& o : fn.graph()->outputs()) {
+                out_types.push_back(o->type());
+              }
+              out_type = TupleType::create(out_types);
+            }
+            auto args = std::vector<IValue>(stack.end() - inst.N, stack.end());
+            auto aw = c10::make_intrusive<c10::ivalue::Await>(out_type);
+            aw->setArgs(std::move(args));
+            aw->setFn(
+                [&args = aw->args(),
+                 fn_ptr,
+                 taskLauncher = taskLauncher_]() -> IValue {
+                  auto& fn = toGraphFunction(*fn_ptr);
+                  auto n_out = fn.graph()->outputs().size();
+                  torch::jit::Stack s;
+                  for (const auto& arg : args) {
+                    s.push_back(arg);
+                  }
+                  InterpreterState await_interpreter(
+                      fn.get_executor().getPlanFor(s).code, taskLauncher);
+                  await_interpreter.run(s);
+                  if (n_out == 1) {
+                    return s.back();
+                  }
+                  return c10::ivalue::Tuple::create(jit::last(s, n_out));
+                });
+            drop(stack, inst.N);
+            push(stack, std::move(aw));
+          }
+            INST_NEXT;
           case INST(WARN): {
             INST_GUARD;
             // Keeps track of which WARN instruction has been executed before,
@@ -895,7 +938,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       // module hierarchy.
       const auto& g = frame.function->graph_;
       std::string g_self_type;
-      if (g && g->inputs().size() > 0) {
+      if (g && !g->inputs().empty()) {
         const auto& g_self_type_ptr =
             g->inputs()[0]->type()->cast<c10::ClassType>();
         if (g_self_type_ptr) {
@@ -945,7 +988,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         if (node->input(0)->node()->kind() == prim::GetAttr) {
           class_instance_name = node->input(0)->node()->s(attr::name);
         } else if (
-            node->owningGraph()->inputs().size() > 0 &&
+            !node->owningGraph()->inputs().empty() &&
             node->input(0) == node->owningGraph()->inputs()[0]) {
           class_instance_name = "SELF";
         } else {
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index a64261a9ef04..abbbb1799aae 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -82,6 +82,7 @@ struct CodeImpl {
       operator_table_inv_;
   std::vector<Function*> function_table_;
   std::vector<std::unique_ptr<GraphFunction>> forked_functions_;
+  std::vector<std::unique_ptr<GraphFunction>> awaited_functions_;
   std::vector<TypePtr> type_table_;
   std::vector<std::function<void(std::vector<IValue>&)>>
       profile_function_table_;
@@ -611,6 +612,16 @@ struct CodeImpl {
     insertInstruction(FORK, function_table_.size() - 1, node->inputs().size());
   }
 
+  void emitAwaitable(Node* node) {
+    emitLoadInputs(node->inputs());
+    std::unique_ptr<GraphFunction> await_fn(new GraphFunction(
+        "<awaitable function>", node->g(attr::Subgraph), nullptr));
+    awaited_functions_.emplace_back(std::move(await_fn));
+    function_table_.emplace_back(awaited_functions_.back().get());
+    insertInstruction(
+        AWAITABLE, function_table_.size() - 1, node->inputs().size());
+  }
+
   void emitWarn(Node* node) {
     if (FLAGS_torch_jit_disable_warning_prints) {
       return;
@@ -716,6 +727,9 @@ struct CodeImpl {
       case prim::fork:
         emitFork(node);
         break;
+      case prim::awaitable:
+        emitAwaitable(node);
+        break;
       case aten::warn:
         emitWarn(node);
         break;
@@ -903,7 +917,7 @@ struct MobileCodeImpl : CodeImpl {
 
           size_t numInclude = specifiedArgs.first +
               (support_default_args_before_out_ ? specifiedArgs.second : 0);
-          auto unique_name = op_schema.overload_name() != ""
+          auto unique_name = !op_schema.overload_name().empty()
               ? op_schema.name() + "." + op_schema.overload_name()
               : op_schema.name();
           auto it = op_to_num_specified_args_.insert(
diff --git a/torch/csrc/jit/runtime/interpreter/preprocess_graph.cpp b/torch/csrc/jit/runtime/interpreter/preprocess_graph.cpp
index 7b27ce6f017e..9eca9f45cf79 100644
--- a/torch/csrc/jit/runtime/interpreter/preprocess_graph.cpp
+++ b/torch/csrc/jit/runtime/interpreter/preprocess_graph.cpp
@@ -67,11 +67,11 @@ void dropUnused(Block* b) {
   auto createDropIfUnused = [&](ArrayRef<Value*> values) -> Node* {
     std::vector<Value*> to_drop;
     for (auto v : values) {
-      if (v->uses().size() == 0 && v->node()->kind() != prim::Constant) {
+      if (v->uses().empty() && v->node()->kind() != prim::Constant) {
         to_drop.push_back(v);
       }
     }
-    if (to_drop.size() == 0) {
+    if (to_drop.empty()) {
       return nullptr;
     }
     return b->owningGraph()->create(prim::Drop, to_drop, 0);
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index d005d1b100bd..13b71f59c76e 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -209,14 +209,14 @@ bool printerHasSpecialCaseFor(Symbol sym) {
   // schema to editing this list here. These cases should only be things
   // that require special handling because they do not fit normal schema
   const static std::unordered_set<Symbol> handled = {
-      prim::Constant,      prim::Uninitialized, prim::fork,
-      prim::ListConstruct, prim::DictConstruct, prim::ListUnpack,
-      prim::Print,         prim::PythonOp,      prim::TupleConstruct,
-      prim::TupleIndex,    prim::TupleSlice,    prim::TupleUnpack,
-      prim::CreateObject,  prim::GetAttr,       prim::SetAttr,
-      prim::CallFunction,  prim::isinstance,    prim::unchecked_cast,
-      prim::tolist,        prim::rpc_async,     prim::rpc_sync,
-      prim::rpc_remote};
+      prim::Constant,       prim::Uninitialized, prim::fork,
+      prim::awaitable,      prim::ListConstruct, prim::DictConstruct,
+      prim::ListUnpack,     prim::Print,         prim::PythonOp,
+      prim::TupleConstruct, prim::TupleIndex,    prim::TupleSlice,
+      prim::TupleUnpack,    prim::CreateObject,  prim::GetAttr,
+      prim::SetAttr,        prim::CallFunction,  prim::isinstance,
+      prim::unchecked_cast, prim::tolist,        prim::rpc_async,
+      prim::rpc_sync,       prim::rpc_remote};
 
   // WARNING: by adding a value to this set, you are asserting that your
   // primitive is only ever added during optimization and does not need
@@ -314,6 +314,9 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) {
       prim::ConstantMKLDNNTensor,
       prim::BroadcastMKLDNNTensors,
       prim::fork,
+      prim::awaitable,
+      prim::awaitable_nowait,
+      prim::awaitable_wait,
       prim::CreateObject,
       prim::AutogradAdd,
       prim::GetAttr,
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
index 31ed3ff4068c..9380fc6633f5 100644
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -266,7 +266,7 @@ bool needsProfiledOutput(Node* n) {
 void ProfilingRecord::removeProfileCounter(Block* b) {
   for (auto it = b->nodes().rbegin(); it != b->nodes().rend();) {
     auto n = *it;
-    if (n->kind() == prim::profile && n->inputs().size() == 0) {
+    if (n->kind() == prim::profile && n->inputs().empty()) {
       it.destroyCurrent();
       // there is only one counter node
       return;
diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp
index 5c64975bd679..6d805005eb61 100644
--- a/torch/csrc/jit/runtime/register_cuda_ops.cpp
+++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp
@@ -50,10 +50,9 @@ RegisterOperators const reg({
     Operator(
         "cuda::current_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream",
         [](Stack& stack) {
-          auto idx = pop(stack).toOptional<int64_t>();
-          c10::DeviceIndex device_index = idx.has_value()
-              ? static_cast<c10::DeviceIndex>(idx.value())
-              : c10::cuda::current_device();
+          auto idx = pop(stack).toOptional<c10::DeviceIndex>();
+          c10::DeviceIndex device_index =
+              idx.has_value() ? idx.value() : c10::cuda::current_device();
           auto s = c10::cuda::getCurrentCUDAStream(device_index);
           auto st = make_custom_class<torch::jit::CUDAStream>(s);
           push(stack, IValue(st));
@@ -74,10 +73,9 @@ RegisterOperators const reg({
     Operator(
         "cuda::default_stream.int(int? val) -> __torch__.torch.classes.cuda.Stream",
         [](Stack& stack) {
-          auto idx = pop(stack).toOptional<int64_t>();
-          c10::DeviceIndex device_index = idx.has_value()
-              ? static_cast<c10::DeviceIndex>(idx.value())
-              : c10::cuda::current_device();
+          auto idx = pop(stack).toOptional<c10::DeviceIndex>();
+          c10::DeviceIndex device_index =
+              idx.has_value() ? idx.value() : c10::cuda::current_device();
           auto s = c10::cuda::getDefaultCUDAStream(device_index);
           auto st = make_custom_class<torch::jit::CUDAStream>(s);
           push(stack, IValue(st));
@@ -129,15 +127,12 @@ RegisterOperators const reg({
         [](Stack& stack) {
           auto v = pop(stack);
           auto s = v.toCustomClass<torch::jit::CUDAStream>();
-          auto stream_device_idx = static_cast<int64_t>(s->device_index());
-          auto cur_device_idx =
-              // NOLINTNEXTLINE(bugprone-signed-char-misuse)
-              static_cast<int64_t>(c10::cuda::current_device());
+          auto stream_device_idx = s->device_index();
+          auto cur_device_idx = c10::cuda::current_device();
           // If the stream is not on the current device, change the
           // device to the device of the stream.
           if (cur_device_idx != stream_device_idx) {
-            c10::cuda::set_device(
-                static_cast<c10::DeviceIndex>(stream_device_idx));
+            c10::cuda::set_device(stream_device_idx);
           }
           // To set the current CUDA stream using
           // c10::cuda::setCurrentCUDAStream, the jit::CUDAStream object needs
@@ -148,9 +143,7 @@ RegisterOperators const reg({
           // unpacking it inside this operator. The unpacked stream is then used
           // to set the current CUDA stream.
           auto unpacked = c10::cuda::CUDAStream::unpack3(
-              s->id(),
-              stream_device_idx,
-              static_cast<int64_t>(c10::DeviceType::CUDA));
+              s->id(), stream_device_idx, c10::DeviceType::CUDA);
           c10::cuda::setCurrentCUDAStream(unpacked);
         },
         aliasAnalysisFromSchema()),
@@ -171,10 +164,9 @@ RegisterOperators const reg({
     Operator(
         "cuda::synchronize.int(int? val) -> ()",
         [](Stack& stack) {
-          auto idx = pop(stack).toOptional<int64_t>();
-          c10::DeviceIndex device_index = idx.has_value()
-              ? static_cast<c10::DeviceIndex>(idx.value())
-              : c10::cuda::current_device();
+          auto idx = pop(stack).toOptional<c10::DeviceIndex>();
+          c10::DeviceIndex device_index =
+              idx.has_value() ? idx.value() : c10::cuda::current_device();
           _device_synchronize(device_index);
         },
         aliasAnalysisFromSchema()),
diff --git a/torch/csrc/jit/runtime/register_distributed_ops.cpp b/torch/csrc/jit/runtime/register_distributed_ops.cpp
index a8baa6f7f406..2420952561c0 100644
--- a/torch/csrc/jit/runtime/register_distributed_ops.cpp
+++ b/torch/csrc/jit/runtime/register_distributed_ops.cpp
@@ -13,8 +13,6 @@
 #include <fmt/format.h>
 #include <stdexcept>
 
-using at::Scalar;
-using at::Tensor;
 namespace dist_autograd = torch::distributed::autograd;
 namespace dist_rpc = torch::distributed::rpc;
 
diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
index b75e224c3ada..4dd4cb46e9a0 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
+++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
@@ -108,7 +108,7 @@ void checkImplicitTensorToNum(const at::Tensor& t, bool toInt) {
     throw std::runtime_error(
         "Cannot input a tensor that requires grad as a scalar argument");
   }
-  if (t.sizes().size() != 0) {
+  if (!t.sizes().empty()) {
     throw std::runtime_error(
         "Cannot input a tensor of dimension other than 0 as a scalar argument");
   }
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index e36885e46df0..5bbdd365d794 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -1379,7 +1379,7 @@ void dictDelete(Stack& stack) {
 
 void dictPopItem(Stack& stack) {
   auto dict = pop(stack).toGenericDict();
-  if (dict.size() == 0) {
+  if (dict.empty()) {
     AT_ERROR("popitem(): dictionary is empty");
   }
   auto head_item = dict.begin();
@@ -1993,7 +1993,7 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string string = pop(stack).toStringRef();
           LOG(WARNING)
               << "The isidentifier() implementation being used is from Python 2\n";
-          if (string.size() < 1) {
+          if (string.empty()) {
             push(stack, false);
             return;
           }
@@ -2416,7 +2416,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs1{
         [](Stack& stack) {
           at::Tensor a;
           pop(stack, a);
-          if (a.name() == "") {
+          if (a.name().empty()) {
             push(stack, IValue());
           } else {
             push(stack, a.name());
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index 7749e3902ea9..0ad99d250a4c 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -29,293 +29,311 @@ namespace jit {
 
 namespace {
 
-RegisterOperators reg(
-    {Operator(
-         prim::profile,
-         [](const Node* node) -> Operation {
-           return [](Stack& stack) {
-             AT_ERROR(
-                 "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         prim::profile_ivalue,
-         [](const Node* node) -> Operation {
-           return [](Stack& stack) {
-             AT_ERROR(
-                 "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         prim::FusionGroup,
-         [](const Node* node) -> Operation {
-           const auto key = registerFusion(node);
-           return [key](Stack& stack) {
-             RECORD_FUNCTION("FusionGroup", std::vector<c10::IValue>());
-             runFusion(key, stack);
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         prim::RequiresGradCheck /* (...)  -> (..., bool) */,
-         [](const Node* node) -> Operation {
-           std::vector<bool> rg_props =
-               fmap(node->tys(attr::types), [](const TypePtr& t) {
-                 // if an rg property changes we assume a tensor does require
-                 // gradients which is set in `guardDifferentiableGraph`
-                 TORCH_INTERNAL_ASSERT(
-                     t->castRaw<TensorType>()->requiresGrad().has_value());
-                 return *t->castRaw<TensorType>()->requiresGrad();
-               });
-           return [rg_props](Stack& stack) {
-             auto num_inputs = rg_props.size();
-             // Check every input's shape against profiled (expected) shape.
-             for (const auto i : c10::irange(num_inputs)) {
-               auto& input = peek(stack, i, num_inputs);
-               const auto& t = input.toTensor();
-               if (rg_props[i] != t.requires_grad()) {
-                 push(stack, false);
-                 return;
-               }
-             }
-
-             push(stack, true);
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         prim::ConstantChunk,
-         [](const Node* node) -> Operation {
-           int64_t chunks = node->i(attr::chunks);
-           int64_t dim = node->i(attr::dim);
-           auto outputs_used = fmap(node->outputs(), [](const Value* v) {
-             return v->uses().size() > 0;
-           });
-           return [=](Stack& stack) {
-             RECORD_FUNCTION("chunk", last(stack, 1));
-
-             at::Tensor t;
-             pop(stack, t);
-             auto result = at::chunk(t, chunks, dim);
-             stack.insert(
-                 stack.end(),
-                 std::make_move_iterator(result.begin()),
-                 std::make_move_iterator(result.end()));
-             // NB: Chunk can sometimes return a smaller number of outputs.
-             int64_t num_results = result.size();
-             if (num_results != chunks) {
-               if (num_results > chunks) {
-                 TORCH_CHECK(
-                     num_results == chunks,
-                     "Expected chunk to return ",
-                     chunks,
-                     " outputs, but got ",
-                     num_results);
-               }
-               for (const auto i : c10::irange(num_results, chunks)) {
-                 TORCH_CHECK(
-                     !outputs_used[i],
-                     "Expected chunk to return at least ",
-                     chunks,
-                     " outputs, but got only ",
-                     num_results);
-                 // We know that the output is unused, so it's ok to push
-                 // anything on the stack.
-                 stack.emplace_back();
-               }
-             }
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         prim::ChunkSizes,
-         [](const Node* node) -> Operation {
-           int64_t raw_dim = node->i(attr::dim);
-           int64_t chunks = node->i(attr::chunks);
-           return [raw_dim, chunks](Stack& stack) {
-             c10::List<int64_t> shape = pop(stack).toIntList();
-             c10::List<int64_t> regular_shape = shape.copy();
-             c10::List<int64_t> last_shape = shape.copy();
-             int64_t dim = at::maybe_wrap_dim(raw_dim, shape.size());
-             TORCH_CHECK(
-                 dim < (int64_t)regular_shape.size(),
-                 "Dimension out of range for chunk");
-             int64_t split_size = (regular_shape[dim] + chunks - 1) / chunks;
-             regular_shape[dim] = split_size;
-             if (shape[dim] % chunks == 0) {
-               last_shape[dim] = split_size;
-             } else {
-               int64_t num_splits = std::max<int64_t>(
-                   (shape[dim] + split_size - 1) / split_size, 1);
-               last_shape[dim] =
-                   split_size - (split_size * num_splits - shape[dim]);
-               AT_ASSERT(last_shape[dim] >= 0);
-             }
-             push(stack, std::move(regular_shape));
-             push(stack, std::move(last_shape));
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         "aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)",
-         [](Stack& stack) {
-           RECORD_FUNCTION("_grad_sum_to_size", std::vector<c10::IValue>());
-           IValue self, size;
-           pop(stack, self, size);
-           if (size.isNone()) {
-             push(stack, std::move(self));
-           } else {
-             push(stack, at::sum_to(self.toTensor(), size.toDimVector()));
-           }
-         },
-         aliasAnalysisFromSchema()),
-     // This operator is generated inside the compiler for indexing into
-     // ModuleDict without a statically determinable key. Accordingly,
-     // self must be a ModuleType and the output must be an InterfaceType.
-     OperatorGenerator(
-         TORCH_SELECTIVE_SCHEMA(
-             "prim::ModuleContainerIndex.dict(Any self, str ind) -> Any"),
-         [](Stack& stack) {
-           IValue ind = pop(stack);
-           IValue module_dict = pop(stack);
-           push(stack, module_dict.toModule().attr(ind.toStringRef()));
-         },
-         aliasAnalysisFromSchema()),
-     Operator(
-         prim::TypeCheck /* (...)  -> (..., bool) */,
-         [](const Node* /* node */) -> Operation {
-           return [](Stack& /* stack */) {
-             AT_ERROR("prim::TypeCheck not yet implemented"); // NOLINT
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         prim::FallbackGraph,
-         [](const Node* node) -> Operation {
-           return [](Stack& stack) {
-             AT_ERROR(
-                 "Must be converted to prim::FunctionCall by replaceFallbackGraphWithFallbackFunction"); // NOLINT
-           };
-         },
-         aliasAnalysisSpecialCase()),
-     Operator(
-         "prim::Guard(Tensor(a) t) -> Tensor(a)",
-         [](Stack& stack) { AT_ERROR("Should be replaced by prim::BailOut"); },
-         aliasAnalysisFromSchema()),
-     Operator(
-         "prim::BailOut(...) -> Tensor(a)",
-         [](Stack& /* stack */) {
-           AT_ERROR("prim::BailOut not yet implemented"); // NOLINT
-         },
-         aliasAnalysisFromSchema()),
-     Operator(
-         "prim::BailoutTemplate() -> int",
-         [](Stack& stack) {
-           // TODO: today, we put a single bailout template at the front to
-           // carry the un-optimized graph for bailout nodes to use. Ideally
-           // this should never run, but we haven't written the code to remove
-           // it yet.
-           // TORCH_INTERNAL_ASSERT(false);
-
-           // Returns an int so that we have an easy way to do graph traversal
-           push(stack, 1);
-         },
-         aliasAnalysisFromSchema()),
-     Operator(
-         "aten::grad(Tensor[] outputs, Tensor[] inputs, Tensor?[]? grad_outputs=None, bool? retain_graph=None, bool create_graph=False, bool allow_unused=False) -> Tensor?[]",
-         [](Stack& stack) {
-           bool allow_unused = pop(stack).toBool();
-           bool create_graph = pop(stack).toBool();
-           auto retain_graph = pop(stack).toOptional<bool>();
-           auto grad_outputs = pop(stack);
-           auto inputs = pop(stack).toTensorList();
-           auto outputs = pop(stack).toTensorList();
-           std::vector<torch::autograd::Variable> input_vars(
-               inputs.begin(), inputs.end());
-           std::vector<torch::autograd::Variable> output_vars(
-               outputs.begin(), outputs.end());
-           std::vector<torch::autograd::Variable> gradients;
-
-           if (!grad_outputs.isNone()) {
-             for (const IValue& v : grad_outputs.toListRef()) {
-               gradients.emplace_back(v.isNone() ? at::Tensor() : v.toTensor());
-             }
-           }
-
-           auto res = torch::autograd::grad(
-               output_vars,
-               input_vars,
-               gradients,
-               retain_graph,
-               create_graph,
-               allow_unused);
-
-           c10::impl::GenericList res_list{OptionalType::ofTensor()};
-           for (const at::Tensor& t : res) {
-             res_list.emplace_back(t.defined() ? t : IValue());
-           }
-           push(stack, res_list);
-         },
-         aliasAnalysisFromSchema()),
-     // NB: backward op might write to every input tensors in the graph and it's
-     // much more expensive to analayze the leaves and sometimes it might retain
-     // the whole gradients in every tensor of the Autograd graph with
-     // create_graph=True so we use aliasAnalysisConservative for these two OPs
-     Operator(
-         "aten::backward.TensorList(Tensor[] tensors, Tensor?[]? grad_tensors=None, bool? retain_graph=None, bool create_graph=False) -> ()",
-         [](Stack& stack) {
-           bool create_graph = pop(stack).toBool();
-           auto retain_graph = pop(stack).toOptional<bool>();
-           auto grad_tensors = pop(stack);
-           auto outputs = pop(stack).toTensorList();
-           std::vector<torch::autograd::Variable> output_vars(
-               outputs.begin(), outputs.end());
-           std::vector<torch::autograd::Variable> gradients;
-
-           if (!grad_tensors.isNone()) {
-             for (const IValue& v : grad_tensors.toListRef()) {
-               gradients.emplace_back(v.isNone() ? at::Tensor() : v.toTensor());
-             }
-           }
-
-           torch::autograd::backward(
-               output_vars, gradients, retain_graph, create_graph);
-         },
-         aliasAnalysisConservative()),
-     Operator(
-         "aten::save(t item, str filename) -> ()",
-         [](Stack& stack) {
-           auto filename = pop(stack).toStringRef();
-           auto ivalue = pop(stack);
-
-           // Pickle the tensor
-           auto data = jit::pickle_save(ivalue);
-
-           // Write file
-           std::fstream output(filename, std::ios::out | std::ios::binary);
-           output.write(data.data(), data.size());
-         },
-         aliasAnalysisFromSchema()),
-     Operator(
-         "prim::IgnoredPythonOp(...) -> None",
-         [](Stack& stack) {
-           throw JITException(
-               "This Python function is annotated to be ignored"
-               " and cannot be and has not been included in the exported"
-               " binary, meaning that it cannot be executed now."
-               " Make sure that ignored operations are never executed after"
-               " import");
-         },
-         aliasAnalysisFromSchema()),
-     Operator(
-         "aten::wait(Future(t) self) -> t",
-         [](Stack& stack) {
-           TORCH_CHECK(
-               false, "wait is implemented directly in the interpreter");
-         },
-         aliasAnalysisSpecialCase())});
+RegisterOperators reg({
+    Operator(
+        prim::profile,
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            AT_ERROR(
+                "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        prim::profile_ivalue,
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            AT_ERROR(
+                "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        prim::FusionGroup,
+        [](const Node* node) -> Operation {
+          const auto key = registerFusion(node);
+          return [key](Stack& stack) {
+            RECORD_FUNCTION("FusionGroup", std::vector<c10::IValue>());
+            runFusion(key, stack);
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        prim::RequiresGradCheck /* (...)  -> (..., bool) */,
+        [](const Node* node) -> Operation {
+          std::vector<bool> rg_props =
+              fmap(node->tys(attr::types), [](const TypePtr& t) {
+                // if an rg property changes we assume a tensor does require
+                // gradients which is set in `guardDifferentiableGraph`
+                TORCH_INTERNAL_ASSERT(
+                    t->castRaw<TensorType>()->requiresGrad().has_value());
+                return *t->castRaw<TensorType>()->requiresGrad();
+              });
+          return [rg_props](Stack& stack) {
+            auto num_inputs = rg_props.size();
+            // Check every input's shape against profiled (expected) shape.
+            for (const auto i : c10::irange(num_inputs)) {
+              auto& input = peek(stack, i, num_inputs);
+              const auto& t = input.toTensor();
+              if (rg_props[i] != t.requires_grad()) {
+                push(stack, false);
+                return;
+              }
+            }
+
+            push(stack, true);
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        prim::ConstantChunk,
+        [](const Node* node) -> Operation {
+          int64_t chunks = node->i(attr::chunks);
+          int64_t dim = node->i(attr::dim);
+          auto outputs_used = fmap(node->outputs(), [](const Value* v) {
+            return !v->uses().empty();
+          });
+          return [=](Stack& stack) {
+            RECORD_FUNCTION("chunk", last(stack, 1));
+
+            at::Tensor t;
+            pop(stack, t);
+            auto result = at::chunk(t, chunks, dim);
+            stack.insert(
+                stack.end(),
+                std::make_move_iterator(result.begin()),
+                std::make_move_iterator(result.end()));
+            // NB: Chunk can sometimes return a smaller number of outputs.
+            int64_t num_results = result.size();
+            if (num_results != chunks) {
+              if (num_results > chunks) {
+                TORCH_CHECK(
+                    num_results == chunks,
+                    "Expected chunk to return ",
+                    chunks,
+                    " outputs, but got ",
+                    num_results);
+              }
+              for (const auto i : c10::irange(num_results, chunks)) {
+                TORCH_CHECK(
+                    !outputs_used[i],
+                    "Expected chunk to return at least ",
+                    chunks,
+                    " outputs, but got only ",
+                    num_results);
+                // We know that the output is unused, so it's ok to push
+                // anything on the stack.
+                stack.emplace_back();
+              }
+            }
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        prim::ChunkSizes,
+        [](const Node* node) -> Operation {
+          int64_t raw_dim = node->i(attr::dim);
+          int64_t chunks = node->i(attr::chunks);
+          return [raw_dim, chunks](Stack& stack) {
+            c10::List<int64_t> shape = pop(stack).toIntList();
+            c10::List<int64_t> regular_shape = shape.copy();
+            c10::List<int64_t> last_shape = shape.copy();
+            int64_t dim = at::maybe_wrap_dim(raw_dim, shape.size());
+            TORCH_CHECK(
+                dim < (int64_t)regular_shape.size(),
+                "Dimension out of range for chunk");
+            int64_t split_size = (regular_shape[dim] + chunks - 1) / chunks;
+            regular_shape[dim] = split_size;
+            if (shape[dim] % chunks == 0) {
+              last_shape[dim] = split_size;
+            } else {
+              int64_t num_splits = std::max<int64_t>(
+                  (shape[dim] + split_size - 1) / split_size, 1);
+              last_shape[dim] =
+                  split_size - (split_size * num_splits - shape[dim]);
+              AT_ASSERT(last_shape[dim] >= 0);
+            }
+            push(stack, std::move(regular_shape));
+            push(stack, std::move(last_shape));
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        "aten::_grad_sum_to_size(Tensor(a) self, int[]? size) -> Tensor(a)",
+        [](Stack& stack) {
+          RECORD_FUNCTION("_grad_sum_to_size", std::vector<c10::IValue>());
+          IValue self, size;
+          pop(stack, self, size);
+          if (size.isNone()) {
+            push(stack, std::move(self));
+          } else {
+            push(stack, at::sum_to(self.toTensor(), size.toDimVector()));
+          }
+        },
+        aliasAnalysisFromSchema()),
+    // This operator is generated inside the compiler for indexing into
+    // ModuleDict without a statically determinable key. Accordingly,
+    // self must be a ModuleType and the output must be an InterfaceType.
+    OperatorGenerator(
+        TORCH_SELECTIVE_SCHEMA(
+            "prim::ModuleContainerIndex.dict(Any self, str ind) -> Any"),
+        [](Stack& stack) {
+          IValue ind = pop(stack);
+          IValue module_dict = pop(stack);
+          push(stack, module_dict.toModule().attr(ind.toStringRef()));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        prim::TypeCheck /* (...)  -> (..., bool) */,
+        [](const Node* /* node */) -> Operation {
+          return [](Stack& /* stack */) {
+            AT_ERROR("prim::TypeCheck not yet implemented"); // NOLINT
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        prim::FallbackGraph,
+        [](const Node* node) -> Operation {
+          return [](Stack& stack) {
+            AT_ERROR(
+                "Must be converted to prim::FunctionCall by replaceFallbackGraphWithFallbackFunction"); // NOLINT
+          };
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        "prim::Guard(Tensor(a) t) -> Tensor(a)",
+        [](Stack& stack) { AT_ERROR("Should be replaced by prim::BailOut"); },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "prim::BailOut(...) -> Tensor(a)",
+        [](Stack& /* stack */) {
+          AT_ERROR("prim::BailOut not yet implemented"); // NOLINT
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "prim::BailoutTemplate() -> int",
+        [](Stack& stack) {
+          // TODO: today, we put a single bailout template at the front to
+          // carry the un-optimized graph for bailout nodes to use. Ideally
+          // this should never run, but we haven't written the code to remove
+          // it yet.
+          // TORCH_INTERNAL_ASSERT(false);
+
+          // Returns an int so that we have an easy way to do graph traversal
+          push(stack, 1);
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "aten::grad(Tensor[] outputs, Tensor[] inputs, Tensor?[]? grad_outputs=None, bool? retain_graph=None, bool create_graph=False, bool allow_unused=False) -> Tensor?[]",
+        [](Stack& stack) {
+          bool allow_unused = pop(stack).toBool();
+          bool create_graph = pop(stack).toBool();
+          auto retain_graph = pop(stack).toOptional<bool>();
+          auto grad_outputs = pop(stack);
+          auto inputs = pop(stack).toTensorList();
+          auto outputs = pop(stack).toTensorList();
+          std::vector<torch::autograd::Variable> input_vars(
+              inputs.begin(), inputs.end());
+          std::vector<torch::autograd::Variable> output_vars(
+              outputs.begin(), outputs.end());
+          std::vector<torch::autograd::Variable> gradients;
+
+          if (!grad_outputs.isNone()) {
+            for (const IValue& v : grad_outputs.toListRef()) {
+              gradients.emplace_back(v.isNone() ? at::Tensor() : v.toTensor());
+            }
+          }
+
+          auto res = torch::autograd::grad(
+              output_vars,
+              input_vars,
+              gradients,
+              retain_graph,
+              create_graph,
+              allow_unused);
+
+          c10::impl::GenericList res_list{OptionalType::ofTensor()};
+          for (const at::Tensor& t : res) {
+            res_list.emplace_back(t.defined() ? t : IValue());
+          }
+          push(stack, res_list);
+        },
+        aliasAnalysisFromSchema()),
+    // NB: backward op might write to every input tensors in the graph and it's
+    // much more expensive to analayze the leaves and sometimes it might retain
+    // the whole gradients in every tensor of the Autograd graph with
+    // create_graph=True so we use aliasAnalysisConservative for these two OPs
+    Operator(
+        "aten::backward.TensorList(Tensor[] tensors, Tensor?[]? grad_tensors=None, bool? retain_graph=None, bool create_graph=False) -> ()",
+        [](Stack& stack) {
+          bool create_graph = pop(stack).toBool();
+          auto retain_graph = pop(stack).toOptional<bool>();
+          auto grad_tensors = pop(stack);
+          auto outputs = pop(stack).toTensorList();
+          std::vector<torch::autograd::Variable> output_vars(
+              outputs.begin(), outputs.end());
+          std::vector<torch::autograd::Variable> gradients;
+
+          if (!grad_tensors.isNone()) {
+            for (const IValue& v : grad_tensors.toListRef()) {
+              gradients.emplace_back(v.isNone() ? at::Tensor() : v.toTensor());
+            }
+          }
+
+          torch::autograd::backward(
+              output_vars, gradients, retain_graph, create_graph);
+        },
+        aliasAnalysisConservative()),
+    Operator(
+        "aten::save(t item, str filename) -> ()",
+        [](Stack& stack) {
+          auto filename = pop(stack).toStringRef();
+          auto ivalue = pop(stack);
+
+          // Pickle the tensor
+          auto data = jit::pickle_save(ivalue);
+
+          // Write file
+          std::fstream output(filename, std::ios::out | std::ios::binary);
+          output.write(data.data(), data.size());
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "prim::IgnoredPythonOp(...) -> None",
+        [](Stack& stack) {
+          throw JITException(
+              "This Python function is annotated to be ignored"
+              " and cannot be and has not been included in the exported"
+              " binary, meaning that it cannot be executed now."
+              " Make sure that ignored operations are never executed after"
+              " import");
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "aten::wait(Future(t) self) -> t",
+        [](Stack& stack) {
+          TORCH_CHECK(false, "wait is implemented directly in the interpreter");
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        "prim::awaitable_wait(Await(t) self) -> t",
+        [](Stack& stack) {
+          auto aw = stack.back().toAwait();
+          aw->wait();
+          stack.pop_back();
+          stack.emplace_back(aw->value());
+        },
+        aliasAnalysisSpecialCase()),
+    Operator(
+        "prim::awaitable_nowait(t self) -> Await(t)",
+        [](Stack& stack) {
+          auto aw =
+              c10::make_intrusive<c10::ivalue::Await>(stack.back().type());
+          aw->markCompleted(pop(stack));
+          push(stack, std::move(aw));
+        },
+        aliasAnalysisSpecialCase()),
+});
 
 RegisterOperators logging_operators(
     {Operator(
diff --git a/torch/csrc/jit/runtime/register_special_ops.cpp b/torch/csrc/jit/runtime/register_special_ops.cpp
index 939370b19693..b25ea60abd1c 100644
--- a/torch/csrc/jit/runtime/register_special_ops.cpp
+++ b/torch/csrc/jit/runtime/register_special_ops.cpp
@@ -68,7 +68,7 @@ std::vector<int64_t> compute_sizes(const IValue& seq) {
   auto seq_recur = seq.toList();
   while (true) {
     sizes.push_back(seq_recur.size());
-    if (seq_recur.size() == 0 || !seq_recur.get(0).isList()) {
+    if (seq_recur.empty() || !seq_recur.get(0).isList()) {
       break;
     }
     seq_recur = seq_recur.get(0).toList();
diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp
index b2a52641458c..4f5468b243dd 100644
--- a/torch/csrc/jit/runtime/static/fusion.cpp
+++ b/torch/csrc/jit/runtime/static/fusion.cpp
@@ -297,7 +297,7 @@ void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size) {
   }
 
   Node* prev_fusion_group =
-      initial_fusion_groups.size() ? initial_fusion_groups[0] : nullptr;
+      !initial_fusion_groups.empty() ? initial_fusion_groups[0] : nullptr;
 
   for (const auto i : c10::irange(1, initial_fusion_groups.size())) {
     // Try merging the just created fusion group into the previous one.
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 3f87df14f555..c371953cda76 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -955,7 +955,7 @@ void BlockRunner::set_inputs(
 
   const auto& schema_args = schema->arguments();
   size_t consumed_kwargs = 0;
-  DCHECK(schema_args.size() > 0);
+  DCHECK(!schema_args.empty());
   TORCH_CHECK(
       args.size() < schema_args.size(),
       "Static runtime got too many arguments");
@@ -1375,8 +1375,7 @@ void BlockRunner::benchmark(
     const int main_runs,
     bool print_per_node_time,
     bool generate_ai_pep_output) {
-  TORCH_CHECK(
-      kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
+  TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
   std::cout << "Input size: " << args_list.size() << std::endl;
   float time_per_iter =
       benchmark_model(args_list, kwargs_list, warmup_runs, main_runs);
@@ -1397,7 +1396,7 @@ void BlockRunner::benchmark(
 
   std::vector<std::pair<std::string, double>> time_per_node_type_vec{
       results.time_per_node_type.begin(), results.time_per_node_type.end()};
-  if (args_list.size() == 0) {
+  if (args_list.empty()) {
     std::sort(
         time_per_node_type_vec.begin(),
         time_per_node_type_vec.end(),
@@ -1497,10 +1496,9 @@ float BlockRunner::benchmark_model(
     const int warmup_runs,
     const int main_runs) {
   TORCH_CHECK(warmup_runs >= 0 && main_runs >= 1);
-  TORCH_CHECK(
-      kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
+  TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
 
-  const bool is_kwargs_empty = kwargs_list.size() == 0;
+  const bool is_kwargs_empty = kwargs_list.empty();
   const KeywordArgs empty_kwargs;
   for (const auto i : c10::irange(warmup_runs)) {
     (void)i; // Suppress unused variable warning
@@ -1599,13 +1597,12 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
     const std::vector<KeywordArgs>& kwargs_list,
     const int warmup_runs,
     const int main_runs) {
-  TORCH_CHECK(
-      kwargs_list.size() == 0 || args_list.size() == kwargs_list.size());
+  TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
   TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1);
 
   IndividualMetrics results;
   results.time_per_node.resize(nodes_.size(), 0);
-  if (args_list.size() == 0) {
+  if (args_list.empty()) {
     // When the given input is empty, compute the op statistics from the given
     // graph without executing it.
     for (const auto i : c10::irange(nodes_.size())) {
@@ -1634,7 +1631,7 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
     return results;
   }
 
-  const bool is_kwargs_empty = kwargs_list.size() == 0;
+  const bool is_kwargs_empty = kwargs_list.empty();
   const KeywordArgs empty_kwargs;
   bool manage_output_tensors = static_module_.opts().manage_output_tensors;
   // See comment on above use of InferenceMode for
diff --git a/torch/csrc/jit/runtime/static/memory_planner.cpp b/torch/csrc/jit/runtime/static/memory_planner.cpp
index 3b3e69d97022..e8b0fb6a3840 100644
--- a/torch/csrc/jit/runtime/static/memory_planner.cpp
+++ b/torch/csrc/jit/runtime/static/memory_planner.cpp
@@ -1,3 +1,4 @@
+#include <c10/core/alignment.h>
 #include <torch/csrc/jit/runtime/static/memory_planner.h>
 
 #include <ATen/Tensor.h>
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index 1c8fb0791389..d82ad3c2c0ec 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -1293,7 +1293,7 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
       if (!sr_schema_check(n, "aten::format(str self, ...) -> str")) {
         return nullptr;
       }
-      TORCH_CHECK(n->inputs().size() > 0);
+      TORCH_CHECK(!n->inputs().empty());
       return [](ProcessedNode* pnode) {
         const auto num_inputs = pnode->num_inputs();
         auto stack = boxInputs(*pnode);
@@ -1485,7 +1485,7 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(
         const auto& tensor = pnode->Input(0).toTensor();
         // JIT does a check for requires_grad, but we skip it here since SR is
         // inference only
-        if (tensor.sizes().size() != 0) {
+        if (!tensor.sizes().empty()) {
           throw std::runtime_error(
               "Cannot convert a tensor of dimension > 0 to scalar");
         }
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index e2a154ad069e..679b28a822bc 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -885,7 +885,7 @@ SROperator aten_stack(Node* n) {
   }
   return [](ProcessedNode* p_node) {
     const auto inputs = p_node->Input(0).toTensorVector();
-    TORCH_CHECK(inputs.size() > 0, "stack expects non-empty tensor list");
+    TORCH_CHECK(!inputs.empty(), "stack expects non-empty tensor list");
     const auto dim = p_node->Input(1).toInt();
     if (p_node->Output(0).isNone()) {
       p_node->Output(0) = at::native::_stack_cpu(inputs, dim);
@@ -2617,7 +2617,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator {
   }
   return [](ProcessedNode* p_node) {
     const auto inputs = p_node->Input(0).toTensorVector();
-    TORCH_CHECK(inputs.size() > 0, "concat expects non-empty tensor list");
+    TORCH_CHECK(!inputs.empty(), "concat expects non-empty tensor list");
     const auto dim = p_node->Input(1).toInt();
     if (p_node->Output(0).isNone()) {
       p_node->Output(0) = at::cpu::cat(inputs, dim);
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index 9de4e45ddef3..8b993e87fb35 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -32,7 +32,7 @@ struct SROperatorFunctor {
   virtual ~SROperatorFunctor() = default;
 };
 
-C10_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor);
+TORCH_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor);
 
 #define REGISTER_OPERATOR_FUNCTOR(name, id, ...)             \
   struct SROperatorFunctor_##id : public SROperatorFunctor { \
@@ -43,7 +43,7 @@ C10_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor);
   };                                                         \
   C10_REGISTER_CLASS(SROperatorRegistry, name, SROperatorFunctor_##id);
 
-C10_DECLARE_REGISTRY(SRNativeOperatorRegistry, SROperatorFunctor);
+TORCH_DECLARE_REGISTRY(SRNativeOperatorRegistry, SROperatorFunctor);
 #define REGISTER_NATIVE_OPERATOR_FUNCTOR(name, id, ...)            \
   struct SRNativeOperatorFunctor_##id : public SROperatorFunctor { \
     const SROpFunctor fn = __VA_ARGS__;                            \
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 9c44266f6ed1..a3875e09650f 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -505,7 +505,7 @@ std::vector<TupleUnpackBlock> CollectVariadicTupleUnpackFusionCandidates(
 }
 
 void FuseTupleUnpackBlock(const TupleUnpackBlock& nodes) {
-  TORCH_CHECK(nodes.size() > 0);
+  TORCH_CHECK(!nodes.empty());
   auto graph = nodes[0]->owningGraph();
   auto var_unpack = graph->create(
       fromQualString("static_runtime::VarTupleUnpack"),
@@ -987,7 +987,7 @@ void RemoveImmutableInputDictLookups(
     }
     iter->second.push_back(getitem_node);
   }
-  if (keys.size() == 0) {
+  if (keys.empty()) {
     return;
   }
   // Move all keys to the beginning of the graph and insert new dict_unpack
@@ -996,7 +996,7 @@ void RemoveImmutableInputDictLookups(
   graph->prependNode(marker);
   graph->setInsertPoint(marker);
   for (Node* key : keys) {
-    DCHECK(key->inputs().size() == 0);
+    DCHECK(key->inputs().empty());
     key->moveBefore(marker);
   }
   const c10::Symbol static_runtime_dict_unpack_symbol =
@@ -1004,7 +1004,7 @@ void RemoveImmutableInputDictLookups(
   for (auto& it : dict_to_getitems) {
     Value* dict = it.first;
     std::vector<Node*>& getitems = it.second;
-    DCHECK(getitems.size() > 0);
+    DCHECK(!getitems.empty());
     auto* dict_unpack =
         graph->create(static_runtime_dict_unpack_symbol, getitems.size());
     graph->insertNode(dict_unpack);
diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp
index 73753157795c..f4c0a44e0fae 100644
--- a/torch/csrc/jit/runtime/symbolic_script.cpp
+++ b/torch/csrc/jit/runtime/symbolic_script.cpp
@@ -90,7 +90,7 @@ const std::vector<std::string> functions = {
                 i = 0
             return i
 
-        def AD_var_backward_0(grad, self, correction: int):
+        def AD_var_backward_0(grad, self, correction: number):
             # FIXME: torchscript: div(float, float)
             return  grad * (self - self.mean()) * 2.0 / (self.numel() - correction)
 
@@ -115,7 +115,7 @@ const std::vector<std::string> functions = {
         def AD_var_backward_1(grad,
                               self,
                               dim: List[int],
-                              correction: int,
+                              correction: number,
                               keepdim: bool):
             if self.dim() == 0:
                 return AD_var_backward_0(grad, self, correction)
@@ -129,7 +129,7 @@ const std::vector<std::string> functions = {
         def AD_var_backward_2(grad,
                               self,
                               dim: Optional[List[int]],
-                              correction: Optional[int],
+                              correction: Optional[number],
                               keepdim: bool):
             if correction is None:
                 correction = 1
@@ -163,7 +163,7 @@ const std::vector<std::string> functions = {
         def std_2(self,
                   dim: Optional[List[int]],
                   *,
-                  correction: Optional[int],
+                  correction: Optional[number],
                   keepdim: bool):
             std_out = torch.std(self, dim, correction=correction, keepdim=keepdim)
             def backward(grad_output):
@@ -195,7 +195,7 @@ const std::vector<std::string> functions = {
         def var_2(self,
                   dim: Optional[List[int]],
                   *,
-                  correction: Optional[int],
+                  correction: Optional[number],
                   keepdim: bool):
             def backward(grad_output):
                 grad_self = AD_var_backward_2(grad_output, self, dim, correction, keepdim)
@@ -1617,7 +1617,7 @@ void loadFunctions() {
 c10::optional<GradientPair> gradientInfoForSchema(
     const FunctionSchema& schema) {
   std::lock_guard<std::mutex> guard(lock);
-  if (schema_to_graphs.size() == 0) {
+  if (schema_to_graphs.empty()) {
     loadFunctions();
   }
   auto cache_it = cached_gradient_pairs.find(&schema);
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index 52dcb2ff391a..c17e6557afe9 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -381,7 +381,7 @@ void loadFunctions() {
 c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
     const FunctionSchema& schema) {
   std::lock_guard<std::mutex> guard(lock);
-  if (cached_schema_to_graph.size() == 0) {
+  if (cached_schema_to_graph.empty()) {
     loadFunctions();
   }
 
@@ -398,7 +398,7 @@ c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
 TORCH_API c10::optional<BoundedShapeGraphs> boundedGraphsForSchema(
     const FunctionSchema& schema) {
   std::lock_guard<std::mutex> guard(lock);
-  if (cached_bounded_schema_to_graph.size() == 0) {
+  if (cached_bounded_schema_to_graph.empty()) {
     loadFunctions();
   }
   GRAPH_DEBUG("Trying to find schema in bounded graphs: ", schema);
@@ -414,7 +414,7 @@ void RegisterShapeComputeGraphForSchema(
     const FunctionSchema& schema,
     std::shared_ptr<Graph> g) {
   std::lock_guard<std::mutex> guard(lock);
-  if (cached_schema_to_graph.size() == 0) {
+  if (cached_schema_to_graph.empty()) {
     loadFunctions();
   }
   transformShapeFunction(&schema, g);
@@ -425,7 +425,7 @@ void RegisterShapeComputeGraphForSchema(
 
 std::vector<const FunctionSchema*> RegisteredShapeComputeSchemas() {
   std::lock_guard<std::mutex> guard(lock);
-  if (cached_schema_to_graph.size() == 0) {
+  if (cached_schema_to_graph.empty()) {
     loadFunctions();
   }
 
diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
index 4d37da535481..7b10e0428a8f 100644
--- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
+++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp
@@ -115,7 +115,7 @@ std::vector<char> CallStackDebugInfoPickler::pickle(
   std::vector<at::Tensor> table;
   c10::IValue ivalue = c10::ivalue::Tuple::create(std::move(ivalues));
   auto result = jit::pickle(ivalue, &table);
-  TORCH_CHECK(table.size() == 0, "Expected 0 tensors to be written");
+  TORCH_CHECK(table.empty(), "Expected 0 tensors to be written");
   return result;
 }
 
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 5eaa4cd26ad9..fe240c51d086 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -59,7 +59,7 @@ namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 
 const static int kInvalidOpsetVersion = -1;
-const static int kMainOpsetVersion = 17;
+const static int kMainOpsetVersion = 18;
 // Based on OP_SET_ID_VERSION_MAP in
 // https://github.com/onnx/onnx/blob/master/onnx/helper.py.
 constexpr static std::array<int64_t, kMainOpsetVersion + 1>
@@ -82,6 +82,7 @@ constexpr static std::array<int64_t, kMainOpsetVersion + 1>
         8, // opset 15
         8, // opset 16
         8, // opset 17
+        8, // opset 18
 };
 
 std::string getNodeStackTraceString(const Node* n) {
@@ -1380,16 +1381,36 @@ std::string serialize_model_proto_to_string(
   return model_proto->SerializeAsString();
 }
 
-void check_onnx_proto(const std::string& proto_string, bool full_check) {
+void check_onnx_proto(const std::string& proto_string) {
   onnx::ModelProto model;
   if (!ParseProtoFromBytes(&model, proto_string.c_str(), proto_string.size())) {
     throw std::runtime_error("Invalid ONNX proto string.");
     return;
   }
+  // 1. baseline check
+  // These two checks prevent broken graph being generated
+  // And errors out exporting if that happens.
   onnx::checker::check_model(model);
-
-  if (full_check) {
-    onnx::shape_inference::InferShapes(model);
+  onnx::shape_inference::InferShapes(model);
+  // 2. full check
+  // apply strict mode shape type inference check which examines
+  // whether it's a valid ONNX graph or not. As for some users, they
+  // don't need a fully valid ONNX graph to run their model, we simply
+  // add this information as warning message if it fails.
+  try {
+    auto* schema_registry = onnx::OpSchemaRegistry::Instance();
+    onnx::ShapeInferenceOptions options{
+        /*check_type=*/true,
+        /*error_mode=*/true};
+    onnx::shape_inference::InferShapes(model, schema_registry, options);
+  } catch (const onnx::InferenceError& ex) {
+    TORCH_WARN(
+        "The exported ONNX model failed ONNX shape inference."
+        "The model will not be executable by the ONNX Runtime."
+        "If this is unintended and you believe there is a bug,"
+        "please report an issue at https://github.com/pytorch/pytorch/issues."
+        "Error reported by strict ONNX shape inference: ",
+        ex.what());
   }
 }
 
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index da5d5e6a7095..3a56cfc7788f 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -64,9 +64,7 @@ export_onnx(
 TORCH_API std::string serialize_model_proto_to_string(
     const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
 
-TORCH_API void check_onnx_proto(
-    const std::string& proto_string,
-    bool full_check = false);
+TORCH_API void check_onnx_proto(const std::string& proto_string);
 
 // Serializer for both oldsyle and unified format TorchScript serialization
 class TORCH_API ScriptModuleSerializer {
@@ -96,7 +94,8 @@ class TORCH_API ScriptModuleSerializer {
       const std::string& archive_name,
       const std::string& archive_dir,
       const std::string& tensor_dir,
-      bool use_storage_context = false);
+      bool use_storage_context = false,
+      bool skip_tensor_data = false);
   void updateSourceRangeTags(const SourceRangeRecords& ranges);
 
   caffe2::serialize::PyTorchStreamWriter& writer_;
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 182803aa91e8..79ecda76d0e2 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -481,6 +481,15 @@ void ScriptModuleSerializer::serialize(
         /*archive_dir=*/"",
         /*tensor_dir=*/"constants/");
   }
+  if (module.retrieve_traced_inputs().size() > 0) {
+    writeArchive(
+        module.retrieve_traced_inputs(),
+        /*archive_name=*/"traced_inputs",
+        /*archive_dir=*/"",
+        /*tensor_dir=*/"traced_inputs/",
+        /*use_storage_context*/ false,
+        /*skip_tensor_data*/ true);
+  }
   // Acquires and sets minimum (dynamic) version
   for (auto& item : file_streams_) {
     writer_.setMinVersion(item.value().minVersion());
@@ -492,7 +501,8 @@ void ScriptModuleSerializer::writeArchive(
     const std::string& archive_name,
     const std::string& archive_dir,
     const std::string& tensor_dir,
-    bool use_storage_context) {
+    bool use_storage_context,
+    bool skip_tensor_data) {
   std::vector<char> data;
   // Vector to capture the run-time class types during pickling the IValues
   std::vector<c10::ClassTypePtr> memoizedClassTypes;
@@ -539,7 +549,7 @@ void ScriptModuleSerializer::writeArchive(
 
   for (const auto& td : data_pickle.tensorData()) {
     std::string tensor_name = tensor_names[i++];
-    if (td.is_meta()) {
+    if (td.is_meta() || skip_tensor_data) {
       writer_.writeRecord(tensor_dir + tensor_name, nullptr, 0);
       continue;
     }
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index b9884192eeaa..9f2404120893 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -22,10 +22,12 @@
 #include <torch/csrc/jit/serialization/import_legacy.h>
 #endif
 #include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/jit/ir/graph_utils.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/mobile/file_format.h>
 #include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 #include <torch/csrc/jit/operator_upgraders/upgraders_entry.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 #include <torch/csrc/jit/serialization/import_read.h>
 #include <torch/csrc/jit/serialization/import_source.h>
@@ -122,7 +124,8 @@ class ScriptModuleDeserializer final {
 
   Module deserialize(
       c10::optional<at::Device> device,
-      ExtraFilesMap& extra_files);
+      ExtraFilesMap& extra_files,
+      bool restore_shapes = false);
 
  private:
   IValue readArchive(const std::string& archive_name);
@@ -251,7 +254,8 @@ graph(%x, %packed_params, %stride, %padding, %dilation, %groups, %r_scale, %r_ze
 
 Module ScriptModuleDeserializer::deserialize(
     c10::optional<at::Device> device,
-    ExtraFilesMap& extra_files) {
+    ExtraFilesMap& extra_files,
+    bool restore_shapes) {
   // we populate the upgraders map before any load starts
   populate_upgraders_graph_map();
 
@@ -280,8 +284,31 @@ Module ScriptModuleDeserializer::deserialize(
   for (auto constant : tuple->elements()) {
     constants_table_.push_back(constant.toIValue());
   }
-  auto m = Module(readArchive("data").toObject());
+  auto m_ivalue = readArchive("data");
+  auto m = Module(m_ivalue.toObject());
   rewriteQuantizedConvForBC(m);
+  // Checking for and loading saved traced inputs
+  if (restore_shapes && reader_->hasRecord("traced_inputs.pkl")) {
+    auto dict = readArchive("traced_inputs").toGenericDict();
+    for (const auto& entry : dict) {
+      auto inputs = entry.value().toList().vec();
+      auto g =
+          toGraphFunction(m.get_method(entry.key().toStringRef()).function())
+              .graph();
+      Stack stack(inputs.begin(), inputs.end());
+      // Added the module as the first input if we are missing
+      // an input as traced modules refer to self as an additional input
+      if (g->inputs().size() == stack.size() + 1) {
+        stack.insert(stack.begin(), m_ivalue);
+      }
+      setInputTensorTypes(*g, stack, /*complete=*/true);
+      PropagateInputShapes(g);
+    }
+  } else {
+    if (restore_shapes) {
+      TORCH_WARN("Cannot restore shapes as no traced inputs were stored");
+    }
+  }
   return m;
 }
 } // namespace
@@ -301,7 +328,8 @@ static Module _load_jit_module_from_bytes(
     size_t size,
     std::shared_ptr<CompilationUnit> cu,
     c10::optional<c10::Device> device,
-    ExtraFilesMap& extra_files);
+    ExtraFilesMap& extra_files,
+    bool restore_shapes);
 
 Module parse_and_initialize_jit_module(
     std::shared_ptr<char> data,
@@ -346,7 +374,8 @@ Module import_ir_module(
     std::istream& in,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files,
-    bool load_debug_files) {
+    bool load_debug_files,
+    bool restore_shapes) {
   in.seekg(0, in.beg);
   // NOTE: Zipformat can be large files. So using stream version directly
   // instead of reading the file all at once.
@@ -354,12 +383,13 @@ Module import_ir_module(
     auto reader = torch::make_unique<PyTorchStreamReader>(&in);
     reader->setShouldLoadDebugSymbol(load_debug_files);
     ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
-    return deserializer.deserialize(device, extra_files);
+    return deserializer.deserialize(device, extra_files, restore_shapes);
   }
   std::shared_ptr<char> data;
   size_t size = 0;
   std::tie(data, size) = get_stream_content(in);
-  return _load_jit_module_from_bytes(data, size, cu, device, extra_files);
+  return _load_jit_module_from_bytes(
+      data, size, cu, device, extra_files, restore_shapes);
 }
 
 // For reading unified serialization format from torch.Package.
@@ -394,19 +424,21 @@ Module import_ir_module(
     const std::string& filename,
     c10::optional<at::Device> device,
     ExtraFilesMap& extra_files,
-    bool load_debug_files) {
+    bool load_debug_files,
+    bool restore_shapes) {
   // NOTE: Zipformat can be large files. So using stream version directly
   // instead of reading the file all at once.
   if (getFileFormat(filename) != FileFormat::FlatbufferFileFormat) {
     auto reader = torch::make_unique<PyTorchStreamReader>(filename);
     reader->setShouldLoadDebugSymbol(load_debug_files);
     ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
-    return deserializer.deserialize(device, extra_files);
+    return deserializer.deserialize(device, extra_files, restore_shapes);
   }
   std::shared_ptr<char> data;
   size_t size = 0;
   std::tie(data, size) = get_file_content(filename.c_str());
-  return _load_jit_module_from_bytes(data, size, cu, device, extra_files);
+  return _load_jit_module_from_bytes(
+      data, size, cu, device, extra_files, restore_shapes);
 }
 
 Module import_ir_module(
@@ -503,7 +535,8 @@ Module _load_jit_module_from_bytes(
     size_t size,
     std::shared_ptr<CompilationUnit> cu,
     c10::optional<c10::Device> device,
-    ExtraFilesMap& extra_files) {
+    ExtraFilesMap& extra_files,
+    bool restore_shapes) {
   TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecognized data format");
   auto format = getFileFormat(data.get());
   switch (format) {
@@ -514,7 +547,7 @@ Module _load_jit_module_from_bytes(
       auto rai = std::make_unique<MemoryReadAdapter>(data.get(), size);
       auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
       ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader));
-      return deserializer.deserialize(device, extra_files);
+      return deserializer.deserialize(device, extra_files, restore_shapes);
     }
 
     default:
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index 0de47d95a4e6..61b96222f6f7 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -40,7 +40,8 @@ TORCH_API Module import_ir_module(
     const std::string& filename,
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
-    bool load_debug_files = true);
+    bool load_debug_files = true,
+    bool restore_shapes = false);
 
 // For reading unified serialization format from torch.Package
 TORCH_API Module import_ir_module(
@@ -55,7 +56,8 @@ TORCH_API Module import_ir_module(
     std::istream& in,
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files,
-    bool load_debug_files = true);
+    bool load_debug_files = true,
+    bool restore_shapes = false);
 
 TORCH_API Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
diff --git a/torch/csrc/jit/serialization/import_read.cpp b/torch/csrc/jit/serialization/import_read.cpp
index 7c85096962d4..533fed491773 100644
--- a/torch/csrc/jit/serialization/import_read.cpp
+++ b/torch/csrc/jit/serialization/import_read.cpp
@@ -33,7 +33,7 @@ IValue readArchiveAndTensors(
   };
 
   std::string tensor_dir_path =
-      (tensor_prefix != "") ? tensor_prefix : archive_name + "/";
+      (!tensor_prefix.empty()) ? tensor_prefix : archive_name + "/";
 
   auto read_record = [&](const std::string& name) {
     std::string ss = tensor_dir_path + name;
diff --git a/torch/csrc/jit/serialization/import_source.cpp b/torch/csrc/jit/serialization/import_source.cpp
index 723fd3752053..4c78e09040f3 100644
--- a/torch/csrc/jit/serialization/import_source.cpp
+++ b/torch/csrc/jit/serialization/import_source.cpp
@@ -121,6 +121,7 @@ SourceImporterImpl::SourceImporterImpl(
       // actual value
       {"CONSTANTS", std::make_shared<ConstantTableValue>(constant_table)},
       {"fork", SpecialFormValue::create(prim::fork)},
+      {"awaitable", SpecialFormValue::create(prim::awaitable)},
       {"annotate", SpecialFormValue::create(prim::annotate)},
       {"unchecked_cast", SpecialFormValue::create(prim::unchecked_cast)},
       {"uninitialized", SpecialFormValue::create(prim::Uninitialized)},
@@ -154,7 +155,7 @@ Function* SourceImporterImpl::findFunction(const QualifiedName& name) {
 
 void SourceImporterImpl::parseSourceIfNeeded(const std::string& qualifier) {
   // qualifier may be blank, for instance checking if __torch__ is a class.
-  if (qualifier == "" || loaded_sources_.count(qualifier)) {
+  if (qualifier.empty() || loaded_sources_.count(qualifier)) {
     return;
   }
   loaded_sources_.insert(qualifier);
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 90a0a271a9d2..1ecdaf2a7d77 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -125,7 +125,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
   } else if (ivalue.isCapsule()) {
     std::stringstream err;
     err << "Cannot serialize custom bound C++ class";
-    if (memoized_class_types_ && memoized_class_types_->size()) {
+    if (memoized_class_types_ && !memoized_class_types_->empty()) {
       if (auto qualname = memoized_class_types_->back()->name()) {
         err << " " << qualname->qualifiedName();
       }
@@ -425,7 +425,6 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) {
       "torch._utils", quantized ? "_rebuild_qtensor" : "_rebuild_tensor_v2");
 
   push<PickleOpCode>(PickleOpCode::MARK);
-
   pushStorageOfTensor(tensor);
 
   // storage offset
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 6042379180d4..12a67d0a9e38 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -29,7 +29,7 @@ static bool isValidIdentifierChar(char c, size_t pos) {
 }
 
 static bool isValidIdentifier(const std::string& name) {
-  if (name.size() == 0)
+  if (name.empty())
     return false;
   for (const auto i : c10::irange(name.size())) {
     if (!isValidIdentifierChar(name[i], i))
@@ -146,11 +146,11 @@ struct PythonPrintImpl {
       // This prevents having redundant entries at the same offset,
       // which can happen for example in printValueList when begin
       // and end are the empty string.
-      if (s.size() == 0) {
+      if (s.empty()) {
         return *this;
       }
 
-      if (!ranges_.size() || ranges_.back().range != srs_->back()) {
+      if (ranges_.empty() || ranges_.back().range != srs_->back()) {
         ranges_.emplace_back((size_t)oss_.tellp(), srs_->back());
       }
       oss_ << s;
@@ -159,7 +159,7 @@ struct PythonPrintImpl {
 
     TaggedStringStream& operator<<(const TaggedStringStream& rhs) {
       for (const auto& range : rhs.ranges_) {
-        if (!ranges_.size() || ranges_.back().range != range.range) {
+        if (ranges_.empty() || ranges_.back().range != range.range) {
           ranges_.emplace_back((size_t)oss_.tellp() + range.bytes, range.range);
         }
       }
@@ -178,7 +178,7 @@ struct PythonPrintImpl {
 
     template <typename T>
     TaggedStringStream& operator<<(const T& t) {
-      if (!ranges_.size() || ranges_.back().range != srs_->back()) {
+      if (ranges_.empty() || ranges_.back().range != srs_->back()) {
         ranges_.emplace_back((size_t)oss_.tellp(), srs_->back());
       }
       oss_ << t;
@@ -236,7 +236,7 @@ struct PythonPrintImpl {
     if (v->hasDebugName() && use.user->kind() != prim::Return)
       return false;
     // don't try to inline control blocks
-    if (n->blocks().size() != 0)
+    if (!n->blocks().empty())
       return false;
     // if it is a loop-carried input, we need a variable
     // otherwise the condition or trip count may be emitted in the wrong order
@@ -375,7 +375,7 @@ struct PythonPrintImpl {
   // force them to be by rewriting them
   static std::string makeValidIdentifier(const std::string& candidate) {
     std::stringstream ss;
-    if (candidate.size() == 0 || isdigit(candidate[0]))
+    if (candidate.empty() || isdigit(candidate[0]))
       ss << "_";
     for (char c : candidate) {
       if (isupper(c) || islower(c) || isdigit(c) || c == '_')
@@ -510,7 +510,7 @@ struct PythonPrintImpl {
   }
 
   void printAssignment(at::ArrayRef<Value*> lhs, at::ArrayRef<Value*> rhs) {
-    if (lhs.size() == 0) {
+    if (lhs.empty()) {
       return;
     }
     indent();
@@ -561,13 +561,13 @@ struct PythonPrintImpl {
     {
       auto guard = WithIndented();
       // Print node contents
-      printBlock(stmt.thenBlock(), stmt.outputs().size() > 0);
+      printBlock(stmt.thenBlock(), !stmt.outputs().empty());
       printAssignment(stmt.outputs(), stmt.thenOutputs());
     }
     indent() << "else:\n";
     {
       auto guard = WithIndented();
-      printBlock(stmt.elseBlock(), stmt.outputs().size() > 0);
+      printBlock(stmt.elseBlock(), !stmt.outputs().empty());
       printAssignment(stmt.outputs(), stmt.elseOutputs());
     }
   }
@@ -622,7 +622,7 @@ struct PythonPrintImpl {
       auto body_block = stmt.bodyBlock();
       ArrayRef<Value*> loop_carried_block_inputs =
           body_block->inputs().slice(offset);
-      printBlock(body_block, loop_carried_block_inputs.size() > 0);
+      printBlock(body_block, !loop_carried_block_inputs.empty());
       printAssignment(
           loop_carried_block_inputs, body_block->outputs().slice(offset));
     }
@@ -694,7 +694,7 @@ struct PythonPrintImpl {
     assignValuesToTheirUniqueNames(node->outputs());
     indent();
     // Print outputs
-    if (node->outputs().size() > 0) {
+    if (!node->outputs().empty()) {
       printValueList(body_, node->outputs());
       body_ << " = ";
     }
@@ -782,7 +782,7 @@ struct PythonPrintImpl {
               << "Exportable methods must have a single return value. "
               << "Normal use of ScriptMethods should enforce this";
         }
-        if (node->inputs().size() > 0) {
+        if (!node->inputs().empty()) {
           indent();
           body_ << "return ";
           printValueList(body_, node->inputs());
@@ -803,7 +803,7 @@ struct PythonPrintImpl {
         // the unpack to be inserted when parsed back in:
         // a, b, = unpacked
         // a, = unpacked # trailing comma forces an unpack to happen
-        if (node->outputs().size() > 0) {
+        if (!node->outputs().empty()) {
           printValueList(body_, node->outputs(), "", ", = ");
         }
         body_ << useOf(node->input()) << "\n";
@@ -831,12 +831,26 @@ struct PythonPrintImpl {
         ss << "fork(" << name << ")";
         printOutputDefinition(node, ss.str());
       } break;
+      case prim::awaitable: {
+        // the subgraph gets emitted as another function
+        auto name = genName("__awaitable_function");
+        auto graph = node->g(attr::Subgraph);
+        indent();
+        body_ << "def " << name << "():\n";
+        for (size_t i = 0; i < node->inputs().size(); ++i) {
+          assignValue(graph->inputs().at(i), node->inputs().at(i));
+        }
+        printBody(graph->block());
+        std::stringstream ss;
+        ss << "awaitable(" << name << ")";
+        printOutputDefinition(node, ss.str());
+      } break;
       case prim::Enter: {
         const auto in = node->inputs().at(0);
         const auto out = node->outputs().at(0);
         indent();
         body_ << "with " << useOf(in);
-        if (out->uses().size() > 0) {
+        if (!out->uses().empty()) {
           assignValue(out, genUniqueNameFor(out));
           body_ << " as " << useOf(out);
         }
@@ -1054,7 +1068,7 @@ struct PythonPrintImpl {
         TypePtr elem_type = list_type->getElementType();
         // Empty lists must be annotated with their type so the compiler knows
         // what type is supposed to be inside them
-        if (node->inputs().size() == 0) {
+        if (node->inputs().empty()) {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_)
                << ", [])";
@@ -1078,7 +1092,7 @@ struct PythonPrintImpl {
         //   - the dict is empty
         //   - the dict has potentially ambiguous element types
         //       (e.g. Tensor vs. Optional[Tensor])
-        if (node->inputs().size() == 0 ||
+        if (node->inputs().empty() ||
             !elementTypeCanBeInferredFromMembers(dict_type->getKeyType()) ||
             !elementTypeCanBeInferredFromMembers(dict_type->getValueType())) {
           stmt << "annotate("
@@ -1320,7 +1334,7 @@ struct PythonPrintImpl {
         printNode(n, /*print_const=*/true);
       }
       // Print body
-      printBlock(body, body->return_node()->inputs().size() > 0);
+      printBlock(body, !body->return_node()->inputs().empty());
       printNode(body->return_node(), /*print_const=*/false);
     }
   }
@@ -1432,7 +1446,7 @@ struct PythonPrintImpl {
         }
         body_ << "]\n";
         auto forwardPreHooks = classType->getForwardPreHooks();
-        if (forwardPreHooks.size() > 0) {
+        if (!forwardPreHooks.empty()) {
           indent();
           body_ << "__forward_pre_hooks__ = [";
           for (const auto& pre_hook : forwardPreHooks) {
@@ -1442,7 +1456,7 @@ struct PythonPrintImpl {
         }
 
         auto forwardHooks = classType->getForwardHooks();
-        if (forwardHooks.size() > 0) {
+        if (!forwardHooks.empty()) {
           indent();
           body_ << "__forward_hooks__ = [";
           for (const auto& hook : forwardHooks) {
@@ -1543,7 +1557,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "def " << method.name() << "(self";
           TORCH_INTERNAL_ASSERT(
-              method.arguments().size() > 0 &&
+              !method.arguments().empty() &&
               method.arguments().at(0).name() == "self");
           for (const Argument& arg :
                at::ArrayRef<Argument>(method.arguments()).slice(1)) {
diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp
index 9208c1889d43..1a6bf3fab9d2 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.cpp
+++ b/torch/csrc/jit/serialization/source_range_serialization.cpp
@@ -179,7 +179,7 @@ std::vector<char> SourceRangePickler::pickle(
   } else {
     result = jit::pickle(ivalue, &table);
   }
-  TORCH_CHECK(table.size() == 0, "Expected 0 tensors to be written");
+  TORCH_CHECK(table.empty(), "Expected 0 tensors to be written");
   return result;
 }
 
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index fc47c8b6016c..d1f537980f25 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -113,6 +113,13 @@ void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
           to_process.emplace_back(std::move(elem));
         }
       } break;
+      case AwaitType::Kind: {
+        auto aw = w.value.toAwait();
+        if (aw->completed()) {
+          Work elem = {w.type->containedType(0), aw->wait()};
+          to_process.emplace_back(std::move(elem));
+        }
+      } break;
       case OptionalType::Kind: {
         if (!w.value.isNone()) {
           Work elem = {w.type->containedType(0), w.value};
@@ -374,15 +381,30 @@ PickleOpCode Unpickler::readInstruction() {
       }
     } break;
     case PickleOpCode::TUPLE1: {
+      TORCH_CHECK(
+          stack_.size() > 0,
+          "Parsing error: stack_ contains ",
+          stack_.size(),
+          " elements, at least 1 expected");
       stack_.emplace_back(c10::ivalue::Tuple::create(pop(stack_)));
     } break;
     case PickleOpCode::TUPLE2: {
+      TORCH_CHECK(
+          stack_.size() > 1,
+          "Parsing error: stack_ contains ",
+          stack_.size(),
+          " elements, at least 2 expected");
       auto e2 = pop(stack_);
       auto e1 = pop(stack_);
       stack_.emplace_back(
           c10::ivalue::Tuple::create(std::move(e1), std::move(e2)));
     } break;
     case PickleOpCode::TUPLE3: {
+      TORCH_CHECK(
+          stack_.size() > 2,
+          "Parsing error: stack_ contains ",
+          stack_.size(),
+          " elements, at least 3 expected");
       auto e3 = pop(stack_);
       auto e2 = pop(stack_);
       auto e1 = pop(stack_);
@@ -432,7 +454,14 @@ PickleOpCode Unpickler::readInstruction() {
       stack_.erase(stack_.begin() + start, stack_.end());
     } break;
     case PickleOpCode::BINGET: {
-      stack_.push_back(memo_table_.at(read<uint8_t>()));
+      auto pos = read<uint8_t>();
+      TORCH_CHECK(
+          memo_table_.size() > pos,
+          "Parsing error: out of bounds access at ",
+          (size_t)pos,
+          " to memo_table_ which is of size ",
+          memo_table_.size());
+      stack_.push_back(memo_table_.at(pos));
     } break;
     case PickleOpCode::LONG_BINGET: {
       auto pos = read<uint32_t>();
@@ -463,6 +492,11 @@ PickleOpCode Unpickler::readInstruction() {
     case PickleOpCode::REDUCE: {
       // stack is: <functor_idx> <functor_arg>
       // extract <functor_idx> and remove from the stack:
+      TORCH_CHECK(
+          stack_.size() > 1,
+          "Parsing error: stack_ contains ",
+          stack_.size(),
+          " elements, at least 2 expected");
       std::swap(*(stack_.end() - 2), *(stack_.end() - 1));
       size_t idx = stack_.back().toInt();
       stack_.pop_back();
@@ -915,7 +949,7 @@ void Unpickler::rebuildTensorFromTypeV2() {
     const auto args_elems = args->elements();
     auto base_tensor_args = args_elems.at(tup_idx + 2).toTuple();
     auto py_state = args_elems.at(tup_idx + 3).toGenericDict();
-    if (py_state.size() > 0) {
+    if (!py_state.empty()) {
       TORCH_WARN(
           "Loading Tensor with Python attributes will return at::Tensor with Python attributes being discarded");
     }
@@ -1005,7 +1039,7 @@ std::string Unpickler::readBytes(size_t length) {
     // If the string is smallish, do a full buffer read,
     // and read out of that buffer.
     data.resize(length);
-    readSlowWithBuffer(data.data(), length);
+    readSlowWithBuffer(&data[0], length);
   } else {
     // Otherwise, for larger strings, read what we can from
     // the buffer, and then read directly to the destination.
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index 71c359de1e09..290d2dcd1bba 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -228,9 +228,6 @@ HazardKind getPotentialHazards(
   BoundsInfo aBounds = getInferredBounds(analyzer, A, true);
   BoundsInfo bBounds = getInferredBounds(analyzer, B, true);
 
-  BoundSet aWrites;
-  BoundSet aReads;
-
   for (auto& pair : bBounds) {
     BufPtr buf = pair.first;
     if (aBounds.find(buf) == aBounds.end()) {
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 2822fa46f998..cd5a957a6431 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -309,7 +309,7 @@ void CodeGen::allocIntermediateBufs() {
       interm_bufs, interm_buf_ranges, bufs_external_allocs);
 
   // Insert memory allocation/mapping nodes.
-  if (buf_allocs.size() > 0) {
+  if (!buf_allocs.empty()) {
     auto stmt_new = insertAllocFree(buf_allocs, bufs_external_allocs, stmt_);
     set_stmt(stmt_new);
   }
diff --git a/torch/csrc/jit/tensorexpr/codegen_external.py b/torch/csrc/jit/tensorexpr/codegen_external.py
index bdfe318a5fad..120520b139cd 100644
--- a/torch/csrc/jit/tensorexpr/codegen_external.py
+++ b/torch/csrc/jit/tensorexpr/codegen_external.py
@@ -80,13 +80,15 @@ def gen_external(native_functions_path, tags_path, external_path):
 def main() -> None:
     parser = argparse.ArgumentParser(
         description='Generate annotated_fn_args script')
-    parser.add_argument('--native_functions',
+    parser.add_argument('--native-functions',
+                        '--native_functions',
                         help='path to native_functions.yaml',
                         default='../../../../aten/src/ATen/native/native_functions.yaml')
     parser.add_argument('--tags',
                         help='path to tags.yaml',
                         default='../../../../aten/src/ATen/native/tags.yaml')
-    parser.add_argument('--template_path',
+    parser.add_argument('--template-path',
+                        '--template_path',
                         help='path to external_functions_codegen_template.cpp',
                         default='../../../../tools/jit/templates/external_functions_codegen_template.cpp')
     args = parser.parse_args()
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 958e5e90d56e..625e74b494f0 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -1,10 +1,11 @@
 #include <torch/csrc/jit/tensorexpr/cuda_codegen.h>
 #include <torch/csrc/jit/tensorexpr/half_support.h>
 
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/native/cuda/jit_utils.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h>
 #include <torch/csrc/jit/codegen/fuser/cuda/resource_strings.h>
 #include <torch/csrc/jit/jit_log.h>
@@ -1115,7 +1116,7 @@ void CudaCodeGen::call_with_numel(void** args, int64_t numel) {
   }
 
   auto stream = at::cuda::getCurrentCUDAStream();
-  fuser::cuda::executor_utils::initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
   AT_CUDA_DRIVER_CHECK(nvrtc().cuLaunchKernel(
       function_,
       gpu_block_extents,
@@ -1239,7 +1240,7 @@ void CudaCodeGen::call_raw(const std::vector<void*>& raw_args) {
   }
   // Launch the kernels
   auto stream = at::cuda::getCurrentCUDAStream();
-  fuser::cuda::executor_utils::initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
   AT_CUDA_DRIVER_CHECK(nvrtc().cuLaunchKernel(
       function_,
       gpu_block_extents_v[0],
@@ -1289,7 +1290,7 @@ at::Tensor CudaCodeGen::empty_strided(
 void CudaCodeGen::CompileToNVRTC(
     const std::string& code,
     const std::string& func_name) {
-  fuser::cuda::executor_utils::initializeCudaContext();
+  at::cuda::jit::initializeCudaContext();
   // Note: hacked at::DeviceGuard since at::DeviceGuard was failing to work
   // properly in some scenarios
   auto prior_device = at::cuda::current_device();
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index 7c4cd91866c7..315ed837fbcd 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -689,7 +689,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
               "Number of dimensions did not match number of strides", buf);
         }
         size_t buf_size = 1;
-        if (dims.size() > 0) {
+        if (!dims.empty()) {
           ExprHandle buf_size_expr = ExprHandle(immLike(dims[0], 1));
           ExprHandle negative_one = ExprHandle(immLike(dims[0], -1));
           for (const auto& i : c10::irange(dims.size())) {
@@ -984,7 +984,7 @@ class SimpleIREvaluatorImpl : public IRVisitor {
       values[i] = this->value();
     }
     std::vector<TInput> v1;
-    if (values.size() >= 1ULL) {
+    if (!values.empty()) {
       v1 = values[0].as_vec<TInput>();
     }
     std::vector<TInput> v2;
diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp
index 420282d14686..53b7763682fd 100644
--- a/torch/csrc/jit/tensorexpr/expr.cpp
+++ b/torch/csrc/jit/tensorexpr/expr.cpp
@@ -365,7 +365,7 @@ std::vector<ExprPtr> make_contiguous_strides(
     const std::vector<ExprHandle>& dims) {
   std::vector<ExprPtr> strides;
 
-  if (dims.size() > 0) {
+  if (!dims.empty()) {
     strides.resize(dims.size());
     auto si = immLike(dims[0], 1);
     // NOLINTNEXTLINE
diff --git a/torch/csrc/jit/tensorexpr/graph_opt.cpp b/torch/csrc/jit/tensorexpr/graph_opt.cpp
index e5589c50c67e..de8e06152ef9 100644
--- a/torch/csrc/jit/tensorexpr/graph_opt.cpp
+++ b/torch/csrc/jit/tensorexpr/graph_opt.cpp
@@ -195,12 +195,11 @@ void annotateInputShapes(
 
 std::shared_ptr<Graph> removeUnusedSelfArgument(
     const std::shared_ptr<Graph>& graph) {
-  if (graph->inputs().size() == 0) {
+  if (graph->inputs().empty()) {
     return graph;
   }
   jit::Value* self_argument = graph->inputs().at(0);
-  if (self_argument->uses().size() != 0 ||
-      !self_argument->type()->is_module()) {
+  if (!self_argument->uses().empty() || !self_argument->type()->is_module()) {
     return graph;
   }
   graph->eraseInput(0);
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index 28de7a0f86e9..7ad67d9474bc 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -13,7 +13,7 @@ static Dtype ChooseDtype(const Dtype& buffer_dtype, const Dtype& index_dtype) {
 }
 
 static Dtype dtypeOfIndices(const std::vector<ExprPtr>& indices) {
-  if (!indices.size()) {
+  if (indices.empty()) {
     // Return something so we can handle scalar buffers.
     return kInt;
   }
@@ -127,7 +127,7 @@ Dtype Intrinsics::IntrinsicsDtype(
     const std::vector<ExprPtr>& params) {
   // TODO: check the op_type and make a real decision
   // Doesnt this fail with kRand?
-  if (params.size() == 0) {
+  if (params.empty()) {
     throw malformed_input("invalid params in Intrinsics");
   } else if (params.size() == 1) {
     return IntrinsicsDtype(op_type, params[0]->dtype());
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 8d2a4e1faf24..5ed247b6881a 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -286,7 +286,7 @@ void IRPrinter::visit(RampPtr v) {
 
 void IRPrinter::visit(LoadPtr v) {
   // TODO: support the mask case
-  if (v->indices().size() == 0) {
+  if (v->indices().empty()) {
     os() << *v->base_handle();
   } else {
     os() << *v->base_handle() << "[";
@@ -414,7 +414,7 @@ void IRPrinter::visit(ReduceOpPtr v) {
 
 void IRPrinter::visit(StorePtr v) {
   // TODO: handle the mask
-  if (v->indices().size() == 0) {
+  if (v->indices().empty()) {
     os() << *v->base_handle() << " = " << *v->value() << ";";
     return;
   }
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index a2145efbe66b..7a50c9c93cf5 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -2336,7 +2336,7 @@ ExprPtr TermExpander::mutate(RoundOffPtr v) {
 
 ExprPtr buf_flat_size(BufPtr v) {
   std::vector<ExprPtr> dims = v->dims();
-  if (dims.size() == 0) {
+  if (dims.empty()) {
     return alloc<LongImm>(1);
   }
   ExprPtr flattened = immLike(dims[0], 1);
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.cpp b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
index 35180d5fa328..cc7569492770 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.cpp
@@ -79,12 +79,12 @@ void IRVerifier::visit(RampPtr v) {
 
 void IRVerifier::visit(LoadPtr v) {
   auto indices = v->indices();
-  if (indices.size() > 0 && v->buf()->base_handle()->dtype() != kHandle) {
+  if (!indices.empty() && v->buf()->base_handle()->dtype() != kHandle) {
     throw malformed_ir(
         "Load base handle dtype must be Handle", v->buf()->base_handle());
   }
 
-  Dtype index_dtype = indices.size() ? indices.at(0)->dtype() : kInt;
+  Dtype index_dtype = !indices.empty() ? indices.at(0)->dtype() : kInt;
   if (indices.size() > 1) {
     for (size_t i = 1; i < indices.size(); ++i) {
       if (indices.at(i)->dtype() != index_dtype) {
@@ -135,12 +135,12 @@ void IRVerifier::visit(IntrinsicsPtr v) {
 
 void IRVerifier::visit(StorePtr v) {
   auto indices = v->indices();
-  if (indices.size() > 0 && v->buf()->base_handle()->dtype() != kHandle) {
+  if (!indices.empty() && v->buf()->base_handle()->dtype() != kHandle) {
     throw malformed_ir(
         "Store base handle dtype must be Handle", v->buf()->base_handle());
   }
 
-  Dtype index_dtype = indices.size() ? indices.at(0)->dtype() : kInt;
+  Dtype index_dtype = !indices.empty() ? indices.at(0)->dtype() : kInt;
   if (indices.size() > 1) {
     for (size_t i = 1; i < indices.size(); ++i) {
       if (indices.at(i)->dtype() != index_dtype) {
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index f1c28a93bb44..c11bb2d7142b 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -432,11 +432,11 @@ ArgValue TensorExprKernel::toArg(const torch::jit::Value* v) const {
     for (auto el : v->node()->inputs()) {
       vec.push_back(toArg(el));
     }
-    if (vec.size() == 0) {
+    if (vec.empty()) {
       return BufList(); // Return arbitrarily typed vector
-    } else if (c10::get_if<BufHandle>(vec.data())) {
+    } else if (c10::get_if<BufHandle>(&vec[0])) {
       return convertVecArgValue<BufHandle>(vec);
-    } else if (c10::get_if<int64_t>(vec.data())) {
+    } else if (c10::get_if<int64_t>(&vec[0])) {
       return convertVecArgValue<int64_t>(vec);
     }
     throw unsupported_dtype();
@@ -543,7 +543,7 @@ bool constZeroDimTensorAsScalarArg(
   }
 
   const auto t = toIValue(v)->toTensor();
-  if (t.sizes().size() != 0) {
+  if (!t.sizes().empty()) {
     return false;
   }
 
@@ -675,7 +675,7 @@ void fuseAllLoops(StmtPtr st) {
   std::vector<ForPtr> outer_loops;
   for (const auto& stmt : *block) {
     auto loop = to<For>(stmt);
-    auto hasReduction = NodeFinder<ReduceOp>::find(stmt).size() != 0;
+    auto hasReduction = !NodeFinder<ReduceOp>::find(stmt).empty();
     if (!loop || hasReduction) {
       all_outer_loops.push_back(outer_loops);
       outer_loops.clear();
@@ -797,7 +797,7 @@ StmtPtr TensorExprKernel::transformLoops(BackendType backendType, StmtPtr st) {
         "After random transform:\n", std::to_string(l.root_stmt()), "\n");
   }
 
-  bool hasReduction = NodeFinder<ReduceOp>::find(l.root_stmt()).size() != 0;
+  bool hasReduction = !NodeFinder<ReduceOp>::find(l.root_stmt()).empty();
 
   // For Block codegen we create a map of tensor dims before
   // inlining. Like GPU codegen we need to inline. But the order
@@ -1460,7 +1460,6 @@ void TensorExprKernel::bindConstant(const torch::jit::Value* v) {
 std::vector<BufPtr> TensorExprKernel::preAllocIntermediateBufs(
     const std::vector<BufPtr>& interm_bufs) {
   std::vector<BufPtr> remaining_interm_bufs;
-  std::vector<std::pair<BufPtr, void*>> allocated_bufs;
   for (const auto& buf : interm_bufs) {
     // Check if buf shape is static and compute its size if static.
     bool is_static = true;
@@ -1580,7 +1579,7 @@ void TensorExprKernel::deduceMemoryLayoutPolicy() {
   auto _prefer_symbolic_mem =
       [](const torch::jit::Value* val,
          const std::vector<torch::jit::StrideInput>& stride_desc_vec) {
-        TORCH_INTERNAL_ASSERT(stride_desc_vec.size() > 0);
+        TORCH_INTERNAL_ASSERT(!stride_desc_vec.empty());
         // Has symbolic stride information
         auto cur_stride_desc = stride_desc_vec[0];
         return (cur_stride_desc ==
@@ -1621,7 +1620,7 @@ void TensorExprKernel::deduceMemoryLayoutPolicy() {
   // std::all_of returns true if the range is empty. But we prefer to keep
   // the original memory layout propagation policy for this case. So we
   // check whether the range is empty.
-  auto prefer_channels_last = (graph_io_tensors.size() > 0);
+  auto prefer_channels_last = (!graph_io_tensors.empty());
   for (auto el : graph_io_tensors) {
     auto is_complete = el->isCompleteTensor();
     auto is_symbolic = symbolic_strides_.count(el);
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 4284d7a4edeb..152d05509ce2 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -1354,7 +1354,7 @@ bool LoopNest::optimizeConditionals() {
       continue;
     }
     TORCH_INTERNAL_ASSERT(
-        comp_values.size() >= 1,
+        !comp_values.empty(),
         buildErrorMessage(
             "Expected at least one expression in optimizeConditional in the fuser."));
     comp_values.insert(comp_values.begin(), immLike(comp_values[0], 0));
@@ -1434,7 +1434,7 @@ void LoopNest::vectorizeInnerLoops() {
     worklist.push_back(rootF);
   } else if (BlockPtr body = to<Block>(root_stmt_)) {
     std::vector<BlockPtr> blocks = {body};
-    while (blocks.size()) {
+    while (!blocks.empty()) {
       BlockPtr b = blocks.back();
       blocks.pop_back();
 
@@ -1450,7 +1450,7 @@ void LoopNest::vectorizeInnerLoops() {
 
   // Traverse the For loop nest find inner-most loops, which are
   // vectorization candidates.
-  while (worklist.size()) {
+  while (!worklist.empty()) {
     ForPtr f = worklist.back();
     worklist.pop_back();
 
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 1a9535b957af..87f0f7094192 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -36,7 +36,7 @@ std::vector<std::vector<ForPtr>> GetAllPerfectlyNestedLoopNests(
   // Find the first set of loops that can be reordered
   std::vector<std::vector<ForPtr>> all_nested_loops;
   std::vector<ForPtr> nested_loops;
-  if (loops.size() == 0) {
+  if (loops.empty()) {
     return all_nested_loops;
   }
   nested_loops.push_back(loops[0]);
@@ -218,7 +218,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case COMPUTE_INLINE: {
           if (can_inline) {
             auto bufs = NodeFinder<Buf>::find(l.root_stmt());
-            if (bufs.size() > 0) {
+            if (!bufs.empty()) {
               int buf_number = std::rand() % (int)bufs.size();
               message =
                   "computeInline(" + bufs[buf_number]->name_hint() + ");\n";
@@ -247,7 +247,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         }
         case SPLIT_TAIL: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -261,7 +261,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         }
         case SPLIT_MASK: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -275,14 +275,14 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         }
         case DIST1: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
           auto loop = loops[loop_n];
           std::vector<StmtPtr> stmts(
               loop->body()->begin(), loop->body()->end());
-          if (stmts.size() == 0) {
+          if (stmts.empty()) {
             break;
           }
           int n_pivots = (std::rand() % (int)stmts.size()) + 1;
@@ -302,7 +302,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case DIST2: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
 
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -316,7 +316,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case DIST3: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
 
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -331,7 +331,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case DIST4: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
 
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -346,7 +346,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case DIST5: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
 
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -402,7 +402,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
           }
 
           // Choose a pair randomly
-          if (valid_pairs.size() == 0) {
+          if (valid_pairs.empty()) {
             break;
           }
           int valid_pair_n = std::rand() % (int)valid_pairs.size();
@@ -434,7 +434,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
           // Find all perfectly nested loop nests
           auto all_nested_loops =
               randomization_helper::GetAllPerfectlyNestedLoopNests(loops);
-          if (all_nested_loops.size() == 0) {
+          if (all_nested_loops.empty()) {
             break;
           }
 
@@ -475,7 +475,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
           // Find all perfectly nested loop nests
           auto all_nested_loops =
               randomization_helper::GetAllPerfectlyNestedLoopNests(loops);
-          if (all_nested_loops.size() == 0) {
+          if (all_nested_loops.empty()) {
             break;
           }
 
@@ -512,7 +512,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
 
         case FULL_UNROLL: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -526,7 +526,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
 
         case NORMALIZE: {
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -548,7 +548,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
           // Find all perfectly nested loop nests
           auto all_nested_loops =
               randomization_helper::GetAllPerfectlyNestedLoopNests(loops);
-          if (all_nested_loops.size() == 0) {
+          if (all_nested_loops.empty()) {
             break;
           }
 
@@ -583,8 +583,6 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         }
 
         case COMPRESS_ALL_BUFFERS: {
-          auto buffers = BufFinder::find(l.root_stmt());
-
           message = "compressAllBuffers(l.root_stmt());\n";
           randomization_helper::printHistory(n_transform, message);
           l.compressAllBuffers(l.root_stmt());
@@ -594,7 +592,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case SLICE_HEAD: {
           // Get all the loops
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -613,7 +611,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
         case SLICE_TAIL: {
           // Get all the loops
           auto loops = NodeFinder<For>::find(l.root_stmt());
-          if (loops.size() == 0) {
+          if (loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)loops.size();
@@ -661,7 +659,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
             }
           }
 
-          if (producer_consumer_pairs.size() == 0) {
+          if (producer_consumer_pairs.empty()) {
             break;
           }
 
@@ -702,7 +700,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
             }
           }
 
-          if (innermost_loops.size() == 0) {
+          if (innermost_loops.empty()) {
             break;
           }
           int loop_n = std::rand() % (int)innermost_loops.size();
diff --git a/torch/csrc/jit/tensorexpr/lowerings.cpp b/torch/csrc/jit/tensorexpr/lowerings.cpp
index 9727bf199a26..39e40f405ede 100644
--- a/torch/csrc/jit/tensorexpr/lowerings.cpp
+++ b/torch/csrc/jit/tensorexpr/lowerings.cpp
@@ -1732,7 +1732,7 @@ int nnc_lowerings_lazy_registration() {
             [&](const std::vector<VarHandle>& axes) {
               int64_t dim = c10::get<int64_t>(inputs[1]);
               if (dim < 0) {
-                if (axes.size() == 0) {
+                if (axes.empty()) {
                   throw malformed_input("axes are zero handling unsqueeze");
                 }
                 dim += axes.size();
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
index 9a7478dcbe3c..87269eccb78f 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
@@ -150,7 +150,7 @@ bool AccessInfo::isWrite() const {
 
 void AccessInfo::print() const {
   std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << "[";
-  if (bounds_.size() > 0) {
+  if (!bounds_.empty()) {
     for (size_t i = 0; i < bounds_.size() - 1; ++i) {
       bounds_[i].print();
       std::cout << ", ";
@@ -183,7 +183,7 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
       type_ == AccessType::Alloc) {
     os << "n" << id_ << " [\n";
     os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << "[";
-    if (bounds_.size() > 0) {
+    if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
         os << *IRSimplifier::simplify(
                   alloc<Add>(bounds_[i].end, immLike(bounds_[i].end, 1)))
@@ -204,8 +204,8 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
     os << "n" << id_ << " [\n";
     os << "label = \"" << AccessToString(type_) << " (#" << id_ << ")\\n";
     os << "buf : " << *var_ << "\\n";
-    os << "bounds : \[";
-    if (bounds_.size() > 0) {
+    os << "bounds : [";
+    if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
         os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
       }
diff --git a/torch/csrc/jit/tensorexpr/operators/misc.cpp b/torch/csrc/jit/tensorexpr/operators/misc.cpp
index c935727efafb..c9006cc3be8d 100644
--- a/torch/csrc/jit/tensorexpr/operators/misc.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/misc.cpp
@@ -479,7 +479,7 @@ Tensor computeFlatten(
 
 static std::pair<ScalarType, std::vector<BufHandle>> processCatList(
     const std::vector<BufHandle>& bufList) {
-  if (bufList.size() == 0) {
+  if (bufList.empty()) {
     throw std::runtime_error("Empty input list is passed to aten::cat");
   }
   std::vector<BufHandle> bufInputs;
@@ -487,7 +487,7 @@ static std::pair<ScalarType, std::vector<BufHandle>> processCatList(
   for (auto buf : bufList) {
     bufInputs.push_back(buf);
     TORCH_INTERNAL_ASSERT(
-        buf.node()->dims().size() > 0, buildErrorMessage("Invalid buf rank"));
+        !buf.node()->dims().empty(), buildErrorMessage("Invalid buf rank"));
     // Ignore buffers that are 0-sized on any dimension.
     bool hasEmptyDims = false;
     for (const auto& dim : buf.dims()) {
@@ -542,7 +542,7 @@ Tensor computeCatWoConditionals(
       ToDtype(high_type),
       nullptr,
       output_strides_expr);
-  if (non_empty_inputs.size() == 0) {
+  if (non_empty_inputs.empty()) {
     return Tensor(
         output_buf, alloc<tensorexpr::Block>(std::vector<StmtPtr>({})));
   }
@@ -638,7 +638,7 @@ Tensor computeCat(
       outputShape,
       outputStrides,
       [&](const std::vector<VarHandle>& axes) {
-        if (nonEmptyInputs.size() == 0) {
+        if (nonEmptyInputs.empty()) {
           return ExprHandle(0);
         }
 
diff --git a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
index 4f36d843012d..d6081887d7cd 100644
--- a/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
+++ b/torch/csrc/jit/tensorexpr/tensorexpr_init.cpp
@@ -38,7 +38,7 @@ ArgValue convertPyToArgValue(py::handle inp) {
     return ArgNone();
   } else if (py::isinstance<py::list>(inp)) {
     auto l = py::cast<py::list>(inp);
-    if (l.size() == 0) {
+    if (l.empty()) {
       return std::vector<BufHandle>();
     } else if (py::isinstance<py::int_>(l[0])) {
       return py::cast<IntList>(inp);
diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h
index ab77b1d8bdfa..e4c95424f67e 100644
--- a/torch/csrc/jit/tensorexpr/types.h
+++ b/torch/csrc/jit/tensorexpr/types.h
@@ -76,7 +76,9 @@ class TORCH_API Dtype {
   }
 
  private:
-  friend std::ostream& operator<<(std::ostream& stream, const Dtype& dtype);
+  friend TORCH_API std::ostream& operator<<(
+      std::ostream& stream,
+      const Dtype& dtype);
   ScalarType scalar_type_;
   int lanes_; // the width of the element for a vector time
 };
diff --git a/torch/csrc/jit/tensorexpr/unique_name_manager.cpp b/torch/csrc/jit/tensorexpr/unique_name_manager.cpp
index 3916686d304c..01065f5eff5b 100644
--- a/torch/csrc/jit/tensorexpr/unique_name_manager.cpp
+++ b/torch/csrc/jit/tensorexpr/unique_name_manager.cpp
@@ -16,7 +16,7 @@ const std::string& UniqueNameManager::get_unique_name(VarPtr v) {
   // First use the name_hint as a prefix to check if there is another name
   // with the same prefix.
   std::string name_hint = v->name_hint();
-  if (name_hint == "") {
+  if (name_hint.empty()) {
     name_hint = "v";
   } else if (std::isdigit(name_hint[0])) {
     name_hint = "v" + name_hint;
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index c656ddfae7e9..a53b98b07d4d 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -153,7 +153,7 @@ struct FileCheckImpl {
   TORCH_API void run(const std::string& test_file) {
     has_run = true;
 
-    if (groups.size() == 0 || groups[0].size() == 0) {
+    if (groups.empty() || groups[0].empty()) {
       throw std::runtime_error(
           "No checks have been added to this instance of"
           "Filecheck! Check for bad input.");
@@ -172,7 +172,7 @@ struct FileCheckImpl {
 
   TORCH_API void addCheck(const Check& check) {
     // consecutive CHECK_DAGs & CHECK_NOTs need to be evaluated as a group
-    if (groups.size() == 0 ||
+    if (groups.empty() ||
         (check.type_ != CHECK_NOT && check.type_ != CHECK_DAG)) {
       groups.push_back({check});
     } else {
@@ -391,7 +391,7 @@ struct FileCheckImpl {
     size_t group_beg = std::string::npos;
     size_t group_end = 0;
 
-    AT_ASSERT(groups.size() != 0);
+    AT_ASSERT(!groups.empty());
     for (const auto& check : group) {
       AT_ASSERT(check.type_ == group[0].type_);
       auto pos = assertFind(source, check.search_str_, prev.end(), check);
@@ -406,7 +406,7 @@ struct FileCheckImpl {
       const std::vector<Check>& group,
       const std::shared_ptr<Source>& source,
       const SourceRange& prev) {
-    AT_ASSERT(group.size() != 0);
+    AT_ASSERT(!group.empty());
     CheckType type = group[0].type_;
 
     if (type == CHECK_DAG) {
diff --git a/torch/csrc/lazy/backend/backend_interface.cpp b/torch/csrc/lazy/backend/backend_interface.cpp
index 0fb3257c90a9..cb5f6694193f 100644
--- a/torch/csrc/lazy/backend/backend_interface.cpp
+++ b/torch/csrc/lazy/backend/backend_interface.cpp
@@ -41,13 +41,13 @@ std::unique_ptr<LoweringContext> LoweringContext::Create(
     c10::ArrayRef<const Node*> post_order,
     Util::EmissionMap emit_status) {
   return getBackend()->CreateLoweringContext(
-      name, device, post_order, emit_status);
+      name, std::move(device), post_order, emit_status);
 }
 
 std::unique_ptr<LoweringContext> LoweringContext::Create(
     const std::string& name,
     BackendDevice device) {
-  return getBackend()->CreateLoweringContext(name, device);
+  return getBackend()->CreateLoweringContext(name, std::move(device));
 }
 
 } // namespace lazy
diff --git a/torch/csrc/lazy/core/cache.h b/torch/csrc/lazy/core/cache.h
index 2ff45b4d1de7..4248cd923865 100644
--- a/torch/csrc/lazy/core/cache.h
+++ b/torch/csrc/lazy/core/cache.h
@@ -65,7 +65,7 @@ class Cache {
 
   TypePtr GetLatest() {
     std::lock_guard<std::mutex> g(lock_);
-    TORCH_CHECK(element_list_.size() > 0);
+    TORCH_CHECK(!element_list_.empty());
     return element_list_.front().second;
   }
 
diff --git a/torch/csrc/lazy/core/ir_metadata.cpp b/torch/csrc/lazy/core/ir_metadata.cpp
index 49201db0c4da..1f1616366f82 100644
--- a/torch/csrc/lazy/core/ir_metadata.cpp
+++ b/torch/csrc/lazy/core/ir_metadata.cpp
@@ -73,7 +73,7 @@ void PopScope() {
 }
 
 void ResetScopeContext() {
-  if (g_scope_context.scopes.size() != 0) {
+  if (!g_scope_context.scopes.empty()) {
     TORCH_CHECK(
         false, "Expecting scope to be empty but it is " + GetCurrentScope());
   }
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index c41c892153ad..71effa7cbf65 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -1037,8 +1037,7 @@ void LazyGraphExecutor::TensorCollectionBarrier(SyncTensorCollection* coll) {
   if (coll) {
     static const std::string invalid_device(
         "Unknown0"); /* Temp solution to idetify unassigned devices */
-    if (coll->device.toString() == invalid_device ||
-        coll->unlocker.size() > 0) {
+    if (coll->device.toString() == invalid_device || !coll->unlocker.empty()) {
       return;
     }
     VLOG(4) << "Waiting on device barrier for device " << coll->device
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index df82fd45fe29..8384456bcaaa 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -400,7 +400,7 @@ std::vector<Shape> compute_shape_std(
 std::vector<Shape> compute_shape_std(
     const at::Tensor& self,
     at::OptionalIntArrayRef dim,
-    c10::optional<int64_t> correction,
+    const c10::optional<at::Scalar>& correction,
     bool keepdim) {
   if (dim.has_value()) {
     auto shape = at::native::shape_from_dim_mask(
@@ -488,7 +488,7 @@ std::vector<Shape> compute_shape_index_select(
 
   auto self_sizes = self.sizes();
   std::vector<int64_t> output_sizes(self_sizes.begin(), self_sizes.end());
-  TORCH_CHECK(output_sizes.size() > 0, "Empty output_sizes is not supported.");
+  TORCH_CHECK(!output_sizes.empty(), "Empty output_sizes is not supported.");
   output_sizes[dim] = index_size;
 
   return {Shape(self.scalar_type(), output_sizes)};
@@ -512,7 +512,7 @@ std::vector<Shape> compute_shape_cat(at::TensorList tensors, int64_t dim) {
   for (auto& tensor : tensors) {
     extended_dim_shape += tensor.sizes()[dim];
   }
-  TORCH_CHECK(out_shape.size() > 0, "Scalar tensors are not supported in cat.");
+  TORCH_CHECK(!out_shape.empty(), "Scalar tensors are not supported in cat.");
   TORCH_CHECK(
       extended_dim_shape <= std::numeric_limits<int64_t>::max(),
       "Size overflow");
@@ -1113,7 +1113,7 @@ TORCH_API std::vector<Shape> compute_shape_clone(
 }
 
 std::vector<Shape> compute_shape_stack(at::TensorList tensors, int64_t dim) {
-  TORCH_CHECK(tensors.size() > 0, "stack expects a non-empty TensorList");
+  TORCH_CHECK(!tensors.empty(), "stack expects a non-empty TensorList");
   auto wrapped_dim = at::maybe_wrap_dim(dim, tensors[0].ndimension() + 1);
 
   // Copied from 'check_stack_inputs' in TensorShape.cpp
diff --git a/torch/csrc/lazy/core/shape_inference.h b/torch/csrc/lazy/core/shape_inference.h
index 9ceb45d6b23d..e243798cfc77 100644
--- a/torch/csrc/lazy/core/shape_inference.h
+++ b/torch/csrc/lazy/core/shape_inference.h
@@ -81,7 +81,7 @@ TORCH_API std::vector<torch::lazy::Shape> compute_shape_sort(const at::Tensor &
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_stack(at::TensorList tensors, int64_t dim);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, bool unbiased);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim);
-TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, c10::optional<int64_t> correction, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_sum(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape__to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format);
 TORCH_API std::vector<torch::lazy::Shape> compute_shape_take(const at::Tensor & self, const at::Tensor & index);
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index a7890fc3e063..3a388d7a71f2 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -367,7 +367,7 @@ std::vector<LazyTensorPtr> GetLtcTensors(c10::ArrayRef<at::Tensor> tensors) {
   std::vector<LazyTensorPtr> ltc_tensors;
   ltc_tensors.reserve(tensors.size());
   for (const auto& tensor : tensors) {
-    ltc_tensors.push_back(TryGetLtcTensor(tensor));
+    ltc_tensors.emplace_back(TryGetLtcTensor(tensor));
   }
   return ltc_tensors;
 }
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 2fb4cc3c0d05..2506b096c4c6 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -68,7 +68,7 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   // LazyTensorPtr instead.
   LazyTensor() = delete;
 
-  virtual ~LazyTensor() = default;
+  ~LazyTensor() override = default;
 
   size_t generation() const {
     return data()->generation;
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index fe74d29d87ac..af4afb78a4fd 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -166,7 +166,7 @@ void initLazyBindings(PyObject* module) {
     std::vector<LazyTensorPtr> xtensors;
     xtensors.reserve(tensors.size());
     for (auto& tensor : tensors) {
-      xtensors.push_back(TryGetLtcTensor(tensor));
+      xtensors.emplace_back(TryGetLtcTensor(tensor));
     }
     auto hash = LazyGraphExecutor::Get()->GetGraphHash(xtensors);
     std::string bin((const char*)&hash, sizeof(hash));
diff --git a/torch/csrc/lazy/python/init.h b/torch/csrc/lazy/python/init.h
index e9c584ead8ce..5bdc5a972290 100644
--- a/torch/csrc/lazy/python/init.h
+++ b/torch/csrc/lazy/python/init.h
@@ -1,12 +1,12 @@
 #pragma once
-#include <c10/macros/Export.h>
 #include <pybind11/pybind11.h>
+#include <torch/csrc/Export.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch {
 namespace lazy {
 
-TORCH_API void initLazyBindings(PyObject* module);
+TORCH_PYTHON_API void initLazyBindings(PyObject* module);
 
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/python/python_util.h b/torch/csrc/lazy/python/python_util.h
index 23df3d192fe9..8040a023de51 100644
--- a/torch/csrc/lazy/python/python_util.h
+++ b/torch/csrc/lazy/python/python_util.h
@@ -1,15 +1,15 @@
 #pragma once
-#include <c10/macros/Export.h>
 #include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
 #include <torch/csrc/lazy/core/ir_metadata.h>
 #include <vector>
 
 namespace torch {
 namespace lazy {
 
-c10::optional<SourceLocation> TORCH_API GetPythonFrameTop();
+c10::optional<SourceLocation> TORCH_PYTHON_API GetPythonFrameTop();
 
-std::vector<SourceLocation> TORCH_API GetPythonFrames();
+std::vector<SourceLocation> TORCH_PYTHON_API GetPythonFrames();
 
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/test_mnist.py b/torch/csrc/lazy/test_mnist.py
index 16a023df5edd..e5c0ecb12c77 100644
--- a/torch/csrc/lazy/test_mnist.py
+++ b/torch/csrc/lazy/test_mnist.py
@@ -13,7 +13,7 @@
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
         self.dropout1 = nn.Dropout(0.25)
diff --git a/torch/csrc/lazy/ts_backend/dynamic_ir.h b/torch/csrc/lazy/ts_backend/dynamic_ir.h
index 40132aa57404..aa0ed1eb9932 100644
--- a/torch/csrc/lazy/ts_backend/dynamic_ir.h
+++ b/torch/csrc/lazy/ts_backend/dynamic_ir.h
@@ -52,7 +52,7 @@ class TORCH_API SizeNode : public TsNode, public DimensionNode {
   bool isSymbolic() const override;
   std::string ToString() const override;
   size_t dim_ = 0;
-  virtual torch::lazy::TSOpVector Lower(
+  torch::lazy::TSOpVector Lower(
       std::shared_ptr<torch::jit::GraphFunction> function,
       TSLoweringContext* loctx) const override;
 };
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
index 3cca52b71545..1cfcc2dfc56f 100644
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@@ -160,7 +160,7 @@ class TSBackendImpl : public torch::lazy::BackendImplInterface {
     return default_device_ordinal_;
   }
 
-  virtual void SetDefaultDeviceOrdinal(int64_t ordinal) override {
+  void SetDefaultDeviceOrdinal(int64_t ordinal) override {
     default_device_ordinal_ = ordinal;
   }
 
diff --git a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
index f5352f2d5ba8..767c86dde47c 100644
--- a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
@@ -119,7 +119,7 @@ c10::optional<c10::Device> compute_target_device(
   // Decide what device to move the output tensor(s) to.
   // The current convention is that we use the first tensor arg to pick the
   // device Barring that, we take the first tensor from a TensorList arg.
-  if (t_args.size() > 0) {
+  if (!t_args.empty()) {
     return t_args[0].device();
   } else {
     // We need to loop through all of the (potentially multiple) TensorList
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index 0ad2b669c0e6..a898dfea654a 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -132,7 +132,7 @@ class TORCH_API TSLoweringContext : public LoweringContext {
 
  private:
   struct Parameter {
-    torch::jit::Value* param;
+    torch::jit::Value* param{nullptr};
     size_t index = 0;
   };
 
diff --git a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
index 12341b69e654..d389aae63095 100644
--- a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
@@ -40,10 +40,9 @@ TSOpVector LowerTSBuiltin(
       std::make_shared<torch::jit::BuiltinFunction>(sym, at::nullopt);
   auto magic_method = std::make_shared<torch::jit::MagicMethod>("", builtin);
   auto ret = magic_method->call({}, *function, arguments, kwarguments, 0);
-  auto sv = dynamic_cast<torch::jit::SimpleValue*>(ret.get());
-  CHECK(sv);
-  if (sv->getValue()->type()->kind() == c10::TypeKind::TupleType) {
-    const auto tuple_call_result = sv->asTuple({}, *function);
+  auto& sv = dynamic_cast<torch::jit::SimpleValue&>(*ret);
+  if (sv.getValue()->type()->kind() == c10::TypeKind::TupleType) {
+    const auto tuple_call_result = sv.asTuple({}, *function);
     TSOpVector tuple_result;
     for (const auto& tuple_component : tuple_call_result) {
       auto tuple_component_sv =
@@ -52,7 +51,7 @@ TSOpVector LowerTSBuiltin(
     }
     return tuple_result;
   }
-  return {sv->getValue()};
+  return {sv.getValue()};
 }
 
 torch::jit::Value* GenerateClone(
diff --git a/torch/csrc/lazy/tutorial.md b/torch/csrc/lazy/tutorial.md
index e26c55d2c520..155e8adfdd85 100644
--- a/torch/csrc/lazy/tutorial.md
+++ b/torch/csrc/lazy/tutorial.md
@@ -136,7 +136,7 @@ Here's our model definition:
 ```python
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
         self.conv2 = nn.Conv2d(32, 64, 3, 1)
         self.dropout1 = nn.Dropout(0.25)
diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp
new file mode 100644
index 000000000000..0a1c45c0838d
--- /dev/null
+++ b/torch/csrc/mps/Module.cpp
@@ -0,0 +1,151 @@
+#include <ATen/ATen.h>
+#include <c10/util/CallOnce.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+// pthread.h is included for tracking bad forks
+#ifndef WIN32
+#include <pthread.h>
+#endif
+
+namespace torch {
+namespace mps {
+
+namespace {
+// True for children forked after mps init
+static bool in_bad_fork = false;
+
+// Called in the forked child if mps has already been initialized
+static void forked_mps_child() {
+  in_bad_fork = true;
+}
+
+// Should be called before the first mps call.
+static void track_bad_mps_fork() {
+#ifndef WIN32
+  static c10::once_flag flag;
+  c10::call_once(
+      flag, [] { pthread_atfork(nullptr, nullptr, forked_mps_child); });
+#endif
+}
+} // namespace
+
+static PyObject* MPSModule_isInBadFork(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(in_bad_fork);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_getDefaultMPSGenerator(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  track_bad_mps_fork();
+  return THPGenerator_initDefaultGenerator(
+      at::detail::getMPSHooks().getDefaultMPSGenerator());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_isAvailable(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  track_bad_mps_fork();
+  if (at::detail::getMPSHooks().hasMPS()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_isMacOS13orNewer(PyObject* _unused, PyObject* args) {
+  HANDLE_TH_ERRORS
+  THPUtils_assert(
+      THPUtils_checkLong(args), "invalid argument to isOnMacOS13orNewer()");
+  auto minor = THPUtils_unpackUInt32(args);
+  if (at::detail::getMPSHooks().isOnMacOS13orNewer(minor)) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_synchronize(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  at::detail::getMPSHooks().deviceSynchronize();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_emptyCache(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  at::detail::getMPSHooks().emptyCache();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_setMemoryFraction(
+    PyObject* _unused,
+    PyObject* args) {
+  HANDLE_TH_ERRORS
+  THPUtils_assert(
+      THPUtils_checkDouble(args), "invalid argument to setMemoryFraction()");
+  double fraction = THPUtils_unpackDouble(args);
+  at::detail::getMPSHooks().setMemoryFraction(fraction);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+static PyObject* MPSModule_currentAllocatedMemory(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return PyLong_FromUnsignedLongLong(
+      at::detail::getMPSHooks().getCurrentAllocatedMemory());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* MPSModule_driverAllocatedMemory(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return PyLong_FromUnsignedLongLong(
+      at::detail::getMPSHooks().getDriverAllocatedMemory());
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(modernize-avoid-c-arrays,
+// cppcoreguidelines-avoid-non-const-global-variables,
+// cppcoreguidelines-avoid-c-arrays)
+static struct PyMethodDef _MPSModule_methods[] = {
+    {"_mps_synchronize", MPSModule_synchronize, METH_NOARGS, nullptr},
+    {"_mps_is_in_bad_fork", MPSModule_isInBadFork, METH_NOARGS, nullptr},
+    {"_mps_is_available", MPSModule_isAvailable, METH_NOARGS, nullptr},
+    {"_mps_is_on_macos_13_or_newer",
+     MPSModule_isMacOS13orNewer,
+     METH_O,
+     nullptr},
+    {"_mps_get_default_generator",
+     MPSModule_getDefaultMPSGenerator,
+     METH_NOARGS,
+     nullptr},
+    {"_mps_emptyCache", MPSModule_emptyCache, METH_NOARGS, nullptr},
+    {"_mps_setMemoryFraction", MPSModule_setMemoryFraction, METH_O, nullptr},
+    {"_mps_currentAllocatedMemory",
+     MPSModule_currentAllocatedMemory,
+     METH_NOARGS,
+     nullptr},
+    {"_mps_driverAllocatedMemory",
+     MPSModule_driverAllocatedMemory,
+     METH_NOARGS,
+     nullptr},
+    {nullptr}};
+
+PyMethodDef* python_functions() {
+  return _MPSModule_methods;
+}
+
+} // namespace mps
+} // namespace torch
diff --git a/torch/csrc/mps/Module.h b/torch/csrc/mps/Module.h
new file mode 100644
index 000000000000..3759d36d738b
--- /dev/null
+++ b/torch/csrc/mps/Module.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace mps {
+
+PyMethodDef* python_functions();
+
+} // namespace mps
+} // namespace torch
diff --git a/torch/csrc/onnx/diagnostics/diagnostics.h b/torch/csrc/onnx/diagnostics/diagnostics.h
index 65ca626b843b..7c78e3065b60 100644
--- a/torch/csrc/onnx/diagnostics/diagnostics.h
+++ b/torch/csrc/onnx/diagnostics/diagnostics.h
@@ -55,7 +55,10 @@ inline void Diagnose(
   py::object py_message =
       py_rule.attr("format_message")(**py::cast(messageArgs));
 
-  _PyDiagnostics().attr("diagnose")(py_rule, py_level, py_message);
+  // to use the `_a` literal for arguments
+  using namespace pybind11::literals;
+  _PyDiagnostics().attr("diagnose")(
+      py_rule, py_level, py_message, "cpp_stack"_a = true);
 }
 
 } // namespace diagnostics
diff --git a/torch/csrc/onnx/diagnostics/generated/rules.h b/torch/csrc/onnx/diagnostics/generated/rules.h
index 405456336422..0b77afd7b4b8 100644
--- a/torch/csrc/onnx/diagnostics/generated/rules.h
+++ b/torch/csrc/onnx/diagnostics/generated/rules.h
@@ -34,6 +34,60 @@ enum class Rule : uint32_t {
    * @brief Operator is supported in newer opset version.
    */
   kOperatorSupportedInNewerOpsetVersion,
+
+  /**
+   * @brief FX Tracer succeeded.
+   */
+  kFxTracerSuccess,
+
+  /**
+   * @brief FX Tracer failed.
+   */
+  kFxTracerFailure,
+
+  /**
+   * @brief FX Tracer succeeded.
+   */
+  kFxFrontendAotautograd,
+
+  /**
+   * @brief FX pass converting torch.neg to torch.sigmoid.
+   */
+  kFxPassConvertNegToSigmoid,
+
+  /**
+   * @brief ToDo, experimenting diagnostics, placeholder text.
+   */
+  kFxIrAddNode,
+
+  /**
+   * @brief Op level tracking. ToDo, experimenting diagnostics, placeholder
+   * text.
+   */
+  kAtenlibSymbolicFunction,
+
+  /**
+   * @brief Graph level tracking. Each op is a step. ToDo, experimenting
+   * diagnostics, placeholder text.
+   */
+  kAtenlibFxToOnnx,
+
+  /**
+   * @brief Node level tracking. ToDo, experimenting diagnostics, placeholder
+   * text.
+   */
+  kFxNodeToOnnx,
+
+  /**
+   * @brief The make_fx + decomposition pass on fx graph produced from Dynamo,
+   * before ONNX export.
+   */
+  kFxFrontendDynamoMakeFx,
+
+  /**
+   * @brief The formatted str for argument to display is too verbose.
+   */
+  kArgFormatTooVerbose,
 };
 
 static constexpr const char* const kPyRuleNames[] = {
@@ -41,6 +95,16 @@ static constexpr const char* const kPyRuleNames[] = {
     "missing_custom_symbolic_function",
     "missing_standard_symbolic_function",
     "operator_supported_in_newer_opset_version",
+    "fx_tracer_success",
+    "fx_tracer_failure",
+    "fx_frontend_aotautograd",
+    "fx_pass_convert_neg_to_sigmoid",
+    "fx_ir_add_node",
+    "atenlib_symbolic_function",
+    "atenlib_fx_to_onnx",
+    "fx_node_to_onnx",
+    "fx_frontend_dynamo_make_fx",
+    "arg_format_too_verbose",
 };
 
 } // namespace diagnostics
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index 9222273d45e2..44406eaa184d 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -47,9 +47,15 @@ void initONNXBindings(PyObject* module) {
                  const std::vector<at::Tensor>& tensors,
                  const python::IODescriptor& desc,
                  bool onnx_shape_inference,
-                 bool is_script) {
+                 bool is_script,
+                 int opset_version) {
                 ONNXAssignOutputShape(
-                    graph, tensors, desc, onnx_shape_inference, is_script);
+                    graph,
+                    tensors,
+                    desc,
+                    onnx_shape_inference,
+                    is_script,
+                    opset_version);
               }))
       .def(
           "_jit_pass_onnx_function_substitution",
@@ -132,7 +138,10 @@ void initONNXBindings(PyObject* module) {
                  std::map<std::string, IValue>& params_dict,
                  int opset_version) {
                 ONNXShapeTypeInference(graph, params_dict, opset_version);
-              }))
+              }),
+          py::arg("graph"),
+          py::arg("params_dict"),
+          py::arg("opset_version"))
       .def(
           "_jit_pass_onnx_set_dynamic_input_shape",
           ::torch::wrap_pybind_function(ONNXSetDynamicInputShape))
@@ -240,11 +249,10 @@ void initONNXBindings(PyObject* module) {
 
   m.def(
       "_check_onnx_proto",
-      [](const std::string& proto_string, bool full_check) {
-        check_onnx_proto(proto_string, full_check);
-      },
-      py::arg("proto_string"),
-      py::arg("full_check") = false);
+      ::torch::wrap_pybind_function([](const std::string& proto_string) {
+        check_onnx_proto(proto_string);
+      }),
+      py::arg("proto_string"));
 
   auto onnx = m.def_submodule("_onnx");
   py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType")
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index ccf4cf96d793..0a0fcfc11beb 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -52,13 +52,13 @@ RawTensorMetadata::RawTensorMetadata(const at::Tensor& t)
 
 TensorMetadata::TensorMetadata(
     const RawTensorMetadata& r,
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides)
+    std::vector<int64_t> sizes,
+    std::vector<int64_t> strides)
     : RawTensorMetadataBase(r),
       weak_self_{r.weak_self_.value_or(WeakTensor(at::Tensor()))},
       device_{r.device_type_, r.device_index_},
-      sizes_{sizes},
-      strides_{strides} {
+      sizes_{std::move(sizes)},
+      strides_{std::move(strides)} {
   SOFT_ASSERT(r.weak_self_.has_value());
 }
 
@@ -519,7 +519,7 @@ ThreadLocalSubqueue::ThreadLocalSubqueue(
     const ProfilerConfig& config)
     : tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
   torch::profiler::impl::kineto::recordThreadInfo();
-  if (config_.experimental_config.performance_events.size()) {
+  if (!config_.experimental_config.performance_events.empty()) {
     perf_profiler_ =
         std::make_unique<torch::profiler::impl::linux_perf::PerfProfiler>();
     perf_profiler_->Configure(config_.experimental_config.performance_events);
@@ -1129,12 +1129,16 @@ RecordQueue::getRecords(
     auto& queue = *subqueue_it.second;
     auto materialize = [&](auto& events) {
       for (auto& i : events) {
+        time_t start_time_ns;
+        if constexpr (std::is_same<
+                          std::remove_reference_t<decltype(i)>,
+                          ExtraFields<EventType::Backend>>::value) {
+          start_time_ns = i.start_time_us_ * 1000;
+        } else {
+          start_time_ns = converter(i.start_time_);
+        }
         out.emplace_back(Result::create(
-            /*start_time_ns_=*/c10::guts::if_constexpr<std::is_same<
-                typename std::remove_reference<decltype(i)>::type,
-                ExtraFields<EventType::Backend>>::value>(
-                [&](auto _) { return _(i).start_time_us_ * 1000; },
-                [&](auto _) { return converter(_(i).start_time_); }),
+            /*start_time_ns_=*/start_time_ns,
             /*start_tid_=*/queue.tid(),
             /*kineto_info_=*/queue.kineto_info(),
             /*extra_fields_=*/std::move(i)));
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 73268995e923..dc87ab3df5d7 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -68,8 +68,8 @@ struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
 struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
   TensorMetadata(
       const RawTensorMetadata& r,
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& strides);
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides);
 
   TensorImplAddress impl() const {
     return weak_self_.get();
@@ -100,11 +100,11 @@ struct ExtraFields;
 struct Result;
 
 struct TorchOpBasicFields {
-  int64_t sequence_number_;
-  uint64_t forward_tid_;
-  at::RecordScope scope_;
-  bool is_async_;
-  int64_t debug_handle_;
+  int64_t sequence_number_{0};
+  uint64_t forward_tid_{0};
+  at::RecordScope scope_{};
+  bool is_async_{false};
+  int64_t debug_handle_{0};
   std::string name_;
 
   // Set in the exit callback.
@@ -180,8 +180,8 @@ struct RawAllocation {
   torch::profiler::impl::approx_time_t start_time_;
   void* ptr_;
   int64_t alloc_size_;
-  int64_t total_allocated_;
-  int64_t total_reserved_;
+  size_t total_allocated_;
+  size_t total_reserved_;
   c10::DeviceType device_type_;
   c10::DeviceIndex device_index_;
 };
@@ -205,8 +205,8 @@ template <>
 struct ExtraFields<EventType::OutOfMemory> {
   torch::profiler::impl::approx_time_t start_time_;
   int64_t alloc_size_;
-  int64_t total_allocated_;
-  int64_t total_reserved_;
+  size_t total_allocated_;
+  size_t total_reserved_;
   c10::DeviceType device_type_;
   c10::DeviceIndex device_index_;
 };
@@ -327,8 +327,8 @@ struct ExtraFields<EventType::Kineto> {
   };
 
   std::string name_;
-  int64_t duration_us_;
-  uint64_t correlation_id_;
+  int64_t duration_us_{0};
+  uint64_t correlation_id_{0};
   libkineto::ActivityType activity_type_;
   Flow flow;
   std::weak_ptr<Result> linked_activity_{};
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index ba3582f0d6d9..e8cb031fc302 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -151,7 +151,7 @@ class ExperimentalConfigWrapper {
     // Kineto supports reading performance events per kernel/iteration
     // using CUPTI Range based profiler API. In this mode however we
     // do not trace CPU or GPU events.
-    bool cupti_range_profiler = config_.profiler_metrics.size() > 0;
+    bool cupti_range_profiler = !config_.profiler_metrics.empty();
     if (cupti_range_profiler &&
         activities.count(torch::autograd::profiler::ActivityType::CPU)) {
       LOG(WARNING)
@@ -276,6 +276,19 @@ void recordThreadInfo() {
 #endif // USE_KINETO
 }
 
+void logInvariantViolation(
+    const std::string& assertion,
+    const std::string& error,
+    const std::string& profile_id,
+    const std::string& group_profile_id) {
+#ifdef USE_KINETO
+  if (libkineto::api().isProfilerInitialized()) {
+    libkineto::api().activityProfiler().logInvariantViolation(
+        profile_id, assertion, error, group_profile_id);
+  }
+#endif // USE_KINETO
+}
+
 } // namespace kineto
 } // namespace impl
 } // namespace profiler
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index fa02e979275b..2a410719a1f7 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -126,6 +126,12 @@ void popCorrelationId();
 void popUserCorrelationId();
 void recordThreadInfo();
 
+void logInvariantViolation(
+    const std::string& assertion,
+    const std::string& error,
+    const std::string& profile_id,
+    const std::string& group_profile_id);
+
 } // namespace kineto
 } // namespace impl
 } // namespace profiler
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index 8f63163089b3..64db126b25ef 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -9,7 +9,7 @@ MakeFn make_fn;
 
 struct NoOpPythonTracer : public PythonTracerBase {
   NoOpPythonTracer() = default;
-  ~NoOpPythonTracer() = default;
+  ~NoOpPythonTracer() override = default;
 
   void stop() override {}
   std::vector<std::shared_ptr<Result>> getEvents(
diff --git a/torch/csrc/profiler/perf.cpp b/torch/csrc/profiler/perf.cpp
index c5b2125fe4c9..2c80fd603a91 100644
--- a/torch/csrc/profiler/perf.cpp
+++ b/torch/csrc/profiler/perf.cpp
@@ -158,7 +158,7 @@ void PerfProfiler::Configure(std::vector<std::string>& event_names) {
 }
 
 void PerfProfiler::Enable() {
-  if (start_values_.size()) {
+  if (!start_values_.empty()) {
     StopCounting();
   }
 
@@ -177,8 +177,7 @@ void PerfProfiler::Disable(perf_counters_t& vals) {
       vals.size() == events_.size(),
       "Can not fit all perf counters in the supplied container");
   TORCH_CHECK(
-      start_values_.size() > 0,
-      "PerfProfiler must be enabled before disabling");
+      !start_values_.empty(), "PerfProfiler must be enabled before disabling");
 
   /* Always connecting this disable event to the last enable event i.e. using
    * whatever is on the top of the start counter value stack. */
@@ -189,7 +188,7 @@ void PerfProfiler::Disable(perf_counters_t& vals) {
   start_values_.pop();
 
   // Restore it for a parent
-  if (start_values_.size()) {
+  if (!start_values_.empty()) {
     StartCounting();
   }
 }
diff --git a/torch/csrc/profiler/perf.h b/torch/csrc/profiler/perf.h
index 88432a946f77..9d5d00cc67d1 100644
--- a/torch/csrc/profiler/perf.h
+++ b/torch/csrc/profiler/perf.h
@@ -38,7 +38,7 @@ struct PerfCounter {
  */
 class PerfEvent {
  public:
-  explicit PerfEvent(std::string& name) : name_(name), fd_(-1) {}
+  explicit PerfEvent(std::string& name) : name_(name) {}
 
   PerfEvent& operator=(PerfEvent&& other) noexcept {
     if (this != &other) {
diff --git a/torch/csrc/profiler/standalone/itt_observer.cpp b/torch/csrc/profiler/standalone/itt_observer.cpp
index 3378c8b52840..d3452ece752b 100644
--- a/torch/csrc/profiler/standalone/itt_observer.cpp
+++ b/torch/csrc/profiler/standalone/itt_observer.cpp
@@ -22,8 +22,8 @@ struct ITTThreadLocalState : ProfilerStateBase {
     return ActiveProfilerType::ITT;
   }
 
-  void reportMemoryUsage(void*, int64_t, int64_t, int64_t, c10::Device)
-      override {}
+  void reportMemoryUsage(void*, int64_t, size_t, size_t, c10::Device) override {
+  }
 
   static ITTThreadLocalState* getTLS() {
     auto tls = ProfilerStateBase::get(/*global=*/false);
diff --git a/torch/csrc/profiler/standalone/nvtx_observer.cpp b/torch/csrc/profiler/standalone/nvtx_observer.cpp
index 1db70a543bc4..7e41bb2eaca9 100644
--- a/torch/csrc/profiler/standalone/nvtx_observer.cpp
+++ b/torch/csrc/profiler/standalone/nvtx_observer.cpp
@@ -22,8 +22,8 @@ struct NVTXThreadLocalState : ProfilerStateBase {
     return ActiveProfilerType::NVTX;
   }
 
-  void reportMemoryUsage(void*, int64_t, int64_t, int64_t, c10::Device)
-      override {}
+  void reportMemoryUsage(void*, int64_t, size_t, size_t, c10::Device) override {
+  }
 
   static NVTXThreadLocalState* getTLS() {
     auto tls = ProfilerStateBase::get(/*global=*/false);
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 6833e8abef70..082a2e8aaab5 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -97,6 +97,46 @@ bool softAssertRaises() {
   return soft_assert_raises_.value_or(false);
 }
 
+void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const char* args) {
+#ifdef USE_KINETO
+  std::string error;
+  error = fmt::format(
+      "{} SOFT ASSERT FAILED at {}:{}, func: {}, args: {}",
+      cond,
+      file,
+      line,
+      func,
+      args);
+  // TODO: Implement profile_id and group_profile_id as 3rd/4th arguments.
+  kineto::logInvariantViolation(cond, error, "", "");
+#endif
+}
+
+void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const std::string& args) {
+#ifdef USE_KINETO
+  std::string error;
+  error = fmt::format(
+      "{} SOFT ASSERT FAILED at {}:{}, func: {}, args: {}",
+      cond,
+      file,
+      line,
+      func,
+      args);
+  // TODO: Implement profile_id and group_profile_id as 3rd/4th arguments.
+  kineto::logInvariantViolation(cond, error, "", "");
+#endif
+}
+
 // ----------------------------------------------------------------------------
 // -- NVTX --------------------------------------------------------------------
 // ----------------------------------------------------------------------------
@@ -106,7 +146,7 @@ std::string getNvtxStr(
     const std::vector<std::vector<int64_t>>& shapes,
     at::RecordFunctionHandle op_id,
     const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids) {
-  if (sequence_nr >= -1 || shapes.size() > 0) {
+  if (sequence_nr >= -1 || !shapes.empty()) {
     std::string str;
     if (sequence_nr >= 0) {
       str = fmt::format("{}, seq = {}", name, sequence_nr);
@@ -121,12 +161,12 @@ std::string getNvtxStr(
     if (op_id > 0) {
       str = fmt::format("{}, op_id = {}", str, op_id);
     }
-    if (shapes.size() > 0) {
+    if (!shapes.empty()) {
       str = fmt::format("{}, sizes = {}", str, shapesToStr(shapes));
     }
     // Include the op ids of the input edges so
     // you can build the network graph
-    if (input_op_ids.size() > 0) {
+    if (!input_op_ids.empty()) {
       str = fmt::format(
           "{}, input_op_ids = {}", str, inputOpIdsToStr(input_op_ids));
     }
@@ -557,7 +597,7 @@ uint64_t computeFlops(
 
     const auto mat1_size = mat1_sizes_ref.toDimVector();
     const auto mat2_size = mat2_sizes_ref.toDimVector();
-    if (mat1_size.size() == 0) {
+    if (mat1_size.empty()) {
       return 0;
     }
 
@@ -598,7 +638,7 @@ uint64_t computeFlops(
 
     const auto mat1_size = mat1_sizes_ref.toDimVector();
     const auto mat2_size = mat2_sizes_ref.toDimVector();
-    if (mat1_size.size() == 0) {
+    if (mat1_size.empty()) {
       return 0;
     }
 
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index ab0550e79caa..f82b804aa582 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -41,6 +41,12 @@
 #define SOFT_ASSERT(cond, ...)                         \
   [&]() -> bool {                                      \
     if (C10_UNLIKELY(!(cond))) {                       \
+      torch::profiler::impl::logSoftAssert(            \
+          __func__,                                    \
+          __FILE__,                                    \
+          static_cast<uint32_t>(__LINE__),             \
+          #cond,                                       \
+          ::c10::str(__VA_ARGS__));                    \
       if (torch::profiler::impl::softAssertRaises()) { \
         TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__);      \
       } else {                                         \
@@ -56,6 +62,26 @@ namespace profiler {
 namespace impl {
 TORCH_API bool softAssertRaises();
 TORCH_API void setSoftAssertRaises(c10::optional<bool> value);
+TORCH_API void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const char* args);
+TORCH_API inline void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    ::c10::detail::CompileTimeEmptyString args) {
+  logSoftAssert(func, file, line, cond, (const char*)args);
+}
+TORCH_API void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const std::string& args);
 
 using time_t = int64_t;
 using steady_clock_t = std::conditional<
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index d30bfff3249c..e090d7793788 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -226,9 +226,7 @@ void THPStorage_writeFileRaw(
     bool save_size,
     uint64_t element_size) {
   c10::DeviceGuard guard(self->device());
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  uint8_t* data;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint8_t* data{};
   at::Tensor cpu_tensor;
   int64_t size_bytes = self->nbytes();
   int64_t numel = size_bytes / element_size;
@@ -251,8 +249,7 @@ void THPStorage_writeFileRaw(
         torch::utils::THPByteOrder::THP_LITTLE_ENDIAN)
       doWrite(fd, &numel, sizeof(int64_t));
     else {
-      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-      int64_t nsize; // convert big endian cpu to little endian storage
+      int64_t nsize{}; // convert big endian cpu to little endian storage
       torch::utils::THP_encodeInt64Buffer(
           (uint8_t*)&nsize,
           (const int64_t*)&numel,
@@ -269,7 +266,6 @@ void THPStorage_writeFileRaw(
   } else {
     int64_t buffer_size = std::min(numel, (int64_t)5000);
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::unique_ptr<uint8_t[]> le_buffer(
         new uint8_t[buffer_size * element_size]);
     for (int64_t i = 0; i < numel; i += buffer_size) {
@@ -319,16 +315,11 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
   if (storage.defined()) {
     guard.reset_device(storage->device());
   }
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  uint8_t* data;
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  int64_t size;
+  int64_t size{};
   doRead(file, &size, sizeof(int64_t));
   if (torch::utils::THP_nativeByteOrder() ==
       torch::utils::THPByteOrder::THP_BIG_ENDIAN) {
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t tsize; // convert little endian storage to big endian cpu
-    tsize = size;
+    int64_t tsize = size; // convert little endian storage to big endian cpu
     torch::utils::THP_decodeInt64Buffer(
         &size, (const uint8_t*)&tsize, torch::utils::THP_nativeByteOrder(), 1);
   }
@@ -348,9 +339,9 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
         _storage_nbytes);
   }
 
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::unique_ptr<char[]> cpu_data;
 
+  uint8_t* data{};
   if (storage->device_type() == at::kCPU) {
     data = storage->data<uint8_t>();
   } else {
@@ -366,7 +357,6 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     std::unique_ptr<uint8_t[]> le_buffer(
         new uint8_t[buffer_size * element_size]);
 
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index b42e389723b5..ec2762de53e7 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -24,6 +24,26 @@ int THPUtils_getCallable(PyObject* arg, PyObject** result) {
   return 1;
 }
 
+bool THPUtils_checkIndex(PyObject* obj) {
+  if (PyBool_Check(obj)) {
+    return false;
+  }
+  if (THPUtils_checkLong(obj)) {
+    return true;
+  }
+  // Avoid poking __index__ early as that will immediately cause a guard
+  if (torch::is_symint(py::handle(obj))) {
+    return true;
+  }
+  torch::jit::tracer::NoWarn no_warn_guard;
+  auto index = THPObjectPtr(PyNumber_Index(obj));
+  if (!index) {
+    PyErr_Clear();
+    return false;
+  }
+  return true;
+}
+
 std::vector<int64_t> THPUtils_unpackLongs(PyObject* arg) {
   bool tuple = PyTuple_Check(arg);
   bool list = PyList_Check(arg);
@@ -195,15 +215,6 @@ void THPPointer<THPStorage>::free() {
     Py_DECREF(ptr);
 }
 
-void storage_copy(at::Storage dst, at::Storage src, bool non_blocking) {
-  auto dst_options = c10::TensorOptions().device(dst.device()).dtype(at::kByte);
-  auto dst_t = at::empty({0}, {}, dst_options).set_(dst);
-
-  auto src_options = c10::TensorOptions().device(src.device()).dtype(at::kByte);
-  auto src_t = at::empty({0}, {}, src_options).set_(src);
-  dst_t.copy_(src_t, non_blocking);
-}
-
 void storage_fill(at::Storage self, uint8_t value) {
   auto options = c10::TensorOptions().device(self.device()).dtype(at::kByte);
   auto self_t = at::empty({0}, {}, options).set_(self);
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index fe8c83407758..56e23487d99d 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -140,6 +140,17 @@
 #define THPQUInt2x4Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
 #define THPQUInt2x4Utils_newReal(value) THPUtils_newReal_INT(value)
 
+/*
+   From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
+   If compiled as a shared library, some compilers don't allow addresses of
+   Python objects defined in other libraries to be used in static PyTypeObject
+   initializers. The DEFERRED_ADDRESS macro is used to tag the slots where such
+   addresses appear; the module init function that adds the PyTypeObject to the
+   module must fill in the tagged slots at runtime. The argument is for
+   documentation -- the macro ignores it.
+*/
+#define DEFERRED_ADDRESS(ADDR) nullptr
+
 #define THPUtils_assert(cond, ...) \
   THPUtils_assertRet(nullptr, cond, __VA_ARGS__)
 #define THPUtils_assertRet(value, cond, ...) \
@@ -208,7 +219,6 @@ std::vector<c10::optional<at::cuda::CUDAStream>>
 THPUtils_PySequence_to_CUDAStreamList(PyObject* obj);
 #endif
 
-void storage_copy(at::Storage dst, at::Storage src, bool non_blocking = false);
 void storage_fill(at::Storage self, uint8_t value);
 void storage_set(at::Storage self, ptrdiff_t idx, uint8_t value);
 uint8_t storage_get(at::Storage self, ptrdiff_t idx);
diff --git a/torch/csrc/utils/byte_order.cpp b/torch/csrc/utils/byte_order.cpp
index c4039de57993..9aeef9a92858 100644
--- a/torch/csrc/utils/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@@ -121,11 +121,11 @@ THPByteOrder THP_nativeByteOrder() {
 void THP_decodeInt16Buffer(
     int16_t* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     dst[i] =
-        (int16_t)(order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+        (int16_t)(do_byte_swap ? decodeUInt16BE(src) : decodeUInt16LE(src));
     src += sizeof(int16_t);
   }
 }
@@ -133,11 +133,11 @@ void THP_decodeInt16Buffer(
 void THP_decodeInt32Buffer(
     int32_t* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     dst[i] =
-        (int32_t)(order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+        (int32_t)(do_byte_swap ? decodeUInt32BE(src) : decodeUInt32LE(src));
     src += sizeof(int32_t);
   }
 }
@@ -145,11 +145,11 @@ void THP_decodeInt32Buffer(
 void THP_decodeInt64Buffer(
     int64_t* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     dst[i] =
-        (int64_t)(order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+        (int64_t)(do_byte_swap ? decodeUInt64BE(src) : decodeUInt64LE(src));
     src += sizeof(int64_t);
   }
 }
@@ -157,7 +157,7 @@ void THP_decodeInt64Buffer(
 void THP_decodeHalfBuffer(
     c10::Half* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -165,7 +165,7 @@ void THP_decodeHalfBuffer(
       uint16_t x;
       c10::Half f;
     };
-    x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    x = (do_byte_swap ? decodeUInt16BE(src) : decodeUInt16LE(src));
     dst[i] = f;
     src += sizeof(uint16_t);
   }
@@ -174,11 +174,10 @@ void THP_decodeHalfBuffer(
 void THP_decodeBFloat16Buffer(
     at::BFloat16* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
-    uint16_t x =
-        (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    uint16_t x = (do_byte_swap ? decodeUInt16BE(src) : decodeUInt16LE(src));
     std::memcpy(&dst[i], &x, sizeof(dst[i]));
     src += sizeof(uint16_t);
   }
@@ -187,7 +186,7 @@ void THP_decodeBFloat16Buffer(
 void THP_decodeBoolBuffer(
     bool* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     dst[i] = (int)src[i] != 0 ? true : false;
@@ -197,7 +196,7 @@ void THP_decodeBoolBuffer(
 void THP_decodeFloatBuffer(
     float* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -205,7 +204,7 @@ void THP_decodeFloatBuffer(
       uint32_t x;
       float f;
     };
-    x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+    x = (do_byte_swap ? decodeUInt32BE(src) : decodeUInt32LE(src));
     dst[i] = f;
     src += sizeof(float);
   }
@@ -214,7 +213,7 @@ void THP_decodeFloatBuffer(
 void THP_decodeDoubleBuffer(
     double* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -222,7 +221,7 @@ void THP_decodeDoubleBuffer(
       uint64_t x;
       double d;
     };
-    x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+    x = (do_byte_swap ? decodeUInt64BE(src) : decodeUInt64LE(src));
     dst[i] = d;
     src += sizeof(double);
   }
@@ -231,7 +230,7 @@ void THP_decodeDoubleBuffer(
 void THP_decodeComplexFloatBuffer(
     c10::complex<float>* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -245,9 +244,9 @@ void THP_decodeComplexFloatBuffer(
       float im;
     };
 
-    x = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+    x = (do_byte_swap ? decodeUInt32BE(src) : decodeUInt32LE(src));
     src += sizeof(float);
-    y = (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+    y = (do_byte_swap ? decodeUInt32BE(src) : decodeUInt32LE(src));
     src += sizeof(float);
 
     dst[i] = c10::complex<float>(re, im);
@@ -257,7 +256,7 @@ void THP_decodeComplexFloatBuffer(
 void THP_decodeComplexDoubleBuffer(
     c10::complex<double>* dst,
     const uint8_t* src,
-    THPByteOrder order,
+    bool do_byte_swap,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
@@ -271,15 +270,95 @@ void THP_decodeComplexDoubleBuffer(
       double im;
     };
 
-    x = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+    x = (do_byte_swap ? decodeUInt64BE(src) : decodeUInt64LE(src));
     src += sizeof(double);
-    y = (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+    y = (do_byte_swap ? decodeUInt64BE(src) : decodeUInt64LE(src));
     src += sizeof(double);
 
     dst[i] = c10::complex<double>(re, im);
   }
 }
 
+void THP_decodeInt16Buffer(
+    int16_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeInt16Buffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeInt32Buffer(
+    int32_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeInt32Buffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeInt64Buffer(
+    int64_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeInt64Buffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeHalfBuffer(
+    c10::Half* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeHalfBuffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeBFloat16Buffer(
+    at::BFloat16* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeBFloat16Buffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeBoolBuffer(
+    bool* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeBoolBuffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeFloatBuffer(
+    float* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeFloatBuffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeDoubleBuffer(
+    double* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeDoubleBuffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeComplexFloatBuffer(
+    c10::complex<float>* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeComplexFloatBuffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
+void THP_decodeComplexDoubleBuffer(
+    c10::complex<double>* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len) {
+  THP_decodeComplexDoubleBuffer(dst, src, (order == THP_BIG_ENDIAN), len);
+}
+
 void THP_encodeInt16Buffer(
     uint8_t* dst,
     const int16_t* src,
diff --git a/torch/csrc/utils/byte_order.h b/torch/csrc/utils/byte_order.h
index b4c3c32eccc3..60aa8fc39e6e 100644
--- a/torch/csrc/utils/byte_order.h
+++ b/torch/csrc/utils/byte_order.h
@@ -13,6 +13,57 @@ enum THPByteOrder { THP_LITTLE_ENDIAN = 0, THP_BIG_ENDIAN = 1 };
 
 TORCH_API THPByteOrder THP_nativeByteOrder();
 
+TORCH_API void THP_decodeInt16Buffer(
+    int16_t* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeInt32Buffer(
+    int32_t* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeInt64Buffer(
+    int64_t* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeHalfBuffer(
+    c10::Half* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeFloatBuffer(
+    float* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeDoubleBuffer(
+    double* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeBoolBuffer(
+    bool* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeBFloat16Buffer(
+    at::BFloat16* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeComplexFloatBuffer(
+    c10::complex<float>* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeComplexDoubleBuffer(
+    c10::complex<double>* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+
 TORCH_API void THP_decodeInt16Buffer(
     int16_t* dst,
     const uint8_t* src,
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 589b069250a3..c612136c4664 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -297,7 +297,6 @@ static bool is_basic_python_type(PyTypeObject* tp) {
 }
 
 inline bool has_torch_function_attr(PyObject* obj) {
-  // NOLINTNEXTLINE(clang-diagnostic-writable-strings)
   auto attr = PyObject_FastGetAttrString(obj, "__torch_function__");
   return (
       attr.ptr() != nullptr && attr.ptr() != torch::disabled_torch_function);
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index 04d79841f79a..6eefe2f03f2c 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -308,7 +308,7 @@ std::string _formattedArgDesc(
       result += reset_red;
     result += ", ";
   }
-  if (arguments.size() > 0)
+  if (!arguments.empty())
     result.erase(result.length() - 2);
   result += ")";
   return result;
@@ -322,7 +322,7 @@ std::string _argDesc(
     result += std::string(py_typename(arg)) + ", ";
   for (auto& kwarg : kwargs)
     result += kwarg.first + "=" + py_typename(kwarg.second) + ", ";
-  if (arguments.size() > 0)
+  if (!arguments.empty())
     result.erase(result.length() - 2);
   result += ")";
   return result;
@@ -390,7 +390,7 @@ std::string format_invalid_args(
     std::vector<std::string> unmatched_kwargs;
     if (has_kwargs)
       unmatched_kwargs = _tryMatchKwargs(option, kwargs);
-    if (unmatched_kwargs.size()) {
+    if (!unmatched_kwargs.empty()) {
       error_msg += "got unrecognized keyword arguments: ";
       for (auto& kwarg : unmatched_kwargs)
         error_msg += kwarg + ", ";
@@ -420,7 +420,7 @@ std::string format_invalid_args(
         std::vector<std::string> unmatched_kwargs;
         if (has_kwargs)
           unmatched_kwargs = _tryMatchKwargs(option, kwargs);
-        if (unmatched_kwargs.size() > 0) {
+        if (!unmatched_kwargs.empty()) {
           error_msg +=
               "      didn't match because some of the keywords were incorrect: ";
           for (auto& kwarg : unmatched_kwargs)
diff --git a/torch/csrc/utils/nested.cpp b/torch/csrc/utils/nested.cpp
index d0619bd1f655..16a93412765b 100644
--- a/torch/csrc/utils/nested.cpp
+++ b/torch/csrc/utils/nested.cpp
@@ -74,11 +74,11 @@ at::Tensor nested_tensor_ctor(
   }
 
   at::ScalarType final_dtype = dtype_val;
-  if (r.isNone(1) && new_list.size() > 0) {
+  if (r.isNone(1) && !new_list.empty()) {
     final_dtype = c10::typeMetaToScalarType(new_list[0].dtype());
   }
   at::Device final_device = tensor_options.device();
-  if (r.isNone(2) && new_list.size() > 0) {
+  if (r.isNone(2) && !new_list.empty()) {
     final_device = new_list[0].device();
   }
   auto out = at::_nested_tensor_from_tensor_list(
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 81ace59e715f..aa5dd5851bbd 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -350,7 +350,6 @@ auto handle_torch_function_no_python_arg_parser(
   }
   if (ret.ptr() == nullptr || ret.ptr() == Py_NotImplemented) {
     for (auto& arg : overloaded_args) {
-      // NOLINTNEXTLINE(clang-diagnostic-writable-strings)
       py::object torch_function =
           PyObject_FastGetAttrString(arg.ptr(), torch_function_name_str);
       if (!torch_function) {
@@ -549,7 +548,7 @@ static void append_overloaded_arg(
     }
   }
   if (class_not_seen_yet) {
-    int arg_index = overloaded_args->size();
+    auto arg_index = overloaded_args->size();
     for (const auto j : c10::irange(arg_index)) {
       if (PyObject_IsSubclass(
               obj_type,
@@ -565,7 +564,8 @@ static void append_overloaded_arg(
     // add object to overloaded_args. If it's a subclass of another class
     // we've already seen it will be inserted before the superclass,
     // otherwise it will be inserted at the end of the array
-    overloaded_args->insert(overloaded_args->begin() + arg_index, obj);
+    overloaded_args->insert(
+        overloaded_args->begin() + static_cast<long>(arg_index), obj);
   }
 }
 
@@ -702,7 +702,7 @@ static bool is_int_list(
     // in an intlist argument. Even float or complex scalar tensors.
     bool r =
         (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
-         THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
+         THPVariable_Unpack(item.ptr()).sizes().empty());
     if (!r && failed_idx != nullptr) {
       *failed_idx = 0;
     }
@@ -738,7 +738,7 @@ static bool is_int_or_symint_list(
     // in an intlist argument. Even float or complex scalar tensors.
     bool r =
         (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
-         THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
+         THPVariable_Unpack(item.ptr()).sizes().empty());
     if (!r && failed_idx != nullptr) {
       *failed_idx = 0;
     }
@@ -783,6 +783,10 @@ auto FunctionParameter::check(
         const auto& var = THPVariable_Unpack(obj);
         return !var.requires_grad() && var.dim() == 0;
       }
+      if (torch::is_symfloat(py::handle(obj))) {
+        // This will induce a guard
+        return true;
+      }
       return false;
     }
     case ParameterType::INT64: {
@@ -794,6 +798,10 @@ auto FunctionParameter::check(
         return at::isIntegralType(var.scalar_type(), /*includeBool=*/false) &&
             !var.requires_grad() && var.dim() == 0;
       }
+      if (torch::is_symint(py::handle(obj))) {
+        // This will induce a guard
+        return true;
+      }
       return false;
     }
     case ParameterType::DIMNAME:
@@ -1126,12 +1134,11 @@ FunctionSignature::FunctionSignature(const std::string& fmt, int index)
   bool allow_numbers_as_tensors = should_allow_numbers_as_tensors(name);
 
   auto last_offset = open_paren + 1;
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  auto next_offset = last_offset;
   bool keyword_only = false;
   bool done = false;
   while (!done) {
     auto offset = fmt.find(", ", last_offset);
+    auto next_offset = offset + 2;
     if (offset == std::string::npos) {
       offset = fmt.find(')', last_offset);
       done = true;
@@ -1141,8 +1148,6 @@ FunctionSignature::FunctionSignature(const std::string& fmt, int index)
         last_offset = next_offset;
         break;
       }
-    } else {
-      next_offset = offset + 2;
     }
     if (offset == std::string::npos) {
       throw std::runtime_error("missing closing parenthesis: " + fmt);
@@ -1207,19 +1212,19 @@ std::string FunctionSignature::toString() const {
 [[noreturn]] static void extra_args(
     const FunctionSignature& signature,
     Py_ssize_t nargs) {
-  const long max_pos_args = signature.max_pos_args;
-  const long min_args = signature.min_args;
+  const auto max_pos_args = signature.max_pos_args;
+  const auto min_args = signature.min_args;
   const long nargs_ = nargs;
   if (min_args != max_pos_args) {
     throw TypeError(
-        "%s() takes from %ld to %ld positional arguments but %ld were given",
+        "%s() takes from %zu to %zu positional arguments but %ld were given",
         signature.name.c_str(),
         min_args,
         max_pos_args,
         nargs_);
   }
   throw TypeError(
-      "%s() takes %ld positional argument%s but %ld %s given",
+      "%s() takes %zu positional argument%s but %ld %s given",
       signature.name.c_str(),
       max_pos_args,
       max_pos_args == 1 ? "" : "s",
@@ -1305,7 +1310,7 @@ bool FunctionSignature::parse(
     PyObject* kwargs,
     PyObject* dst[], // NOLINT
     bool raise_exception) {
-  size_t nargs = args ? PyTuple_GET_SIZE(args) : 0;
+  Py_ssize_t nargs = args ? PyTuple_GET_SIZE(args) : 0;
   auto remaining_kwargs = kwargs ? PyDict_Size(kwargs) : 0;
   size_t arg_pos = 0;
   bool allow_varargs_intlist = false;
@@ -1323,7 +1328,7 @@ bool FunctionSignature::parse(
     }
   }
 
-  if (nargs > max_pos_args && !allow_varargs_intlist) {
+  if (static_cast<size_t>(nargs) > max_pos_args && !allow_varargs_intlist) {
     if (raise_exception) {
       // foo() takes takes 2 positional arguments but 3 were given
       extra_args(*this, nargs);
@@ -1342,7 +1347,7 @@ bool FunctionSignature::parse(
   for (auto& param : params) {
     PyObject* obj = nullptr;
     bool is_kwd = false;
-    if (arg_pos < nargs) {
+    if (arg_pos < static_cast<size_t>(nargs)) {
       // extra positional args given after single positional IntArrayRef arg
       if (param.keyword_only) {
         if (raise_exception) {
@@ -1457,7 +1462,7 @@ PythonArgParser::PythonArgParser(std::vector<std::string> fmts, bool traceable)
       max_args = signature.max_args;
     }
   }
-  if (signatures_.size() > 0) {
+  if (!signatures_.empty()) {
     function_name = signatures_[0].name;
   }
 
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 5bf1a47e068a..24c870f16486 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -46,6 +46,7 @@
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Layout.h>
 #include <torch/csrc/MemoryFormat.h>
@@ -893,6 +894,10 @@ inline int64_t PythonArgs::toInt64(int i) {
     jit::tracer::ArgumentStash::stashValue(
         signature.params[i].name, idx, var, c10::IntType::get());
   }
+  if (torch::is_symint(py::handle(args[i]))) {
+    return py::cast<c10::SymInt>(py::handle(args[i]))
+        .guard_int(__FILE__, __LINE__);
+  }
   return THPUtils_unpackLong(args[i]);
 }
 
@@ -944,6 +949,10 @@ inline c10::optional<double> PythonArgs::toDoubleOptional(int i) {
 inline double PythonArgs::toDouble(int i) {
   if (!args[i])
     return signature.params[i].default_double;
+  if (torch::is_symfloat(py::handle(args[i]))) {
+    return py::cast<c10::SymFloat>(py::handle(args[i]))
+        .guard_float(__FILE__, __LINE__);
+  }
   return THPUtils_unpackDouble(args[i]);
 }
 
@@ -1026,7 +1035,7 @@ inline c10::Stream PythonArgs::stream(int i) {
   return c10::Stream::unpack3(
       ((THPStream*)args[i])->stream_id,
       ((THPStream*)args[i])->device_index,
-      ((THPStream*)args[i])->device_type);
+      static_cast<DeviceType>(((THPStream*)args[i])->device_type));
 }
 
 inline PyObject* PythonArgs::pyobject(int i) {
@@ -1119,7 +1128,7 @@ auto handle_torch_function(
 // PythonArgParser to get overloaded_args.
 enum class TorchFunctionName { TorchFunction, TorchDispatch };
 
-auto TORCH_API handle_torch_function_no_python_arg_parser(
+auto TORCH_PYTHON_API handle_torch_function_no_python_arg_parser(
     at::ArrayRef<py::handle> overloaded_args,
     PyObject* args,
     PyObject* kwargs,
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index a316803d5ca4..3d611db549bb 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -10,6 +10,7 @@
 #include <torch/library.h>
 
 #include <c10/core/SafePyObject.h>
+#include <torch/csrc/PyInterpreter.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 
@@ -58,7 +59,7 @@ c10::AliasAnalysisKind parseAliasAnalysisKind(const std::string& k) {
 
 template <typename Func>
 inline torch::CppFunction dispatch_str(const char* key, Func&& raw_f) {
-  auto mb_key = std::string(key) == ""
+  auto mb_key = std::string(key).empty()
       ? c10::nullopt
       : c10::make_optional(c10::parseDispatchKey(key));
   if (mb_key) {
@@ -345,7 +346,7 @@ void initDispatchBindings(PyObject* module) {
         return std::make_unique<torch::Library>(
             parseKind(kind),
             std::move(name),
-            std::string(dispatch) == ""
+            std::string(dispatch).empty()
                 ? c10::nullopt
                 : c10::make_optional(c10::parseDispatchKey(dispatch)),
             "/dev/null", // temporary workaround
@@ -436,7 +437,7 @@ void initDispatchBindings(PyObject* module) {
     std::vector<std::string> states;
     states.reserve(danglingImpls.size());
     for (auto& danglingImpl : danglingImpls) {
-      states.push_back(danglingImpl.dumpState());
+      states.emplace_back(danglingImpl.dumpState());
     }
 
     return states;
@@ -453,7 +454,7 @@ void initDispatchBindings(PyObject* module) {
       if (!op.overload_name.empty()) {
         ss << "." << op.overload_name;
       }
-      names.push_back(ss.str());
+      names.emplace_back(ss.str());
     }
 
     return names;
@@ -590,7 +591,7 @@ void initDispatchBindings(PyObject* module) {
   m.def(
       "_dispatch_print_registrations_for_dispatch_key",
       [](const char* dispatch_key = "") {
-        auto k = std::string(dispatch_key) == ""
+        auto k = std::string(dispatch_key).empty()
             ? c10::nullopt
             : c10::make_optional(c10::parseDispatchKey(dispatch_key));
         auto op_names =
@@ -604,7 +605,7 @@ void initDispatchBindings(PyObject* module) {
   m.def(
       "_dispatch_get_registrations_for_dispatch_key",
       [](const char* dispatch_key = "") {
-        auto k = std::string(dispatch_key) == ""
+        auto k = std::string(dispatch_key).empty()
             ? c10::nullopt
             : c10::make_optional(c10::parseDispatchKey(dispatch_key));
         auto op_names =
@@ -612,8 +613,9 @@ void initDispatchBindings(PyObject* module) {
         std::vector<std::string> names;
         names.reserve(op_names.size());
         for (auto& op : op_names) {
-          names.push_back(
-              op.name + (op.overload_name == "" ? "" : "." + op.overload_name));
+          names.emplace_back(
+              op.name +
+              (op.overload_name.empty() ? "" : "." + op.overload_name));
         }
         return names;
       },
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index a81e72f764aa..da6025a1bab1 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -91,21 +91,7 @@ inline uint64_t THPUtils_unpackUInt64(PyObject* obj) {
   return (uint64_t)value;
 }
 
-inline bool THPUtils_checkIndex(PyObject* obj) {
-  if (PyBool_Check(obj)) {
-    return false;
-  }
-  if (THPUtils_checkLong(obj)) {
-    return true;
-  }
-  torch::jit::tracer::NoWarn no_warn_guard;
-  auto index = THPObjectPtr(PyNumber_Index(obj));
-  if (!index) {
-    PyErr_Clear();
-    return false;
-  }
-  return true;
-}
+bool THPUtils_checkIndex(PyObject* obj);
 
 inline int64_t THPUtils_unpackIndex(PyObject* obj) {
   if (!THPUtils_checkLong(obj)) {
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index ec0c49c64dc5..53adbcdfb247 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -3,6 +3,7 @@
 #include <c10/core/SafePyObject.h>
 #include <c10/core/SymNodeImpl.h>
 
+#include <torch/csrc/PyInterpreter.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -50,14 +51,26 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));
   }
 
-  c10::SymNode is_non_overlapping_and_dense(
-      c10::ArrayRef<c10::SymNode> sizes,
-      c10::ArrayRef<c10::SymNode> strides) override {
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr("is_non_overlapping_and_dense")(sizes, strides);
-    return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));
+#define TORCH_SYMNODE_SIZES_STRIDES(n)                                        \
+  c10::SymNode n(                                                             \
+      c10::ArrayRef<c10::SymNode> sizes, c10::ArrayRef<c10::SymNode> strides) \
+      override {                                                              \
+    py::gil_scoped_acquire acquire;                                           \
+    auto r = getPyObj().attr(#n)(sizes, strides);                             \
+    return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));              \
   }
 
+  // clang-format off
+    TORCH_SYMNODE_SIZES_STRIDES(is_contiguous)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_contiguous_2d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_contiguous_3d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_strides_2d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_strides_3d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_non_overlapping_and_dense)
+  // clang-format on
+
+#undef TORCH_SYMNODE_SIZES_STRIDES
+
   bool bool_() override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("bool_")().is(py::handle(Py_True));
@@ -78,6 +91,11 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("is_bool")().is(py::handle(Py_True));
   }
 
+  bool has_hint() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("has_hint")().is(py::handle(Py_True));
+  }
+
   int64_t guard_int(const char* file, int64_t line) override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("guard_int")(file, line).cast<int64_t>();
diff --git a/torch/csrc/utils/schema_info.cpp b/torch/csrc/utils/schema_info.cpp
index fafd1c121180..0d09b3dba6b2 100644
--- a/torch/csrc/utils/schema_info.cpp
+++ b/torch/csrc/utils/schema_info.cpp
@@ -261,7 +261,7 @@ std::vector<c10::FunctionSchema> SchemaInfo::getNonDeterministicOps() {
   std::vector<c10::FunctionSchema> nondeterministic_ops;
   nondeterministic_ops.reserve(nondeterministic_op_strings.size());
   for (const std::string& signature : nondeterministic_op_strings) {
-    nondeterministic_ops.push_back(torch::jit::parseSchema(signature));
+    nondeterministic_ops.emplace_back(torch::jit::parseSchema(signature));
   }
 
   return nondeterministic_ops;
@@ -341,7 +341,7 @@ void SchemaInfo::initSchemaInfo() {
       c10::optional<c10::AliasTypeSet> contained_types =
           schema_.getAliasTypeSetContainedTypes(
               schema_.mapTypeToAliasTypeSet(argument.type()));
-      if (contained_types && contained_types->size() > 0) {
+      if (contained_types && !contained_types->empty()) {
         container_set_.insert({type, i});
       }
     }
diff --git a/torch/csrc/utils/schema_info.h b/torch/csrc/utils/schema_info.h
index ae1a6f766ede..461f5a6f0427 100644
--- a/torch/csrc/utils/schema_info.h
+++ b/torch/csrc/utils/schema_info.h
@@ -17,7 +17,7 @@ using SchemaSpecialCasePair =
 
 struct TORCH_API SchemaInfo {
  public:
-  explicit SchemaInfo(const c10::FunctionSchema& schema)
+  explicit SchemaInfo(c10::FunctionSchema schema)
       : schema_(std::move(schema)),
         alias_maps_current_(false),
         has_init_(false) {}
diff --git a/torch/csrc/utils/tensor_apply.cpp b/torch/csrc/utils/tensor_apply.cpp
index 7632c6511ea4..7d7012661fe9 100644
--- a/torch/csrc/utils/tensor_apply.cpp
+++ b/torch/csrc/utils/tensor_apply.cpp
@@ -35,7 +35,7 @@ static void recursive_apply(
     int64_t dim,
     PyObject* fn,
     std::array<StridedData, N> strided_data) {
-  int64_t ndim = sizes.size();
+  int64_t ndim = static_cast<int64_t>(sizes.size());
   if (dim == ndim) {
     auto args = THPObjectPtr(PyTuple_New(N));
     if (!args)
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index 3e0e3acf38c2..fd9a6b26a4b2 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -78,7 +78,7 @@ void initializeDtypes() {
         0) {
       throw python_error();
     }
-    if (legacy_name != "") {
+    if (!legacy_name.empty()) {
       Py_INCREF(dtype);
       if (PyModule_AddObject(torch_module.get(), legacy_name.c_str(), dtype) !=
           0) {
diff --git a/torch/csrc/utils/tensor_flatten.cpp b/torch/csrc/utils/tensor_flatten.cpp
index 6b0f6388b276..396a6e8a3a8e 100644
--- a/torch/csrc/utils/tensor_flatten.cpp
+++ b/torch/csrc/utils/tensor_flatten.cpp
@@ -29,7 +29,7 @@ std::vector<TensorGroup> take_tensors(
       tensor_size = tensor.numel() * tensor.element_size();
     }
 
-    auto& type_group = groups[type_id(tensor)];
+    auto& type_group = groups[static_cast<int64_t>(type_id(tensor))];
     type_group.tensors.push_back(tensor);
 
     if (fine_grained) {
@@ -104,7 +104,7 @@ std::vector<at::Tensor> unflatten_sparse_tensors(
     const at::Tensor& flat_indices,
     const at::Tensor& flat_values,
     at::TensorList tensors) {
-  if (tensors.size() == 0)
+  if (tensors.empty())
     return {};
 
   auto indices =
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index 76d587f0166c..df7ca9be2943 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -17,8 +17,8 @@ static PyObject* recursive_to_list(
     IntArrayRef strides,
     int64_t dim,
     ScalarType scalarType,
-    int64_t elementSize) {
-  int64_t ndim = sizes.size();
+    size_t elementSize) {
+  int64_t ndim = static_cast<int64_t>(sizes.size());
   if (dim == ndim) {
     return torch::utils::load_scalar(data, scalarType);
   }
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 6121c4c43eed..bec1649fb05e 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -33,19 +33,14 @@
 #include <stdexcept>
 #include <vector>
 
-using at::Backend;
 using at::Device;
 using at::IntArrayRef;
-using at::kCPU;
-using at::kCUDA;
 using at::kInt;
 using at::kLong;
-using at::Scalar;
 using at::ScalarType;
 using at::Storage;
 using at::Tensor;
 using at::TensorOptions;
-using at::Type;
 using c10::optional;
 
 namespace torch {
@@ -64,7 +59,7 @@ TensorOptions build_options(
   return options;
 }
 
-void maybe_initialize_cuda(const Device device) {
+void maybe_initialize_cuda(const Device& device) {
   if (device.is_cuda()) {
     torch::utils::cuda_lazy_init();
   }
@@ -103,7 +98,7 @@ std::vector<int64_t> compute_sizes(PyObject* seq, ScalarType scalar_type) {
     if (length < 0)
       throw python_error();
     if (is_storage) {
-      length /= elementSize(scalar_type);
+      length /= static_cast<int64_t>(elementSize(scalar_type));
     }
     sizes.push_back(length);
     if (sizes.size() > MAX_DIMS) {
@@ -205,11 +200,11 @@ void recursive_store(
     IntArrayRef strides,
     int64_t dim,
     ScalarType scalarType,
-    int elementSize,
+    size_t elementSize,
     PyObject* obj) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data != nullptr);
 
-  int64_t ndim = sizes.size();
+  int64_t ndim = static_cast<int64_t>(sizes.size());
   bool is_symfloat = torch::is_symfloat(obj);
   bool is_symint = torch::is_symint(obj);
   if (dim == ndim) {
@@ -374,7 +369,7 @@ Tensor internal_new_from_data(
       at::tracer::impl::NoTracerDispatchMode tracer_guard;
 
       if (isStorage(data)) {
-        ScalarType storage_scalar_type;
+        ScalarType storage_scalar_type{ScalarType::Undefined};
         bool is_typed_storage = false;
         Storage storage =
             createStorageGetType(data, storage_scalar_type, is_typed_storage);
@@ -492,6 +487,7 @@ void check_base_legacy_new(
         c10::DispatchKey::HPU,
         c10::DispatchKey::MPS,
         c10::DispatchKey::Meta,
+        c10::DispatchKey::PrivateUse1,
     });
     TORCH_CHECK(
         expected_key_set.has(dispatch_key),
@@ -558,13 +554,29 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
+    if (ctor_or_new == CtorOrNew::CTOR) {
+      TORCH_WARN_ONCE(
+          "torch.sparse.SparseTensor() is deprecated."
+          "  Please use torch.sparse_coo_tensor((0,), dtype=).");
+    }
     auto deviceOptional = r.deviceOptional(0);
     check_legacy_ctor_device(dispatch_key, deviceOptional);
     return at::empty({0}, build_options(options, scalar_type, deviceOptional));
   } else if (r.idx == 1) {
+    if (ctor_or_new == CtorOrNew::CTOR) {
+      TORCH_WARN_ONCE(
+          "torch.sparse.SparseTensor(cdata=x._cdata) is deprecated."
+          "  Please use torch.sparse_coo_tensor(x._indices(), x._values(), x.shape).");
+    }
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return at::unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 2) {
+    if (ctor_or_new == CtorOrNew::CTOR) {
+      TORCH_WARN_ONCE(
+          "torch.sparse.SparseTensor(indices, values, *, device=) is deprecated."
+          "  Please use torch.sparse_coo_tensor(indices, values, dtype=, device=).");
+    }
     // Note: this signature doesn't have a dtype, even though it has a device;
     // it probably shouldn't have a device (we should infer it).
     auto deviceOptional = r.deviceOptional(2);
@@ -572,6 +584,11 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
     at::OptionalDeviceGuard device_guard(deviceOptional);
     return at::sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
+    if (ctor_or_new == CtorOrNew::CTOR) {
+      TORCH_WARN_ONCE(
+          "torch.sparse.SparseTensor(indices, values, shape, *, device=) is deprecated."
+          "  Please use torch.sparse_coo_tensor(indices, values, shape, dtype=, device=).");
+    }
     // Note: this signature doesn't have a dtype, even though it has a device;
     // it probably shouldn't have a device (we should infer it).
     auto deviceOptional = r.deviceOptional(3);
@@ -588,7 +605,7 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
       // unless the sequences is a torch.Size
       if (ctor_or_new == CtorOrNew::CTOR) {
         throw TypeError(
-            "torch.SparseTensor(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() "
+            "torch.sparse.SparseTensor(sequence) only accepts sizes.  Please use torch.sparse_coo_tensor() "
             "or construct a strided tensor and convert it to sparse via to_sparse.");
       } else {
         throw TypeError(
@@ -596,6 +613,11 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
             "or construct a strided tensor and convert it to sparse via to_sparse.");
       }
     }
+    if (ctor_or_new == CtorOrNew::CTOR) {
+      TORCH_WARN_ONCE(
+          "torch.sparse.SparseTensor(shape, *, device=) is deprecated."
+          "  Please use torch.sparse_coo_tensor(shape, dtype=, device=).");
+    }
     return new_with_sizes(
         options, scalar_type, r.deviceOptional(1), r.symintlist(0));
   }
@@ -608,9 +630,9 @@ c10::TensorOptions typeIdWithDefault(
     int64_t device_idx,
     c10::DispatchKey dispatch_key) {
   auto options = dispatchKeyToTensorOptions(dispatch_key);
-  if (!r.isNone(device_idx)) {
+  if (!r.isNone(static_cast<int>(device_idx))) {
     // TODO: This line doesn't seem to be exercised at all in tests
-    options = options.device(r.device(device_idx).type());
+    options = options.device(r.device(static_cast<int>(device_idx)).type());
   }
   return options;
 }
@@ -655,7 +677,7 @@ Tensor legacy_tensor_generic_ctor_new(
     at::OptionalDeviceGuard device_guard(deviceOptional);
     return at::empty({0}, build_options(options, scalar_type));
   } else if (r.idx == 1) {
-    at::ScalarType storage_scalar_type;
+    at::ScalarType storage_scalar_type{at::ScalarType::Undefined};
     bool is_typed_storage = false;
     at::Storage storage = r.storage(0, storage_scalar_type, is_typed_storage);
     if (storage_scalar_type != at::ScalarType::Undefined && is_typed_storage) {
@@ -669,6 +691,7 @@ Tensor legacy_tensor_generic_ctor_new(
     }
     return new_with_storage(options, scalar_type, storage);
   } else if (r.idx == 2) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return at::unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 3) {
@@ -786,9 +809,8 @@ Tensor indexing_tensor_from_data(
 
 class CheckSparseTensorInvariantsContext {
  public:
-  CheckSparseTensorInvariantsContext() {
-    state = at::globalContext().checkSparseTensorInvariants();
-  }
+  CheckSparseTensorInvariantsContext()
+      : state{at::globalContext().checkSparseTensorInvariants()} {}
   ~CheckSparseTensorInvariantsContext() {
     at::globalContext().setCheckSparseTensorInvariants(state);
   }
@@ -1603,10 +1625,39 @@ Tensor asarray(
   }
 
 #ifdef USE_NUMPY
-  // Check whether 'obj' is a NumPy Array
-  if (is_numpy_available() && PyArray_Check(obj)) {
-    tensor = tensor_from_numpy(obj, /*warn_if_not_writeable=*/false);
-    should_warn_numpy_not_writable = !PyArray_ISWRITEABLE((PyArrayObject*)obj);
+  if (is_numpy_available()) {
+    // Check whether 'obj' is a NumPy Array or Scalar.
+    bool is_numpy_array = PyArray_Check(obj);
+    bool is_numpy_scalar = PyArray_CheckScalar(obj);
+
+    if (is_numpy_array || is_numpy_scalar) {
+      THPObjectPtr ptr;
+      auto arr = obj;
+
+      if (is_numpy_scalar) {
+        TORCH_CHECK(
+            !force_alias,
+            "can't alias NumPy scalars. ",
+            "Either remove copy=False or transform it in a ndarray. ")
+
+        ptr = PyArray_FromScalar(obj, nullptr);
+        arr = ptr.get();
+      }
+
+      tensor = tensor_from_numpy(arr, /*warn_if_not_writeable=*/false);
+      should_warn_numpy_not_writable =
+          !PyArray_ISWRITEABLE((PyArrayObject*)arr);
+
+      if (is_numpy_scalar) {
+        // Uses a newly cloned storage, instead of the shared one.
+        // The THPObjectPtr will delete the previous storage in the
+        // end of the previous scope.
+        tensor = tensor.clone();
+
+        // No need to clone again, later.
+        force_copy = false;
+      }
+    }
   }
 #endif
 
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index 0ba584eac7bd..62ca17464152 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -175,7 +175,7 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force /*=false*/) {
 
   auto array = THPObjectPtr(PyArray_New(
       &PyArray_Type,
-      prepared_tensor.dim(),
+      static_cast<int>(prepared_tensor.dim()),
       sizes.data(),
       dtype,
       strides.data(),
@@ -382,6 +382,7 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
   }
 
   // Extract the `obj.__cuda_array_interface__['typestr']` attribute
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   ScalarType dtype;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int dtype_size_in_bytes;
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index f81ed6461a66..decf407b982e 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -47,6 +47,8 @@ static const char* backend_to_string(const at::Backend& backend) {
       return "torch.lazy";
     case at::Backend::XLA:
       return "torch.xla";
+    case at::Backend::Meta:
+      return "torch.meta";
     default:
       AT_ERROR("Unimplemented backend ", backend);
   }
diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h
index 2c97a7d96c32..1b36569bdf66 100644
--- a/torch/csrc/utils/torch_dispatch_mode.h
+++ b/torch/csrc/utils/torch_dispatch_mode.h
@@ -26,8 +26,8 @@ struct StashTorchDispatchModeGuard {
 struct StashTorchDispatchStackGuard {
  public:
   StashTorchDispatchStackGuard() {
-    const auto old = c10::impl::TorchDispatchModeTLS::get_state();
-    c10::impl::TorchDispatchModeTLS::set_state(saved_state_);
+    auto old = c10::impl::TorchDispatchModeTLS::get_state();
+    c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_));
     saved_state_ = std::move(old);
   }
 
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index eec9bc2d1986..a471294ef960 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -16,7 +16,7 @@
 import warnings
 import threading
 from functools import lru_cache
-from typing import Any, List, Optional, Set, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union, cast
 from ._utils import _get_device_index, _dummy_type
 from .._utils import classproperty
 from .graphs import CUDAGraph, graph_pool_handle, graph, \
@@ -37,6 +37,15 @@
 _is_in_bad_fork = getattr(torch._C, "_cuda_isInBadFork", lambda: False)
 _device_t = Union[_device, str, int, None]
 
+_HAS_PYNVML = False
+_PYNVML_ERR = None
+try:
+    import pynvml  # type: ignore[import]
+    _HAS_PYNVML = True
+except ImportError as err:
+    _PYNVML_ERR = err  # sometimes a lib is installed but the import fails for some other reason, so we log the error for later
+
+
 
 class _LazySeedTracker:
     # Since seeding is memory-less, only track the latest seed.
@@ -272,14 +281,14 @@ def cudart():
     return _cudart
 
 
-class cudaStatus(object):
+class cudaStatus:
     SUCCESS: int = 0
     ERROR_NOT_READY: int = 34
 
 class CudaError(RuntimeError):
     def __init__(self, code: int) -> None:
         msg = _cudart.cudaGetErrorString(_cudart.cudaError(code))
-        super(CudaError, self).__init__('{0} ({1})'.format(msg, code))
+        super().__init__('{0} ({1})'.format(msg, code))
 
 
 def check_error(res: int) -> None:
@@ -300,7 +309,7 @@ def __exit__(self, type: Any, value: Any, traceback: Any):
         return False
 
 
-class device(object):
+class device:
     r"""Context-manager that changes the selected device.
 
     Args:
@@ -313,17 +322,10 @@ def __init__(self, device: Any):
         self.prev_idx = -1
 
     def __enter__(self):
-        if self.idx == -1:
-            return
-        self.prev_idx = torch.cuda.current_device()
-        if self.prev_idx != self.idx:
-            torch.cuda.set_device(self.idx)
-        if not torch.jit.is_scripting():
-            _lazy_init()
+        self.prev_idx = torch.cuda._exchange_device(self.idx)
 
     def __exit__(self, type: Any, value: Any, traceback: Any):
-        if self.prev_idx != self.idx:
-            torch.cuda.set_device(self.prev_idx)
+        torch.cuda._exchange_device(self.prev_idx)
         return False
 
 
@@ -339,7 +341,7 @@ class device_of(device):
 
     def __init__(self, obj):
         idx = obj.get_device() if obj.is_cuda else -1
-        super(device_of, self).__init__(idx)
+        super().__init__(idx)
 
 
 def set_device(device: _device_t) -> None:
@@ -418,7 +420,7 @@ def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool:
     return torch._C._cuda_canDeviceAccessPeer(device, peer_device)
 
 
-class StreamContext(object):
+class StreamContext:
     r"""Context-manager that selects a given stream.
 
     All CUDA kernels queued within its context will be enqueued on a selected
@@ -494,46 +496,131 @@ def set_stream(stream: Stream):
         return
     torch._C._cuda_setStream(stream_id=stream.stream_id, device_index=stream.device_index, device_type=stream.device_type)
 
-def _parse_visible_devices() -> Set[int]:
+
+def _parse_visible_devices() -> Union[List[int], List[str]]:
     """Parse CUDA_VISIBLE_DEVICES environment variable."""
     var = os.getenv("CUDA_VISIBLE_DEVICES")
     if var is None:
-        return set(x for x in range(64))
+        return list(range(64))
 
     def _strtoul(s: str) -> int:
         """Return -1 or positive integer sequence string starts with,"""
         if not s:
             return -1
         for idx, c in enumerate(s):
-            if not c.isdigit():
+            if not (c.isdigit() or (idx == 0 and c in '+-')):
                 break
             if idx + 1 == len(s):
                 idx += 1
         return int(s[:idx]) if idx > 0 else -1
 
+    def parse_list_with_prefix(lst: str, prefix: str) -> List[str]:
+        rcs: List[str] = []
+        for elem in lst.split(","):
+            # Repeated id results in empty set
+            if elem in rcs:
+                return cast(List[str], [])
+            # Anything other but prefix is ignored
+            if not elem.startswith(prefix):
+                break
+            rcs.append(elem)
+        return rcs
+
+    if var.startswith("GPU-"):
+        return parse_list_with_prefix(var, "GPU-")
+    if var.startswith("MIG-"):
+        return parse_list_with_prefix(var, "MIG-")
     # CUDA_VISIBLE_DEVICES uses something like strtoul
     # which makes `1gpu2,2ampere` is equivalent to `1,2`
-    rc: Set[int] = set()
+    rc: List[int] = []
     for elem in var.split(","):
-        rc.add(_strtoul(elem.strip()))
+        x = _strtoul(elem.strip())
+        # Repeated ordinal results in empty set
+        if x in rc:
+            return cast(List[int], [])
+        # Negative value aborts the sequence
+        if x < 0:
+            break
+        rc.append(x)
     return rc
 
+
 def _raw_device_count_nvml() -> int:
     """Return number of devices as reported by NVML
     or negative value if NVML discovery/initialization failed."""
-    from ctypes import CDLL, c_int
+    from ctypes import CDLL, c_int, byref
     nvml_h = CDLL("libnvidia-ml.so.1")
     rc = nvml_h.nvmlInit()
     if rc != 0:
         warnings.warn("Can't initialize NVML")
         return -1
-    dev_arr = (c_int * 1)(-1)
-    rc = nvml_h.nvmlDeviceGetCount_v2(dev_arr)
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
     if rc != 0:
         warnings.warn("Can't get nvml device count")
         return -1
     del nvml_h
-    return dev_arr[0]
+    return dev_count.value
+
+
+def _raw_device_uuid_nvml() -> Optional[List[str]]:
+    """Return list of device UUID as reported by NVML
+    or None if NVM discovery/initialization failed."""
+    from ctypes import CDLL, c_int, c_void_p, create_string_buffer, byref
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return None
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return None
+    uuids: List[str] = []
+    for idx in range(dev_count.value):
+        dev_id = c_void_p()
+        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+        if rc != 0:
+            warnings.warn("Can't get device handle")
+            return None
+        buf_len = 96
+        buf = create_string_buffer(buf_len)
+        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+        if rc != 0:
+            warnings.warn("Can't get device UUID")
+            return None
+        uuids.append(buf.raw.decode("ascii").strip('\0'))
+    del nvml_h
+    return uuids
+
+
+def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]:
+    """Given the set of partial uuids and list of known uuids builds
+    a set of ordinals excluding ambiguous partials IDs"""
+    def uuid_to_orinal(candidate: str, uuids: List[str]) -> int:
+        best_match = -1
+        for idx, uuid in enumerate(uuids):
+            if not uuid.startswith(candidate):
+                continue
+            # Ambigous candidate
+            if best_match != -1:
+                return -1
+            best_match = idx
+        return best_match
+
+    rc: List[int] = []
+    for candidate in candidates:
+        idx = uuid_to_orinal(candidate, uuids)
+        # First invalid ordinal stops parsing
+        if idx < 0:
+            break
+        # Duplicates result in empty set
+        if idx in rc:
+            return cast(List[int], [])
+        rc.append(idx)
+    return rc
+
 
 def _device_count_nvml() -> int:
     """Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account.
@@ -542,14 +629,41 @@ def _device_count_nvml() -> int:
     if not visible_devices:
         return 0
     try:
-        raw_cnt = _raw_device_count_nvml()
+        if type(visible_devices[0]) is str:
+            # Skip MIG parsing
+            if visible_devices[0].startswith("MIG-"):
+                return -1
+            uuids = _raw_device_uuid_nvml()
+            if uuids is None:
+                return -1
+            visible_devices = _transform_uuid_to_ordinals(cast(List[str], visible_devices), uuids)
+        else:
+            raw_cnt = _raw_device_count_nvml()
+            if raw_cnt <= 0:
+                return raw_cnt
+            # Trim the list up to a maximum available device
+            for idx, val in enumerate(visible_devices):
+                if cast(int, val) >= raw_cnt:
+                    return idx
     except OSError:
         return -1
     except AttributeError:
         return -1
-    if raw_cnt <= 0:
-        return raw_cnt
-    return len(set(range(raw_cnt)).intersection(visible_devices))
+    return len(visible_devices)
+
+def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int:
+    r"""Returns the NVML index of the device, taking CUDA_VISIBLE_DEVICES into account."""
+    idx = _get_device_index(device, optional=True)
+    visible_devices = _parse_visible_devices()
+    if type(visible_devices[0]) is str:
+        uuids = _raw_device_uuid_nvml()
+        if uuids is None:
+            raise RuntimeError("Can't get device UUIDs")
+        visible_devices = _transform_uuid_to_ordinals(cast(List[str], visible_devices), uuids)
+    idx_map = {idx: real_idx for idx, real_idx in enumerate(cast(List[int], visible_devices))}
+    if idx not in idx_map:
+        raise RuntimeError(f"device {idx} is not visible (CUDA_VISIBLE_DEVICES={visible_devices})")
+    return idx_map[idx]
 
 @lru_cache(maxsize=1)
 def device_count() -> int:
@@ -677,6 +791,19 @@ def get_sync_debug_mode() -> int:
     return torch._C._cuda_get_sync_debug_mode()
 
 
+def _get_pynvml_handler(device: Optional[Union[Device, int]] = None):
+    if not _HAS_PYNVML:
+        raise ModuleNotFoundError("pynvml does not seem to be installed or it can't be imported.") from _PYNVML_ERR
+    from pynvml import NVMLError_DriverNotLoaded
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded as e:
+        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
+
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return handle
+
 def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
     r"""Returns the percent of time over the past sample period during which global (device)
     memory was being read or written. as given by `nvidia-smi`.
@@ -689,16 +816,9 @@ def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
     Warning: Each sample period may be between 1 second and 1/6 second,
     depending on the product being queried.
     """
-    try:
-        import pynvml  # type: ignore[import]
-    except ModuleNotFoundError as e:
-        raise ModuleNotFoundError("pynvml module not found, please install pynvml") from e
-    from pynvml import NVMLError_DriverNotLoaded
-    try:
-        pynvml.nvmlInit()
-    except NVMLError_DriverNotLoaded as e:
-        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
-    device = _get_device_index(device, optional=True)
+    handle = _get_pynvml_handler()
+
+    device = _get_nvml_device_index(device)
     handle = pynvml.nvmlDeviceGetHandleByIndex(device)
     return pynvml.nvmlDeviceGetUtilizationRates(handle).memory
 
@@ -715,19 +835,59 @@ def utilization(device: Optional[Union[Device, int]] = None) -> int:
     Warning: Each sample period may be between 1 second and 1/6 second,
     depending on the product being queried.
     """
-    try:
-        import pynvml  # type: ignore[import]
-    except ModuleNotFoundError as e:
-        raise ModuleNotFoundError("pynvml module not found, please install pynvml") from e
-    from pynvml import NVMLError_DriverNotLoaded
-    try:
-        pynvml.nvmlInit()
-    except NVMLError_DriverNotLoaded as e:
-        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
-    device = _get_device_index(device, optional=True)
+
+    handle = _get_pynvml_handler(device)
+    device = _get_nvml_device_index(device)
     handle = pynvml.nvmlDeviceGetHandleByIndex(device)
     return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
 
+def temperature(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Returns the average temperature of the GPU sensor in Degrees C (Centigrades)
+        over the past sample period as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    # 0 refers to the temperature sensor for the GPU die.
+    return pynvml.nvmlDeviceGetTemperature(handle, 0)
+
+def power_draw(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Returns the average power draw of the GPU sensor in mW (MilliWatts)
+        over the past sample period as given by `nvidia-smi` for Fermi or newer fully supported devices.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    return pynvml.nvmlDeviceGetPowerUsage(handle)
+
+def clock_rate(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Returns the clock speed of the GPU SM in Hz Hertz over the past sample period as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    return pynvml.nvmlDeviceGetClockInfo(handle, 1)
+
+
+
 
 from .memory import *  # noqa: F403
 
@@ -746,7 +906,7 @@ def _lazy_new(cls, *args, **kwargs):
     return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
 
 
-class _CudaBase(object):
+class _CudaBase:
     is_cuda = True
     is_sparse = False
 
@@ -755,7 +915,7 @@ def type(self, *args, **kwargs):
         # but it is only available in the typing module on Python >= 3.8
         # or on typing_extensions module on Python >= 3.6
         with device(self.get_device()):  # type: ignore[attr-defined]
-            return super(_CudaBase, self).type(*args, **kwargs)  # type: ignore[misc]
+            return super().type(*args, **kwargs)  # type: ignore[misc]
 
     __new__ = _lazy_new
 
@@ -940,7 +1100,8 @@ def _dtype(self):
     'is_current_stream_capturing', 'is_initialized', 'jiterator', 'list_gpu_processes', 'make_graphed_callables',
     'manual_seed', 'manual_seed_all', 'max_memory_allocated', 'max_memory_cached', 'max_memory_reserved',
     'mem_get_info', 'memory', 'memory_allocated', 'memory_cached', 'memory_reserved', 'memory_snapshot',
-    'memory_stats', 'memory_stats_as_nested_dict', 'memory_summary', 'memory_usage', 'nccl', 'nvtx', 'profiler',
-    'random', 'reset_accumulated_memory_stats', 'reset_max_memory_allocated', 'reset_max_memory_cached',
-    'reset_peak_memory_stats', 'seed', 'seed_all', 'set_device', 'set_per_process_memory_fraction', 'set_rng_state',
-    'set_rng_state_all', 'set_stream', 'set_sync_debug_mode', 'sparse', 'stream', 'streams', 'synchronize', 'utilization']
+    'memory_stats', 'memory_stats_as_nested_dict', 'memory_summary', 'memory_usage', 'temperature', 'power_draw',
+    'clock_rate', 'nccl', 'nvtx', 'profiler', 'random', 'reset_accumulated_memory_stats', 'reset_max_memory_allocated',
+    'reset_max_memory_cached', 'reset_peak_memory_stats', 'seed', 'seed_all', 'set_device', 'set_per_process_memory_fraction',
+    'set_rng_state', 'set_rng_state_all', 'set_stream', 'set_sync_debug_mode', 'sparse', 'stream', 'streams',
+    'synchronize', 'utilization']
diff --git a/torch/cuda/_memory_viz.py b/torch/cuda/_memory_viz.py
index f034639cceba..dc7ebc67d8a8 100644
--- a/torch/cuda/_memory_viz.py
+++ b/torch/cuda/_memory_viz.py
@@ -85,11 +85,11 @@ def _seg_info(seg):
 
     f = io.StringIO()
 
-    before_segs = set(_seg_key(seg) for seg in before)
-    after_segs = set(_seg_key(seg) for seg in after)
+    before_segs = {_seg_key(seg) for seg in before}
+    after_segs = {_seg_key(seg) for seg in after}
 
-    print(f'only_before = {list(a for a,_ in (before_segs - after_segs))}')
-    print(f'only_after = {list(a for a,_ in (after_segs - before_segs))}')
+    print(f'only_before = {[a for a,_ in (before_segs - after_segs)]}')
+    print(f'only_after = {[a for a,_ in (after_segs - before_segs)]}')
 
     for seg in before:
         if _seg_key(seg) not in after_segs:
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index 83bc6beb5e79..d9347ecf842c 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -6,7 +6,6 @@
     HAS_NUMPY = True
 except ModuleNotFoundError:
     np = None  # type: ignore[assignment]
-from torch._six import string_classes
 from typing import Any
 
 __all__ = ["autocast", "custom_fwd", "custom_bwd"]
@@ -48,7 +47,7 @@ def _cast(value, dtype):
     if isinstance(value, torch.Tensor):
         is_eligible = (value.is_floating_point() and value.is_cuda and (value.dtype is not torch.float64))
         return value.to(dtype) if is_eligible else value
-    elif isinstance(value, string_classes):
+    elif isinstance(value, str):
         return value
     elif HAS_NUMPY and isinstance(value, np.ndarray):
         return value
@@ -56,7 +55,7 @@ def _cast(value, dtype):
         return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()}
     elif isinstance(value, collections.abc.Iterable):
         iterable = map(lambda v: _cast(v, dtype), value)
-        if isinstance(value, list) or isinstance(value, tuple):
+        if isinstance(value, (list, tuple)):
             return type(value)(iterable)
         else:
             return iterable
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index b26438327ca4..1e826f676d2a 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -1,14 +1,16 @@
-import torch
 from collections import defaultdict, abc
-import warnings
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, cast
+import inspect
+import warnings
+
+import torch
 from .common import amp_definitely_not_available
 
 
 __all__ = ["OptState", "GradScaler"]
 
-class _MultiDeviceReplicator(object):
+class _MultiDeviceReplicator:
     """
     Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
     """
@@ -40,7 +42,7 @@ def _refresh_per_optimizer_state():
     return {"stage": OptState.READY, "found_inf_per_device": {}}
 
 
-class GradScaler(object):
+class GradScaler:
     _scale: Optional[torch.Tensor]
     _grows_tracker: Optional[torch.Tensor]
     _per_optimizer_states: Dict[int, Dict[str, Any]]
@@ -181,7 +183,7 @@ def apply_scale(val):
                 return val * stash[0].get(val.device)
             elif isinstance(val, abc.Iterable):
                 iterable = map(apply_scale, val)
-                if isinstance(val, list) or isinstance(val, tuple):
+                if isinstance(val, (list, tuple)):
                     return type(val)(iterable)
                 else:
                     return iterable
@@ -329,8 +331,35 @@ def step(self, optimizer, *args, **kwargs):
             # The contract with custom optimizers is that their step() should accept an additional,
             # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
             # it can query its own state, invoke unscale_ on itself, etc
-            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
+            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
+            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
+            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
+            # to skip the parameter updates or unscale gradients before updating parameters in
+            # the fused kernel, e.g. `FusedAdamMathFunctor`.
+            kwargs_ = kwargs
+            has_grad_scaler_kwarg = "grad_scaler" in inspect.signature(optimizer.step).parameters
+            if has_grad_scaler_kwarg:
+                warnings.warn(
+                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
+                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
+                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
+                    FutureWarning)
+                kwargs_.update({"grad_scaler": self})
+            else:
+                scaler = self._get_scale_async()
+                found_inf = cast(
+                    torch.Tensor,
+                    sum([
+                        t.to(scaler.device, non_blocking=True) for t in self._check_inf_per_device(optimizer).values()
+                    ])
+                )
+                optimizer.grad_scale = None if optimizer_state["stage"] == OptState.UNSCALED else scaler
+                optimizer.found_inf = found_inf
+            retval = optimizer.step(*args, **kwargs_)
             optimizer_state["stage"] = OptState.STEPPED
+            if not has_grad_scaler_kwarg:
+                del optimizer.grad_scale
+                del optimizer.found_inf
             return retval
 
         if optimizer_state["stage"] is OptState.READY:
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index 62ead4b7083a..1ce4b4754b8c 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -47,9 +47,6 @@ class CUDAGraph(torch._C._CUDAGraph):
     def __new__(cls):
         return super(CUDAGraph, cls).__new__(cls)
 
-    def __init__(self):
-        super(CUDAGraph, self).__init__()
-
     def capture_begin(self, pool=None):
         r"""
         Begins capturing CUDA work on the current stream.
@@ -66,9 +63,9 @@ def capture_begin(self, pool=None):
         # I'm not sure if pybind11 converts a None arg to the default defined on the C++ side,
         # so I'm not taking any chances.
         if pool is None:
-            super(CUDAGraph, self).capture_begin()
+            super().capture_begin()
         else:
-            super(CUDAGraph, self).capture_begin(pool)
+            super().capture_begin(pool)
 
     def capture_end(self):
         r"""
@@ -79,19 +76,19 @@ def capture_end(self):
         Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
         which call ``capture_end`` internally.
         """
-        super(CUDAGraph, self).capture_end()
+        super().capture_end()
 
     def replay(self):
         r"""
         Replays the CUDA work captured by this graph.
         """
-        super(CUDAGraph, self).replay()
+        super().replay()
 
     def reset(self):
         r"""
         Deletes the graph currently held by this instance.
         """
-        super(CUDAGraph, self).reset()
+        super().reset()
 
     def pool(self):
         r"""
@@ -99,13 +96,13 @@ def pool(self):
         This id can optionally be passed to another graph's ``capture_begin``,
         which hints the other graph may share the same memory pool.
         """
-        return super(CUDAGraph, self).pool()
+        return super().pool()
 
     def enable_debug_mode(self):
         r"""
         Enables debugging mode for CUDAGraph.debug_dump.
         """
-        return super(CUDAGraph, self).enable_debug_mode()
+        return super().enable_debug_mode()
 
     def debug_dump(self, debug_path):
         r"""
@@ -115,10 +112,10 @@ def debug_dump(self, debug_path):
         Calls a debugging function to dump the graph if the debugging is
         enabled via CUDAGraph.enable_debug_mode()
         """
-        return super(CUDAGraph, self).debug_dump(debug_path)
+        return super().debug_dump(debug_path)
 
 
-class graph(object):
+class graph:
     r"""
     Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph`
     object for later replay.
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 9a3c13991c98..6e63ab2bf4d8 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Union, Tuple
 
 import torch
-from . import is_initialized, _get_device_index, _lazy_init
+from . import is_initialized, _get_device_index, _lazy_init, _get_nvml_device_index
 from ._utils import _dummy_type
 
 from ._memory_viz import segments as _segments, memory as _memory
@@ -194,6 +194,15 @@ def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
     - ``"oversize_segments.{current,peak,allocated,freed}"``:
       number of over-size reserved segments from ``cudaMalloc()``.
 
+    The caching allocator can be configured via ENV to round memory allocations in order
+    to reduce fragmentation. Sometimes the overhead from rounding can be higher than
+    the fragmentation it helps reduce. The following stat can be used to check if
+    rounding adds too much overhed:
+
+    - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      memory requested by client code, compare this with allocated_bytes to check if
+      allocation rounding adds too much overhead.
+
     Args:
         device (torch.device or int, optional): selected device. Returns
             statistics for the current device, given by :func:`~torch.cuda.current_device`,
@@ -477,6 +486,7 @@ def _format_count(cnt, pref_cnt):
     metrics_to_display = [
         ("allocated_bytes", "Allocated memory", _format_size),
         ("active_bytes", "Active memory", _format_size),
+        ("requested_bytes", "Requested memory", _format_size),
         ("reserved_bytes", "GPU reserved memory", _format_size),
         ("inactive_split_bytes", "Non-releasable memory", _format_size),
         ("allocation", "Allocations", _format_count),
@@ -577,7 +587,7 @@ def list_gpu_processes(device: Union[Device, int] = None) -> str:
         pynvml.nvmlInit()
     except NVMLError_DriverNotLoaded:
         return ("cuda driver can't be loaded, is cuda enabled?")
-    device = _get_device_index(device, optional=True)
+    device = _get_nvml_device_index(device)
     handle = pynvml.nvmlDeviceGetHandleByIndex(device)
     procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
     lines = []
diff --git a/torch/cuda/nvtx.py b/torch/cuda/nvtx.py
index 7e2e8715a605..16fa078dff80 100644
--- a/torch/cuda/nvtx.py
+++ b/torch/cuda/nvtx.py
@@ -3,7 +3,7 @@
 try:
     from torch._C import _nvtx
 except ImportError:
-    class _NVTXStub(object):
+    class _NVTXStub:
         @staticmethod
         def _fail(*args, **kwargs):
             raise RuntimeError("NVTX functions not installed. Are you sure you have a CUDA build?")
diff --git a/torch/cuda/profiler.py b/torch/cuda/profiler.py
index eb7c813b122a..6ea7c65d34cc 100644
--- a/torch/cuda/profiler.py
+++ b/torch/cuda/profiler.py
@@ -1,4 +1,5 @@
 import tempfile
+import torch
 import contextlib
 from . import cudart, check_error
 
@@ -19,6 +20,10 @@ def init(output_file, flags=None, output_mode='key_value'):
     rt = cudart()
     if not hasattr(rt, 'cudaOutputMode'):
         raise AssertionError("HIP does not support profiler initialization!")
+    if hasattr(torch.version, "cuda") and torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12:
+        # Check https://github.com/pytorch/pytorch/pull/91118
+        # cudaProfilerInitialize is no longer needed after CUDA 12
+        raise AssertionError("CUDA12+ does not need profiler initialization!")
     flags = DEFAULT_FLAGS if flags is None else flags
     if output_mode == 'key_value':
         output_mode_enum = rt.cudaOutputMode.KeyValuePair
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 1b87f5b2dee5..0c125daf120e 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -87,7 +87,7 @@ def query(self):
 
         Returns:
             A boolean indicating if all kernels in this stream are completed."""
-        return super(Stream, self).query()
+        return super().query()
 
     def synchronize(self):
         r"""Wait for all the kernels in this stream to complete.
@@ -95,7 +95,7 @@ def synchronize(self):
         .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
            `CUDA Stream documentation`_ for more info.
         """
-        super(Stream, self).synchronize()
+        super().synchronize()
 
     @property
     def _as_parameter_(self):
@@ -103,7 +103,7 @@ def _as_parameter_(self):
 
     def __eq__(self, o):
         if isinstance(o, Stream):
-            return super(Stream, self).__eq__(o)
+            return super().__eq__(o)
         return False
 
     def __hash__(self):
@@ -177,7 +177,7 @@ def record(self, stream=None):
         stream's device must match the event's device."""
         if stream is None:
             stream = torch.cuda.current_stream()
-        super(Event, self).record(stream)
+        super().record(stream)
 
     def wait(self, stream=None):
         r"""Makes all future work submitted to the given stream wait for this
@@ -190,7 +190,7 @@ def wait(self, stream=None):
         """
         if stream is None:
             stream = torch.cuda.current_stream()
-        super(Event, self).wait(stream)
+        super().wait(stream)
 
     def query(self):
         r"""Checks if all work currently captured by event has completed.
@@ -199,13 +199,13 @@ def query(self):
             A boolean indicating if all work currently captured by event has
             completed.
         """
-        return super(Event, self).query()
+        return super().query()
 
     def elapsed_time(self, end_event):
         r"""Returns the time elapsed in milliseconds after the event was
         recorded and before the end_event was recorded.
         """
-        return super(Event, self).elapsed_time(end_event)
+        return super().elapsed_time(end_event)
 
     def synchronize(self):
         r"""Waits for the event to complete.
@@ -216,12 +216,12 @@ def synchronize(self):
          .. note:: This is a wrapper around ``cudaEventSynchronize()``: see
             `CUDA Event documentation`_ for more info.
         """
-        super(Event, self).synchronize()
+        super().synchronize()
 
     def ipc_handle(self):
         r"""Returns an IPC handle of this event. If not recorded yet, the event
         will use the current device. """
-        return super(Event, self).ipc_handle()
+        return super().ipc_handle()
 
     @property
     def _as_parameter_(self):
diff --git a/torch/custom_class.h b/torch/custom_class.h
index 214c6f5ed060..1277ca61a94c 100644
--- a/torch/custom_class.h
+++ b/torch/custom_class.h
@@ -197,8 +197,8 @@ class class_ : public ::torch::detail::class_base {
       GetterFunc getter_func,
       SetterFunc setter_func,
       std::string doc_string = "") {
-    torch::jit::Function* getter;
-    torch::jit::Function* setter;
+    torch::jit::Function* getter{};
+    torch::jit::Function* setter{};
 
     auto wrapped_getter =
         detail::wrap_func<CurClass, GetterFunc>(std::move(getter_func));
@@ -218,7 +218,7 @@ class class_ : public ::torch::detail::class_base {
       const std::string& name,
       GetterFunc getter_func,
       std::string doc_string = "") {
-    torch::jit::Function* getter;
+    torch::jit::Function* getter{};
 
     auto wrapped_getter =
         detail::wrap_func<CurClass, GetterFunc>(std::move(getter_func));
@@ -321,7 +321,7 @@ class class_ : public ::torch::detail::class_base {
         c10::guts::infer_function_traits_t<std::decay_t<SetStateFn>>;
     using SetStateArg = typename c10::guts::typelist::head_t<
         typename SetStateTraits::parameter_types>;
-    auto setstate_wrapper = [set_state = std::move(set_state)](
+    auto setstate_wrapper = [set_state = std::forward<SetStateFn>(set_state)](
                                 c10::tagged_capsule<CurClass> self,
                                 SetStateArg&& arg) {
       c10::intrusive_ptr<CurClass> classObj =
diff --git a/torch/custom_class_detail.h b/torch/custom_class_detail.h
index b501053831a2..736d5aacdaa3 100644
--- a/torch/custom_class_detail.h
+++ b/torch/custom_class_detail.h
@@ -175,7 +175,7 @@ struct BoxedProxy<void, Func> {
     constexpr size_t num_ivalue_args =
         c10::guts::infer_function_traits_t<Func>::number_of_parameters;
     torch::jit::drop(stack, num_ivalue_args);
-    stack.emplace_back(c10::IValue());
+    stack.emplace_back();
   }
 };
 
diff --git a/torch/distributed/_composable/_ddp.py b/torch/distributed/_composable/_ddp.py
index 6a17b974a15b..a2a4cb3f001d 100644
--- a/torch/distributed/_composable/_ddp.py
+++ b/torch/distributed/_composable/_ddp.py
@@ -81,7 +81,7 @@ def backward(ctx, *grad_outputs):
         # Enqueue delay allreduce for static graph training on the first
         # iteration.
         if state_dict["static_graph"] and state_dict["num_iterations"] == 1:
-            Variable._execution_engine.queue_callback(ctx.reducer._delay_all_reduce)
+            Variable._execution_engine.queue_callback(ctx.reducer._delay_all_reduce)  # type: ignore[call-arg,misc]
 
         return (None, None, *grad_outputs)
 
@@ -104,7 +104,7 @@ def __init__(
         static_graph=False,
     ):
 
-        super(DistributedDataParallel, self).__init__()
+        super().__init__()
         self.logger: Optional[dist.Logger] = None
         if not any((p.requires_grad for p in module.parameters())):
             self._log_and_throw(
@@ -337,7 +337,7 @@ def __getstate__(self):
     def __setstate__(self, state):
         # If serializable, then the process group should be the default one
         self.process_group = _get_default_group()
-        super(DistributedDataParallel, self).__setstate__(state)
+        super().__setstate__(state)
         self.__dict__.setdefault("require_forward_param_sync", True)
         self.__dict__.setdefault("require_backward_grad_sync", True)
         parameters, expect_sparse_gradient = self._build_params_for_reducer()
@@ -383,21 +383,19 @@ def _build_params_for_reducer(self):
         ]
 
         # Build list of parameters.
-        parameters = list(parameter for _, parameter in modules_and_parameters)
+        parameters = [parameter for _, parameter in modules_and_parameters]
 
         # Checks if a module will produce a sparse gradient.
         def produces_sparse_gradient(module):
-            if isinstance(module, torch.nn.Embedding) or isinstance(
-                module, torch.nn.EmbeddingBag
-            ):
+            if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
                 return module.sparse
             return False
 
         # Build list of booleans indicating whether or not to expect sparse
         # gradients for the corresponding parameters.
-        expect_sparse_gradient = list(
+        expect_sparse_gradient = [
             produces_sparse_gradient(module) for module, _ in modules_and_parameters
-        )
+        ]
 
         self._assign_modules_buffers()
 
@@ -472,8 +470,7 @@ def model_parameters(m):
                 if hasattr(m, "_former_parameters")
                 else m.parameters(recurse=False)
             )
-            for p in ps:
-                yield p
+            yield from ps
 
         for m in m.modules() if recurse else [m]:
             for p in model_parameters(m):
@@ -658,7 +655,7 @@ def gather(self, outputs, output_device):
         return gather(outputs, output_device, dim=self.dim)
 
     def train(self, mode=True):
-        super(DistributedDataParallel, self).train(mode)
+        super().train(mode)
         return self
 
     # When running in join mode, schedules an allreduce to notify joined ranks
diff --git a/torch/distributed/_composable/checkpoint_activation.py b/torch/distributed/_composable/checkpoint_activation.py
index 64669df4fc71..9e7609426c6a 100644
--- a/torch/distributed/_composable/checkpoint_activation.py
+++ b/torch/distributed/_composable/checkpoint_activation.py
@@ -212,6 +212,7 @@ def checkpoint(module: nn.Module, *, use_reentrant: bool = True) -> nn.Module:
         >>> model(torch.zeros(2, 10)).sum().backward()
 
     """
+    torch._C._log_api_usage_once("torch.distributed.checkpoint")
 
     def forward_pre_hook(module: nn.Module, inputs: Tuple[Any, ...]) -> None:
         if checkpoint.state(module).enable_hook:
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index 5065761e0f7b..f9c9a5d43e4d 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -49,6 +49,7 @@ def fully_shard(
     """
     Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
     """
+    torch._C._log_api_usage_once("torch.distributed.fully_shard")
     # Enforce the new auto wrap policy
     if policy is not None and not isinstance(policy, _FSDPPolicy):
         raise ValueError(f"Expects an `_FSDPPolicy` but got {policy}")
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index 30111da685d1..ec4e4e7e8819 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -22,6 +22,7 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
+    torch._C._log_api_usage_once("torch.distributed.replicate")
     _ReplicateState().mark_modules(module, **kwargs)
     return module
 
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
new file mode 100644
index 000000000000..41a1409d6793
--- /dev/null
+++ b/torch/distributed/_functional_collectives.py
@@ -0,0 +1,260 @@
+from typing import Any, Tuple, Union, List, cast
+
+import weakref
+import warnings
+
+import sys
+import torch
+import torch.distributed as dist
+
+from torch._C import _disabled_torch_function_impl
+from torch.utils._pytree import tree_map
+
+import torch.distributed.distributed_c10d as c10d
+"""
+New traceable, functional collectives.
+RFC: https://github.com/pytorch/pytorch/issues/93173
+
+  compiler: trace these ops with plain-old-data schemas, then choose how to lower them.
+  eager: execute these 'functional' ops which in eager return AsyncCollectiveTensor subclasses,
+         automatically calling .wait() on underlying/hidden async 'work' obj only when fed to
+         a downstream op.
+
+Issues:
+* Where should these ops live? Couldn't `import torch` if putting these ops in existing torch.distributed files
+* Proper support for eager requires inplace ops. We should explore having it as an option for the API.
+"""
+
+"""
+Functional collectives are asynchronous only and we perform implicit stream synchronization
+on behalf of the user.
+
+We use AsyncCollectiveTensor to wrap the result tensor of a collective and it lets us witness
+first usage of the tensor and insert cross stream sync at the right place.
+
+The above are the easy bits, the hard one is how we match the Work object returned by
+c10d and the tensor AsyncCollectiveTensor wraps. We alloc the tensor inside the collective
+op implementation (see ``clone()`` call in ``_all_reduce``) and then it's handled by the
+dispatcher which might call other implementations that are allowed to change the returned
+tensor - even return a tensor with a different shape (see ``torch.vmap``).
+
+This means the caller of our ops receives a Tensor that is not guaranteed to be the same
+allocated by our implementations and that makes pairing The AsyncTensor to the original
+tensor a lot harder. This pairing is needed so we can lookup the Work object to use.
+
+Originally, we tried WeakKeyDictionary to map from Tensor to Work, but because Tensor's
+identity is not stable across dispatch, the op caller would end up with a different Tensor
+instance that would not match any in the dictionary.
+
+With Tensor identity out of the question, we decided use the tensor data pointer, which
+should be stable across all the Tensor changes done during dispatch.
+
+We have a dictionary of tensor::data_ptr -> Work that we insert right after we call into c10d.
+
+We use this dictionary when AsyncCollectiveTensor is used to invoke Work::wait()
+
+Finally, we setup a finalizer against the tensor wrapper to observe it getting collected so we
+can clean up stale entries in the dictionary.
+
+To eliminate the possiblity of races we have a global version counter that is used by the finalizer.
+
+As a wise man said once: Don't cross the streams (https://www.youtube.com/watch?v=wyKQe_i9yyo)
+
+"""
+data_ptr_to_work = dict()
+work_version = 0
+
+def _register_tensor_work(tensor, work):
+    global data_ptr_to_work
+    global work_version
+    data_ptr_to_work[tensor.data_ptr()] = (work_version, work)
+    work_version += 1
+
+def _clear_tensor(data_ptr, version):
+    global data_ptr_to_work
+    version_and_work = data_ptr_to_work.get(data_ptr)
+
+    if version_and_work is not None and version_and_work[0] == version:
+        del data_ptr_to_work[data_ptr]
+
+def _register_wrapper_tensor(tensor_wrapper, tensor):
+    global data_ptr_to_work
+    version, _ = data_ptr_to_work.get(tensor.data_ptr(), (None, None))
+    if version is None:
+        warnings.warn("Trying to register finalizers to AsyncCollectiveTensor but the inner tensor is already gone")
+    else:
+        weakref.finalize(tensor_wrapper, _clear_tensor, tensor.data_ptr(), version)
+
+def _wait_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    global data_ptr_to_work
+    data_ptr = tensor.data_ptr()
+    version_and_work = data_ptr_to_work.get(data_ptr)
+    if version_and_work is not None:
+        version_and_work[1].wait()
+        _clear_tensor(data_ptr, version_and_work[0])
+    return tensor
+
+
+class AsyncCollectiveTensor(torch.Tensor):
+    r"""
+    A Tensor subclass that is only used in eager mode, to hold a 'work' object
+    and then wait on it before invoking a real op.
+
+    Usage, from inside functional collective:
+    def functional_collective(input):
+        input = input.clone()
+        mutated_input, work = c10d.{inplace_collective}(input)
+        return AsyncCollectiveTensor(mutated_input, work)
+    """
+    _tensor: torch.Tensor
+
+    __torch_function__ = _disabled_torch_function_impl
+
+    @staticmethod
+    def __new__(cls, tensor: torch.Tensor):
+        t = tensor
+        r = torch.Tensor._make_subclass(cls, t, require_grad=t.requires_grad)
+        r._tensor = tensor  # type: ignore[attr-defined]
+        return r
+
+    def __repr__(self):
+        return f"AsyncCollectiveTensor({self._tensor})"
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        def unwrap(e: Any):
+            if isinstance(e, AsyncCollectiveTensor):
+                return wait_tensor(e._tensor)
+            return e
+
+        unwrapped_args = tree_map(unwrap, args)
+        unwrapped_kwargs = tree_map(unwrap, kwargs)
+
+        out = func(*unwrapped_args, **unwrapped_kwargs)
+        return out
+
+def _str_to_reduce_op(reduceOp: str) -> dist.ReduceOp:
+    reduceOp = reduceOp.upper()
+    op = dist.ReduceOp.RedOpType.__members__.get(reduceOp)
+    if op is None:
+        raise ValueError(f"Invalid reduce operation {reduceOp}")
+    return cast(dist.ReduceOp, op)
+
+# TODO assert if ranks has duplicated entries
+def _all_reduce(self, reduceOp, tag, ranks, group_size):
+    op = _str_to_reduce_op(reduceOp)
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+
+    inplace_tensor = self.clone(memory_format=torch.contiguous_format)
+    work = dist.all_reduce(inplace_tensor, op=op, group=group, async_op=True)
+    _register_tensor_work(inplace_tensor, work)
+
+    return inplace_tensor
+
+def _all_gather_into_tensor(shard, tag, ranks, group_size):
+    # TODO add dim support?
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+    out_size = list(shard.size())
+    out_size[0] *= group_size
+    out_tensor = shard.new_empty(out_size)
+    assert out_tensor.is_contiguous()
+    work = dist.all_gather_into_tensor(out_tensor, shard, group=group, async_op=True)
+    _register_tensor_work(out_tensor, work)
+
+    return out_tensor
+
+RANK_TYPES = Union[List[int], List[List[int]], dist.ProcessGroup, "dist._tensor.DeviceMesh", Tuple["dist._tensor.DeviceMesh", int]]
+
+def _expand_group(group: RANK_TYPES, tag: str = "") -> Tuple[str, List[int], int]:
+    # Cannot import on the top level to avoid circular imports
+    import torch.distributed._tensor as dt
+    rankset: List[int]
+    if isinstance(group, list):
+        if isinstance(group[0], list):
+            nested_list = cast(List[List[int]], group)
+            rankset = []
+            group_size = -1
+            for rs in nested_list:
+                rankset.extend(rs)
+                if group_size != -1 and group_size != len(rs):
+                    raise ValueError(f"group sizes must be identical found {group_size} and {len(rs)}")
+                group_size = len(rs)
+        else:
+            rankset = cast(List[int], group)
+            group_size = len(rankset)
+    elif isinstance(group, dist.ProcessGroup):
+        rankset = dist.get_process_group_ranks(group)
+        group_size = len(rankset)
+        tag = tag or c10d._get_group_tag(group)
+    elif isinstance(group, dt.DeviceMesh):
+        rankset = group.mesh.flatten().tolist()
+        group_size = group.mesh.size(0)
+        rankset = group.mesh.swapdims(-1, 0).reshape(-1, group_size).flatten().tolist()
+        tag = tag or c10d._get_group_tag(group.get_dim_groups()[0])
+    elif isinstance(group, tuple):
+        if len(group) == 2 and isinstance(group[0], dt.DeviceMesh) and isinstance(group[1], int):
+            dmesh = group[0]
+            dim = group[1]
+            group_size = dmesh.mesh.size(dim)
+            rankset = dmesh.mesh.swapdims(-1, dim).reshape(-1, group_size).flatten().tolist()
+            tag = tag or c10d._get_group_tag(dmesh.get_dim_groups()[dim])
+        else:
+            raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
+    else:
+        raise ValueError("Invalid type for group, must be one of List, Processgroup, DeviceMesh or (DeviceMesh, int).")
+
+    return (tag, rankset, group_size)
+
+
+def wait_tensor(tensor):
+    """
+    Wait on a tensor returned by the collectives ops.
+
+    Waiting follows device semantics, which means blocking on CPU and synchronizing streams on CUDA.
+    """
+    return torch._C._nn.wait_tensor(tensor)  # type: ignore[attr-defined]
+
+
+def all_reduce(self: torch.Tensor, reduceOp: str, group: RANK_TYPES, tag: str = ""):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    The input tensor is left unmodified.
+
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    tag, rankset, group_size = _expand_group(group, tag)
+    tensor = torch._C._nn.all_reduce(self, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
+    res = AsyncCollectiveTensor(tensor)
+    _register_wrapper_tensor(res, tensor)
+    return res
+
+
+c10_lib_cpu = torch.library.Library("aten", "IMPL", "CPU")
+c10_lib_cuda = torch.library.Library("aten", "IMPL", "CUDA")
+
+def _register_ops():
+    c10_lib_cpu.impl("all_reduce", _all_reduce)
+    c10_lib_cuda.impl("all_reduce", _all_reduce)
+
+    c10_lib_cpu.impl("wait_tensor", _wait_tensor)
+    c10_lib_cuda.impl("wait_tensor", _wait_tensor)
+
+    c10_lib_cpu.impl("all_gather_into_tensor", _all_gather_into_tensor)
+    c10_lib_cuda.impl("all_gather_into_tensor", _all_gather_into_tensor)
+
+if sys.executable != 'torch_deploy':
+    _register_ops()
+else:
+    warnings.warn("PyTorch Distributed functional collectives do not work with torch::deploy.")
diff --git a/torch/distributed/_shard/__init__.py b/torch/distributed/_shard/__init__.py
index 2dfad636b07f..34539d633f8f 100644
--- a/torch/distributed/_shard/__init__.py
+++ b/torch/distributed/_shard/__init__.py
@@ -1,5 +1,4 @@
 from .api import (
-    _replicate_tensor,
     _shard_tensor,
     load_with_process_group,
     shard_module,
diff --git a/torch/distributed/_shard/_utils.py b/torch/distributed/_shard/_utils.py
index 7e347fefa27c..26305b99cce3 100644
--- a/torch/distributed/_shard/_utils.py
+++ b/torch/distributed/_shard/_utils.py
@@ -2,6 +2,8 @@
 from torch.distributed._shard.metadata import ShardMetadata
 from typing import Sequence
 
+DEPRECATE_MSG = "Please use DTensor instead and we are deprecating ShardedTensor."
+
 def narrow_tensor_by_index(tensor: torch.Tensor, offsets: Sequence[int], sizes: Sequence[int]) -> torch.Tensor:
     """
     Narrow the tensor according to ``offsets`` and ``sizes``.
diff --git a/torch/distributed/_shard/api.py b/torch/distributed/_shard/api.py
index 05b4ac3cbe40..cd318103550f 100644
--- a/torch/distributed/_shard/api.py
+++ b/torch/distributed/_shard/api.py
@@ -7,7 +7,6 @@
     ShardedTensor,
     _PartialTensor
 )
-from .replicated_tensor import ReplicatedTensor
 from .sharding_spec import (
     ShardingSpec,
     ChunkShardingSpec
@@ -121,22 +120,6 @@ def shard_parameter(
     # Replace param with ShardedTensor.
     module.register_parameter(param_name, nn.Parameter(st))
 
-def _replicate_tensor(tensor: torch.Tensor, process_group=None) -> ReplicatedTensor:
-    """
-    Given a :class:`torch.Tensor`, mark it as a ReplicatedTensor where all
-    ranks have the same value.
-
-    Args:
-        tensor (:class:`torch.Tensor`): the tensor to be marked as replicated.
-    Keyword args:
-        process_group (ProcessGroup, optional): The process group to replicate on.
-            If None, the default process group will be used.
-    Returns:
-        A :class:`ReplicatedTensor` from the given tensor.
-
-    """
-    return ReplicatedTensor(tensor, process_group=process_group)
-
 # Tracks the current process group in the load context manager.
 _CURRENT_PROCESS_GROUP = None
 
@@ -183,7 +166,7 @@ def _reshard_output(
         A :class:`torch.nn.Module` object with reshard API hooked.
     """
     def hook_func(_module, _input, output):
-        if isinstance(output, ShardedTensor) or isinstance(output, _PartialTensor):
+        if isinstance(output, (ShardedTensor, _PartialTensor)):
             return output.reshard(resharding_spec)
         return output
     module.register_forward_hook(hook_func)
diff --git a/torch/distributed/_shard/common_op_utils.py b/torch/distributed/_shard/common_op_utils.py
index 42d65923a536..7ef88965eecb 100644
--- a/torch/distributed/_shard/common_op_utils.py
+++ b/torch/distributed/_shard/common_op_utils.py
@@ -7,7 +7,6 @@ def _basic_validation(op, args=(), kwargs=None):
     Common validation across all ops go in here.
     """
     from torch.distributed._shard.partial_tensor import _PartialTensor
-    from torch.distributed._shard.replicated_tensor import ReplicatedTensor
     from torch.distributed._shard.sharded_tensor import ShardedTensor
 
     if len(args) == 0 and (kwargs is None or len(kwargs) == 0):
@@ -18,7 +17,7 @@ def _basic_validation(op, args=(), kwargs=None):
 
     def is_distributed_tensor(e):
         nonlocal has_distributed_tensor
-        if isinstance(e, ReplicatedTensor) or isinstance(e, _PartialTensor) or isinstance(e, ShardedTensor):
+        if isinstance(e, (_PartialTensor, ShardedTensor)):
             has_distributed_tensor = True
 
     tree_map(is_distributed_tensor, args)
@@ -35,7 +34,7 @@ def is_distributed_tensor(e):
 
     def validate_pg(e):
         nonlocal cur_pg
-        if isinstance(e, ReplicatedTensor) or isinstance(e, _PartialTensor) or isinstance(e, ShardedTensor):
+        if isinstance(e, (_PartialTensor, ShardedTensor)):
             if cur_pg is not None and e._process_group is not cur_pg:
                 raise RuntimeError(
                     'All distributed tensors should use the '
diff --git a/torch/distributed/_shard/metadata.py b/torch/distributed/_shard/metadata.py
index bc6ae8bb53cd..b7bae9e6664a 100644
--- a/torch/distributed/_shard/metadata.py
+++ b/torch/distributed/_shard/metadata.py
@@ -5,7 +5,7 @@
 from torch.distributed.remote_device import _remote_device
 
 @dataclass
-class ShardMetadata(object):
+class ShardMetadata:
     """
     Represents a shard of the overall Tensor including its
     offsets, lengths and device placement.
diff --git a/torch/distributed/_shard/op_registry_utils.py b/torch/distributed/_shard/op_registry_utils.py
index fbb98dbffe6b..4febe841186a 100644
--- a/torch/distributed/_shard/op_registry_utils.py
+++ b/torch/distributed/_shard/op_registry_utils.py
@@ -3,7 +3,7 @@
 from .common_op_utils import _basic_validation
 
 """
-Common utilities to register ops on ShardedTensor, ReplicatedTensor
+Common utilities to register ops on ShardedTensor
 and PartialTensor.
 """
 
diff --git a/torch/distributed/_shard/partial_tensor.py b/torch/distributed/_shard/partial_tensor.py
index 6a48163082c5..9c1aefbf2d3f 100644
--- a/torch/distributed/_shard/partial_tensor.py
+++ b/torch/distributed/_shard/partial_tensor.py
@@ -1,9 +1,13 @@
 import functools
+import warnings
 from typing import Callable, Dict, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
 import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard._utils import (
+    DEPRECATE_MSG,
+)
 from torch.distributed import distributed_c10d
 from torch.distributed.nn.functional import (
     reduce_scatter,
@@ -117,6 +121,7 @@ class _PartialTensor(torch.Tensor):
     __slots__ = ["_process_group", "_local_shard", "_reduce_op"]
 
     def __new__(cls, local_shard, process_group=None, reduce_op=distributed_c10d.ReduceOp.SUM):
+        warnings.warn(DEPRECATE_MSG)
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
             cls,
             local_shard.size(),
@@ -158,6 +163,7 @@ def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> "ShardedTensor":
         """
         from torch.distributed._shard.sharded_tensor.api import ShardedTensor
 
+        warnings.warn(DEPRECATE_MSG)
         if not isinstance(resharding_spec, shard_spec.ChunkShardingSpec):
             raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
         if self._local_shard.is_complex():
@@ -219,6 +225,7 @@ def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> "ShardedTensor":
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
+        warnings.warn(DEPRECATE_MSG)
         # Find process_group
         process_group = None
 
@@ -252,7 +259,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         )
 
     def __repr__(self):
-        return f"PartialTensor({super(_PartialTensor, self).__repr__()})"
+        return f"PartialTensor({super().__repr__()})"
 
 def _transpose_impl(types, args=(), kwargs=None, process_group=None):
     partial_tensor = args[0]
diff --git a/torch/distributed/_shard/replicated_tensor.py b/torch/distributed/_shard/replicated_tensor.py
index e3db6b0fac66..6a4217940d82 100644
--- a/torch/distributed/_shard/replicated_tensor.py
+++ b/torch/distributed/_shard/replicated_tensor.py
@@ -1,7 +1,11 @@
+import warnings
 import torch
 import torch.distributed as dist
 
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._shard._utils import (
+    DEPRECATE_MSG,
+)
 from torch.distributed import distributed_c10d
 from torch.overrides import get_default_nowrap_functions
 
@@ -13,6 +17,8 @@
     torch.Tensor.__getitem__,
 ]
 
+warnings.warn(DEPRECATE_MSG)
+
 class ReplicatedTensor(torch.Tensor):
     """
     ReplicatedTensor represents a tensor which is replicated across the `world_size` and
@@ -57,7 +63,7 @@ def __deepcopy__(self, memo):
             return result
 
     def __repr__(self):
-        return f"ReplicatedTensor({super(ReplicatedTensor, self).__repr__()})"
+        return f"ReplicatedTensor({super().__repr__()})"
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
diff --git a/torch/distributed/_shard/sharded_optim/api.py b/torch/distributed/_shard/sharded_optim/api.py
index ec4f9e6ae749..54d8a94ad3fe 100644
--- a/torch/distributed/_shard/sharded_optim/api.py
+++ b/torch/distributed/_shard/sharded_optim/api.py
@@ -40,8 +40,8 @@ def __init__(
         self.param_groups = self._optim.param_groups
         self.state = self._optim.state
 
-    def zero_grad(self, set_to_none: bool = False):  # type: ignore[override]
-        r"""Sets the gradients of all optimized :class:`torch.Tensor` s to zero.
+    def zero_grad(self, set_to_none: bool = True):  # type: ignore[override]
+        r"""Resets the gradients of all optimized :class:`torch.Tensor` s.
 
         Args:
             set_to_none (bool): instead of setting to zero, set the grads to None.
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py
index fe41cc79a858..2b0ad3d5dca4 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/math_ops.py
@@ -1,57 +1,20 @@
 import torch
 from torch import Tensor
 from torch.distributed._shard.sharded_tensor import ShardedTensor, _sharded_op_impl
-from torch.distributed._shard.replicated_tensor import ReplicatedTensor
-from torch.distributed._shard._utils import narrow_tensor
 
 
 def binary_math_op_impl(op, types, args=(), kwargs=None, pg=None):
     """
     Handles ``__torch_function__`` dispatch for the binary math ops
     such as `torch.add`, `torch.mul`, `torch.div`, etc.
-    This method computes on ShardedTensor, or ShardedTensor op ReplicatedTensor
+    This method computes on ShardedTensor, or ShardedTensor op
     """
     if len(args) != 2:
         raise ValueError("Only support binary math op on ShardedTensor for now!")
     lhs = args[0]
     rhs = args[1]
     # Validate types
-    if isinstance(lhs, ReplicatedTensor):
-        assert isinstance(rhs, ShardedTensor)
-        st_size = rhs.size()
-        st_meta = rhs.local_shards()[0].metadata
-        if st_size != lhs.size():
-            # try to broadcast replicated tensor
-            lhs = lhs.expand(st_size)
-
-        replica_part = narrow_tensor(lhs, st_meta)
-        res = op(replica_part, rhs.local_tensor())
-
-        return ShardedTensor._init_from_local_tensor(
-            res,
-            rhs.sharding_spec(),
-            rhs.size(),  # type: ignore[arg-type]
-            process_group=pg,
-        )
-
-    elif isinstance(rhs, ReplicatedTensor):
-        assert isinstance(lhs, ShardedTensor)
-        st_size = lhs.size()
-        st_meta = lhs.local_shards()[0].metadata
-        if st_size != rhs.size():
-            # try to broadcast replicated tensor
-            rhs = rhs.expand(st_size)
-
-        replica_part = narrow_tensor(rhs, st_meta)
-        res = op(lhs.local_tensor(), replica_part)
-        return ShardedTensor._init_from_local_tensor(
-            res,
-            lhs.sharding_spec(),
-            lhs.size(),  # type: ignore[arg-type]
-            process_group=pg,
-        )
-
-    elif isinstance(lhs, (int, float)):
+    if isinstance(lhs, (int, float)):
         assert isinstance(rhs, ShardedTensor)
         res = op(lhs, rhs.local_tensor())
         return ShardedTensor._init_from_local_tensor(
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 3b939fdcd374..af587f800f70 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -10,6 +10,7 @@
     cast,
 )
 import copy
+import warnings
 from functools import reduce
 import weakref
 
@@ -28,6 +29,9 @@
     check_tensor,
     validate_non_overlapping_shards_metadata,
 )
+from torch.distributed._shard._utils import (
+    DEPRECATE_MSG,
+)
 
 from .metadata import TensorProperties, ShardedTensorMetadata
 from .shard import Shard
@@ -840,6 +844,8 @@ def _init_from_local_tensor(
                  We fully rely on the user to ensure local tensor is sharded based on the
                  sharding spec.
         """
+        warnings.warn(DEPRECATE_MSG)
+
         if not local_tensor.is_contiguous():
             raise ValueError('local_tensor is not a contiguous Tensor.')
 
@@ -1006,6 +1012,8 @@ def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
             tensor([[3], [3], [5], [5], [7], [7], [9], [9]]) # Rank 2
             tensor([[4], [4], [6], [6], [8], [8], [10], [10]]) # Rank 3
         """
+        warnings.warn(DEPRECATE_MSG)
+
         if (
             not isinstance(resharding_spec, shard_spec.ChunkShardingSpec) or
             not isinstance(self._sharding_spec, shard_spec.ChunkShardingSpec)
@@ -1074,6 +1082,7 @@ def dispatch(st: ShardedTensor, func: Callable):
                 f"torch function '{func.__name__}', with args: {args} and "
                 f"kwargs: {kwargs} not supported for ShardedTensor!")
 
+        warnings.warn(DEPRECATE_MSG)
         # Find ShardedTensor instance to get process_group and sharding_spec.
         st_instance = None
 
diff --git a/torch/distributed/_shard/sharded_tensor/metadata.py b/torch/distributed/_shard/sharded_tensor/metadata.py
index 2fce1d28470d..cb112da5686b 100644
--- a/torch/distributed/_shard/sharded_tensor/metadata.py
+++ b/torch/distributed/_shard/sharded_tensor/metadata.py
@@ -11,7 +11,7 @@ class MEM_FORMAT_ENCODING(Enum):
     TORCH_PRESERVE_FORMAT = 2
 
 @dataclass
-class TensorProperties(object):
+class TensorProperties:
     """ Properties used to create :class:`Tensor` """
 
     # Regular tensor fields
@@ -68,7 +68,7 @@ def create_from_tensor(tensor: torch.Tensor) -> "TensorProperties":
             pin_memory=tensor.is_pinned()
         )
 @dataclass
-class ShardedTensorMetadata(object):
+class ShardedTensorMetadata:
     """
     Represents metadata for :class:`ShardedTensor`
     """
diff --git a/torch/distributed/_shard/sharded_tensor/shard.py b/torch/distributed/_shard/sharded_tensor/shard.py
index 66c688b3c90e..d448cc6321b1 100644
--- a/torch/distributed/_shard/sharded_tensor/shard.py
+++ b/torch/distributed/_shard/sharded_tensor/shard.py
@@ -7,7 +7,7 @@
 
 
 @dataclass
-class Shard(object):
+class Shard:
     """
     Container which holds the data for a shard as a Tensor and also
     the associated metadata for that shard.
diff --git a/torch/distributed/_shard/sharding_plan/api.py b/torch/distributed/_shard/sharding_plan/api.py
index 89bc6c717a73..40a967104acf 100644
--- a/torch/distributed/_shard/sharding_plan/api.py
+++ b/torch/distributed/_shard/sharding_plan/api.py
@@ -8,7 +8,7 @@
 from torch.distributed._shard.sharding_spec import ShardingSpec
 
 @dataclass
-class ShardingPlan(object):
+class ShardingPlan:
     """
     Representation of a sharding plan, describes how to shard a module
     across hosts. `plan` is used to shard module parameters according to the spec provided,
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
index 2f65e097301f..4939d2c11e81 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
@@ -2,11 +2,10 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._shard.replicated_tensor import ReplicatedTensor
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
-from torch.distributed.nn.functional import all_gather, all_reduce, reduce_scatter
+from torch.distributed.nn.functional import all_gather, reduce_scatter
 
 from ._common import (
     _all_gather_base_input,
@@ -209,11 +208,8 @@ def _handle_col_wise_sharding(
 
     Returns: final result of lookup.
     """
-    if not isinstance(input, ReplicatedTensor):
-        # allgather the inputs first for non Replicated Tensor.
-        gathered_inputs = all_gather(input, group=pg)
-    else:
-        gathered_inputs = input
+    # allgather the inputs first for non Replicated Tensor.
+    gathered_inputs = all_gather(input, group=pg)
 
     if max_norm is not None:
         # max_norm changes the weight in-place
@@ -261,11 +257,8 @@ def _handle_row_wise_sharding(
 
     Returns: final result of lookup.
     """
-    if not isinstance(input, ReplicatedTensor):
-        # allgather the inputs first for non Replicated Tensor.
-        gather_inp = _all_gather_base_input(input, pg)
-    else:
-        gather_inp = input
+    # allgather the inputs first for non Replicated Tensor.
+    gather_inp = _all_gather_base_input(input, pg)
 
     # Mask the input according to sharding spec.
     lookup_input, padding_idx, padding_row = _handle_row_wise_mask(
@@ -293,12 +286,9 @@ def _handle_row_wise_sharding(
     )
 
     # TODO: Make the result a PartialTensor.
-    if isinstance(input, ReplicatedTensor):
-        return all_reduce(local_input_embeddings, group=pg)
-    else:
-        local_shards = local_input_embeddings.chunk(pg.size())
-        return reduce_scatter(
-            torch.empty_like(local_shards[0]),
-            list(local_shards),
-            group=pg,
-        )
+    local_shards = local_input_embeddings.chunk(pg.size())
+    return reduce_scatter(
+        torch.empty_like(local_shards[0]),
+        list(local_shards),
+        group=pg,
+    )
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
index 7716ad390ddf..5f4d4ee3381f 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -5,11 +5,10 @@
 import torch
 import torch.distributed as dist
 from torch._C._distributed_c10d import ReduceOp
-from torch.distributed._shard.replicated_tensor import ReplicatedTensor
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
-from torch.distributed.nn.functional import all_gather, all_reduce, reduce_scatter
+from torch.distributed.nn.functional import all_gather, reduce_scatter
 
 from ._common import (
     _all_gather_base_input,
@@ -353,28 +352,25 @@ def _handle_row_wise_sharding(
     Returns:
         gathered_output: final result of lookup and aggregation.
     """
-    if not isinstance(input, ReplicatedTensor):
-        if input.dim() > 1 and per_sample_weights is None:
-            # allgather the inputs first for non Replicated Tensor.
-            gather_inp = _all_gather_base_input(input, pg)
-        else:
-            (
-                gathered_inputs,
-                gathered_per_sample_weights,
-                gathered_offsets,
-            ) = _all_gather_embedding_bag_input(input, per_sample_weights, offsets, pg)
-            cat_dim = 0 if input.dim() != 1 else -1
-            gather_inp = torch.cat(gathered_inputs, dim=cat_dim)
-            if per_sample_weights is not None:
-                per_sample_weights = torch.cat(gathered_per_sample_weights, dim=cat_dim)
-            offset_add = 0 if input.dim() > 1 else input.size(0)
-            if offsets is not None:
-                offsets_list = torch.cat(
-                    [gathered_offsets[i] + (offset_add * i) for i in range(pg.size())],
-                    dim=cat_dim,
-                )
+    if input.dim() > 1 and per_sample_weights is None:
+        # allgather the inputs first for non Replicated Tensor.
+        gather_inp = _all_gather_base_input(input, pg)
     else:
-        gather_inp = input
+        (
+            gathered_inputs,
+            gathered_per_sample_weights,
+            gathered_offsets,
+        ) = _all_gather_embedding_bag_input(input, per_sample_weights, offsets, pg)
+        cat_dim = 0 if input.dim() != 1 else -1
+        gather_inp = torch.cat(gathered_inputs, dim=cat_dim)
+        if per_sample_weights is not None:
+            per_sample_weights = torch.cat(gathered_per_sample_weights, dim=cat_dim)
+        offset_add = 0 if input.dim() > 1 else input.size(0)
+        if offsets is not None:
+            offsets_list = torch.cat(
+                [gathered_offsets[i] + (offset_add * i) for i in range(pg.size())],
+                dim=cat_dim,
+            )
 
     # Mask the input according to sharding spec.
     lookup_input, padding_local, padding_row = _handle_row_wise_mask(
@@ -410,16 +406,13 @@ def _handle_row_wise_sharding(
 
     op = ReduceOp.SUM if mode != "max" else ReduceOp.MAX
     # TODO: Make the result a PartialTensor and move the the logic below there.
-    if isinstance(input, ReplicatedTensor):
-        result = all_reduce(result, op=op, group=pg)
-    else:
-        local_shards = result.chunk(pg.size())
-        result = reduce_scatter(
-            torch.empty_like(local_shards[0]),
-            list(local_shards),
-            op=op,
-            group=pg,
-        )
+    local_shards = result.chunk(pg.size())
+    result = reduce_scatter(
+        torch.empty_like(local_shards[0]),
+        list(local_shards),
+        op=op,
+        group=pg,
+    )
 
     # For Mean, we cannot do the division until very end because the sum of means
     # not equal to the mean of sum. (Divisor is different)
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py
index b6125e69b16e..e38f1dc15e7c 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/linear.py
@@ -281,7 +281,7 @@ def _handle_row_wise_sharding_tensor(
             indices[placement.rank()] = list(
                 range(offset_start_idx, offset_start_idx + split_size)
             )
-        indices_flatten = list(idx for indice in indices for idx in indice)
+        indices_flatten = [idx for indice in indices for idx in indice]
 
         input_t = input_t.index_select(
             0, torch.tensor(indices_flatten, device=input_t.device)
diff --git a/torch/distributed/_spmd/aot_function_patch.py b/torch/distributed/_spmd/aot_function_patch.py
new file mode 100644
index 000000000000..32bf871d9df6
--- /dev/null
+++ b/torch/distributed/_spmd/aot_function_patch.py
@@ -0,0 +1,181 @@
+from functools import wraps
+from typing import Callable, Dict, Optional, Tuple
+
+import torch.utils._pytree as pytree
+from torch._functorch.aot_autograd import (
+    AOT_COUNTER,
+    KNOWN_TYPES,
+    AOTConfig,
+    PytreeThunk,
+    create_aot_dispatcher_function,
+    default_partition,
+)
+
+
+def patched_aot_function(
+    fn: Callable[..., object],
+    fw_compiler: Callable[..., object],
+    bw_compiler: Optional[Callable[..., object]] = None,
+    partition_fn: Callable[..., object] = default_partition,
+    decompositions: Optional[Dict[object, object]] = None,
+    num_params_buffers: int = 0,
+    hasher_type: object = None,  # deprecated
+    static_argnums: Optional[Tuple[int]] = None,  # deprecated
+    keep_inference_input_mutations: bool = False,
+    pre_compile_fn: Optional[Callable[..., object]] = None,
+) -> Callable[..., object]:
+    """
+    NOTE: rationale for patch.
+        We want to do the following
+            trace single device graph  --> parallelize (SPMD) ---> run graph on a shard
+
+        But::
+           - "single device graph" expects fully-sized shapes (e.g. logical shapes)
+           - "parallelized graph" expects sharded shapes (e.g. physical local shapes)
+
+        This means that we need to pass in "logical tensors" as input to the capturing step,
+        but then we need to pass "physical local_shard tensors" as input to the parallelized
+        graph afterwards.
+
+        This patch allows to transform the inputs of the graph before compilation, so that
+        we can capture the graph with logical shapes, and then finally after compilation,
+        call into the compiled (and transformed) graph with the original sharded tensors.
+
+        Beyond that:
+
+            The compilation for the backwards pass doesn't follow the same pattern.
+            For the backwards pass, since the compilation happens at first usage, we won't
+            be able to intercept the compilation call from here. But that's fine, because
+            the graph was already captured before with logical-shapes.
+
+
+    Traces the forward and backward graph of :attr:`fn` using torch dispatch
+    mechanism, and then compiles the generated forward and backward graphs
+    through :attr:`fw_compiler` and :attr:`bw_compiler`.
+
+    :func:`aot_function` traces the forward and backward graph ahead of time,
+    and generates a joint forward and backward graph.  :attr:`partition_fn` is
+    then used to separate out forward and backward graphs. The partitioner
+    function can be used to perform optimizations such as recomputation. One can
+    set `decompositions` dictionary to decompose the operators into a sequence
+    of core or simpler operators supported by the backend compilers.
+
+    :func:`aot_function` uses a compilation cache, based on input tensor
+    properties, to detect when there is a need of recompilation.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        fn (Callable): A Python function that takes one ore more arguments. Must
+            return one or more Tensors.
+        fw_compiler (Callable): A Python function that accepts an Fx graph with
+            Aten ops and input args, and returns a Callable that semantically is
+            equivalent to the input Fx graph.
+        bw_compiler (Optional[Callable]): A Python function that accepts an
+            Fx graph with Aten ops and input args, and returns a Callable that
+            semantically is equivalent to the input Fx graph.  Default: None
+            (when None, it defaults to the :attr:`fw_compiler`)
+        partition_fn (Callable): A Python function that takes a joint forward
+            and backward graph, and partitions it into separate forward and
+            backward graphs.
+        decompositions (Dict): A dictionary to define the decomposition of
+            larger Aten ops into simpler or core Aten ops.
+
+    Returns:
+        Returns a ``Callable`` that retains the eager behavior of the original
+        :attr:`fn`, but with forward and backward graph compiled via
+        :attr:`fw_compile` and :attr:`bw_compile`.
+
+    A simple example usage of :func:`aot_function` is as follows. This example
+    will print the forward and backward graphs of the function ``fn``
+
+        >>> fn = lambda x : x.sin().cos()
+        >>> def print_compile_fn(fx_module, args):
+        >>>     print(fx_module)
+        >>>     return fx_module
+        >>> aot_fn = patched_aot_function(fn, print_compile_fn)
+        >>> x = torch.randn(4, 5, requires_grad=True)
+        >>> aot_fn(x)
+    """
+    if static_argnums is not None:
+        raise RuntimeError(
+            "static_argnums has been deprecated - manually wrap your function or use torchdynamo."
+        )
+
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        partition_fn=partition_fn,
+        # pyre-fixme
+        decompositions=decompositions,  # type:ignore[arg-type]
+        num_params_buffers=num_params_buffers,
+        aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
+    )
+    cached_res = None
+
+    @wraps(fn)
+    # pyre-fixme
+    def returned_function(*args, **kwargs):
+        nonlocal cached_res
+        # Now flatten the tensor args
+        flat_args, _ = pytree.tree_flatten((args, kwargs))
+
+        # Compile the function and save it in the cache
+        if cached_res is None:
+            # Save the args_spec for flat_tensor_args to unflatten while tracing
+            _, tensor_args_spec = pytree.tree_flatten((args, kwargs))
+            out_spec = PytreeThunk()
+
+            # pyre-fixme
+            def flat_fn(*flat_args):
+                # The input are flattened tensor args. Prepare the args in the
+                # order that original function expects. Add static args as well.
+                # They will appear as tensor constants in the traced graph.
+                nonlocal out_spec
+                args, kwargs = pytree.tree_unflatten(
+                    list(flat_args),
+                    tensor_args_spec,
+                )
+                tree_out = fn(*args, **kwargs)
+                flat_out, spec = pytree.tree_flatten(tree_out)
+                for i in flat_out:
+                    is_known_type = False
+                    for j in KNOWN_TYPES:
+                        if isinstance(i, j):
+                            is_known_type = True
+                            break
+                    if not is_known_type:
+                        raise RuntimeError(
+                            f"Found {type(i)} in output, which is not a known type. "
+                            "If this type holds tensors, you need to register a pytree for it. "
+                            "See https://github.com/pytorch/functorch/issues/475 for a brief "
+                            "explanation why. If you don't need to register a pytree, please "
+                            "leave a comment explaining your use case and we'll make this more "
+                            "ergonomic to deal with"
+                        )
+                out_spec.set(spec)
+                return flat_out
+
+            compile_flat_args = (
+                pre_compile_fn(flat_args)
+                if pre_compile_fn is not None
+                else flat_args
+            )
+
+            compiled_fn = create_aot_dispatcher_function(
+                flat_fn,
+                compile_flat_args,
+                aot_config,
+            )
+            cached_res = (compiled_fn, out_spec)
+
+        cached_fn, out_spec = cached_res
+        out = cached_fn(flat_args)
+        return out_spec.unflatten(out)
+
+    return returned_function
diff --git a/torch/distributed/_spmd/api.py b/torch/distributed/_spmd/api.py
new file mode 100644
index 000000000000..5e3b52067b1e
--- /dev/null
+++ b/torch/distributed/_spmd/api.py
@@ -0,0 +1,53 @@
+from typing import Dict, Optional, Sequence, Tuple
+
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._spmd.distribute import distribute, Schema
+from torch.distributed._spmd.distributed_graph import DistributedGraph
+from torch.distributed._tensor import Placement, Replicate
+
+
+class SPMD(nn.Module):
+    def __init__(
+        self,
+        module: nn.Module,
+        schema: Schema,
+        input_schemas: Sequence[Placement] = tuple(),
+    ) -> None:
+        """
+        Given a non-distributed nn.Module, distribute the module and apply
+        optimizations over the distributed module (fx.GraphModule).
+
+        Args:
+            module (nn.Module): The target module.
+            schema (Schema): The distributed schema.
+            input_schemas (Sequence[Placement]): The schemas of the inputs.
+        """
+        super().__init__()
+        assert schema.placements == [
+            Replicate()
+        ], "SPMD only support Replicate() parameters for now"
+
+        # TODO: Fix model initialization with coalescing.
+        # This needs to happen post model transformation.
+        # Consider an explicit model init API.
+        for p in module.parameters():
+            dist.broadcast(p, src=0)
+
+        self._param_schema = schema
+        self._input_schemas = input_schemas
+        self._compiled_m: Optional[nn.Module] = None
+        self._dist_graph = DistributedGraph(orig_module=module)
+
+    def forward(self, *args: Tuple[object], **kwargs: Dict[str, object]) -> object:
+        if self._compiled_m is None:
+            self._compiled_m = distribute(
+                self._dist_graph,
+                self._param_schema,
+                self._input_schemas,
+                *args,
+                **kwargs,
+            )
+
+        assert self._compiled_m is not None
+        return self._compiled_m(*args, **kwargs)
diff --git a/torch/distributed/_spmd/config.py b/torch/distributed/_spmd/config.py
new file mode 100644
index 000000000000..54f0cc4dc5c8
--- /dev/null
+++ b/torch/distributed/_spmd/config.py
@@ -0,0 +1,27 @@
+import logging
+import sys
+from types import ModuleType
+from typing import Set
+
+# log level (levels print what it says + all levels listed below it)
+# DEBUG print full traces <-- lowest level + print tracing of every instruction
+# INFO print compiler functions + distributed graphs
+# WARN print warnings
+# ERROR print exceptions
+log_level: int = logging.DEBUG
+# Verbose will print full stack traces on warnings and errors
+verbose = False
+
+# the name of a file to write the logs to
+log_file_name: None = None
+
+
+class _AccessLimitingConfig(ModuleType):
+    def __setattr__(self, name, value) -> None:
+        if name not in _allowed_config_names:
+            raise AttributeError(f"{__name__}.{name} does not exist")
+        return object.__setattr__(self, name, value)
+
+
+_allowed_config_names: Set[str] = {*globals().keys()}
+sys.modules[__name__].__class__ = _AccessLimitingConfig
diff --git a/torch/distributed/_spmd/distribute.py b/torch/distributed/_spmd/distribute.py
new file mode 100644
index 000000000000..78ee67d737f0
--- /dev/null
+++ b/torch/distributed/_spmd/distribute.py
@@ -0,0 +1,631 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from typing import Dict, List, Optional, Sequence, Set, Tuple, cast
+import logging
+
+import torch
+import torch.fx as fx
+import torch.nn as nn
+from torch._functorch.aot_autograd import aot_module, make_boxed_func
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed._spmd.comm_tensor import _get_tracer
+from torch.distributed._spmd.log_utils import get_logger
+from torch.distributed._spmd.aot_function_patch import patched_aot_function
+from torch.distributed._spmd.distributed_graph import DistributedGraph
+from torch.distributed._spmd.graph_utils import OP
+from torch.distributed._spmd.experimental_ops import *  # noqa: F401, F403
+from torch.distributed._tensor import (
+    DeviceMesh,
+    DTensor,
+    Replicate,
+    Shard,
+)
+from torch.distributed._tensor.dispatch import (
+    _CURRENT_DECOMPOSITION_TABLE,
+    operator_dispatch
+)
+from torch.distributed._tensor.redistribute import (
+    _redistribute_with_local_tensor,
+)
+from torch.distributed._tensor.placement_types import _Partial, Placement
+from torch.fx.experimental.proxy_tensor import (
+    make_fx,
+    maybe_disable_fake_tensor_mode,
+    proxy_slot,
+)
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+
+# patch aot_function so that we can pass the full (non-sharded) input to capture the graph
+# pyre-fixme
+torch._functorch.aot_autograd.aot_function = patched_aot_function  # type: ignore[assignment]
+
+logger: Optional[logging.Logger] = None
+
+
+class TrainingPhase(Enum):
+    FORWARD = auto()
+    BACKWARD = auto()
+
+
+@dataclass
+class Schema:
+    mesh: DeviceMesh
+    placements: List[Placement]
+
+
+def _is_partial_dtensor(obj: object) -> bool:
+    """check if object is 1) DTensor and  2) with any placement of _Partial"""
+    if not isinstance(obj, DTensor):
+        return False
+
+    is_partial = False
+    for placement in obj.placements:
+        if isinstance(placement, _Partial):
+            is_partial = True
+            break
+
+    return is_partial
+
+
+def _dispatch_with_local_tensors(
+    op: torch._ops.OpOverload,
+    local_args: Tuple[object, ...],
+    kwargs: Optional[Dict[str, object]] = None,
+    specs: Optional[Dict[
+        torch.Tensor,
+        Tuple[torch.Size, DeviceMesh, Sequence[Placement], Sequence[Placement]],
+    ]] = None,
+) -> object:
+    if kwargs is None:
+        kwargs = {}
+    if specs is None:
+        specs = {}
+
+    def redistribute(arg: object) -> object:
+        return (
+            _redistribute_with_local_tensor(arg, *specs[arg])  # type: ignore[index]
+            if isinstance(arg, torch.Tensor) and arg in specs  # type: ignore[operator]
+            else arg
+        )
+
+    # TODO: this is broken because it won't redistributed potential tensors on the kwargs
+    return op(*tree_map(redistribute, local_args), **kwargs)
+
+
+# Figure out how to specify a type spec for the return specs value
+# without the entire structure.
+# pyre-fixme
+def _update_specs_for_redistribute(args, target_schema, redistribute):
+    # Code adapted from pack_args_kwargs_with_local_tensor
+    flatten_args, args_tree_spec = tree_flatten(args)
+    flatten_args_schema, _ = tree_flatten(target_schema.args_schema)
+
+    specs: Dict[
+        torch.Tensor,
+        Tuple[
+            torch.Size,
+            DeviceMesh,
+            Sequence[Placement],
+            Sequence[Placement],
+        ],
+    ] = {}
+    for i, arg in enumerate(flatten_args):
+        if isinstance(arg, DTensor):
+            if redistribute:
+                specs[arg._local_tensor] = (
+                    arg.size(),
+                    flatten_args_schema[i].mesh,
+                    arg.placements,
+                    flatten_args_schema[i].placements,
+                )
+            flatten_args_schema[i] = arg._local_tensor
+
+    unflattened_args = tree_unflatten(flatten_args_schema, args_tree_spec)
+    return specs, unflattened_args
+
+
+def _get_dtensor_dispatch_graph(
+    node: fx.Node,
+    node_to_obj: Dict[fx.Node, object],
+) -> fx.GraphModule:
+    def _remap_arg(arg: object) -> object:
+        if isinstance(arg, torch.fx.Node):
+            obj = node_to_obj[arg]
+            if _get_tracer():
+                # This is a shared arg, already has a tracer from previous
+                # tracing. Delete the tracer.
+                del cast(Dict[object, object], obj.__dict__)[proxy_slot]
+            return obj
+        else:
+            return arg
+
+    # Args should be a list of objects post remapping.
+    args = tree_map(_remap_arg, node.args)
+    # kwargs in this set of tests are all constants
+    kwargs = cast(Dict[str, object], node.kwargs)
+
+    op_overload = cast(torch._ops.OpOverload, node.target)
+
+    # run dispatch once to get the real DTensor output.
+    with torch.no_grad():
+        out = operator_dispatch(
+            op_overload,
+            args,
+            kwargs,  # kwargs in this set of tests are all constants
+            DTensor._propagator,
+            DTensor._custom_dispatch_ops,
+        )
+        node_to_obj[node] = out
+
+    op_schema = DTensor._propagator.prepare_op_schema(op_overload, args, kwargs)
+    # get DTensor specs for inputs and outputs
+    output_sharding = DTensor._propagator.propagate_op_sharding(
+        op_overload,
+        op_schema,
+    )
+
+    assert output_sharding.schema_suggestions is not None
+    target_schema = output_sharding.schema_suggestions[0]
+    redistribute = target_schema is not op_schema
+
+    # TODO: this is broken when kwargs contains tensors
+    # or if a non-tensor kwarg was modified by the sharding propagation
+    # (in order to fix, need to port over pack_args_kwargs_with_local_tensor for kwargs as well)
+    updated_args_spec, unflattened_args = _update_specs_for_redistribute(
+        args, target_schema, redistribute
+    )
+
+    dispatch = partial(
+        _dispatch_with_local_tensors,
+        op_overload,
+        kwargs=kwargs,
+        specs=updated_args_spec,
+    )
+
+    return make_fx(dispatch)(unflattened_args)
+
+
+def _build_dummy_add_graph(
+    dt: DTensor, node_to_obj: Dict[fx.Node, object]
+) -> Tuple[fx.GraphModule, object]:
+    """
+    Creates a graph for a dummy add function from a partial DTensor.
+    This dummy add is used for triggering all_reduce on a Partial DTensor
+    during the DTensor expansion of the traced graph.
+    Also returns the actual DTensor after resharding.
+    """
+
+    def dummy_add(grad: torch.Tensor, zero: torch.Tensor) -> torch.Tensor:
+        return grad + zero
+
+    grad: torch.Tensor = dt._local_tensor
+    zero: torch.Tensor = torch.zeros_like(dt._local_tensor)
+
+    traced_add = make_fx(dummy_add)(grad, zero)
+
+    placeholders = [n for n in traced_add.graph.nodes if n.op == OP.PLACEHOLDER]
+    call_functions = [
+        n for n in traced_add.graph.nodes if n.op == OP.CALL_FUNCTION
+    ]
+    assert len(placeholders) == 2
+    assert len(call_functions) == 1
+    node_to_obj[placeholders[0]] = dt
+    node_to_obj[placeholders[1]] = DTensor.from_local(
+        zero, dt.device_mesh, [Replicate()], run_check=False
+    )
+
+    traced_dispatch = _get_dtensor_dispatch_graph(
+        call_functions[0], node_to_obj
+    )
+
+    traced_dispatch.graph.lint()
+
+    # TODO(anj): This depends on the call function node -> actual DTensor output
+    # mapping that we want to avoid for SPMD expansion
+    return traced_dispatch, node_to_obj[call_functions[0]]
+
+
+def _convert_output(
+    gm: fx.GraphModule,
+    node: fx.Node,
+    node_to_obj: Dict[fx.Node, object],
+) -> fx.Node:
+    new_args = []
+    has_partial = False
+    for argument in node.args[0]:  # type: ignore[union-attr]
+        if not isinstance(argument, fx.Node):
+            new_args.append(argument)
+            continue
+
+        obj = node_to_obj[argument]
+
+        if not _is_partial_dtensor(obj):
+            new_args.append(argument)
+            continue
+
+        has_partial = True
+
+        # we know it's a dtensor from is partial DT check...
+        dt = cast(DTensor, obj)
+
+        traced_dispatch, result_obj = _build_dummy_add_graph(dt, node_to_obj)
+
+        wait = [n for n in traced_dispatch.graph.nodes if n.name == "wait_comm"]
+        add = [n for n in traced_dispatch.graph.nodes if n.name == "add"]
+        assert len(wait) == 1 and len(add) == 1
+
+        # remove add node and replace it with wait node
+        add[0].replace_all_uses_with(wait[0])
+        traced_dispatch.graph.lint()
+        traced_dispatch.graph.eliminate_dead_code()
+        # also update the actual DTensor corresponding to the node
+        # TODO(anj): We require mapping of the final DTensor output to the wait
+        # comm node.
+        node_to_obj[wait[0]] = result_obj
+
+        value_remap: Dict[fx.Node, fx.Node] = {}
+        for dtn in traced_dispatch.graph.nodes:
+            if dtn.op == OP.PLACEHOLDER:
+                # do nothing, ignore placeholders, as it has
+                # already been prepared in value_remap
+                value_remap[dtn] = argument
+            elif dtn.op == OP.OUTPUT:
+                assert (
+                    len(dtn.args) == 1 and len(dtn.args[0]) == 1
+                ), f"Expecting single output, but got {dtn.args} {len(dtn.args)}"
+                new_args.append(value_remap[dtn.args[0][0]])
+                # the concrete DTensor value of output was added when creating the
+                # inner graph (in _build_dummy_add_graph). Just add it to the final
+                # output node so that we can report the final output specs correctly.
+                # TODO(anj): We are depending on the concrete DTensor output of the dummy add.
+                node_to_obj[value_remap[dtn.args[0][0]]] = node_to_obj[
+                    dtn.args[0][0]
+                ]
+
+            else:
+                if dtn.op == OP.GET_ATTR:
+                    setattr(
+                        gm,
+                        dtn.target,
+                        getattr(traced_dispatch, dtn.target),
+                    )
+                with gm.graph.inserting_before(node):
+                    value_remap[dtn] = gm.graph.node_copy(
+                        dtn, lambda n: value_remap[n]
+                    )
+    if has_partial:
+        gm.graph.erase_node(node)
+        return gm.graph.output(new_args)
+    else:
+        return node
+
+
+def _rebuild_graph(
+    gm: fx.GraphModule,
+    node_replacements: Dict[torch.fx.Node, torch.fx.GraphModule],
+) -> None:
+
+    # replace nodes in local traced graph with DTensor's dispatch graph
+    for node in gm.graph.nodes:
+        if node not in node_replacements:
+            continue
+
+        traced_dispatch = node_replacements[node]
+        # Map DT's dispatch graph input placeholder nodes to the ones in
+        # local traced graph. It uses index-based accessing, which is
+        # brittle, just for testing purpose.
+        flatten_args, _ = tree_flatten(node.args)
+        i, value_remap = 0, {}
+        for dtn in traced_dispatch.graph.nodes:
+            if dtn.op == OP.PLACEHOLDER:
+                value_remap[dtn] = flatten_args[i]
+                i += 1
+
+        # insert DT's dispatch graph to traced local graph.
+        with gm.graph.inserting_before(node):
+            for dtn in traced_dispatch.graph.nodes:
+
+                if dtn.op == OP.PLACEHOLDER:
+                    # do nothing, ignore placeholders, as it has already
+                    # been prepared in value_remap
+                    pass
+                elif dtn.op == OP.OUTPUT:
+                    assert (
+                        len(dtn.args) == 1
+                    ), f"Expecting single output, but got {dtn.args} {len(dtn.args[0])}"
+                    outputs = dtn.args[0]
+                    # we currently support two very specific types of output
+                    # 1. single output
+                    # 2. multiple outputs resulting from getitem of all elements of tuple
+                    if len(outputs) == 1:
+                        # for single output, we replace the node with the single node
+                        output = outputs[0]
+                    else:
+                        # for multiple outputs, we check that these outputs correspond
+                        # to all elements of a tuple. In that case, we replace
+                        # uses of the output directly with the original tuple
+                        source = None
+                        for i, out in enumerate(outputs):
+                            # we allow None outputs for certain items in the tuple
+                            if out is None:
+                                continue
+                            assert out.op == "call_function"
+                            assert out.target.__module__ == "_operator"
+                            assert out.target.__name__ == "getitem"
+                            assert source is None or source == out.args[0]
+                            source = out.args[0]
+                            assert out.args[1] == i
+                        assert source is not None
+                        output = source
+
+                    new_node = value_remap[output]
+                    node.replace_all_uses_with(new_node)
+                else:
+                    value_remap[dtn] = gm.graph.node_copy(
+                        dtn, lambda n: value_remap[n]
+                    )
+
+    gm.graph.lint()
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+
+
+def _get_last_consumer_to_nodes(
+    graph: fx.Graph,
+) -> Dict[fx.Node, List[fx.Node]]:
+    # Run through reverse nodes and record the first instance of a use
+    # of a given node. This represents the *last* use of the node in the
+    # execution order of the program, which we will use to free unused
+    # values
+    node_to_last_consumer: Dict[fx.Node, fx.Node] = {}
+    last_consumer_to_nodes: Dict[fx.Node, List[fx.Node]] = {}
+
+    def _register_final_consumer(arg_node: fx.Node, consumer: fx.Node) -> None:
+        if arg_node not in node_to_last_consumer:
+            node_to_last_consumer[arg_node] = consumer
+            last_consumer_to_nodes.setdefault(consumer, []).append(arg_node)
+
+    for node in reversed(graph.nodes):
+        fx.node.map_arg(
+            node.args, lambda arg_node: _register_final_consumer(arg_node, node)
+        )
+        fx.node.map_arg(
+            node.kwargs,
+            lambda kwarg_node: _register_final_consumer(kwarg_node, node),
+        )
+
+    return last_consumer_to_nodes
+
+
+def _convert_to_distributed(
+    gm: fx.GraphModule,
+    inps: List[torch.Tensor],
+    schemas: List[Schema],
+    _allow_partial: bool = False,
+) -> Tuple[fx.GraphModule, Dict[str, Schema]]:
+    """
+    Returns:
+        - transformed graph module
+        - map from output name to DTensorSpec
+    """
+    global logger
+    logger = get_logger("spmd_exp")
+    node_to_obj: Dict[fx.Node, object] = {}
+    # map local op node in traced_f to its corresponding subgraph of
+    # DTensor ops.
+    node_replacements: Dict[torch.fx.Node, torch.fx.GraphModule] = {}
+
+    last_consumer_to_nodes = _get_last_consumer_to_nodes(gm.graph)
+
+    output_schemas: Dict[str, Schema] = {}
+    for i, node in enumerate(gm.graph.nodes):
+        assert logger is not None
+        logger.info(f"node{i}: op={node.op} target={node.target}")
+        if node.op == OP.PLACEHOLDER:
+            assert i < len(
+                inps
+            ), f"got more placeholer nodes ({i + 1}) than inputs ({len(inps)})"
+
+            # our example inputs are local shards. Create DTensors from them.
+            node_to_obj[node] = DTensor.from_local(
+                inps[i],
+                schemas[i].mesh,
+                schemas[i].placements,
+                # prevent running this collective in backwards pass
+                run_check=False,
+            )
+
+        elif isinstance(node.target, torch._ops.OpOverload):
+            node_replacements[node] = _get_dtensor_dispatch_graph(
+                node, node_to_obj
+            )
+        elif node.op == OP.OUTPUT:
+            if not _allow_partial:
+                # Returns an expanded dummy add node that ensures
+                # that the partial output tensor has been converted
+                # to a replicated tensor.
+                node = _convert_output(gm, node, node_to_obj)
+
+            # Save output sharding for the inputs to backward pass.
+            # TODO(anj): Pipe the output schema for the BW pass
+            # instead of requiring the full output DTensor to be
+            # materialized.
+            for inp_arg in node.args[0]:
+                if isinstance(inp_arg, fx.Node):
+                    obj = node_to_obj[inp_arg]
+                    if isinstance(obj, DTensor):
+                        output_schemas[inp_arg.name] = Schema(
+                            obj.device_mesh, obj.placements  # type: ignore[arg-type]
+                        )
+
+        elif node.op == OP.CALL_FUNCTION:
+
+            def _remap_arg(arg: object) -> object:
+                if isinstance(arg, torch.fx.Node):
+                    obj = node_to_obj[arg]
+                    if _get_tracer():
+                        # This is a shared arg, already has a tracer from previous
+                        # tracing. Delete the tracer.
+                        del cast(Dict[object, object], obj.__dict__)[proxy_slot]
+                    return obj
+                else:
+                    return arg
+
+            args = tree_map(_remap_arg, node.args)
+            assert (
+                len(args) >= 2
+            ), f"Expected number of args for call function to be at least 2, found {len(args)}"
+            # TODO(anj): Why do we assume this is only 2?
+            node_to_obj[node] = node.target(args[0], args[1])
+        else:
+            raise ValueError(f"Unrecognized node.op type {node.op}")
+
+        if node in last_consumer_to_nodes:
+            # Save memory by deleting objs that wont be used anymore.
+            for arg_node in last_consumer_to_nodes[node]:
+                del node_to_obj[arg_node]
+
+    _rebuild_graph(gm, node_replacements)
+
+    return gm, output_schemas
+
+
+class _SPMD:
+    def __init__(
+        self,
+        dist_graph: DistributedGraph,
+        param_schema: Schema,
+        input_schemas: Sequence[Placement],
+    ) -> None:
+        self._dist_graph = dist_graph
+        self._param_schema = param_schema
+        # Override the default sharding of input to the model.
+        self._input_schemas = input_schemas
+        # used to propagate sharding from the output of the forward pass to
+        # the input of backward pass
+        self._known_specs_by_node_name: Dict[str, Schema] = {}
+
+    def _is_param(self, t: torch.Tensor) -> bool:
+        # N.B.: id(t) and id(param) does not match
+        orig_module = cast(nn.Module, self._dist_graph.orig_module)
+        return t.data_ptr() in (p.data_ptr() for p in orig_module.parameters())
+
+    def _compile_wrapper(
+        self,
+        training_phase: TrainingPhase,
+        original_inputs: List[List[torch.Tensor]],
+        gm: fx.GraphModule,
+        inps: List[torch.Tensor],
+    ) -> fx.GraphModule:
+
+        with maybe_disable_fake_tensor_mode():
+            return self._compile(training_phase, gm, original_inputs[0])
+
+    def _compile(
+        self,
+        training_phase: TrainingPhase,
+        gm: fx.GraphModule,
+        inps: List[torch.Tensor],
+    ) -> fx.GraphModule:
+        shard_schema: Schema = Schema(
+            mesh=self._param_schema.mesh, placements=[Shard(0)]
+        )
+        schemas: List[Schema] = []
+        inp_schema_count = 0
+        nparams = 0
+
+        # iterate through inputs (and initial nodes of the graph that should
+        # correspond 1:1 to those inputs)
+        for inp, placeholder_node in zip(inps, gm.graph.nodes):
+            # This is a no-op but we want the order of schemas
+            # to match the order of inputs when we iterate through
+            # the graph. Usually the non-tensor inputs are at the
+            # end of the list so we could drop the schemas for it.
+
+            assert placeholder_node.op == "placeholder", (
+                "Expected initial nodes of the GraphModule to be input placeholders. "
+                "Got {placeholder_node.op}"
+            )
+
+            known_schema = self._known_specs_by_node_name.get(
+                placeholder_node.name
+            )
+
+            if known_schema is not None:
+                schemas.append(known_schema)
+            elif not isinstance(inp, torch.Tensor):
+                schemas.append(
+                    Schema(
+                        mesh=self._param_schema.mesh, placements=[Replicate()]
+                    )
+                )
+            else:
+                if self._is_param(inp):
+                    schemas.append(self._param_schema)
+                    nparams += 1
+                elif self._input_schemas:
+                    schemas.append(self._input_schemas[inp_schema_count])  # type: ignore[arg-type]
+                    inp_schema_count += 1
+                else:
+                    schemas.append(shard_schema)
+
+        parallelized_gm, output_specs = _convert_to_distributed(
+            gm,
+            inps,
+            schemas,
+            _allow_partial=False,
+        )
+        self._known_specs_by_node_name.update(output_specs)
+
+        if training_phase == TrainingPhase.FORWARD:
+            self._dist_graph.fwd_graph_modules.append(parallelized_gm)
+        elif training_phase == TrainingPhase.BACKWARD:
+            self._dist_graph.bwd_graph_modules.append(parallelized_gm)
+        return make_boxed_func(parallelized_gm)
+
+
+def distribute(
+    dist_graph: DistributedGraph,
+    param_schema: Schema,
+    input_schemas: Sequence[Placement],
+    *args: Tuple[object],
+    **kwargs: Dict[str, object],
+) -> nn.Module:
+
+    flat_args, _ = tree_flatten(args)
+    flat_kwargs, _ = tree_flatten(kwargs)
+    input_set: Set[object] = set(flat_args + flat_kwargs)
+
+    fake_mode: FakeTensorMode = FakeTensorMode()
+
+    # will update this to the original forward inputs
+    original_inputs: List[Optional[Sequence[object]]] = [None]
+
+    def input_to_fake(input: object) -> object:
+        if not isinstance(input, torch.Tensor):
+            return input
+        y = fake_mode.from_tensor(input)
+        if input in input_set:
+            # "unshard" our fake tensor
+            # (considers that inputs are sharded)
+            y = y.repeat(param_schema.mesh.size(0), *((1,) * (y.ndim - 1)))
+        # TODO assume non-inputs (params, etc) are replicated for now.
+        return y
+
+    def gather_inputs_for_compilation(
+        inps: Tuple[object, ...],
+    ) -> Tuple[object, ...]:
+        original_inputs[0] = inps
+        return tuple(input_to_fake(x) for x in inps)
+
+    spmd = _SPMD(dist_graph, param_schema, input_schemas)
+    compiled_m = aot_module(
+        cast(nn.Module, dist_graph.orig_module),
+        partial(spmd._compile_wrapper, TrainingPhase.FORWARD, original_inputs),
+        partial(spmd._compile, TrainingPhase.BACKWARD),
+        pre_compile_fn=gather_inputs_for_compilation,
+        decompositions=_CURRENT_DECOMPOSITION_TABLE,
+    )
+
+    return compiled_m
diff --git a/torch/distributed/_spmd/distributed_graph.py b/torch/distributed/_spmd/distributed_graph.py
new file mode 100644
index 000000000000..bc838d04d9b0
--- /dev/null
+++ b/torch/distributed/_spmd/distributed_graph.py
@@ -0,0 +1,30 @@
+from typing import List, Optional
+
+import torch.nn as nn
+from torch import fx
+
+
+class DistributedGraph:
+    def __init__(
+        self,
+        orig_module: Optional[nn.Module] = None,
+    ) -> None:
+        self.orig_module: Optional[nn.Module] = orig_module
+        self.fwd_graph_modules: List[fx.GraphModule] = []
+        self.bwd_graph_modules: List[fx.GraphModule] = []
+
+        # Indicate `update()` must be called before applying any optimization.
+        self._dirty = True
+
+    def validate(self) -> None:
+        return
+
+    def update(self) -> "DistributedGraph":
+        """
+        Utility to put graph module into a node map for easier adjustments.
+        """
+        if not self._dirty:
+            return self
+
+        self.validate()
+        return self
diff --git a/torch/distributed/_spmd/experimental_ops.py b/torch/distributed/_spmd/experimental_ops.py
new file mode 100644
index 000000000000..be8c2e9d7507
--- /dev/null
+++ b/torch/distributed/_spmd/experimental_ops.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Optional, Sequence
+
+import torch
+
+from torch.distributed._tensor.placement_types import (
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+    _Partial,
+)
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.utils import register_prop_rule
+from torch.distributed._tensor.ops.common_rules import pointwise_rule
+
+aten = torch.ops.aten  # pyre-ignore
+
+
+@register_prop_rule(aten.native_layer_norm.default)  # pyre-ignore
+def _prop_native_layer_norm(op_schema: OpSchema) -> OutputSharding:
+    input, normalized_shape, weight, bias, eps = op_schema.args_schema
+    assert isinstance(input, DTensorSpec)
+    assert isinstance(weight, DTensorSpec)
+    assert isinstance(bias, DTensorSpec)
+    assert isinstance(normalized_shape, (tuple, list))
+    assert all(isinstance(p, Replicate) for p in weight.placements)
+    assert all(isinstance(p, Replicate) for p in bias.placements)
+    # only the left-most (non-normalized) dimensions of the input can be sharded
+    batch_ndim = len(input.shape) - len(normalized_shape)
+    assert all(
+        isinstance(p, Replicate)
+        or (isinstance(p, Shard) and p.dim < batch_ndim,)
+        for p in input.placements
+    )
+    stats_spec = DTensorSpec(
+        mesh=weight.mesh,
+        placements=input.placements,
+    )
+    return OutputSharding(output_spec=(input, stats_spec, stats_spec))
+
+
+@register_prop_rule(aten.native_layer_norm_backward.default)  # pyre-ignore
+def _prop_native_layer_norm_backward(op_schema: OpSchema) -> OutputSharding:
+    (
+        grad,
+        input,
+        normalized_shape,
+        result1,
+        result2,
+        weight,
+        bias,
+        grad_input_mask,
+    ) = op_schema.args_schema
+    assert isinstance(grad, DTensorSpec)
+    assert isinstance(weight, DTensorSpec)
+    assert isinstance(bias, DTensorSpec)
+    assert isinstance(grad_input_mask, (list, tuple))
+    assert all(isinstance(s, Replicate) for s in weight.placements)
+    assert all(isinstance(s, Replicate) for s in bias.placements)
+    # ensure sharding on dim 0, which will trigger the "Partial" output on weight and bias grads
+    assert any(
+        isinstance(s, Shard) and s.dim == 0 for s in grad.placements
+    ), f"Got {grad.placements}"
+    weight_grad = DTensorSpec(
+        mesh=weight.mesh,
+        placements=[_Partial()] * weight.mesh.ndim,
+    )
+    bias_grad = DTensorSpec(
+        mesh=bias.mesh,
+        placements=[_Partial()] * bias.mesh.ndim,
+    )
+    return OutputSharding(
+        # NOTE: type errors below are legit. This is because DTensor currently
+        # doesn't support Optional return values. Need to be fixed in DTensor repo.
+        output_spec=(
+            grad if grad_input_mask[0] else None,
+            weight_grad if grad_input_mask[1] else None,
+            bias_grad if grad_input_mask[2] else None,
+        ),
+    )
+
+
+def _refine_sharding(
+    op_schema: OpSchema, active_dim: Optional[int]
+) -> Sequence[Placement]:
+    """
+    Considers 2 first inputs of op_schema as having same shape,
+    and returns suggested placement for a pointwise operation.
+    """
+    # consider the operating dimension as a singleton to prevent sharding on it
+    # however, if active_dim is None, this means the input and output shapes are equal and
+    # we'll apply exactly the pointwise rule.
+    args_schema = [
+        DTensorSpec(
+            mesh=s.mesh,  # type: ignore[attr-defined]
+            placements=s.placements,  # type: ignore[attr-defined]
+            tensor_meta=s.tensor_meta,  # type: ignore[attr-defined]
+        )
+        for s in op_schema.args_schema[:2]
+    ]
+
+    op_schema = OpSchema(
+        func_schema=op_schema.func_schema,
+        args_schema=args_schema,  # type: ignore[arg-type]
+        kwargs_schema={},
+        is_inplace=op_schema.is_inplace,
+        is_out_variant=op_schema.is_out_variant,
+    )
+    output_sharding = pointwise_rule(op_schema, linearity=False)
+    if output_sharding.output_spec:
+        assert isinstance(output_sharding.output_spec, DTensorSpec)
+        return output_sharding.output_spec.placements
+    else:
+        assert output_sharding.schema_suggestions is not None
+        out_schema = output_sharding.schema_suggestions[0].args_schema[0]
+        assert isinstance(out_schema, DTensorSpec)
+        return tuple(out_schema.placements)
+
+
+@register_prop_rule(aten.slice_scatter.default)  # pyre-ignore
+def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
+    # 1. number of dimensions in input and src need to match.
+    # 2. number of elements on all non-dim need to match between input and src.
+    # 3. numer of elements in src in dim need to match the slice size.
+    # Given the above:
+    # - We suggest for src to follow the sharding of input, except on the scatter dimension,
+    #   where our best bet for now is to make them replicated as a fall-back.
+    #   TODO: Ideally we'd like to make sure the output is re-sharded afterwards to keep input sharding.
+
+    defaults = (None, None, 0, None, None, 1)
+    input, src, dim, start, end, step = (
+        op_schema.args_schema + defaults[len(op_schema.args_schema) :]
+    )
+    assert isinstance(input, DTensorSpec)
+    assert isinstance(src, DTensorSpec)
+    assert isinstance(dim, int)
+
+    if dim < 0:
+        dim += input.ndim
+
+    # if the input shape and the output shape are the same on the operating dimension,
+    # this is effectively a no-op, so we just propagate sharding as we would do for
+    # pointwise, no exceptions.
+    if input.shape[dim] == src.shape[dim]:
+        assert start == 0
+        assert end >= src.shape[dim]  # type: ignore[operator]
+        dim = None
+
+    # apply sharding refinement as implemented in pointwise_rule
+    input_suggestion = list(_refine_sharding(op_schema, dim))
+    # apply the exception -- disallow sharding on the operating dimension.
+    for i, p in enumerate(input_suggestion):
+        if isinstance(p, Shard) and p.dim == dim:
+            input_suggestion[i] = Replicate()
+    input_suggestion = tuple(input_suggestion)  # type: ignore[assignment]
+
+    if input_suggestion == tuple(input.placements) and src.placements == tuple(
+        input.placements
+    ):
+        # if our sharding is correct, the output sharding will be the same as the input.
+        return OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=input.mesh,
+                placements=input.placements,
+            )
+        )
+    else:
+        # otherwise, return the suggestion.
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(
+                        DTensorSpec(
+                            mesh=input.mesh,
+                            placements=input_suggestion,
+                            tensor_meta=input.tensor_meta,
+                        ),
+                        DTensorSpec(
+                            mesh=src.mesh,
+                            placements=input_suggestion,
+                            tensor_meta=src.tensor_meta,
+                        ),
+                    )
+                    + op_schema.args_schema[2:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
diff --git a/torch/distributed/_spmd/graph_utils.py b/torch/distributed/_spmd/graph_utils.py
new file mode 100644
index 000000000000..f7af160614f4
--- /dev/null
+++ b/torch/distributed/_spmd/graph_utils.py
@@ -0,0 +1,113 @@
+from enum import Enum
+from typing import List, Optional, Set, Tuple, Union
+
+import torch.fx as fx
+from torch.fx.passes.shape_prop import TensorMetadata
+
+
+class OP(str, Enum):
+    CALL_FUNCTION = "call_function"
+    CALL_MODULE = "call_module"
+    CALL_METHOD = "call_method"
+    GET_ATTR = "get_attr"
+    OUTPUT = "output"
+    PLACEHOLDER = "placeholder"
+
+
+class CommType(str, Enum):
+    ALLREDUCE = "allreduce_"
+    ALLGATHER = "allgather_"
+    BROADCAST = "broadcast_"
+    REDUCESCATTER = "reduce_scatter_"
+    SCATTER = "scatter_"
+
+
+comm_block_op_sequence: Tuple[Union[str, Set[CommType]], ...] = (
+    "clone",
+    "_tensor_constant",
+    "_tensor_constant",
+    # The supported communication type.
+    {CommType.ALLREDUCE},
+    "comm_result",
+    "getitem",
+    "getitem",
+    "wait_comm",
+)
+
+
+def get_comm_block_nodes(
+    wait_node: fx.Node, comm_type: CommType
+) -> Tuple[int, List[fx.Node]]:
+    """
+    Given a wait_comm node, find out all the nodes belong to this communcation.
+
+    Args:
+        wait_node(fx.Node): The target wait_comm node.
+        comm_type(CommType): The communication type of this communication block.
+            Currently, only allreduce is supported. An exception will be raised
+            if other values are passed.
+    Returns:
+        comm_idx(int): The index to the communication node in the return list.
+        node_list(List[fx.Node]): The list that contain the nodes in the order
+           of inserting to the graph.
+    """
+    if not wait_node.name.startswith("wait_comm"):
+        raise ValueError(
+            "Passing a wait_node that name does not start with ``wait_comm``. "
+            f"Name is {wait_node.name}, OP is {wait_node.op}."
+        )
+    node = wait_node
+    node_list = []
+    for i, prefix in enumerate(reversed(comm_block_op_sequence)):
+        node_list.append(node)
+        if isinstance(prefix, set):
+            if comm_type not in prefix:
+                raise ValueError(f"Not supported CommType {comm_type}")
+            prefix = comm_type
+            comm_idx = i
+        assert node.name.startswith(
+            prefix
+        ), f"Comm block op sequence mismatches, {node.op} {node.name} {i} {prefix}."
+        node = node.prev
+
+    comm_idx = len(node_list) - comm_idx - 1
+    node_list.reverse()
+
+    return comm_idx, node_list
+
+
+def get_node_tensor_metadata(node: fx.Node, is_required: bool = True) -> TensorMetadata:
+    metadata = node.meta.get("tensor_meta", None)
+    if is_required and metadata is None:
+        raise RuntimeError(
+            f"Callsite expects that ``tensor_meta`` exists in ``{node.name}``, "
+            f"but got None instead. Node: {node.op} {node.name} {node.target}"
+        )
+    return metadata
+
+
+def get_output_node(gm: fx.GraphModule) -> Optional[fx.Node]:
+    """
+    Take a graphmodule and returns the graph output node. We traverse in reverse
+    to expedite it, with the idea that last node should be output
+    """
+    if gm.graph is None:
+        raise ValueError("Missing graph from graph module.")
+
+    for node in reversed(gm.graph.nodes):
+        if node.op == OP.OUTPUT:
+            return node
+    return None
+
+
+def rebuild_graph(gm: fx.GraphModule, remove_dead_code: bool = True) -> None:
+    """
+    Runs the required steps to ensure production-ready graph.
+    note - per the fx docs, eliminate dead code is not very precise.
+    Hence, the flag to make this step optional.
+    """
+
+    gm.graph.lint()
+    if remove_dead_code:
+        gm.graph.eliminate_dead_code()
+    gm.recompile()
diff --git a/torch/distributed/_spmd/log_utils.py b/torch/distributed/_spmd/log_utils.py
new file mode 100644
index 000000000000..1a8a9f0400ea
--- /dev/null
+++ b/torch/distributed/_spmd/log_utils.py
@@ -0,0 +1,78 @@
+import logging
+import logging.config
+import os
+from typing import Optional
+
+import torch.distributed as dist
+
+
+LOGGING_CONFIG = {
+    "version": 1,
+    "formatters": {
+        "spmd_format": {"format": "%(name)s: [%(levelname)s] %(message)s"},
+        "graph_opt_format": {"format": "%(name)s: [%(levelname)s] %(message)s"},
+    },
+    "handlers": {
+        "spmd_console": {
+            "class": "logging.StreamHandler",
+            "level": "DEBUG",
+            "formatter": "spmd_format",
+            "stream": "ext://sys.stdout",
+        },
+        "graph_opt_console": {
+            "class": "logging.StreamHandler",
+            "level": "DEBUG",
+            "formatter": "graph_opt_format",
+            "stream": "ext://sys.stdout",
+        },
+        "null_console": {
+            "class": "logging.NullHandler",
+        },
+    },
+    "loggers": {
+        "spmd_exp": {
+            "level": "DEBUG",
+            "handlers": ["spmd_console"],
+            "propagate": False,
+        },
+        "graph_opt": {
+            "level": "DEBUG",
+            "handlers": ["graph_opt_console"],
+            "propagate": False,
+        },
+        "null_logger": {
+            "handlers": ["null_console"],
+            "propagate": False,
+        },
+        # TODO(anj): Add loggers for MPMD
+    },
+    "disable_existing_loggers": False,
+}
+
+
+def get_logger(log_type: str) -> Optional[logging.Logger]:
+    from torch.distributed._spmd import config
+
+    if "PYTEST_CURRENT_TEST" not in os.environ:
+        logging.config.dictConfig(LOGGING_CONFIG)
+        avail_loggers = list(LOGGING_CONFIG["loggers"].keys())  # type: ignore[attr-defined]
+        assert (
+            log_type in avail_loggers
+        ), f"Unable to find {log_type} in the available list of loggers {avail_loggers}"
+
+        if not dist.is_initialized():
+            return logging.getLogger(log_type)
+
+        if dist.get_rank() == 0:
+            logger = logging.getLogger(log_type)
+            logger.setLevel(config.log_level)
+            if config.log_file_name is not None:
+                log_file = logging.FileHandler(config.log_file_name)
+                log_file.setLevel(config.log_level)
+                logger.addHandler(log_file)
+        else:
+            logger = logging.getLogger("null_logger")
+
+        return logger
+
+    return logging.getLogger("null_logger")
diff --git a/torch/distributed/_tensor/README.md b/torch/distributed/_tensor/README.md
index ba7ef77cbb5c..d61f47f7fceb 100644
--- a/torch/distributed/_tensor/README.md
+++ b/torch/distributed/_tensor/README.md
@@ -1,6 +1,6 @@
-# PyTorch DistributedTensor (DTensor)
+# PyTorch DTensor (Prototype Release)
 
-This folder contains the DistributedTensor (a.k.a DTensor) implementation in PyTorch.
+This folder contains the DTensor (a.k.a DistributedTensor) implementation in PyTorch.
 
 ## Introduction
 We propose distributed tensor primitives to allow easier distributed computation authoring in SPMD(Single Program Multiple Devices) paradigm. The primitives are simple but powerful when used to express tensor distributions with both sharding and replication parallelism strategies. This could empower native Tensor parallelism among other advanced parallelism explorations. For example, to shard a big tensor across devices with 3 lines of code:
@@ -9,7 +9,10 @@ We propose distributed tensor primitives to allow easier distributed computation
 import torch
 from torch.distributed._tensor import DeviceMesh, Shard, distribute_tensor
 
-# Create a mesh topology with the available devices.
+# Create a mesh topology with the available devices:
+# 1. We can directly create the mesh using elastic launcher,
+# 2. If using mp.spawn, we need to initialize the world process_group first.
+#   i.e. torch.distributed.init_process_group(backend="nccl", world_size=world_size)
 mesh = DeviceMesh("cuda", list(range(world_size)))
 big_tensor = torch.randn(100000, 88)
 # Shard this tensor over the mesh by sharding `big_tensor`'s 0th dimension over the 0th dimension of `mesh`.
@@ -22,58 +25,59 @@ Today there are mainly three ways to scale up distributed training: Data Paralle
 
 An ideal scenario is that users could build their distributed program just like authoring in a single node/device, without worrying about how to do distributed training in a cluster, and our solutions could help them run distributed training in an efficient manner. For example, researchers just need to build the big transformer model, and PyTorch Distributed automatically figures out how to split the model and run pipeline parallel across different nodes, how to run data parallel and tensor parallel within each node. In order to achieve this, we need some common abstractions to distribute tensor values and distributed computations accordingly.
 
-There're many recent works that working on tensor level parallelism to provide common abstractions, see the `Related Works` in the last section for more details. Inspired by [GSPMD](https://arxiv.org/pdf/2105.04663.pdf), [Oneflow](https://arxiv.org/pdf/2110.15032.pdf) and [TF’s DTensor](https://www.tensorflow.org/guide/dtensor_overview), we introduce DistributedTensor as the next generation of ShardedTensor to provide basic abstractions for distributing storage and computation. It serves as one of the basic building blocks for distributed program translations and describes the layout of a distributed training program. With the DistributedTensor abstraction, we can seamlessly build parallelism strategies such as tensor parallelism, DDP and FSDP.
+There're many recent works that working on tensor level parallelism to provide common abstractions, see the `Related Works` in the last section for more details. Inspired by [GSPMD](https://arxiv.org/pdf/2105.04663.pdf), [Oneflow](https://arxiv.org/pdf/2110.15032.pdf) and [TF’s DTensor](https://www.tensorflow.org/guide/dtensor_overview), we introduce PyTorch DTensor as the next generation of ShardedTensor to provide basic abstractions for distributing storage and computation. It serves as one of the basic building blocks for distributed program translations and describes the layout of a distributed training program. With the DTensor abstraction, we can seamlessly build parallelism strategies such as tensor parallelism, DDP and FSDP.
 
-## Value Propsition
+## Value Proposition
 
-DistributedTensor primarily:
+PyTorch DTensor primarily:
 -   Offers a uniform way to save/load `state_dict` during checkpointing, even when there’re complex tensor storage distribution strategies such as combining tensor parallelism with parameter sharding in FSDP.
 -   Enables Tensor Parallelism in eager mode. Compared to ShardedTensor, DistributedTensor allows additional flexibility to mix sharding and replication.
 -   Serves as the entry point of an SPMD programming model and the foundational building block for compiler-based distributed training.
 
-## PyTorch DistributedTensor
+## PyTorch DTensor
 
-### DistributedTensor API
+### DTensor API
 
 We offer both a lower level DistributedTensor API and a module level API to create a `nn.Module` with “distributed” parameters.
 
-#### Basic DistributedTensor API Examples
+#### Basic DTensor API Examples
 
-Here are some basic DistributedTensor API examples that showcase:
-1. How to construct a DistributedTensor directly, to represent different types of sharding, replication, sharding + replication strategies.
-2. How to create DistributedTensor from a local `torch.Tensor`.
-3. How to “reshard” an existing DistributedTensor to a different DistributedTensor with modified placement strategy or world size.
+Here are some basic DTensor API examples that showcase:
+1. How to construct a DTensor directly, to represent different types of sharding, replication, sharding + replication strategies.
+2. How to create DTensor from a local `torch.Tensor`.
+3. How to “reshard” an existing DTensor to a different DTensor with modified placement strategy or world size.
 
 ```python
 import torch
-import torch.distributed as distributed
-from torch.distributed._tensor import DTensor, DeviceMesh, Shard, Replicate, distribute_module
+from torch.distributed._tensor import DTensor, DeviceMesh, Shard, Replicate, distribute_tensor, distribute_module
 
 # construct a device mesh with available devices (multi-host or single host)
-device_mesh = DeviceMesh(device_type="cuda", [0, 1, 2, 3])
+device_mesh = DeviceMesh("cuda", [0, 1, 2, 3])
 # if we want to do row-wise sharding
 rowwise_placement=[Shard(0)]
 # if we want to do col-wise sharding
 colwise_placement=[Shard(1)]
+
+big_tensor = torch.randn(888, 12)
 # distributed tensor returned will be sharded across the dimension specified in placements
-distributed.empty((8, 12), device_mesh=device_mesh, placements=rowwise_placement)
+rowwise_tensor = distribute_tensor(big_tensor, device_mesh=device_mesh, placements=rowwise_placement)
 
 # if we want to do replication across a certain device list
 replica_placement = [Replicate()]
 # distributed tensor will be replicated to all four GPUs.
-distributed.empty((8, 12), device_mesh=device_mesh, placements=replica_placement)
+replica_tensor = distribute_tensor(big_tensor, device_mesh=device_mesh, placements=replica_placement)
 
 # if we want to distributed a tensor with both replication and sharding
-device_mesh = DeviceMesh(device_type="cuda", [[0, 1], [2, 3]])
+device_mesh = DeviceMesh("cuda", [[0, 1], [2, 3]])
 # replicate across the first dimension of device mesh, then sharding on the second dimension of device mesh
 spec=[Replicate(), Shard(0)]
-distributed.empty((8, 8), device_mesh=device_mesh, placements=spec)
+partial_replica = distribute_tensor(big_tensor, device_mesh=device_mesh, placements=spec)
 
 # create a DistributedTensor that shards on dim 0, from a local torch.Tensor
 local_tensor = torch.randn((8, 8), requires_grad=True)
 rowwise_tensor = DTensor.from_local(local_tensor, device_mesh, rowwise_placement)
 
-# reshard the current rowise tensor to a colwise tensor or replicate tensor
+# reshard the current row-wise tensor to a colwise tensor or replicate tensor
 colwise_tensor = rowwise_tensor.redistribute(device_mesh, colwise_placement)
 replica_tensor = colwise_tensor.redistribute(device_mesh, replica_placement)
 
@@ -81,7 +85,7 @@ replica_tensor = colwise_tensor.redistribute(device_mesh, replica_placement)
 
 #### High level User Facing APIs
 
-Users can use DistributedTensor tensor constructors directly to create a distributed tensor (i.e. `distributed.ones/empty`), but for existing modules like `nn.Linear` that are already having `torch.Tensor` as parameters, how to make them distributed parameters? We offer a way to directly distribute a `torch.Tensor` and a module level APIs to directly distribute the module parameters. Below is the high level API we introduce:
+Users can use DTensor tensor constructors directly to create a distributed tensor (i.e. `distributed.ones/empty`), but for existing modules like `nn.Linear` that are already having `torch.Tensor` as parameters, how to make them distributed parameters? We offer a way to directly distribute a `torch.Tensor` and a module level APIs to directly distribute the module parameters. Below is the high level API we introduce:
 
 ```python
 def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh=None, placements: List[Placement]=None):
@@ -132,11 +136,11 @@ def shard_fc(mod_name, mod, mesh):
 sharded_module = distribute_module(model, device_mesh, partition_fn=shard_fc)
 ```
 
-## Compiler and DistributedTensor
+## Compiler and PyTorch DTensor
 
-DistributedTensor provides efficient solutions for cases like Tensor Parallelism. But when using the DTensor's replication in a data parallel fashion, it might become observably slower compared to our existing solutions like DDP/FSDP. This is mainly because mainly because DDP/FSDP have a global view of the entire model architecture, thus could optimize for data parallel specifically, i.e. collective fusion and computation overlap, etc. In contract, DistributedTensor as a Tensor-like object can only optimize within individual tensor operations.
+DTensor provides efficient solutions for cases like Tensor Parallelism. But when using the DTensor's replication in a data parallel fashion, it might become observably slower compared to our existing solutions like DDP/FSDP. This is mainly because mainly because DDP/FSDP have a global view of the entire model architecture, thus could optimize for data parallel specifically, i.e. collective fusion and computation overlap, etc. In contract, DistributedTensor as a Tensor-like object can only optimize within individual tensor operations.
 
-To improve efficiency of DistributedTensor-based data parallel training, we are exploring a compiler-based solution on top of DistributedTensor, which can extract graph information from user programs to expose more performance optimization opportunities.
+To improve efficiency of DTensor-based data parallel training, we are exploring a compiler-based solution on top of DTensor, which can extract graph information from user programs to expose more performance optimization opportunities.
 
 ## Related Works
 
@@ -164,4 +168,4 @@ There are also several cutting edge research fields that embeds tensor sharding
 
 RFC: https://github.com/pytorch/pytorch/issues/88838
 
-We are gathering early feedbacks about this proposal. We have also posted this [RFC](https://dev-discuss.pytorch.org/t/rfc-pytorch-distributedtensor/740) to the dev-discuss forum, please feel free to comment directly in the above issue or in the forum post. To see a complete design doc with additional details about DTesnor, please refer to this [doc](https://docs.google.com/document/d/1nFeJ8NSFNhNlCkNgWK31ZGRqm1L9rd0i_XN_RprphaI/edit#heading=h.6sovjqv9jiqn)
+We are gathering early feedbacks about this proposal. We have also posted this [RFC](https://dev-discuss.pytorch.org/t/rfc-pytorch-distributedtensor/740) to the dev-discuss forum, please feel free to comment directly in the above issue or in the forum post. To see a complete design doc with additional details about DTensor, please refer to this [doc](https://docs.google.com/document/d/1nFeJ8NSFNhNlCkNgWK31ZGRqm1L9rd0i_XN_RprphaI/edit#heading=h.6sovjqv9jiqn)
diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
index 476357364a02..667723d525dd 100644
--- a/torch/distributed/_tensor/__init__.py
+++ b/torch/distributed/_tensor/__init__.py
@@ -1,176 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from typing import Callable, cast, Optional, Sequence
 
-import torch
-
 # Import all builtin dist tensor ops
 import torch.distributed._tensor.ops
-import torch.nn as nn
-from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.api import DTensor, distribute_tensor, distribute_module
 from torch.distributed._tensor.device_mesh import DeviceMesh, get_global_device_mesh
 from torch.distributed._tensor.placement_types import Placement, Replicate, Shard
 
 
-def distribute_tensor(
-    tensor: torch.Tensor,
-    device_mesh: Optional[DeviceMesh] = None,
-    placements: Optional[Sequence[Placement]] = None,
-) -> DTensor:
-    """
-    Distribute a torch.Tensor to the `device_mesh` according to the `placements`
-    specified. The rank of `device_mesh` and `placements` must be the same.
-
-    Args:
-        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
-            want to shard a tensor on a dimension that is not evenly divisible by
-            the number of devices in that mesh dimension, we use `torch.tensor_split`
-            semantic to shard the tensor and scatter the shards.
-        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
-            tensor, if not specified, must be called under a DeviceMesh context
-            manager, default: None
-        placements (List[:class:`Placement`], optional): the placements that
-            describes how to place the tensor on DeviceMesh, must have the same
-            number of elements as `device_mesh.ndim`. If not specified, we will
-            by default replicate the tensor across the `device_mesh` from the
-            first rank of each dimension of the `device_mesh`.
-
-    Returns:
-        A :class:`DTensor` object
-    """
-    # get default device mesh if there's nothing specified
-    device_mesh = get_global_device_mesh() if device_mesh is None else device_mesh
-    # convert tensor to the correponding device type if it's not in that device type
-    tensor = tensor.to(device_mesh.device_type)
-    # set default placements to replicated if not specified
-    if placements is None:
-        placements = [Replicate() for _ in range(device_mesh.ndim)]
-
-    if len(placements) != device_mesh.ndim:
-        raise ValueError(
-            f"`placements` must have the same length as `device_mesh.ndim`! "
-            f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}."
-        )
-
-    if isinstance(tensor, DTensor):
-        # if the tensor is already a DTensor, we just need to check if the
-        # device mesh and placements are the same
-        if tensor.device_mesh != device_mesh:
-            raise ValueError(
-                f"Cannot distribute a DTensor with device mesh {tensor.device_mesh} "
-                f"to a different device mesh {device_mesh}."
-            )
-        if tensor.placements != placements:
-            raise ValueError(
-                f"Cannot distribute a DTensor with placements {tensor.placements} "
-                f"to a different placements {placements}. do you want to call "
-                f"`redistribute` instead?"
-            )
-        return tensor
-
-    local_tensor = tensor
-
-    # distribute the tensor according to the placements.
-    for idx, placement in enumerate(placements):
-        if placement.is_shard():
-            placement = cast(Shard, placement)
-            output = placement._shard_tensor(local_tensor, device_mesh, idx)
-            # scatter call could not return a tensor with correct requires_grad
-            # field, as ProcessGroupNCCL refuse to take a tensor with requires_grad
-            # to do inplace update! So we manually set it here
-            output.requires_grad_(tensor.requires_grad)
-            local_tensor = output
-        elif placement.is_replicate():
-            local_tensor = local_tensor.contiguous()
-            device_mesh.broadcast(local_tensor, mesh_dim=idx)
-        else:
-            raise RuntimeError(
-                f"Trying to distribute tensor with unsupported placements {placement} on device mesh dimension {idx}!"
-            )
-
-    assert local_tensor is not None, "distributing a tensor should not be None"
-    return DTensor(
-        local_tensor,
-        device_mesh,
-        placements,
-        size=tensor.size(),
-        requires_grad=tensor.requires_grad,
-    )
-
-
-def distribute_module(
-    module: nn.Module,
-    device_mesh: Optional[DeviceMesh] = None,
-    partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
-    input_fn: Optional[Callable[..., None]] = None,
-    output_fn: Optional[Callable[..., None]] = None,
-) -> nn.Module:
-    """
-    This function converts all module parameters to :class:`DTensor` parameters
-    according to the `partition_fn` specified. It could also control the input or
-    output of the module by specifying the `input_fn` and `output_fn`. (i.e. convert
-    the input to :class:`DTensor`, convert the output back to torch.Tensor)
-    Args:
-        module (:class:`nn.Module`): user module to be partitioned.
-        device_mesh (:class:`DeviceMesh`): the device mesh to place the module.
-        partition_fn (Callable): the function to partition parameters (i.e. shard certain
-            parameters across the `device_mesh`). If `partition_fn` is not specified,
-            by default we replicate all module parameters of `module` across the mesh.
-        input_fn (Callable): specify the input distribution, i.e. could control how the
-            input of the module is sharded. `input_fn` will be installed as a module
-            `forward_pre_hook` (pre forward hook).
-        output_fn (Callable): specify the output distribution, i.e. could control how the
-            output is sharded, or convert it back to torch.Tensor. output_fn will be
-            installed as a module `forward_hook` (post forward hook).
-
-    Returns:
-        A module that contains parameters/buffers that are all `DTensor`s.
-    """
-
-    if device_mesh is None:
-        device_mesh = get_global_device_mesh()
-
-    def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
-        # This function loop over the immediate module parameters and
-        # buffers, replicate all non DTensor params/buffers to DTensor
-        # parameters/buffers, if they have not been partitioned in the
-        # partition_fn, we can't easily use `module._apply` here
-        # because we don't know what happened inside partition_fn as
-        # user could do anything, i.e. install hooks, and we want to
-        # preserve those.
-        full_replicate = [Replicate()] * mesh.ndim
-        for key, param in m._parameters.items():
-            if param is not None and not isinstance(param, DTensor):
-                m.register_parameter(
-                    key,
-                    nn.Parameter(distribute_tensor(param.data, mesh, full_replicate)),
-                )
-        for key, buffer in m._buffers.items():
-            if buffer is not None and not isinstance(buffer, DTensor):
-                m._buffers[key] = distribute_tensor(buffer, mesh, full_replicate)
-
-    if partition_fn is None:
-        # if partition_fn not specified, we by default replicate
-        # all module params/buffers
-        for name, submod in module.named_modules():
-            replicate_module_params_buffers(submod, device_mesh)
-    else:
-        # apply partition_fun to submodules
-        for name, submod in module.named_modules():
-            partition_fn(name, submod, device_mesh)
-            replicate_module_params_buffers(submod, device_mesh)
-
-    # register input_fn as module forward pre hook
-    if input_fn is not None:
-        module.register_forward_pre_hook(lambda _, inputs: input_fn(inputs, device_mesh))  # type: ignore[misc]
-    # register input_fn as module forward hook
-    if output_fn is not None:
-        module.register_forward_hook(
-            lambda mod, inputs, outputs: output_fn(outputs, device_mesh)  # type: ignore[misc]
-        )
-
-    return module
-
-
 # All public APIs from dtensor package
 __all__ = [
     "DTensor",
diff --git a/torch/distributed/_tensor/api.py b/torch/distributed/_tensor/api.py
index dd94113ffda1..0baa1b49038b 100644
--- a/torch/distributed/_tensor/api.py
+++ b/torch/distributed/_tensor/api.py
@@ -1,11 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import copy
 import warnings
-from typing import Callable, cast, Dict, Optional, Sequence
+from typing import Callable, cast, Dict, Optional, Sequence, Tuple
 
 import torch
+import torch.nn as nn
 
 import torch.distributed._tensor.dispatch as op_dispatch
+from torch.fx.passes.shape_prop import TensorMetadata
 from torch.distributed._tensor.device_mesh import DeviceMesh, get_global_device_mesh
 from torch.distributed._tensor.placement_types import (
     _Partial,
@@ -14,9 +16,13 @@
     Replicate,
     Shard,
 )
+from torch.distributed._tensor.sharding_prop import ShardingPropagator
 from torch.distributed._tensor.redistribute import Redistribute
 from torch.utils._pytree import tree_flatten
 
+
+__all__ = ["DTensor", "distribute_tensor", "distribute_module"]
+
 # NOTE [Autograd interaction between torch.Tensor]
 #
 # The autograd functions defined below are being used by the public
@@ -41,29 +47,28 @@
 # `from_local`, and conversion from DTensor output to output, which
 # is `to_local`, thus these two functions must be Autograd functions.
 #
-class ToTorchTensor(torch.autograd.Function):
+class _ToTorchTensor(torch.autograd.Function):
     @staticmethod
     def forward(ctx, input: "DTensor"):  # type: ignore[override]
-        ctx.dtensor_device_mesh = input.device_mesh
-        ctx.dtensor_placements = input.placements
-        ctx.dtensor_shape = input.shape
-        ctx.dtensor_requires_grad = input.requires_grad
+        ctx.dtensor_spec = input._spec
         return input._local_tensor.detach()
 
     @staticmethod
     def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
-        device_mesh = ctx.dtensor_device_mesh
-        placements = ctx.dtensor_placements
+        dtensor_spec = ctx.dtensor_spec
+        dtensor_meta = dtensor_spec.tensor_meta
         return DTensor(
             grad_output,
-            device_mesh,
-            placements,
-            size=ctx.dtensor_shape,
+            dtensor_spec.mesh,
+            dtensor_spec.placements,
+            shape=dtensor_meta.shape,
+            dtype=dtensor_meta.dtype,
             requires_grad=grad_output.requires_grad,
+            stride=dtensor_meta.stride
         )
 
 
-class FromTorchTensor(torch.autograd.Function):
+class _FromTorchTensor(torch.autograd.Function):
     @staticmethod
     def forward(  # type: ignore[override]
         ctx,  # pyre-ignore[2]: Parameter must be annotated.
@@ -75,35 +80,51 @@ def forward(  # type: ignore[override]
         ctx.previous_placement = placements
         ctx.previous_device_mesh = device_mesh
 
-        if run_check:
-            # TODO: by default check tensor metas across rank
-            # TODO: See if we need to make this run_check logic
-            # have a corresponding backward.
-            for idx, placement in enumerate(placements):
-                if placement.is_replicate():
-                    # broadcast rank 0 tensor to all ranks
-                    # only broadcast if run_check is True
-                    input = input.contiguous()
-                    device_mesh.broadcast(input, mesh_dim=idx)
-
         # if it's not by default run_check, we assume user is certain that each
         # rank has the same tensor shape, and we just use that to calculate the
         # global shape
         tensor_shape = list(input.size())
+        tensor_stride = list(input.stride())
         for idx, placement in enumerate(placements):
             if placement.is_shard():
                 shard_dim = cast(Shard, placement).dim
                 local_dim_size = tensor_shape[shard_dim]
                 tensor_shape[shard_dim] = local_dim_size * device_mesh.size(idx)
 
+                # recover tensor stride by modifying the stride that larger than
+                # the current stride on the shard_dim
+                for i in range(len(tensor_stride)):
+                    if i != shard_dim and tensor_stride[i] >= tensor_stride[shard_dim]:
+                        # rescale the stride by the shard size
+                        tensor_stride[i] = tensor_stride[i] * device_mesh.size(idx)
+            elif not isinstance(placement, (Replicate, _Partial)):
+                raise RuntimeError(f"placement type {type(placement)} not supported!")
+
+        if device_mesh.get_coordinate() is None:
+            # if the global rank is not participating in the device mesh, we
+            # simply set the local tensor to an empty tensor
+            input = input.new_empty(0, requires_grad=input.requires_grad)
+        elif run_check:
+            # TODO: by default check tensor metas across rank
+            # TODO: See if we need to make this run_check logic
+            # have a corresponding backward.
+            for idx, placement in enumerate(placements):
+                if placement.is_replicate():
+                    # broadcast rank 0 tensor to all ranks
+                    # only broadcast if run_check is True
+                    input = input.contiguous()
+                    device_mesh.broadcast(input, mesh_dim=idx)
+
         dist_tensor = DTensor(
             input,
             device_mesh,
             placements,
-            size=torch.Size(tensor_shape),
+            shape=torch.Size(tensor_shape),
+            dtype=input.dtype,
             # requires_grad of the dist tensor depends on if input
             # requires_grad or not
             requires_grad=input.requires_grad,
+            stride=tuple(tensor_stride),
         )
         return dist_tensor
 
@@ -133,9 +154,7 @@ class DTensor(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new__
 
     # class attribute that handles operator placements propagation
     # rules, keyed by aten op name, value is propagation func
-    _op_to_rules: Dict[
-        str, Callable[["op_dispatch.OpSchema"], "op_dispatch.OutputSharding"]
-    ] = {}
+    _propagator: ShardingPropagator = ShardingPropagator()
 
     # class attribute that handles custom registered ops, all handled
     # custom ops should appear in this table, and overriding the default
@@ -151,8 +170,10 @@ def __new__(
         device_mesh: DeviceMesh,
         placements: Sequence[Placement],
         *,
-        size: torch.Size,
-        requires_grad: bool = False,
+        shape: torch.Size,
+        dtype: torch.dtype,
+        requires_grad: bool,
+        stride: Tuple[int, ...],
     ) -> "DTensor":
         """
         Construct a DTensor from a local tensor, device mesh, and placement and
@@ -164,25 +185,6 @@ def __new__(
             already have tensor initialized and want to shard this tensor),
             consider using `distribute_tensor`.
         """
-        # recover tensor strides from local tensor strides and global size info
-        # in the case of sharding
-        # TODO: we should try to use meta tensor for shape and stride calculation
-        tensor_stride = list(local_tensor.stride())
-        local_size = list(local_tensor.size())
-        for placement in placements:
-            if isinstance(placement, Shard):
-                shard_dim = placement.dim
-                # recover tensor stride by modifying the stride that larger than
-                # the current stride on the shard_dim
-                for i in range(len(tensor_stride)):
-                    if i != shard_dim and tensor_stride[i] >= tensor_stride[shard_dim]:
-                        # rescale the stride by the shard size
-                        tensor_stride[i] = (
-                            tensor_stride[i] // local_size[shard_dim]
-                        ) * size[shard_dim]
-            elif not isinstance(placement, (Replicate, _Partial)):
-                raise RuntimeError(f"placement type {type(placement)} not supported!")
-
         if requires_grad != local_tensor.requires_grad:
             warnings.warn(
                 "To construct DTensor from torch.Tensor, it's recommended to "
@@ -193,15 +195,26 @@ def __new__(
         # placement spec, it does not do actual distribution
         r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
             cls,
-            size,
-            strides=tensor_stride,
-            dtype=local_tensor.dtype,
+            shape,
+            strides=stride,
+            dtype=dtype,
             device=local_tensor.device,
             layout=local_tensor.layout,
             requires_grad=requires_grad,
         )
+
+        # TODO: populate all tensor meta fields properly
+        tensor_meta = TensorMetadata(
+            shape,
+            dtype,
+            requires_grad,
+            stride,
+            torch.contiguous_format,
+            False,
+            {}
+        )
         # deepcopy and set spec
-        r._spec = DTensorSpec(device_mesh, copy.deepcopy(placements), shape=r.size())
+        r._spec = DTensorSpec(device_mesh, copy.deepcopy(placements), tensor_meta=tensor_meta)
         # detach local tensor from autograd graph as we initialize the
         # distributed tensor and autograd will be working on top of
         # the wrapper tensor directly instead of local torch.Tensor
@@ -233,7 +246,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             func,
             args,
             kwargs,
-            DTensor._op_to_rules,
+            DTensor._propagator,
             DTensor._custom_dispatch_ops,
         )
 
@@ -276,8 +289,10 @@ def from_local(
         # strategy, where we broadcast the replication from the first rank
         # in the mesh dimension
         device_mesh = get_global_device_mesh() if device_mesh is None else device_mesh
+
         # convert the local tensor to desired device base on device mesh's device_type
-        local_tensor = local_tensor.to(device_mesh.device_type)
+        if not local_tensor.is_meta:
+            local_tensor = local_tensor.to(device_mesh.device_type)
 
         # set default placements to replicated if not specified
         if placements is None:
@@ -286,7 +301,7 @@ def from_local(
         # `from_local` is differentiable, and the gradient of the dist tensor this function
         # created should flow back the gradients to the local_tensor, so we call an autograd
         # function to construct the dist tensor instead.
-        return FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+        return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
             local_tensor, device_mesh, placements, run_check
         )
 
@@ -302,7 +317,7 @@ def to_local(self) -> torch.Tensor:
         .. note:: `to_local` is differentiable, the `requires_grad` of the local tensor returned
             will depend on if the `DTensor` requires_grad or not.
         """
-        return ToTorchTensor.apply(self)  # pyre-ignore[16]: autograd func
+        return _ToTorchTensor.apply(self)  # pyre-ignore[16]: autograd func
 
     def redistribute(
         self,
@@ -366,3 +381,166 @@ def placements(self) -> Sequence[Placement]:
         .. note:: placements is a read-only property, it can not be set.
         """
         return self._spec.placements
+
+
+def distribute_tensor(
+    tensor: torch.Tensor,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Distribute a torch.Tensor to the `device_mesh` according to the `placements`
+    specified. The rank of `device_mesh` and `placements` must be the same.
+
+    Args:
+        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
+            want to shard a tensor on a dimension that is not evenly divisible by
+            the number of devices in that mesh dimension, we use `torch.tensor_split`
+            semantic to shard the tensor and scatter the shards.
+        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
+            tensor, if not specified, must be called under a DeviceMesh context
+            manager, default: None
+        placements (List[:class:`Placement`], optional): the placements that
+            describes how to place the tensor on DeviceMesh, must have the same
+            number of elements as `device_mesh.ndim`. If not specified, we will
+            by default replicate the tensor across the `device_mesh` from the
+            first rank of each dimension of the `device_mesh`.
+
+    Returns:
+        A :class:`DTensor` object
+    """
+    # get default device mesh if there's nothing specified
+    device_mesh = get_global_device_mesh() if device_mesh is None else device_mesh
+    # convert tensor to the correponding device type if it's not in that device type
+    if not tensor.is_meta:
+        tensor = tensor.to(device_mesh.device_type)
+    # set default placements to replicated if not specified
+    if placements is None:
+        placements = [Replicate() for _ in range(device_mesh.ndim)]
+
+    if len(placements) != device_mesh.ndim:
+        raise ValueError(
+            f"`placements` must have the same length as `device_mesh.ndim`! "
+            f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}."
+        )
+
+    if isinstance(tensor, DTensor):
+        # if the tensor is already a DTensor, we just need to check if the
+        # device mesh and placements are the same
+        if tensor.device_mesh != device_mesh:
+            raise ValueError(
+                f"Cannot distribute a DTensor with device mesh {tensor.device_mesh} "
+                f"to a different device mesh {device_mesh}."
+            )
+        if tensor.placements != placements:
+            raise ValueError(
+                f"Cannot distribute a DTensor with placements {tensor.placements} "
+                f"to a different placements {placements}. do you want to call "
+                f"`redistribute` instead?"
+            )
+        return tensor
+
+    local_tensor = tensor
+
+    # distribute the tensor according to the placements.
+    for idx, placement in enumerate(placements):
+        if placement.is_shard():
+            placement = cast(Shard, placement)
+            output = placement._shard_tensor(local_tensor, device_mesh, idx)
+            # scatter call could not return a tensor with correct requires_grad
+            # field, as ProcessGroupNCCL refuse to take a tensor with requires_grad
+            # to do inplace update! So we manually set it here
+            output.requires_grad_(tensor.requires_grad)
+            local_tensor = output
+        elif placement.is_replicate():
+            placement = cast(Replicate, placement)
+            local_tensor = placement._replicate_tensor(local_tensor, device_mesh, idx)
+        else:
+            raise RuntimeError(
+                f"Trying to distribute tensor with unsupported placements {placement} on device mesh dimension {idx}!"
+            )
+
+    assert local_tensor is not None, "distributing a tensor should not be None"
+    return DTensor(
+        local_tensor,
+        device_mesh,
+        placements,
+        shape=tensor.size(),
+        dtype=tensor.dtype,
+        requires_grad=tensor.requires_grad,
+        stride=tensor.stride(),
+    )
+
+
+def distribute_module(
+    module: nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+    partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
+    input_fn: Optional[Callable[..., None]] = None,
+    output_fn: Optional[Callable[..., None]] = None,
+) -> nn.Module:
+    """
+    This function converts all module parameters to :class:`DTensor` parameters
+    according to the `partition_fn` specified. It could also control the input or
+    output of the module by specifying the `input_fn` and `output_fn`. (i.e. convert
+    the input to :class:`DTensor`, convert the output back to torch.Tensor)
+    Args:
+        module (:class:`nn.Module`): user module to be partitioned.
+        device_mesh (:class:`DeviceMesh`): the device mesh to place the module.
+        partition_fn (Callable): the function to partition parameters (i.e. shard certain
+            parameters across the `device_mesh`). If `partition_fn` is not specified,
+            by default we replicate all module parameters of `module` across the mesh.
+        input_fn (Callable): specify the input distribution, i.e. could control how the
+            input of the module is sharded. `input_fn` will be installed as a module
+            `forward_pre_hook` (pre forward hook).
+        output_fn (Callable): specify the output distribution, i.e. could control how the
+            output is sharded, or convert it back to torch.Tensor. output_fn will be
+            installed as a module `forward_hook` (post forward hook).
+
+    Returns:
+        A module that contains parameters/buffers that are all `DTensor`s.
+    """
+
+    if device_mesh is None:
+        device_mesh = get_global_device_mesh()
+
+    def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
+        # This function loop over the immediate module parameters and
+        # buffers, replicate all non DTensor params/buffers to DTensor
+        # parameters/buffers, if they have not been partitioned in the
+        # partition_fn, we can't easily use `module._apply` here
+        # because we don't know what happened inside partition_fn as
+        # user could do anything, i.e. install hooks, and we want to
+        # preserve those.
+        full_replicate = [Replicate()] * mesh.ndim
+        for key, param in m._parameters.items():
+            if param is not None and not isinstance(param, DTensor):
+                m.register_parameter(
+                    key,
+                    nn.Parameter(distribute_tensor(param.data, mesh, full_replicate)),
+                )
+        for key, buffer in m._buffers.items():
+            if buffer is not None and not isinstance(buffer, DTensor):
+                m._buffers[key] = distribute_tensor(buffer, mesh, full_replicate)
+
+    if partition_fn is None:
+        # if partition_fn not specified, we by default replicate
+        # all module params/buffers
+        for name, submod in module.named_modules():
+            replicate_module_params_buffers(submod, device_mesh)
+    else:
+        # apply partition_fun to submodules
+        for name, submod in module.named_modules():
+            partition_fn(name, submod, device_mesh)
+            replicate_module_params_buffers(submod, device_mesh)
+
+    # register input_fn as module forward pre hook
+    if input_fn is not None:
+        module.register_forward_pre_hook(lambda _, inputs: input_fn(inputs, device_mesh))  # type: ignore[misc]
+    # register input_fn as module forward hook
+    if output_fn is not None:
+        module.register_forward_hook(
+            lambda mod, inputs, outputs: output_fn(outputs, device_mesh)  # type: ignore[misc]
+        )
+
+    return module
diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
index 709c5e140ed3..c4817ebc41d9 100644
--- a/torch/distributed/_tensor/device_mesh.py
+++ b/torch/distributed/_tensor/device_mesh.py
@@ -51,7 +51,7 @@ def set_global_device_mesh(mesh: Optional["DeviceMesh"]) -> None:
 ]
 
 
-class DeviceMesh(object):
+class DeviceMesh:
     """
     DeviceMesh represents a mesh of devices, where layout of devices could be
     represented as a n-d dimension array, and each value of the n-d dimensional
@@ -291,12 +291,12 @@ def backend(self) -> str:
     def get_rank(self) -> int:
         return get_rank()
 
-    def get_coordinate_on_dim(self, dim: int) -> Optional[int]:
+    def get_coordinate(self) -> Optional[List[int]]:
         """
         Return the relative index of this rank relative to a given
         dimension of the mesh. If this rank is not part of the mesh, return None.
         """
-        return self._coordinate_on_dim[dim] if self._coordinate_on_dim else None
+        return self._coordinate_on_dim if self._coordinate_on_dim else None
 
     def scatter(
         self,
@@ -322,6 +322,12 @@ def scatter(
         Returns:
             A :class:`Work` object
         """
+        # TODO: Ideally we should use the meta tensor way
+        # (to register a meta kernel for the collective op)
+        # so that it would avoid the communication. Need to
+        # remove the check below once that is done.
+        if output.is_meta:
+            return None
         dim_group = self._dim_groups[mesh_dim]
         # src need to be global rank
         src_for_dim = 0
@@ -369,6 +375,12 @@ def broadcast(
         Returns:
             A :class:`Work` object
         """
+        # TODO: Ideally we should use the meta tensor way
+        # (to register a meta kernel for the collective op)
+        # so that it would avoid the communication. Need to
+        # remove the check below once that is done.
+        if tensor.is_meta:
+            return None
         dim_group = self._dim_groups[mesh_dim]
         # src need to be global rank
         src_for_dim = 0
@@ -461,7 +473,7 @@ def reduce_scatter(
             warnings.warn(
                 "ProcessGroupGloo does not support reduce_scatter, falling back with all reduce!"
             )
-            my_coordinate = self.get_coordinate_on_dim(mesh_dim)
+            my_coordinate = self.get_coordinate()
             # TODO: what should happen if rank is not in the mesh?
             # see issue https://github.com/pytorch/tau/pull/492
             assert (
@@ -485,7 +497,7 @@ def reduce_scatter(
                 flat_tensor, op=op, mesh_dim=mesh_dim, async_op=async_op
             )
             # scatter the tensor
-            output_offset = offset_list[my_coordinate]
+            output_offset = offset_list[my_coordinate[mesh_dim]]
             output.copy_(
                 flat_tensor[output_offset : output_offset + output.numel()].view(
                     output.shape
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
index dde78d8158cf..84b2eef33ff3 100644
--- a/torch/distributed/_tensor/dispatch.py
+++ b/torch/distributed/_tensor/dispatch.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import Callable, cast, Dict, Optional, Tuple, Union
+from typing import Callable, cast, Dict, Tuple, Union, Optional
 
 import torch
 
@@ -7,14 +7,12 @@
 from torch.distributed._tensor.op_schema import (
     ArgsType,
     KwargsType,
-    OpSchema,
-    OutputSharding,
     OutputSpecType,
 )
 from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed._tensor.sharding_prop import ShardingPropagator
 from torch.distributed._tensor.redistribute import redistribute_dtensor
-from torch.distributed._tensor.utils import unwrap_local_tensor
-from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+from torch.utils._pytree import tree_flatten, tree_unflatten
 
 
 """
@@ -24,49 +22,46 @@
 _ENABLE_FALLBACK = False
 
 
-"""
-Print information on ops input shape and sharding for debugging purposes.
-"""
-_DEBUG_VERBOSE = False
-
-def unwrap_schema(e: object) -> object:
-    return e._spec if isinstance(e, dtensor.DTensor) else e
-
-
 def wrap(res: object, spec: OutputSpecType) -> object:
     if isinstance(res, torch.Tensor):
         assert spec is not None and isinstance(
             spec, DTensorSpec
         ), f"output spec does not match with output! Expected DTensorSpec, got {spec}."
+        assert spec.tensor_meta is not None
         return dtensor.DTensor(
             res,
             spec.mesh,
             spec.placements,
-            size=spec.shape,
+            shape=spec.tensor_meta.shape,
+            dtype=spec.tensor_meta.dtype,
             requires_grad=res.requires_grad,
+            stride=spec.tensor_meta.stride,
         )
-    elif isinstance(res, list):
-        assert spec is not None and isinstance(
-            spec, list
-        ), f"output spec does not match with output! Expected list, got {spec}."
-        return list(
-            dtensor.DTensor(e, s.mesh, s.placements, size=s.shape)
-            for e, s in zip(res, spec)
-        )
-    elif isinstance(res, tuple):
+    elif isinstance(res, (list, tuple)):
         assert spec is not None and isinstance(
-            spec, tuple
-        ), f"output spec does not match with output! Expected tuple, got {spec}"
-
-        # NOTE: local results might return Optional Tensor from ATen op, so we need to
-        # handle that case and make sure we don't wrap None with DTensor.
-        # (i.e. native_layer_norm.backward)
-        return tuple(
-            dtensor.DTensor(e, s.mesh, s.placements, size=s.shape)
-            if e is not None and s is not None
-            else None
-            for e, s in zip(res, spec)
-        )
+            spec, (list, tuple)
+        ), f"output spec does not match with output! Expected list/tuple, got {spec}."
+        res_list = []
+        for e, s in zip(res, spec):
+            # NOTE: local results might return Optional Tensor from ATen op, so we need
+            # to handle that case and make sure we don't wrap None with DTensor.
+            # (i.e. native_layer_norm.backward)
+            if e is not None and s is not None:
+                assert s.tensor_meta is not None
+                res_dt = dtensor.DTensor(
+                    e,
+                    s.mesh,
+                    s.placements,
+                    shape=s.tensor_meta.shape,
+                    dtype=s.tensor_meta.dtype,
+                    requires_grad=s.tensor_meta.requires_grad,
+                    stride=s.tensor_meta.stride
+                )
+            else:
+                res_dt = None
+
+            res_list.append(res_dt)
+        return tuple(res_list) if isinstance(res, tuple) else res_list
     else:
         # if the res contains only non tensor values, we simply return it without rewrapping
         return res
@@ -105,133 +100,87 @@ def _reshape_alias(
 }
 
 
-def propagate_input_sharding(
+def operator_dispatch(
     op_call: torch._ops.OpOverload,
     args: Tuple[object, ...],
     kwargs: Dict[str, object],
-    op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]],
-) -> Tuple[OpSchema, bool, Optional[OutputSharding]]:
-    # unwrap the args/kwargs schema
-    args_schema = tree_map(unwrap_schema, args)
-    kwargs_schema = tree_map(unwrap_schema, kwargs)
-
-    op_schema = OpSchema(op_call._schema, args_schema, kwargs_schema)
-
-    if _DEBUG_VERBOSE and torch.distributed.get_rank() == 0:
-        print(f"{op_call}({op_schema})")
-        local_shapes = tree_map(
-            lambda t: t.to_local().shape if isinstance(t, dtensor.DTensor) else None,
-            args,
-        )
-        print(f"    local shapes: {local_shapes}")
-
-    op_key = str(op_call)
-    sharding_prop_func = op_to_rules.get(op_key, None)
-
-    if sharding_prop_func is None:
-        # step 1. If there's not even one sharding rule
-        # implemented for the operator, we fall back to
-        # local tensor compute, this is wront currently
-        # we will change the behavior to reshard to full
-        # replicate and do the computatation
-        if not _ENABLE_FALLBACK:
-            raise NotImplementedError(
-                f"Operator {op_key} does not have a DistributedTensor rule registered."
-            )
-        else:
-            return op_schema, False, None
-
-    # step 2. there's sharding propagation rule, run
-    # sharding propagation to get output sharding
-    try:
-        output_sharding = sharding_prop_func(op_schema)
-    except Exception as e:
-        raise RuntimeError(
-            f"Sharding propagation failed on op {op_key}.\n"
-            f"Input schema: {op_schema}.\n"
-            f"Error: {e}"
-        ) from e
-
-    # step 3. if can't get output_spec from sharding
-    # propagation (i.e. no rules apply for input
-    # placements), we do auto redistribute on inputs
-    # to get an eligble input, which we will pick a
-    # target schema base on the redistribute cost
-    # TODO: implement full auto distribute with a
-    # simple cost estimation model
-    if output_sharding.output_spec is None:
-        # do auto distributed/boxing here
-        if output_sharding.schema_suggestions is not None:
-            # pick the first suggestion for now,
-            target_schema = output_sharding.schema_suggestions[0]
-            # run sharding propagation again with target schema
-            output_sharding = sharding_prop_func(target_schema)
-
-            return target_schema, True, output_sharding
-
-        else:
+    sharding_propagator: ShardingPropagator,
+    custom_dispatch_ops: Optional[Dict[str, Callable[..., object]]] = None,
+) -> object:
+    # check that we are not getting mixed vanilla and Distributed tensors
+    arg_list, _ = tree_flatten(args)
+    mesh = None
+    for arg in arg_list:
+        if isinstance(arg, torch.Tensor) and not isinstance(arg, dtensor.DTensor):
             raise RuntimeError(
-                f"Sharding propagation failed on op {op_key}!"
-                f"Input schema: {op_schema}."
-                f"Failed reason: {output_sharding.failed_reason}"
+                f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
+                " torch.Tensor to DTensor before calling distributed operators!"
             )
-    else:
-        return op_schema, False, output_sharding
 
+        if isinstance(arg, dtensor.DTensor):
+            if mesh is not None:
+                if mesh != arg.device_mesh:
+                    raise NotImplementedError(
+                        f"{op_call}: DTensor does not support cross-mesh operation yet!"
+                    )
+            else:
+                mesh = arg.device_mesh
 
-def operator_dispatch(
-    op_call: torch._ops.OpOverload,
-    args: Tuple[object, ...],
-    kwargs: Dict[str, object],
-    op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]],
-    custom_dispatch_ops: Dict[str, Callable[..., object]],
-) -> object:
     # first we need to lift some private aten aliases to public calls
     if op_call in _CURRENT_DECOMPOSITION_TABLE:
         return _CURRENT_DECOMPOSITION_TABLE[op_call](*args, **kwargs)
 
-    # STEP 0. See if threre're user defined custom aten operator
+    # STEP 0. See if there's a user defined custom aten operator
     # implementations. Custom operators take the highest priority
-    if str(op_call) in custom_dispatch_ops:
+    if custom_dispatch_ops is not None and str(op_call) in custom_dispatch_ops:
         # dispatch to user defined custom distributed tensor ops
         return custom_dispatch_ops[str(op_call)](*args, **kwargs)
 
-    target_schema, redistribute, output_sharding = propagate_input_sharding(
-        op_call, args, kwargs, op_to_rules
-    )
-
-    if output_sharding is None:
-        # default to local tensor ops, this is wrong
-        # but we use it now to enable more tensor point-wise ops
-        # TODO: delete this and use replicate (all_gather) as
-        # the default fallback.
-        tensor_args = tree_map(unwrap_local_tensor, args)
-        tensor_kwargs = tree_map(unwrap_local_tensor, kwargs)
-        local_results = op_call(*tensor_args, **tensor_kwargs)
-        return wrap(local_results, target_schema.args_spec[0])
-
-    local_tensor_args = pack_args_kwargs_with_local_tensor(
-        args,
-        target_schema.args_schema,
-        redistribute_with_schema=redistribute,
-    )
-    local_tensor_kwargs = pack_args_kwargs_with_local_tensor(
-        kwargs,
-        target_schema.kwargs_schema,
-        redistribute_with_schema=redistribute,
-    )
-
-    # run local op computation with potentially modified args/kwargs
-    local_tensor_args = cast(Tuple[object, ...], local_tensor_args)
-    local_tensor_kwargs = cast(Dict[str, object], local_tensor_kwargs)
-    local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
-
-    if target_schema.is_inplace:
+    # unwrap the args/kwargs schema
+    op_schema = sharding_propagator.prepare_op_schema(op_call, args, kwargs)
+
+    output_sharding = sharding_propagator.propagate_op_sharding(op_call, op_schema)
+
+    # if the schema suggestion from sharding prop is not the same instance as the
+    # input op_schema, it indicates a reshard, we need to redistribute the input
+    # tensors before calling the local op
+    assert output_sharding.schema_suggestions is not None
+    suggested_input_schema = output_sharding.schema_suggestions[0]
+    needs_redistribute = suggested_input_schema is not op_schema
+
+    if mesh is not None and mesh.get_coordinate() is None:
+        # if we are on a non-participating device, we simply return
+        # an empty tensor for now.
+        # TODO: what if the op returns a non-tensor value, what if
+        # the op returns a list of tensors, we need to figure out
+        # a consistent way to handle that, and also need to figure
+        # out if we should communicate the result to non-participating
+        # ranks (i.e. a.sum() -> scalar, maybe we should set to 0)
+        local_results = torch.tensor([])
+    else:
+        # compute locally with redistribute first if needed
+        local_tensor_args = pack_args_kwargs_with_local_tensor(
+            args,
+            suggested_input_schema.args_schema,
+            redistribute_with_schema=needs_redistribute,
+        )
+        local_tensor_kwargs = pack_args_kwargs_with_local_tensor(
+            kwargs,
+            suggested_input_schema.kwargs_schema,
+            redistribute_with_schema=needs_redistribute,
+        )
+
+        # run local op computation with potentially modified args/kwargs
+        local_tensor_args = cast(Tuple[object, ...], local_tensor_args)
+        local_tensor_kwargs = cast(Dict[str, object], local_tensor_kwargs)
+        local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
+
+    if suggested_input_schema.is_inplace:
         # inplace op should return self instead of re-wrapping
         self = cast(dtensor.DTensor, args[0])
         self._spec = cast(DTensorSpec, output_sharding.output_spec)
         return self
-    elif target_schema.is_out_variant:
+    elif suggested_input_schema.is_out_variant:
         # out variant could possibly have multiple out args (i.e. lu_unpack.out)
         output_specs = (
             (output_sharding.output_spec,)
@@ -240,7 +189,7 @@ def operator_dispatch(
         )
         out_dts = []
         spec_idx = 0
-        for arg in target_schema.func_schema.arguments:
+        for arg in suggested_input_schema.func_schema.arguments:
             if arg.is_out:
                 out_dt = cast(dtensor.DTensor, kwargs[arg.name])
                 out_dt._spec = cast(DTensorSpec, output_specs[spec_idx])
diff --git a/torch/distributed/_tensor/examples/checkpoint_example.py b/torch/distributed/_tensor/examples/checkpoint_example.py
new file mode 100644
index 000000000000..78e183e60800
--- /dev/null
+++ b/torch/distributed/_tensor/examples/checkpoint_example.py
@@ -0,0 +1,180 @@
+"""
+The following example contains a simple MLP model that uses
+different DTensor layouts, and use the checkpointing API to
+checkpoint save/load the model.
+"""
+import os
+
+from typing import cast, List
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
+from torch.distributed._tensor.placement_types import Placement
+from torch.distributed.tensor.parallel import PairwiseParallel, parallelize_module
+
+
+class SimpleMLP(torch.nn.Module):
+    def __init__(self):
+        super(SimpleMLP, self).__init__()
+        self.net1 = torch.nn.Linear(5, 128)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(128, 12)
+
+    def forward(self, x):
+        return self.net2(F.relu(self.net1(x)))
+
+
+def gen_tensor_parallel_model(model: nn.Module, mesh: DeviceMesh) -> nn.Module:
+    """
+    generates a nn.Module where parameters are sharded in the tensor-parallel
+    fashion.
+    """
+    # shard the model
+    return parallelize_module(
+        model,
+        mesh,
+        PairwiseParallel(),
+    )
+
+
+def gen_partial_replicate_2d(model: nn.Module, mesh: DeviceMesh) -> nn.Module:
+    """
+    generates a nn.Module where parameters are replicated in the first mesh
+    dimension, and sharded in the second mesh dimension.
+    """
+
+    def parallel_fn(name, module, device_mesh):
+        assert device_mesh.ndim == 2
+        if isinstance(module, torch.nn.Linear) and name == "net1":
+            for name, param in module.named_parameters():
+                dist_param = torch.nn.Parameter(
+                    distribute_tensor(param, device_mesh, [Replicate(), Shard(0)])
+                )
+                module.register_parameter(name, dist_param)
+        elif isinstance(module, torch.nn.Linear) and name == "net2":
+            for name, param in module.named_parameters():
+                dist_spec = (
+                    [Replicate(), Shard(1)]
+                    if name == "weight"
+                    else [Replicate(), Replicate()]
+                )
+                dist_param = torch.nn.Parameter(
+                    distribute_tensor(param, device_mesh, dist_spec)
+                )
+                module.register_parameter(name, dist_param)
+
+    # mark input replicating on mesh
+    def input_fn(inputs, device_mesh):
+        return DTensor.from_local(inputs[0], device_mesh, [Replicate(), Replicate()])
+
+    def output_fn(outputs, device_mesh):
+        assert isinstance(outputs, DTensor)
+        return outputs.to_local()
+
+    return distribute_module(
+        model,
+        mesh,
+        partition_fn=parallel_fn,
+        input_fn=input_fn,
+        output_fn=output_fn,
+    )
+
+
+def gen_model_param_in_submesh(model: nn.Module, sub_mesh: DeviceMesh) -> nn.Module:
+    """
+    generates a nn.Module where parameters are sharded/replicated only on a
+    sub-mesh (i.e. mesh(0, 2) in a world size of 4)
+    """
+
+    def parallel_fn(name, module, device_mesh):
+        assert device_mesh.ndim == 1
+        if isinstance(module, torch.nn.Linear) and name == "net1":
+            for name, param in module.named_parameters():
+                dist_param = torch.nn.Parameter(
+                    distribute_tensor(param, device_mesh, [Shard(0)])
+                )
+                module.register_parameter(name, dist_param)
+        elif isinstance(module, torch.nn.Linear) and name == "net2":
+            for name, param in module.named_parameters():
+                dist_spec = cast(List[Placement], [Shard(1)] if name == "weight" else [Replicate()])
+                dist_param = torch.nn.Parameter(
+                    distribute_tensor(param, device_mesh, dist_spec)
+                )
+                module.register_parameter(name, dist_param)
+
+    # mark input replicating on mesh
+    def input_fn(inputs, device_mesh):
+        return DTensor.from_local(inputs[0], device_mesh, [Replicate()])
+
+    def output_fn(outputs, device_mesh):
+        assert isinstance(outputs, DTensor)
+        return outputs.to_local()
+
+    return distribute_module(
+        model,
+        sub_mesh,
+        partition_fn=parallel_fn,
+        input_fn=input_fn,
+        output_fn=output_fn,
+    )
+
+
+def checkpoint(model: nn.Module, mesh: DeviceMesh) -> nn.Module:
+    """
+    checkpoint save/load models with DTensor parameters
+    """
+    # TODO: implement this checkpoint save/load example
+    pass
+
+
+def run_checkpoint_example(rank, world_size):
+    # set up world pg
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+
+    # initialize the process group
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+    # create a device mesh
+    mesh = DeviceMesh("cpu", torch.arange(world_size))
+
+    # create and shard the model in tensor parallel fashion
+    model_tp = gen_tensor_parallel_model(SimpleMLP(), mesh)
+    model_tp(torch.rand(5, 5))
+    # print(f"tensor parallel model state_dict: {model_tp.state_dict()}")
+
+    # create a 2-D device mesh for partial replication
+    mesh_2d = DeviceMesh("cpu", torch.arange(world_size).reshape(2, 2))
+    # replicate the parameters on the first mesh dimension,
+    # and shard the parameters on the second mesh dimension
+    model_2d = gen_partial_replicate_2d(SimpleMLP(), mesh_2d)
+    model_2d(torch.rand(5, 5))
+
+    # create a sub-mesh and shard/replicate params only on submesh
+    submesh = DeviceMesh("cpu", [0, 2])
+    model_submesh = gen_model_param_in_submesh(SimpleMLP(), submesh)
+    model_submesh(torch.rand(5, 5))
+    print(f"partial replicate model state_dict: {model_submesh.state_dict()}")
+
+    # checkpoint the model
+    # TODO: fully implement checkpoint save/load example
+    model = checkpoint(model_2d, mesh)
+
+    # shutting down world pg
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    world_size = 4
+    mp.spawn(run_checkpoint_example, args=(world_size,), nprocs=world_size, join=True)
diff --git a/torch/distributed/_tensor/op_schema.py b/torch/distributed/_tensor/op_schema.py
index 5e3fbebe621b..1b374b3b34e6 100644
--- a/torch/distributed/_tensor/op_schema.py
+++ b/torch/distributed/_tensor/op_schema.py
@@ -2,6 +2,7 @@
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
+from torch.utils._pytree import tree_map_only
 from torch.distributed._tensor.placement_types import DTensorSpec
 
 
@@ -13,8 +14,20 @@
 OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]
 
 
+def _rebuild_tensor_from_dtensor_meta(arg) -> object:
+    """"
+    This is used to propagate tensor metadata, must be under fake mode
+    """
+    assert arg.tensor_meta is not None, "DTensorSpec does not contain tensor_meta."
+    return torch.empty_strided(
+        arg.tensor_meta.shape,
+        arg.tensor_meta.stride,
+        dtype=arg.tensor_meta.dtype,
+        requires_grad=arg.tensor_meta.requires_grad
+    )
+
 @dataclass
-class OpSchema(object):
+class OpSchema:
     """
     OpSchema is a data class that describes an operator input schemas, it
     includes DTensor DTensorSpecs and non-tensor args/kwargs (positional order
@@ -81,6 +94,36 @@ def __repr__(self) -> str:
             f" kwargs_schema={self.kwargs_schema})"
         )
 
+    def __hash__(self) -> int:
+        # NOTE: we turn kwargs_schema into a frozenset to hash as it would not be nested dict
+        frozen_set_kwargs_schema = frozenset(self.kwargs_schema.items())
+        return hash((self.func_schema, self.args_spec, frozen_set_kwargs_schema))
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, OpSchema):
+            return False
+        return (
+            self.func_schema == other.func_schema
+            and self.args_schema == other.args_schema
+            and self.kwargs_schema == other.kwargs_schema
+        )
+
+    def gen_fake_args(self) -> ArgsType:
+        """
+        gen_fake_args: generate fake args for the operator, this is mainly used
+            by sharding propagation rules to generate fake args for the operator
+            to run the local tensor operator and get the output spec.
+        """
+        return tree_map_only(DTensorSpec, _rebuild_tensor_from_dtensor_meta, self.args_schema)
+
+    def gen_fake_kwargs(self) -> KwargsType:
+        """
+        gen_fake_kwargs: generate fake kwargs for the operator, this is mainly used
+            by sharding propagation rules to generate fake kwargs for the operator
+            to run the local tensor operator and get the output spec.
+        """
+        return tree_map_only(DTensorSpec, _rebuild_tensor_from_dtensor_meta, self.kwargs_schema)
+
 
 @dataclass
 class OutputSharding:
diff --git a/torch/distributed/_tensor/ops/__init__.py b/torch/distributed/_tensor/ops/__init__.py
index 5550b2ffae08..ace4293c0c78 100644
--- a/torch/distributed/_tensor/ops/__init__.py
+++ b/torch/distributed/_tensor/ops/__init__.py
@@ -2,6 +2,5 @@
 from .matrix_ops import *  # noqa: F403
 from .math_ops import *  # noqa: F403
 from .tensor_ops import *  # noqa: F403
-from .tp_sharding_ops import *  # noqa: F403
 from .pointwise_ops import *  # noqa: F403
 from .view_ops import *  # noqa: F403
diff --git a/torch/distributed/_tensor/ops/common_rules.py b/torch/distributed/_tensor/ops/common_rules.py
index 81c76ab84204..caf96dcf9320 100644
--- a/torch/distributed/_tensor/ops/common_rules.py
+++ b/torch/distributed/_tensor/ops/common_rules.py
@@ -2,7 +2,8 @@
 from typing import cast, Dict, List, Optional, Sequence, Tuple
 
 import torch
-from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.fx.passes.shape_prop import TensorMetadata
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.utils import prod
 from torch.distributed._tensor.placement_types import DTensorSpec
 
@@ -42,7 +43,7 @@ def _gen_reshard_suggestions(
                 mesh=input_spec.mesh,
                 dim_map=dim_map,
                 sums=pending_sum,
-                shape=input_spec.shape,
+                tensor_meta=input_spec.tensor_meta,
             )
         )
     suggested_schema = OpSchema(op_schema.func_schema, tuple(suggested_arg_specs), {})
@@ -215,12 +216,25 @@ def merge_sharding(dim: str, a: int, b: int) -> int:
             output_dim_map.append(dim_to_sharding[dim])
             output_shape.append(dim_to_size[dim])
 
+    # XXX: since we still need to have intermediate shape calculation, we need
+    # to pass in the shape here. We should remove this once sharding decomp works
+    # for ops like addmm
+    assert input_specs[0].tensor_meta is not None
+    tensor_meta = TensorMetadata(
+        torch.Size(output_shape),
+        input_specs[0].tensor_meta.dtype,
+        input_specs[0].tensor_meta.requires_grad,
+        input_specs[0].tensor_meta.stride,
+        input_specs[0].tensor_meta.memory_format,
+        input_specs[0].tensor_meta.is_quantized,
+        input_specs[0].tensor_meta.qparams,
+    )
     return OutputSharding(
         DTensorSpec.from_dim_map(
             input_specs[0].mesh,
             output_dim_map,
             pending_sums,
-            shape=torch.Size(output_shape),
+            tensor_meta=tensor_meta,
         )
     )
 
@@ -329,7 +343,7 @@ def reduction_rule(
 
         if needs_reshard:
             no_partial_spec = DTensorSpec.from_dim_map(
-                input_spec.mesh, reshard_dim_map, [], input_spec.shape
+                input_spec.mesh, reshard_dim_map, [], tensor_meta=input_spec.tensor_meta
             )
             schema_suggestion = OpSchema(op_schema.func_schema, (no_partial_spec,), {})
             _inplace_rewrap_schema_suggestion(schema_suggestion, op_schema)
diff --git a/torch/distributed/_tensor/ops/math_ops.py b/torch/distributed/_tensor/ops/math_ops.py
index 2480e7ced573..eb31e981cfdf 100644
--- a/torch/distributed/_tensor/ops/math_ops.py
+++ b/torch/distributed/_tensor/ops/math_ops.py
@@ -1,8 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from typing import cast, Optional, Sequence
 
-from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+import torch
+
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.common_rules import pointwise_rule, reduction_rule
 from torch.distributed._tensor.ops.utils import (
     as_list,
@@ -12,6 +13,9 @@
 from torch.distributed._tensor.placement_types import DTensorSpec
 
 
+aten = torch.ops.aten
+
+
 def _infer_reduction_dims(dims_arg: object, ndim: int) -> Optional[Sequence[int]]:
     if dims_arg is None:
         return None
@@ -23,11 +27,17 @@ def _infer_reduction_dims(dims_arg: object, ndim: int) -> Optional[Sequence[int]
     return dims
 
 
-@register_prop_rule("aten.all.default")
+@register_prop_rule(aten.all.default)
 def default_reduction_rule(op_schema: OpSchema) -> OutputSharding:
     return reduction_rule(op_schema, reduction_linear=True)
 
 
+@register_prop_rule(
+    [
+        aten.sum.default,
+        aten.sum.dim_IntList,
+    ]
+)
 def sum_rule(op_schema: OpSchema) -> OutputSharding:
     args_schema = op_schema.args_schema
     input_spec = cast(DTensorSpec, args_schema[0])
@@ -41,15 +51,7 @@ def sum_rule(op_schema: OpSchema) -> OutputSharding:
     )
 
 
-sum_ops = [
-    "aten.sum.default",
-    "aten.sum.dim_IntList",
-]
-for sum_op in sum_ops:
-    DTensor._op_to_rules[sum_op] = sum_rule
-
-
-@register_prop_rule("aten._softmax.default")
+@register_prop_rule(aten._softmax.default)
 def softmax_rule(op_schema: OpSchema) -> OutputSharding:
     input_spec, softmax_dim, _ = op_schema.args_schema
     input_spec = cast(DTensorSpec, input_spec)
@@ -60,7 +62,7 @@ def softmax_rule(op_schema: OpSchema) -> OutputSharding:
     return OutputSharding(input_spec)
 
 
-@register_prop_rule("aten._softmax_backward_data.default")
+@register_prop_rule(aten._softmax_backward_data.default)
 def softmax_bwd_rule(op_schema: OpSchema) -> OutputSharding:
     grad_out_spec, out_spec, softmax_dim, _ = op_schema.args_schema
     grad_out_spec = cast(DTensorSpec, grad_out_spec)
@@ -75,6 +77,7 @@ def softmax_bwd_rule(op_schema: OpSchema) -> OutputSharding:
     return pointwise_rule(op_schema)
 
 
+@register_prop_rule([aten.mean.default, aten.mean.dim, aten.mean.out])
 def mean_rule(op_schema: OpSchema) -> OutputSharding:
     args_schema = op_schema.args_schema
     input_spec = cast(DTensorSpec, args_schema[0])
@@ -89,16 +92,13 @@ def mean_rule(op_schema: OpSchema) -> OutputSharding:
     )
 
 
-mean_ops = [
-    "aten.mean.default",
-    "aten.mean.dim",
-    "aten.mean.out",
-]
-
-for mean_op in mean_ops:
-    DTensor._op_to_rules[mean_op] = mean_rule
-
-
+@register_prop_rule(
+    [
+        aten.var.default,
+        aten.var.dim,
+        aten.var.out,
+    ]
+)
 def var_rule(op_schema: OpSchema) -> OutputSharding:
     args_schema = op_schema.args_schema
     input_spec = cast(DTensorSpec, args_schema[0])
@@ -115,18 +115,7 @@ def var_rule(op_schema: OpSchema) -> OutputSharding:
     )
 
 
-var_ops = [
-    "aten.var.default",
-    "aten.var.dim",
-    "aten.var.out",
-]
-
-for var_op in var_ops:
-    DTensor._op_to_rules[var_op] = var_rule
-
-
-@register_prop_rule("aten.var.correction")
-@register_prop_rule("aten.var.correction_out")
+@register_prop_rule([aten.var.correction, aten.var.correction_out])
 def var_correction_rule(op_schema: OpSchema) -> OutputSharding:
     args_schema = op_schema.args_schema
     input_spec = cast(DTensorSpec, args_schema[0])
diff --git a/torch/distributed/_tensor/ops/matrix_ops.py b/torch/distributed/_tensor/ops/matrix_ops.py
index 6d884843ea81..08c10f8005c0 100644
--- a/torch/distributed/_tensor/ops/matrix_ops.py
+++ b/torch/distributed/_tensor/ops/matrix_ops.py
@@ -1,9 +1,14 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # implement matrix related ops for distributed tensor
-from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+
+import torch
+
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.common_rules import einop_rule, pointwise_rule
 from torch.distributed._tensor.ops.utils import register_prop_rule
 
+aten = torch.ops.aten
+
 
 def _update_schema_suggestion_for_addmm(
     output_sharding: OutputSharding,
@@ -41,12 +46,12 @@ def _update_schema_suggestion_for_addmm(
     return output_sharding
 
 
-@register_prop_rule("aten.mm.default")
+@register_prop_rule(aten.mm.default)
 def mm_rules(op_schema: OpSchema) -> OutputSharding:
     return einop_rule("mk,kn->mn", op_schema, linearity=False)
 
 
-@register_prop_rule("aten.addmm.default")
+@register_prop_rule(aten.addmm.default)
 def addmm_rules(op_schema: OpSchema) -> OutputSharding:
     input_spec, mat1_spec, mat2_spec = op_schema.args_spec
     mm_out_sharding = mm_rules(
@@ -80,17 +85,17 @@ def addmm_rules(op_schema: OpSchema) -> OutputSharding:
     return output_sharding
 
 
-@register_prop_rule("aten.t.default")
+@register_prop_rule(aten.t.default)
 def transpose_rule(op_schema: OpSchema) -> OutputSharding:
     return einop_rule("ij->ji", op_schema, linearity=True)
 
 
-@register_prop_rule("aten.bmm.default")
+@register_prop_rule(aten.bmm.default)
 def bmm_rules(op_schema: OpSchema) -> OutputSharding:
     return einop_rule("bmk,bkn->bmn", op_schema, linearity=False)
 
 
-@register_prop_rule("aten.baddbmm.default")
+@register_prop_rule(aten.baddbmm.default)
 def baddbmm_rules(op_schema: OpSchema) -> OutputSharding:
     input_spec, mat1_spec, mat2_spec = op_schema.args_spec
     bmm_output_sharding = bmm_rules(
diff --git a/torch/distributed/_tensor/ops/pointwise_ops.py b/torch/distributed/_tensor/ops/pointwise_ops.py
index 0c7516866fe8..622d8048add2 100644
--- a/torch/distributed/_tensor/ops/pointwise_ops.py
+++ b/torch/distributed/_tensor/ops/pointwise_ops.py
@@ -1,8 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from typing import cast
 
-from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+import torch
+
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.common_rules import (
     linear_pointwise_rule,
     pointwise_rule,
@@ -10,6 +11,8 @@
 from torch.distributed._tensor.ops.utils import register_prop_rule
 from torch.distributed._tensor.placement_types import _Partial, DTensorSpec, Replicate
 
+
+aten = torch.ops.aten
 # leave the remaining pointwise_ops list here for convenience,
 # Below ops are some pointwise ops that are yet to be supported,
 # they might not be a complete list.
@@ -28,353 +31,351 @@
 
 
 linear_pointwise_ops = [
-    "aten.div.Scalar",  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
-    "aten.to.dtype",
+    aten.div.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    aten.to.dtype,
 ]
 
 
 pointwise_ops = [
     # please keep the entries below alphabetically sorted
-    "aten.abs.default",
-    "aten.acos.default",
-    "aten.acos.out",
-    "aten.acos_.default",
-    "aten.acosh.default",
-    "aten.acosh.out",
-    "aten.acosh_.default",
-    "aten.add.Scalar",
-    "aten.add.Tensor",
-    "aten.add.out",
-    "aten.add_.Scalar",
-    "aten.add_.Tensor",
-    "aten.addcdiv.default",
-    "aten.addcdiv.out",
-    "aten.addcdiv_.default",
-    "aten.addcmul.default",
-    "aten.addcmul.out",
-    "aten.addcmul_.default",
-    "aten.angle.default",
-    "aten.angle.out",
-    "aten.asin.default",
-    "aten.asin.out",
-    "aten.asin_.default",
-    "aten.asinh.default",
-    "aten.asinh.out",
-    "aten.asinh_.default",
-    "aten.atan.default",
-    "aten.atan.out",
-    "aten.atan2.default",
-    "aten.atan2.out",
-    "aten.atan2_.default",
-    "aten.atan_.default",
-    "aten.atanh.default",
-    "aten.atanh.out",
-    "aten.atanh_.default",
-    "aten.bitwise_and.Scalar",
-    "aten.bitwise_and.Scalar_Tensor",
-    "aten.bitwise_and.Scalar_out",
-    "aten.bitwise_and.Tensor",
-    "aten.bitwise_and.Tensor_out",
-    "aten.bitwise_and_.Scalar",
-    "aten.bitwise_and_.Tensor",
-    "aten.bitwise_left_shift.Scalar_Tensor",
-    "aten.bitwise_left_shift.Tensor",
-    "aten.bitwise_left_shift.Tensor_Scalar",
-    "aten.bitwise_left_shift.Tensor_Scalar_out",
-    "aten.bitwise_left_shift.Tensor_out",
-    "aten.bitwise_left_shift_.Tensor",
-    "aten.bitwise_left_shift_.Tensor_Scalar",
-    "aten.bitwise_not.default",
-    "aten.bitwise_not.out",
-    "aten.bitwise_not_.default",
-    "aten.bitwise_or.Scalar",
-    "aten.bitwise_or.Scalar_Tensor",
-    "aten.bitwise_or.Scalar_out",
-    "aten.bitwise_or.Tensor",
-    "aten.bitwise_or.Tensor_out",
-    "aten.bitwise_or_.Scalar",
-    "aten.bitwise_or_.Tensor",
-    "aten.bitwise_right_shift.Scalar_Tensor",
-    "aten.bitwise_right_shift.Tensor",
-    "aten.bitwise_right_shift.Tensor_Scalar",
-    "aten.bitwise_right_shift.Tensor_Scalar_out",
-    "aten.bitwise_right_shift.Tensor_out",
-    "aten.bitwise_right_shift_.Tensor",
-    "aten.bitwise_right_shift_.Tensor_Scalar",
-    "aten.bitwise_xor.Scalar",
-    "aten.bitwise_xor.Scalar_Tensor",
-    "aten.bitwise_xor.Scalar_out",
-    "aten.bitwise_xor.Tensor",
-    "aten.bitwise_xor.Tensor_out",
-    "aten.bitwise_xor_.Scalar",
-    "aten.bitwise_xor_.Tensor",
-    "aten.ceil.default",
-    "aten.ceil.out",
-    "aten.ceil_.default",
-    "aten.clamp.default",
-    "aten.clamp.out",
-    "aten.clamp_.default",
-    "aten.clip.default",
-    "aten.clip.out",
-    "aten.clip_.default",
-    "aten.conj_physical.default",
-    "aten.conj_physical.out",
-    "aten.conj_physical_.default",
-    "aten.constant_.default",
-    "aten.copy_sign.Scalar",
-    "aten.copy_sign.Scalar_out",
-    "aten.copy_sign.Tensor",
-    "aten.copy_sign.out",
-    "aten.copy_sign_.Scalar",
-    "aten.copy_sign_.Tensor",
-    "aten.cos.default",
-    "aten.cos.out",
-    "aten.cos_.default",
-    "aten.cosh.default",
-    "aten.cosh.out",
-    "aten.cosh_.default",
-    "aten.deg2rad.default",
-    "aten.deg2rad.out",
-    "aten.deg2rad_.default",
-    "aten.digamma.default",
-    "aten.digamma.out",
-    "aten.digamma_.default",
-    "aten.div.Tensor",
-    "aten.div.Tensor_mode",
-    "aten.div.out",
-    "aten.div.out_mode",
-    "aten.div_.Tensor",
-    "aten.div_.Tensor_mode",
-    "aten.eq.Tensor",
-    "aten.eq.Tensor_out",
-    "aten.eq.Scalar",
-    "aten.eq.Scalar_out",
-    "aten.equal.default",
-    "aten.erf.default",
-    "aten.erf.out",
-    "aten.erf_.default",
-    "aten.erfc.default",
-    "aten.erfc.out",
-    "aten.erfc_.default",
-    "aten.erfinv.default",
-    "aten.erfinv.out",
-    "aten.erfinv_.default",
-    "aten.exp.default",
-    "aten.exp.out",
-    "aten.exp2.default",
-    "aten.exp2.out",
-    "aten.exp2_.default",
-    "aten.exp_.default",
-    "aten.expm1.default",
-    "aten.expm1.out",
-    "aten.expm1_.default",
-    "aten.float_power.Scalar",
-    "aten.float_power.Scalar_out",
-    "aten.float_power.Tensor_Scalar",
-    "aten.float_power.Tensor_Scalar_out",
-    "aten.float_power.Tensor_Tensor",
-    "aten.float_power.Tensor_Tensor_out",
-    "aten.float_power_.Scalar",
-    "aten.float_power_.Tensor",
-    "aten.floor.default",
-    "aten.floor.out",
-    "aten.floor_.default",
-    "aten.fmod.Scalar",
-    "aten.fmod.Scalar_out",
-    "aten.fmod.Tensor",
-    "aten.fmod.Tensor_out",
-    "aten.fmod_.Scalar",
-    "aten.fmod_.Tensor",
-    "aten.frac.default",
-    "aten.frac.out",
-    "aten.frac_.default",
-    "aten.ge.Scalar",
-    "aten.ge.Tensor",
-    "aten.gelu.default",
-    "aten.gt.Scalar",
-    "aten.gt.Tensor",
-    "aten.hypot.default",
-    "aten.hypot.out",
-    "aten.hypot_.default",
-    "aten.i0.default",
-    "aten.i0.out",
-    "aten.i0_.default",
-    "aten.igamma.default",
-    "aten.igamma.out",
-    "aten.igamma_.default",
-    "aten.igammac.default",
-    "aten.igammac.out",
-    "aten.igammac_.default",
-    "aten.isnan.default",
-    "aten.ldexp.default",
-    "aten.ldexp.out",
-    "aten.ldexp_.default",
-    "aten.le.Scalar",
-    "aten.le.Tensor",
-    "aten.lerp.Scalar",
-    "aten.lerp.Scalar_out",
-    "aten.lerp.Tensor",
-    "aten.lerp.Tensor_out",
-    "aten.lerp_.Scalar",
-    "aten.lerp_.Tensor",
-    "aten.lgamma.default",
-    "aten.lgamma.out",
-    "aten.lgamma_.default",
-    "aten.log.default",
-    "aten.log.out",
-    "aten.log10.default",
-    "aten.log10.out",
-    "aten.log10_.default",
-    "aten.log1p.default",
-    "aten.log1p.out",
-    "aten.log1p_.default",
-    "aten.log2.default",
-    "aten.log2.out",
-    "aten.log2_.default",
-    "aten.log_.default",
-    "aten.logaddexp.default",
-    "aten.logaddexp.out",
-    "aten.logaddexp2.default",
-    "aten.logaddexp2.out",
-    "aten.logical_and.default",
-    "aten.logical_and.out",
-    "aten.logical_and_.default",
-    "aten.logical_not.default",
-    "aten.logical_not.out",
-    "aten.logical_not_.default",
-    "aten.logical_or.default",
-    "aten.logical_or.out",
-    "aten.logical_or_.default",
-    "aten.logical_xor.default",
-    "aten.logical_xor.out",
-    "aten.logical_xor_.default",
-    "aten.logit.default",
-    "aten.logit.out",
-    "aten.logit_.default",
-    "aten.masked_fill.Scalar",
-    "aten.mul.Scalar",
-    "aten.mul.Tensor",
-    "aten.mul.out",
-    "aten.mul_.Scalar",
-    "aten.mul_.Tensor",
-    "aten.mvlgamma.default",
-    "aten.mvlgamma.out",
-    "aten.mvlgamma_.default",
-    "aten.native_dropout_backward.default",
-    "aten.native_dropout_backward.out",
-    "aten.nan_to_num.default",
-    "aten.nan_to_num.out",
-    "aten.nan_to_num_.default",
-    "aten.ne.Scalar",
-    "aten.neg.default",
-    "aten.neg.out",
-    "aten.neg_.default",
-    "aten.nextafter.default",
-    "aten.nextafter.out",
-    "aten.nextafter_.default",
-    "aten.polygamma.default",
-    "aten.polygamma.out",
-    "aten.polygamma_.default",
-    "aten.positive.default",
-    "aten.pow.Scalar",
-    "aten.pow.Scalar_out",
-    "aten.pow.Tensor_Scalar",
-    "aten.pow.Tensor_Scalar_out",
-    "aten.pow.Tensor_Tensor",
-    "aten.pow.Tensor_Tensor_out",
-    "aten.pow_.Scalar",
-    "aten.pow_.Tensor",
-    "aten.reciprocal.default",
-    "aten.reciprocal.out",
-    "aten.reciprocal_.default",
-    "aten.red2deg.default",
-    "aten.red2deg.out",
-    "aten.red2deg_.default",
-    "aten.relu.default",
-    "aten.relu_.default",
-    "aten.remainder.Scalar",
-    "aten.remainder.Scalar_Tensor",
-    "aten.remainder.Scalar_out",
-    "aten.remainder.Tensor",
-    "aten.remainder.Tensor_out",
-    "aten.remainder_.Scalar",
-    "aten.remainder_.Tensor",
-    "aten.round.decimals",
-    "aten.round.decimals_out",
-    "aten.round.default",
-    "aten.round.out",
-    "aten.round_.decimals",
-    "aten.round_.default",
-    "aten.rsqrt.default",
-    "aten.rsqrt.out",
-    "aten.rsqrt_.default",
-    "aten.rsub.Scalar",
-    "aten.sgn.default",
-    "aten.sgn.out",
-    "aten.sgn_.default",
-    "aten.sigmoid.default",
-    "aten.sigmoid.out",
-    "aten.sigmoid_.default",
-    "aten.sign.default",
-    "aten.sign.out",
-    "aten.sign_.default",
-    "aten.signbit.default",
-    "aten.signbit.out",
-    "aten.sin.default",
-    "aten.sin.out",
-    "aten.sin_.default",
-    "aten.sinc.default",
-    "aten.sinc.out",
-    "aten.sinc_.default",
-    "aten.sinh.default",
-    "aten.sinh.out",
-    "aten.sinh_.default",
-    "aten.sqrt.default",
-    "aten.sqrt.out",
-    "aten.sqrt_.default",
-    "aten.square.default",
-    "aten.square.out",
-    "aten.square_.default",
-    "aten.sub.Scalar",
-    "aten.sub.Tensor",
-    "aten.sub.out",
-    "aten.sub_.Scalar",
-    "aten.sub_.Tensor",
-    "aten.tan.default",
-    "aten.tan.out",
-    "aten.tan_.default",
-    "aten.tanh.default",
-    "aten.tanh.out",
-    "aten.tanh_.default",
-    "aten.true_divide.Tensor",
-    "aten.trunc.default",
-    "aten.trunc.out",
-    "aten.trunc_.default",
-    "aten.where.self",
-    "aten.xlogy.OutScalar_Self",
-    "aten.xlogy.OutTensor",
-    "aten.xlogy.Scalar_other",
-    "aten.xlogy.Scalar_self",
-    "aten.xlogy.Tensor",
-    "aten.xlogy_.OutScalar_Other",
-    "aten.xlogy_.Scalar_other",
-    "aten.xlogy_.Tensor",
-    "prims.convert_element_type.default",
+    aten.abs.default,
+    aten.acos.default,
+    aten.acos.out,
+    aten.acos_.default,
+    aten.acosh.default,
+    aten.acosh.out,
+    aten.acosh_.default,
+    aten.add.Scalar,
+    aten.add.Tensor,
+    aten.add.out,
+    aten.add_.Scalar,
+    aten.add_.Tensor,
+    aten.addcdiv.default,
+    aten.addcdiv.out,
+    aten.addcdiv_.default,
+    aten.addcmul.default,
+    aten.addcmul.out,
+    aten.addcmul_.default,
+    aten.angle.default,
+    aten.angle.out,
+    aten.asin.default,
+    aten.asin.out,
+    aten.asin_.default,
+    aten.asinh.default,
+    aten.asinh.out,
+    aten.asinh_.default,
+    aten.atan.default,
+    aten.atan.out,
+    aten.atan2.default,
+    aten.atan2.out,
+    aten.atan2_.default,
+    aten.atan_.default,
+    aten.atanh.default,
+    aten.atanh.out,
+    aten.atanh_.default,
+    aten.bitwise_and.Scalar,
+    aten.bitwise_and.Scalar_Tensor,
+    aten.bitwise_and.Scalar_out,
+    aten.bitwise_and.Tensor,
+    aten.bitwise_and.Tensor_out,
+    aten.bitwise_and_.Scalar,
+    aten.bitwise_and_.Tensor,
+    aten.bitwise_left_shift.Scalar_Tensor,
+    aten.bitwise_left_shift.Tensor,
+    aten.bitwise_left_shift.Tensor_Scalar,
+    aten.bitwise_left_shift.Tensor_Scalar_out,
+    aten.bitwise_left_shift.Tensor_out,
+    aten.bitwise_left_shift_.Tensor,
+    aten.bitwise_left_shift_.Tensor_Scalar,
+    aten.bitwise_not.default,
+    aten.bitwise_not.out,
+    aten.bitwise_not_.default,
+    aten.bitwise_or.Scalar,
+    aten.bitwise_or.Scalar_Tensor,
+    aten.bitwise_or.Scalar_out,
+    aten.bitwise_or.Tensor,
+    aten.bitwise_or.Tensor_out,
+    aten.bitwise_or_.Scalar,
+    aten.bitwise_or_.Tensor,
+    aten.bitwise_right_shift.Scalar_Tensor,
+    aten.bitwise_right_shift.Tensor,
+    aten.bitwise_right_shift.Tensor_Scalar,
+    aten.bitwise_right_shift.Tensor_Scalar_out,
+    aten.bitwise_right_shift.Tensor_out,
+    aten.bitwise_right_shift_.Tensor,
+    aten.bitwise_right_shift_.Tensor_Scalar,
+    aten.bitwise_xor.Scalar,
+    aten.bitwise_xor.Scalar_Tensor,
+    aten.bitwise_xor.Scalar_out,
+    aten.bitwise_xor.Tensor,
+    aten.bitwise_xor.Tensor_out,
+    aten.bitwise_xor_.Scalar,
+    aten.bitwise_xor_.Tensor,
+    aten.ceil.default,
+    aten.ceil.out,
+    aten.ceil_.default,
+    aten.clamp.default,
+    aten.clamp.out,
+    aten.clamp_.default,
+    aten.clip.default,
+    aten.clip.out,
+    aten.clip_.default,
+    aten.conj_physical.default,
+    aten.conj_physical.out,
+    aten.conj_physical_.default,
+    aten.copysign.Scalar,
+    aten.copysign.Scalar_out,
+    aten.copysign.Tensor,
+    aten.copysign.out,
+    aten.copysign_.Scalar,
+    aten.copysign_.Tensor,
+    aten.cos.default,
+    aten.cos.out,
+    aten.cos_.default,
+    aten.cosh.default,
+    aten.cosh.out,
+    aten.cosh_.default,
+    aten.deg2rad.default,
+    aten.deg2rad.out,
+    aten.deg2rad_.default,
+    aten.digamma.default,
+    aten.digamma.out,
+    aten.digamma_.default,
+    aten.div.Tensor,
+    aten.div.Tensor_mode,
+    aten.div.out,
+    aten.div.out_mode,
+    aten.div_.Tensor,
+    aten.div_.Tensor_mode,
+    aten.eq.Tensor,
+    aten.eq.Tensor_out,
+    aten.eq.Scalar,
+    aten.eq.Scalar_out,
+    aten.equal.default,
+    aten.erf.default,
+    aten.erf.out,
+    aten.erf_.default,
+    aten.erfc.default,
+    aten.erfc.out,
+    aten.erfc_.default,
+    aten.erfinv.default,
+    aten.erfinv.out,
+    aten.erfinv_.default,
+    aten.exp.default,
+    aten.exp.out,
+    aten.exp2.default,
+    aten.exp2.out,
+    aten.exp2_.default,
+    aten.exp_.default,
+    aten.expm1.default,
+    aten.expm1.out,
+    aten.expm1_.default,
+    aten.float_power.Scalar,
+    aten.float_power.Scalar_out,
+    aten.float_power.Tensor_Scalar,
+    aten.float_power.Tensor_Scalar_out,
+    aten.float_power.Tensor_Tensor,
+    aten.float_power.Tensor_Tensor_out,
+    aten.float_power_.Scalar,
+    aten.float_power_.Tensor,
+    aten.floor.default,
+    aten.floor.out,
+    aten.floor_.default,
+    aten.fmod.Scalar,
+    aten.fmod.Scalar_out,
+    aten.fmod.Tensor,
+    aten.fmod.Tensor_out,
+    aten.fmod_.Scalar,
+    aten.fmod_.Tensor,
+    aten.frac.default,
+    aten.frac.out,
+    aten.frac_.default,
+    aten.ge.Scalar,
+    aten.ge.Tensor,
+    aten.gelu.default,
+    aten.gt.Scalar,
+    aten.gt.Tensor,
+    aten.hypot.default,
+    aten.hypot.out,
+    aten.hypot_.default,
+    aten.i0.default,
+    aten.i0.out,
+    aten.i0_.default,
+    aten.igamma.default,
+    aten.igamma.out,
+    aten.igamma_.default,
+    aten.igammac.default,
+    aten.igammac.out,
+    aten.igammac_.default,
+    aten.isnan.default,
+    aten.ldexp.default,
+    aten.ldexp.out,
+    aten.ldexp_.default,
+    aten.le.Scalar,
+    aten.le.Tensor,
+    aten.lerp.Scalar,
+    aten.lerp.Scalar_out,
+    aten.lerp.Tensor,
+    aten.lerp.Tensor_out,
+    aten.lerp_.Scalar,
+    aten.lerp_.Tensor,
+    aten.lgamma.default,
+    aten.lgamma.out,
+    aten.lgamma_.default,
+    aten.log.default,
+    aten.log.out,
+    aten.log10.default,
+    aten.log10.out,
+    aten.log10_.default,
+    aten.log1p.default,
+    aten.log1p.out,
+    aten.log1p_.default,
+    aten.log2.default,
+    aten.log2.out,
+    aten.log2_.default,
+    aten.log_.default,
+    aten.logaddexp.default,
+    aten.logaddexp.out,
+    aten.logaddexp2.default,
+    aten.logaddexp2.out,
+    aten.logical_and.default,
+    aten.logical_and.out,
+    aten.logical_and_.default,
+    aten.logical_not.default,
+    aten.logical_not.out,
+    aten.logical_not_.default,
+    aten.logical_or.default,
+    aten.logical_or.out,
+    aten.logical_or_.default,
+    aten.logical_xor.default,
+    aten.logical_xor.out,
+    aten.logical_xor_.default,
+    aten.logit.default,
+    aten.logit.out,
+    aten.logit_.default,
+    aten.masked_fill.Scalar,
+    aten.mul.Scalar,
+    aten.mul.Tensor,
+    aten.mul.out,
+    aten.mul_.Scalar,
+    aten.mul_.Tensor,
+    aten.mvlgamma.default,
+    aten.mvlgamma.out,
+    aten.mvlgamma_.default,
+    aten.native_dropout_backward.default,
+    aten.native_dropout_backward.out,
+    aten.nan_to_num.default,
+    aten.nan_to_num.out,
+    aten.nan_to_num_.default,
+    aten.ne.Scalar,
+    aten.neg.default,
+    aten.neg.out,
+    aten.neg_.default,
+    aten.nextafter.default,
+    aten.nextafter.out,
+    aten.nextafter_.default,
+    aten.polygamma.default,
+    aten.polygamma.out,
+    aten.polygamma_.default,
+    aten.positive.default,
+    aten.pow.Scalar,
+    aten.pow.Scalar_out,
+    aten.pow.Tensor_Scalar,
+    aten.pow.Tensor_Scalar_out,
+    aten.pow.Tensor_Tensor,
+    aten.pow.Tensor_Tensor_out,
+    aten.pow_.Scalar,
+    aten.pow_.Tensor,
+    aten.reciprocal.default,
+    aten.reciprocal.out,
+    aten.reciprocal_.default,
+    aten.rad2deg.default,
+    aten.rad2deg.out,
+    aten.rad2deg_.default,
+    aten.relu.default,
+    aten.relu_.default,
+    aten.remainder.Scalar,
+    aten.remainder.Scalar_Tensor,
+    aten.remainder.Scalar_out,
+    aten.remainder.Tensor,
+    aten.remainder.Tensor_out,
+    aten.remainder_.Scalar,
+    aten.remainder_.Tensor,
+    aten.round.decimals,
+    aten.round.decimals_out,
+    aten.round.default,
+    aten.round.out,
+    aten.round_.decimals,
+    aten.round_.default,
+    aten.rsqrt.default,
+    aten.rsqrt.out,
+    aten.rsqrt_.default,
+    aten.rsub.Scalar,
+    aten.sgn.default,
+    aten.sgn.out,
+    aten.sgn_.default,
+    aten.sigmoid.default,
+    aten.sigmoid.out,
+    aten.sigmoid_.default,
+    aten.sign.default,
+    aten.sign.out,
+    aten.sign_.default,
+    aten.signbit.default,
+    aten.signbit.out,
+    aten.sin.default,
+    aten.sin.out,
+    aten.sin_.default,
+    aten.sinc.default,
+    aten.sinc.out,
+    aten.sinc_.default,
+    aten.sinh.default,
+    aten.sinh.out,
+    aten.sinh_.default,
+    aten.sqrt.default,
+    aten.sqrt.out,
+    aten.sqrt_.default,
+    aten.square.default,
+    aten.square.out,
+    aten.square_.default,
+    aten.sub.Scalar,
+    aten.sub.Tensor,
+    aten.sub.out,
+    aten.sub_.Scalar,
+    aten.sub_.Tensor,
+    aten.tan.default,
+    aten.tan.out,
+    aten.tan_.default,
+    aten.tanh.default,
+    aten.tanh.out,
+    aten.tanh_.default,
+    aten.true_divide.Tensor,
+    aten.trunc.default,
+    aten.trunc.out,
+    aten.trunc_.default,
+    aten.where.self,
+    aten.xlogy.OutScalar_Self,
+    aten.xlogy.OutScalar_Other,
+    aten.xlogy.OutTensor,
+    aten.xlogy.Scalar_Other,
+    aten.xlogy.Scalar_Self,
+    aten.xlogy.Tensor,
+    aten.xlogy_.Scalar_Other,
+    aten.xlogy_.Tensor,
     # backward point-wise ops
     # please keep the entries below alphabetically sorted
-    "aten.gelu_backward.default",
-    "aten.sigmoid_backward.default",
-    "aten.tanh_backward.default",
-    "aten.threshold_backward.default",
+    aten.gelu_backward.default,
+    aten.sigmoid_backward.default,
+    aten.tanh_backward.default,
+    aten.threshold_backward.default,
 ]
 
 
 for op in linear_pointwise_ops:
-    DTensor._op_to_rules[op] = linear_pointwise_rule
+    register_prop_rule(op)(linear_pointwise_rule)
 
 
 for op in pointwise_ops:
-    DTensor._op_to_rules[op] = pointwise_rule
+    register_prop_rule(op)(pointwise_rule)
 
 
 def _register_non_deterministic_op(op):
@@ -399,7 +400,6 @@ def non_deterministic_rule(op_schema: OpSchema) -> OutputSharding:
             return OutputSharding(self_spec)
 
 
-_register_non_deterministic_op("aten.native_dropout.default")
-_register_non_deterministic_op("aten.uniform_.default")
-_register_non_deterministic_op("aten.normal_.default")
-_register_non_deterministic_op("aten.kaiming_uniform_.default")
+_register_non_deterministic_op(aten.native_dropout.default)
+_register_non_deterministic_op(aten.uniform_.default)
+_register_non_deterministic_op(aten.normal_.default)
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/tensor_ops.py
index 4ab57bbe2699..de7a79ad1a45 100644
--- a/torch/distributed/_tensor/ops/tensor_ops.py
+++ b/torch/distributed/_tensor/ops/tensor_ops.py
@@ -2,18 +2,20 @@
 from typing import cast, List, Optional, Sequence, Tuple
 
 import torch
+
 from torch.distributed._tensor.api import (
     _Partial,
-    DTensor,
     DTensorSpec,
     Placement,
     Replicate,
     Shard,
 )
-from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
-from torch.distributed._tensor.ops.common_rules import pointwise_rule
-from torch.distributed._tensor.ops.utils import register_prop_rule
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.common_rules import einop_rule, pointwise_rule
+from torch.distributed._tensor.ops.utils import register_prop_rule, normalize_dim
+
 
+aten = torch.ops.aten
 
 # NOTE: the default propagation rule should apply for
 # any operator that does not return a DTensor, i.e.
@@ -37,15 +39,14 @@ def prop_create_like(op_schema: OpSchema) -> OutputSharding:
         placements=tuple(
             Replicate() if isinstance(p, _Partial) else p for p in input_spec.placements
         ),
-        ndim=input_spec.ndim,
-        shape=input_spec.shape,
     )
     return OutputSharding(output_spec=output_spec)
 
 
-# some tensor ops should not support shard, i.e. local_scalar_dense
-# shouldn't work for shard as it requires numel == 1
+@register_prop_rule(aten._local_scalar_dense.default)
 def no_shard_prop_rule(op_schema: OpSchema) -> OutputSharding:
+    # some tensor ops should not support shard, i.e. local_scalar_dense
+    # shouldn't work for shard as it requires numel == 1
     # by default prop the first arg spec
     tensor_spec = op_schema.args_spec[0]
     for placement in tensor_spec.placements:
@@ -56,68 +57,69 @@ def no_shard_prop_rule(op_schema: OpSchema) -> OutputSharding:
                 f"with `Shard`, but found placements: "
                 f"{tensor_spec.placements}",
             )
-    # otherwise default prop the first arg spec
-    return OutputSharding(tensor_spec)
+    # otherwise default prop as None as it would not return
+    # a DTensor
+    return OutputSharding(None)
 
 
 def new_factory_rule(op_schema: OpSchema) -> OutputSharding:
     # this op would benefit from backward sharding propagation!
     # Since we cannot do that yet, just return replicated
     input = op_schema.args_schema[0]
-    size = torch.Size(cast(Sequence[int], op_schema.args_schema[1]))
     assert isinstance(input, DTensorSpec)
 
     return OutputSharding(
         output_spec=DTensorSpec(
             mesh=input.mesh,
             placements=[Replicate()] * input.mesh.ndim,
-            shape=size,
-            ndim=len(size),
+            tensor_meta=input.tensor_meta
         )
     )
 
 
+@register_prop_rule(aten.is_same_size.default)
+def non_tensor_prop_rule(op_schema: OpSchema) -> OutputSharding:
+    # simply return None as it does not return DTensor
+    return OutputSharding(
+        output_spec=None
+    )
+
+
 default_prop_ops = [
-    "aten._to_copy.default",
-    "aten.clone.default",
-    "aten.contiguous.default",
-    "aten.copy_.default",
-    "aten.detach.default",
-    "aten.is_same_size.default",
-    "aten.new_empty_strided.default",
+    aten._to_copy.default,
+    aten.clone.default,
+    aten.contiguous.default,
+    aten.copy_.default,
+    aten.detach.default,
+    aten.new_empty_strided.default,
 ]
 
 create_like_ops = [
-    "aten.empty_like.default",
-    "aten.fill_.Scalar",
-    "aten.full_like.default",
-    "aten.ones_like.default",
-    "aten.zero_.default",
-    "aten.zeros_like.default",
+    aten.empty_like.default,
+    aten.fill_.Scalar,
+    aten.full_like.default,
+    aten.ones_like.default,
+    aten.zero_.default,
+    aten.zeros_like.default,
 ]
 
 new_factory_ops = [
-    "aten.new_full.default",
-    "aten.new_ones.default",
-    "aten.new_zeros.default",
+    aten.new_full.default,
+    aten.new_ones.default,
+    aten.new_zeros.default,
 ]
 
-no_shard_prop_ops = ["aten._local_scalar_dense.default"]
-
 for op in default_prop_ops:
-    DTensor._op_to_rules[op] = default_prop_rule
+    register_prop_rule(op)(default_prop_rule)
 
 for op in create_like_ops:
-    DTensor._op_to_rules[op] = prop_create_like
-
-for op in no_shard_prop_ops:
-    DTensor._op_to_rules[op] = no_shard_prop_rule
+    register_prop_rule(op)(prop_create_like)
 
 for op in new_factory_ops:
-    DTensor._op_to_rules[op] = new_factory_rule
+    register_prop_rule(op)(new_factory_rule)
 
 
-@register_prop_rule("aten.bucketize.Tensor")
+@register_prop_rule(aten.bucketize.Tensor)
 def prop_bucketize(op_schema: OpSchema) -> OutputSharding:
     """
     Point-wise on the first input (just propagate input sharding).
@@ -140,8 +142,7 @@ def prop_bucketize(op_schema: OpSchema) -> OutputSharding:
                         DTensorSpec(
                             mesh=boundaries.mesh,
                             placements=[Replicate()] * len(boundaries.placements),
-                            ndim=boundaries.ndim,
-                            shape=boundaries.shape,
+                            tensor_meta=boundaries.tensor_meta,
                         ),
                     ),
                     kwargs_schema=op_schema.kwargs_schema,
@@ -160,9 +161,14 @@ def unshard_tensor_dim(
     )
 
 
-def _prop_all_but_dim(
-    op_schema: OpSchema, dim: int, out_shape: torch.Size
-) -> OutputSharding:
+def is_tensor_dim_sharded(
+    spec: DTensorSpec, dim: int
+) -> bool:
+    """Return True if tensor dim is sharded"""
+    return (dim < spec.ndim) and spec.dim_map[dim] >= 0
+
+
+def _prop_all_but_dim(op_schema: OpSchema, dim: int) -> OutputSharding:
     """
     Considering an op that takes its input as first argument, forwards all shardings
     except for the given dimension.
@@ -174,8 +180,6 @@ def _prop_all_but_dim(
     output_spec = DTensorSpec(
         mesh=input_spec.mesh,
         placements=output_placements,
-        shape=out_shape,
-        ndim=input_spec.ndim,
     )
 
     if input_spec.placements == output_placements:
@@ -184,8 +188,7 @@ def _prop_all_but_dim(
         suggested_input_spec = DTensorSpec(
             mesh=input_spec.mesh,
             placements=output_placements,
-            ndim=input_spec.ndim,
-            shape=input_spec.shape,
+            tensor_meta=input_spec.tensor_meta
         )
         out = OutputSharding(
             output_spec=None,
@@ -200,7 +203,7 @@ def _prop_all_but_dim(
     return out
 
 
-@register_prop_rule("aten.slice.Tensor")
+@register_prop_rule(aten.slice.Tensor)
 def prop_slice(op_schema: OpSchema) -> OutputSharding:
     """NOTE: can be further optimized (right now it replicates before slicing on a sharded dimension)"""
     defaults = (None, 0, None, None, 1)
@@ -230,18 +233,10 @@ def prop_slice(op_schema: OpSchema) -> OutputSharding:
     if start == 0 and end == input_spec.shape[dim] and step == 1:
         return OutputSharding(output_spec=input_spec)
 
-    # shape propagation
-    slice_len = (end - start + step - 1) // step
-    out_shape = torch.Size(
-        tuple(input_spec.shape[0:dim])
-        + (slice_len,)
-        + tuple(input_spec.shape[dim + 1 :])
-    )
-
-    return _prop_all_but_dim(op_schema, dim=dim, out_shape=out_shape)
+    return _prop_all_but_dim(op_schema, dim=dim)
 
 
-@register_prop_rule("aten.slice_scatter.default")
+@register_prop_rule(aten.slice_scatter.default)
 def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
     # 1. number of dimensions in input and src need to match.
     # 2. number of elements on all non-dim need to match between input and src.
@@ -279,8 +274,6 @@ def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
             output_spec=DTensorSpec(
                 mesh=input.mesh,
                 placements=input.placements,
-                shape=input.shape,
-                ndim=input.ndim,
             )
         )
     else:
@@ -294,14 +287,12 @@ def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
                         DTensorSpec(
                             mesh=input.mesh,
                             placements=input_suggestion,
-                            shape=input.shape,
-                            ndim=input.ndim,
+                            tensor_meta=input.tensor_meta,
                         ),
                         DTensorSpec(
                             mesh=src.mesh,
                             placements=input_suggestion,
-                            shape=src.shape,
-                            ndim=src.ndim,
+                            tensor_meta=src.tensor_meta,
                         ),
                     )
                     + op_schema.args_schema[2:],
@@ -311,7 +302,7 @@ def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
         )
 
 
-@register_prop_rule("aten.index_select.default")
+@register_prop_rule(aten.index_select.default)
 def prop_index_select(op_schema: OpSchema) -> OutputSharding:
     values_spec, dim, indices_spec = op_schema.args_schema
 
@@ -342,7 +333,7 @@ def prop_index_select(op_schema: OpSchema) -> OutputSharding:
     return result
 
 
-@register_prop_rule("aten.index.Tensor")
+@register_prop_rule(aten.index.Tensor)
 def prop_index(op_schema: OpSchema) -> OutputSharding:
     """
     Expect replicated on the first input; _mostly_ pointwise on the second input.
@@ -392,7 +383,7 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
         assert isinstance(indices_output_spec, DTensorSpec)
         indices_spec = indices_output_spec
 
-    lookup_dims = set(v[0] for v in valid_indices_spec)
+    lookup_dims = {v[0] for v in valid_indices_spec}
 
     need_reshard_on_values = tuple(
         (isinstance(vp, Shard) and (vp.dim in lookup_dims or isinstance(ip, Shard)))
@@ -400,9 +391,7 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
     )
 
     if not need_reshard_on_indices and not any(need_reshard_on_values):
-
         value_placements = values_spec.placements
-        value_shape = values_spec.shape
 
         all_dims_consecutive = all(
             b[0] - a[0] == 1
@@ -434,18 +423,10 @@ def place(vp: Placement, ip: Placement) -> Placement:
             place(vp, ip)
             for vp, ip in zip(values_spec.placements, indices_spec.placements)
         )
-        value_shape = torch.Size(
-            tuple(value_shape[:insert_dim])
-            + tuple(indices_spec.shape)
-            + tuple(value_shape[insert_dim + len(valid_indices_spec) :])
-        )
-
         result = OutputSharding(
             output_spec=DTensorSpec(
                 mesh=values_spec.mesh,
                 placements=value_placements,
-                shape=value_shape,
-                ndim=len(value_shape),
             )
         )
         return result
@@ -462,8 +443,7 @@ def place(vp: Placement, ip: Placement) -> Placement:
                                 Replicate() if need_reshard_on_values[i] else v
                                 for i, v in enumerate(values_spec.placements)
                             ],
-                            ndim=values_spec.ndim,
-                            shape=values_spec.shape,
+                            tensor_meta=values_spec.tensor_meta,
                         ),
                         multi_indices_spec,
                     ),
@@ -472,3 +452,179 @@ def place(vp: Placement, ip: Placement) -> Placement:
             ],
         )
         return result
+
+
+@register_prop_rule(aten.cat.default)
+def cat_rule(op_schema: OpSchema) -> OutputSharding:
+    # the first arg is a list of input tensors' specs
+    tensor_list_specs = cast(List[DTensorSpec], op_schema.args_schema[0])
+    # ndim will also be the result's ndim
+    ndim = 1
+    for spec in tensor_list_specs:
+        ndim = max(ndim, spec.ndim)
+
+    dim = 0  # default dim = 0
+    if (len(op_schema.args_schema) > 1):
+        dim = cast(int, op_schema.args_schema[1])
+    dim = normalize_dim(dim, ndim)
+
+    # Unshard all input tensors on cat dim before running einop rule
+    # to avoid _Partial in result.
+    need_reshard = False
+    tensor_list_specs_after = []
+    for spec in tensor_list_specs:
+        if is_tensor_dim_sharded(spec, dim=dim):
+            need_reshard = True
+            tensor_list_specs_after.append(
+                DTensorSpec(
+                    mesh=spec.mesh,
+                    placements=unshard_tensor_dim(spec.placements, dim=dim),
+                    tensor_meta=spec.tensor_meta,
+                )
+            )
+        else:
+            tensor_list_specs_after.append(spec)
+    tensor_list_specs = tensor_list_specs_after
+
+    # TODO: currently einop rule requires every character
+    # in result notation must have appeared in inputs
+    # so we temporarily design cat notation as
+    # "aij,bij->aij". Once we modify this requirement,
+    # we can switch to the more logically reasonable notation
+    # "aij,bij->cij"
+    alphabet = "abcdefghijklmnopqrstuvwxyz"
+    einop_notation_list = []
+
+    l = len(tensor_list_specs)
+    free_dim = alphabet[l:l + ndim - 1]
+    for i, spec in enumerate(tensor_list_specs):
+        if spec.ndim == ndim:
+            # rewrite concat dim
+            dim_word = free_dim[:dim] + alphabet[i] + free_dim[dim:]
+            einop_notation_list.append(dim_word)
+        else:
+            einop_notation_list.append(alphabet[i])
+
+    cat_dim_char = alphabet[0]
+    dim_word = free_dim[:dim] + cat_dim_char + free_dim[dim:]
+    einop_equation = f"{','.join(einop_notation_list)}->{dim_word}"
+    output_sharding = einop_rule(
+        einop_equation,
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=tuple(tensor_list_specs),
+            kwargs_schema={},
+        ),
+        linearity=False
+    )
+
+    if (
+        (output_sharding.output_spec is not None) and
+        need_reshard
+    ):
+        output_sharding.output_spec = None
+        output_sharding.schema_suggestions = [
+            OpSchema(
+                func_schema=op_schema.func_schema,
+                args_schema=tuple(tensor_list_specs),
+                kwargs_schema={},
+            ),
+        ]
+
+    if output_sharding.output_spec is None:
+        if output_sharding.schema_suggestions is not None:
+            # Convert args_schema from a tuple of DTensorSpec into a list
+            return _update_schema_suggestion_for_cat(
+                output_sharding,
+                op_schema,
+            )
+        else:
+            return output_sharding
+
+    return output_sharding
+
+
+def _update_schema_suggestion_for_cat(
+    output_sharding: OutputSharding,
+    op_schema: OpSchema,
+) -> OutputSharding:
+    assert output_sharding.schema_suggestions is not None
+    suggestion_specs = output_sharding.schema_suggestions[0].args_spec
+
+    args_schema = (suggestion_specs,) + op_schema.args_schema[1:]
+
+    output_sharding.schema_suggestions = [
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=args_schema,
+            kwargs_schema=op_schema.kwargs_schema,
+        )
+    ]
+    return output_sharding
+
+
+@register_prop_rule([aten.split.Tensor, aten.split_with_sizes.default])
+def split_rule(op_schema: OpSchema) -> OutputSharding:
+    output_spec_list: List[DTensorSpec] = []
+    input_spec = cast(DTensorSpec, op_schema.args_schema[0])
+    ndim = input_spec.ndim
+    split_size_or_sections = op_schema.args_schema[1]
+    dim = (
+        cast(int, op_schema.args_schema[2])
+        if len(op_schema.args_schema) > 2
+        else 0
+    )
+    dim = normalize_dim(dim, ndim)
+
+    # TODO: tensor to split cannot have _Partial
+    # in its placements for now. Will need to
+    # support in future.
+    if input_spec.sums:
+        raise NotImplementedError(
+            f"splitting distributed tensor with "
+            f"_Partial placement is not implemented!\n"
+            f"DTensorSpec={input_spec}"
+        )
+
+    # TODO: just like slice op, split replicates before
+    # splitting on a sharded dimension
+    need_reshard = False
+    if is_tensor_dim_sharded(input_spec, dim=dim):
+        need_reshard = True
+        input_spec = DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=unshard_tensor_dim(input_spec.placements, dim=dim),
+            tensor_meta=input_spec.tensor_meta,
+        )
+
+    if need_reshard:
+        return OutputSharding(
+            None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(input_spec,) + op_schema.args_schema[1:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                ),
+            ]
+        )
+
+    def size_split(N, i):
+        # Last chunk will be smaller if the tensor size N
+        # along the given dimension dim is not divisible by i.
+        assert i > 0
+        return [i] * (N // i) + ([N % i] if N % i != 0 else [])
+
+    output_size_list = (
+        size_split(input_spec.shape[dim], split_size_or_sections)
+        if isinstance(split_size_or_sections, int)
+        else split_size_or_sections
+    )
+    output_spec_list = [
+        DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=input_spec.placements,
+        )
+        for _ in range(len(output_size_list))
+    ]
+    return OutputSharding(output_spec_list)
diff --git a/torch/distributed/_tensor/ops/tp_sharding_ops.py b/torch/distributed/_tensor/ops/tp_sharding_ops.py
deleted file mode 100644
index 59964751ed2c..000000000000
--- a/torch/distributed/_tensor/ops/tp_sharding_ops.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-# implement matrix related ops for distributed tensor
-from typing import List
-
-import torch
-import torch.utils._pytree as pytree
-from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.ops.utils import register_impl, unwrap_single_placement
-from torch.distributed._tensor.utils import unwrap_local_tensor
-
-"""
-The ops below were quickly hacked and needed to be polished down the road.
-Although they come with unit tests already, the logic is directly borrowed
-from ShardedTensor. We need to also make it work for all placement types
-of DTensor and all corner cases for sharded distributed tensor.
-"""
-
-
-@register_impl("aten.cat.default")
-def dist_cat(tensor_list: List[DTensor], dim: int = 0) -> DTensor:
-    local_inputs = pytree.tree_map(unwrap_local_tensor, tensor_list)
-    local_tensor = torch.ops.aten.concat(local_inputs, dim=dim)
-    return DTensor.from_local(
-        local_tensor,
-        tensor_list[0].device_mesh,
-        tensor_list[0].placements,
-        run_check=False,
-    )
-
-
-@register_impl("aten.split.Tensor")
-# pyre-fixme[2]: Parameter must be annotated.
-def dist_split(self: DTensor, split_size_or_sections, dim=0) -> List[DTensor]:
-    local_mat = pytree.tree_map(unwrap_local_tensor, self)
-    mat_placement = pytree.tree_map(unwrap_single_placement, self)
-    sharding_dim = mat_placement.dim
-    world_size = self.device_mesh.size(dim=0)
-    if dim < 0:
-        dim = self.dim() + dim
-    if sharding_dim < 0:
-        sharding_dim = self.dim() + sharding_dim
-    if dim == sharding_dim:
-        if type(split_size_or_sections) is list:
-            split_size_or_sections[sharding_dim] //= world_size
-        else:
-            split_size_or_sections //= world_size
-    tensor_list = local_mat.split(split_size_or_sections, dim=dim)
-    return [
-        DTensor.from_local(
-            tensor,
-            self.device_mesh,
-            [mat_placement],
-            run_check=False,
-        )
-        for tensor in tensor_list
-    ]
diff --git a/torch/distributed/_tensor/ops/utils.py b/torch/distributed/_tensor/ops/utils.py
index 107fdc912d6d..e7e06ade4c22 100644
--- a/torch/distributed/_tensor/ops/utils.py
+++ b/torch/distributed/_tensor/ops/utils.py
@@ -33,12 +33,14 @@ def wrapper(impl):
 # convenient wrapper to register sharding propagation rules
 # pyre-fixme[3]: Return type must be annotated.
 # pyre-fixme[2]: Parameter must be annotated.
-def register_prop_rule(func):
+def register_prop_rule(op):
     # pyre-fixme[53]: Captured variable `func` is not annotated.
     # pyre-fixme[3]: Return type must be annotated.
     # pyre-fixme[2]: Parameter must be annotated.
     def wrapper(impl):
-        DTensor._op_to_rules[func] = impl
+        overloads = op if isinstance(op, list) else [op]
+        for overload in overloads:
+            DTensor._propagator.register_sharding_prop_rule(overload, impl)
         return impl
 
     return wrapper
diff --git a/torch/distributed/_tensor/ops/view_ops.py b/torch/distributed/_tensor/ops/view_ops.py
index 5ec84b6e8b82..ea04dfdef4c5 100644
--- a/torch/distributed/_tensor/ops/view_ops.py
+++ b/torch/distributed/_tensor/ops/view_ops.py
@@ -3,9 +3,10 @@
 from typing import Callable, cast, Dict, Iterable, Optional, Sequence, Set, Tuple, Union
 
 import torch
+
 from torch import Tensor
 from torch.distributed._tensor.api import Shard
-from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.utils import (
     normalize_dim,
     normalize_dims,
@@ -15,6 +16,7 @@
 
 from torch.distributed._tensor.placement_types import DTensorSpec, Placement, Replicate
 
+aten = torch.ops.aten
 
 Shape = Tuple[int, ...]
 
@@ -368,7 +370,7 @@ def dim_transpose(ndim: int, dim1: int, dim2: int) -> DimMap:
     dim2 = normalize_dim(dim2, ndim)
     assert dim1 < ndim
     assert dim2 < ndim
-    dimmap = list(InputDim(i) for i in range(ndim))
+    dimmap = [InputDim(i) for i in range(ndim)]
     swapdim = dimmap[dim1]
     dimmap[dim1] = dimmap[dim2]
     dimmap[dim2] = swapdim
@@ -478,7 +480,7 @@ def propagate_shape_and_sharding(
       if the leftmost split size is divisible by the mesh dimension
     """
     assert len(in_shard) == len(mesh_sizes)
-    sharded_in_dims: Set[int] = set(s.dim for s in in_shard if isinstance(s, Shard))
+    sharded_in_dims: Set[int] = {s.dim for s in in_shard if isinstance(s, Shard)}
     # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
     shardable_dims: torch.Tensor = torch.ones(
         (len(local_in_shape), len(mesh_sizes)), dtype=torch.bool
@@ -585,11 +587,11 @@ def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
 
 
 def register_prop_rule_map(
-    aten_op_name: str, local_op_name: Callable[..., torch.Tensor]
+    aten_op_overload: torch._ops.OpOverload, local_op_name: Callable[..., torch.Tensor]
 ) -> None:
     spec: Op = ops[local_op_name]
 
-    @register_prop_rule(aten_op_name)
+    @register_prop_rule(aten_op_overload)
     def reshape_prop(op_schema: OpSchema) -> OutputSharding:
         rules = spec.dim_map(*op_schema.args_schema, **op_schema.kwargs_schema)
         input_dtensor_spec = op_schema.args_schema[0]
@@ -612,12 +614,10 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
             output_dtensor_spec = DTensorSpec(
                 mesh=input_dtensor_spec.mesh,
                 placements=shard_out,
-                shape=torch.Size(global_out_shape),
-                ndim=len(global_out_shape),
             )
-            local_out_shape = output_dtensor_spec.local_shape
+            local_out_shape = output_dtensor_spec._local_shape_from_global_shape(list(global_out_shape))
 
-            # We only need the local shape to lower he call into the local op
+            # We only need the local shape to lower the call into the local op
             args = op_schema.args_schema
             shape_argnum = spec.shape_argnum
             if shape_argnum is not None:
@@ -649,8 +649,7 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
                             DTensorSpec(
                                 placements=suggested_placements,
                                 mesh=input_dtensor_spec.mesh,
-                                ndim=input_dtensor_spec.ndim,
-                                shape=input_dtensor_spec.shape,
+                                tensor_meta=input_dtensor_spec.tensor_meta,
                             ),
                         )
                         + op_schema.args_schema[1:],
@@ -660,13 +659,12 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
             )
 
 
-register_prop_rule_map("aten.squeeze.default", torch.squeeze)
-register_prop_rule_map("aten.squeeze.dim", torch.squeeze)
-register_prop_rule_map("aten.view.default", Tensor.view)
-register_prop_rule_map("aten.view.SymInt", Tensor.view)
-register_prop_rule_map("aten._unsafe_view.default", Tensor.view)
-register_prop_rule_map("aten.unsqueeze.default", torch.unsqueeze)
-register_prop_rule_map("aten.expand.default", Tensor.expand)
-register_prop_rule_map("aten.permute.default", torch.permute)
-register_prop_rule_map("aten.repeat.default", Tensor.repeat)
-register_prop_rule_map("aten.transpose.int", torch.transpose)
+register_prop_rule_map(aten.squeeze.default, torch.squeeze)
+register_prop_rule_map(aten.squeeze.dim, torch.squeeze)
+register_prop_rule_map(aten.view.default, Tensor.view)
+register_prop_rule_map(aten._unsafe_view.default, Tensor.view)
+register_prop_rule_map(aten.unsqueeze.default, torch.unsqueeze)
+register_prop_rule_map(aten.expand.default, Tensor.expand)
+register_prop_rule_map(aten.permute.default, torch.permute)
+register_prop_rule_map(aten.repeat.default, Tensor.repeat)
+register_prop_rule_map(aten.transpose.int, torch.transpose)
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py
index d420e8736656..b996658c4656 100644
--- a/torch/distributed/_tensor/placement_types.py
+++ b/torch/distributed/_tensor/placement_types.py
@@ -8,9 +8,10 @@
 from torch.distributed._spmd.comm_tensor import CommTensor
 
 from torch.distributed._tensor.device_mesh import DeviceMesh
+from torch.fx.passes.shape_prop import TensorMetadata
 
 
-class Placement(object):
+class Placement:
     # base class Placement type
 
     # convenient utils to check for placement types
@@ -110,20 +111,19 @@ def _shard_tensor(
         shard and scatter a tensor on a mesh dimension (use coordinate
         0 on the mesh dimension as source of truth)
         """
-        my_coordinate = mesh.get_coordinate_on_dim(mesh_dim)
+        my_coordinate = mesh.get_coordinate()
         num_chunks = mesh.size(dim=mesh_dim)
-        # TODO: what should happen if rank is not in the mesh?
-        # see issue https://github.com/pytorch/tau/pull/492
-        assert (
-            my_coordinate is not None
-        ), "Rank if not part of mesh"  # TODO: figure out behavior here
+        if my_coordinate is None:
+            # if rank is not part of mesh, we simply return an empty tensor
+            return tensor.new_empty(0, requires_grad=tensor.requires_grad)
+
         scatter_list, pad_idx = self._split_tensor(
             tensor, num_chunks, with_padding=True, contiguous=True
         )
-        output = torch.empty_like(scatter_list[my_coordinate])
+        output = torch.empty_like(scatter_list[my_coordinate[mesh_dim]])
         mesh.scatter(output, scatter_list, mesh_dim=mesh_dim)
 
-        if pad_idx != 0 and my_coordinate >= pad_idx:
+        if pad_idx != 0 and my_coordinate[mesh_dim] >= pad_idx:
             output = self._unpad_tensor(output)
         return output
 
@@ -137,7 +137,7 @@ def _reduce_shard_tensor(
         """
         reduce and scatter a tensor on a mesh dimension
         """
-        my_coordinate = mesh.get_coordinate_on_dim(mesh_dim)
+        my_coordinate = mesh.get_coordinate()
         num_chunks = mesh.size(dim=mesh_dim)
         # TODO: what should happen if rank is not in the mesh?
         # see issue https://github.com/pytorch/tau/pull/492
@@ -149,14 +149,14 @@ def _reduce_shard_tensor(
         )
         # wrap with comm tensor
         scattered_list = [CommTensor(t) for t in scattered_list]
-        output = torch.empty_like(scattered_list[my_coordinate])
+        output = torch.empty_like(scattered_list[my_coordinate[mesh_dim]])
         mesh.reduce_scatter(
             CommTensor(output),
             scattered_list,  # pyre-ignore[6]
             op=reduce_op,
             mesh_dim=mesh_dim,
         )
-        if pad_idx != 0 and my_coordinate >= pad_idx:
+        if pad_idx != 0 and my_coordinate[mesh_dim] >= pad_idx:
             output = self._unpad_tensor(output)
         return output
 
@@ -171,7 +171,7 @@ def _to_replicate_tensor(
         This function all_gather all shards and return a tensor that
         is replicated on the previously sharded mesh dimension
         """
-        my_coordinate = mesh.get_coordinate_on_dim(mesh_dim)
+        my_coordinate = mesh.get_coordinate()
         num_chunks = mesh.size(dim=mesh_dim)
         # TODO: what should happen if rank is not in the mesh?
         # see issue https://github.com/pytorch/tau/pull/492
@@ -180,7 +180,7 @@ def _to_replicate_tensor(
         ), "Rank if not part of mesh"  # TODO: figure out behavior here
         # check if it needs to pad input tensor before all_gather
         pad_idx = size[self.dim] % num_chunks
-        if pad_idx != 0 and my_coordinate >= pad_idx:
+        if pad_idx != 0 and my_coordinate[mesh_dim] >= pad_idx:
             local_tensor = self._pad_tensor(local_tensor).contiguous()
 
         gathered_list = []
@@ -234,6 +234,25 @@ def __hash__(self) -> int:
     def __repr__(self) -> str:
         return "Replicate()"
 
+    def _replicate_tensor(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int
+    ) -> torch.Tensor:
+        """
+        Replicate (broadcast) a torch.Tensor on a mesh dimension (use
+        the first coordinate on the mesh dimension as source of truth)
+        """
+        my_coordinate = mesh.get_coordinate()
+        if my_coordinate is None:
+            # if rank is not part of mesh, we simply return an empty tensor
+            return tensor.new_empty(0, requires_grad=tensor.requires_grad)
+
+        tensor = tensor.contiguous()
+        mesh.broadcast(tensor, mesh_dim=mesh_dim)
+        return tensor
+
 
 class _Partial(Placement):
     # This is a default partial placement with element-wise reduce op
@@ -285,34 +304,40 @@ def __repr__(self) -> str:
 
 # used internally to propagate the placements
 @dataclass
-class DTensorSpec(object):
+class DTensorSpec:
     mesh: DeviceMesh
     placements: Sequence[Placement]
-    # shape of the current dist tensor, this will be set upon
-    # construction of the DTensor, prop rule could read it, and
-    # would need to set in output spec when calculate the output
-    # sharding
-    shape: torch.Size
-    # ndim of the current dist tensor, if passed in, this would be
-    # validated with shape, if not passed in, will be generated from
-    # the shape
-    ndim: int = -1
-
-    def __post_init__(self) -> None:
-        if self.ndim == -1:
-            self.ndim = len(self.shape)
+
+    tensor_meta: Optional[TensorMetadata] = None
 
     def __hash__(self) -> int:
-        return hash((self.mesh, tuple(self.placements), self.shape))
+        # TODO: tensor meta should all be part of the hash function, but we only
+        # use shape for now, need to fix this later
+        if self.tensor_meta is not None:
+            return hash((self.mesh, tuple(self.placements), self.tensor_meta.shape))
+        else:
+            return hash((self.mesh, tuple(self.placements)))
 
     def __eq__(self, __o: object) -> bool:
         return (
             isinstance(__o, DTensorSpec)
             and self.mesh == __o.mesh
             and self.placements == __o.placements
-            and self.shape == __o.shape
+            and self.tensor_meta == __o.tensor_meta
         )
 
+    @property
+    def shape(self) -> torch.Size:
+        if self.tensor_meta is None:
+            raise ValueError("tensor_meta is not set")
+        return self.tensor_meta.shape
+
+    @property
+    def ndim(self) -> int:
+        if self.tensor_meta is None:
+            raise ValueError("tensor_meta is not set")
+        return len(self.tensor_meta.shape)
+
     @property
     def dim_map(self) -> List[int]:
         """
@@ -326,7 +351,7 @@ def dim_map(self) -> List[int]:
         For example, we have a dist tensor that have the shape of
         [18, 20, 30], and device_mesh([0, 1, 2, 3]), placements:
         [Shard(1)], the dim_map of this placement would be:
-        [-1, 1, -1]. This representation is pretty helpful during
+        [-1, 0, -1]. This representation is pretty helpful during
         sharding propagation where we could know exactly each
         tensor dimension is sharded or not.
 
@@ -363,30 +388,37 @@ def sums(self) -> List[int]:
             if placement.is_partial()
         ]
 
-    @property
-    def local_shape(self) -> Tuple[int, ...]:
-        """
-        Compute the shape of a local shard of the given DTensor on its current
-        coordinate of the mesh.
-        """
-        assert self.shape is not None, "DTensorSpec does not contain global shape."
-        local_shape = list(self.shape)  # start with global shape
+    def _local_shape_from_global_shape(
+        self, global_shape: List[int]
+    ) -> Tuple[int, ...]:
+        local_shape = global_shape  # start with global shape
+        ndim = len(global_shape)
         for idx, placement in enumerate(self.placements):
             mesh_dim_size = self.mesh.size(idx)
-            my_coordinate = self.mesh.get_coordinate_on_dim(idx)
+            my_coordinate = self.mesh.get_coordinate()
             assert my_coordinate is not None, "Rank not part of mesh!"
             if isinstance(placement, Shard):
                 shard_dim = placement.dim
                 assert (
-                    shard_dim < self.ndim
-                ), f"Sharding dim {shard_dim} greater than tensor ndim {self.ndim}"
+                    shard_dim < ndim
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {ndim}"
                 local_shard_size, _ = placement._local_shard_size_on_dim(
-                    local_shape[shard_dim], mesh_dim_size, my_coordinate
+                    local_shape[shard_dim], mesh_dim_size, my_coordinate[idx]
                 )
                 assert isinstance(local_shard_size, int)
                 local_shape[shard_dim] = local_shard_size
+
         return tuple(local_shape)
 
+    @property
+    def local_shape(self) -> Tuple[int, ...]:
+        """
+        Compute the shape of a local shard of the given DTensor on its current
+        coordinate of the mesh.
+        """
+        assert self.tensor_meta is not None, "DTensorSpec does not contain tensor meta."
+        return self._local_shape_from_global_shape(list(self.tensor_meta.shape))
+
     @property
     def local_offsets(self) -> Tuple[int, ...]:
         """
@@ -394,23 +426,23 @@ def local_offsets(self) -> Tuple[int, ...]:
         global rank. This is mostly used by distributed checkpointing to know the
         exact offsets of the local shard.
         """
-        assert self.shape is not None, "DTensorSpec does not contain global shape."
-        local_offsets = [0] * self.ndim
-        local_shape = list(self.shape)
+        assert self.tensor_meta is not None, "DTensorSpec does not contain tensor meta."
+        local_offsets = [0] * len(self.tensor_meta.shape)
+        local_shape = list(self.tensor_meta.shape)
 
         for idx, placement in enumerate(self.placements):
             mesh_dim_size = self.mesh.size(idx)
-            my_coordinate = self.mesh.get_coordinate_on_dim(idx)
+            my_coordinate = self.mesh.get_coordinate()
             assert my_coordinate is not None, "Rank not part of mesh!"
             if isinstance(placement, Shard):
                 shard_dim = placement.dim
                 assert (
-                    shard_dim < self.ndim
-                ), f"Sharding dim {shard_dim} greater than tensor ndim {self.ndim}"
+                    shard_dim < len(local_shape)
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
                 shard_size, shard_offset = placement._local_shard_size_on_dim(
                     local_shape[shard_dim],
                     mesh_dim_size,
-                    my_coordinate,
+                    my_coordinate[idx],
                     return_offset=True,
                 )
                 local_shape[shard_dim] = shard_size
@@ -423,7 +455,7 @@ def from_dim_map(
         mesh: DeviceMesh,
         dim_map: List[int],
         sums: List[int],
-        shape: torch.Size,
+        tensor_meta: Optional[TensorMetadata] = None,
     ) -> "DTensorSpec":
         """
         Construct a DTensorSpec from dim_map list and pending sum.
@@ -434,7 +466,7 @@ def from_dim_map(
                 tensor dimension, see `dim_map` property doc for details
             sums (List[int]): a list of integer that represents the dist tensor have
                 pending sum on which device mesh dimension.
-            shape (torch.Size): shape of the DTensor associated with this spec.
+            tensor meta (TensorMetadata): DTensor metadata
 
         Return:
             a class:`DTensorSpec` object
@@ -460,4 +492,4 @@ def from_dim_map(
                     )
                 placements[m] = Shard(i)
 
-        return cls(mesh, placements, shape=shape, ndim=len(dim_map))
+        return cls(mesh, placements, tensor_meta=tensor_meta)
diff --git a/torch/distributed/_tensor/redistribute.py b/torch/distributed/_tensor/redistribute.py
index b3ffa1b9ab74..92e6702bf4e1 100644
--- a/torch/distributed/_tensor/redistribute.py
+++ b/torch/distributed/_tensor/redistribute.py
@@ -86,7 +86,7 @@ def _redistribute_with_local_tensor(
     sorted_placements.sort(key=_replicate_then_shard)
 
     for i, (current, target) in sorted_placements:
-        my_coordinate = device_mesh.get_coordinate_on_dim(i)
+        my_coordinate = device_mesh.get_coordinate()
         num_chunks = device_mesh.size(dim=i)
         # TODO: what should happen if rank is not in the mesh?
         # see issue https://github.com/pytorch/tau/pull/492
@@ -131,7 +131,7 @@ def _redistribute_with_local_tensor(
                     with_padding=False,
                     contiguous=False,
                 )
-                new_local_tensor = shards[my_coordinate].clone()
+                new_local_tensor = shards[my_coordinate[i]].clone()
             else:
                 # NOTE: this case shouldn't hit _decompose_sharding, decompose sharding should
                 # decompose Shard(0) -> Shard(1) into Shard(0) -> Replicate -> Shard(1)
@@ -149,7 +149,7 @@ def _redistribute_with_local_tensor(
             if current.is_replicate():
                 # For replicate -> partial, we zero out all other ranks of the current mesh dim
                 # and leave only 1 rank have the data, to perform a "zero cost" reshard.
-                if my_coordinate is not None and my_coordinate != 0:
+                if my_coordinate[i] != 0:
                     new_local_tensor = local_tensor.zero_()
                 else:
                     new_local_tensor = local_tensor
@@ -188,8 +188,10 @@ def redistribute_dtensor(
         new_local_tensor,
         device_mesh,
         placements,
-        size=input.size(),
+        shape=input.size(),
+        dtype=input.dtype,
         requires_grad=local_tensor.requires_grad,
+        stride=input.stride()
     )
 
 
@@ -223,9 +225,9 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
         # TODO: see if this make sense for all cases.
         target_placements: List[Placement] = []
         for current, target in zip(grad_output.placements, previous_placement):
-            if current.is_replicate() and target.is_partial():
+            if not current.is_partial() and target.is_partial():
                 # keep target placement to replicate instead of partial in this case
-                target_placements.append(current)
+                target_placements.append(Replicate())
             else:
                 target_placements.append(target)
 
diff --git a/torch/distributed/_tensor/sharding_prop.py b/torch/distributed/_tensor/sharding_prop.py
new file mode 100644
index 000000000000..02635b097482
--- /dev/null
+++ b/torch/distributed/_tensor/sharding_prop.py
@@ -0,0 +1,202 @@
+from typing import Callable, Dict, Tuple, Optional
+
+import torch
+import torch.distributed._tensor.api as dtensor
+from torch._subclasses import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import get_isolated_graphmodule
+from torch._ops import OpOverload
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding, DTensorSpec
+from torch.utils._pytree import tree_map
+
+"""
+Print information on ops input shape and sharding for debugging purposes.
+"""
+_DEBUG_VERBOSE = False
+
+
+def unwrap_schema(e: object) -> object:
+    return e._spec if isinstance(e, dtensor.DTensor) else e
+
+
+class ShardingPropagator:
+    def __init__(self) -> None:
+        self.op_to_rules: Dict[OpOverload, Callable[[OpSchema], OutputSharding]] = {}
+
+    def register_sharding_prop_rule(
+        self, op_overload: OpOverload, rule_func: Callable[[OpSchema], OutputSharding]
+    ):
+        """
+        Register a sharding propagation rule for an operator.
+        """
+        self.op_to_rules[op_overload] = rule_func
+
+    def prepare_op_schema(
+        self,
+        op_call: OpOverload,
+        args: Tuple[object, ...],
+        kwargs: Dict[str, object]
+    ) -> OpSchema:
+        """
+        This unwrap the args/kwargs DTensor to DTensorSpec and pack them
+        into an OpSchema for sharding propagation usage.
+        """
+        args_schema = tree_map(unwrap_schema, args)
+        kwargs_schema = tree_map(unwrap_schema, kwargs)
+
+        op_schema = OpSchema(op_call._schema, args_schema, kwargs_schema)
+
+        if _DEBUG_VERBOSE and torch.distributed.get_rank() == 0:
+            print(f"OpSchema({op_schema})")
+            local_shapes = tree_map(
+                lambda t: t.to_local().shape if isinstance(t, dtensor.DTensor) else None,
+                args,
+            )
+            print(f"    local shapes: {local_shapes}")
+
+        return op_schema
+
+    def propagate_op_sharding(
+        self, op_overload: OpOverload, op_schema: OpSchema
+    ) -> OutputSharding:
+        """
+        Propagate the sharding for an operator given the op_schema.
+        """
+        # first we propagate the tensor metadata
+        output_node = self._propagate_tensor_meta(op_overload, op_schema)
+
+        # then we propagate the sharding
+        sharding_prop_func = self.op_to_rules.get(op_overload, None)
+
+        if sharding_prop_func is None:
+            # step 1. If there's not even one sharding rule
+            # implemented for the operator, we error out.
+            raise NotImplementedError(
+                f"Operator {op_overload} does not have a DistributedTensor rule registered."
+            )
+
+        # step 2. there's sharding propagation rule, run
+        # sharding propagation to get the output sharding
+        try:
+            output_sharding = sharding_prop_func(op_schema)
+        except Exception as e:
+            raise RuntimeError(
+                f"Sharding propagation failed on op {op_overload}.\n"
+                f"Input schema: {op_schema}.\n"
+                f"Error: {e}"
+            ) from e
+
+
+        # step 3. if can't get output_spec from sharding
+        # propagation (i.e. no rules apply for input
+        # placements), we return the output sharding
+        # with schema suggestions, which can be used to
+        # decide how to do redistribute on inputs
+        if output_sharding.output_spec is None:
+            if output_sharding.schema_suggestions is None:
+                if output_sharding.failed_reason is not None:
+                    raise RuntimeError(
+                        f"Sharding propagation failed on op {op_overload}!"
+                        f"Input schema: {op_schema}."
+                        f"Failed reason: {output_sharding.failed_reason}"
+                    )
+                else:
+                    # if both output spec and schema suggestions are None, it
+                    # means the operator return a non-tensor (scalar) value,
+                    # in this case we just return the suggestion with the original
+                    # input schema
+                    output_sharding.schema_suggestions = [op_schema]
+            else:
+                # we do auto redistribute on inputs if necessary
+                # to get an eligble input, which we will pick a
+                # schema suggestion base on the redistribute cost.
+                # For now we simply pick the first suggestion.
+                # TODO: implement full auto distribute with a
+                # simple cost estimation model
+                suggested_input_schema = output_sharding.schema_suggestions[0]
+                # run sharding propagation again with suggested schema
+                propagation_res = sharding_prop_func(suggested_input_schema)
+                # we set the output sharding with the new propagation result
+                # so that dispatching know both output_spec and schema_suggestions
+                # exist, which indicates a reshard is needed
+                output_sharding.output_spec = propagation_res.output_spec
+        else:
+            # if sharding propagation succeed, we set the schema suggestion to
+            # the default op_schema, which indicates no reshard is needed
+            output_sharding.schema_suggestions = [op_schema]
+
+        # associate the output sharding with the output metadata
+        if output_node is not None:
+            output_nodes = output_node.args[0]
+            output_spec = output_sharding.output_spec
+            if output_spec is not None:
+                assert isinstance(output_nodes, (tuple, list))
+                if isinstance(output_spec, DTensorSpec):
+                    output_spec.tensor_meta = output_nodes[0].meta['tensor_meta']
+                elif isinstance(output_spec, (tuple, list)):
+                    for i, spec in enumerate(output_spec):
+                        if isinstance(spec, DTensorSpec):
+                            spec.tensor_meta = output_nodes[i].meta['tensor_meta']
+
+        return output_sharding
+
+    def _propagate_tensor_meta(
+        self,
+        op_overload: OpOverload,
+        op_schema: OpSchema,
+    ) -> Optional[torch.fx.Node]:
+        # right now we only use the graph for metadata prop, but next we will use
+        # the graph to do sharding prop together
+
+        # special case op list, we don't need to propagate for local
+        # scalar. TODO: figure out a better way to handle this
+        skip_prop_list = [
+            torch.ops.aten._local_scalar_dense.default,
+            torch.ops.aten.equal.default
+        ]
+        if op_overload in skip_prop_list:
+            return None
+
+        # NOTE: We must call the tracing in fake tensor mode so that it
+        # avoids materializing memory
+        with FakeTensorMode():
+            fake_args = op_schema.gen_fake_args()
+            fake_kwargs = op_schema.gen_fake_kwargs()
+            g = get_isolated_graphmodule(op_overload, fake_args, fake_kwargs)
+
+        output = None
+        for node in g.graph.nodes:
+            if node.op == 'output':
+                output = node
+        return output
+
+
+class _CachingPropagator(ShardingPropagator):
+    """
+    A sharding propagator that caches the propagation results.
+    This is currently experimental for Tensor Parallel usage.
+    """
+
+    def __init__(self, op_to_rules=None) -> None:
+        super().__init__()
+        if op_to_rules is not None:
+            self.op_to_rules = op_to_rules
+
+        # cache table for sharding propagation results, we might need to
+        # limit the size of the cache table in the future
+        self.cached_prop_results: Dict[OpSchema, OutputSharding] = {}
+
+    def propagate_op_sharding(
+        self, op_overload: OpOverload, op_schema: OpSchema
+    ) -> OutputSharding:
+        """
+        Propagate the sharding for an operator given the op_schema.
+        Cache the propagation results to avoid running propagation again.
+        """
+        if op_schema in self.cached_prop_results:
+            return self.cached_prop_results[op_schema]
+        else:
+            # call DTensor's propagate_op_sharding to get the prop result
+            output_sharding = super().propagate_op_sharding(op_overload, op_schema)
+            # update cached table
+            self.cached_prop_results[op_schema] = output_sharding
+            return output_sharding
diff --git a/torch/distributed/_tensor/utils.py b/torch/distributed/_tensor/utils.py
deleted file mode 100644
index 7afd97753b9e..000000000000
--- a/torch/distributed/_tensor/utils.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-
-import torch
-
-import torch.distributed._tensor.api as dtensor
-
-def unwrap_local_tensor(e: "dtensor.DTensor") -> torch.Tensor:
-    return e._local_tensor if isinstance(e, dtensor.DTensor) else e
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index 5e7005a85063..165cc964d243 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -1,5 +1,7 @@
 from collections import defaultdict
 
+from itertools import chain
+
 import pickle
 
 from typing import (
@@ -16,6 +18,7 @@
 from torch.utils.hooks import RemovableHandle
 from torch.utils._python_dispatch import TorchDispatchMode
 
+
 BYTES_PER_MB = 1024 * 1024.0
 
 
@@ -83,6 +86,7 @@ def __init__(self) -> None:
         self._markers: Dict[str, int] = defaultdict(int)
         self._cur_module_name: str = ""
         self._op_index: int = 0
+        self._num_cuda_retries: int = 0
 
     @no_type_check
     def start_monitor(self, root_module: nn.Module) -> None:
@@ -116,7 +120,11 @@ def stop(self) -> None:
         """
         Remove module hooks and exit ``MemoryProfileDispatchMode`` to stop
         tracking memory stats at operator level.
+        Get some aggregated stats when the memory_tracker() is enabled, like
+        cuda ``num_alloc_retries``.
         """
+        self._num_cuda_retries = torch.cuda.memory_stats().get("num_alloc_retries", 0)
+
         for h in self._hooks:
             h.remove()
         self._hooks.clear()
@@ -138,6 +146,7 @@ def summary(self, top: int = 20) -> None:
             previous_allocated_memory = current_allocated_memory
 
         print("------------------------------------------------")
+        print(f"The number of cuda retries are: {self._num_cuda_retries}")
         print(f"Top {top} ops that generates memory are:")
         for k, v in sorted(op_diff.items(), key=lambda item: item[1], reverse=True)[
             :top
@@ -147,8 +156,6 @@ def summary(self, top: int = 20) -> None:
 
     @no_type_check
     def show_traces(self, path: str = "") -> None:
-        from itertools import chain
-
         import matplotlib.pyplot as plt
 
         def _plot_figure(x, y_values, labels):
@@ -206,6 +213,7 @@ def save_stats(self, path: str) -> None:
             "memories_active": self.memories_active,
             "memories_reserved": self.memories_reserved,
             "markers": self._markers,
+            "num_alloc_retries": self._num_cuda_retries,
         }
 
         with open(path, "wb") as f:
@@ -223,6 +231,7 @@ def load(self, path: str) -> None:
         self.memories_active = stats["memories_active"]
         self.memories_reserved = stats["memories_reserved"]
         self._markers = stats["markers"]
+        self._num_cuda_retries = stats["num_alloc_retries"]
 
     def _create_pre_forward_hook(self, name: str) -> Callable:
         """
@@ -305,3 +314,4 @@ def _clear_state(self) -> None:
         self._markers.clear()
         self._cur_module_name = ""
         self._op_index = 0
+        self._num_cuda_retries = 0
diff --git a/torch/distributed/algorithms/_comm_hooks/default_hooks.py b/torch/distributed/algorithms/_comm_hooks/default_hooks.py
index 5f3498ddc888..52acea85e9d6 100644
--- a/torch/distributed/algorithms/_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/_comm_hooks/default_hooks.py
@@ -3,7 +3,7 @@
 import torch.distributed as dist
 
 
-class DefaultState(object):
+class DefaultState:
     r"""
     Stores state needed to perform the default communication algorithm
     within a communication hook.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
index e852b34c1b4c..ffa155fce552 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -10,7 +10,7 @@
 
 _FUNCTIONAL_OPTIM_STEP_METHOD_NAME = "step_param"
 
-class _OptimizerHookState(object):
+class _OptimizerHookState:
     """
     Holds state for running optimizer in-line after DDP communication hook.
     Currently contains only optimizer class which must have a method `step_param`.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
index 9cbeb80d59a1..36eeb85c5996 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -8,7 +8,7 @@
 logger = logging.getLogger(__name__)
 
 
-class PostLocalSGDState(object):
+class PostLocalSGDState:
     r"""
     Stores the state for all-reducing gradients globally using ``process_group`` until step ``start_localSGD_iter``,
     and all-reducing gradients locally using ``subgroup`` afterwards.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index 42bfe6607c9e..7dc263b34789 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -112,7 +112,7 @@ def _report_compression_stats(bucket, state):
         state.next_stats_report = state.iter + state.compression_stats_logging_frequency
 
 
-class PowerSGDState(object):
+class PowerSGDState:
     r"""
     Stores both the algorithm's hyperparameters and the internal state for all the gradients during the training.
     Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main hyperparameters that should be tuned by the user.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
index aaa0b9455ee8..e6afe5b831a3 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
@@ -68,7 +68,7 @@ def quantization_pertensor_hook(
 
     tensor = bucket.buffer()
 
-    myObserver = torch.quantization.MinMaxObserver().cuda(tensor.device)
+    myObserver = torch.ao.quantization.MinMaxObserver().cuda(tensor.device)
     myObserver(tensor)
 
     s, z = myObserver.calculate_qparams()
@@ -159,7 +159,7 @@ def quantization_perchannel_hook(
         .cuda(tensor.device)
     )
 
-    myPerChannelObserver = torch.quantization.PerChannelMinMaxObserver().cuda(
+    myPerChannelObserver = torch.ao.quantization.PerChannelMinMaxObserver().cuda(
         tensor.device
     )
     myPerChannelObserver(tensor_in_channels)
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
index 0132e586e204..1c00b2ca2ea4 100644
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@@ -50,7 +50,7 @@ class Joinable(ABC):
     """
     @abstractmethod
     def __init__(self):
-        super(Joinable, self).__init__()
+        super().__init__()
         self._join_config = _JoinConfig.construct_disabled_join_config()
 
     @abstractmethod
diff --git a/torch/distributed/autograd/__init__.py b/torch/distributed/autograd/__init__.py
index c78d8c990187..e94ab1bb9d63 100644
--- a/torch/distributed/autograd/__init__.py
+++ b/torch/distributed/autograd/__init__.py
@@ -26,7 +26,7 @@ def is_available():
     )
 
 
-class context(object):
+class context:
     '''
     Context object to wrap forward and backward passes when using
     distributed autograd. The ``context_id`` generated in the ``with``
diff --git a/torch/distributed/benchmarks/benchmark_ddp_rpc.py b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
index 6614d3969bfc..d8f5737d2d43 100644
--- a/torch/distributed/benchmarks/benchmark_ddp_rpc.py
+++ b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
@@ -42,7 +42,7 @@ class HybridModel(torch.nn.Module):
    """
 
     def __init__(self, emb_rref_list, device):
-        super(HybridModel, self).__init__()
+        super().__init__()
         self.emb_rref_list = emb_rref_list
         fc1 = torch.nn.Linear(512, 256)
         fc2 = torch.nn.Linear(256, 128)
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index 0083d926f63f..c7e0bda81eff 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -19,3 +19,4 @@
     WriteItem,
 )
 from .default_planner import DefaultSavePlanner, DefaultLoadPlanner
+from .optimizer import load_sharded_optimizer_state_dict
diff --git a/torch/distributed/checkpoint/_nested_tensor.py b/torch/distributed/checkpoint/_sharded_tensor_utils.py
similarity index 87%
rename from torch/distributed/checkpoint/_nested_tensor.py
rename to torch/distributed/checkpoint/_sharded_tensor_utils.py
index 94ceaf5d4a52..8d39be25221a 100644
--- a/torch/distributed/checkpoint/_nested_tensor.py
+++ b/torch/distributed/checkpoint/_sharded_tensor_utils.py
@@ -29,12 +29,14 @@
 from .utils import _element_wise_add
 
 
-# TODO: update docstring for nested_tensor.py
-def flatten_sharded_tensors(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
-    """
-    Transform ``state_dict`` by flattening all nested ShardedTensor instances found.
+# TODO: We need to refactor this code.
+def _flatten_sharded_tensors(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+    r"""
+    Transforms ``state_dict`` by flattening all nested ShardedTensor instances found.
+
     The resulting ShardedTensor instances are only correct regarding the local shard and
-    MUST not be used for any other purpose but checkpointing, no operator will work with them.
+    MUST not be used for any other purpose but checkpointing, as no operator will work with them.
+
     This function should be used in conjunction with a state_dict produced by FSDP's
     StateDictType.SHARDED_STATE_DICT methods.
     """
@@ -45,12 +47,13 @@ def rewrite_dict(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
             set_element(new_state_dict, path, value)
             return
         shards = value.local_shards()
+
         if len(shards) == 0:
             return
         if len(shards) != 1:
-            raise ValueError(
-                f"Cannot handle outer tensor with more than 1 shard {path} -- {len(shards)}"
-            )
+            set_element(new_state_dict, path, value)
+            return
+
         outer_shard = shards[0]
 
         inner_st = outer_shard.tensor
@@ -80,13 +83,14 @@ def rewrite_dict(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
 
         st_meta: ShardedTensorMetadata = copy.deepcopy(value.metadata())
         other_rank = 0 if dist.get_rank() > 0 else 1
+
         # Remove the outer ST shard the inner ST covers
         for i, shard_md in enumerate(st_meta.shards_metadata):
             if shard_md.shard_offsets == outer_shard.metadata.shard_offsets:
                 st_meta.shards_metadata.pop(i)
                 break
 
-        # blame other rank for the other shards
+        # Attribute other rank for the other shards
         for shard_md in st_meta.shards_metadata:
             shard_md.placement = _remote_device(f"rank:{other_rank}/cuda:0")
 
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index 0bb44fd05759..9b80e9b5e290 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -4,6 +4,7 @@
 import io
 import logging
 import operator
+from collections import ChainMap
 from functools import reduce
 from typing import List, Tuple, Dict, Any, Union, cast
 
@@ -11,6 +12,7 @@
 
 from torch.distributed._shard._utils import narrow_tensor_by_index
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._tensor import DTensor
 
 
 from torch.distributed.checkpoint.planner import (
@@ -43,13 +45,12 @@
     FLATTEN_MAPPING,
     flatten_state_dict,
 )
-from torch.distributed.checkpoint._nested_tensor import flatten_sharded_tensors
-from torch.distributed.checkpoint._dedup_tensors import dedup_tensors
-from torch.distributed.checkpoint.utils import (
-    find_state_dict_object,
-    find_tensor_shard,
+from torch.distributed.checkpoint._sharded_tensor_utils import (
+    _flatten_sharded_tensors,
 )
-from torch.distributed.checkpoint._traverse import set_element, get_element
+from torch.distributed.checkpoint._dedup_tensors import dedup_tensors
+from torch.distributed.checkpoint.utils import find_state_dict_object
+from torch.distributed.checkpoint._traverse import set_element
 
 logger: logging.Logger = logging.getLogger(__file__)
 
@@ -65,27 +66,27 @@
 
 
 # TODO: Update docstrings for default_planner.py
-
-
 class DefaultSavePlanner(SavePlanner):
     mappings: FLATTEN_MAPPING
 
     def __init__(
         self,
-        flatten_state_dict: bool = False,
-        flatten_sharded_tensors: bool = False,
-        dedup_replicated_tensors: bool = False,
+        flatten_state_dict: bool = True,
+        flatten_sharded_tensors: bool = True,
+        dedup_replicated_tensors: bool = True,
     ) -> None:
         self.flatten_state_dict = flatten_state_dict
         self.flatten_sharded_tensors = flatten_sharded_tensors
         self.dedup_replicated_tensors = dedup_replicated_tensors
         self.mappings = {}
 
-    def init(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+    def set_up_planner(
+        self, state_dict: STATE_DICT_TYPE, is_coordinator: bool
+    ) -> None:
         if self.flatten_state_dict:
             state_dict, self.mappings = flatten_state_dict(state_dict)
         if self.flatten_sharded_tensors:
-            state_dict = flatten_sharded_tensors(state_dict)
+            state_dict = _flatten_sharded_tensors(state_dict)
         self.state_dict = state_dict
         self.is_coordinator = is_coordinator
 
@@ -108,9 +109,12 @@ def create_global_plan(
         global_plan, metadata = create_default_global_save_plan(all_plans)
 
         if self.flatten_state_dict:
-            merged_mappings = reduce(
-                lambda x, y: x | y, (p.planner_data for p in global_plan)
-            )
+            # | does not work for Python 3.8 or older version.
+            # merged_mappings = reduce(
+            #     lambda x, y: x | y, (p.planner_data for p in global_plan)
+            # )
+            planner_data_dict = [p.planner_data for p in global_plan]
+            merged_mappings = dict(ChainMap(*planner_data_dict))
             metadata = dataclasses.replace(
                 metadata, planner_data=merged_mappings
             )
@@ -165,25 +169,25 @@ class DefaultLoadPlanner(LoadPlanner):
 
     def __init__(
         self,
-        flatten_state_dict: bool = False,
-        flatten_sharded_tensors: bool = False,
+        flatten_state_dict: bool = True,
+        flatten_sharded_tensors: bool = True,
     ) -> None:
         self.flatten_state_dict = flatten_state_dict
         self.flatten_sharded_tensors = flatten_sharded_tensors
         self.original_state_dict = {}
         self.mappings = {}
 
-    def init(
+    def set_up_planner(
         self,
         state_dict: STATE_DICT_TYPE,
         metadata: Metadata,
         is_coordinator: bool,
     ) -> None:
-        if self.flatten_sharded_tensors:
-            state_dict = flatten_sharded_tensors(state_dict)
-
         self.original_state_dict = state_dict
 
+        if self.flatten_sharded_tensors:
+            state_dict = _flatten_sharded_tensors(state_dict)
+
         if self.flatten_state_dict:
             state_dict, self.mappings = flatten_state_dict(state_dict)
 
@@ -221,14 +225,7 @@ def lookup_tensor(self, index: MetadataIndex) -> torch.Tensor:
         """
         This is an extension from the planner interface to make it easy to extend the default planner
         """
-        if self.flatten_state_dict:
-            obj = get_element(
-                self.original_state_dict, self.mappings[index.fqn]
-            )
-            assert isinstance(obj, torch.Tensor)
-            return find_tensor_shard(obj, index)
-        else:
-            return find_state_dict_object(self.state_dict, index)
+        return find_state_dict_object(self.state_dict, index)
 
     def transform_tensor(self, read_item: ReadItem, tensor: torch.Tensor):
         """
@@ -285,7 +282,7 @@ def create_default_local_save_plan(
     """
     requests = []
     for fqn, obj in state_dict.items():
-        if isinstance(obj, ShardedTensor) or is_coordinator:
+        if isinstance(obj, (ShardedTensor, DTensor)) or is_coordinator:
             requests += _create_write_items(fqn, obj)
     return SavePlan(requests)
 
@@ -396,6 +393,7 @@ def _validate_global_plan(
             continue
         chunks_volume = 0
         for chunk_idx, chunk0 in enumerate(value.chunks):
+            # Compute the volume
             if not _check_box_bounds(value.size, chunk0):
                 logger.warning(
                     f"""
@@ -406,6 +404,7 @@ def _validate_global_plan(
                 all_good = False
             chunks_volume += reduce(operator.mul, chunk0.sizes, 1)
 
+            # Check for overlap
             for chunk1 in value.chunks[chunk_idx + 1 :]:
                 if _check_box_overlap(chunk0, chunk1):
                     logger.warning(
@@ -413,6 +412,7 @@ def _validate_global_plan(
                     )
                     all_good = False
 
+        # Check whether combined chunk cover the whole tensor
         tensor_volume = reduce(operator.mul, value.size, 1)
         if chunks_volume != tensor_volume:
             logger.warning(
diff --git a/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py b/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
new file mode 100644
index 000000000000..7f3f54f2ff84
--- /dev/null
+++ b/torch/distributed/checkpoint/examples/fsdp_checkpoint_example.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+"""
+The following example demonstrates how to use Pytorch Distributed Checkpoint
+to save a FSDP model. This is the current recommended way to checkpoint FSDP.
+torch.save() and torch.load() is not recommended when checkpointing sharded models.
+"""
+
+import os
+import shutil
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dist_cp
+import torch.multiprocessing as mp
+
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+from torch.distributed.checkpoint.optimizer import (
+    load_sharded_optimizer_state_dict,
+)
+
+CHECKPOINT_DIR = f"/scratch/{os.environ['LOGNAME']}/checkpoint"
+
+
+def opt_at(opt, idx):
+    return list((opt.state.values()))[idx]
+
+
+def init_model():
+    model = FSDP(torch.nn.Linear(4, 4).cuda(dist.get_rank()))
+    optim = torch.optim.Adam(model.parameters(), lr=0.1)
+    model(torch.rand(4, 4)).sum().backward()
+    optim.step()
+
+    return model, optim
+
+
+def print_params(stage, model_1, model_2, optim_1, optim_2):
+    with FSDP.summon_full_params(model_1):
+        with FSDP.summon_full_params(model_2):
+            print(
+                f"{stage} --- rank: {dist.get_rank()}\n"
+                f"model.weight: {model_1.weight}\n"
+                f"model_2.weight:{model_2.weight}\n"
+                f"model.bias: {model_1.bias}\n"
+                f"model_2.bias: {model_2.bias}\n"
+            )
+
+    print(
+        f"{stage} --- rank: {dist.get_rank()}\n"
+        f"optim exp_avg:{opt_at(optim_1, 0)['exp_avg']}\n"
+        f"optim_2 exp_avg:{opt_at(optim_2, 0)['exp_avg']}\n"
+        f"optim exp_avg_sq:{opt_at(optim_1, 0)['exp_avg_sq']}\n"
+        f"optim_2 exp_avg_sq:{opt_at(optim_2, 0)['exp_avg_sq']}\n"
+    )
+
+
+def run_fsdp_checkpoint_example(rank, world_size):
+    # Set up world pg
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+
+    # Initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+
+    # Create a model
+    model_1, optim_1 = init_model()
+
+    # Save the model to CHECKPOINT_DIR
+    with FSDP.state_dict_type(model_1, StateDictType.SHARDED_STATE_DICT):
+        state_dict = {
+            "model": model_1.state_dict(),
+            "optim": FSDP.optim_state_dict(model_1, optim_1),
+        }
+
+        dist_cp.save_state_dict(
+            state_dict=state_dict,
+            storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
+        )
+
+    # Create a second model
+    model_2, optim_2 = init_model()
+
+    # Print the model parameters for both models.
+    # Before loading, the parameters should be different.
+    print_params("Before loading", model_1, model_2, optim_1, optim_2)
+
+    # Load model_2 with parameters saved in CHECKPOINT_DIR
+    with FSDP.state_dict_type(model_2, StateDictType.SHARDED_STATE_DICT):
+        state_dict = {
+            "model": model_2.state_dict(),
+            # cannot load the optimizer state_dict together with the model state_dict
+        }
+
+        dist_cp.load_state_dict(
+            state_dict=state_dict,
+            storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+        )
+        model_2.load_state_dict(state_dict["model"])
+
+        optim_state = load_sharded_optimizer_state_dict(
+            model_state_dict=state_dict["model"],
+            optimizer_key="optim",
+            storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+        )
+
+        flattened_osd = FSDP.optim_state_dict_to_load(
+            optim_state["optim"], model_2, optim_2
+        )
+        optim_2.load_state_dict(flattened_osd)
+
+    # Print the model parameters for both models.
+    # After loading, the parameters should be the same.
+    print_params("After loading", model_1, model_2, optim_1, optim_2)
+
+    # Shut down world pg
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    world_size = torch.cuda.device_count()
+    print(f"Running fsdp checkpoint example on {world_size} devices.")
+    shutil.rmtree(CHECKPOINT_DIR, ignore_errors=True)
+    mp.spawn(
+        run_fsdp_checkpoint_example,
+        args=(world_size,),
+        nprocs=world_size,
+        join=True,
+    )
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 3d5ca4c8c2cf..2355f2d6f5bb 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -189,11 +189,9 @@ def values(self):
         while not self._done:
             drained = self._drain()
             self._refill()
-            for obj in drained:
-                yield obj
+            yield from drained
 
-        for val in self._finish():
-            yield val
+        yield from self._finish()
 
 
 def _item_size(item: WriteItem) -> int:
@@ -321,7 +319,7 @@ class FileSystemWriter(StorageWriter):
     def __init__(
         self,
         path: Union[str, os.PathLike],
-        single_file_per_rank: bool = False,
+        single_file_per_rank: bool = True,
         sync_files: bool = True,
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
@@ -345,7 +343,7 @@ def __init__(
         self.thread_count = thread_count
         self.per_thread_copy_ahead = per_thread_copy_ahead
 
-    def init(self, is_coordinator: bool) -> None:
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
         pass
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
@@ -513,7 +511,7 @@ def read_metadata(self) -> Metadata:
         with (self.path / ".metadata").open("rb") as metadata_file:
             return pickle.load(metadata_file)
 
-    def init(self, metadata: Metadata, is_coordinator: bool) -> None:
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
         self.storage_data = metadata.storage_data
         assert self.storage_data is not None
 
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
index a0ee4fc4a3fc..4210726318d4 100644
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -177,17 +177,17 @@ def create_local_plan(self) -> LoadPlan:
             reqs = _create_sharded_read_items(
                 fqn, cast(TensorStorageMetadata, md), local_shards
             )
-            # TODO: The WriteItems will have a displaced MetadataIndex, fix it.
+            # TODO: The ReadItems will have a displaced MetadataIndex, fix it.
             # TODO: we should change _create_sharded_read_items to have more ergonomic API
-            for wi in reqs:
-                assert wi.dest_index.offset is not None
+            for ri in reqs:
+                assert ri.dest_index.offset is not None
                 original_offset = _element_wise_sub(
-                    wi.dest_index.offset, offset
+                    ri.dest_index.offset, offset
                 )
                 original_index = dataclasses.replace(
-                    wi.dest_index, offset=torch.Size(original_offset)
+                    ri.dest_index, offset=torch.Size(original_offset)
                 )
-                self.translation[wi.dest_index] = original_index
+                self.translation[ri.dest_index] = original_index
 
             requests += reqs
         return LoadPlan(requests)
@@ -202,25 +202,24 @@ def load_sharded_optimizer_state_dict(
     storage_reader: dist_cp.StorageReader,
 ) -> STATE_DICT_TYPE:
     """
-    Loads a state_dict to be used in conjuntion with FSDP sharded optimizer state.
-    This is the current recommended way to checkpoint is FSDP
+    Loads a state_dict in conjuntion with FSDP sharded optimizer state.
+    This is the current recommended way to checkpoint FSDP.
     >>> # xdoctest: +SKIP
     >>> import torch.distributed.checkpoint as dist_cp
-    >>> import spmd.checkpoint as sp_cp
     >>> # Save
     >>> model: torch.nn.Model
     >>> optim_params = model.parameters()
     >>> optim = torch.optim.SGD(optim_params, lr=0.01)
-    >>>
+    >>> # Save
     >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
     >>>     state_dict = {
-    >>>         "optimizer": FSDP.sharded_optim_state_dict(model, optim, optim_params),
+    >>>         "optimizer": FSDP.optim_state_dict(model, optim),
     >>>         "model": model.state_dict()
     >>>     }
     >>>     dist_cp.save_state_dict(
     >>>         state_dict=optim_state,
     >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
-    >>>         planner=sp_cp.AdvLoadPlanner()
+    >>>         planner=dist_cp.DefaultSavePlanner(),
     >>>     )
     >>>
     >>> # Load
@@ -232,17 +231,17 @@ def load_sharded_optimizer_state_dict(
     >>>     dist_cp.load_state_dict(
     >>>         state_dict=checkpoint,
     >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
-    >>>         planner=sp_cp.AdvLoadPlanner()
+    >>>         planner=dist_cp.DefaultLoadPlanner(),
     >>>     )
     >>>     model.load_state_dict(checkpoint["model_state"])
     >>>
-    >>>     optim_state = sp_cp.load_sharded_optimizer_state_dict(
+    >>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
     >>>         model_state_dict,
     >>>         optimizer_key="optimizer",
     >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
     >>>     )
     >>>
-    >>>     flattened_osd = FSDP.flatten_sharded_optim_state_dict(
+    >>>     flattened_osd = FSDP.optim_state_dict_to_load(
     >>>        optim_state["optimizer"], model, optim
     >>>     )
     >>>
diff --git a/torch/distributed/checkpoint/planner.py b/torch/distributed/checkpoint/planner.py
index cb94a40df732..57fd38aafa41 100644
--- a/torch/distributed/checkpoint/planner.py
+++ b/torch/distributed/checkpoint/planner.py
@@ -100,7 +100,7 @@ class SavePlanner(abc.ABC):
 
     A planner subclass can expect the following sequence of calls during save_state_dict:
 
-    1) init - called on all ranks.
+    1) set_up_planner - called on all ranks.
         Signals the start of a checkpoint save.
 
     2) create_local_plan - called on all ranks.
@@ -125,9 +125,9 @@ class SavePlanner(abc.ABC):
 
     >>> # xdoctest: +SKIP("undefined vars")
     >>> class RenamePlanner(DefaultSavePlanner):
-    >>>     def init(self, state_dict, is_coordinator):
+    >>>     def set_up_planner(self, state_dict, is_coordinator):
     >>>         # prefix all keys with `foo_``
-    >>>         super().init(self, {"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
+    >>>         super().set_up_planner(self, {"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
 
     Modifying local plan and lookup in tandem. This is useful when fine control of how data is persisted
 
@@ -179,7 +179,7 @@ class SavePlanner(abc.ABC):
     """
 
     @abc.abstractmethod
-    def init(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+    def set_up_planner(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
         """
         Intialize this planner to save ``state_dict``.
 
@@ -225,7 +225,7 @@ def resolve_data(
         self, write_item: WriteItem
     ) -> Union[torch.Tensor, io.BytesIO]:
         """
-        Lookup the object associated with ``write_item``in `state_dict` and apply any
+        Lookup the object associated with ``write_item`` in ``state_dict`` and apply any
         transformation (such as serialization) prior to the storage layer consuming it.
 
         Called on each rank multiple times, at least once per WriteItem in the final SavePlan.
@@ -237,7 +237,7 @@ def resolve_data(
         is called in order to reduce peak memory required by checkpointing.
 
         When returning tensors, they can be on any device or format, they can be views too.
-        It's the storage layer responsiblity to figure out how to save them.
+        It's the storage layer responsibility to figure out how to save them.
         """
         pass
 
@@ -253,7 +253,7 @@ class LoadPlanner:
 
     A planner subclass can expect the following sequence of calls during load_state_dict:
 
-    1) init - called on all ranks.
+    1) set_up_planner - called on all ranks.
         Signals the start of loading a checkpoint.
 
     2) create_local_plan - called on all ranks.
@@ -280,9 +280,9 @@ class LoadPlanner:
 
     >>> # xdoctest: +SKIP("undefined vars")
     >>> class RenamePlanner(DefaultLoadPlanner):
-    >>>     def init(self, state_dict, metadata, is_coordinator):
+    >>>     def set_up_planner(self, state_dict, metadata, is_coordinator):
     >>>         self.original_state_dict = state_dict
-    >>>         super().init(self, {"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
+    >>>         super().set_up_planner(self, {"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
     >>>
     >>>     def load_bytes(self, read_item, value):
     >>>         # Remove the "foo_" prefix
@@ -302,7 +302,7 @@ class LoadPlanner:
     """
 
     @abc.abstractmethod
-    def init(
+    def set_up_planner(
         self,
         state_dict: STATE_DICT_TYPE,
         metadata: Metadata,
@@ -318,7 +318,7 @@ def init(
     @abc.abstractmethod
     def create_local_plan(self) -> LoadPlan:
         """
-        Create a LoadPlan based on state_dict and metadata provided by init.
+        Create a LoadPlan based on state_dict and metadata provided by set_up_planner.
 
         . N.B. This is called on every rank.
         """
diff --git a/torch/distributed/checkpoint/planner_helpers.py b/torch/distributed/checkpoint/planner_helpers.py
index 23fbcd0d7e78..d154bd1f5877 100644
--- a/torch/distributed/checkpoint/planner_helpers.py
+++ b/torch/distributed/checkpoint/planner_helpers.py
@@ -1,11 +1,13 @@
-from typing import List, Any
+from typing import Any, List
 
 import torch
 
+import torch.distributed as dist
 from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
 from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._tensor import DTensor
 
 from torch.distributed._shard.sharding_spec._internals import (
     _check_shard_metadata_pair_overlap,
@@ -62,6 +64,30 @@ def _sharded_tensor_metadata(
     )
 
 
+def _create_write_items_for_dtensor(fqn: str, tensor: DTensor) -> WriteItem:
+    device_mesh = tensor.device_mesh
+    assert (
+        device_mesh.ndim == 1
+    ), "Only 1D DeviceMeshes can currently be handled."
+
+    sizes = torch.Size(tensor._spec.local_shape)
+    offsets = torch.Size(tensor._spec.local_offsets)
+
+    return WriteItem(
+        index=MetadataIndex(fqn, offsets),
+        type=WriteItemType.SHARD,
+        tensor_data=TensorWriteData(
+            chunk=ChunkStorageMetadata(
+                offsets=offsets,
+                sizes=sizes,
+            ),
+            # TODO:update this to not use TensorProperties from ST.
+            properties=TensorProperties.create_from_tensor(tensor.to_local()),
+            size=tensor.size(),
+        ),
+    )
+
+
 def _create_write_item_for_shard(
     fqn: str, sharded_tensor: ShardedTensor, shard_md: ShardMetadata
 ) -> WriteItem:
@@ -173,7 +199,9 @@ def _create_sharded_read_items(
 def _create_default_metadata_only_plan(state_dict: STATE_DICT_TYPE) -> SavePlan:
     requests = []
     for fqn, obj in state_dict.items():
-        if isinstance(obj, ShardedTensor):
+        if isinstance(obj, DTensor):
+            requests.append(_create_write_items_for_dtensor(fqn, obj))
+        elif isinstance(obj, ShardedTensor):
             for shard_md in obj.metadata().shards_metadata:
                 requests.append(
                     _create_write_item_for_shard(fqn, obj, shard_md)
@@ -186,7 +214,9 @@ def _create_default_metadata_only_plan(state_dict: STATE_DICT_TYPE) -> SavePlan:
 
 
 def _create_write_items(fqn: str, object: Any) -> List[WriteItem]:
-    if isinstance(object, ShardedTensor):
+    if isinstance(object, DTensor):
+        return [_create_write_items_for_dtensor(fqn, object)]
+    elif isinstance(object, ShardedTensor):
         return [
             _create_write_item_for_shard(fqn, object, shard.metadata)
             for shard in object.local_shards()
@@ -197,8 +227,39 @@ def _create_write_items(fqn: str, object: Any) -> List[WriteItem]:
         return [_create_write_item_for_bytesio(fqn, object)]
 
 
+def _create_shard_from_dtensor(tensor: DTensor) -> Shard:
+    device_mesh = tensor.device_mesh
+    assert (
+        device_mesh.ndim == 1
+    ), "Only 1D DeviceMeshes can currently be handled."
+
+    sizes = tensor._spec.local_shape
+    offsets = tensor._spec.local_offsets
+    return Shard(
+        tensor=tensor.to_local(),
+        metadata=ShardMetadata(
+            shard_offsets=list(offsets),
+            shard_sizes=list(sizes),
+            placement=f"rank:{dist.get_rank()}/{tensor.to_local().device}",
+        ),
+    )
+
+
 def _create_read_items(fqn: str, md: STORAGE_TYPES, obj: Any) -> List[ReadItem]:
-    if isinstance(md, BytesStorageMetadata):
+    if not isinstance(md, BytesStorageMetadata):
+        if isinstance(obj, DTensor):
+            local_shards = [_create_shard_from_dtensor(obj)]
+        elif isinstance(obj, ShardedTensor):
+            local_shards = obj.local_shards()
+        elif isinstance(obj, torch.Tensor):
+            local_shards = [_create_shard_from_tensor(obj)]
+        else:
+            raise ValueError(
+                f"Invalid checkpoint metadata for {fqn}, "
+                + f"expected BytesStorageMetadata but found {type(md)}"
+            )
+        return _create_sharded_read_items(fqn, md, local_shards)
+    else:
         return [
             _create_read_item_for_byteio(
                 dest_index=MetadataIndex(fqn),
@@ -208,14 +269,3 @@ def _create_read_items(fqn: str, md: STORAGE_TYPES, obj: Any) -> List[ReadItem]:
                 length=0,
             )
         ]
-    elif isinstance(obj, ShardedTensor):
-        local_shards = obj.local_shards()
-    elif isinstance(obj, torch.Tensor):
-        local_shards = [_create_shard_from_tensor(obj)]
-    else:
-        raise ValueError(
-            f"Invalid checkpoint metadata for {fqn}, "
-            + f"expected BytesStorageMetadata but found {type(md)}"
-        )
-
-    return _create_sharded_read_items(fqn, md, local_shards)
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index be622eba51e7..11b8e360c976 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -29,20 +29,20 @@ def load_state_dict(
     instances, each rank only reads data for their local shards.
 
     .. warning::
-    All tensors in ``state_dict`` must be allocated on their
-    destination device *prior to* calling this function.
+        All tensors in ``state_dict`` must be allocated on their
+        destination device *prior to* calling this function.
 
-    All non-tensor data is loaded using `torch.load()` and modified in place
-    on state_dict.
+        All non-tensor data is loaded using `torch.load()` and modified in place
+        on state_dict.
 
     .. warning::
-    Users must call `load_state_dict` on the root module to ensure load
-    pos-processing and non-tensor data properly propagates.
+        Users must call `load_state_dict` on the root module to ensure load
+        pos-processing and non-tensor data properly propagates.
 
     .. note:
-    This function can be used for local inference and load a checkpoint
-    produced by ``save_state_dict`` without having a process group initialized
-    by passing ``no_dist=True`` and by using Tensors instead of ShardedTensors.
+        This function can be used for local inference and load a checkpoint
+        produced by ``save_state_dict`` without having a process group initialized
+        by passing ``no_dist=True`` and by using Tensors instead of ShardedTensors.
 
     Args:
         state_dict (Dict[str, Any]) : The state_dict to load. Note that this
@@ -91,8 +91,8 @@ def load_state_dict(
     def local_step():
         assert planner is not None
         metadata = storage_reader.read_metadata()
-        planner.init(state_dict, metadata, distW.is_coordinator)
-        storage_reader.init(metadata, distW.is_coordinator)
+        planner.set_up_planner(state_dict, metadata, distW.is_coordinator)
+        storage_reader.set_up_storage_reader(metadata, distW.is_coordinator)
 
         local_plan = planner.create_local_plan()
         local_plan = storage_reader.prepare_local_plan(local_plan)
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index 6a81595f4239..0ace087f5d4b 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -30,17 +30,17 @@ def save_state_dict(
     ``ShardedTensor`` by having each rank only save their local shards.
 
     .. warning::
-    There is no guarantees of Backwards Compatibility across PyTorch versions
-    for saved state_dicts.
+        There is no guarantees of Backwards Compatibility across PyTorch versions
+        for saved state_dicts.
 
     .. warning::
-    If using the `process_group` argument, make sure that only its ranks
-    call `save_state_dict` and that all data in state_dict belong to it.
+        If using the `process_group` argument, make sure that only its ranks
+        call `save_state_dict` and that all data in state_dict belong to it.
 
-    .. note:
-    This function can be used to save a state_dict with an intialized process
-    group by passing ``no_dist=True``. This can be used to produce a checkpoint
-    that can consumed by load_state_dict is a SPMD fashion.
+    .. note::
+        This function can be used to save a state_dict with an intialized process
+        group by passing ``no_dist=True``. This can be used to produce a checkpoint
+        that can consumed by load_state_dict is a SPMD fashion.
 
     Args:
         state_dict (Dict[str, Any]): A state_dict
@@ -85,8 +85,8 @@ def save_state_dict(
 
     def local_step():
         assert planner is not None
-        planner.init(state_dict, distW.is_coordinator)
-        storage_writer.init(distW.is_coordinator)
+        planner.set_up_planner(state_dict, distW.is_coordinator)
+        storage_writer.set_up_storage_writer(distW.is_coordinator)
         local_plan = planner.create_local_plan()
         local_plan = storage_writer.prepare_local_plan(local_plan)
         return local_plan
diff --git a/torch/distributed/checkpoint/storage.py b/torch/distributed/checkpoint/storage.py
index dbc8fda59eac..73cd5ffa93e1 100644
--- a/torch/distributed/checkpoint/storage.py
+++ b/torch/distributed/checkpoint/storage.py
@@ -37,7 +37,7 @@ class StorageWriter(abc.ABC):
 
     A subclass should expect the following sequence of calls.
 
-    1) (all ranks) init()
+    1) (all ranks) set_up_storage_writer()
     2) (all ranks) prepare_local_plan()
     3) (coordinator) prepare_global_plan()
     4) (all ranks) write_data()
@@ -45,7 +45,7 @@ class StorageWriter(abc.ABC):
     """
 
     @abc.abstractmethod
-    def init(self, is_coordinator: bool) -> None:
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
         """
         Initialize this instance.
 
@@ -146,10 +146,10 @@ class StorageReader(abc.ABC):
     A subclass should expected the following sequence of calls by ``load_state_dict``:
 
     1) (all ranks) read_metadata()
-    2) (all ranks) init
-    3) (all ranks) prepare_local_plan
-    4) (coordinator) prepare_global_plan
-    5) (all ranks) read_data
+    2) (all ranks) set_up_storage_reader()
+    3) (all ranks) prepare_local_plan()
+    4) (coordinator) prepare_global_plan()
+    5) (all ranks) read_data()
     """
 
     @abc.abstractmethod
@@ -164,7 +164,7 @@ def read_metadata(self) -> Metadata:
         pass
 
     @abc.abstractmethod
-    def init(self, metadata: Metadata, is_coordinator: bool) -> None:
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
         """
         Initialize this instance.
 
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 7a3c259474b5..5ffc8b8ece04 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -22,8 +22,8 @@
 from torch.distributed._shard.sharded_tensor import (
     ShardedTensor,
 )
-
 from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._tensor import DTensor
 
 from .metadata import (
     STATE_DICT_TYPE,
@@ -316,6 +316,8 @@ def _find_shard(tensor: ShardedTensor, index: MetadataIndex) -> Shard:
 def find_tensor_shard(
     tensor: torch.Tensor, index: MetadataIndex
 ) -> torch.Tensor:
+    if isinstance(tensor, DTensor):
+        return tensor.to_local()
     if isinstance(tensor, ShardedTensor):
         return _find_shard(tensor, index).tensor
     if index.offset is not None:
@@ -334,6 +336,7 @@ def find_state_dict_object(
     if index.fqn not in state_dict:
         raise ValueError(f"Could not find FQN: '{index.fqn}'")
     obj = state_dict[index.fqn]
+
     if isinstance(obj, torch.Tensor):
         return find_tensor_shard(obj, index)
     elif index.offset is not None:
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 81e610ffa7fd..f16277713179 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -10,7 +10,7 @@
 import warnings
 from collections import namedtuple
 from datetime import timedelta
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union, List
 
 import torch
 from torch._C._distributed_c10d import (
@@ -32,7 +32,6 @@
     get_debug_level,
     Work
 )
-from torch._six import string_classes
 from torch.autograd.profiler import record_function
 from .constants import default_pg_timeout
 from .c10d_error_logger import _get_or_create_logger
@@ -147,7 +146,7 @@ def supports_complex(reduceOp: ReduceOp) -> bool:
     return reduceOp not in denyList
 
 
-class Backend(object):
+class Backend:
     """
     An enum-like class of available backends: GLOO, NCCL, UCC, MPI, and other registered
     backends.
@@ -178,7 +177,7 @@ class Backend(object):
     backend_list = [UNDEFINED, GLOO, NCCL, UCC, MPI]
 
     def __new__(cls, name: str):
-        if not isinstance(name, string_classes):
+        if not isinstance(name, str):
             raise ValueError("Backend name must be a string, but got: {}".format(name))
         value = getattr(Backend, name.upper(), Backend.UNDEFINED)
 
@@ -223,7 +222,7 @@ def register_backend(cls, name, func, extended_api=False):
         Backend.backend_list.append(name.lower())
         Backend._plugins[name.upper()] = Backend._BackendPlugin(func, extended_api)
 
-class BackendConfig(object):
+class BackendConfig:
 
     def __init__(self, backend: Union[str, Backend]):
         self.device_backend_map: Dict[torch.device, Backend] = {}
@@ -266,7 +265,7 @@ def get_device_backend_map(self):
 dist_backend = Backend
 
 
-class _reduce_op(object):
+class _reduce_op:
     r"""
     Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``,
     ``MIN``, and ``MAX``.
@@ -297,8 +296,10 @@ def __getattribute__(self, key):
 _pg_names: Dict[ProcessGroup, str] = {}
 _pg_group_ranks: Dict[ProcessGroup, Dict[int, int]] = {}
 # For a pg, it is a map from ProcessGroup to BackendConfig
-_pg_backend_map: Dict[ProcessGroup, str] = {}
+_pg_backend_config: Dict[ProcessGroup, str] = {}
 _group_count = 0
+_tags_to_pg: Dict[str, List[ProcessGroup]] = {}
+_pg_to_tag: Dict[ProcessGroup, str] = {}
 
 class _World:
     """
@@ -354,6 +355,15 @@ def pg_group_ranks(self) -> Dict[ProcessGroup, Dict[int, int]]:
         global _pg_group_ranks
         return _pg_group_ranks
 
+    @property
+    def pg_backend_config(self) -> Dict[ProcessGroup, str]:
+        """
+        Process group's backend config
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_backend_config
+        return _pg_backend_config
+
     @property
     def group_count(self) -> int:
         """
@@ -372,6 +382,15 @@ def group_count(self, value):
         global _group_count
         _group_count = value
 
+    @property
+    def tags_to_pg(self) -> Dict[str, List[ProcessGroup]]:
+        global _tags_to_pg
+        return _tags_to_pg
+
+    @property
+    def pg_to_tag(self) -> Dict[ProcessGroup, str]:
+        global _pg_to_tag
+        return _pg_to_tag
 
 _world = _World()
 """Holds the singleton instance of ``_World`` used by c10. Experimental extension point to override it"""
@@ -390,10 +409,10 @@ def WORLD(cls) -> Optional[ProcessGroup]:
     def WORLD(cls, pg: Optional[ProcessGroup]):
         _world.default_pg = pg
 
-class group(object, metaclass=_WorldMeta):
+class group(metaclass=_WorldMeta):
     pass
 
-class GroupMember(object, metaclass=_WorldMeta):
+class GroupMember(metaclass=_WorldMeta):
     NON_GROUP_MEMBER = object()
 
 
@@ -717,7 +736,7 @@ def get_backend_config(group: Optional[ProcessGroup] = None) -> str:
         pg = group
     if _rank_not_in_group(pg):
         raise RuntimeError("Invalid process group specified")
-    backend_config = _pg_backend_map.get(pg, None)
+    backend_config = _world.pg_backend_config.get(pg)
     assert backend_config is not None
     return str(backend_config)
 
@@ -769,10 +788,12 @@ def init_process_group(
 
 
     Args:
-        backend (str or Backend): The backend to use. Depending on
+        backend (str or Backend, optional): The backend to use. Depending on
             build-time configurations, valid values include ``mpi``, ``gloo``,
-            ``nccl``, and ``ucc``. This field should be given as a lowercase
-            string (e.g., ``"gloo"``), which can also be accessed via
+            ``nccl``, and ``ucc``. If the backend is not provied, then both a ``gloo``
+            and ``nccl`` backend will be created, see notes below for how multiple
+            backends are managed. This field can be given as a lowercase string
+            (e.g., ``"gloo"``), which can also be accessed via
             :class:`Backend` attributes (e.g., ``Backend.GLOO``). If using
             multiple processes per machine with ``nccl`` backend, each process
             must have exclusive access to every GPU it uses, as sharing GPUs
@@ -823,6 +844,11 @@ def init_process_group(
     .. note:: To enable ``backend == Backend.MPI``, PyTorch needs to be built from source
         on a system that supports MPI.
 
+    .. note:: Support for multiple backends is experimental. Currently when no backend is
+        specified, both ``gloo`` and ``nccl`` backends will be created. The ``gloo`` backend
+        will be used for collectives with CPU tensors and the ``nccl`` backend will be used
+        for collectives with CUDA tensors.
+
     """
     global _world
 
@@ -885,7 +911,7 @@ def init_process_group(
             store,
             pg_options=pg_options,
             group_name=group_name,
-            timeout=timeout,
+            timeout=timeout
         )
         _update_default_pg(default_pg)
 
@@ -914,6 +940,7 @@ def _new_process_group_helper(
     pg_options=None,
     group_name=None,
     timeout=default_pg_timeout,
+    pg_tag=None
 ):
     """
     Create a new distributed process group.
@@ -941,6 +968,12 @@ def _new_process_group_helper(
             "Expected timeout argument to be of type" "datetime.timedelta"
         )
 
+    if pg_tag not in [None, ""]:
+        # creating with the same tag and rank set results in the same underlying PG
+        existing_group = _find_pg_by_ranks_and_tag(pg_tag, global_ranks_in_group)
+        if existing_group:
+            return existing_group
+
     # The list of group ranks is empty if we're creating the default group.
     is_default_group = len(global_ranks_in_group) == 0
 
@@ -972,8 +1005,7 @@ def _new_process_group_helper(
             backend_type = ProcessGroup.BackendType.MPI
             if not backend_class:
                 return GroupMember.NON_GROUP_MEMBER
-
-        if backend_str == Backend.GLOO:
+        elif backend_str == Backend.GLOO:
             # TODO: remove this check after lazy initialization is supported
             # if pg_options is not None:
             #     raise RuntimeError("GLOO options not supported")
@@ -1009,6 +1041,7 @@ def _new_process_group_helper(
             backend_plugin = Backend._plugins[backend_str.upper()]
             creator_fn = backend_plugin.creator_fn
             extended_api = backend_plugin.extended_api
+            backend_type = ProcessGroup.BackendType.CUSTOM
 
             if not extended_api:
                 backend_class = creator_fn(backend_prefix_store, group_rank, group_size, timeout)
@@ -1068,9 +1101,17 @@ def _new_process_group_helper(
     # update global state
     _world.pg_map[pg] = (backend, prefix_store)
     _world.pg_names[pg] = group_name
-    _pg_backend_map[pg] = str(backend_config)
-    return pg
+    _world.pg_backend_config[pg] = str(backend_config)
+    # "" is the default tag for user PGs
+    if pg_tag in [None, ""]:
+        pg_tag = f"ptd:{group_name}"
+        _world.tags_to_pg.setdefault("", []).append(pg)
+    else:
+        pg_tag = f"user:{pg_tag}"
 
+    _world.tags_to_pg.setdefault(pg_tag, []).append(pg)
+    _world.pg_to_tag[pg] = pg_tag
+    return pg
 
 def destroy_process_group(group: Optional[ProcessGroup] = None):
     """
@@ -1083,7 +1124,6 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
                                         be destroyed.
     """
     global _world
-    global _pg_backend_map
 
     if group == GroupMember.NON_GROUP_MEMBER:
         return
@@ -1102,7 +1142,9 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         _world.pg_map.clear()
         _world.pg_names.clear()
         _world.pg_group_ranks.clear()
-        _pg_backend_map.clear()
+        _world.pg_backend_config.clear()
+        _world.pg_to_tag.clear()
+        _world.tags_to_pg.clear()
 
         # when process group doesn't have an explicit name (only WORLD (default)
         # process group can have an explicit name), we use global _world.group_count
@@ -1117,7 +1159,17 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         del _world.pg_map[pg]
         del _world.pg_names[pg]
         del _world.pg_group_ranks[pg]
-        del _pg_backend_map[pg]
+        del _world.pg_backend_config[pg]
+
+        tag = _world.pg_to_tag.get(pg)
+        del _world.pg_to_tag[pg]
+        if tag is not None:
+            try:
+                _world.tags_to_pg[tag].remove(pg)
+                if tag.startswith("ptd:"):
+                    _world.tags_to_pg[""].remove(pg)
+            except Exception:
+                pass
 
 
 def get_rank(group: Optional[ProcessGroup] = None) -> int:
@@ -1175,6 +1227,9 @@ def isend(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None,
         Modifying ``tensor`` before the request completes causes undefined
         behavior.
 
+    .. warning::
+        ``tag`` is not supported with the NCCL backend.
+
     Args:
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
@@ -1204,6 +1259,9 @@ def irecv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[Proce
     """
     Receives a tensor asynchronously.
 
+    .. warning::
+        ``tag`` is not supported with the NCCL backend.
+
     Args:
         tensor (Tensor): Tensor to fill with received data.
         src (int, optional): Source rank. Will receive from any
@@ -1313,7 +1371,7 @@ def recv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[Proces
         return src
 
 
-class P2POp(object):
+class P2POp:
     """
     A class to build point-to-point operations for ``batch_isend_irecv``.
 
@@ -1425,8 +1483,8 @@ def wrapper(*args, **kwargs):
                 error_msg_dict = {
                     "func_name": f"{func.__name__}",
                     "args": f"{args}, {kwargs}",
-                    "backend": f"{get_backend()}",
-                    "world_size": f"{get_world_size()}",
+                    "backend": f"{get_backend(kwargs.get('group'))}",
+                    "world_size": f"{get_world_size(kwargs.get('group'))}",
                     "global_rank": f"{get_rank()}",
                     "local_rank": f"{get_rank(kwargs.get('group'))}",
                     "error": f"{error}",
@@ -3446,7 +3504,15 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None, pg_options=N
     Returns:
         A handle of distributed group that can be given to collective calls.
     """
+    return _new_group_with_tag(ranks, timeout, backend, pg_options)
 
+def _new_group_with_tag(ranks=None, timeout=default_pg_timeout, backend=None, pg_options=None, pg_tag=None):
+    """
+    This is a variant of ``new_group`` that exposes tag creation.
+
+    :: N.B. The mechanism is experimental and tied to the functional collectives effort, see
+    ``torch.distributed._functional_collectives`` for reference on how to use it.
+    """
     global _world
 
     default_pg = _get_default_group()
@@ -3496,6 +3562,7 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None, pg_options=N
             default_store,
             pg_options=pg_options,
             timeout=timeout,
+            pg_tag=pg_tag
         )
 
     # Create the global rank to group rank mapping
@@ -3753,3 +3820,53 @@ def new_subgroups_by_enumeration(
                 logger.info("Rank {} is assigned to subgroup {}".format(rank, ranks))
 
     return cur_subgroup, subgroups
+
+
+def _find_pg_by_ranks_and_tag(tag: str, ranks: List[int]) -> ProcessGroup:
+    if len(tag) > 0 and not tag.startswith("ptd:") and not tag.startswith("user:"):
+        tag = f"user:{tag}"
+
+    for group in _world.tags_to_pg.get(tag, []):
+        if group.size() != len(ranks):
+            continue
+
+        group_ranks = get_process_group_ranks(group)
+        good = all(r in group_ranks for r in ranks)
+        if good:
+            return group
+    return None
+
+def _find_or_create_pg_by_ranks_and_tag(tag: str, ranks: List[int], stride: int) -> ProcessGroup:
+    assert len(ranks) % stride == 0, f"Ranks length ({len(ranks)}) must be divisible by stride ({stride})"
+
+    my_rank = get_rank()
+    my_ranks = None
+
+    if stride == len(ranks):
+        my_ranks = ranks.copy()
+        assert my_rank in my_ranks, "rankset doesn't include the current node"
+    else:
+        for i in range(0, len(ranks), stride):
+            rank_set = ranks[i : i + stride]
+            if my_rank in rank_set:
+                my_ranks = rank_set
+        assert my_ranks is not None, "rankset doesn't include the current node"
+
+    my_ranks.sort()
+
+    pg = _find_pg_by_ranks_and_tag(tag, my_ranks)
+    if pg is not None:
+        return pg
+    if tag == "":
+        raise ValueError("Cannot automatically create PG with empty tag")
+    # TODO copy settings and timeout from default PG
+    return _new_group_with_tag(my_ranks, pg_tag=tag)
+
+def _get_group_tag(pg: ProcessGroup) -> str:
+    """
+    Returns the tag associated with ``pg``.
+    """
+    tag = _world.pg_to_tag[pg]
+    if tag.startswith("user:"):
+        tag = tag[5:]
+    return tag
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index b670c096d9bc..a9907663bb58 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -928,7 +928,7 @@ def _exit_barrier(self):
                 f"Done waiting for other agents. Elapsed: {time.time() - start} seconds"
             )
         except SignalException as e:
-            log.warn(f"Got termination signal: {e.sigval}")
+            log.warning(f"Got termination signal: {e.sigval}")
             raise
         except Exception:
             log.exception(
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index ec1269d34eee..6f14eb07ff32 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -113,7 +113,7 @@ def main():
                         role="trainer",
                         local_world_size=nproc_per_process,
                         entrypoint="/usr/local/bin/trainer",
-                        args=("--trainer_args", "foobar"),
+                        args=("--trainer-args", "foobar"),
                         ...<OTHER_PARAMS...>)
             agent = LocalElasticAgent(spec)
             results = agent.run()
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 727566fc6039..fde50a686964 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -117,11 +117,10 @@ def from_str(cls, vm: str) -> Union["Std", Dict[int, "Std"]]:
         Any other input raises an exception
         """
 
-        def to_std(v):
-            v = int(v)
-            for s in Std:
-                if s == v:
-                    return s
+        def to_std(v: str) -> Std:  # type: ignore[return]
+            s = Std(int(v))
+            if s in Std:
+                return s
             # return None -> should NEVER reach here since we regex check input
 
         if re.match(_VALUE_REGEX, vm):  # vm is a number (e.g. 0)
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index 8a711bdb2fe3..cc5a096c4df3 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -194,7 +194,7 @@ def shutdown(self) -> bool:
 # but is verbose to add everywhere. Consider wrapping the client calls
 # into auto-retry for these errors?
 #
-class EtcdRendezvous(object):
+class EtcdRendezvous:
     """
     A rendezvous implementation that uses `etcd <https://etcd.io/>`__ as
     the backend store.
diff --git a/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py b/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
index 9030c84a7837..547d526c0194 100644
--- a/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
@@ -83,18 +83,18 @@ def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler:
     if "rank" not in params.config:
         raise ValueError(
             "rank is absent in RendezvousParameters."
-            "Try add --node_rank to the cmd request"
+            "Try add --node-rank to the cmd request"
         )
     endpoint = params.endpoint.strip()
     if not endpoint:
         raise ValueError(
             "endpoint is absent in RendezvousParameters"
-            "Try add --master_port and --master_addr to the cmd request"
+            "Try add --master-port and --master-addr to the cmd request"
         )
     master_addr, master_port = parse_rendezvous_endpoint(endpoint, -1)
     if master_port == -1:
         raise ValueError(
-            f"Port is absent in endpoint: {endpoint}. Try launching with --master_port"
+            f"Port is absent in endpoint: {endpoint}. Try launching with --master-port"
         )
     world_size = params.max_nodes
     rank = cast(int, params.config.get("rank"))
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index fc032b356d18..2f7769a34f65 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -3,6 +3,7 @@
 """
 
 import traceback
+import warnings
 from enum import auto, Enum
 from typing import (
     Callable,
@@ -24,7 +25,14 @@
     _CHECKPOINT_PREFIX,
 )
 
-from .api import FullStateDictConfig, ShardingStrategy, StateDictConfig, StateDictType
+from .api import (
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    OptimStateDictConfig,
+    ShardingStrategy,
+    StateDictConfig,
+    StateDictType,
+)
 
 FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
 FSDP_PREFIX = FSDP_WRAPPED_MODULE + "."
@@ -35,22 +43,28 @@ class _FSDPState(_State):
     def __init__(self) -> None:
         # TODO: Move all the attributes to this class to enable typing for
         # FSDP/fully_shard.
+        self._ignored_modules: Set[nn.Module] = set()
+        self._ignored_params: Set[nn.Parameter] = set()
+        self.process_group: Optional[dist.ProcessGroup] = None
+        self.rank: int = -1
+        self.world_size: int = -1
+        self.sharding_strategy = ShardingStrategy.FULL_SHARD
         self._use_orig_params: bool = False
+        self.training_state = TrainingState.IDLE
         self._unshard_params_ctx: Dict[nn.Module, Generator] = {}
         self._state_dict_type: StateDictType = StateDictType.FULL_STATE_DICT
         self._state_dict_config: StateDictConfig = FullStateDictConfig()
+        self._optim_state_dict_config: OptimStateDictConfig = FullOptimStateDictConfig()
         self._is_root: Optional[bool] = None
         self._handles: List[flat_param_file.FlatParamHandle] = []
-        self._ignored_modules: Set[nn.Module] = set()
         self._fully_sharded_module_to_handles: Dict[
             nn.Module, flat_param_file.FlatParamHandle
         ] = {}
-        self.rank: int = -1
-        self.world_size: int = -1
-        self.sharding_strategy = ShardingStrategy.FULL_SHARD
         self.compute_device = torch.device("cuda", torch.cuda.current_device())
-        self.process_group: Optional[dist.ProcessGroup] = None
-        self._ignored_params: Set[nn.Parameter] = set()
+        # All following attributes should only be used for root states:
+        # Save these static lists to avoid the repeated tree traversals
+        self._all_fsdp_states: List[_FSDPState] = []
+        self._all_handles: List[flat_param_file.FlatParamHandle] = []
 
 
 def _get_module_fsdp_state(module: nn.Module) -> Optional[_FSDPState]:
@@ -60,7 +74,9 @@ def _get_module_fsdp_state(module: nn.Module) -> Optional[_FSDPState]:
     return state
 
 
-def _get_module_fsdp_state_if_comm_module(module: nn.Module) -> Optional[_FSDPState]:
+def _get_module_fsdp_state_if_fully_sharded_module(
+    module: nn.Module,
+) -> Optional[_FSDPState]:
     state = _get_module_fsdp_state(module)
     if state is None:
         return None
@@ -201,8 +217,27 @@ def module_fn(module, prefix, param_to_fqns):
             is_shared_param = param in param_to_fqns
             if not is_shared_param:
                 param_to_fqns[param] = global_fqns
-            elif not dedup_shared_params:
-                param_to_fqns[param].extend(global_fqns)
+            else:
+                if type(param) is flat_param_file.FlatParameter:
+                    # DMP overwrites `named_parameters` and skip (advance to
+                    # the next child module) the wrapped_module (e.g.,
+                    # _dmp_wrapped_module and _fsdp_wrapped_module). When a user
+                    # calls `named_child` to traverse the module recursively and
+                    # calls `named_parameters` with `recurse=False`, parameters
+                    # will be traversed more than once.
+                    # This hack is specificed designed for DMP + FSDP. We
+                    # overwite the flat_parameters traversal result to only obtain
+                    # the last one, which happens to be the correct one.
+                    #
+                    # TODO: Remove this hack once DMP + FSDP is not supported.
+                    warnings.warn(
+                        "FlatParameter is being traversed more than once. "
+                        "This case should only happen when using "
+                        "DistributedModelParallel with FullyShardedDataParallel."
+                    )
+                    param_to_fqns[param] = global_fqns
+                elif not dedup_shared_params:
+                    param_to_fqns[param].extend(global_fqns)
 
     def return_fn(param_to_fqns):
         return param_to_fqns
@@ -212,6 +247,7 @@ def return_fn(param_to_fqns):
         model,
         module_fn,
         return_fn,
+        [key for key, _ in model.named_parameters()],
         param_to_unflat_param_names,
     )
 
@@ -220,6 +256,7 @@ def _apply_to_modules(
     root_module: torch.nn.Module,
     module_fn: Callable,
     return_fn: Callable,
+    filter_fqns: Optional[List[str]] = None,
     *args,
     **kwargs,
 ):
@@ -229,15 +266,40 @@ def _apply_to_modules(
     returning a value using ``return_fn``. The traversal constructs the full
     module prefix name (e.g. "module.submodule." just like in model state dict)
     and makes that available to ``module_fn``.
+
+    ``filter_fqns`` is used because some module may have its own prefix similar
+    to ``FullyShardedDataParallel`` and the ``named_parameters()`` is overwritten
+    to remove the prefix.
     """
 
     def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
         # Call the module function before recursing over children (pre-order)
         module_fn(module, prefix, *args, **kwargs)
         for submodule_name, submodule in module.named_children():
-            if submodule is not None:
-                new_prefix = prefix + submodule_name + "."
-                f(submodule, new_prefix, *args, **kwargs)
+            if submodule is None:
+                continue
+            new_prefix = prefix + submodule_name + "."
+            if filter_fqns is not None:
+                for fqn in filter_fqns:
+                    if fqn.startswith(new_prefix):
+                        break
+                else:
+                    # DMP's named_parameter() will mess up the traversal with
+                    # ``named_children`` + `named_parameter(recurse=False)``.
+                    # This hack is a must to make the travsersal work.
+                    # TODO: Remove this hack once DMP + FSDP is not supported.
+                    if (
+                        submodule_name == "_fsdp_wrapped_module"
+                        or submodule_name == "_dmp_wrapped_module"
+                    ):
+                        warnings.warn(
+                            "An unexpected prefix is detected. This case "
+                            " should only happen when using DMP with FSDP. "
+                            f"prefix = {prefix}, "
+                            f"submodule_name = {submodule_name}"
+                        )
+                        new_prefix = prefix
+            f(submodule, new_prefix, *args, **kwargs)
 
     f(root_module, "", *args, **kwargs)
     return return_fn(*args, **kwargs)
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index f552b70e4dbe..b92df41648bb 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -36,6 +36,7 @@
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
+    FullOptimStateDictConfig,
     FullStateDictConfig,
     MixedPrecision,
     ShardingStrategy,
@@ -374,6 +375,7 @@ def _init_prefetching_state(
 def _init_state_dict_state(state: _FSDPState) -> _FSDPState:
     state._state_dict_type = StateDictType.FULL_STATE_DICT
     state_dict_config: StateDictConfig = FullStateDictConfig()
+    state._optim_state_dict_config = FullOptimStateDictConfig()
     state._state_dict_config = state_dict_config
     unshard_params_ctx: Dict[nn.Module, Generator] = {}
     state._unshard_params_ctx = unshard_params_ctx
@@ -395,13 +397,19 @@ def _init_param_handle_from_module(
     """
     _check_single_device_module(fully_sharded_module, state._ignored_params)
     device_from_device_id = _get_device_from_device_id(device_id, state.rank)
-    _materialize_module(
-        fully_sharded_module,
-        param_init_fn,
-        state._ignored_params,
-        device_from_device_id,
-        lambda k: not isinstance(k, module_wrapper_cls),
+    is_meta_module, is_torchdistX_deferred_init = _need_to_materialize_module(
+        fully_sharded_module, state._ignored_params
     )
+    # Materialize the module if needed
+    if (is_meta_module or is_torchdistX_deferred_init) and param_init_fn is not None:
+        _materialize_with_param_init_fn(fully_sharded_module, param_init_fn)
+    elif is_meta_module:
+        _materialize_meta_module(fully_sharded_module, device_id)
+    elif is_torchdistX_deferred_init:
+        deferred_init.materialize_module(
+            fully_sharded_module,
+            check_fn=lambda k: not isinstance(k, module_wrapper_cls),
+        )
     # TODO: Investigate refactoring `_move_module_to_device()` to
     # `_move_states_to_device()` to avoid the `device_id` + CPU offload hack
     _move_module_to_device(
@@ -453,19 +461,33 @@ def _init_param_handles_from_module(
     # using auto wrapping, which also represents a valid reverse toplogical
     # sort order, but the difference does not matter.
     materialized_module = False
-    for fully_sharded_module, (params, buffers, param_names, buffer_names) in reversed(
+    for fully_sharded_module, (params, buffers) in reversed(
         fully_sharded_module_to_states.items()
     ):
-        materialized_module |= _materialize_module(
-            fully_sharded_module,
-            param_init_fn,
-            state._ignored_params,
-            device_from_device_id,
-            lambda _: True,
+        # Materialize the module if needed
+        is_meta_module, is_torchdistX_deferred_init = _need_to_materialize_module(
+            fully_sharded_module, state._ignored_params
         )
+        if is_meta_module or is_torchdistX_deferred_init:
+            materialized_module = True
+            # Save the parameter and buffer names to reacquire references after
+            # after materialization since their variables may change
+            param_names, buffer_names = _get_state_names_for_states(
+                fully_sharded_module, params, buffers
+            )
+        if (
+            is_meta_module or is_torchdistX_deferred_init
+        ) and param_init_fn is not None:
+            _materialize_with_param_init_fn(fully_sharded_module, param_init_fn)
+        elif is_meta_module:
+            _materialize_meta_module(fully_sharded_module, device_id)
+        elif is_torchdistX_deferred_init:
+            deferred_init.materialize_module(
+                root_module,
+                check_fn=lambda _: True,
+            )
         if materialized_module:
-            # Materializing from meta device can change the parameter/buffer
-            # variables, so reacquire references
+            # Reacquire references using the pre-computed state names
             params = [
                 fully_sharded_module.get_parameter(param_name)
                 for param_name in param_names
@@ -530,6 +552,37 @@ def _init_param_handle_from_params(
         handle.flat_param_to(cpu_device)
 
 
+def _get_state_names_for_states(
+    module: nn.Module,
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+) -> Tuple[List[str], List[str]]:
+    """
+    Returns the parameter and buffer names of the given ``params`` and
+    ``buffers``, where the names are prefixed starting from ``module``. This
+    function assumes that the parameters and buffers are in the module tree.
+    """
+    param_names: List[str] = []
+    buffer_names: List[str] = []
+    param_to_param_name = {
+        param: param_name for param_name, param in module.named_parameters()
+    }
+    buffer_to_buffer_name = {
+        buffer: buffer_name for buffer_name, buffer in module.named_buffers()
+    }
+    for param in params:
+        assert (
+            param in param_to_param_name
+        ), f"Parameter not in the module tree:\n{module}\n{param}"
+        param_names.append(param_to_param_name[param])
+    for buffer in buffers:
+        assert (
+            buffer in buffer_to_buffer_name
+        ), f"Buffer not in the module tree:\n{module}\n{buffer}"
+        buffer_names.append(buffer_to_buffer_name[buffer])
+    return param_names, buffer_names
+
+
 def _get_ignored_modules(
     root_module: nn.Module,
     _ignored_modules: Optional[Iterable[torch.nn.Module]],
@@ -565,12 +618,12 @@ def _get_ignored_modules(
     # that this FSDP instance can get any ignored modules from its children.
 
     # Include child modules and exclude nested FSDP modules themselves
-    ignored_modules = set(
+    ignored_modules = {
         child
         for module in ignored_root_modules
         for child in module.modules()
         if not isinstance(child, fsdp_file.FullyShardedDataParallel)
-    )
+    }
     if root_module in ignored_modules:
         warnings.warn(
             "Trying to ignore the top-level module passed into the FSDP "
@@ -597,16 +650,16 @@ def _get_ignored_params(
     """
     all_ignored_params: Set[torch.nn.Parameter] = set()
 
-    params_in_ignored_modules = set(
+    params_in_ignored_modules = {
         p for m in ignored_modules for p in m.parameters() if not _is_fsdp_flattened(p)
-    )
+    }
 
     all_ignored_params.update(params_in_ignored_modules)
 
     if ignored_parameters is not None:
-        params_in_ignored_parameters = set(
+        params_in_ignored_parameters = {
             p for p in ignored_parameters if not _is_fsdp_flattened(p)
-        )
+        }
         all_ignored_params.update(params_in_ignored_parameters)
 
         # Include nested FSDP modules' ignored parameters
@@ -624,9 +677,9 @@ def _get_buffer_names(root_module: nn.Module) -> Set[str]:
     Returns the fully prefixed names of all buffers in the module hierarchy
     rooted at ``root_module`` as a class:`set`.
     """
-    return set(
+    return {
         clean_tensor_name(buffer_name) for buffer_name, _ in root_module.named_buffers()
-    )
+    }
 
 
 def _check_single_device_module(
@@ -638,7 +691,7 @@ def _check_single_device_module(
     ignoring the parameters in ``ignored_params``. Thus, after this method, the
     module must be either fully on the CPU or fully on a non-CPU device.
     """
-    devices = set(param.device for param in _get_orig_params(module, ignored_params))
+    devices = {param.device for param in _get_orig_params(module, ignored_params)}
     if len(devices) > 1:
         raise RuntimeError(
             f"FSDP only supports single device modules but got params on {devices}"
@@ -671,28 +724,15 @@ def _get_device_from_device_id(
     return device
 
 
-def _materialize_module(
+def _need_to_materialize_module(
     module: nn.Module,
-    param_init_fn: Optional[Callable[[nn.Module], None]],
     ignored_params: Set[nn.Parameter],
-    device_from_device_id: Optional[torch.device],
-    deferred_init_check_fn: Callable,
-) -> bool:
+) -> Tuple[bool, bool]:
     """
-    Materializes the wrapped module ``module`` in place if needed: either
-    if the module has parameters that use meta device or are torchdistX
-    fake tensors.
-
-    This method uses ``param_init_fn`` to materialize the module if the
-    function is not ``None`` and falls back to default behavior otherwise.
-    For meta device, this moves the module to ``device_from_device_id`` if
-    it is not ``None`` or the current device otherwise and calls
-    ``reset_parameters()``, and for torchdistX fake tensors, this calls
-    ``deferred_init.materialize_module()``.
-
-    Returns:
-        bool: ``True`` if ``module`` was materialized and ``False`` if this was
-        a no-op.
+    Returns if ``module`` has parameters on meta device and if ``module`` is
+    using torchdistX deferred initialization. At most of the returned bools can
+    be ``True``. If either is ``True``, then ``module`` needs to be
+    materialized.
     """
     managed_params = _get_orig_params(module, ignored_params)
     is_meta_module = any(param.is_meta for param in managed_params)
@@ -701,35 +741,39 @@ def _materialize_module(
         and _TORCHDISTX_AVAIL
         and any(fake.is_fake(param) for param in managed_params)
     )
-    if (is_meta_module or is_torchdistX_deferred_init) and param_init_fn is not None:
-        if not callable(param_init_fn):
-            raise ValueError(
-                f"Expected {param_init_fn} to be callable but got {type(param_init_fn)}"
-            )
-        param_init_fn(module)
-        return True
-    elif is_meta_module:
-        # Run default meta device initialization
-        materialization_device = device_from_device_id or torch.device(
-            torch.cuda.current_device()
+    return is_meta_module, is_torchdistX_deferred_init
+
+
+def _materialize_with_param_init_fn(
+    module: nn.Module,
+    param_init_fn,
+) -> None:
+    if not callable(param_init_fn):
+        raise ValueError(
+            f"Expected {param_init_fn} to be callable but got {type(param_init_fn)}"
         )
-        module.to_empty(device=materialization_device)
-        try:
-            with torch.no_grad():
-                module.reset_parameters()  # type: ignore[operator]
-        except BaseException as e:
-            warnings.warn(
-                "Unable to call `reset_parameters()` for module on meta "
-                f"device with error {str(e)}. Please ensure your "
-                "module implements a `reset_parameters()` method."
-            )
-            raise e
-        return True
-    elif is_torchdistX_deferred_init:
-        # Run default torchdistX initialization
-        deferred_init.materialize_module(module, check_fn=deferred_init_check_fn)
-        return True
-    return False
+    param_init_fn(module)
+
+
+def _materialize_meta_module(
+    module: nn.Module,
+    device_from_device_id: Optional[torch.device],
+):
+    # Run default meta device initialization
+    materialization_device = device_from_device_id or torch.device(
+        torch.cuda.current_device()
+    )
+    module.to_empty(device=materialization_device)
+    try:
+        with torch.no_grad():
+            module.reset_parameters()  # type: ignore[operator]
+    except BaseException as e:
+        warnings.warn(
+            "Unable to call `reset_parameters()` for module on meta "
+            f"device with error {str(e)}. Please ensure your "
+            "module implements a `reset_parameters()` method."
+        )
+        raise e
 
 
 def _move_module_to_device(
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 3755bd12f136..6cb4055cf30a 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -1,5 +1,6 @@
 import copy
 import functools
+import warnings
 from dataclasses import dataclass
 from typing import (
     Any,
@@ -11,6 +12,7 @@
     NamedTuple,
     Optional,
     Sequence,
+    Set,
     Tuple,
     Union,
 )
@@ -19,12 +21,11 @@
 import torch.distributed as dist
 import torch.distributed.fsdp._traversal_utils as traversal_utils
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed.fsdp._common_utils import (
     _apply_to_modules,
     _FSDPState,
-    _get_module_fsdp_state_if_comm_module,
+    _get_module_fsdp_state_if_fully_sharded_module,
     _get_param_to_fqns,
     _module_handles,
     clean_tensor_name,
@@ -293,6 +294,7 @@ def _flatten_optim_state_dict(
     model: nn.Module,
     shard_state: bool,
     use_orig_params: bool = False,
+    optim: Optional[torch.optim.Optimizer] = None,
 ) -> Dict[str, Any]:
     """
     Flattens the full optimizer state dict, still keying by unflattened
@@ -300,6 +302,23 @@ def _flatten_optim_state_dict(
     ``FlatParameter`` 's optimizer states are sharded, and otherwise, they are
     kept unsharded.
 
+    If ``use_orig_params`` is True, each rank will have all FSDP-managed
+    parameters but some of these parameters may be empty due to the sharding.
+    For a regular optim.Optimizer, states for those empty parameters will
+    not be initialized. So, when aggregating the FQNs across ranks, no assert
+    will be raised on a rank even if it does not have all the states -- it is
+    valid and FSDP know how to aggregate them. However, FSDP has to ignore
+    handling those parameters that are not managed by FSDP and do not exist on
+    the local rank -- it is managed by other parallelism and FSDP does not
+    know ho to handle/aggregate them.
+
+    Note that ``_flatten_tensor_optim_state`` does not need ``optim`` to
+    flatten/shard the state. However, NamedOptimizer and KeyedOptimizer require
+    all the states even if the corresponding parameters are empty. To this end,
+    ``optim`` will be used to to get the initial state of the empty parameters.
+    ``optim`` should only be non-None if the ``optim` is KeyedOptimizer or
+    NamedOptimizer.
+
     Returns:
         Dict[str, Any]: The flattened optimizer state dict.
     """
@@ -317,6 +336,16 @@ def _flatten_optim_state_dict(
     unflat_osd_state = unflat_osd["state"]
     all_state_keys = set(unflat_osd_state.keys())
 
+    # local_state_dict is used to construct states of empty parameters.
+    # This should only be used if is_named_optimizer=True.
+    local_state_dict: Dict[str, Any] = {}
+    local_state_clean_fqns: Dict[str, str] = {}
+    if optim is not None:
+        local_state_dict = optim.state_dict()["state"]
+        for fqn in local_state_dict.keys():
+            clean_fqn = clean_tensor_name(fqn)
+            local_state_clean_fqns[clean_fqn] = fqn
+
     for param, unflat_param_names in param_to_fqns.items():
         fqn = unflat_param_names[0]
         if fqn not in unflat_osd_state:
@@ -341,10 +370,18 @@ def _flatten_optim_state_dict(
                     shard_state,
                 )
             key = _OptimStateKey(tuple(unflat_param_names), True)
+            # Only include non-empty states since as expected by
+            # `torch.optim.Optimizer` s unless the optimizer is KeyedOptimizer
+            # or NamedOptimizer.
             if flat_state:
-                # Only include non-empty states since as expected by
-                # `torch.optim.Optimizer` s
                 flat_osd_state[key] = flat_state
+            elif optim is not None:  # NamedOptimizer or KeyedOptimizer case.
+                assert len(unflat_param_names) == 1
+                local_wrapped_fqn = local_state_clean_fqns.get(fqn, "")
+                if local_wrapped_fqn:
+                    flat_osd_state[key] = copy.deepcopy(
+                        local_state_dict[local_wrapped_fqn]
+                    )
         else:  # do not flatten non-FSDP parameters' states
             assert len(unflat_param_names) == 1
             key = _OptimStateKey(tuple(unflat_param_names), False)
@@ -448,7 +485,7 @@ def _flatten_optim_state(
             are_pos_dim_tensors &= torch.is_tensor(v) and v.dim() > 0
             are_zero_dim_tensors &= _is_zero_dim_tensor(v)
             are_non_tensors &= not torch.is_tensor(v)
-        types = set(type(v) for v in non_none_state_values)
+        types = {type(v) for v in non_none_state_values}
         if len(types) != 1 or not (
             are_pos_dim_tensors or are_zero_dim_tensors or are_non_tensors
         ):
@@ -533,7 +570,7 @@ def _flatten_tensor_optim_state(
     """
     non_none_tensors = [t for t in pos_dim_tensors if t is not None]
     # Check that all are tensors with the same dtype
-    dtypes = set(t.dtype for t in non_none_tensors)
+    dtypes = {t.dtype for t in non_none_tensors}
     if len(dtypes) != 1:
         raise ValueError(
             "All unflattened parameters comprising a single flattened "
@@ -611,8 +648,8 @@ def _flatten_zero_dim_tensor_optim_state(
     """
     non_none_tensors = [t for t in zero_dim_tensors if t is not None]
     # Enforce that all have the same value and dtype
-    values_set = set(t.item() if t is not None else None for t in zero_dim_tensors)
-    dtypes = set(t.dtype if t is not None else None for t in zero_dim_tensors)
+    values_set = {t.item() if t is not None else None for t in zero_dim_tensors}
+    dtypes = {t.dtype if t is not None else None for t in zero_dim_tensors}
     if (
         len(non_none_tensors) != len(zero_dim_tensors)
         or len(values_set) != 1
@@ -901,21 +938,6 @@ def _broadcast_unsharded_pos_dim_tensor_state(
     param_state[state_name] = unsharded_tensor
 
 
-def _rekey_named_optim_state_dict(optim_state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Rekeys the optimizer state dict from _OptimStateKey to FQN. This API is only
-    used when the optimizer is a NamedOptimizer which expects FQN as the keys.
-    """
-    osd = {"state": {}, "param_groups": optim_state_dict["param_groups"]}
-    for k, state in optim_state_dict["state"].items():
-        assert len(k.unflat_param_names) == 1, (
-            "For NamedOptimzer, each _OptimStateKey should have one name "
-            f"in `unflat_param_names` but got {k.unflat_param_names}."
-        )
-        osd["state"][k.unflat_param_names[0]] = state
-    return osd
-
-
 def _rekey_sharded_optim_state_dict(
     sharded_osd: Dict[str, Any],
     model: nn.Module,
@@ -973,17 +995,19 @@ def _rekey_sharded_optim_state_dict(
         if isinstance(key, str):
             rekeyed_osd_state[key] = param_state
             continue
-        flat_param_key = unflat_param_names_to_flat_param_key[key.unflat_param_names]
+        flat_param_key = unflat_param_names_to_flat_param_key.get(
+            key.unflat_param_names, key.unflat_param_names
+        )
         rekeyed_osd_state[flat_param_key] = param_state
 
     rekeyed_osd_param_groups: List[Dict[str, Any]] = []
     for unflat_param_group in sharded_osd["param_groups"]:
         flat_param_group = copy.deepcopy(unflat_param_group)
         flat_param_keys = sorted(
-            set(
+            {
                 unflat_param_name_to_flat_param_key[unflat_param_name]
                 for unflat_param_name in unflat_param_group["params"]
-            )
+            }
         )
         flat_param_group["params"] = flat_param_keys
         rekeyed_osd_param_groups.append(flat_param_group)
@@ -1083,6 +1107,7 @@ def return_fn(flat_param_to_fqn):
         model,
         module_fn,
         return_fn,
+        [fqn for fqn, _ in model.named_parameters()],
         flat_param_to_fqn_ret,
     )
 
@@ -1232,7 +1257,10 @@ def _map_param_key_to_optim_keys(
         fqns = param_to_fqns[param]
         is_fsdp_managed = isinstance(param, FlatParameter)
         if is_fsdp_managed:
-            assert fqns[0] in fqn_to_fsdp_param_info
+            assert fqns[0] in fqn_to_fsdp_param_info, (
+                fqns[0],
+                list(fqn_to_fsdp_param_info.keys()),
+            )
         is_fsdp_managed = fqns[0] in fqn_to_fsdp_param_info
         optim_state_key = _OptimStateKey(
             unflat_param_names=tuple(fqns),
@@ -1250,7 +1278,7 @@ def _map_param_key_to_optim_keys(
         merge_all_optim_state_keys = [
             key for local_keys in all_keys for key in local_keys
         ]
-        all_optim_state_keys = sorted(list(set(merge_all_optim_state_keys)))
+        all_optim_state_keys = sorted(set(merge_all_optim_state_keys))
     else:
         key_obj_list: List[Optional[List[_OptimStateKey]]] = (
             [all_optim_state_keys] if rank == 0 else [None]
@@ -1293,8 +1321,13 @@ def _unflatten_param_groups(
 
 
 def _is_named_optimizer(optim_state_dict: Dict[str, Any]) -> bool:
+    state = optim_state_dict.get("state", None)
+    if not state:
+        # If we cannot find a state, assume it is not NamedOptimizer as
+        # NamedOptimizer has eagerly initialization.
+        return False
     try:
-        key = next(iter(optim_state_dict["state"].keys()))
+        key = next(iter(state.keys()))
     except Exception as e:
         raise Exception(optim_state_dict) from e
     return isinstance(key, str)
@@ -1335,6 +1368,16 @@ def _optim_state_dict(
     states. This API finds the mapping from FQNs to parameters if the optimizer
     is a ``NamedOptimizer``.
 
+    If ``use_orig_params`` is True, each rank will have all FSDP-managed
+    parameters but some of these parameters may be empty due to the sharding.
+    For a regular optim.Optimizer, states for those empty parameters will
+    not be initialized. So, when aggregating the FQNs across ranks, no assert
+    will be raised on a rank even if it does not have all the states -- it is
+    valid and FSDP know how to aggregate them. However, FSDP has to ignore
+    handling those parameters that are not managed by FSDP and do not exist on
+    the local rank -- it is managed by other parallelism and FSDP does not
+    know ho to handle/aggregate them.
+
     Args:
         model (nn.Module): Root module (which may or may not be a
             :class:`FullyShardedDataParallel` instance) whose parameters
@@ -1389,16 +1432,15 @@ def _optim_state_dict(
         param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
             optim_state_key, None
         )
-        assert param_key is not None or (
-            optim_state_key.is_fsdp_managed and use_orig_params
-        ), (
-            "If use_orig_params is False, we must be able to find the "
-            "corresponding param id. If use_orig_params is True, some FSDP "
-            "managedparameters may not exist in the local shard, so the lookup "
-            "can return -1. Both assert conditions failed, some unexpected "
-            "corner case happens."
-            f"{param_key}  {optim_state_key.is_fsdp_managed} {use_orig_params}"
-        )
+
+        if param_key is None:
+            assert use_orig_params, (
+                "If use_orig_params is False, we must be able to find the "
+                f"corresponding param id. {optim_state_key} {param_key}"
+            )
+            if not optim_state_key.is_fsdp_managed:
+                continue
+
         if optim_state_key.is_fsdp_managed:
             # If there are multiple unflat_param_names (not use_orig_params),
             # they share the same FSDPParamInfo. So the first unflat_param_name
@@ -1450,8 +1492,18 @@ def _optim_state_dict(
                 continue
             if key in param_key_to_param:
                 continue
-            # This key is not a parameter state. It is a user-defined state.
-            fsdp_osd_state[key] = copy.copy(value)
+            # This key is not recognized by FSDP. It may be a user-defined state
+            # or some parameters state that FSDP is unable to map from
+            # ``optim.param_groups``.
+            warnings.warn(
+                f"Found a optim state, {key}, that FSDP cannot process. FSDP "
+                "will directly copy everything to the returned state_dict. In "
+                "most cases, this is a user-defined state that is not "
+                "associated with any particular parameter. Another possible "
+                "case is this state is managed by TorchRec. Otherwise, there may "
+                " be a mismatched assumption of optim_state_dict of this mode."
+            )
+            fsdp_osd_state[key] = value
 
         fsdp_osd["param_groups"] = _unflatten_param_groups(
             optim_state_dict, param_key_to_param, param_to_fqns
@@ -1469,7 +1521,7 @@ def _get_fqn_to_fsdp_param_info(model: nn.Module) -> Dict[str, FSDPParamInfo]:
     """
 
     def module_fn(module, prefix, fqn_to_param_info):
-        fsdp_state = _get_module_fsdp_state_if_comm_module(module)
+        fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
         if fsdp_state is None:
             return
         _lazy_init(fsdp_state, module)
@@ -1496,6 +1548,7 @@ def return_fn(fqn_to_param_info):
         model,
         module_fn,
         return_fn,
+        [fqn for fqn, _ in model.named_parameters()],
         fqn_to_param_info,
     )
 
@@ -1515,7 +1568,7 @@ class AllGatherInfo:
 
 
 def _all_gather_optim_state(
-    fsdp_state: _FSDPState, optim_state: Dict[str, Any], param_numel: int
+    fsdp_state: _FSDPState, optim_state: Dict[str, Any]
 ) -> Dict[str, Any]:
     """
     All-gathering state from all the ranks. This API is slow as it uses
@@ -1528,7 +1581,8 @@ def _all_gather_optim_state(
     for state_name, value in sorted_items(optim_state):
         if torch.is_tensor(value):
             if value.dim() == 0:
-                processed_state.scalar_tensors[state_name] = value
+                # Ensure that `step` is on CPU.
+                processed_state.scalar_tensors[state_name] = value.cpu()
             else:
                 processed_state.tensors[state_name] = _PosDimTensorInfo(
                     value.shape, value.dtype
@@ -1544,38 +1598,42 @@ def _all_gather_optim_state(
     gathered_state: Dict[str, Any] = {}
 
     all_tensor_states = sorted(
-        list(set([n for state in object_list for n in state.tensors.keys()]))
+        {n for state in object_list for n in state.tensors.keys()}
     )
+    empty_ranks: Set[int] = set()
     for name in all_tensor_states:
         numels = []
         dtype = torch.float
-        max_numel = 0
-        for object_state in object_list:
+        _empty_ranks: Set[int] = set()
+        for rank, object_state in enumerate(object_list):
             numels.append(0)
             info = object_state.tensors.get(name, None)
             if info is not None:
                 numels[-1] = info.shape.numel()
                 dtype = info.dtype
-                max_numel = max(max_numel, numels[-1])
-        local_state = (
-            optim_state[name]
-            if name in optim_state
-            else torch.empty(max_numel, dtype=dtype, device=fsdp_state.compute_device)
+            if numels[-1] == 0:
+                _empty_ranks.add(rank)
+
+        empty_func = functools.partial(
+            torch.empty, dtype=dtype, device=fsdp_state.compute_device
         )
-        if max_numel > local_state.numel():
-            local_state = F.pad(local_state, [0, max_numel - local_state.numel()])
+        if empty_ranks:
+            assert empty_ranks == _empty_ranks
+        empty_ranks = _empty_ranks
+        local_state = optim_state.get(name, empty_func(0))
+        local_state = local_state.to(fsdp_state.compute_device)
         tensors = [
-            torch.empty(max_numel, dtype=dtype, device=fsdp_state.compute_device)
-            if rank != fsdp_state.rank
-            else local_state
-            for rank in range(len(object_list))
+            empty_func(numel) if rank != fsdp_state.rank else local_state
+            for rank, numel in enumerate(numels)
         ]
         work = dist.all_gather(
             tensors, local_state, group=fsdp_state.process_group, async_op=True
         )
         gathered_state[name] = AllGatherInfo(tensors, numels, work)
 
-    for object_state in object_list:
+    for rank, object_state in enumerate(object_list):
+        if rank in empty_ranks:
+            continue
         for name, non_tensor_value in object_state.non_tensors.items():
             curr_non_tensor_value = gathered_state.get(name, None)
             assert (
@@ -1629,9 +1687,7 @@ def _gather_orig_param_state(
     ):
         return optim_state
 
-    gathered_state = _all_gather_optim_state(
-        fsdp_state, optim_state, flat_param._numels[param_idx]
-    )
+    gathered_state = _all_gather_optim_state(fsdp_state, optim_state)
 
     # Unflatten state values.
     for state_name, value in list(gathered_state.items()):
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 4ba9367ca04a..66afeca8b89d 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -26,11 +26,7 @@
     TrainingState,
 )
 from torch.distributed.fsdp._init_utils import HYBRID_SHARDING_STRATEGIES
-from torch.distributed.fsdp._utils import (
-    _apply_to_tensors,
-    _no_dispatch_record_stream,
-    p_assert,
-)
+from torch.distributed.fsdp._utils import _no_dispatch_record_stream
 from torch.distributed.fsdp.api import BackwardPrefetch
 from torch.distributed.fsdp.flat_param import (
     _HandlesKey,
@@ -39,7 +35,7 @@
     HandleShardingStrategy,
     HandleTrainingState,
 )
-from torch.distributed.utils import _to_kwargs
+from torch.distributed.utils import _apply_to_tensors, _p_assert, _to_kwargs
 
 RESHARD_AFTER_FORWARD_STRATEGIES = {
     HandleShardingStrategy.FULL_SHARD,
@@ -53,16 +49,23 @@
 )
 
 
-def _get_fsdp_root_states(module: nn.Module) -> List[_FSDPState]:
+def _get_fsdp_root_states_with_modules(
+    module: nn.Module,
+) -> Tuple[List[_FSDPState], List[nn.Module]]:
     """
-    Returns all root ``_FSDPState`` instances in the module tree rooted at
-    ``module``.
+    Returns a tuple containing:
+    1. A list of the root ``_FSDPState`` instances in the module tree rooted at
+    ``module`` without any duplicates and following the ``module.modules()``
+    traversal order (which is assumed to be depth-first).
+    2. A corresponding list of the root modules owning the states in the first
+    list.
 
-    This is similar to :func:`_get_fsdp_states` except we must call
-    :func:`_is_fsdp_root` to force a lazy initialization to determine the FSDP
-    root in case lazy initialization has not yet happened.
+    This is similar to :func:`_get_fsdp_states_with_modules` except that we
+    must call :func:`_is_fsdp_root` to force a lazy initialization to determine
+    the FSDP root in case lazy initialization has not yet happened.
     """
     fsdp_root_states: List[_FSDPState] = []
+    fsdp_root_modules: List[nn.Module] = []
     visited_fsdp_states: Set[_FSDPState] = set()
     # NOTE: This function assumes that `module.modules()` proceeds top-down.
     for submodule in module.modules():
@@ -74,6 +77,13 @@ def _get_fsdp_root_states(module: nn.Module) -> List[_FSDPState]:
         ):
             visited_fsdp_states.add(optional_state)
             fsdp_root_states.append(optional_state)
+            fsdp_root_modules.append(submodule)
+    return fsdp_root_states, fsdp_root_modules
+
+
+def _get_fsdp_root_states(module: nn.Module) -> List[_FSDPState]:
+    """See :func:`_get_fsdp_root_states_with_modules`."""
+    fsdp_root_states, _ = _get_fsdp_root_states_with_modules(module)
     return fsdp_root_states
 
 
@@ -205,9 +215,11 @@ def _share_state_and_init_handle_attrs(
     attr_name_to_values: Dict[str, Set[Any]] = {}
     for attr_name in HOMOGENEOUS_ATTR_NAMES:
         attr_name_to_values[attr_name] = set()
-    for fsdp_state in traversal_utils._get_fsdp_states(root_module):
+    root_state._all_fsdp_states = traversal_utils._get_fsdp_states(root_module)
+    root_state._all_handles = root_state._exec_order_data.all_handles  # share reference
+    for fsdp_state in root_state._all_fsdp_states:
         for attr_name in HOMOGENEOUS_ATTR_NAMES:
-            p_assert(
+            _p_assert(
                 hasattr(fsdp_state, attr_name),
                 f"FSDP state missing attribute {attr_name}",
             )
@@ -232,7 +244,7 @@ def _share_state_and_init_handle_attrs(
         # Relax the assert for non-root FSDP instances in case the nested
         # initialized module is wrapped again in FSDP later (e.g. after
         # training to run inference)
-        p_assert(
+        _p_assert(
             fsdp_state._is_root is None or not fsdp_state._is_root,
             "Non-root FSDP instance's `_is_root` should not have been "
             "set yet or should have been set to `False`",
@@ -330,7 +342,7 @@ def _reshard(
     """
     if not handles:
         return
-    p_assert(
+    _p_assert(
         len(handles) == len(free_unsharded_flat_params),
         "Expects both lists to have equal length but got "
         f"{len(handles)} and {len(free_unsharded_flat_params)}",
@@ -504,18 +516,16 @@ def _root_pre_forward(
             may not be the root. If not, then this method does not do anything.
     """
     _lazy_init(state, module)
-    p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
+    _p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
     if not state._is_root:
         return args, kwargs
     if state.forward_prefetch:
         handles_keys = []
-        if _is_composable(state):
-            # TODO: This assumes singleton handles keys.
-            handles_keys = [tuple(handle) for handle in state._handles]
-        else:
-            for fsdp_module in traversal_utils._get_fsdp_states(state):
-                handles_key = tuple(fsdp_module._handles)
-                handles_keys.append(handles_key)
+        for fsdp_state in state._all_fsdp_states:
+            # TODO: Forward prefetch assumes singleton handles key. For the
+            # composable path, `_handles` may have more than one handle,
+            # whereas for the wrapper path, it has at most one handle.
+            handles_keys.extend((handle,) for handle in fsdp_state._handles)
         for handles_key in handles_keys:
             state._needs_pre_forward_unshard[handles_key] = True
     _wait_for_computation_stream(
@@ -523,7 +533,7 @@ def _root_pre_forward(
         state._streams["unshard"],
         state._streams["pre_unshard"],
     )
-    _clear_grads_if_needed(traversal_utils._get_fsdp_handles(module))
+    _clear_grads_if_needed(state._all_handles)
 
     # Prepares the forward inputs by moving them to ``compute_device``
     # TODO: Do not use the side stream for tensor copies for now; investigate
@@ -601,7 +611,7 @@ def _pre_backward_hook(
         # after all backward calls complete
         if state._is_root and not state._post_backward_callback_queued:
             _register_post_backward_final_callback(state, module)
-            _clear_grads_if_needed(traversal_utils._get_fsdp_handles(module))
+            _clear_grads_if_needed(state._all_handles)
         elif _handles_key:
             allowed_states = [TrainingState.IDLE]
             if _is_composable(state):
@@ -661,7 +671,7 @@ def _post_backward_hook(
         # the same `FlatParameter`, the post-backward hook may run multiple
         # times in one backward, in which case we permit the state to already
         # be in `BACKWARD_POST`.
-        p_assert(
+        _p_assert(
             handle._training_state
             in (HandleTrainingState.BACKWARD_PRE, HandleTrainingState.BACKWARD_POST),
             f"Expects `BACKWARD_PRE` or `BACKWARD_POST` state but got {handle._training_state}",
@@ -841,8 +851,8 @@ def _check_comm_hook(
     comm_hook: Any,
     comm_hook_state: Any,
 ) -> None:
-    p_assert(comm_hook is not None, "Communication hook should not be `None`")
-    p_assert(
+    _p_assert(comm_hook is not None, "Communication hook should not be `None`")
+    _p_assert(
         comm_hook_state is not None, "Communication hook state should not be `None`"
     )
 
@@ -851,13 +861,13 @@ def _check_grad_to_accumulate(
     new_sharded_grad: torch.Tensor,
     accumulated_grad: torch.Tensor,
 ) -> None:
-    p_assert(
+    _p_assert(
         accumulated_grad.shape == new_sharded_grad.shape,
         "Shape mismatch when accumulating gradients: "
         f"existing gradient shape={accumulated_grad.shape} "
         f"new gradient shape={new_sharded_grad.shape}",
     )
-    p_assert(
+    _p_assert(
         accumulated_grad.device == new_sharded_grad.device,
         "Device mismatch when accumulating gradients: "
         f"existing gradient device={accumulated_grad.device} "
@@ -881,7 +891,7 @@ def _post_backward_final_callback(
     This runs at the end of the entire backward pass and should only be called
     on the root FSDP instance.
     """
-    p_assert(
+    _p_assert(
         state._is_root,
         "The post-backward callback should only be called on the root FSDP instance",
     )
@@ -896,7 +906,7 @@ def _post_backward_final_callback(
             torch.cuda.current_stream().synchronize()
     root_state._exec_order_data.next_iter()
 
-    for fsdp_state in traversal_utils._get_fsdp_states(module):
+    for fsdp_state in state._all_fsdp_states:
         _catch_all_reshard(fsdp_state)
         _finalize_params(fsdp_state)
         fsdp_state._ran_pre_backward_hook.clear()
@@ -938,7 +948,7 @@ def _catch_all_reshard(
         if handles_to_reshard:
             _reshard(state, handles_to_reshard, free_unsharded_flat_params)
     except Exception as e:
-        p_assert(
+        _p_assert(
             False,
             f"Got exception in the catch-all reshard for {state}: {str(e)}",
             raise_assertion_error=False,
@@ -955,7 +965,7 @@ def _finalize_params(
         flat_param = handle.flat_param
         if flat_param.requires_grad:
             if hasattr(flat_param, "_post_backward_hook_state"):
-                p_assert(
+                _p_assert(
                     len(flat_param._post_backward_hook_state) == 2,
                     f"Invalid: ``_post_backward_hook_state``: {flat_param._post_backward_hook_state}",
                 )
@@ -968,7 +978,7 @@ def _finalize_params(
                 # sharded gradient from the last synchronized iteration
                 continue
             handle.prepare_gradient_for_optim()
-            p_assert(
+            _p_assert(
                 hasattr(flat_param, "_post_backward_called"),
                 "Expects `_post_backward_called` to be set on the `FlatParameter`",
             )
@@ -1015,7 +1025,7 @@ def _get_handles_to_prefetch(
         HandleTrainingState.BACKWARD_POST,
         HandleTrainingState.FORWARD,
     )
-    p_assert(
+    _p_assert(
         training_state in valid_training_states,
         f"Prefetching is only supported in {valid_training_states} but "
         f"currently in {training_state}",
@@ -1053,9 +1063,9 @@ def _get_training_state(
     handles_key: _HandlesKey,
 ) -> HandleTrainingState:
     """Returns the training state of the handles in ``handles_key``."""
-    p_assert(len(handles_key) > 0, "Expects a non-empty handles key")
-    training_states = set(handle._training_state for handle in handles_key)
-    p_assert(
+    _p_assert(len(handles_key) > 0, "Expects a non-empty handles key")
+    training_states = {handle._training_state for handle in handles_key}
+    _p_assert(
         len(training_states) == 1,
         f"Expects uniform training state but got {training_states}",
     )
@@ -1207,6 +1217,12 @@ def _register_post_backward_hooks(
     We register the post-backward hook only once in the *first* forward that a
     ``FlatParameter`` participates in. This relies on the ``AccumulateGrad``
     object being preserved through multiple forwards.
+
+    NOTE: We follow this heuristic to prefer the *first* forward to target the
+    parameter mixed precision case, where there are *separate*
+    ``AccumulateGrad`` objects across the different forwards. (Without
+    parameter mixed precision, the ``AccumulateGrad`` objects are the same.) If
+    we instead prefer the *last* forward, then the hook runs early.
     """
     # If there is no gradient computation, then there is no need for
     # post-backward logic
@@ -1219,7 +1235,7 @@ def _register_post_backward_hooks(
             continue
         # Get the `AccumulateGrad` object
         temp_flat_param = flat_param.expand_as(flat_param)
-        p_assert(
+        _p_assert(
             temp_flat_param.grad_fn is not None,
             "The `grad_fn` is needed to access the `AccumulateGrad` and "
             "register the post-backward hook",
@@ -1241,7 +1257,7 @@ def _register_post_backward_final_callback(
     backward pass. This should be called from the root FSDP instance at the
     beginning of the pre-backward.
     """
-    p_assert(
+    _p_assert(
         state._is_root,
         "Only the root FSDP instance should register the post-backward callback",
     )
@@ -1295,7 +1311,7 @@ def _get_buffers_and_dtypes_for_computation(
     is either ``None`` if buffer mixed precision is not enabled or the buffer
     low precision dtype otherwise.
     """
-    p_assert(state._is_root, "Expects the root to cast buffers")
+    _p_assert(state._is_root, "Expects the root to cast buffers")
     buffers: List[torch.Tensor] = []
     buffer_dtypes: List[Optional[torch.dtype]] = []
     if _is_composable(state):
@@ -1330,7 +1346,7 @@ def _get_buffer_dtypes(
     """
     buffer_dtypes: List[torch.dtype] = []
     for buffer_name in buffer_names:
-        p_assert(
+        _p_assert(
             buffer_name in state._buffer_name_to_orig_dtype,
             f"{buffer_name} is missing from pre-computed dict on rank "
             f"{state.rank}, which only has keys "
@@ -1350,7 +1366,7 @@ def _cast_buffers_to_dtype_and_device(
     to ``device``. If an element in ``buffer_dtypes`` is ``None``, then the
     corresponding buffer is only moved to ``device``.
     """
-    p_assert(
+    _p_assert(
         buffer_dtypes is None or len(buffers) == len(buffer_dtypes),
         f"Expects `buffers` and `buffer_dtypes` to have the same length if "
         f"`buffer_dtypes` is specified but got {len(buffers)} and "
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 542432f08f60..54ed901dfaca 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -6,9 +6,7 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
-import torch.distributed.fsdp._traversal_utils as traversal_utils
 
-# Import the entire FSDP file to avoid circular imports
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -43,7 +41,7 @@
 from ._unshard_param_utils import (
     _deregister_orig_params,
     _register_orig_params,
-    _unshard_params,
+    _unshard_fsdp_state_params,
     FLAT_PARAM,
 )
 from .flat_param import FlatParamHandle
@@ -54,8 +52,7 @@ def _convert_to_wrapped_module_name(module_name: str) -> str:
     module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "")
     if module_name:
         module_name = f"{module_name}."
-    # Activation checkpoint adds a prefix that has to be
-    # removed as well.
+    # `CheckpointWrapper` adds a prefix that has to be removed as well.
     module_name = module_name.replace(checkpoint_wrapper._CHECKPOINT_PREFIX, "")
     return module_name
 
@@ -86,7 +83,6 @@ def _shared_param_fqns(module: nn.Module, fsdp_state) -> Iterator[Tuple[str, str
 def _enter_unshard_params_ctx(
     module: nn.Module,
     fsdp_state: _FSDPState,
-    recurse: bool = False,
     writeback: bool = False,
     rank0_only: bool = False,
     offload_to_cpu: bool = False,
@@ -95,13 +91,13 @@ def _enter_unshard_params_ctx(
     """
     state_dict hooks cannot use the pure context call as the checkpoint flow
     requires to enter the context in the pre-hook but leave the context in the
-    post-hook. This API enters the context of ``_unshard_params``.
+    post-hook. This API enters the context of ``_unshard_fsdp_state_params``.
     """
     assert module not in fsdp_state._unshard_params_ctx, (
-        "Entering the ``_unshard_params`` context but _unshard_params_ctx[module] "
+        "Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] "
         "is not None."
     )
-    fsdp_state._unshard_params_ctx[module] = _unshard_params(
+    fsdp_state._unshard_params_ctx[module] = _unshard_fsdp_state_params(
         module,
         fsdp_state,
         writeback=writeback,
@@ -114,7 +110,7 @@ def _enter_unshard_params_ctx(
 
 @no_type_check
 def _exit_unshard_params_ctx(module: nn.Module, fsdp_state: _FSDPState) -> None:
-    """A helper function to exit ``_unshard_params`` context."""
+    """A helper function to exit ``_unshard_fsdp_state_params`` context."""
     fsdp_state._unshard_params_ctx[module].__exit__(None, None, None)
     fsdp_state._unshard_params_ctx.pop(module)
 
@@ -130,7 +126,7 @@ def _common_pre_state_dict_hook(
     _lazy_init(fsdp_state, module)
     # TODO: change to this call after pre_state_dict_hook is in `nn.Module`.
     if fsdp_state._is_root:
-        _clear_grads_if_needed(traversal_utils._get_fsdp_handles(module))
+        _clear_grads_if_needed(fsdp_state._all_handles)
 
 
 def _common_unshard_pre_state_dict_hook(
@@ -141,12 +137,11 @@ def _common_unshard_pre_state_dict_hook(
 ) -> None:
     """
     Performs the pre-state_dict tasks shared by all state_dict types that require
-    ``_unshard_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
+    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
     """
     _enter_unshard_params_ctx(
         module,
         fsdp_state,
-        recurse=False,
         writeback=False,
         offload_to_cpu=offload_to_cpu,
         rank0_only=rank0_only,
@@ -164,7 +159,7 @@ def _common_unshard_post_state_dict_hook(
 ) -> Dict[str, Any]:
     """
     The post-state_dict flow that shared by all state_dict types that require
-    ``_unshard_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
+    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
     hook.
     """
     _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
@@ -248,7 +243,7 @@ def _common_unshard_post_state_dict_hook(
             _cast_buffers_to_dtype_and_device(
                 buffers, buffer_dtypes, fsdp_state.compute_device
             )
-            for buffers, clean_fqn in zip(buffers, buffer_clean_fqns):
+            for buffer, clean_fqn in zip(buffers, buffer_clean_fqns):
                 fqn = f"{prefix}{clean_fqn}"
                 state_dict[fqn] = buffer.clone()
     return state_dict
@@ -290,7 +285,7 @@ def _full_post_state_dict_hook(
     """
     Hook that runs after model.state_dict() is called before returning result to
     user. For FSDP, we may have to clone the tensors in state_dict as params go
-    back to sharded version after _unshard_params ends, and also remove
+    back to sharded version after _unshard_fsdp_state_params ends, and also remove
     the ``FSDP_WRAPPED_MODULE`` prefix.
     """
 
@@ -307,7 +302,7 @@ def param_hook(
         if clean_key.startswith(clean_prefix):
             clean_key = clean_key[len(clean_prefix) :]
 
-        # Clone parameters before exiting the `_unshard_params()` context.
+        # Clone parameters before exiting the `_unshard_fsdp_state_params()` context.
         if not getattr(state_dict[fqn], "_has_been_cloned", False):
             try:
                 state_dict[fqn] = state_dict[fqn].clone().detach()
@@ -333,7 +328,7 @@ def _full_pre_load_state_dict_hook(
     prefix: str,
 ) -> None:
     _lazy_init(fsdp_state, module)
-    _enter_unshard_params_ctx(module, fsdp_state, recurse=False, writeback=True)
+    _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
     # Add FSDP_PREFIX only for wrapper-based FSDP.
     if not _is_composable(fsdp_state):
         _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
@@ -390,15 +385,23 @@ def _local_post_state_dict_hook(
     # to get flat_param to get the metadata.
     assert _module_handles(fsdp_state, module), "Should have returned early"
     flat_param = _module_handles(fsdp_state, module)[0].flat_param
-    # Construct a ShardedTensor from the flat_param.
+    # Constructs a ShardedTensor from the flat_param "without" padding.
+    # Removing the padding allows users to change the number of ranks
+    # when loading the local_state_dict.
     full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
     shard_offset = flat_param.numel() * fsdp_state.rank
     valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
-    if valid_data_size > 0 and flat_param._shard_numel_padded > 0:
-        flat_param = flat_param.narrow(0, 0, valid_data_size)
-    local_shards = [
-        Shard.from_tensor_and_offsets(flat_param, [shard_offset], fsdp_state.rank)
-    ]
+    if valid_data_size > 0:
+        # If FlatParameter is returned, FlatParameter._local_shard cause a
+        # pickling issue (can be torch.save but not torch.load). Since there
+        # is no benefit for state_dict to return the actual FlatParameter class,
+        # a view (which is a tensor) of the FlatParameter will be returned.
+        flat_param = flat_param[:valid_data_size].view(valid_data_size)
+        local_shards = [
+            Shard.from_tensor_and_offsets(flat_param, [shard_offset], fsdp_state.rank)
+        ]
+    else:
+        local_shards = []
     sharded_tensor = init_from_local_shards(
         local_shards, full_numel, process_group=fsdp_state.process_group
     )  # type: ignore[assignment]
@@ -440,20 +443,24 @@ def _local_pre_load_state_dict_hook(
     ), "Tensors in local_state_dict should be ShardedTensor."
 
     # Convert the ShardedTensor to a Tensor.
-    shards = load_tensor.local_shards()
-    assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
-    load_tensor = shards[0].tensor
-
-    # Get the metadata of the flat_param to decide whether to pad the loaded
-    # tensor.
     flat_param = _module_handles(fsdp_state, module)[0].flat_param
     assert flat_param is not None
-    if flat_param._shard_numel_padded not in (0, flat_param.numel()):
-        assert load_tensor.numel() < flat_param.numel(), (
-            f"Local shard size = {flat_param.numel()} and the tensor in "
-            f"the state_dict is {load_tensor.numel()}."
-        )
-        load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
+    valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
+    shards = load_tensor.local_shards()
+    if valid_data_size > 0:
+        assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
+        load_tensor = shards[0].tensor
+
+        # Get the metadata of the flat_param to decide whether to pad the loaded
+        # tensor.
+        if flat_param._shard_numel_padded > 0:
+            assert load_tensor.numel() < flat_param.numel(), (
+                f"Local shard size = {flat_param.numel()} and the tensor in "
+                f"the state_dict is {load_tensor.numel()}."
+            )
+            load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
+    else:
+        load_tensor = flat_param
     state_dict[fqn] = load_tensor
 
 
diff --git a/torch/distributed/fsdp/_traversal_utils.py b/torch/distributed/fsdp/_traversal_utils.py
index f4756371530b..b0238ca5f49a 100644
--- a/torch/distributed/fsdp/_traversal_utils.py
+++ b/torch/distributed/fsdp/_traversal_utils.py
@@ -6,7 +6,7 @@
 """
 
 import collections
-from typing import Deque, List, Set
+from typing import Deque, List, Set, Tuple
 
 import torch.nn as nn
 from torch.distributed._composable.contract import _get_registry
@@ -40,22 +40,30 @@ def _composable(module: nn.Module) -> bool:
     return "replicate" not in _get_registry(module)
 
 
-def _get_fsdp_states(module: nn.Module) -> List[_FSDPState]:
+# TODO (awgu): We may be able to remove this function if we retired the
+# `use_orig_params=False` code path since so far we only need the module for
+# `FlatParameter` registration, which is not needed for `use_orig_params=True`.
+def _get_fsdp_states_with_modules(
+    module: nn.Module,
+) -> Tuple[List[_FSDPState], List[nn.Module]]:
     """
-    Returns all ``_FSDPState`` instances in the module tree rooted at
+    Returns a tuple containing:
+    1. A list of the ``_FSDPState`` instances in the module tree rooted at
     ``module`` without any duplicates and following the ``module.modules()``
-    traversal order (which is assumed to remain as depth-first). However, the
-    traversal does not proceed into any module annotated by an incompatible
-    API (e.g. ``replicate``).
+    traversal order (which is assumed to be depth-first).
+    2. A corresponding list of the modules owning the states in the first list.
 
-    For the wrapper code path, this returns all ``FullyShardedDataParallel``
-    instances. For the non-wrapper code path, this returns composable state
-    instances.
+    For the wrapper code path, both returned lists are the same, each
+    containing all ``FullyShardedDataParallel`` instances. For the composable
+    code path, this returns a list of all composable state instances and a list
+    of the corresponding fully sharded modules. See [Note: Fully Sharded
+    Module].
 
-    NOTE: For now, we must pass an ``nn.Module`` as the argument because
-    ``_FSDPState`` does not support graph traversal.
+    NOTE: The traversal does not proceed into any module annotated by an
+    incompatible API (e.g. ``replicate``).
     """
     fsdp_states: List[_FSDPState] = []
+    fsdp_modules: List[nn.Module] = []
     # Track the visited FSDP states since multiple modules may share the same
     # one and we want to return a de-duplicated list
     visited_fsdp_states: Set[_FSDPState] = set()
@@ -80,6 +88,13 @@ def _get_fsdp_states(module: nn.Module) -> List[_FSDPState]:
         if optional_state is not None and optional_state not in visited_fsdp_states:
             visited_fsdp_states.add(optional_state)
             fsdp_states.append(optional_state)
+            fsdp_modules.append(submodule)
+    return fsdp_states, fsdp_modules
+
+
+def _get_fsdp_states(module: nn.Module) -> List[_FSDPState]:
+    """See :func:`_get_fsdp_states_with_modules`."""
+    fsdp_states, _ = _get_fsdp_states_with_modules(module)
     return fsdp_states
 
 
diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py
index 950841850b62..af75cea11ba7 100644
--- a/torch/distributed/fsdp/_unshard_param_utils.py
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@@ -3,21 +3,25 @@
 from typing import cast, Generator, List
 
 import torch
+import torch.distributed.fsdp._traversal_utils as traversal_utils
 import torch.nn as nn
 from torch.distributed.fsdp._common_utils import (
     _FSDPState,
     _has_fsdp_params,
     _module_handles,
     HandleTrainingState,
+    TrainingState,
 )
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
+    _get_fsdp_root_states_with_modules,
+    _lazy_init,
     _reshard,
     _reshard_grads,
     _unshard,
     _unshard_grads,
 )
-from ._utils import p_assert
+from torch.distributed.utils import _p_assert
 from .flat_param import FlatParamHandle
 
 FLAT_PARAM = "_flat_param"
@@ -38,33 +42,29 @@ def _writeback_to_local_shard(
     padded unsharded flattened parameter.
     """
     for handle in handles:
-        # For `NO_SHARD`, `_local_shard` is the unsharded flattened
-        # parameter and `grad` is the unsharded gradient, so there is no
-        # need to writeback for either
-        if not handle.uses_sharded_strategy:
-            continue
-        assert (
-            handle.flat_param.ndim == 1
-        ), f"Expects `flat_param` to be flattened but got {handle.flat_param.shape}"
-
-        # Get the unpadded shard instead of the padded shard to persist
-        # user changes to the padding (though FSDP does not explicitly
-        # support this)
-        param_shard, _ = FlatParamHandle._get_unpadded_shard(
-            handle.flat_param,
-            handle.rank,
-            handle.world_size,
-        )
+
+        def _get_shard(flat_param_or_grad: torch.Tensor) -> torch.Tensor:
+            if handle.uses_sharded_strategy:
+                # For sharded strategies, get the *unpadded* shard instead of
+                # the *padded* shard to persist user changes to the padding
+                # (though FSDP does not explicitly support this)
+                shard, _ = FlatParamHandle._get_unpadded_shard(
+                    flat_param_or_grad,
+                    handle.rank,
+                    handle.world_size,
+                )
+                return shard
+            # For `NO_SHARD`, the `flat_param` or its gradient may be modified,
+            # so we write it back directly
+            return flat_param_or_grad
+
+        param_shard = _get_shard(handle.flat_param)
         handle.flat_param._local_shard[: param_shard.numel()].copy_(param_shard)  # type: ignore[attr-defined]
         if writeback_grad:
             existing_grad = handle.sharded_grad
             if existing_grad is not None:
                 assert handle.flat_param.grad is not None
-                grad_shard, _ = FlatParamHandle._get_unpadded_shard(
-                    handle.flat_param.grad,
-                    handle.rank,
-                    handle.world_size,
-                )
+                grad_shard = _get_shard(handle.flat_param.grad)
                 existing_grad[: grad_shard.numel()].copy_(grad_shard)
 
 
@@ -120,38 +120,57 @@ def _unflatten_as_params(state: _FSDPState, module: nn.Module) -> Generator:
                 _register_flat_param(state, module)
 
 
-@contextlib.contextmanager
-def _unshard_params(
-    module: nn.Module,
+def _validate_unshard_params_args(
     state: _FSDPState,
-    writeback: bool = True,
-    rank0_only: bool = False,
-    offload_to_cpu: bool = False,
-    with_grads: bool = False,
-):
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+) -> None:
     if with_grads and (offload_to_cpu or not state._use_orig_params):
         raise NotImplementedError(
-            f"with_grads={with_grads} "
-            f"use_orig_params={state._use_orig_params} "
+            f"with_grads={with_grads}, "
+            f"use_orig_params={state._use_orig_params}, "
             f"offload_to_cpu={offload_to_cpu} "
             f"is not supported yet"
         )
+    if offload_to_cpu and any(
+        not handle.uses_sharded_strategy for handle in state._handles
+    ):
+        raise NotImplementedError(
+            "offload_to_cpu=True and NO_SHARD is not supported yet"
+        )
     if writeback and rank0_only:
-        raise ValueError(
-            "writeback=True and rank0_only=True is not supported, as model "
-            "parameter shapes will be different across ranks, and writing "
-            "to them can lead to inconsistencies across ranks when the "
-            "context is exited."
+        # TODO: Rank 0 can broadcast the `FlatParameter` to allow all ranks to
+        # persist the changes.
+        raise NotImplementedError(
+            "writeback=True and rank0_only=True is not supported yet"
         )
     if offload_to_cpu and not rank0_only:
         warnings.warn(
-            "offload_to_cpu and rank0_only=False will result in "
-            "full parameters being redundantly copied to CPU memory for "
-            "GPUs that reside on the same machine, which may incur the risk of "
-            "CPU OOM. It is recommended to use ``offload_to_cpu`` with "
-            "rank0_only=True."
+            "offload_to_cpu=True and rank0_only=False may result in the"
+            "unsharded parameters being redundantly copied to CPU memory for "
+            "GPUs sharing the same CPU memory, which risks CPU OOM. We "
+            "recommend using offload_to_cpu=True with rank0_only=True."
         )
 
+
+@contextlib.contextmanager
+def _unshard_fsdp_state_params(
+    module: nn.Module,
+    state: _FSDPState,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+):
+    """
+    This unshards the parameters for a single FSDP state ``state`` that
+    corresponds to ``module``.
+    """
+    _validate_unshard_params_args(
+        state, writeback, rank0_only, offload_to_cpu, with_grads
+    )
     torch.cuda.synchronize()
     # If handles are shared by other module(s), the handle may be already unsharded.
     handles = [
@@ -164,8 +183,9 @@ def _unshard_params(
         return
 
     for handle in handles:
-        if handle._training_state != HandleTrainingState.IDLE:
-            raise ValueError(f"Current handle state is {handle._training_state}")
+        assert (
+            handle._training_state == HandleTrainingState.IDLE
+        ), f"Expects the handle training to be IDLE but got {handle._training_state}"
 
     for handle in handles:
         handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
@@ -196,11 +216,10 @@ def _unshard_params(
             for handle in handles:
                 if offload_to_cpu and handle.uses_sharded_strategy:
                     stack.enter_context(handle.to_cpu())
-                    # TODO (awgu): Since PyTorch enforces that a parameter
-                    # and its gradients need to match metadata (e.g.
-                    # device), we must move gradients to CPU *after* we
-                    # move parameters.
-            # TODO (awgu): This FPW call assumes 1 `FlatParameter`
+                    # NOTE: Since PyTorch enforces that a parameter and its
+                    # gradients need to match metadata (e.g. device), we must
+                    # move gradients to CPU *after* we move parameters.
+            # NOTE: This assumes 1 `FlatParameter`
             if not state._use_orig_params:
                 stack.enter_context(_unflatten_as_params(state, module))
             try:
@@ -216,12 +235,108 @@ def _unshard_params(
                     handle._training_state = HandleTrainingState.IDLE
 
 
+@contextlib.contextmanager
+def _unshard_params_recurse(
+    module: nn.Module,
+    state: _FSDPState,
+    recurse: bool,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+):
+    """
+    This is a helper for :func:`_unshard_params` that recursively calls
+    :func:`_unshard_fsdp_state_params` on FSDP states if ``recurse=True``.
+    NOTE: This runs lazy initialization.
+    """
+    _validate_unshard_params_args(
+        state, writeback, rank0_only, offload_to_cpu, with_grads
+    )
+    if recurse:
+        with contextlib.ExitStack() as stack:
+            # TODO (awgu): The traversal function does not traverse through
+            # incompatible composable APIs. Verify if this is the desired
+            # behavior for this function.
+            for state, fsdp_module in zip(
+                *traversal_utils._get_fsdp_states_with_modules(module)
+            ):
+                stack.enter_context(
+                    _unshard_params_recurse(
+                        module=fsdp_module,
+                        state=state,
+                        recurse=False,
+                        writeback=writeback,
+                        rank0_only=rank0_only,
+                        offload_to_cpu=offload_to_cpu,
+                        with_grads=with_grads,
+                    )
+                )
+            yield
+        return
+    _lazy_init(state, module)
+    if state.training_state == TrainingState.FORWARD_BACKWARD:
+        raise AssertionError(
+            "Cannot manually unshard parameters during forward/backward"
+        )
+    elif state.training_state == TrainingState.SUMMON_FULL_PARAMS:
+        raise AssertionError(
+            "Cannot manually unshard parameters when already unsharding parameters"
+        )
+    with _unshard_fsdp_state_params(
+        module=module,
+        state=state,
+        writeback=writeback,
+        rank0_only=rank0_only,
+        offload_to_cpu=offload_to_cpu,
+        with_grads=with_grads,
+    ):
+        try:
+            state.training_state = TrainingState.SUMMON_FULL_PARAMS
+            yield
+        finally:
+            state.training_state = TrainingState.IDLE
+
+
+@contextlib.contextmanager
+def _unshard_params(
+    module: nn.Module,
+    recurse: bool,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+):
+    """
+    This unshards FSDP-managed parameters for all modules with FSDP applied in
+    the module tree rooted at ``module``.
+    """
+    root_fsdp_states, root_fsdp_modules = _get_fsdp_root_states_with_modules(module)
+    with contextlib.ExitStack() as stack:
+        for root_fsdp_state, root_fsdp_module in zip(
+            root_fsdp_states, root_fsdp_modules
+        ):
+            stack.enter_context(
+                _unshard_params_recurse(
+                    module=root_fsdp_module,
+                    state=root_fsdp_state,
+                    recurse=recurse,
+                    writeback=writeback,
+                    rank0_only=rank0_only,
+                    offload_to_cpu=offload_to_cpu,
+                    with_grads=with_grads,
+                )
+            )
+        yield
+    return
+
+
 def _deregister_orig_params(state: _FSDPState, module: nn.Module) -> None:
     """
     Deregisters the original parameters; registers the ``FlatParameter``.
     """
     handles = _module_handles(state, module)
-    p_assert(
+    _p_assert(
         len(handles) <= 1,
         "Expects <=1 handle per FSDP instance; needs to be refactored "
         "for >1 handle (e.g. non-recursive wrapping)",
@@ -229,7 +344,7 @@ def _deregister_orig_params(state: _FSDPState, module: nn.Module) -> None:
     if not handles:
         return
     handle = handles[0]
-    p_assert(
+    _p_assert(
         handle._use_orig_params,
         f"Inconsistent `_use_orig_params` -- FSDP: {state._use_orig_params} "
         f"handle: {handle._use_orig_params}",
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index 5efb376e6645..45c8c455422b 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -1,14 +1,7 @@
-import dataclasses
-import traceback
-from collections import OrderedDict
-from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union
+from typing import cast
 
 import torch
 from torch.nn.modules.batchnorm import _BatchNorm
-from torch.nn.parallel.scatter_gather import (  # type: ignore[attr-defined]
-    _is_namedtuple,
-)
-from torch.nn.utils.rnn import PackedSequence
 from torch.utils._mode_utils import no_dispatch
 
 
@@ -22,102 +15,12 @@ def _override_batchnorm_mixed_precision(module):
             mod._wrap_overrides = {"mixed_precision": None}  # type: ignore[assignment]
 
 
-def _apply_to_tensors(
-    fn: Callable,
-    container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence],
-) -> Any:
-    """Recursively apply to all tensor in different kinds of container types."""
-
-    def apply(
-        x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]
-    ) -> Any:
-        if torch.is_tensor(x):
-            return fn(x)
-        elif hasattr(x, "__dataclass_fields__"):
-            dc = dataclasses.replace(x)
-            for f in dataclasses.fields(dc):
-                name = f.name
-                setattr(dc, name, apply(getattr(dc, name)))
-            return dc
-        elif isinstance(x, OrderedDict):
-            od = x.__class__()
-            for key, value in x.items():
-                od[key] = apply(value)
-            return od
-        elif isinstance(x, PackedSequence):
-            apply(x.data)
-            return x
-        elif isinstance(x, dict):
-            return {key: apply(value) for key, value in x.items()}
-        elif _is_namedtuple(x):
-            res = (apply(el) for el in x)
-            return type(x)(*res)
-        elif isinstance(x, (list, tuple, set)):
-            return type(x)(apply(el) for el in x)
-        else:
-            return x
-
-    return apply(container)
-
-
-@torch.no_grad()
-def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> bool:
-    """
-    Allocate storage for ``tensor`` with the given size.
-
-    Returns:
-        bool: ``True`` if this method allocated storage and ``False`` if the
-        storage was already allocated.
-    """
-    already_allocated = tensor._typed_storage()._size() == size.numel()
-    if not already_allocated:
-        tensor_storage_size = tensor._typed_storage()._size()
-        p_assert(
-            tensor_storage_size == 0,
-            f"Tensor storage should have been resized to be 0 but got {tensor_storage_size}",
-        )
-        tensor._typed_storage()._resize_(size.numel())
-    return not already_allocated
-
-
-@torch.no_grad()
-def _free_storage(tensor: torch.Tensor) -> bool:
-    """
-    Frees the underlying storage of ``tensor``.
-
-    Returns:
-        bool: ``True`` if the method freed the storage and ``False`` if the
-        storage was already freed.
-    """
-    already_freed = tensor._typed_storage()._size() == 0
-    if not already_freed:
-        p_assert(
-            tensor.storage_offset() == 0,
-            "Freeing a tensor's storage is unsafe when it is not the sole occupant\n"
-            f"storage offset: {tensor.storage_offset()}\n"
-            f"storage size: {tensor._typed_storage()._size()}\n"
-            f"tensor shape: {tensor.shape}",
-        )
-        tensor._typed_storage()._resize_(0)
-    return not already_freed
-
-
 def _same_storage(x: torch.Tensor, y: torch.Tensor) -> bool:
     """Returns if ``x`` and ``y`` share the same storage."""
     # NOTE: CPU and GPU tensors are ensured to have different data pointers.
     return x._typed_storage()._data_ptr() == y._typed_storage()._data_ptr()
 
 
-def p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
-    """This is used as an alternate to ``assert`` when in the backward context
-    to print the error message ``s`` since otherwise, it is swallowed."""
-    if not cond:
-        print(s)
-        traceback.print_stack()
-        if raise_assertion_error:
-            raise AssertionError(s)
-
-
 def _no_dispatch_record_stream(tensor: torch.Tensor, stream: torch.cuda.Stream) -> None:
     with no_dispatch():
         tensor.record_stream(cast(torch._C.Stream, stream))
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index f9b5f8975486..b60b4aae991b 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -26,10 +26,6 @@ class FullyShardedModuleState(NamedTuple):
 
     params: List[nn.Parameter]
     buffers: List[torch.Tensor]
-    # Parameter and buffer names are prefixed starting from the submodule,
-    # which is not necessarily the root module
-    param_names: List[str]
-    buffer_names: List[str]
 
 
 def _auto_wrap(
@@ -137,9 +133,7 @@ def _get_fully_sharded_module_to_states(
         deque: Deque[Tuple[nn.Module, str]] = collections.deque()
         deque.append((submodule, ""))
         params: List[nn.Parameter] = []
-        param_names: List[str] = []
         buffers: List[torch.Tensor] = []
-        buffer_names: List[str] = []
         while len(deque) > 0:
             module, prefix = deque.popleft()
             # Reverse `named_children()`, use `appendleft()`, and add to the
@@ -149,18 +143,16 @@ def _get_fully_sharded_module_to_states(
             ):
                 if child_module not in wrapped_modules_set:
                     deque.appendleft((child_module, prefix + child_module_name + "."))
-            for param_name, param in module.named_parameters(recurse=False):
+            for param in module.parameters(recurse=False):
                 if param not in visited_params and not _is_fsdp_flattened(param):
                     params.append(param)
                     visited_params.add(param)
-                    param_names.append(prefix + param_name)
-            for buffer_name, buffer in module.named_buffers(recurse=False):
+            for buffer in module.buffers(recurse=False):
                 if buffer not in visited_buffers:
                     buffers.append(buffer)
                     visited_buffers.add(buffer)
-                    buffer_names.append(prefix + buffer_name)
         fully_sharded_module_to_states[submodule] = FullyShardedModuleState(
-            params, buffers, param_names, buffer_names
+            params, buffers
         )
     return fully_sharded_module_to_states
 
diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
index a8bd7db6e4c3..6e222cd42b52 100644
--- a/torch/distributed/fsdp/api.py
+++ b/torch/distributed/fsdp/api.py
@@ -20,6 +20,11 @@
     "FullStateDictConfig",
     "LocalStateDictConfig",
     "ShardedStateDictConfig",
+    "OptimStateDictConfig",
+    "FullOptimStateDictConfig",
+    "LocalOptimStateDictConfig",
+    "ShardedOptimStateDictConfig",
+    "StateDictSettings",
 ]
 
 
@@ -301,3 +306,38 @@ class LocalStateDictConfig(StateDictConfig):
 @dataclass
 class ShardedStateDictConfig(StateDictConfig):
     pass
+
+
+@dataclass
+class OptimStateDictConfig:
+    """
+    ``OptimStateDictConfig`` is the base class for all optimizer state_dict
+    configuration classes.  Users should instantiate a child version
+    (i.e. ``FullOptimStateDictConfig``) in order to configure settings for the
+    particular type of ``optim_state_dict`` implementation FSDP will use.
+    """
+
+    # TODO: actually use this flag in the _optim_utils.py
+    offload_to_cpu: bool = True
+
+
+@dataclass
+class FullOptimStateDictConfig(OptimStateDictConfig):
+    rank0_only: bool = False
+
+
+@dataclass
+class LocalOptimStateDictConfig(OptimStateDictConfig):
+    offload_to_cpu: bool = False
+
+
+@dataclass
+class ShardedOptimStateDictConfig(OptimStateDictConfig):
+    pass
+
+
+@dataclass
+class StateDictSettings:
+    state_dict_type: StateDictType
+    state_dict_config: StateDictConfig
+    optim_state_dict_config: OptimStateDictConfig
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index f58e2eecb1fd..1bfc2090a7cf 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -27,15 +27,10 @@
     _set_fsdp_flattened,
     HandleTrainingState,
 )
+from torch.distributed.utils import _alloc_storage, _free_storage, _p_assert
 
 from ._fsdp_extensions import _ext_post_unflatten_transform, _ext_pre_flatten_transform
-from ._utils import (
-    _alloc_storage,
-    _free_storage,
-    _no_dispatch_record_stream,
-    _same_storage,
-    p_assert,
-)
+from ._utils import _no_dispatch_record_stream, _same_storage
 
 __all__ = [
     "FlatParameter",
@@ -274,8 +269,8 @@ def _init_metadata(
         self._fqns = tuple(fqns)
         self._shared_param_infos = tuple(shared_param_infos)
         self._param_extensions = tuple(param_extensions)
-        self._modules = set(pi.module for pi in self._param_infos).union(
-            set(spi.module for spi in self._shared_param_infos)
+        self._modules = {pi.module for pi in self._param_infos}.union(
+            {spi.module for spi in self._shared_param_infos}
         )
         assert (params is None) == (shared_params is None)
         if params is not None:
@@ -558,7 +553,7 @@ def shard(self):
         if not self.uses_sharded_strategy:
             self._init_shard_metadata(0, 0, flat_param.numel() - 1)
         else:
-            p_assert(
+            _p_assert(
                 flat_param.storage_offset() == 0,
                 "The `FlatParameter` is not the sole occupant of its storage",
             )
@@ -600,8 +595,8 @@ def _init_shard_metadata(
         """
         self.flat_param._sharded_size = self.flat_param.size()  # type: ignore[attr-defined]
         sharded_flat_param_numel = self.flat_param.numel()  # includes `numel_padded`
-        p_assert(start >= 0 and start <= end, f"start: {start} end: {end}")
-        p_assert(
+        _p_assert(start >= 0 and start <= end, f"start: {start} end: {end}")
+        _p_assert(
             numel_padded <= sharded_flat_param_numel,
             f"numel_padded: {numel_padded} "
             f"sharded_flat_param_numel: {sharded_flat_param_numel}",
@@ -792,7 +787,7 @@ def init_flat_param_attributes(self) -> None:
             self._orig_param_dtype = flat_param.dtype
         cpu_device = torch.device("cpu")
         if self._offload_params:
-            p_assert(
+            _p_assert(
                 flat_param.device == cpu_device,
                 f"Expects the `FlatParameter` to be on CPU when parameter CPU "
                 f"offloading is enabled, not {flat_param.device}",
@@ -957,7 +952,7 @@ def _get_padded_unsharded_flat_param(self) -> torch.Tensor:
             # tensor as the all-gather destination to preserve the invariant
             # that  `_full_param_padded` is in the low precision
             unsharded_flat_param = flat_param._full_prec_full_param_padded  # type: ignore[attr-defined]
-            p_assert(
+            _p_assert(
                 unsharded_flat_param.dtype != self._fwd_bwd_param_dtype,
                 f"Expects full precision but got {self._fwd_bwd_param_dtype}",
             )
@@ -974,13 +969,13 @@ def _all_gather_flat_param(
         ``padded_unsharded_flat_param``, and switches to using the all-gathered
         tensor.
         """
-        p_assert(
+        _p_assert(
             hasattr(self, "process_group") and hasattr(self, "world_size"),
             "Expects a process group and world size to have been set via `shard()`",
         )
         sharded_flat_param = self.flat_param.data
         expected_numel = sharded_flat_param.numel() * self.world_size
-        p_assert(
+        _p_assert(
             padded_unsharded_flat_param.numel() == expected_numel,
             f"Expects {expected_numel} numel but got {padded_unsharded_flat_param.numel()}",
         )
@@ -1111,7 +1106,7 @@ def prepare_gradient_for_backward(self):
         clearing any existing sharded gradient in ``.grad`` to enable computing
         a new unsharded gradient.
         """
-        p_assert(
+        _p_assert(
             self._training_state
             in (HandleTrainingState.BACKWARD_PRE, HandleTrainingState.IDLE),
             "Expects to be in `BACKWARD_PRE` or `IDLE` (if prefetching)",
@@ -1123,7 +1118,7 @@ def prepare_gradient_for_backward(self):
         ):
             self._check_on_compute_device(self.flat_param)
             grad_offloaded = flat_param.grad.device != self.device
-            p_assert(
+            _p_assert(
                 not grad_offloaded or self._offload_params,
                 f"Expects the sharded gradient to be on {self.device} "
                 f"but got {flat_param.grad.device}",
@@ -1142,7 +1137,7 @@ def prepare_gradient_for_backward(self):
                     flat_param._saved_grad_shard = flat_param.grad.data  # type: ignore[attr-defined]
                     sharded_grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
                 else:
-                    p_assert(
+                    _p_assert(
                         hasattr(flat_param, "_cpu_grad"),
                         "`_cpu_grad` should be defined if the gradient is on CPU",
                     )
@@ -1162,7 +1157,7 @@ def prepare_gradient_for_backward(self):
                     sharded_grad.data = sharded_grad.to(local_shard_dtype)
             else:
                 padded_unsharded_size = flat_param._padded_unsharded_size  # type: ignore[attr-defined]
-                p_assert(
+                _p_assert(
                     flat_param.grad.size() == padded_unsharded_size,
                     "Expects `.grad` to be the unsharded gradient in "
                     f"`no_sync()` with size {padded_unsharded_size} "
@@ -1203,7 +1198,7 @@ def cast_grad_to_param_dtype_if_needed(flat_param):
                 flat_param.grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
                 cast_grad_to_param_dtype_if_needed(flat_param)
         else:
-            p_assert(
+            _p_assert(
                 not self.uses_sharded_strategy
                 or not flat_param._post_backward_called,  # type: ignore[attr-defined]
                 "All sharded parameters that received a gradient in the "
@@ -1229,7 +1224,7 @@ def to_cpu(self):
         Postcondition: Same as the precondition.
         """
         self._check_sharded_strategy()
-        p_assert(
+        _p_assert(
             self.flat_param.size() == self.flat_param._unpadded_unsharded_size,
             f"Expects size {self.flat_param._unpadded_unsharded_size} but got {self.flat_param.size()}",
         )
@@ -1242,7 +1237,7 @@ def to_cpu(self):
         padded_storage_ptr = (
             self._get_padded_unsharded_flat_param()._typed_storage()._data_ptr()
         )
-        p_assert(
+        _p_assert(
             unpadded_storage_ptr == padded_storage_ptr,
             "Expects the unpadded parameter to be a view into the padded parameter",
         )
@@ -1251,7 +1246,7 @@ def to_cpu(self):
         try:
             yield
         finally:
-            p_assert(
+            _p_assert(
                 self.flat_param.size() == self.flat_param._unpadded_unsharded_size,
                 f"Expects size {self.flat_param._unpadded_unsharded_size} but got {self.flat_param.size()}",
             )
@@ -1268,9 +1263,13 @@ def reshard(self, free_unsharded_flat_param: bool):
         parameter if ``free_unsharded_flat_param`` and switching to using the
         sharded flattened parameter.
         """
+        # Switch to the sharded `FlatParameter` before freeing to prevent
+        # "use-after-free"-type bugs with external profiling tools, where for
+        # `use_orig_params=True`, the `param` does not point to valid memory
+        # when setting `param.data = ...` in `_use_sharded_views()`.
+        self._use_sharded_flat_param()
         if free_unsharded_flat_param:
             self._free_unsharded_flat_param()
-        self._use_sharded_flat_param()
 
     def post_reshard(self):
         """
@@ -1310,7 +1309,7 @@ def _use_sharded_flat_param(self) -> None:
         flat_param = self.flat_param
         if self._offload_params:
             device = flat_param._local_shard.device  # type: ignore[attr-defined]
-            p_assert(
+            _p_assert(
                 device == torch.device("cpu"),
                 f"Expects the local shard to be on CPU but got {device}",
             )
@@ -1353,7 +1352,7 @@ def _get_unflat_views(
         """
         if tensor is None:
             tensor = flat_param
-        p_assert(
+        _p_assert(
             tensor.numel() == flat_param._unpadded_unsharded_size.numel(),
             f"Expects {flat_param._unpadded_unsharded_size.numel()} numel but got "
             f"{tensor.numel()} numel",
@@ -1412,7 +1411,7 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                         # hook fires (e.g. for reentrant AC)
                         assert self.flat_param._tensors is not None  # mypy
                         tensor = self.flat_param._tensors[i]
-                        p_assert(
+                        _p_assert(
                             tensor is not None,
                             "Expects `Tensor` to have been saved in forward",
                         )
@@ -1435,14 +1434,14 @@ def _use_unsharded_views(self, as_params: bool) -> None:
         ) in enumerate(self.flat_param._shared_param_infos):
             if hasattr(module, param_name):
                 delattr(module, param_name)
-            p_assert(
+            _p_assert(
                 hasattr(prim_module, prim_param_name),
                 f"Module {prim_module_name} is missing parameter {prim_param_name}",
             )
             prim_param: Union[Tensor, nn.Parameter] = getattr(
                 prim_module, prim_param_name
             )
-            p_assert(
+            _p_assert(
                 not as_params or isinstance(prim_param, nn.Parameter),
                 f"as_params={as_params} type(prim_param)={type(prim_param)}",
             )
@@ -1481,14 +1480,12 @@ def _use_unsharded_grad_views(self) -> None:
         for i, (view, (param_name, module, _)) in enumerate(
             zip(views, self.flat_param._param_infos)
         ):
-            p_assert(
+            _p_assert(
                 hasattr(module, param_name),
                 f"{self.flat_param._fqns[i]} is missing",
             )
             param = getattr(module, param_name)
-            if param.shape != view.shape or (
-                param.dtype != view.dtype and not self.uses_sharded_strategy
-            ):
+            if param.shape != view.shape or param.dtype != view.dtype:
                 # NOTE: This is a hack using `.data` to side step the
                 # check that parameter/gradient sizes and dtypes match. Here,
                 # `param` can have the sharded size, and `grad` can have the
@@ -1509,7 +1506,7 @@ def _use_unsharded_grad_views(self) -> None:
             prim_module,
             _,
         ) in enumerate(self.flat_param._shared_param_infos):
-            p_assert(
+            _p_assert(
                 hasattr(module, param_name),
                 f"{module_name + '.' + param_name if module_name else param_name} is missing",
             )  # did not save FQN info in `_shared_param_infos`
@@ -1791,7 +1788,7 @@ def _writeback_tensor(
             RuntimeError: If the ``src_tensor`` does not have the expected
             shape.
         """
-        p_assert(
+        _p_assert(
             len(expected_shape) == 1,
             f"Expects a 1D expected shape but got {expected_shape}",
         )
@@ -1859,8 +1856,8 @@ def flat_param_to(self, *args, **kwargs):
     def _get_modules(self) -> Set[nn.Module]:
         """Returns a :class:`set` of the modules whose parameters are included
         in this handle's flattened parameter."""
-        return set(pi.module for pi in self.flat_param._param_infos).union(
-            set(spi.module for spi in self.flat_param._shared_param_infos)
+        return {pi.module for pi in self.flat_param._param_infos}.union(
+            {spi.module for spi in self.flat_param._shared_param_infos}
         )
 
     def is_sharded(self, tensor: Tensor) -> bool:
@@ -1933,7 +1930,7 @@ def sharded_grad(self) -> Optional[Tensor]:
         else:
             # If in the forward, then there may be an accumulated gradient,
             # which will be in `.grad`
-            p_assert(
+            _p_assert(
                 flat_param.grad is None
                 or not self.uses_sharded_strategy
                 or self._training_state == HandleTrainingState.FORWARD,
@@ -1952,7 +1949,7 @@ def _reset_is_grad_none(self) -> None:
         """
         if not self._use_orig_params:
             return
-        p_assert(
+        _p_assert(
             self._training_state == HandleTrainingState.BACKWARD_POST,
             "Expects to only be called in the post-backward after gradient computation",
         )
@@ -1969,16 +1966,16 @@ def _reset_is_grad_none(self) -> None:
     # CHECKS & INVARIANTS #
     #######################
     def _check_sharded_strategy(self):
-        p_assert(self.uses_sharded_strategy, "Expects sharded strategy")
+        _p_assert(self.uses_sharded_strategy, "Expects sharded strategy")
 
     def _check_on_compute_device(self, tensor: Tensor):
-        p_assert(
+        _p_assert(
             tensor.device == self.device,
             f"Expects tensor to be on the compute device {self.device}",
         )
 
     def _check_on_cpu(self, tensor: Tensor):
-        p_assert(
+        _p_assert(
             tensor.device == torch.device("cpu"),
             f"Expects tensor to be on CPU but got {tensor.device}",
         )
@@ -1986,7 +1983,7 @@ def _check_on_cpu(self, tensor: Tensor):
     @staticmethod
     def _check_storage_freed(tensor: Tensor):
         storage_size: int = tensor._typed_storage()._size()
-        p_assert(
+        _p_assert(
             storage_size == 0,
             f"Expects storage to be freed but got storage with size {storage_size}",
         )
@@ -1994,37 +1991,37 @@ def _check_storage_freed(tensor: Tensor):
     @staticmethod
     def _check_storage_allocated(tensor: Tensor):
         storage_size: int = tensor._typed_storage()._size()
-        p_assert(storage_size > 0, "Expects storage to be allocated")
+        _p_assert(storage_size > 0, "Expects storage to be allocated")
 
     def _check_low_precision_shard(self):
-        p_assert(
+        _p_assert(
             self._uses_param_mixed_precision,
             "Not using low precision for parameters",
         )
-        p_assert(
+        _p_assert(
             getattr(self.flat_param, "_mp_shard", None) is not None,
             "Expects `_mp_shard` to exist",
         )
         device = self.flat_param._mp_shard.device  # type: ignore[attr-defined]
-        p_assert(
+        _p_assert(
             device == self.device,
             f"Expects the low precision shard to be on {self.device} but got {device}",
         )
 
     def _check_unsharded(self, tensor: Tensor):
         msg_prefix = "Expects tensor to be unsharded "
-        p_assert(tensor is not None, msg_prefix + "but got `None`")
+        _p_assert(tensor is not None, msg_prefix + "but got `None`")
         unsharded_size = self.flat_param._unpadded_unsharded_size
-        p_assert(
+        _p_assert(
             tensor.size() == unsharded_size,
             msg_prefix + f"with size {unsharded_size} but got {tensor.size()}",
         )
 
     def _check_sharded(self, tensor: Tensor):
         msg_prefix = "Expects tensor to be sharded "
-        p_assert(tensor is not None, msg_prefix + "but got `None`")
+        _p_assert(tensor is not None, msg_prefix + "but got `None`")
         sharded_size = self.flat_param._sharded_size  # type: ignore[attr-defined]
-        p_assert(
+        _p_assert(
             tensor.size() == sharded_size,
             msg_prefix + f"with size {sharded_size} but got {tensor.size()}",
         )
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 137c74d59cda..40f731c3e74b 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -64,14 +64,20 @@
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
+    FullOptimStateDictConfig,
     FullStateDictConfig,
+    LocalOptimStateDictConfig,
     LocalStateDictConfig,
     MixedPrecision,
+    OptimStateDictConfig,
+    ShardedOptimStateDictConfig,
     ShardedStateDictConfig,
     ShardingStrategy,
     StateDictConfig,
+    StateDictSettings,
     StateDictType,
 )
+from torch.distributed.utils import _p_assert
 
 from ._optim_utils import (
     _broadcast_pos_dim_tensor_states,
@@ -91,8 +97,8 @@
     _register_flat_param,
     _register_orig_params,
     _unshard_params,
+    _unshard_params_recurse,
 )
-from ._utils import p_assert
 from .flat_param import FlatParameter
 from .wrap import _FSDPPolicy
 
@@ -493,11 +499,24 @@ def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
         """
         uninitialized = self._is_root is None
         self._assert_state(TrainingState.IDLE)
-        with self._summon_full_params(recurse=False, writeback=True):
+        # Use `_unshard_params_recurse()` with `recurse=False` instead of
+        # `_unshard_fsdp_state_params()` directly to perform lazy
+        # initialization, which is needed to initialize `FlatParameter`
+        # parameter attributes as required by the unshard logic
+        with _unshard_params_recurse(
+            self,
+            self,
+            recurse=False,
+            writeback=True,
+            rank0_only=False,
+            offload_to_cpu=False,
+            with_grads=False,
+        ):
             ret = super().apply(fn)
 
-        # Reset lazy init that might be called by _summon_full_params, since
-        # it could have set is_root incorrectly for non-root FSDP instances.
+        # Reset lazy init called in `_unshard_params_recurse()` since `apply()`
+        # may have been called on FSDP instance that is not truly a root, in
+        # which case it will be incorrectly marked as one.
         if uninitialized and self._is_root:
             for module in traversal_utils._get_fsdp_states(self):
                 module._reset_lazy_init()
@@ -534,7 +553,8 @@ def set_state_dict_type(
         module: nn.Module,
         state_dict_type: StateDictType,
         state_dict_config: Optional[StateDictConfig] = None,
-    ) -> Tuple[StateDictType, StateDictConfig]:
+        optim_state_dict_config: Optional[OptimStateDictConfig] = None,
+    ) -> StateDictSettings:
         """
         Set the ``state_dict_type`` and the corresponding (optional)
         configurations of all the descendant FSDP modules of the target module.
@@ -558,53 +578,104 @@ def set_state_dict_type(
             >>> FSDP.set_state_dict_type(
             >>>     model,
             >>>     StateDictType.SHARDED_STATE_DICT,
-            >>>     ShardedStateDictConfig(offload_to_cpu=True),
+            >>>     state_dict_config = ShardedStateDictConfig(offload_to_cpu=True),
+            >>>     optim_state_dict_config = OptimStateDictConfig(offload_to_cpu=True),
             >>> )
-            >>> checkpoint = model.state_dict()
+            >>> param_state_dict = model.state_dict()
+            >>> optim_state_dict = FSDP.optim_state_dict(model, optim)
 
         Args:
             module (torch.nn.Module): Root module.
             state_dict_type (StateDictType): the desired ``state_dict_type`` to set.
             state_dict_config (Optional[StateDictConfig]): the configuration for the
                 target ``state_dict_type``.
+        Returns:
+            A StateDictSettings that include the previous state_dict type and
+            configuration for the module.
         """
         _state_dict_type_to_config = {
             StateDictType.FULL_STATE_DICT: FullStateDictConfig,
             StateDictType.LOCAL_STATE_DICT: LocalStateDictConfig,
             StateDictType.SHARDED_STATE_DICT: ShardedStateDictConfig,
         }
+        _optim_state_dict_type_to_config = {
+            StateDictType.FULL_STATE_DICT: FullOptimStateDictConfig,
+            StateDictType.LOCAL_STATE_DICT: LocalOptimStateDictConfig,
+            StateDictType.SHARDED_STATE_DICT: ShardedOptimStateDictConfig,
+        }
 
-        prev_state_dict_type = None
-        prev_state_dict_config = None
         # Use the default config if a state_dict config is not set.
+        state_dict_config_type = _state_dict_type_to_config[state_dict_type]
+        optim_state_dict_config_type = _optim_state_dict_type_to_config[state_dict_type]
         if state_dict_config is None:
-            state_dict_config = _state_dict_type_to_config[state_dict_type]()
+            state_dict_config = state_dict_config_type()
+        if optim_state_dict_config is None:
+            optim_state_dict_config = optim_state_dict_config_type()
+        if state_dict_config_type != type(state_dict_config):
+            raise RuntimeError(
+                f"Expected state_dict_config of type {state_dict_config_type} "
+                f"but got {type(state_dict_config)}"
+            )
+        if optim_state_dict_config_type != type(optim_state_dict_config):
+            raise RuntimeError(
+                f"Expected optim_state_dict_config of type {optim_state_dict_config_type} "
+                f"but got {type(optim_state_dict_config)}"
+            )
+
+        # Set the state_dict type and configurations.
+        prev_state_dict_type = None
+        prev_state_dict_config = None
+        prev_optim_state_dict_config = None
         for submodule in traversal_utils._get_fsdp_states(module):
             if prev_state_dict_type is None:
                 prev_state_dict_type = submodule._state_dict_type
+            else:
+                assert (
+                    prev_state_dict_type == submodule._state_dict_type
+                ), "All FSDP modules should have the same state_dict_type."
             if prev_state_dict_config is None:
                 prev_state_dict_config = submodule._state_dict_config
-            if prev_state_dict_type != submodule._state_dict_type:
-                raise RuntimeError("All FSDP module should the same state_dict_type.")
-            if not isinstance(
-                submodule._state_dict_config, type(prev_state_dict_config)
-            ):
-                raise RuntimeError(
-                    "All FSDP modules should have the same type of state_dict_config."
-                )
+            else:
+                assert isinstance(
+                    submodule._state_dict_config, type(prev_state_dict_config)
+                ), "All FSDP modules must have the same type of state_dict_config."
+            if prev_optim_state_dict_config is None:
+                prev_optim_state_dict_config = submodule._optim_state_dict_config
+            else:
+                assert isinstance(
+                    submodule._optim_state_dict_config,
+                    type(prev_optim_state_dict_config),
+                ), "All FSDP modules must have the same type of optim_state_dict_config."
 
-            expected_state_dict_config_type = _state_dict_type_to_config[
-                state_dict_type
-            ]
-            if expected_state_dict_config_type != type(state_dict_config):
-                raise RuntimeError(
-                    f"Expected state_dict_config of type {expected_state_dict_config_type} "
-                    f"but got {type(state_dict_config)}"
-                )
             submodule._state_dict_type = state_dict_type
             submodule._state_dict_config = state_dict_config
+            submodule._optimstate_dict_config = optim_state_dict_config
+
+        return StateDictSettings(
+            prev_state_dict_type, prev_state_dict_config, prev_optim_state_dict_config
+        )
 
-        return prev_state_dict_type, prev_state_dict_config
+    @staticmethod
+    def get_state_dict_type(module: nn.Module) -> StateDictSettings:
+        state_dict_settings: Optional[StateDictSettings] = None
+        for submodule in FullyShardedDataParallel.fsdp_modules(module):
+            if state_dict_settings is None:
+                state_dict_settings = StateDictSettings(
+                    state_dict_type=submodule._state_dict_type,
+                    state_dict_config=submodule._state_dict_config,
+                    optim_state_dict_config=submodule._optim_state_dict_config,
+                )
+            else:
+                submodule_settings = StateDictSettings(
+                    submodule._state_dict_type,
+                    submodule._state_dict_config,
+                    submodule._optim_state_dict_config,
+                )
+                assert state_dict_settings == submodule_settings, (
+                    "All FSDP modules must have the same state dict settings."
+                    f"Got {submodule_settings} and {state_dict_settings}."
+                )
+        return state_dict_settings
 
     @staticmethod
     @contextlib.contextmanager
@@ -612,6 +683,7 @@ def state_dict_type(
         module: nn.Module,
         state_dict_type: StateDictType,
         state_dict_config: Optional[StateDictConfig] = None,
+        optim_state_dict_config: Optional[OptimStateDictConfig] = None,
     ) -> Generator:
         """
         A context manager to set the ``state_dict_type`` of all the descendant
@@ -635,26 +707,22 @@ def state_dict_type(
             state_dict_config (Optional[StateDictConfig]): the configuration for the
                 target ``state_dict_type``.
         """
-        prev_state_dict_type = None
-        prev_state_dict_config = None
         try:
-            (
-                prev_state_dict_type,
-                prev_state_dict_config,
-            ) = FullyShardedDataParallel.set_state_dict_type(
-                module, state_dict_type, state_dict_config
+            prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(
+                module,
+                state_dict_type,
+                state_dict_config,
+                optim_state_dict_config,
             )
             yield
         except Exception as e:
             raise e
-        else:
-            assert prev_state_dict_type is not None
-            assert prev_state_dict_config is not None
-        finally:
-            if prev_state_dict_type is not None and prev_state_dict_config is not None:
-                FullyShardedDataParallel.set_state_dict_type(
-                    module, prev_state_dict_type, prev_state_dict_config
-                )
+        FullyShardedDataParallel.set_state_dict_type(
+            module,
+            prev_state_dict_settings.state_dict_type,
+            prev_state_dict_settings.state_dict_config,
+            prev_state_dict_settings.optim_state_dict_config,
+        )
 
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         """
@@ -672,7 +740,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                 self, self._handles, unshard_fn, self._fsdp_wrapped_module, args, kwargs
             )
             for handle in self._handles:
-                p_assert(
+                _p_assert(
                     handle.flat_param.device == self.compute_device,
                     "Expected `FlatParameter` to be on the compute device "
                     f"{self.compute_device} but got {handle.flat_param.device}",
@@ -683,7 +751,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
     @staticmethod
     @contextlib.contextmanager
     def summon_full_params(
-        module,
+        module: nn.Module,
         recurse: bool = True,
         writeback: bool = True,
         rank0_only: bool = False,
@@ -749,87 +817,10 @@ def summon_full_params(
                 constructor and ``offload_to_cpu=False`` to this method.
                 (Default: ``False``)
         """
-        # Note that we specify root_only as FSDP roots will handle summoning
-        # child FSDP instances based on recurse argument.
-        root_fsdp_modules = _get_fsdp_root_states(module)
-        # Summon all params for all FSDP instances
-        with contextlib.ExitStack() as stack:
-            for module in root_fsdp_modules:
-                stack.enter_context(
-                    module._summon_full_params(
-                        recurse=recurse,
-                        writeback=writeback,
-                        rank0_only=rank0_only,
-                        offload_to_cpu=offload_to_cpu,
-                        with_grads=with_grads,
-                    )
-                )
-            # Yield to the caller, with full params in all FSDP instances.
-            yield
-        # Exiting from the ExitStack will reshard all params.
-        return
-
-    @contextlib.contextmanager
-    def _summon_full_params(
-        self,
-        recurse: bool = True,
-        writeback: bool = True,
-        rank0_only: bool = False,
-        offload_to_cpu: bool = False,
-        with_grads: bool = False,
-    ):
-        if with_grads and (offload_to_cpu or not self._use_orig_params):
-            raise NotImplementedError(
-                f"with_grads={with_grads} "
-                f"use_orig_params={self._use_orig_params} "
-                f"offload_to_cpu={offload_to_cpu} "
-                f"is not supported yet"
-            )
-        if writeback and rank0_only:
-            raise ValueError(
-                "writeback=True and rank0_only=True is not supported, as model "
-                "parameter shapes will be different across ranks, and writing "
-                "to them can lead to inconsistencies across ranks when the "
-                "context is exited."
-            )
-        if offload_to_cpu and not rank0_only:
-            warnings.warn(
-                "offload_to_cpu and rank0_only=False will result in "
-                "full parameters being redundantly copied to CPU memory for "
-                "GPUs that reside on the same machine, which may incur the risk of "
-                "CPU OOM. It is recommended to use ``offload_to_cpu`` with "
-                "rank0_only=True."
-            )
-
-        if recurse:
-            with contextlib.ExitStack() as stack:
-                for module in traversal_utils._get_fsdp_states(self):
-                    stack.enter_context(
-                        module._summon_full_params(
-                            recurse=False,
-                            writeback=writeback,
-                            rank0_only=rank0_only,
-                            offload_to_cpu=offload_to_cpu,
-                            with_grads=with_grads,
-                        )
-                    )
-                yield
-            return
-
-        _lazy_init(self, self)
         with _unshard_params(
-            module=self,
-            state=self,
-            writeback=writeback,
-            rank0_only=rank0_only,
-            offload_to_cpu=offload_to_cpu,
-            with_grads=with_grads,
+            module, recurse, writeback, rank0_only, offload_to_cpu, with_grads
         ):
-            try:
-                self.training_state = TrainingState.SUMMON_FULL_PARAMS
-                yield
-            finally:
-                self.training_state = TrainingState.IDLE
+            yield
 
     @contextlib.contextmanager
     def _deregister_orig_params_ctx(self):
@@ -839,7 +830,7 @@ def _deregister_orig_params_ctx(self):
         this refreshes the sharded views before exiting. This method shouuld
         only be called when using the original parameters.
         """
-        p_assert(
+        _p_assert(
             self._use_orig_params,
             "`_deregister_orig_params_ctx()` should only be called when "
             "`_use_orig_params=True`",
@@ -882,10 +873,7 @@ def named_buffers(
         remove all occurrences of the FSDP-specific flattened buffer prefix
         when inside the :meth:`summon_full_params` context manager.
         """
-        should_clean_name = (
-            self.training_state == TrainingState.SUMMON_FULL_PARAMS
-            or self._use_orig_params
-        )
+        should_clean_name = self.training_state == TrainingState.SUMMON_FULL_PARAMS
         for buffer_name, buffer in super().named_buffers(*args, **kwargs):
             if should_clean_name:
                 # Remove any instances of the FSDP-specific prefix; there can
@@ -903,10 +891,7 @@ def named_parameters(
         remove all occurrences of the FSDP-specific flattened parameter prefix
         when inside the :meth:`summon_full_params` context manager.
         """
-        should_clean_name = (
-            self.training_state == TrainingState.SUMMON_FULL_PARAMS
-            or self._use_orig_params
-        )
+        should_clean_name = self.training_state == TrainingState.SUMMON_FULL_PARAMS
         for param_name, param in super().named_parameters(*args, **kwargs):
             if should_clean_name:
                 # Remove any instances of the FSDP-specific prefix; there can
@@ -1018,8 +1003,7 @@ def clip_grad_norm_(
         # If every FSDP instance uses `NO_SHARD`, then we can directly use
         # the normal `nn.utils` one targeting local gradients
         all_no_shard = all(
-            not handle.uses_sharded_strategy
-            for handle in traversal_utils._get_fsdp_handles(self)
+            not handle.uses_sharded_strategy for handle in self._all_handles
         )
         if all_no_shard:
             return torch.nn.utils.clip_grad_norm_(
@@ -1032,7 +1016,7 @@ def clip_grad_norm_(
         sharded_params = set()
         nonsharded_params = set()  # `NO_SHARD` or not FSDP-managed
         grads: List[torch.Tensor] = []
-        for handle in traversal_utils._get_fsdp_handles(self):
+        for handle in self._all_handles:
             target_set = (
                 sharded_params if handle.uses_sharded_strategy else nonsharded_params
             )
@@ -1085,6 +1069,16 @@ def clip_grad_norm_(
             grad.detach().mul_(clip_coef_clamped.to(grad.device, grad.dtype))
         # Use the "largest" dtype by type promotion semantics to use the same
         # dtype as if we did not force local norm computation to be in FP32
+        if len(grads) == 0:
+            # If this rank has no gradients, then we must default to FP32
+            # unless we use additional communication, which we prefer to avoid
+            # since `clip_grad_norm_()` is called in the training loop
+            warnings.warn(
+                f"Called FSDP.clip_grad_norm_() on rank {self.rank} with no "
+                "gradients -- returning the total norm in the default dtype "
+                f"{total_norm.dtype}"
+            )  # warn since this is generally unexpected
+            return total_norm
         total_norm_dtype = functools.reduce(
             lambda dtype1, dtype2: torch.promote_types(dtype1, dtype2),
             [grad.dtype for grad in grads],
@@ -1111,18 +1105,12 @@ def _is_using_optim_input(optim_input, optim) -> bool:
         return False
 
     @staticmethod
-    def _raise_on_use_orig_params_optim_checkpoint(
-        model: nn.Module, full_optim: bool, rank0_only: bool
-    ):
-        if full_optim and not rank0_only:
-            return
-        if any(
-            fsdp_module._use_orig_params
-            for fsdp_module in traversal_utils._get_fsdp_states(model)
-        ):
-            raise NotImplementedError(
-                "Optimizer state checkpointing is not supported yet for `use_orig_params=True`"
-            )
+    def _warn_legacy_optim_state_dict(curr: str, new: str):
+        warnings.warn(
+            f"``FullyShardedDataParallel.{curr}``is being deprecated and is "
+            f"replaced by ``FullyShardedDataParallel.{new}``. "
+            f"``FullyShardedDataParallel.{curr}`` may be removed after PyTorch 2.2."
+        )
 
     @staticmethod
     def _optim_state_dict_impl(
@@ -1136,8 +1124,8 @@ def _optim_state_dict_impl(
             ]
         ] = None,
         rank0_only: bool = True,
-        group: Optional[dist.ProcessGroup] = None,
         full_state_dict: bool = True,
+        group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
         """
         The internal API that is used by all the optim_state_dict implementations.
@@ -1145,9 +1133,6 @@ def _optim_state_dict_impl(
         FSDP internal information and internal sharding from the optim_state_dict.
         """
         if full_state_dict:
-            FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(
-                model, True, rank0_only
-            )
             FullyShardedDataParallel._warn_optim_input(optim_input)
             using_optim_input = FullyShardedDataParallel._is_using_optim_input(
                 optim_input,
@@ -1156,9 +1141,6 @@ def _optim_state_dict_impl(
         else:
             using_optim_input = False
             assert optim_input is None and not rank0_only
-            FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(
-                model, False, False
-            )
 
         use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
             0
@@ -1192,7 +1174,9 @@ def _optim_state_dict_to_load_impl(
         ] = None,
         optim: Optional[torch.optim.Optimizer] = None,
         full_state_dict: bool = True,
+        rank0_only: bool = False,
         is_named_optimizer: bool = False,
+        group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
         """
         The internal API that is used by all the load optim_state_dict
@@ -1201,9 +1185,6 @@ def _optim_state_dict_to_load_impl(
         Given model, optim, the saved optim_state_dict, this API adds the
         FSDP internal information and internal sharding to the optim_state_dict.
         """
-        FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(
-            model, full_state_dict, False
-        )
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
             optim_input,
@@ -1218,20 +1199,70 @@ def _optim_state_dict_to_load_impl(
             for m in FullyShardedDataParallel.fsdp_modules(model)
         ), "Not all FSDP modules have the same _use_orig_params value"
 
-        sharded_osd = _flatten_optim_state_dict(
-            optim_state_dict,
-            model,
-            True,
-            use_orig_params,
-        )
-        return _rekey_sharded_optim_state_dict(
-            sharded_osd,
-            model,
-            optim,
-            optim_input,
-            using_optim_input,
-            is_named_optimizer,
-        )
+        if rank0_only:
+            rank = dist.get_rank(group)
+            world_size = dist.get_world_size(group)
+            # Flatten the optimizer state dict and construct a copy with the
+            # positive-dimension tensors' shapes in place of the tensors themselves
+            # since those tensors will be broadcast separately to avoid copying
+            if rank == 0:
+                flat_osd = _flatten_optim_state_dict(
+                    optim_state_dict,
+                    model=model,
+                    shard_state=False,
+                    use_orig_params=use_orig_params,
+                    optim=(optim if is_named_optimizer else None),
+                )
+                processed_osd = _process_pos_dim_tensor_state(flat_osd, world_size)
+                # Broadcast the optim state dict without positive-dimension tensor
+                # state and the FSDP parameter IDs from rank 0 to all ranks
+            processed_osd = _broadcast_processed_optim_state_dict(
+                processed_osd if rank == 0 else None,
+                rank,
+                group,
+            )
+            # Broadcast positive-dimension tensor state (both sharded tensors for
+            # FSDP parameters and unsharded tensors for non-FSDP parameters)
+            broadcast_device = (
+                torch.device("cuda")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
+            sharded_osd = _broadcast_pos_dim_tensor_states(
+                processed_osd,
+                flat_osd if rank == 0 else None,
+                rank,
+                world_size,
+                group,
+                broadcast_device,
+            )
+            # Rekey the optimizer state dict to use parameter IDs according to this
+            # rank's `optim`
+            ret_state_dict = _rekey_sharded_optim_state_dict(
+                sharded_osd,
+                model=model,
+                optim=optim,
+                optim_input=optim_input,
+                using_optim_input=using_optim_input,
+                is_named_optimizer=is_named_optimizer,
+            )
+        else:
+            sharded_osd = _flatten_optim_state_dict(
+                optim_state_dict,
+                model=model,
+                shard_state=True,
+                use_orig_params=use_orig_params,
+                optim=(optim if is_named_optimizer else None),
+            )
+            ret_state_dict = _rekey_sharded_optim_state_dict(
+                sharded_osd,
+                model=model,
+                optim=optim,
+                optim_input=optim_input,
+                using_optim_input=using_optim_input,
+                is_named_optimizer=is_named_optimizer,
+            )
+        return ret_state_dict
 
     @staticmethod
     def full_optim_state_dict(
@@ -1292,6 +1323,9 @@ def full_optim_state_dict(
             :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=True``,
             then nonzero ranks return an empty :class:`dict`.
         """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "full_optim_state_dict", "optim_state_dict"
+        )
         return FullyShardedDataParallel._optim_state_dict_impl(
             model=model,
             optim=optim,
@@ -1319,14 +1353,17 @@ def sharded_optim_state_dict(
         .. warning:: The returned state dict contains ``ShardedTensor`` and
             cannot be directly used by the regular ``optim.load_state_dict``.
         """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "sharded_optim_state_dict", "optim_state_dict"
+        )
         return FullyShardedDataParallel._optim_state_dict_impl(
             model=model,
             optim=optim,
             optim_state_dict=optim.state_dict(),
             optim_input=None,
             rank0_only=False,
-            group=group,
             full_state_dict=False,
+            group=group,
         )
 
     @staticmethod
@@ -1395,6 +1432,9 @@ def shard_full_optim_state_dict(
             flattened parameters instead of unflattened parameters and
             restricted to only include this rank's part of the optimizer state.
         """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "shard_full_optim_state_dict", "optim_state_dict_to_load"
+        )
         return FullyShardedDataParallel._optim_state_dict_to_load_impl(
             optim_state_dict=full_optim_state_dict,
             model=model,
@@ -1428,6 +1468,9 @@ def flatten_sharded_optim_state_dict(
         Returns:
             Refer to :meth:`shard_full_optim_state_dict`.
         """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "flatten_sharded_optim_state_dict", "optim_state_dict_to_load"
+        )
         return FullyShardedDataParallel._optim_state_dict_to_load_impl(
             optim_state_dict=sharded_optim_state_dict,
             model=model,
@@ -1505,66 +1548,19 @@ def scatter_full_optim_state_dict(
             flattened parameters instead of unflattened parameters and
             restricted to only include this rank's part of the optimizer state.
         """
-        FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(
-            model, True, True
-        )
-        FullyShardedDataParallel._warn_optim_input(optim_input)
-        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input,
-            optim,
-        )
-        # Try to use the passed-in process group, the model's process group,
-        # or the default process group (i.e. `None`) in that priority order
-        if group is None and hasattr(model, "process_group"):
-            group = model.process_group
-        rank = dist.get_rank(group)
-        world_size = dist.get_world_size(group)
-        # Check for a valid broadcast device, preferring GPU when available
-        using_nccl = dist.distributed_c10d._check_for_nccl_backend(group)
-        broadcast_device = (
-            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-        )
-        if using_nccl and not torch.cuda.is_available():
-            raise RuntimeError("NCCL requires a GPU for collectives")
-        # Flatten the optimizer state dict and construct a copy with the
-        # positive-dimension tensors' shapes in place of the tensors themselves
-        # since those tensors will be broadcast separately to avoid copying
-        if rank == 0:
-            if full_optim_state_dict is None:
-                raise ValueError("Rank 0 must pass in the full optimizer state dict")
-            flat_osd = _flatten_optim_state_dict(
-                full_optim_state_dict,
-                model=model,
-                shard_state=False,
-            )
-            processed_osd = _process_pos_dim_tensor_state(flat_osd, world_size)
-        # Broadcast the optim state dict without positive-dimension tensor
-        # state and the FSDP parameter IDs from rank 0 to all ranks
-        processed_osd = _broadcast_processed_optim_state_dict(
-            processed_osd if rank == 0 else None,
-            rank,
-            group,
-        )
-        # Broadcast positive-dimension tensor state (both sharded tensors for
-        # FSDP parameters and unsharded tensors for non-FSDP parameters)
-        sharded_osd = _broadcast_pos_dim_tensor_states(
-            processed_osd,
-            flat_osd if rank == 0 else None,
-            rank,
-            world_size,
-            group,
-            broadcast_device,
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "scatter_full_optim_state_dict", "optim_state_dict_to_load"
         )
-        # Rekey the optimizer state dict to use parameter IDs according to this
-        # rank's `optim`
-        sharded_osd = _rekey_sharded_optim_state_dict(
-            sharded_osd,
-            model,
-            optim,
-            optim_input,
-            using_optim_input,
+        return FullyShardedDataParallel._optim_state_dict_to_load_impl(
+            optim_state_dict=full_optim_state_dict,
+            model=model,
+            optim_input=optim_input,
+            optim=optim,
+            full_state_dict=True,
+            rank0_only=True,
+            is_named_optimizer=False,
+            group=group,
         )
-        return sharded_osd
 
     @staticmethod
     def rekey_optim_state_dict(
@@ -1693,35 +1689,88 @@ def rekey_optim_state_dict(
         return new_osd  # should never reach here
 
     @staticmethod
-    def _optim_state_dict_post_hook(
+    def optim_state_dict(
         model: torch.nn.Module,
         optim: torch.optim.Optimizer,
-        optim_state_dict: Dict[str, Any],
+        group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
+        """
+        Returns the state dict of ``optim`` for the ``model`` that is (partially)
+        sharded by FSDP. The state may be sharded, consolidated, or consolidated
+        on rank 0 only depending on the ``state_dict_type`` set by
+        :meth:`set_state_dict_type` or :meth:`state_dict_type`.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> from torch.distributed.fsdp import StateDictType
+            >>> from torch.distributed.fsdp import FullStateDictConfig
+            >>> from torch.distributed.fsdp import FullOptimStateDictConfig
+            >>> # Save a checkpoint
+            >>> model, optim = ...
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> state_dict = model.state_dict()
+            >>> optim_state_dict = FSDP.optim_state_dict(model, optim)
+            >>> save_a_checkpoint(state_dict, optim_state_dict)
+            >>> # Load a checkpoint
+            >>> model, optim = ...
+            >>> state_dict, optim_state_dict = load_a_checkponit()
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> model.load_state_dict(state_dict)
+            >>> optim_state_dict = FSDP.optim_state_dict_to_load(
+            >>>     optim_state_dict, model, optim
+            >>> )
+            >>> optim.load_state_dict(optim_state_dict)
+
+        Args:
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                were passed into the optimizer ``optim``.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+            group (dist.ProcessGroup): Model's process group across which parameters
+                are sharded or ``None`` if using the default process group. (
+                Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: A :class:`dict` containing the optimizer state for
+            ``model``. The sharding of the optimizer state is based on
+            ``state_dict_type``.
+        """
+        state_dict_settings = FullyShardedDataParallel.get_state_dict_type(model)
         return FullyShardedDataParallel._optim_state_dict_impl(
             model=model,
             optim=optim,
-            optim_state_dict=optim_state_dict,
+            optim_state_dict=optim.state_dict(),
             optim_input=None,
-            rank0_only=False,
-            group=None,
-            full_state_dict=True,
+            rank0_only=getattr(state_dict_settings, "rank0_only", False),
+            full_state_dict=state_dict_settings.state_dict_type
+            == StateDictType.FULL_STATE_DICT,
+            group=group,
         )
 
     @staticmethod
-    def _optim_state_dict(
+    def optim_state_dict_post_hook(
         model: torch.nn.Module,
         optim: torch.optim.Optimizer,
+        optim_state_dict: Dict[str, Any],
         group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
         """
-        This API is still being developed, hence the `_` prefix. The comment
-        below is also not fully implemented yet. Do not use this API unless
-        you know why this API exists and how this API works.
-
-        Returns the optimizer state. The state will be sharded or consolidated
-        based on ``state_dict_type`` set by :meth:`set_state_dict_type` or
-        :meth:`state_dict_type`.
+        This hook is intended be used by ``torch.distributed.NamedOptimizer``.
+        The functionaility is identical to ``:meth:optim_state_dict`` except
+        for the different arguments.
 
         Args:
             model (torch.nn.Module): Root module (which may or may not be a
@@ -1729,6 +1778,8 @@ def _optim_state_dict(
                 were passed into the optimizer ``optim``.
             optim (torch.optim.Optimizer): Optimizer for ``model`` 's
                 parameters.
+            optim (Dict[str, Any]: the optim_state_dict to be coverted. The value
+               is typically returned by ``NamedOptimizer.state_dict()``.
             group (dist.ProcessGroup): Model's process group across which parameters
                 are sharded or ``None`` if using the default process group. (
                 Default: ``None``)
@@ -1738,59 +1789,123 @@ def _optim_state_dict(
             ``model``. The sharding of the optimizer state is based on
             ``state_dict_type``.
         """
-        return FullyShardedDataParallel.full_optim_state_dict(
-            model, optim, rank0_only=False, group=group
+        state_dict_settings = FullyShardedDataParallel.get_state_dict_type(model)
+        return FullyShardedDataParallel._optim_state_dict_impl(
+            model=model,
+            optim=optim,
+            optim_state_dict=optim_state_dict,
+            optim_input=None,
+            rank0_only=getattr(state_dict_settings, "rank0_only", False),
+            full_state_dict=state_dict_settings.state_dict_type
+            == StateDictType.FULL_STATE_DICT,
+            group=None,
         )
 
     @staticmethod
-    def _load_optim_state_dict_pre_hook(
+    def optim_state_dict_to_load(
+        optim_state_dict: Dict[str, Any],
         model: torch.nn.Module,
         optim: torch.optim.Optimizer,
-        optim_state_dict: Dict[str, Any],
+        is_named_optimizer: bool = False,
+        group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
+        """
+        Given a saved ``optim_state_dict``, converts it to the optimizer state_dict
+        that can be loaded to ``optim`` which is the optimizer for ``model``.
+        ``model`` is (partially) sharded by FullyShardedDataParallel.
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> from torch.distributed.fsdp import StateDictType
+            >>> from torch.distributed.fsdp import FullStateDictConfig
+            >>> from torch.distributed.fsdp import FullOptimStateDictConfig
+            >>> # Save a checkpoint
+            >>> model, optim = ...
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> state_dict = model.state_dict()
+            >>> optim_state_dict = FSDP.optim_state_dict(model, optim)
+            >>> save_a_checkpoint(state_dict, optim_state_dict)
+            >>> # Load a checkpoint
+            >>> model, optim = ...
+            >>> state_dict, optim_state_dict = load_a_checkponit()
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> model.load_state_dict(state_dict)
+            >>> optim_state_dict = FSDP.optim_state_dict_to_load(
+            >>>     optim_state_dict, model, optim
+            >>> )
+            >>> optim.load_state_dict(optim_state_dict)
+
+        Args:
+            optim_state_dict (Dict[str, Any]): The optimizer states to be loaded.
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                were passed into the optimizer ``optim``.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+            is_named_optimizer (bool): Is this optimizer a NamedOptimizer or
+                KeyedOptimizer. Only set to True if ``optim`` is TorchRec's
+                KeyedOptimizer or torch.distributed's NamedOptimizer.
+            group (dist.ProcessGroup): Model's process group across which parameters
+                are sharded or ``None`` if using the default process group. (
+                Default: ``None``)
+        """
+        state_dict_settings = FullyShardedDataParallel.get_state_dict_type(model)
         return FullyShardedDataParallel._optim_state_dict_to_load_impl(
             optim_state_dict=optim_state_dict,
             model=model,
             optim_input=None,
             optim=optim,
-            full_state_dict=True,
-            is_named_optimizer=True,
+            full_state_dict=(
+                state_dict_settings.state_dict_type == StateDictType.FULL_STATE_DICT
+            ),
+            rank0_only=getattr(state_dict_settings, "rank0_only", False),
+            is_named_optimizer=is_named_optimizer,
+            group=group,
         )
 
     @staticmethod
-    def _optim_state_dict_to_load(
-        optim_state_dict: Dict[str, Any],
+    def load_optim_state_dict_pre_hook(
         model: torch.nn.Module,
         optim: torch.optim.Optimizer,
+        optim_state_dict: Dict[str, Any],
         group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
         """
-        This API is still being developed, hence the `_` prefix. The comment
-        below is also not fully implemented yet. Do not use this API unless
-        you know why this API exists and how this API works.
-
-        Load the optimizer state, ``state_dict``, to the optimizer ``optim``.
-        ``state_dict_type`` set by :meth:``set_state_dict_type`` decides how
-        to load the state_dict.
+        This hook is intended be used by ``torch.distributed.NamedOptimizer``.
+        The functionaility is identical to ``:meth:optim_state_dict_to_load``
+        except for the different arguments.
 
         Args:
-            optim_state_dict (Dict[str, Any]): The optimizer states to be loaded.
             model (torch.nn.Module): Root module (which may or may not be a
                 :class:`FullyShardedDataParallel` instance) whose parameters
                 were passed into the optimizer ``optim``.
             optim (torch.optim.Optimizer): Optimizer for ``model`` 's
                 parameters.
+            optim_state_dict (Dict[str, Any]): The optimizer states to be loaded.
             group (dist.ProcessGroup): Model's process group across which parameters
                 are sharded or ``None`` if using the default process group. (
                 Default: ``None``)
         """
+        state_dict_settings = FullyShardedDataParallel.get_state_dict_type(model)
         return FullyShardedDataParallel._optim_state_dict_to_load_impl(
             optim_state_dict=optim_state_dict,
             model=model,
             optim_input=None,
             optim=optim,
-            full_state_dict=True,
-            is_named_optimizer=False,
+            full_state_dict=state_dict_settings.state_dict_type
+            == StateDictType.FULL_STATE_DICT,
+            is_named_optimizer=True,
+            group=group,
         )
 
     def register_comm_hook(self, state: object, hook: callable):
@@ -1862,7 +1977,7 @@ def _get_grad_norm(
     if len(params_with_grad) == 0:
         return torch.tensor(0.0)
     grads = [param.grad for param in params_with_grad]
-    grad_dtypes = set(grad.dtype for grad in grads)
+    grad_dtypes = {grad.dtype for grad in grads}
     if len(grad_dtypes) != 1:
         raise ValueError(
             f"Requires uniform dtype across all gradients but got {grad_dtypes}"
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py
index 5cf12225fae2..ded0ceed7bfb 100644
--- a/torch/distributed/launch.py
+++ b/torch/distributed/launch.py
@@ -19,7 +19,7 @@
 
 In both cases of single-node distributed training or multi-node distributed
 training, this utility will launch the given number of processes per node
-(``--nproc_per_node``). If used for GPU training, this number needs to be less
+(``--nproc-per-node``). If used for GPU training, this number needs to be less
 or equal to the number of GPUs on the current system (``nproc_per_node``),
 and each process will be operating on a single GPU from *GPU 0 to
 GPU (nproc_per_node - 1)*.
@@ -30,7 +30,7 @@
 
 ::
 
-    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
                YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
                arguments of your training script)
 
@@ -41,18 +41,18 @@
 
 ::
 
-    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
-               --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
-               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node-rank=0 --master-addr="192.168.1.1"
+               --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
                and all other arguments of your training script)
 
 Node 2:
 
 ::
 
-    python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
-               --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
-               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node-rank=1 --master-addr="192.168.1.1"
+               --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
                and all other arguments of your training script)
 
 3. To look up what optional arguments this module offers:
@@ -70,7 +70,7 @@
 use for GPU training.
 
 2. In your training program, you must parse the command-line argument:
-``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
+``--local-rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
 If your training program uses GPUs, you should ensure that your code only
 runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
 
@@ -81,7 +81,7 @@
     >>> # xdoctest: +SKIP
     >>> import argparse
     >>> parser = argparse.ArgumentParser()
-    >>> parser.add_argument("--local_rank", type=int)
+    >>> parser.add_argument("--local-rank", type=int)
     >>> args = parser.parse_args()
 
 Set your device to local rank using either
@@ -128,9 +128,9 @@
 
 5. Another way to pass ``local_rank`` to the subprocesses via environment variable
 ``LOCAL_RANK``. This behavior is enabled when you launch the script with
-``--use_env=True``. You must adjust the subprocess example above to replace
+``--use-env=True``. You must adjust the subprocess example above to replace
 ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
-will not pass ``--local_rank`` when you specify this flag.
+will not pass ``--local-rank`` when you specify this flag.
 
 .. warning::
 
@@ -156,13 +156,14 @@
 def parse_args(args):
     parser = get_args_parser()
     parser.add_argument(
+        "--use-env",
         "--use_env",
         default=False,
         action="store_true",
         help="Use environment variable to pass "
         "'local rank'. For legacy reasons, the default value is False. "
         "If set to True, the script will not pass "
-        "--local_rank as argument, and will instead set LOCAL_RANK.",
+        "--local-rank as argument, and will instead set LOCAL_RANK.",
     )
     return parser.parse_args(args)
 
@@ -170,8 +171,8 @@ def parse_args(args):
 def launch(args):
     if args.no_python and not args.use_env:
         raise ValueError(
-            "When using the '--no_python' flag,"
-            " you must also set the '--use_env' flag."
+            "When using the '--no-python' flag,"
+            " you must also set the '--use-env' flag."
         )
     run(args)
 
@@ -180,8 +181,8 @@ def main(args=None):
     warnings.warn(
         "The module torch.distributed.launch is deprecated\n"
         "and will be removed in future. Use torchrun.\n"
-        "Note that --use_env is set by default in torchrun.\n"
-        "If your script expects `--local_rank` argument to be set, please\n"
+        "Note that --use-env is set by default in torchrun.\n"
+        "If your script expects `--local-rank` argument to be set, please\n"
         "change it to read from `os.environ['LOCAL_RANK']` instead. See \n"
         "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n"
         "further instructions\n",
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index b32b208965f7..a699e7f98239 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -165,12 +165,12 @@ def _get_addr_and_port(
     endpoint = endpoint.strip()
     if not endpoint:
         raise ValueError(
-            "Endpoint is missing in endpoint. Try to add --master_addr and --master_port"
+            "Endpoint is missing in endpoint. Try to add --master-addr and --master-port"
         )
     master_addr, master_port = parse_rendezvous_endpoint(endpoint, default_port=-1)
     if master_port == -1:
         raise ValueError(
-            f"port is missing in endpoint: {endpoint}. Try to specify --master_port"
+            f"port is missing in endpoint: {endpoint}. Try to specify --master-port"
         )
     return (master_addr, master_port)
 
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index d6230140a63f..6e0216d72f4c 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -447,7 +447,7 @@ def eval(self: T) -> T:
     def requires_grad_(self: T, requires_grad: bool = True) -> T:  # type: ignore[return]
         _raise_not_supported(self.requires_grad_.__name__)
 
-    def zero_grad(self, set_to_none: bool = False) -> None:
+    def zero_grad(self, set_to_none: bool = True) -> None:
         _raise_not_supported(self.zero_grad.__name__)
 
     def share_memory(self: T) -> T:  # type: ignore[return]
diff --git a/torch/distributed/optim/functional_adadelta.py b/torch/distributed/optim/functional_adadelta.py
index 0aaa8906709f..af421cd9bb0c 100644
--- a/torch/distributed/optim/functional_adadelta.py
+++ b/torch/distributed/optim/functional_adadelta.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalAdadelta(object):
+class _FunctionalAdadelta:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
index a644aa5a378c..909893efa034 100644
--- a/torch/distributed/optim/functional_adagrad.py
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalAdagrad(object):
+class _FunctionalAdagrad:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/functional_adam.py b/torch/distributed/optim/functional_adam.py
index 1b7dc1a76fc4..7ef64f674fb5 100644
--- a/torch/distributed/optim/functional_adam.py
+++ b/torch/distributed/optim/functional_adam.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalAdam(object):
+class _FunctionalAdam:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/functional_adamax.py b/torch/distributed/optim/functional_adamax.py
index e5c236728d08..0b0ac03b6744 100644
--- a/torch/distributed/optim/functional_adamax.py
+++ b/torch/distributed/optim/functional_adamax.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalAdamax(object):
+class _FunctionalAdamax:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py
index 48d70843d368..d0b65eba3299 100644
--- a/torch/distributed/optim/functional_adamw.py
+++ b/torch/distributed/optim/functional_adamw.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalAdamW(object):
+class _FunctionalAdamW:
     def __init__(
         self,
         params: List[Tensor],
@@ -28,6 +28,7 @@ def __init__(
         amsgrad: bool = False,
         maximize: bool = False,
         foreach: bool = False,
+        fused: bool = False,
         _allow_empty_param_list: bool = False,
     ):
         if not 0.0 <= lr:
@@ -51,6 +52,7 @@ def __init__(
         self.amsgrad = amsgrad
         self.maximize = maximize
         self.foreach = foreach
+        self.fused = fused
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
         if len(params) == 0 and not _allow_empty_param_list:
@@ -114,6 +116,9 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
                 weight_decay=self.defaults["weight_decay"],
                 eps=self.defaults["eps"],
                 foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
             )
 
     def step(self, gradients: List[Optional[Tensor]]):
@@ -181,4 +186,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 weight_decay=self.defaults["weight_decay"],
                 eps=self.defaults["eps"],
                 foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
             )
diff --git a/torch/distributed/optim/functional_rmsprop.py b/torch/distributed/optim/functional_rmsprop.py
index 079f35c7b774..1f2d92b433f0 100644
--- a/torch/distributed/optim/functional_rmsprop.py
+++ b/torch/distributed/optim/functional_rmsprop.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalRMSprop(object):
+class _FunctionalRMSprop:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py
index cd109cfa9661..402262c4dc62 100644
--- a/torch/distributed/optim/functional_rprop.py
+++ b/torch/distributed/optim/functional_rprop.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalRprop(object):
+class _FunctionalRprop:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/functional_sgd.py b/torch/distributed/optim/functional_sgd.py
index 1d529cd50189..ff6ce757735b 100644
--- a/torch/distributed/optim/functional_sgd.py
+++ b/torch/distributed/optim/functional_sgd.py
@@ -17,7 +17,7 @@
 # NOTE: This should be only used by distributed optimizer internals
 # and not meant to expose to the user.
 @torch.jit.script
-class _FunctionalSGD(object):
+class _FunctionalSGD:
     def __init__(
         self,
         params: List[Tensor],
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index 67913b48b0cd..fed73886dd5d 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -144,14 +144,14 @@ def state_dict(self) -> Dict[str, Any]:
 
         return self._post_state_dict({"state": ret_state, "param_groups": ret_groups})
 
-    def step(self):
+    def step(self, closure: Any = None) -> None:
         """
         Performs a single optimization step.
 
         This will call :meth:`torch.optim.Optimizer.step` on the wrapped
         optimizer.
         """
-        self._optimizer.step()
+        self._optimizer.step(closure=closure)
 
     def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
         """
@@ -284,16 +284,30 @@ def add_param_group(self, param_group: Mapping[str, Any]) -> None:
         # Update param_groups from optimizer.
         self.param_groups = self._optimizer.param_groups
 
+    def init_state(self) -> None:
+        """
+        Runs a dummy optimizer step, which allows to initialize optimizer state
+        because we do lazy init for most optimizers.
+
+        This allows doing in-place loading of optimizer state from a checkpoint.
+        """
+        for _, param in self.named_parameters.items():
+            if param.requires_grad:
+                t = torch.zeros_like(param)
+                param.grad = torch.autograd.Variable(t)
+        # Calling ``step`` will load the initial state for optimizer states.
+        self.step(closure=None)
+
     def _pre_load_state_dict(self, state_dict) -> Dict[str, Any]:
         if isinstance(self.module, FSDP):
-            return FSDP._load_optim_state_dict_pre_hook(
+            return FSDP.load_optim_state_dict_pre_hook(
                 self.module, self._optimizer, state_dict
             )
         return state_dict
 
     def _post_state_dict(self, state_dict) -> Dict[str, Any]:
         if isinstance(self.module, FSDP):
-            FSDP._optim_state_dict_post_hook(self.module, self._optimizer, state_dict)
+            FSDP.optim_state_dict_post_hook(self.module, self._optimizer, state_dict)
         return state_dict
 
 
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index c8b26fba0463..acea8e0445ad 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -29,7 +29,7 @@
 # TODO (wanchaol): remove this once we added TorchScript
 # class reference semantics
 @jit.interface
-class _ScriptLocalOptimizerInterface(object):
+class _ScriptLocalOptimizerInterface:
     def step(self, autograd_ctx_id: int) -> None:
         pass
 
@@ -59,7 +59,7 @@ def step(self, autograd_ctx_id: int):
 
 # TODO (wanchaol): remove/merge this with ScriptLocalOptimizer once
 # we have converted all to functional optimizer in distributed.optim
-class _LocalOptimizer(object):
+class _LocalOptimizer:
     # Ideally we would only need to share a lock for instances of
     # _LocalOptimizer that deal with the same parameters. We are
     # making a simplifying assumption here that if there is more
@@ -198,7 +198,7 @@ def __init__(self, optimizer_class, params_rref, *args, **kwargs):
         if self.is_functional_optim:
             optimizer_new_func = _new_script_local_optimizer
         else:
-            logger.warn(
+            logger.warning(
                 f"Creating the optimizer {optimizer_class} without TorchScript support, "
                 "this might result in slow computation time in multithreading environment"
                 "(i.e. Distributed Model Parallel training on CPU) due to the Python's "
diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py
index 4c603996f0cc..f1717685966a 100644
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@@ -102,7 +102,7 @@ def step(self):
         self.optim.step()
         self.averager.average_parameters(params=self.param_groups)
 
-    def zero_grad(self, set_to_none: bool = False):  # type: ignore[override]
+    def zero_grad(self, set_to_none: bool = True):  # type: ignore[override]
         self.optim.zero_grad(set_to_none=set_to_none)
 
     def add_param_group(self, param_group):
diff --git a/torch/distributed/pipeline/sync/checkpoint.py b/torch/distributed/pipeline/sync/checkpoint.py
index a944b7b6de19..26d561cc3c15 100644
--- a/torch/distributed/pipeline/sync/checkpoint.py
+++ b/torch/distributed/pipeline/sync/checkpoint.py
@@ -28,12 +28,12 @@
 from contextlib import contextmanager
 import threading
 from typing import (
-    TYPE_CHECKING,
     Any,
     Deque,
     Generator,
     List,
     Optional,
+    Protocol,
     Union,
     Sequence,
     Tuple
@@ -60,12 +60,6 @@
 RNGStates = Tuple[Tensor, Optional[Tensor]]  # (cpu_rng_state, gpu_rng_state)
 
 
-if TYPE_CHECKING:
-    from typing_extensions import Protocol
-else:
-    Protocol = object
-
-
 # Protocol with __call__ instead of Callable can be used as an attribute type.
 # See: https://github.com/python/mypy/issues/708#issuecomment-561735949
 class Function(Protocol):
diff --git a/torch/distributed/pipeline/sync/microbatch.py b/torch/distributed/pipeline/sync/microbatch.py
index 10dbbf38cfd2..021644e4c0bd 100644
--- a/torch/distributed/pipeline/sync/microbatch.py
+++ b/torch/distributed/pipeline/sync/microbatch.py
@@ -20,7 +20,7 @@
 Function = Callable[[TensorOrTensors], Union[List[Any], Tensor]]
 
 
-class NoChunk(object):
+class NoChunk:
     """
     Wrapper for a Tensor in :meth:`Pipe.forward` indicating that the tensor
     should not be chunked on the batch dimension and instead be replicated
diff --git a/torch/distributed/pipeline/sync/pipe.py b/torch/distributed/pipeline/sync/pipe.py
index ba4fda1fcf83..e577279f1925 100644
--- a/torch/distributed/pipeline/sync/pipe.py
+++ b/torch/distributed/pipeline/sync/pipe.py
@@ -162,7 +162,7 @@ class WithDevice(nn.Module):
         >>> model = Pipe(model, chunks=8)
     """
     def __init__(self, module: nn.Module, device: torch.device):
-        super(WithDevice, self).__init__()
+        super().__init__()
         self._module = module
         self._device = torch.device(device)
 
diff --git a/torch/distributed/remote_device.py b/torch/distributed/remote_device.py
index b49ea174dd05..cc896cee9288 100644
--- a/torch/distributed/remote_device.py
+++ b/torch/distributed/remote_device.py
@@ -3,7 +3,7 @@
 import torch
 
 
-class _remote_device(object):
+class _remote_device:
     """
     Represents a device on a remote worker.
 
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 3e3607b3f390..4a6d1320c189 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -11,7 +11,6 @@
 from datetime import timedelta
 from typing import Dict, Optional
 
-import torch._six as six
 from torch.distributed import FileStore, PrefixStore, Store, TCPStore
 
 from .constants import default_pg_timeout
@@ -54,7 +53,7 @@ def register_rendezvous_handler(scheme, handler):
 # Query will have format "rank=0&world_size=1" and is
 # converted into {"rank": 0, "world_size": 1}
 def _query_to_dict(query: str) -> Dict[str, str]:
-    return dict((pair[0], pair[1]) for pair in (pair.split("=") for pair in filter(None, query.split("&"))))
+    return {pair[0]: pair[1] for pair in (pair.split("=") for pair in filter(None, query.split("&")))}
 
 
 def _rendezvous_helper(url: str, rank: int, world_size_opt: Optional[int], **kwargs):
@@ -91,7 +90,7 @@ def _rendezvous_helper(url: str, rank: int, world_size_opt: Optional[int], **kwa
 
 
 def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
-    if not isinstance(url, six.string_classes):
+    if not isinstance(url, str):
         raise RuntimeError("`url` must be a string. {}: {}".format(type(url), url))
 
     if not isinstance(rank, numbers.Integral):
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index f5e544806822..c23201d21b44 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -84,7 +84,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-class AllGatherStates(object):
+class AllGatherStates:
     def __init__(self):
         # Each `gathered_objects` is an empty dict at beginning.
         # The leader worker is elected as the first worker in a sorted worker
@@ -425,7 +425,7 @@ def get_worker_info(worker_name=None):
 def _to_worker_info(to):
     if isinstance(to, WorkerInfo):
         return to
-    elif isinstance(to, str) or isinstance(to, int):
+    elif isinstance(to, (str, int)):
         return get_worker_info(to)
     else:
         raise ValueError("Cannot get WorkerInfo from name {}".format(to))
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 9937189c9f49..0d0ce01e9988 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -30,11 +30,11 @@
 
 
 ``torchrun`` supports the same arguments as ``torch.distributed.launch`` **except**
-for ``--use_env`` which is now deprecated. To migrate from ``torch.distributed.launch``
+for ``--use-env`` which is now deprecated. To migrate from ``torch.distributed.launch``
 to ``torchrun`` follow these steps:
 
 1.  If your training script is already reading ``local_rank`` from the ``LOCAL_RANK`` environment variable.
-    Then you need simply omit the ``--use_env`` flag, e.g.:
+    Then you need simply omit the ``--use-env`` flag, e.g.:
 
     +--------------------------------------------------------------------+--------------------------------------------+
     |         ``torch.distributed.launch``                               |                ``torchrun``                |
@@ -42,11 +42,11 @@
     |                                                                    |                                            |
     | .. code-block:: shell-session                                      | .. code-block:: shell-session              |
     |                                                                    |                                            |
-    |    $ python -m torch.distributed.launch --use_env train_script.py  |    $ torchrun train_script.py              |
+    |    $ python -m torch.distributed.launch --use-env train_script.py  |    $ torchrun train_script.py              |
     |                                                                    |                                            |
     +--------------------------------------------------------------------+--------------------------------------------+
 
-2.  If your training script reads local rank from a ``--local_rank`` cmd argument.
+2.  If your training script reads local rank from a ``--local-rank`` cmd argument.
     Change your training script to read from the ``LOCAL_RANK`` environment variable as
     demonstrated by the following code snippet:
 
@@ -59,7 +59,7 @@
     |                                                       |                                                    |
     |    import argparse                                    |     import os                                      |
     |    parser = argparse.ArgumentParser()                 |     local_rank = int(os.environ["LOCAL_RANK"])     |
-    |    parser.add_argument("--local_rank", type=int)      |                                                    |
+    |    parser.add_argument("--local-rank", type=int)      |                                                    |
     |    args = parser.parse_args()                         |                                                    |
     |                                                       |                                                    |
     |    local_rank = args.local_rank                       |                                                    |
@@ -85,7 +85,7 @@
     torchrun
         --standalone
         --nnodes=1
-        --nproc_per_node=$NUM_TRAINERS
+        --nproc-per-node=$NUM_TRAINERS
         YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 Stacked single-node multi-worker
@@ -94,18 +94,18 @@
 To run multiple instances (separate jobs) of single-node, multi-worker on the
 same host, we need to make sure that each instance (job) is
 setup on different ports to avoid port conflicts (or worse, two jobs being merged
-as a single job). To do this you have to run with ``--rdzv_backend=c10d``
-and specify a different port by setting ``--rdzv_endpoint=localhost:$PORT_k``.
+as a single job). To do this you have to run with ``--rdzv-backend=c10d``
+and specify a different port by setting ``--rdzv-endpoint=localhost:$PORT_k``.
 For ``--nodes=1``, its often convenient to let ``torchrun`` pick a free random
 port automatically instead of manually assgining different ports for each run.
 
 ::
 
     torchrun
-        --rdzv_backend=c10d
-        --rdzv_endpoint=localhost:0
+        --rdzv-backend=c10d
+        --rdzv-endpoint=localhost:0
         --nnodes=1
-        --nproc_per_node=$NUM_TRAINERS
+        --nproc-per-node=$NUM_TRAINERS
         YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 
@@ -116,11 +116,11 @@
 
     torchrun
         --nnodes=$NUM_NODES
-        --nproc_per_node=$NUM_TRAINERS
-        --max_restarts=3
-        --rdzv_id=$JOB_ID
-        --rdzv_backend=c10d
-        --rdzv_endpoint=$HOST_NODE_ADDR
+        --nproc-per-node=$NUM_TRAINERS
+        --max-restarts=3
+        --rdzv-id=$JOB_ID
+        --rdzv-backend=c10d
+        --rdzv-endpoint=$HOST_NODE_ADDR
         YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 ``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
@@ -137,11 +137,11 @@
 
     torchrun
         --nnodes=1:4
-        --nproc_per_node=$NUM_TRAINERS
-        --max_restarts=3
-        --rdzv_id=$JOB_ID
-        --rdzv_backend=c10d
-        --rdzv_endpoint=$HOST_NODE_ADDR
+        --nproc-per-node=$NUM_TRAINERS
+        --max-restarts=3
+        --rdzv-id=$JOB_ID
+        --rdzv-backend=c10d
+        --rdzv-endpoint=$HOST_NODE_ADDR
         YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
 
 ``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
@@ -156,10 +156,10 @@
 
 For multi-node training you need to specify:
 
-1. ``--rdzv_id``: A unique job id (shared by all nodes participating in the job)
-2. ``--rdzv_backend``: An implementation of
+1. ``--rdzv-id``: A unique job id (shared by all nodes participating in the job)
+2. ``--rdzv-backend``: An implementation of
    :py:class:`torch.distributed.elastic.rendezvous.RendezvousHandler`
-3. ``--rdzv_endpoint``: The endpoint where the rendezvous backend is running; usually in form
+3. ``--rdzv-endpoint``: The endpoint where the rendezvous backend is running; usually in form
    ``host:port``.
 
 Currently ``c10d`` (recommended), ``etcd-v2``, and ``etcd`` (legacy)  rendezvous backends are
@@ -221,7 +221,7 @@
    of the worker is specified in the ``WorkerSpec``.
 
 5. ``LOCAL_WORLD_SIZE`` - The local world size (e.g. number of workers running locally); equals to
-   ``--nproc_per_node`` specified on ``torchrun``.
+   ``--nproc-per-node`` specified on ``torchrun``.
 
 6. ``WORLD_SIZE`` - The world size (total number of workers in the job).
 
@@ -246,7 +246,7 @@
 ------------
 
 1. (Not needed for the C10d backend) Start the rendezvous backend server and get the endpoint (to be
-   passed as ``--rdzv_endpoint`` to the launcher script)
+   passed as ``--rdzv-endpoint`` to the launcher script)
 
 2. Single-node multi-worker: Start the launcher on the host to start the agent process which
    creates and monitors a local worker group.
@@ -406,6 +406,7 @@ def get_args_parser() -> ArgumentParser:
         help="Number of nodes, or the range of nodes in form <minimum_nodes>:<maximum_nodes>.",
     )
     parser.add_argument(
+        "--nproc-per-node",
         "--nproc_per_node",
         action=env,
         type=str,
@@ -418,6 +419,7 @@ def get_args_parser() -> ArgumentParser:
     #
 
     parser.add_argument(
+        "--rdzv-backend",
         "--rdzv_backend",
         action=env,
         type=str,
@@ -425,6 +427,7 @@ def get_args_parser() -> ArgumentParser:
         help="Rendezvous backend.",
     )
     parser.add_argument(
+        "--rdzv-endpoint",
         "--rdzv_endpoint",
         action=env,
         type=str,
@@ -432,6 +435,7 @@ def get_args_parser() -> ArgumentParser:
         help="Rendezvous backend endpoint; usually in form <host>:<port>.",
     )
     parser.add_argument(
+        "--rdzv-id",
         "--rdzv_id",
         action=env,
         type=str,
@@ -439,6 +443,7 @@ def get_args_parser() -> ArgumentParser:
         help="User-defined group id.",
     )
     parser.add_argument(
+        "--rdzv-conf",
         "--rdzv_conf",
         action=env,
         type=str,
@@ -450,7 +455,7 @@ def get_args_parser() -> ArgumentParser:
         action=check_env,
         help="Start a local standalone rendezvous backend that is represented by a C10d TCP store "
         "on port 29400. Useful when launching single-node, multi-worker job. If specified "
-        "--rdzv_backend, --rdzv_endpoint, --rdzv_id are auto-assigned; any explicitly set values "
+        "--rdzv-backend, --rdzv-endpoint, --rdzv-id are auto-assigned; any explicitly set values "
         "are ignored.",
     )
 
@@ -459,6 +464,7 @@ def get_args_parser() -> ArgumentParser:
     #
 
     parser.add_argument(
+        "--max-restarts",
         "--max_restarts",
         action=env,
         type=int,
@@ -466,6 +472,7 @@ def get_args_parser() -> ArgumentParser:
         help="Maximum number of worker group restarts before failing.",
     )
     parser.add_argument(
+        "--monitor-interval",
         "--monitor_interval",
         action=env,
         type=float,
@@ -473,6 +480,7 @@ def get_args_parser() -> ArgumentParser:
         help="Interval, in seconds, to monitor the state of workers.",
     )
     parser.add_argument(
+        "--start-method",
         "--start_method",
         action=env,
         type=str,
@@ -495,6 +503,7 @@ def get_args_parser() -> ArgumentParser:
         "with the same behavior as 'python -m'.",
     )
     parser.add_argument(
+        "--no-python",
         "--no_python",
         action=check_env,
         help="Skip prepending the training script with 'python' - just execute it directly. Useful "
@@ -502,13 +511,15 @@ def get_args_parser() -> ArgumentParser:
     )
 
     parser.add_argument(
+        "--run-path",
         "--run_path",
         action=check_env,
         help="Run the training script with runpy.run_path in the same interpreter."
         " Script must be provided as an abs path (e.g. /abs/path/script.py)."
-        " Takes precedence over --no_python.",
+        " Takes precedence over --no-python.",
     )
     parser.add_argument(
+        "--log-dir",
         "--log_dir",
         action=env,
         type=str,
@@ -541,6 +552,7 @@ def get_args_parser() -> ArgumentParser:
     #
 
     parser.add_argument(
+        "--node-rank",
         "--node_rank",
         type=int,
         action=env,
@@ -548,16 +560,18 @@ def get_args_parser() -> ArgumentParser:
         help="Rank of the node for multi-node distributed training.",
     )
     parser.add_argument(
+        "--master-addr",
         "--master_addr",
         default="127.0.0.1",
         type=str,
         action=env,
         help="Address of the master node (rank 0) that only used for static rendezvous. It should "
         "be either the IP address or the hostname of rank 0. For single node multi-proc training "
-        "the --master_addr can simply be 127.0.0.1; IPv6 should have the pattern "
+        "the --master-addr can simply be 127.0.0.1; IPv6 should have the pattern "
         "`[0:0:0:0:0:0:0:1]`.",
     )
     parser.add_argument(
+        "--master-port",
         "--master_port",
         default=29500,
         type=int,
@@ -566,6 +580,7 @@ def get_args_parser() -> ArgumentParser:
         "training. It is only used for static rendezvous.",
     )
     parser.add_argument(
+        "--local-addr",
         "--local_addr",
         default=None,
         type=str,
@@ -652,7 +667,7 @@ def get_use_env(args) -> bool:
     """
     Retrieves ``use_env`` from the args.
     ``use_env`` is a legacy argument, if ``use_env`` is False, the
-    ``--node_rank`` argument will be transferred to all worker processes.
+    ``--node-rank`` argument will be transferred to all worker processes.
     ``use_env`` is only used by the ``torch.distributed.launch`` and will
     be deprecated in future releases.
     """
@@ -729,12 +744,12 @@ def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str
         else:
             if args.module:
                 raise ValueError(
-                    "Don't use both the '--no_python' flag"
+                    "Don't use both the '--no-python' flag"
                     " and the '--module' flag at the same time."
                 )
             cmd = args.training_script
     if not use_env:
-        cmd_args.append(f"--local_rank={macros.local_rank}")
+        cmd_args.append(f"--local-rank={macros.local_rank}")
     cmd_args.extend(args.training_script_args)
 
     return config, cmd, cmd_args
@@ -760,9 +775,9 @@ def run(args):
         log.info(
             f"\n**************************************\n"
             f"Rendezvous info:\n"
-            f"--rdzv_backend={args.rdzv_backend} "
-            f"--rdzv_endpoint={args.rdzv_endpoint} "
-            f"--rdzv_id={args.rdzv_id}\n"
+            f"--rdzv-backend={args.rdzv_backend} "
+            f"--rdzv-endpoint={args.rdzv_endpoint} "
+            f"--rdzv-id={args.rdzv_id}\n"
             f"**************************************\n"
         )
 
diff --git a/torch/distributed/tensor/parallel/__init__.py b/torch/distributed/tensor/parallel/__init__.py
index 760d4b24cb7c..85289c82e6bb 100644
--- a/torch/distributed/tensor/parallel/__init__.py
+++ b/torch/distributed/tensor/parallel/__init__.py
@@ -7,12 +7,15 @@
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     make_input_replicate_1d,
+    make_input_reshard_replicate,
     make_input_shard_1d,
-    make_input_shard_1d_dim_last,
+    make_input_shard_1d_last_dim,
     make_output_replicate_1d,
+    make_output_reshard_tensor,
     make_output_shard_1d,
     make_output_tensor,
     PairwiseParallel,
+    PairwiseSequenceParallel,
     ParallelStyle,
     RowwiseParallel,
 )
@@ -20,13 +23,16 @@
 __all__ = [
     "ColwiseParallel",
     "PairwiseParallel",
+    "PairwiseSequenceParallel",
     "ParallelStyle",
     "RowwiseParallel",
     "TensorParallelMultiheadAttention",
     "make_input_replicate_1d",
+    "make_input_reshard_replicate",
     "make_input_shard_1d",
-    "make_input_shard_1d_dim_last",
+    "make_input_shard_1d_last_dim",
     "make_output_replicate_1d",
+    "make_output_reshard_tensor",
     "make_output_tensor",
     "make_output_shard_1d",
     "parallelize_module",
diff --git a/torch/distributed/tensor/parallel/_utils.py b/torch/distributed/tensor/parallel/_utils.py
index 5e9fc07b8b98..bb37623de97f 100644
--- a/torch/distributed/tensor/parallel/_utils.py
+++ b/torch/distributed/tensor/parallel/_utils.py
@@ -45,7 +45,7 @@ def _prepare_input_validate(
     def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
         assert len(args) >= 1, "_prepare_input needs at least one arg."
         input = args[0]
-        if isinstance(input, list) or isinstance(input, tuple):
+        if isinstance(input, (list, tuple)):
             input = input[0]
             args = (input, *args[1:])
         device_mesh = None if len(args) < 2 else args[1]
diff --git a/torch/distributed/tensor/parallel/_view_with_dim_change.py b/torch/distributed/tensor/parallel/_view_with_dim_change.py
index e2e1cc547178..2bdd1741181d 100644
--- a/torch/distributed/tensor/parallel/_view_with_dim_change.py
+++ b/torch/distributed/tensor/parallel/_view_with_dim_change.py
@@ -1,10 +1,16 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import Tuple, Union
+from typing import Tuple, Union, Sequence, cast
 
 import torch
+from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor import DTensor as DT
 from torch.distributed._tensor.ops.utils import prod
-from torch.distributed._tensor.placement_types import Shard
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    Placement,
+    Replicate,
+    Shard,
+)
 
 
 def _view_with_sharding_dim_change(
@@ -24,6 +30,28 @@ def _view_with_sharding_dim_change(
     else:
         return tensor.view(shape)
 
+def _infer_dtensor_stride(
+    local_tensor: torch.Tensor, mesh: DeviceMesh, placements: Sequence[Placement]
+) -> Tuple[int, ...]:
+    """
+    infer the dtensor stride from a local tensor
+    """
+    tensor_stride = list(local_tensor.stride())
+    for idx, placement in enumerate(placements):
+        if placement.is_shard():
+            shard_dim = cast(Shard, placement).dim
+            # recover tensor stride by modifying the stride that larger than
+            # the current stride on the shard_dim
+            for i in range(len(tensor_stride)):
+                if i != shard_dim and tensor_stride[i] >= tensor_stride[shard_dim]:
+                    # rescale the stride by the shard size
+                    tensor_stride[i] = tensor_stride[i] * mesh.size(idx)
+
+        elif not isinstance(placement, (Replicate, _Partial)):
+            raise RuntimeError(f"placement type {type(placement)} not supported!")
+
+    return tuple(tensor_stride)
+
 
 class _ViewAndRedistribute(torch.autograd.Function):
     @staticmethod
@@ -85,8 +113,10 @@ def forward(  # type: ignore[override]
                 new_local_tensor,
                 device_mesh,
                 new_sharding_placement,
-                size=torch.Size(shape),
+                shape=torch.Size(shape),
+                dtype=new_local_tensor.dtype,
                 requires_grad=new_local_tensor.requires_grad,
+                stride=_infer_dtensor_stride(new_local_tensor, device_mesh, new_sharding_placement),
             )
 
     @staticmethod
@@ -95,13 +125,17 @@ def backward(ctx, grad_output: DT) -> Tuple[DT, None, None]:  # type: ignore[ove
         previous_device_mesh = ctx.previous_device_mesh
         previous_local_tensor_size = ctx.previous_local_shape
         previous_global_shape = ctx.previous_global_shape
+
+        new_local_tensor = grad_output.to_local().view(*previous_local_tensor_size)
         return (
             DT(
-                grad_output.to_local().view(*previous_local_tensor_size),
+                new_local_tensor,
                 previous_device_mesh,
                 previous_placement,
-                size=previous_global_shape,
+                shape=previous_global_shape,
+                dtype=grad_output.dtype,
                 requires_grad=grad_output.requires_grad,
+                stride=_infer_dtensor_stride(new_local_tensor, previous_device_mesh, previous_placement),
             ),
             None,
             None,
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
index 43cd1ec9f850..40e9479cd237 100644
--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@@ -5,11 +5,13 @@
 import torch.nn as nn
 from torch.distributed._tensor import (
     DeviceMesh,
+    DTensor,
     distribute_module,
     distribute_tensor,
     Replicate,
     Shard,
 )
+from torch.distributed._tensor.sharding_prop import _CachingPropagator
 from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
 from torch.distributed.tensor.parallel.multihead_attention_tp import (
     TensorParallelMultiheadAttention,
@@ -26,6 +28,9 @@
     "parallelize_module",
 ]
 
+# switch the DTensor propagator to use the caching propagator to speed up
+# the TP eager execution time.
+DTensor._propagator = _CachingPropagator(DTensor._propagator.op_to_rules)
 
 def parallelize_module(  # type: ignore[return]
     module: nn.Module,
@@ -35,9 +40,11 @@ def parallelize_module(  # type: ignore[return]
 ) -> nn.Module:
     """
     The API to apply Tensor Parallelism (TP) in PyTorch. We parallelize module
-    or sub_modules based on a parallelize_plan which contains the parallel_style
-    which indicates how user want the module or sub_module to be parallelized.
-    User can also specify different parallel_style per module fully qualifed name (FQN).
+    or sub_modules based on a parallelize_plan. The parallelize_plan contains
+    :class:`ParallelStyle`, which indicates how user wants the module or sub_module
+    to be parallelized.
+
+    User can also specify different parallel style per module fully qualifed name (FQN).
     The API supports 2D parallelism natively by accepting an n-dimension device_mesh
     and users just need to specify the dimension where we perform tensor parallelism on.
 
@@ -61,7 +68,7 @@ def parallelize_module(  # type: ignore[return]
 
     Example::
         >>> # xdoctest: +SKIP("distributed")
-        >>> from torch.distributed._tensor.parallel import parallelize_module, PairwiseParallel
+        >>> from torch.distributed.tensor.parallel import parallelize_module, PairwiseParallel
         >>>
         >>> # Define the module.
         >>> m = Model(...)
@@ -78,15 +85,13 @@ def parallelize_module(  # type: ignore[return]
 
     if isinstance(parallelize_plan, ParallelStyle):
         # RowwiseParallel or ColwiseParallel
-        if isinstance(parallelize_plan, ColwiseParallel) or isinstance(
-            parallelize_plan, RowwiseParallel
-        ):
+        if isinstance(parallelize_plan, (ColwiseParallel, RowwiseParallel)):
             return _parallelize_linear(module, device_mesh, parallelize_plan)
         # PairwiseParallel
         if _is_mha_for_pairwise_parallel(module):
             return _parallelize_multihead_attn(module, device_mesh)
         elif _is_mlp_for_pairwise_parallel(module):
-            return _parallelize_mlp(module, device_mesh)
+            return _parallelize_mlp(module, device_mesh, parallelize_plan)
         else:
             for n, m in module.named_children():
                 module.register_module(
@@ -96,12 +101,18 @@ def parallelize_module(  # type: ignore[return]
     elif isinstance(parallelize_plan, dict):
         for module_path, parallelize_style in parallelize_plan.items():
             sub_module = module.get_submodule(module_path)
-            module.register_module(  # type: ignore[call-arg] # pyre-ignore[20]
+            parent_module = module
+            if "." in module_path:
+                parent_module_path = ".".join(module_path.split(".")[:-1])
+                parent_module = module.get_submodule(parent_module_path)
+                module_path = module_path.split(".")[-1]
+            parent_module.register_module(  # type: ignore[call-arg] # pyre-ignore[20]
+                module_path,
                 parallelize_module(  # type: ignore[arg-type]
-                    module_path, sub_module, device_mesh, parallelize_style  # type: ignore[arg-type] # pyre-ignore[6]
-                )
+                    sub_module, device_mesh, parallelize_style  # type: ignore[arg-type] # pyre-ignore[6]
+                ),
             )
-            return module
+        return module
     else:
         raise RuntimeError(  # pyre-ignore[7]
             "Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for"
@@ -120,9 +131,7 @@ def _is_mha_for_pairwise_parallel(module: nn.Module) -> bool:
     Return:
         A boolean object which specifies whether the module is MHA supported by Pairwise parallel or not.
     """
-    return isinstance(module, TensorParallelMultiheadAttention) or isinstance(
-        module, nn.MultiheadAttention
-    )
+    return isinstance(module, (TensorParallelMultiheadAttention, nn.MultiheadAttention))
 
 
 def _is_mlp_for_pairwise_parallel(module: nn.Module) -> bool:
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index 3ab3a32dc04b..2339b3f2a7fd 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -8,6 +8,7 @@
 import torch.distributed._shard.sharding_spec as shard_spec
 import torch.distributed.distributed_c10d as c10d
 
+from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
 from torch.distributed._shard.sharded_tensor import (
     Shard,
     ShardedTensor,
@@ -29,7 +30,63 @@
 
 from torch.distributed.remote_device import _remote_device
 
-__all__ = ["is_available"]
+__all__ = ["enable_2d_with_fsdp"]
+
+
+def enable_2d_with_fsdp() -> bool:
+    """
+    The API registers the extension which is needed for Tensor Parallelism (TP)
+    to work with FullyShardedDataParallel (FSDP). We first parallelize parameters
+    within one module or sub_modules based on a parallelize_plan and will let FSDP
+    reshard the local tensor of distributed parameter which is essentially a DTensor.
+
+    Return:
+        A `bool` indicated whether extension registration succeeds or not.
+    """
+    try:
+        from torch.distributed.fsdp._fsdp_extensions import (
+            _set_fsdp_extensions,
+            FSDPExtensions,
+        )
+
+        class DTensorExtensions(FSDPExtensions):
+            def pre_flatten_transform(
+                self,
+                tensor: torch.Tensor,
+            ) -> Tuple[torch.Tensor, Optional[_STShardingInfo]]:
+                return _flatten_tensor(tensor)
+
+            def post_unflatten_transform(
+                self, tensor: torch.Tensor, param_extension: _STShardingInfo
+            ) -> torch.Tensor:
+                return _unflatten_tensor(tensor, param_extension)
+
+            def chunk_tensor(
+                self,
+                tensor: torch.Tensor,
+                rank: int,
+                world_size: int,
+                num_devices_per_node: int,
+                pg: dist.ProcessGroup,
+            ) -> torch.Tensor:
+                return _chunk_tensor(tensor, rank, world_size, num_devices_per_node, pg)
+
+            def pre_load_state_dict_transform(
+                self,
+                tensor: torch.Tensor,
+            ) -> Tuple[torch.Tensor, List[Shard]]:
+                return _pre_load_state_dict(tensor)
+
+        _set_fsdp_extensions(DTensorExtensions())
+        return True
+
+    except BaseException as e:
+        warnings.warn(
+            "PyTorch doesn't have TensorFlattener extension point available"
+            "2D parallelism won't work with FSDP"
+            f"exception: {e}"
+        )
+        return False
 
 
 class _STShardingInfo(NamedTuple):
@@ -65,9 +122,9 @@ def _get_box_for(tensor: DistributedTensor, idx: int) -> Tuple[torch.Size, torch
 
 def _get_local_box(tensor: DistributedTensor) -> Tuple[torch.Size, torch.Size]:
     device_mesh = tensor.device_mesh
-    dim_0_coord = device_mesh.get_coordinate_on_dim(0)
-    assert dim_0_coord is not None
-    return _get_box_for(tensor, dim_0_coord)
+    coord = device_mesh.get_coordinate()
+    assert coord is not None
+    return _get_box_for(tensor, coord[0])
 
 
 def _create_shard_md_from_dt(dt: DistributedTensor, current_rank: int) -> ShardMetadata:
@@ -292,54 +349,3 @@ def _pre_load_state_dict(
         tensor = inner_tensor
 
     return (tensor, shards if len(shards) > 0 else [])
-
-
-try:
-    from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
-    from torch.distributed.fsdp._fsdp_extensions import (
-        _set_fsdp_extensions,
-        FSDPExtensions,
-    )
-
-    class DTensorExtensions(FSDPExtensions):
-        def pre_flatten_transform(
-            self,
-            tensor: torch.Tensor,
-        ) -> Tuple[torch.Tensor, Optional[_STShardingInfo]]:
-            return _flatten_tensor(tensor)
-
-        def post_unflatten_transform(
-            self, tensor: torch.Tensor, param_extension: _STShardingInfo
-        ) -> torch.Tensor:
-            return _unflatten_tensor(tensor, param_extension)
-
-        def chunk_tensor(
-            self,
-            tensor: torch.Tensor,
-            rank: int,
-            world_size: int,
-            num_devices_per_node: int,
-            pg: dist.ProcessGroup,
-        ) -> torch.Tensor:
-            return _chunk_tensor(tensor, rank, world_size, num_devices_per_node, pg)
-
-        def pre_load_state_dict_transform(
-            self,
-            tensor: torch.Tensor,
-        ) -> Tuple[torch.Tensor, List[Shard]]:
-            return _pre_load_state_dict(tensor)
-
-    _set_fsdp_extensions(DTensorExtensions())
-
-    def is_available() -> bool:
-        return True
-
-except BaseException as e:
-    warnings.warn(
-        "PyTorch doesn't have TensorFlattener extension point available"
-        "2D parallelism won't work with FSDP"
-        f"exception: {e}"
-    )
-
-    def is_available() -> bool:
-        return False
diff --git a/torch/distributed/tensor/parallel/multihead_attention_tp.py b/torch/distributed/tensor/parallel/multihead_attention_tp.py
index 3c408e75e9d1..26b266602bf7 100644
--- a/torch/distributed/tensor/parallel/multihead_attention_tp.py
+++ b/torch/distributed/tensor/parallel/multihead_attention_tp.py
@@ -64,7 +64,7 @@ def __init__(
         tp_size: int = 1,
         self_attention: bool = True,
     ) -> None:
-        super(TensorParallelMultiheadAttention, self).__init__()
+        super().__init__()
         self.device: torch.device = (
             torch.device("cuda" if torch.cuda.is_available() else "cpu")
             if device is None
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index 34a160ab14ad..f5587b2e36cf 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -16,10 +16,13 @@
     "RowwiseParallel",
     "ColwiseParallel",
     "PairwiseParallel",
+    "PairwiseSequenceParallel",
     "make_input_replicate_1d",
+    "make_input_reshard_replicate",
     "make_input_shard_1d",
-    "make_input_shard_1d_dim_last",
+    "make_input_shard_1d_last_dim",
     "make_output_replicate_1d",
+    "make_output_reshard_tensor",
     "make_output_tensor",
     "make_output_shard_1d",
 ]
@@ -44,15 +47,37 @@ class PairwiseParallel(ParallelStyle):
     """
     PairwiseParallel concatenate colwise and rowwise styles as a fixed
     pair like what Megatron-LM(https://arxiv.org/abs/1909.08053) is doing.
-    We assume both input and output needs to a replicate DTensor.
+    We assume both input and output need to be replicate DTensors.
 
     .. warning::
         PairwiseParallel only supports ``nn.Multihead Attention``,
         ``nn.Transformer`` or even-number-layer MLP for now.
     """
 
+    def __init__(self, _prepare_input=None, _prepare_output=None) -> None:
+        _prepare_input = (
+            make_input_replicate_1d if _prepare_input is None else _prepare_input
+        )
+        _prepare_output = (
+            make_output_tensor if _prepare_output is None else _prepare_output
+        )
+        super().__init__(_prepare_input, _prepare_output)
+
+
+class PairwiseSequenceParallel(PairwiseParallel):
+    """
+    PairwiseSequenceParallel concatenate colwise and rowwise styles as a fixed
+    pair together with sequence parallel like what Megatron-LM Sequence parallel
+    (https://arxiv.org/pdf/2205.05198.pdf) is doing.
+    We assume both input and output need to be sharded DTensors.
+
+    .. warning::
+        PairwiseSequenceParallel only supports ``nn.Multihead Attention``,
+        ``nn.Transformer`` or even-number-layer MLP for now.
+    """
+
     def __init__(self) -> None:
-        super().__init__(make_input_replicate_1d, make_output_tensor)
+        super().__init__(make_input_reshard_replicate, make_output_reshard_tensor)
 
 
 class RowwiseParallel(ParallelStyle):
@@ -62,7 +87,7 @@ class RowwiseParallel(ParallelStyle):
     """
 
     def __init__(self) -> None:
-        super().__init__(make_input_shard_1d_dim_last, make_output_replicate_1d)
+        super().__init__(make_input_shard_1d_last_dim, make_output_replicate_1d)
 
 
 class ColwiseParallel(ParallelStyle):
@@ -112,7 +137,8 @@ def make_input_shard_1d(
         )
 
 
-def make_input_shard_1d_dim_last(
+@_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_input_shard_1d_last_dim(
     input: Union[torch.Tensor, DTensor],
     device_mesh: Optional[DeviceMesh] = None,
 ) -> DTensor:
@@ -121,7 +147,7 @@ def make_input_shard_1d_dim_last(
 
     Args:
         input (Union[:class:`torch.Tensor`, :class:`DTensor`]):
-            This single tensor will be sharded on dimension ``dim``
+            This single tensor will be sharded on the last dimension
             over the 1-D :class:`DeviceMesh`.
         device_mesh (:class:`DeviceMesh`, optional):
             The 1-D device mesh where ``input`` will be sharded.
@@ -131,11 +157,39 @@ def make_input_shard_1d_dim_last(
             Default: ``None``
 
     Returns:
-        A :class:`DTensor` sharded on dimension ``dim`` over ``device_mesh``.
+        A :class:`DTensor` sharded on the last dimension over ``device_mesh``.
     """
     return make_input_shard_1d(input, device_mesh, dim=-1)  # type: ignore[call-arg]
 
 
+@_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_input_reshard_replicate(
+    input: torch.Tensor,
+    device_mesh: DeviceMesh,
+) -> DTensor:
+    """
+    To construct a Sharded DTensor from a tensor on different ranks
+    and then convert to a replicate DTensor.
+
+    Args:
+        input (:class:`torch.Tensor`):
+            The input tensor on each rank which consists of a global DTensor
+            sharded on dimension ``0`` over the 1-D :class:`DeviceMesh`
+            and then the sharded DTensor is converted to a replicate DTensor.
+        device_mesh (:class:`DeviceMesh`, optional):
+            The 1-D device mesh where ``input`` will be sharded.
+            If :class:`DeviceMesh` is not 1-D, an exception will be thrown.
+            Default: ``None``
+
+    Returns:
+        A :class:`DTensor` sharded on dimension ``0`` over ``device_mesh``
+            and then converted to replicate.
+    """
+    return make_input_replicate_1d(  # type: ignore[call-arg]
+        make_input_shard_1d(input, device_mesh, dim=0), device_mesh  # type: ignore[call-arg]
+    )
+
+
 @_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
 def make_input_replicate_1d(
     input: Union[torch.Tensor, DTensor],
@@ -240,3 +294,27 @@ def make_output_tensor(
     return make_output_replicate_1d(  # type: ignore[attr-defined]
         output, device_mesh
     ).to_local()  # type: ignore[call-arg]
+
+
+@_prepare_output_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_output_reshard_tensor(
+    output: DTensor,
+    device_mesh: Optional[DeviceMesh] = None,
+) -> torch.Tensor:
+    """
+    Convert Output DTensor to a sharded DTensor and return the local tensor.
+
+    Args:
+        output (:class:`DTensor`):
+            Output of module to be converted.
+        device_mesh (:class:`DeviceMesh`, optional):
+            Object needed to shard the output and it needs to be a 1D ``device_mesh``
+            and we will throw exceptions if a non-1D ``device_mesh`` is passed in.
+            If no ``device_mesh`` is passed in, we will reuse the one from output.
+            Default: ``None``
+
+    Return:
+        A :class:`torch.Tensor` object converted from output DTensor.
+    """
+
+    return make_output_shard_1d(output, device_mesh).to_local()  # type: ignore[call-arg, attr-defined]
diff --git a/torch/distributed/utils.py b/torch/distributed/utils.py
index bfb6b8c6243e..5848c0ecab0e 100644
--- a/torch/distributed/utils.py
+++ b/torch/distributed/utils.py
@@ -1,4 +1,6 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, Callable, Union, Set, OrderedDict
+import dataclasses
+import traceback
 
 import torch
 import torch.distributed as dist
@@ -53,7 +55,7 @@ def _recursive_to(inputs, target_gpu, use_side_stream_for_tensor_copies):
     """
 
     def to_map(obj):
-        if isinstance(obj, torch.Tensor) or isinstance(obj, PackedSequence):
+        if isinstance(obj, (torch.Tensor, PackedSequence)):
             device = obj.data.device if isinstance(obj, PackedSequence) else obj.device
             if device == torch.device("cuda", target_gpu):
                 return (obj,)
@@ -94,6 +96,92 @@ def to_map(obj):
         to_map = None  # type: ignore[assignment]
     return res
 
+def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
+    """This is used as an alternate to ``assert`` when in the backward context
+    to print the error message ``s`` since otherwise, it is swallowed."""
+    if not cond:
+        print(s)
+        traceback.print_stack()
+        if raise_assertion_error:
+            raise AssertionError(s)
+
+def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> bool:
+    """
+    Allocate storage for ``tensor`` with the given size.
+
+    Returns:
+        bool: ``True`` if this method allocated storage and ``False`` if the
+        storage was already allocated.
+    """
+    with torch.no_grad():
+        already_allocated = tensor._typed_storage()._size() == size.numel()
+        if not already_allocated:
+            tensor_storage_size = tensor._typed_storage()._size()
+            _p_assert(
+                tensor_storage_size == 0,
+                f"Tensor storage should have been resized to be 0 but got {tensor_storage_size}",
+            )
+            tensor._typed_storage()._resize_(size.numel())
+        return not already_allocated
+
+
+def _free_storage(tensor: torch.Tensor) -> bool:
+    """
+    Frees the underlying storage of ``tensor``.
+
+    Returns:
+        bool: ``True`` if the method freed the storage and ``False`` if the
+        storage was already freed.
+    """
+    with torch.no_grad():
+        already_freed = tensor._typed_storage()._size() == 0
+        if not already_freed:
+            _p_assert(
+                tensor.storage_offset() == 0,
+                "Freeing a tensor's storage is unsafe when it is not the sole occupant\n"
+                f"storage offset: {tensor.storage_offset()}\n"
+                f"storage size: {tensor._typed_storage()._size()}\n"
+                f"tensor shape: {tensor.shape}",
+            )
+            tensor._typed_storage()._resize_(0)
+        return not already_freed
+
+def _apply_to_tensors(
+    fn: Callable,
+    container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence],
+) -> Any:
+    """Recursively apply to all tensor in different kinds of container types."""
+
+    def apply(
+        x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]
+    ) -> Any:
+        if torch.is_tensor(x):
+            return fn(x)
+        elif hasattr(x, "__dataclass_fields__"):
+            dc = dataclasses.replace(x)
+            for f in dataclasses.fields(dc):
+                name = f.name
+                setattr(dc, name, apply(getattr(dc, name)))
+            return dc
+        elif isinstance(x, OrderedDict):
+            od = x.__class__()
+            for key, value in x.items():
+                od[key] = apply(value)
+            return od
+        elif isinstance(x, PackedSequence):
+            apply(x.data)
+            return x
+        elif isinstance(x, dict):
+            return {key: apply(value) for key, value in x.items()}
+        elif _is_namedtuple(x):
+            res = (apply(el) for el in x)
+            return type(x)(*res)
+        elif isinstance(x, (list, tuple, set)):
+            return type(x)(apply(el) for el in x)
+        else:
+            return x
+
+    return apply(container)
 
 def _to_kwargs(inputs, kwargs, device_id, use_side_stream_for_tensor_copies):
     inputs = (
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index deaf98b16b34..9d9b0fd7b8c9 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -1,7 +1,7 @@
 from numbers import Number
 
 import torch
-from torch._six import nan
+from torch import nan
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import broadcast_all, probs_to_logits, logits_to_probs, lazy_property
@@ -48,7 +48,7 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self._param.size()
-        super(Bernoulli, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Bernoulli, _instance)
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index 51316e7f56eb..dd6ed437c1e5 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -36,7 +36,7 @@ def __init__(self, concentration1, concentration0, validate_args=None):
             concentration1, concentration0 = broadcast_all(concentration1, concentration0)
             concentration1_concentration0 = torch.stack([concentration1, concentration0], -1)
         self._dirichlet = Dirichlet(concentration1_concentration0, validate_args=validate_args)
-        super(Beta, self).__init__(self._dirichlet._batch_shape, validate_args=validate_args)
+        super().__init__(self._dirichlet._batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Beta, _instance)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 5b2d31213ad4..c4d33ca8a4c4 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -50,7 +50,7 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
 
         self._param = self.probs if probs is not None else self.logits
         batch_shape = self._param.size()
-        super(Binomial, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Binomial, _instance)
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index ae39a1ad520f..7cff0e4ee35a 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -1,5 +1,5 @@
 import torch
-from torch._six import nan
+from torch import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import probs_to_logits, logits_to_probs, lazy_property
@@ -63,7 +63,7 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         self._param = self.probs if probs is not None else self.logits
         self._num_events = self._param.size()[-1]
         batch_shape = self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size()
-        super(Categorical, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Categorical, _instance)
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 3787406bec45..2ef0fb95aa82 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -1,5 +1,5 @@
 import math
-from torch._six import inf, nan
+from torch import inf, nan
 from numbers import Number
 
 import torch
@@ -36,7 +36,7 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.loc.size()
-        super(Cauchy, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Cauchy, _instance)
diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py
index 5ecbd854e49b..4394a078832f 100644
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@@ -21,11 +21,11 @@ class Chi2(Gamma):
     arg_constraints = {'df': constraints.positive}
 
     def __init__(self, df, validate_args=None):
-        super(Chi2, self).__init__(0.5 * df, 0.5, validate_args=validate_args)
+        super().__init__(0.5 * df, 0.5, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Chi2, _instance)
-        return super(Chi2, self).expand(batch_shape, new)
+        return super().expand(batch_shape, new)
 
     @property
     def df(self):
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index a9709ba4bfc0..0207f88c9b19 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -76,13 +76,13 @@ def my_factory(constraint):
 ]
 
 
-class ConstraintRegistry(object):
+class ConstraintRegistry:
     """
     Registry to link constraints to transforms.
     """
     def __init__(self):
         self._registry = {}
-        super(ConstraintRegistry, self).__init__()
+        super().__init__()
 
     def register(self, constraint, factory=None):
         """
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index a21759572579..a4e3c08461cd 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -65,7 +65,7 @@
 ]
 
 
-class Constraint(object):
+class Constraint:
     """
     Abstract base class for constraints.
 
diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py
index acd3e6430b0c..415d952f1678 100644
--- a/torch/distributions/continuous_bernoulli.py
+++ b/torch/distributions/continuous_bernoulli.py
@@ -62,7 +62,7 @@ def __init__(self, probs=None, logits=None, lims=(0.499, 0.501), validate_args=N
         else:
             batch_shape = self._param.size()
         self._lims = lims
-        super(ContinuousBernoulli, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(ContinuousBernoulli, _instance)
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index 9c7d43d04289..1612e37f42ed 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -51,7 +51,7 @@ def __init__(self, concentration, validate_args=None):
             raise ValueError("`concentration` parameter must be at least one-dimensional.")
         self.concentration = concentration
         batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:]
-        super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Dirichlet, _instance)
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index 507e80bbfac7..bc6910e98c47 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -7,7 +7,7 @@
 
 __all__ = ['Distribution']
 
-class Distribution(object):
+class Distribution:
     r"""
     Distribution is the abstract base class for probability distributions.
     """
@@ -66,7 +66,7 @@ def __init__(
                         f"to satisfy the constraint {repr(constraint)}, "
                         f"but found invalid values:\n{value}"
                     )
-        super(Distribution, self).__init__()
+        super().__init__()
 
     def expand(self, batch_shape: torch.Size, _instance=None):
         """
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index dac54a313ea5..f333bfc18b75 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -45,7 +45,7 @@ def variance(self):
     def __init__(self, rate, validate_args=None):
         self.rate, = broadcast_all(rate)
         batch_shape = torch.Size() if isinstance(rate, Number) else self.rate.size()
-        super(Exponential, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Exponential, _instance)
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index 5fbdf6b690fd..26511ab4b894 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -1,6 +1,6 @@
 from numbers import Number
 import torch
-from torch._six import nan
+from torch import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.gamma import Gamma
@@ -36,7 +36,7 @@ def __init__(self, df1, df2, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.df1.size()
-        super(FisherSnedecor, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(FisherSnedecor, _instance)
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index d6522b202d23..2601109dcb4f 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -51,7 +51,7 @@ def __init__(self, concentration, rate, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.concentration.size()
-        super(Gamma, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Gamma, _instance)
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index 5f61427488e7..0cac28f6e9ef 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -44,7 +44,7 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = probs_or_logits.size()
-        super(Geometric, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
         if self._validate_args and probs is not None:
             # Add an extra check beyond unit_interval
             value = self.probs
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index 07c3ea9f8dd8..ae272c54159d 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -37,13 +37,13 @@ def __init__(self, loc, scale, validate_args=None):
                                 torch.full_like(self.loc, 1 - finfo.eps))
         transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
                       ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)]
-        super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args)
+        super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Gumbel, _instance)
         new.loc = self.loc.expand(batch_shape)
         new.scale = self.scale.expand(batch_shape)
-        return super(Gumbel, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     # Explicitly defining the log probability function for Gumbel due to precision issues
     def log_prob(self, value):
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index e8f4bcae3811..c50107654342 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -1,7 +1,7 @@
 import math
 
 import torch
-from torch._six import inf
+from torch import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
 from torch.distributions.cauchy import Cauchy
@@ -32,12 +32,11 @@ class HalfCauchy(TransformedDistribution):
 
     def __init__(self, scale, validate_args=None):
         base_dist = Cauchy(0, scale, validate_args=False)
-        super(HalfCauchy, self).__init__(base_dist, AbsTransform(),
-                                         validate_args=validate_args)
+        super().__init__(base_dist, AbsTransform(), validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(HalfCauchy, _instance)
-        return super(HalfCauchy, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def scale(self):
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index d5b133707ad9..184d6f16c3c3 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -1,7 +1,7 @@
 import math
 
 import torch
-from torch._six import inf
+from torch import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
 from torch.distributions.normal import Normal
@@ -32,12 +32,11 @@ class HalfNormal(TransformedDistribution):
 
     def __init__(self, scale, validate_args=None):
         base_dist = Normal(0, scale, validate_args=False)
-        super(HalfNormal, self).__init__(base_dist, AbsTransform(),
-                                         validate_args=validate_args)
+        super().__init__(base_dist, AbsTransform(), validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(HalfNormal, _instance)
-        return super(HalfNormal, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def scale(self):
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 67c7fdc4d2d2..48442650ddcb 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -48,7 +48,7 @@ def __init__(self, base_distribution, reinterpreted_batch_ndims, validate_args=N
         event_shape = shape[len(shape) - event_dim:]
         self.base_dist = base_distribution
         self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
-        super(Independent, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Independent, _instance)
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index a4b30289ced3..26d7b47d2f51 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -4,7 +4,7 @@
 from typing import Type, Dict, Callable, Tuple
 
 import torch
-from torch._six import inf
+from torch import inf
 
 from .bernoulli import Bernoulli
 from .beta import Beta
@@ -78,7 +78,7 @@ def decorator(fun):
 
 
 @total_ordering
-class _Match(object):
+class _Match:
     __slots__ = ['types']
 
     def __init__(self, *types):
diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py
index 4802adf0a133..249cdf07b14c 100644
--- a/torch/distributions/kumaraswamy.py
+++ b/torch/distributions/kumaraswamy.py
@@ -1,5 +1,5 @@
 import torch
-from torch._six import nan
+from torch import nan
 from torch.distributions import constraints
 from torch.distributions.uniform import Uniform
 from torch.distributions.transformed_distribution import TransformedDistribution
@@ -47,13 +47,13 @@ def __init__(self, concentration1, concentration0, validate_args=None):
         transforms = [PowerTransform(exponent=self.concentration0.reciprocal()),
                       AffineTransform(loc=1., scale=-1.),
                       PowerTransform(exponent=self.concentration1.reciprocal())]
-        super(Kumaraswamy, self).__init__(base_dist, transforms, validate_args=validate_args)
+        super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Kumaraswamy, _instance)
         new.concentration1 = self.concentration1.expand(batch_shape)
         new.concentration0 = self.concentration0.expand(batch_shape)
-        return super(Kumaraswamy, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def mean(self):
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index e1dca36aa76a..3dfe968eda35 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -47,7 +47,7 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.loc.size()
-        super(Laplace, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Laplace, _instance)
diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py
index d9d7fc3ef067..dbc094adc2b8 100644
--- a/torch/distributions/lkj_cholesky.py
+++ b/torch/distributions/lkj_cholesky.py
@@ -71,7 +71,7 @@ def __init__(self, dim, concentration=1., validate_args=None):
         beta_conc1 = offset + 0.5
         beta_conc0 = marginal_conc.unsqueeze(-1) - 0.5 * offset
         self._beta = Beta(beta_conc1, beta_conc0)
-        super(LKJCholesky, self).__init__(batch_shape, event_shape, validate_args)
+        super().__init__(batch_shape, event_shape, validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LKJCholesky, _instance)
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index 278d7d400331..1621b5cc2bd5 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -30,11 +30,11 @@ class LogNormal(TransformedDistribution):
 
     def __init__(self, loc, scale, validate_args=None):
         base_dist = Normal(loc, scale, validate_args=validate_args)
-        super(LogNormal, self).__init__(base_dist, ExpTransform(), validate_args=validate_args)
+        super().__init__(base_dist, ExpTransform(), validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LogNormal, _instance)
-        return super(LogNormal, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def loc(self):
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index 7d8a70649c30..d424f1b14004 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -36,13 +36,11 @@ def __init__(self, loc, scale, validate_args=None):
         base_dist = Normal(loc, scale, validate_args=validate_args)
         if not base_dist.batch_shape:
             base_dist = base_dist.expand([1])
-        super(LogisticNormal, self).__init__(base_dist,
-                                             StickBreakingTransform(),
-                                             validate_args=validate_args)
+        super().__init__(base_dist, StickBreakingTransform(), validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LogisticNormal, _instance)
-        return super(LogisticNormal, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def loc(self):
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 921477ac99a4..9d2954baf644 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -109,8 +109,7 @@ def __init__(self, loc, cov_factor, cov_diag, validate_args=None):
         self._unbroadcasted_cov_factor = cov_factor
         self._unbroadcasted_cov_diag = cov_diag
         self._capacitance_tril = _batch_capacitance_tril(cov_factor, cov_diag)
-        super(LowRankMultivariateNormal, self).__init__(batch_shape, event_shape,
-                                                        validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LowRankMultivariateNormal, _instance)
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index dd0beace1917..d37e706ef004 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -86,9 +86,7 @@ def __init__(self,
 
         event_shape = self._component_distribution.event_shape
         self._event_ndims = len(event_shape)
-        super(MixtureSameFamily, self).__init__(batch_shape=cdbs,
-                                                event_shape=event_shape,
-                                                validate_args=validate_args)
+        super().__init__(batch_shape=cdbs, event_shape=event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         batch_shape = torch.Size(batch_shape)
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index 1fc532b2157d..579febb819a5 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -1,5 +1,5 @@
 import torch
-from torch._six import inf
+from torch import inf
 from torch.distributions.binomial import Binomial
 from torch.distributions.distribution import Distribution
 from torch.distributions import Categorical
@@ -65,7 +65,7 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
         self._binomial = Binomial(total_count=total_count, probs=self.probs)
         batch_shape = self._categorical.batch_shape
         event_shape = self._categorical.param_shape[-1:]
-        super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Multinomial, _instance)
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index e8c15c32d985..e7cbb740b7f1 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -147,7 +147,7 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri
         self.loc = loc.expand(batch_shape + (-1,))
 
         event_shape = self.loc.shape[-1:]
-        super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
         if scale_tril is not None:
             self._unbroadcasted_scale_tril = scale_tril
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index 20d802654e11..36ea72da3749 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -37,7 +37,7 @@ def __init__(self, total_count, probs=None, logits=None, validate_args=None):
 
         self._param = self.probs if probs is not None else self.logits
         batch_shape = self._param.size()
-        super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(NegativeBinomial, _instance)
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 8864816b74fb..39e41d729eeb 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -53,7 +53,7 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.loc.size()
-        super(Normal, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Normal, _instance)
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index ea574079039f..128010c4ce45 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -43,7 +43,7 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         self._categorical = Categorical(probs, logits)
         batch_shape = self._categorical.batch_shape
         event_shape = self._categorical.param_shape[-1:]
-        super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(OneHotCategorical, _instance)
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 0d28048bb439..f57ccd559c63 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -27,13 +27,13 @@ def __init__(self, scale, alpha, validate_args=None):
         self.scale, self.alpha = broadcast_all(scale, alpha)
         base_dist = Exponential(self.alpha, validate_args=validate_args)
         transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
-        super(Pareto, self).__init__(base_dist, transforms, validate_args=validate_args)
+        super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Pareto, _instance)
         new.scale = self.scale.expand(batch_shape)
         new.alpha = self.alpha.expand(batch_shape)
-        return super(Pareto, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def mean(self):
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index 63aaa08e5f15..bad1d0548705 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -47,7 +47,7 @@ def __init__(self, rate, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.rate.size()
-        super(Poisson, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Poisson, _instance)
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 500e82991bfb..634c0131ca04 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -46,7 +46,7 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self._param.size()
-        super(LogitRelaxedBernoulli, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(LogitRelaxedBernoulli, _instance)
@@ -118,13 +118,11 @@ class RelaxedBernoulli(TransformedDistribution):
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
         base_dist = LogitRelaxedBernoulli(temperature, probs, logits)
-        super(RelaxedBernoulli, self).__init__(base_dist,
-                                               SigmoidTransform(),
-                                               validate_args=validate_args)
+        super().__init__(base_dist, SigmoidTransform(), validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(RelaxedBernoulli, _instance)
-        return super(RelaxedBernoulli, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def temperature(self):
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 3ea069aad1c5..859078284b33 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -40,7 +40,7 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
         self.temperature = temperature
         batch_shape = self._categorical.batch_shape
         event_shape = self._categorical.param_shape[-1:]
-        super(ExpRelaxedCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(ExpRelaxedCategorical, _instance)
@@ -112,13 +112,11 @@ class RelaxedOneHotCategorical(TransformedDistribution):
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
         base_dist = ExpRelaxedCategorical(temperature, probs, logits, validate_args=validate_args)
-        super(RelaxedOneHotCategorical, self).__init__(base_dist,
-                                                       ExpTransform(),
-                                                       validate_args=validate_args)
+        super().__init__(base_dist, ExpTransform(), validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(RelaxedOneHotCategorical, _instance)
-        return super(RelaxedOneHotCategorical, self).expand(batch_shape, _instance=new)
+        return super().expand(batch_shape, _instance=new)
 
     @property
     def temperature(self):
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index 2699f89b48b8..83b06c668a2f 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -1,7 +1,7 @@
 import math
 
 import torch
-from torch._six import inf, nan
+from torch import inf, nan
 from torch.distributions import Chi2, constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import _standard_normal, broadcast_all
@@ -51,7 +51,7 @@ def __init__(self, df, loc=0., scale=1., validate_args=None):
         self.df, self.loc, self.scale = broadcast_all(df, loc, scale)
         self._chi2 = Chi2(self.df)
         batch_shape = self.df.size()
-        super(StudentT, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(StudentT, _instance)
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index a3bab3e836a3..d31064210d4b 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -80,7 +80,7 @@ def __init__(self, base_distribution, transforms, validate_args=None):
         cut = len(forward_shape) - event_dim
         batch_shape = forward_shape[:cut]
         event_shape = forward_shape[cut:]
-        super(TransformedDistribution, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(TransformedDistribution, _instance)
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index 648d051a525f..06d21548384e 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -38,7 +38,7 @@
 ]
 
 
-class Transform(object):
+class Transform:
     """
     Abstract class for invertable transformations with computable log
     det jacobians. They are primarily used in
@@ -95,7 +95,7 @@ def __init__(self, cache_size=0):
             self._cached_x_y = None, None
         else:
             raise ValueError('cache_size must be 0 or 1')
-        super(Transform, self).__init__()
+        super().__init__()
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -212,7 +212,7 @@ class _InverseTransform(Transform):
     This class is private; please instead use the ``Transform.inv`` property.
     """
     def __init__(self, transform: Transform):
-        super(_InverseTransform, self).__init__(cache_size=transform._cache_size)
+        super().__init__(cache_size=transform._cache_size)
         self._inv: Transform = transform
 
     @constraints.dependent_property(is_discrete=False)
@@ -280,7 +280,7 @@ class ComposeTransform(Transform):
     def __init__(self, parts: List[Transform], cache_size=0):
         if cache_size:
             parts = [part.with_cache(cache_size) for part in parts]
-        super(ComposeTransform, self).__init__(cache_size=cache_size)
+        super().__init__(cache_size=cache_size)
         self.parts = parts
 
     def __eq__(self, other):
@@ -550,7 +550,7 @@ class PowerTransform(Transform):
     sign = +1
 
     def __init__(self, exponent, cache_size=0):
-        super(PowerTransform, self).__init__(cache_size=cache_size)
+        super().__init__(cache_size=cache_size)
         self.exponent, = broadcast_all(exponent)
 
     def with_cache(self, cache_size=1):
@@ -698,7 +698,7 @@ class AffineTransform(Transform):
     bijective = True
 
     def __init__(self, loc, scale, event_dim=0, cache_size=0):
-        super(AffineTransform, self).__init__(cache_size=cache_size)
+        super().__init__(cache_size=cache_size)
         self.loc = loc
         self.scale = scale
         self._event_dim = event_dim
@@ -1012,7 +1012,7 @@ def __init__(self, tseq, dim=0, lengths=None, cache_size=0):
         assert all(isinstance(t, Transform) for t in tseq)
         if cache_size:
             tseq = [t.with_cache(cache_size) for t in tseq]
-        super(CatTransform, self).__init__(cache_size=cache_size)
+        super().__init__(cache_size=cache_size)
         self.transforms = list(tseq)
         if lengths is None:
             lengths = [1] * len(self.transforms)
@@ -1113,7 +1113,7 @@ def __init__(self, tseq, dim=0, cache_size=0):
         assert all(isinstance(t, Transform) for t in tseq)
         if cache_size:
             tseq = [t.with_cache(cache_size) for t in tseq]
-        super(StackTransform, self).__init__(cache_size=cache_size)
+        super().__init__(cache_size=cache_size)
         self.transforms = list(tseq)
         self.dim = dim
 
@@ -1189,7 +1189,7 @@ class CumulativeDistributionTransform(Transform):
     sign = +1
 
     def __init__(self, distribution, cache_size=0):
-        super(CumulativeDistributionTransform, self).__init__(cache_size=cache_size)
+        super().__init__(cache_size=cache_size)
         self.distribution = distribution
 
     @property
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index cd29f2aa8d91..cbbd8d1ed28d 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -1,7 +1,7 @@
 from numbers import Number
 
 import torch
-from torch._six import nan
+from torch import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import broadcast_all
@@ -52,7 +52,7 @@ def __init__(self, low, high, validate_args=None):
             batch_shape = torch.Size()
         else:
             batch_shape = self.low.size()
-        super(Uniform, self).__init__(batch_shape, validate_args=validate_args)
+        super().__init__(batch_shape, validate_args=validate_args)
 
         if self._validate_args and not torch.lt(self.low, self.high).all():
             raise ValueError("Uniform is not defined when low>= high")
diff --git a/torch/distributions/von_mises.py b/torch/distributions/von_mises.py
index b10beec5eed7..30457d7de715 100644
--- a/torch/distributions/von_mises.py
+++ b/torch/distributions/von_mises.py
@@ -98,7 +98,7 @@ def __init__(self, loc, concentration, validate_args=None):
         rho = (tau - (2 * tau).sqrt()) / (2 * self.concentration)
         self._proposal_r = (1 + rho ** 2) / (2 * rho)
 
-        super(VonMises, self).__init__(batch_shape, event_shape, validate_args)
+        super().__init__(batch_shape, event_shape, validate_args)
 
     def log_prob(self, value):
         if self._validate_args:
@@ -120,7 +120,7 @@ def sample(self, sample_shape=torch.Size()):
 
     def expand(self, batch_shape):
         try:
-            return super(VonMises, self).expand(batch_shape)
+            return super().expand(batch_shape)
         except NotImplementedError:
             validate_args = self.__dict__.get('_validate_args')
             loc = self.loc.expand(batch_shape)
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index 7f0b18037736..6d8b16c448f7 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -32,9 +32,7 @@ def __init__(self, scale, concentration, validate_args=None):
         base_dist = Exponential(torch.ones_like(self.scale), validate_args=validate_args)
         transforms = [PowerTransform(exponent=self.concentration_reciprocal),
                       AffineTransform(loc=0, scale=self.scale)]
-        super(Weibull, self).__init__(base_dist,
-                                      transforms,
-                                      validate_args=validate_args)
+        super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
         new = self._get_checked_instance(Weibull, _instance)
diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py
index 6d31375afac4..0c9c541ad1a6 100644
--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@@ -4,7 +4,7 @@
 from typing import Union
 
 import torch
-from torch._six import nan
+from torch import nan
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import lazy_property
@@ -106,7 +106,7 @@ def __init__(self,
         if self.df.lt(event_shape[-1]).any():
             warnings.warn("Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim.")
 
-        super(Wishart, self).__init__(batch_shape, event_shape, validate_args=validate_args)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
         self._batch_dims = [-(x + 1) for x in range(len(self._batch_shape))]
 
         if scale_tril is not None:
diff --git a/torch/func/__init__.py b/torch/func/__init__.py
index 3ac046356db6..0cfb8008345c 100644
--- a/torch/func/__init__.py
+++ b/torch/func/__init__.py
@@ -7,6 +7,7 @@
     jacfwd,
     hessian,
     functionalize,
+    linearize
 )
 from torch._functorch.functional_call import functional_call, stack_module_state
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
diff --git a/torch/functional.py b/torch/functional.py
index c5f0843ac9d7..556a5f77df1d 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -107,7 +107,7 @@ def broadcast_shapes(*shapes):
             if isinstance(shape, int):
                 if max_len < 1:
                     max_len = 1
-            elif isinstance(shape, tuple) or isinstance(shape, list):
+            elif isinstance(shape, (tuple, list)):
                 s = len(shape)
                 if max_len < s:
                     max_len = s
@@ -115,7 +115,7 @@ def broadcast_shapes(*shapes):
         for shape in shapes:
             if isinstance(shape, int):
                 shape = (shape,)
-            if isinstance(shape, tuple) or isinstance(shape, list):
+            if isinstance(shape, (tuple, list)):
                 for i in range(-1, -1 - len(shape), -1):
                     if shape[i] < 0:
                         raise RuntimeError("Trying to create tensor with negative dimension ({}): ({})"
diff --git a/torch/fx/_pytree.py b/torch/fx/_pytree.py
index 9d9102cc7044..faff3961a686 100644
--- a/torch/fx/_pytree.py
+++ b/torch/fx/_pytree.py
@@ -25,7 +25,7 @@ def tree_flatten_spec(pytree: PyTree, spec: TreeSpec) -> List[Any]:
     return result
 
 def _dict_flatten_spec(d: Dict[Any, Any], spec: TreeSpec) -> List[Any]:
-    return list([d[k] for k in spec.context])
+    return [d[k] for k in spec.context]
 
 def _list_flatten_spec(d: List[Any], spec: TreeSpec) -> List[Any]:
     return [d[i] for i in range(len(spec.children_specs))]
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index 1d30ce332dba..a88dc3e90adc 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -119,7 +119,29 @@ def _patch_function(fn: FunctionType, nargs: int) -> FunctionType:
     co = fn.__code__
     co_flags = co.co_flags & ~HAS_VARSTUFF
     co_args: tuple
-    if hasattr(co, "co_posonlyargcount"):
+    if hasattr(co, "co_qualname"):
+        # Python-3.11+ code signature
+        co_args = (
+            nargs,
+            0,
+            0,
+            co.co_nlocals,
+            co.co_stacksize,
+            co_flags,
+            co.co_code,
+            co.co_consts,
+            co.co_names,
+            co.co_varnames,
+            co.co_filename,
+            co.co_name,
+            co.co_qualname,  # type: ignore[attr-defined]
+            co.co_firstlineno,
+            co.co_lnotab,
+            co.co_exceptiontable,  # type: ignore[attr-defined]
+            co.co_freevars,
+            co.co_cellvars,
+        )
+    elif hasattr(co, "co_posonlyargcount"):
         co_args = (
             nargs,
             0,
@@ -167,7 +189,7 @@ def _patch_function(fn: FunctionType, nargs: int) -> FunctionType:
 
 
 @compatibility(is_backward_compatible=False)
-class PHBase(object):
+class PHBase:
     """
     Object representing an input placeholder to `concrete_args`
     """
@@ -242,7 +264,7 @@ def __init__(
             for name, value in chain(*[m.__dict__.items() for m in autowrap_modules])
             if not name.startswith("_") and callable(value)
         }
-        self._autowrap_function_ids.update(set([id(f) for f in autowrap_functions]))
+        self._autowrap_function_ids.update({id(f) for f in autowrap_functions})
 
         # Python modules to apply autowrap to at the start, in addition to
         # modules we see while tracing
@@ -875,9 +897,9 @@ def revert(self):
         setattr(self.frame_dict, self.fn_name, self.orig_fn)
 
 
-class _Patcher(object):
+class _Patcher:
     def __init__(self):
-        super(_Patcher, self).__init__()
+        super().__init__()
         self.patches_made: List[_PatchedFn] = []
         self.visited: Set[int] = set()
 
@@ -1054,7 +1076,7 @@ def f(a, b):
 
     FX can typically not trace through this due to the presence of control
     flow. However, we can use `concrete_args` to specialize on the value of
-    `b` to trace through this.
+    `b` to trace through this::
 
         f = fx.symbolic_trace(f, concrete_args={'b': False})
         assert f(3, False)  == 6
diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py
index cd2267c701fb..3b5d5afe0f20 100644
--- a/torch/fx/experimental/accelerator_partitioner.py
+++ b/torch/fx/experimental/accelerator_partitioner.py
@@ -275,7 +275,7 @@ def check_dependency(partition):
     """Given a partition,check if there is a circular dependency on
     this partition using bfs
     """
-    visited: Set[Partition] = set([partition])
+    visited: Set[Partition] = {partition}
     queue: Deque[Partition] = deque([partition])
     while queue:
         p = queue.popleft()
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index a96980302978..8d95ffc5655d 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -6,6 +6,8 @@
 from torch.fx.passes.split_module import split_module
 
 
+__all__ = ['FoldedGraphModule', 'get_unique_attr_name_in_module', 'split_const_subgraphs']
+
 class FoldedGraphModule(torch.fx.GraphModule):
     """
     FoldedGraphModule is a GraphModule which also contains another
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index 7ffabc9c6996..f1c7428ce609 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -11,12 +11,7 @@
 
 from torch.fx.experimental.unification import Var  # type: ignore[attr-defined]
 
-
-try:
-    import sympy  # type: ignore[import]
-    HAS_SYMPY = True
-except ImportError:
-    HAS_SYMPY = False
+import sympy
 
 _INFERENCE_RULES: Dict[Target, Callable] = {}
 _REFINEMENT_RULES: Dict[Target, Callable] = {}
@@ -305,7 +300,7 @@ def calculate_out_dimension(d_in, module_instance, index):
     dilation = (module_instance.dilation, module_instance.dilation) \
         if isinstance(module_instance.dilation, int) else module_instance.dilation
 
-    DIMENSION_TYPES = (int, sympy.Symbol) if HAS_SYMPY else (int,)
+    DIMENSION_TYPES = (int, sympy.Symbol)
 
     if d_in == Dyn:
         return Dyn
@@ -814,18 +809,15 @@ def convert_to_sympy_symbols(self, typ):
         """
         Replace all unknown types with fresh type variables.
         """
-        if HAS_SYMPY:
-            if isinstance(typ, Var):
-                return sympy.symbols(str(typ))
-            elif isinstance(typ, TensorType):
-                new_args = [self.convert_to_sympy_symbols(a) for a in typ.__args__]
-                return TensorType(tuple(new_args))
-            elif isinstance(typ, list):
-                return [self.convert_to_sympy_symbols(t) for t in typ]
-            elif isinstance(typ, tuple):
-                return (self.convert_to_sympy_symbols(t) for t in typ)
-            else:
-                return typ
+        if isinstance(typ, Var):
+            return sympy.symbols(str(typ))
+        elif isinstance(typ, TensorType):
+            new_args = [self.convert_to_sympy_symbols(a) for a in typ.__args__]
+            return TensorType(tuple(new_args))
+        elif isinstance(typ, list):
+            return [self.convert_to_sympy_symbols(t) for t in typ]
+        elif isinstance(typ, tuple):
+            return (self.convert_to_sympy_symbols(t) for t in typ)
         else:
             return typ
 
@@ -865,29 +857,26 @@ def get_node_type(a):
             pass
 
     def infer_symbolic_relations(self, n: Node):
-        if HAS_SYMPY:
-            n.type = self.convert_to_sympy_symbols(n.type)
-            if n.op == 'call_function':
-                if n.target in _RULES:
-                    return _RULES[n.target](n)
-                else:
-                    pass
-
-            if n.op == 'call_module':
-                module_instance = self.traced.get_submodule(n.target)
-                if type(module_instance) in _RULES:
-                    return _RULES[type(module_instance)](n, module_instance)
-                else:
-                    pass
-
-            if n.op == 'output':
-                def get_node_type(a):
-                    return a.type
-                n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
-                return n.type
+        n.type = self.convert_to_sympy_symbols(n.type)
+        if n.op == 'call_function':
+            if n.target in _RULES:
+                return _RULES[n.target](n)
+            else:
+                pass
 
+        if n.op == 'call_module':
+            module_instance = self.traced.get_submodule(n.target)
+            if type(module_instance) in _RULES:
+                return _RULES[type(module_instance)](n, module_instance)
             else:
                 pass
+
+        if n.op == 'output':
+            def get_node_type(a):
+                return a.type
+            n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
+            return n.type
+
         else:
             pass
 
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint.py b/torch/fx/experimental/migrate_gradual_types/constraint.py
index b96c1b96636d..bb5c6e8c7fc5 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint.py
@@ -115,8 +115,8 @@ class BinConstraintT(BinaryConstraint):
     Binary constraints about tensors
     """
     def __init__(self, lhs, rhs, op):
-        assert (isinstance(lhs, TVar) or isinstance(lhs, TensorType) or isinstance(lhs, int) or lhs == Dyn) and \
-               (isinstance(rhs, TVar) or isinstance(rhs, TensorType) or isinstance(rhs, int) or rhs == Dyn)
+        assert (isinstance(lhs, (TVar, TensorType, int)) or lhs == Dyn) and \
+               (isinstance(rhs, (TVar, TensorType, int)) or rhs == Dyn)
         super().__init__(lhs, rhs, op)
 
     def __eq__(self, other):
@@ -552,7 +552,7 @@ def is_bool_expr(constraint):
     if isinstance(constraint, BinConstraintD):
         return constraint.op in [op_gt, op_lt, op_neq, op_eq]
     else:
-        return isinstance(constraint, BVar) or isinstance(constraint, Conj) or isinstance(constraint, Disj)
+        return isinstance(constraint, (BVar, Conj, Disj))
 
 def is_dim(d):
-    return isinstance(d, DVar) or isinstance(d, int) or d == Dyn
+    return isinstance(d, (DVar, int)) or d == Dyn
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint_generator.py b/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
index 10004cab4515..1dc274bfc620 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
@@ -151,7 +151,7 @@ def expand_inference_rule(n: Node, symbols, constraints, counter):
 
     e2_nat_constraints = []
     for arg in n.args[1:]:
-        assert isinstance(arg, Node) or isinstance(arg, int)
+        assert isinstance(arg, (Node, int))
         if isinstance(arg, Node):
             assert isinstance(symbols[arg], DVar)
             e2_nat_constraints.append(BinConstraintD(0, symbols[arg], op_leq))
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py b/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
index 120541d27bae..1d5224b6b1c9 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
@@ -604,7 +604,7 @@ def calc_last_two_dims(constraint, d: List[DVar]):
 
     """
 
-    assert isinstance(constraint, CalcConv) or isinstance(constraint, CalcMaxPool)
+    assert isinstance(constraint, (CalcConv, CalcMaxPool))
 
     b3 = constraint.matching_constraint[2]
     b4 = constraint.matching_constraint[3]
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index e3f6903b3ecd..48696c1a086a 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -18,6 +18,7 @@
 from dataclasses import dataclass
 import weakref
 import operator
+from torch.utils._stats import count
 
 from torch.utils._python_dispatch import TorchDispatchMode, _pop_mode_temporarily, _get_current_dispatch_mode
 from torch._subclasses import FakeTensor
@@ -121,7 +122,7 @@ def set_meta(proxy, val):
         proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
     elif isinstance(val, py_sym_types):
         proxy.node.meta['val'] = val
-    elif isinstance(val, list) or isinstance(val, tuple):
+    elif isinstance(val, (list, tuple)):
         if all(isinstance(x, FakeTensor) for x in val):
             proxy.node.meta['val'] = [snapshot_fake(x) for x in val]
     elif isinstance(val, torch.Tensor):
@@ -190,7 +191,7 @@ def get_constant(idx):
     # Unfortunately, tree_map cannot directly be used here. As the resulting
     # object may be a proxy that represents a tuple, we may need to
     # explicitly unwrap the proxy by simulating the flattening operations.
-    if isinstance(inner_res, tuple) or isinstance(inner_res, list):
+    if isinstance(inner_res, (tuple, list)):
         if isinstance(proxy_res, fx.Proxy):
             set_meta(proxy_res, inner_res)
         for idx, e in enumerate(inner_res):
@@ -274,12 +275,15 @@ def can_handle_tensor(x):
             )
             with maybe_disable_fake_tensor_mode():
                 return func(*const_args, **const_kwargs)
-        # For symbolic tracing, we return a SymInt/SymFloat and try to
-        # get further in the trace
-        if proxy_mode.tracing_mode != "symbolic":
+        # If any of the Tensor inputs are "real" (not FakeTensor), we may
+        # incorrectly burn in constants by allowing this access.  Raise
+        # an error in this case
+        if pytree.tree_all_only(torch.Tensor, lambda t: not isinstance(t, FakeTensor), (args, kwargs)):
             raise RuntimeError(
                 f"It appears that you're trying to get value out of a tracing tensor with {func} - erroring out! "
-                "It's likely that this is caused by data-dependent control flow or similar."
+                "It's likely that this is caused by data-dependent control flow or similar.  "
+                "It may be possible to trace this with dynamic shapes; try setting tracing_mode='symbolic' "
+                "in your make_fx call."
             )
     proxy_args, proxy_kwargs = pytree.tree_map_only(
         (SymInt, SymFloat, SymBool),
@@ -477,6 +481,7 @@ def __init__(self, tracer, tracing_mode):
         self.trace_state = {}
         self._managers = []
 
+    @count
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         with self.sym_mode.enable(False):
             return self.inner_torch_dispatch(func, types, args, kwargs)
@@ -556,11 +561,16 @@ def __sym_dispatch__(self, func, types, args, kwargs):
         # We also assume there are no keyword arguments.
         assert not kwargs
         out = func(*args, **kwargs)
-        assert isinstance(out, py_sym_types), f"{func}(*{args}, **{kwargs}) = {out}"
 
-        # Delays tracing out the proxies on this op until we actually need it
-        p_out_thunk = thunkify(self._compute_proxy, func=func, args=args, out=out)
-        set_proxy_slot(out.node, self.tracer, p_out_thunk)
+        # If func returned a constant, we don't need to trace; we have
+        # determined that the result is constant (no matter if the inputs
+        # were symbolic) and it is no longer necessary to trace the
+        # computation.  This could occur if func triggered some guards.
+        if isinstance(out, py_sym_types):
+            # Delays tracing out the proxies on this op until we actually need it
+            p_out_thunk = thunkify(self._compute_proxy, func=func, args=args, out=out)
+            set_proxy_slot(out.node, self.tracer, p_out_thunk)
+
         return out
 
 
@@ -695,8 +705,12 @@ def wrap_fake(x):
 
         # We disable the autocast cache as the autocast cache causes type conversions on parameters to
         # check a cache, which introduces untracked tensors into the graph
+        #
+        # We also disable tracing by any other tensor proxy-based tracers except the current. The
+        # purpose of `make_fx` is to produce graphmodules as a side effect; its internal execution is
+        # thus irrelevant to any external functional trace.
         with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, \
-             sym_mode, proxy_mode, disable_autocast_cache():  # type: ignore[attr-defined]
+             sym_mode, proxy_mode, disable_autocast_cache(), disable_proxy_modes_tracing(enable_current=True):
             t = dispatch_trace(wrap_key(func, args, fx_tracer), tracer=fx_tracer, concrete_args=tuple(phs))
 
         # TODO: kind of a bad way to do it, should maybe figure out a better way
@@ -719,18 +733,21 @@ def get_innermost_proxy_mode():
 
 
 @contextlib.contextmanager
-def disable_proxy_modes_tracing():
-    # TODO: This probably doesn't correctly also disable ProxySymDispatchMode
+def disable_proxy_modes_tracing(enable_current=False):
     modes = get_torch_dispatch_modes()
     proxy_tensor_modes = [m for m in modes if isinstance(m, ProxyTorchDispatchMode)]
-    olds = [m.enable_tracing for m in proxy_tensor_modes]
+    if enable_current:
+        proxy_tensor_modes = proxy_tensor_modes[:-1]
+    olds = [(m.enable_tracing, m.sym_mode.enable_tracing) for m in proxy_tensor_modes]
     for proxy_mode in proxy_tensor_modes:
         proxy_mode.enable_tracing = False
+        proxy_mode.sym_mode.enable_tracing = False
     try:
         yield
     finally:
-        for proxy_mode, old in zip(proxy_tensor_modes, olds):
+        for proxy_mode, (old, old_sym) in zip(proxy_tensor_modes, olds):
             proxy_mode.enable_tracing = old
+            proxy_mode.sym_mode.enable_tracing = old_sym
 
 
 def get_isolated_graphmodule(func, args, kwargs, tracing_mode="real"):
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index f82ffdb84563..f9ad531dbfbf 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,6 +1,7 @@
 import torch
 from typing import Set, Dict, List, Type, Optional, cast, Union
 import sys
+import builtins
 import itertools
 import operator
 import math
@@ -14,26 +15,28 @@
 import logging
 
 # NB: The sym_* functions are used via getattr() and must be imported here.
-from torch import SymInt, SymFloat, SymBool, sym_not, sym_float, sym_int, sym_max, sym_min  # noqa: F401
+from torch import SymInt, SymFloat, SymBool, sym_not, sym_float, sym_max, sym_min  # noqa: F401
 from torch._guards import ShapeGuard, Source
+from torch.utils._sympy.value_ranges import ValueRanges, ValueRangeAnalysis
+from torch.utils._sympy.interp import sympy_interp
 
 SymTypes = (SymInt, SymFloat, SymBool)
 
 log = logging.getLogger(__name__)
 
-try:
-    import sympy  # type: ignore[import]
-    from sympy.printing.precedence import precedence  # type: ignore[import] # noqa: F401
-    from sympy.printing.str import StrPrinter  # type: ignore[import]
-    HAS_SYMPY = True
-except ImportError:
-    HAS_SYMPY = False
+class GuardOnDataDependentSymNode(RuntimeError):
+    pass
+
+import sympy
+from sympy.printing.str import StrPrinter
+from sympy.core.logic import fuzzy_and, fuzzy_or
 
 aten = torch._ops.ops.aten  # type: ignore[has-type]
 
 __all__ = [
     "has_symbolic_sizes_strides", "create_contiguous", "ShapeEnv",
-    "SymDispatchMode", "FloorDiv", "guard_int", "wrap_node",
+    "SymDispatchMode", "FloorDiv", "guard_int", "guard_float", "guard_scalar", "wrap_node",
+    "method_to_operator", "hint_int", "SYMPY_INTERP",
 ]
 
 SYM_FUNCTION_MODE = None
@@ -99,10 +102,161 @@ def _handle_sym_dispatch(func, args, kwargs):
     finally:
         SYM_FUNCTION_MODE = mode
 
+def hint_int(a):
+    if isinstance(a, torch.SymInt):
+        return a.node.require_hint()
+    assert type(a) is int, a
+    return a
+
+def has_hint(a):
+    if isinstance(a, SymTypes):
+        return a.node.has_hint()
+    return True
+
+# Returns True if every size dim on the tensor has a hint
+# TODO: Should this include strides too?  For now it doesn't matter,
+# that's quite an obscure case
+def tensor_has_hints(t):
+    return all(has_hint(s) for s in t.size())
+
+def definitely_true(a):
+    """
+    Returns True only if we can tell that a is True, possibly introducing
+    a guard in the process.  If a depends on some unbacked SymInt, we may
+    return False even though there may exist a possible value of the SymInt
+    that would cause the expression to return True.
+
+    When is it appropriate to use definitely_true?  First, if you can use
+    a higher level combinator like parallel_or/parallel_and, prefer using
+    those instead, they are definitely safe (modulo short-circuiting).
+    Second, it can be used if the program would behave equivalently if
+    definitely_true always returned False (parallel_or/parallel_and are
+    examples of this pattern, modulo short-circuiting).  Finally, it even
+    be OK if the program wouldn't behave equivalently, so long as the
+    change is semantics preserving.  It can be semantics preserving if
+    the program errors in more cases than it did previously (but otherwise
+    behaves identically), or if it changes some quantity in a way that
+    doesn't matter (e.g., strides often fall in this bucket.)
+    """
+    if isinstance(a, SymBool):
+        if a.node.has_hint():
+            return guard_bool(a)
+        else:
+            return False
+    return bool(a)
+
+def definitely_false(a):
+    """
+    Returns True only if we can tell that a is False, possibly introducing
+    a guard in the process.  If a depends on some unbacked SymInt, we may
+    return False even though there may exist a possible value of the SymInt
+    that would cause the expression a to be False.  See definitely_true
+    for more usage guidance.
+    """
+    if isinstance(a, SymBool):
+        if a.node.has_hint():
+            return not guard_bool(a)
+        else:
+            return False
+    return not bool(a)
+
+# TODO: could improve parallel_or/parallel_and by avoiding guards
+# if there exists a quantity that can be handled un-guardedly.  However,
+# for backed SymInts, avoiding guards doesn't really matter in practice,
+# so I chose not to do it.
+
+def parallel_or(*args):
+    """
+    Evaluate the logical OR of several arguments, avoiding guarding on
+    unbacked SymInts if another argument is definitely True.
+    """
+    if any(definitely_true(args) for a in args):
+        return True
+    return any(args)
+
+def parallel_and(*args):
+    """
+    Evaluate the logical FALSE of several arguments, avoiding guarding on
+    unbacked SymInts if another argument is definitely False.
+    """
+    if any(definitely_false(args) for a in args):
+        return False
+    return all(args)
+
+def guard_scalar(a):
+    if isinstance(a, (SymBool, bool)):
+        return guard_bool(a)
+    elif isinstance(a, (SymInt, int)):
+        return guard_int(a)
+    elif isinstance(a, (SymFloat, float)):
+        return guard_float(a)
+    else:
+        raise AssertionError(f"unrecognized scalar {a}")
+
+# inclusive both ways
+def constrain_range(a, *, min: Optional[int], max: Optional[int] = None):
+    if min is None:
+        min = -sympy.oo
+    if max is None:
+        max = sympy.oo
+    if not isinstance(a, SymInt):
+        assert min <= a <= max
+        return
+    if isinstance(a.node.expr, sympy.Integer):
+        assert min <= int(a.node.expr) <= max
+        return
+    # TODO: Turn this into a runtime assert too
+    assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+    r = a.node.shape_env.var_to_range[a.node.expr]
+    a.node.shape_env.var_to_range[a.node.expr] = ValueRanges(
+        builtins.max(r.lower, min), builtins.min(r.upper, max)
+    )
+
+
+def constrain_unify(a, b):
+    """
+    Given two SymInts, constrain them so that they must be equal.  NB:
+    this will not work with SymInts that represent nontrivial expressions
+    (yet!)
+    """
+    # TODO: Maybe dedupe this with _maybe_guard_eq?
+    if not isinstance(a, SymInt):
+        if not isinstance(b, SymInt):
+            assert a == b
+        else:
+            assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+            shape_env = b.node.shape_env
+            shape_env.replacements[b.node.expr] = sympy.Integer(a)
+    else:
+        # TODO: Actually, we can support this as long as one of them is a symbol.
+        # NB: We can't actually do "unification" as our operators are not
+        # injective
+        assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+        shape_env = a.node.shape_env
+        if not isinstance(b, SymInt):
+            shape_env.replacements[a.node.expr] = sympy.Integer(b)
+        else:
+            assert a.node.shape_env is b.node.shape_env
+            assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+            new_var = shape_env._find(a.node.expr)
+            shape_env.replacements[b.node.expr] = new_var
+
+def guard_bool(a):
+    if isinstance(a, SymBool):
+        return a.node.guard_bool("", 0)  # NB: uses Python backtrace
+    assert type(a) is bool, a
+    return a
+
 def guard_int(a):
     if isinstance(a, SymInt):
         return a.node.guard_int("", 0)  # NB: uses Python backtrace
-    assert type(a) is int
+    assert type(a) is int, a
+    return a
+
+def guard_float(a):
+    if isinstance(a, SymFloat):
+        return a.node.guard_float("", 0)  # NB: uses Python backtrace
+    assert isinstance(a, float), a
     return a
 
 # Drop in replacement for math.sqrt
@@ -129,6 +283,9 @@ def to_node(self, num):
 def fx_placeholder_vals(gm):
     return [n.meta['val'] for n in gm.graph.nodes if n.op == "placeholder"]
 
+def fx_placeholder_targets(gm):
+    return [n.target for n in gm.graph.nodes if n.op == "placeholder"]
+
 # Given a GraphModule and arguments to run it with, evaluate that the guards
 # for its associated ShapeEnv are satisfied by the passed arguments.  This
 # WILL check for duck sizing.
@@ -146,17 +303,76 @@ class SymNode:
     This is a type erased SymInt/SymFloat which we use to do actual operations.
     End users don't touch this.  Magic methods are NOT defined on this object.
     """
-    def __init__(self, expr, shape_env, pytype, constant=None):
+    def __init__(self, expr, shape_env, pytype, hint: Optional[Union[int, float]], constant=None):
         self._expr = expr
         self.shape_env = shape_env
         self.pytype = pytype
-        self.constant = constant
+        # What's the difference between hint and constant?
+        #
+        # - A constant is known to be invariant across invocations of the model;
+        #   it will always be this value.  We only really know this when we
+        #   encounter an honest-to-goodness literal (when wrapping it into
+        #   a SymNode, we set constant.)  Most of the time, constant is None
+        #
+        # - A hint is a *particular* value from the particular run we are
+        #   tracing, but it may vary the next time around.  It's useful to
+        #   keep this around, as if we need a concrete value from a SymNode,
+        #   we will return the hint and guard on the expression that produced
+        #   it giving the same hint next time around.  The hint is not
+        #   guaranteed to be set either: if you have an unbacked SymNode,
+        #   there won't be any hint; it was the result of some tensor-dependent
+        #   computation, but we don't know what it actually is because we
+        #   haven't actually run the tensor computation.
+        #
+        # hint_expr is only set if we don't have a hint.  When it is set, it
+        # contains the expression which contains the unbacked symnodes that,
+        # if constrained, would allow this expression to be hinted again.
+        if hint is None:
+            self._hint_expr = self.expr.xreplace(shape_env.var_to_val)
+            self._hint = None
+            self._update_hint()  # check if the replacement actually was enough
+        else:
+            self._hint_expr = None
+            self._hint = hint
+        self.constant: Optional[Union[int, float, bool]] = constant
 
     @property
     def expr(self):
         self._update_expr()
         return self._expr
 
+    # Check if we have replacements hint_expr that would allow us to
+    # simplify it into a hint
+    def _update_hint(self):
+        if self._hint_expr.free_symbols <= self.shape_env.replacements.keys():
+            new_hint = self.shape_env.replace(self._hint_expr)
+            # NB: unification constraints could result in a replacement that
+            # doesn't actually solve the hint!  Check for this.
+            if new_hint.free_symbols:
+                self._hint_expr = new_hint
+                return
+            self._hint = self.pytype(new_hint)
+            self._hint_expr = None
+
+    @property
+    def hint(self):
+        if self._hint is None:
+            self._update_hint()
+        return self._hint
+
+    def has_hint(self):
+        return self._hint is not None
+
+    def require_hint(self):
+        if self._hint is None:
+            self._update_hint()
+            if self._hint is None:
+                raise self.shape_env._make_data_dependent_error(self._hint_expr, self.expr)
+            else:
+                return self._hint
+        else:
+            return self._hint
+
     def _update_expr(self):
         self._expr = self.shape_env.replace(self._expr)
 
@@ -171,15 +387,15 @@ def is_bool(self):
 
     def wrap_int(self, num):
         assert type(num) is int
-        return SymNode(sympy.Integer(num), self.shape_env, int, constant=num)
+        return SymNode(sympy.Integer(num), self.shape_env, int, num, constant=num)
 
     def wrap_float(self, num):
         assert type(num) is float
-        return SymNode(sympy.Float(num), self.shape_env, float, constant=num)
+        return SymNode(sympy.Float(num), self.shape_env, float, num, constant=num)
 
     def wrap_bool(self, num):
         assert type(num) is bool
-        return SymNode(sympy.true if num else sympy.false, self.shape_env, bool, constant=num)
+        return SymNode(sympy.true if num else sympy.false, self.shape_env, bool, num, constant=num)
 
     def clone(self):
         return self
@@ -193,55 +409,169 @@ def __str__(self):
     def __repr__(self):
         return self.str()
 
-    # These methods are metaprogrammed in below
-    def sym_int(self) -> "SymNode":  # noqa: F811
-        raise AssertionError("should have been overridden")
+    # These methods call the metaprogrammed methods, they're hand written
+    # here so we get good stack traces
+    def add(self, other) -> "SymNode":  # noqa: F811
+        return self._add(other)  # type: ignore[attr-defined]
 
-    def sym_float(self) -> "SymNode":  # noqa: F811
-        raise AssertionError("should have been overridden")
+    def sub(self, other) -> "SymNode":  # noqa: F811
+        return self._sub(other)  # type: ignore[attr-defined]
 
-    def or_(self, other) -> "SymNode":  # noqa: F811
-        raise AssertionError("should have been overridden")
+    def mul(self, other) -> "SymNode":  # noqa: F811
+        return self._mul(other)  # type: ignore[attr-defined]
+
+    def mod(self, other) -> "SymNode":  # noqa: F811
+        return self._mod(other)  # type: ignore[attr-defined]
+
+    def pow(self, other) -> "SymNode":  # noqa: F811
+        return self._pow(other)  # type: ignore[attr-defined]
 
     def and_(self, other) -> "SymNode":  # noqa: F811
-        raise AssertionError("should have been overridden")
+        return self._and_(other)  # type: ignore[attr-defined]
+
+    def or_(self, other) -> "SymNode":  # noqa: F811
+        return self._or_(other)  # type: ignore[attr-defined]
+
+    def truediv(self, other) -> "SymNode":  # noqa: F811
+        return self._truediv(other)  # type: ignore[attr-defined]
+
+    def floordiv(self, other) -> "SymNode":  # noqa: F811
+        return self._floordiv(other)  # type: ignore[attr-defined]
+
+    def sym_not(self) -> "SymNode":  # noqa: F811
+        return self._sym_not()  # type: ignore[attr-defined]
+
+    def eq(self, other) -> "SymNode":  # noqa: F811
+        return self._eq(other)  # type: ignore[attr-defined]
+
+    def ne(self, other) -> "SymNode":  # noqa: F811
+        return self._ne(other)  # type: ignore[attr-defined]
+
+    def gt(self, other) -> "SymNode":  # noqa: F811
+        return self._gt(other)  # type: ignore[attr-defined]
+
+    def lt(self, other) -> "SymNode":  # noqa: F811
+        return self._lt(other)  # type: ignore[attr-defined]
+
+    def le(self, other) -> "SymNode":  # noqa: F811
+        return self._le(other)  # type: ignore[attr-defined]
+
+    def ge(self, other) -> "SymNode":  # noqa: F811
+        return self._ge(other)  # type: ignore[attr-defined]
+
+    def floor(self) -> "SymNode":  # noqa: F811
+        return self._floor()  # type: ignore[attr-defined]
+
+    def sym_float(self) -> "SymNode":  # noqa: F811
+        return self._sym_float()  # type: ignore[attr-defined]
+
+    def sym_int(self) -> "SymNode":  # noqa: F811
+        return self._sym_int()  # type: ignore[attr-defined]
+
+    def ceil(self) -> "SymNode":  # noqa: F811
+        return self._ceil()  # type: ignore[attr-defined]
+
+    def neg(self) -> "SymNode":  # noqa: F811
+        return self._neg()  # type: ignore[attr-defined]
+
+    def sym_min(self, other) -> "SymNode":  # noqa: F811
+        return self._sym_min(other)  # type: ignore[attr-defined]
+
+    def sym_max(self, other) -> "SymNode":  # noqa: F811
+        return self._sym_max(other)  # type: ignore[attr-defined]
+
+    def sym_sqrt(self) -> "SymNode":  # noqa: F811
+        return self._sym_sqrt()  # type: ignore[attr-defined]
+
+    def is_contiguous(self, sizes, strides) -> "SymNode":  # noqa: F811
+        return self._is_contiguous(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_contiguous_2d(self, sizes, strides) -> "SymNode":  # noqa: F811
+        return self._is_channels_last_contiguous_2d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_contiguous_3d(self, sizes, strides) -> "SymNode":  # noqa: F811
+        return self._is_channels_last_contiguous_3d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_strides_2d(self, sizes, strides) -> "SymNode":  # noqa: F811
+        return self._is_channels_last_strides_2d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_strides_3d(self, sizes, strides) -> "SymNode":  # noqa: F811
+        return self._is_channels_last_strides_3d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_non_overlapping_and_dense_indicator(self, sizes, strides) -> "SymNode":  # noqa: F811
+        return self._is_non_overlapping_and_dense_indicator(sizes, strides)  # type: ignore[attr-defined]
 
     # Make C++ happy
-    def sym_or(self, other):
+    def sym_or(self, other):  # noqa: F811
         return self.or_(other)
 
-    def sym_and(self, other):
+    def sym_and(self, other):  # noqa: F811
         return self.and_(other)
 
-    # Today we error on calling int on a symbolic shape, as this is a very accessible footgun.
+    def is_non_overlapping_and_dense(self, sizes, strides):
+        return self.is_non_overlapping_and_dense_indicator(sizes, strides).eq(to_node(self, 1))  # type: ignore[attr-defined]
+
     def int_(self):
-        if len(self.expr.free_symbols) == 0:
-            return int(self.expr)
-        raise RuntimeError("Trying to extract a concrete int out of a symbolic int")
+        return self.guard_int("", 0)  # NB: uses Python backtrace
 
     # You can manually trigger a guard with this function
     def guard_int(self, file, line):
         # TODO: use the file/line for some useful diagnostic on why a
         # guard occurred
-        return int(self.shape_env.evaluate_expr(self.expr))
+        r = self.shape_env.evaluate_expr(self.expr, self.hint)
+        try:
+            return int(r)
+        except Exception:
+            log.warning(f"Failed to convert to int: {r}")
+            raise
 
     def guard_float(self, file, line):
         # TODO: use the file/line for some useful diagnostic on why a
         # guard occurred
-        return float(self.shape_env.evaluate_expr(self.expr))
+        r = self.shape_env.evaluate_expr(self.expr, self.hint)
+        try:
+            return float(r)
+        except Exception:
+            log.warning(f"Failed to convert to float: {r}")
+            raise
 
     def guard_bool(self, file, line):
         # TODO: use the file/line for some useful diagnostic on why a
         # guard occurred
-        # TODO: why is the replace needed here?
-        return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
+        r = self.shape_env.evaluate_expr(self.expr, self.hint)
+        try:
+            return bool(r)
+        except Exception:
+            log.warning(f"Failed to convert to bool: {r}")
+            raise
 
     def bool_(self):
-        # TODO: why is the replace needed here?
-        return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
+        return self.guard_bool("", 0)
+
+
+if True:  # TODO: unindent
+    # Overloaded to be compatible with regular Python.
+    # https://github.com/pytorch/pytorch/issues/90900
+    class Pow(sympy.Function):
+        @classmethod
+        def eval(cls, base, exp):
+            if exp.is_zero:
+                return sympy.Integer(1)
+            elif base.is_zero and exp < 0:
+                raise ZeroDivisionError(f"{base} cannot be raised to a negative power")
+            else:
+                return base ** exp
 
+    # Overloaded to be compatible with regular Python.
+    # https://github.com/pytorch/pytorch/issues/90900
+    class TrueDiv(sympy.Function):
+        @classmethod
+        def eval(cls, base, divisor):
+            if divisor.is_zero:
+                raise ZeroDivisionError("division by zero")
+            else:
+                return base / divisor
 
-if HAS_SYMPY:
     class FloorDiv(sympy.Function):
         """
         We maintain this so that:
@@ -251,49 +581,111 @@ class FloorDiv(sympy.Function):
         nargs = (2,)
         precedence = 50  # precedence of mul  # noqa: F811
 
+        # Default return type for SymPy assumptions.
+        # https://docs.sympy.org/latest/guides/assumptions.html#implementing-assumptions-handlers
+        is_real = True
+
+        @property
+        def base(self):
+            return self.args[0]
+
+        @property
+        def divisor(self):
+            return self.args[1]
+
         def _sympystr(self, printer):
-            lhs = self.args[0]
-            rhs = self.args[1]
-            lhs_str = printer.parenthesize(lhs, self.precedence)
-            rhs_str = printer.parenthesize(rhs, self.precedence)
-            return f"{lhs_str}//{rhs_str}"
+            base = printer.parenthesize(self.base, self.precedence)
+            divisor = printer.parenthesize(self.divisor, self.precedence)
+            return f"{base}//{divisor}"
+
+        # SymPy assumptions based on argument types.
+        def _eval_is_real(self):
+            return fuzzy_or([self.base.is_real, self.divisor.is_real])
 
+        def _eval_is_integer(self):
+            return fuzzy_and([self.base.is_integer, self.divisor.is_integer])
+
+        # Automatic evaluation.
+        # https://docs.sympy.org/latest/guides/custom-functions.html#best-practices-for-eval
         @classmethod
         def eval(cls, base, divisor):
-            if base == 0:
-                return sympy.Integer(0)
-            if divisor == 1:
+            def check_supported_type(x):
+                if (x.is_integer is False and x.is_real is False and x.is_complex) or x.is_Boolean:
+                    raise TypeError(
+                        f"unsupported operand type(s) for //: "
+                        f"'{type(base).__name__}' and '{type(divisor).__name__}'"
+                        f", expected integer or real")
+
+            check_supported_type(base)
+            check_supported_type(divisor)
+
+            # We don't provide the same error message as in Python because SymPy
+            # makes it difficult to check the types.
+            if divisor.is_zero:
+                raise ZeroDivisionError("division by zero")
+
+            if base.is_zero:
+                return sympy.S.Zero
+            if base.is_integer and divisor == 1:
                 return base
+            if base.is_real and divisor == 1:
+                return sympy.floor(base)
             if isinstance(base, sympy.Integer) and isinstance(divisor, sympy.Integer):
                 return base // divisor
+            if isinstance(base, (sympy.Integer, sympy.Float)) and isinstance(divisor, (sympy.Integer, sympy.Float)):
+                return sympy.floor(base / divisor)
             if isinstance(base, FloorDiv):
                 return FloorDiv(base.args[0], base.args[1] * divisor)
 
+            if isinstance(base, sympy.Add):
+                for a in base.args:
+                    gcd = sympy.gcd(a, divisor)
+                    if gcd == divisor:
+                        return FloorDiv(base - a, divisor) + a / gcd
+
             gcd = sympy.gcd(base, divisor)
             if gcd != 1:
                 return FloorDiv(
                     sympy.simplify(base / gcd), sympy.simplify(divisor / gcd)
                 )
 
+    # TODO: As an indicator, this != 0 implies == 1 (and vice versa).
+    # Because we do not have the ability to guard on the stride permutation
+    # at the moment, it is hard to make further inferences when this is true,
+    # as although we know the tensor is contiguous in *some* layout, we don't
+    # know which one (however, you could, for example, make the inference that
+    # reshaping this to a 1D tensor can be guard-free.)
     class IsNonOverlappingAndDenseIndicator(sympy.Function):
         is_integer = True
 
         @classmethod
         def eval(cls, *args):
             assert len(args) % 2 == 0
+            dim = len(args) // 2
+            # TODO: it is possible to make progress evaluating this guard
+            # even if not all of the inputs are known.  For example, a 2D
+            # tensor with non-0/1 sizes but strides (0, 1) is definitely
+            # false, because we know its numel > 1 but it's broadcasted
+            # in dim 0.
             if all(isinstance(a, sympy.Integer) for a in args):
-                dim = len(args) // 2
-                sizes = args[0:dim]
-                strides = args[dim:]
-                return int(eval_is_non_overlapping_and_dense(
-                    [int(s) for s in sizes],
-                    [int(s) for s in strides]
-                ))
+                size_args = args[0:dim]
+                stride_args = args[dim:]
+                return eval_is_non_overlapping_and_dense(
+                    [int(a) for a in size_args],
+                    [int(a) for a in stride_args]
+                )
             return None
 
+    IndicatorTypes = (IsNonOverlappingAndDenseIndicator,)
+
+@lru_cache(256)
 def safe_expand(r):
     if hasattr(r, 'expand'):
-        return sympy.expand(r)
+        try:
+            return sympy.expand(r)
+        except RecursionError:
+            log.warning(f"RecursionError in sympy.expand({r})")
+            return r
     else:
         return r
 
@@ -303,13 +695,35 @@ def safe_expand(r):
     'sub': lambda a, b: a - b,
     'mul': lambda a, b: a * b,
     'mod': lambda a, b: a % b,
-    'pow': lambda a, b: a ** b,
+    'pow': lambda a, b: Pow(a, b),
     'and': lambda a, b: a & b,
     'or': lambda a, b: a | b,
-    'truediv': lambda a, b: a / b,
+    'truediv': lambda a, b: TrueDiv(a, b),
     'floordiv': lambda a, b: FloorDiv(a, b),
 }
 
+
+def error():
+    raise AssertionError("shouldn't be hit")
+
+def floor_ceil_helper(a, fn):
+    if isinstance(a, sympy.Mul):
+        aa = a.args
+        if len(aa) == 2 and isinstance(aa[0], sympy.Float) and aa[1].is_integer:
+            coef = sympy.Integer(aa[0])
+            if aa[0] == coef:  # structural equality test
+                return coef * aa[1]
+    if isinstance(a, sympy.Float) and a == sympy.Integer(a) or isinstance(a, sympy.Integer):
+        return sympy.Integer(a)
+    return fn(a)
+
+def floor_impl(a):
+    return floor_ceil_helper(a, sympy.floor)
+
+def ceil_impl(a):
+    return floor_ceil_helper(a, sympy.ceiling)
+
+
 magic_methods = {
     **reflectable_magic_methods,
     'sym_not': lambda a: ~a,
@@ -319,9 +733,9 @@ def safe_expand(r):
     'lt': lambda a, b: sympy.Lt(a, b),
     'le': lambda a, b: sympy.Le(a, b),
     'ge': lambda a, b: sympy.Ge(a, b),
-    'floor': lambda a: sympy.floor(a),
+    'floor': floor_impl,
     'sym_float': lambda a: a,  # Cannot use sympy.Float(a) here, coz it expects python literals
-    'ceil': lambda a: sympy.ceiling(a),
+    'ceil': ceil_impl,
     'neg': lambda a: -a,
     'sym_min': lambda a, b: sympy.Min(a, b),
     'sym_max': lambda a, b: sympy.Max(a, b),
@@ -329,11 +743,97 @@ def safe_expand(r):
 }
 
 sizes_strides_methods = {
-    'is_non_overlapping_and_dense': lambda *args: IsNonOverlappingAndDenseIndicator(*args),
+    # TODO: These could also be done with indicators, maybe it is better
+    # for reasoning to do it that way
+    'is_contiguous': lambda sizes, strides: sympy_is_contiguous(sizes, strides),
+    'is_channels_last_contiguous_2d': lambda sizes, strides: sympy_is_channels_last_contiguous_2d(sizes, strides),
+    'is_channels_last_contiguous_3d': lambda sizes, strides: sympy_is_channels_last_contiguous_3d(sizes, strides),
+    'is_channels_last_strides_2d': lambda sizes, strides: sympy_is_channels_last_strides_2d(sizes, strides),
+    'is_channels_last_strides_3d': lambda sizes, strides: sympy_is_channels_last_strides_3d(sizes, strides),
+    'is_non_overlapping_and_dense_indicator': lambda sizes, strides: IsNonOverlappingAndDenseIndicator(*sizes, *strides),
+}
+
+alternate_impl_if_hinted_methods = {
+    "sym_min": builtins.min,
+    "sym_max": builtins.max,
 }
 
+def sympy_is_contiguous_generic(sizes, strides, dim_order):
+    dim = len(sizes)
+
+    if len(dim_order) != dim:
+        return sympy.false
+
+    is_contiguous = sympy.true
+    z = sympy.Integer(1)
+    # Contiguous if the strides make sense (or the dim is size 1)
+    for d in dim_order:
+        is_contiguous &= sympy.Eq(sizes[d], sympy.Integer(1)) | sympy.Eq(strides[d], z)
+        z *= sizes[d]
+    # OR if any size is zero
+    for d in range(dim):
+        is_contiguous |= sympy.Eq(sizes[d], sympy.Integer(0))
+    return is_contiguous
+
+def sympy_is_contiguous(sizes, strides):
+    dim = len(sizes)
+    return sympy_is_contiguous_generic(sizes, strides, list(range(dim - 1, -1, -1)))
+
+# NB: There is a TODO in C++ to allow omitting the batch dim.  If that
+# happens you will need to refactor this
+
+def sympy_is_channels_last_contiguous_2d(sizes, strides):
+    return sympy_is_contiguous_generic(sizes, strides, [1, 3, 2, 0])
+
+def sympy_is_channels_last_contiguous_3d(sizes, strides):
+    return sympy_is_contiguous_generic(sizes, strides, [1, 4, 3, 2, 0])
+
+def sympy_is_channels_last_strides_generic(sizes, strides, dim_order):
+    dim = len(sizes)
+
+    if dim != len(dim_order):
+        return sympy.false
+
+    m = sympy.Integer(0)
+    r = sympy.true
+
+    # special case for trivial C dimension. default to NCHW
+    r &= sympy.Ne(strides[1], 0)
+
+    for d in dim_order:
+        r &= sympy.Ne(sizes[d], 0) & (strides[d] >= m)
+        # Fallback to NCHW as default layout for ambiguous cases
+        # This is the flaw of implicit memory_format from strides.
+        # N111 tensor with identical strides for size 1 dimension;
+        # Two cases could lead us here:
+        # a. N111 contiguous Tensor ([N,1,1,1]@[1,1,1,1])
+        # b. N11W contiguous Tensor sliced on the W-dimension.
+        # ([N,1,1,1]@[W,W,W,W])
+        if d == 0:
+            r &= sympy.Ne(m, strides[1])
+        # This is necessary to:
+        # 1. distinguish the memory_format of N1H1;
+        #     [H, 1, 1, 1] channels_last stride
+        #     [H, H, 1, 1] contiguous stride
+        # 2. permutation of 1C1W:
+        #     [1, C, 1, H]@[HC, H, H, 1] transpose(1, 3)
+        #     [1, H, 1, C]@[HC, 1, H, H] shouldn't be identified as
+        #     channels_last
+        m = strides[d] * sympy.Max(sizes[d], 1)
+
+    return r
+
+def sympy_is_channels_last_strides_2d(sizes, strides):
+    return sympy_is_channels_last_strides_generic(sizes, strides, [1, 3, 2, 0])
+
+def sympy_is_channels_last_strides_3d(sizes, strides):
+    return sympy_is_channels_last_strides_generic(sizes, strides, [1, 4, 3, 2, 0])
+
 # TODO: Deduplicate this with torch/_prims_common/__init__.py
 def eval_is_non_overlapping_and_dense(sizes, strides):
+    return int(guard_bool(_eval_is_non_overlapping_and_dense(sizes, strides)))
+
+def _eval_is_non_overlapping_and_dense(sizes, strides):
     dim = len(sizes)
 
     # Short-circuits for tensors of rank one, which are
@@ -345,7 +845,7 @@ def eval_is_non_overlapping_and_dense(sizes, strides):
     # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
     # Sorts (length, stride) pairs by stride
     lengths_and_strides = sorted(
-        tuple(zip(sizes, strides)), key=operator.itemgetter(1)
+        zip(sizes, strides), key=operator.itemgetter(1)
     )
 
     # Unlike the C++ code, we don't move the 0/1 size dimensions to the
@@ -363,19 +863,6 @@ def eval_is_non_overlapping_and_dense(sizes, strides):
 
     return True
 
-def is_non_overlapping_and_dense(sizes, strides):
-    base = None
-    for s in itertools.chain(sizes, strides):
-        if isinstance(s, SymInt):
-            base = s
-            break
-
-    assert base is not None
-    return wrap_node(base.node.is_non_overlapping_and_dense(
-        [to_node(base.node, s) for s in sizes],
-        [to_node(base.node, s) for s in strides],
-    ))
-
 unary_magic_methods = {
     'sym_float',
     'ceil',
@@ -391,7 +878,37 @@ def is_non_overlapping_and_dense(sizes, strides):
 magic_methods_on_submodule = {"sym_float", "sym_sqrt", "sym_min", "sym_max", "sym_not"}
 magic_methods_on_operator_with_trailing_underscore = {"and", "or"}
 
-always_float_magic_methods = {"truediv", "sym_float", "sym_sqrt"}
+def method_to_operator(method):
+    if method in magic_methods_on_operator_with_trailing_underscore:
+        method_attr = f"{method}_"
+    else:
+        method_attr = method
+    if method in magic_methods_on_submodule:
+        op = getattr(torch.fx.experimental.symbolic_shapes, method_attr)
+    elif method in magic_methods_on_math:
+        op = getattr(math, method_attr)
+    else:
+        op = getattr(operator, method_attr)
+    return op
+
+SYMPY_INTERP = {
+    'Eq': operator.eq,
+    'Ne': operator.ne,
+    'Gt': operator.gt,
+    'Lt': operator.lt,
+    'Le': operator.le,
+    'Ge': operator.ge,
+    'Min': min,
+    'Max': max,
+    'Mod': operator.mod,
+    'FloorDiv': operator.floordiv,
+    'TrueDiv': operator.truediv,
+    'IsNonOverlappingAndDenseIndicator': eval_is_non_overlapping_and_dense,
+    'floor': math.floor,
+    'ceiling': math.ceil,
+}
+
+always_float_magic_methods = {"truediv", "sym_float", "sym_sqrt", "pow"}
 always_int_magic_methods = {"ceil", "floor"}
 always_bool_magic_methods = {"eq", "ne", "gt", "lt", "le", "ge", "and", "or", "sym_not", "is_non_overlapping_and_dense"}
 
@@ -417,15 +934,18 @@ def _make_node_magic(method, func):
         method_attr = method
 
     def binary_magic_impl(self, other):
-        if method in magic_methods_on_submodule:
-            op = getattr(sys.modules[__name__], method_attr)
-        else:
-            assert method not in magic_methods_on_math
-            op = getattr(operator, method_attr)
+        op = method_to_operator(method)
+
+        out_hint = None
+        if self.hint is not None and other.hint is not None:
+            out_hint = op(self.hint, other.hint)
+
+        alternate_impl = alternate_impl_if_hinted_methods.get(method)
+        if alternate_impl and out_hint is not None:
+            return to_node(self, alternate_impl(wrap_node(self), wrap_node(other)))
+
         if SYM_FUNCTION_MODE:
-            r = _handle_sym_dispatch(op, (wrap_node(self), wrap_node(other)), {})
-            assert isinstance(r, SymTypes), type(r)
-            return r.node
+            return to_node(self, _handle_sym_dispatch(op, (wrap_node(self), wrap_node(other)), {}))
         assert isinstance(other, SymNode)
         other_expr = other.expr
         # TODO: consider constant prop here
@@ -438,33 +958,42 @@ def binary_magic_impl(self, other):
             raise
         out = safe_expand(out)
         pytype: Type
+        # This is not strictly correct. In Python, a**b may return complex when
+        # a < 0 and b is a float: (-1)**2.1. Same for sympy.sqrt(-3.14). This
+        # returns a float while both arguments are ints: 2**(-1). Also, max and
+        # min do not type promote. To avoid having data-dependent control flow
+        # here, we just set the type to float if one of the args is a float. In
+        # case of a type mismatch, we assume that it will be detected during
+        # evaluation.
         if method in always_float_magic_methods:
             pytype = float
         elif method in always_bool_magic_methods:
             pytype = bool
+        elif self.pytype is float or other.pytype is float:
+            pytype = float
         else:
             pytype = self.pytype
 
-        return SymNode(out, self.shape_env, pytype)
+        return SymNode(out, self.shape_env, pytype, out_hint)
 
     def unary_magic_impl(self):
+        op = method_to_operator(method)
         if SYM_FUNCTION_MODE:
-            if method in magic_methods_on_math:
-                op = getattr(math, method_attr)
-            elif method in magic_methods_on_submodule:
-                op = getattr(sys.modules[__name__], method_attr)
-            else:
-                op = getattr(operator, method_attr)
-            r = _handle_sym_dispatch(op, (wrap_node(self),), {})
-            assert isinstance(r, SymTypes), type(r)
-            return r.node
+            return to_node(self, _handle_sym_dispatch(op, (wrap_node(self),), {}))
         # TODO: consider constant prop here
         expr = self.shape_env.replace(self.expr)
+        if method == "floor" or method == "ceiling":
+            expr = self.shape_env._simplify_floor_div(expr)
+
         try:
             out = func(expr)
         except Exception:
             log.warning(f"failed to eval {method}({expr})")
             raise
+
+        out_hint = None
+        if self.hint is not None:
+            out_hint = op(self.hint)
         out = safe_expand(out)
         pytype: Type
         if method in always_int_magic_methods:
@@ -474,12 +1003,12 @@ def unary_magic_impl(self):
         else:
             pytype = self.pytype
 
-        return SymNode(out, self.shape_env, pytype)
+        return SymNode(out, self.shape_env, pytype, out_hint)
 
     if method in unary_magic_methods:
-        setattr(SymNode, method_attr, unary_magic_impl)
+        setattr(SymNode, f"_{method_attr}", unary_magic_impl)
     else:
-        setattr(SymNode, method_attr, binary_magic_impl)
+        setattr(SymNode, f"_{method_attr}", binary_magic_impl)
 
 def _make_node_sizes_strides(method, func):
     # NB: don't LRU cache, lots of arguments
@@ -487,20 +1016,70 @@ def _make_node_sizes_strides(method, func):
     def sizes_strides_impl(self, sizes, strides):
         op = getattr(sys.modules[__name__], method)
         if SYM_FUNCTION_MODE:
-            r = _handle_sym_dispatch(op, ([wrap_node(s) for s in sizes], [wrap_node(s) for s in strides]), {})
-            assert isinstance(r, SymBool), type(r)
-            return r.node
+            return to_node(
+                self,
+                _handle_sym_dispatch(
+                    op,
+                    ([wrap_node(s) for s in sizes], [wrap_node(s) for s in strides]),
+                    {}
+                )
+            )
         size_exprs = [s.expr for s in sizes]
         stride_exprs = [s.expr for s in strides]
         try:
-            out = func(*size_exprs, *stride_exprs)
+            out = func(size_exprs, stride_exprs)
         except Exception:
-            log.warning(f"failed to eval {method}(*{size_exprs}, *{stride_exprs})")
+            log.warning(f"failed to eval {method}({size_exprs}, {stride_exprs})")
             raise
         # bool is never expandable
-        return SymNode(sympy.Eq(out, 1), self.shape_env, bool)
 
-    setattr(SymNode, method, sizes_strides_impl)
+        size_hints = []
+        out_hint = None
+        for s in sizes:
+            if s.hint is None:
+                break
+            size_hints.append(s.hint)
+        else:
+            stride_hints = []
+            for s in strides:
+                if s.hint is None:
+                    break
+                stride_hints.append(s.hint)
+            else:
+                out_hint = op(size_hints, stride_hints)
+
+        # NB: This is the indicator function, not the actual bool!
+        pytype: Type
+        if method.endswith("_indicator"):
+            pytype = int
+        else:
+            pytype = bool
+        return SymNode(out, self.shape_env, pytype, out_hint)
+
+    setattr(SymNode, f"_{method}", sizes_strides_impl)
+
+    # TODO: This is technically hotpath, but in the ideal end state
+    # guards on this will resolve at a higher level so you never
+    # spend time in this code
+    def sizes_strides_user(sizes, strides):
+        for a in itertools.chain(sizes, strides):
+            if isinstance(a, SymInt):
+                return wrap_node(getattr(a.node, method)(
+                    [to_node(a.node, b) for b in sizes],
+                    [to_node(a.node, b) for b in strides],
+                ))
+        if method == "is_non_overlapping_and_dense_indicator":
+            return eval_is_non_overlapping_and_dense(sizes, strides)
+        else:
+            # TODO: this is an awful implementation
+            return bool(func(
+                [sympy.sympify(a) for a in sizes],
+                [sympy.sympify(a) for a in strides],
+            ))
+
+    # Skip for is_non_overlapping_and_dense_indicator
+    if not hasattr(sys.modules[__name__], method):
+        setattr(sys.modules[__name__], method, sizes_strides_user)
 
 for method, func in magic_methods.items():
     _make_node_magic(method, func)
@@ -574,49 +1153,65 @@ def wrapper(self, *args, **kwargs):
     return wrapper
 
 
-if HAS_SYMPY:
-    # This stub exists so we can easily add metadata to sympy symbols
-    # NB: This inherits from Dummy, not Symbol, because Symbols with the same
-    # name get interned.  This is bad for us as we want the metadata
-    # to vary across different invocations and not leak.
-    class Symbol(sympy.Dummy):
-        __slots__: List[str] = ['sources', 'stack']
-        sources: List[Source]
-        stack: Optional[str]
-
-        def __new__(cls, *args, **kwargs):
-            self = super().__new__(cls, *args, **kwargs)
-            self.sources = []
-            self.stack = None
-            return self
-
-
+if True:  # TODO: unindent
     class ShapeGuardPrinter(StrPrinter):
         def __init__(
             self,
             symbol_to_source,
             source_ref,
+            var_to_sources,
         ):
             super().__init__()
             self.symbol_to_source = symbol_to_source
             self.source_ref = source_ref
+            self.var_to_sources = var_to_sources
 
         def _print_Symbol(self, expr) -> str:
-            assert isinstance(expr, Symbol), str(type(expr))
+            assert isinstance(expr, sympy.Symbol), str(type(expr))
             assert expr in self.symbol_to_source, (
-                f"{expr} (could be from {[s.name() for s in expr.sources]}) "
+                f"{expr} (could be from {[s.name() for s in self.var_to_sources[expr]]}) "
                 f"not in {self.symbol_to_source}"
             )
             return self.source_ref(self.symbol_to_source[expr][0])
 
 
+TLS = threading.local()
 
-class ShapeEnv(object):
-    def __init__(self):
+
+class ShapeEnv:
+    def __init__(
+        self, *,
+        allow_scalar_outputs=True,
+        allow_dynamic_output_shape_ops=True,
+        strict_mark_dyn=False,
+        assume_static_by_default=False,
+        # The following options affect decisions we make about eager
+        # specialization.  Disabling them will increase trace time (as we do
+        # more symbolic reasoning) and can also harm the quality of generated
+        # code (because inductor may not be able to specialize for bounds
+        # being equal--although if we later respecialize because of a guard,
+        # your code may be just as good as it was before.)
+        #
+        # When True, eagerly specialize input sizes which have 0/1.
+        specialize_zero_one=True,
+        # When True, assume input sizes which have the same size are
+        # symbolically equal.
+        duck_shape=True,
+    ):
+        # Not directly used by ShapeEnv; indirectly used by FakeTensor
+        self.allow_scalar_outputs = allow_scalar_outputs
+        self.allow_dynamic_output_shape_ops = allow_dynamic_output_shape_ops
         self.guards: List[ShapeGuard] = []
         # Maps symbolic ints to their original concrete values
         # Currently populated from tensors
         self.var_to_val: Dict["sympy.Symbol", "sympy.Integer"] = {}
+        # Maps symbolic ints to their min/max range.  These ranges
+        # are conservative: the int MUST fall in the range, but the
+        # range may contain ints which may not actually appear in
+        # practice
+        self.var_to_range: Dict["sympy.Symbol", ValueRanges] = {}
+        self.var_to_sources: Dict["sympy.Symbol", List[Source]] = {}
+        self.var_to_stack: Dict["sympy.Symbol", str] = {}
         # Maps from sympy ints to expressions representing them
         # Populated from equality guards (i.e. a.shape[0] == b.shape[0])
         self.replacements: Dict["sympy.Symbol", "sympy.Expr"] = {}  #
@@ -624,21 +1219,26 @@ def __init__(self):
         self.divisible: Set["sympy.Expr"] = set()
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
-        self.val_to_var: Dict[int, "sympy.Expr"] = {0: sympy.Integer(0), 1: sympy.Integer(1)}
-        self.tls = threading.local()
+        self.val_to_var: Dict[int, "sympy.Expr"] = {}
+        if specialize_zero_one:
+            self.val_to_var = {0: sympy.Integer(0), 1: sympy.Integer(1)}
         self.unbacked_symfloat_counter = itertools.count()
         self.unbacked_symint_counter = itertools.count()
+        self.strict_mark_dyn = strict_mark_dyn
+        self.assume_static_by_default = assume_static_by_default
+        self.specialize_zero_one = specialize_zero_one
+        self.duck_shape = duck_shape
 
     def _suppress_guards_tls(self):
-        return getattr(self.tls, "suppress_guards", False)
+        return getattr(TLS, "suppress_guards", False)
 
     @contextmanager
     def suppress_guards(self):
-        self.tls.suppress_guards = True
+        TLS.suppress_guards = True
         try:
             yield
         finally:
-            self.tls.suppress_guards = False
+            TLS.suppress_guards = False
 
     def _get_key(self):
         """
@@ -647,6 +1247,19 @@ def _get_key(self):
         """
         return (len(self.replacements), len(self.divisible))
 
+    def _produce_dyn_sizes(self, ex: torch.Tensor, source: Source) -> List[sympy.Expr]:
+        from torch._dynamo.source import TensorPropertySource, TensorProperty
+        size = []
+        for i, val in enumerate(ex.size()):
+            is_dynamic = _is_dim_dynamic(ex, i)
+            if _should_allocate(is_dynamic, self.assume_static_by_default):
+                size.append(self.create_symbol(
+                    val, TensorPropertySource(source, TensorProperty.SIZE, i), is_dynamic
+                ))
+            else:
+                size.append(sympy.Integer(val))
+        return size
+
     def create_symbolic_sizes_strides_storage_offset(self, ex: torch.Tensor, source: Source):
         """
         Returns a list of symbolic sizes and strides for the given tensor.
@@ -654,12 +1267,7 @@ def create_symbolic_sizes_strides_storage_offset(self, ex: torch.Tensor, source:
         introduce new symbolic variables.
         """
         from torch._dynamo.source import TensorPropertySource, TensorProperty
-
-        size = [
-            self.create_symbol(
-                val, TensorPropertySource(source, TensorProperty.SIZE, i)
-            ) for i, val in enumerate(ex.size())
-        ]
+        size: List[sympy.Expr] = self._produce_dyn_sizes(ex, source)
         stride: List[Optional[sympy.Expr]] = [None] * len(size)
         for i, val in enumerate(ex.stride()):
             if val in (0, 1):
@@ -692,60 +1300,72 @@ def create_symbolic_sizes_strides_storage_offset(self, ex: torch.Tensor, source:
                     TensorPropertySource(source, TensorProperty.STRIDE, i)
                 )
         assert all(x is not None for x in stride)
-        sym_size = [self.create_symintnode(i) for i in size]
+        sym_size = [self.create_symintnode(i, hint=hint) for i, hint in zip(size, ex.size())]
         sym_stride = []
         for i, stride_expr in enumerate(stride):
             # NB: Don't duck size the stride; instead use the expression
             # we computed
             assert stride_expr is not None
-            sym_stride.append(self.create_symintnode(stride_expr))
+            sym_stride.append(self.create_symintnode(stride_expr, hint=ex.stride(i)))
         sym_storage_offset = self.create_symintnode(self.create_symbol(
             ex.storage_offset(),
             TensorPropertySource(source, TensorProperty.STORAGE_OFFSET)
-        ))
+        ), hint=ex.storage_offset())
         return sym_size, sym_stride, sym_storage_offset
 
-    def create_symintnode(self, sym: "sympy.Expr"):
-        return SymInt(SymNode(sym, self, int))
+    # If you know what the current hint value of the SymInt to be created
+    # is, pass it into hint.  Otherwise, pass None and we will make our best
+    # guess
+    def create_symintnode(self, sym: "sympy.Expr", *, hint: Optional[int]):
+        return SymInt(SymNode(sym, self, int, hint))
 
     def create_unbacked_symfloat(self):
-        symbol = Symbol(f"f{next(self.unbacked_symfloat_counter)}")
-        symbol.stack = ''.join(traceback.format_list(traceback.extract_stack()[:-1]))
-        return SymFloat(SymNode(symbol, self, float))
+        symbol = sympy.Symbol(f"f{next(self.unbacked_symfloat_counter)}")
+        self.var_to_stack[symbol] = ''.join(traceback.format_list(traceback.extract_stack()[:-1]))
+        self.var_to_range[symbol] = ValueRanges.unknown()
+        return SymFloat(SymNode(symbol, self, float, None))
 
     def create_unbacked_symint(self):
-        symbol = Symbol(f"i{next(self.unbacked_symint_counter)}", integer=True)
-        symbol.stack = ''.join(traceback.format_list(traceback.extract_stack()[:-1]))
-        return SymInt(SymNode(symbol, self, int))
+        symbol = sympy.Symbol(f"i{next(self.unbacked_symint_counter)}", integer=True)
+        self.var_to_stack[symbol] = ''.join(traceback.format_list(traceback.extract_stack()[:-1]))
+        self.var_to_range[symbol] = ValueRanges.unknown()
+        return SymInt(SymNode(symbol, self, int, None))
 
     # This is guaranteed to return a symbol or its negation is a sympy.Symbol,
     # but there may be a replacement that allows it to be immediately
     # simplified
-    def create_symbol(self, val: int, source: Source) -> "sympy.Expr":
+    def create_symbol(self, val: int, source: Source, dyn=False) -> "sympy.Expr":
         assert isinstance(source, Source), f"{type(source)} {source}"
 
-        if not HAS_SYMPY:
-            raise RuntimeError("Need sympy installed to create symbolic shapes")
-
         if val < 0:
             from torch._dynamo.source import NegateSource
-            return -self.create_symbol(-val, NegateSource(source))
-
-        # Now attempt to duck size this value
-        # TODO: Use site has to duck size
-        # TODO: Do this duck sizing lazily later
+            return -self.create_symbol(-val, NegateSource(source), dyn)
 
-        # Create a duck sized int if necessary
-        if val not in self.val_to_var:
-            sympy_expr = Symbol(f"s{len(self.var_to_val)}", positive=True, integer=True)
+        if dyn or val not in self.val_to_var or not self.duck_shape:
+            # If a value is never before seen, or dynamic, we want to create an expression
+            sympy_expr = sympy.Symbol(f"s{len(self.var_to_val)}", positive=True, integer=True)
+            # We always associate vars to vals
             self.var_to_val[sympy_expr] = sympy.Integer(val)
-            self.val_to_var[val] = sympy_expr
+            # Do the appending later, because we always want to populate this
+            self.var_to_sources[sympy_expr] = []
 
-        # This implements duck-shaping: input sizes that match are assigned
-        # the same symint
-        r = self.duck_int(val)
-        if isinstance(r, Symbol):
-            r.sources.append(source)
+            if not dyn:
+                # Non explicitly marked dynamic dims register to val_to_var to get duck shaped
+                self.val_to_var[val] = sympy_expr
+
+            # We also infer that it must be not 0/1
+            lower = 2 if self.specialize_zero_one else 0
+            self.var_to_range[sympy_expr] = ValueRanges(lower, sympy.oo)
+
+        if not dyn and self.duck_shape:
+            # This implements duck-shaping: input sizes that match are assigned
+            # the same symint
+            r = self.duck_int(val)
+        else:
+            r = sympy_expr
+
+        if isinstance(r, sympy.Symbol):
+            self.var_to_sources[r].append(source)
         return r
 
     # Given a concrete integer value, return the duck sized symbol associated
@@ -754,6 +1374,7 @@ def create_symbol(self, val: int, source: Source) -> "sympy.Expr":
     # This has some pretty tricky preconditions associated with it, so if
     # you are in a binding context, you probably wanted create_symbol instead.
     def duck_int(self, val):
+        assert self.duck_shape
         assert val in self.val_to_var, (
             "Direct call to duck_int MUST only duck size an integer values "
             "that have already produced by inputs (allocated "
@@ -764,13 +1385,23 @@ def duck_int(self, val):
         )
         return self.val_to_var[val]
 
-    # Generates a Python string which, when evaluated in a context that
+    # Generates a list of guards strings which, when evaluated in a context that
     # defines tensors for all the sources, returns True or False depending
-    # on if the guards evaluated to True or not.  Primarily used by Dynamo,
+    # on if the guards in the list evaluated to True or not.  Primarily used by Dynamo,
     # but this is also helpful for manual testing of guards (see
     # evaluate_guards_for_args)
-    def codegen_guards(self, placeholders, sources,
-                       source_ref=lambda n: n.name()):
+    #
+    # For convenience in testing, a source is allowed to be a str,
+    # in which case we will assume it is a LocalSource
+    #
+    # simplified lets you omit duck sizing, equality and 0/1 guards.
+    # This is useful for testing when you don't care about the boilerplate
+    # guards, and it may be helpful for user output too (be careful though;
+    # some equality guards are nontrivial!  It would be nice to get simplified
+    # output to print them too).  It's private because it's not
+    # intended for normal use
+    def produce_guards(self, placeholders, sources,
+                       source_ref=lambda n: n.name(), *, _simplified=False) -> List[str]:
         # It took a lot of sweat to figure out the algorithm here.  Let's
         # explain how it works.
         #
@@ -837,6 +1468,7 @@ def codegen_guards(self, placeholders, sources,
         input_guards = []
 
         symbol_to_source = collections.defaultdict(list)
+        dynamic_sources = []
 
         # How do we know what the value of s0 is?  Fresh variables can only be
         # bound by inputs, so there MUST be some other input which binds the
@@ -857,12 +1489,28 @@ def track_symint(source, val):
                     symbol_to_source[s].append(source)
                 elif isinstance(-s, sympy.Symbol):
                     symbol_to_source[-s].append(NegateSource(source))
-
                 input_guards.append((source, s))
             else:
                 input_guards.append((source, sympy.Integer(val)))
 
+        def _verify(expr, potential_expr):
+            # An expression of > 1 symbols is a relationship,
+            # and relationships can be ignored due to the nature of the
+            # constraint api explicitly not supporting relationships.
+            #
+            # In a future where we want to extend the constraint API to include
+            # user directives about relationships, we can remove this check from
+            # verification.
+            if len(expr.free_symbols) == 1:
+                srcs = symbol_to_source[expr.free_symbols.pop()]
+                for src in srcs:
+                    if src in dynamic_sources:
+                        raise RuntimeError(f"Attempting to introduce a guard {potential_expr} that violates user's mark_dynamic")
+
         for t, source in zip(placeholders, sources):
+            if isinstance(source, str):
+                from torch._dynamo.source import LocalSource
+                source = LocalSource(source)
             assert isinstance(source, Source)
             if t is None:
                 continue
@@ -870,10 +1518,17 @@ def track_symint(source, val):
                 track_symint(source, t)
                 continue
             assert isinstance(t, torch.Tensor)
-            for i, s in enumerate(t.size()):
-                track_symint(TensorPropertySource(source, TensorProperty.SIZE, i), s)
-            for i, s in enumerate(t.stride()):
-                track_symint(TensorPropertySource(source, TensorProperty.STRIDE, i), s)
+            for i, ss in enumerate(t.size()):
+                property_source = TensorPropertySource(source, TensorProperty.SIZE, i)
+                track_symint(property_source, ss)
+                if _is_dim_dynamic(t, i):
+                    # If this dim is marked dynamic, we need to do a test on it, to ensure that it has not bee
+                    # constrained to an integer.
+                    if _is_int(ss):
+                        raise RuntimeError(f"Attempting to constrain dim {i} for {source}, which violates user's mark_dynamic")
+                    dynamic_sources.append(property_source)
+            for i, ss in enumerate(t.stride()):
+                track_symint(TensorPropertySource(source, TensorProperty.STRIDE, i), ss)
             track_symint(TensorPropertySource(source, TensorProperty.STORAGE_OFFSET), t.storage_offset())
 
         # 1. Every input must equal the final simplified symbolic expression
@@ -881,16 +1536,17 @@ def track_symint(source, val):
         #    if we have an input (2, 3), we must show s0*2 == 2 and s1 == 3.
         #    This does a lot of work: it covers duck sizing and equality guards.
         exprs = []
-        for source, expr in input_guards:
-            # Small optimization
-            if (
-                isinstance(expr, Symbol) and
-                expr in symbol_to_source and
-                source == symbol_to_source[expr][0]
-            ):
-                continue
-            sexpr = ShapeGuardPrinter(symbol_to_source, source_ref).doprint(expr)
-            exprs.append(f"{source_ref(source)} == {sexpr}")
+        if not _simplified:
+            for source, expr in input_guards:
+                # Small optimization
+                if (
+                    isinstance(expr, sympy.Symbol) and
+                    expr in symbol_to_source and
+                    source == symbol_to_source[expr][0]
+                ):
+                    continue
+                sexpr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(expr)
+                exprs.append(f"{source_ref(source)} == {sexpr}")
 
         # 2. Every guard must evaluate to True (but remember many guards
         #    like s0 == s1*2 because trivial due to simplification)
@@ -899,28 +1555,42 @@ def track_symint(source, val):
                 continue
             g = self.simplify(g)
             try:
-                exprs.append(ShapeGuardPrinter(symbol_to_source, source_ref).doprint(g))
+                guard_expr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(g)
+                exprs.append(guard_expr)
+                if self.strict_mark_dyn:
+                    _verify(g, guard_expr)
             except Exception:
                 log.warning(f"Failing guard allocated at: \n{tb}")
                 raise
 
-        # 3. Every symbol must not be equal to 0/1
-        for sources in symbol_to_source.values():
-            assert sources
-            # We must assert that each symbol is not zero or one, as we make
-            # negative inferences on shape variables
-            exprs.append(f"{source_ref(sources[0])} != 0 and {source_ref(sources[0])} != 1")
-
-        if exprs:
-            return " and ".join(exprs)
-        else:
-            return "True"
+        # 3. Every symbol must be within its value range (this handles 0/1
+        # specialization too).  NB: because we never update value ranges
+        # except in case of explicit user annotation, these are not included
+        # in simplified.  However, when we start updating value ranges
+        # these should probably get reported in tests too
+        if not _simplified:
+            for symbol, sources in symbol_to_source.items():
+                assert sources
+                r = self.var_to_range[symbol]
+                bounds = []
+                if r.lower != -sympy.oo:
+                    bounds.append(str(r.lower))
+                bounds.append(source_ref(sources[0]))
+                if r.upper != sympy.oo:
+                    bounds.append(str(r.upper))
+                if len(bounds) > 1:
+                    exprs.append(" <= ".join(bounds))
+
+        return exprs
 
     def evaluate_guards_for_args(self, placeholders, args):
         from torch._dynamo.source import GlobalSource
         arg_names = [f"t{i}" for i in range(len(args))]
-        code = self.codegen_guards(placeholders, [GlobalSource(a) for a in arg_names])
-        return eval(code, {}, dict(zip(arg_names, args)))
+        guards = self.produce_guards(placeholders, [GlobalSource(a) for a in arg_names])
+        if guards:
+            code = " and ".join(guards)
+            return eval(code, SYMPY_INTERP, dict(zip(arg_names, args)))
+        return True
 
     def bind_symbols(self, placeholders, args):
         # Given a paired list of placeholders (fake tensors with
@@ -987,27 +1657,52 @@ def get_shape_groups(self):
         return shape_groups
 
     @_lru_cache
-    def _maybe_evaluate_static(self, expr: "sympy.Expr") -> "Optional[sympy.Expr]":
+    def _maybe_evaluate_static(self, expr: "sympy.Expr", *, unbacked_only: bool = False) -> "Optional[sympy.Expr]":
         """
         Tries to evaluate expr without introducing guards
         """
         expr = self.simplify(expr)
-        # Simplifies assuming that shape vars > 1 (since we cache on 0/1 shape values)
+
+        # Simplify making use of value range lower bound
         symbols = list(expr.free_symbols)
-        new_shape_env = {
-            k: sympy.Symbol(f"shape_{idx}", positive=True, integer=True) + 1
-            for idx, k in enumerate(symbols)
-            # Do not assume unbacked symints are > 1
-            if k in self.var_to_val
-        }
+        new_shape_env = {}
+        new_range_env = {}
+        for idx, k in enumerate(symbols):
+            vr = self.var_to_range[k]
+            # Don't do anything if we don't have a nontrivial lower bound
+            # Also don't do anything if we asked only to simplify unbacked
+            # SymInt
+            if vr.lower == -sympy.oo or (unbacked_only and k in self.var_to_val):
+                new_range_env[k] = vr
+                continue
+            # Positive means >= 1
+            # Positive - 1 means >= 0
+            # Positive + lower - 1 means >= lower
+            # The new symbol 's' is "too low", so when we substitute it in
+            # we have to increase it by offset (and conversely, the new
+            # variables have to have their value range bounds adjusted as
+            # well)
+            s = sympy.Symbol(f"shape_{idx}", positive=True, integer=True)
+            offset = vr.lower - 1
+            new_shape_env[k] = s + offset
+            new_range_env[s] = ValueRangeAnalysis.sub(vr, offset)
+
         new_expr = expr.xreplace(new_shape_env)
         floor_div_replace = {}
         for atom in new_expr.atoms(FloorDiv):
             floor_div_replace[atom] = sympy.floor(atom.args[0] / atom.args[1])
         new_expr = safe_expand(new_expr.xreplace(floor_div_replace))
+        # TODO: when unbacked_only, can sometimes early return even when there
+        # are still free symbols
         if len(list(new_expr.free_symbols)) == 0:
             return new_expr
-        return None
+
+        # Check if the range can solve it statically
+        out = sympy_interp(ValueRangeAnalysis, new_range_env, new_expr)
+        if out.is_singleton():
+            return out.lower
+
+        return new_expr if unbacked_only else None
 
     @_lru_cache
     def replace(self, expr: "sympy.Expr") -> "sympy.Expr":
@@ -1027,15 +1722,39 @@ def _update_divisible(self):
     @_lru_cache
     def simplify(self, expr: "sympy.Expr") -> "sympy.Expr":
         expr = self.replace(expr)
+        # TODO it would seem that this pass is not necessary given the
+        # below replacement of // with /, but for nested FloorDivs
+        # the non-recursive replacement doesn't work, and
+        # recursive makes it hard to look up divisibility,
+        # because existing divisibility info has FloorDiv in it, not /
+        # for now just do a separate pass to catch common nested case
         if expr.has(FloorDiv):
             self._update_divisible()
             div_replacements = {}
             for atom in expr.atoms(FloorDiv):
                 base, divisor = atom.args
-                if self.replace(base % divisor) in self.divisible:
-                    div_replacements[atom] = base / divisor
+                if isinstance(divisor, FloorDiv):
+                    base1, divisor1 = divisor.args
+                    if self.replace(base % divisor) in self.divisible and \
+                            base == base1 and self.replace(base1 % divisor1) in self.divisible:
+                        div_replacements[atom] = divisor1
             expr = expr.xreplace(div_replacements)
             expr = safe_expand(expr)
+        if expr.has(FloorDiv):
+            div_replacements = {}
+            pows = expr.atoms(sympy.Pow)
+            rationals = expr.atoms(sympy.Rational).difference(expr.atoms(sympy.Integer))
+            for fd in expr.atoms(FloorDiv):
+                base, divisor = fd.args
+                if self.replace(base % divisor) in self.divisible:
+                    div_replacements[fd] = base / divisor
+            new_expr = expr.xreplace(div_replacements)
+            new_expr = safe_expand(new_expr)
+            new_pows = new_expr.atoms(sympy.Pow)
+            new_rationals = new_expr.atoms(sympy.Rational).difference(new_expr.atoms(sympy.Integer))
+            # divisions simplified away
+            if new_pows.issubset(pows) and new_rationals.issubset(rationals):
+                expr = new_expr
         return expr
 
     @lru_cache(256)
@@ -1047,21 +1766,25 @@ def size_hint(self, expr: "sympy.Expr"):
         """
         result_expr = safe_expand(expr).xreplace(self.var_to_val)
         if len(result_expr.free_symbols) != 0:
-            raise self._make_data_dependent_error(result_expr)
+            r = self._maybe_evaluate_static(result_expr)
+            if r is not None:
+                return r
+            raise self._make_data_dependent_error(result_expr, expr)
         return result_expr
 
-    def _make_data_dependent_error(self, expr):
+    def _make_data_dependent_error(self, expr, unhinted_expr):
         # TODO: in a Dynamo context, having user code, and having the
         # name of the local, will be much better
         accesses = '\n\n'.join(
-            f"Data dependent variable '{s}' allocated at:\n{s.stack}"
+            f"Data dependent variable '{s}' allocated at:\n{self.var_to_stack[s]}"
             for s in expr.free_symbols
         )
-        return RuntimeError(
+        return GuardOnDataDependentSymNode(
             f"\n\n{accesses}\n"
-            "RuntimeError: It appears that you're trying to get a value out of symbolic int/float "
+            "GuardOnDataDependentSymNode: It appears that you're trying to get "
+            "a value out of symbolic int/float "
             "whose value is data-dependent (and thus we do not know the true value.)  "
-            f"The expression we were trying to evaluate is {expr}.  "
+            f"The expression we were trying to evaluate is {expr} (unhinted: {unhinted_expr}).  "
             "Scroll up to see where each of these data-dependent accesses originally occurred."
             # TODO: Help text about how to use our runtime tests to fix this
             # problem
@@ -1084,12 +1807,12 @@ def _find(self, a: "sympy.Symbol") -> "sympy.Expr":
         return self.replacements[a]
 
     @lru_cache(256)
-    def _maybe_guard_eq(self, expr: Union["sympy.Eq", "sympy.Ne"]) -> None:
+    def _maybe_guard_eq(self, expr: Union["sympy.Eq", "sympy.Ne"], concrete_bool: bool) -> None:
         """
         Evaluates the result of an eq call. If true, uses information to
         simplify shapes (i.e. a == b or a % 5 == 0)
         """
-        concrete_bool = bool(self.size_hint(expr))
+        assert type(concrete_bool) is bool
         if isinstance(expr, sympy.Eq):
             if not concrete_bool:
                 return
@@ -1111,6 +1834,9 @@ def _maybe_guard_eq(self, expr: Union["sympy.Eq", "sympy.Ne"]) -> None:
         rhs = expr.rhs
         if not expr.has(sympy.Mod):
             try:
+                floor_div_atoms = lhs.atoms(FloorDiv).union(rhs.atoms(FloorDiv))
+                if len(floor_div_atoms) > 0 and any([a.divisor != 1 for a in floor_div_atoms]):
+                    raise NotImplementedError
                 solutions = sympy.solve(lhs - rhs, free[0], dict=True)
                 if len(solutions) != 1:
                     return
@@ -1132,25 +1858,52 @@ def _maybe_guard_eq(self, expr: Union["sympy.Eq", "sympy.Ne"]) -> None:
                 pass
         return
 
+    @_lru_cache
+    def _simplify_floor_div(self, expr):
+        floor_divs = tuple(expr.atoms(FloorDiv))
+        # we expect floor_divs to be exact,
+        # and thus add the guards for the exact floordivs,
+        # even if tracing doesn't require them otherwise
+        for fd in reversed(floor_divs):
+            base, divisor = fd.args
+            mod_expr = sympy.Mod(base, divisor)
+            eq_expr = sympy.Eq(mod_expr, 0)
+            # add necessary mod guards
+            self.evaluate_expr(eq_expr)
+        return self.simplify(expr)
+
     @lru_cache(256)
-    def evaluate_expr(self, expr: "sympy.Expr"):
+    def evaluate_expr(self, expr: "sympy.Expr", hint=None):
         """
         Given an expression, evaluates it, adding guards if necessary
         """
         if len(expr.free_symbols) == 0:
             return expr
         expr = self.simplify(expr)
+
         static_expr = self._maybe_evaluate_static(expr)
         if static_expr is not None:
             return static_expr
 
+        if not (expr.free_symbols <= self.var_to_val.keys()):
+            # TODO: dedupe this with _maybe_evaluate_static
+            # Attempt to eliminate the unbacked SymInt
+            new_expr = self._maybe_evaluate_static(expr, unbacked_only=True)
+            if not (new_expr.free_symbols <= self.var_to_val.keys()):
+                raise self._make_data_dependent_error(expr.xreplace(self.var_to_val), expr)
+            expr = new_expr
+
+        if hint is None:
+            concrete_val = self.size_hint(expr)
+        else:
+            concrete_val = sympy.sympify(hint)
+
         if isinstance(expr, (sympy.Eq, sympy.Ne)):
-            self._maybe_guard_eq(expr)
+            self._maybe_guard_eq(expr, bool(concrete_val))
             # TODO: If we successfully eliminate a symbol via equality, it
             # is not actually necessary to save a guard for the equality,
             # as we will implicitly generate a guard when we match that
             # input against the symbol
-        concrete_val = self.size_hint(expr)
 
         # TODO: optimize this; avoid formatting traces until we need them
         # NB: drop two frames; evaluate_expr and the Sym* function that
@@ -1165,3 +1918,24 @@ def evaluate_expr(self, expr: "sympy.Expr"):
                 self.guards.append(
                     ShapeGuard(sympy.Eq(expr, concrete_val), stack))  # type: ignore[arg-type]
         return concrete_val
+
+def _should_allocate(user_marked_dynamic, assume_static_by_default):
+    """
+    Mainly here for readability, repurposes the flag name for the context
+    of shape_env, which cares about allocation.
+    """
+    if user_marked_dynamic:
+        return True
+    # If we got here, the user did *NOT* mark this dim as dynamic,
+    # but BC behavior is to allocate a symbol anyway.
+    return not assume_static_by_default
+
+def _is_dim_dynamic(t, d):
+    return hasattr(t, "_dynamo_dynamic_indices") and d in t._dynamo_dynamic_indices
+
+def _is_int(expr):
+    if not isinstance(expr, SymInt):
+        return False
+    if len(expr.node.expr.free_symbols) > 0:
+        return False
+    return True
diff --git a/torch/fx/experimental/unification/core.py b/torch/fx/experimental/unification/core.py
index 32116f93c30f..3a0e572c09eb 100644
--- a/torch/fx/experimental/unification/core.py
+++ b/torch/fx/experimental/unification/core.py
@@ -30,7 +30,7 @@ def _reify(t, s):
 
 @dispatch(dict, dict)  # type: ignore[no-redef]
 def _reify(d, s):
-    return dict((k, reify(v, s)) for k, v in d.items())
+    return {k: reify(v, s) for k, v in d.items()}
 _reify
 
 @dispatch(object, dict)  # type: ignore[no-redef]
diff --git a/torch/fx/experimental/unification/match.py b/torch/fx/experimental/unification/match.py
index a6c5fc3b48e7..c4fd64c64acf 100644
--- a/torch/fx/experimental/unification/match.py
+++ b/torch/fx/experimental/unification/match.py
@@ -4,7 +4,7 @@
 from .unification_tools import groupby, first  # type: ignore[import]
 
 
-class Dispatcher(object):
+class Dispatcher:
     def __init__(self, name):
         self.name = name
         self.funcs = {}
@@ -55,7 +55,7 @@ class VarDispatcher(Dispatcher):
     """
     def __call__(self, *args, **kwargs):
         func, s = self.resolve(args)
-        d = dict((k.token, v) for k, v in s.items())
+        d = {k.token: v for k, v in s.items()}
         return func(**d)
 
 
@@ -86,7 +86,7 @@ def supercedes(a, b):
     s = unify(a, b)
     if s is False:
         return False
-    s = dict((k, v) for k, v in s.items() if not isvar(k) or not isvar(v))
+    s = {k: v for k, v in s.items() if not isvar(k) or not isvar(v)}
     if reify(a, s) == a:
         return True
     if reify(b, s) == b:
@@ -117,5 +117,5 @@ def ordering(signatures):
     for s in signatures:
         if s not in edges:
             edges[s] = []
-    edges = dict((k, [b for a, b in v]) for k, v in edges.items())  # type: ignore[attr-defined, assignment]
+    edges = {k: [b for a, b in v] for k, v in edges.items()}  # type: ignore[attr-defined, assignment]
     return _toposort(edges)
diff --git a/torch/fx/experimental/unification/multipledispatch/conflict.py b/torch/fx/experimental/unification/multipledispatch/conflict.py
index 5aa0c0ed19ed..2eaf6141b18b 100644
--- a/torch/fx/experimental/unification/multipledispatch/conflict.py
+++ b/torch/fx/experimental/unification/multipledispatch/conflict.py
@@ -80,11 +80,11 @@ def ambiguous(a, b):
 def ambiguities(signatures):
     """ All signature pairs such that A is ambiguous with B """
     signatures = list(map(tuple, signatures))
-    return set((a, b) for a in signatures for b in signatures
-               if hash(a) < hash(b)
-               and ambiguous(a, b)
-               and not any(supercedes(c, a) and supercedes(c, b)
-                           for c in signatures))
+    return {(a, b) for a in signatures for b in signatures
+            if hash(a) < hash(b)
+            and ambiguous(a, b)
+            and not any(supercedes(c, a) and supercedes(c, b)
+            for c in signatures)}
 
 
 def super_signature(signatures):
@@ -92,7 +92,7 @@ def super_signature(signatures):
     n = len(signatures[0])
     assert all(len(s) == n for s in signatures)
 
-    return [max([type.mro(sig[i]) for sig in signatures], key=len)[0]
+    return [max((type.mro(sig[i]) for sig in signatures), key=len)[0]
             for i in range(n)]
 
 
@@ -115,5 +115,5 @@ def ordering(signatures):
     for s in signatures:
         if s not in edges:
             edges[s] = []
-    edges = dict((k, [b for a, b in v]) for k, v in edges.items())  # type: ignore[assignment, attr-defined]
+    edges = {k: [b for a, b in v] for k, v in edges.items()}  # type: ignore[assignment, attr-defined]
     return _toposort(edges)
diff --git a/torch/fx/experimental/unification/multipledispatch/dispatcher.py b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
index eb1cbfc94f85..36155260ed33 100644
--- a/torch/fx/experimental/unification/multipledispatch/dispatcher.py
+++ b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@@ -92,7 +92,7 @@ def variadic_signature_matches(types, full_signature):
     return all(variadic_signature_matches_iter(types, full_signature))
 
 
-class Dispatcher(object):
+class Dispatcher:
     """ Dispatch methods based on type signature
     Use ``dispatch`` to add implementations
     Examples
diff --git a/torch/fx/experimental/unification/multipledispatch/variadic.py b/torch/fx/experimental/unification/multipledispatch/variadic.py
index d9280e93c12c..6d50ff6a65e8 100644
--- a/torch/fx/experimental/unification/multipledispatch/variadic.py
+++ b/torch/fx/experimental/unification/multipledispatch/variadic.py
@@ -1,5 +1,3 @@
-import six
-
 from .utils import typename
 
 __all__ = ["VariadicSignatureType", "isvariadic", "VariadicSignatureMeta", "Variadic"]
@@ -72,7 +70,7 @@ def __getitem__(cls, variadic_type):
         )
 
 
-class Variadic(six.with_metaclass(VariadicSignatureMeta)):
+class Variadic(metaclass=VariadicSignatureMeta):
     """A class whose getitem method can be used to generate a new type
     representing a specific variadic signature.
     Examples
diff --git a/torch/fx/experimental/unification/utils.py b/torch/fx/experimental/unification/utils.py
index 2eda80f4ee86..d74799a714c5 100644
--- a/torch/fx/experimental/unification/utils.py
+++ b/torch/fx/experimental/unification/utils.py
@@ -45,8 +45,8 @@ def _toposort(edges):
     [2] http://en.wikipedia.org/wiki/Toposort#Algorithms
     """
     incoming_edges = reverse_dict(edges)
-    incoming_edges = dict((k, set(val)) for k, val in incoming_edges.items())
-    S = set((v for v in edges if v not in incoming_edges))
+    incoming_edges = {k: set(val) for k, val in incoming_edges.items()}
+    S = ({v for v in edges if v not in incoming_edges})
     L = []
 
     while S:
diff --git a/torch/fx/experimental/unification/variable.py b/torch/fx/experimental/unification/variable.py
index 7da400311b02..d918ec3b6ab4 100644
--- a/torch/fx/experimental/unification/variable.py
+++ b/torch/fx/experimental/unification/variable.py
@@ -6,7 +6,7 @@
 _glv = _global_logic_variables
 
 
-class Var(object):
+class Var:
     """ Logic Variable """
 
     _id = 1
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 15a8f607f2cf..51d0f744ce6a 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -273,7 +273,7 @@ class _PyTreeInfo(NamedTuple):
     out_spec: Optional[pytree.TreeSpec]
 
 @compatibility(is_backward_compatible=False)
-class CodeGen(object):
+class CodeGen:
     def __init__(self):
         self._body_transformer: Optional[TransformCodeFunc] = None
 
@@ -451,26 +451,20 @@ def append_stacktrace_summary(node : Node):
                         prev_stacktrace = node.stack_trace
 
                         lines = node.stack_trace.strip().split('\n')
-                        idx = 0
-                        while idx < len(lines):
+                        # stacktrace should have innermost frame last, so we
+                        # iterate backwards to find the first line that starts
+                        # with 'File '
+                        summary_str = ""
+                        for idx in range(len(lines) - 2, -1, -1):
                             line = lines[idx].strip()
-                            if line.startswith('File '):
-                                break
-                            idx += 1
-
-                        summary_lines = []
-                        if idx + 1 < len(lines):
-                            matches = pattern.match(lines[idx].strip())
+                            matches = pattern.match(line)
                             if matches:
                                 file = matches.group(1)
                                 lineno = matches.group(2)
-                                lineage = f'File: {file}:{lineno}'
-                                summary_lines.append(lineage)
-
-                            code = f"code: {lines[idx + 1].strip()}"
-                            summary_lines.append(code)
-
-                        summary_str = ', '.join(summary_lines)
+                                # next line should be the code
+                                code = lines[idx + 1].strip()
+                                summary_str = f'File: {file}:{lineno}, code: {code}'
+                                break
                         body.append(f'\n# {summary_str}\n')
                 elif prev_stacktrace != "":
                     prev_stacktrace = ""
@@ -514,7 +508,7 @@ def emit_node(node : Node):
             elif node.op == 'call_function':
                 assert callable(node.target)
                 # pretty print operators
-                if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods:
+                if getattr(node.target, "__module__", "") == '_operator' and node.target.__name__ in magic_methods:
                     assert isinstance(node.args, tuple)
                     body.append(f'{repr(node)}{maybe_type_annotation} = '
                                 f'{magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}')
@@ -522,7 +516,7 @@ def emit_node(node : Node):
 
                 # pretty print inplace operators; required for jit.script to work properly
                 # not currently supported in normal FX graphs, but generated by torchdynamo
-                if node.target.__module__ == '_operator' and node.target.__name__ in inplace_methods:
+                if getattr(node.target, "__module__", "") == '_operator' and node.target.__name__ in inplace_methods:
                     body.append(f'{inplace_methods[node.target.__name__].format(*(repr(a) for a in node.args))};  '
                                 f'{repr(node)}{maybe_type_annotation} = {repr(node.args[0])}')
                     return
@@ -622,17 +616,49 @@ def process_outputs(self, out: Any) -> Any:
         return pytree.tree_unflatten(out, self.pytree_info.out_spec)
 
     def gen_fn_def(self, free_vars, maybe_return_annotation):
+        # Given a user function/model:
+        #   myargs = [myargs0, myargs1]
+        #   mykwargs = {'mykwargs0': ..., 'mykwargs1': ...}
+        #   def forward(self, mypos, *myargs, mykey=None, **mykwargs):
+        #
+        # The generated code flattens all keywords into positional arguments for `forward()`
+        #   e.g forward(self, mypos, myargs0, myargs1, mykey, mykwargs0, mykwargs1):
+        #
+        # Within `forward`, `tree_flatten_spec``still parses args and kwargs separately
+        #   e.g. tree_flatten_spec(([mypos, myargs0, myargs1],
+        #                           {'mykey':mykey, 'mykwargs0':mykwargs0, 'mykwargs1':mykwargs1}),
+        #                          self._in_spec)
+        #
+        # If the user function/model does not have keywords, the dict is suppressed from tree_flatten_spec
+        #   e.g. tree_flatten_spec([mypos, myargs0, myargs1]), self._in_spec)
         if self.pytree_info is None:
             return super().gen_fn_def(free_vars, maybe_return_annotation)
-        function_args = self.pytree_info.orig_args
-        has_orig_self = (function_args[0] == 'self') if len(function_args) > 0 else False
+
+        fn_args = self.pytree_info.orig_args
+        has_orig_self = (fn_args[0] == 'self') if len(fn_args) > 0 else False
         if has_orig_self:
             free_vars.insert(0, 'self')
-        function_definition = super().gen_fn_def(function_args[:], maybe_return_annotation)
+        fn_definition = super().gen_fn_def(fn_args[:], maybe_return_annotation)
+
         if len(free_vars) > 0:  # pytree has placeholders in it
-            function_definition += f"""
-    {', '.join(free_vars)}, = fx_pytree.tree_flatten_spec([{', '.join(function_args)}], self._in_spec)"""
-        return function_definition
+            # when kwargs is present, in_spec is tuple(args, kwargs)
+            has_args_kwargs_tuple = self.pytree_info.in_spec.type == tuple and \
+                len(self.pytree_info.in_spec.children_specs) == 2 and \
+                self.pytree_info.in_spec.children_specs[0].type == tuple and \
+                self.pytree_info.in_spec.children_specs[1].type == dict
+            fn_kwargs = '{}'
+            fn_signature = f"[{', '.join(fn_args)}], self._in_spec"
+            if has_args_kwargs_tuple:
+                count_args = len(self.pytree_info.in_spec.children_specs[0].children_specs)
+                fn_args = self.pytree_info.orig_args[:count_args]
+                fn_kwargs = '{' + ', '.join(f"'{k}':{v}" for k, v in zip(
+                                  self.pytree_info.in_spec.children_specs[1].context,
+                                  self.pytree_info.orig_args[count_args:])) + '}'
+                fn_signature = f"([{', '.join(fn_args)}], {fn_kwargs}), self._in_spec"
+
+            fn_definition += f"""
+    {', '.join(free_vars)}, = fx_pytree.tree_flatten_spec({fn_signature})"""
+        return fn_definition
 
     def generate_output(self, output_args):
         if self.pytree_info:
@@ -764,8 +790,9 @@ def __deepcopy__(self, memo=None) -> 'Graph':
         output_vals = g.graph_copy(self, val_map=memo, return_output_node=True)
         g._codegen = copy.deepcopy(self._codegen)
         assert isinstance(output_vals, tuple)
-        output_val, old_output_val = output_vals
-        g.output(output_val, type_expr=getattr(old_output_val, 'type', None))
+        output_val, old_output_node = output_vals
+        new_output_node = g.output(output_val, type_expr=getattr(old_output_node, 'type', None))
+        new_output_node.meta = copy.copy(old_output_node.meta)
         return g
 
     @compatibility(is_backward_compatible=True)
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 644f36b07b74..7e4ff606c3d0 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -18,11 +18,13 @@
 
 __all__ = ["reduce_graph_module", "reduce_package_graph_module", "reduce_deploy_graph_module", "GraphModule"]
 
+_USER_PRESERVED_ATTRIBUTES_KEY = "_user_preserved_attributes"
+
 # Normal exec loses the source code, however we can work with
 # the linecache module to recover it.
 # Using _exec_with_source will add it to our local cache
 # and then tools like TorchScript will be able to get source info.
-class _EvalCacheLoader(object):
+class _EvalCacheLoader:
     def __init__(self):
         self.eval_cache = {}
         self.next_id = 0
@@ -704,10 +706,27 @@ def __reduce__(self):
     # we need to define deepcopy otherwise it will call __reduce__
     # and cause symbolic tracing to occur every time we try to copy the object
     def __deepcopy__(self, memo):
+        res = type(self).__new__(type(self))
+        memo[id(self)] = res
         fake_mod = torch.nn.Module()
-        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
-        res = GraphModule(fake_mod, fake_mod.__dict__['_graph'])
-        res.meta = copy.deepcopy(getattr(self, 'meta', {}))
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__, memo)
+        GraphModule.__init__(res, fake_mod, fake_mod.__dict__['_graph'])
+        # hooks are lost during `GraphModule.__init__`, so we need to copy over
+        # them explicitly, note right now we are only copying state_dict related
+        # hooks, to reduce bc-related issues, we can copy forward/backward related
+        # hooks in the future as well if needed
+        extra_preserved_attrs = [
+            "_state_dict_hooks",
+            "_load_state_dict_pre_hooks",
+            "_load_state_dict_post_hooks"
+        ]
+        for attr in extra_preserved_attrs:
+            if attr in self.__dict__:
+                setattr(res, attr, copy.deepcopy(self.__dict__[attr], memo))
+        res.meta = copy.deepcopy(getattr(self, 'meta', {}), memo)
+        if _USER_PRESERVED_ATTRIBUTES_KEY in res.meta:
+            for attr_name, attr in res.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():
+                setattr(res, attr_name, attr)
         return res
 
     def __copy__(self):
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 9001129fdc52..586dd3bf75a5 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -153,7 +153,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
 
     @contextmanager
     def _set_current_node(self, node):
-        with fx_traceback.append_stack_trace(node.stack_trace), fx_traceback.set_current_meta(node.meta):
+        with fx_traceback.set_current_meta(node.meta):
             yield
 
     @compatibility(is_backward_compatible=True)
@@ -457,7 +457,7 @@ def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict
             kwargs (Dict): Dict of keyword arguments for this invocation
         """
         assert isinstance(target, str)
-        return Proxy(self.new_graph.get_attr(target), self.tracer)
+        return self.tracer.create_proxy("get_attr", target, args, kwargs)
 
     @compatibility(is_backward_compatible=True)
     def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
@@ -477,7 +477,7 @@ def transform(self) -> GraphModule:
         Transform ``self.module`` and return the transformed
         ``GraphModule``.
         """
-        with fx_traceback.override_stack_trace():
+        with fx_traceback.preserve_node_meta():
             result = super().run(enable_io_processing=False)
         if result is not None:
             def strip_proxy(a : Union[Argument, Proxy]) -> Any:
diff --git a/torch/fx/node.py b/torch/fx/node.py
index f1bc9b3e0011..6745667a73d6 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -70,9 +70,15 @@ def _get_qualified_name(func: Callable[..., Any]) -> str:
     # things like getattr just appear in builtins
     if getattr(builtins, func.__name__, None) is func:
         return func.__name__
+    # torch.Tensor.{fn}
+    if isinstance(func, types.MethodDescriptorType) and func is getattr(torch.Tensor, func.__name__, None):
+        return f"torch.Tensor.{func.__name__}"
     name = func.__name__
     module = _find_module_of_method(func)
     module = module.replace('torch._ops', 'torch.ops')  # WAR for bug in how torch.ops assigns module
+    # Fixup segment_reduce mismatch
+    if module == "torch" and name == "segment_reduce":
+        name = "_" + name
     return f'{module}.{name}'
 
 def _format_arg(arg, max_list_len=float('inf')) -> str:
@@ -357,9 +363,13 @@ def update_kwarg(self, key : str, arg : Argument) -> None:
     def stack_trace(self) -> Optional[str]:
         """
         Return the Python stack trace that was recorded during tracing, if any.
-        This property is usually populated by `Tracer.create_proxy`. To record
-        stack traces during tracing for debug purposes, set
-        `record_stack_traces = True` on the `Tracer` instance.
+        When traced with fx.Tracer, this property is usually populated by
+        `Tracer.create_proxy`. To record stack traces during tracing for debug purposes,
+        set `record_stack_traces = True` on the `Tracer` instance.
+        When traced with dynamo, this property will be populated by default by
+        `OutputGraph.create_proxy`.
+
+        stack_trace would have the innermost frame at the end of the string.
         """
         return self.meta.get("stack_trace", None)
 
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 2129d91a3dfe..e9cee88d01b5 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -64,18 +64,29 @@ def _torchscript_type_to_python_type(ts_type : 'torch._C.JitType') -> Any:
     return eval(ts_type.annotation_str, _type_eval_globals)
 
 def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> inspect.Signature:
-    parameters : List[inspect.Parameter] = []
+    from inspect import Parameter
+    parameters : List[Parameter] = []
     for arg in ts_schema.arguments:
         arg_type = _torchscript_type_to_python_type(arg.type)
-        default = arg.default_value if arg.has_default_value() else inspect.Parameter.empty
+        default = arg.default_value if arg.has_default_value() else Parameter.empty
         # TODO: Figure out if this is safe. It seems like when generating the type signatures for
         # PythonArgParser, we emit signatures with `input` instead of `self` as the first tensor
         # argument name. Downstream, if someone converts that positional argument to a keyword
         # argument, the name mismatch will break things, so here we're going to normalize the
         # name to "input"
         name = arg.name if arg.name != 'self' else 'input'
-        kind = inspect.Parameter.KEYWORD_ONLY if arg.kwarg_only else inspect.Parameter.POSITIONAL_OR_KEYWORD
-        parameters.append(inspect.Parameter(name=name, kind=kind, default=default, annotation=arg_type))
+        kind = Parameter.KEYWORD_ONLY if arg.kwarg_only else Parameter.POSITIONAL_OR_KEYWORD
+        # "from" is a keyword therefore it must be a POSITIONAL_ONLY argument
+        if name == "from":
+            assert kind == Parameter.POSITIONAL_OR_KEYWORD
+            # ParameterKind type is internal implementation detail to inspec package
+            # which makes it hard to do type annoation
+            kind = Parameter.POSITIONAL_ONLY  # type: ignore[assignment]
+            # This renders all previous arguments to positional only
+            for idx, p in enumerate(parameters):
+                assert p.kind == Parameter.POSITIONAL_OR_KEYWORD
+                parameters[idx] = Parameter(name=p.name, kind=Parameter.POSITIONAL_ONLY, default=p.default, annotation=p.annotation)
+        parameters.append(Parameter(name=name, kind=kind, default=default, annotation=arg_type))
     return_types = [_torchscript_type_to_python_type(ret.type) for ret in ts_schema.returns]
     if len(return_types) == 0:
         return_type = None
@@ -159,7 +170,7 @@ def get_signature_for_torch_op(op : Callable, return_schemas : bool = False):
 @compatibility(is_backward_compatible=False)
 def create_type_hint(x):
     try:
-        if isinstance(x, list) or isinstance(x, tuple):
+        if isinstance(x, (list, tuple)):
             # todo(chilli): Figure out the right way for mypy to handle this
             if isinstance(x, list):
                 def ret_type(x):
@@ -263,7 +274,7 @@ def normalize_function(
         kwargs = {}
     new_args_and_kwargs = None
     if not isinstance(target, types.BuiltinFunctionType) and not (
-        isinstance(target, OpOverloadPacket) or isinstance(target, OpOverload)
+        isinstance(target, (OpOverloadPacket, OpOverload))
     ):
         target_for_analysis = target
         if target in boolean_dispatched:
@@ -395,7 +406,12 @@ def _args_kwargs_to_normalized_args_kwargs(sig : inspect.Signature, args : Tuple
     supported_parameter_types = {
         inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY}
     if any(p.kind not in supported_parameter_types for p in sig.parameters.values()):
-        return None
+        # Add an exception for one signature, which is common for random/uniform, i.e.:
+        # Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None
+        # `from` is Python keyword and as such functions with that signature should have
+        # positional-only args, but at the same time they could be dispatched as kwargs
+        if list(sig.parameters.keys()) != ['input', 'from', 'to', 'generator']:
+            return None
 
     bound_args = sig.bind(*args, **kwargs)
     bound_args.apply_defaults()
diff --git a/torch/fx/passes/dialect/common/cse_pass.py b/torch/fx/passes/dialect/common/cse_pass.py
index fdfdc791569b..bfbefcae8619 100644
--- a/torch/fx/passes/dialect/common/cse_pass.py
+++ b/torch/fx/passes/dialect/common/cse_pass.py
@@ -11,9 +11,9 @@
 
 
 # stateful ops are banned from CSE
-rand_ops = set([aten.dropout, aten._fused_dropout, aten._standard_gamma, aten.bernoulli, aten.multinomial, aten.native_dropout, aten.normal, aten.poisson, aten.binomial, aten.rrelu, aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm])  # noqa: E501
+rand_ops = {aten.dropout, aten._fused_dropout, aten._standard_gamma, aten.bernoulli, aten.multinomial, aten.native_dropout, aten.normal, aten.poisson, aten.binomial, aten.rrelu, aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm}  # noqa: E501
 
-inplace_ops = set([aten.add_, aten.sub_, aten.mul_, aten.div_, aten.pow_, aten.lerp_, aten.relu_, aten.sigmoid_, aten.tanh_])  # noqa: E501
+inplace_ops = {aten.add_, aten.sub_, aten.mul_, aten.div_, aten.pow_, aten.lerp_, aten.relu_, aten.sigmoid_, aten.tanh_}  # noqa: E501
 
 
 @torch.fx._compatibility.compatibility(is_backward_compatible=False)
diff --git a/torch/fx/passes/fake_tensor_prop.py b/torch/fx/passes/fake_tensor_prop.py
index 403db5b9a009..9b780d92e933 100644
--- a/torch/fx/passes/fake_tensor_prop.py
+++ b/torch/fx/passes/fake_tensor_prop.py
@@ -34,5 +34,5 @@ def run_node(self, n: Node):
 
     def propagate(self, *args):
         with self._mode:
-            fake_args = [self._mode.from_tensor(a) for a in args]
+            fake_args = [self._mode.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args]
             return super().run(*fake_args)
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index 3754739c30a6..cbce8f24cd04 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
 
 import hashlib
 import torch
@@ -57,8 +56,7 @@ class FxGraphDrawer:
         Visualize a torch.fx.Graph with graphviz
         Basic usage:
             g = FxGraphDrawer(symbolic_traced, "resnet18")
-            with open("a.svg", "w") as f:
-                f.write(g.get_dot_graph().create_svg())
+            g.get_dot_graph().write_svg("a.svg")
         """
 
         def __init__(
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index cf002b3611bf..8242ab4c8e65 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -5,6 +5,14 @@
 
 logger = logging.getLogger(__name__)
 
+__all__ = [
+    "PassManager",
+    "inplace_wrapper",
+    "log_hook",
+    "loop_pass",
+    "this_before_that_pass_constraint",
+    "these_before_those_pass_constraint",
+]
 
 # for callables which modify object inplace and return something other than
 # the object on which they act
diff --git a/torch/fx/passes/reinplace.py b/torch/fx/passes/reinplace.py
index 86986a85acc8..bb5839f98cb4 100644
--- a/torch/fx/passes/reinplace.py
+++ b/torch/fx/passes/reinplace.py
@@ -111,7 +111,7 @@ def propagate(self, *args):
         self.multi_output_view_nodes = {}
         self.node_counter = -1
 
-        with FakeTensorMode(allow_meta=True) as mode:
+        with FakeTensorMode() as mode:
             fake_args = [mode.from_tensor(a) for a in args]
             return super().run(*fake_args)
 
@@ -468,10 +468,10 @@ def f(x):
     # so we know not to re-inplace them.
     # NOTE: later, we'll need to add an optimization for fully recovering performance
     # on programs that mutate inputs.
-    input_storages = set(
+    input_storages = {
         StorageWeakRef(
             node.meta['fake_result']._typed_storage()
-        ) for node in gm.graph.nodes if node.op == 'placeholder')
+        ) for node in gm.graph.nodes if node.op == 'placeholder'}
 
 
     # We also need to know for a given node, what are all of its aliasing nodes.
@@ -627,14 +627,14 @@ def replace_arg(a):
                     old_flattened_res, _ = tree_flatten(old.meta['fake_result'])
                     node_flattened_res, _ = tree_flatten(node_to_update.meta['fake_result'])
 
-                    old_res_storage = set(
+                    old_res_storage = {
                         StorageWeakRef(
                             x._typed_storage()
-                        ) for x in old_flattened_res if isinstance(x, FakeTensor))
-                    node_res_storage = set(
+                        ) for x in old_flattened_res if isinstance(x, FakeTensor)}
+                    node_res_storage = {
                         StorageWeakRef(
                             x._typed_storage()
-                        ) for x in node_flattened_res if isinstance(x, FakeTensor))
+                        ) for x in node_flattened_res if isinstance(x, FakeTensor)}
 
                     # This will happen if we're updating a view op, e.g.
                     # e.g. replacing
@@ -648,10 +648,10 @@ def replace_arg(a):
                     # We can't just check equality because we might encounter FX nodes that return zero tensor outputs.
                     if len(old_res_storage) == 1 and len(node_res_storage) == 1 and old_res_storage == node_res_storage:
                         new_flattened_res, _ = tree_flatten(new.meta['fake_result'])
-                        new_res_storage = set(
+                        new_res_storage = {
                             StorageWeakRef(
                                 x._typed_storage()
-                            ) for x in new_flattened_res if isinstance(x, FakeTensor))
+                            ) for x in new_flattened_res if isinstance(x, FakeTensor)}
                         assert len(new_res_storage) == 1
                         (old_ref,) = old_res_storage
                         (new_ref,) = new_res_storage
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 2cc11dbd4cd8..d1e9afd5e01f 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -80,7 +80,7 @@ class ShapeProp(torch.fx.Interpreter):
 
         class TwoLayerNet(torch.nn.Module):
             def __init__(self, D_in, H, D_out):
-                super(TwoLayerNet, self).__init__()
+                super().__init__()
                 self.linear1 = torch.nn.Linear(D_in, H)
                 self.linear2 = torch.nn.Linear(H, D_out)
             def forward(self, x):
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index f3b1dd2d0603..d9024816870f 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -136,6 +136,31 @@ def forward(self, x, y):
             > self.assertEqual(orig_out, submodules_out)
             True
     """
+
+    def construct_graph(
+        node: torch.fx.node.Node,
+        base_mod_env: Dict[str, torch.fx.node.Node],
+        base_mod_attrs: Dict[str, torch.fx.graph_module.GraphModule],
+    ):
+        if node.op == "placeholder":
+            default_value = (
+                node.args[0] if len(node.args) > 0 else inspect.Signature.empty
+            )
+            base_mod_env[node.name] = base_mod_graph.placeholder(
+                node.target, type_expr=node.type, default_value=default_value
+            )
+            base_mod_env[node.name].meta = node.meta.copy()
+        elif node.op == "get_attr":
+            base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
+            base_mod_env[node.name].meta = node.meta.copy()
+            attr_val = m
+            for atom in node.target.split("."):  # type: ignore[union-attr]
+                if not hasattr(attr_val, atom):
+                    raise AttributeError(f"Node target {node.target} not found!")
+                attr_val = getattr(attr_val, atom)
+            base_mod_attrs[node.target] = attr_val  # type: ignore[index]
+        return base_mod_env, base_mod_attrs
+
     partitions: Dict[str, Partition] = {}
     orig_nodes: Dict[str, torch.fx.node.Node] = {}
 
@@ -236,7 +261,7 @@ def record_cross_partition_use(
                 target_attr = m
                 for atom in target_atoms:
                     if not hasattr(target_attr, atom):
-                        raise RuntimeError(f"Operator target {node.target} not found!")
+                        raise AttributeError(f"Operator target {node.target} not found!")
                     target_attr = getattr(target_attr, atom)
                 # target = target_atoms[-1]
                 target = "_".join(target_atoms)
@@ -260,39 +285,34 @@ def record_cross_partition_use(
             new_node.meta = node.meta.copy()
             partition.environment[node] = new_node
 
+    # original module environment dict mapping node names to nodes
+    org_mod_env: Dict[str, torch.fx.node.Node] = {}
     # Set up values to construct base module
     base_mod_env: Dict[str, torch.fx.node.Node] = {}
     base_mod_graph: torch.fx.graph.Graph = torch.fx.graph.Graph()
     base_mod_attrs: Dict[str, torch.fx.graph_module.GraphModule] = {}
-    for node in m.graph.nodes:
-        if node.op == "placeholder":
-            default_value = (
-                node.args[0] if len(node.args) > 0 else inspect.Signature.empty
-            )
-            base_mod_env[node.name] = base_mod_graph.placeholder(
-                node.target, type_expr=node.type, default_value=default_value
+    if not keep_original_order:
+        for node in m.graph.nodes:
+            base_mod_env, base_mod_attrs = construct_graph(
+                node, base_mod_env, base_mod_attrs
             )
-            base_mod_env[node.name].meta = node.meta.copy()
-        elif node.op == "get_attr":
-            base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
-            base_mod_env[node.name].meta = node.meta.copy()
-            attr_val = m
-            for atom in node.target.split("."):
-                if not hasattr(attr_val, atom):
-                    raise RuntimeError(f"Node target {node.target} not found!")
-                attr_val = getattr(attr_val, atom)
-            base_mod_attrs[node.target] = attr_val
+
+    else:
+        # Go through the graph to construct the mapping dict
+        for node in m.graph.nodes:
+            org_mod_env[node.name] = node
 
     # Do some things iterating over the partitions in topological order again:
     # 1) Finish off submodule Graphs by setting corresponding outputs
     # 2) Construct GraphModules for each submodule
     # 3) Construct the base graph by emitting calls to those submodules in
-    #    topological order
+    #    topological order or original order specified by keep_original_order
 
     construct_order_partitions = (
         sorted_partitions if not keep_original_order else original_partition_order
     )
 
+    already_constructed_attr_nodes = set()
     for partition_name in construct_order_partitions:
         partition = partitions[partition_name]
 
@@ -303,7 +323,20 @@ def record_cross_partition_use(
         output_vals = output_vals[0] if len(output_vals) == 1 else output_vals  # type: ignore[assignment]
         partition.graph.output(output_vals)
 
-        # Construct GraphModule for this partition
+        if keep_original_order:
+            # first get the attr nodes required by this partition
+            org_mod_attr_nodes: List[torch.fx.node.Node] = [
+                org_mod_env[key] for key in partition.inputs
+            ]
+            # Construct GraphModule for this partition
+            for node in org_mod_attr_nodes:  # type: ignore[attr-defined]
+                if node in already_constructed_attr_nodes:
+                    continue
+                base_mod_env, base_mod_attrs = construct_graph(
+                    node, base_mod_env, base_mod_attrs
+                )
+                already_constructed_attr_nodes.add(node)
+
         base_mod_attrs[partition.submod_name] = torch.fx.graph_module.GraphModule(
             partition.targets, partition.graph
         )  # noqa: B950
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 0f357c38dcb7..f2c45ab5acd5 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -43,12 +43,14 @@ def __init__(
     ):
         parser = argparse.ArgumentParser()
         parser.add_argument(
+            "--min-acc-module-size",
             "--min_acc_module_size",
             required=False,
             type=int,
             help="Minimum size limit of an accelerator subgraph.",
         )
         parser.add_argument(
+            "--skip-fusion",
             "--skip_fusion",
             default=False,
             action="store_true",
@@ -58,6 +60,7 @@ def __init__(
             "can reduce overhead.",
         )
         parser.add_argument(
+            "--allow-non-tensor",
             "--allow_non_tensor",
             default=False,
             action="store_true",
@@ -226,7 +229,7 @@ def generate_inputs_for_submodules(
 
     handles = []
     results = {}
-    submodule_to_names = dict((mod, name) for name, mod in model.named_modules())
+    submodule_to_names = {mod: name for name, mod in model.named_modules()}
 
     def pre_forward(module, module_inputs):
         results[submodule_to_names[module]] = copy.deepcopy(module_inputs) if deepcopy else module_inputs
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index 9eddc2befd04..e6b6cd770065 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -50,24 +50,37 @@ def validate_partition(partition: NodeList) -> bool:
                 # external user node, need to expose as an output
                 outputs.append(user_node)
 
-    # perform DFS on the parition outputs
-    # if it reaches a node within the partition, then it found a cycle
-    visited: NodeSet = set()
-
-    def dfs_find_cycle(node):
-        if node in partition_set:
-            return True  # found cycle, return
-
-        visited.add(node)
-        for user_node in node.users:
-            if user_node not in visited:
-                if dfs_find_cycle(user_node):
-                    return True
+    # Perform BFS on the partition outputs.
+    # If it reaches a node within the partition, then it found a cycle.
+    # This function takes the ownership of `root_nodes` and may modify it.
+    def bfs_find_cycle(root_nodes: NodeList) -> bool:
+        # Set used to exclude nodes that have already been visited.
+        # If a node has been visited, that node and all its children have
+        # been checked for cycles.
+        visited: NodeSet = set()
+
+        # Start with `root_nodes` and traverse through (toward child nodes)
+        # their connected sub-graph. Nodes in `visited` won't be added
+        # to `queue` again.
+        queue: NodeList = root_nodes
+        while queue:
+            current = queue.pop()
+            visited.add(current)
+            if current in partition_set:
+                # Started from partition's `output` nodes, and reached
+                # another node in partition. Cycle!
+                return True
+            for user_node in current.users:
+                if user_node in visited:
+                    continue
+                queue.append(user_node)
+        # `root_nodes` don't cause cycle.
         return False
 
-    for output_node in outputs:
-        if dfs_find_cycle(output_node):
-            return False
+    # Use all output nodes as roots to traverse
+    # the graph to check cycles.
+    if bfs_find_cycle(outputs):
+        return False
 
     return True
 
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 5bcb1bad0050..abf439824df5 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -5,7 +5,7 @@
 from torch.fx.graph import Graph
 from torch.fx.node import Node
 from torch.fx._compatibility import compatibility
-from typing import Dict, List, Set, Any
+from typing import Dict, List, Set, Any, Union, Tuple
 import logging
 import os
 
@@ -158,7 +158,7 @@ def _remove_overlapping_matches(self, matches: List[InternalMatch]) -> List[Inte
                         nodes_matched.add(gn)
         return non_overlapping_matches
 
-    def _match_args(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
+    def _match_literals(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
         assert not (isinstance(pn, Node) and isinstance(gn, Node)), "pn and gn cannot both be Node"
 
         if isinstance(pn, Node) and not isinstance(gn, Node):
@@ -198,6 +198,8 @@ def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
         saved_match = copy.copy(match)
         match.nodes_map[pn] = gn
 
+        # Placeholder is a wildcard and can be matched with any python object
+        # (including list/tuple)
         if pn.op == "placeholder":
             return True
 
@@ -205,40 +207,34 @@ def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
         # match for `gn`
         match_found = True
 
-        def flatten_args(args) -> List[Any]:
-            # Recursively flatten args
-            result : List[Any] = []
-            for arg in args:
-                # flatten the list, if only it's a list/tuple of nodes
-                if isinstance(arg, (list, tuple)) and len(arg) > 0 and isinstance(arg[0], Node):
-                    result.extend(flatten_args(arg))
+        def _match_args(args1: Union[List, Tuple], args2: Union[List, Tuple]) -> bool:
+            if len(args1) != len(args2):
+                return False
+
+            for a1, a2 in zip(args1, args2):
+                if isinstance(a1, Node) and isinstance(a2, Node):
+                    matched = self._match_nodes(a1, a2, match)
+                elif isinstance(a1, (list, tuple)) and isinstance(a2, (list, tuple)):
+                    matched = _match_args(a1, a2)
                 else:
-                    result.append(arg)
+                    matched = self._match_literals(a1, a2, match)
+
+                if not matched:
+                    return False
 
-            return result
+            return True
 
-        pn_flatten_args = flatten_args(pn.args)
-        gn_flatten_args = flatten_args(gn.args)
+        match_found = match_found and _match_args(pn.args, gn.args)
 
+        pn_kwargs, gn_kwargs = [], []
         if pn.kwargs.keys() == gn.kwargs.keys():
             for key in pn.kwargs.keys():
-                pn_flatten_args.append(pn.kwargs[key])
-                gn_flatten_args.append(gn.kwargs[key])
+                pn_kwargs.append(pn.kwargs[key])
+                gn_kwargs.append(gn.kwargs[key])
         else:
             match_found = False
 
-        if match_found and len(pn_flatten_args) == len(gn_flatten_args):
-            for pn_, gn_ in zip(pn_flatten_args, gn_flatten_args):
-                if isinstance(gn_, Node) and isinstance(pn_, Node):
-                    matched = self._match_nodes(pn_, gn_, match)
-                else:
-                    matched = self._match_args(pn_, gn_, match)
-
-                if not matched:
-                    match_found = False
-                    break
-        else:
-            match_found = False
+        match_found = match_found and _match_args(pn_kwargs, gn_kwargs)
 
         if not match_found:
             # revert to saved_match before matching with current node
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index cb538392ed41..4c933a15a326 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -1,5 +1,6 @@
 import dis
 import copy
+import sys
 import torch
 import inspect
 import operator
@@ -19,7 +20,7 @@
 
 
 @compatibility(is_backward_compatible=False)
-class Scope(object):
+class Scope:
     """ Scope object that records the module path and the module type
     of a module. Scope is used to track the information of the module
     that contains a Node in a Graph of GraphModule. For example::
@@ -50,7 +51,7 @@ def __init__(self, module_path: str, module_type: Any):
 
 
 @compatibility(is_backward_compatible=False)
-class ScopeContextManager(object):
+class ScopeContextManager:
     """ A context manager to track the Scope of Node during symbolic tracing.
     When entering a forward function of a Module, we'll update the scope information of
     the current module, and when we exit, we'll restore the previous scope information.
@@ -125,7 +126,24 @@ def create_node(self, kind : str, target : Target,
             self.scope.module_path,
             self.scope.module_type,
         )
-        if self.module_stack:
+        # Optionally set stack trace on the created Node for debugging purposes
+        if fx_traceback.has_preserved_node_meta():
+            current_meta: Dict[str, Any] = fx_traceback.get_current_meta()
+
+            # Explicitly set the stack_trace, nn_module_stack and source_fn on the node.meta
+            # If other meta fields are needed, they can be added here
+            stack_trace = current_meta.get("stack_trace")
+            if stack_trace:
+                node.stack_trace = stack_trace
+
+            nn_module_stack = current_meta.get("nn_module_stack")
+            if nn_module_stack:
+                node.meta["nn_module_stack"] = nn_module_stack
+
+            source_fn = current_meta.get("source_fn")
+            if source_fn:
+                node.meta["source_fn"] = source_fn
+        elif self.module_stack:
             node.meta['nn_module_stack'] = copy.copy(self.module_stack)
         return node
 
@@ -159,17 +177,12 @@ def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs:
         else:
             proxy = proxy_factory_fn(node)
 
-        # Optionally set stack trace on the created Node for debugging purposes
-        if fx_traceback.is_stack_trace_overridden():
-            proxy.node.meta = fx_traceback.get_current_meta()
-            stacks = fx_traceback.format_stack()
-            proxy.node.stack_trace = '\n'.join(reversed(stacks))
-        elif self.record_stack_traces:
+        if self.record_stack_traces and not proxy.node.stack_trace:
             user_frame = self._find_user_frame()
             if user_frame:
-                walk_stack_gen = traceback.walk_stack(user_frame)
-                summary = traceback.StackSummary.extract(walk_stack_gen)  # type: ignore[arg-type]
+                summary = traceback.extract_stack(user_frame)
                 tb_lines = summary.format()
+                # stack_trace would have innermost frame at the bottom
                 proxy.node.stack_trace = ''.join(tb_lines)
 
         return proxy
@@ -358,7 +371,13 @@ def __iter__(self) -> Iterable['Proxy']:
         assert frame is not None
         calling_frame = frame.f_back
         assert calling_frame is not None
-        inst = list(dis.get_instructions(calling_frame.f_code))[calling_frame.f_lasti // 2]
+        inst_list = list(dis.get_instructions(calling_frame.f_code))
+        if sys.version_info >= (3, 11):
+            from bisect import bisect_left
+            inst_idx = bisect_left(inst_list, calling_frame.f_lasti, key=lambda x: x.offset)
+        else:
+            inst_idx = calling_frame.f_lasti // 2
+        inst = inst_list[inst_idx]
         if inst.opname == 'UNPACK_SEQUENCE':
             return (self[i] for i in range(inst.argval))  # type: ignore[index]
 
@@ -373,7 +392,11 @@ def __bool__(self) -> bool:
             calling_frame = frame.f_back
             assert calling_frame is not None
             insts = list(dis.get_instructions(calling_frame.f_code))
-            cur = calling_frame.f_lasti // 2
+            if sys.version_info >= (3, 11):
+                from bisect import bisect_left
+                cur = bisect_left(insts, calling_frame.f_lasti, key=lambda x: x.offset)
+            else:
+                cur = calling_frame.f_lasti // 2
             inst = insts[cur]
 
             if inst.opname == 'POP_JUMP_IF_TRUE':
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index 54a2c46c237d..2610b24909ad 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -1,66 +1,49 @@
 import traceback
 from contextlib import contextmanager
-from typing import Optional, List, Any, Dict
+from typing import List, Any, Dict
 from ._compatibility import compatibility
 
-__all__ = ['override_stack_trace', 'set_stack_trace', 'append_stack_trace', 'format_stack',
-           'is_stack_trace_overridden', 'get_current_meta', 'set_current_meta']
+__all__ = ['preserve_node_meta', 'has_preserved_node_meta',
+           'set_stack_trace', 'format_stack',
+           'set_current_meta', 'get_current_meta']
 
-
-current_stack: List[str] = []
 current_meta: Dict[str, Any] = {}
-is_overridden = False
+should_preserve_node_meta = False
 
 
 @compatibility(is_backward_compatible=False)
 @contextmanager
-def override_stack_trace():
-    global is_overridden
+def preserve_node_meta():
+    global should_preserve_node_meta
 
-    saved_is_overridden = is_overridden
+    saved_should_preserve_node_meta = should_preserve_node_meta
     try:
-        is_overridden = True
+        should_preserve_node_meta = True
         yield
     finally:
-        is_overridden = saved_is_overridden
-
-@compatibility(is_backward_compatible=False)
-def set_stack_trace(stack : List[str]):
-    global current_stack
+        should_preserve_node_meta = saved_should_preserve_node_meta
 
-    if is_overridden and stack:
-        current_stack = stack
 
 @compatibility(is_backward_compatible=False)
-@contextmanager
-def append_stack_trace(stack : Optional[str]):
-    """
-    The content of stack here is an entire stacktraces as a string
-    """
-    global current_stack
+def set_stack_trace(stack : List[str]):
+    global current_meta
 
-    if is_overridden and stack:
-        try:
-            current_stack.append(stack)
-            yield
-        finally:
-            current_stack.pop()
-    else:
-        yield
+    if should_preserve_node_meta and stack:
+        current_meta["stack_trace"] = "".join(stack)
 
 
 @compatibility(is_backward_compatible=False)
 def format_stack() -> List[str]:
-    if is_overridden:
-        return current_stack.copy()
+    if should_preserve_node_meta:
+        return [current_meta.get("stack_trace", "")]
     else:
         # fallback to traceback.format_stack()
         return traceback.format_list(traceback.extract_stack()[:-1])
 
 
 @compatibility(is_backward_compatible=False)
-def is_stack_trace_overridden() -> bool:
-    return is_overridden
+def has_preserved_node_meta() -> bool:
+    return should_preserve_node_meta
 
 
 @compatibility(is_backward_compatible=False)
@@ -68,13 +51,13 @@ def is_stack_trace_overridden() -> bool:
 def set_current_meta(meta : Dict[str, Any]):
     global current_meta
 
-    old_meta = current_meta
-    if is_overridden and meta:
+    if should_preserve_node_meta and meta:
+        saved_meta = current_meta
         try:
             current_meta = meta
             yield
         finally:
-            current_meta = old_meta
+            current_meta = saved_meta
     else:
         yield
 
diff --git a/torch/hub.py b/torch/hub.py
index 19df8b0f33c6..36be9129728a 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -17,7 +17,7 @@
 from urllib.parse import urlparse  # noqa: F401
 from torch.serialization import MAP_LOCATION
 
-class _Faketqdm(object):  # type: ignore[no-redef]
+class _Faketqdm:  # type: ignore[no-redef]
 
     def __init__(self, total=None, disable=False,
                  unit=None, *args, **kwargs):
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 50877b122137..e9c6bc3e2db5 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -10,6 +10,8 @@
 from torch._jit_internal import (
     Final,
     Future,
+    _Await,
+    _drop,
     _IgnoreContextManager,
     _overload,
     _overload_method,
@@ -48,6 +50,7 @@
     _get_trace_graph,
 )
 from torch.jit._async import fork, wait
+from torch.jit._await import _awaitable, _awaitable_wait, _awaitable_nowait
 from torch.jit._decomposition_utils import _register_decomposition
 from torch.jit._serialization import (
     save,
@@ -175,7 +178,7 @@ def isinstance(obj, target_type):
 
         class MyModule(torch.nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
 
             def forward(self, input: Any): # note the Any type
                 if torch.jit.isinstance(input, List[torch.Tensor]):
@@ -193,7 +196,7 @@ def forward(self, input: Any): # note the Any type
     """
     return _isinstance(obj, target_type)
 
-class strict_fusion(object):
+class strict_fusion:
     """
     This class errors if not all nodes have been fused in
     inference, or symbolically differentiated in training.
diff --git a/torch/jit/_await.py b/torch/jit/_await.py
new file mode 100644
index 000000000000..d0df60d72405
--- /dev/null
+++ b/torch/jit/_await.py
@@ -0,0 +1,31 @@
+import torch
+
+from torch.utils import set_module
+from torch.jit._builtins import _register_builtin
+from torch._jit_internal import _Await
+
+set_module(_Await, "torch.jit")
+
+def _awaitable(func, *args, **kwargs):
+    r"""
+    Creates Await object that will call specified functioni with specified args,
+    when it is requested for the result.
+    """
+    return torch._C._awaitable(func, *args, **kwargs)
+
+def _awaitable_wait(aw):
+    r"""
+    Requests await the result of execution, if Await is not completed yet,
+    the func will be called immediately.
+    """
+    return torch._C._awaitable_wait(aw)
+
+def _awaitable_nowait(o):
+    r"""
+    Creates completed Await with specified result.
+    """
+    return torch._C._awaitable_nowait(o)
+
+
+_register_builtin(_awaitable_wait, "prim::awaitable_wait")
+_register_builtin(_awaitable_nowait, "prim::awaitable_nowait")
diff --git a/torch/jit/_builtins.py b/torch/jit/_builtins.py
index 509957371e7d..777a531d077d 100644
--- a/torch/jit/_builtins.py
+++ b/torch/jit/_builtins.py
@@ -117,7 +117,7 @@ def _gen_torch_functional_registered_ops():
     # some functions directly map to their aten:: implementations.
     # TODO: add support for more ops
     ops = ["stft", "istft", "lu", "cdist", "norm", "unique", "unique_consecutive", "tensordot"]
-    return set(getattr(torch.functional, name) for name in ops)
+    return {getattr(torch.functional, name) for name in ops}
 
 _functional_registered_ops = _gen_torch_functional_registered_ops()
 
@@ -135,6 +135,9 @@ def register_all(mod):
         for name in dir(mod):
             v = getattr(mod, name)
             if callable(v) and not _is_special_functional_bound_op(v) and v is not torch.no_grad and v is not torch.autocast:
+                # Fixup inconsistency in segment_reduce
+                if name == "_segment_reduce":
+                    name = name[1:]
                 _builtin_ops.append((v, "aten::" + name))
     for mod in _modules_containing_builtins:
         register_all(mod)
diff --git a/torch/jit/_check.py b/torch/jit/_check.py
index 492de8a8a09b..9d8557d9d2c5 100644
--- a/torch/jit/_check.py
+++ b/torch/jit/_check.py
@@ -1,7 +1,6 @@
 
 import ast
 import inspect
-import sys
 import textwrap
 import torch
 import warnings
@@ -59,9 +58,6 @@ def forward(self, x: List[int]):
     """
 
     def check(self, nn_module: torch.nn.Module) -> None:
-        # Check if we have a Python version <3.8
-        self.using_deprecated_ast: bool = sys.version_info < (3, 8)
-
         source_lines = inspect.getsource(nn_module.__class__.__init__)
 
         # Ignore comments no matter the indentation
@@ -99,12 +95,7 @@ def _is_empty_container(self, node: ast.AST, ann_type: str) -> bool:
         elif ann_type == "Optional":
             # Assigning `None` to an `Optional` type gives you a
             # Node where value=Constant(value=None, kind=None)
-            # or, in Python <3.8, value=NameConstant(value=None)
-            if (not self.using_deprecated_ast
-                    and not isinstance(node, ast.Constant)):
-                return False
-            if (self.using_deprecated_ast
-                    and not isinstance(node, ast.NameConstant)):
+            if not isinstance(node, ast.Constant):
                 return False
             if node.value:  # type: ignore[attr-defined]
                 return False
diff --git a/torch/jit/_dataclass_impls.py b/torch/jit/_dataclass_impls.py
index 4daf347db2b3..6adfa4f70100 100644
--- a/torch/jit/_dataclass_impls.py
+++ b/torch/jit/_dataclass_impls.py
@@ -7,7 +7,6 @@
 import ast
 import dataclasses
 import inspect
-import sys
 
 def _get_fake_filename(cls, method_name):
     return os.path.join(FAKE_FILENAME_PREFIX, cls.__name__, method_name)
@@ -56,19 +55,18 @@ def synthesize__init__(cls) -> ParsedDef:
     # Handle InitVars if needed (only works on Python 3.8+, when a `type` attribute was added to InitVar);
     # see CPython commit here https://github.com/python/cpython/commit/01ee12ba35a333e8a6a25c4153c4a21838e9585c
     init_vars: List[str] = []
-    if sys.version_info >= (3, 8):
-        params = []
-        for name, param in signature.parameters.items():
-            ann = param.annotation
-
-            if isinstance(ann, dataclasses.InitVar):
-                # The TorchScript interpreter can't handle InitVar annotations, so we unwrap the underlying type here
-                init_vars.append(name)
-                params.append(param.replace(annotation=ann.type))   # type: ignore[attr-defined]
-            else:
-                params.append(param)
-
-        signature = signature.replace(parameters=params)
+    params = []
+    for name, param in signature.parameters.items():
+        ann = param.annotation
+
+        if isinstance(ann, dataclasses.InitVar):
+            # The TorchScript interpreter can't handle InitVar annotations, so we unwrap the underlying type here
+            init_vars.append(name)
+            params.append(param.replace(annotation=ann.type))   # type: ignore[attr-defined]
+        else:
+            params.append(param)
+
+    signature = signature.replace(parameters=params)
 
     body = [
         # Assign all attributes to self
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
index b939584f36d3..6d50d534c957 100644
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@@ -5,8 +5,8 @@
 aten = torch.ops.aten
 from typing import Optional, List, Dict, Set
 import inspect
-from torch.fx.operator_schemas import get_signature_for_torch_op
 import warnings
+from torch.types import Number
 
 decomposition_table: Dict[str, torch.jit.ScriptFunction] = {}
 function_name_set: Set[str] = set()
@@ -58,18 +58,7 @@ def decomposition_decorator(f):
         if registry is None:
             registry = decomposition_table
 
-        check_decomposition_has_type_annotations(f)
-
-        torch_op_sigs, torch_op_schemas = get_signature_for_torch_op(aten_op, return_schemas=True)
-        decomposition_sig = inspect.signature(f)
-
-        found_index = None
-        for i, torch_op_sig in enumerate(torch_op_sigs):
-            if signatures_match(decomposition_sig, torch_op_sig):
-                found_index = i
-                break
-
-        assert found_index is not None, "Could not find matching signature: " + str(f)
+        assert isinstance(aten_op, torch._ops.OpOverload)
 
         # Need unique name for jit function serialization
         assert f.__name__ not in function_name_set, "Duplicated function name {}".format(f.__name__)
@@ -82,15 +71,16 @@ def decomposition_decorator(f):
             torch._C._jit_pass_peephole(scripted_func.graph)
             torch._C._jit_pass_constant_propagation(scripted_func.graph)
 
-        registry[str(torch_op_schemas[found_index])] = scripted_func
+        registry[str(aten_op._schema)] = scripted_func
         return f
 
     return decomposition_decorator
 
 # TODO: replace torch.sigmoid -> aten.sigmoid
 
-@register_decomposition(aten.var)
-def var_decomposition(input: Tensor, dim: Optional[List[int]] = None, correction: Optional[int] = None,
+@register_decomposition(aten.var.correction)
+def var_decomposition(input: Tensor, dim: Optional[List[int]] = None,
+                      correction: Optional[Number] = None,
                       keepdim: bool = False) -> Tensor:
     if dim is None:
         dim_i: List[int] = []
@@ -108,11 +98,18 @@ def var_decomposition(input: Tensor, dim: Optional[List[int]] = None, correction
     sq = sub * sub
     sum = aten.sum(sq, dim, keepdim)
 
-    if correction is not None:
-        n = n - correction
+    if correction is None:
+        denom = float(n - 1)
+    else:
+        if isinstance(correction, int):
+            denom = float(n - correction)
+        elif isinstance(correction, float):
+            denom = float(n) - correction
+        else:
+            raise RuntimeError("correction must be int or float")
 
-    return sum / n
+    return sum / max(0, denom)
 
-@register_decomposition(aten.var)
+@register_decomposition(aten.var.default)
 def var(input: Tensor, unbiased: bool = True) -> Tensor:
     return var_decomposition(input, correction=(1 if unbiased else 0))
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index af0a132ee0e7..0db888f6411d 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -40,7 +40,7 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics:
         import torch
         class MyModule(torch.nn.Module):
             def __init__(self, N, M):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.weight = torch.nn.Parameter(torch.rand(N, M))
                 self.linear = torch.nn.Linear(N, M)
 
@@ -62,7 +62,7 @@ def forward(self, input):
         import torch
         class MyModule2(torch.nn.Module):
             def __init__(self):
-                super(MyModule2, self).__init__()
+                super().__init__()
                 self.modified_tensor = torch.tensor(10.)
                 self.version = 1
 
diff --git a/torch/jit/_ir_utils.py b/torch/jit/_ir_utils.py
index dd2d72880431..9e4596de7758 100644
--- a/torch/jit/_ir_utils.py
+++ b/torch/jit/_ir_utils.py
@@ -1,7 +1,7 @@
 import torch
 from typing import Union
 
-class _InsertPoint(object):
+class _InsertPoint:
     def __init__(self, insert_point_graph: torch._C.Graph, insert_point: Union[torch._C.Node, torch._C.Block]):
         self.insert_point = insert_point
         self.g = insert_point_graph
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 02516d7a2ac7..5d3a1c5c5d0c 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -89,7 +89,7 @@ def jit_ignored_properties(module):
     user_annotated_ignored_attributes = getattr(module, "__jit_ignored_attributes__", list())
 
     def get_properties_names(module):
-        return set(k for k, v in vars(module).items() if isinstance(v, property))
+        return {k for k, v in vars(module).items() if isinstance(v, property)}
 
     properties = get_properties_names(type(module))
     user_annoted_ignored_properties = set()
@@ -108,7 +108,7 @@ def get_properties_names(module):
 def _get_valid_constant(attr, v, owner_type):
     if isinstance(v, _constant_types):
         return v
-    elif isinstance(v, tuple) or isinstance(v, list):
+    elif isinstance(v, (tuple, list)):
         return tuple(_get_valid_constant(attr, x, owner_type) for x in v)
     constants = ", ".join(torch.typename(typ) for typ in _constant_types)
     raise TypeError(textwrap.dedent("""
@@ -122,7 +122,7 @@ def _get_valid_constant(attr, v, owner_type):
 
 class SourceContext(torch._C._jit_tree_views.SourceRangeFactory):
     def __init__(self, source, filename, file_lineno, leading_whitespace_len):
-        super(SourceContext, self).__init__(source, filename, file_lineno, leading_whitespace_len)
+        super().__init__(source, filename, file_lineno, leading_whitespace_len)
 
 
 def infer_concrete_type_builder(nn_module, share_types=True):
@@ -351,7 +351,7 @@ def infer_type(name, item):
 
     return concrete_type_builder
 
-class ConcreteTypeStore(object):
+class ConcreteTypeStore:
     type_store: Dict[Type[Module], List[torch._C.ConcreteModuleType]]
     methods_compiled: Set[torch._C.ConcreteModuleType]
 
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index acafd9997483..fd0fa1f22a05 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -23,7 +23,6 @@
 from torch.nn import Module
 from torch.jit._state import _enabled
 from torch.jit._builtins import _register_builtin
-from torch._six import with_metaclass
 from torch.jit.frontend import get_jit_def, get_default_args, get_jit_class_def
 from torch._jit_internal import _qualified_name
 from torch.jit._fuser import _graph_for, _script_method_graph_for
@@ -97,7 +96,7 @@ def Attribute(value, type):  # type: ignore[no-redef]
 
         class AttributeModule(torch.jit.ScriptModule):
             def __init__(self):
-                super(AttributeModule, self).__init__()
+                super().__init__()
                 self.foo = torch.jit.Attribute(0.1, float)
 
                 # we should be able to use self.foo as a float here
@@ -128,7 +127,7 @@ class AttributeModule(torch.nn.Module):
             names: Dict[str, int]
 
             def __init__(self):
-                super(AttributeModule, self).__init__()
+                super().__init__()
                 self.names = {}
 
         m = AttributeModule()
@@ -177,7 +176,7 @@ def _is_new_style_class(cls):
 #  len(view)
 
 
-class OrderedDictWrapper(object):
+class OrderedDictWrapper:
     def __init__(self, _c):
         self._c = _c
 
@@ -215,7 +214,7 @@ def __getitem__(self, k):
 
 class OrderedModuleDict(OrderedDictWrapper):
     def __init__(self, module, python_dict):
-        super(OrderedModuleDict, self).__init__(torch._C.ModuleDict(module))
+        super().__init__(torch._C.ModuleDict(module))
         # contains _both_ script modules and non-script python-only modules
 
         # because script modules are subclassed in python and the
@@ -321,7 +320,7 @@ def make_stubs(module):
         super(ScriptMeta, cls).__init__(name, bases, attrs)
 
 
-class _CachedForward(object):
+class _CachedForward:
     def __get__(self, obj, cls):
         return self.__getattr__("forward")  # type: ignore[attr-defined]
 
@@ -411,7 +410,7 @@ def unpackage_script_module(importer: PackageImporter, script_module_id: str) ->
         "__exit__",
     ]
 
-    class RecursiveScriptClass(object):
+    class RecursiveScriptClass:
         """
         An analogue of RecursiveScriptModule for regular objects that are not modules.
         This class is a wrapper around a torch._C.ScriptObject that represents an instance
@@ -424,7 +423,7 @@ class RecursiveScriptClass(object):
                 exposed on this wrppaer.
         """
         def __init__(self, cpp_class):
-            super(RecursiveScriptClass, self).__init__()
+            super().__init__()
             self.__dict__["_initializing"] = True
             self._c = cpp_class
 
@@ -435,7 +434,7 @@ def __init__(self, cpp_class):
 
         def __getattr__(self, attr):
             if "_initializing" in self.__dict__ and self.__dict__["_initializing"]:
-                return super(RecursiveScriptClass, self).__getattr__(attr)  # type: ignore[misc]
+                return super().__getattr__(attr)  # type: ignore[misc]
 
             if attr in self._props:
                 return self._props[attr].fget()  # type: ignore[call-arg, misc]
@@ -444,7 +443,7 @@ def __getattr__(self, attr):
 
         def __setattr__(self, attr, value):
             if "_initializing" in self.__dict__ and self.__dict__["_initializing"]:
-                return super(RecursiveScriptClass, self).__setattr__(attr, value)
+                return super().__setattr__(attr, value)
 
             if attr in self._props:
                 return self._props[attr].fset(value)  # type: ignore[call-arg, misc]
@@ -484,7 +483,7 @@ def method_template(self, *args, **kwargs):
     # did nothing, __getattr__ would not be called. Instead we'd get nn.Module.forward
     # which always throws an exception.
 
-    class ScriptModule(with_metaclass(ScriptMeta, Module)):  # type: ignore[misc]
+    class ScriptModule(Module, metaclass=ScriptMeta):
         r"""
         A wrapper around C++ ``torch::jit::Module``. ``ScriptModule``\s
         contain methods, attributes, parameters, and
@@ -493,13 +492,13 @@ class ScriptModule(with_metaclass(ScriptMeta, Module)):  # type: ignore[misc]
         __jit_unused_properties__ = ['code', 'code_with_constants', 'graph', 'inlined_graph', 'original_name']
 
         def __init__(self):
-            super(ScriptModule, self).__init__()
+            super().__init__()
 
-        forward = _CachedForward()
+        forward: Callable[..., Any] = _CachedForward()  # type: ignore[assignment]
 
         def __getattr__(self, attr):
             if "_actual_script_module" not in self.__dict__:
-                return super(ScriptModule, self).__getattr__(attr)
+                return super().__getattr__(attr)
             return getattr(self._actual_script_module, attr)
 
         def __setattr__(self, attr, value):
@@ -518,7 +517,7 @@ def __setattr__(self, attr, value):
                         self.__class__.__annotations__ = {}
                     self.__annotations__[attr] = value.type
                     value = value.value
-                return super(ScriptModule, self).__setattr__(attr, value)
+                return super().__setattr__(attr, value)
 
             setattr(self._actual_script_module, attr, value)
 
@@ -591,7 +590,7 @@ class RecursiveScriptModule(ScriptModule):
         def __init__(self, cpp_module):
             self.__dict__["_initializing"] = True
             self._c = cpp_module
-            super(RecursiveScriptModule, self).__init__()
+            super().__init__()
             # Delete the 'training' attribute set up by `Module.__init__`. It
             # will get set on the underlying cpp module, so we delete it here
             # to avoid this version shadowing the cpp module version.
@@ -650,11 +649,11 @@ def _reconstruct(self, cpp_module):
             modules = {}
             for name, cpp_module in torch._C.ModuleDict(self._c).items():
                 modules[name] = wrap_cpp_module(cpp_module)
-            self._modules = OrderedModuleDict(self._c, modules)
+            self._modules = OrderedModuleDict(self._c, modules)  # type: ignore[assignment]
 
             # Copy parameters and buffers.
-            self._parameters = OrderedDictWrapper(torch._C.ParameterDict(self._c))
-            self._buffers = OrderedDictWrapper(torch._C.BufferDict(self._c))
+            self._parameters = OrderedDictWrapper(torch._C.ParameterDict(self._c))  # type: ignore[assignment]
+            self._buffers = OrderedDictWrapper(torch._C.BufferDict(self._c))  # type: ignore[assignment]
 
             # Get rid of the functions from the old C++ module.
             self.__dict__ = {
@@ -679,7 +678,7 @@ def inlined_graph(self):
             ``forward`` method. This graph will be preprocessed to inline all function and method calls.
             See :ref:`interpreting-graphs` for details.
             """
-            return self.forward.inlined_graph
+            return self.forward.inlined_graph  # type: ignore[attr-defined]
 
         @property
         def code(self):
@@ -688,7 +687,7 @@ def code(self):
             the internal graph for the ``forward`` method. See
             :ref:`inspecting-code` for details.
             """
-            return self.forward.code
+            return self.forward.code  # type: ignore[attr-defined]
 
         @property
         def code_with_constants(self):
@@ -702,7 +701,7 @@ def code_with_constants(self):
 
             See :ref:`inspecting-code` for details.
             """
-            r = self.forward.code_with_constants
+            r = self.forward.code_with_constants  # type: ignore[attr-defined]
             return (r[0], ConstMap(r[1]))
 
         def save(self, f, **kwargs):
@@ -740,7 +739,7 @@ def extra_repr(self):
             return "original_name={}".format(self.original_name)
 
         def graph_for(self, *args, **kwargs):
-            return self.forward.graph_for(self, *args, **kwargs)
+            return self.forward.graph_for(self, *args, **kwargs)  # type: ignore[attr-defined]
 
         @property
         def original_name(self):
@@ -767,7 +766,7 @@ def __getattr__(self, attr):
                 )
 
             if self._initializing:
-                return super(RecursiveScriptModule, self).__getattr__(attr)
+                return super().__getattr__(attr)
 
             # _modules check is before hasattr since modules are included as attributes in _c,
             # but we want to get the python wrapper from _modules instead of the raw _c object.
@@ -782,11 +781,11 @@ def __getattr__(self, attr):
                 self.__dict__[attr] = script_method
                 return script_method
 
-            return super(RecursiveScriptModule, self).__getattr__(attr)
+            return super().__getattr__(attr)
 
         def __setattr__(self, attr, value):
             if self._initializing:
-                return super(RecursiveScriptModule, self).__setattr__(attr, value)
+                return super().__setattr__(attr, value)
 
             if attr in self._modules:
                 self._modules[attr] = value
@@ -811,7 +810,7 @@ def __setattr__(self, attr, value):
                 #   s.python_attr = ...
                 #   s.save()   <--- this doesn't have `python_attr`
                 # It's fairly trivial to save enough info to warn in this case.
-                return super(RecursiveScriptModule, self).__setattr__(attr, value)
+                return super().__setattr__(attr, value)
 
         def __copy__(self):
             return torch.jit._recursive.wrap_cpp_module(copy.copy(self._c))
@@ -850,7 +849,7 @@ def __dir__(self):
             if self_method.__func__ == _get_function_from_type(  # type: ignore[attr-defined]
                 RecursiveScriptModule, "__dir__"
             ):
-                return super(RecursiveScriptModule, self).__dir__()
+                return super().__dir__()
             return self_method()
 
         # to resolve bool(value), Python looks if __bool__ is defined then __iter__
@@ -877,7 +876,7 @@ def init_fn(script_module):
 
     # Need to copy all RecursiveScriptModule methods to ScriptModule.
     #
-    # This is because `super(MyScriptModule, self).foo()` does not use
+    # This is because `super().foo()` does not use
     # `__getattr__` to look up `foo`. So we need to make each method available on
     # the ScriptModule manually.
     for name, item in RecursiveScriptModule.__dict__.items():
@@ -956,9 +955,8 @@ def fail(self, *args, **kwargs):
 
 else:
     # TODO MAKE SURE THAT DISABLING WORKS
-    class RecursiveScriptClass(object):  # type: ignore[no-redef]
-        def __init__(self):
-            super().__init__()
+    class RecursiveScriptClass:  # type: ignore[no-redef]
+        pass
 
     class ScriptModule(torch.nn.Module):  # type: ignore[no-redef]
         def __init__(self, arg=None):
@@ -1141,7 +1139,7 @@ def test_sum(a, b):
 
             class MyModule(torch.nn.Module):
                 def __init__(self, N, M):
-                    super(MyModule, self).__init__()
+                    super().__init__()
                     # This parameter will be copied to the new ScriptModule
                     self.weight = torch.nn.Parameter(torch.rand(N, M))
 
@@ -1168,7 +1166,7 @@ def forward(self, input):
 
             class MyModule(nn.Module):
                 def __init__(self):
-                    super(MyModule, self).__init__()
+                    super().__init__()
                     # torch.jit.trace produces a ScriptModule's conv1 and conv2
                     self.conv1 = torch.jit.trace(nn.Conv2d(1, 20, 5), torch.rand(1, 1, 16, 16))
                     self.conv2 = torch.jit.trace(nn.Conv2d(20, 20, 5), torch.rand(1, 20, 16, 16))
@@ -1191,7 +1189,7 @@ def forward(self, input):
 
             class MyModule(nn.Module):
                 def __init__(self):
-                    super(MyModule, self).__init__()
+                    super().__init__()
 
                 @torch.jit.export
                 def some_entry_point(self, input):
@@ -1345,6 +1343,8 @@ def forward(self, a) -> MyModule:
         )
         # Forward docstrings
         fn.__doc__ = obj.__doc__
+        # Allow torch.compile() to inline
+        fn._torchdynamo_inline = obj  # type: ignore[attr-defined]
         _set_jit_function_cache(obj, fn)
         return fn
     else:
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index 01a136ad7a02..c8c2975b1a5b 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -11,7 +11,6 @@
 import pathlib
 
 import torch
-from torch._six import string_classes
 from torch.jit._recursive import wrap_cpp_module
 from torch.serialization import validate_cuda_device
 
@@ -77,14 +76,14 @@ def forward(self, x):
     """
     if _extra_files is None:
         _extra_files = {}
-    if isinstance(f, str) or isinstance(f, pathlib.Path):
+    if isinstance(f, (str, pathlib.Path)):
         m.save(f, _extra_files=_extra_files)
     else:
         ret = m.save_to_buffer(_extra_files=_extra_files)
         f.write(ret)
 
 
-def load(f, map_location=None, _extra_files=None):
+def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
     r"""
     Load a :class:`ScriptModule` or :class:`ScriptFunction` previously
     saved with :func:`torch.jit.save <torch.jit.save>`
@@ -103,6 +102,7 @@ def load(f, map_location=None, _extra_files=None):
         _extra_files (dictionary of filename to content): The extra
             filenames given in the map would be loaded and their content
             would be stored in the provided map.
+        _restore_shapes (bool): Whether or not to retrace the module on load using stored inputs
 
     Returns:
         A :class:`ScriptModule` object.
@@ -147,7 +147,7 @@ def load(f, map_location=None, _extra_files=None):
         os.remove("scriptmodule.pt")
     """
 
-    if isinstance(f, string_classes):
+    if isinstance(f, str):
         if not os.path.exists(f):  # type: ignore[type-var]
             raise ValueError("The provided filename {} does not exist".format(f))  # type: ignore[str-bytes-safe]
         if os.path.isdir(f):
@@ -158,12 +158,12 @@ def load(f, map_location=None, _extra_files=None):
         _extra_files = {}
 
     cu = torch._C.CompilationUnit()
-    if isinstance(f, str) or isinstance(f, pathlib.Path):
-        cpp_module = torch._C.import_ir_module(cu, str(f), map_location, _extra_files)
+    if isinstance(f, (str, pathlib.Path)):
+        cpp_module = torch._C.import_ir_module(cu, str(f), map_location, _extra_files, _restore_shapes)  # type: ignore[call-arg]
     else:
         cpp_module = torch._C.import_ir_module_from_buffer(
-            cu, f.read(), map_location, _extra_files
-        )
+            cu, f.read(), map_location, _extra_files, _restore_shapes
+        )  # type: ignore[call-arg]
 
     # TODO: Pretty sure this approach loses ConstSequential status and such
     return wrap_cpp_module(cpp_module)
@@ -196,13 +196,13 @@ def get_ff_module():
 
 def jit_module_from_flatbuffer(f):
     ff = get_ff_module()
-    if isinstance(f, string_classes):
+    if isinstance(f, str):
         if not os.path.exists(f):  # type: ignore[type-var]
             raise ValueError("The provided filename {} does not exist".format(f))  # type: ignore[str-bytes-safe]
         if os.path.isdir(f):
             raise ValueError("The provided filename {} is a directory".format(f))  # type: ignore[str-bytes-safe]
 
-    if isinstance(f, str) or isinstance(f, pathlib.Path):
+    if isinstance(f, (str, pathlib.Path)):
         f = str(f)
         return wrap_cpp_module(ff._load_jit_module_from_file(f))
     else:
@@ -253,7 +253,7 @@ def forward(self, x):
         extra_files = {}
 
     ff = get_ff_module()
-    if isinstance(f, str) or isinstance(f, pathlib.Path):
+    if isinstance(f, (str, pathlib.Path)):
         f = str(f)
         ff._save_jit_module(m._c, f, extra_files)
     else:
@@ -283,7 +283,7 @@ def get_flatbuffer_module_info(path_or_file):
         }
     """
     ff = get_ff_module()
-    if isinstance(path_or_file, str) or isinstance(path_or_file, pathlib.Path):
+    if isinstance(path_or_file, (str, pathlib.Path)):
         with open(path_or_file, "rb") as f:
             all_bytes = f.read()
     else:
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 5fa570893146..4afe73496900 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -16,7 +16,7 @@
 import warnings
 import inspect
 import re
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Callable, Dict, List, Optional, Set
 
 from torch.jit._state import _python_cu, _enabled
 from torch.jit._script import ScriptModule, _CachedForward, script
@@ -81,7 +81,7 @@ def __init__(
         return_inputs=False,
         return_inputs_states=False,
     ):
-        super(ONNXTracedModule, self).__init__()
+        super().__init__()
         # inner may be a Module, or it may be an arbitrary callable
         # If it's a Module, we get its parameters automatically, which lets
         # us avoid a special casing functions versus modules.
@@ -302,7 +302,7 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None):
                 " encountered untraceable code.\n"
             )
             self.message += indent(tensor_compare_error) + "\n"
-        super(TracingCheckError, self).__init__(self.message)
+        super().__init__(self.message)
 
 
 # Check the traced module against a set of user-provided validation inputs
@@ -337,6 +337,7 @@ def _check_trace(
                 _module_class=_module_class,
                 _compilation_unit=torch._C.CompilationUnit(),
                 example_inputs_is_kwarg=example_inputs_is_kwarg,
+                _store_inputs=False
             )
             check_mod_func = check_mod._c._get_method(traced_func.name)
             inputs = inputs[traced_func.name]
@@ -351,6 +352,7 @@ def _check_trace(
                     _force_outplace=force_outplace,
                     _module_class=_module_class,
                     example_kwarg_inputs=_clone_inputs(inputs),
+                    _store_inputs=False
                 )
             else:
                 check_mod = torch.jit.trace(
@@ -360,9 +362,8 @@ def _check_trace(
                     strict=strict,
                     _force_outplace=force_outplace,
                     _module_class=_module_class,
+                    _store_inputs=False
                 )
-
-
             check_mod_func = check_mod
 
         def graph_diagnostic_info():
@@ -621,7 +622,8 @@ def trace(
     _force_outplace=False,
     _module_class=None,
     _compilation_unit=_python_cu,
-    example_kwarg_inputs=None
+    example_kwarg_inputs=None,
+    _store_inputs=True
 ):
     """
     Trace a function and return an executable  or :class:`ScriptFunction`
@@ -748,7 +750,7 @@ def foo(x, y):
 
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
             def forward(self, x):
@@ -800,8 +802,8 @@ def forward(self, x):
             _force_outplace,
             _module_class,
             example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
+            _store_inputs=_store_inputs
         )
-
     if (
         hasattr(func, "__self__")
         and isinstance(func.__self__, torch.nn.Module)
@@ -823,6 +825,7 @@ def forward(self, x):
             _force_outplace,
             _module_class,
             example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
+            _store_inputs=_store_inputs
         )
 
     # Special case for common case of passing a single Tensor
@@ -890,6 +893,8 @@ def forward(self, x):
                 example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
             )
 
+    # Allow torch.compile() to inline
+    traced._torchdynamo_inline = func  # type: ignore[attr-defined]
     return traced
 
 
@@ -908,6 +913,7 @@ def trace_module(
     _module_class=None,
     _compilation_unit=_python_cu,
     example_inputs_is_kwarg=False,
+    _store_inputs=True,
 ):
     """
     Trace a module and return an executable :class:`ScriptModule` that will be optimized
@@ -957,7 +963,7 @@ def trace_module(
 
         class Net(nn.Module):
             def __init__(self):
-                super(Net, self).__init__()
+                super().__init__()
                 self.conv = nn.Conv2d(1, 1, 3)
 
             def forward(self, x):
@@ -1043,6 +1049,7 @@ def register_submods(mod, prefix):
                     strict,
                     _force_outplace,
                     argument_names,
+                    _store_inputs
                 )
             else:
                 example_inputs = make_tuple(example_inputs)
@@ -1054,6 +1061,7 @@ def register_submods(mod, prefix):
                     strict,
                     _force_outplace,
                     argument_names,
+                    _store_inputs
                 )
 
             check_trace_method = module._c._get_method(method_name)
@@ -1105,7 +1113,7 @@ class TracedModule(ScriptModule):
 
     def __init__(self, orig, id_set=None, _compilation_unit=None):
         # XXX: orig can be a nn.Module or a function!
-        super(TracedModule, self).__init__()
+        super().__init__()
         assert isinstance(orig, torch.nn.Module)
 
         # Copy a subset of `orig` to a temporary nn.Module.
@@ -1176,12 +1184,12 @@ def forward(self, *args, **kwargs):
 
     def __getattr__(self, attr):
         if "_actual_script_module" not in self.__dict__:
-            return super(TracedModule, self).__getattr__(attr)
+            return super().__getattr__(attr)
         return getattr(self._actual_script_module, attr)
 
     def __setattr__(self, attr, value):
         if "_actual_script_module" not in self.__dict__:
-            return super(TracedModule, self).__setattr__(attr, value)
+            return super().__setattr__(attr, value)
         setattr(self._actual_script_module, attr, value)
 
     def _get_name(self):
@@ -1192,7 +1200,7 @@ def extra_repr(self):
 
 
 class TopLevelTracedModule(TracedModule):
-    forward = _CachedForward()
+    forward: Callable[..., Any] = _CachedForward()  # type: ignore[assignment]
 
     def _reconstruct(self, cpp_module):
         """
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index a6ff2d04d207..0295c20ec964 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -7,13 +7,13 @@
 import torch
 import warnings
 from .._jit_internal import List, Tuple, is_tuple, is_list, Dict, is_dict, Optional, \
-    is_optional, _qualified_name, Any, Future, is_future, is_ignored_fn, Union, is_union
+    is_optional, _qualified_name, Any, Future, is_future, _Await, is_await, is_ignored_fn, Union, is_union
 from .._jit_internal import BroadcastingList1, BroadcastingList2, BroadcastingList3  # type: ignore[attr-defined]
 from ._state import _get_script_class
 
 from torch._C import TensorType, TupleType, FloatType, IntType, ComplexType, \
     ListType, StringType, DictType, BoolType, OptionalType, InterfaceType, AnyType, \
-    NoneType, DeviceObjType, StreamObjType, FutureType, EnumType, UnionType, NumberType
+    NoneType, DeviceObjType, StreamObjType, FutureType, AwaitType, EnumType, UnionType, NumberType
 
 
 from textwrap import dedent
@@ -26,7 +26,7 @@
 
 from torch._ops import OpOverloadPacket
 
-class Module(object):
+class Module:
     def __init__(self, name, members):
         self.name = name
         self.members = members
@@ -38,7 +38,7 @@ def __getattr__(self, name):
             raise RuntimeError(f"Module {self.name} has no member called {name}") from None
 
 
-class EvalEnv(object):
+class EvalEnv:
     env = {
         'torch': Module('torch', {'Tensor': torch.Tensor}),
         'Tensor': torch.Tensor,
@@ -48,7 +48,8 @@ class EvalEnv(object):
         'Dict': Dict,
         'Optional': Optional,
         'Union': Union,
-        'Future': Future
+        'Future': Future,
+        'Await': _Await
     }
 
     def __init__(self, rcb):
@@ -351,7 +352,7 @@ def try_ann_to_type(ann, loc):
         return OptionalType(valid_type)
     if is_union(ann):
         # TODO: this is hack to recognize NumberType
-        if set(ann.__args__) == set([int, float, complex]):
+        if set(ann.__args__) == {int, float, complex}:
             return NumberType.get()
         inner: List = []
         # We need these extra checks because both `None` and invalid
@@ -369,6 +370,9 @@ def try_ann_to_type(ann, loc):
         return RRefType(try_ann_to_type(ann.__args__[0], loc))
     if is_future(ann):
         return FutureType(try_ann_to_type(ann.__args__[0], loc))
+    if is_await(ann):
+        elementType = try_ann_to_type(ann.__args__[0], loc) if hasattr(ann, "__args__") else AnyType.get()
+        return AwaitType(elementType)
     if ann is float:
         return FloatType.get()
     if ann is complex:
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 44a8628f77d5..c3d5ce10aa25 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -23,7 +23,7 @@
 from torch._sources import ParsedDef as _ParsedDef
 from torch.jit._dataclass_impls import DATACLASS_MAGIC_METHODS
 from torch.jit._monkeytype_config import monkeytype_trace, get_qualified_name
-from torch._jit_internal import should_drop, is_static_fn, FunctionModifiers  # noqa: F401
+from torch._jit_internal import should_drop, _is_drop_fn, is_static_fn, FunctionModifiers  # noqa: F401
 from torch import _jit_internal
 import torch.jit.annotations
 
@@ -92,11 +92,10 @@ def is_reserved_name(name):
     ast.Nonlocal: "nonlocal",
 })
 
-if sys.version_info >= (3, 6):
-    pretty_node_names.update({
-        ast.AnnAssign: "annotated assignments",
-    })
-    # NB: no specific token for AnnAssign
+pretty_node_names.update({
+    ast.AnnAssign: "annotated assignments",
+})
+# NB: no specific token for AnnAssign
 
 
 class FrontendError(Exception):
@@ -126,7 +125,7 @@ def __init__(self, ctx, offending_node, reason=''):
                                       offending_node.col_offset + range_len)
         feature_name = pretty_node_names.get(node_type, node_type.__name__)
         msg = "{} {}aren't supported".format(feature_name, reason + ' ' if reason else '')
-        super(UnsupportedNodeError, self).__init__(source_range, msg)
+        super().__init__(source_range, msg)
 
 
 class FrontendTypeError(FrontendError):
@@ -195,6 +194,7 @@ def get_jit_class_def(cls, self_name):
         predicate=lambda m: (inspect.ismethod(m) or inspect.isfunction(m))
         and not is_static_fn(cls, m.__name__)
         and m.__name__ in cls.__dict__
+        and not _is_drop_fn(m)
     )
 
     def is_classmethod(fn):
@@ -281,6 +281,10 @@ def _forward(self):
         for arg in fn_def.args.args + fn_def.args.kwonlyargs:
             # Replace potentially unsupported type annotations by "Any"
             arg.annotation = unused_def.args.args[0].annotation
+        if _is_drop_fn(fn):
+            # Dropping potentially unsupported return type annotation for jit._drop
+            fn_def.returns = None
+            fn_def.type_comment = None
 
     # If MonkeyType is installed, get all the consolidated type traces
     # for the arguments from type_trace_db
@@ -308,7 +312,7 @@ def is_torch_jit_ignore_context_manager(stmt):
                         return True
     return False
 
-class Builder(object):
+class Builder:
     def __call__(self, ctx, node):
         method = getattr(self, 'build_' + node.__class__.__name__, None)
         if method is None:
@@ -404,11 +408,7 @@ def process_ins_outs(args):
         outputs = []
         for arg in args:
             var_name = arg.arg
-            if sys.version_info < (3, 8):
-                # Starting python3.8 ast.Str is deprecated
-                var_ann = arg.value.s
-            else:
-                var_ann = arg.value.value
+            var_ann = arg.value.value
             var_decl_type, var_ann = var_ann.split(":")
             if var_decl_type == "inp":
                 inputs.append(InputType(var_name, var_ann))
diff --git a/torch/jit/mobile/__init__.py b/torch/jit/mobile/__init__.py
index 1749dae0099e..01a7495e9922 100644
--- a/torch/jit/mobile/__init__.py
+++ b/torch/jit/mobile/__init__.py
@@ -44,17 +44,17 @@ def _load_for_lite_interpreter(f, map_location=None):
 
     map_location = validate_map_location(map_location)
 
-    if isinstance(f, str) or isinstance(f, pathlib.Path):
+    if isinstance(f, (str, pathlib.Path)):
         cpp_module = torch._C._load_for_lite_interpreter(f, map_location)
     else:
         cpp_module = torch._C._load_for_lite_interpreter_from_buffer(f.read(), map_location)
 
     return LiteScriptModule(cpp_module)
 
-class LiteScriptModule(object):
+class LiteScriptModule:
     def __init__(self, cpp_module):
         self._c = cpp_module
-        super(LiteScriptModule, self).__init__()
+        super().__init__()
 
     def __call__(self, *input):
         return self._c.forward(input)
@@ -101,7 +101,7 @@ def _get_model_bytecode_version(f_input) -> int:
         if os.path.isdir(f_input):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
-    if (isinstance(f_input, str) or isinstance(f_input, pathlib.Path)):
+    if (isinstance(f_input, (str, pathlib.Path))):
         return torch._C._get_model_bytecode_version(str(f_input))
     else:
         return torch._C._get_model_bytecode_version_from_buffer(f_input.read())
@@ -131,7 +131,7 @@ def _get_mobile_model_contained_types(f_input) -> int:
         if os.path.isdir(f_input):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
-    if (isinstance(f_input, str) or isinstance(f_input, pathlib.Path)):
+    if (isinstance(f_input, (str, pathlib.Path))):
         return torch._C._get_mobile_model_contained_types(str(f_input))
     else:
         return torch._C._get_mobile_model_contained_types_from_buffer(f_input.read())
@@ -152,8 +152,8 @@ def _backport_for_mobile(f_input, f_output, to_version):
         if os.path.isdir(f_input):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
-    if ((isinstance(f_input, str) or isinstance(f_input, pathlib.Path)) and (
-            isinstance(f_output, str) or isinstance(f_output, pathlib.Path))):
+    if ((isinstance(f_input, (str, pathlib.Path))) and (
+            isinstance(f_output, (str, pathlib.Path)))):
         return torch._C._backport_for_mobile(str(f_input), str(f_output), to_version)
     else:
         return torch._C._backport_for_mobile_from_buffer(f_input.read(), str(f_output), to_version)
@@ -171,7 +171,7 @@ def _backport_for_mobile_to_buffer(f_input, to_version):
         if os.path.isdir(f_input):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
-    if (isinstance(f_input, str) or isinstance(f_input, pathlib.Path)):
+    if (isinstance(f_input, (str, pathlib.Path))):
         return torch._C._backport_for_mobile_to_buffer(str(f_input), to_version)
     else:
         return torch._C._backport_for_mobile_from_buffer_to_buffer(f_input.read(), to_version)
@@ -211,7 +211,7 @@ def _get_model_ops_and_info(f_input):
         if os.path.isdir(f_input):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
-    if (isinstance(f_input, str) or isinstance(f_input, pathlib.Path)):
+    if (isinstance(f_input, (str, pathlib.Path))):
         return torch._C._get_model_ops_and_info(str(f_input))
     else:
         return torch._C._get_model_ops_and_info(f_input.read())
diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py
index df0cfe1cc1f4..cb4c5f04df2d 100644
--- a/torch/jit/quantized.py
+++ b/torch/jit/quantized.py
@@ -11,7 +11,7 @@ class QuantizedLinear(torch.jit.ScriptModule):
     __constants__ = ['scale', 'zero_point']
 
     def __init__(self, other):
-        super(QuantizedLinear, self).__init__()
+        super().__init__()
         warnings.warn(
             "torch.jit.QuantizedLinear is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.Linear instead.")
@@ -56,7 +56,7 @@ def extra_repr(self):
 class QuantizedLinearFP16(torch.jit.ScriptModule):
 
     def __init__(self, other):
-        super(QuantizedLinearFP16, self).__init__()
+        super().__init__()
         warnings.warn(
             "torch.jit.QuantizedLinearFP16 is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.Linear instead.")
@@ -96,7 +96,7 @@ class QuantizedRNNCellBase(torch.jit.ScriptModule):
                      'zero_point_ih', 'zero_point_hh']
 
     def __init__(self, other):
-        super(QuantizedRNNCellBase, self).__init__()
+        super().__init__()
         warnings.warn(
             "torch.jit.QuantizedRNNCellBase is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.RNNCell instead.")
@@ -174,7 +174,7 @@ class QuantizedRNNCell(QuantizedRNNCellBase):
                      'zero_point_ih', 'zero_point_hh', 'nonlinearity']
 
     def __init__(self, other):
-        super(QuantizedRNNCell, self).__init__(other)
+        super().__init__(other)
         warnings.warn(
             "torch.jit.QuantizedRNNCell is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.RNNCell instead.")
@@ -209,7 +209,7 @@ def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
 
 class QuantizedLSTMCell(QuantizedRNNCellBase):
     def __init__(self, other):
-        super(QuantizedLSTMCell, self).__init__(other)
+        super().__init__(other)
         warnings.warn(
             "torch.jit.QuantizedLSTMCell is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.LSTMCell instead.")
@@ -232,7 +232,7 @@ def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) ->
 
 class QuantizedGRUCell(QuantizedRNNCellBase):
     def __init__(self, other):
-        super(QuantizedGRUCell, self).__init__(other)
+        super().__init__(other)
         warnings.warn(
             "torch.jit.QuantizedGRUCell is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.GRUCell instead.")
@@ -260,7 +260,7 @@ class QuantizedRNNBase(torch.jit.ScriptModule):
                      'batch_first', 'dropout', 'bidirectional', 'dtype']
 
     def __init__(self, other, dtype=torch.int8):
-        super(QuantizedRNNBase, self).__init__()
+        super().__init__()
         warnings.warn(
             "torch.jit.QuantizedRNNBase is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic instead.")
@@ -365,7 +365,7 @@ class QuantizedLSTM(QuantizedRNNBase):
     __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
 
     def __init__(self, other, dtype):
-        super(QuantizedLSTM, self).__init__(other, dtype)
+        super().__init__(other, dtype)
         warnings.warn(
             "torch.jit.QuantizedLSTM is deprecated and will be removed in an upcoming "
             "PyTorch release. Please use the torch.ao.nn.quantized.dynamic.LSTM instead.")
@@ -406,13 +406,15 @@ def forward_tensor(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = No
         return output, self.permute_hidden(hidden, unsorted_indices)
 
     @torch.jit.script_method
-    def forward_packed(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
-                       ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
-        input, batch_sizes, sorted_indices, unsorted_indices = input
-        max_batch_size = batch_sizes[0]
-        max_batch_size = int(max_batch_size)
-
-        output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
+    def forward_packed(
+        self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
+    ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
+        input_, batch_sizes, sorted_indices, unsorted_indices = input
+        max_batch_size = int(batch_sizes[0])
+
+        output, hidden = self.forward_impl(
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
 
         output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
         return output, self.permute_hidden(hidden, unsorted_indices)
@@ -490,11 +492,12 @@ def forward_tensor(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Te
 
     @torch.jit.script_method
     def forward_packed(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]:
-        input, batch_sizes, sorted_indices, unsorted_indices = input
-        max_batch_size = batch_sizes[0]
-        max_batch_size = int(max_batch_size)
+        input_, batch_sizes, sorted_indices, unsorted_indices = input
+        max_batch_size = int(batch_sizes[0])
 
-        output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
+        output, hidden = self.forward_impl(
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
 
         output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
         return output, self.permute_hidden(hidden, unsorted_indices)
diff --git a/torch/jit/unsupported_tensor_ops.py b/torch/jit/unsupported_tensor_ops.py
index 5babb405280f..29d910051cfd 100644
--- a/torch/jit/unsupported_tensor_ops.py
+++ b/torch/jit/unsupported_tensor_ops.py
@@ -14,12 +14,12 @@ def func(x):
         return x.{op}()
     ''')
 
-    deprecated_apis = set(["volatile", "resize", "reinforce", "new", "name", "map2_", "has_names", "grad_fn", "resize_as"])
+    deprecated_apis = {"volatile", "resize", "reinforce", "new", "name", "map2_", "has_names", "grad_fn", "resize_as"}
     tensor_attrs = tensor_attrs - deprecated_apis
 
     properties = []
     methods = []
-    sorted_tensor_attrs = sorted(list(tensor_attrs), key=lambda x: x.lower())
+    sorted_tensor_attrs = sorted(tensor_attrs, key=lambda x: x.lower())
     for attr in sorted_tensor_attrs:
         funcs_str = funcs_template.format(op=attr)
         scope: Dict[str, Any] = {}
diff --git a/torch/lib/libshm/CMakeLists.txt b/torch/lib/libshm/CMakeLists.txt
index 2c2eec4bcf33..20158a9a2553 100644
--- a/torch/lib/libshm/CMakeLists.txt
+++ b/torch/lib/libshm/CMakeLists.txt
@@ -2,7 +2,6 @@ project(libshm C CXX)
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 set(TORCH_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../../)
-include(${TORCH_ROOT}/cmake/public/threads.cmake)
 
 if(NOT LIBSHM_INSTALL_LIB_SUBDIR)
   set(LIBSHM_INSTALL_LIB_SUBDIR "lib" CACHE PATH "libshm install library directory")
@@ -34,6 +33,7 @@ target_link_libraries(shm PUBLIC torch)
 
 if(UNIX AND NOT APPLE)
   include(CheckLibraryExists)
+  find_package(Threads REQUIRED)
   # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
   check_library_exists(rt clock_gettime "time.h" NEED_LIBRT)
   if(NEED_LIBRT)
@@ -56,12 +56,12 @@ if(UNIX AND NOT APPLE)
     # site above though in case there was a reason we were testing
     # against clock_gettime. In principle, the choice of symbol you
     # test for shouldn't matter.
-    set(CMAKE_REQUIRED_LIBRARIES caffe2::Threads)
+    set(CMAKE_REQUIRED_LIBRARIES Threads::Threads)
     check_library_exists(rt shm_open "sys/mman.h" NEED_RT_AND_PTHREAD)
     unset(CMAKE_REQUIRED_LIBRARIES)
     if(NEED_RT_AND_PTHREAD)
       message(STATUS "Needs it, linking against pthread and rt")
-      target_link_libraries(shm PUBLIC rt caffe2::Threads)
+      target_link_libraries(shm PUBLIC rt Threads::Threads)
     endif()
   endif()
 endif()
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
index d03380698aa6..3f971763ffc6 100644
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@@ -102,7 +102,7 @@ THManagedMapAllocatorInit::THManagedMapAllocatorInit(
     if (!manager_handle_.empty()) {
       socket = &get_manager_socket(manager_handle_);
     } else {
-      if (managers.size() == 0) {
+      if (managers.empty()) {
         start_manager();
       }
       const auto& manager = managers.begin();
diff --git a/torch/lib/libshm/libshm.h b/torch/lib/libshm/libshm.h
index b289f9a886e8..39e8e04853e8 100644
--- a/torch/lib/libshm/libshm.h
+++ b/torch/lib/libshm/libshm.h
@@ -27,7 +27,7 @@ class THManagedMapAllocator : private THManagedMapAllocatorInit,
 
   void close() override;
 
-  ~THManagedMapAllocator() {
+  ~THManagedMapAllocator() override {
     close();
   }
 
diff --git a/torch/lib/libshm/manager.cpp b/torch/lib/libshm/manager.cpp
index 3be979cb4779..54dd24dcda74 100644
--- a/torch/lib/libshm/manager.cpp
+++ b/torch/lib/libshm/manager.cpp
@@ -113,12 +113,12 @@ int main(int argc, char* argv[]) {
   for (;;) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     int nevents;
-    if (client_sessions.size() == 0)
+    if (client_sessions.empty())
       timeout = SHUTDOWN_TIMEOUT;
     SYSCHECK_ERR_RETURN_NEG1(
         nevents = poll(pollfds.data(), pollfds.size(), timeout));
     timeout = -1;
-    if (nevents == 0 && client_sessions.size() == 0)
+    if (nevents == 0 && client_sessions.empty())
       break;
 
     for (auto& pfd : pollfds) {
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index 9b706cfa60e6..9839330b260a 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -9,6 +9,8 @@
 from torch import Tensor
 from torch.masked import as_masked_tensor, is_masked_tensor, MaskedTensor
 from . import _docs
+from torch._prims_common import corresponding_real_dtype
+from torch import sym_float
 
 if TYPE_CHECKING:
     from torch.types import _dtype as DType
@@ -378,11 +380,11 @@ def _generate_docstring(func):
     )
 
     # Apply function name info to docstring templates:
-    templates = dict(
-        (k, v.format_map(template_data))
+    templates = {
+        k: v.format_map(template_data)
         for k, v in docstring_templates.items()
         if k.startswith(op_kind)
-    )
+    }
     templates.update(
         (k, v.format_map(template_data) if isinstance(v, str) else v)
         for k, v in template_data.items()
@@ -783,7 +785,7 @@ def _sparse_csr_segment_reduction_helper(
             )
             new_nnz = new_crow_indices[-1]
             new_col_indices = col_indices.new_zeros(new_nnz)
-            new_values = torch.segment_reduce(values, reduce, offsets=crow_indices)
+            new_values = torch._segment_reduce(values, reduce, offsets=crow_indices)  # type: ignore[attr-defined]
             new_shape = [mask_input.size(0), 1]
     else:
         assert len(dims) == 2
@@ -1538,18 +1540,18 @@ def _std_var(
     dim: DimOrDims,
     unbiased: Optional[bool],
     *,
-    correction: Optional[int],
+    correction_opt: Optional[Union[int, float]],
     keepdim: Optional[bool],
     dtype: Optional[DType],
     mask: Optional[Tensor],
     take_sqrt: Optional[bool],
 ) -> Tensor:
-    assert (unbiased is None or correction is None), "Only one of unbiased and correction may be given"
-    correction_int = 1
+    assert (unbiased is None or correction_opt is None), "Only one of unbiased and correction may be given"
+    correction = 1.0
     if unbiased is not None:
-        correction_int = 1 if unbiased else 0
-    if correction is not None:
-        correction_int = correction
+        correction = 1.0 if unbiased else 0.0
+    if correction_opt is not None:
+        correction = sym_float(correction_opt)
 
     if dtype is None:
         dtype = input.dtype
@@ -1589,8 +1591,11 @@ def _std_var(
             )
         if not keepdim:
             count = count.reshape(total.shape)
-        if correction_int != 0:
-            count = torch.subtract(count, correction_int)
+        if correction != 0:
+            real_dtype = (corresponding_real_dtype(compute_dtype)
+                          if compute_dtype.is_complex else compute_dtype)
+            count = count.to(real_dtype)
+            count = torch.subtract(count, correction)
             count = torch.maximum(count, count.new_zeros([]))
         output = torch.divide(total, count).to(dtype=dtype)
         if take_sqrt:
@@ -1608,7 +1613,7 @@ def var(
     dim: DimOrDims = None,
     unbiased: Optional[bool] = None,
     *,
-    correction: Optional[int] = None,
+    correction: Optional[Union[int, float]] = None,
     keepdim: Optional[bool] = False,
     dtype: Optional[DType] = None,
     mask: Optional[Tensor] = None,
@@ -1625,7 +1630,7 @@ def var(
         input=input,
         dim=dim,
         unbiased=unbiased,
-        correction=correction,
+        correction_opt=correction,
         keepdim=keepdim,
         dtype=dtype,
         mask=mask,
@@ -1656,7 +1661,7 @@ def std(
         input=input,
         dim=dim,
         unbiased=unbiased,
-        correction=correction,
+        correction_opt=correction,
         keepdim=keepdim,
         dtype=dtype,
         mask=mask,
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 0459f24587bd..ae1c46d2bf82 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -90,7 +90,7 @@ def _helper(a, map_fn):
 
 def _wrap_result(result_data, result_mask):
     if isinstance(result_data, list):
-        return list(_wrap_result(r, m) for (r, m) in zip(result_data, result_mask))
+        return [_wrap_result(r, m) for (r, m) in zip(result_data, result_mask)]
     if isinstance(result_data, tuple):
         return tuple(_wrap_result(r, m) for (r, m) in zip(result_data, result_mask))
     if torch.is_tensor(result_data):
diff --git a/torch/mps/__init__.py b/torch/mps/__init__.py
new file mode 100644
index 000000000000..2ab95557714d
--- /dev/null
+++ b/torch/mps/__init__.py
@@ -0,0 +1,104 @@
+r"""
+This package enables an interface for accessing MPS backend in python
+"""
+import torch
+from .. import Tensor
+
+_is_in_bad_fork = getattr(torch._C, "_mps_is_in_bad_fork", lambda: False)
+_default_mps_generator: torch._C.Generator = None  # type: ignore[assignment]
+
+# local helper function (not public or exported)
+def _get_default_mps_generator() -> torch._C.Generator:
+    global _default_mps_generator
+    if _default_mps_generator is None:
+        _default_mps_generator = torch._C._mps_get_default_generator()
+    return _default_mps_generator
+
+def synchronize() -> None:
+    r"""Waits for all kernels in all streams on a MPS device to complete."""
+    return torch._C._mps_synchronize()
+
+def get_rng_state() -> Tensor:
+    r"""Returns the random number generator state as a ByteTensor."""
+    return _get_default_mps_generator().get_state()
+
+def set_rng_state(new_state: Tensor) -> None:
+    r"""Sets the random number generator state.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    _get_default_mps_generator().set_state(new_state_copy)
+
+def manual_seed(seed: int) -> None:
+    r"""Sets the seed for generating random numbers.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    # the torch.mps.manual_seed() can be called from the global
+    # torch.manual_seed() in torch/random.py. So we need to make
+    # sure mps is available (otherwise we just return without
+    # erroring out)
+    if not torch.has_mps:
+        return
+    seed = int(seed)
+    _get_default_mps_generator().manual_seed(seed)
+
+def seed() -> None:
+    r"""Sets the seed for generating random numbers to a random number."""
+    _get_default_mps_generator().seed()
+
+def empty_cache() -> None:
+    r"""Releases all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU applications.
+    """
+    torch._C._mps_emptyCache()
+
+def set_per_process_memory_fraction(fraction) -> None:
+    r"""Set memory fraction for limiting process's memory allocation on MPS device.
+    The allowed value equals the fraction multiplied by recommended maximum device memory
+    (obtained from Metal API device.recommendedMaxWorkingSetSize).
+    If trying to allocate more than the allowed value in a process, it will raise an out of
+    memory error in allocator.
+
+    Args:
+        fraction(float): Range: 0~2. Allowed memory equals total_memory * fraction.
+
+    .. note::
+       Passing 0 to fraction means unlimited allocations
+       (may cause system failure if out of memory).
+       Passing fraction greater than 1.0 allows limits beyond the value
+       returned from device.recommendedMaxWorkingSetSize.
+    """
+
+    if not isinstance(fraction, float):
+        raise TypeError('Invalid type for fraction argument, must be `float`')
+    if fraction < 0 or fraction > 2:
+        raise ValueError('Invalid fraction value: {}. Allowed range: 0~2'.format(fraction))
+
+    torch._C._mps_setMemoryFraction(fraction)
+
+def current_allocated_memory() -> int:
+    r"""Returns the current GPU memory occupied by tensors in bytes.
+
+     .. note::
+        The returned size does not include cached allocations in
+        memory pools of MPSAllocator.
+    """
+    return torch._C._mps_currentAllocatedMemory()
+
+def driver_allocated_memory() -> int:
+    r"""Returns total GPU memory allocated by Metal driver for the process in bytes.
+
+     .. note::
+        The returned size includes cached allocations in MPSAllocator pools
+        as well as allocations from MPS/MPSGraph frameworks.
+    """
+    return torch._C._mps_driverAllocatedMemory()
+
+__all__ = [
+    'get_rng_state', 'manual_seed', 'seed', 'set_rng_state', 'synchronize',
+    'empty_cache', 'set_per_process_memory_fraction', 'current_allocated_memory',
+    'driver_allocated_memory']
diff --git a/torch/multiprocessing/_atfork.py b/torch/multiprocessing/_atfork.py
index b9d59bc30604..74b4ec9fff16 100644
--- a/torch/multiprocessing/_atfork.py
+++ b/torch/multiprocessing/_atfork.py
@@ -2,7 +2,7 @@
 
 __all__ = ['register_after_fork']
 
-if sys.platform == 'win32' or sys.version_info < (3, 7):
+if sys.platform == 'win32':
     import multiprocessing.util as _util
 
     def _register(func):
diff --git a/torch/multiprocessing/queue.py b/torch/multiprocessing/queue.py
index 9622cd8d3fb1..3128fc9e16e7 100644
--- a/torch/multiprocessing/queue.py
+++ b/torch/multiprocessing/queue.py
@@ -4,7 +4,7 @@
 import pickle
 
 
-class ConnectionWrapper(object):
+class ConnectionWrapper:
     """Proxy class for _multiprocessing.Connection which uses ForkingPickler to
     serialize objects"""
 
@@ -30,7 +30,7 @@ def __getattr__(self, name):
 class Queue(multiprocessing.queues.Queue):
 
     def __init__(self, *args, **kwargs):
-        super(Queue, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self._reader: ConnectionWrapper = ConnectionWrapper(self._reader)
         self._writer: ConnectionWrapper = ConnectionWrapper(self._writer)
         self._send = self._writer.send
@@ -43,4 +43,4 @@ def _make_methods(self):
         if not isinstance(self._reader, ConnectionWrapper):
             self._reader: ConnectionWrapper = ConnectionWrapper(self._reader)
             self._writer: ConnectionWrapper = ConnectionWrapper(self._writer)
-        super(SimpleQueue, self)._make_methods()  # type: ignore[misc]
+        super()._make_methods()  # type: ignore[misc]
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index 4fcccb47685c..6389fc99830d 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -19,7 +19,7 @@
     pass
 
 
-class StorageWeakRef(object):
+class StorageWeakRef:
     r"""A weak reference to a Storage.
 
     The cdata member is a Python number containing the integer representation of
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index 5b838efc75ea..e802c3d14a44 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -163,8 +163,7 @@ def join(self, timeout=None):
 class SpawnContext(ProcessContext):
     def __init__(self, processes, error_queues):
         warnings.warn('SpawnContext is renamed to ProcessContext since 1.4 release.')
-        super(SpawnContext, self).__init__(processes, error_queues)
-    pass
+        super().__init__(processes, error_queues)
 
 
 # Note: [start_processes]
diff --git a/torch/nn/cpp.py b/torch/nn/cpp.py
index 25a5bcc446aa..2e4e2aafb4e0 100644
--- a/torch/nn/cpp.py
+++ b/torch/nn/cpp.py
@@ -3,7 +3,7 @@
 from torch import nn
 
 
-class OrderedDictWrapper(object):
+class OrderedDictWrapper:
     """
     A wrapper around a C++ OrderedDict that dynamically evaluates the
     OrderedDict getter on a bound C++ module, such that new changes on the C++
@@ -56,7 +56,7 @@ def __init__(self, cpp_module):
         # Assign before the super class constructor so ``self.training`` can be
         # assigned to in the super class constructor.
         self.cpp_module = cpp_module
-        super(ModuleWrapper, self).__init__()
+        super().__init__()
         self._parameters = OrderedDictWrapper(cpp_module, "_parameters")  # type: ignore[assignment]
         self._buffers: OrderedDictWrapper = OrderedDictWrapper(cpp_module, "_buffers")  # type: ignore[assignment]
         self._modules: OrderedDictWrapper = OrderedDictWrapper(cpp_module, "_modules")  # type: ignore[assignment]
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 68d0ff2b4fc0..d7b31fd54d80 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch import _VF
-from torch import sym_float as _sym_float, sym_int as _sym_int
+from torch import sym_int as _sym_int
 from torch._C import _infer_size, _add_docstr
 from torch._torch_docs import reproducibility_notes, tf32_notes, sparse_support_notes
 # A workaround to support both TorchScript and MyPy:
@@ -2335,6 +2335,11 @@ def embedding_bag(
             "then it must have the same shape as the input ({})".format(per_sample_weights.shape, input.shape)
         )
 
+    if not weight.dim() == 2:
+        raise ValueError(
+            f"weight has to be a 2D Tensor, but got Tensor of dimension {weight.dim()}"
+        )
+
     if input.dim() == 2:
         if offsets is not None:
             type_str = "<unknown>"
@@ -2358,7 +2363,7 @@ def embedding_bag(
         if offsets.dim() != 1:
             raise ValueError("offsets has to be a 1D Tensor")
     else:
-        raise ValueError("input has to be 1D or 2D Tensor," " but got Tensor of dimension {}".format(input.dim()))
+        raise ValueError(f"input has to be 1D or 2D Tensor, but got Tensor of dimension {input.dim()}")
     if mode == "sum":
         mode_enum = 0
     elif mode == "mean":
@@ -2551,8 +2556,9 @@ def local_response_norm(input: Tensor, size: int, alpha: float = 1e-4, beta: flo
     if input.numel() == 0:
         return input
 
-    div = input.mul(input).unsqueeze(1)
+    div = input.mul(input)
     if dim == 3:
+        div = div.unsqueeze(1)
         div = pad(div, (0, 0, size // 2, (size - 1) // 2))
         div = avg_pool2d(div, (size, 1), stride=1).squeeze(1)
     else:
@@ -3916,7 +3922,7 @@ def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optiona
                            for i in range(dim)]
         else:
             output_size = [
-                _sym_int(math.floor(_sym_float(input.size(i + 2)) * scale_factors[i]))
+                _sym_int(input.size(i + 2) * scale_factors[i])
                 for i in range(dim)
             ]
         scale_factors = None
@@ -4761,7 +4767,10 @@ def _in_projection_packed(
     if k is v:
         if q is k:
             # self-attention
-            return linear(q, w, b).chunk(3, dim=-1)
+            proj = linear(q, w, b)
+            # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
+            proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+            return proj[0], proj[1], proj[2]
         else:
             # encoder-decoder attention
             w_q, w_kv = w.split([E, E * 2])
@@ -4769,7 +4778,11 @@ def _in_projection_packed(
                 b_q = b_kv = None
             else:
                 b_q, b_kv = b.split([E, E * 2])
-            return (linear(q, w_q, b_q),) + linear(k, w_kv, b_kv).chunk(2, dim=-1)
+            q_proj = linear(q, w_q, b_q)
+            kv_proj = linear(k, w_kv, b_kv)
+            # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
+            kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+            return (q_proj, kv_proj[0], kv_proj[1])
     else:
         w_q, w_k, w_v = w.chunk(3)
         if b is None:
@@ -4831,37 +4844,98 @@ def _in_projection(
     assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
     return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 
+scaled_dot_product_attention = _add_docstr(
+    torch._C._nn.scaled_dot_product_attention, r"""
+scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False) -> Tensor:
 
-_scaled_dot_product_attention = _add_docstr(
-    torch._C._nn._scaled_dot_product_attention, r"""
 Computes scaled dot product attention on query, key and value tensors, using
 an optional attention mask if passed, and applying dropout if a probability
 greater than 0.0 is specified.
 
+.. code-block:: python
+
+    # Efficient implementation equivalent to the following:
+    attn_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) if is_causal else attn_mask
+    attn_mask = attn_mask.masked_fill(not attn_mask, -float('inf')) if attn_mask.dtype==torch.bool else attn_mask
+    attn_weight = torch.softmax((Q @ K.transpose(-2, -1) / math.sqrt(Q.size(-1))) + attn_mask, dim=-1)
+    attn_weight = torch.dropout(attn_weight, dropout_p)
+    return attn_weight @ V
+
+.. warning:: This function is beta and subject to change.
+
+Note:
+
+    There are currently three supported implementations of scaled dot product attention:
+
+        - `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_
+        - `Memory-Efficient Attention`_
+        - A PyTorch implementation defined in C++ matching the above formulation
+
+    The function may call optimized kernels for improved performance when using the CUDA backend.
+    For all other backends, the PyTorch implementation will be used.
+
+    All implementations are enabled by default. Scaled dot product attention attempts to automatically select the
+    most optimal implementation based on the inputs. In order to provide more fine-grained control over what implementation
+    is used, the following functions are provided for enabling and disabling implementations.
+    The context manager is the preferred mechanism:
+
+        - :func:`torch.backends.cuda.sdp_kernel`: A context manager used to enable/disable any of the implementations.
+        - :func:`torch.backends.cuda.enable_flash_sdp`: Enables or Disables FlashAttention.
+        - :func:`torch.backends.cuda.enable_mem_efficient_sdp`: Enables or Disables Memory-Efficient Attention.
+        - :func:`torch.backends.cuda.enable_math_sdp`: Enables or Disables the PyTorch C++ implementation.
+
+    Each of the fused kernels has specific input limitations. If the user requires the use of a specific fused implementation,
+    disable the PyTorch C++ implementation using :func:`torch.backends.cuda.sdp_kernel`.
+    In the event that a fused implementation is not available, an error will be raised with the
+    reasons why the fused implementation cannot run.
+
+    Due to the nature of fusing floating point operations, the output of this function may be different
+    depending on what backend kernel is chosen.
+    The c++ implementation supports torch.float64 and can be used when higher precision is required.
+    For more information please see :doc:`/notes/numerical_accuracy`
+
+Note:
+    {cudnn_reproducibility_note}
+""".format(**reproducibility_notes)
+    + r"""
+
 Args:
-     query (Tensor): Query tensor; shape (N, ..., L, E)
-     key (Tensor): Key tensor; shape (N, ..., S, E)
-     value (Tensor): Value tensor; shape (N, ..., S, E)
-     attn_mask (optional Tensor): Attention mask; shape (N, ..., L, S) or (L, S). Currently, only a boolean mask
-         is supported, where a value of True indicates that the element *should* take part in attention.
-     dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
-     need_attn_weights (bool): If true, the second return value will contain the attention weights used;
-         otherwise, the second return value is unspecified
-     is_causal (bool): If true, assumes causal attention masking and ignores attn_mask.
+    query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
+    key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
+    value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
+    attn_mask (optional Tensor): Attention mask; shape :math:`(N, ..., L, S)`. Two types of masks are supported.
+        A boolean mask where a value of True indicates that the element *should* take part in attention.
+        A float mask of the same type as query, key, value that is added to the attention score.
+    dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
+    is_causal (bool): If true, assumes causal attention masking and errors if both attn_mask and is_causal
+        are set.
 
 
-Returns a tuple containing:
-    output (Tensor): Attention output; shape (N, ..., L, E)
-    attn_weights (Tensor): Attention weighting; shape (N, ..., L, S)
+Returns:
+    output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`.
 
 Shape legend:
-    N: Batch size
-    ...: Any number of other batch dimensions (optional)
-    S: Source sequence length
-    L: Target sequence lengthE: Embedding dimension
+    - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
+    - :math:`S: \text{Source sequence length}`
+    - :math:`L: \text{Target sequence length}`
+    - :math:`E: \text{Embedding dimension of the query and key}`
+    - :math:`Ev: \text{Embedding dimension of the value}`
 
-""")
+Examples::
 
+    >>> # Optionally use the context manager to ensure one of the fused kerenels is run
+    >>> query = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+    >>> key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+    >>> value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+    >>> with torch.backends.cuda.sdp_kernel(enable_math=False):
+    >>>     F.scaled_dot_product_attention(query,key,value)
+
+.. _FlashAttention\: Fast and Memory-Efficient Exact Attention with IO-Awareness:
+    https://arxiv.org/abs/2205.14135
+.. _Memory-Efficient Attention:
+    https://github.com/facebookresearch/xformers
+
+""")
 
 def _mha_shape_check(query: Tensor, key: Tensor, value: Tensor,
                      key_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor], num_heads: int):
@@ -4910,6 +4984,41 @@ def _mha_shape_check(query: Tensor, key: Tensor, value: Tensor,
 
     return is_batched
 
+def _canonical_mask(
+        mask: Optional[Tensor],
+        mask_name: str,
+        other_type: Optional[DType],
+        other_name: str,
+        target_type: DType,
+        check_other: bool = True,
+) -> Optional[Tensor]:
+
+    if mask is not None:
+        _mask_dtype = mask.dtype
+        _mask_is_float = torch.is_floating_point(mask)
+        if _mask_dtype != torch.bool and not _mask_is_float:
+            raise AssertionError(
+                f"only bool and floating types of {mask_name} are supported")
+        if check_other and other_type is not None:
+            if _mask_dtype != other_type:
+                warnings.warn(
+                    f"Support for mismatched {mask_name} and {other_name} "
+                    "is deprecated. Use same type for both instead."
+                )
+        if not _mask_is_float:
+            mask = (
+                torch.zeros_like(mask, dtype=target_type)
+                .masked_fill_(mask, float("-inf"))
+            )
+    return mask
+
+def _none_or_dtype(input: Optional[Tensor]) -> Optional[DType]:
+    if input is None:
+        return None
+    elif isinstance(input, torch.Tensor):
+        return input.dtype
+    raise RuntimeError("input to _none_or_dtype() must be None or torch.Tensor")
+
 def multi_head_attention_forward(
     query: Tensor,
     key: Tensor,
@@ -4984,8 +5093,7 @@ def multi_head_attention_forward(
         - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
           3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
           S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
-          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
-          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          positions. If a BoolTensor is provided, positions with ``True``
           are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
           is provided, it will be added to the attention weight.
         - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
@@ -5036,9 +5144,6 @@ def multi_head_attention_forward(
 
     is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
 
-    if is_causal:
-        attn_mask = None
-
     # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
     # is batched, run the computation and before returning squeeze the
     # batch dimension so that the output doesn't carry this temporary batch dimension.
@@ -5053,11 +5158,18 @@ def multi_head_attention_forward(
     # set up shape vars
     tgt_len, bsz, embed_dim = query.shape
     src_len, _, _ = key.shape
-    if key_padding_mask is not None:
-        _kpm_dtype = key_padding_mask.dtype
-        if _kpm_dtype != torch.bool and not torch.is_floating_point(key_padding_mask):
-            raise AssertionError(
-                "only bool and floating types of key_padding_mask are supported")
+
+    key_padding_mask = _canonical_mask(
+        mask=key_padding_mask,
+        mask_name="key_padding_mask",
+        other_type=_none_or_dtype(attn_mask),
+        other_name="attn_mask",
+        target_type=query.dtype
+    )
+
+    if is_causal:
+        attn_mask = None
+
     assert embed_dim == embed_dim_to_check, \
         f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
     if isinstance(embed_dim, torch.Tensor):
@@ -5090,13 +5202,17 @@ def multi_head_attention_forward(
         q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
 
     # prep attention mask
+
+    attn_mask = _canonical_mask(
+        mask=attn_mask,
+        mask_name="attn_mask",
+        other_type=_none_or_dtype(key_padding_mask),
+        other_name="key_padding_mask",
+        target_type=q.dtype,
+        check_other=False,
+    )
+
     if attn_mask is not None:
-        if attn_mask.dtype == torch.uint8:
-            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
-            attn_mask = attn_mask.to(torch.bool)
-        else:
-            assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \
-                f"Only float, byte, and bool types are supported for attn_mask, not {attn_mask.dtype}"
         # ensure attn_mask's dim is 3
         if attn_mask.dim() == 2:
             correct_2d_size = (tgt_len, src_len)
@@ -5127,9 +5243,9 @@ def multi_head_attention_forward(
     #
     # reshape q, k, v for multihead attention and make em batch first
     #
-    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
     if static_k is None:
-        k = k.contiguous().view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
+        k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
         assert static_k.size(0) == bsz * num_heads, \
@@ -5138,7 +5254,7 @@ def multi_head_attention_forward(
             f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
         k = static_k
     if static_v is None:
-        v = v.contiguous().view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
+        v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
         assert static_v.size(0) == bsz * num_heads, \
@@ -5168,16 +5284,8 @@ def multi_head_attention_forward(
             expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
         if attn_mask is None:
             attn_mask = key_padding_mask
-        elif attn_mask.dtype == torch.bool:
-            attn_mask = attn_mask.logical_or(key_padding_mask)
         else:
-            attn_mask = attn_mask.masked_fill(key_padding_mask, float("-inf"))
-
-    # convert mask to float
-    if attn_mask is not None and attn_mask.dtype == torch.bool:
-        new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
-        new_attn_mask.masked_fill_(attn_mask, float("-inf"))
-        attn_mask = new_attn_mask
+            attn_mask = attn_mask + key_padding_mask
 
     # adjust dropout probability
     if not training:
@@ -5187,28 +5295,27 @@ def multi_head_attention_forward(
     # (deep breath) calculate attention and out projection
     #
 
-    if attn_mask is not None:
-        if attn_mask.size(0) == 1:
-            attn_mask = attn_mask.unsqueeze(0)
+    if need_weights:
+        B, Nt, E = q.shape
+        q_scaled = q / math.sqrt(E)
+        if attn_mask is not None:
+            attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
         else:
-            attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
+            attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        if dropout_p > 0.0:
+            attn_output_weights = dropout(attn_output_weights, p=dropout_p)
 
-    q = q.view(bsz, num_heads, tgt_len, head_dim)
-    k = k.view(bsz, num_heads, src_len, head_dim)
-    v = v.view(bsz, num_heads, src_len, head_dim)
+        attn_output = torch.bmm(attn_output_weights, v)
 
-    attn_output, attn_output_weights = _scaled_dot_product_attention(
-        q, k, v, attn_mask, dropout_p, need_weights, is_causal)
-    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 
-    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
-    attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
-
-    if need_weights:
         # optionally average attention weights over heads
         attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
         if average_attn_weights:
-            attn_output_weights = attn_output_weights.sum(dim=1) / num_heads
+            attn_output_weights = attn_output_weights.mean(dim=1)
 
         if not is_batched:
             # squeeze the output if input was unbatched
@@ -5216,6 +5323,24 @@ def multi_head_attention_forward(
             attn_output_weights = attn_output_weights.squeeze(0)
         return attn_output, attn_output_weights
     else:
+        # attn_mask can be either (L,S) or (N*num_heads, L, S)
+        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
+        # in order to match the input for SDPA of (N, num_heads, L, S)
+        if attn_mask is not None:
+            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
+
+        q = q.view(bsz, num_heads, tgt_len, head_dim)
+        k = k.view(bsz, num_heads, src_len, head_dim)
+        v = v.view(bsz, num_heads, src_len, head_dim)
+
+        attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
         if not is_batched:
             # squeeze the output if input was unbatched
             attn_output = attn_output.squeeze(1)
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index ac40c4a57cf7..f3be7d4a989e 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -356,6 +356,17 @@ def fold(input: Tensor, output_size: _size_any_t, kernel_size: _size_any_t, dila
          stride: _size_any_t = ...) -> Tensor: ...
 
 
+def _canonical_mask(
+        mask: Optional[Tensor],
+        mask_name: str,
+        other_type: Optional[_dtype],
+        other_name: str,
+        target_type: _dtype,
+        check_other: bool = True,
+) -> Optional[Tensor]: ...
+
+def _none_or_dtype(input: Optional[Tensor]) -> Optional[_dtype]: ...
+
 def multi_head_attention_forward(query: Tensor,
                                  key: Tensor,
                                  value: Tensor,
diff --git a/torch/nn/intrinsic/quantized/__init__.py b/torch/nn/intrinsic/quantized/__init__.py
index a3c5788d574d..b949303a4083 100644
--- a/torch/nn/intrinsic/quantized/__init__.py
+++ b/torch/nn/intrinsic/quantized/__init__.py
@@ -1,4 +1,7 @@
 from .modules import *  # noqa: F403
+# to ensure customers can use the module below
+# without importing it directly
+import torch.nn.intrinsic.quantized.dynamic
 
 __all__ = [
     'BNReLU2d',
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index a0d6d505d7a5..1e92dc0852e2 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -49,7 +49,7 @@ class Threshold(Module):
     inplace: bool
 
     def __init__(self, threshold: float, value: float, inplace: bool = False) -> None:
-        super(Threshold, self).__init__()
+        super().__init__()
         self.threshold = threshold
         self.value = value
         self.inplace = inplace
@@ -96,7 +96,7 @@ class ReLU(Module):
     inplace: bool
 
     def __init__(self, inplace: bool = False):
-        super(ReLU, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
@@ -159,7 +159,7 @@ def __init__(
         upper: float = 1. / 3,
         inplace: bool = False
     ):
-        super(RReLU, self).__init__()
+        super().__init__()
         self.lower = lower
         self.upper = upper
         self.inplace = inplace
@@ -218,7 +218,7 @@ def __init__(
         min_value: Optional[float] = None,
         max_value: Optional[float] = None
     ) -> None:
-        super(Hardtanh, self).__init__()
+        super().__init__()
         if min_value is not None:
             warnings.warn("keyword argument min_value is deprecated and rename to min_val")
             min_val = min_value
@@ -264,7 +264,7 @@ class ReLU6(Hardtanh):
     """
 
     def __init__(self, inplace: bool = False):
-        super(ReLU6, self).__init__(0., 6., inplace)
+        super().__init__(0., 6., inplace)
 
     def extra_repr(self) -> str:
         inplace_str = 'inplace=True' if self.inplace else ''
@@ -327,7 +327,7 @@ class Hardsigmoid(Module):
     inplace: bool
 
     def __init__(self, inplace : bool = False) -> None:
-        super(Hardsigmoid, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
@@ -389,7 +389,7 @@ class SiLU(Module):
     inplace: bool
 
     def __init__(self, inplace: bool = False):
-        super(SiLU, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
@@ -425,7 +425,7 @@ class Mish(Module):
     inplace: bool
 
     def __init__(self, inplace: bool = False):
-        super(Mish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
@@ -468,7 +468,7 @@ class Hardswish(Module):
     inplace: bool
 
     def __init__(self, inplace : bool = False) -> None:
-        super(Hardswish, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
@@ -509,7 +509,7 @@ class ELU(Module):
     inplace: bool
 
     def __init__(self, alpha: float = 1., inplace: bool = False) -> None:
-        super(ELU, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.inplace = inplace
 
@@ -553,7 +553,7 @@ class CELU(Module):
     inplace: bool
 
     def __init__(self, alpha: float = 1., inplace: bool = False) -> None:
-        super(CELU, self).__init__()
+        super().__init__()
         self.alpha = alpha
         self.inplace = inplace
 
@@ -603,7 +603,7 @@ class SELU(Module):
     inplace: bool
 
     def __init__(self, inplace: bool = False) -> None:
-        super(SELU, self).__init__()
+        super().__init__()
         self.inplace = inplace
 
     def forward(self, input: Tensor) -> Tensor:
@@ -637,7 +637,7 @@ class GLU(Module):
     dim: int
 
     def __init__(self, dim: int = -1) -> None:
-        super(GLU, self).__init__()
+        super().__init__()
         self.dim = dim
 
     def forward(self, input: Tensor) -> Tensor:
@@ -678,7 +678,7 @@ class GELU(Module):
     approximate: str
 
     def __init__(self, approximate: str = 'none') -> None:
-        super(GELU, self).__init__()
+        super().__init__()
         self.approximate = approximate
 
     def forward(self, input: Tensor) -> Tensor:
@@ -720,7 +720,7 @@ class Hardshrink(Module):
     lambd: float
 
     def __init__(self, lambd: float = 0.5) -> None:
-        super(Hardshrink, self).__init__()
+        super().__init__()
         self.lambd = lambd
 
     def forward(self, input: Tensor) -> Tensor:
@@ -747,7 +747,8 @@ class LeakyReLU(Module):
         \end{cases}
 
     Args:
-        negative_slope: Controls the angle of the negative slope. Default: 1e-2
+        negative_slope: Controls the angle of the negative slope (which is used for
+          negative input values). Default: 1e-2
         inplace: can optionally do the operation in-place. Default: ``False``
 
     Shape:
@@ -768,7 +769,7 @@ class LeakyReLU(Module):
     negative_slope: float
 
     def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
-        super(LeakyReLU, self).__init__()
+        super().__init__()
         self.negative_slope = negative_slope
         self.inplace = inplace
 
@@ -834,7 +835,7 @@ class Softplus(Module):
     threshold: int
 
     def __init__(self, beta: int = 1, threshold: int = 20) -> None:
-        super(Softplus, self).__init__()
+        super().__init__()
         self.beta = beta
         self.threshold = threshold
 
@@ -875,7 +876,7 @@ class Softshrink(Module):
     lambd: float
 
     def __init__(self, lambd: float = 0.5) -> None:
-        super(Softshrink, self).__init__()
+        super().__init__()
         self.lambd = lambd
 
     def forward(self, input: Tensor) -> Tensor:
@@ -885,6 +886,24 @@ def extra_repr(self) -> str:
         return str(self.lambd)
 
 
+def _arg_cuda_or_cpu(x: Optional[torch.Tensor]) -> bool:
+    if x is None:
+        return True
+    else:
+        return x.is_cuda or 'cpu' in str(x.device)
+
+    return False
+
+
+def _arg_requires_grad(x: Optional[torch.Tensor]) -> bool:
+    if x is None:
+        return False
+    else:
+        return x.requires_grad
+
+    return True
+
+
 class MultiheadAttention(Module):
     r"""Allows the model to jointly attend to information
     from different representation subspaces as described in the paper:
@@ -897,7 +916,8 @@ class MultiheadAttention(Module):
 
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
-    ``forward()`` will use a special optimized implementation if all of the following
+    ``forward()`` will use the optimized implementation described in
+    `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following
     conditions are met:
 
     - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
@@ -940,6 +960,9 @@ class MultiheadAttention(Module):
         >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
         >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
 
+    .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+
     """
     __constants__ = ['batch_first']
     bias_k: Optional[torch.Tensor]
@@ -948,7 +971,7 @@ class MultiheadAttention(Module):
     def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
                  kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(MultiheadAttention, self).__init__()
+        super().__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
         self.vdim = vdim if vdim is not None else embed_dim
@@ -1008,7 +1031,7 @@ def __setstate__(self, state):
         if '_qkv_same_embed_dim' not in state:
             state['_qkv_same_embed_dim'] = True
 
-        super(MultiheadAttention, self).__setstate__(state)
+        super().__setstate__(state)
 
     def forward(
             self,
@@ -1037,7 +1060,7 @@ def forward(
             See "Attention Is All You Need" for more details.
         key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
             to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
-            Binary and byte masks are supported.
+            Binary and float masks are supported.
             For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
             the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
         need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
@@ -1046,10 +1069,10 @@ def forward(
             :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
             :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
             broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
-            Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
-            corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
+            Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the
             corresponding position is not allowed to attend. For a float mask, the mask values will be added to
             the attention weight.
+            If both attn_mask and key_padding_mask are supplied, their types should match.
         is_causal: If specified, applies a causal mask as attention mask. Mutually exclusive with providing attn_mask.
             Default: ``False``.
         average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
@@ -1074,11 +1097,15 @@ def forward(
             raise AssertionError("Only allow causal mask or attn_mask")
 
         is_batched = query.dim() == 3
-        if key_padding_mask is not None:
-            _kpm_dtype = key_padding_mask.dtype
-            if _kpm_dtype != torch.bool and not torch.is_floating_point(key_padding_mask):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported")
+
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype
+        )
+
         why_not_fast_path = ''
         if not is_batched:
             why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
@@ -1089,7 +1116,9 @@ def forward(
             why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
         elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
             why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
-        elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype:
+        elif self.in_proj_weight is None:
+            why_not_fast_path = "in_proj_weight was None"
+        elif query.dtype != self.in_proj_weight.dtype:
             # this case will fail anyway, but at least they'll get a useful error message.
             why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
         elif self.training:
@@ -1124,28 +1153,29 @@ def forward(
             # generator expressions.
             if torch.overrides.has_torch_function(tensor_args):
                 why_not_fast_path = "some Tensor argument has_torch_function"
-            elif not all([(x is None or x.is_cuda or 'cpu' in str(x.device)) for x in tensor_args]):
+            elif not all([_arg_cuda_or_cpu(x) for x in tensor_args]):
                 why_not_fast_path = "some Tensor argument is neither CUDA nor CPU"
-            elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]):
+            elif torch.is_grad_enabled() and any([_arg_requires_grad(x) for x in tensor_args]):
                 why_not_fast_path = ("grad is enabled and at least one of query or the "
                                      "input/output projection weights or biases requires_grad")
             if not why_not_fast_path:
                 merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
 
-                return torch._native_multi_head_attention(
-                    query,
-                    key,
-                    value,
-                    self.embed_dim,
-                    self.num_heads,
-                    self.in_proj_weight,
-                    self.in_proj_bias,
-                    self.out_proj.weight,
-                    self.out_proj.bias,
-                    merged_mask,
-                    need_weights,
-                    average_attn_weights,
-                    mask_type)
+                if self.in_proj_bias is not None and self.in_proj_weight is not None:
+                    return torch._native_multi_head_attention(
+                        query,
+                        key,
+                        value,
+                        self.embed_dim,
+                        self.num_heads,
+                        self.in_proj_weight,
+                        self.in_proj_bias,
+                        self.out_proj.weight,
+                        self.out_proj.bias,
+                        merged_mask,
+                        need_weights,
+                        average_attn_weights,
+                        mask_type)
 
         any_nested = query.is_nested or key.is_nested or value.is_nested
         assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " +
@@ -1210,6 +1240,16 @@ def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Te
         """
         mask_type: Optional[int] = None
         merged_mask: Optional[Tensor] = None
+
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=F._none_or_dtype(key_padding_mask),
+            other_name="key_padding_mask",
+            target_type=query.dtype,
+            check_other=False,
+        )
+
         if attn_mask is not None:
             mask_type = 0
             merged_mask = attn_mask
@@ -1223,7 +1263,7 @@ def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Te
             key_padding_mask_expanded = key_padding_mask.view(batch_size, 1, 1, seq_len) \
                                                         .expand(-1, self.num_heads, -1, -1)
             attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(batch_size, self.num_heads, -1, -1)
-            merged_mask = attn_mask_expanded.logical_or(key_padding_mask_expanded)
+            merged_mask = attn_mask_expanded + key_padding_mask_expanded
         return merged_mask, mask_type
 
 
@@ -1283,7 +1323,7 @@ def __init__(self, num_parameters: int = 1, init: float = 0.25,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
         self.num_parameters = num_parameters
-        super(PReLU, self).__init__()
+        super().__init__()
         self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs).fill_(init))
 
     def forward(self, input: Tensor) -> Tensor:
@@ -1372,7 +1412,7 @@ class Softmin(Module):
     dim: Optional[int]
 
     def __init__(self, dim: Optional[int] = None) -> None:
-        super(Softmin, self).__init__()
+        super().__init__()
         self.dim = dim
 
     def __setstate__(self, state):
@@ -1428,7 +1468,7 @@ class Softmax(Module):
     dim: Optional[int]
 
     def __init__(self, dim: Optional[int] = None) -> None:
-        super(Softmax, self).__init__()
+        super().__init__()
         self.dim = dim
 
     def __setstate__(self, state):
@@ -1499,7 +1539,7 @@ class LogSoftmax(Module):
     dim: Optional[int]
 
     def __init__(self, dim: Optional[int] = None) -> None:
-        super(LogSoftmax, self).__init__()
+        super().__init__()
         self.dim = dim
 
     def __setstate__(self, state):
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index 5f6fb08c82fe..f728102bc632 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -121,7 +121,7 @@ def __init__(
         dtype=None
     ) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(AdaptiveLogSoftmaxWithLoss, self).__init__()
+        super().__init__()
 
         cutoffs = list(cutoffs)
 
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 66af541fa9ea..01a706ef0c8e 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -38,7 +38,7 @@ def __init__(
         dtype=None
     ) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_NormBase, self).__init__()
+        super().__init__()
         self.num_features = num_features
         self.eps = eps
         self.momentum = momentum
@@ -107,7 +107,7 @@ def _load_from_state_dict(
             if num_batches_tracked_key not in state_dict:
                 state_dict[num_batches_tracked_key] = torch.tensor(0, dtype=torch.long)
 
-        super(_NormBase, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict,
             prefix,
             local_metadata,
@@ -130,7 +130,7 @@ def __init__(
         dtype=None
     ) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_BatchNorm, self).__init__(
+        super().__init__(
             num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
         )
 
@@ -191,7 +191,7 @@ class _LazyNormBase(LazyModuleMixin, _NormBase):
     def __init__(self, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_LazyNormBase, self).__init__(
+        super().__init__(
             # affine and track_running_stats are hardcoded to False to
             # avoid creating tensors that will soon be overwritten.
             0,
@@ -663,7 +663,7 @@ def __init__(
         dtype=None
     ) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(SyncBatchNorm, self).__init__(
+        super().__init__(
             num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
         )
         self.process_group = process_group
diff --git a/torch/nn/modules/channelshuffle.py b/torch/nn/modules/channelshuffle.py
index 3faee2c75fc2..ffb235713c71 100644
--- a/torch/nn/modules/channelshuffle.py
+++ b/torch/nn/modules/channelshuffle.py
@@ -44,7 +44,7 @@ class ChannelShuffle(Module):
     groups: int
 
     def __init__(self, groups: int) -> None:
-        super(ChannelShuffle, self).__init__()
+        super().__init__()
         self.groups = groups
 
     def forward(self, input: Tensor) -> Tensor:
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 079a8780efb6..9ca99a023549 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -31,7 +31,7 @@ def _addindent(s_, numSpaces):
 class Container(Module):
 
     def __init__(self, **kwargs: Any) -> None:
-        super(Container, self).__init__()
+        super().__init__()
         # DeprecationWarning is ignored by default <sigh>
         warnings.warn("nn.Container is deprecated. All of it's functionality "
                       "is now implemented in nn.Module. Subclass that instead.")
@@ -95,7 +95,7 @@ def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
         ...
 
     def __init__(self, *args):
-        super(Sequential, self).__init__()
+        super().__init__()
         if len(args) == 1 and isinstance(args[0], OrderedDict):
             for key, module in args[0].items():
                 self.add_module(key, module)
@@ -200,7 +200,7 @@ def __imul__(self, other: int) -> 'Sequential':
 
     @_copy_to_script_wrapper
     def __dir__(self):
-        keys = super(Sequential, self).__dir__()
+        keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
@@ -261,7 +261,7 @@ class ModuleList(Module):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
 
             def forward(self, x):
@@ -274,7 +274,7 @@ def forward(self, x):
     _modules: Dict[str, Module]  # type: ignore[assignment]
 
     def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
-        super(ModuleList, self).__init__()
+        super().__init__()
         if modules is not None:
             self += modules
 
@@ -359,7 +359,7 @@ def __repr__(self):
 
     @_copy_to_script_wrapper
     def __dir__(self):
-        keys = super(ModuleList, self).__dir__()
+        keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
@@ -433,7 +433,7 @@ class ModuleDict(Module):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.choices = nn.ModuleDict({
                         'conv': nn.Conv2d(10, 10, 3),
                         'pool': nn.MaxPool2d(3)
@@ -452,7 +452,7 @@ def forward(self, x, choice, act):
     _modules: Dict[str, Module]  # type: ignore[assignment]
 
     def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
-        super(ModuleDict, self).__init__()
+        super().__init__()
         if modules is not None:
             self.update(modules)
 
@@ -567,7 +567,7 @@ class ParameterList(Module):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
 
             def forward(self, x):
@@ -578,7 +578,7 @@ def forward(self, x):
     """
 
     def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
-        super(ParameterList, self).__init__()
+        super().__init__()
         self._size = 0
         if values is not None:
             self += values
@@ -632,7 +632,7 @@ def __iadd__(self, parameters: Iterable[Any]) -> 'ParameterList':
         return self.extend(parameters)
 
     def __dir__(self):
-        keys = super(ParameterList, self).__dir__()
+        keys = super().__dir__()
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
@@ -707,7 +707,7 @@ class ParameterDict(Module):
 
         class MyModule(nn.Module):
             def __init__(self):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.params = nn.ParameterDict({
                         'left': nn.Parameter(torch.randn(5, 10)),
                         'right': nn.Parameter(torch.randn(5, 10))
@@ -719,7 +719,7 @@ def forward(self, x, choice):
     """
 
     def __init__(self, parameters: Any = None) -> None:
-        super(ParameterDict, self).__init__()
+        super().__init__()
         self._keys: Dict[str, None] = {}
         if parameters is not None:
             self.update(parameters)
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 5c081e64ecca..bace244553e0 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -83,7 +83,7 @@ def __init__(self,
                  device=None,
                  dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_ConvNd, self).__init__()
+        super().__init__()
         if groups <= 0:
             raise ValueError('groups must be a positive integer')
         if in_channels % groups != 0:
@@ -172,7 +172,7 @@ def extra_repr(self):
         return s.format(**self.__dict__)
 
     def __setstate__(self, state):
-        super(_ConvNd, self).__setstate__(state)
+        super().__setstate__(state)
         if not hasattr(self, 'padding_mode'):
             self.padding_mode = 'zeros'
 
@@ -297,7 +297,7 @@ def __init__(
         stride_ = _single(stride)
         padding_ = padding if isinstance(padding, str) else _single(padding)
         dilation_ = _single(dilation)
-        super(Conv1d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
             False, _single(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -447,7 +447,7 @@ def __init__(
         stride_ = _pair(stride)
         padding_ = padding if isinstance(padding, str) else _pair(padding)
         dilation_ = _pair(dilation)
-        super(Conv2d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
             False, _pair(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -588,7 +588,7 @@ def __init__(
         stride_ = _triple(stride)
         padding_ = padding if isinstance(padding, str) else _triple(padding)
         dilation_ = _triple(dilation)
-        super(Conv3d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
             False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
 
@@ -622,7 +622,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
             raise ValueError('Only "zeros" padding mode is supported for {}'.format(self.__class__.__name__))
 
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_ConvTransposeNd, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride,
             padding, dilation, transposed, output_padding,
             groups, bias, padding_mode, **factory_kwargs)
@@ -783,7 +783,7 @@ def __init__(
         padding = _single(padding)
         dilation = _single(dilation)
         output_padding = _single(output_padding)
-        super(ConvTranspose1d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             True, output_padding, groups, bias, padding_mode, **factory_kwargs)
 
@@ -937,7 +937,7 @@ def __init__(
         padding = _pair(padding)
         dilation = _pair(dilation)
         output_padding = _pair(output_padding)
-        super(ConvTranspose2d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             True, output_padding, groups, bias, padding_mode, **factory_kwargs)
 
@@ -1089,7 +1089,7 @@ def __init__(
         padding = _triple(padding)
         dilation = _triple(dilation)
         output_padding = _triple(output_padding)
-        super(ConvTranspose3d, self).__init__(
+        super().__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             True, output_padding, groups, bias, padding_mode, **factory_kwargs)
 
@@ -1130,7 +1130,7 @@ def __init__(self, *args, **kwargs):
         warnings.warn(
             "_ConvTransposeMixin is a deprecated internal class. "
             "Please consider using public APIs.")
-        super(_ConvTransposeMixin, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
 
 # TODO: Conv2dLocal
diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py
index 73ba31b8868a..83478a294c69 100644
--- a/torch/nn/modules/distance.py
+++ b/torch/nn/modules/distance.py
@@ -44,7 +44,7 @@ class PairwiseDistance(Module):
     keepdim: bool
 
     def __init__(self, p: float = 2., eps: float = 1e-6, keepdim: bool = False) -> None:
-        super(PairwiseDistance, self).__init__()
+        super().__init__()
         self.norm = p
         self.eps = eps
         self.keepdim = keepdim
@@ -79,7 +79,7 @@ class CosineSimilarity(Module):
     eps: float
 
     def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
-        super(CosineSimilarity, self).__init__()
+        super().__init__()
         self.dim = dim
         self.eps = eps
 
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
index 0b35bd546e23..a92a58c0f882 100644
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -11,7 +11,7 @@ class _DropoutNd(Module):
     inplace: bool
 
     def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
-        super(_DropoutNd, self).__init__()
+        super().__init__()
         if p < 0 or p > 1:
             raise ValueError("dropout probability has to be between 0 and 1, "
                              "but got {}".format(p))
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index 616b6bc690e3..ab9868f9e72e 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -38,7 +38,7 @@ class Flatten(Module):
     end_dim: int
 
     def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
-        super(Flatten, self).__init__()
+        super().__init__()
         self.start_dim = start_dim
         self.end_dim = end_dim
 
@@ -104,7 +104,7 @@ class Unflatten(Module):
     unflattened_size: Union[_size, NamedShape]
 
     def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None:
-        super(Unflatten, self).__init__()
+        super().__init__()
 
         if isinstance(dim, int):
             self._require_tuple_int(unflattened_size)
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index a7b1f758dd5a..770ba429bd76 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -135,7 +135,7 @@ def __init__(
         padding: _size_any_t = 0,
         stride: _size_any_t = 1
     ) -> None:
-        super(Fold, self).__init__()
+        super().__init__()
         self.output_size = output_size
         self.kernel_size = kernel_size
         self.dilation = dilation
@@ -288,7 +288,7 @@ def __init__(
         padding: _size_any_t = 0,
         stride: _size_any_t = 1
     ) -> None:
-        super(Unfold, self).__init__()
+        super().__init__()
         self.kernel_size = kernel_size
         self.dilation = dilation
         self.padding = padding
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index 6d384ebb427b..ceb34f310a24 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -18,7 +18,7 @@ def __init__(
         dtype=None
     ) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(_InstanceNorm, self).__init__(
+        super().__init__(
             num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
 
     def _check_input_dim(self, input):
@@ -61,7 +61,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                 for key in running_stats_keys:
                     state_dict.pop(key)
 
-        super(_InstanceNorm, self)._load_from_state_dict(
+        super()._load_from_state_dict(
             state_dict, prefix, local_metadata, strict,
             missing_keys, unexpected_keys, error_msgs)
 
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index d214f6e5eb5d..0c77c3550d15 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -1,6 +1,6 @@
 import itertools
-from typing_extensions import Protocol
 import warnings
+from typing import Protocol
 
 import torch
 from ..parameter import is_lazy
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 18bf25f71023..07d429bb13b0 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -39,7 +39,7 @@ class Identity(Module):
 
     """
     def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super(Identity, self).__init__()
+        super().__init__()
 
     def forward(self, input: Tensor) -> Tensor:
         return input
@@ -90,7 +90,7 @@ class Linear(Module):
     def __init__(self, in_features: int, out_features: int, bias: bool = True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(Linear, self).__init__()
+        super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
@@ -178,7 +178,7 @@ class Bilinear(Module):
     def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(Bilinear, self).__init__()
+        super().__init__()
         self.in1_features = in1_features
         self.in2_features = in2_features
         self.out_features = out_features
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 2271d75f332a..e31ecdf57969 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -18,7 +18,7 @@ class _Loss(Module):
     reduction: str
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(_Loss, self).__init__()
+        super().__init__()
         if size_average is not None or reduce is not None:
             self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
         else:
@@ -27,7 +27,7 @@ def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> N
 
 class _WeightedLoss(_Loss):
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.register_buffer('weight', weight)
         self.weight: Optional[Tensor]
 
@@ -95,7 +95,7 @@ class L1Loss(_Loss):
     __constants__ = ['reduction']
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(L1Loss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.l1_loss(input, target, reduction=self.reduction)
@@ -209,7 +209,7 @@ class NLLLoss(_WeightedLoss):
 
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
                  reduce=None, reduction: str = 'mean') -> None:
-        super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
+        super().__init__(weight, size_average, reduce, reduction)
         self.ignore_index = ignore_index
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
@@ -222,7 +222,7 @@ def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_in
         warnings.warn("NLLLoss2d has been deprecated. "
                       "Please use NLLLoss instead as a drop-in replacement and see "
                       "https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
-        super(NLLLoss2d, self).__init__(weight, size_average, ignore_index, reduce, reduction)
+        super().__init__(weight, size_average, ignore_index, reduce, reduction)
 
 
 class PoissonNLLLoss(_Loss):
@@ -288,7 +288,7 @@ class PoissonNLLLoss(_Loss):
 
     def __init__(self, log_input: bool = True, full: bool = False, size_average=None,
                  eps: float = 1e-8, reduce=None, reduction: str = 'mean') -> None:
-        super(PoissonNLLLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.log_input = log_input
         self.full = full
         self.eps = eps
@@ -369,7 +369,7 @@ class GaussianNLLLoss(_Loss):
     eps: float
 
     def __init__(self, *, full: bool = False, eps: float = 1e-6, reduction: str = 'mean') -> None:
-        super(GaussianNLLLoss, self).__init__(None, None, reduction)
+        super().__init__(None, None, reduction)
         self.full = full
         self.eps = eps
 
@@ -464,7 +464,7 @@ class KLDivLoss(_Loss):
     __constants__ = ['reduction']
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', log_target: bool = False) -> None:
-        super(KLDivLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.log_target = log_target
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
@@ -530,7 +530,7 @@ class MSELoss(_Loss):
     __constants__ = ['reduction']
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(MSELoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.mse_loss(input, target, reduction=self.reduction)
@@ -613,7 +613,7 @@ class BCELoss(_WeightedLoss):
     __constants__ = ['reduction']
 
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(BCELoss, self).__init__(weight, size_average, reduce, reduction)
+        super().__init__(weight, size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
@@ -710,7 +710,7 @@ class BCEWithLogitsLoss(_Loss):
     """
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean',
                  pos_weight: Optional[Tensor] = None) -> None:
-        super(BCEWithLogitsLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.register_buffer('weight', weight)
         self.register_buffer('pos_weight', pos_weight)
         self.weight: Optional[Tensor]
@@ -776,7 +776,7 @@ class HingeEmbeddingLoss(_Loss):
     margin: float
 
     def __init__(self, margin: float = 1.0, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(HingeEmbeddingLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.margin = margin
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
@@ -841,7 +841,7 @@ class MultiLabelMarginLoss(_Loss):
     __constants__ = ['reduction']
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(MultiLabelMarginLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.multilabel_margin_loss(input, target, reduction=self.reduction)
@@ -921,7 +921,7 @@ class SmoothL1Loss(_Loss):
     __constants__ = ['reduction']
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None:
-        super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.beta = beta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
@@ -1023,7 +1023,7 @@ class SoftMarginLoss(_Loss):
     __constants__ = ['reduction']
 
     def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(SoftMarginLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.soft_margin_loss(input, target, reduction=self.reduction)
@@ -1166,7 +1166,7 @@ class probabilities only when a single class label per minibatch item is too res
 
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
                  reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
-        super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
+        super().__init__(weight, size_average, reduce, reduction)
         self.ignore_index = ignore_index
         self.label_smoothing = label_smoothing
 
@@ -1217,7 +1217,7 @@ class MultiLabelSoftMarginLoss(_WeightedLoss):
     __constants__ = ['reduction']
 
     def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(MultiLabelSoftMarginLoss, self).__init__(weight, size_average, reduce, reduction)
+        super().__init__(weight, size_average, reduce, reduction)
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
         return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
@@ -1269,7 +1269,7 @@ class CosineEmbeddingLoss(_Loss):
     margin: float
 
     def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(CosineEmbeddingLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.margin = margin
 
     def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
@@ -1326,7 +1326,7 @@ class MarginRankingLoss(_Loss):
     margin: float
 
     def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
-        super(MarginRankingLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.margin = margin
 
     def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
@@ -1399,7 +1399,7 @@ class MultiMarginLoss(_WeightedLoss):
 
     def __init__(self, p: int = 1, margin: float = 1., weight: Optional[Tensor] = None, size_average=None,
                  reduce=None, reduction: str = 'mean') -> None:
-        super(MultiMarginLoss, self).__init__(weight, size_average, reduce, reduction)
+        super().__init__(weight, size_average, reduce, reduction)
         if p != 1 and p != 2:
             raise ValueError("only p == 1 and p == 2 supported")
         assert weight is None or weight.dim() == 1
@@ -1484,7 +1484,7 @@ class TripletMarginLoss(_Loss):
 
     def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
                  reduce=None, reduction: str = 'mean'):
-        super(TripletMarginLoss, self).__init__(size_average, reduce, reduction)
+        super().__init__(size_average, reduce, reduction)
         self.margin = margin
         self.p = p
         self.eps = eps
@@ -1599,7 +1599,7 @@ class TripletMarginWithDistanceLoss(_Loss):
 
     def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
                  margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
-        super(TripletMarginWithDistanceLoss, self).__init__(size_average=None, reduce=None, reduction=reduction)
+        super().__init__(size_average=None, reduce=None, reduction=reduction)
         self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = \
             distance_function if distance_function is not None else PairwiseDistance()
         self.margin = margin
@@ -1748,7 +1748,7 @@ class CTCLoss(_Loss):
     zero_infinity: bool
 
     def __init__(self, blank: int = 0, reduction: str = 'mean', zero_infinity: bool = False):
-        super(CTCLoss, self).__init__(reduction=reduction)
+        super().__init__(reduction=reduction)
         self.blank = blank
         self.zero_infinity = zero_infinity
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index b1d5671c6be8..5f82dc65d383 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -28,7 +28,7 @@ class _IncompatibleKeys(namedtuple('IncompatibleKeys', ['missing_keys', 'unexpec
     def __repr__(self):
         if not self.missing_keys and not self.unexpected_keys:
             return '<All keys matched successfully>'
-        return super(_IncompatibleKeys, self).__repr__()
+        return super().__repr__()
 
     __str__ = __repr__
 
@@ -432,13 +432,23 @@ def forward(self, x):
     _state_dict_pre_hooks: Dict[int, Callable]
     _load_state_dict_post_hooks: Dict[int, Callable]
     _modules: Dict[str, Optional['Module']]
+    call_super_init: bool = False
 
-    def __init__(self) -> None:
+    def __init__(self, *args, **kwargs) -> None:
         """
         Initializes internal Module state, shared by both nn.Module and ScriptModule.
         """
         torch._C._log_api_usage_once("python.nn_module")
 
+        # Backward compatibility: no args used to be allowed when call_super_init=False
+        if self.call_super_init is False and bool(kwargs):
+            raise TypeError("{}.__init__() got an unexpected keyword argument '{}'"
+                            "".format(type(self).__name__, next(iter(kwargs))))
+
+        if self.call_super_init is False and bool(args):
+            raise TypeError("{}.__init__() takes 1 positional argument but {} were"
+                            " given".format(type(self).__name__, len(args) + 1))
+
         """
         Calls super().__setattr__('a', a) instead of the typical self.a = a
         to avoid Module.__setattr__ overhead. Module's __setattr__ has special
@@ -462,6 +472,9 @@ def __init__(self) -> None:
         super().__setattr__('_load_state_dict_post_hooks', OrderedDict())
         super().__setattr__('_modules', OrderedDict())
 
+        if self.call_super_init:
+            super().__init__(*args, **kwargs)
+
     forward: Callable[..., Any] = _forward_unimplemented
 
     def register_buffer(self, name: str, tensor: Optional[Tensor], persistent: bool = True) -> None:
@@ -499,7 +512,7 @@ def register_buffer(self, name: str, tensor: Optional[Tensor], persistent: bool
         if '_buffers' not in self.__dict__:
             raise AttributeError(
                 "cannot assign buffer before Module.__init__() call")
-        elif not isinstance(name, torch._six.string_classes):
+        elif not isinstance(name, str):
             raise TypeError("buffer name should be a string. "
                             "Got {}".format(torch.typename(name)))
         elif '.' in name:
@@ -540,7 +553,7 @@ def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
             raise AttributeError(
                 "cannot assign parameter before Module.__init__() call")
 
-        elif not isinstance(name, torch._six.string_classes):
+        elif not isinstance(name, str):
             raise TypeError("parameter name should be a string. "
                             "Got {}".format(torch.typename(name)))
         elif '.' in name:
@@ -582,7 +595,7 @@ def add_module(self, name: str, module: Optional['Module']) -> None:
         if not isinstance(module, Module) and module is not None:
             raise TypeError("{} is not a Module subclass".format(
                 torch.typename(module)))
-        elif not isinstance(name, torch._six.string_classes):
+        elif not isinstance(name, str):
             raise TypeError("module name should be a string. Got {}".format(
                 torch.typename(name)))
         elif hasattr(self, name) and name not in self._modules:
@@ -2099,8 +2112,7 @@ def named_parameters(
         gen = self._named_members(
             lambda module: module._parameters.items(),
             prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
-        for elem in gen:
-            yield elem
+        yield from gen
 
     def buffers(self, recurse: bool = True) -> Iterator[Tensor]:
         r"""Returns an iterator over module buffers.
@@ -2150,8 +2162,7 @@ def named_buffers(self, prefix: str = '', recurse: bool = True, remove_duplicate
         gen = self._named_members(
             lambda module: module._buffers.items(),
             prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
-        for elem in gen:
-            yield elem
+        yield from gen
 
     def children(self) -> Iterator['Module']:
         r"""Returns an iterator over immediate children modules.
@@ -2319,8 +2330,8 @@ def requires_grad_(self: T, requires_grad: bool = True) -> T:
             p.requires_grad_(requires_grad)
         return self
 
-    def zero_grad(self, set_to_none: bool = False) -> None:
-        r"""Sets gradients of all model parameters to zero. See similar function
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        r"""Resets gradients of all model parameters. See similar function
         under :class:`torch.optim.Optimizer` for more context.
 
         Args:
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index ce2b83253a07..82ab69b7dbea 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -46,7 +46,7 @@ class LocalResponseNorm(Module):
     k: float
 
     def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.) -> None:
-        super(LocalResponseNorm, self).__init__()
+        super().__init__()
         self.size = size
         self.alpha = alpha
         self.beta = beta
@@ -67,7 +67,7 @@ class CrossMapLRN2d(Module):
     k: float
 
     def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1) -> None:
-        super(CrossMapLRN2d, self).__init__()
+        super().__init__()
         self.size = size
         self.alpha = alpha
         self.beta = beta
@@ -165,7 +165,7 @@ class LayerNorm(Module):
     def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(LayerNorm, self).__init__()
+        super().__init__()
         if isinstance(normalized_shape, numbers.Integral):
             # mypy error: incompatible types in assignment
             normalized_shape = (normalized_shape,)  # type: ignore[assignment]
@@ -247,7 +247,7 @@ class GroupNorm(Module):
     def __init__(self, num_groups: int, num_channels: int, eps: float = 1e-5, affine: bool = True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(GroupNorm, self).__init__()
+        super().__init__()
         if num_channels % num_groups != 0:
             raise ValueError('num_channels must be divisible by num_groups')
 
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index df8d78837961..9ead68337271 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -18,7 +18,7 @@ class _ConstantPadNd(Module):
     padding: Sequence[int]
 
     def __init__(self, value: float) -> None:
-        super(_ConstantPadNd, self).__init__()
+        super().__init__()
         self.value = value
 
     def forward(self, input: Tensor) -> Tensor:
@@ -75,7 +75,7 @@ class ConstantPad1d(_ConstantPadNd):
     padding: Tuple[int, int]
 
     def __init__(self, padding: _size_2_t, value: float):
-        super(ConstantPad1d, self).__init__(value)
+        super().__init__(value)
         self.padding = _pair(padding)
 
 
@@ -126,7 +126,7 @@ class ConstantPad2d(_ConstantPadNd):
     padding: Tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t, value: float) -> None:
-        super(ConstantPad2d, self).__init__(value)
+        super().__init__(value)
         self.padding = _quadruple(padding)
 
 
@@ -166,7 +166,7 @@ class ConstantPad3d(_ConstantPadNd):
     padding: Tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t, value: float) -> None:
-        super(ConstantPad3d, self).__init__(value)
+        super().__init__(value)
         self.padding = _ntuple(6)(padding)
 
 
@@ -218,7 +218,7 @@ class ReflectionPad1d(_ReflectionPadNd):
     padding: Tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
-        super(ReflectionPad1d, self).__init__()
+        super().__init__()
         self.padding = _pair(padding)
 
 
@@ -270,7 +270,7 @@ class ReflectionPad2d(_ReflectionPadNd):
     padding: Tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
-        super(ReflectionPad2d, self).__init__()
+        super().__init__()
         self.padding = _quadruple(padding)
 
 
@@ -323,7 +323,7 @@ class ReflectionPad3d(_ReflectionPadNd):
     padding: Tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
-        super(ReflectionPad3d, self).__init__()
+        super().__init__()
         self.padding = _ntuple(6)(padding)
 
 
@@ -375,7 +375,7 @@ class ReplicationPad1d(_ReplicationPadNd):
     padding: Tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
-        super(ReplicationPad1d, self).__init__()
+        super().__init__()
         self.padding = _pair(padding)
 
 
@@ -427,7 +427,7 @@ class ReplicationPad2d(_ReplicationPadNd):
     padding: Tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
-        super(ReplicationPad2d, self).__init__()
+        super().__init__()
         self.padding = _quadruple(padding)
 
 
@@ -468,7 +468,7 @@ class ReplicationPad3d(_ReplicationPadNd):
     padding: Tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
-        super(ReplicationPad3d, self).__init__()
+        super().__init__()
         self.padding = _ntuple(6)(padding)
 
 
@@ -520,7 +520,7 @@ class ZeroPad2d(ConstantPad2d):
     padding: Tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
-        super(ZeroPad2d, self).__init__(padding, 0.)
+        super().__init__(padding, 0.)
 
     def extra_repr(self) -> str:
         return '{}'.format(self.padding)
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index eb5e48dd4b0e..5120a21eed10 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -47,7 +47,7 @@ class PixelShuffle(Module):
     upscale_factor: int
 
     def __init__(self, upscale_factor: int) -> None:
-        super(PixelShuffle, self).__init__()
+        super().__init__()
         self.upscale_factor = upscale_factor
 
     def forward(self, input: Tensor) -> Tensor:
@@ -97,7 +97,7 @@ class PixelUnshuffle(Module):
     downscale_factor: int
 
     def __init__(self, downscale_factor: int) -> None:
-        super(PixelUnshuffle, self).__init__()
+        super().__init__()
         self.downscale_factor = downscale_factor
 
     def forward(self, input: Tensor) -> Tensor:
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 3d65bb22e146..677ce43d9c2d 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -22,7 +22,7 @@ class _MaxPoolNd(Module):
     def __init__(self, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
                  padding: _size_any_t = 0, dilation: _size_any_t = 1,
                  return_indices: bool = False, ceil_mode: bool = False) -> None:
-        super(_MaxPoolNd, self).__init__()
+        super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride if (stride is not None) else kernel_size
         self.padding = padding
@@ -263,6 +263,10 @@ class MaxUnpool1d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
     .. note:: :class:`MaxPool1d` can map several input sizes to the same output
               sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
@@ -314,7 +318,7 @@ class MaxUnpool1d(_MaxUnpoolNd):
     padding: _size_1_t
 
     def __init__(self, kernel_size: _size_1_t, stride: Optional[_size_1_t] = None, padding: _size_1_t = 0) -> None:
-        super(MaxUnpool1d, self).__init__()
+        super().__init__()
         self.kernel_size = _single(kernel_size)
         self.stride = _single(stride if (stride is not None) else kernel_size)
         self.padding = _single(padding)
@@ -333,6 +337,10 @@ class MaxUnpool2d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
     .. note:: :class:`MaxPool2d` can map several input sizes to the same output
               sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
@@ -397,7 +405,7 @@ class MaxUnpool2d(_MaxUnpoolNd):
     padding: _size_2_t
 
     def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0) -> None:
-        super(MaxUnpool2d, self).__init__()
+        super().__init__()
         self.kernel_size = _pair(kernel_size)
         self.stride = _pair(stride if (stride is not None) else kernel_size)
         self.padding = _pair(padding)
@@ -415,6 +423,10 @@ class MaxUnpool3d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
     .. note:: :class:`MaxPool3d` can map several input sizes to the same output
               sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
@@ -463,7 +475,7 @@ class MaxUnpool3d(_MaxUnpoolNd):
     padding: _size_3_t
 
     def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0) -> None:
-        super(MaxUnpool3d, self).__init__()
+        super().__init__()
         self.kernel_size = _triple(kernel_size)
         self.stride = _triple(stride if (stride is not None) else kernel_size)
         self.padding = _triple(padding)
@@ -536,7 +548,7 @@ class AvgPool1d(_AvgPoolNd):
 
     def __init__(self, kernel_size: _size_1_t, stride: _size_1_t = None, padding: _size_1_t = 0, ceil_mode: bool = False,
                  count_include_pad: bool = True) -> None:
-        super(AvgPool1d, self).__init__()
+        super().__init__()
         self.kernel_size = _single(kernel_size)
         self.stride = _single(stride if stride is not None else kernel_size)
         self.padding = _single(padding)
@@ -615,7 +627,7 @@ class AvgPool2d(_AvgPoolNd):
 
     def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0,
                  ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
-        super(AvgPool2d, self).__init__()
+        super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride if (stride is not None) else kernel_size
         self.padding = padding
@@ -701,7 +713,7 @@ class AvgPool3d(_AvgPoolNd):
 
     def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0,
                  ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
-        super(AvgPool3d, self).__init__()
+        super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride if (stride is not None) else kernel_size
         self.padding = padding
@@ -714,7 +726,7 @@ def forward(self, input: Tensor) -> Tensor:
                             self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
 
     def __setstate__(self, d):
-        super(AvgPool3d, self).__setstate__(d)
+        super().__setstate__(d)
         self.__dict__.setdefault('padding', 0)
         self.__dict__.setdefault('ceil_mode', False)
         self.__dict__.setdefault('count_include_pad', True)
@@ -767,7 +779,7 @@ class FractionalMaxPool2d(Module):
     def __init__(self, kernel_size: _size_2_t, output_size: Optional[_size_2_t] = None,
                  output_ratio: Optional[_ratio_2_t] = None,
                  return_indices: bool = False, _random_samples=None) -> None:
-        super(FractionalMaxPool2d, self).__init__()
+        super().__init__()
         self.kernel_size = _pair(kernel_size)
         self.return_indices = return_indices
         self.register_buffer('_random_samples', _random_samples)
@@ -836,7 +848,7 @@ class FractionalMaxPool3d(Module):
     def __init__(self, kernel_size: _size_3_t, output_size: Optional[_size_3_t] = None,
                  output_ratio: Optional[_ratio_3_t] = None,
                  return_indices: bool = False, _random_samples=None) -> None:
-        super(FractionalMaxPool3d, self).__init__()
+        super().__init__()
         self.kernel_size = _triple(kernel_size)
         self.return_indices = return_indices
         self.register_buffer('_random_samples', _random_samples)
@@ -867,7 +879,7 @@ class _LPPoolNd(Module):
 
     def __init__(self, norm_type: float, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
                  ceil_mode: bool = False) -> None:
-        super(_LPPoolNd, self).__init__()
+        super().__init__()
         self.norm_type = norm_type
         self.kernel_size = kernel_size
         self.stride = stride
@@ -980,7 +992,7 @@ class _AdaptiveMaxPoolNd(Module):
     return_indices: bool
 
     def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None:
-        super(_AdaptiveMaxPoolNd, self).__init__()
+        super().__init__()
         self.output_size = output_size
         self.return_indices = return_indices
 
@@ -1110,7 +1122,7 @@ class _AdaptiveAvgPoolNd(Module):
     __constants__ = ['output_size']
 
     def __init__(self, output_size: _size_any_opt_t) -> None:
-        super(_AdaptiveAvgPoolNd, self).__init__()
+        super().__init__()
         self.output_size = output_size
 
     def extra_repr(self) -> str:
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 6d1e138d6895..bbd3ec1b20e6 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -49,7 +49,7 @@ def __init__(self, mode: str, input_size: int, hidden_size: int,
                  dropout: float = 0., bidirectional: bool = False, proj_size: int = 0,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(RNNBase, self).__init__()
+        super().__init__()
         self.mode = mode
         self.input_size = input_size
         self.hidden_size = hidden_size
@@ -143,7 +143,7 @@ def __setattr__(self, attr, value):
             # keep self._flat_weights up to date if you do self.weight = ...
             idx = self._flat_weights_names.index(attr)
             self._flat_weights[idx] = value
-        super(RNNBase, self).__setattr__(attr, value)
+        super().__setattr__(attr, value)
 
     def flatten_parameters(self) -> None:
         """Resets parameter data pointer so that they can use faster code paths.
@@ -173,7 +173,7 @@ def flatten_parameters(self) -> None:
         # a sufficient check, because overlapping parameter buffers that don't completely
         # alias would break the assumptions of the uniqueness check in
         # Module.named_parameters().
-        unique_data_ptrs = set(p.data_ptr() for p in self._flat_weights)
+        unique_data_ptrs = {p.data_ptr() for p in self._flat_weights}
         if len(unique_data_ptrs) != len(self._flat_weights):
             return
 
@@ -194,7 +194,7 @@ def flatten_parameters(self) -> None:
                         self.batch_first, bool(self.bidirectional))
 
     def _apply(self, fn):
-        ret = super(RNNBase, self)._apply(fn)
+        ret = super()._apply(fn)
 
         # Resets _flat_weights
         # Note: be v. careful before removing this, as 3rd party device types
@@ -284,7 +284,7 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, d):
-        super(RNNBase, self).__setstate__(d)
+        super().__setstate__(d)
         if 'all_weights' in d:
             self._all_weights = d['all_weights']
         # In PyTorch 1.8 we added a proj_size member variable to LSTM.
@@ -329,7 +329,7 @@ def all_weights(self) -> List[List[Parameter]]:
         return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]
 
     def _replicate_for_data_parallel(self):
-        replica = super(RNNBase, self)._replicate_for_data_parallel()
+        replica = super()._replicate_for_data_parallel()
         # Need to copy these caches, otherwise the replica will share the same
         # flat weights list.
         replica._flat_weights = replica._flat_weights[:]
@@ -450,7 +450,7 @@ def __init__(self, *args, **kwargs):
             mode = 'RNN_RELU'
         else:
             raise ValueError("Unknown nonlinearity '{}'".format(self.nonlinearity))
-        super(RNN, self).__init__(mode, *args, **kwargs)
+        super().__init__(mode, *args, **kwargs)
 
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
@@ -708,7 +708,7 @@ class LSTM(RNNBase):
     """
 
     def __init__(self, *args, **kwargs):
-        super(LSTM, self).__init__('LSTM', *args, **kwargs)
+        super().__init__('LSTM', *args, **kwargs)
 
     def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
         if batch_sizes is not None:
@@ -766,8 +766,7 @@ def forward(self, input, hx=None):  # noqa: F811
         batch_sizes = None
         if isinstance(orig_input, PackedSequence):
             input, batch_sizes, sorted_indices, unsorted_indices = input
-            max_batch_size = batch_sizes[0]
-            max_batch_size = int(max_batch_size)
+            max_batch_size = int(batch_sizes[0])
         else:
             batch_sizes = None
             assert (input.dim() in (2, 3)), f"LSTM: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
@@ -940,7 +939,7 @@ class GRU(RNNBase):
     def __init__(self, *args, **kwargs):
         if 'proj_size' in kwargs:
             raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
-        super(GRU, self).__init__('GRU', *args, **kwargs)
+        super().__init__('GRU', *args, **kwargs)
 
     @overload  # type: ignore[override]
     @torch._jit_internal._overload_method  # noqa: F811
@@ -961,8 +960,7 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             input, batch_sizes, sorted_indices, unsorted_indices = input
-            max_batch_size = batch_sizes[0]
-            max_batch_size = int(max_batch_size)
+            max_batch_size = int(batch_sizes[0])
         else:
             batch_sizes = None
             assert (input.dim() in (2, 3)), f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
@@ -1029,7 +1027,7 @@ class RNNCellBase(Module):
     def __init__(self, input_size: int, hidden_size: int, bias: bool, num_chunks: int,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(RNNCellBase, self).__init__()
+        super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
@@ -1118,7 +1116,7 @@ class RNNCell(RNNCellBase):
     def __init__(self, input_size: int, hidden_size: int, bias: bool = True, nonlinearity: str = "tanh",
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
         self.nonlinearity = nonlinearity
 
     def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
@@ -1219,7 +1217,7 @@ class LSTMCell(RNNCellBase):
     def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(LSTMCell, self).__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
+        super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
 
     def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
         assert input.dim() in (1, 2), \
@@ -1310,7 +1308,7 @@ class GRUCell(RNNCellBase):
     def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(GRUCell, self).__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
 
     def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         assert input.dim() in (1, 2), \
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index 8fef8040a6c4..8f7378c4e95e 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -125,7 +125,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optiona
                  sparse: bool = False, _weight: Optional[Tensor] = None, _freeze: bool = False,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(Embedding, self).__init__()
+        super().__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
         if padding_idx is not None:
@@ -322,7 +322,7 @@ def __init__(self, num_embeddings: int, embedding_dim: int,
                  include_last_offset: bool = False, padding_idx: Optional[int] = None,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(EmbeddingBag, self).__init__()
+        super().__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
         self.max_norm = max_norm
@@ -403,7 +403,7 @@ def extra_repr(self) -> str:
         s += ', mode={mode}'
         if self.padding_idx is not None:
             s += ', padding_idx={padding_idx}'
-        return s.format(**self.__dict__)
+        return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
 
     @classmethod
     def from_pretrained(cls, embeddings: Tensor, freeze: bool = True, max_norm: Optional[float] = None,
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 9ce8580adf9a..560028ad53c7 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -55,7 +55,7 @@ def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int =
                  layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(Transformer, self).__init__()
+        super().__init__()
         torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
 
         if custom_encoder is not None:
@@ -94,9 +94,9 @@ def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, t
             src_mask: the additive mask for the src sequence (optional).
             tgt_mask: the additive mask for the tgt sequence (optional).
             memory_mask: the additive mask for the encoder output (optional).
-            src_key_padding_mask: the ByteTensor mask for src keys per batch (optional).
-            tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional).
-            memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional).
+            src_key_padding_mask: the Tensor mask for src keys per batch (optional).
+            tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional).
+            memory_key_padding_mask: the Tensor mask for memory keys per batch (optional).
 
         Shape:
             - src: :math:`(S, E)` for unbatched input, :math:`(S, N, E)` if `batch_first=False` or
@@ -111,13 +111,11 @@ def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, t
             - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
 
             Note: [src/tgt/memory]_mask ensures that position i is allowed to attend the unmasked
-            positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
-            while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+            positions. If a BoolTensor is provided, positions with ``True``
             are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
             is provided, it will be added to the attention weight.
             [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
-            the attention. If a ByteTensor is provided, the non-zero positions will be ignored while the zero
-            positions will be unchanged. If a BoolTensor is provided, the positions with the
+            the attention. If a BoolTensor is provided, the positions with the
             value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
 
             - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
@@ -186,7 +184,7 @@ class TransformerEncoder(Module):
     __constants__ = ['norm']
 
     def __init__(self, encoder_layer, num_layers, norm=None, enable_nested_tensor=True, mask_check=True):
-        super(TransformerEncoder, self).__init__()
+        super().__init__()
         torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
         self.layers = _get_clones(encoder_layer, num_layers)
         self.num_layers = num_layers
@@ -213,11 +211,14 @@ def forward(
         Shape:
             see the docs in Transformer class.
         """
-        if src_key_padding_mask is not None:
-            _skpm_dtype = src_key_padding_mask.dtype
-            if _skpm_dtype != torch.bool and not torch.is_floating_point(src_key_padding_mask):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported")
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(mask),
+            other_name="mask",
+            target_type=src.dtype
+        )
+
         output = src
         convert_to_nested = False
         first_layer = self.layers[0]
@@ -331,7 +332,7 @@ class TransformerDecoder(Module):
     __constants__ = ['norm']
 
     def __init__(self, decoder_layer, num_layers, norm=None):
-        super(TransformerDecoder, self).__init__()
+        super().__init__()
         torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
         self.layers = _get_clones(decoder_layer, num_layers)
         self.num_layers = num_layers
@@ -398,7 +399,8 @@ class TransformerEncoderLayer(Module):
         >>> out = encoder_layer(src)
 
     Fast path:
-        forward() will use a special optimized implementation if all of the following
+        forward() will use a special optimized implementation described in
+        `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following
         conditions are met:
 
         - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor
@@ -418,6 +420,10 @@ class TransformerEncoderLayer(Module):
         mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ will be
         returned, and an additional speedup proportional to the fraction of the input that
         is padding can be expected.
+
+        .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+
     """
     __constants__ = ['batch_first', 'norm_first']
 
@@ -426,7 +432,7 @@ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropou
                  layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(TransformerEncoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                             **factory_kwargs)
         # Implementation of Feedforward model
@@ -455,7 +461,7 @@ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropou
         self.activation = activation
 
     def __setstate__(self, state):
-        super(TransformerEncoderLayer, self).__setstate__(state)
+        super().__setstate__(state)
         if not hasattr(self, 'activation'):
             self.activation = F.relu
 
@@ -471,19 +477,21 @@ def forward(
         Args:
             src: the sequence to the encoder layer (required).
             src_mask: the mask for the src sequence (optional).
-            is_causal: If specified, applies a causal mask as src_mask. Mutually exclusive with providing src_mask.
+            is_causal: If specified, applies a causal mask as src_mask.
               Default: ``False``.
             src_key_padding_mask: the mask for the src keys per batch (optional).
 
         Shape:
             see the docs in Transformer class.
         """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(src_mask),
+            other_name="src_mask",
+            target_type=src.dtype
+        )
 
-        if src_key_padding_mask is not None:
-            _skpm_dtype = src_key_padding_mask.dtype
-            if _skpm_dtype != torch.bool and not torch.is_floating_point(src_key_padding_mask):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported")
         # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
         why_not_sparsity_fast_path = ''
         if not src.dim() == 3:
@@ -623,7 +631,7 @@ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropou
                  layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
-        super(TransformerDecoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
                                             **factory_kwargs)
         self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
@@ -650,7 +658,7 @@ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropou
     def __setstate__(self, state):
         if 'activation' not in state:
             state['activation'] = F.relu
-        super(TransformerDecoderLayer, self).__setstate__(state)
+        super().__setstate__(state)
 
     def forward(
         self,
@@ -720,6 +728,7 @@ def _ff_block(self, x: Tensor) -> Tensor:
 
 
 def _get_clones(module, N):
+    # FIXME: copy.deepcopy() is not defined on nn.module
     return ModuleList([copy.deepcopy(module) for i in range(N)])
 
 
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index 4f13c84c2e90..c0793936fae3 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -141,7 +141,7 @@ class Upsample(Module):
     def __init__(self, size: Optional[_size_any_t] = None, scale_factor: Optional[_ratio_any_t] = None,
                  mode: str = 'nearest', align_corners: Optional[bool] = None,
                  recompute_scale_factor: Optional[bool] = None) -> None:
-        super(Upsample, self).__init__()
+        super().__init__()
         self.name = type(self).__name__
         self.size = size
         if isinstance(scale_factor, tuple):
@@ -158,10 +158,10 @@ def forward(self, input: Tensor) -> Tensor:
 
     def extra_repr(self) -> str:
         if self.scale_factor is not None:
-            info = 'scale_factor=' + str(self.scale_factor)
+            info = 'scale_factor=' + repr(self.scale_factor)
         else:
-            info = 'size=' + str(self.size)
-        info += ', mode=' + self.mode
+            info = 'size=' + repr(self.size)
+        info += ', mode=' + repr(self.mode)
         return info
 
 
@@ -207,7 +207,7 @@ class UpsamplingNearest2d(Upsample):
                   [3., 3., 4., 4.]]]])
     """
     def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
-        super(UpsamplingNearest2d, self).__init__(size, scale_factor, mode='nearest')
+        super().__init__(size, scale_factor, mode='nearest')
 
 
 class UpsamplingBilinear2d(Upsample):
@@ -254,4 +254,4 @@ class UpsamplingBilinear2d(Upsample):
                   [3.0000, 3.3333, 3.6667, 4.0000]]]])
     """
     def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
-        super(UpsamplingBilinear2d, self).__init__(size, scale_factor, mode='bilinear', align_corners=True)
+        super().__init__(size, scale_factor, mode='bilinear', align_corners=True)
diff --git a/torch/nn/parallel/_replicated_tensor_ddp_interop.py b/torch/nn/parallel/_replicated_tensor_ddp_interop.py
deleted file mode 100644
index c66d1c4b46ab..000000000000
--- a/torch/nn/parallel/_replicated_tensor_ddp_interop.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import torch
-from torch.distributed._shard.replicated_tensor import ReplicatedTensor
-
-class ReplicatedTensorFunction(torch.autograd.Function):
-    """
-    Autograd function to ensure gradients are replicated between the
-    replicated tensor and the original one.
-    """
-    @staticmethod
-    def forward(ctx, inp, process_group=None):
-        # set_materialize_grads(False) will ensure that None gradients stay as
-        # None and are not filled with zeros.
-        ctx.set_materialize_grads(False)
-        return ReplicatedTensor(inp, process_group)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output, None
-
-def _make_replicated_tensor(tensor, process_group):
-    replicated_tensor = ReplicatedTensorFunction.apply(tensor, process_group)
-    replicated_tensor.grad = tensor.grad
-    return replicated_tensor
-
-def _replicate_module_recurse(module, process_group):
-    replica = module._replicate_for_data_parallel()
-    for param_name, param in module._parameters.items():
-        if param is not None:
-            setattr(replica, param_name, _make_replicated_tensor(param, process_group))
-        else:
-            setattr(replica, param_name, param)
-
-    for buffer_name, buffer in module._buffers.items():
-        setattr(replica, buffer_name, buffer)
-
-    for module_name, child in module._modules.items():
-        setattr(replica, module_name, _replicate_module_recurse(child, process_group))
-    return replica
-
-def _replicate_module(network, process_group):
-    from torch.nn.parallel.replicate import _replicatable_module  # type: ignore[attr-defined]
-    if not _replicatable_module(network):
-        raise RuntimeError("Cannot replicate network where python modules are "
-                           "childrens of ScriptModule")
-
-    return _replicate_module_recurse(network, process_group)
diff --git a/torch/nn/parallel/_replicated_tensor_ddp_utils.py b/torch/nn/parallel/_replicated_tensor_ddp_utils.py
deleted file mode 100644
index 9ef00af4a163..000000000000
--- a/torch/nn/parallel/_replicated_tensor_ddp_utils.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from contextlib import contextmanager
-
-_DDP_WITH_REPLICATED_TENSOR = False
-
-@contextmanager
-def _ddp_replicated_tensor(val):
-    """
-    A context manager to tag tensors in the forward pass of DDP to be
-    ``ReplicatedTensor``. This can be used by ReplicatedTensor inter-op
-    during the forward pass to perform appropriate optimizations.
-
-    This context manager needs to wrap DDP creation and modifying the underlying
-    module passed into DDP after leaving this context manager would cause
-    inconsitencies and the changes will not be picked up during the forward
-    pass.
-    """
-    global _DDP_WITH_REPLICATED_TENSOR
-    old_val = _DDP_WITH_REPLICATED_TENSOR
-    _DDP_WITH_REPLICATED_TENSOR = val
-    try:
-        yield
-    finally:
-        _DDP_WITH_REPLICATED_TENSOR = old_val
-
-def _ddp_with_replicated_tensor_enabled():
-    global _DDP_WITH_REPLICATED_TENSOR
-    return _DDP_WITH_REPLICATED_TENSOR
-
-def _set_ddp_with_replicated_tensor(value):
-    global _DDP_WITH_REPLICATED_TENSOR
-    _DDP_WITH_REPLICATED_TENSOR = value
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 44e571e72892..6bdc3ef67e15 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -122,7 +122,7 @@ class DataParallel(Module):
     # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
 
     def __init__(self, module, device_ids=None, output_device=None, dim=0):
-        super(DataParallel, self).__init__()
+        super().__init__()
         torch._C._log_api_usage_once("torch.nn.parallel.DataParallel")
         device_type = _get_available_device_type()
         if device_type is None:
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 3af49ae464e2..743edb11be51 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -37,7 +37,6 @@
 from torch._utils import _get_device_index
 
 from ..modules import Module
-from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled
 from .scatter_gather import gather, scatter_kwargs  # noqa: F401
 
 __all__ = ["DistributedDataParallel"]
@@ -177,14 +176,13 @@ def forward(ctx, reducer, state_dict, *inputs):
 
     @staticmethod
     def backward(ctx, *grad_outputs):
-        state_dict = ctx.state_dict
         # Enqueue delay allreduce for static graph training on the first
         # iteration.
         if (
             ctx.state_dict["static_graph"]
             and ctx.state_dict["num_iterations"] == 1
         ):
-            Variable._execution_engine.queue_callback(
+            Variable._execution_engine.queue_callback(  # type: ignore[call-arg,misc]
                 ctx.reducer._delay_all_reduce
             )
 
@@ -559,7 +557,7 @@ def __init__(
         gradient_as_bucket_view=False,
         static_graph=False,
     ):
-        super(DistributedDataParallel, self).__init__()
+        super().__init__()
         Joinable.__init__(self)
         self.logger = None
         if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
@@ -636,11 +634,6 @@ def __init__(
         self.require_forward_param_sync = True
         self.gradient_as_bucket_view = gradient_as_bucket_view
 
-        self._use_replicated_tensor_module = (
-            _ddp_with_replicated_tensor_enabled()
-        )
-        self._build_replicated_tensor_module()
-
         if check_reduction:
             # This argument is no longer used since the reducer
             # will ensure reduction completes even if some parameters
@@ -696,6 +689,9 @@ def __init__(
         if static_graph:
             self._set_static_graph()
 
+        self._setup_in_backward_optimizers()
+
+    def _setup_in_backward_optimizers(self):
         # Check if user has used apply_optim_in_backward to overlap optimizer
         # step + DDP backward. Current constraints:
         # 1. Only allreduce is supported at the moment, no custom communication.
@@ -706,7 +702,6 @@ def __init__(
         # If your use case requires some DDP managed parameters to run with
         # an in-backward optimizer and some with a traditional optimizer, please
         # ping https://github.com/pytorch/pytorch/issues/90052.
-
         # NOTE: we use self._module_parameters instead of .parameters() since
         # the former excludes ignored (non-DDP managed) parameters.
         if any(
@@ -750,17 +745,6 @@ def __init__(
                 )
                 self.reducer._set_grads_to_none()  # type: ignore[attr-defined]
 
-    def _build_replicated_tensor_module(self):
-        if self._use_replicated_tensor_module:
-            # Create a module with ReplicatedTensor without copying tensors. Avoid
-            # registering '_replicated_tensor_module' as a submodule by directly
-            # adding to self.__dict__.
-            from ._replicated_tensor_ddp_interop import _replicate_module
-
-            self.__dict__["_replicated_tensor_module"] = _replicate_module(
-                self.module, self.process_group
-            )
-
     def _log_and_throw(self, err_type, err_msg):
         if self.logger is not None:
             self.logger.set_error_and_log(f"{str(err_type)}: {err_msg}")
@@ -870,15 +854,12 @@ def __getstate__(self):
         del attrs["process_group"]
         del attrs["reducer"]
         del attrs["logger"]
-        if self._use_replicated_tensor_module:
-            del attrs["_replicated_tensor_module"]
         return attrs
 
     def __setstate__(self, state):
         # If serializable, then the process group should be the default one
         self.process_group = _get_default_group()
-        super(DistributedDataParallel, self).__setstate__(state)
-        self._build_replicated_tensor_module()
+        super().__setstate__(state)
         self.__dict__.setdefault("require_forward_param_sync", True)
         self.__dict__.setdefault("require_backward_grad_sync", True)
         parameters, expect_sparse_gradient = self._build_params_for_reducer()
@@ -927,22 +908,20 @@ def _build_params_for_reducer(self):
         ]
 
         # Build list of parameters.
-        parameters = list(parameter for _, parameter in modules_and_parameters)
+        parameters = [parameter for _, parameter in modules_and_parameters]
 
         # Checks if a module will produce a sparse gradient.
         def produces_sparse_gradient(module):
-            if isinstance(module, torch.nn.Embedding) or isinstance(
-                module, torch.nn.EmbeddingBag
-            ):
+            if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
                 return module.sparse
             return False
 
         # Build list of booleans indicating whether or not to expect sparse
         # gradients for the corresponding parameters.
-        expect_sparse_gradient = list(
+        expect_sparse_gradient = [
             produces_sparse_gradient(module)
             for module, _ in modules_and_parameters
-        )
+        ]
 
         self._assign_modules_buffers()
 
@@ -1020,8 +999,7 @@ def model_parameters(m):
                 if hasattr(m, "_former_parameters")
                 else m.parameters(recurse=False)
             )
-            for p in ps:
-                yield p
+            yield from ps
 
         for m in m.modules() if recurse else [m]:
             for p in model_parameters(m):
@@ -1094,12 +1072,6 @@ def _inside_ddp_forward(self):
             DistributedDataParallel._active_ddp_module = None
 
     def _run_ddp_forward(self, *inputs, **kwargs):
-        module_to_run = (
-            self._replicated_tensor_module
-            if self._use_replicated_tensor_module
-            else self.module
-        )
-
         if self.device_ids:
             inputs, kwargs = _to_kwargs(
                 inputs,
@@ -1108,10 +1080,10 @@ def _run_ddp_forward(self, *inputs, **kwargs):
                 self.use_side_stream_for_tensor_copies,
             )
             with self._inside_ddp_forward():
-                return module_to_run(*inputs[0], **kwargs[0])  # type: ignore[index]
+                return self.module(*inputs[0], **kwargs[0])  # type: ignore[index]
         else:
             with self._inside_ddp_forward():
-                return module_to_run(*inputs, **kwargs)
+                return self.module(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
         with torch.autograd.profiler.record_function(
@@ -1145,7 +1117,6 @@ def forward(self, *inputs, **kwargs):
 
             # sync params according to location (before/after forward) user
             # specified as part of hook, if hook was specified.
-            buffer_hook_registered = hasattr(self, "buffer_hook")
             if self._check_sync_bufs_pre_fwd():
                 self._sync_buffers()
 
@@ -1234,9 +1205,7 @@ def gather(self, outputs, output_device):
         return gather(outputs, output_device, dim=self.dim)
 
     def train(self, mode=True):
-        super(DistributedDataParallel, self).train(mode)
-        if self._use_replicated_tensor_module:
-            self._replicated_tensor_module.train(mode)  # type: ignore[union-attr]
+        super().train(mode)
         return self
 
     # When running in join mode, schedules an allreduce to notify joined ranks
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index 80553fee046a..a114dfd8dc10 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -9,7 +9,7 @@ def get_a_var(obj):
     if isinstance(obj, torch.Tensor):
         return obj
 
-    if isinstance(obj, list) or isinstance(obj, tuple):
+    if isinstance(obj, (list, tuple)):
         for result in map(get_a_var, obj):
             if isinstance(result, torch.Tensor):
                 return result
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index e2100d782c6a..c15ad0c863c9 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -24,8 +24,11 @@ class Parameter(torch.Tensor, metaclass=_ParameterMeta):
 
     Args:
         data (Tensor): parameter tensor.
-        requires_grad (bool, optional): if the parameter requires gradient. See
-            :ref:`locally-disable-grad-doc` for more details. Default: `True`
+        requires_grad (bool, optional): if the parameter requires gradient. Note that
+            the torch.no_grad() context does NOT affect the default behavior of
+            Parameter creation--the Parameter will still have `requires_grad=True` in
+            :class:`~no_grad` mode. See :ref:`locally-disable-grad-doc` for more
+            details. Default: `True`
     """
     def __new__(cls, data=None, requires_grad=True):
         if data is None:
@@ -57,7 +60,7 @@ def __deepcopy__(self, memo):
             return result
 
     def __repr__(self):
-        return 'Parameter containing:\n' + super(Parameter, self).__repr__()
+        return 'Parameter containing:\n' + super().__repr__()
 
     def __reduce_ex__(self, proto):
         state = torch._utils._get_obj_state(self)
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
index 1bfb91c3360c..a39c2bda09e3 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -1,11 +1,67 @@
+from contextlib import contextmanager
+
 from torch._C import _TensorBase
 import torch
 import functools
+from torch._decomp import decomposition_table
 
 from typing import Callable, Dict, cast
 
+from torch.utils._pytree import tree_map_only
+
 HANDLED_FUNCTIONS: Dict[Callable, torch.autograd.Function] = {}
 
+aten = torch._ops.ops.aten
+# __torch_function__ runs before the pydispatcher so we need to manually use the same
+# decompositions indexed by their torch equivalent
+expanded_weights_rnn_decomps = {
+    # func: (input_decomp, data_decomp)
+    torch.rnn_relu: (decomposition_table[aten.rnn_relu.input], decomposition_table[aten.rnn_relu.data]),
+    torch.rnn_tanh: (decomposition_table[aten.rnn_tanh.input], decomposition_table[aten.rnn_tanh.data]),
+    torch.lstm: (decomposition_table[aten.lstm.input], decomposition_table[aten.lstm.data]),
+    torch.gru: (decomposition_table[aten.gru.input], decomposition_table[aten.gru.data]),
+}
+
+# all of the RNN decomps run linear with the batch dimension second, even if batch_first was set
+@contextmanager
+def batch_second(args, kwargs):
+    def set_batch_second(ew):
+        ew.set_batch_first(False)
+
+    def reset_batch_first(ew):
+        ew.set_batch_first(True)
+
+    tree_map_only(ExpandedWeight, set_batch_second, args)
+    tree_map_only(ExpandedWeight, set_batch_second, kwargs)
+    try:
+        yield
+    finally:
+        tree_map_only(ExpandedWeight, reset_batch_first, args)
+        tree_map_only(ExpandedWeight, reset_batch_first, kwargs)
+
+# to support packed sequences, we need to allow for smaller batches. Expanded weights represents the largest batch
+@contextmanager
+def allow_smaller_batches(args, kwargs):
+    def allow(ew):
+        ew.set_allow_smaller_batches(True)
+
+    def reset(ew):
+        ew.set_allow_smaller_batches(False)
+
+    tree_map_only(ExpandedWeight, allow, args)
+    tree_map_only(ExpandedWeight, allow, kwargs)
+    try:
+        yield
+    finally:
+        tree_map_only(ExpandedWeight, reset, args)
+        tree_map_only(ExpandedWeight, reset, kwargs)
+
+@contextmanager
+def setup_rnn(use_input_variant, args, kwargs):
+    with batch_second(args, kwargs) if use_input_variant else allow_smaller_batches(args, kwargs):
+        yield
+
+
 def implements_per_sample_grads(torch_function):
     @functools.wraps(torch_function)
     def decorator(autograd_func):
@@ -28,6 +84,8 @@ def decorator(autograd_func):
 class ExpandedWeight(torch.Tensor):
     def __init__(self, orig_weight, batch_size, loss_reduction):
         self.batch_size = batch_size
+        self.batch_first = True
+        self.allow_smaller_batches = False
         self.orig_weight = orig_weight
         self.loss_reduction = loss_reduction
 
@@ -45,6 +103,18 @@ def __new__(cls, orig_weight, batch_size, loss_reduction):
     def __torch_function__(cls, func, _, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
+        if func in expanded_weights_rnn_decomps:
+            # in aten, choosing the input or data variants is done by parsing logic. This mimics some of that
+            decomp_opts = expanded_weights_rnn_decomps[func]
+            use_input_variant = isinstance(args[2], list)  # data variant uses a list here
+            decomp = decomp_opts[0] if use_input_variant else decomp_opts[1]
+
+            if decomp is not None:
+                with setup_rnn(use_input_variant, args, kwargs):
+                    return decomp(*args, **kwargs)
+        if func == torch._cudnn_rnn_flatten_weight:
+            # since we aren't using the fused cuda kernels for RNNs, don't do this
+            return
         if func in cls.handled_functions:
             return cls.handled_functions[func].apply(tuple(kwargs.keys()), func, *(args + tuple(kwargs.values())))
         # We cannot use a fallback here because we do not know the batch dimension for any regular tensor inputs,
@@ -55,6 +125,30 @@ def __torch_function__(cls, func, _, args=(), kwargs=None):
     def dtype(self):
         return self.orig_weight.dtype
 
+    @property
+    def data(self):
+        return self.orig_weight.data
+
     @property
     def shape(self):
         return self.orig_weight.shape
+
+    @property
+    def device(self):
+        return self.orig_weight.device
+
+    @property
+    def is_cuda(self):
+        return self.orig_weight.is_cuda
+
+    def data_ptr(self):
+        return self.orig_weight.data_ptr()
+
+    def get_device(self):
+        return self.orig_weight.get_device()
+
+    def set_allow_smaller_batches(self, is_allow_smaller_batches):
+        self.allow_smaller_batches = is_allow_smaller_batches
+
+    def set_batch_first(self, is_batch_first=True):
+        self.batch_first = is_batch_first
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
index 9b2fe0dbfaa7..b3c91481c18c 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -3,6 +3,18 @@
 import torch
 from .expanded_weights_impl import ExpandedWeight
 
+def is_batch_first(expanded_args_and_kwargs):
+    batch_first = None
+    for arg in expanded_args_and_kwargs:
+        if not isinstance(arg, ExpandedWeight):
+            continue
+
+        if not batch_first:
+            batch_first = arg.batch_first
+        elif arg.batch_first != batch_first:
+            raise RuntimeError("Got conflicting batch_first arguments in the same layer")
+    return batch_first
+
 def standard_kwargs(kwarg_names, expanded_args):
     r'''Most `__torch_function__`s standardize the kwargs that they give, so this will separate
     the args and kwargs they pass. Functions that don't are linear and convND
@@ -46,9 +58,12 @@ def _check_and_unexpand_args(func, expanded_args, expanded_kwargs):
     if input.shape[0] == 0:
         raise RuntimeError("0 is not a valid batch size for Expanded Weights but got input tensor of "
                            f"{input} in function {func.__name__}")
-    batch_size = input.shape[0]
     for arg in expanded_args + tuple(expanded_kwargs.values()):
-        if isinstance(arg, ExpandedWeight) and arg.batch_size != batch_size:
+        if not isinstance(arg, ExpandedWeight):
+            continue
+        batch_size = input.shape[0] if arg.batch_first else input.shape[1]
+        if (arg.allow_smaller_batches and batch_size > arg.batch_size) or \
+                (not arg.allow_smaller_batches and arg.batch_size != batch_size):
             raise RuntimeError("Expected ExpandedWeights to have batch size matching input but got "
                                f"input batch size of {batch_size} with ExpandedWeight of batch size {arg.batch_size}")
 
@@ -76,6 +91,15 @@ def set_grad_sample_if_exists(maybe_expanded_weight, per_sample_grad_fn):
     unpacked = unpack_expanded_weight_or_tensor(maybe_expanded_weight)
     if isinstance(maybe_expanded_weight, ExpandedWeight):
         grad_sample_contribution = maybe_scale_by_batch_size(per_sample_grad_fn(unpacked), maybe_expanded_weight)
+
+        if maybe_expanded_weight.batch_size > grad_sample_contribution.shape[0]:
+            # this only passes the other checks if the arg allows smaller batch sizes
+            intermediate = torch.zeros(maybe_expanded_weight.batch_size, *grad_sample_contribution.shape[1:],
+                                       dtype=grad_sample_contribution.dtype,
+                                       device=grad_sample_contribution.device)
+            intermediate[:grad_sample_contribution.shape[0]] = grad_sample_contribution
+            grad_sample_contribution = intermediate
+
         if hasattr(unpacked, "grad_sample") and unpacked.grad_sample is not None:
             unpacked.grad_sample = unpacked.grad_sample + grad_sample_contribution
         else:
diff --git a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
index 70db268b8fe7..c2cbae63f336 100644
--- a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
@@ -2,7 +2,7 @@
 import torch.nn.functional as F
 from .expanded_weights_impl import implements_per_sample_grads
 from .expanded_weights_utils import \
-    forward_helper, set_grad_sample_if_exists, unpack_expanded_weight_or_tensor
+    forward_helper, set_grad_sample_if_exists, unpack_expanded_weight_or_tensor, is_batch_first
 from typing import List, Optional
 
 @implements_per_sample_grads(F.linear)
@@ -14,6 +14,7 @@ def forward(ctx, _, __, *expanded_args_and_kwargs):
                                f"of at least rank 2, got of rank {len(expanded_args_and_kwargs[0].shape)}")
         expanded_kwargs = {'bias': expanded_args_and_kwargs[2] if len(expanded_args_and_kwargs) == 3 else None}
         expanded_args = expanded_args_and_kwargs[:2]
+        ctx.batch_first = is_batch_first(expanded_args_and_kwargs)
         output = forward_helper(F.linear, expanded_args, expanded_kwargs)
         ctx.args = expanded_args
         ctx.kwargs = expanded_kwargs
@@ -33,6 +34,10 @@ def backward(ctx, grad_output):
             results.append(None)
         results.extend([None] * 2)  # weight and bias don't compute batched gradients
 
+        if not ctx.batch_first:
+            grad_output = grad_output.transpose(0, 1)
+            input = input.transpose(0, 1)
+
         # weight and bias get their grad_sample fields set directly if they exist
         set_grad_sample_if_exists(weight, lambda _: torch.einsum("n...i,n...j->nij", grad_output, input))
         set_grad_sample_if_exists(bias, lambda _: torch.einsum("n...k->nk", grad_output))
diff --git a/torch/nn/utils/_named_member_accessor.py b/torch/nn/utils/_named_member_accessor.py
new file mode 100644
index 000000000000..1c65dbaf9b52
--- /dev/null
+++ b/torch/nn/utils/_named_member_accessor.py
@@ -0,0 +1,341 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Iterable, List, Tuple
+
+import torch
+
+
+_MISSING: torch.Tensor = object()  # type: ignore[assignment]
+
+
+def set_tensor(module: "torch.nn.Module", name: str, tensor: torch.Tensor) -> None:
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if not isinstance(tensor, torch.Tensor) and tensor is not None:
+        raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+    if "." in name:
+        raise KeyError('tensor name can\'t contain "."')
+    if name == "":
+        raise KeyError('tensor name can\'t be empty string ""')
+    if name in module._parameters:
+        module._parameters[name] = tensor  # type: ignore[assignment]
+    elif name in module._buffers:
+        module._buffers[name] = tensor
+    else:
+        setattr(module, name, tensor)
+
+
+def swap_tensor(
+    module: "torch.nn.Module",
+    name: str,
+    tensor: torch.Tensor,
+    allow_missing: bool = False,
+) -> torch.Tensor:
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if (
+        tensor is not _MISSING
+        and not isinstance(tensor, torch.Tensor)
+        and tensor is not None
+    ):
+        raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+    if "." in name:
+        raise KeyError('tensor name can\'t contain "."')
+    if name == "":
+        raise KeyError('tensor name can\'t be empty string ""')
+
+    orig_tensor: torch.Tensor
+    if name in module._parameters:
+        orig_tensor = module._parameters[name]  # type: ignore[assignment]
+        if tensor is not _MISSING:
+            module._parameters[name] = tensor  # type: ignore[assignment]
+        else:
+            del module._parameters[name]
+    elif name in module._buffers:
+        orig_tensor = module._buffers[name]  # type: ignore[assignment]
+        if tensor is not _MISSING:
+            module._buffers[name] = tensor
+        else:
+            del module._buffers[name]
+    else:
+        try:
+            orig_tensor = getattr(module, name)
+        except AttributeError as ex:
+            if not allow_missing:
+                raise AttributeError(
+                    f"{module._get_name()} has no attribute `{name}`"
+                ) from ex
+            orig_tensor = _MISSING
+        if (
+            orig_tensor is not _MISSING
+            and not isinstance(orig_tensor, torch.Tensor)
+            and orig_tensor is not None
+        ):
+            raise TypeError(
+                f"attribute `{name}`: {orig_tensor} is not an instance of torch.Tensor"
+            )
+        if tensor is not _MISSING:
+            setattr(module, name, tensor)
+        elif hasattr(module, name):
+            delattr(module, name)
+    return orig_tensor
+
+
+class NamedMemberAccessor:
+    """
+    A class that provides a way to access the submodules and parameters/buffers
+    of a module. It provides caching mechanism to speed up submodule lookups.
+    This is useful for functional programming to manipulate the module state.
+    """
+
+    def __init__(self, module: "torch.nn.Module") -> None:
+        self.module = module
+        self.memo: Dict[str, torch.nn.Module] = {}
+
+    # Nested attribute access
+
+    def get_submodule(self, name: str) -> "torch.nn.Module":
+        """
+        Return the submodule specified by the given path.
+        For example, to get the submodule mod.layer1.conv1,
+        use accessor.get_submodule("layer1.conv1")
+
+        Compare to mod.get_submodule("layer1.conv1"), this method will cache the
+        intermediate submodule access to speed up future lookups.
+        """
+        if not name:
+            return self.module
+
+        try:
+            return self.memo[name]
+        except KeyError:
+            prefix, dot, attr = name.rpartition(".")
+            if dot:
+                module = self.get_submodule(prefix)
+            else:
+                module = self.module
+            try:
+                submodule = getattr(module, attr)
+            except AttributeError as ex:
+                raise AttributeError(
+                    f"{module._get_name()} has no attribute `{attr}`"
+                ) from ex
+            if not isinstance(submodule, torch.nn.Module):
+                raise TypeError(
+                    f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
+                )
+            self.memo[name] = submodule
+            return submodule
+
+    def get_tensor(self, name: str) -> torch.Tensor:
+        """
+        Get the tensor specified by the given path to value.
+        For example, to get the attribute mod.layer1.conv1.weight,
+        use accessor.get_tensor('layer1.conv1.weight')
+
+        Compare to mod.get_parameter("layer1.conv1.weight"), this method will
+        cache the intermediate submodule access to speed up future lookups.
+        """
+        prefix, _, attr = name.rpartition(".")
+        submodule = self.get_submodule(prefix)
+        try:
+            tensor = getattr(submodule, attr)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"{submodule._get_name()} has no attribute `{name}`"
+            ) from ex
+        if not isinstance(tensor, torch.Tensor) and tensor is not None:
+            raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+        return tensor  # type: ignore[return-value]
+
+    def set_tensor(self, name: str, value: torch.Tensor) -> None:
+        """
+        Set the attribute specified by the given path to value.
+        For example, to set the attribute mod.layer1.conv1.weight,
+        use accessor.set_tensor("layer1.conv1.weight", value)
+        """
+        prefix, _, attr = name.rpartition(".")
+        set_tensor(self.get_submodule(prefix), attr, value)
+
+    def del_tensor(self, name: str) -> None:
+        """
+        Delete the attribute specified by the given path.
+        For example, to delete the attribute mod.layer1.conv1.weight,
+        use accessor.del_tensor("layer1.conv1.weight")
+        """
+        prefix, _, attr = name.rpartition(".")
+        submodule = self.get_submodule(prefix)
+        try:
+            delattr(submodule, attr)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"{submodule._get_name()} has no attribute `{name}`"
+            ) from ex
+
+    def swap_tensor(
+        self, name: str, value: torch.Tensor, allow_missing: bool = False
+    ) -> torch.Tensor:
+        """
+        Swap the attribute specified by the given path to value.
+        For example, to swap the attribute mod.layer1.conv1.weight,
+        use accessor.swap_tensor("layer1.conv1.weight", value)
+        """
+        prefix, _, attr = name.rpartition(".")
+        return swap_tensor(
+            self.get_submodule(prefix), attr, value, allow_missing=allow_missing
+        )
+
+    # Batched operations
+
+    def get_tensors(self, names: Iterable[str]) -> List[torch.Tensor]:
+        """
+        Get the tensors specified by the given paths.
+        For example, to get the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.get_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"])
+        """
+        return [self.get_tensor(name) for name in names]
+
+    def set_tensors(self, names: Iterable[str], values: Iterable[torch.Tensor]) -> None:
+        """
+        Set the attributes specified by the given paths to values.
+        For example, to set the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.set_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"], [weight, bias])
+        """
+        if not isinstance(names, (list, tuple)):
+            names = list(names)
+        if not isinstance(values, (list, tuple)):
+            values = list(values)
+        assert len(names) == len(values), "names and values must have the same length"
+
+        for name, value in zip(names, values):
+            self.set_tensor(name, value)
+
+    def set_tensors_dict(self, named_tensors: Dict[str, torch.Tensor]) -> None:
+        """
+        Set the attributes specified by the given paths to values.
+        For example, to set the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.set_tensors_dict({
+            "layer1.conv1.weight": weight,
+            "layer1.conv1.bias": bias,
+        })
+        """
+        for name, value in named_tensors.items():
+            self.set_tensor(name, value)
+
+    def del_tensors(self, names: Iterable[str]) -> None:
+        """
+        Delete the attributes specified by the given paths.
+        For example, to delete the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.del_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"])
+        """
+        for name in names:
+            self.del_tensor(name)
+
+    def swap_tensors(
+        self,
+        names: Iterable[str],
+        values: Iterable[torch.Tensor],
+        allow_missing: bool = False,
+    ) -> List[torch.Tensor]:
+        """
+        Swap the attributes specified by the given paths to values.
+        For example, to swap the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.swap_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"], [weight, bias])
+        """
+        if not isinstance(names, (list, tuple)):
+            names = list(names)
+        if not isinstance(values, (list, tuple)):
+            values = list(values)
+        assert len(names) == len(values), "names and values must have the same length"
+
+        return [
+            self.swap_tensor(name, value, allow_missing=allow_missing)
+            for name, value in zip(names, values)
+        ]
+
+    def swap_tensors_dict(
+        self, named_tensors: Dict[str, torch.Tensor], allow_missing: bool = False
+    ) -> Tuple[Dict[str, torch.Tensor], List[str]]:
+        """
+        Swap the attributes specified by the given paths to values.
+        For example, to swap the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.swap_tensors_dict({
+            "layer1.conv1.weight": weight,
+            "layer1.conv1.bias": bias,
+        })
+        """
+        orig_named_tensors = {}
+        missing_keys = []
+        try:
+            for name, tensor in named_tensors.items():
+                orig_tensor = self.swap_tensor(name, tensor, allow_missing=True)
+                if orig_tensor is _MISSING:
+                    missing_keys.append(name)
+                orig_named_tensors[name] = orig_tensor
+        except Exception:
+            # Swap back if any exception occurs
+            for name, orig_tensor in orig_named_tensors.items():
+                self.swap_tensor(name, orig_tensor, allow_missing=True)
+            raise
+        if missing_keys and not allow_missing:
+            # Swap back if any key is missing when allow_missing is False
+            for name, orig_tensor in orig_named_tensors.items():
+                self.swap_tensor(name, orig_tensor, allow_missing=True)
+            raise RuntimeError(
+                "Missing key(s): {}.".format(", ".join(map(repr, missing_keys)))
+            )
+        return orig_named_tensors, missing_keys
+
+    def check_keys(self, keys: Iterable[str]) -> Tuple[List[str], List[str]]:
+        """
+        Check that the given keys are valid.
+        """
+        keys = set(keys)
+        valid_keys = {name for name, _ in self.named_tensors(remove_duplicate=False)}
+        missing_keys = valid_keys - keys
+        unexpected_keys = keys - valid_keys
+        return sorted(missing_keys), sorted(unexpected_keys)
+
+    # Shortcut methods
+
+    def named_parameters(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        """
+        Iterate over all the parameters in the module.
+        """
+        yield from self.module.named_parameters(remove_duplicate=remove_duplicate)
+
+    def named_buffers(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        """
+        Iterate over all the buffers in the module.
+        """
+        yield from self.module.named_buffers(remove_duplicate=remove_duplicate)
+
+    def named_tensors(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        """
+        Iterate over all the tensors in the module.
+        """
+        yield from self.module.named_parameters(remove_duplicate=remove_duplicate)
+        yield from self.module.named_buffers(remove_duplicate=remove_duplicate)
+
+    def named_modules(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, "torch.nn.Module"]]:
+        """
+        Iterate over all the modules in the module.
+        """
+        yield from self.module.named_modules(remove_duplicate=remove_duplicate)
diff --git a/torch/nn/utils/_per_sample_grad.py b/torch/nn/utils/_per_sample_grad.py
index cd4b043f9e43..566b1684ebd4 100644
--- a/torch/nn/utils/_per_sample_grad.py
+++ b/torch/nn/utils/_per_sample_grad.py
@@ -6,9 +6,11 @@
 from torch.utils._pytree import tree_flatten
 
 
-def call_for_per_sample_grads(module, *, batch_size=None, loss_reduction="sum"):
+# dependency on `functional_call` means that this can't be exposed in utils
+# without creating circular dependency
+def call_for_per_sample_grads(module, *, batch_size=None, loss_reduction="sum", batch_first=True):
     r"""
-    call_for_per_sample_grads(module, batch_size=None, loss_reduction="sum")
+    call_for_per_sample_grads(module, batch_size=None, loss_reduction="sum", batch_first=True)
     ``call_for_per_sample_grads`` returns a function that is invoked like the forward
     function of ``module`` and will produce the same result. Then, when backward is invoked,
     the parameters of ``module`` will have a ``grad_sample`` field populated with the per sample
@@ -24,6 +26,8 @@ def call_for_per_sample_grads(module, *, batch_size=None, loss_reduction="sum"):
         loss_reduction: Indicates if the loss reduction (for aggregating the gradients) is a sum or a mean operation. If
           "mean", per sample gradients will be scaled by the batch size to offset the crossbatch interaction from
           running mean across a batch. Must be "mean" or "sum". Default: "sum"
+        batch_first: Indicates if the batch dimension is the first dimension. If True, the batch dimension is the first
+          dimension. If False, it's the second dimension. Default: True.
 
     Examples::
         >>> # xdoctest: +SKIP
@@ -64,7 +68,7 @@ def compute_batch_size(*args, **kwargs):
             if not isinstance(arg, torch.Tensor):
                 continue
 
-            arg_batch_size = arg.shape[0]  # we assume batch size is the first dim
+            arg_batch_size = arg.shape[0] if batch_first else arg.shape[1]
             if batch_size is not None and batch_size != arg_batch_size:
                 raise RuntimeError("When computing batch size, found at least one input with batch size "
                                    f"{batch_size} and one with batch size {arg_batch_size}. Please specify it "
diff --git a/torch/nn/utils/_stateless.py b/torch/nn/utils/_stateless.py
deleted file mode 100644
index 48b4556f5634..000000000000
--- a/torch/nn/utils/_stateless.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# This file is never automatically imported within PyTorch so it is ok to
-# always warn here
-import warnings
-
-warnings.warn("The `torch.nn.utils._stateless` code is deprecated now that "
-              "it is publicly available. Please use `torch.nn.utils.stateless "
-              "instead.", DeprecationWarning)
-
-# Import * wouldn't work as most things are private and thus wouldn't be imported
-# here.
-from torch.nn.utils.stateless import functional_call  # noqa: F401
-from torch.nn.utils.stateless import _apply_func_submodules, _change_class  # noqa: F401
-# This one used to look public but should actually be private. This was fixed when making the module
-# public and is kept here for BC
-from torch.nn.utils.stateless import _reparametrize_module as reparametrize_module  # noqa: F401
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index 0e49bc28c8ab..900d042abefd 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -2,9 +2,8 @@
 from typing import Union, Iterable, List, Dict, Tuple, Optional
 
 import torch
-from torch import Tensor
-from torch._six import inf
-from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
+from torch import Tensor, inf
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype, _has_foreach_support
 
 _tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
 
@@ -28,8 +27,8 @@ def clip_grad_norm_(
             norm of the gradients from :attr:`parameters` is ``nan``,
             ``inf``, or ``-inf``. Default: False (will switch to True in the future)
         foreach (bool): use the faster foreach-based implementation.
-            If ``None``, use the foreach implementation for CUDA and CPU tensors and silently fall back to the slow
-            implementation for other device types.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
             Default: ``None``
 
     Returns:
@@ -52,7 +51,7 @@ def clip_grad_norm_(
     else:
         norms = []
         for ((device, _), [grads]) in grouped_grads.items():
-            if (foreach is None or foreach) and device.type in {'cpu', 'cuda'}:
+            if (foreach is None or foreach) and _has_foreach_support(grads, device=device):
                 norms.extend(torch._foreach_norm(grads, norm_type))
             elif foreach:
                 raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
@@ -73,7 +72,7 @@ def clip_grad_norm_(
     # when the gradients do not reside in CPU memory.
     clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
     for ((device, _), [grads]) in grouped_grads.items():
-        if (foreach is None or foreach) and device.type in ('cpu', 'cuda'):
+        if (foreach is None or foreach) and _has_foreach_support(grads, device=device):
             torch._foreach_mul_(grads, clip_coef_clamped.to(device))  # type: ignore[call-overload]
         elif foreach:
             raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
@@ -111,8 +110,8 @@ def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float, foreach:
             The gradients are clipped in the range
             :math:`\left[\text{-clip\_value}, \text{clip\_value}\right]`
         foreach (bool): use the faster foreach-based implementation
-            If ``None``, use the foreach implementation for CUDA and CPU tensors and silently fall back to the slow
-            implementation for other device types.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and
+            silently fall back to the slow implementation for other device types.
             Default: ``None``
     """
     if isinstance(parameters, torch.Tensor):
@@ -124,7 +123,7 @@ def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float, foreach:
         = _group_tensors_by_device_and_dtype([grads])  # type: ignore[assignment]
 
     for ((device, _), [grads]) in grouped_grads.items():
-        if (foreach is None or foreach) and device.type in {'cpu', 'cuda'}:
+        if (foreach is None or foreach) and _has_foreach_support(grads, device=device):
             torch._foreach_clamp_min_(grads, -clip_value)
             torch._foreach_clamp_max_(grads, clip_value)
         elif foreach:
diff --git a/torch/nn/utils/memory_format.py b/torch/nn/utils/memory_format.py
index 00e0e089ae87..e0c762af4cdb 100644
--- a/torch/nn/utils/memory_format.py
+++ b/torch/nn/utils/memory_format.py
@@ -64,7 +64,7 @@ def convert_conv2d_weight_memory_format(module, memory_format):
 
     # TODO: expand this to `_ConvNd` when channels_last support is extended
     # beyond only 4d tensors.
-    if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.ConvTranspose2d):
+    if isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)):
         weight_data = module.weight.detach().clone().contiguous(memory_format=memory_format)
         module.weight.data = weight_data.resize_(weight_data.size(), memory_format=memory_format)
     for child in module.children():
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index b9db6a5f1a9c..1a322b2167ca 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -1,24 +1,21 @@
-from collections import namedtuple
 import warnings
+from typing import Iterable, List, NamedTuple, Tuple, Union
 
 import torch
 from torch import Tensor
 from ... import _VF
 from ..._jit_internal import Optional
 
-from typing import List, Tuple, Union, Iterable
-
 
 __all__ = ['PackedSequence', 'invert_permutation', 'pack_padded_sequence', 'pad_packed_sequence', 'pad_sequence',
            'unpad_sequence', 'pack_sequence', 'unpack_sequence']
 
-PackedSequence_ = namedtuple('PackedSequence_',
-                             ['data', 'batch_sizes', 'sorted_indices', 'unsorted_indices'])
 
-# type annotation for PackedSequence_ to make it compatible with TorchScript
-PackedSequence_.__annotations__ = {'data': torch.Tensor, 'batch_sizes': torch.Tensor,
-                                   'sorted_indices': Optional[torch.Tensor],
-                                   'unsorted_indices': Optional[torch.Tensor]}
+class PackedSequence_(NamedTuple):
+    data: torch.Tensor
+    batch_sizes: torch.Tensor
+    sorted_indices: Optional[torch.Tensor]
+    unsorted_indices: Optional[torch.Tensor]
 
 
 def bind(optional, fn):
diff --git a/torch/nn/utils/rnn.pyi b/torch/nn/utils/rnn.pyi
index 2c1c6c97e4a5..d337caa7af36 100644
--- a/torch/nn/utils/rnn.pyi
+++ b/torch/nn/utils/rnn.pyi
@@ -1,10 +1,23 @@
-from collections import namedtuple
-from typing import Any, List, Optional, overload, Union, TypeVar, Tuple, Sequence
-from torch import Tensor
-from torch.types import _dtype, _device
+from typing import (
+    Any,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    NamedTuple,
+    overload,
+)
 
-PackedSequence_ = namedtuple('PackedSequence_', ['data', 'batch_sizes', 'sorted_indices', 'unsorted_indices'])
+from torch import Tensor
+from torch.types import _device, _dtype
 
+class PackedSequence_(NamedTuple):
+    data: Tensor
+    batch_sizes: Tensor
+    sorted_indices: Optional[Tensor]
+    unsorted_indices: Optional[Tensor]
 
 def bind(optional: Any, fn: Any): ...
 
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
index 570500b2d489..e35ddd739bb2 100644
--- a/torch/nn/utils/stateless.py
+++ b/torch/nn/utils/stateless.py
@@ -1,180 +1,157 @@
 import contextlib
-from typing import Any, Callable, Dict, Iterator, List, Tuple, Union, Set, Optional
 import warnings
+from collections import defaultdict
+from typing import Any, Dict, Iterator, Set, Tuple, Union
 
 import torch
 from torch import Tensor
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
 
 __all__ = ["functional_call"]
 
-# We avoid typing module here because module attributes are declared as Union[Parameter, Tensor] by default
-# and using other types causes mypy errors
-def _change_class(module, params_and_buffers) -> None:
-    cls = module.__class__
-    attr_to_path : Dict[str, str] = module._attr_to_path
-
-    def _getattribute(self, name: str) -> Any:
-        if name in attr_to_path:
-            return params_and_buffers[attr_to_path[name]]
-        return cls.__getattribute__(self, name)
-
-    def _setattr(self, name: str, value: Any) -> None:
-        if name in attr_to_path:
-            params_and_buffers[attr_to_path[name]] = value
-        else:
-            return cls.__setattr__(self, name, value)
-
-    param_cls = type(
-        f"StatelessReplacer{cls.__name__}",
-        (cls,),
-        {
-            "__getattribute__": _getattribute,
-            "__setattr__": _setattr,
-        },
-    )
-
-    module.__class__ = param_cls
-    module._orig_class = cls
-
 
-def _create_tied_weights_map(module: 'torch.nn.Module', params_and_buffers: Dict[str, Tensor]) -> Dict[str, str]:
+def _untie_named_tensors_map(
+    module: "torch.nn.Module",
+    parameters_and_buffers: Dict[str, Tensor],
+) -> Dict[str, Tensor]:
     """
-    _create_tied_weights_map(module: Module, params_and_buffers: Dict[str, Tensor]) -> Dict[str, str]
+    Unties all tied tensors in the module to parameters_and_buffers.
 
-    Creates a weight map of {tied_name: name_given_by_user} for all weights where one of their tied weights is passed
+    This function returns a new untied_parameters_and_buffers dictionary and leave the original
+    untied_parameters_and_buffers dictionary unchanged. It adds new (missing) keys for tied tensors
+    in the module to untied_parameters_and_buffers. The value of the new key is the user-given value
+    in the original parameters_and_buffers dictionary.
 
-    ex: Foo() has self.foo and self.tied_foo, which are tied. If a user passed {'foo': ...} as the reparamaterization,
-        this would return {'tied_foo': 'foo'}. Similarly if a user passed {'tied_foo': ...}, this returns
-        {'tied_foo': 'foo'}.
+    If there are more than one user-given values for the same tied tensor, it will raise an error.
 
-    ex: If there aren't any tied weights and the user passed values for every parameter and buffer, this will return a
-        map where every name maps to an empty set: {'l1.weight': set(), 'l1.bias': set(), ...}
+    For example, if the module has two tied weights self.foo and self.tied_foo and the user passes
+    {'foo': foo_value, ...}, this will return {'foo': foo_value, 'tied_foo': foo_value, ...}. If the
+    user passes {'foo': foo_value, 'tied_foo': tied_foo_value, ...}, it will raise an error. If the
+    user passes {'foo': foo_value, 'tied_foo': foo_value, ...}, it will not raise an error.
 
-    ex: The map only contains values that a user is reparamaterizing. For example, if module = nn.Linear(...) and the
-        user only passed a new value for 'bias', this looks returns: {'bias': set()}
+    Args:
+        module (torch.nn.Module): the module to determine which tensors are tied.
+        parameters_and_buffers (Dict[str, Tensor]): a map of {name: tensor} for reparamaterizing the module.
 
-    This is useful because we will start by reparamaterizing all the keys of params_and_buffers, then all the key from
-    this returned dictionary.
+    Returns:
+        A new untied version of the parameters_and_buffers dictionary.
+
+    Raises:
+        ValueError: if there are more than one user-given values for the same tied tensor.
     """
+    # A map of {name: tensor} for all tensors (including tied ones) in the module.
+    all_named_tensors: Dict[str, Tensor] = {}
+    all_named_tensors.update(module.named_parameters(remove_duplicate=False))
+    all_named_tensors.update(module.named_buffers(remove_duplicate=False))
+
+    # A map of {tensor: set(all_tied_names)} for all tensor names in the module.
+    tensor_to_tied_names_map: Dict[Tensor, Set[str]] = defaultdict(set)
+    for name, tensor in all_named_tensors.items():
+        tensor_to_tied_names_map[tensor].add(name)
+
+    # A map of {tied_name: set(all_tied_names)} for all tensor names in the module.
+    # If a name is not tied, it will not be in this map.
+    tied_names_map: Dict[str, Set[str]] = {}
+    for tied_names in tensor_to_tied_names_map.values():
+        if len(tied_names) > 1:
+            for tied_name in tied_names:
+                tied_names_map[tied_name] = tied_names
+
+    # Make sure the user didn't pass multiple values for the same tied tensor.
+    given_names = set(parameters_and_buffers.keys())
+    given_names_for_tied_tensors = given_names.intersection(tied_names_map.keys())
+    for given_name in given_names_for_tied_tensors:
+        tied_names = tied_names_map[given_name]
+        if (
+            # Detect if there are multiple keys present for the same tied tensor.
+            len(tied_names.intersection(given_names_for_tied_tensors)) > 1
+            # Only raise an error if the user passed multiple values for the same tied tensor.
+            # If all given values are the same, don't raise.
+            and len({parameters_and_buffers[tied_name] for tied_name in tied_names})
+            != 1
+        ):
+            raise ValueError(
+                f"functional_call got multiple values for keys {sorted(tied_names)}, "
+                f"which are tied. Consider using tie_weights=False"
+            )
 
-    # The basic algorithm looks like:
-    #   - index all weights by their original tensor value to find tied weights
-    #     - when we encounter a weight not used by the user, we save it in a set (second element in the tuple)
-    #     - when we run into a weight used by the user, we save that separate from the set as the first element in the tuple
-    #     - ending map looks like {tensor: (name_given_by_user, set(all_tied_names)}
-    #   - then loop through the values of this map (name_given_by_user and set(all_tied_names))
-    #     - for each element of all_tied_names, add {tied_name: name_given_by_user} to a new map
-
-    names = params_and_buffers.keys()
-    weight_to_name_and_tied_names: Dict[torch.Tensor, Tuple[Optional[str], Set[str]]] = {}
-
-    # create a map keyed by tensor value so that tied weights get mapped to the same key. The value is the interesting
-    # part at the end it's (used_name, (tied_names)).
-    # For example, in the first example where there's tied weights self.foo and self.tied_foo and the user passes a
-    # value for self.foo, this will return {torch.Tensor(...): ('foo', set('tied_foo'))}
-    def add_to_name_map(n: str, t: torch.Tensor):
-        # if the tensor hasn't been seen before, add it to the map
-        if t not in weight_to_name_and_tied_names:
-            weight_to_name_and_tied_names[t] = (n, set()) if n in names else (None, {n})
-            return
-
-        # if the name is not used by the user, we add it to the tied set
-        if n not in names:
-            weight_to_name_and_tied_names[t][1].add(n)
-            return
-
-        # check that the user didn't pass two different tensors for the same tied weight
-        first_seen_name = weight_to_name_and_tied_names[t][0]
-
-        # if they didn't pass multiple names for tied weights or used the same tensor, we set the used name
-        if first_seen_name is None or params_and_buffers[n] is params_and_buffers[first_seen_name]:
-            weight_to_name_and_tied_names[t] = (n, weight_to_name_and_tied_names[t][1])
-            return
-
-        raise ValueError(f"functional_call got values for both {n} and {first_seen_name}, which are tied. " +
-                         "Consider using tie_weights=False")
-
-    tensor: Tensor
-    for name, tensor in module.named_parameters(remove_duplicate=False):
-        add_to_name_map(name, tensor)
-
-    for name, tensor in module.named_buffers(remove_duplicate=False):
-        add_to_name_map(name, tensor)
-
-    # make {tied_name: name_given_by_user} from pairs of (name_given_by_user, set(all_tied_names))
-    tied_weights_to_given_name = {}
-    for name_given_by_user, tied_names in weight_to_name_and_tied_names.values():
-        if name_given_by_user is None:  # no mapping was passed for this tensor, use original tensor
-            continue
-        for tied_name in tied_names:
-            tied_weights_to_given_name[tied_name] = name_given_by_user
-    return tied_weights_to_given_name
-
-
-def _create_swap_params(params_and_buffers):
-    def _swap_parameters(module, tensor_name: str, full_path: str, tensor: Optional[Tensor]) -> None:
-        # Changes the module class to get a new __getattr__ dunder method
-        # that looks for the reparametrized tensor
-        if hasattr(module, "_attr_to_path"):
-            module._attr_to_path[tensor_name] = full_path
-        else:
-            module._attr_to_path = {}
-            module._attr_to_path[tensor_name] = full_path
-            _change_class(module, params_and_buffers)
-    return _swap_parameters
-
-
-def _remove_swap(module, name: str, full_path: str) -> None:
-    if hasattr(module, "_orig_class"):
-        module.__class__ = module._orig_class
-        delattr(module, "_orig_class")
-        delattr(module, "_attr_to_path")
+    # Untie the given named tensor map
+    # Make a copy for not modifying the original dict
+    untied_parameters_and_buffers = parameters_and_buffers.copy()
+    for given_name in given_names_for_tied_tensors:
+        for tied_name in tied_names_map[given_name]:
+            untied_parameters_and_buffers[tied_name] = parameters_and_buffers[
+                given_name
+            ]
+    return untied_parameters_and_buffers
 
 
 @contextlib.contextmanager
 def _reparametrize_module(
-    module: 'torch.nn.Module',
+    module: "torch.nn.Module",
     parameters_and_buffers: Dict[str, Tensor],
+    *,
     tie_weights: bool = False,
+    strict: bool = False,
 ) -> Iterator[None]:
-    tied_weights_map = _create_tied_weights_map(module, parameters_and_buffers) if tie_weights else {}
-    for name, tensor in parameters_and_buffers.items():
-        _apply_func_submodules(
-            _create_swap_params(parameters_and_buffers),
-            module, name.split("."), name, (tensor,))
-    for tied_name, user_given_name in tied_weights_map.items():
-        _apply_func_submodules(
-            _create_swap_params(parameters_and_buffers),
-            module, tied_name.split("."), user_given_name, (None,))
+    if tie_weights:
+        untied_parameters_and_buffers = _untie_named_tensors_map(
+            module, parameters_and_buffers
+        )
+    else:
+        untied_parameters_and_buffers = parameters_and_buffers
+
+    accessor = NamedMemberAccessor(module)
+    if strict:
+        missing_keys, unexpected_keys = accessor.check_keys(
+            untied_parameters_and_buffers
+        )
+        error_msgs = []
+        if len(unexpected_keys) > 0:
+            error_msgs.append(
+                "Unexpected key(s): {}.".format(", ".join(map(repr, unexpected_keys)))
+            )
+        if len(missing_keys) > 0:
+            error_msgs.append(
+                "Missing key(s): {}.".format(", ".join(map(repr, missing_keys)))
+            )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in reparametrizing for {}:\n\t{}".format(
+                    module._get_name(), "\n\t".join(error_msgs)
+                )
+            )
+
+    orig_parameters_and_buffers: Dict[str, Tensor] = {}
     try:
+        orig_parameters_and_buffers, _ = accessor.swap_tensors_dict(
+            untied_parameters_and_buffers, allow_missing=True
+        )
         yield
     finally:
-        for name in parameters_and_buffers:
-            _apply_func_submodules(
-                _remove_swap,
-                module, name.split("."), name, ())
-
-
-def _apply_func_submodules(
-    func: Callable[..., None],
-    module: 'torch.nn.Module',
-    path: List[str],
-    full_path: str,
-    args: Tuple,
-):
-    if len(path) == 1:
-        func(module, path[0], full_path, *args)
-    else:
-        _apply_func_submodules(func, getattr(module, path[0]), path[1:], full_path, args)
+        new_parameters_and_buffers, _ = accessor.swap_tensors_dict(
+            orig_parameters_and_buffers, allow_missing=True
+        )
+        # Sometimes the module is not completely stateless and has some in-place modifications on
+        # the _parameters and _buffers dictionaries.
+        # Write the changed parameters and buffers back to the original dict.
+        parameters_and_buffers.update(
+            {
+                k: new_parameters_and_buffers[k]
+                for k in parameters_and_buffers
+                if k in new_parameters_and_buffers
+            }
+        )
 
 
 def functional_call(
-    module: 'torch.nn.Module',
+    module: "torch.nn.Module",
     parameters_and_buffers: Dict[str, Tensor],
     args: Union[Any, Tuple],
     kwargs: Dict[str, Any] = None,
     *,
     tie_weights: bool = True,
+    strict: bool = False,
 ):
     r"""Performs a functional call on the module by replacing the module parameters
     and buffers with the provided ones.
@@ -229,6 +206,9 @@ def functional_call(
             tied in the reparamaterized version. Therefore, if True and different values are passed for the tied
             paramaters and buffers, it will error. If False, it will not respect the originally tied parameters and
             buffers unless the values passed for both weights are the same. Default: True.
+        strict (bool, optional): If True, then the parameters and buffers passed in must match the parameters and
+            buffers in the original module. Therefore, if True and there are any missing or unexpected keys, it will
+            error. Default: False.
 
     Returns:
         Any: the result of calling ``module``.
@@ -236,35 +216,47 @@ def functional_call(
     warnings.warn(
         "This API is deprecated as of PyTorch 2.0 and will be removed in a future "
         "version of PyTorch. Please use torch.func.functional_call instead "
-        "which is a drop-in replacement for this API.")
+        "which is a drop-in replacement for this API."
+    )
+
+    return _functional_call(
+        module,
+        parameters_and_buffers,
+        args,
+        kwargs,
+        tie_weights=tie_weights,
+        strict=strict,
+    )
 
-    return _functional_call(module, parameters_and_buffers, args, kwargs,
-                            tie_weights=tie_weights)
 
 def _functional_call(
-    module: 'torch.nn.Module',
+    module: "torch.nn.Module",
     parameters_and_buffers: Dict[str, Tensor],
     args: Union[Any, Tuple],
     kwargs: Dict[str, Any] = None,
     *,
     tie_weights: bool = True,
+    strict: bool = False,
 ):
     # TODO allow kwargs such as unsafe and others for parametrization
     if (
-            torch.jit.is_tracing()
-            or torch.jit.is_scripting()
-            or isinstance(module, (
+        torch.jit.is_tracing()
+        or torch.jit.is_scripting()
+        or isinstance(
+            module,
+            (
                 torch.jit.RecursiveScriptModule,
                 torch.jit.ScriptModule,
-                torch.jit.ScriptFunction)
-            )
+                torch.jit.ScriptFunction,
+            ),
+        )
     ):
         raise RuntimeError("The stateless API can't be used with Jitted modules")
     if kwargs is None:
         kwargs = {}
-    with _reparametrize_module(module, parameters_and_buffers, tie_weights):
-        if isinstance(args, tuple):
-            out = module(*args, **kwargs)
-        else:
-            out = module(args, **kwargs)
-    return out
+    if not isinstance(args, tuple):
+        args = (args,)
+    with _reparametrize_module(
+        module, parameters_and_buffers, tie_weights=tie_weights, strict=strict
+    ):
+        return module(*args, **kwargs)
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
index ab206a35be46..a56a5b150186 100644
--- a/torch/nn/utils/weight_norm.py
+++ b/torch/nn/utils/weight_norm.py
@@ -8,7 +8,7 @@
 
 __all__ = ['WeightNorm', 'weight_norm', 'remove_weight_norm']
 
-class WeightNorm(object):
+class WeightNorm:
     name: str
     dim: int
 
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index da868112d0c3..3c6b90b6a90b 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -25,6 +25,7 @@
     symbolic_opset15,
     symbolic_opset16,
     symbolic_opset17,
+    symbolic_opset18,
     utils,
 )
 
@@ -62,6 +63,7 @@
     "symbolic_opset15",
     "symbolic_opset16",
     "symbolic_opset17",
+    "symbolic_opset18",
     # Enums
     "ExportTypes",
     "OperatorExportTypes",
diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
index ed27f94a9e14..2d218e65f162 100644
--- a/torch/onnx/_constants.py
+++ b/torch/onnx/_constants.py
@@ -4,7 +4,7 @@
 
 ONNX_BASE_OPSET = 9
 ONNX_MIN_OPSET = 7
-ONNX_MAX_OPSET = 17
+ONNX_MAX_OPSET = 18
 # ONNX_DEFAULT_OPSET generated by tools/onnx/update_default_opset_version.py
 ONNX_DEFAULT_OPSET = 14
 ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
@@ -12,3 +12,4 @@
 PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
 
 INT64_MAX = 9223372036854775807
+INT32_MAX = 2147483647
diff --git a/torch/onnx/_internal/diagnostics/__init__.py b/torch/onnx/_internal/diagnostics/__init__.py
index 304978dbe22d..73c6db4f4e50 100644
--- a/torch/onnx/_internal/diagnostics/__init__.py
+++ b/torch/onnx/_internal/diagnostics/__init__.py
@@ -1,8 +1,8 @@
 from ._diagnostic import (
-    context,
     create_export_diagnostic_context,
     diagnose,
     engine,
+    export_context,
     ExportDiagnostic,
 )
 from ._rules import rules
@@ -13,7 +13,7 @@
     "rules",
     "levels",
     "engine",
-    "context",
+    "export_context",
     "create_export_diagnostic_context",
     "diagnose",
 ]
diff --git a/torch/onnx/_internal/diagnostics/_diagnostic.py b/torch/onnx/_internal/diagnostics/_diagnostic.py
index efe5c0e34911..fb0c7e0fe2ab 100644
--- a/torch/onnx/_internal/diagnostics/_diagnostic.py
+++ b/torch/onnx/_internal/diagnostics/_diagnostic.py
@@ -1,18 +1,17 @@
 """Diagnostic components for PyTorch ONNX export."""
+from __future__ import annotations
 
 import contextlib
-from typing import Optional, TypeVar
+from collections.abc import Generator
+from typing import Optional
 
 import torch
+
 from torch.onnx._internal.diagnostics import infra
-from torch.onnx._internal.diagnostics.infra import utils as infra_utils
 from torch.utils import cpp_backtrace
 
-# This is a workaround for mypy not supporting Self from typing_extensions.
-_ExportDiagnostic = TypeVar("_ExportDiagnostic", bound="ExportDiagnostic")
-
 
-def _cpp_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32):
+def _cpp_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32) -> infra.Stack:
     """Returns the current C++ call stack.
 
     This function utilizes `torch.utils.cpp_backtrace` to get the current C++ call stack.
@@ -21,6 +20,7 @@ def _cpp_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32):
     r"frame #[0-9]+: (?P<frame_info>.*)". More info at `c10/util/Backtrace.cpp`.
 
     """
+    # NOTE: Cannot use `@_beartype.beartype`. It somehow erases the cpp stack frame info.
     frames = cpp_backtrace.get_cpp_backtrace(frames_to_skip, frames_to_log).split("\n")
     frame_messages = []
     for frame in frames:
@@ -51,28 +51,31 @@ class ExportDiagnostic(infra.Diagnostic):
     def __init__(
         self,
         *args,
+        frames_to_skip: int = 1,
+        cpp_stack: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
-        self.record_python_call_stack(frames_to_skip=1)
-        self.record_cpp_call_stack(frames_to_skip=1)
-
-    def record_python_call_stack(self, frames_to_skip) -> None:
-        """Records the current Python call stack in the diagnostic."""
-        frames_to_skip += 1  # Skip this function.
-        stack = infra_utils.python_call_stack(frames_to_skip=frames_to_skip)
-        stack.message = "Python call stack"
-        self.with_stack(stack)
-        self.python_call_stack = stack
+        self.python_call_stack = self.record_python_call_stack(
+            frames_to_skip=frames_to_skip
+        )
+        if cpp_stack:
+            self.cpp_call_stack = self.record_cpp_call_stack(
+                frames_to_skip=frames_to_skip
+            )
 
-    def record_cpp_call_stack(self, frames_to_skip) -> None:
+    def record_cpp_call_stack(self, frames_to_skip: int) -> infra.Stack:
         """Records the current C++ call stack in the diagnostic."""
+        # NOTE: Cannot use `@_beartype.beartype`. It somehow erases the cpp stack frame info.
         # No need to skip this function because python frame is not recorded
         # in cpp call stack.
         stack = _cpp_call_stack(frames_to_skip=frames_to_skip)
         stack.message = "C++ call stack"
         self.with_stack(stack)
-        self.cpp_call_stack = stack
+        return stack
+
+    def record_fx_graphmodule(self, gm: torch.fx.GraphModule) -> None:
+        self.with_graph(infra.Graph(gm.print_readable(False), gm.__class__.__name__))
 
 
 class ExportDiagnosticEngine(infra.DiagnosticEngine):
@@ -116,38 +119,51 @@ def sarif_log(self):
 
 
 engine = ExportDiagnosticEngine()
-context = engine.background_context
+_context = engine.background_context
 
 
 @contextlib.contextmanager
-def create_export_diagnostic_context():
+def create_export_diagnostic_context() -> Generator[
+    infra.DiagnosticContext, None, None
+]:
     """Create a diagnostic context for export.
 
     This is a workaround for code robustness since diagnostic context is accessed by
     export internals via global variable. See `ExportDiagnosticEngine` for more details.
     """
-    global context
-    context = engine.create_diagnostic_context(
+    global _context
+    assert (
+        _context == engine.background_context
+    ), "Export context is already set. Nested export is not supported."
+    _context = engine.create_diagnostic_context(
         "torch.onnx.export", torch.__version__, diagnostic_type=ExportDiagnostic
     )
     try:
-        yield context
+        yield _context
     finally:
-        context.pretty_print(context.options.log_verbose, context.options.log_level)
-        context = engine.background_context
+        _context.pretty_print(_context.options.log_verbose, _context.options.log_level)
+        _context = engine.background_context
 
 
 def diagnose(
     rule: infra.Rule,
     level: infra.Level,
     message: Optional[str] = None,
+    frames_to_skip: int = 2,
     **kwargs,
 ) -> ExportDiagnostic:
     """Creates a diagnostic and record it in the global diagnostic context.
 
     This is a wrapper around `context.record` that uses the global diagnostic context.
     """
-    global context
-    diagnostic = ExportDiagnostic(rule, level, message, **kwargs)
-    context.add_diagnostic(diagnostic)
+    # NOTE: Cannot use `@_beartype.beartype`. It somehow erases the cpp stack frame info.
+    diagnostic = ExportDiagnostic(
+        rule, level, message, frames_to_skip=frames_to_skip, **kwargs
+    )
+    export_context().add_diagnostic(diagnostic)
     return diagnostic
+
+
+def export_context() -> infra.DiagnosticContext:
+    global _context
+    return _context
diff --git a/torch/onnx/_internal/diagnostics/_rules.py b/torch/onnx/_internal/diagnostics/_rules.py
index f9948388d5da..de2a110afdcf 100644
--- a/torch/onnx/_internal/diagnostics/_rules.py
+++ b/torch/onnx/_internal/diagnostics/_rules.py
@@ -7,6 +7,7 @@
 """
 
 import dataclasses
+from typing import Tuple
 
 # flake8: noqa
 from torch.onnx._internal.diagnostics import infra
@@ -28,6 +29,15 @@ def format_message(self, op_name) -> str:  # type: ignore[override]
         """
         return self.message_default_template.format(op_name=op_name)
 
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.'
+        """
+        return self, level, self.format_message(op_name=op_name)
+
 
 class _MissingCustomSymbolicFunction(infra.Rule):
     """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
@@ -39,11 +49,22 @@ def format_message(self, op_name) -> str:  # type: ignore[override]
         """
         return self.message_default_template.format(op_name=op_name)
 
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version.'
+        """
+        return self, level, self.format_message(op_name=op_name)
+
 
 class _MissingStandardSymbolicFunction(infra.Rule):
     """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
 
-    def format_message(self, op_name, opset_version, issue_url) -> str:  # type: ignore[override]
+    def format_message(  # type: ignore[override]
+        self, op_name, opset_version, issue_url
+    ) -> str:
         """Returns the formatted default message of this Rule.
 
         Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
@@ -52,11 +73,28 @@ def format_message(self, op_name, opset_version, issue_url) -> str:  # type: ign
             op_name=op_name, opset_version=opset_version, issue_url=issue_url
         )
 
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name, opset_version, issue_url
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                op_name=op_name, opset_version=opset_version, issue_url=issue_url
+            ),
+        )
+
 
 class _OperatorSupportedInNewerOpsetVersion(infra.Rule):
     """Operator is supported in newer opset version."""
 
-    def format_message(self, op_name, opset_version, supported_opset_version) -> str:  # type: ignore[override]
+    def format_message(  # type: ignore[override]
+        self, op_name, opset_version, supported_opset_version
+    ) -> str:
         """Returns the formatted default message of this Rule.
 
         Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
@@ -67,6 +105,279 @@ def format_message(self, op_name, opset_version, supported_opset_version) -> str
             supported_opset_version=supported_opset_version,
         )
 
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name, opset_version, supported_opset_version
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                op_name=op_name,
+                opset_version=opset_version,
+                supported_opset_version=supported_opset_version,
+            ),
+        )
+
+
+class _FxTracerSuccess(infra.Rule):
+    """FX Tracer succeeded."""
+
+    def format_message(self, fn_name, tracer_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+        """
+        return self.message_default_template.format(
+            fn_name=fn_name, tracer_name=tracer_name
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, fn_name, tracer_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+        """
+        return (
+            self,
+            level,
+            self.format_message(fn_name=fn_name, tracer_name=tracer_name),
+        )
+
+
+class _FxTracerFailure(infra.Rule):
+    """FX Tracer failed."""
+
+    def format_message(  # type: ignore[override]
+        self, fn_name, tracer_name, explanation
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: "The callable '{fn_name}' is not successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'.\n{explanation}"
+        """
+        return self.message_default_template.format(
+            fn_name=fn_name, tracer_name=tracer_name, explanation=explanation
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, fn_name, tracer_name, explanation
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "The callable '{fn_name}' is not successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'.\n{explanation}"
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                fn_name=fn_name, tracer_name=tracer_name, explanation=explanation
+            ),
+        )
+
+
+class _FxFrontendAotautograd(infra.Rule):
+    """FX Tracer succeeded."""
+
+    def format_message(self, fn_name, tracer_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+        """
+        return self.message_default_template.format(
+            fn_name=fn_name, tracer_name=tracer_name
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, fn_name, tracer_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+        """
+        return (
+            self,
+            level,
+            self.format_message(fn_name=fn_name, tracer_name=tracer_name),
+        )
+
+
+class _FxPassConvertNegToSigmoid(infra.Rule):
+    """FX pass converting torch.neg to torch.sigmoid."""
+
+    def format_message(  # type: ignore[override]
+        self,
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Running 'convert-neg-to-sigmoid' pass on 'torch.fx.GraphModule'."
+        """
+        return self.message_default_template.format()
+
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Running 'convert-neg-to-sigmoid' pass on 'torch.fx.GraphModule'."
+        """
+        return self, level, self.format_message()
+
+
+class _FxIrAddNode(infra.Rule):
+    """ToDo, experimenting diagnostics, placeholder text."""
+
+    def format_message(  # type: ignore[override]
+        self,
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self.message_default_template.format()
+
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self, level, self.format_message()
+
+
+class _AtenlibSymbolicFunction(infra.Rule):
+    """Op level tracking. ToDo, experimenting diagnostics, placeholder text."""
+
+    def format_message(  # type: ignore[override]
+        self,
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self.message_default_template.format()
+
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self, level, self.format_message()
+
+
+class _AtenlibFxToOnnx(infra.Rule):
+    """Graph level tracking. Each op is a step. ToDo, experimenting diagnostics, placeholder text."""
+
+    def format_message(  # type: ignore[override]
+        self,
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self.message_default_template.format()
+
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self, level, self.format_message()
+
+
+class _FxNodeToOnnx(infra.Rule):
+    """Node level tracking. ToDo, experimenting diagnostics, placeholder text."""
+
+    def format_message(  # type: ignore[override]
+        self,
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self.message_default_template.format()
+
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self, level, self.format_message()
+
+
+class _FxFrontendDynamoMakeFx(infra.Rule):
+    """The make_fx + decomposition pass on fx graph produced from Dynamo, before ONNX export."""
+
+    def format_message(  # type: ignore[override]
+        self,
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self.message_default_template.format()
+
+    def format(  # type: ignore[override]
+        self,
+        level: infra.Level,
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ToDo, experimenting diagnostics, placeholder text.'
+        """
+        return self, level, self.format_message()
+
+
+class _ArgFormatTooVerbose(infra.Rule):
+    """The formatted str for argument to display is too verbose."""
+
+    def format_message(  # type: ignore[override]
+        self, length, length_limit, argument_type, formatter_type
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Too verbose ({length} > {length_limit}). Argument type {argument_type} for formatter {formatter_type}.'
+        """
+        return self.message_default_template.format(
+            length=length,
+            length_limit=length_limit,
+            argument_type=argument_type,
+            formatter_type=formatter_type,
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, length, length_limit, argument_type, formatter_type
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Too verbose ({length} > {length_limit}). Argument type {argument_type} for formatter {formatter_type}.'
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                length=length,
+                length_limit=length_limit,
+                argument_type=argument_type,
+                formatter_type=formatter_type,
+            ),
+        )
+
 
 @dataclasses.dataclass
 class _POERules(infra.RuleCollection):
@@ -168,5 +479,249 @@ class _POERules(infra.RuleCollection):
     )
     """Operator is supported in newer opset version."""
 
+    fx_tracer_success: _FxTracerSuccess = dataclasses.field(
+        default=_FxTracerSuccess.from_sarif(
+            **{
+                "id": "FXE0001",
+                "name": "fx-tracer-success",
+                "short_description": {"text": "FX Tracer succeeded."},
+                "full_description": {
+                    "text": "FX Tracer succeeded. The callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers.",
+                    "markdown": "FX Tracer succeeded.\nThe callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """FX Tracer succeeded."""
+
+    fx_tracer_failure: _FxTracerFailure = dataclasses.field(
+        default=_FxTracerFailure.from_sarif(
+            **{
+                "id": "FXE0002",
+                "name": "fx-tracer-failure",
+                "short_description": {"text": "FX Tracer failed."},
+                "full_description": {
+                    "text": "FX Tracer failed. The callable is not successfully traced as a 'torch.fx.GraphModule'.",
+                    "markdown": "FX Tracer failed.\nThe callable is not successfully traced as a 'torch.fx.GraphModule'.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The callable '{fn_name}' is not successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'.\n{explanation}"
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """FX Tracer failed."""
+
+    fx_frontend_aotautograd: _FxFrontendAotautograd = dataclasses.field(
+        default=_FxFrontendAotautograd.from_sarif(
+            **{
+                "id": "FXE0003",
+                "name": "fx-frontend-aotautograd",
+                "short_description": {"text": "FX Tracer succeeded."},
+                "full_description": {
+                    "text": "FX Tracer succeeded. The callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers.",
+                    "markdown": "FX Tracer succeeded.\nThe callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """FX Tracer succeeded."""
+
+    fx_pass_convert_neg_to_sigmoid: _FxPassConvertNegToSigmoid = dataclasses.field(
+        default=_FxPassConvertNegToSigmoid.from_sarif(
+            **{
+                "id": "FXE0004",
+                "name": "fx-pass-convert-neg-to-sigmoid",
+                "short_description": {
+                    "text": "FX pass converting torch.neg to torch.sigmoid."
+                },
+                "full_description": {
+                    "text": "A 'fx.Interpreter' based pass to convert all 'torch.neg' calls to 'torch.sigmoid' for a given 'torch.fx.GraphModule' object.",
+                    "markdown": "A 'fx.Interpreter' based pass to convert all 'torch.neg' calls to 'torch.sigmoid' for\na given 'torch.fx.GraphModule' object.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Running 'convert-neg-to-sigmoid' pass on 'torch.fx.GraphModule'."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """FX pass converting torch.neg to torch.sigmoid."""
+
+    fx_ir_add_node: _FxIrAddNode = dataclasses.field(
+        default=_FxIrAddNode.from_sarif(
+            **{
+                "id": "FXE0005",
+                "name": "fx-ir-add-node",
+                "short_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text."
+                },
+                "full_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text.",
+                    "markdown": "ToDo, experimenting diagnostics, placeholder text.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ToDo, experimenting diagnostics, placeholder text."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """ToDo, experimenting diagnostics, placeholder text."""
+
+    atenlib_symbolic_function: _AtenlibSymbolicFunction = dataclasses.field(
+        default=_AtenlibSymbolicFunction.from_sarif(
+            **{
+                "id": "FXE0006",
+                "name": "atenlib-symbolic-function",
+                "short_description": {
+                    "text": "Op level tracking. ToDo, experimenting diagnostics, placeholder text."
+                },
+                "full_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text.",
+                    "markdown": "ToDo, experimenting diagnostics, placeholder text.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ToDo, experimenting diagnostics, placeholder text."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Op level tracking. ToDo, experimenting diagnostics, placeholder text."""
+
+    atenlib_fx_to_onnx: _AtenlibFxToOnnx = dataclasses.field(
+        default=_AtenlibFxToOnnx.from_sarif(
+            **{
+                "id": "FXE0007",
+                "name": "atenlib-fx-to-onnx",
+                "short_description": {
+                    "text": "Graph level tracking. Each op is a step. ToDo, experimenting diagnostics, placeholder text."
+                },
+                "full_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text.",
+                    "markdown": "ToDo, experimenting diagnostics, placeholder text.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ToDo, experimenting diagnostics, placeholder text."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Graph level tracking. Each op is a step. ToDo, experimenting diagnostics, placeholder text."""
+
+    fx_node_to_onnx: _FxNodeToOnnx = dataclasses.field(
+        default=_FxNodeToOnnx.from_sarif(
+            **{
+                "id": "FXE0008",
+                "name": "fx-node-to-onnx",
+                "short_description": {
+                    "text": "Node level tracking. ToDo, experimenting diagnostics, placeholder text."
+                },
+                "full_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text.",
+                    "markdown": "ToDo, experimenting diagnostics, placeholder text.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ToDo, experimenting diagnostics, placeholder text."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Node level tracking. ToDo, experimenting diagnostics, placeholder text."""
+
+    fx_frontend_dynamo_make_fx: _FxFrontendDynamoMakeFx = dataclasses.field(
+        default=_FxFrontendDynamoMakeFx.from_sarif(
+            **{
+                "id": "FXE0009",
+                "name": "fx-frontend-dynamo-make-fx",
+                "short_description": {
+                    "text": "The make_fx + decomposition pass on fx graph produced from Dynamo, before ONNX export."
+                },
+                "full_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text.",
+                    "markdown": "ToDo, experimenting diagnostics, placeholder text.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ToDo, experimenting diagnostics, placeholder text."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """The make_fx + decomposition pass on fx graph produced from Dynamo, before ONNX export."""
+
+    arg_format_too_verbose: _ArgFormatTooVerbose = dataclasses.field(
+        default=_ArgFormatTooVerbose.from_sarif(
+            **{
+                "id": "DIAGSYS0001",
+                "name": "arg-format-too-verbose",
+                "short_description": {
+                    "text": "The formatted str for argument to display is too verbose."
+                },
+                "full_description": {
+                    "text": "ToDo, experimenting diagnostics, placeholder text.",
+                    "markdown": "ToDo, experimenting diagnostics, placeholder text.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Too verbose ({length} > {length_limit}). Argument type {argument_type} for formatter {formatter_type}."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """The formatted str for argument to display is too verbose."""
+
 
 rules = _POERules()
diff --git a/torch/onnx/_internal/diagnostics/infra/__init__.py b/torch/onnx/_internal/diagnostics/infra/__init__.py
index 4f9dd9e5fa0b..1250a03e2735 100644
--- a/torch/onnx/_internal/diagnostics/infra/__init__.py
+++ b/torch/onnx/_internal/diagnostics/infra/__init__.py
@@ -1,7 +1,7 @@
 from ._infra import (
-    Diagnostic,
-    DiagnosticContext,
     DiagnosticOptions,
+    Graph,
+    Invocation,
     Level,
     levels,
     Location,
@@ -9,14 +9,18 @@
     RuleCollection,
     Stack,
     StackFrame,
+    Tag,
+    ThreadFlowLocation,
 )
-from .engine import DiagnosticEngine
+from .engine import Diagnostic, DiagnosticContext, DiagnosticEngine
 
 __all__ = [
     "Diagnostic",
     "DiagnosticContext",
     "DiagnosticEngine",
     "DiagnosticOptions",
+    "Graph",
+    "Invocation",
     "Level",
     "levels",
     "Location",
@@ -24,4 +28,6 @@
     "RuleCollection",
     "Stack",
     "StackFrame",
+    "Tag",
+    "ThreadFlowLocation",
 ]
diff --git a/torch/onnx/_internal/diagnostics/infra/_infra.py b/torch/onnx/_internal/diagnostics/infra/_infra.py
index 48ebf989084f..322e3dc8e5f6 100644
--- a/torch/onnx/_internal/diagnostics/infra/_infra.py
+++ b/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -4,7 +4,8 @@
 
 import dataclasses
 import enum
-from typing import FrozenSet, List, Optional, Sequence, Tuple, Type, TypeVar
+import pprint
+from typing import FrozenSet, List, Mapping, Optional, Sequence, Tuple
 
 from torch.onnx._internal.diagnostics.infra import formatter, sarif
 
@@ -14,7 +15,13 @@ class Level(enum.Enum):
 
     This class is used to represent the level of a diagnostic. The levels are defined
     by the SARIF specification, and are not modifiable. For alternative categories,
-    please use infra.Tag instead.
+    please use infra.Tag instead. When selecting a level, please consider the following
+    guidelines:
+
+    - NONE: Informational result that does not indicate the presence of a problem.
+    - NOTE: An opportunity for improvement was found.
+    - WARNING: A potential problem was found.
+    - ERROR: A serious problem was found.
     """
 
     NONE = enum.auto()
@@ -29,8 +36,6 @@ class Level(enum.Enum):
 class Tag(enum.Enum):
     """The tag of a diagnostic. This class can be inherited to define custom tags."""
 
-    pass
-
 
 class PatchedPropertyBag(sarif.PropertyBag):
     """Key/value pairs that provide additional information about the object.
@@ -98,6 +103,16 @@ def sarif(self) -> sarif.ReportingDescriptor:
             help_uri=self.help_uri,
         )
 
+    def format(self, level: Level, *args, **kwargs) -> Tuple[Rule, Level, str]:
+        """Returns a tuple of (rule, level, message) for a diagnostic.
+
+        This method is used to format the message of a diagnostic. The message is
+        formatted using the default template of this rule, and the arguments passed in
+        as `*args` and `**kwargs`. The level is used to override the default level of
+        this rule.
+        """
+        return (self, level, self.format_message(*args, **kwargs))
+
     def format_message(self, *args, **kwargs) -> str:
         """Returns the formatted default message of this Rule.
 
@@ -119,6 +134,7 @@ class Location:
     start_column: Optional[int] = None
     end_column: Optional[int] = None
     snippet: Optional[str] = None
+    function: Optional[str] = None
 
     def sarif(self) -> sarif.Location:
         """Returns the SARIF representation of this location."""
@@ -138,23 +154,14 @@ def sarif(self) -> sarif.Location:
         )
 
     def pretty_print(self):
-        """Prints the location in a human-readable format."""
-        location_strs = ["frame:"]
-        if self.snippet is not None:
-            location_strs.append(self.snippet)
-        if self.uri is not None:
-            line_strs = [self.uri]
-            line_strs.append(str(self.line)) if self.line is not None else "-1"
-            line_strs.append(
-                str(self.start_column)
-            ) if self.start_column is not None else "-1"
-            line_strs.append(
-                str(self.end_column)
-            ) if self.end_column is not None else "-1"
-            location_strs.append(":".join(line_strs))
-        if self.message is not None:
-            location_strs.append(f"({self.message})")
-        print(" ".join(location_strs))
+        """Prints the location in a traceback style format."""
+        unknown = "<unknown>"
+        snippet = self.snippet or unknown
+        uri = self.uri or unknown
+        function = self.function or unknown
+        lineno = self.line if self.line is not None else unknown
+        message = f"  # {self.message}" if self.message is not None else ""
+        print(f'  File "{uri}", line {lineno}, in {function}\n    {snippet}{message}')
 
 
 @dataclasses.dataclass
@@ -172,6 +179,8 @@ def pretty_print(self):
 
 @dataclasses.dataclass
 class Stack:
+    """Records a stack trace. The top of the stack is the first element in the list."""
+
     frames: List[StackFrame] = dataclasses.field(default_factory=list)
     message: Optional[str] = None
 
@@ -187,12 +196,35 @@ def sarif(self) -> sarif.Stack:
     def pretty_print(self):
         """Prints the stack in a human-readable format."""
         formatter.pretty_print_title(f"Stack: {self.message}", fill_char="-")
-        for frame in self.frames:
+        for frame in reversed(self.frames):
             frame.pretty_print()
 
 
-# This is a workaround for mypy not supporting Self from typing_extensions.
-_Diagnostic = TypeVar("_Diagnostic", bound="Diagnostic")
+@dataclasses.dataclass
+class ThreadFlowLocation:
+    """Records code location and the initial state."""
+
+    location: Location
+    state: Mapping[str, str]
+    index: int
+    stack: Optional[Stack] = None
+
+    def sarif(self) -> sarif.ThreadFlowLocation:
+        """Returns the SARIF representation of this thread flow location."""
+        return sarif.ThreadFlowLocation(
+            location=self.location.sarif(),
+            state=self.state,
+            stack=self.stack.sarif() if self.stack is not None else None,
+        )
+
+    def pretty_print(self, verbose: bool = False):
+        """Prints the thread flow location in a human-readable format."""
+        formatter.pretty_print_title(f"Step {self.index}", fill_char="-")
+        self.location.pretty_print()
+        if verbose:
+            print(f"State: {pprint.pformat(self.state)}")
+            if self.stack is not None:
+                self.stack.pretty_print()
 
 
 @dataclasses.dataclass
@@ -203,97 +235,32 @@ class Graph:
     The `nodes` and `edges` fields are unused in the current implementation.
     """
 
-    graph_str: str
+    graph: str
     name: str
     description: Optional[str] = None
 
     def sarif(self) -> sarif.Graph:
         """Returns the SARIF representation of this graph."""
         return sarif.Graph(
-            description=sarif.Message(text=self.graph_str),
+            description=sarif.Message(text=self.graph),
             properties=PatchedPropertyBag(name=self.name, description=self.description),
         )
 
-    def pretty_print(self):
-        pass
-
-
-@dataclasses.dataclass
-class Diagnostic:
-    rule: Rule
-    level: Level
-    message: Optional[str] = None
-    locations: List[Location] = dataclasses.field(default_factory=list)
-    stacks: List[Stack] = dataclasses.field(default_factory=list)
-    graphs: List[Graph] = dataclasses.field(default_factory=list)
-    additional_message: Optional[str] = None
-    tags: List[Tag] = dataclasses.field(default_factory=list)
-
-    def sarif(self) -> sarif.Result:
-        """Returns the SARIF Result representation of this diagnostic."""
-        message = self.message or self.rule.message_default_template
-        if self.additional_message is not None:
-            message = f"{message}\n{self.additional_message}"
-        sarif_result = sarif.Result(
-            message=sarif.Message(text=message),
-            level=self.level.name.lower(),  # type: ignore[arg-type]
-            rule_id=self.rule.id,
-        )
-        sarif_result.locations = [location.sarif() for location in self.locations]
-        sarif_result.stacks = [stack.sarif() for stack in self.stacks]
-        sarif_result.graphs = [graph.sarif() for graph in self.graphs]
-        sarif_result.properties = sarif.PropertyBag(
-            tags=[tag.value for tag in self.tags]
-        )
-        return sarif_result
-
-    def with_location(self: _Diagnostic, location: Location) -> _Diagnostic:
-        """Adds a location to the diagnostic."""
-        self.locations.append(location)
-        return self
-
-    def with_stack(self: _Diagnostic, stack: Stack) -> _Diagnostic:
-        """Adds a stack to the diagnostic."""
-        self.stacks.append(stack)
-        return self
-
-    def with_graph(self: _Diagnostic, graph: Graph) -> _Diagnostic:
-        """Adds a graph to the diagnostic."""
-        self.graphs.append(graph)
-        return self
-
-    def with_additional_message(self: _Diagnostic, message: str) -> _Diagnostic:
-        """Adds an additional message to the diagnostic."""
-        if self.additional_message is None:
-            self.additional_message = message
-        else:
-            self.additional_message = f"{self.additional_message}\n{message}"
-        return self
-
-    def pretty_print(self, verbose: bool = False, log_level: Level = Level.ERROR):
+    def pretty_print(
+        self,
+        verbose: bool = False,
+    ):
         """Prints the diagnostics in a human-readable format.
 
         Args:
-            verbose: If True, prints all information. E.g. stack frames, graphs, etc.
-                Otherwise, only prints compact information. E.g., rule name and display message.
+            verbose: If True, prints all information. Otherwise, only prints compact
+                information. E.g., graph name and description.
             log_level: The minimum level of diagnostics to print.
         """
-        if self.level.value < log_level.value:
-            return
-        formatter.pretty_print_item_title(f"{self.level.name}: {self.rule.name}")
-        print(self.message)
-
-        if not verbose:
-            print("<Set verbose=True to see more details>\n")
-            return
-
-        for location in self.locations:
-            location.pretty_print()
-        for stack in self.stacks:
-            stack.pretty_print()
-        for graph in self.graphs:
-            graph.pretty_print()
-        print()
+        formatter.pretty_print_title(f"Graph: {self.name}", fill_char="-")
+        print(self.description)
+        if verbose:
+            print(self.graph)
 
 
 @dataclasses.dataclass
@@ -334,6 +301,7 @@ def custom_collection_from_list(
 
 class Invocation:
     # TODO: Implement this.
+    # Tracks top level call arguments and diagnostic options.
     def __init__(self) -> None:
         raise NotImplementedError()
 
@@ -346,105 +314,3 @@ class DiagnosticOptions:
 
     log_verbose: bool = dataclasses.field(default=False)
     log_level: Level = dataclasses.field(default=Level.ERROR)
-
-
-@dataclasses.dataclass
-class DiagnosticContext:
-    name: str
-    version: str
-    options: DiagnosticOptions = dataclasses.field(default_factory=DiagnosticOptions)
-    diagnostic_type: Type[Diagnostic] = dataclasses.field(default=Diagnostic)
-    diagnostics: List[Diagnostic] = dataclasses.field(init=False, default_factory=list)
-    _invocation: Invocation = dataclasses.field(init=False)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        return True
-
-    def sarif(self) -> sarif.Run:
-        """Returns the SARIF Run object."""
-        return sarif.Run(
-            tool=sarif.Tool(
-                driver=sarif.ToolComponent(
-                    name=self.name,
-                    version=self.version,
-                    rules=[diagnostic.rule.sarif() for diagnostic in self.diagnostics],
-                )
-            ),
-            results=[diagnostic.sarif() for diagnostic in self.diagnostics],
-        )
-
-    def add_diagnostic(self, diagnostic: Diagnostic) -> None:
-        """Adds a diagnostic to the context.
-
-        Use this method to add diagnostics that are not created by the context.
-        Args:
-            diagnostic: The diagnostic to add.
-        """
-        if not isinstance(diagnostic, self.diagnostic_type):
-            raise TypeError(
-                f"Expected diagnostic of type {self.diagnostic_type}, got {type(diagnostic)}"
-            )
-        self.diagnostics.append(diagnostic)
-
-    def diagnose(
-        self,
-        rule: Rule,
-        level: Level,
-        message: Optional[str] = None,
-        **kwargs,
-    ) -> Diagnostic:
-        """Creates a diagnostic for the given arguments.
-
-        Args:
-            rule: The rule that triggered the diagnostic.
-            level: The level of the diagnostic.
-            message: The message of the diagnostic.
-            **kwargs: Additional arguments to pass to the Diagnostic constructor.
-
-        Returns:
-            The created diagnostic.
-
-        Raises:
-            ValueError: If the rule is not supported by the tool.
-        """
-        diagnostic = self.diagnostic_type(rule, level, message, **kwargs)
-        self.add_diagnostic(diagnostic)
-        return diagnostic
-
-    def pretty_print(
-        self, verbose: bool = False, log_level: Level = Level.ERROR
-    ) -> None:
-        """Prints the diagnostics in a human-readable format.
-
-        Args:
-            verbose: Whether to print the diagnostics in verbose mode. See Diagnostic.pretty_print.
-            log_level: The minimum level of diagnostics to print.
-        """
-        formatter.pretty_print_title(
-            f"Diagnostic Run {self.name} version {self.version}"
-        )
-        print(f"verbose: {verbose}, log level: {log_level}")
-        diagnostic_stats = {level: 0 for level in Level}
-        for diagnostic in self.diagnostics:
-            diagnostic_stats[diagnostic.level] += 1
-        formatter.pretty_print_title(
-            " ".join(f"{diagnostic_stats[level]} {level.name}" for level in Level)
-        )
-
-        for diagnostic in self.diagnostics:
-            diagnostic.pretty_print(verbose, log_level)
-
-        unprinted_diagnostic_stats = [
-            (level, count)
-            for level, count in diagnostic_stats.items()
-            if count > 0 and level.value < log_level.value
-        ]
-        if unprinted_diagnostic_stats:
-            print(
-                f"{' '.join(f'{count} {level.name}' for level, count in unprinted_diagnostic_stats)} "
-                "were not printed due to the log level."
-            )
-        print()
diff --git a/torch/onnx/_internal/diagnostics/infra/decorator.py b/torch/onnx/_internal/diagnostics/infra/decorator.py
new file mode 100644
index 000000000000..8fd244ec0e82
--- /dev/null
+++ b/torch/onnx/_internal/diagnostics/infra/decorator.py
@@ -0,0 +1,203 @@
+import functools
+import traceback
+from typing import Any, Callable, Dict, Optional, Tuple, Type
+
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, utils
+
+
+MessageFormatterType = Callable[[Callable, Tuple[Any, ...], Dict[str, Any]], str]
+
+
+@_beartype.beartype
+def format_message_in_text(
+    fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> str:
+    return f"{formatter.display_name(fn)}"
+
+
+@_beartype.beartype
+def format_exception_in_markdown(exception: Exception) -> str:
+    msg_list = ["### Exception log", "```"]
+    msg_list.extend(
+        traceback.format_exception(type(exception), exception, exception.__traceback__)
+    )
+    msg_list.append("```")
+    return "\n".join(msg_list)
+
+
+@_beartype.beartype
+def format_function_signature_in_markdown(
+    fn: Callable,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> str:
+    msg_list = [f"### Function Signature {formatter.display_name(fn)}"]
+
+    state = utils.function_state(fn, args, kwargs)
+
+    for k, v in state.items():
+        msg_list.append(f"- {k}: {format_argument(v)}")
+
+    return "\n".join(msg_list)
+
+
+@_beartype.beartype
+def format_return_values_in_markdown(
+    return_values: Any,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> str:
+    return f"- Return value: {format_argument(return_values)}"
+
+
+ModifierCallableType = Callable[
+    [infra.Diagnostic, Callable, Tuple[Any, ...], Dict[str, Any], Any], None
+]
+
+
+@_beartype.beartype
+def modify_diagnostic(
+    diag: infra.Diagnostic,
+    fn: Callable,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    return_values: Any,
+) -> None:
+    return
+
+
+@_beartype.beartype
+def diagnose_call(
+    get_context: Callable[[], Optional[infra.DiagnosticContext]],
+    rule: infra.Rule,
+    level: infra.Level = infra.Level.NONE,
+    exception_report_level: infra.Level = infra.Level.WARNING,
+    diagnostic_type: Type[infra.Diagnostic] = infra.Diagnostic,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+    diagnostic_message_formatter: MessageFormatterType = format_message_in_text,
+    diagnostic_modifier: ModifierCallableType = modify_diagnostic,
+    report_criterion: Callable[
+        [Callable, Tuple[Any, ...], Dict[str, Any], Any], bool
+    ] = lambda _1, _2, _3, _4: True,
+) -> Callable:
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            # TODO(bowbao): add switch to disable diagnostics.
+            ctx = get_context()
+            if ctx is None:
+                return fn(*args, **kwargs)
+
+            diag = diagnostic_type(
+                rule,
+                level,
+                diagnostic_message_formatter(fn, args, kwargs),
+            )
+
+            # pop the decorator frame
+            # TODO(bowbao): by default diagnostic doesn't have stack.
+            # So need to check before doing this. Make the code cleaner.
+            stack: Optional[infra.Stack] = None
+            if len(diag.stacks) > 0:
+                stack = diag.stacks[0]
+                stack.frames.pop(0)
+
+            # set function location
+            fn_location = utils.function_location(fn)
+            diag.locations.insert(0, fn_location)
+            # Add function location to the top of the stack.
+            if stack is not None:
+                stack.frames.insert(0, infra.StackFrame(location=fn_location))
+
+            additional_messages = [
+                format_function_signature_in_markdown(
+                    fn, args, kwargs, format_argument
+                ),
+            ]
+
+            return_values: Any = None
+            report_diagnostic: bool = True
+            with ctx.add_inflight_diagnostic(diag) as diag:
+                try:
+                    return_values = fn(*args, **kwargs)
+                    additional_messages.append(
+                        format_return_values_in_markdown(return_values, format_argument)
+                    )
+                    report_diagnostic = report_criterion(
+                        fn, args, kwargs, return_values
+                    )
+                    return return_values
+                except Exception as e:
+                    # Record exception.
+                    report_diagnostic = True
+                    diag.level = exception_report_level
+                    additional_messages.append(format_exception_in_markdown(e))
+                    raise
+                finally:
+                    if report_diagnostic:
+                        diag.with_additional_message(
+                            "\n".join(additional_messages).strip()
+                        )
+                        diagnostic_modifier(diag, fn, args, kwargs, return_values)
+                        ctx.add_diagnostic(diag)
+
+        return wrapper
+
+    return decorator
+
+
+@_beartype.beartype
+def diagnose_step(
+    get_context: Callable[[], Optional[infra.DiagnosticContext]],
+    rule: Optional[infra.Rule] = None,
+    message_formatter: MessageFormatterType = format_message_in_text,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> Callable:
+    """Decorator to log a step in the inflight diagnostic.
+
+    Args:
+        get_context: A function that returns the diagnostic context where inflight
+            diagnostic is retrieved and modified by the decorator.
+        rule: The decorator logs this step to the top inflight diagnostic that matches
+            the rule. If None, the top inflight diagnostic in the stack will be picked,
+            regardless of its rule.
+
+    Returns:
+        A decorator that logs a step in the inflight diagnostic.
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            ctx = get_context()
+            if ctx is None:
+                return fn(*args, **kwargs)
+
+            try:
+                diag = ctx.inflight_diagnostic(rule=rule)
+            except infra.engine.DiagnosticError:
+                # TODO(bowbao): this should trigger a built-in diagnostic.
+                traceback.print_exc()
+                return fn(*args, **kwargs)
+
+            state = utils.function_state(fn, args, kwargs)
+            state = {k: format_argument(v) for k, v in state.items()}
+            diag.record_python_call(
+                fn,
+                state,
+                message=message_formatter(fn, args, kwargs),
+                frames_to_skip=1,
+            )
+
+            return_values = fn(*args, **kwargs)
+            state["return_values"] = format_argument(return_values)
+            return return_values
+
+        return wrapper
+
+    return decorator
+
+
+# TODO(bowbao): decorator to report only when failed.
diff --git a/torch/onnx/_internal/diagnostics/infra/engine.py b/torch/onnx/_internal/diagnostics/infra/engine.py
index 9504ca84245b..001d52b4a73d 100644
--- a/torch/onnx/_internal/diagnostics/infra/engine.py
+++ b/torch/onnx/_internal/diagnostics/infra/engine.py
@@ -2,13 +2,343 @@
 
 from __future__ import annotations
 
-from typing import List, Optional, Type
+import contextlib
+
+import dataclasses
+
+import gzip
+
+from typing import Callable, Generator, List, Mapping, Optional, Type, TypeVar
+
+from typing_extensions import Literal
 
 from torch.onnx._internal.diagnostics import infra
-from torch.onnx._internal.diagnostics.infra import formatter, sarif
+from torch.onnx._internal.diagnostics.infra import formatter, sarif, utils
 from torch.onnx._internal.diagnostics.infra.sarif import version as sarif_version
 
 
+class DiagnosticError(RuntimeError):
+    pass
+
+
+# This is a workaround for mypy not supporting Self from typing_extensions.
+_Diagnostic = TypeVar("_Diagnostic", bound="Diagnostic")
+
+
+@dataclasses.dataclass
+class Diagnostic:
+    rule: infra.Rule
+    level: infra.Level
+    message: Optional[str] = None
+    locations: List[infra.Location] = dataclasses.field(default_factory=list)
+    stacks: List[infra.Stack] = dataclasses.field(default_factory=list)
+    graphs: List[infra.Graph] = dataclasses.field(default_factory=list)
+    thread_flow_locations: List[infra.ThreadFlowLocation] = dataclasses.field(
+        default_factory=list
+    )
+    additional_message: Optional[str] = None
+    tags: List[infra.Tag] = dataclasses.field(default_factory=list)
+
+    def sarif(self) -> sarif.Result:
+        """Returns the SARIF Result representation of this diagnostic."""
+        message = self.message or self.rule.message_default_template
+        if self.additional_message:
+            message_markdown = (
+                f"{message}\n\n## Additional Message:\n\n{self.additional_message}"
+            )
+        else:
+            message_markdown = message
+
+        kind: Literal["informational", "fail"] = (
+            "informational" if self.level == infra.Level.NONE else "fail"
+        )
+
+        sarif_result = sarif.Result(
+            message=sarif.Message(text=message, markdown=message_markdown),
+            level=self.level.name.lower(),  # type: ignore[arg-type]
+            rule_id=self.rule.id,
+            kind=kind,
+        )
+        sarif_result.locations = [location.sarif() for location in self.locations]
+        sarif_result.stacks = [stack.sarif() for stack in self.stacks]
+        sarif_result.graphs = [graph.sarif() for graph in self.graphs]
+        sarif_result.code_flows = [
+            sarif.CodeFlow(
+                thread_flows=[
+                    sarif.ThreadFlow(
+                        locations=[loc.sarif() for loc in self.thread_flow_locations]
+                    )
+                ]
+            )
+        ]
+        sarif_result.properties = sarif.PropertyBag(
+            tags=[tag.value for tag in self.tags]
+        )
+        return sarif_result
+
+    def with_location(self: _Diagnostic, location: infra.Location) -> _Diagnostic:
+        """Adds a location to the diagnostic."""
+        self.locations.append(location)
+        return self
+
+    def with_thread_flow_location(
+        self: _Diagnostic, location: infra.ThreadFlowLocation
+    ) -> _Diagnostic:
+        """Adds a thread flow location to the diagnostic."""
+        self.thread_flow_locations.append(location)
+        return self
+
+    def with_stack(self: _Diagnostic, stack: infra.Stack) -> _Diagnostic:
+        """Adds a stack to the diagnostic."""
+        self.stacks.append(stack)
+        return self
+
+    def with_graph(self: _Diagnostic, graph: infra.Graph) -> _Diagnostic:
+        """Adds a graph to the diagnostic."""
+        self.graphs.append(graph)
+        return self
+
+    def with_additional_message(self: _Diagnostic, message: str) -> _Diagnostic:
+        """Adds an additional message to the diagnostic."""
+        if self.additional_message is None:
+            self.additional_message = message
+        else:
+            self.additional_message = f"{self.additional_message}\n{message}"
+        return self
+
+    def record_python_call_stack(self, frames_to_skip: int) -> infra.Stack:
+        """Records the current Python call stack."""
+        frames_to_skip += 1  # Skip this function.
+        stack = utils.python_call_stack(frames_to_skip=frames_to_skip)
+        self.with_stack(stack)
+        if len(stack.frames) > 0:
+            self.with_location(stack.frames[0].location)
+        return stack
+
+    def record_python_call(
+        self,
+        fn: Callable,
+        state: Mapping[str, str],
+        message: Optional[str] = None,
+        frames_to_skip: int = 0,
+    ) -> infra.ThreadFlowLocation:
+        """Records a python call as one thread flow step."""
+        frames_to_skip += 1  # Skip this function.
+        stack = utils.python_call_stack(frames_to_skip=frames_to_skip, frames_to_log=5)
+        location = utils.function_location(fn)
+        location.message = message
+        # Add function location to the top of the stack.
+        stack.frames.insert(0, infra.StackFrame(location=location))
+        thread_flow_location = infra.ThreadFlowLocation(
+            location=location,
+            state=state,
+            index=len(self.thread_flow_locations),
+            stack=stack,
+        )
+        self.with_thread_flow_location(thread_flow_location)
+        return thread_flow_location
+
+    def pretty_print(
+        self, verbose: bool = False, log_level: infra.Level = infra.Level.ERROR
+    ):
+        """Prints the diagnostics in a human-readable format.
+
+        Args:
+            verbose: If True, prints all information. E.g. stack frames, graphs, etc.
+                Otherwise, only prints compact information. E.g., rule name and display message.
+            log_level: The minimum level of diagnostics to print.
+        """
+        if self.level.value < log_level.value:
+            return
+        formatter.pretty_print_item_title(f"{self.level.name}: {self.rule.name}")
+        print(self.message)
+        print(self.additional_message)
+
+        if not verbose:
+            print("<Set verbose=True to see more details>\n")
+            return
+
+        formatter.pretty_print_title("Locations", fill_char="-")
+        for location in self.locations:
+            location.pretty_print()
+        for stack in self.stacks:
+            stack.pretty_print()
+        formatter.pretty_print_title("Thread Flow Locations", fill_char="-")
+        for thread_flow_location in self.thread_flow_locations:
+            thread_flow_location.pretty_print(verbose=verbose)
+        for graph in self.graphs:
+            graph.pretty_print(verbose=verbose)
+
+        print()
+
+        # TODO: print help url to rule at the end.
+
+
+@dataclasses.dataclass
+class DiagnosticContext:
+    name: str
+    version: str
+    options: infra.DiagnosticOptions = dataclasses.field(
+        default_factory=infra.DiagnosticOptions
+    )
+    diagnostic_type: Type[Diagnostic] = dataclasses.field(default=Diagnostic)
+    diagnostics: List[Diagnostic] = dataclasses.field(init=False, default_factory=list)
+    # TODO(bowbao): Implement this.
+    # _invocation: infra.Invocation = dataclasses.field(init=False)
+    _inflight_diagnostics: List[Diagnostic] = dataclasses.field(
+        init=False, default_factory=list
+    )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return True
+
+    def sarif(self) -> sarif.Run:
+        """Returns the SARIF Run object."""
+        unique_rules = {diagnostic.rule for diagnostic in self.diagnostics}
+        return sarif.Run(
+            tool=sarif.Tool(
+                driver=sarif.ToolComponent(
+                    name=self.name,
+                    version=self.version,
+                    rules=[rule.sarif() for rule in unique_rules],
+                )
+            ),
+            results=[diagnostic.sarif() for diagnostic in self.diagnostics],
+        )
+
+    def add_diagnostic(self, diagnostic: Diagnostic) -> None:
+        """Adds a diagnostic to the context.
+
+        Use this method to add diagnostics that are not created by the context.
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        if not isinstance(diagnostic, Diagnostic):
+            raise TypeError(
+                f"Expected diagnostic of type {Diagnostic}, got {type(diagnostic)}"
+            )
+        self.diagnostics.append(diagnostic)
+
+    @contextlib.contextmanager
+    def add_inflight_diagnostic(
+        self, diagnostic: Diagnostic
+    ) -> Generator[Diagnostic, None, None]:
+        """Adds a diagnostic to the context.
+
+        Use this method to add diagnostics that are not created by the context.
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        self._inflight_diagnostics.append(diagnostic)
+        try:
+            yield diagnostic
+        finally:
+            self._inflight_diagnostics.pop()
+
+    def diagnose(
+        self,
+        rule: infra.Rule,
+        level: infra.Level,
+        message: Optional[str] = None,
+        **kwargs,
+    ) -> Diagnostic:
+        """Creates a diagnostic for the given arguments.
+
+        Args:
+            rule: The rule that triggered the diagnostic.
+            level: The level of the diagnostic.
+            message: The message of the diagnostic.
+            **kwargs: Additional arguments to pass to the Diagnostic constructor.
+
+        Returns:
+            The created diagnostic.
+
+        Raises:
+            ValueError: If the rule is not supported by the tool.
+        """
+        diagnostic = self.diagnostic_type(rule, level, message, **kwargs)
+        self.add_diagnostic(diagnostic)
+        return diagnostic
+
+    def push_inflight_diagnostic(self, diagnostic: Diagnostic) -> None:
+        """Pushes a diagnostic to the inflight diagnostics stack.
+
+        Args:
+            diagnostic: The diagnostic to push.
+
+        Raises:
+            ValueError: If the rule is not supported by the tool.
+        """
+        self._inflight_diagnostics.append(diagnostic)
+
+    def pop_inflight_diagnostic(self) -> Diagnostic:
+        """Pops the last diagnostic from the inflight diagnostics stack.
+
+        Returns:
+            The popped diagnostic.
+        """
+        return self._inflight_diagnostics.pop()
+
+    def inflight_diagnostic(self, rule: Optional[infra.Rule] = None) -> Diagnostic:
+        if rule is None:
+            # TODO(bowbao): Create builtin-rules and create diagnostic using that.
+            if len(self._inflight_diagnostics) <= 0:
+                raise DiagnosticError("No inflight diagnostics")
+
+            return self._inflight_diagnostics[-1]
+        else:
+            # TODO(bowbao): Improve efficiency with Mapping[Rule, List[Diagnostic]]
+            for diagnostic in reversed(self._inflight_diagnostics):
+                if diagnostic.rule == rule:
+                    return diagnostic
+            raise DiagnosticError(f"No inflight diagnostic for rule {rule.name}")
+
+    def pretty_print(
+        self, verbose: Optional[bool] = None, log_level: Optional[infra.Level] = None
+    ) -> None:
+        """Prints the diagnostics in a human-readable format.
+
+        Args:
+            verbose: Whether to print the diagnostics in verbose mode. See Diagnostic.pretty_print.
+                If not specified, uses the value of 'self.options.log_verbose'.
+            log_level: The minimum level of diagnostics to print.
+                If not specified, uses the value of 'self.options.log_level'.
+        """
+        if verbose is None:
+            verbose = self.options.log_verbose
+        if log_level is None:
+            log_level = self.options.log_level
+
+        formatter.pretty_print_title(
+            f"Diagnostic Run {self.name} version {self.version}"
+        )
+        print(f"verbose: {verbose}, log level: {log_level}")
+        diagnostic_stats = {level: 0 for level in infra.Level}
+        for diagnostic in self.diagnostics:
+            diagnostic_stats[diagnostic.level] += 1
+        formatter.pretty_print_title(
+            " ".join(f"{diagnostic_stats[level]} {level.name}" for level in infra.Level)
+        )
+
+        for diagnostic in self.diagnostics:
+            diagnostic.pretty_print(verbose, log_level)
+
+        unprinted_diagnostic_stats = [
+            (level, count)
+            for level, count in diagnostic_stats.items()
+            if count > 0 and level.value < log_level.value
+        ]
+        if unprinted_diagnostic_stats:
+            print(
+                f"{' '.join(f'{count} {level.name}' for level, count in unprinted_diagnostic_stats)} "
+                "were not printed due to the log level."
+            )
+        print()
+
+
 class DiagnosticEngine:
     """A generic diagnostic engine based on SARIF.
 
@@ -44,7 +374,7 @@ class DiagnosticEngine:
         >>> sarif_log = engine.sarif_log()
     """
 
-    contexts: List[infra.DiagnosticContext]
+    contexts: List[DiagnosticContext]
 
     def __init__(self) -> None:
         self.contexts = []
@@ -66,6 +396,15 @@ def __repr__(self) -> str:
     def to_json(self) -> str:
         return formatter.sarif_to_json(self.sarif_log())
 
+    def dump(self, file_path: str, compress: bool = False) -> None:
+        """Dumps the SARIF log to a file."""
+        if compress:
+            with gzip.open(file_path, "wt") as f:
+                f.write(self.to_json())
+        else:
+            with open(file_path, "w") as f:
+                f.write(self.to_json())
+
     def clear(self) -> None:
         """Clears all diagnostic contexts."""
         self.contexts.clear()
@@ -75,8 +414,8 @@ def create_diagnostic_context(
         name: str,
         version: str,
         options: Optional[infra.DiagnosticOptions] = None,
-        diagnostic_type: Type[infra.Diagnostic] = infra.Diagnostic,
-    ) -> infra.DiagnosticContext:
+        diagnostic_type: Type[Diagnostic] = Diagnostic,
+    ) -> DiagnosticContext:
         """Creates a new diagnostic context.
 
         Args:
@@ -89,7 +428,7 @@ def create_diagnostic_context(
         """
         if options is None:
             options = infra.DiagnosticOptions()
-        context = infra.DiagnosticContext(
+        context = DiagnosticContext(
             name, version, options, diagnostic_type=diagnostic_type
         )
         self.contexts.append(context)
diff --git a/torch/onnx/_internal/diagnostics/infra/formatter.py b/torch/onnx/_internal/diagnostics/infra/formatter.py
index 292a2b6a47a5..a92112fcfefb 100644
--- a/torch/onnx/_internal/diagnostics/infra/formatter.py
+++ b/torch/onnx/_internal/diagnostics/infra/formatter.py
@@ -1,10 +1,12 @@
 import dataclasses
 import json
 import re
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
+from torch.onnx._internal import _beartype
 from torch.onnx._internal.diagnostics.infra import sarif
 
+
 # A list of types in the SARIF module to support pretty printing.
 # This is solely for type annotation for the functions below.
 _SarifClass = Union[
@@ -15,14 +17,25 @@
 ]
 
 
-def _camel_case_to_snake_case(s: str) -> str:
+@_beartype.beartype
+def snake_case_to_camel_case(s: str) -> str:
+    splits = s.split("_")
+    if len(splits) <= 1:
+        return s
+    return "".join([splits[0], *map(str.capitalize, splits[1:])])
+
+
+@_beartype.beartype
+def camel_case_to_snake_case(s: str) -> str:
     return re.sub(r"([A-Z])", r"_\1", s).lower()
 
 
+@_beartype.beartype
 def kebab_case_to_snake_case(s: str) -> str:
     return s.replace("-", "_")
 
 
+@_beartype.beartype
 def _convert_key(
     object: Union[Dict[str, Any], Any], convert: Callable[[str], str]
 ) -> Union[Dict[str, Any], Any]:
@@ -49,29 +62,68 @@ def _convert_key(
             new_v = [_convert_key(elem, convert) for elem in v]
         else:
             new_v = v
+        if new_v is None:
+            # Otherwise unnesseraily bloated sarif log with "null"s.
+            continue
+        if new_v == -1:
+            # WAR: -1 as default value shouldn't be logged into sarif.
+            continue
+
         new_dict[new_k] = new_v
+
     return new_dict
 
 
-def sarif_to_json(attr_cls_obj: _SarifClass) -> str:
+@_beartype.beartype
+def sarif_to_json(attr_cls_obj: _SarifClass, indent: Optional[str] = " ") -> str:
     dict = dataclasses.asdict(attr_cls_obj)
-    dict = _convert_key(dict, _camel_case_to_snake_case)
-    return json.dumps(dict, indent=4)
+    dict = _convert_key(dict, snake_case_to_camel_case)
+    return json.dumps(dict, indent=indent, separators=(",", ":"))
 
 
-def pretty_print_title(title: str, width: int = 80, fill_char: str = "=") -> None:
+@_beartype.beartype
+def pretty_print_title(
+    title: str, width: int = 80, fill_char: str = "=", print_output: bool = True
+) -> str:
     """Pretty prints title in below format:
 
     ==================== title ====================
     """
-    print(f" {title} ".center(width, fill_char))
+    msg = f" {title} ".center(width, fill_char)
+    if print_output:
+        print(msg)
+    return msg
 
 
-def pretty_print_item_title(title: str, fill_char: str = "=") -> None:
+@_beartype.beartype
+def pretty_print_item_title(
+    title: str, fill_char: str = "=", print_output: bool = True
+) -> str:
     """Pretty prints title in below format:
 
     title
     =====
     """
-    print(title)
-    print(fill_char * len(title))
+    msg_list = []
+    msg_list.append(title)
+    msg_list.append(fill_char * len(title))
+
+    msg = "\n".join(msg_list)
+    if print_output:
+        print(msg)
+    return msg
+
+
+@_beartype.beartype
+def format_argument(obj: Any) -> str:
+    return f"{str(obj)}: {type(obj)}"
+
+
+@_beartype.beartype
+def display_name(fn: Callable) -> str:
+    if hasattr(fn, "__qualname__"):
+        return fn.__qualname__
+    elif hasattr(fn, "__name__"):
+        return fn.__name__
+    else:
+        return str(fn)
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py b/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py
index 20aa233a995f..2f6616777248 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Any, List, Optional
-
-from typing_extensions import Literal
+from typing import Any, List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _artifact_content,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py b/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py
index 718b9e811668..ae5a530a090f 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import List, Optional
-
-from typing_extensions import Literal
+from typing import List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _address,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py b/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py
index daf925418fd2..9ffb40b4d19b 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import List, Optional
-
-from typing_extensions import Literal
+from typing import List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _exception,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py b/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py
index c9967d777d75..fbc74a9fb35b 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Optional
-
-from typing_extensions import Literal
+from typing import Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
 
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_result.py b/torch/onnx/_internal/diagnostics/infra/sarif/_result.py
index 7eed416e1eb8..829cd3cdf5dc 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_result.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_result.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Any, List, Optional
-
-from typing_extensions import Literal
+from typing import Any, List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _artifact_location,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_run.py b/torch/onnx/_internal/diagnostics/infra/sarif/_run.py
index c85d764a980a..e2aca9ba5e32 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_run.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_run.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Any, List, Optional
-
-from typing_extensions import Literal
+from typing import Any, List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _address,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py b/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py
index f614bb55a412..c738222981e5 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import List, Optional
-
-from typing_extensions import Literal
+from typing import List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _external_properties,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py b/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py
index aeaa3bd035d2..c1dcb014809d 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Optional
-
-from typing_extensions import Literal
+from typing import Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import _location, _property_bag
 
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py b/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py
index 53cc984ecd0b..43c67cf62ccf 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Any, List, Optional
-
-from typing_extensions import Literal
+from typing import Any, List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _location,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py b/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py
index 4f47fbb417f8..2421393b8ac3 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py
@@ -4,9 +4,7 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Any, List, Optional
-
-from typing_extensions import Literal
+from typing import Any, List, Literal, Optional
 
 from torch.onnx._internal.diagnostics.infra.sarif import (
     _artifact_location,
diff --git a/torch/onnx/_internal/diagnostics/infra/sarif/version.py b/torch/onnx/_internal/diagnostics/infra/sarif/version.py
index 46c122b98084..2beddcb3f042 100644
--- a/torch/onnx/_internal/diagnostics/infra/sarif/version.py
+++ b/torch/onnx/_internal/diagnostics/infra/sarif/version.py
@@ -1,4 +1,4 @@
-from typing_extensions import Final
+from typing import Final
 
 SARIF_VERSION: Final = "2.1.0"
 SARIF_SCHEMA_LINK: Final = "https://docs.oasis-open.org/sarif/sarif/v2.1.0/cs01/schemas/sarif-schema-2.1.0.json"
diff --git a/torch/onnx/_internal/diagnostics/infra/utils.py b/torch/onnx/_internal/diagnostics/infra/utils.py
index 6a85df910463..48c44c8f9344 100644
--- a/torch/onnx/_internal/diagnostics/infra/utils.py
+++ b/torch/onnx/_internal/diagnostics/infra/utils.py
@@ -1,8 +1,11 @@
 import inspect
+from typing import Any, Callable, Dict, Mapping, Tuple
 
-from torch.onnx._internal.diagnostics.infra import _infra
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.diagnostics.infra import _infra, formatter
 
 
+@_beartype.beartype
 def python_frame(frame: inspect.FrameInfo) -> _infra.StackFrame:
     """Returns a StackFrame for the given inspect.FrameInfo."""
     snippet = (
@@ -16,20 +19,48 @@ def python_frame(frame: inspect.FrameInfo) -> _infra.StackFrame:
             uri=frame.filename,
             line=frame.lineno,
             snippet=snippet,
+            function=frame.function,
+            message=snippet,
         )
     )
 
 
-def python_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32) -> _infra.Stack:
+@_beartype.beartype
+def python_call_stack(frames_to_skip: int = 0, frames_to_log: int = 16) -> _infra.Stack:
     """Returns the current Python call stack."""
     if frames_to_skip < 0:
         raise ValueError("frames_to_skip must be non-negative")
     if frames_to_log < 0:
         raise ValueError("frames_to_log must be non-negative")
-    frames_to_skip += 1  # Skip this function.
+    frames_to_skip += 2  # Skip this function and beartype.
     stack = _infra.Stack()
     stack.frames = [
         python_frame(frame)
+        # TODO(bowbao): Rewrite with 'traceback' to speedup performance.
+        # Reference code: `torch/fx/proxy.py`.
+        # `inspect.stack(0)` will speedup the call greatly, but loses line snippet.
         for frame in inspect.stack()[frames_to_skip : frames_to_skip + frames_to_log]
     ]
+    stack.message = "Python call stack"
     return stack
+
+
+@_beartype.beartype
+def function_location(fn: Callable) -> _infra.Location:
+    """Returns a Location for the given function."""
+    source_lines, lineno = inspect.getsourcelines(fn)
+    snippet = source_lines[0].strip() if len(source_lines) > 0 else "<unknown>"
+    return _infra.Location(
+        uri=inspect.getsourcefile(fn),
+        line=lineno,
+        snippet=snippet,
+        message=formatter.display_name(fn),
+    )
+
+
+@_beartype.beartype
+def function_state(
+    fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Mapping[str, Any]:
+    bind = inspect.signature(fn).bind(*args, **kwargs)
+    return bind.arguments
diff --git a/torch/onnx/_internal/diagnostics/rules.yaml b/torch/onnx/_internal/diagnostics/rules.yaml
index 9d527bccf1e2..2d4df0de04e2 100644
--- a/torch/onnx/_internal/diagnostics/rules.yaml
+++ b/torch/onnx/_internal/diagnostics/rules.yaml
@@ -82,3 +82,181 @@
   properties:
     deprecated: false
     tags: []
+
+
+
+- id: FXE0001
+  name: fx-tracer-success
+  short_description:
+    text: FX Tracer succeeded.
+  full_description:
+    text: "FX Tracer succeeded.
+      The callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers."
+    markdown: |
+      FX Tracer succeeded.
+      The callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers.
+  message_strings:
+    default:
+      text: "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+- id: FXE0002
+  name: fx-tracer-failure
+  short_description:
+    text: FX Tracer failed.
+  full_description:
+    text: "FX Tracer failed.
+      The callable is not successfully traced as a 'torch.fx.GraphModule'."
+    markdown: |
+      FX Tracer failed.
+      The callable is not successfully traced as a 'torch.fx.GraphModule'.
+  message_strings:
+    default:
+      text: "The callable '{fn_name}' is not successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'.
+
+      {explanation}"
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+
+- id: FXE0003
+  name: fx-frontend-aotautograd
+  short_description:
+    text: FX Tracer succeeded.
+  full_description:
+    text: "FX Tracer succeeded.
+      The callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers."
+    markdown: |
+      FX Tracer succeeded.
+      The callable is successfully traced as a 'torch.fx.GraphModule' by one of the fx tracers.
+  message_strings:
+    default:
+      text: "The callable '{fn_name}' is successfully traced as a 'torch.fx.GraphModule' by '{tracer_name}'."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+
+- id: FXE0004
+  name: fx-pass-convert-neg-to-sigmoid
+  short_description:
+    text: FX pass converting torch.neg to torch.sigmoid.
+  full_description:
+    text: "A 'fx.Interpreter' based pass to convert all 'torch.neg' calls to 'torch.sigmoid' for
+      a given 'torch.fx.GraphModule' object."
+    markdown: |
+      A 'fx.Interpreter' based pass to convert all 'torch.neg' calls to 'torch.sigmoid' for
+      a given 'torch.fx.GraphModule' object.
+  message_strings:
+    default:
+      text: "Running 'convert-neg-to-sigmoid' pass on 'torch.fx.GraphModule'."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+
+- id: FXE0005
+  name: fx-ir-add-node
+  short_description:
+    text: ToDo, experimenting diagnostics, placeholder text.
+  full_description:
+    text: "ToDo, experimenting diagnostics, placeholder text."
+    markdown: |
+      ToDo, experimenting diagnostics, placeholder text.
+  message_strings:
+    default:
+      text: "ToDo, experimenting diagnostics, placeholder text."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+
+- id: FXE0006
+  name: atenlib-symbolic-function
+  short_description:
+    text: Op level tracking. ToDo, experimenting diagnostics, placeholder text.
+  full_description:
+    text: "ToDo, experimenting diagnostics, placeholder text."
+    markdown: |
+      ToDo, experimenting diagnostics, placeholder text.
+  message_strings:
+    default:
+      text: "ToDo, experimenting diagnostics, placeholder text."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+
+- id: FXE0007
+  name: atenlib-fx-to-onnx
+  short_description:
+    text: Graph level tracking. Each op is a step. ToDo, experimenting diagnostics, placeholder text.
+  full_description:
+    text: "ToDo, experimenting diagnostics, placeholder text."
+    markdown: |
+      ToDo, experimenting diagnostics, placeholder text.
+  message_strings:
+    default:
+      text: "ToDo, experimenting diagnostics, placeholder text."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+- id: FXE0008
+  name: fx-node-to-onnx
+  short_description:
+    text: Node level tracking. ToDo, experimenting diagnostics, placeholder text.
+  full_description:
+    text: "ToDo, experimenting diagnostics, placeholder text."
+    markdown: |
+      ToDo, experimenting diagnostics, placeholder text.
+  message_strings:
+    default:
+      text: "ToDo, experimenting diagnostics, placeholder text."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+- id: FXE0009
+  name: fx-frontend-dynamo-make-fx
+  short_description:
+    text: The make_fx + decomposition pass on fx graph produced from Dynamo, before ONNX export.
+  full_description:
+    text: "ToDo, experimenting diagnostics, placeholder text."
+    markdown: |
+      ToDo, experimenting diagnostics, placeholder text.
+  message_strings:
+    default:
+      text: "ToDo, experimenting diagnostics, placeholder text."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
+
+
+- id: DIAGSYS0001
+  name: arg-format-too-verbose
+  short_description:
+    text: The formatted str for argument to display is too verbose.
+  full_description:
+    text: "ToDo, experimenting diagnostics, placeholder text."
+    markdown: |
+      ToDo, experimenting diagnostics, placeholder text.
+  message_strings:
+    default:
+      text: "Too verbose ({length} > {length_limit}). Argument type {argument_type} for formatter {formatter_type}."
+  help_uri:
+  properties:
+    deprecated: false
+    tags: []
diff --git a/torch/onnx/_internal/fx/__init__.py b/torch/onnx/_internal/fx/__init__.py
new file mode 100644
index 000000000000..57fbf56c5284
--- /dev/null
+++ b/torch/onnx/_internal/fx/__init__.py
@@ -0,0 +1,16 @@
+from .context import FxToOnnxContext
+from .exporter import (
+    export,
+    export_after_normalizing_args_and_kwargs,
+    export_without_parameters_and_buffers,
+    save_model_with_external_data,
+)
+
+
+__all__ = [
+    "export",
+    "export_after_normalizing_args_and_kwargs",
+    "export_without_parameters_and_buffers",
+    "save_model_with_external_data",
+    "FxToOnnxContext",
+]
diff --git a/torch/onnx/_internal/fx/context.py b/torch/onnx/_internal/fx/context.py
new file mode 100644
index 000000000000..97fb5c0297f3
--- /dev/null
+++ b/torch/onnx/_internal/fx/context.py
@@ -0,0 +1,99 @@
+import copy
+from typing import List
+
+import torch
+
+
+class FxToOnnxContext:
+    """Context manager to make PyTorch friendly to FX-to-ONNX exporter.
+    This class means to collect all "patches" required by FX-to-ONNX
+    exporter. If PyTorch needs to be patched, please use this class to
+    manage the patch.
+
+    This context overrides several torch functions to support symbolic
+    export of large scale models.
+
+    torch.load:
+        This function is patched to record the files PyTorch stores model
+        parameters and buffers. Downstream FX-to-ONNX exporter can create
+        initializers from these files.
+    torch._util._rebuild_tensor:
+        This function is patched to avoid creating real tensors during
+        model loading. FakeTensor's are created instead. Real tensors
+        cannot be fitted into single machine's memory for the targeted
+        model scale.
+    torch.fx._symbolic_trace._wrapped_methods_to_patch:
+        This list is extended with (torch.Tensor, "__getitem__") so that
+        weight[x, :, y] becomes exportable with torch.fx.symbolic_trace.
+
+    Search for FxToOnnxContext in test_fx_to_onnx_with_onnxruntime.py for
+    example usage.
+    """
+
+    def __init__(self):
+        # List of file paths processed by torch.load.
+        self.paths: List[str] = []
+
+        def torch_load_wrapper(f, *args, **kwargs):
+            # Record path.
+            self.paths.append(f)
+            # Then, call the original torch.load.
+            return self.torch_load(f, *args, **kwargs)
+
+        def torch__util__rebuild_tensor_wrapper(storage, storage_offset, size, stride):
+            from torch._subclasses.fake_tensor import FakeTensorMode
+            from torch.utils._mode_utils import no_dispatch
+            from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+            def _rebuild_real_tensor(storage, storage_offset, size, stride):
+                t = torch.tensor(
+                    [], dtype=storage.dtype, device=storage._untyped_storage.device
+                )
+                return t.set_(storage._untyped_storage, storage_offset, size, stride)
+
+            mode = _get_current_dispatch_mode()
+            if isinstance(mode, FakeTensorMode):
+                # Create a real tensor and then convert it to FakeTensor.
+                # We cannot directly create a FakeTensor because it tensor.set_(...)
+                # is not supported in FakeTensorMode dispatcher.
+
+                with no_dispatch():
+                    t = _rebuild_real_tensor(storage, storage_offset, size, stride)
+                return mode.from_tensor(t)
+
+            return _rebuild_real_tensor(storage, storage_offset, size, stride)
+
+        # Original version of torch.load.
+        self.torch_load = torch.load
+        self.torch__util_rebuild_tensor = torch._utils._rebuild_tensor
+
+        # Wrapper or modified version of torch functions.
+        self.torch_load_wrapper = torch_load_wrapper
+        self.torch__util_rebuild_tensor_wrapper = torch__util__rebuild_tensor_wrapper
+
+    def __enter__(self):
+        torch.load = self.torch_load_wrapper
+        torch._utils._rebuild_tensor = self.torch__util_rebuild_tensor_wrapper
+
+        self.torch_fx__symbolic_trace__wrapped_methods_to_patch = (
+            torch.fx._symbolic_trace._wrapped_methods_to_patch
+        )
+        desired_wrapped_methods = copy.deepcopy(
+            torch.fx._symbolic_trace._wrapped_methods_to_patch
+        )
+        if (torch.Tensor, "__getitem__") not in desired_wrapped_methods:
+            # Adding `__getitem__` to the patching list will make tensor indexing traceable via
+            # torch.fx.symbolic_trace. Otherwise, `tensor[x, :, y]` cannot be traced.
+            # This happens because `__getitem__` is neither under torch domain nor an aten operator,
+            # so the patching (or similar Proxy-generating mechanism) doesn't happen automatically.
+            # Note that torch.fx.symbolic_trace defines FX_PATCH_GETITEM environment variable for
+            # enabling the line below for patching.
+            desired_wrapped_methods.append((torch.Tensor, "__getitem__"))
+        torch.fx._symbolic_trace._wrapped_methods_to_patch = desired_wrapped_methods
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.load = self.torch_load
+        torch._utils._rebuild_tensor = self.torch__util_rebuild_tensor
+        torch.fx._symbolic_trace._wrapped_methods_to_patch = (
+            self.torch_fx__symbolic_trace__wrapped_methods_to_patch
+        )
diff --git a/torch/onnx/_internal/fx/diagnostics.py b/torch/onnx/_internal/fx/diagnostics.py
new file mode 100644
index 000000000000..400ed0cb72d4
--- /dev/null
+++ b/torch/onnx/_internal/fx/diagnostics.py
@@ -0,0 +1,93 @@
+import functools
+from typing import Any
+
+import onnxscript  # type: ignore[import]
+from onnxscript.function_libs.torch_aten import graph_building  # type: ignore[import]
+
+import torch
+from torch.onnx._internal import diagnostics
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import decorator, formatter, utils
+
+_LENGTH_LIMIT: int = 80
+
+# NOTE(bowbao): This is a shim over `torch.onnx._internal.diagnostics`, which is
+# used in `torch.onnx`, and loaded with `torch`. Hence anything related to `onnxscript`
+# cannot be put there.
+
+
+@functools.singledispatch
+def _format_argument(obj: Any) -> str:
+    return formatter.format_argument(obj)
+
+
+def format_argument(obj: Any) -> str:
+    formatter = _format_argument.dispatch(type(obj))
+    result_str = formatter(obj)
+
+    if len(result_str) > _LENGTH_LIMIT:
+        # TODO(bowbao): group diagnostics.
+        #   Related fields of sarif.Result: occurance_count, fingerprints.
+        #   Do a final process to group results before outputing sarif log.
+        diag = infra.Diagnostic(
+            *diagnostics.rules.arg_format_too_verbose.format(
+                level=infra.levels.WARNING,
+                length=len(result_str),
+                length_limit=_LENGTH_LIMIT,
+                argument_type=type(obj),
+                formatter_type=type(format_argument),
+            )
+        )
+        diag.with_location(utils.function_location(formatter))
+        diagnostics.export_context().add_diagnostic(diag)
+
+    return result_str
+
+
+@_format_argument.register
+def _torch_nn_module(obj: torch.nn.Module) -> str:
+    return f"{obj.__class__.__name__}"
+
+
+@_format_argument.register
+def _torch_fx_graph_module(obj: torch.fx.GraphModule) -> str:
+    return f"{obj.print_readable(print_output=False)}"
+
+
+@_format_argument.register
+def _torch_tensor(obj: torch.Tensor) -> str:
+    return f"Tensor(shape={obj.shape}, dtype={obj.dtype})"
+
+
+@_format_argument.register
+def _torch_nn_parameter(obj: torch.nn.Parameter) -> str:
+    return f"Parameter({format_argument(obj.data)})"
+
+
+@_format_argument.register
+def _onnxscript_torch_script_tensor(obj: graph_building.TorchScriptTensor) -> str:
+    # TODO(bowbao) obj.dtype throws error.
+    return f"`TorchScriptTensor({obj.name}, {obj.onnx_dtype}, {obj.shape}, {obj.symbolic_value()})`"
+
+
+@_format_argument.register
+def _onnxscript_onnx_function(obj: onnxscript.values.OnnxFunction) -> str:
+    return f"`OnnxFunction({obj.name})`"
+
+
+diagnose_call = functools.partial(
+    decorator.diagnose_call,
+    diagnostics.export_context,
+    diagnostic_type=diagnostics.ExportDiagnostic,
+    format_argument=format_argument,
+)
+
+diagnose_step = functools.partial(
+    decorator.diagnose_step,
+    diagnostics.export_context,
+    format_argument=format_argument,
+)
+
+rules = diagnostics.rules
+export_context = diagnostics.export_context
+levels = diagnostics.levels
diff --git a/torch/onnx/_internal/fx/exporter.py b/torch/onnx/_internal/fx/exporter.py
new file mode 100644
index 000000000000..1d18cb8ab07b
--- /dev/null
+++ b/torch/onnx/_internal/fx/exporter.py
@@ -0,0 +1,1162 @@
+from __future__ import annotations
+
+import copy
+import functools
+import inspect
+import itertools
+import operator
+import os
+import re
+import warnings
+from types import FunctionType
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import onnx
+import onnxscript  # type: ignore[import]
+from onnxscript import evaluator  # type: ignore[import]
+from onnxscript.function_libs.torch_aten import graph_building  # type: ignore[import]
+
+import torch
+import torch._C
+import torch._decomp
+import torch._dynamo
+import torch._ops
+import torch.fx
+from torch._subclasses import fake_tensor
+from torch.fx.experimental import proxy_tensor
+from torch.fx.passes import fake_tensor_prop
+from torch.nn.utils import stateless
+from torch.onnx import _constants, _type_utils
+
+from torch.onnx._internal import _beartype, onnx_proto_utils
+from torch.onnx._internal.fx import diagnostics, function_dispatcher, options
+from torch.utils import _pytree
+
+# TODO: Separate into individual components.
+# TODO: make_fx lose stack info https://github.com/pytorch/pytorch/issues/90276
+
+
+def _onnx_function_diagnose_call_message_formatter(
+    fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> str:
+    if len(args) > 0 and isinstance(args[0], onnxscript.OnnxFunction):
+        onnx_function: onnxscript.OnnxFunction = args[0]  # self
+        return f"{onnx_function.name}: {onnxscript.OnnxFunction}"
+    return f"{fn.__name__}: {fn}"
+
+
+def _onnx_function_diagnose_call_append_symbolic_source_location(
+    diagnostic: diagnostics.infra.Diagnostic,
+    fn: Callable,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    return_values: Any,
+) -> None:
+    # TODO(bowbao): Record source location of symbolic.
+    # Need this separate step because normally only the source location of
+    # class `onnxscript.OnnxFunction.__call__` is recorded.
+    pass
+
+
+# TODO(bowbao): Delete this once diagnostics is introduced in onnxscript.
+_diagnose_onnx_function = diagnostics.diagnose_call(
+    rule=diagnostics.rules.atenlib_symbolic_function,
+    diagnostic_message_formatter=_onnx_function_diagnose_call_message_formatter,
+    diagnostic_modifier=_onnx_function_diagnose_call_append_symbolic_source_location,
+)
+for key, onnx_function in function_dispatcher._ATENLIB_FUNCTIONS.items():
+    if isinstance(onnx_function, FunctionType):
+        function_dispatcher._ATENLIB_FUNCTIONS[key] = _diagnose_onnx_function(
+            onnx_function
+        )
+onnxscript.OnnxFunction.__call__ = _diagnose_onnx_function(
+    onnxscript.OnnxFunction.__call__
+)
+
+
+class ModuleExpansionTracer(torch.fx._symbolic_trace.Tracer):
+    """Tracer to create ONNX-exporting friendly FX graph.
+
+    This tracer traces models into operators. That is,
+    the traced graph mostly contains call_function nodes and
+    has no call_module nodes. The call_module nodes
+    are problematic to the use of make_fx(...) in ONNX
+    exporter.
+    """
+
+    @_beartype.beartype
+    def is_leaf_module(
+        self, module: torch.nn.Module, module_qualified_name: str
+    ) -> bool:
+        # This returns False so that all sub-modules are considered as not leaves
+        # and therefore expanded into operators in
+        # torch.fx._symbolic_trace.Tracer.call_module.
+        return False
+
+    @_beartype.beartype
+    def to_bool(self, obj: "torch.fx.Proxy") -> bool:
+        # This is a hack to tracing through if-else Python blocks.
+        # It may generate incorrect ONNX graphs if the if-else block
+        return False
+
+
+# Functions directly wrapped to produce torch.fx.Proxy so that symbolic
+# data can flow through those functions. Python functions (e.g., `torch.arange`)
+# not defined by pybind11 in C++ do not go though Python dispatcher, so
+# they are not automatically patched by FX's Python dispatcher.
+# The list below means `torch.arange`, `torch.tensor`, and so on will be
+# patched.
+_TORCH_METHODS_TO_PATCH: Tuple[str, ...] = (
+    "arange",
+    "tensor",
+    "finfo",
+    "full",
+    "empty",
+)
+
+
+def _wrap_for_symbolic_trace(target: Callable) -> Tuple[Callable, Callable]:
+    """This function wraps ```target`` for symbolic tracing.
+
+    This function wraps ```target``` so that its wrapper produces
+    torch.fx.Proxy in symbolic computation. The returned values are
+    the wrapper and then the original function. Per `_TORCH_METHODS_TO_PATCH`,
+    this function shall receive `torch.arange`, `torch.tensor`, etc. as inputs.
+    """
+
+    @functools.wraps(target)
+    def wrapper(*args, **kwargs):
+        proxy = None
+
+        def check_has_proxy(v):
+            if isinstance(v, torch.fx.Proxy):
+                nonlocal proxy
+                proxy = v
+
+        torch.fx.node.map_aggregate(args, check_has_proxy)
+        torch.fx.node.map_aggregate(kwargs, check_has_proxy)
+
+        if proxy is not None:
+            return proxy.tracer.create_proxy("call_function", target, args, kwargs)
+        else:
+            return target(*args, **kwargs)
+
+    return wrapper, target
+
+
+@_beartype.beartype
+def _module_expansion_symbolic_trace(
+    root: Union[torch.nn.Module, Callable[..., Any]],
+    concrete_args: Optional[Dict[str, Any]] = None,
+) -> "torch.fx.GraphModule":
+    """Trace a callable into FX graph.
+
+    When "root" is torch.nn.Module, calls to its submodule (type: torch.nn.Module) will be
+    expanded into operators (e.g., torch.matmul, torch.add, +, and -) to simplify graph
+    structure.
+    """
+    # For functions doesn't support symbolic tracing, create wrappers
+    # which produce symbolic results during tracing.
+    patched_torch_methods = {
+        target_name: _wrap_for_symbolic_trace(getattr(torch, target_name))
+        for target_name in _TORCH_METHODS_TO_PATCH
+    }
+
+    # Set the symbolic-tracing friendly functions so that `tracer.trace` below
+    # can work.
+    for name, (wrapper, _) in patched_torch_methods.items():
+        setattr(torch, name, wrapper)
+
+    try:
+        # Set up a tracer.
+        tracer = ModuleExpansionTracer()
+        # Trace the model.
+        graph = tracer.trace(root, concrete_args)
+        name = (
+            root.__class__.__name__
+            if isinstance(root, torch.nn.Module)
+            else root.__name__
+        )
+        return torch.fx.GraphModule(tracer.root, graph, name)
+    finally:
+        # Revert the patches for symbolic tracing.
+        for name, (_, wrapped) in patched_torch_methods.items():
+            # wrapped is the original version of `torch.name`.
+            setattr(torch, name, wrapped)
+
+
+def _retrieve_or_adapt_input_to_graph_set(fx_node_arg, fx_name_to_onnxscipt_value):
+    """Map FX value to TorchScript value.
+
+    When creating TorchScript graph from FX graph, we need a mapping from FX variable
+    to TorchScript variable. This function maps FX variable, fx_node_arg, to torch.jit.Value.
+    """
+
+    onnx_tensor = fx_node_arg
+    if isinstance(onnx_tensor, torch.fx.Node):
+        # 1. fx_node_arg is a torch.fx.Node, which means
+        #    fx_node_arg stands for the output of that torch.fx.Node.
+        # 2. fx_node_arg (variable in torch.fx.Graph) is be mapped to
+        #    torch.jit.Value, fx_name_to_onnxscipt_value[fx_node_arg.name],
+        #    in TorchScript graph.
+        onnx_tensor = fx_name_to_onnxscipt_value[onnx_tensor.name]
+    elif isinstance(onnx_tensor, torch.dtype):
+        onnx_tensor = int(_type_utils.JitScalarType.from_dtype(onnx_tensor).onnx_type())
+
+    return onnx_tensor
+
+
+def _filter_incompatible_and_dtype_convert_kwargs(kwargs):
+    """Filter out kwargs that are not supported by onnxscript."""
+    filtered = {}
+    for key, value in kwargs.items():
+        if key in {
+            "layout",
+            "device",
+            "requires_grad",
+            "pin_memory",
+            "memory_format",
+            "implicit",
+        }:
+            continue
+        if key == "dtype":
+            if value is None:
+                filtered["dtype"] = -1
+            else:
+                filtered["dtype"] = int(
+                    _type_utils.JitScalarType.from_dtype(value).onnx_type()
+                )
+            continue
+        filtered[key] = value
+    return filtered
+
+
+def _wrap_fx_args_as_onnxscript_args(
+    node: torch.fx.Node,
+    fx_name_to_onnxscipt_value: Dict[
+        str, Union[torch._C.Value, Tuple[torch._C.Value, ...]]
+    ],
+) -> Tuple[tuple, dict, tuple, dict]:
+    """Map all FX arguments of a node to arguments in TorchScript graph."""
+
+    # This function assumes the order of arguments in FX op is the
+    # same as the order of arguments in TorchScript op.
+    # (1) Complete the arguments with default values.
+    complete_args: List[Any] = []
+    complete_kwargs: Dict[str, Any] = {}
+    if inspect.isbuiltin(node.target):
+        complete_args = list(node.args)
+    else:
+        for i, expected_arg in enumerate(node.target._schema.arguments):  # type: ignore[union-attr]
+            if i < len(node.args):
+                complete_args.append(node.args[i])
+            else:
+                if expected_arg.name in node.kwargs:
+                    complete_kwargs[expected_arg.name] = node.kwargs[expected_arg.name]
+                else:
+                    # Get default from schema.
+                    complete_kwargs[expected_arg.name] = expected_arg.default_value
+
+    onnxscript_args = tuple(
+        _retrieve_or_adapt_input_to_graph_set(arg, fx_name_to_onnxscipt_value)
+        for arg in complete_args
+    )
+    onnxscript_kwargs = _filter_incompatible_and_dtype_convert_kwargs(complete_kwargs)
+
+    # prepare torch format args and kwargs for op-level validation
+    # Use fake tensor to create real tensor to feed in ops
+    torch_args = []
+    for arg in complete_args:
+        if isinstance(arg, torch.fx.Node):
+            # Create a concreate test tensor based on the fake tensor
+            with torch.utils._mode_utils.no_dispatch():
+                # TODO(titaiwang): The assumption of torch.float might not be true, eg: aten_where needs BOOL in input_args
+                # fx_name_to_onnxscipt_value could help?
+                if isinstance(arg.meta["val"], list):
+                    for meta_value in arg.meta["val"]:
+                        torch_args.append(
+                            torch.randn_like(meta_value, dtype=torch.float)
+                        )
+                else:
+                    torch_args.append(
+                        torch.randn_like(arg.meta["val"], dtype=torch.float)
+                    )
+        else:
+            torch_args.append(arg)
+    torch_kwargs = complete_kwargs
+    return (onnxscript_args, onnxscript_kwargs, tuple(torch_args), torch_kwargs)
+
+
+def _fill_tensor_meta(
+    onnxscript_values,
+    name: str,
+    expected_values: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+):
+    """Fill the meta information of onnxscript_values with that from the fx FakeTensor."""
+    flat_onnxscript_values, _ = _pytree.tree_flatten(onnxscript_values)
+    flat_expected_values, _ = _pytree.tree_flatten(expected_values)
+    for i, (onnxscript_value, expected_value) in enumerate(
+        zip(flat_onnxscript_values, flat_expected_values)
+    ):
+        # Only set shape for now as we don't need type information.
+        onnxscript_value.shape = tuple(expected_value.size())
+        if i > 0:
+            onnxscript_value.name = f"{name}_{i}"
+        else:
+            onnxscript_value.name = name
+
+
+def _location_from_fx_stack_trace(
+    node_stack_trace: str,
+) -> Optional[diagnostics.infra.Location]:
+    """Extract location from FX node stack trace.
+
+    Args:
+        node_stack_trace: The stack trace of the FX node. Example:
+
+            File "path/file.py", line 311, in <function>
+                <code>
+            |   File "path/file2.py", line 389, in <function>
+                <code>
+
+    Returns:
+        location: The location of the FX node.
+    """
+    if "File" not in node_stack_trace:
+        return None
+
+    lines = node_stack_trace.strip().split("\n")
+    idx = 0
+    while idx < len(lines) and "File" not in lines[idx]:
+        idx += 1
+    if idx + 1 >= len(lines):
+        return None
+
+    pattern = re.compile(r"^File \"(.+)\", line (\d+), in (.+)$")
+    matches = pattern.match(lines[idx].strip())
+    if matches:
+        uri = matches.group(1)
+        line_number = int(matches.group(2))
+        snippet = lines[idx + 1].strip()
+        return diagnostics.infra.Location(uri=uri, line=line_number, snippet=snippet)
+    return None
+
+
+@_beartype.beartype
+def _fx_node_to_onnx_message_formatter(
+    fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> str:
+    assert len(args) > 0
+    node = args[0]
+    assert isinstance(node, torch.fx.Node)
+    return f"FX Node: {node.op}:{node.target}[name={node.name}]"
+
+
+@_beartype.beartype
+@diagnostics.diagnose_call(
+    rule=diagnostics.rules.fx_node_to_onnx,
+    exception_report_level=diagnostics.levels.ERROR,
+    diagnostic_message_formatter=_fx_node_to_onnx_message_formatter,
+)
+def _export_fx_node_to_onnxscript(
+    node: torch.fx.Node,
+    onnxscript_graph: graph_building.TorchScriptGraph,
+    fx_name_to_onnxscipt_value: Dict[
+        str, Union[torch._C.Value, Tuple[torch._C.Value, ...]]
+    ],
+    onnxscript_value_name_to_real_tensor: Dict[
+        str, Union[torch.Tensor, Tuple[torch._C.Value, ...]]
+    ],
+    tracer: graph_building.TorchScriptTracingEvaluator,
+    fx_module_with_metadata: torch.fx.GraphModule,
+    options: options.ExportOptions,
+):
+    # Record stack trace of node in diagnostic.
+    node_stack_trace = node.stack_trace
+    if node_stack_trace:
+        diagnostic = diagnostics.export_context().inflight_diagnostic(
+            rule=diagnostics.rules.fx_node_to_onnx
+        )
+        diagnostic.with_additional_message(
+            f"### PyTorch source information\n```\n{node_stack_trace}\n```"
+        )
+        location = _location_from_fx_stack_trace(node_stack_trace)
+        if location is not None:
+            diagnostic.with_location(location)
+
+    if node.op == "placeholder":
+        # Input of graph.
+        output = onnxscript_graph.add_input(
+            input_name=node.name,
+            # The node.meta["val"] is generated by FakeTensorProp.
+            input_value=node.meta["val"],
+        )
+        assert (
+            output is not None
+        ), f"Node creates None with target={node.target} and name={node.name}"
+        assert isinstance(output, graph_building.TorchScriptTensor)
+        assert isinstance(output, onnxscript.tensor.Tensor)
+
+        fx_name_to_onnxscipt_value[node.name] = output
+    elif node.op == "call_function":
+        # aten ops and other stateless functions.
+        if node.target == operator.getitem and isinstance(
+            fx_name_to_onnxscipt_value[node.args[0].name], tuple  # type: ignore[union-attr]
+        ):
+            onnx_tensor_tuple = fx_name_to_onnxscipt_value[node.args[0].name]  # type: ignore[union-attr]
+            index = node.args[1]
+            output = onnx_tensor_tuple[index]  # type: ignore[index]
+            assert (
+                output is not None
+            ), f"Node creates None with target={node.target} and name={node.name}"
+            assert isinstance(output, (graph_building.TorchScriptTensor, tuple)), type(
+                output
+            )
+
+            fx_name_to_onnxscipt_value[node.name] = output
+            return
+
+        if node.target == operator.getitem:
+            # __getitem__ on Tensor or Sequence of tensors. Not tuple.
+            exporter_key = "getitem"
+        elif (
+            isinstance(node.target, torch._ops.OpOverload)
+            and node.target in function_dispatcher._OP_OVERLOAD_TO_EXPORTER_KEY_TABLE
+        ):
+            exporter_key = function_dispatcher._OP_OVERLOAD_TO_EXPORTER_KEY_TABLE[
+                node.target
+            ]
+        else:
+            raise RuntimeError(f"Unknown call_function target: {node.target}")
+        # Only the latest opset version is only supported in atenlib for now
+        symbolic_fn = function_dispatcher._ATENLIB_FUNCTIONS.get(exporter_key)
+        if symbolic_fn is None:
+            raise RuntimeError(f"Cannot find function for {exporter_key}")
+        # Map FX inputs to ONNX inputs and fill optional inputs with default values.
+        # torch_args and torch_kwargs are for op-level validation
+        (
+            onnx_args,
+            onnx_kwargs,
+            torch_args,
+            torch_kwargs,
+        ) = _wrap_fx_args_as_onnxscript_args(node, fx_name_to_onnxscipt_value)
+        with evaluator.default_as(tracer):
+            output: Union[  # type: ignore[no-redef]
+                graph_building.TorchScriptTensor,
+                Tuple[graph_building.TorchScriptTensor],
+            ] = symbolic_fn(*onnx_args, **onnx_kwargs)
+        assert (
+            output is not None
+        ), f"Node creates None with target={node.target}, name={node.name}, args={onnx_args}, kwargs={onnx_kwargs}"
+        # TODO(justinchuby): Add diagnostic information.
+        # Assign type and shape obtained from FakeTensorProp.
+        _fill_tensor_meta(output, node.name, node.meta["val"])
+        # One fx node could produce multiple outputs (e.g., tuple of tensors); in
+        # that case, v is a tuple of TorchScriptTensors.
+        assert isinstance(output, (graph_building.TorchScriptTensor, tuple)), type(
+            output
+        )
+        if options.op_level_debug:
+            _validate_op_between_ort_torch(node, symbolic_fn, torch_args, torch_kwargs)
+        fx_name_to_onnxscipt_value[node.name] = output
+    elif node.op == "output":
+
+        if isinstance(node.args[0], torch.fx.Node):
+            onnx_tensor_or_tensor_tuple = fx_name_to_onnxscipt_value[node.args[0].name]
+            onnxscript_graph.register_outputs(onnx_tensor_or_tensor_tuple)
+        else:
+            # ONNX can't represent collection types (e.g., dictionary, tuple of tuple of
+            # tensor, etc), we flatten the collection and register each element as output.
+            flat_args, _ = _pytree.tree_flatten(node.args[0])
+            for arg in flat_args:
+                assert isinstance(
+                    arg, torch.fx.Node
+                ), f"arg must be a torch.fx.Node, not {type(arg)}"
+                onnx_tensor_or_tensor_tuple = fx_name_to_onnxscipt_value[arg.name]
+                onnxscript_graph.register_outputs(onnx_tensor_or_tensor_tuple)
+    elif node.op == "call_method":
+        # TODO(wechi): Support call_method.
+        raise RuntimeError("call_method is not supported yet.")
+    elif node.op == "call_module":
+        # TODO(wechi): Support call_module.
+        raise RuntimeError("call_module is not supported yet.")
+    elif node.op == "get_attr":
+        current_attr = fx_module_with_metadata
+        sub_attr_names = node.target.split(".")  # type: ignore[union-attr]
+        # If node.targe is "conv.weight", the following loop first
+        # assigns fx_module_with_metadata.conv to current_attr, and then
+        # fx_module_with_metadata.conv.weight to current_attr.
+        while sub_attr_names:
+            sub_attr_name = sub_attr_names.pop(0)
+            if not hasattr(current_attr, sub_attr_name):
+                raise AttributeError(
+                    f"Attribute {sub_attr_name} is not found in {current_attr}."
+                )
+            current_attr = getattr(current_attr, sub_attr_name)
+
+        input_ = onnxscript_graph.add_input(
+            input_name=node.name, input_value=current_attr
+        )
+        assert isinstance(input_, graph_building.TorchScriptTensor)
+        assert isinstance(input_, onnxscript.tensor.Tensor)
+        fx_name_to_onnxscipt_value[node.name] = input_
+        onnxscript_value_name_to_real_tensor[input_.name] = current_attr  # type: ignore[assignment]
+    else:
+        # TODO(wechi): Support get_attr, call_module, call_method.
+        raise RuntimeError(f"Found node type not defined in torch.fx: {node.op}")
+
+
+@diagnostics.diagnose_call(diagnostics.rules.atenlib_fx_to_onnx)
+def _export_fx_to_onnxscript(
+    fx_module_with_metadata: torch.fx.GraphModule, options: options.ExportOptions
+):
+
+    # Initialize the ONNX graph
+    onnxscript_graph = graph_building.TorchScriptGraph()
+    tracer = graph_building.TorchScriptTracingEvaluator(onnxscript_graph)
+
+    # In the following loop, a TorchScript graph is created to
+    # represent the input FX graph with ONNX symbols (e.g., onnx::add).
+    # To connect the values to nodes in the TorchScript graph, we maintian
+    # fx_name_to_onnxscipt_value. Basically, we want to translate
+    #   fx_tensor_x (type: torch.fx.Node) -> fx_node_1 -> fx_tensor_y (type: torch.fx.Node)
+    # to
+    #   fx_name_to_onnxscipt_value[fx_tensor_x.name] -> onnx_node_1 -> fx_name_to_onnxscipt_value[fx_tensor_y.name]
+    fx_name_to_onnxscipt_value: Dict[
+        str, Union[torch._C.Value, Tuple[torch._C.Value, ...]]
+    ] = {}
+    # Similar to fx_name_to_onnxscipt_value, we need a mapping fo real tensors (usually tensor parameters
+    # in nn.Module). Note that TorchScript's cannot store real tensors; TorchScript values are all
+    # symbolic. This is passed into ONNX ModelProto as the initializers.
+    onnxscript_value_name_to_real_tensor: Dict[
+        str, Union[torch.Tensor, Tuple[torch._C.Value, ...]]
+    ] = {}
+    for node in fx_module_with_metadata.graph.nodes:
+        _export_fx_node_to_onnxscript(
+            node,
+            onnxscript_graph,
+            fx_name_to_onnxscipt_value,
+            onnxscript_value_name_to_real_tensor,
+            tracer,
+            fx_module_with_metadata,
+            options,
+        )
+
+    # Apply TorchScript's type promotion code.
+    # Ideally, we should implement our type promotion but
+    # to save time, we just reuse.
+    onnxscript_graph.apply(
+        torch._C._jit_pass_onnx_scalar_type_analysis,
+        lowprecision_cast=True,
+        opset_version=options.opset_version,
+    )
+
+    return onnxscript_graph, onnxscript_value_name_to_real_tensor
+
+
+@_beartype.beartype
+def _shape_inference_with_fake_tensor(decomposed_module: "torch.fx.GraphModule", *args):
+    # Use this FakeTensorMode to
+    # 1. convert nn.Parameter's in nn.Module to FakeTensor
+    # 2. run FakeTensorProp
+    # If (1) and (2) are done with difference FakeTensorMode's, undefined behavior may
+    # happen.
+    fake_tensor_mode = fake_tensor.FakeTensorMode()
+
+    def to_fake_tensor(x):
+        if isinstance(x, torch.Tensor) and not isinstance(x, fake_tensor.FakeTensor):
+            return fake_tensor_mode.from_tensor(x)
+        return x
+
+    # "args" are FakeTensor in FakeTensorProp so the parameters and buffers
+    # in model must be converted to FakeTensor as well.
+    fake_parameters_and_buffers = {
+        k: to_fake_tensor(v)
+        for k, v in itertools.chain(
+            decomposed_module.named_parameters(), decomposed_module.named_buffers()
+        )
+    }
+
+    # Shape inference via FakeTensorProp
+    with stateless._reparametrize_module(
+        decomposed_module, fake_parameters_and_buffers
+    ):
+        # Assign output types and shapes to each node.
+        # TODO(wechi): It's possible to get symbolic types (and shapes)
+        # for each node's output. Consider to set "tracing_mode=symbolic"
+        # when calling make_fx and then remove FakeTensorProp below.
+        fake_tensor_prop.FakeTensorProp(decomposed_module, fake_tensor_mode).propagate(
+            *args
+        )
+
+    return decomposed_module
+
+
+@_beartype.beartype
+def _rename_placeholder_targets(
+    module: "torch.fx.GraphModule", reference_module: "torch.fx.GraphModule"
+):
+    """Align the argument names in module with those in reference_module.
+    After calling this function, the two forward(...) in module and reference_module should have
+    the same signature.
+    """
+    placeholders = [node for node in module.graph.nodes if node.op == "placeholder"]
+    reference_placeholders = [
+        node for node in reference_module.graph.nodes if node.op == "placeholder"
+    ]
+
+    for placeholder, reference_placeholder in zip(placeholders, reference_placeholders):
+        placeholder.target = reference_placeholder.target
+        placeholder.name = reference_placeholder.name
+
+    module.recompile()
+
+
+@_beartype.beartype
+def _export(
+    module: torch.fx.GraphModule,
+    args,
+    **kwargs,
+) -> Union["onnx.ModelProto", bytes]:
+
+    export_options = options.ExportOptions()
+    export_options.update(**kwargs)
+    # Apply decomposition table to the input graph.
+    # Make sure the feed-in "module" is stateless.
+    decomposed_module = proxy_tensor.make_fx(
+        module,
+        decomposition_table=export_options.decomposition_table,
+        tracing_mode="fake",
+        _allow_non_fake_inputs=True,
+    )(*args)
+    # Rename placeholder targets to match the original module's signature since
+    # We don't want to map forward(x, y, z) to forward(arg0, arg1, arg2).
+    _rename_placeholder_targets(decomposed_module, module)
+    # Run FakeTensorProp on decomposed_module.
+    # Symbolic output of the i-th node can be accessed via
+    # decomposed_module.graph.nodes[i].meta["val"]
+    decomposed_module = _shape_inference_with_fake_tensor(decomposed_module, *args)
+
+    # We want to pass list of ints and floats to TorchScript graph correctly
+    # in _export_fx_to_ts, so we must disable FakeTensorMode. Otherwise, graph may
+    # receive FakeTensor and results runtime error. In addition, TorchScript-based
+    # ONNX exporter used in _ts_graph_to_onnx_model_in_protobuf is not compatible
+    # with FakeTensorMode.
+    with torch.utils._mode_utils.no_dispatch():
+        onnxscript_graph, initializers = _export_fx_to_onnxscript(
+            decomposed_module, export_options
+        )
+    # Export TorchScript graph to ONNX ModelProto.
+    onnx_model = onnxscript_graph.to_model_proto(
+        initializers, export_options.opset_version
+    )
+
+    if export_options.use_binary_format:
+        # Return ModelProto in binary format.
+        return onnx_model.SerializeToString()
+    # Return ModelProto
+    return onnx_model
+
+
+@_beartype.beartype
+def export(
+    fn: Union[torch.nn.Module, Callable],
+    *args,
+    use_binary_format: bool = True,
+    opset_version: int = _constants.ONNX_DEFAULT_OPSET,
+    op_level_debug: bool = False,
+) -> Union["onnx.ModelProto", bytes]:
+    # args will be converted to symbolic tensor. Let's copy to avoid side effects.
+    args = copy.deepcopy(args)
+    # Translate callable to FX graph.
+    #
+    # TODO(wechi): There are several symbolic tracing mechanisms to convert
+    # nn.Module to FX graph. We should choose the right one after they are
+    # matured.
+    graph_module, graph_guard = torch._dynamo.export(fn, *args, aten_graph=True)
+    del graph_guard  # Unused
+    # Export FX graph to ONNX ModelProto.
+    #
+    # Note that ALL kwargs are folded into constants in graph_module, so we don't pass kwargs
+    # to _export.
+    return _export(
+        graph_module,
+        args,
+        opset_version=opset_version,
+        decomposition_table=function_dispatcher._ONNX_FRIENDLY_DECOMPOSITION_TABLE,
+        use_binary_format=use_binary_format,
+        op_level_debug=op_level_debug,
+    )
+
+
+@_beartype.beartype
+def export_after_normalizing_args_and_kwargs(
+    fn: Union[torch.nn.Module, Callable],
+    *args,
+    use_binary_format: bool = True,
+    opset_version: int = _constants.ONNX_DEFAULT_OPSET,
+    op_level_debug: bool = False,
+    **kwargs,
+) -> Union["onnx.ModelProto", bytes]:
+    """Export an nn.Module or a callable to ONNX.
+
+    This traces the given nn.Module or a callable into FX graph and then
+    and exports it to ONNX by calling `_export`. Notice that ONNX does
+    not represent keyword arguments, so `args` and `kwargs` are normalized by
+    calling `inspect.Signature.bind` and `inspect.BoundArgument.apply_defaults`
+    in the beginning.
+
+    Args:
+        fn: nn.Module or a callable to be exported to ONNX.
+        opset_version: the opset version to export the model to. E.g., 14.
+        args: the positional arguments to pass to `fn`.
+        use_binary_format: whether to return the ONNX model in binary format.
+            If False, `onnx.ModelProto` will be returned. If False, the byte array
+            generated by `onnx.ModelProto.SerializeToString` is returned.
+        kwargs: the keyword arguments to pass to `fn`.
+
+    Returns:
+        ONNX model in binary format or `onnx.ModelProto`. To select return type,
+        use `use_binary_format` argument.
+    """
+
+    if isinstance(fn, torch.nn.Module):
+        signature = inspect.signature(fn.forward)
+    else:
+        signature = inspect.signature(fn)
+
+    # We hope the input kwargs will be mapped to bound.args after binding.
+    # If not, we will raise an error.
+    bound = signature.bind(*args, **kwargs)
+    bound.apply_defaults()
+    # keyword-only arguments are not handled.
+    # bound.kwargs only contains keyword-only arguments after calling
+    # bind & apply_defaults, so we throw if it's not empty.
+    assert not bound.kwargs
+
+    class Wrapper(torch.nn.Module):
+        def __init__(self, fn):
+            super().__init__()
+            self.fn = fn
+
+        def forward(self, *args):
+            result, _ = _pytree.tree_flatten(self.fn(*args))
+            return result
+
+    # args will be converted to symbolic tensor. Let's copy to avoid side effects.
+    bound_args = copy.deepcopy(bound.args)
+    # Translate callable to FX graph.
+    #
+    # TODO(wechi): There are several symbolic tracing mechanisms to convert
+    # nn.Module to FX graph. We should choose the right one after they are
+    # matured.
+
+    class GraphCaptureCompiler:
+        def __init__(self):
+            self.captured_graph: Optional["torch.fx.GraphModule"] = None
+            self.captured_graph_count = 0
+
+        def compile(self, graph_module: "torch.fx.GraphModule", _):
+            assert self.captured_graph_count == 0
+            self.captured_graph = graph_module
+            self.captured_graph_count += 1
+            return graph_module
+
+    compiler = GraphCaptureCompiler()
+    torch._dynamo.reset()
+    torch._dynamo.optimize(compiler.compile, nopython=True)(Wrapper(fn))(*bound_args)
+    torch._dynamo.reset()
+    assert compiler.captured_graph
+    # Export FX graph to ONNX ModelProto.
+    return _export(
+        compiler.captured_graph,
+        # Function optimized by _dynamo doesn't have None in args.
+        tuple(arg for arg in bound_args if arg is not None),
+        opset_version=opset_version,
+        decomposition_table=function_dispatcher._ONNX_FRIENDLY_DECOMPOSITION_TABLE,
+        use_binary_format=use_binary_format,
+        op_level_debug=op_level_debug,
+    )
+
+
+@_beartype.beartype
+def _move_placeholder_to_front(graph_module: "torch.fx.GraphModule") -> None:
+    """
+    This function move all placeholder nodes to the front of the graph node list.
+    In torch.fx.Graph, placeholder is a special assignment node. If it's not
+    executed in the beginning, it could overwrite values computed by upstream
+    nodes.
+    """
+
+    graph = graph_module.graph
+    placeholders = []
+    first_not_placeholder = None
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            placeholders.append(node)
+        if first_not_placeholder is None and node.op != "placeholder":
+            first_not_placeholder = node
+    if first_not_placeholder is None:
+        return
+    for placeholder in placeholders:
+        first_not_placeholder.prepend(placeholder)
+
+
+@_beartype.beartype
+def _replace_get_attr_with_placeholder(
+    graph_module: "torch.fx.GraphModule",
+) -> Tuple[torch.Tensor, ...]:
+    """
+    Replace get_attr with placeholder.
+    The parameters and buffers accessed by the original get_attr are returned;
+    they are useful when creating random inputs for the modified graph_module.
+    """
+    graph = graph_module.graph
+    replaced_attrs: List[torch.Tensor] = []
+    for node in graph.nodes:
+        if node.op == "get_attr":
+            replaced_attr: Optional[torch.Tensor] = None
+            # get_attr could retrieve either parameter or buffer, so
+            # we need to try both.
+            try:
+                replaced_attr = graph_module.get_parameter(node.target)
+            except AttributeError:
+                # It's possible that model author use buffer instead of
+                # parameter to store trainable weights. In this case,
+                # 1. get_parameter will throw something like
+                #    AttributeError: `bias` is not an nn.Parameter.
+                # 2. get_buffer should work.
+                replaced_attr = graph_module.get_buffer(node.target)
+
+            # Reassign op type so that get_attr node becomes placeholder node.
+            node.op = "placeholder"
+            # The target name in placeholder must be a valid Python identifier.
+            # Thus, we replace, e.g., "module.submodule.weight" with
+            # "module_submodule_weight".
+            node.target = node.target.replace(".", "_")
+            # Default value is None. This is needed as long as the "graph_module"
+            # has optional inputs. Assume the original forward signature is
+            #  def forward(self, x, y=None)
+            # and the replaced get_attr node has target "z". Then, the modified
+            # signature should be
+            #  def forward(self, x, y=None, z=None)
+            # Without the following line, the signature will be
+            #  def forward(self, x, y=None, z)
+            # , which is not valid Python code.
+            node.args = (None,)
+
+            replaced_attrs.append(replaced_attr)
+
+    return tuple(replaced_attrs)
+
+
+@_beartype.beartype
+def _trace_into_fx_graph_via_fx_symbolic_trace(
+    module: torch.nn.Module,
+    *args,
+    # kwargs are the keyword arguments to call "module"; that is,
+    # module(*args, **kwargs) must run.
+    **kwargs,
+) -> Tuple["torch.fx.GraphModule", Tuple[Any, ...]]:
+    signature = inspect.signature(module.forward)
+
+    # We hope the input kwargs will be mapped to bound.args after binding.
+    # If not, we will raise an error.
+    bound = signature.bind(*args, **kwargs)
+    bound.apply_defaults()
+    # After apply_defaults, all non keyword-only arguments are in bound.args.
+    # Because below code do not support keyword-word arguments, bound.kwargs
+    # must be empty.
+    assert len(bound.kwargs) == 0, bound.kwargs
+
+    # Create inputs to call symbolic trace (torch.fx.symbolic_trace)
+    # Example content of concrete_args:
+    #  concrete_args["x"] = torch.fx._symbolic_trace.PH
+    #  concrete_args["b"] = 1
+    # where "x" and "b" are argument names in "signature".
+    concrete_args = {}
+    for param_name, param_value in bound.arguments.items():
+        if isinstance(param_value, torch.Tensor):
+            # param_value can be, e.g., a real tensor or a fake tensor.
+            # param_value is treated as substitutable tensor symbol (aka placeholder).
+            concrete_args[param_name] = torch.fx._symbolic_trace.PH
+        else:
+            concrete_args[param_name] = param_value
+
+    return (
+        _module_expansion_symbolic_trace(module, concrete_args=concrete_args),
+        bound.args,
+    )
+
+
+@_beartype.beartype
+def export_without_parameters_and_buffers(
+    module: torch.nn.Module,
+    *args,
+    decomposition_table: Optional[Dict[torch._ops.OpOverload, Callable]] = None,
+    use_binary_format: bool = True,
+    opset_version: int = _constants.ONNX_DEFAULT_OPSET,
+    op_level_debug: bool = False,
+    # kwargs are the keyword arguments to call "module"; that is,
+    # module(*args, **kwargs) must run.
+    **kwargs,
+) -> Tuple[
+    Union["onnx.ModelProto", bytes],
+    "torch.fx.GraphModule",
+    Tuple[Any, ...],
+    Tuple[Any, ...],
+]:
+
+    graph_module, bound_args = _trace_into_fx_graph_via_fx_symbolic_trace(
+        module, *args, **kwargs
+    )
+
+    # Make sure all placeholder nodes are executed before get_attr nodes.
+    # Otherwise, inputs can interleave with initializers in the final ModeoProto.graph.input.
+    # Basically, we want
+    #  ModeoProto.graph.input =
+    #   [input_0, input_1, ..., input_n, weight_0, weight_1, ..., weight_m]
+    # and we don't want
+    #  ModeoProto.graph.input =
+    #   [input_0, weight_0, input_1, weight_1, ..., input_n, weight_0, weight_1, ..., weight_m]
+    _move_placeholder_to_front(graph_module)
+    # To save memory, move get_attr to input so that the generated model doesn't
+    # have weigh tensors. "replaced_attrs" are the list of replaced weight tensors.
+    replaced_attrs = _replace_get_attr_with_placeholder(graph_module)
+    # Move all newly created placeholder nodes to the front of the graph.
+    _move_placeholder_to_front(graph_module)
+    # Finalize the graph editing.
+    graph_module.recompile()
+    return (
+        _export(
+            graph_module,
+            (*bound_args, *replaced_attrs),
+            opset_version=opset_version,
+            decomposition_table=decomposition_table,
+            use_binary_format=use_binary_format,
+            op_level_debug=op_level_debug,
+        ),
+        graph_module,
+        bound_args,
+        replaced_attrs,
+    )
+
+
+@_beartype.beartype
+def _create_tensor_proto_with_external_data(
+    tensor: torch.Tensor, name: str, location: str, basepath: str
+) -> "onnx.TensorProto":
+    """Create a TensorProto with external data from a PyTorch tensor.
+    The external data is saved to os.path.join(basepath, location).
+
+    Args:
+        tensor: Tensor to be saved.
+        name: Name of the tensor (i.e., initializer name in ONNX graph).
+        location: Relative location of the external data file
+            (e.g., "/tmp/initializers/weight_0" when model is "/tmp/model_name.onnx").
+        basepath: Base path of the external data file (e.g., "/tmp/external_data" while model must be in "/tmp").
+
+
+    Reference for ONNX's external data format:
+        How to load?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L187
+        How to save?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L43
+        How to set ONNX fields?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L88
+    """
+    tensor_proto = onnx.TensorProto()
+    tensor_proto.name = name
+    tensor_proto.data_type = torch.onnx._type_utils._SCALAR_TYPE_TO_ONNX[  # type: ignore[assignment]
+        torch.onnx._type_utils._DTYPE_TO_SCALAR_TYPE[tensor.dtype]
+    ]
+    tensor_proto.dims.extend(tensor.shape)
+    tensor_proto.data_location = onnx.TensorProto.EXTERNAL
+
+    # Settings for saving one tensor per file.
+    # Offset is zero because there is no other tensor in the same file.
+    key_value_pairs = {
+        "location": location,
+        "offset": 0,
+        "length": tensor.untyped_storage().nbytes(),
+    }
+    for k, v in key_value_pairs.items():
+        entry = tensor_proto.external_data.add()
+        entry.key = k
+        entry.value = str(v)
+
+    # Actual path to write content of tensor.
+    external_data_file_path = os.path.join(basepath, location)
+    if os.path.exists(external_data_file_path):
+        os.remove(external_data_file_path)
+
+    # Create external data's folder if not exists.
+    external_data_dir_path = os.path.dirname(external_data_file_path)
+    if not os.path.exists(external_data_dir_path):
+        # if the demo_folder directory is not present
+        # then create it.
+        os.makedirs(external_data_dir_path)
+
+    # Create a fresh file.
+    with open(external_data_file_path, "xb") as data_file:
+        # No need to call "seek" because offset is 0.
+        # data_file.seek(0)
+        # Write tensor content to the file.
+        data_file.write(tensor.numpy().tobytes())
+
+    return tensor_proto
+
+
+@_beartype.beartype
+def save_model_with_external_data(
+    basepath: str,
+    model_location: str,
+    initializer_location: str,
+    torch_load_paths: Tuple[str, ...],
+    onnx_model: "onnx.ModelProto",
+) -> None:
+    """Load PyTorch tensors from files and add to "onnx_model" as external initializers.
+
+    Output files:
+        ONNX model file path:
+        ONNX initializer folder: os.path.join(basepath, initializer_location)
+
+    After running this function, you can do
+        ort_sess = onnxruntime.InferenceSession(os.path.join(basepath, model_location))
+    to execute the model.
+
+    Arguments:
+        basepath: Base path of the external data file (e.g., "/tmp/large-onnx-model").
+        model_location: Relative location of the ONNX model file.
+            E.g., "model.onnx" so that the model file is saved to
+            "/tmp/large-onnx-model/model.onnx".
+        initializer_location: Relative location of the ONNX initializer folder.
+            E.g., "initializers" so that the initializers are saved to
+            "/tmp/large-onnx-model/initializers".
+        torch_load_paths: Files which containing serialized PyTorch tensors to be saved
+            as ONNX initializers. They are loaded by torch.load.
+        onnx_model: ONNX model to be saved with external initializers.
+            If an input name matches a tensor loaded from "torch_load_paths",
+            the tensor will be saved as that input's external initializer.
+    """
+    onnx_model_with_initializers = onnx.ModelProto()
+    onnx_model_with_initializers.CopyFrom(onnx_model)
+    onnx_input_names = [input.name for input in onnx_model.graph.input]
+
+    for path in torch_load_paths:
+        state_ditc = torch.load(path)
+        for name, tensor in state_ditc.items():
+            # Basically, "transformer.attention.self.query.weight" is mapped
+            # to "transformer_attention_self_query_weight" for mimicking the
+            # name-modifying code in FX-to-ONNX exporter.
+            # See function _replace_get_attr_with_placeholder for details.
+            refined_name = name.replace(".", "_")
+
+            # For each refined PyTorch tensor name loaded by torch.load,
+            #  1.  Search its best match in ONNX model. E.g., the match of
+            #       "transformer_attention_weight" could be "attention_weight".
+            #  2.  Set "tensor" as the initializer of the matched ONNX input.
+            #      E.g., "tensor" is stored as the initializer of "attention_weight".
+            # Step 1 is required because sometimes, tensor names are stored with prefix the dictionary
+            # loaded by torch.load.
+            for onnx_input_name in onnx_input_names:
+                if onnx_input_name.endswith(refined_name) or refined_name.endswith(
+                    onnx_input_name
+                ):
+                    # Find a match. Change refined_name to the matched ONNX input name, so that we
+                    # create initializer with the right ONNX name.
+                    refined_name = onnx_input_name
+                    break
+
+            relative_tensor_file_path = os.path.join(initializer_location, refined_name)
+            # Create one file per tensor.
+            # tensor_proto.raw_data is stored to external file at
+            # os.path.join(basepath, relative_tensor_file_path).
+            tensor_proto = _create_tensor_proto_with_external_data(
+                tensor, refined_name, relative_tensor_file_path, basepath
+            )
+            # Add the tensor_proto to the ONNX model as an initializer with external data.
+            onnx_model_with_initializers.graph.initializer.append(tensor_proto)
+
+    # model_location should be a pure file name such as "file_name.onnx", not "folder/file_name.onnx".
+    onnx.save(onnx_model_with_initializers, os.path.join(basepath, model_location))
+
+
+@_beartype.beartype
+def _validate_op_between_ort_torch(
+    node: torch.fx.Node,
+    symbolic_fn: Union[onnxscript.OnnxFunction, Callable],
+    torch_args: tuple,
+    torch_kwargs: dict,
+):
+    """Validate the op between ONNX Runtime and PyTorch."""
+    # op-level validation
+    # Symbolic_fn should have the same output as node.target (torch ops)
+    # trace_only function is regular python function
+    function_name = (
+        symbolic_fn.name
+        if isinstance(symbolic_fn, onnxscript.OnnxFunction)
+        else symbolic_fn.__name__
+    )
+    try:
+        with evaluator.default_as(evaluator.ort_evaluator):
+            expected_outputs = node.target(*torch_args, **torch_kwargs)  # type: ignore[operator]
+            # TODO(titaiwang): Expose _convert_tensor_to_numpy and _convert_kwargs_for_onnx?
+            input_onnx = [
+                onnx_proto_utils._convert_tensor_to_numpy(x) for x in torch_args
+            ]
+            kwargs_onnx = _filter_incompatible_and_dtype_convert_kwargs(torch_kwargs)
+            ort_outputs = symbolic_fn(*input_onnx, **kwargs_onnx)
+
+            # TODO: add pytree structure comparison.
+            flattened_torch_outputs, _ = _pytree.tree_flatten(expected_outputs)
+            flattened_function_outputs, _ = _pytree.tree_flatten(ort_outputs)
+
+            assert flattened_torch_outputs
+            assert len(flattened_torch_outputs) == len(flattened_function_outputs)
+
+            for torch_output, function_output in zip(
+                flattened_torch_outputs, flattened_function_outputs
+            ):
+                try:
+                    if not isinstance(function_output, np.ndarray):
+                        # An onnxscript tensor
+                        function_output = function_output.value
+
+                    # Use torch.testing as opposed to np.testing to ensure dtypes and shapes match
+                    torch.testing.assert_close(
+                        torch.tensor(function_output).cpu(),
+                        torch_output.cpu()
+                        if isinstance(torch_output, torch.Tensor)
+                        else torch.tensor(torch_output).cpu(),
+                        rtol=1e-4,
+                        atol=1e-3,
+                    )
+
+                except AssertionError as e:
+                    warnings.warn(
+                        f"\nSuppressed AssertionError:\n{e}.\n"
+                        f"Op {node.target} has mismatch outputs. "
+                        f"Please check the implementation of {function_name}.\n"
+                    )
+                    diagnostic = diagnostics.export_context().inflight_diagnostic()
+                    diagnostic.with_additional_message(
+                        f"### Validation failed\n"
+                        f"{diagnostics.decorator.format_exception_in_markdown(e)}"
+                    )
+                    diagnostic.level = diagnostics.levels.ERROR
+    except Exception as e:
+        warnings.warn(
+            f"\nORT fails to run on Op {node.target} with error: \n{e}.\n"
+            f"Please check the implementation of {function_name}.\n"
+        )
+        diagnostic = diagnostics.export_context().inflight_diagnostic()
+        diagnostic.with_additional_message(
+            f"### Validation failed\n"
+            f"{diagnostics.decorator.format_exception_in_markdown(e)}"
+        )
+        diagnostic.level = diagnostics.levels.WARNING
+
+
+# Register a few argument formatter
diff --git a/torch/onnx/_internal/fx/function_dispatcher.py b/torch/onnx/_internal/fx/function_dispatcher.py
new file mode 100644
index 000000000000..9c584adfd878
--- /dev/null
+++ b/torch/onnx/_internal/fx/function_dispatcher.py
@@ -0,0 +1,211 @@
+"""Dispatcher for AtenLib functions from onnx-script."""
+
+from __future__ import annotations
+
+from typing import Callable, Dict, Union
+
+import onnxscript  # type: ignore[import]
+from onnxscript import opset18  # type: ignore[import]
+from onnxscript.function_libs.torch_aten import ops  # type: ignore[import]
+
+import torch
+from torch.onnx._internal import _beartype
+
+
+TORCH_ONNX_OPSET = onnxscript.values.Opset(domain="torch.onnx", version=1)
+
+
+@onnxscript.script(opset=TORCH_ONNX_OPSET)
+def prims_convert_element_type(tensor, dtype: int):
+    return opset18.Cast(tensor, to=dtype)
+
+
+@onnxscript.script(opset=TORCH_ONNX_OPSET)
+def aten_getitem(self, i):
+    # TODO(justinchuby): Support
+    # i = opset18.Unsqueeze(i, opset18.Constant(value_ints=[0]))
+    # return opset18.Gather(self, i, axis=0)
+    return opset18.SequenceAt(self, i)
+
+
+# A simple lookup table for atenlib functions
+_ATENLIB_FUNCTIONS = {
+    "aten::abs": ops.core.aten_abs,
+    "aten::acos": ops.core.aten_acos,
+    "aten::acosh": ops.core.aten_acosh,
+    "aten::adaptive_avg_pool1d": ops.nn.aten_adaptive_avg_pool1d,
+    "aten::adaptive_avg_pool2d": ops.nn.aten_adaptive_avg_pool2d,
+    "aten::adaptive_avg_pool3d": ops.nn.aten_adaptive_avg_pool3d,
+    "aten::add": ops.core.aten_add,
+    "aten::addmm": ops.core.aten_addmm,
+    "aten::alias": ops.core.aten_alias,
+    "aten::amax": ops.core.aten_amax,
+    "aten::amin": ops.core.aten_amin,
+    "aten::arange": ops.core.aten_arange_start,
+    "aten::argmax": ops.core.aten_argmax,
+    "aten::argmin": ops.core.aten_argmin,
+    "aten::asin": ops.core.aten_asin,
+    "aten::asinh": ops.core.aten_asinh,
+    "aten::atan": ops.core.aten_atan,
+    "aten::atanh": ops.core.aten_atanh,
+    "aten::baddbmm": ops.core.aten_baddbmm,
+    "aten::bitwise_not": ops.core.aten_bitwise_not_bool,
+    "aten::bmm": ops.core.aten_bmm,
+    "aten::ceil": ops.core.aten_ceil,
+    "aten::celu": ops.nn.aten_celu,
+    "aten::clamp_max": ops.core.aten_clamp_max,
+    "aten::clamp_min": ops.core.aten_clamp_min,
+    "aten::clamp": ops.core.aten_clamp,
+    "aten::clone": ops.core.aten_clone,
+    "aten::convolution": ops.core.aten_convolution,
+    "aten::cos": ops.core.aten_cos,
+    "aten::cosh": ops.core.aten_cosh,
+    "aten::cumsum": ops.core.aten_cumsum,
+    "aten::detach": ops.core.aten_detach,
+    "aten::div": ops.core.aten_div,
+    "aten::dot": ops.core.aten_dot,
+    "aten::elu": ops.nn.aten_elu,
+    "aten::embedding": ops.core.aten_embedding,
+    "aten::empty_like": ops.core.aten_empty_like,
+    "aten::empty": ops.core.aten_empty,
+    "aten::eq": ops.core.aten_eq,
+    "aten::equal": ops.core.aten_equal,
+    "aten::erf": ops.core.aten_erf,
+    "aten::exp": ops.core.aten_exp,
+    "aten::exp2": ops.core.aten_exp2,
+    "aten::expand": ops.core.aten_expand,
+    "aten::fmod": ops.core.aten_fmod,
+    "aten::full_like": ops.core.aten_full_like,
+    "aten::full": ops.core.aten_full,
+    "aten::ge": ops.core.aten_ge,
+    "aten::gelu": ops.nn.aten_gelu,
+    "aten::gt": ops.core.aten_gt,
+    "aten::isinf": ops.core.aten_isinf,
+    "aten::le": ops.core.aten_le,
+    "aten::leaky_relu": ops.nn.aten_leaky_relu,
+    "aten::linear": ops.nn.aten_linear,
+    "aten::log_softmax": ops.special.aten_special_log_softmax,
+    "aten::log": ops.core.aten_log,
+    "aten::log10": ops.core.aten_log10,
+    "aten::log1p": ops.core.aten_log1p,
+    "aten::log2": ops.core.aten_log2,
+    "aten::logaddexp": ops.core.aten_logaddexp,
+    "aten::logaddexp2": ops.core.aten_logaddexp2,
+    "aten::logcumsumexp": ops.core.aten_logcumsumexp,
+    "aten::logdet": ops.core.aten_logdet,
+    "aten::logsigmoid": ops.nn.aten_log_sigmoid,
+    "aten::logsumexp": ops.core.aten_logsumexp,
+    "aten::lt": ops.core.aten_lt,
+    "aten::masked_fill": ops.core.aten_masked_fill,
+    "aten::matmul": ops.core.aten_matmul,
+    "aten::maximum": ops.core.aten_maximum,
+    "aten::minimum": ops.core.aten_minimum,
+    "aten::mm": ops.core.aten_mm,
+    "aten::mul": ops.core.aten_mul,
+    "aten::native_layer_norm": ops.core.aten_native_layer_norm,
+    "aten::ne": ops.core.aten_ne,
+    "aten::neg": ops.core.aten_neg,
+    "aten::new_full": ops.core.aten_new_full,
+    "aten::nonzero": ops.core.aten_nonzero,
+    "aten::ones_like": ops.core.aten_ones_like,
+    "aten::ones": ops.core.aten_ones,
+    "aten::permute": ops.core.aten_permute,
+    "aten::pow": ops.core.aten_pow,
+    "aten::reciprocal": ops.core.aten_reciprocal,
+    "aten::relu": ops.nn.aten_relu,
+    "aten::relu6": ops.nn.aten_relu6,
+    "aten::remainder": ops.core.aten_remainder,
+    "aten::repeat": ops.core.aten_repeat,
+    "aten::reshape": ops.core.aten_reshape,
+    "aten::round": ops.core.aten_round,
+    "aten::rsqrt": ops.core.aten_rsqrt,
+    "aten::rsub": ops.core.aten_rsub,
+    "aten::select": ops.core.aten_select,
+    "aten::selu": ops.core.aten_selu,
+    "aten::sigmoid": ops.core.aten_sigmoid,
+    "aten::sign": ops.core.aten_sign,
+    "aten::sin": ops.core.aten_sin,
+    "aten::sinh": ops.core.aten_sinh,
+    "aten::slice": ops.core.aten_slice,
+    "aten::softmax": ops.special.aten_special_softmax,
+    "aten::split": ops.core.aten_split,
+    "aten::sqrt": ops.core.aten_sqrt,
+    "aten::sub": ops.core.aten_sub,
+    "aten::sum": ops.core.aten_sum_dim_IntList,
+    "aten::t": ops.core.aten_t,
+    "aten::tan": ops.core.aten_tan,
+    "aten::tanh": ops.core.aten_tanh,
+    "aten::topk": ops.core.aten_topk,
+    "aten::transpose": ops.core.aten_transpose,
+    "aten::unsqueeze": ops.core.aten_unsqueeze,
+    "aten::upsample_nearest2d": ops.nn.aten_upsample_nearest2d,
+    "aten::view": ops.core.aten_view,
+    "aten::where": ops.core.aten_where,
+    "aten::xlogy": ops.special.aten_special_xlogy,
+    "aten::zeros_like": ops.core.aten_zeros_like,
+    "aten::zeros": ops.core.aten_zeros,
+    "getitem": aten_getitem,
+    "prims::convert_element_type": prims_convert_element_type,
+}
+
+
+def _create_op_overload_to_exporter_key_table() -> Dict[
+    Union[torch._ops.OpOverload, Callable], str
+]:
+    # TODO(justinchuby): Improve how the table is constructed.
+    table: Dict[Union[torch._ops.OpOverload, Callable], str] = {}
+
+    for op_namespace in (torch.ops.aten, torch.ops.prims):
+        for attr_name in dir(op_namespace):
+            op_overload_packet = getattr(op_namespace, attr_name)
+
+            if not isinstance(op_overload_packet, torch._ops.OpOverloadPacket):
+                continue
+
+            exporter_look_up_key = op_overload_packet._qualified_op_name
+            if _ATENLIB_FUNCTIONS.get(exporter_look_up_key) is None:
+                # This aten op doesn't have ONNX exporter.
+                continue
+
+            for overload_name in op_overload_packet.overloads():
+                op_overload = getattr(op_overload_packet, overload_name)
+                # This line maps torch.ops.aten.add.Tensor, torch.ops.aten.add.Scalar, torch.ops.aten.add.out, etc
+                # to "aten::add". This means the exporter for "aten::add" is used for all overloads of "aten::add".
+                # This is applied to all ops under torch.ops.aten.
+                #
+                # TODO(wechi): in the future, we might want to write individual exporter for each overload, if,
+                # for example, they have different type promotion rules. If so, just map different overloads to
+                # different exporter keys.
+
+                table[op_overload] = op_overload_packet._qualified_op_name
+    # TODO(justinchuby): is baddbmm different?
+    table[torch.ops.aten.baddbmm.default] = "aten::baddbmm"
+    return table
+
+
+# Dictionary that maps torch.ops.aten.* to exporter look up key; e.g.,
+# _OP_OVERLOAD_TO_EXPORTER_KEY_TABLE[torch.add.Tensor] is "aten::add".
+_OP_OVERLOAD_TO_EXPORTER_KEY_TABLE = _create_op_overload_to_exporter_key_table()
+
+
+@_beartype.beartype
+def _create_onnx_friendly_decomposition_table() -> Dict[
+    torch._ops.OpOverload, Callable
+]:
+    decomposition_table: Dict[torch._ops.OpOverload, Callable] = {}
+    for op_overload, decomp_fn in torch._decomp.decomposition_table.items():
+        # Skip decomposition into "prim::*" ops, because they are not generally supported by ONNX.
+        # Skip decomposition for op_overload as long as that op_overload has a corresponding ONNX exporter.
+        if (
+            "torch._refs" in decomp_fn.__module__
+            or op_overload in _OP_OVERLOAD_TO_EXPORTER_KEY_TABLE
+        ):
+            continue
+        decomposition_table[op_overload] = decomp_fn
+    return decomposition_table
+
+
+# This is a subset of PyTorch's built-in aten-to-aten decomposition. If an aten
+# op (e.g., torch.ops.aten.add.Tensor) has exporter, we exclude the op's decomposition
+# function in the _ONNX_FRIENDLY_DECOMPOSITION_TABLE.
+_ONNX_FRIENDLY_DECOMPOSITION_TABLE = _create_onnx_friendly_decomposition_table()
diff --git a/torch/onnx/_internal/fx/options.py b/torch/onnx/_internal/fx/options.py
new file mode 100644
index 000000000000..b550181099c3
--- /dev/null
+++ b/torch/onnx/_internal/fx/options.py
@@ -0,0 +1,35 @@
+"""Options for FX exporter."""
+from __future__ import annotations
+
+import dataclasses
+from typing import Callable, Dict
+
+import torch
+from torch.onnx import _constants
+from torch.onnx._internal.fx import function_dispatcher
+
+
+@dataclasses.dataclass
+class ExportOptions:
+    """Options for FX-ONNX export.
+    Attributes:
+        opset_version: The export ONNX version.
+        use_binary_format: Whether to Return ModelProto in binary format.
+        decomposition_table: The decomposition table for graph ops. Default is for torch ops, including aten and prim.
+        op_level_debug: Whether to export the model with op level debug information with onnxruntime evaluator.
+    """
+
+    opset_version: int = _constants.ONNX_DEFAULT_OPSET
+    use_binary_format: bool = True
+    op_level_debug: bool = False
+    decomposition_table: Dict[torch._ops.OpOverload, Callable] = dataclasses.field(
+        default_factory=lambda: function_dispatcher._ONNX_FRIENDLY_DECOMPOSITION_TABLE
+    )
+
+    def update(self, **kwargs):
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                if value is not None:
+                    setattr(self, key, value)
+            else:
+                raise KeyError(f"ExportOptions has no attribute {key}")
diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/jit_utils.py
index 9212d484e2a4..90326a316379 100644
--- a/torch/onnx/_internal/jit_utils.py
+++ b/torch/onnx/_internal/jit_utils.py
@@ -99,6 +99,10 @@ def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
             **kwargs,
         )
 
+    # NOTE: For backward compatibility with the old symbolic functions.
+    # We are probably going to remove this only after the fx exporter is established.
+    at = aten_op
+
     @_beartype.beartype
     def onnxscript_op(
         self,
@@ -306,7 +310,7 @@ def _create_node(
 @_beartype.beartype
 def _is_onnx_list(value):
     return (
-        not isinstance(value, torch._six.string_classes)
+        not isinstance(value, str)
         and not isinstance(value, torch.Tensor)
         and isinstance(value, Iterable)
     )
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/onnx_proto_utils.py
index e8d85d80a0af..9290df2d9e8d 100644
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@@ -1,5 +1,7 @@
 """Utilities for manipulating the onnx and onnx-script dependencies and ONNX proto."""
 
+from __future__ import annotations
+
 import glob
 import io
 import os
@@ -10,7 +12,7 @@
 import torch
 import torch.jit._trace
 import torch.serialization
-from torch.onnx import _constants, _exporter_states, errors
+from torch.onnx import _constants, _exporter_states, _type_utils, errors
 from torch.onnx._internal import _beartype, jit_utils, registration
 
 
@@ -287,3 +289,31 @@ def _find_onnxscript_op(
                 else None,
             )
     return onnx_function_list, included_node_func
+
+
+def _convert_tensor_to_numpy(input: Any) -> Any:
+
+    try:
+        import numpy as np
+    except ImportError:
+        raise ImportError(f"{__name__} needs numpy, but it's not installed.")
+
+    if isinstance(input, torch.Tensor):
+        return input.detach().cpu().numpy()
+    if isinstance(input, torch.dtype):
+        return int(_type_utils.JitScalarType.from_dtype(input).onnx_type())
+    if isinstance(input, (tuple, list)):
+        if len(input) == 0:
+            return np.array((), dtype=np.int64)
+        if isinstance(input[0], torch.Tensor):
+            return [_convert_tensor_to_numpy(x) for x in input]
+        if isinstance(input[0], bool):
+            return np.array(input, dtype=np.bool_)
+
+        # Just a sequence of numbers
+        if isinstance(input[0], int):
+            return np.array(input, dtype=np.int64)
+        if isinstance(input[0], float):
+            return np.array(input)
+
+    return input
diff --git a/torch/onnx/_patch_torch.py b/torch/onnx/_patch_torch.py
deleted file mode 100644
index 24e3416164b2..000000000000
--- a/torch/onnx/_patch_torch.py
+++ /dev/null
@@ -1,297 +0,0 @@
-"""Importing this patches torch._C classes to add ONNX conveniences."""
-import numbers
-import re
-from typing import Any, Iterable, Tuple, Union
-
-import torch
-from torch import _C
-from torch._C import _onnx as _C_onnx
-
-# Import utils to get _params_dict because it is a global that is accessed by c++ code
-from torch.onnx import _deprecation, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import _beartype, jit_utils
-
-_ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
-
-
-# TODO(#78694): Remove this file after PyTorch 2.0.
-# All functions in this file are deprecated and should not be used
-
-
-@_deprecation.deprecated(
-    "1.13",
-    "2.0",
-    "note 'g.op()' is to be removed from torch.Graph. Please open a"
-    " GitHub issue if you need this functionality.",
-)
-@_beartype.beartype
-def _graph_op(
-    g: _C.Graph,
-    opname: str,
-    *raw_args: Union[torch.Tensor, _C.Value],
-    outputs: int = 1,
-    **kwargs,
-) -> Union[_C.Value, Tuple[_C.Value, ...]]:
-    r"""Creates an ONNX operator "opname", taking "args" as inputs and attributes "kwargs".
-
-    The set of operators and the inputs/attributes they take
-    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
-
-    This function is monkey-patched onto Graph.
-
-    Args:
-        g: The Torch graph.
-        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
-            with a namespace, e.g., `aten::add`.
-        raw_args: The inputs to the operator; usually provided
-            as arguments to the `symbolic` definition.
-        outputs: The number of outputs this operator returns.
-            By default an operator is assumed to return a single output.
-            If `outputs` is greater than one, this functions returns a tuple
-            of output `Node`, representing each output of the ONNX operator
-            in positional.
-        kwargs: The attributes of the ONNX operator, whose keys are named
-            according to the following convention: `alpha_f` indicates
-            the `alpha` attribute with type `f`.  The valid type specifiers are
-            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
-            specified with type float accepts either a single float, or a
-            list of floats (e.g., you would say `dims_i` for a `dims` attribute
-            that takes a list of integers).
-
-    Returns:
-        The node representing the single output of this operator (see the `outputs`
-        keyword argument for multi-return nodes).
-    """
-    # Filter out None attributes, this can be convenient client side because
-    # now they can pass through None attributes, and have them not show up
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-
-    args = [_const_if_tensor(g, arg) for arg in raw_args]
-
-    if "::" in opname:
-        namespace, op = jit_utils.parse_node_kind(opname)
-    else:
-        namespace = "onnx"
-        op = opname
-
-    n = g.insertNode(_new_node(g, namespace, op, outputs, *args, **kwargs))
-
-    if GLOBALS.onnx_shape_inference:
-        _C._jit_pass_onnx_node_shape_type_inference(
-            n, utils._params_dict, GLOBALS.export_onnx_opset_version
-        )
-
-    if outputs == 1:
-        return n.output()
-    return tuple(n.outputs())
-
-
-@_beartype.beartype
-def _const_if_tensor(g: _C.Graph, arg):
-    if arg is None:
-        return arg
-    if isinstance(arg, _C.Value):
-        return arg
-    return _graph_op(g, "Constant", value_z=arg)
-
-
-@_deprecation.deprecated(
-    "1.13",
-    "2.0",
-    "note 'g.at()' is to be removed from torch.Graph. Please open a"
-    " GitHub issue if you need this functionality.",
-)
-# Generate an ONNX ATen op node.
-@_beartype.beartype
-def _aten_op(g: _C.Graph, operator: str, *args, overload_name: str = "", **kwargs):
-    return _graph_op(
-        g,
-        "aten::ATen",
-        *args,
-        operator_s=operator,
-        overload_name_s=overload_name,
-        **kwargs,
-    )
-
-
-@_deprecation.deprecated(
-    "1.13",
-    "2.0",
-    "note 'b.op()' is to be removed from torch.Block. Please open a"
-    " GitHub issue if you need this functionality.",
-)
-@_beartype.beartype
-def _block_op(block: _C.Block, opname: str, *args: _C.Value, **kwargs):
-    if "::" in opname:
-        namespace, op = jit_utils.parse_node_kind(opname)
-    else:
-        namespace = "onnx"
-        op = opname
-
-    n = block.addNode(f"{namespace}::{op}", args)
-    aten = namespace == "aten"
-    skip_attrs = {"inplace", "aten"}
-    for k, v in sorted(kwargs.items()):
-        if k in skip_attrs:
-            continue
-        _add_attribute(n, k, v, aten=aten)
-    outputs = tuple(n.outputs())
-    if len(outputs) == 1:
-        return n.output()
-    return outputs
-
-
-@_beartype.beartype
-def _new_node(
-    g: _C.Graph, namespace: str, op: str, outputs: int, *args: _C.Value, **kwargs
-) -> _C.Node:
-    """Creates a new node in the graph.
-
-    Args:
-        g: The graph to create the operator on.
-        namespace: The namespace of the operator. E.g., "aten", "onnx".
-        op: The name of the operator to create.
-        outputs: The number of the outputs of the node.
-
-    Returns:
-        The new node.
-    """
-    aten = namespace == "aten"
-    node = g.create(f"{namespace}::{op}", args, outputs)
-    skip_attrs = {"inplace", "aten"}
-    for k, v in sorted(kwargs.items()):
-        if k in skip_attrs:
-            continue
-        _add_attribute(node, k, v, aten=aten)
-    return node
-
-
-@_beartype.beartype
-def _is_onnx_list(value):
-    return (
-        not isinstance(value, torch._six.string_classes)
-        and not isinstance(value, torch.Tensor)
-        and isinstance(value, Iterable)
-    )
-
-
-@_beartype.beartype
-def _scalar(x: torch.Tensor):
-    """Convert a scalar tensor into a Python value."""
-    assert x.numel() == 1
-    return x[0]
-
-
-@_beartype.beartype
-def _is_caffe2_aten_fallback() -> bool:
-    return (
-        GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
-        and _C_onnx._CAFFE2_ATEN_FALLBACK
-    )
-
-
-@_beartype.beartype
-def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
-    r"""Initializes the right attribute based on type of value."""
-    m = _ATTR_PATTERN.match(key)
-    if m is None:
-        raise ValueError(
-            f"Invalid attribute specifier '{key}' names "
-            "must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
-        )
-    name, kind = m.group(1), m.group(2)
-    if _is_onnx_list(value):
-        kind += "s"
-
-    if aten and _is_caffe2_aten_fallback():
-        if isinstance(value, torch.Tensor):
-            # Caffe2 proto does not support tensor attribute.
-            if value.numel() > 1:
-                raise ValueError("Should not pass tensor attribute")
-            value = _scalar(value)
-            if isinstance(value, float):
-                kind = "f"
-            else:
-                kind = "i"
-    return getattr(node, f"{kind}_")(name, value)
-
-
-# TODO(#76254): Remove the deprecated function.
-@_deprecation.deprecated(
-    "1.13", "2.0", "Use 'g.op()' to create a constant node instead."
-)
-@_beartype.beartype
-def _graph_constant(
-    g,
-    value,
-    dims,
-    type_: str,
-    *args,
-    **kwargs,
-):
-    """This helper function can create either constant tensor or constant scalar.
-
-    If dims is None or 0 or [0], generate a 0-d tensor (scalar).
-    """
-    assert isinstance(value, numbers.Number)
-    assert type_ is not None
-    isscalar = False
-    if dims is None or dims == 0 or set(dims) == {0}:
-        dims = [1]
-        isscalar = True
-    type_ = type_.lower()
-    tensor: Union[
-        torch.CharTensor,
-        torch.ShortTensor,
-        torch.IntTensor,
-        torch.LongTensor,
-        torch.HalfTensor,
-        torch.FloatTensor,
-        torch.DoubleTensor,
-    ]
-    if type_ == "char":
-        tensor = torch.CharTensor(*dims)
-    elif type_ == "short":
-        tensor = torch.ShortTensor(*dims)
-    elif type_ == "int":
-        tensor = torch.IntTensor(*dims)
-    elif type_ == "long":
-        tensor = torch.LongTensor(*dims)
-    elif type_ == "half":
-        tensor = torch.HalfTensor(*dims)
-    elif type_ == "float":
-        tensor = torch.FloatTensor(*dims)
-    elif type_ == "double":
-        tensor = torch.DoubleTensor(*dims)
-    else:
-        raise ValueError(
-            "Unknown type, type should be one of the following strings: "
-            "char, short, int, long, half, float, double"
-        )
-    tensor.fill_(value)  # type: ignore[call-overload]
-    if isscalar:
-        return g.op("Constant", *args, value_z=tensor, **kwargs)
-    return g.op("Constant", *args, value_t=tensor, **kwargs)
-
-
-# TODO(#76254): Remove the deprecated function.
-@_deprecation.deprecated(
-    "1.13",
-    "2.0",
-    "Internally use '_node_get' in symbolic_helper instead.",
-)
-def _node_getitem(self, k):
-    """Gets attributes of a node which is polymorphic over return type.
-
-    This is monkey-patched onto Node.
-    """
-    sel = self.kindOf(k)
-    return getattr(self, sel)(k)
-
-
-torch._C.Graph.op = _graph_op  # type: ignore[attr-defined]
-torch._C.Graph.at = _aten_op  # type: ignore[attr-defined]
-torch._C.Block.op = _block_op  # type: ignore[attr-defined]
-torch._C.Graph.constant = _graph_constant  # type: ignore[attr-defined]
-torch._C.Node.__getitem__ = _node_getitem  # type: ignore[attr-defined, misc, assignment]
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_type_utils.py
index e7ed0e411005..a395127de234 100644
--- a/torch/onnx/_type_utils.py
+++ b/torch/onnx/_type_utils.py
@@ -3,9 +3,7 @@
 
 import enum
 import typing
-from typing import Dict, Optional, Union
-
-from typing_extensions import Literal
+from typing import Dict, Literal, Optional, Union
 
 import torch
 from torch._C import _onnx as _C_onnx
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 8818e69fad92..61e249216619 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -5,22 +5,25 @@
 import sys
 import typing
 import warnings
-from typing import Any, Callable, List, NoReturn, Optional, Sequence, Set, Tuple, Union
-
-from typing_extensions import Literal
+from typing import (
+    Any,
+    Callable,
+    List,
+    Literal,
+    NoReturn,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import torch
 import torch._C._onnx as _C_onnx
 from torch import _C
 
 # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import (  # noqa: F401
-    _constants,
-    _deprecation,
-    _patch_torch,
-    _type_utils,
-    errors,
-)
+from torch.onnx import _constants, _deprecation, _type_utils, errors
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import _beartype, jit_utils
 from torch.types import Number
@@ -224,7 +227,7 @@ def _unpack_quantized_tensor(tuple_value: _C.Value) -> Tuple[_C.Value, ...]:
 # Check if list_value is output from prim::ListConstruct
 # This is usually called before _unpack_list to ensure the list can be unpacked.
 @_beartype.beartype
-def _is_packed_list(list_value: _C.Value) -> bool:
+def _is_packed_list(list_value: Any) -> bool:
     return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
 
 
@@ -372,17 +375,26 @@ def wrapper(g, *args, **kwargs):
             )
             descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
 
+            def _is_arg_quantized(descriptor, arg):
+                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
+
             # Run regular symbolic function if none of the argument is QTensor.
-            if not any(
-                (descriptor and _is_value(arg) and _is_tuple_construct(arg))
-                for descriptor, arg in descriptor_args
-            ):
+            is_quantized = list()
+            for descriptor, arg in descriptor_args:
+                # ListConstruct
+                if _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        is_quantized.append(_is_arg_quantized(descriptor, arg_input))
+                else:
+                    is_quantized.append(_is_arg_quantized(descriptor, arg))
+
+            if not any(is_quantized):
                 return fn(g, *args, **kwargs)
 
             # Dequantize arguments that are quantized
             non_quantized_args = []
             for descriptor, arg in descriptor_args:
-                if descriptor and _is_value(arg) and _is_tuple_construct(arg):
+                if _is_arg_quantized(descriptor, arg):
                     # Quantized arg is a tuple of (value, scale, zero_point)
                     dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
                         g, arg
@@ -393,6 +405,24 @@ def wrapper(g, *args, **kwargs):
                         _scale = arg_scale
                     if _zero_point is None:
                         _zero_point = arg_zero_point
+                # ListConstruct
+                elif _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        if _is_arg_quantized(descriptor, arg_input):
+                            # Quantized arg is a tuple of (value, scale, zero_point)
+                            (
+                                dequantized_arg,
+                                arg_scale,
+                                arg_zero_point,
+                                _,
+                            ) = dequantize_helper(g, arg_input)
+                            # Set scale and zero_point to the first quantized input if not already set
+                            if _scale is None:
+                                _scale = arg_scale
+                            if _zero_point is None:
+                                _zero_point = arg_zero_point
+                            arg_input.replaceAllUsesWith(dequantized_arg)
+                    non_quantized_args.append(arg)
                 else:
                     # Non-quantized arg
                     non_quantized_args.append(arg)
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index a02009a74f69..b14f12cbce05 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -9,9 +9,8 @@
 from torch import _C
 
 # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import (  # noqa: F401
+from torch.onnx import (
     _constants,
-    _patch_torch,
     _type_utils,
     errors,
     symbolic_helper,
@@ -350,6 +349,8 @@ def _slice(
             and (steps is None or (len(steps) == 1 and steps[0] == 1))
         ):
             return input
+        if ends[0] > _constants.INT64_MAX:
+            ends[0] = _constants.INT64_MAX
         axes = g.op("Constant", value_t=torch.tensor(axes))
         starts = g.op("Constant", value_t=torch.tensor(starts))
         ends = g.op("Constant", value_t=torch.tensor(ends))
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 3706c5336dfc..f9475c46fc2d 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -1,4 +1,5 @@
 """This file exports ONNX ops for opset 11."""
+from __future__ import annotations
 
 import functools
 import sys
@@ -281,6 +282,17 @@ def index_put(
             rank = symbolic_helper._get_tensor_rank(values)
             if rank is not None and rank == 0:
                 return opset9.masked_fill(g, self, bool_inp, values)
+            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
+            self_rank = symbolic_helper._get_tensor_rank(self)
+            if (
+                mask_rank is not None
+                and self_rank is not None
+                and self_rank > mask_rank
+            ):
+                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
+                bool_inp = symbolic_helper._unsqueeze_helper(
+                    g, bool_inp, list(range(mask_rank, self_rank))
+                )
             return masked_scatter(g, self, bool_inp, values)
         broadcast_index_shape = g.op("Shape", index)
         index = symbolic_helper._unsqueeze_helper(g, index, [-1])
@@ -532,6 +544,7 @@ def Delete(g: jit_utils.GraphContext, tensor_list, dim):
 
 
 @_onnx_symbolic("aten::cat")
+@symbolic_helper.quantized_args(True)
 @_beartype.beartype
 def cat(g: jit_utils.GraphContext, tensor_list, dim):
     if symbolic_helper._is_packed_list(tensor_list):
@@ -872,7 +885,26 @@ def _get_arange_dtype(dtype):
         dtype = symbolic_helper._maybe_get_const(dtype, "i")
         return dtype
 
-    if len(args) == 2 or len(args) == 5:
+    if len(args) == 2 and all(map(lambda val: isinstance(val, int), args)):
+        # aten::arange(Scalar start, Scalar end)
+        dtype = torch.int64
+        # Start index.
+        start = g.op(
+            "Constant",
+            value_t=torch.tensor(args[0], dtype=dtype),
+        )
+        # End (exclusive) index.
+        end = g.op(
+            "Constant",
+            value_t=torch.tensor(args[1], dtype=dtype),
+        )
+        # Step size from start to end indexes.
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=dtype),
+        )
+        return g.op("Range", start, end, delta_default)
+    elif len(args) == 2 or len(args) == 5:
         if len(args) == 2:
             # aten::arange(Scalar end, Tensor out)
             dtype = None
diff --git a/torch/onnx/symbolic_opset18.py b/torch/onnx/symbolic_opset18.py
new file mode 100644
index 000000000000..dee33785d0b2
--- /dev/null
+++ b/torch/onnx/symbolic_opset18.py
@@ -0,0 +1,70 @@
+"""This file exports ONNX ops for opset 18.
+
+Note [ONNX Operators that are added/updated in opset 18]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
+New operators:
+    CenterCropPad
+    Col2Im
+    Mish
+    OptionalGetElement
+    OptionalHasElement
+    Pad
+    Resize
+    ScatterElements
+    ScatterND
+"""
+
+import functools
+from typing import Sequence
+
+from torch import _C
+from torch.onnx import symbolic_helper
+from torch.onnx._internal import _beartype, registration
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["col2im"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
+
+
+@_onnx_symbolic("aten::col2im")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
+@_beartype.beartype
+def col2im(
+    g,
+    input: _C.Value,
+    output_size: _C.Value,
+    kernel_size: _C.Value,
+    dilation: Sequence[int],
+    padding: Sequence[int],
+    stride: Sequence[int],
+):
+    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
+    adjusted_padding = []
+    for pad in padding:
+        for _ in range(2):
+            adjusted_padding.append(pad)
+
+    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
+    if not adjusted_padding:
+        adjusted_padding = [0, 0] * num_dimensional_axis
+
+    if not dilation:
+        dilation = [1] * num_dimensional_axis
+
+    if not stride:
+        stride = [1] * num_dimensional_axis
+
+    return g.op(
+        "Col2Im",
+        input,
+        output_size,
+        kernel_size,
+        dilations_i=dilation,
+        pads_i=adjusted_padding,
+        strides_i=stride,
+    )
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 18f11771805b..2b62021833c1 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -3,6 +3,7 @@
 Opset 9 is supported by ONNX release 1.4.1
 release on 01/23/19
 """
+from __future__ import annotations
 
 import builtins
 import functools
@@ -18,14 +19,7 @@
 from torch import _C
 
 # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import (  # noqa: F401
-    _constants,
-    _deprecation,
-    _patch_torch,
-    _type_utils,
-    errors,
-    symbolic_helper,
-)
+from torch.onnx import _constants, _deprecation, _type_utils, errors, symbolic_helper
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import _beartype, jit_utils, registration
 from torch.types import Number
@@ -54,6 +48,7 @@
     "batch_norm",
     "bernoulli",
     "bitwise_not",
+    "bitwise_or",
     "bmm",
     "broadcast_tensors",
     "bucketize",
@@ -73,6 +68,7 @@
     "conv1d",
     "conv2d",
     "conv3d",
+    "convert_element_type",
     "convolution",
     "cos",
     "cosine_similarity",
@@ -151,6 +147,7 @@
     "lstm",
     "lt",
     "masked_fill",
+    "masked_fill_",
     "matmul",
     "max_pool1d_with_indices",
     "max_pool2d_with_indices",
@@ -543,6 +540,7 @@ def cat(g: jit_utils.GraphContext, tensor_list, dim):
     assert all(
         [
             symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+            or symbolic_helper._get_tensor_rank(t) is None
             or symbolic_helper._get_tensor_rank(t)
             == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
             for t in nonempty_tensors
@@ -1323,12 +1321,15 @@ def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kw
 
     if require_cast:
         for input in inputs:
-            input_scalar_type = _type_utils.JitScalarType.from_value(input)
-            if input.isCompleteTensor() and input_scalar_type != dtype_0:
-                raise errors.SymbolicValueError(
-                    f"Inputs of {op_name} must have same dtype. Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
-                    input,
-                )
+
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
         for i, input in enumerate(inputs):
             if input.isCompleteTensor() and not symbolic_helper._is_fp(input):
                 inputs[i] = g.op(
@@ -2085,6 +2086,24 @@ def bitwise_not(g: jit_utils.GraphContext, input):
     return g.op("Not", input)
 
 
+@_onnx_symbolic("aten::bitwise_or")
+@_beartype.beartype
+def bitwise_or(g, self, other):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. self: ",
+            self,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. other: ",
+            other,
+        )
+    return g.op("Or", self, other)
+
+
 @_beartype.beartype
 def wrap_logical_op_with_cast_to(to_type):
     def decorator(fn):
@@ -3387,9 +3406,9 @@ def feature_dropout(g, input, p, train):
 
 
 @_onnx_symbolic("aten::norm")
-@symbolic_helper.parse_args("v", "t", "is", "i")
+@symbolic_helper.parse_args("v", "t", "is", "i", "v")
 @_beartype.beartype
-def norm(g: jit_utils.GraphContext, self, p, dim, keepdim):
+def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
     if p == 1:
         f = _reduce_op_symbolic("ReduceL1")
     elif p == 2:
@@ -3398,7 +3417,11 @@ def norm(g: jit_utils.GraphContext, self, p, dim, keepdim):
         raise errors.SymbolicValueError(
             "ONNX export only p-norms with p of 1 or 2", self
         )
-    return f(g, self, dim=dim, keepdim=keepdim)
+    result = f(g, self, dim=dim, keepdim=keepdim)
+    if dtype is not None:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return result
 
 
 @_onnx_symbolic("aten::conv_tbc")
@@ -3617,7 +3640,7 @@ def tensor(
         for t in symbolic_helper._unpack_list(data):
             shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
             t = symbolic_helper._reshape_helper(g, t, shape_reference)
-            t = g.op("Cast", t, to_i=dtype.onnx_type())
+            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
             input_list.append(t)
         return g.op("Concat", *input_list, axis_i=0)
     else:
@@ -4097,6 +4120,13 @@ def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
     return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
 
 
+@_onnx_symbolic("prim::convert_element_type")
+@_beartype.beartype
+def convert_element_type(g: jit_utils.GraphContext, self, *args):
+    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
 @_onnx_symbolic("aten::to")
 @_beartype.beartype
 def to(g: jit_utils.GraphContext, self, *args):
@@ -5476,6 +5506,12 @@ def masked_fill(g: jit_utils.GraphContext, self, mask, value):
     return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
 
 
+@_onnx_symbolic("aten::masked_fill_")
+@_beartype.beartype
+def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
+    return masked_fill(g, self, mask, value)
+
+
 @_onnx_symbolic("aten::index")
 @_beartype.beartype
 def index(g: jit_utils.GraphContext, self, index):
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index fd0edef773a6..5f2a460f9084 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -959,7 +959,7 @@ def _create_jit_graph(
 
         if isinstance(model, torch.jit.ScriptModule):
             try:
-                graph = model.forward.graph
+                graph = model.forward.graph  # type: ignore[attr-defined]
             except AttributeError as e:
                 raise RuntimeError("'forward' method must be a script method") from e
             _C._jit_pass_onnx_function_substitution(graph)
@@ -1062,7 +1062,7 @@ def _pre_trace_quant_model(model, args):
     This is due to https://github.com/pytorch/pytorch/issues/75761.
     """
     if any(
-        hasattr(m, "_packed_params") for m in getattr(model, "modules", lambda: [])()
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
     ) or any(getattr(arg, "is_quantized", False) for arg in args):
         return torch.jit.trace(model, args)
     return model
@@ -1136,7 +1136,12 @@ def _model_to_graph(
             example_outputs_final += unpack_quantized_tensor(example_output)
         out_vars, desc = torch.jit._flatten(example_outputs_final)
         _C._jit_pass_onnx_assign_output_shape(
-            graph, out_vars, desc, GLOBALS.onnx_shape_inference, is_script
+            graph,
+            out_vars,
+            desc,
+            GLOBALS.onnx_shape_inference,
+            is_script,
+            GLOBALS.export_onnx_opset_version,
         )
 
     # NB: ONNX requires complete information about output types, which might be
@@ -1158,6 +1163,7 @@ def _model_to_graph(
                 out_desc,
                 GLOBALS.onnx_shape_inference,
                 is_script,
+                GLOBALS.export_onnx_opset_version,
             )
 
     _set_input_and_output_names(graph, input_names, output_names)
@@ -1464,6 +1470,15 @@ def _export(
     if export_type is None:
         export_type = _exporter_states.ExportTypes.PROTOBUF_FILE
 
+    # Discussed deprecation with Nikita Shulga and Sergii Dymchenko from Meta
+    if _C_onnx._CAFFE2_ATEN_FALLBACK:
+        warnings.warn(
+            "Caffe2 ONNX exporter is deprecated in version 2.0 and will be "
+            "removed in 2.2. Please use PyTorch 2.1 or older for this capability.",
+            category=FutureWarning,
+            stacklevel=2,
+        )
+
     if isinstance(model, torch.nn.DataParallel):
         raise ValueError(
             "torch.nn.DataParallel is not supported by ONNX "
@@ -1618,7 +1633,7 @@ def _export(
                 not val_use_external_data_format
             ):
                 try:
-                    _C._check_onnx_proto(proto, full_check=True)
+                    _C._check_onnx_proto(proto)
                 except RuntimeError as e:
                     raise errors.CheckerError(e) from e
     finally:
@@ -1682,7 +1697,15 @@ def _run_symbolic_method(g, op_name, symbolic_fn, args):
     call from C++.
     """
     try:
-        return symbolic_fn(g, *args)
+        graph_context = jit_utils.GraphContext(
+            graph=g,
+            block=g.block(),
+            opset=GLOBALS.export_onnx_opset_version,
+            original_node=None,  # type: ignore[arg-type]
+            params_dict=_params_dict,
+            env={},
+        )
+        return symbolic_fn(graph_context, *args)
     except TypeError as e:
         # Handle the specific case where we didn't successfully dispatch
         # to symbolic_fn.  Otherwise, the backtrace will have the clues
@@ -1837,7 +1860,7 @@ def _run_symbolic_function(
         }
         outputs = node.outputsSize()
         attrs["outputs"] = outputs
-        return graph_context.at(
+        return graph_context.aten_op(
             op_name,
             *inputs,
             overload_name=_get_aten_op_overload_name(node),
@@ -1895,7 +1918,7 @@ def _run_symbolic_function(
                 k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
                 for k in node.attributeNames()
             }
-            return graph_context.at(
+            return graph_context.aten_op(
                 op_name,
                 *inputs,
                 overload_name=_get_aten_op_overload_name(node),
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index 5d4925b0b067..84ac973bc8ce 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -914,7 +914,7 @@ def verify_aten_graph(
     graph = graph.copy()
 
     # Execute aten graph and get reference torch jit outputs.
-    graph_inputs = list(v for v in graph.inputs())
+    graph_inputs = list(graph.inputs())
     jit_inputs = tuple([arg for arg in input_args if arg is not None])
     weights = [params_dict[v.debugName()] for v in graph_inputs[len(jit_inputs) :]]
     assert all([w is not None for w in weights])
@@ -940,7 +940,7 @@ def verify_aten_graph(
     # NOTE: Verification is unstable. Try catch to emit information for debugging.
     try:
         # NOTE: Input might be dce'ed, so we need to remove those from the input args.
-        new_input_names = set(v.debugName() for v in graph.inputs())
+        new_input_names = {v.debugName() for v in graph.inputs()}
         new_input_args = []
         for v, arg in zip(original_jit_graph.inputs(), input_args):
             if v.debugName() in new_input_names:
@@ -956,6 +956,7 @@ def verify_aten_graph(
 
         onnx_session = _onnx_backend_session(model_f, verification_options.backend)
         onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+        del onnx_session  # To free device memory
 
         try:
             _compare_onnx_pytorch_outputs(
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index 17bf6a1b451f..667a272f45d5 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -1,7 +1,7 @@
 import torch
 from torch import Tensor
 
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_foreach,
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
                         _differentiable_doc, _foreach_doc, _maximize_doc)
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 from typing import List, Optional
@@ -40,7 +40,7 @@ def __init__(
             foreach=foreach,
             differentiable=differentiable,
         )
-        super(Adadelta, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -193,7 +193,8 @@ def adadelta(
 
     # We still respect when the user inputs False for foreach.
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, square_avgs, acc_deltas], differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, square_avgs, acc_deltas],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 4fe38c1b2d02..f5c575324020 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -2,7 +2,7 @@
 from torch import Tensor
 
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value,
-                        _default_to_foreach, _differentiable_doc, _foreach_doc, _maximize_doc)
+                        _default_to_fused_or_foreach, _differentiable_doc, _foreach_doc, _maximize_doc)
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 from typing import List, Optional
 
@@ -48,7 +48,7 @@ def __init__(
             maximize=maximize,
             differentiable=differentiable,
         )
-        super(Adagrad, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
         for group in self.param_groups:
             for p in group["params"]:
@@ -210,7 +210,8 @@ def adagrad(
         )
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, state_sums, state_steps], differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, state_sums, state_steps],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index b5f9e072d8f4..25b999ef6047 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -1,52 +1,15 @@
-from typing import cast, List, Optional, Dict
+from typing import List, Optional
 
 import torch
 from torch import Tensor
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _stack_if_compiling,
-                        _dispatch_sqrt, _capturable_doc, _differentiable_doc, _maximize_doc)
+                        _dispatch_sqrt, _default_to_fused_or_foreach, _capturable_doc,
+                        _differentiable_doc, _foreach_doc, _fused_doc, _maximize_doc)
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
 __all__ = ['Adam', 'adam']
 
 
-# TODO(crcrpar): Move this to soemwhere (e.g. torch/optim/_utils?) else when adding another fused optimizer.
-# NOTE(crcrpar): Almost the same as `_MultiDeviceReplicator` defined in
-# torch/cuda/amp/grad_scaler.py except for the key being str only for torch script.
-class _MultiDeviceReplicator:
-    main_tensor: Tensor
-    _per_device_tensors: Dict[str, Tensor]
-
-    def __init__(self, main_tensor: Tensor) -> None:
-        self.main_tensor = main_tensor
-        self._per_device_tensors = {str(main_tensor.device): main_tensor}
-
-    def get(self, device: str):
-        if device in self._per_device_tensors:
-            return self._per_device_tensors[device]
-        tensor = self.main_tensor.to(device=device, non_blocking=True, copy=True)
-        self._per_device_tensors[device] = tensor
-        return tensor
-
-
-# todo(crcrpar): Move this to another place when adding another fused optimizer.
-def _get_fp16AMP_params(
-    *,
-    optimizer: Optimizer,
-    grad_scaler: Optional[torch.cuda.amp.GradScaler] = None,
-    device: torch.device,
-) -> Optional[_MultiDeviceReplicator]:
-    if grad_scaler is None:
-        return None
-    found_inf_dict = grad_scaler._check_inf_per_device(optimizer)
-    # Combines found_inf tensors from all devices. As in GradScaler.update(),
-    # tensors are combined on the scale's device, which is an arbitrary but
-    # reasonable choice that avoids new context creation.
-    found_infs = [f.to(device, non_blocking=True) for f in found_inf_dict.values()]
-    assert len(found_infs) > 0, "No inf checks were recorded in _check_inf_per_device."
-    with torch.no_grad():
-        found_inf_combined = cast(torch.Tensor, sum(found_infs))
-    return _MultiDeviceReplicator(found_inf_combined)
-
 class Adam(Optimizer):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                  weight_decay=0, amsgrad=False, *, foreach: Optional[bool] = None,
@@ -67,11 +30,11 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                         weight_decay=weight_decay, amsgrad=amsgrad,
                         maximize=maximize, foreach=foreach, capturable=capturable,
                         differentiable=differentiable, fused=fused)
-        super(Adam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
         if fused:
             if differentiable:
-                raise RuntimeError("`fused` cannot be `differentiable`")
+                raise RuntimeError("`fused` does not support `differentiable`")
             self._step_supports_amp_scaling = True
             # TODO(crcrpar): [low prec params & their higher prec copy]
             # Suppor AMP with FP16/BF16 model params which would need
@@ -81,7 +44,9 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 p.is_cuda and torch.is_floating_point(p)
                 for pg in self.param_groups for p in pg['params']
             ):
-                raise RuntimeError("FusedAdam requires all the params to be CUDA, floating point")
+                raise RuntimeError("`fused=True` requires all the params to be CUDA, floating point Tensor")
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -91,7 +56,7 @@ def __setstate__(self, state):
             group.setdefault('foreach', None)
             group.setdefault('capturable', False)
             group.setdefault('differentiable', False)
-            group.setdefault('fused', False)
+            group.setdefault('fused', None)
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
         if not step_is_tensor:
@@ -101,7 +66,6 @@ def __setstate__(self, state):
     def _init_group(
         self,
         group,
-        grad_scaler,
         params_with_grad,
         grads,
         exp_avgs,
@@ -109,15 +73,6 @@ def _init_group(
         max_exp_avg_sqs,
         state_steps
     ):
-
-        grad_scale = None
-        found_inf = None
-        if group['fused'] and grad_scaler is not None:
-            grad_scale = grad_scaler._get_scale_async()
-            device = grad_scale.device
-            grad_scale = _MultiDeviceReplicator(grad_scale)
-            found_inf = _get_fp16AMP_params(optimizer=self, grad_scaler=grad_scaler, device=device)
-
         for p in group['params']:
             if p.grad is not None:
                 params_with_grad.append(p)
@@ -130,7 +85,7 @@ def _init_group(
                 if len(state) == 0:
                     state['step'] = (
                         torch.zeros((1,), dtype=torch.float, device=p.device)
-                        if self.defaults['capturable'] or self.defaults['fused']
+                        if group['capturable'] or group['fused']
                         else torch.tensor(0.)
                     )
                     # Exponential moving average of gradient values
@@ -150,17 +105,13 @@ def _init_group(
                     raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
                 state_steps.append(state['step'])
 
-        return grad_scale, found_inf
-
     @_use_grad_for_differentiable
-    def step(self, closure=None, *, grad_scaler=None):
+    def step(self, closure=None):
         """Performs a single optimization step.
 
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
-            grad_scaler (:class:`torch.cuda.amp.GradScaler`, optional): A GradScaler which is
-                supplied from ``grad_scaler.step(optimizer)``.
         """
         self._cuda_graph_capture_health_check()
 
@@ -178,9 +129,8 @@ def step(self, closure=None, *, grad_scaler=None):
             state_steps = []
             beta1, beta2 = group['betas']
 
-            grad_scale, found_inf = self._init_group(
+            self._init_group(
                 group,
-                grad_scaler,
                 params_with_grad,
                 grads,
                 exp_avgs,
@@ -188,25 +138,27 @@ def step(self, closure=None, *, grad_scaler=None):
                 max_exp_avg_sqs,
                 state_steps)
 
-            adam(params_with_grad,
-                 grads,
-                 exp_avgs,
-                 exp_avg_sqs,
-                 max_exp_avg_sqs,
-                 state_steps,
-                 amsgrad=group['amsgrad'],
-                 beta1=beta1,
-                 beta2=beta2,
-                 lr=group['lr'],
-                 weight_decay=group['weight_decay'],
-                 eps=group['eps'],
-                 maximize=group['maximize'],
-                 foreach=group['foreach'],
-                 capturable=group['capturable'],
-                 differentiable=group['differentiable'],
-                 fused=group['fused'],
-                 grad_scale=grad_scale,
-                 found_inf=found_inf)
+            adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=group['amsgrad'],
+                beta1=beta1,
+                beta2=beta2,
+                lr=group['lr'],
+                weight_decay=group['weight_decay'],
+                eps=group['eps'],
+                maximize=group['maximize'],
+                foreach=group['foreach'],
+                capturable=group['capturable'],
+                differentiable=group['differentiable'],
+                fused=group['fused'],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
+            )
 
         return loss
 
@@ -262,24 +214,18 @@ def step(self, closure=None, *, grad_scaler=None):
         amsgrad (bool, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False)
-        foreach (bool, optional): whether foreach implementation of optimizer
-            is used (default: None)
+        {foreach}
         {maximize}
         {capturable}
         {differentiable}
-        fused (bool, optional): whether the fused implementation (CUDA only) is used.
-            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
-            are supported. Since the fused implementation is usually significantly faster than
-            the for-loop implementation, we try to use it whenever possible (all parameters
-            are on CUDA and are of a supported type). Else, we continue with the for-loop
-            implementation. (default: None)
-
+        {fused}
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
 
-    """.format(maximize=_maximize_doc, capturable=_capturable_doc, differentiable=_differentiable_doc)
+    """.format(foreach=_foreach_doc, maximize=_maximize_doc, capturable=_capturable_doc,
+               differentiable=_differentiable_doc, fused=_fused_doc)
 
 
 def adam(params: List[Tensor],
@@ -294,8 +240,8 @@ def adam(params: List[Tensor],
          capturable: bool = False,
          differentiable: bool = False,
          fused: Optional[bool] = None,
-         grad_scale: Optional[_MultiDeviceReplicator] = None,
-         found_inf: Optional[_MultiDeviceReplicator] = None,
+         grad_scale: Optional[Tensor] = None,
+         found_inf: Optional[Tensor] = None,
          *,
          amsgrad: bool,
          beta1: float,
@@ -308,36 +254,29 @@ def adam(params: List[Tensor],
     See :class:`~torch.optim.Adam` for details.
     """
 
-    # We try to use the fused implementation whenever we can since it is fastest.
-    # It's only available when the tensors are floats on the same CUDA device
-    # and when differentiable=False.
-    # We still respect when the user inputs False for fused.
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps],
+            differentiable, use_fused=False)
     if fused is None:
-        all_tensors = []
-        all_tensors.extend(params)
-        all_tensors.extend(grads)
-        all_tensors.extend(exp_avgs)
-        all_tensors.extend(exp_avg_sqs)
-        all_tensors.extend(max_exp_avg_sqs)
-        all_tensors.extend(state_steps)
-        fused = not torch.jit.is_scripting() and not differentiable and all(
-            p.is_cuda and torch.is_floating_point(p) for p in all_tensors
-        )
+        fused = False
+    if foreach is None:
+        foreach = False
 
     if not all(isinstance(t, torch.Tensor) for t in state_steps):
         raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
 
-    if foreach is None:
-        # Placeholder for more complex foreach logic to be added when value is not set
-        foreach = False
-
     if foreach and torch.jit.is_scripting():
         raise RuntimeError('torch.jit.script not supported with foreach optimizers')
 
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_adam
-    elif fused and not torch.jit.is_scripting():
+    if fused and not torch.jit.is_scripting():
         func = _fused_adam
+    elif foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adam
     else:
         func = _single_tensor_adam
 
@@ -366,8 +305,8 @@ def _single_tensor_adam(params: List[Tensor],
                         exp_avg_sqs: List[Tensor],
                         max_exp_avg_sqs: List[Tensor],
                         state_steps: List[Tensor],
-                        grad_scale: Optional[_MultiDeviceReplicator],
-                        found_inf: Optional[_MultiDeviceReplicator],
+                        grad_scale: Optional[Tensor],
+                        found_inf: Optional[Tensor],
                         *,
                         amsgrad: bool,
                         beta1: float,
@@ -456,15 +395,14 @@ def _single_tensor_adam(params: List[Tensor],
             param.addcdiv_(exp_avg, denom, value=-step_size)
 
 
-
 def _multi_tensor_adam(params: List[Tensor],
                        grads: List[Tensor],
                        exp_avgs: List[Tensor],
                        exp_avg_sqs: List[Tensor],
                        max_exp_avg_sqs: List[Tensor],
                        state_steps: List[Tensor],
-                       grad_scale: Optional[_MultiDeviceReplicator],
-                       found_inf: Optional[_MultiDeviceReplicator],
+                       grad_scale: Optional[Tensor],
+                       found_inf: Optional[Tensor],
                        *,
                        amsgrad: bool,
                        beta1: float,
@@ -513,9 +451,8 @@ def _multi_tensor_adam(params: List[Tensor],
         torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads, 1 - beta2)
 
         if capturable:
-            # TODO: use foreach_pow if/when foreach_pow is added
-            bias_correction1 = [torch.pow(beta1, step) for step in device_state_steps]
-            bias_correction2 = [torch.pow(beta2, step) for step in device_state_steps]
+            bias_correction1 = torch._foreach_pow(beta1, device_state_steps)
+            bias_correction2 = torch._foreach_pow(beta2, device_state_steps)
             # foreach_sub doesn't allow a scalar as the first arg
             torch._foreach_sub_(bias_correction1, 1)
             torch._foreach_sub_(bias_correction2, 1)
@@ -580,8 +517,8 @@ def _fused_adam(
     exp_avg_sqs: List[Tensor],
     max_exp_avg_sqs: List[Tensor],
     state_steps: List[Tensor],
-    grad_scale: Optional[_MultiDeviceReplicator],
-    found_inf: Optional[_MultiDeviceReplicator],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
     *,
     amsgrad: bool,
     beta1: float,
@@ -593,6 +530,9 @@ def _fused_adam(
     capturable: bool,  # Needed for consistency.
     differentiable: bool,
 ) -> None:
+    grouped_tensors = _group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
+    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
     grouped_tensors = _group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
     for (device, dtype) in grouped_tensors:
         (
@@ -604,8 +544,12 @@ def _fused_adam(
             device_state_steps,
         ) = grouped_tensors[(device, dtype)]
         if grad_scale is not None and found_inf is not None:
-            device_grad_scale = grad_scale.get(str(device))
-            device_found_inf = found_inf.get(str(device))
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)
+            if found_inf not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device, non_blocking=True)
+            device_grad_scale = grad_scale_dict[device]
+            device_found_inf = found_inf_dict[device]
         else:
             device_grad_scale = None
             device_found_inf = None
diff --git a/torch/optim/adam.pyi b/torch/optim/adam.pyi
index 161c29e7fde0..6fde30275a3a 100644
--- a/torch/optim/adam.pyi
+++ b/torch/optim/adam.pyi
@@ -1,5 +1,5 @@
-from typing import Tuple
+from typing import Tuple, Optional
 from .optimizer import _params_t, Optimizer
 
 class Adam(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ..., *, foreach: Optional[bool] = ..., maximize: bool = ..., capturable: bool = ..., differentiable: bool = ..., fused: bool = ...) -> None: ...
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 2e9088e29f86..f94a5790f00d 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -2,7 +2,7 @@
 from torch import Tensor
 
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _stack_if_compiling,
-                        _default_to_foreach, _differentiable_doc, _maximize_doc, _foreach_doc)
+                        _default_to_fused_or_foreach, _differentiable_doc, _maximize_doc, _foreach_doc)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
@@ -42,7 +42,7 @@ def __init__(
             maximize=maximize,
             differentiable=differentiable,
         )
-        super(Adamax, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -206,8 +206,8 @@ def adamax(
         )
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, exp_avgs, exp_infs, state_steps],
-                                      differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, exp_avgs, exp_infs, state_steps],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index d0de6d150643..d0af45372e3d 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -1,7 +1,8 @@
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt, _stack_if_compiling,
-                        _capturable_doc, _differentiable_doc, _foreach_doc, _maximize_doc, _default_to_foreach)
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt,
+                        _stack_if_compiling, _capturable_doc, _differentiable_doc, _foreach_doc,
+                        _fused_doc, _maximize_doc, _default_to_fused_or_foreach)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
@@ -22,6 +23,7 @@ def __init__(
         foreach: Optional[bool] = None,
         capturable: bool = False,
         differentiable: bool = False,
+        fused: Optional[bool] = None,
     ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
@@ -43,8 +45,25 @@ def __init__(
             maximize=maximize,
             capturable=capturable,
             differentiable=differentiable,
+            fused=fused,
         )
-        super(AdamW, self).__init__(params, defaults)
+        super().__init__(params, defaults)
+
+        if fused:
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            self._step_supports_amp_scaling = True
+            # TODO(crcrpar): [low prec params & their higher prec copy]
+            # Suppor AMP with FP16/BF16 model params which would need
+            # higher prec copy of params to do update math in higher prec to
+            # alleviate the loss of information.
+            if not all(
+                p.is_cuda and torch.is_floating_point(p)
+                for pg in self.param_groups for p in pg['params']
+            ):
+                raise RuntimeError("`fused=True` requires all the params to be CUDA, floating point Tensor")
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -54,6 +73,7 @@ def __setstate__(self, state):
             group.setdefault("foreach", None)
             group.setdefault("capturable", False)
             group.setdefault("differentiable", False)
+            group.setdefault("fused", None)
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
             state_values[0]["step"]
@@ -62,7 +82,17 @@ def __setstate__(self, state):
             for s in state_values:
                 s["step"] = torch.tensor(float(s["step"]))
 
-    def _init_group(self, group, params_with_grad, grads, amsgrad, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps):
+    def _init_group(
+        self,
+        group,
+        params_with_grad,
+        grads,
+        amsgrad,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+    ):
         for p in group["params"]:
             if p.grad is None:
                 continue
@@ -77,7 +107,7 @@ def _init_group(self, group, params_with_grad, grads, amsgrad, exp_avgs, exp_avg
             if len(state) == 0:
                 state["step"] = (
                     torch.zeros((1,), dtype=torch.float, device=p.device)
-                    if self.defaults["capturable"]
+                    if group["capturable"] or group["fused"]
                     else torch.tensor(0.0)
                 )
                 # Exponential moving average of gradient values
@@ -127,7 +157,16 @@ def step(self, closure=None):
             amsgrad = group["amsgrad"]
             beta1, beta2 = group["betas"]
 
-            self._init_group(group, params_with_grad, grads, amsgrad, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps)
+            self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                amsgrad,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+            )
 
             adamw(
                 params_with_grad,
@@ -146,6 +185,9 @@ def step(self, closure=None):
                 foreach=group["foreach"],
                 capturable=group["capturable"],
                 differentiable=group["differentiable"],
+                fused=group["fused"],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
             )
 
         return loss
@@ -206,7 +248,7 @@ def step(self, closure=None):
         {foreach}
         {capturable}
         {differentiable}
-
+        {fused}
     .. _Decoupled Weight Decay Regularization:
         https://arxiv.org/abs/1711.05101
     .. _On the Convergence of Adam and Beyond:
@@ -214,6 +256,7 @@ def step(self, closure=None):
 
     """.format(maximize=_maximize_doc,
                foreach=_foreach_doc,
+               fused=_fused_doc,
                capturable=_capturable_doc,
                differentiable=_differentiable_doc)
 
@@ -230,6 +273,9 @@ def adamw(
     foreach: Optional[bool] = None,
     capturable: bool = False,
     differentiable: bool = False,
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
     *,
     amsgrad: bool,
     beta1: float,
@@ -249,16 +295,27 @@ def adamw(
             "API has changed, `state_steps` argument must contain a list of singleton tensors"
         )
 
-    # Respect when the user inputs False/True for foreach.
-    if foreach is None:
-        foreach = _default_to_foreach(
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
             [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps],
-            differentiable=differentiable)
+            differentiable, use_fused=False)
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
 
-    if foreach and not torch.jit.is_scripting():
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adamw
+    elif foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adamw
     else:
         func = _single_tensor_adamw
@@ -279,6 +336,8 @@ def adamw(
         maximize=maximize,
         capturable=capturable,
         differentiable=differentiable,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
     )
 
 
@@ -289,6 +348,8 @@ def _single_tensor_adamw(
     exp_avg_sqs: List[Tensor],
     max_exp_avg_sqs: List[Tensor],
     state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
     *,
     amsgrad: bool,
     beta1: float,
@@ -301,6 +362,8 @@ def _single_tensor_adamw(
     differentiable: bool,
 ):
 
+    assert grad_scale is None and found_inf is None
+
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
         exp_avg = exp_avgs[i]
@@ -388,6 +451,8 @@ def _multi_tensor_adamw(
     exp_avg_sqs: List[Tensor],
     max_exp_avg_sqs: List[Tensor],
     state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
     *,
     amsgrad: bool,
     beta1: float,
@@ -409,6 +474,8 @@ def _multi_tensor_adamw(
 
     assert not differentiable, "_foreach ops don't support autograd"
 
+    assert grad_scale is None and found_inf is None
+
     grouped_tensors = _group_tensors_by_device_and_dtype([
         params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
     for (device_params, device_grads, device_exp_avgs, device_exp_avg_sqs,
@@ -438,9 +505,8 @@ def _multi_tensor_adamw(
         torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads, 1 - beta2)
 
         if capturable:
-            # TODO: use foreach_pow if/when foreach_pow is added
-            bias_correction1 = [torch.pow(beta1, step) for step in device_state_steps]
-            bias_correction2 = [torch.pow(beta2, step) for step in device_state_steps]
+            bias_correction1 = torch._foreach_pow(beta1, device_state_steps)
+            bias_correction2 = torch._foreach_pow(beta2, device_state_steps)
             # foreach_sub doesn't allow a scalar as the first arg
             torch._foreach_sub_(bias_correction1, 1)
             torch._foreach_sub_(bias_correction2, 1)
@@ -501,3 +567,69 @@ def _multi_tensor_adamw(
                 denom = torch._foreach_add(exp_avg_sq_sqrt, eps)
 
             torch._foreach_addcdiv_(device_params, device_exp_avgs, denom, step_size)
+
+
+def _fused_adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,  # Needed for consistency.
+    differentiable: bool,
+) -> None:
+    if differentiable:
+        raise RuntimeError("_fused_adamw is not differentiable")
+    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
+    grouped_tensors = _group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
+    for (device, dtype) in grouped_tensors:
+        (
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,
+            device_state_steps,
+        ) = grouped_tensors[(device, dtype)]
+        if grad_scale is not None and found_inf is not None:
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)
+            if found_inf not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device, non_blocking=True)
+            device_grad_scale = grad_scale_dict[device]
+            device_found_inf = found_inf_dict[device]
+        else:
+            device_grad_scale = None
+            device_found_inf = None
+        torch._foreach_add_(device_state_steps, 1)
+        torch._fused_adamw_(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+            grad_scale=device_grad_scale,
+            found_inf=device_found_inf,
+        )
+        if device_found_inf is not None:
+            torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))
diff --git a/torch/optim/adamw.pyi b/torch/optim/adamw.pyi
index 8f6618fdcb95..5c8843568886 100644
--- a/torch/optim/adamw.pyi
+++ b/torch/optim/adamw.pyi
@@ -1,5 +1,5 @@
-from typing import Tuple
+from typing import Tuple, Optional
 from .optimizer import _params_t, Optimizer
 
 class AdamW(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ..., *, foreach: Optional[bool] = ..., maximize: bool = ..., capturable: bool = ..., differentiable: bool = ..., fused: bool = ...) -> None: ...
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 61c3147e57b6..918a75f390e3 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -1,8 +1,8 @@
 import torch
 from torch import Tensor
 
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value,
-                        _default_to_foreach, _differentiable_doc, _foreach_doc, _maximize_doc)
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _default_to_fused_or_foreach,
+                        _differentiable_doc, _foreach_doc, _maximize_doc)
 from torch._utils import is_compiling
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 from typing import List, Optional
@@ -43,7 +43,7 @@ def __init__(
             maximize=maximize,
             differentiable=differentiable,
         )
-        super(ASGD, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -185,8 +185,8 @@ def asgd(
     """
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, axs, mus, etas, state_steps],
-                                      differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, axs, mus, etas, state_steps],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 9f9336128699..377236fc05ee 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -231,7 +231,7 @@ def __init__(self,
             tolerance_change=tolerance_change,
             history_size=history_size,
             line_search_fn=line_search_fn)
-        super(LBFGS, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
         if len(self.param_groups) != 1:
             raise ValueError("LBFGS doesn't support per-parameter options "
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index bded76c62802..89a377e1205c 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -1,6 +1,6 @@
 import types
 import math
-from torch._six import inf
+from torch import inf
 from functools import wraps
 import warnings
 import weakref
@@ -23,7 +23,7 @@
 )
 
 
-class LRScheduler(object):
+class LRScheduler:
 
     def __init__(self, optimizer, last_epoch=-1, verbose=False):
 
@@ -218,7 +218,7 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1, verbose=False):
                 raise ValueError("Expected {} lr_lambdas, but got {}".format(
                     len(optimizer.param_groups), len(lr_lambda)))
             self.lr_lambdas = list(lr_lambda)
-        super(LambdaLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def state_dict(self):
         """Returns the state of the scheduler as a :class:`dict`.
@@ -302,7 +302,7 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1, verbose=False):
                 raise ValueError("Expected {} lr_lambdas, but got {}".format(
                     len(optimizer.param_groups), len(lr_lambda)))
             self.lr_lambdas = list(lr_lambda)
-        super(MultiplicativeLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def state_dict(self):
         """Returns the state of the scheduler as a :class:`dict`.
@@ -382,7 +382,7 @@ class StepLR(LRScheduler):
     def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1, verbose=False):
         self.step_size = step_size
         self.gamma = gamma
-        super(StepLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -430,7 +430,7 @@ class MultiStepLR(LRScheduler):
     def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1, verbose=False):
         self.milestones = Counter(milestones)
         self.gamma = gamma
-        super(MultiStepLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -443,7 +443,7 @@ def get_lr(self):
                 for group in self.optimizer.param_groups]
 
     def _get_closed_form_lr(self):
-        milestones = list(sorted(self.milestones.elements()))
+        milestones = sorted(self.milestones.elements())
         return [base_lr * self.gamma ** bisect_right(milestones, self.last_epoch)
                 for base_lr in self.base_lrs]
 
@@ -484,7 +484,7 @@ def __init__(self, optimizer, factor=1.0 / 3, total_iters=5, last_epoch=-1, verb
 
         self.factor = factor
         self.total_iters = total_iters
-        super(ConstantLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -551,7 +551,7 @@ def __init__(self, optimizer, start_factor=1.0 / 3, end_factor=1.0, total_iters=
         self.start_factor = start_factor
         self.end_factor = end_factor
         self.total_iters = total_iters
-        super(LinearLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -588,7 +588,7 @@ class ExponentialLR(LRScheduler):
 
     def __init__(self, optimizer, gamma, last_epoch=-1, verbose=False):
         self.gamma = gamma
-        super(ExponentialLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -807,7 +807,7 @@ class CosineAnnealingLR(LRScheduler):
     def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, verbose=False):
         self.T_max = T_max
         self.eta_min = eta_min
-        super(CosineAnnealingLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -910,7 +910,7 @@ def load_state_dict(self, state_dict):
             self._schedulers[idx].load_state_dict(s)
 
 
-class ReduceLROnPlateau(object):
+class ReduceLROnPlateau:
     """Reduce learning rate when a metric has stopped improving.
     Models often benefit from reducing the learning rate by a factor
     of 2-10 once learning stagnates. This scheduler reads a metrics
@@ -974,7 +974,7 @@ def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
                 type(optimizer).__name__))
         self.optimizer = optimizer
 
-        if isinstance(min_lr, list) or isinstance(min_lr, tuple):
+        if isinstance(min_lr, (list, tuple)):
             if len(min_lr) != len(optimizer.param_groups):
                 raise ValueError("expected {} min_lrs, got {}".format(
                     len(optimizer.param_groups), len(min_lr)))
@@ -1237,7 +1237,7 @@ def __init__(self,
             self.base_momentums = [group['momentum'] for group in optimizer.param_groups]
             self.max_momentums = self._format_param('max_momentum', optimizer, max_momentum)
 
-        super(CyclicLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
         self.base_lrs = base_lrs
 
     def _init_scale_fn(self):
@@ -1372,7 +1372,7 @@ def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose=F
         self.T_mult = T_mult
         self.eta_min = eta_min
         self.T_cur = last_epoch
-        super(CosineAnnealingWarmRestarts, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def get_lr(self):
         if not self._get_lr_called_within_step:
@@ -1551,6 +1551,7 @@ class OneCycleLR(LRScheduler):
         >>> for epoch in range(10):
         >>>     for batch in data_loader:
         >>>         train_batch(...)
+        >>>         optimizer.step()
         >>>         scheduler.step()
 
 
@@ -1673,7 +1674,7 @@ def __init__(self,
                     group['max_momentum'] = m_momentum
                     group['base_momentum'] = b_momentum
 
-        super(OneCycleLR, self).__init__(optimizer, last_epoch, verbose)
+        super().__init__(optimizer, last_epoch, verbose)
 
     def _format_param(self, name, optimizer, param):
         """Return correctly formatted lr/momentum for each param group."""
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index df3ae1f13bec..17d2d986c56f 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -1,7 +1,7 @@
 import torch
 from torch import Tensor
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt, _stack_if_compiling,
-                        _differentiable_doc, _foreach_doc, _default_to_foreach)
+                        _differentiable_doc, _foreach_doc, _default_to_fused_or_foreach)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
@@ -26,7 +26,7 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
         defaults = dict(lr=lr, betas=betas, eps=eps,
                         weight_decay=weight_decay, momentum_decay=momentum_decay,
                         foreach=foreach, differentiable=differentiable)
-        super(NAdam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -187,8 +187,8 @@ def nadam(params: List[Tensor],
         raise RuntimeError("API has changed, `mu_products` argument must contain a list of singleton tensors")
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, exp_avgs, exp_avg_sqs,
-                                       mu_products, state_steps], differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError('torch.jit.script not supported with foreach optimizers')
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index aadb0ff37d24..718447e9bba3 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -6,7 +6,7 @@
 import functools
 import math
 
-from typing import Callable, Dict, List
+from typing import Callable, Dict, List, Tuple
 
 import torch.utils.hooks as hooks
 from torch.utils.hooks import RemovableHandle
@@ -16,7 +16,7 @@
 _global_optimizer_pre_hooks: Dict[int, Callable] = OrderedDict()
 _global_optimizer_post_hooks: Dict[int, Callable] = OrderedDict()
 
-class _RequiredParameter(object):
+class _RequiredParameter:
     """Singleton class representing a required parameter for an Optimizer."""
     def __repr__(self):
         return "<required parameter>"
@@ -54,17 +54,27 @@ def _dispatch_sqrt(x: float):  # float annotation is needed because of torchscri
     else:
         return math.sqrt(x)
 
-
-# We try to use the foreach implementation on CUDA whenever possible since
-# it is faster than the for-loop implementation. However, the foreach
-# implementation is not differentiable, so we must check differentiable=False.
-def _default_to_foreach(tensorlists: List[List[torch.Tensor]], differentiable: bool = False) -> bool:
+# For any optimizer with a faster implementation, we attempt to default to the
+# fastest + stablest whenever possible. For foreach, the requirements are to have
+# native tensors all on CUDA. For fused, there's currently the additional requirement
+# that the tensors' dtypes must be floating point. Neither alternative supports
+# torch.jit.script nor differentiable, so we fall back to the single tensor
+# implementation in those cases.
+def _default_to_fused_or_foreach(tensorlists: List[List[torch.Tensor]],
+                                 differentiable: bool,
+                                 use_fused: bool = False) -> Tuple[bool, bool]:
     if torch.jit.is_scripting() or differentiable:
-        return False
+        return False, False
     all_tensors = []
     for tensorlist in tensorlists:
         all_tensors.extend(tensorlist)
-    return all(p.is_cuda for p in all_tensors)
+    fused = use_fused and all(
+        p is None or (type(p) == torch.Tensor and p.is_cuda and torch.is_floating_point(p)) for p in all_tensors
+    )
+    foreach = not fused and all(
+        p is None or (type(p) == torch.Tensor and p.is_cuda) for p in all_tensors
+    )
+    return fused, foreach
 
 
 # Common doc strings among optimizers
@@ -73,6 +83,23 @@ def _default_to_foreach(tensorlists: List[List[torch.Tensor]], differentiable: b
             foreach over the for-loop implementation on CUDA, since it is usually
             significantly more performant. (default: None)"""
 
+_fused_doc = r"""fused (bool, optional): whether the fused implementation (CUDA only) is used.
+            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
+            are supported. (default: None)
+
+    .. note:: The foreach and fused implementations are typically faster than the for-loop,
+              single-tensor implementation. Thus, if the user has not specified BOTH flags
+              (i.e., when foreach = fused = None), we will attempt defaulting to the foreach
+              implementation when the tensors are all on CUDA. For example, if the user specifies
+              True for fused but nothing for foreach, we will run the fused implementation. If
+              the user specifies False for foreach but nothing for fused (or False for fused but
+              nothing for foreach), we will run the for-loop implementation. If the user specifies
+              True for both foreach and fused, we will prioritize fused over foreach, as it is
+              typically faster. We attempt to use the fastest, so the hierarchy goes fused ->
+              foreach -> for-loop. HOWEVER, since the fused implementation is relatively new,
+              we want to give it sufficient bake-in time, so we default to foreach and NOT
+              fused when the user has not specified either flag."""
+
 _capturable_doc = r"""capturable (bool, optional): whether this instance is safe to
             capture in a CUDA graph. Passing True can impair ungraphed performance,
             so if you don't intend to graph capture this instance, leave it False
@@ -126,7 +153,7 @@ def register_optimizer_step_post_hook(hook: Callable[..., None]) -> RemovableHan
     return handle
 
 
-class Optimizer(object):
+class Optimizer:
     r"""Base class for all optimizers.
 
     .. warning::
@@ -204,19 +231,21 @@ def _cuda_graph_capture_health_check(self):
         if torch.has_cuda and torch.cuda.is_available():
             capturing = torch.cuda.is_current_stream_capturing()
 
-            if capturing and not self.defaults['capturable']:
+            if capturing and not all(group['capturable'] for group in self.param_groups):
                 raise RuntimeError("Attempting CUDA graph capture of step() for an instance of " +
                                    self.__class__.__name__ +
-                                   " but this instance was constructed with capturable=False.")
+                                   " but param_groups' capturable is False.")
 
             if (
                 (not getattr(self, "_warned_capturable_if_run_uncaptured", False))
-                and self.defaults["capturable"]
+                and all(group['capturable'] for group in self.param_groups)
                 and (not capturing)
             ):
-                print("Warning: This instance was constructed with capturable=True, but step() " +
-                      "is running without CUDA graph capture. If you never intend to graph-capture this " +
-                      "instance, capturable=True can impair performance, and you should set capturable=False.")
+                warnings.warn(
+                    "This instance was constructed with capturable=True or some of all the param_groups came with capturable=True, "
+                    "but step() is running without CUDA graph capture. If you never intend to graph-capture this "
+                    "instance, capturable=True can impair performance, and you should set capturable=False."
+                )
                 self._warned_capturable_if_run_uncaptured = True
 
     def _optimizer_step_code(self):
@@ -405,8 +434,8 @@ def update_group(group, new_group):
             update_group(g, ng) for g, ng in zip(groups, saved_groups)]
         self.__setstate__({'state': state, 'param_groups': param_groups})
 
-    def zero_grad(self, set_to_none: bool = False):
-        r"""Sets the gradients of all optimized :class:`torch.Tensor` s to zero.
+    def zero_grad(self, set_to_none: bool = True):
+        r"""Resets the gradients of all optimized :class:`torch.Tensor` s.
 
         Args:
             set_to_none (bool): instead of setting to zero, set the grads to None.
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 6c209317fb49..3cbd9d5923c7 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -3,7 +3,7 @@
 from torch import Tensor
 
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt, _stack_if_compiling,
-                        _default_to_foreach, _differentiable_doc, _foreach_doc)
+                        _default_to_fused_or_foreach, _differentiable_doc, _foreach_doc)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
@@ -40,7 +40,7 @@ def __init__(
             foreach=foreach,
             differentiable=differentiable,
         )
-        super(RAdam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -209,8 +209,8 @@ def radam(
         )
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, exp_avgs, exp_avg_sqs, state_steps],
-                                      differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, exp_avgs, exp_avg_sqs, state_steps],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 2c3eb5c553d8..29a4275aaf0d 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -1,6 +1,6 @@
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _default_to_foreach, _use_grad_for_differentiable,
+from .optimizer import (Optimizer, _default_to_fused_or_foreach, _use_grad_for_differentiable,
                         _differentiable_doc, _foreach_doc, _maximize_doc)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
@@ -44,7 +44,7 @@ def __init__(
             maximize=maximize,
             differentiable=differentiable,
         )
-        super(RMSprop, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -220,8 +220,8 @@ def rmsprop(
     """
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, square_avgs, grad_avgs, momentum_buffer_list],
-                                      differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, square_avgs, grad_avgs, momentum_buffer_list],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 7d8872d73cec..8bee98932c70 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,6 +1,6 @@
 import torch
 from torch import Tensor
-from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_foreach,
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
                         _differentiable_doc, _foreach_doc, _maximize_doc)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
@@ -33,7 +33,7 @@ def __init__(
             maximize=maximize,
             differentiable=differentiable,
         )
-        super(Rprop, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -192,7 +192,8 @@ def rprop(
     """
 
     if foreach is None:
-        foreach = _default_to_foreach([params, grads, prevs, step_sizes], differentiable=differentiable)
+        _, foreach = _default_to_fused_or_foreach([params, grads, prevs, step_sizes],
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 200d0b3f6aa9..e82cf5fdcce8 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -1,6 +1,7 @@
 import torch
 from torch import Tensor
-from .optimizer import Optimizer, required, _use_grad_for_differentiable, _differentiable_doc, _maximize_doc
+from .optimizer import (Optimizer, required, _use_grad_for_differentiable, _default_to_fused_or_foreach,
+                        _differentiable_doc, _foreach_doc, _maximize_doc)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
@@ -23,7 +24,7 @@ def __init__(self, params, lr=required, momentum=0, dampening=0,
                         differentiable=differentiable)
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
-        super(SGD, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
@@ -137,10 +138,9 @@ def step(self, closure=None):
         dampening (float, optional): dampening for momentum (default: 0)
         nesterov (bool, optional): enables Nesterov momentum (default: False)
         {maximize}
-        foreach (bool, optional): whether foreach implementation of optimizer
-            is used (default: None)
+        {foreach}
         {differentiable}
-    """.format(maximize=_maximize_doc, differentiable=_differentiable_doc) + r"""
+    """.format(maximize=_maximize_doc, foreach=_foreach_doc, differentiable=_differentiable_doc) + r"""
 
     Example:
         >>> # xdoctest: +SKIP
@@ -190,7 +190,7 @@ def sgd(params: List[Tensor],
         # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
         # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
         has_sparse_grad: bool = None,
-        foreach: bool = None,
+        foreach: Optional[bool] = None,
         *,
         weight_decay: float,
         momentum: float,
@@ -204,8 +204,13 @@ def sgd(params: List[Tensor],
     """
 
     if foreach is None:
-        # Placeholder for more complex foreach logic to be added when value is not set
-        foreach = False
+        # why must we be explicit about an if statement for torch.jit.is_scripting here?
+        # because JIT can't handle Optionals nor fancy conditionals when scripting
+        if not torch.jit.is_scripting():
+            _, foreach = _default_to_fused_or_foreach([params, d_p_list, momentum_buffer_list],
+                                                      differentiable=False, use_fused=False)
+        else:
+            foreach = False
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError('torch.jit.script not supported with foreach optimizers')
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index 1761d814960c..75b4d00a2173 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -33,7 +33,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool
             )
 
         defaults = dict(lr=lr, betas=betas, eps=eps, maximize=maximize)
-        super(SparseAdam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     @torch.no_grad()
     def step(self, closure=None):
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 52d4182e3689..dda4b8ad504d 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -100,7 +100,7 @@ class AveragedModel(Module):
         https://arxiv.org/abs/2001.02312
     """
     def __init__(self, model, device=None, avg_fn=None, use_buffers=False):
-        super(AveragedModel, self).__init__()
+        super().__init__()
         self.module = deepcopy(model)
         if device is not None:
             self.module = self.module.to(device)
@@ -254,7 +254,7 @@ def __init__(self, optimizer, swa_lr, anneal_epochs=10, anneal_strategy='cos', l
         if not isinstance(anneal_epochs, int) or anneal_epochs < 0:
             raise ValueError(f"anneal_epochs must be equal or greater than 0, got {anneal_epochs}")
         self.anneal_epochs = anneal_epochs
-        super(SWALR, self).__init__(optimizer, last_epoch)
+        super().__init__(optimizer, last_epoch)
 
     @staticmethod
     def _format_param(optimizer, swa_lrs):
diff --git a/torch/overrides.py b/torch/overrides.py
index 60a69cea281e..663704597090 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -144,6 +144,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.cudnn_grid_sampler,
         torch.cudnn_is_acceptable,
         torch.empty,
+        torch.empty_permuted,
         torch.empty_strided,
         torch.empty_quantized,
         torch.eye,
@@ -200,6 +201,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.nn.functional.sigmoid,
         torch.nn.functional.hardsigmoid,
         torch.nn.functional.tanh,
+        torch.nn.functional._canonical_mask,
+        torch.nn.functional._none_or_dtype,
         # Doesn't actually take or return tensor arguments
         torch.nn.init.calculate_gain,
         # These are deprecated; don't test them
@@ -275,6 +278,7 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor.new_full,
         Tensor._make_subclass,
         Tensor.solve,
+        Tensor.symeig,
         Tensor.stride,
         Tensor.unflatten,
         Tensor.to_sparse_coo,
@@ -859,6 +863,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.selu: lambda input, inplace=False: -1,
         torch.nn.functional.silu: lambda input, inplace=False: -1,
         torch.nn.functional.mish: lambda input, inplace=False: -1,
+        torch.nn.functional.scaled_dot_product_attention: lambda query, key, value, attn_mask=None, dropout_p=0.0: -1,
         torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean', beta=1.: -1,
         torch.nn.functional.huber_loss: lambda input, target, reduction='mean', delta=1.: -1,
         torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
@@ -967,7 +972,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.scatter_add: lambda input, dim, index, src: -1,
         torch.scatter_reduce: lambda input, dim, index, src, reduce, include_self=True: -1,
         torch.searchsorted: lambda sorted_sequence, input, out_int32=False, right=False, out=None: -1,
-        torch.segment_reduce: lambda data, reduce="max", lengths=None, indices=None, offsets=None, axis=0, unsafe=False: -1,
+        torch._segment_reduce: lambda data, reduce="max", lengths=None, indices=None, offsets=None, axis=0, unsafe=False: -1,
         torch.select: lambda input, dim, index: -1,
         torch.select_scatter: lambda input, src, dim, index: -1,
         torch.slice_scatter: lambda input, src, dim=0, start=None, end=None, step=1: -1,
@@ -1006,7 +1011,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.svd_lowrank: lambda input, q=6, niter=2, M=None: -1,
         torch.linalg.svd: lambda input, full_matrices=True, out=None: -1,
         torch.linalg.svdvals: lambda input, out=None: -1,
-        torch.symeig: lambda input, eigenvectors=False, upper=True, out=None: -1,
         torch.swapaxes: lambda input, dim0, dim1: -1,
         torch.swapdims: lambda input, axis0, axis1: -1,
         torch.special.airy_ai: lambda input: -1,
@@ -1293,6 +1297,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         Tensor.ndimension: lambda self: -1,
         Tensor.nelement: lambda self: -1,
         Tensor._nested_tensor_size: lambda self: -1,
+        Tensor._nested_tensor_strides: lambda self: -1,
         Tensor.normal_: lambda self: -1,
         Tensor.numpy: lambda self: -1,
         Tensor.permute: lambda self, dim: -1,
@@ -1611,7 +1616,7 @@ def _get_overridable_functions() -> Tuple[Dict[Any, List[Callable]], Dict[Callab
     overridable_funcs = collections.defaultdict(list)
     index = {}
     tested_namespaces = [
-        ("torch", torch, torch.__all__ + dir(torch._C._VariableFunctions)),
+        ("torch", torch, torch.__all__),
         ("torch.functional", torch.functional, torch.functional.__all__),
         ("torch.nn.functional", torch.nn.functional, dir(torch.nn.functional)),
         ("torch.nn.init", torch.nn.init, dir(torch.nn.init)),
@@ -1708,7 +1713,7 @@ def resolve_name(f):
         Name of the function; if eval'ed it should give back the input
         function.
     """
-    if isinstance(f, torch._ops.OpOverload) or isinstance(f, torch._ops.OpOverloadPacket):
+    if isinstance(f, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
         return str(f)
     return _get_overridable_functions()[1].get(f)
 
diff --git a/torch/package/_directory_reader.py b/torch/package/_directory_reader.py
index 30833493c4fb..35a57cb1c015 100644
--- a/torch/package/_directory_reader.py
+++ b/torch/package/_directory_reader.py
@@ -6,7 +6,7 @@
 from torch.types import Storage
 
 # because get_storage_from_record returns a tensor!?
-class _HasStorage(object):
+class _HasStorage:
     def __init__(self, storage):
         self._storage = storage
 
@@ -14,7 +14,7 @@ def storage(self):
         return self._storage
 
 
-class DirectoryReader(object):
+class DirectoryReader:
     """
     Class to allow PackageImporter to operate on unzipped packages. Methods
     copy the behavior of the internal PyTorchFileReader class (which is used for
diff --git a/torch/package/_importlib.py b/torch/package/_importlib.py
index 63c9d7024bfb..62cabd7293a4 100644
--- a/torch/package/_importlib.py
+++ b/torch/package/_importlib.py
@@ -61,7 +61,7 @@ def _calc___package__(globals):
     spec = globals.get("__spec__")
     if package is not None:
         if spec is not None and package != spec.parent:
-            _warnings.warn(
+            _warnings.warn(  # noqa: G010
                 "__package__ != __spec__.parent " f"({package!r} != {spec.parent!r})",
                 ImportWarning,
                 stacklevel=3,
@@ -70,7 +70,7 @@ def _calc___package__(globals):
     elif spec is not None:
         return spec.parent
     else:
-        _warnings.warn(
+        _warnings.warn(  # noqa: G010
             "can't resolve package from __spec__ or __package__, "
             "falling back on __name__ and __path__",
             ImportWarning,
diff --git a/torch/package/_stdlib.py b/torch/package/_stdlib.py
index bddde3a60aae..a810d50661cb 100644
--- a/torch/package/_stdlib.py
+++ b/torch/package/_stdlib.py
@@ -17,10 +17,6 @@ def is_stdlib_module(module: str) -> bool:
 
 def _get_stdlib_modules():
     if sys.version_info.major == 3:
-        if sys.version_info.minor == 6:
-            return stdlib3_6
-        if sys.version_info.minor == 7:
-            return stdlib3_7
         if sys.version_info.minor == 8:
             return stdlib3_8
         if sys.version_info.minor == 9:
@@ -33,441 +29,6 @@ def _get_stdlib_modules():
     raise RuntimeError(f"Unsupported Python version: {sys.version_info}")
 
 
-stdlib3_6 = {
-    "_dummy_thread",
-    "_thread",
-    "abc",
-    "aifc",
-    "argparse",
-    "array",
-    "ast",
-    "asynchat",
-    "asyncio",
-    "asyncore",
-    "atexit",
-    "audioop",
-    "base64",
-    "bdb",
-    "binascii",
-    "binhex",
-    "bisect",
-    "builtins",
-    "bz2",
-    "cProfile",
-    "calendar",
-    "cgi",
-    "cgitb",
-    "chunk",
-    "cmath",
-    "cmd",
-    "code",
-    "codecs",
-    "codeop",
-    "collections",
-    "colorsys",
-    "compileall",
-    "concurrent",
-    "configparser",
-    "contextlib",
-    "copy",
-    "copyreg",
-    "crypt",
-    "csv",
-    "ctypes",
-    "curses",
-    "datetime",
-    "dbm",
-    "decimal",
-    "difflib",
-    "dis",
-    "distutils",
-    "doctest",
-    "dummy_threading",
-    "email",
-    "encodings",
-    "ensurepip",
-    "enum",
-    "errno",
-    "faulthandler",
-    "fcntl",
-    "filecmp",
-    "fileinput",
-    "fnmatch",
-    "formatter",
-    "fpectl",
-    "fractions",
-    "ftplib",
-    "functools",
-    "gc",
-    "getopt",
-    "getpass",
-    "gettext",
-    "glob",
-    "grp",
-    "gzip",
-    "hashlib",
-    "heapq",
-    "hmac",
-    "html",
-    "http",
-    "imaplib",
-    "imghdr",
-    "imp",
-    "importlib",
-    "inspect",
-    "io",
-    "ipaddress",
-    "itertools",
-    "json",
-    "keyword",
-    "lib2to3",
-    "linecache",
-    "locale",
-    "logging",
-    "lzma",
-    "macpath",
-    "mailbox",
-    "mailcap",
-    "marshal",
-    "math",
-    "mimetypes",
-    "mmap",
-    "modulefinder",
-    "msilib",
-    "msvcrt",
-    "multiprocessing",
-    "netrc",
-    "nis",
-    "nntplib",
-    "ntpath",
-    "numbers",
-    "operator",
-    "optparse",
-    "os",
-    "ossaudiodev",
-    "parser",
-    "pathlib",
-    "pdb",
-    "pickle",
-    "pickletools",
-    "pipes",
-    "pkgutil",
-    "platform",
-    "plistlib",
-    "poplib",
-    "posix",
-    "posixpath",
-    "pprint",
-    "profile",
-    "pstats",
-    "pty",
-    "pwd",
-    "py_compile",
-    "pyclbr",
-    "pydoc",
-    "queue",
-    "quopri",
-    "random",
-    "re",
-    "readline",
-    "reprlib",
-    "resource",
-    "rlcompleter",
-    "runpy",
-    "sched",
-    "secrets",
-    "select",
-    "selectors",
-    "shelve",
-    "shlex",
-    "shutil",
-    "signal",
-    "site",
-    "smtpd",
-    "smtplib",
-    "sndhdr",
-    "socket",
-    "socketserver",
-    "spwd",
-    "sqlite3",
-    "sre",
-    "sre_compile",
-    "sre_constants",
-    "sre_parse",
-    "ssl",
-    "stat",
-    "statistics",
-    "string",
-    "stringprep",
-    "struct",
-    "subprocess",
-    "sunau",
-    "symbol",
-    "symtable",
-    "sys",
-    "sysconfig",
-    "syslog",
-    "tabnanny",
-    "tarfile",
-    "telnetlib",
-    "tempfile",
-    "termios",
-    "test",
-    "textwrap",
-    "threading",
-    "time",
-    "timeit",
-    "tkinter",
-    "token",
-    "tokenize",
-    "trace",
-    "traceback",
-    "tracemalloc",
-    "tty",
-    "turtle",
-    "turtledemo",
-    "types",
-    "typing",
-    "unicodedata",
-    "unittest",
-    "urllib",
-    "uu",
-    "uuid",
-    "venv",
-    "warnings",
-    "wave",
-    "weakref",
-    "webbrowser",
-    "winreg",
-    "winsound",
-    "wsgiref",
-    "xdrlib",
-    "xml",
-    "xmlrpc",
-    "zipapp",
-    "zipfile",
-    "zipimport",
-    "zlib",
-}
-
-stdlib3_7 = {
-    "_dummy_thread",
-    "_thread",
-    "abc",
-    "aifc",
-    "argparse",
-    "array",
-    "ast",
-    "asynchat",
-    "asyncio",
-    "asyncore",
-    "atexit",
-    "audioop",
-    "base64",
-    "bdb",
-    "binascii",
-    "binhex",
-    "bisect",
-    "builtins",
-    "bz2",
-    "cProfile",
-    "calendar",
-    "cgi",
-    "cgitb",
-    "chunk",
-    "cmath",
-    "cmd",
-    "code",
-    "codecs",
-    "codeop",
-    "collections",
-    "colorsys",
-    "compileall",
-    "concurrent",
-    "configparser",
-    "contextlib",
-    "contextvars",
-    "copy",
-    "copyreg",
-    "crypt",
-    "csv",
-    "ctypes",
-    "curses",
-    "dataclasses",
-    "datetime",
-    "dbm",
-    "decimal",
-    "difflib",
-    "dis",
-    "distutils",
-    "doctest",
-    "dummy_threading",
-    "email",
-    "encodings",
-    "ensurepip",
-    "enum",
-    "errno",
-    "faulthandler",
-    "fcntl",
-    "filecmp",
-    "fileinput",
-    "fnmatch",
-    "formatter",
-    "fractions",
-    "ftplib",
-    "functools",
-    "gc",
-    "getopt",
-    "getpass",
-    "gettext",
-    "glob",
-    "grp",
-    "gzip",
-    "hashlib",
-    "heapq",
-    "hmac",
-    "html",
-    "http",
-    "imaplib",
-    "imghdr",
-    "imp",
-    "importlib",
-    "inspect",
-    "io",
-    "ipaddress",
-    "itertools",
-    "json",
-    "keyword",
-    "lib2to3",
-    "linecache",
-    "locale",
-    "logging",
-    "lzma",
-    "macpath",
-    "mailbox",
-    "mailcap",
-    "marshal",
-    "math",
-    "mimetypes",
-    "mmap",
-    "modulefinder",
-    "msilib",
-    "msvcrt",
-    "multiprocessing",
-    "netrc",
-    "nis",
-    "nntplib",
-    "ntpath",
-    "numbers",
-    "operator",
-    "optparse",
-    "os",
-    "ossaudiodev",
-    "parser",
-    "pathlib",
-    "pdb",
-    "pickle",
-    "pickletools",
-    "pipes",
-    "pkgutil",
-    "platform",
-    "plistlib",
-    "poplib",
-    "posix",
-    "posixpath",
-    "pprint",
-    "profile",
-    "pstats",
-    "pty",
-    "pwd",
-    "py_compile",
-    "pyclbr",
-    "pydoc",
-    "queue",
-    "quopri",
-    "random",
-    "re",
-    "readline",
-    "reprlib",
-    "resource",
-    "rlcompleter",
-    "runpy",
-    "sched",
-    "secrets",
-    "select",
-    "selectors",
-    "shelve",
-    "shlex",
-    "shutil",
-    "signal",
-    "site",
-    "smtpd",
-    "smtplib",
-    "sndhdr",
-    "socket",
-    "socketserver",
-    "spwd",
-    "sqlite3",
-    "sre",
-    "sre_compile",
-    "sre_constants",
-    "sre_parse",
-    "ssl",
-    "stat",
-    "statistics",
-    "string",
-    "stringprep",
-    "struct",
-    "subprocess",
-    "sunau",
-    "symbol",
-    "symtable",
-    "sys",
-    "sysconfig",
-    "syslog",
-    "tabnanny",
-    "tarfile",
-    "telnetlib",
-    "tempfile",
-    "termios",
-    "test",
-    "textwrap",
-    "threading",
-    "time",
-    "timeit",
-    "tkinter",
-    "token",
-    "tokenize",
-    "trace",
-    "traceback",
-    "tracemalloc",
-    "tty",
-    "turtle",
-    "turtledemo",
-    "types",
-    "typing",
-    "unicodedata",
-    "unittest",
-    "urllib",
-    "uu",
-    "uuid",
-    "venv",
-    "warnings",
-    "wave",
-    "weakref",
-    "webbrowser",
-    "winreg",
-    "winsound",
-    "wsgiref",
-    "xdrlib",
-    "xml",
-    "xmlrpc",
-    "zipapp",
-    "zipfile",
-    "zipimport",
-    "zlib",
-}
-
 stdlib3_8 = {
     "_dummy_thread",
     "_thread",
diff --git a/torch/package/find_file_dependencies.py b/torch/package/find_file_dependencies.py
index cc16c339ea34..af8cd9fec84d 100644
--- a/torch/package/find_file_dependencies.py
+++ b/torch/package/find_file_dependencies.py
@@ -1,5 +1,4 @@
 import ast
-import sys
 from typing import List, Optional, Tuple
 
 from ._importlib import _resolve_name
@@ -43,16 +42,10 @@ def visit_ImportFrom(self, node):
                 self.references[(name, None)] = True
 
     def _grab_node_int(self, node):
-        if sys.version_info[:2] < (3, 8):
-            return node.n
-        else:
-            return node.value
+        return node.value
 
     def _grab_node_str(self, node):
-        if sys.version_info[:2] < (3, 8):
-            return node.s
-        else:
-            return node.value
+        return node.value
 
     def visit_Call(self, node):
         # __import__ calls aren't routed to the visit_Import/From nodes
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 347641e46431..f83a79efced6 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -131,7 +131,7 @@ class PackagingError(Exception):
     them to you at once.
     """
 
-    def __init__(self, dependency_graph: DiGraph):
+    def __init__(self, dependency_graph: DiGraph, debug=False):
         # Group errors by reason.
         broken: Dict[PackagingErrorReason, List[str]] = defaultdict(list)
         for module_name, attrs in dependency_graph.nodes.items():
@@ -154,7 +154,30 @@ def __init__(self, dependency_graph: DiGraph):
                 error_context = dependency_graph.nodes[module_name].get("error_context")
                 if error_context is not None:
                     message.write(f"      Context: {error_context}\n")
-
+                if module_name in _DISALLOWED_MODULES:
+                    message.write(
+                        (
+                            "      Note: While we usually use modules in the python standard library "
+                            f"from the local environment, `{module_name}` has a lot of system "
+                            "level access and therefore can pose a security risk. We heavily "
+                            f"recommend removing `{module_name}` from your packaged code. However, if that "
+                            "is not possible, add it to the extern list by calling "
+                            f'PackageExporter.extern("`{module_name}`")\n'
+                        )
+                    )
+                if debug:
+                    module_path = dependency_graph.first_path(module_name)
+                    message.write(
+                        f"      A path to {module_name}: {' -> '.join(module_path)}"
+                    )
+        if not debug:
+            message.write("\n")
+            message.write(
+                (
+                    "Set debug=True when invoking PackageExporter for a visualization of where "
+                    "broken modules are coming from!\n"
+                )
+            )
         # Save the dependency graph so that tooling can get at it.
         self.dependency_graph = dependency_graph
         super().__init__(message.getvalue())
@@ -195,6 +218,7 @@ def __init__(
         self,
         f: Union[str, Path, BinaryIO],
         importer: Union[Importer, Sequence[Importer]] = sys_importer,
+        debug: bool = False,
     ):
         """
         Create an exporter.
@@ -204,9 +228,10 @@ def __init__(
                 or a binary I/O object.
             importer: If a single Importer is passed, use that to search for modules.
                 If a sequence of importers are passed, an ``OrderedImporter`` will be constructed out of them.
+            debug: If set to True, add path of broken modules to PackagingErrors.
         """
         torch._C._log_api_usage_once("torch.package.PackageExporter")
-
+        self.debug = debug
         if isinstance(f, (Path, str)):
             f = str(f)
             self.buffer: Optional[BinaryIO] = None
@@ -979,7 +1004,7 @@ def _validate_dependency_graph(self):
         # 1. Check the graph for any errors inserted during dependency analysis.
         for module_name, attrs in self.dependency_graph.nodes.items():
             if "error" in attrs:
-                raise PackagingError(self.dependency_graph)
+                raise PackagingError(self.dependency_graph, debug=self.debug)
 
         # 2. Check that all patterns for which allow_empty=False have been matched at least once.
         for pattern, pattern_info in self.patterns.items():
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index a52d8ed228ae..8cdfb55f749d 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -349,7 +349,7 @@ def __init__(self, op_tree: OpTree) -> None:
                     # the core PyTorch codebase.
                     if prior_size != new_size:
                         delta = f"{prior_size} vs. {new_size}"
-                        log.warn(f"Mismatch between allocation and free: {delta}")
+                        log.warning(f"Mismatch between allocation and free: {delta}")
 
         self._values.update(allocations)
 
diff --git a/torch/profiler/_pattern_matcher.py b/torch/profiler/_pattern_matcher.py
index 8003872bcdaf..1f161bc9ef36 100644
--- a/torch/profiler/_pattern_matcher.py
+++ b/torch/profiler/_pattern_matcher.py
@@ -413,6 +413,14 @@ def is_dataloader_function(name: str, function_name: str):
                 os.path.join("torch", "utils", "data",
                              "dataloader.py")) and name.endswith(function_name)
 
+        # TODO: fixme! Due to lifetime issues of the function name, this field might
+        # actually point to an already freed string when the even is a PyCall.
+        # Just silently skip this to unblock testing.
+        try:
+            event.name
+        except UnicodeDecodeError:
+            return False
+
         if not is_dataloader_function(event.name, "__iter__"):
             return False
         if not event.children:
diff --git a/torch/profiler/itt.py b/torch/profiler/itt.py
index f1c799d16c70..22f4dcf828c3 100644
--- a/torch/profiler/itt.py
+++ b/torch/profiler/itt.py
@@ -3,7 +3,7 @@
 try:
     from torch._C import _itt
 except ImportError:
-    class _ITTStub(object):
+    class _ITTStub:
         @staticmethod
         def _fail(*args, **kwargs):
             raise RuntimeError("ITT functions not installed. Are you sure you have a ITT build?")
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 9ebbc3ddf9ef..c50c0e62beb9 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -44,7 +44,7 @@ def supported_activities():
     return torch.autograd._supported_activities()
 
 
-class _KinetoProfile(object):
+class _KinetoProfile:
     """Low-level profiler wrap the autograd profile
 
     Args:
@@ -391,7 +391,7 @@ def trace_handler(prof):
                 torch.profiler.ProfilerActivity.CUDA,
             ],
 
-            # In this example with wait=1, warmup=1, active=2,
+            # In this example with wait=1, warmup=1, active=2, repeat=1,
             # profiler will skip the first step/iteration,
             # start warming up on the second, record
             # the third and the forth iterations,
@@ -402,7 +402,8 @@ def trace_handler(prof):
             schedule=torch.profiler.schedule(
                 wait=1,
                 warmup=1,
-                active=2),
+                active=2,
+                repeat=1),
             on_trace_ready=trace_handler
             # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
             # used when outputting for tensorboard
@@ -594,6 +595,13 @@ def unregister_callback(self):
             _remove_execution_graph_observer()
             self._registered = False
 
+    @property
+    def is_registered(self):
+        """
+        Return if the execution graph observer is registered.
+        """
+        return self._registered
+
     def start(self):
         """
         Starts to capture.
@@ -614,4 +622,10 @@ def get_output_file_path(self) -> str:
         """
         Returns the output file name.
         """
-        return self._output_file_path
+        if self.is_registered:
+            return self._output_file_path
+        else:
+            raise RuntimeError(
+                "A callback to the EG profiler needs to be registered "
+                "first before getting the output file path"
+            )
diff --git a/torch/quantization/fuse_modules.py b/torch/quantization/fuse_modules.py
index 896f3571aaa7..55bd8363524b 100644
--- a/torch/quantization/fuse_modules.py
+++ b/torch/quantization/fuse_modules.py
@@ -12,8 +12,8 @@
 from torch.ao.quantization.fuse_modules import get_fuser_method
 
 # for backward compatiblity
-from torch.quantization.fuser_method_mappings import fuse_conv_bn
-from torch.quantization.fuser_method_mappings import fuse_conv_bn_relu
+from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn
+from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn_relu
 
 # TODO: These functions are not used outside the `fuse_modules.py`
 #       Keeping here for now, need to remove them later.
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index 8dec26d45a19..d528f42a4937 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -16,11 +16,11 @@
 )
 
 # QuantizeHandler.__module__ = _NAMESPACE
-_register_fusion_pattern.__module__ = "torch.quantization.fx.pattern_utils"
-get_default_fusion_patterns.__module__ = "torch.quantization.fx.pattern_utils"
-_register_quant_pattern.__module__ = "torch.quantization.fx.pattern_utils"
-get_default_quant_patterns.__module__ = "torch.quantization.fx.pattern_utils"
-get_default_output_activation_post_process_map.__module__ = "torch.quantization.fx.pattern_utils"
+_register_fusion_pattern.__module__ = "torch.ao.quantization.fx.pattern_utils"
+get_default_fusion_patterns.__module__ = "torch.ao.quantization.fx.pattern_utils"
+_register_quant_pattern.__module__ = "torch.ao.quantization.fx.pattern_utils"
+get_default_quant_patterns.__module__ = "torch.ao.quantization.fx.pattern_utils"
+get_default_output_activation_post_process_map.__module__ = "torch.ao.quantization.fx.pattern_utils"
 
 # __all__ = [
 #     "QuantizeHandler",
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 6177e9bd04b8..50bfa0bfbe8e 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -23,17 +23,17 @@
     StandaloneModuleQuantizeHandler
 )
 
-QuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-BinaryOpQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-CatQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-ConvReluQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-LinearReLUQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-BatchNormQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-EmbeddingQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-RNNDynamicQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-DefaultNodeQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-FixedQParamsOpQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-CopyNodeQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-CustomModuleQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-GeneralTensorShapeOpQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
-StandaloneModuleQuantizeHandler.__module__ = "torch.quantization.fx.quantization_patterns"
+QuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+BinaryOpQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+CatQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+ConvReluQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+LinearReLUQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+BatchNormQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+EmbeddingQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+RNNDynamicQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+DefaultNodeQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+FixedQParamsOpQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+CopyNodeQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+CustomModuleQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+GeneralTensorShapeOpQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+StandaloneModuleQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
diff --git a/torch/quasirandom.py b/torch/quasirandom.py
index b85a9bd2842d..c5086da283a4 100644
--- a/torch/quasirandom.py
+++ b/torch/quasirandom.py
@@ -2,7 +2,7 @@
 from typing import Optional
 
 
-class SobolEngine(object):
+class SobolEngine:
     r"""
     The :class:`torch.quasirandom.SobolEngine` is an engine for generating
     (scrambled) Sobol sequences. Sobol sequences are an example of low
diff --git a/torch/random.py b/torch/random.py
index f5156bf48730..e4795907a3a5 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -39,6 +39,10 @@ def manual_seed(seed) -> torch._C.Generator:
     if not torch.cuda._is_in_bad_fork():
         torch.cuda.manual_seed_all(seed)
 
+    import torch.mps
+    if not torch.mps._is_in_bad_fork():
+        torch.mps.manual_seed(seed)
+
     return default_generator.manual_seed(seed)
 
 
@@ -52,6 +56,10 @@ def seed() -> int:
     if not torch.cuda._is_in_bad_fork():
         torch.cuda.manual_seed_all(seed)
 
+    import torch.mps
+    if not torch.mps._is_in_bad_fork():
+        torch.mps.manual_seed(seed)
+
     return seed
 
 
diff --git a/torch/serialization.py b/torch/serialization.py
index 2a4a99e82d2a..83f6fa275bbb 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -10,12 +10,11 @@
 import warnings
 from contextlib import closing, contextmanager
 from ._utils import _import_dotted_name
-from ._six import string_classes as _string_classes
 from torch._sources import get_source_lines_and_file
 from torch.types import Storage
 from torch.storage import _get_dtype_from_pickle_storage_type
 from typing import Any, BinaryIO, Callable, cast, Dict, Optional, Type, Tuple, Union, IO
-from typing_extensions import TypeAlias
+from typing_extensions import TypeAlias  # Python 3.10+
 import copyreg
 import pickle
 import pathlib
@@ -234,11 +233,10 @@ def storage_to_tensor_type(storage):
 
 
 def _is_path(name_or_buffer):
-    return isinstance(name_or_buffer, str) or \
-        isinstance(name_or_buffer, pathlib.Path)
+    return isinstance(name_or_buffer, (str, pathlib.Path))
 
 
-class _opener(object):
+class _opener:
     def __init__(self, file_like):
         self.file_like = file_like
 
@@ -1080,7 +1078,7 @@ def _get_restore_location(map_location):
         def restore_location(storage, location):
             location = map_location.get(location, location)
             return default_restore_location(storage, location)
-    elif isinstance(map_location, _string_classes):
+    elif isinstance(map_location, str):
         def restore_location(storage, location):
             return default_restore_location(storage, map_location)
     elif isinstance(map_location, torch.device):
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 3ceaf56fc203..6f05dfbb2209 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -61,41 +61,65 @@
 .. note::
     This function doesn't support computing derivaties with respect to CSR matrices.
 
-    Args:
-        mat1 (Tensor): the first sparse matrix to be multiplied
-        mat2 (Tensor): the second matrix to be multiplied, which could be sparse or dense
+    This function also additionally accepts an optional :attr:`reduce` argument that allows
+    specification of an optional reduction operation, mathematically performs the following operation:
 
-    Shape:
-        The format of the output tensor of this function follows:
-        - sparse x sparse -> sparse
-        - sparse x dense -> dense
+.. math::
 
-    Example::
+    z_{ij} = \bigoplus_{k = 0}^{K - 1} x_{ik} y_{kj}
 
-        >>> a = torch.randn(2, 3).to_sparse().requires_grad_(True)
-        >>> a
-        tensor(indices=tensor([[0, 0, 0, 1, 1, 1],
-                               [0, 1, 2, 0, 1, 2]]),
-               values=tensor([ 1.5901,  0.0183, -0.6146,  1.8061, -0.0112,  0.6302]),
-               size=(2, 3), nnz=6, layout=torch.sparse_coo, requires_grad=True)
-
-        >>> b = torch.randn(3, 2, requires_grad=True)
-        >>> b
-        tensor([[-0.6479,  0.7874],
-                [-1.2056,  0.5641],
-                [-1.1716, -0.9923]], requires_grad=True)
-
-        >>> y = torch.sparse.mm(a, b)
-        >>> y
-        tensor([[-0.3323,  1.8723],
-                [-1.8951,  0.7904]], grad_fn=<SparseAddmmBackward>)
-        >>> y.sum().backward()
-        >>> a.grad
-        tensor(indices=tensor([[0, 0, 0, 1, 1, 1],
-                               [0, 1, 2, 0, 1, 2]]),
-               values=tensor([ 0.1394, -0.6415, -2.1639,  0.1394, -0.6415, -2.1639]),
-               size=(2, 3), nnz=6, layout=torch.sparse_coo)
-    """)
+where :math:`\bigoplus` defines the reduce operator. :attr:`reduce` is implemented only for
+CSR storage format on CPU device.
+
+Args:
+    mat1 (Tensor): the first sparse matrix to be multiplied
+    mat2 (Tensor): the second matrix to be multiplied, which could be sparse or dense
+    reduce (str, optional): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`). Default :obj:`"sum"`.
+
+Shape:
+    The format of the output tensor of this function follows:
+    - sparse x sparse -> sparse
+    - sparse x dense -> dense
+
+Example::
+
+    >>> a = torch.tensor([[1., 0, 2], [0, 3, 0]]).to_sparse().requires_grad_()
+    >>> a
+    tensor(indices=tensor([[0, 0, 1],
+                           [0, 2, 1]]),
+           values=tensor([1., 2., 3.]),
+           size=(2, 3), nnz=3, layout=torch.sparse_coo, requires_grad=True)
+    >>> b = torch.tensor([[0, 1.], [2, 0], [0, 0]], requires_grad=True)
+    >>> b
+    tensor([[0., 1.],
+            [2., 0.],
+            [0., 0.]], requires_grad=True)
+    >>> y = torch.sparse.mm(a, b)
+    >>> y
+    tensor([[0., 1.],
+            [6., 0.]], grad_fn=<SparseAddmmBackward0>)
+    >>> y.sum().backward()
+    >>> a.grad
+    tensor(indices=tensor([[0, 0, 1],
+                           [0, 2, 1]]),
+           values=tensor([1., 0., 2.]),
+           size=(2, 3), nnz=3, layout=torch.sparse_coo)
+    >>> c = a.detach().to_sparse_csr()
+    >>> c
+    tensor(crow_indices=tensor([0, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([1., 2., 3.]), size=(2, 3), nnz=3,
+           layout=torch.sparse_csr)
+    >>> y1 = torch.sparse.mm(c, b, 'sum')
+    >>> y1
+    tensor([[0., 1.],
+            [6., 0.]], grad_fn=<SparseMmReduceImplBackward0>)
+    >>> y2 = torch.sparse.mm(c, b, 'max')
+    >>> y2
+    tensor([[0., 1.],
+            [6., 0.]], grad_fn=<SparseMmReduceImplBackward0>)
+""")
 
 
 sampled_addmm = _add_docstr(_sparse.sparse_sampled_addmm, r"""
@@ -149,7 +173,6 @@
         size=(3, 3), nnz=3, layout=torch.sparse_csr)
 """)
 
-
 def sum(input: Tensor, dim: DimOrDims = None,
         dtype: Optional[DType] = None) -> Tensor:
     r"""
@@ -359,7 +382,7 @@ def sum(input: Tensor, dim: DimOrDims = None,
 """)
 
 
-class check_sparse_tensor_invariants(object):
+class check_sparse_tensor_invariants:
     """A tool to control checking sparse tensor invariants.
 
 The following options exists to manage sparsr tensor invariants
@@ -446,13 +469,19 @@ def disable():
     # context manager support
     def __init__(self, enable=True):
         self.state = enable
-        self.saved_state = self.is_enabled()
+        self.saved_state : Optional[bool] = None
 
     def __enter__(self):
+        if self.saved_state is not None:
+            raise RuntimeError('This context manager instance is already activated.'
+                               ' Use a different context manager instance for context nesting.')
+        self.saved_state = self.is_enabled()
         torch._C._set_check_sparse_tensor_invariants(self.state)
 
     def __exit__(self, type, value, traceback):
+        assert self.saved_state is not None
         torch._C._set_check_sparse_tensor_invariants(self.saved_state)
+        self.saved_state = None
 
     # decorator support
     def __call__(self, mth):
diff --git a/torch/storage.py b/torch/storage.py
index 775260926bb4..ddf48a3f3b70 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -15,7 +15,7 @@
     np = None  # type: ignore[assignment]
 
 T = TypeVar('T', bound='Union[_StorageBase, TypedStorage]')
-class _StorageBase(object):
+class _StorageBase:
     _cdata: Any
     is_sparse: bool = False
     is_sparse_csr: bool = False
@@ -105,7 +105,7 @@ def __reduce__(self):
         return (_load_from_bytes, (b.getvalue(),))
 
     def __sizeof__(self):
-        return super(_StorageBase, self).__sizeof__() + self.size()
+        return super().__sizeof__() + self.size()
 
     def clone(self):
         """Returns a copy of this storage"""
@@ -662,7 +662,7 @@ def _deepcopy(self, memo):
 
     def __sizeof__(self):
         _warn_typed_storage_removal()
-        return super(TypedStorage, self).__sizeof__() + self.nbytes()
+        return super().__sizeof__() + self.nbytes()
 
     def clone(self):
         """Returns a copy of this storage"""
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index f241a5591991..617fd71e00bd 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -28,7 +28,7 @@
 
 
 class ErrorMeta(Exception):
-    """Internal testing exception that makes that carries error meta data."""
+    """Internal testing exception that makes that carries error metadata."""
 
     def __init__(
         self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ()
@@ -333,25 +333,31 @@ def __init__(
         self.id = id
         self._unknown_parameters = unknown_parameters
 
+    @staticmethod
+    def _inputs_not_supported() -> NoReturn:
+        raise UnsupportedInputs()
+
     @staticmethod
     def _check_inputs_isinstance(*inputs: Any, cls: Union[Type, Tuple[Type, ...]]):
         """Checks if all inputs are instances of a given class and raise :class:`UnsupportedInputs` otherwise."""
         if not all(isinstance(input, cls) for input in inputs):
-            raise UnsupportedInputs()
+            Pair._inputs_not_supported()
 
-    def _make_error_meta(self, type: Type[Exception], msg: str) -> ErrorMeta:
-        """Makes an :class:`ErrorMeta` from a given exception type and message and the stored id.
+    def _fail(
+        self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ()
+    ) -> NoReturn:
+        """Raises an :class:`ErrorMeta` from a given exception type and message and the stored id.
 
         .. warning::
 
-            Since this method uses instance attributes of :class:`Pair`, it should not be used before the
-            ``super().__init__(...)`` call in the constructor.
+            If you use this before the ``super().__init__(...)`` call in the constructor, you have to pass the ``id``
+            explicitly.
         """
-        return ErrorMeta(type, msg, id=self.id)
+        raise ErrorMeta(type, msg, id=self.id if not id and hasattr(self, "id") else id)
 
     @abc.abstractmethod
     def compare(self) -> None:
-        """Compares the inputs and returns an :class`ErrorMeta` in case they mismatch."""
+        """Compares the inputs and raises an :class`ErrorMeta` in case they mismatch."""
 
     def extra_repr(self) -> Sequence[Union[str, Tuple[str, Any]]]:
         """Returns extra information that will be included in the representation.
@@ -394,14 +400,15 @@ def compare(self) -> None:
         try:
             equal = self.actual == self.expected
         except Exception as error:
-            raise self._make_error_meta(
-                ValueError, f"{self.actual} == {self.expected} failed with:\n{error}."
+            # We are not using `self._raise_error_meta` here since we need the exception chaining
+            raise ErrorMeta(
+                ValueError,
+                f"{self.actual} == {self.expected} failed with:\n{error}.",
+                id=self.id,
             ) from error
 
         if not equal:
-            raise self._make_error_meta(
-                AssertionError, f"{self.actual} != {self.expected}"
-            )
+            self._fail(AssertionError, f"{self.actual} != {self.expected}")
 
 
 class NonePair(Pair):
@@ -409,13 +416,13 @@ class NonePair(Pair):
 
     def __init__(self, actual: Any, expected: Any, **other_parameters: Any) -> None:
         if not (actual is None or expected is None):
-            raise UnsupportedInputs()
+            self._inputs_not_supported()
 
         super().__init__(actual, expected, **other_parameters)
 
     def compare(self) -> None:
         if not (self.actual is None and self.expected is None):
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError, f"None mismatch: {self.actual} is not {self.expected}"
             )
 
@@ -468,7 +475,7 @@ def _to_bool(self, bool_like: Any, *, id: Tuple[Any, ...]) -> bool:
 
     def compare(self) -> None:
         if self.actual is not self.expected:
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError,
                 f"Booleans mismatch: {self.actual} is not {self.expected}",
             )
@@ -564,7 +571,7 @@ def _to_number(
 
     def compare(self) -> None:
         if self.check_dtype and type(self.actual) is not type(self.expected):
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError,
                 f"The (d)types do not match: {type(self.actual)} != {type(self.expected)}.",
             )
@@ -581,7 +588,7 @@ def compare(self) -> None:
         if cmath.isfinite(abs_diff) and abs_diff <= tolerance:
             return
 
-        raise self._make_error_meta(
+        self._fail(
             AssertionError,
             make_scalar_mismatch_msg(
                 self.actual, self.expected, rtol=self.rtol, atol=self.atol
@@ -617,9 +624,6 @@ class TensorLikePair(Pair):
             check is disabled, tensors with different ``layout``'s are converted to strided tensors before being
             compared.
         check_stride (bool): If ``True`` and corresponding tensors are strided, asserts that they have the same stride.
-        check_is_coalesced (bool): If ``True`` (default) and corresponding tensors are sparse COO, checks that both
-            ``actual`` and ``expected`` are either coalesced or uncoalesced. If this check is disabled, tensors are
-            :meth:`~torch.Tensor.coalesce`'ed before being compared.
     """
 
     def __init__(
@@ -636,7 +640,6 @@ def __init__(
         check_dtype: bool = True,
         check_layout: bool = True,
         check_stride: bool = False,
-        check_is_coalesced: bool = True,
         **other_parameters: Any,
     ):
         actual, expected = self._process_inputs(
@@ -652,7 +655,6 @@ def __init__(
         self.check_dtype = check_dtype
         self.check_layout = check_layout
         self.check_stride = check_stride
-        self.check_is_coalesced = check_is_coalesced
 
     def _process_inputs(
         self, actual: Any, expected: Any, *, id: Tuple[Any, ...], allow_subclasses: bool
@@ -661,10 +663,10 @@ def _process_inputs(
             expected, type(actual)
         )
         if not directly_related:
-            raise UnsupportedInputs()
+            self._inputs_not_supported()
 
         if not allow_subclasses and type(actual) is not type(expected):
-            raise UnsupportedInputs()
+            self._inputs_not_supported()
 
         actual, expected = [self._to_tensor(input) for input in (actual, expected)]
         for tensor in (actual, expected):
@@ -677,8 +679,8 @@ def _to_tensor(self, tensor_like: Any) -> torch.Tensor:
 
         try:
             return torch.as_tensor(tensor_like)
-        except Exception as e:
-            raise UnsupportedInputs() from e
+        except Exception:
+            self._inputs_not_supported()
 
     def _check_supported(self, tensor: torch.Tensor, *, id: Tuple[Any, ...]) -> None:
         if tensor.layout not in {
@@ -729,7 +731,7 @@ def _compare_attributes(
         def raise_mismatch_error(
             attribute_name: str, actual_value: Any, expected_value: Any
         ) -> NoReturn:
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError,
                 f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.",
             )
@@ -866,7 +868,7 @@ def _compare_sparse_coo_values(
         - the values for closeness.
         """
         if actual.sparse_dim() != expected.sparse_dim():
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError,
                 (
                     f"The number of sparse dimensions in sparse COO tensors does not match: "
@@ -875,7 +877,7 @@ def _compare_sparse_coo_values(
             )
 
         if actual._nnz() != expected._nnz():
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError,
                 (
                     f"The number of specified values in sparse COO tensors does not match: "
@@ -937,7 +939,7 @@ def _compare_sparse_compressed_values(
         }[actual.layout]
 
         if actual._nnz() != expected._nnz():
-            raise self._make_error_meta(
+            self._fail(
                 AssertionError,
                 (
                     f"The number of specified values in sparse {format_name} tensors does not match: "
@@ -1007,7 +1009,7 @@ def _compare_regular_values_close(
             msg = make_tensor_mismatch_msg(
                 actual, expected, ~matches, rtol=rtol, atol=atol, identifier=identifier
             )
-        raise self._make_error_meta(AssertionError, msg)
+        self._fail(AssertionError, msg)
 
     def _promote_for_comparison(
         self, actual: torch.Tensor, expected: torch.Tensor
@@ -1036,7 +1038,6 @@ def extra_repr(self) -> Sequence[str]:
             "check_dtype",
             "check_layout",
             "check_stride",
-            "check_is_coalesced",
         )
 
 
@@ -1178,16 +1179,15 @@ def originate_pairs(
             )
 
 
-def assert_equal(
+def not_close_error_metas(
     actual: Any,
     expected: Any,
     *,
     pair_types: Sequence[Type[Pair]] = (ObjectPair,),
     sequence_types: Tuple[Type, ...] = (collections.abc.Sequence,),
     mapping_types: Tuple[Type, ...] = (collections.abc.Mapping,),
-    msg: Optional[Union[str, Callable[[str], str]]] = None,
     **options: Any,
-) -> None:
+) -> List[ErrorMeta]:
     """Asserts that inputs are equal.
 
     ``actual`` and ``expected`` can be possibly nested :class:`~collections.abc.Sequence`'s or
@@ -1237,11 +1237,7 @@ def assert_equal(
                 "please except the previous error and raise an expressive `ErrorMeta` instead."
             ) from error
 
-    if not error_metas:
-        return
-
-    # TODO: compose all metas into one AssertionError
-    raise error_metas[0].to_error(msg)
+    return error_metas
 
 
 def assert_close(
@@ -1490,7 +1486,7 @@ def assert_close(
     # Hide this function from `pytest`'s traceback
     __tracebackhide__ = True
 
-    assert_equal(
+    error_metas = not_close_error_metas(
         actual,
         expected,
         pair_types=(
@@ -1510,6 +1506,10 @@ def assert_close(
         msg=msg,
     )
 
+    if error_metas:
+        # TODO: compose all metas into one AssertionError
+        raise error_metas[0].to_error(msg)
+
 
 def assert_allclose(
     actual: Any,
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 9a88ab054340..b04ae3491b4f 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -2,7 +2,7 @@
 from torch.testing._internal.common_utils import TEST_WITH_ROCM
 
 
-class AutocastTestLists(object):
+class AutocastTestLists:
     def _rnn_cell_args(self, n, num_chunks, is_lstm, dev, dtype):
         input = (torch.randn((n, n), device=dev, dtype=torch.float32),)
 
@@ -230,7 +230,7 @@ def __init__(self, dev):
                                       torch.rand((n, n), device=dev, dtype=torch.float32)), torch._C._nn),
         ]
 
-class AutocastCPUTestLists(object):
+class AutocastCPUTestLists:
     # Supplies ops and arguments for test_autocast_* in test/test_cpu.py
     def __init__(self, dev):
         super().__init__()
@@ -314,11 +314,12 @@ def __init__(self, dev):
                           torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
                           torch.randn(5, device=dev, dtype=torch.float32),
                           0)),
+            ("conv_transpose1d", conv_args_fp32[0]),
+            ("conv_transpose2d", conv_args_fp32[1]),
+            ("conv_transpose3d", conv_args_fp32[2]),
+            ("prelu", pointwise0_fp32 + element0_fp32),
         ]
         self.torch_fp32 = [
-            ("conv_transpose1d", conv_args_bf16[0]),
-            ("conv_transpose2d", conv_args_bf16[1]),
-            ("conv_transpose3d", conv_args_bf16[2]),
             ("poisson_nll_loss", mat0_bf16 + mat1_bf16 + (True, False, 1.e-8, torch.nn._reduction.get_enum('mean'))),
             ("cosine_embedding_loss", (torch.tensor([[1, 2, 3]], device=dev, dtype=torch.bfloat16),
                                        torch.tensor([[1, 3, 4]], device=dev, dtype=torch.bfloat16),
diff --git a/torch/testing/_internal/codegen/random_topo_test.py b/torch/testing/_internal/codegen/random_topo_test.py
index e92720be6b80..09c7d6f30d82 100644
--- a/torch/testing/_internal/codegen/random_topo_test.py
+++ b/torch/testing/_internal/codegen/random_topo_test.py
@@ -250,17 +250,17 @@ def prepareInputTensorsToRandomTopoTest(seed,
 def reproString(current_seed, args):
     repro_str = "python {0}".format(__file__)
     if args.cuda_fuser:
-        repro_str += " --cuda_fuser"
+        repro_str += " --cuda-fuser"
     if args.legacy_fuser:
-        repro_str += " --legacy_fuser"
+        repro_str += " --legacy-fuser"
     if args.profiling_executor:
-        repro_str += " --profiling_executor"
+        repro_str += " --profiling-executor"
     if args.fp16:
         repro_str += " --fp16"
     if args.cpu:
         repro_str += " --cpu"
-    repro_str += " --max_num_tensor {0} --max_tensor_dim {1} --max_tensor_size {2}"\
-        " --depth_factor {3} --seed {4} --repro_run".format(
+    repro_str += " --max-num-tensor {0} --max-tensor-dim {1} --max-tensor-size {2}"\
+        " --depth-factor {3} --seed {4} --repro-run".format(
             args.max_num_tensor, args.max_tensor_dim, args.max_tensor_size,
             args.depth_factor, current_seed)
     return repro_str
@@ -337,21 +337,21 @@ def runTest(seed, args):
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--cuda_fuser", action='store_true', default=True)
-    parser.add_argument("--legacy_fuser", action='store_true', default=False)
-    parser.add_argument("--profiling_executor", action='store_true', default=False)
+    parser.add_argument("--cuda-fuser", "--cuda_fuser", action='store_true', default=True)
+    parser.add_argument("--legacy-fuser", "--legacy_fuser", action='store_true', default=False)
+    parser.add_argument("--profiling-executor", "--profiling_executor", action='store_true', default=False)
     parser.add_argument("--fp16", action='store_true', default=False)
     parser.add_argument("--cpu", action='store_true', default=False)
-    parser.add_argument("--debug_print", action='store_true', default=False)
-    parser.add_argument("--debug_tensor", action='store_true', default=False)
-    parser.add_argument("--max_num_tensor", default=MAX_TENSOR, type=int)
-    parser.add_argument("--max_tensor_dim", default=MAX_TENSOR_DIM, type=int)
-    parser.add_argument("--max_tensor_size", default=MAX_TENSOR_SIZE, type=int)
-    parser.add_argument("--depth_factor", default=GRAPH_FACTOR, type=int)
+    parser.add_argument("--debug-print", "--debug_print", action='store_true', default=False)
+    parser.add_argument("--debug-tensor", "--debug_tensor", action='store_true', default=False)
+    parser.add_argument("--max-num-tensor", "--max_num_tensor", default=MAX_TENSOR, type=int)
+    parser.add_argument("--max-tensor-dim", "--max_tensor_dim", default=MAX_TENSOR_DIM, type=int)
+    parser.add_argument("--max-tensor-size", "--max_tensor_size", default=MAX_TENSOR_SIZE, type=int)
+    parser.add_argument("--depth-factor", "--depth-factor", default=GRAPH_FACTOR, type=int)
     parser.add_argument("--seed", default=45589, type=int)
     group = parser.add_mutually_exclusive_group()
     group.add_argument("--iterations", default=4, type=int)
-    group.add_argument("--repro_run", action='store_true', default=False)
+    group.add_argument("--repro-run", "--repro_run", action='store_true', default=False)
     return parser.parse_args()
 
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index b226c7af58e5..d92c5e04f2f7 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -6,7 +6,6 @@
 from torch.testing._internal.common_utils import TEST_NUMBA, IS_WINDOWS, TEST_WITH_ROCM
 import inspect
 import contextlib
-from distutils.version import LooseVersion
 
 
 TEST_CUDA = torch.cuda.is_available()
@@ -16,12 +15,12 @@
 TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE))
 TEST_CUDNN_VERSION = torch.backends.cudnn.version() if TEST_CUDNN else 0
 
-CUDA11OrLater = torch.version.cuda and LooseVersion(torch.version.cuda) >= "11.0"
-CUDA9 = torch.version.cuda and torch.version.cuda.startswith('9.')
 SM53OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3)
 SM60OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0)
 SM80OrLater = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0)
 
+PLATFORM_SUPPORTS_FUSED_SDPA: bool = TEST_CUDA and not TEST_WITH_ROCM
+
 TEST_MAGMA = TEST_CUDA
 if TEST_CUDA:
     torch.ones(1).cuda()  # has_magma shows up after cuda is initialized
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 0585ee0820e7..8e34ec10a835 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -15,7 +15,7 @@
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
-    TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES, skipIfTorchDynamo
+    NATIVE_DEVICES, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
     TEST_CUSPARSE_GENERIC, TEST_HIPSPARSE_GENERIC
 from torch.testing._internal.common_dtype import get_all_dtypes
@@ -239,7 +239,7 @@
 #         # Intention is to override
 #         def assertEqual(self, x, y):
 #             # This DOESN'T WORK!
-#             super(TestFooDeviceType, self).assertEqual(x, y)
+#             super().assertEqual(x, y)
 #
 # If you try to run this code, you'll get an error saying that TestFooDeviceType
 # is not in scope.  This is because after instantiating our classes, we delete
@@ -259,7 +259,7 @@
 
 def _dtype_test_suffix(dtypes):
     """ Returns the test suffix for a dtype, sequence of dtypes, or None. """
-    if isinstance(dtypes, list) or isinstance(dtypes, tuple):
+    if isinstance(dtypes, (list, tuple)):
         if len(dtypes) == 0:
             return ''
         return '_' + '_'.join((dtype_name(d) for d in dtypes))
@@ -280,7 +280,7 @@ def _update_param_kwargs(param_kwargs, name, value):
     if plural_name in param_kwargs:
         del param_kwargs[plural_name]
 
-    if isinstance(value, list) or isinstance(value, tuple):
+    if isinstance(value, (list, tuple)):
         param_kwargs[plural_name] = value
     elif value is not None:
         param_kwargs[name] = value
@@ -870,7 +870,7 @@ def test_wrapper(*args, **kwargs):
 #       for the test to run. If you want to use a string argument you should
 #       probably define a new decorator instead (see below).
 #   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
-class skipIf(object):
+class skipIf:
 
     def __init__(self, dep, reason, device_type=None):
         self.dep = dep
@@ -973,7 +973,7 @@ def dep_fn(self, *args, **kwargs):
     return inner
 
 
-class expectedFailure(object):
+class expectedFailure:
 
     def __init__(self, device_type):
         self.device_type = device_type
@@ -994,7 +994,7 @@ def efail_fn(slf, *args, **kwargs):
         return efail_fn
 
 
-class onlyOn(object):
+class onlyOn:
 
     def __init__(self, device_type):
         self.device_type = device_type
@@ -1016,7 +1016,7 @@ def only_fn(slf, *args, **kwargs):
 # as a list of strings instead of providing a single device string.
 # Skips the test if the number of available devices of the variant's device
 # type is less than the 'num_required_devices' arg.
-class deviceCountAtLeast(object):
+class deviceCountAtLeast:
 
     def __init__(self, num_required_devices):
         self.num_required_devices = num_required_devices
@@ -1064,7 +1064,7 @@ def only_fn(self, *args, **kwargs):
 # precisions (or are working with multiple dtypes) they should be specified
 # explicitly and computed using self.precision (e.g.
 # self.precision *2, max(1, self.precision)).
-class precisionOverride(object):
+class precisionOverride:
 
     def __init__(self, d):
         assert isinstance(d, dict), "precisionOverride not given a dtype : precision dict!"
@@ -1096,7 +1096,7 @@ def __call__(self, fn):
 # atol = 1e-4 and rtol = 0 for torch.double.
 tol = namedtuple('tol', ['atol', 'rtol'])
 
-class toleranceOverride(object):
+class toleranceOverride:
     def __init__(self, d):
         assert isinstance(d, dict), "toleranceOverride not given a dtype : tol dict!"
         for dtype, prec in d.items():
@@ -1119,7 +1119,7 @@ def __call__(self, fn):
 # Examples:
 # @dtypes(torch.float32, torch.float64)
 # @dtypes((torch.long, torch.float32), (torch.int, torch.float64))
-class dtypes(object):
+class dtypes:
 
     def __init__(self, *args, device_type="all"):
         if len(args) > 0 and isinstance(args[0], (list, tuple)):
@@ -1281,10 +1281,6 @@ def wrap_fn(self, *args, **kwargs):
         return wrap_fn
     return dec_fn
 
-# Skips a test on CUDA when using ROCm.
-def skipCUDAIfNotMiopenSuggestNHWC(fn):
-    return skipCUDAIf(not TEST_WITH_MIOPEN_SUGGEST_NHWC, "test doesn't currently work without MIOpen NHWC activation")(fn)
-
 # Skips a test for specified CUDA versions, given in the form of a list of [major, minor]s.
 def skipCUDAVersionIn(versions : List[Tuple[int, int]] = None):
     def dec_fn(fn):
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 795dd7488c28..84ad5dc8ed80 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -18,8 +18,10 @@
 from functools import partial, reduce, wraps
 from io import StringIO
 from typing import Dict, NamedTuple, Optional, Union
+from unittest.mock import patch
 
 import torch
+import torch._dynamo.test_case
 import torch.cuda.nccl
 import torch.distributed as c10d
 import torch.nn as nn
@@ -628,15 +630,7 @@ def _event_listener(parent_pipe, signal_pipe, rank: int):
 
     @classmethod
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
-        # Enable DDP + ReplicatedTensor
-        from torch.nn.parallel._replicated_tensor_ddp_utils import (
-            _set_ddp_with_replicated_tensor,
-        )
-
-        _set_ddp_with_replicated_tensor(True)
-
         self = cls(test_name)
-
         self.rank = rank
         self.file_name = file_name
         self.run_test(test_name, parent_pipe)
@@ -1023,8 +1017,10 @@ def _run(cls, test_name, rank, world_size):
         # every thread have the same value. This would be relevant when we use op db tests, where it
         # needs those states to be set i.e. using instantiate_device_type_tests()
         # TODO: figure out a better way to do this
-        self._tls.precision = TestCase._precision
-        self._tls.rel_tol = TestCase._rel_tol
+        if hasattr(self, "_tls"):
+            self._tls = threading.local()
+            self._tls.precision = TestCase._precision
+            self._tls.rel_tol = TestCase._rel_tol
 
         self.run_test_with_threaded_pg(test_name, rank, world_size)
 
@@ -1181,3 +1177,86 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.forward_inputs[self] = x
         return self.c2(self.c1(x))
+
+@contextmanager
+def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True):
+    # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
+    # Just manually implement the most important part of the dynamo behavior to reset/clear.
+    torch.cuda.set_device(rank)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '6789'
+    if init_pg:
+        c10d.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch._dynamo.reset()
+    torch._dynamo.utils.counters.clear()
+    yield
+    torch._dynamo.reset()
+    torch._dynamo.utils.counters.clear()
+    if init_pg:
+        c10d.destroy_process_group()
+
+
+class DynamoDistributedSingleProcTestCase(torch._dynamo.test_case.TestCase):
+    """
+    Test harness for single-process dynamo distributed tests,
+    initializes dist process group.
+
+    Prefer this for simple tests, as it's easier to debug.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # _exit_stack is set up in TestCase
+        cls._exit_stack.enter_context(
+            patch.dict(
+                os.environ,
+                {
+                    "MASTER_ADDR": "localhost",
+                    "MASTER_PORT": "12355",
+                },
+            )
+        )
+        cls.rank = 0
+        cls.device = f"cuda:{cls.rank}"
+        cls.device_ids = None if "cuda" in cls.device else [cls.rank]
+        c10d.init_process_group("nccl", rank=cls.rank, world_size=1)
+
+    @classmethod
+    def tearDownClass(cls):
+        c10d.destroy_process_group()
+        super().tearDownClass()
+
+
+class DynamoDistributedMultiProcTestCase(MultiProcessTestCase):
+    """
+    Use this for tests that actually run on multiple GPUs.
+
+    Decorate tests with @skip_if_lt_x_gpu(ngpu)
+
+    Note: MultiProcTestCase spawns processes per test and is slow.
+    Prefer MultiThreadedTestCase for most tests. Perhaps use this one
+    sparingly for integration tests.
+    """
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self) -> int:
+        return torch.cuda.device_count()
+
+    @classmethod
+    def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
+        # The rest is copypasta from MultiProcessTestCase._run
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+        self.run_test(test_name, parent_pipe)
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index f86add830311..02725f2eede4 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -202,21 +202,6 @@ def get_future():
         return dist_wait
 
 
-class DeterministicModel(torch.nn.Module):
-    def __init__(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)):
-        super().__init__()
-        # keep everything deterministic for model initialization
-        torch.manual_seed(0)
-        self.inner: Union[torch.nn.Linear, FSDP] = torch.nn.Linear(2, 2).cuda()
-        if wrap_fsdp:
-            self.inner = FSDP(self.inner, cpu_offload=cpu_offload)
-        self.outer = torch.nn.Linear(2, 2).cuda()
-
-    def forward(self, x):
-        y = self.inner(x)
-        return self.outer(y)
-
-
 class TransformerWithSharedParams(FSDPTestModel):
     def __init__(
         self,
@@ -730,7 +715,7 @@ def init(
 
 class FSDPTest(MultiProcessTestCase):
     def setUp(self):
-        super(FSDPTest, self).setUp()
+        super().setUp()
         self._spawn_processes()
 
     @property
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 30e320743ad2..25b7bd8be051 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -51,7 +51,7 @@ def check_against_reference(self, func, reference_func, output_func, args, kwarg
     def allSum(vs):
         if isinstance(vs, torch.Tensor):
             vs = (vs,)
-        return sum((i + 1) * v.sum()
+        return sum((i + 1) * v.sum().abs() if v.dtype.is_complex else (i + 1) * v.sum()
                    for i, v in enumerate(vs)
                    if v is not None and v.dtype in floating_and_complex_types_and(torch.half, torch.bfloat16))
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 45661db9f230..f36ae38a4a46 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11,7 +11,7 @@
 
 import torch
 import numpy as np
-from torch._six import inf, nan
+from torch import inf, nan
 
 from typing import Any, Dict, List, Tuple, Union, Sequence
 from torch.testing import make_tensor
@@ -21,13 +21,15 @@
     all_types, empty_types, complex_types_and, integral_types
 )
 from torch.testing._internal.common_device_type import \
-    (onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
+    (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
      skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
      skipCPUIfNoMklSparse,
      toleranceOverride, tol)
 from torch.testing._internal.common_cuda import (
-    CUDA11OrLater, SM53OrLater, SM60OrLater, with_tf32_off, TEST_CUDNN,
-    _get_torch_cuda_version, _get_torch_rocm_version)
+    SM53OrLater, SM60OrLater, with_tf32_off, TEST_CUDNN,
+    _get_torch_cuda_version, _get_torch_rocm_version, PLATFORM_SUPPORTS_FUSED_SDPA,
+    SM80OrLater
+)
 from torch.testing._internal.common_utils import (
     make_fullrank_matrices_with_distinct_singular_values,
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
@@ -710,7 +712,7 @@ def sample_inputs_add_sub(op, device, dtype, requires_grad, **kwargs):
         yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': 2})
     else:
         yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': True})
-    neg_alpha = -3.14 if (dtype.is_floating_point or dtype.is_complex) else -3
+    neg_alpha = -3.125 if (dtype.is_floating_point or dtype.is_complex) else -3
     lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
     rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
     if dtype is not torch.bool:
@@ -794,6 +796,111 @@ def sample_inputs_randn(op, device, dtype, requires_grad, **kwargs):
     for shape in shapes:
         yield SampleInput(input=shape, kwargs=dict(dtype=dtype, device=device, requires_grad=requires_grad))
 
+def sample_inputs_normal(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((S, S), 0, 5),
+        ((S, S, S), -2, 0.5),
+    )
+    for shape, mean, std in samples:
+        yield SampleInput(make_arg(shape), args=(mean, std))
+
+def error_inputs_normal(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_std = -1
+    yield ErrorInput(
+        SampleInput(t, args=(0, invalid_std)),
+        error_type=RuntimeError,
+        error_regex=r"normal expects std >= 0.0, but found std {}".format(invalid_std),
+    )
+
+def sample_inputs_cauchy(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0, 0.5),
+        ((S, S), 0, 1),
+        ((S, S, S), -2, 1),
+    )
+    for shape, median, gamma in samples:
+        yield SampleInput(make_arg(shape), args=(median, gamma))
+
+
+def error_inputs_cauchy(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_scale = 0
+    yield ErrorInput(
+        SampleInput(t, args=(0, invalid_scale,)),
+        error_type=RuntimeError,
+        error_regex=r"cauchy_ expects sigma > 0.0, but found sigma={}".format(invalid_scale),
+    )
+
+
+def sample_inputs_exponential(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0.5),
+        ((S, S), 1),
+        ((S, S, S), 1.5),
+    )
+    for shape, rate in samples:
+        yield SampleInput(make_arg(shape), args=(rate,))
+
+
+def error_inputs_exponential(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_rate = 0
+    yield ErrorInput(
+        SampleInput(t, args=(invalid_rate,)),
+        error_type=RuntimeError,
+        error_regex=r"exponential_ expects lambda > 0.0, but found lambda={}".format(invalid_rate),
+    )
+
+
+def sample_inputs_geometric(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0.2),
+        ((S, S), 0.5),
+        ((S, S, S), 0.8),
+    )
+    for shape, rate in samples:
+        yield SampleInput(make_arg(shape), args=(rate,))
+
+
+def error_inputs_geometric(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    neg_prob = -1
+    yield ErrorInput(
+        SampleInput(t, args=(neg_prob,)),
+        error_type=RuntimeError,
+        error_regex=r"geometric_ expects p to be in \(0, 1\), but got p={}".format(neg_prob),
+    )
+
+
+def sample_inputs_log_normal(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0, 0.25),
+        ((S, S), 0.5, 1),
+        ((S, S, S), 0, 0.5),
+    )
+    for shape, mean, std in samples:
+        yield SampleInput(make_arg(shape), args=(mean, std))
+
+
+def error_inputs_log_normal(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_std = 0
+    yield ErrorInput(
+        SampleInput(t, args=(0, invalid_std)),
+        error_type=RuntimeError,
+        error_regex=r"log_normal_ expects std > 0.0, but found std={}".format(invalid_std),
+    )
+
 
 def sample_inputs_uniform(op, device, dtype, requires_grad, **kwargs):
 
@@ -983,6 +1090,21 @@ def sample_inputs_sparse_sampled_addmm(op_info, device, dtype, requires_grad, **
             beta=beta,
         )
 
+def sample_inputs_sparse_mm_reduce(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    reductions = ["sum", "mean", "amax", "amin"]
+    for m, k, reduce in product([5, 7], [3, 11], reductions):
+        yield SampleInput(
+            torch.eye(m, m)
+            .to(device=device, dtype=dtype)
+            .to_sparse_csr()
+            .requires_grad_(requires_grad),
+            make_arg((m, k)),
+            reduce,
+        )
+
+
 def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
     yield SampleInput(make_arg(S, M), make_arg(M))
@@ -1445,6 +1567,33 @@ def sample_inputs_empty(op, device, dtype, requires_grad, **kwargs):
     for case in cases:
         yield SampleInput(case, device=device, dtype=dtype, requires_grad=requires_grad)
 
+def sample_inputs_empty_permuted(op, device, dtype, requires_grad, **kwargs):
+    # shape
+    cases = (
+        (), (0,), (1,), (1, 3, 5), (5, 3, 1), (1, 0, 5, 1),
+    )
+
+    for case in cases:
+        for layout in itertools.permutations(range(len(case))):
+            yield SampleInput(case, layout, device=device, dtype=dtype, requires_grad=requires_grad)
+
+def error_inputs_empty_permuted(op_info, device, **kwargs):
+    yield ErrorInput(
+        SampleInput((2,), args=((0, 1),)),
+        error_type=RuntimeError,
+        error_regex="Number of dimensions in size does not match the length of the physical_layout"
+    )
+    yield ErrorInput(
+        SampleInput((2,), args=((3,),)),
+        error_type=RuntimeError,
+        error_regex="Dimension out of range"
+    )
+    yield ErrorInput(
+        SampleInput((2, 3), args=((0, 0),)),
+        error_type=RuntimeError,
+        error_regex="Duplicate dim not allowed"
+    )
+
 def sample_inputs_scalar_tensor(op, device, dtype, requires_grad, **kwargs):
     # Not including a scalar tensor in vals because meta tests start failing due to
     # lack of meta support for _local_scalar_dense
@@ -1842,18 +1991,6 @@ def sample_inputs_cdist(op_info, device, dtype, requires_grad, **kwargs):
                 # The args should never be non-contiguous as this is not supported in the backward
                 yield SampleInput(make_arg(t1_size), make_arg(t2_size), p, cm)
 
-
-def sample_inputs_fill_(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype,
-                       low=None, high=None, requires_grad=requires_grad)
-
-    cases = (((S, S, S), (1,)),
-             ((), (1,)),
-             ((S, S, S), (make_arg(()),)))
-
-    for shape, args in cases:
-        yield SampleInput(make_arg(shape), args=args)
-
 def _fill_np(a, value):
     a = a.copy()
     a.fill(value)
@@ -2530,6 +2667,13 @@ def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs):
             make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad),
             **kwargs)
 
+def error_inputs_diff(op_info, device, **kwargs):
+    t = torch.rand((1, 3), device=device)
+    n = -1
+    yield ErrorInput(SampleInput(t, args=(n, ), kwargs=kwargs),
+                     error_type=RuntimeError,
+                     error_regex=f'order must be non-negative but got {n}')
+
 def sample_inputs_diff(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
 
@@ -3014,7 +3158,47 @@ def error_inputs_adaptive_max_pool3d(opinfo, device, **kwargs):
                      error_regex="Trying to create tensor with negative dimension")
 
 
-class _TestParamsMaxPoolBase(object):
+def sample_inputs_reduction_sparse(op_info, device, dtype, requires_grad, layout, blocksize=None, **kwargs):
+    layout_name = str(layout).split('.', 1)[-1].rsplit('_coo', 1)[0]
+    op_supports_layout = getattr(op_info, 'supports_' + layout_name)
+    if not op_supports_layout:
+        return
+
+    for sample_input in sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs):
+        if sample_input.input.ndim == 0:
+            # scalar sparse tensors are not supported
+            continue
+
+        yield SampleInput(
+            sample_input.input.detach().to_sparse(layout=layout,
+                                                  blocksize=blocksize).requires_grad_(requires_grad),
+            args=sample_input.args,
+            kwargs=sample_input.kwargs)
+
+        if layout is torch.sparse_coo and (dtype.is_floating_point or dtype.is_complex):
+            # uncoalesced samples
+            inp = sample_input.input.detach().to_sparse(layout=layout)
+            inp = torch.sparse_coo_tensor(inp.indices().repeat(1, 2),
+                                          inp.values().repeat(2),
+                                          inp.shape,
+                                          dtype=inp.dtype,
+                                          device=inp.device)
+            assert not inp.is_coalesced()
+            yield SampleInput(inp.requires_grad_(requires_grad),
+                              args=sample_input.args,
+                              kwargs=sample_input.kwargs)
+
+        if sample_input.input.ndim > 2:
+            # hybrid samples
+            yield SampleInput(
+                sample_input.input.detach().to_sparse(layout=layout,
+                                                      blocksize=blocksize,
+                                                      dense_dim=sample_input.input.ndim - 2).requires_grad_(requires_grad),
+                args=sample_input.args,
+                kwargs=sample_input.kwargs)
+
+
+class _TestParamsMaxPoolBase:
 
     def __init__(self):
         self.kwargs = {
@@ -3914,6 +4098,23 @@ def shape(size, rank, with_batch_channel=True):
         yield SampleInput(make_arg(shape(D, rank)), scale_factor=0.6)
 
 
+def sample_inputs_upsample_aten(mode, self, device, dtype, requires_grad, **kwargs):
+    N = 6
+    C = 3
+    H = 10
+    W = 20
+    S = 3
+    L = 5
+
+    input_tensor = make_tensor(torch.Size([N, C, H, W]), device=device, dtype=dtype,
+                               requires_grad=requires_grad, low=-1, high=1)
+
+    yield SampleInput(input_tensor, output_size=torch.Size([S, S]), align_corners=False, scale_factors=None)
+    yield SampleInput(input_tensor, output_size=torch.Size([L, L]), align_corners=False, scale_factors=None)
+    yield SampleInput(input_tensor, output_size=None, align_corners=False, scale_factors=[1.7, 0.9])
+    yield SampleInput(input_tensor, output_size=None, align_corners=True, scale_factors=[0.8, 1.0])
+
+
 def sample_inputs_gelu(self, device, dtype, requires_grad, **kwargs):
     N = 5
     for _ in range(1, N):
@@ -4751,10 +4952,10 @@ def sample_unsqueeze(op_info, device, dtype, requires_grad, **kwargs):
 
 
 def sample_inputs_nn_unfold(op_info, device, dtype, requires_grad, **kwargs):
-    shapes = ((0, 1, 5, 5), (1, 1, 5, 5), (2, 3, 5, 5))
-    kernel_sizes = (2, (2, 2), (3, 3), (2, 3))
+    shapes = ((0, 1, 5, 5), (2, 3, 5, 5))
+    kernel_sizes = (2, (2, 2), (2, 3))
     dilations = (1, 2, (1, 2))
-    paddings = (0, 1, (1, 1), (1, 2))
+    paddings = (0, 1, (1, 2))
     strides = (1, 2, (1, 2))
 
     cases = product(shapes, kernel_sizes, dilations, paddings, strides)
@@ -4993,16 +5194,6 @@ def sample_inputs_ormqr(op_info, device, dtype, requires_grad, **kwargs):
         other = make_input((*batch, *other_matrix_shape), requires_grad=requires_grad)
         yield SampleInput(reflectors, tau, other, left=left, transpose=transpose)
 
-def sample_inputs_symeig(op_info, device, dtype, requires_grad=False, **kwargs):
-    out = sample_inputs_linalg_invertible(op_info, device, dtype, requires_grad)
-
-    for o in out:
-        o.kwargs = {"upper": bool(np.random.choice([True, False])),
-                    "eigenvectors": True}
-        # A gauge-invariant function
-        o.output_process_fn_grad = lambda output: (output[0], abs(output[1]))
-        yield o
-
 
 def sample_inputs_cholesky_solve(op_info, device, dtype, requires_grad=False, **kwargs):
     cholesky_inverse_samples = sample_inputs_linalg_cholesky_inverse(
@@ -5047,6 +5238,8 @@ def sample_inputs_roll(op_info, device, dtype, requires_grad=False, **kwargs):
         yield SampleInput(make_arg((0, 0, 0)), args=arg)
         yield SampleInput(make_arg((S, S, S)), args=arg)
 
+    # Scalar tensor
+    yield SampleInput(make_arg(()), args=(10, ))
 
 def error_inputs_roll(op_info, device, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=torch.float32)
@@ -5062,6 +5255,10 @@ def error_inputs_roll(op_info, device, **kwargs):
     s3 = SampleInput(make_arg((S, )), 0, 2)
     yield ErrorInput(s3, error_regex=err_msg3, error_type=IndexError)
 
+    err_msg4 = ("Dimension specified as 0")
+    s4 = SampleInput(make_arg(()), 0, 0)
+    yield ErrorInput(s4, error_regex=err_msg4, error_type=IndexError)
+
 def sample_inputs_rot90(op_info, device, dtype, requires_grad=False, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -5099,6 +5296,7 @@ def sample_inputs_std_var(op_info, device, dtype, requires_grad, **kwargs):
     yield SampleInput(tensor_1d(), dim=0, unbiased=True, keepdim=True)
     yield SampleInput(tensor_1d(), dim=0, unbiased=False, keepdim=False)
 
+    yield SampleInput(tensor_nd(), dim=(1,), correction=1.3)
     yield SampleInput(tensor_nd(), dim=(1,), correction=S // 2)
     yield SampleInput(tensor_nd(), dim=None, correction=0, keepdim=True)
     yield SampleInput(tensor_nd(), dim=None, correction=None)
@@ -5712,7 +5910,7 @@ def sample_inputs_cross_entropy(op_info, device, dtype, requires_grad, **kwargs)
 
             if "ignore_index" in kwargs and torch.all(target == kwargs["ignore_index"]):
                 # make sure at least one item in target is not ignored
-                target[0] = random.sample(set(range(num_classes)) - {kwargs["ignore_index"]}, 1)[0]
+                target[0] = random.sample(sorted(set(range(num_classes)) - {kwargs["ignore_index"]}), 1)[0]
 
         yield SampleInput(input, target, **kwargs)
 
@@ -6220,7 +6418,7 @@ def _tensor(shape, dtype=dtype, low=None, high=None):
         ((S, S, S), 1, [[2, 0, 3, 0], [0, 1, 2, 2], [3, 0, 2, 0], [1, 1, 1, 2], [0, 1, 2, 2]], False),
     )
 
-    reductions = ["amax", "mean", "amin", "sum", "prod"]
+    reductions = ["max", "mean", "min", "sum", "prod"]
     for args, reduce, initial in product(test_cases, reductions, [1, 2]):
         inp_shape, dim, lengths, unsafe = args
         lengths_t = torch.tensor(lengths, dtype=torch.long, device=device)
@@ -7317,8 +7515,7 @@ def sample_inputs_argwhere(op_info, device, dtype, requires_grad, **kwargs):
 def _generate_sample_shape_reduction():
     shapes = ((S,), (S, S), (S, S, S))
     reductions = ('none', 'mean', 'sum')
-    for s, r in product(shapes, reductions):
-        yield s, r
+    yield from product(shapes, reductions)
 
 def sample_inputs_gaussian_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7596,18 +7793,32 @@ def sample_inputs_scaled_dot_product_attention(op_info, device, dtype, requires_
     dim_4_q_shape = (batch, num_heads, seq_q, head_dim)
     dim_4_kv_shape = (batch, num_heads, seq_kv, head_dim)
 
-    qkv_shapes = [(dim_3_q_shape, dim_3_kv_shape), (dim_4_q_shape, dim_4_kv_shape)]
-    for qkv_shapes, is_causal, need_attn_weights, dropout_p in product(
-            qkv_shapes, [True, False], [True, False], [0.0, 0.5]):
+    broadcast_tuple = ((num_heads, seq_q, head_dim), (batch, num_heads, seq_kv, head_dim))
+
+    qkv_shapes = [(dim_3_q_shape, dim_3_kv_shape), (dim_4_q_shape, dim_4_kv_shape), broadcast_tuple]
+    samples = []
+    for qkv_shapes, is_causal, dropout_p in product(
+            qkv_shapes, [True, False], [0.0, 0.5]):
         shape_q, shape_kv = qkv_shapes
-        yield SampleInput(
+        samples.append(SampleInput(
             make(shape_q),
             make(shape_kv),
             make(shape_kv),
             is_causal=is_causal,
-            need_attn_weights=need_attn_weights,
             dropout_p=dropout_p
-        )
+        ))
+
+    # Add non standard shapes
+    diff_v_head_dim = SampleInput(
+        make((batch, num_heads, seq_q, head_dim)),
+        make((batch, num_heads, seq_kv, head_dim)),
+        make((batch, num_heads, seq_kv, head_dim + 8)),
+        is_causal=is_causal,
+        dropout_p=dropout_p
+    )
+    samples.append(diff_v_head_dim)
+
+    yield from samples
 
 def sample_inputs_pairwise_distance(op_info, device, dtype, requires_grad, **kwargs):
     make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7794,9 +8005,7 @@ def sample_inputs_max_unpool(op_info, device, dtype, requires_grad, **kwargs):
         'nn.functional.max_unpool3d': 3
     }
 
-    unpool_to_pool_name_dict = dict((
-        (k, f'nn.functional.{v.__name__}') for k, v in unpool_name_to_pool_method_dict.items()
-    ))
+    unpool_to_pool_name_dict = {k: f'nn.functional.{v.__name__}' for k, v in unpool_name_to_pool_method_dict.items()}
 
     pool_dim = unpool_name_to_dim[op_info.name]
     pool_method = unpool_name_to_pool_method_dict[op_info.name]
@@ -7864,12 +8073,6 @@ class ForeachRightmostArgType(enum.Enum):
     TensorList = 1
     ScalarList = 2
     Scalar = 3
-foreach_scalars = (
-    random.randint(1, 10),
-    1.0 - random.random(),
-    True,
-    complex(1.0 - random.random(), 1.0 - random.random()),
-)
 _foreach_inputs_default_kwargs = {"noncontiguous": False, "same_size": False, "low": None, "high": None}
 # TODO(crcrpar): Update to return `n_expected_cudaLaunchKernels` as well
 class foreach_inputs_sample_func:
@@ -7894,25 +8097,42 @@ def _set_rightmost_arg_types(
             if rightmost_supports_scalarlist:
                 self._rightmost_arg_types.append(ForeachRightmostArgType.ScalarList)
 
-    def _sample_rightmost_arg(self, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
+    def _sample_rightmost_arg(self, opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
         if rightmost_arg_type == ForeachRightmostArgType.TensorList:
             return [sample_inputs_foreach(None, device, dtype, num_tensors, **_foreach_inputs_kwargs)]
+        should_use_simpler_scalars = opinfo.name == "_foreach_pow" and dtype in (torch.float16, torch.bfloat16)
+
+        def sample_float():
+            s = random.random()
+            if should_use_simpler_scalars:
+                return 1.0 if s > 0.5 else 2.0
+            else:
+                return 1.0 - s
+
+        high = 2 if should_use_simpler_scalars else 9
         if rightmost_arg_type == ForeachRightmostArgType.ScalarList:
             return [
-                [random.randint(0, 9) + 1 for _ in range(num_tensors)],
-                [1.0 - random.random() for _ in range(num_tensors)],
-                [complex(1.0 - random.random(), 1.0 - random.random()) for _ in range(num_tensors)],
+                [random.randint(0, high) + 1 for _ in range(num_tensors)],
+                [sample_float() for _ in range(num_tensors)],
+                [complex(sample_float(), sample_float()) for _ in range(num_tensors)],
                 [True for _ in range(num_tensors)],
                 [1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(num_tensors - 3)],
                 [True, 1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(num_tensors - 4)],
             ]
         if rightmost_arg_type == ForeachRightmostArgType.Scalar:
-            return foreach_scalars
+            return (
+                random.randint(1, high + 1),
+                sample_float(),
+                True,
+                complex(sample_float(), sample_float()),
+            )
         raise AssertionError(f"Invalid rightmost_arg_type of {rightmost_arg_type}")
 
     def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dtype):
         if self.arity < 2:
             return None
+        if "foreach_pow" in opinfo.name and dtype in integral_types():
+            return True
         if rightmost_arg_type == ForeachRightmostArgType.TensorList:
             disable_fastpath = "foreach_div" in opinfo.name and dtype in integral_types_and(torch.bool)
             if "foreach_add" in opinfo.name and dtype == torch.bool:
@@ -7983,7 +8203,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
                         for _ in range(self.arity - 2)
                     ]
                     rightmost_arg_list = self._sample_rightmost_arg(
-                        rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs)
+                        opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs)
                     for rightmost_arg in rightmost_arg_list:
                         args.append(rightmost_arg)
                         kwargs = self._sample_kwargs(opinfo, rightmost_arg, rightmost_arg_type, dtype)
@@ -8012,7 +8232,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
 
 
 class foreach_lerp_sample_func(foreach_inputs_sample_func):
-    def _sample_rightmost_arg(self, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
+    def _sample_rightmost_arg(self, opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
         if rightmost_arg_type == ForeachRightmostArgType.TensorList:
             return [sample_inputs_foreach(None, device, dtype, num_tensors, **_foreach_inputs_kwargs)]
         if rightmost_arg_type == ForeachRightmostArgType.ScalarList:
@@ -8056,7 +8276,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
                     for _ in range(2 - int(rightmost_arg_type == ForeachRightmostArgType.TensorList))
                 ]
                 rightmost_arg_list = self._sample_rightmost_arg(
-                    rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs)
+                    opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs)
                 for rightmost_arg in rightmost_arg_list:
                     kwargs = {}
                     if rightmost_arg_type == ForeachRightmostArgType.TensorList:
@@ -8210,6 +8430,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         supports_alpha_param=True,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "sub",
@@ -8217,18 +8438,21 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         supports_alpha_param=True,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "mul",
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "div",
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "clamp_min",
@@ -8236,6 +8460,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
         supports_alpha_param=False,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "clamp_max",
@@ -8243,6 +8468,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
         supports_alpha_param=False,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "minimum",
@@ -8250,6 +8476,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
         supports_alpha_param=False,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "maximum",
@@ -8257,6 +8484,21 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
         supports_alpha_param=False,
         sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
+    ),
+    ForeachFuncInfo(
+        "pow",
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        supports_alpha_param=False,
+        supports_scalar_self_arg=True,
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
+        skips=(
+            # TODO: Memory leak https://github.com/pytorch/pytorch/issues/95237
+            DecorateInfo(unittest.skip("Memory leak https://github.com/pytorch/pytorch/issues/95237"),
+                         "TestForeach", "test_binary_op"),
+        ),
     ),
 ]
 
@@ -8266,12 +8508,14 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypes=all_types_and_complex(),
         dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
         sample_inputs_func=foreach_pointwise_sample_func(3, False, False),
+        supports_autograd=True,
     ),
     ForeachFuncInfo(
         "addcdiv",
         dtypes=all_types_and_complex(),
         dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
         sample_inputs_func=foreach_pointwise_sample_func(3, False, False),
+        supports_autograd=True,
     ),
 ]
 
@@ -8281,6 +8525,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
         sample_inputs_func=foreach_norm_sample_func(1, False, False),
+        supports_autograd=True,
     ),
 ]
 
@@ -8290,6 +8535,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
         dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
         sample_inputs_func=foreach_lerp_sample_func(3, True, False),
+        supports_autograd=True,
     ),
 ]
 
@@ -8806,6 +9052,153 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
            )),
+    OpInfo('cauchy',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.cauchy_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.cauchy_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_cauchy,
+           error_inputs_func=error_inputs_cauchy,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+
+               DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"), 'TestCommon', 'test_python_ref_executor'),
+
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('exponential',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.exponential_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.exponential_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_exponential,
+           error_inputs_func=error_inputs_exponential,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.expectedFailure, "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('geometric',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.geometric_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.geometric_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16, torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_geometric,
+           error_inputs_func=error_inputs_geometric,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('log_normal',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.log_normal_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.log_normal_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_log_normal,
+           error_inputs_func=error_inputs_log_normal,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('normal',
+           variant_test_name='in_place',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.normal_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.normal_,
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_normal,
+           error_inputs_func=error_inputs_normal,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestCommon", "test_noncontiguous_samples"),
+
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+           )),
     OpInfo('uniform',
            op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.uniform_, inp, *args, **kwargs),
            method_variant=None,
@@ -8915,7 +9308,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # trigger addmm being decomposed by a jit pass.
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if CUDA11OrLater else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -8933,8 +9326,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # When alpha=beta=1 as compile-time constants, JIT will decompose addmm into mm and add.
            variant_test_name='decomposed',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -8956,7 +9348,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('addmv',
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128,
-                                           *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                           torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_addmv),
@@ -8967,7 +9359,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
                                                        *[torch.bfloat16]
-                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
            supports_forward_ad=True,
@@ -9003,7 +9395,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('baddbmm',
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128,
-                                           *[torch.bfloat16] if CUDA11OrLater or TEST_WITH_ROCM else []),
+                                           torch.bfloat16),
            backward_dtypesIfCUDA=floating_types_and(torch.float16,
                                                     *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else [],
                                                     torch.complex64, torch.complex128),
@@ -9029,8 +9421,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            )),
     OpInfo('dot',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            sample_inputs_func=sample_inputs_dot_vdot,
            supports_forward_ad=True,
@@ -9045,8 +9436,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            )),
     OpInfo('vdot',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_dot_vdot,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9062,7 +9452,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
                                                        *[torch.bfloat16]
-                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
            assert_autodiffed=True,
            assert_jit_shape_analysis=True,
            supports_forward_ad=True,
@@ -9076,8 +9466,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_bmm),
     OpInfo('mv',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9085,8 +9474,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('addr',
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
            backward_dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
-           backward_dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, *[torch.bfloat16]
-                                                           if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           backward_dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            # Reference: https://github.com/pytorch/pytorch/issues/50747
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9512,21 +9900,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float,)),
            )),
-    OpInfo('symeig',
-           dtypes=floating_and_complex_types(),
-           check_batched_grad=False,
-           check_batched_gradgrad=False,
-           sample_inputs_func=sample_inputs_symeig,
-           gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
-           skips=(
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
-                            device_type='mps', dtypes=[torch.float32]),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
-                            device_type='mps', dtypes=[torch.float32]),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
-                            device_type='mps', dtypes=[torch.float32]),
-           ),
-           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, with_tf32_off]),
     OpInfo('clamp',
            aliases=('clip',),
            ref=_clamp_numpy,
@@ -9647,8 +10020,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_fwgrad_bwgrad=True),
     OpInfo('corrcoef',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=all_types_and_complex_and(torch.half,
-                                                  *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_corrcoef,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -9722,10 +10094,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    )),
     OpInfo('cov',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=all_types_and_complex_and(torch.half,
-                                                  *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
-           backward_dtypesIfCUDA=all_types_and_complex_and(torch.half, *[torch.bfloat16]
-                                                           if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
+           backward_dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_cov,
            error_inputs_func=error_inputs_cov,
            supports_out=False,
@@ -9831,6 +10201,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_diff,
+           error_inputs_func=error_inputs_diff,
            # See https://github.com/pytorch/pytorch/pull/78358
            check_batched_forward_grad=False,
            skips=(
@@ -10218,6 +10589,47 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
                DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
            )),
+    OpInfo('sparse.mm',
+           dtypes=floating_types_and(torch.bfloat16),
+           variant_test_name='reduce',
+           supports_autograd=True,
+           supports_out=False,
+           supports_gradgrad=False,
+           supports_forward_ad=False,
+           sample_inputs_func=sample_inputs_sparse_mm_reduce,
+           decorators=[onlyCPU],
+           skips=(
+               # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: Sparse CSR tensors do not have strides.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestTags', 'test_tags'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_operator'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: unsupported memory format option Preserve
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_fail_gradgrad'),
+           )),
     UnaryUfuncInfo('i0',
                    ref=np_unary_ufunc_integer_promotion_wrapper(
                        scipy.special.i0) if TEST_SCIPY else None,
@@ -10515,7 +10927,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         ),
                     ], ),
     BinaryUfuncInfo('logaddexp',
-                    dtypes=floating_types_and(torch.bfloat16),
+                    dtypes=floating_and_complex_types_and(torch.bfloat16),
                     dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
                     dtypesIfROCM=floating_types_and(torch.bfloat16, torch.float16),
                     supports_forward_ad=True,
@@ -10644,8 +11056,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            )),
     OpInfo('matrix_exp',
            dtypes=floating_and_complex_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            aliases=('linalg.matrix_exp',),
            sample_inputs_func=sample_inputs_matrix_exp,
            # Needs to construct a 2nx2n matrix by copy_ ing into it
@@ -10666,7 +11077,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
                                                        *[torch.bfloat16]
-                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
            assert_autodiffed=True,
            assert_jit_shape_analysis=True,
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
@@ -11513,9 +11924,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose1d),
            aten_name='conv_transpose1d',
            aliases=('conv_transpose1d',),
-           dtypes=floating_and_complex_types_and(torch.int64),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       torch.bfloat16),
            sample_inputs_func=sample_inputs_conv_transpose1d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -11557,9 +11968,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # `ref` for this function is backward of
            # corresponding `conv*d`
            ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose2d),
-           dtypes=floating_and_complex_types_and(torch.int64),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       torch.bfloat16),
            sample_inputs_func=sample_inputs_conv_transpose2d,
            # Runs very slowly on slow-gradcheck for complex.
            gradcheck_fast_mode=True,
@@ -11605,9 +12016,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # `ref` for this function is backward of
            # corresponding `conv*d`
            ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose3d),
-           dtypes=floating_and_complex_types_and(torch.int64),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(
-               torch.float16, torch.chalf, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+               torch.float16, torch.chalf, torch.bfloat16),
            sample_inputs_func=sample_inputs_conv_transpose3d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -11663,7 +12074,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            aten_name='conv1d',
            dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       torch.bfloat16),
            sample_inputs_func=sample_inputs_conv1d,
            error_inputs_func=error_inputs_conv1d,
            supports_forward_ad=True,
@@ -11704,7 +12115,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            aten_name='conv2d',
            dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_conv2d),
            error_inputs_func=error_inputs_conv2d,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -11914,7 +12325,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
            dtypes=floating_types_and(torch.uint8, torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.uint8),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16, torch.uint8),
            sample_inputs_func=partial(sample_inputs_interpolate, 'nearest'),
            skips=(
                # RuntimeError: false
@@ -11930,7 +12341,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            supports_forward_ad=True,
            dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_interpolate, 'linear'),
            skips=(
                # RuntimeError: false
@@ -11945,8 +12356,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            supports_autograd=True,
            supports_forward_ad=True,
-           dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half),
+           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
            skips=(
@@ -11963,7 +12374,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            skips=(
@@ -11980,7 +12391,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_interpolate, 'trilinear'),
            skips=(
@@ -12011,8 +12422,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           dtypes=floating_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half),
+           dtypes=floating_types_and(torch.uint8, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),
            skips=(
@@ -12022,6 +12433,25 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
            ),
            supports_out=False),
+    OpInfo('_upsample_bilinear2d_aa',
+           op=torch.ops.aten._upsample_bilinear2d_aa,
+           aten_name='_upsample_bilinear2d_aa',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.uint8),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_upsample_aten, 'bilinear'),
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestDTensorOps', 'test_dtensor_op_db'),
+               DecorateInfo(unittest.expectedFailure, 'TestEagerFusionOpInfo', 'test_aot_autograd_symbolic_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestInductorOpInfo', 'test_comprehensive'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperators', 'test_vmapjvpall_has_batch_rule'),
+           )),
     OpInfo(
         "nn.functional.soft_margin_loss",
         dtypes=floating_types_and(torch.bfloat16),
@@ -12037,7 +12467,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            dtypes=floating_types_and(torch.uint8, torch.bfloat16),
-           dtypesIfCUDA=floating_types_and(torch.half, torch.uint8),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.uint8, torch.bfloat16),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=partial(sample_inputs_upsample, 'nearest'),
            skips=(
@@ -12370,10 +12800,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_linear,
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
-                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
-           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
-                                                                if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            # linear calls mm under the hood which is nondeterministic on CUDA
            # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -12394,7 +12822,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_bilinear,
            dtypes=all_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.float16,
-                                           *[torch.bfloat16] if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
+                                           *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []),
            skips=(
                # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
@@ -12560,11 +12988,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             ), ],
     ),
     OpInfo(
-        'nn.functional._scaled_dot_product_attention',
+        'nn.functional.scaled_dot_product_attention',
         op=lambda *args, **kwargs:
-               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs)
-               if kwargs['need_attn_weights'] else
-               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs)[0],
+               wrapper_set_seed(torch.nn.functional.scaled_dot_product_attention, *args, **kwargs),
         sample_inputs_func=sample_inputs_scaled_dot_product_attention,
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
@@ -12586,9 +13012,20 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
             # OpInfo was implemented with a lambda
             DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
-            # No meta function
+            # See [Note] SDPA_flash's meta function returns incorrect Philox seed and offset
+            DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_crossref_backward_amp',
+                         device_type='cuda', dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_FUSED_SDPA and SM80OrLater),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace',
+                         device_type='cuda', dtypes=(torch.float16, torch.bfloat16),
+                         active_if=PLATFORM_SUPPORTS_FUSED_SDPA and SM80OrLater),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
+                         device_type='cuda', dtypes=(torch.float16, torch.bfloat16),
+                         active_if=PLATFORM_SUPPORTS_FUSED_SDPA and SM80OrLater),
+            # TODO Need to understand what this is testing and why it doesn't work
             DecorateInfo(unittest.skip("Skipped"), 'TestDecomp', 'test_comprehensive'),
-            DecorateInfo(unittest.skip('output is non-deterministic (when dropout_p > 0)'), 'TestCommon', 'test_compare_cpu'),),
+            DecorateInfo(unittest.skip('output is non-deterministic (when dropout_p > 0)'), 'TestCommon', 'test_compare_cpu'),
+            # TODO skip this for now since we can't skip on runtime arch support
+            DecorateInfo(unittest.skip('This is '), 'TestInductorOpInfo', 'test_comprehensive'),),
     ),
     UnaryUfuncInfo(
         'nn.functional.silu',
@@ -12748,7 +13185,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                          dtypes=(torch.int, torch.int8)),
             # pytorch computes (0+nanj), numpy computes (-5e-18-1j) for input (-501.-1.0000e+20j)
             DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
-                         "test_reference_numerics_large", dtypes=(torch.complex64,)),),
+                         "test_reference_numerics_large", dtypes=(torch.complex64,), device_type='cpu',
+                         active_if=not IS_MACOS and not IS_WINDOWS),),
     ),
     UnaryUfuncInfo(
         'nn.functional.tanhshrink',
@@ -13115,8 +13553,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    autodiff_nonfusible_nodes=["aten::relu6"]),
     OpInfo('mm',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
-                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -13219,12 +13656,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            supports_autograd=True,
            sample_inputs_func=sample_inputs_view_reshape,
-           error_inputs_func=error_inputs_view_reshape,
-           skips=(
-               # https://github.com/pytorch/pytorch/issues/89068
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           )),
+           error_inputs_func=error_inputs_view_reshape),
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
                    ref=np.negative,
@@ -13753,7 +14185,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
                                                        *[torch.bfloat16]
-                                                       if (SM53OrLater and CUDA11OrLater) or TEST_WITH_ROCM else []),
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
            assert_autodiffed=True,
            sample_inputs_func=partial(sample_inputs_matmul, is_rmatmul=True),
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
@@ -14264,11 +14696,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # TODO(@heitorschueroff) update SampleInput to handle such cases
            op=lambda tensors, equation: torch.einsum(equation, tensors),
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.half,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
            backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, *[torch.bfloat16]
-                                                                if ((SM60OrLater and CUDA11OrLater)
-                                                                or TEST_WITH_ROCM) else []),
+                                                                if (SM60OrLater or
+                                                                    TEST_WITH_ROCM) else []),
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -14656,8 +15087,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            # https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
-           skips=(
-           ),
            sample_inputs_func=sample_inputs_index,
            reference_inputs_func=partial(sample_inputs_index, reference=True),
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
@@ -14737,7 +15166,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            skips=(
            )),
     OpInfo('unique',
-           dtypes=all_types_and(torch.bool, torch.bfloat16),
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
            sample_inputs_func=sample_inputs_unique,
            supports_out=False,
@@ -14752,7 +15181,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip('Output order is undefined when sorted=False'), 'TestCommon', 'test_compare_cpu'),
            )),
     OpInfo('unique_consecutive',
-           dtypes=all_types_and(torch.bool, torch.bfloat16),
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
            sample_inputs_func=sample_inputs_unique_consecutive,
            supports_out=False,
@@ -15362,6 +15791,48 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
            )),
+    OpInfo('empty_permuted',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_empty_permuted,
+           error_inputs_func=error_inputs_empty_permuted,
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"), 'TestCompositeCompliance',
+                            'test_operator'),
+               # requires_grad doesn't exist in the jit schema
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestCommon',
+                            'test_out'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestCommon',
+                            'test_out_warning'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestLazyOpInfo'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
     OpInfo('scalar_tensor',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_scalar_tensor,
@@ -15430,7 +15901,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
                DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestFwdGradients'),
                DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestBwdGradients'),
-               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # RuntimeError: Difference from {dtype} is larger with decomposition
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_comprehensive'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick'),
+               # The inplace variant (Tensor.normal_) is different from torch.normal
+               # inplace varaint Tensor.normal_ is decomposed using randn_like()
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'))),
     OpInfo('normal',
            # This has its own variant b/c OpInfos assume the first arg is a Tensor but it is not here
            variant_test_name='number_mean',
@@ -15451,7 +15928,21 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
                DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestFwdGradients'),
                DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestBwdGradients'),
-               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # The inplace variant (Tensor.normal_) is different from torch.normal
+               # inplace varaint Tensor.normal_ is decomposed using randn_like()
+               # TypeError: randn_like(): argument 'input' (position 1) must be Tensor, not float
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_autocast'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_comprehensive'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestProxyTensorOpInfo', 'test_make_fx_fake_exhaustive'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_amp'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_no_amp'))),
     OpInfo('bernoulli',
            op=lambda inp, *args, **kwargs:
                wrapper_set_seed(torch.bernoulli, inp, *args, **kwargs),
@@ -15659,17 +16150,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            backward_dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
-           supports_out=False,
+           supports_out=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            check_batched_gradgrad=False,
            # See https://github.com/pytorch/pytorch/issues/66357
            check_batched_forward_grad=False,
-           skips=(
-               # *_copy functions do not seem to treat out as expected
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           ),
            sample_inputs_func=sample_inputs_unfold),
     OpInfo('msort',
            dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
@@ -16012,8 +16498,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_kron),
     OpInfo('inner',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
-                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -16023,8 +16508,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            ),
     OpInfo('tensordot',
            dtypes=all_types_and_complex_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, *[torch.bfloat16]
-                                                       if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -16746,7 +17230,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # This is because currently only the `input` field of SampleInput
         # is tested in gradient tests.
         op=lambda weight, idx, **kwargs: torch.nn.functional.embedding_bag(idx, weight, **kwargs),
-        dtypes=floating_types_and(torch.float16),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
         dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
         # backward is not supported for mode `max` and dtype `bfloat16`
         backward_dtypesIfCUDA=floating_types_and(torch.float16),
@@ -16983,6 +17467,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         check_batched_forward_grad=False,
         supports_fwgrad_bwgrad=True,
         dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=sample_inputs_nan_reduction(supports_multiple_dims=True),
         ref=reference_reduction_numpy(np.nanmean),
         skips=(
@@ -17138,10 +17623,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_out=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
+        supports_sparse=True,
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         ref=reference_reduction_numpy(np.sum),
+        sample_inputs_sparse_coo_func=partial(sample_inputs_reduction_sparse, layout=torch.sparse_coo),
+        sample_inputs_sparse_csr_func=partial(sample_inputs_reduction_sparse, layout=torch.sparse_csr),
+        sample_inputs_sparse_csc_func=partial(sample_inputs_reduction_sparse, layout=torch.sparse_csc),
+        sample_inputs_sparse_bsr_func=partial(sample_inputs_reduction_sparse, layout=torch.sparse_bsr),
+        sample_inputs_sparse_bsc_func=partial(sample_inputs_reduction_sparse, layout=torch.sparse_bsc),
         skips=(
             # FIXME: sum does not support passing keepdim without passing dim
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
@@ -17167,6 +17658,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         check_batched_forward_grad=False,
         supports_fwgrad_bwgrad=True,
         dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=sample_inputs_nan_reduction(supports_multiple_dims=True),
         ref=reference_reduction_numpy(np.nansum),
         skips=(
@@ -17486,7 +17978,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_scatter_reduce,
     ),
     OpInfo(
-        'segment_reduce',
+        '_segment_reduce',
+        aten_name='segment_reduce',
         variant_test_name='lengths',
         dtypes=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
@@ -17505,7 +17998,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         ),
     ),
     OpInfo(
-        'segment_reduce',
+        '_segment_reduce',
+        aten_name='segment_reduce',
         variant_test_name='offsets',
         dtypes=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
@@ -17591,6 +18085,149 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         ),
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.cauchy",
+        torch_opinfo_name="cauchy",
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: cauchy is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: cauchy is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip("Expected: cauchy is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.exponential",
+        torch_opinfo_name="exponential",
+        supports_out=True,
+        decorators=(
+            # dtypes that do not support check_uniform_bounds of rand_like
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64)),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_dtypes'),
+
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: exponential is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: exponential is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip("Expected: exponential is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.geometric",
+        torch_opinfo_name="geometric",
+        supports_out=True,
+        decorators=(
+            # dtypes that do not support check_uniform_bounds of rand_like
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_dtypes'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64)),
+
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor', device_type='cuda'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.log_normal",
+        torch_opinfo_name="log_normal",
+        supports_out=True,
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor', device_type='cuda'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.normal",
+        torch_opinfo_name="normal",
+        torch_opinfo_variant_name="in_place",
+        supports_out=True,
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"), 'TestDecomp', 'test_comprehensive'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        )
+    ),
     PythonRefInfo(
         "_refs.arange",
         torch_opinfo_name="arange",
@@ -17599,10 +18236,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
             DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
-
-            # Prims arange does not follow aten
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta',
-                         dtypes=(torch.int64,)),
         ),
         supports_nvfuser=False,
     ),
@@ -18372,6 +19005,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.logaddexp",
         torch_opinfo_name="logaddexp",
         supports_nvfuser=False,
+        skips=(
+            # failure due to mismatch in edge cases, which boils down to what torch.exp(inf + infj) should be
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref', device_type='cpu',
+                         dtypes=(torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback', device_type='cpu',
+                         dtypes=(torch.complex64, torch.complex128)),
+        ),
     ),
     ElementwiseBinaryPythonRefInfo(
         "_refs.floor_divide",
@@ -19589,3 +20229,38 @@ def mask_not_all_zeros(shape):
         result = torch.randn(shape).gt(0)
         if result.sum() > 0:
             return result
+
+# Copied from functorch
+def xfail(op_name, variant_name='', *, device_type=None, dtypes=None):
+    return (op_name, variant_name, device_type, dtypes, True)
+
+
+def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
+    return (op_name, variant_name, device_type, dtypes, False)
+
+
+def skipOps(test_case_name, base_test_name, to_skip):
+    all_opinfos = op_db
+    for xfail in to_skip:
+        op_name, variant_name, device_type, dtypes, expected_failure = xfail
+        matching_opinfos = [o for o in all_opinfos
+                            if o.name == op_name and o.variant_test_name == variant_name]
+        assert len(matching_opinfos) >= 1, f"Couldn't find OpInfo for {xfail}"
+        for op in matching_opinfos:
+            decorators = list(op.decorators)
+            if expected_failure:
+                decorator = DecorateInfo(unittest.expectedFailure,
+                                         test_case_name, base_test_name,
+                                         device_type=device_type, dtypes=dtypes)
+                decorators.append(decorator)
+            else:
+                decorator = DecorateInfo(unittest.skip("Skipped!"),
+                                         test_case_name, base_test_name,
+                                         device_type=device_type, dtypes=dtypes)
+                decorators.append(decorator)
+            op.decorators = tuple(decorators)
+
+    # This decorator doesn't modify fn in any way
+    def wrapped(fn):
+        return fn
+    return wrapped
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 4031ea54a5ca..0a8b49960ec5 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -6,16 +6,17 @@
 from itertools import chain, product
 import itertools
 import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence
 from torch.testing import make_tensor
 from torch.testing._internal.common_cuda import TEST_CUDNN
 from torch.testing._internal.common_dtype import floating_types, floating_and_complex_types_and
 from torch.testing._internal.common_device_type import (
     _TestParametrizer, _update_param_kwargs, toleranceOverride, tol,
-    skipCUDAIfCudnnVersionLessThan, skipCUDAIfRocm, precisionOverride, skipMeta)
+    skipCUDAIfCudnnVersionLessThan, skipCUDAIfRocm, precisionOverride, skipMeta, skipCUDAVersionIn)
 from torch.testing._internal.common_methods_invocations import DecorateInfo
 from torch.testing._internal.common_nn import nllloss_reference, get_reduction
 from torch.testing._internal.common_utils import (
-    freeze_rng_state, set_single_threaded_if_parallel_tbb, skipIfMps, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM)
+    freeze_rng_state, set_single_threaded_if_parallel_tbb, skipIfMps, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS)
 from types import ModuleType
 from typing import List, Tuple, Type, Set, Dict
 
@@ -23,8 +24,8 @@
 MODULE_NAMESPACES: List[ModuleType] = [
     torch.nn.modules,
     torch.ao.nn.qat.modules,
-    torch.nn.quantizable.modules,
-    torch.nn.quantized.modules,
+    torch.ao.nn.quantizable.modules,
+    torch.ao.nn.quantized.modules,
     torch.ao.nn.quantized.modules,
 ]
 
@@ -33,7 +34,7 @@
     torch.nn.Module,  # abstract base class
     torch.nn.Container,  # deprecated
     torch.nn.NLLLoss2d,  # deprecated
-    torch.nn.quantized.MaxPool2d,  # aliases to nn.MaxPool2d
+    torch.ao.nn.quantized.MaxPool2d,  # aliases to nn.MaxPool2d
     torch.ao.nn.quantized.MaxPool2d,  # aliases to nn.MaxPool2d
 }
 
@@ -63,8 +64,8 @@
 class modules(_TestParametrizer):
     """ PROTOTYPE: Decorator for specifying a list of modules over which to run a test. """
 
-    def __init__(self, module_info_list, allowed_dtypes=None, train_eval_mode=TrainEvalMode.train_and_eval):
-        self.module_info_list = module_info_list
+    def __init__(self, module_info_iterable, allowed_dtypes=None, train_eval_mode=TrainEvalMode.train_and_eval):
+        self.module_info_list = list(module_info_iterable)
         self.allowed_dtypes = set(allowed_dtypes) if allowed_dtypes is not None else None
         self.train_eval_mode = train_eval_mode
 
@@ -132,7 +133,7 @@ def get_module_common_name(module_cls):
         return module_cls.__name__
 
 
-class FunctionInput(object):
+class FunctionInput:
     """ Contains args and kwargs to pass as input to a function. """
     __slots__ = ['args', 'kwargs']
 
@@ -141,7 +142,7 @@ def __init__(self, *args, **kwargs):
         self.kwargs = kwargs
 
 
-class ModuleInput(object):
+class ModuleInput:
     """ Contains args / kwargs for module instantiation + forward pass. """
     __slots__ = ['constructor_input', 'forward_input', 'desc', 'reference_fn']
 
@@ -164,7 +165,7 @@ def copy_reference_fn(m, *args, **kwargs):
             self.reference_fn = copy_reference_fn
 
 
-class ModuleInfo(object):
+class ModuleInfo:
     """ Module information to be used in testing. """
 
     def __init__(self,
@@ -947,8 +948,15 @@ def module_inputs_torch_nn_LSTMCell(module_info, device, dtype, requires_grad, t
 
     return samples
 
+def make_packed_sequence(inp, batch_sizes):
+    required_grad = inp.requires_grad
+    inp.requires_grad_(False)  # user won't have access to inp so won't be able to get its grads
+    seq = pack_padded_sequence(inp, batch_sizes)
+    seq.data.requires_grad_(required_grad)
+    return seq
+
 
-def module_inputs_torch_nn_RNN_GRU(module_info, device, dtype, requires_grad, training, **kwargs):
+def module_inputs_torch_nn_RNN_GRU(module_info, device, dtype, requires_grad, training, with_packed_sequence=False, **kwargs):
     # Currently all samples below are for validating the no-batch-dim support.
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
     is_rnn = kwargs['is_rnn']
@@ -980,7 +988,7 @@ def module_inputs_torch_nn_RNN_GRU(module_info, device, dtype, requires_grad, tr
         samples.append(
             ModuleInput(
                 constructor_input=FunctionInput(**cons_args),
-                forward_input=FunctionInput(make_input((2, 2))),
+                forward_input=FunctionInput(make_input((3, 2))),
                 reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
             )
         )
@@ -991,6 +999,21 @@ def module_inputs_torch_nn_RNN_GRU(module_info, device, dtype, requires_grad, tr
                 reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
             )
         )
+        if with_packed_sequence:
+            samples.append(
+                ModuleInput(
+                    constructor_input=FunctionInput(**cons_args),
+                    forward_input=FunctionInput(make_packed_sequence(make_input((5, 2, 2)), torch.tensor([5, 3]))),
+                    reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
+                )
+            )
+            samples.append(
+                ModuleInput(
+                    constructor_input=FunctionInput(**cons_args),
+                    forward_input=FunctionInput(make_packed_sequence(make_input((5, 5, 2)), torch.tensor([5, 3, 3, 2, 2]))),
+                    reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
+                )
+            )
 
     return samples
 
@@ -1032,6 +1055,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
             )
         )
 
+
     return samples
 
 
@@ -1059,6 +1083,14 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
         unittest.expectedFailure, "TestModule", "test_non_contiguous_tensors",
         active_if=(TEST_CUDNN and TEST_WITH_ROCM), dtypes=(torch.float,), device_type='cuda'
     ),
+    DecorateInfo(
+        skipCUDAVersionIn([(11, 7)]), "TestExpandedWeightModule", "test_module",
+        device_type='cuda'
+    ),
+    DecorateInfo(
+        skipCUDAVersionIn([(11, 7)]), "TestDecomp", "test_rnn_decomp_module",
+        device_type='cuda'
+    )
 )
 
 # Database of ModuleInfo entries in alphabetical order.
@@ -1180,12 +1212,11 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
                    DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cpu'),
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
                                 dtypes=[torch.float64, torch.complex128]),
                    # These fail only on ROCm
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
-                                dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM),
+                                dtypes=[torch.complex32], active_if=TEST_WITH_ROCM),
                    # Not implmented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_forward',
                                 dtypes=(torch.chalf,), device_type='cpu'),
@@ -1344,7 +1375,6 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
                    DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float64]),
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cpu'),
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
                                 dtypes=[torch.float64]),
                ),
@@ -1440,6 +1470,11 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
     ModuleInfo(torch.nn.TransformerEncoderLayer,
                train_and_eval_differ=True,
                module_inputs_func=module_inputs_torch_nn_TransformerEncoderLayer,
+               decorators=[
+                   DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                                'TestModule', 'test_non_contiguous_tensors',
+                                device_type='cpu', active_if=IS_WINDOWS),
+               ],
                skips=(
                    # No channels_last support for TransformerEncoderLayer currently.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 5faa8ce099e5..c60bd4e57b95 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -5848,7 +5848,7 @@ def check_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True):
             self.assertLessEqual(max(differences), PRECISION)  # type: ignore[type-var]
 
 
-class TestBase(object):
+class TestBase:
 
     _required_arg_names = {'constructor_args', 'input', 'extra_args'}
 
@@ -6033,6 +6033,9 @@ def test_cuda(self, test_case):
         cpu_input = self._get_input()
         type_map = {torch.double: torch.float}
         cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,)
+
+        is_any_input_complex = any(map(lambda t: isinstance(t, torch.Tensor) and t.dtype.is_complex, cpu_input_tuple))
+
         gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map)
 
         cpu_module = self.constructor(*self.constructor_args)
@@ -6093,12 +6096,19 @@ def test_cuda(self, test_case):
             # torch.autograd.grad doesn't complain that some inputs
             # are unreachable (which can happen if you differentiate
             # only on the gradient.
+            if is_any_input_complex:
+                outputs_cpu = cpu_output.sum().abs() + sum(x.sum().abs() for x in cpu_gradInputs)
+                outputs_gpu = gpu_output.sum().abs() + sum(x.sum().abs() for x in gpu_gradInputs)
+            else:
+                outputs_cpu = cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs)
+                outputs_gpu = gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs)
+
             cpu_gg = torch.autograd.grad(
-                cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs),
+                outputs_cpu,
                 cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()),
                 retain_graph=True)
             gpu_gg = torch.autograd.grad(
-                gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs),
+                outputs_gpu,
                 gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                 retain_graph=True)
             test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
@@ -6108,7 +6118,7 @@ def test_cuda(self, test_case):
         self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
 
 
-class InputVariableMixin(object):
+class InputVariableMixin:
     def _get_input(self):
         input = TestBase._get_input(self, False)  # type: ignore[arg-type]
 
@@ -6477,13 +6487,13 @@ def _test_module_empty_input(test_case, module, inp, check_size=True, inference=
 def _create_basic_net():
     class Layer(nn.Module):
         def __init__(self):
-            super(Layer, self).__init__()
+            super().__init__()
             self.layer_dummy_param = nn.Parameter(torch.empty(3, 5))
             self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
 
     class Net(nn.Module):
         def __init__(self):
-            super(Net, self).__init__()
+            super().__init__()
             self.l1 = Layer()
             self.dummy_param = nn.Parameter(torch.empty(3, 5))
             self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
diff --git a/torch/testing/_internal/common_pruning.py b/torch/testing/_internal/common_pruning.py
index 8fc08ee2a41b..d652316f66f1 100644
--- a/torch/testing/_internal/common_pruning.py
+++ b/torch/testing/_internal/common_pruning.py
@@ -7,6 +7,22 @@
 from torch import nn
 
 
+def rows_are_subset(subset_tensor, superset_tensor) -> bool:
+    """
+    Checks to see if all rows in subset tensor are present in the superset tensor
+    """
+    i = 0
+    for row in subset_tensor:
+        while i < len(superset_tensor):
+            if not torch.equal(row, superset_tensor[i]):
+                i += 1
+            else:
+                break
+        else:
+            return False
+    return True
+
+
 class SimpleLinear(nn.Module):
     r"""Model with only Linear layers without biases, some wrapped in a Sequential,
     some following the Sequential. Used to test basic pruned Linear-Linear fusion."""
@@ -309,3 +325,37 @@ def forward(self, x):
         x = self.flatten(x)
         x = self.fc(x)
         return x
+
+
+class LSTMLinearModel(nn.Module):
+    """Container module with an encoder, a recurrent module, and a linear."""
+
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers)
+        self.linear = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, input):
+        output, hidden = self.lstm(input)
+        decoded = self.linear(output)
+        return decoded, output
+
+
+class LSTMLayerNormLinearModel(nn.Module):
+    """Container module with an LSTM, a LayerNorm, and a linear."""
+
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers)
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.linear = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        x, state = self.lstm(x)
+        x = self.norm(x)
+        x = self.linear(x)
+        return x, state
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 6d23d68e929a..86d587680174 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -5,10 +5,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
 import torch.ao.nn.quantized as nnq
 import torch.ao.nn.quantized.dynamic as nnqd
-from torch.nn.intrinsic import _FusedModule
+from torch.ao.nn.intrinsic import _FusedModule
 import torch.distributed as dist
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
 
@@ -18,11 +18,11 @@
     default_embedding_qat_qconfig,
     default_symmetric_qnnpack_qat_qconfig,
 )
-from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
+from torch.ao.quantization import QuantWrapper, QuantStub, DeQuantStub, \
     default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
     propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_weight_only_qconfig, \
     get_default_qat_qconfig, PerChannelMinMaxObserver, default_dynamic_quant_observer, quantize
-from torch.quantization.quantization_mappings import (
+from torch.ao.quantization.quantization_mappings import (
     get_default_dynamic_quant_module_mappings,
     get_default_qconfig_propagation_list,
     get_default_qat_module_mappings,
@@ -129,7 +129,7 @@ def test_only_train_fn(model, train_data, loss_fn=_default_loss_fn):
             correct += (predicted == target).sum().item()
     return train_loss, correct, total
 
-class AverageMeter(object):
+class AverageMeter:
     """Computes and stores the average and current value"""
     def __init__(self, name, fmt=':f'):
         self.name = name
@@ -267,6 +267,18 @@ def _make_conv_test_input(
 
     return (X, X_q, W, W_q, b if use_bias else None)
 
+def _make_conv_add_extra_input_tensor(scale, zero_point, sizes):
+    (X_value_min, X_value_max) = (0, 4)
+    X_init = torch.randint(
+        X_value_min,
+        X_value_max,
+        sizes  # Infer the size of tensor to do the add
+    )
+    X = scale * (X_init - zero_point).float()
+    X_q = torch.quantize_per_tensor(
+        X, scale=scale, zero_point=zero_point, dtype=torch.quint8)
+    return X, X_q
+
 def skipIfNoFBGEMM(fn):
     reason = 'Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs with instruction set support AVX2 or newer.'
     if isinstance(fn, type):
@@ -307,6 +319,25 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def withQNNPACKBackend(fn):
+    # TODO(future PR): consider combining with skipIfNoQNNPACK,
+    # will require testing of existing callsites
+    reason = 'Quantized operations require QNNPACK.'
+    if isinstance(fn, type):
+        if 'qnnpack' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'qnnpack' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        with override_quantized_engine('qnnpack'):
+            fn(*args, **kwargs)
+
+    return wrapper
+
 def skipIfNoONEDNN(fn):
     reason = 'Quantized operations require ONEDNN.'
     if isinstance(fn, type):
@@ -422,7 +453,7 @@ def is_leaf_module(module):
            ((is_leaf_module(module) and not isinstance(module, torch.nn.Sequential)
             and type(module) in propagate_qconfig_list) or
            type(module) in float_to_observed_module_class_mapping.keys()) and \
-           not isinstance(module, torch.quantization.DeQuantStub):
+           not isinstance(module, torch.ao.quantization.DeQuantStub):
             self.assertTrue(hasattr(module, 'activation_post_process'),
                             'module: ' + str(type(module)) + ' do not have observer')
         # we don't need to check observers for child modules of the
@@ -990,15 +1021,11 @@ def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indic
         self.assertTrue(expected_name in str(q_embeddingbag))
 
 class QuantizationLiteTestCase(QuantizationTestCase):
-
-    def setUp(self):
-        super().setUp()
-
     def _create_quantized_model(self, model_class: Type[torch.nn.Module], **kwargs):
         # Creates quantized model for testing mobile script modules
         qengine = "qnnpack"
         with override_quantized_engine(qengine):
-            qconfig = torch.quantization.get_default_qconfig(qengine)
+            qconfig = torch.ao.quantization.get_default_qconfig(qengine)
             model = model_class(**kwargs)
             model = quantize(model, test_only_eval_fn, [self.calib_data])
 
@@ -1054,7 +1081,7 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 class AnnotatedSingleLayerLinearModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.fc1 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
 
     def forward(self, x):
@@ -1067,7 +1094,7 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 class SingleLayerLinearDynamicModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
 
     def forward(self, x):
@@ -1125,7 +1152,7 @@ def forward(self, x):
 class LSTMwithHiddenDynamicModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.lstm = torch.nn.LSTM(2, 2).to(dtype=torch.float)
 
     def forward(self, x, hid):
@@ -1159,7 +1186,7 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 class AnnotatedConvModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
@@ -1176,7 +1203,7 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 class AnnotatedConvTransposeModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
@@ -1241,8 +1268,8 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 
 class AnnotatedConvBnReLUModel(torch.nn.Module):
     def __init__(self, qengine='fbgemm'):
-        super(AnnotatedConvBnReLUModel, self).__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
         self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
         self.relu = nn.ReLU(inplace=True)
@@ -1260,9 +1287,9 @@ def forward(self, x):
     def fuse_model(self):
         # TODO: remove this check and define two fuse_modules function on this module
         if self.training:
-            torch.quantization.fuse_modules_qat(self, [['conv', 'bn', 'relu']], inplace=True)
+            torch.ao.quantization.fuse_modules_qat(self, [['conv', 'bn', 'relu']], inplace=True)
         else:
-            torch.quantization.fuse_modules(self, [['conv', 'bn', 'relu']], inplace=True)
+            torch.ao.quantization.fuse_modules(self, [['conv', 'bn', 'relu']], inplace=True)
 
     def get_example_inputs(self) -> Tuple[Any, ...]:
         return (torch.rand(1, 3, 5, 5),)
@@ -1297,7 +1324,7 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 
 class LinearModelWithSubmodule(nn.Module):
     def __init__(self):
-        super(LinearModelWithSubmodule, self).__init__()
+        super().__init__()
         self.subm = TwoLayerLinearModel()
         self.fc = nn.Linear(5, 5)
 
@@ -1314,7 +1341,7 @@ def __init__(self):
         super().__init__()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.fc2 = QuantWrapper(torch.nn.Linear(8, 5).to(dtype=torch.float))
-        self.fc2.qconfig = torch.quantization.get_default_qconfig("fbgemm")
+        self.fc2.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
 
     def forward(self, x):
         x = self.fc1(x)
@@ -1327,11 +1354,11 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 class ActivationsTestModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig("fbgemm")
-        self.quant = torch.quantization.QuantStub()
+        self.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        self.quant = torch.ao.quantization.QuantStub()
         self.hardswish = torch.nn.Hardswish().to(dtype=torch.float)
         self.elu = torch.nn.ELU().to(dtype=torch.float)
-        self.dequant = torch.quantization.DeQuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
 
     def forward(self, x):
         x = self.quant(x)
@@ -1420,6 +1447,66 @@ def forward(self, x):
     def get_example_inputs(self) -> Tuple[Any, ...]:
         return (torch.rand(1, 5),)
 
+class ConvBnAddReluModel(torch.nn.Module):
+    def __init__(self,
+                 with_bn=True,
+                 with_relu=True,
+                 left_conv=True,
+                 two_conv=True,
+                 use_torch_add=True):
+        super().__init__()
+        self.conv = nn.Conv2d(5, 5, (2, 2))
+        self.conv2 = nn.Conv2d(5, 5, (2, 2))
+        self.bn = nn.BatchNorm2d(5)
+        self.relu = nn.ReLU()
+        self.with_bn = with_bn
+        self.with_relu = with_relu
+        self.two_conv = two_conv
+        self.left_conv = left_conv
+        self.use_torch_add = use_torch_add
+
+    def forward(self, x1, x2):
+        if self.two_conv:
+            if self.use_torch_add:
+                if self.with_bn:
+                    x = torch.add(self.bn(self.conv(x1)), self.conv2(x1))
+                else:
+                    x = torch.add(self.conv(x1), self.conv2(x1))
+            else:
+                if self.with_bn:
+                    x = self.bn(self.conv(x1)) + self.conv2(x1)
+                else:
+                    x = self.conv(x1) + self.conv2(x1)
+        else:
+            if self.use_torch_add:
+                if self.left_conv:
+                    if self.with_bn:
+                        x = torch.add(self.bn(self.conv(x1)), x2)
+                    else:
+                        x = torch.add(self.conv(x1), x2)
+                else:
+                    if self.with_bn:
+                        x = torch.add(x2, self.bn(self.conv(x1)))
+                    else:
+                        x = torch.add(x2, self.conv(x1))
+            else:
+                if self.left_conv:
+                    if self.with_bn:
+                        x = self.bn(self.conv(x1)) + x2
+                    else:
+                        x = self.conv(x1) + x2
+                else:
+                    if self.with_bn:
+                        x = x2 + self.bn(self.conv(x1))
+                    else:
+                        x = x2 + self.conv(x1)
+        if self.with_relu:
+            x = self.relu(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5, 3, 3), torch.rand(1, 5, 2, 2))
+
 # TODO: self.fc should be self.conv
 class ConvReluModel(torch.nn.Module):
     def __init__(self):
@@ -1473,7 +1560,7 @@ def get_example_inputs(self) -> Tuple[Any, ...]:
 class NormalizationTestModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.quant = torch.quantization.QuantStub()
+        self.quant = torch.ao.quantization.QuantStub()
         self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
         self.layer_norm = torch.nn.LayerNorm((8))
         self.group_norm = torch.nn.GroupNorm(2, 8)
@@ -1780,7 +1867,7 @@ class AnnotatedSkipQuantModel(torch.nn.Module):
     """
     def __init__(self, qengine):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
         self.sub = QuantWrapper(InnerModule())
         self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
         # don't quantize this fc
@@ -1797,7 +1884,7 @@ class QuantStubModel(torch.nn.Module):
     """
     def __init__(self):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qconfig("qnnpack")
+        self.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
@@ -1812,7 +1899,7 @@ class ManualLinearQATModel(torch.nn.Module):
     """
     def __init__(self, qengine):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qat_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.fc1 = torch.nn.Linear(5, 1).to(dtype=torch.float)
@@ -1829,7 +1916,7 @@ class ManualDropoutQATModel(torch.nn.Module):
     """
     def __init__(self, qengine):
         super().__init__()
-        self.qconfig = torch.quantization.get_default_qat_qconfig(qengine)
+        self.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.fc1 = torch.nn.Linear(5, 1).to(dtype=torch.float)
@@ -1861,7 +1948,7 @@ class ManualConvLinearQATModel(torch.nn.Module):
     """
     def __init__(self, qconfig=None):
         super().__init__()
-        self.qconfig = qconfig if qconfig else torch.quantization.get_default_qat_qconfig("qnnpack")
+        self.qconfig = qconfig if qconfig else torch.ao.quantization.get_default_qat_qconfig("qnnpack")
         self.quant = QuantStub()
         self.dequant = DeQuantStub()
         self.conv = torch.nn.Conv2d(3, 1, kernel_size=3).to(dtype=torch.float)
@@ -1885,7 +1972,7 @@ def __init__(self):
 
 class ManualEmbeddingBagLinear(nn.Module):
     def __init__(self):
-        super(ManualEmbeddingBagLinear, self).__init__()
+        super().__init__()
         self.emb = nn.EmbeddingBag(num_embeddings=10, embedding_dim=12, mode='sum')
         self.emb.qconfig = default_embedding_qat_qconfig
         self.quant = QuantStub()
@@ -2244,7 +2331,7 @@ def forward(self, indices, offsets, linear_in):
 class DenseTopMLP(nn.Module):
 
     def __init__(self, dense_dim, dense_out, embedding_dim, top_out_in, top_out_out) -> None:
-        super(DenseTopMLP, self).__init__()
+        super().__init__()
 
         self.dense_mlp = nn.Sequential(
             nn.Linear(dense_dim, dense_out),
@@ -2285,7 +2372,7 @@ class SparseNNModel(nn.Module):
     _TOP_MLP_DIM = 1
 
     def __init__(self) -> None:
-        super(SparseNNModel, self).__init__()
+        super().__init__()
 
         self.model_sparse = EmbBagWrapper(self._NUM_EMBEDDINGS, self._EMBEDDING_DIM)
         self.dense_top = DenseTopMLP(
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index f6161990ce13..7164e9616307 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -65,7 +65,6 @@
 import torch.cuda
 from torch import Tensor
 from torch._C import ScriptDict, ScriptList  # type: ignore[attr-defined]
-from torch._six import string_classes
 from torch._utils_internal import get_writable_path
 from torch.nn import (
     ModuleDict,
@@ -81,14 +80,12 @@
 from torch.testing import make_tensor
 from torch.testing._comparison import (
     BooleanPair,
-    ErrorMeta,
     NonePair,
     NumberPair,
     Pair,
     TensorLikePair,
-    UnsupportedInputs,
 )
-from torch.testing._comparison import assert_equal as assert_equal
+from torch.testing._comparison import not_close_error_metas
 from torch.testing._internal.common_dtype import get_all_dtypes
 import torch.utils._pytree as pytree
 
@@ -127,8 +124,20 @@
 
 NATIVE_DEVICES = ('cpu', 'cuda', 'meta')
 
+check_names = ['orin', 'concord', 'galen', 'xavier', 'nano', 'jetson', 'tegra']
+IS_JETSON = any(name in platform.platform() for name in check_names)
 
-class _TestParametrizer(object):
+def gcIfJetson(fn):
+    # Irregular Jetson host/device memory setup requires cleanup to avoid tests being killed
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if IS_JETSON:
+            gc.collect()
+            torch.cuda.empty_cache()
+        fn(*args, **kwargs)
+    return wrapper
+
+class _TestParametrizer:
     """
     Decorator class for parametrizing a test function, yielding a set of new tests spawned
     from the original generic test, each specialized for a specific set of test inputs. For
@@ -266,7 +275,7 @@ def instantiated_test(self, param_kwargs=param_kwargs):
     return generic_cls
 
 
-class subtest(object):
+class subtest:
     """
     Explicit subtest case for use with test parametrization.
     Allows for explicit naming of individual subtest cases as well as applying
@@ -503,9 +512,9 @@ def _get_test_report_path():
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
 parser.add_argument('--accept', action='store_true')
-parser.add_argument('--jit_executor', type=str)
+parser.add_argument('--jit-executor', '--jit_executor', type=str)
 parser.add_argument('--repeat', type=int, default=1)
-parser.add_argument('--test_bailouts', action='store_true')
+parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
 parser.add_argument('--use-pytest', action='store_true')
 parser.add_argument('--save-xml', nargs='?', type=str,
                     const=_get_test_report_path(),
@@ -591,7 +600,7 @@ def shell(command, cwd=None, env=None, stdout=None, stderr=None):
     #      `p.wait()` in a `final` block for the code to be portable.
     #
     # https://github.com/python/cpython/blob/71b6c1af727fbe13525fb734568057d78cea33f3/Lib/subprocess.py#L309-L323
-    assert not isinstance(command, torch._six.string_classes), "Command to shell should be a list or tuple of tokens"
+    assert not isinstance(command, str), "Command to shell should be a list or tuple of tokens"
     p = subprocess.Popen(command, universal_newlines=True, cwd=cwd, env=env, stdout=stdout, stderr=stderr)
     return wait_for_process(p)
 
@@ -739,14 +748,16 @@ def run_tests(argv=UNITTEST_ARGS):
             failed |= wait_for_process(p) != 0
         assert not failed, "Some test shards have failed"
     elif USE_PYTEST:
+        pytest_args = argv
         if TEST_SAVE_XML:
             test_report_path = get_report_path(pytest=True)
             print(f'Test results will be stored in {test_report_path}')
+            pytest_args = pytest_args + [f'--junit-xml-reruns={test_report_path}']
 
         import pytest
         os.environ["NO_COLOR"] = "1"
         os.environ["USING_PYTEST"] = "1"
-        exit_code = pytest.main(args=argv + [f'--junit-xml-reruns={test_report_path}'] if TEST_SAVE_XML else [])
+        exit_code = pytest.main(args=pytest_args)
         del os.environ["USING_PYTEST"]
         if TEST_SAVE_XML:
             sanitize_pytest_xml(test_report_path)
@@ -898,9 +909,6 @@ def _check_module_exists(name: str) -> bool:
 TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
 TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
 
-# TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
-# See #64427
-TEST_WITH_MIOPEN_SUGGEST_NHWC = os.getenv('PYTORCH_MIOPEN_SUGGEST_NHWC', '0') == '1'
 # Enables tests that are slow to run (disabled by default)
 TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'
 
@@ -1092,6 +1100,13 @@ def wrap_fn(self, *args, **kwargs):
         return wrap_fn
     return dec_fn
 
+# Temporary function to simplify adding support to 3.11
+def xfailIfPython311(fn):
+    if sys.version_info < (3, 11):
+        return fn
+    else:
+        return unittest.expectedFailure(fn)
+
 def skipIfNotMiopenSuggestNHWC(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -1740,7 +1755,7 @@ def check_if_enable(test: unittest.TestCase):
 
 # `TestCase.assertEqual` is very permissive and coerced the inputs into a format that could be compared. This is very
 # convenient when writing tests, but not so much while reviewing them. By default, the comparison `Pair` framework of
-# `torch.testing._comparison.assert_equal`, used for example by the public testing function
+# `torch.testing._comparison.are_equal`, used for example by the public testing function
 # `torch.testing.assert_close`, is more strict. In order to use the same framework and thus reduce the divergence
 # between internal and external comparison logic as much as possible, we define some "relaxed" pairs here. They only
 # change the supported inputs, but the comparison logic is the same.
@@ -1763,7 +1778,7 @@ def _process_inputs(self, actual, expected, *, id):
             (isinstance(actual, self._supported_types) and isinstance(expected, other_supported_types))
             or (isinstance(expected, self._supported_types) and isinstance(actual, other_supported_types))
         ):
-            raise UnsupportedInputs()
+            self._inputs_not_supported()
 
         return [self._to_bool(input, id=id) for input in (actual, expected)]
 
@@ -1775,11 +1790,11 @@ def _to_bool(self, bool_like, *, id):
         elif isinstance(bool_like, (torch.Tensor, np.ndarray)):
             numel = bool_like.numel() if isinstance(bool_like, torch.Tensor) else bool_like.size
             if numel > 1:
-                raise ErrorMeta(
+                self._fail(
                     ValueError,
                     f"Only single element tensor-likes can be compared against a boolean. "
                     f"Got {numel} elements instead.",
-                    id=id,
+                    id=id
                 )
 
             return bool(bool_like.item())
@@ -1820,7 +1835,7 @@ def _process_inputs(self, actual, expected, *, id):
                 (isinstance(actual, self._supported_types) and isinstance(expected, other_supported_types))
                 or (isinstance(expected, self._supported_types) and isinstance(actual, other_supported_types))
         ):
-            raise UnsupportedInputs()
+            self._inputs_not_supported()
 
         return [self._to_number(input, id=id) for input in (actual, expected)]
 
@@ -1828,11 +1843,11 @@ def _to_number(self, number_like, *, id):
         if isinstance(number_like, (torch.Tensor, np.ndarray)):
             numel = number_like.numel() if isinstance(number_like, torch.Tensor) else number_like.size
             if numel > 1:
-                raise ErrorMeta(
+                self._fail(
                     ValueError,
                     f"Only single element tensor-likes can be compared against a number. "
                     f"Got {numel} elements instead.",
-                    id=id,
+                    id=id
                 )
             number = number_like.item()
             if isinstance(number, bool):
@@ -1896,7 +1911,7 @@ class UnittestPair(Pair):
     """Fallback ABC pair that handles non-numeric inputs.
 
     To avoid recreating the mismatch messages of :meth:`unittest.TestCase.assertEqual`, this pair simply wraps it in
-    order to use it with the :class:`Pair` "framework" from :func:`assert_equal`.
+    order to use it with the :class:`Pair` "framework" from :func:`are_equal`.
 
     Define the :attr:`UnittestPair.CLS` in a subclass to indicate which class(es) of the inputs the pair should support.
     """
@@ -1916,11 +1931,11 @@ def compare(self):
             msg = str(error)
 
         type_name = self.TYPE_NAME or (self.CLS if isinstance(self.CLS, type) else self.CLS[0]).__name__
-        raise self._make_error_meta(AssertionError, f"{type_name.title()} comparison failed: {msg}")
+        self._fail(AssertionError, f"{type_name.title()} comparison failed: {msg}")
 
 
 class StringPair(UnittestPair):
-    CLS = string_classes
+    CLS = str
     TYPE_NAME = "string"
 
 
@@ -2127,18 +2142,15 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_re
             errors_before = 0 if result is None else len(result.errors)
             skipped_before = 0 if result is None else len(result.skipped)
 
-        if TEST_WITH_TORCHDYNAMO:
+        super_run = super().run
+        # TODO remove version check once dynamo supports 3.11
+        if TEST_WITH_TORCHINDUCTOR and sys.version_info < (3, 11):
+            super_run = torch._dynamo.optimize("inductor")(super_run)
+        elif TEST_WITH_TORCHDYNAMO and sys.version_info < (3, 11):
             # TorchDynamo optimize annotation
-            if TEST_WITH_TORCHINDUCTOR:
-                super_run = torch._dynamo.optimize("inductor")(super().run)
-            else:
-                super_run = torch._dynamo.optimize("eager")(super().run)
-            super_run(result=result)
+            super_run = torch._dynamo.optimize("eager")(super_run)
 
-            # TODO - Reset for each test slows down testing significantly.
-            # torch._dynamo.reset()
-        else:
-            super().run(result=result)
+        super_run(result=result)
 
         # Early terminate test if necessary.
         if self._should_stop_test_suite():
@@ -2357,15 +2369,7 @@ def sawteeth(n, m):
             q, r = divmod(nnz - n * n_cols - m * (n_rows - n),
                           (n_cols - m) * (n_cols - m + 1) // 2)
             p = 1 + q * (n_cols - m + 1)
-            if sys.version_info >= (3, 8):
-                k = math.isqrt(2 * r)
-            else:
-                # math.isqrt(x) is available starting from Python 3.8.
-                # Here we use int(math.sqrt(x)) as an approximation
-                # that appers to give exaxt result for all x values
-                # less than 2**35, at least, the upper limit of x is
-                # TBD.
-                k = int(math.sqrt(2 * r))
+            k = math.isqrt(2 * r)
             if k * (k + 1) > 2 * r:
                 k -= 1
             corr = r - k * (k + 1) // 2
@@ -2917,7 +2921,7 @@ def to_list(input):
             x = to_list(x)
             y = to_list(y)
         # When comparing a sequence of numbers to a tensor, we need to convert the sequence to a tensor here.
-        # Otherwise, the pair origination of `assert_equal` will fail, because the sequence is recognized as container
+        # Otherwise, the pair origination of `are_equal` will fail, because the sequence is recognized as container
         # that should be checked elementwise while the tensor is not.
         elif isinstance(x, torch.Tensor) and isinstance(y, Sequence):
             y = torch.as_tensor(y, dtype=x.dtype, device=x.device)
@@ -2931,7 +2935,7 @@ def to_list(input):
         if isinstance(y, torch.Tensor) and y.is_nested:
             y = y.unbind()
 
-        assert_equal(
+        error_metas = not_close_error_metas(
             x,
             y,
             pair_types=(
@@ -2964,12 +2968,17 @@ def to_list(input):
             check_layout=exact_layout,
             check_stride=exact_stride,
             check_is_coalesced=exact_is_coalesced,
-            # This emulates unittest.TestCase's behavior if a custom message passed and
-            # TestCase.longMessage (https://docs.python.org/3/library/unittest.html#unittest.TestCase.longMessage)
-            # is True (default)
-            msg=(lambda generated_msg: f"{generated_msg}\n{msg}") if isinstance(msg, str) and self.longMessage else msg,
         )
 
+        if error_metas:
+            # TODO: compose all metas into one AssertionError
+            raise error_metas[0].to_error(
+                # This emulates unittest.TestCase's behavior if a custom message passed and
+                # TestCase.longMessage (https://docs.python.org/3/library/unittest.html#unittest.TestCase.longMessage)
+                # is True (default)
+                (lambda generated_msg: f"{generated_msg}\n{msg}") if isinstance(msg, str) and self.longMessage else msg
+            )
+
     def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override]
                        atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:
         with self.assertRaises(AssertionError, msg=msg):
@@ -3612,8 +3621,8 @@ def random_sparse_pd_matrix(matrix_size, density=0.01, **kwargs):
     torch = kwargs.get('torch', globals()['torch'])
     dtype = kwargs.get('dtype', torch.double)
     device = kwargs.get('device', 'cpu')
-    data = dict([((i, i), float(i + 1) / matrix_size)
-                 for i in range(matrix_size)])
+    data = {(i, i): float(i + 1) / matrix_size
+            for i in range(matrix_size)}
 
 
     def multiply(data, N, i, j, cs, sn, left=True):
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index 5d7de4e2328a..26f2984ec1ac 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -220,7 +220,7 @@ def wrap(e):
                     # 4. we set the storage (and sizes/strides/offset) of the wrapper
                     #    tensor results to be that of the tensors that alias the input
                     result = func(*args, **kwargs)
-                    if isinstance(result, tuple) or isinstance(result, list):
+                    if isinstance(result, (tuple, list)):
                         for a, b in zip(rs, result):
                             a.set_(b)
                     else:
@@ -507,7 +507,7 @@ def maybe_tangent(t):
         if isinstance(t, torch.Tensor) and t.requires_grad:
             return torch.randn_like(t)
         elif is_tensorlist(t):
-            return list(torch.randn_like(e) if e.requires_grad else None for e in t)
+            return [torch.randn_like(e) if e.requires_grad else None for e in t]
         return None
 
     tangent_args = tuple(maybe_tangent(arg) for arg in args)
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
index 4352817476f6..58ce3c996fa0 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
@@ -36,7 +36,7 @@ def __init__(
         group=None,
         init_rrefs=True
     ) -> None:
-        super(MyShardedModel2, self).__init__()
+        super().__init__()
         if spec is not None:
             self.sharded_tensor2 = sharded_tensor.rand(
                 spec, 10, 20, process_group=group, init_rrefs=init_rrefs
@@ -53,7 +53,7 @@ def __init__(
         group=None,
         init_rrefs=True
     ) -> None:
-        super(MyShardedModel1, self).__init__()
+        super().__init__()
         if spec is not None:
             self.sharded_tensor1 = sharded_tensor.rand(
                 spec, 10, 20, process_group=group, init_rrefs=init_rrefs
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 3ebffbaa0324..bbed3e70f1e3 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -12,7 +12,6 @@
     Iterator,
     Tuple,
     Dict,
-    Optional,
     List,
     Sequence,
     TypeVar,
@@ -40,7 +39,7 @@
 from torch.distributed._tensor.api import DTensor
 from torch.distributed._tensor.placement_types import Placement
 
-DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
+DEVICE_TYPE = "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
 NUM_DEVICES = 4
 
 # We use this as a proxy for "multiple GPUs exist"
@@ -145,15 +144,10 @@ def _test_op(self, mesh: DeviceMesh, op_call, *args, **kwargs) -> None:
                 )
 
 
+TestFunc = Callable[[object], object]
+
 # wrapper to initialize comms (processgroup)
-def with_comms(
-    func: Optional[  # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-        Callable
-    ] = None,
-    backend: Optional[str] = None,
-) -> Optional[  # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-    Callable
-]:
+def with_comms(func: TestFunc) -> TestFunc:
     assert func is not None
 
     @wraps(func)  # pyre-ignore[6]
@@ -161,13 +155,17 @@ def wrapper(
         self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
     ) -> None:
         # if backend not specified, and cuda available, then use nccl, else gloo
+        if torch.cuda.is_available() and torch.cuda.device_count() >= self.world_size:
+            self.device_type = "cuda"
+        else:
+            self.device_type = "cpu"
+
         pg_backend = (
-            "nccl" if backend is None and torch.cuda.is_available() else "gloo"
+            "nccl" if self.device_type == "cuda" else "gloo"
         )
         if pg_backend == "nccl" and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
-        self.device_type = "cuda" if pg_backend == "nccl" else "cpu"
         self.init_pg(backend=pg_backend)
         func(self)  # type: ignore[misc]
         self.destroy_pg()
@@ -193,7 +191,7 @@ def setUp(self) -> None:
 
 
 # This is a class for converting args/kwargs of an op into distributed args/kwargs
-class DTensorConverter(object):
+class DTensorConverter:
     def __init__(
         self,
         mesh: DeviceMesh,
@@ -328,7 +326,9 @@ def to_dist_tensor(
                         mesh,
                         placements,
                         size=t.size(),
+                        dtype=torch.bool,
                         requires_grad=t.requires_grad,
+                        stride=t.stride()
                     )
                 else:
                     r = distribute_tensor(t, mesh, placements)
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index fb12f2e23283..7674be33a3a6 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -92,7 +92,7 @@ def _remote_method_async(method, rref, *args, **kwargs):
 class RemoteEM(nn.Module):
     def __init__(self, num_embeddings: int, embedding_dim: int):
         gLogger.info(f"Initing RemoteEM with {num_embeddings} {embedding_dim}")
-        super(RemoteEM, self).__init__()
+        super().__init__()
         init_em = [0.5] * embedding_dim
         self.em = nn.EmbeddingBag(
             num_embeddings,
@@ -118,7 +118,7 @@ def getLinear(d_in, d_out):
 class RemoteNet(nn.Module):
     def __init__(self, d_in: int, d_out: int):
         gLogger.info(f"Initing RemoteNet with {d_in} {d_out}")
-        super(RemoteNet, self).__init__()
+        super().__init__()
         self.fc = getLinear(d_in, d_out)
         self.relu = nn.ReLU()
 
@@ -134,7 +134,7 @@ def __init__(
         remote_net_rref: rpc.RRef,
         process_group_for_ddp: dist.ProcessGroup = None,
     ):
-        super(HybridModel, self).__init__()
+        super().__init__()
         self.remote_em_rref = remote_em_rref
         self.remote_net_rref = remote_net_rref
         self.fc1 = getLinear(D_DENSE, D_DENSE)
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 8a420db32b84..98e6f15ff7ca 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -224,7 +224,7 @@ class DDPUnevenTestInput(NamedTuple):
 
 class _FC2(nn.Module):
     def __init__(self):
-        super(_FC2, self).__init__()
+        super().__init__()
         self.fc = nn.Linear(10, 50, bias=True)
         self.fc.bias.requires_grad = False
 
@@ -235,7 +235,7 @@ def forward(self, x):
 
 class Net(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 10, bias=False)
         self.fc2 = _FC2()
         self.fc3 = nn.Linear(50, 4, bias=False)
@@ -253,7 +253,7 @@ def forward(self, x):
 
 class LargeNet(nn.Module):
     def __init__(self):
-        super(LargeNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(1000, 2000, bias=False)
         self.fc2 = nn.Linear(2000, 500, bias=False)
 
@@ -274,7 +274,7 @@ def forward(self, x):
 
 class BatchNormNet(nn.Module):
     def __init__(self, affine=True):
-        super(BatchNormNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(2, 40, bias=False)
         self.bn = nn.BatchNorm1d(4, affine=affine)
         self.fc2 = nn.Linear(40, 4, bias=False)
@@ -346,7 +346,7 @@ def forward(self, x):
 
 class ControlFlowToyModel(nn.Module):
     def __init__(self):
-        super(ControlFlowToyModel, self).__init__()
+        super().__init__()
         self.lin1 = nn.Linear(10, 10, bias=False)
         self.lin2 = nn.Linear(10, 10, bias=False)
 
@@ -481,7 +481,7 @@ def _create_torch_profiler():
 
 
 
-class Barrier(object):
+class Barrier:
     barrier_id = 0
 
     @classmethod
@@ -547,10 +547,6 @@ def init_method(self):
 
     @classmethod
     def _run(cls, rank, test_name, file_name, pipe):
-        # Enable DDP + ReplicatedTensor
-        from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
-        _set_ddp_with_replicated_tensor(True)
-
         if BACKEND == "nccl" and not torch.cuda.is_available():
             sys.exit(TEST_SKIPS["no_cuda"].exit_code)
         self = cls(test_name)
@@ -4225,7 +4221,7 @@ def test_DistributedDataParallel_requires_grad(self):
         def test_ddp_zero_output_features(self):
             class ToyModel(nn.Module):
                 def __init__(self):
-                    super(ToyModel, self).__init__()
+                    super().__init__()
                     self.net1 = nn.Linear(10, 10)
                     self.relu = nn.ReLU()
                     self.net2 = nn.Linear(10, 0)
@@ -5822,12 +5818,7 @@ def parse_env(var):
             params = list(model_DDP.parameters())
             num_params = 0
             param_size = 0
-            params = list(
-                parameter
-                for parameter in filter(
-                    lambda parameter: parameter.requires_grad, params
-                )
-            )
+            params = list(filter(lambda parameter: parameter.requires_grad, params))
             for p in params:
                 num_params += 1
                 param_size += p.numel() * p.element_size()
@@ -6665,7 +6656,7 @@ def _run_uneven_inputs_test(
                 dist.all_gather(tensor_list, final_rank_tensor)
                 max_rank = dist.get_world_size() - 1
                 self.assertSetEqual(
-                    {max_rank}, set(tensor.item() for tensor in tensor_list)
+                    {max_rank}, {tensor.item() for tensor in tensor_list}
                 )
                 # Ensure that all models are the same across ranks after all have joined.
                 self.validate_net_equivalence(net)
@@ -7094,7 +7085,7 @@ def _test_ddp_ignore_params_arg(self, static_graph=False):
             class TestModel(nn.Module):
                 def __init__(self, rank):
                     self.rank = rank
-                    super(TestModel, self).__init__()
+                    super().__init__()
                     self.fc1 = nn.Linear(1, 1, bias=False)
                     # Proxy that will be materialized to another architecture later.
                     # (after wrapping model with DDP)
@@ -7149,8 +7140,6 @@ def forward(self, x):
                 # Materialize new params. These are not registered in DDP and thus
                 # don't have autograd hooks installed on them.
                 ddp.module.fc2 = nn.Linear(1, 1, bias=False).to(device_id)
-                # Rebuild replicated_module to pick up the changes.
-                ddp._build_replicated_tensor_module()
 
                 # local model with the new materialized parameters.
                 local_model = copy.deepcopy(ddp.module).cuda(self.rank)
@@ -7195,7 +7184,7 @@ def test_ddp_ignore_params_arg(self):
         def test_ddp_unused_params_rebuild_buckets_exception(self):
             class ToyModel(nn.Module):
                 def __init__(self):
-                    super(ToyModel, self).__init__()
+                    super().__init__()
                     self.net1 = nn.Linear(10, 10, bias=False)
                     self.net2 = nn.Linear(10, 10, bias=False)
 
@@ -7250,7 +7239,7 @@ def test_ddp_shared_grad_acc_unused_params(self):
             # even if they share gradient accumulators.
             class ToyModel(nn.Module):
                 def __init__(self):
-                    super(ToyModel, self).__init__()
+                    super().__init__()
                     # net1, bias, and net1.bias are all unused params.
                     self.net1 = nn.Linear(10, 5, bias=False)
                     self.bias = nn.Parameter(torch.zeros(5))
@@ -7298,7 +7287,7 @@ def __init__(self, t):
 
             def tuple_and_list_validator(x):
                 self.assertTrue(len(x), expected_len)
-                self.assertEqual(1, len(set(t.device for t in x)))
+                self.assertEqual(1, len({t.device for t in x}))
                 self.assertEqual(x[0].device.index, self.rank)
                 return x[0] + x[1]
 
@@ -7317,7 +7306,7 @@ def custom_type_validator(x):
             def dict_validator(x):
                 self.assertTrue(EXPECTED_FIELDS[0] in x.keys())
                 self.assertTrue(EXPECTED_FIELDS[1] in x.keys())
-                self.assertEqual(1, len(set(t.device for t in x.values())))
+                self.assertEqual(1, len({t.device for t in x.values()}))
                 self.assertEqual(x[EXPECTED_FIELDS[0]].device.index, self.rank)
                 return x[EXPECTED_FIELDS[0]] + x[EXPECTED_FIELDS[1]]
 
@@ -7564,7 +7553,7 @@ def test_ddp_control_flow_different_across_ranks(self):
 
             class ToyModel(nn.Module):
                 def __init__(self, rank):
-                    super(ToyModel, self).__init__()
+                    super().__init__()
                     self.lin1 = nn.Linear(10, 10, bias=False)
                     self.lin2 = nn.Linear(10, 10, bias=False)
                     self.rank = rank
@@ -8070,7 +8059,7 @@ def _test_different_graph_across_ranks(
         ):
             class ToyModel(nn.Module):
                 def __init__(self, rank):
-                    super(ToyModel, self).__init__()
+                    super().__init__()
                     self.lin1 = nn.Linear(10, 10, bias=False)
                     self.lin2 = nn.Linear(10, 10, bias=False)
                     self.rank = rank
@@ -8183,14 +8172,14 @@ def test_monitored_barrier_gloo_subgroup(self):
         def _test_monitored_barrier_allreduce_hang(self, wait_all_ranks):
             # tests expected behavior when nonzero rank hangs.
             nccl_pg = dist.new_group(
-                ranks=list(i for i in range(int(self.world_size))),
+                ranks=list(range(int(self.world_size))),
                 # provide sufficient timeout so communicators
                 # can be initialized in ctor.
                 timeout=timedelta(seconds=15),
                 backend=dist.Backend.NCCL,
             )
             gloo_pg = dist.new_group(
-                ranks=list(i for i in range(int(self.world_size))),
+                ranks=list(range(int(self.world_size))),
                 backend=dist.Backend.GLOO,
             )
             tensors = [torch.ones(10, device=self.rank) * self.rank]
@@ -8256,7 +8245,7 @@ def test_monitored_barrier_allreduce_hang_wait_all_ranks(self):
         def test_monitored_barrier_gloo_rank_0_timeout(self):
             # tests error when rank 0 exhausts its given timeout.
             process_group = dist.new_group(
-                ranks=list(i for i in range(int(self.world_size)))
+                ranks=list(range(int(self.world_size)))
             )
             timeout = timedelta(seconds=0)
             if self.rank == 0:
@@ -8692,7 +8681,7 @@ def get_loss(model_output):
                 elif isinstance(model_output, dict):
                     for value in model_output.values():
                         loss += get_loss(value)
-                elif isinstance(model_output, tuple) or isinstance(model_output, list):
+                elif isinstance(model_output, (tuple, list)):
                     for x in model_output:
                         loss += get_loss(x)
                 else:
@@ -8778,7 +8767,7 @@ def forward(self, x):
         def test_detect_ddp_is_actually_static(self):
             class ToyModel(nn.Module):
                 def __init__(self):
-                    super(ToyModel, self).__init__()
+                    super().__init__()
                     self.net1 = nn.Linear(10, 10, bias=False)
                     self.net2 = nn.Linear(10, 10)
 
@@ -9108,15 +9097,10 @@ def forward(self, x):
 
             device = self.rank
             module = MockModule().to(device)
-            # Disable DDP + ReplicatedTensor since stateless looks for 'module'
-            # whereas with ReplicatedTensor, we run '_replicated_tensor_module'
-            # in the forward pass.
-            from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
-            with _ddp_replicated_tensor(False):
-                module = torch.nn.parallel.DistributedDataParallel(
-                    module,
-                    device_ids=[device]
-                )
+            module = torch.nn.parallel.DistributedDataParallel(
+                module,
+                device_ids=[device]
+            )
             x = torch.rand((1, 1)).to(device)
             weight = torch.tensor([[1.0]], device=device, requires_grad=True)
             bias = torch.tensor([0.0], device=device, requires_grad=True)
@@ -9151,7 +9135,7 @@ def forward(self, x):
         def test_ddp_forward_backward_hook(self):
             class DummyTestModel(nn.Module):
                 def __init__(self):
-                    super(DummyTestModel, self).__init__()
+                    super().__init__()
                     torch.manual_seed(0)
                     self.fc = nn.Linear(2, 2)
 
diff --git a/torch/testing/_internal/distributed/distributed_utils.py b/torch/testing/_internal/distributed/distributed_utils.py
index 8473077c3c7f..f76533c39e6f 100644
--- a/torch/testing/_internal/distributed/distributed_utils.py
+++ b/torch/testing/_internal/distributed/distributed_utils.py
@@ -11,7 +11,7 @@
 class MockProcessGroup(dist.ProcessGroup):
 
     def __init__(self, rank, world):
-        super(MockProcessGroup, self).__init__(rank, world)
+        super().__init__(rank, world)
 
     def getBackendName(self):
         return "mock_process_group"
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index c0bd6aeca056..c0891034934b 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -1,7 +1,7 @@
 import sys
 import threading
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.distributed as dist
@@ -260,7 +260,7 @@ def reduce_scatter(self, output_tensor, scatter_list, opts=ReduceScatterOptions(
         return res
 
     def __init__(self, rank, world_size):
-        super(ProcessLocalGroup, self).__init__(rank, world_size)
+        super().__init__(rank, world_size)
         self._rank = rank
         self._world_size = world_size
         ProcessLocalGroup._register(self)
@@ -295,15 +295,17 @@ class WorldData:
     pg_map: Dict[dist.ProcessGroup, Tuple[str, Optional[Store]]]
     pg_names: Dict[dist.ProcessGroup, str]
     pg_group_ranks: Dict[dist.ProcessGroup, Dict[int, int]]
+    pg_backend_config: Dict[dist.ProcessGroup, str]
     group_count: int
-
+    tags_to_pg: Dict[str, List[dist.ProcessGroup]]
+    pg_to_tag: Dict[dist.ProcessGroup, str]
 
 class ThreadLocalWorld:
     _world = threading.local()
 
     def _get_world(self) -> WorldData:
         if not hasattr(ThreadLocalWorld._world, "world"):
-            ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, 0)
+            ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, {}, 0, {}, {})
         return ThreadLocalWorld._world.world
 
     @property
@@ -326,6 +328,10 @@ def pg_names(self):
     def pg_group_ranks(self):
         return self._get_world().pg_group_ranks
 
+    @property
+    def pg_backend_config(self):
+        return self._get_world().pg_backend_config
+
     @property
     def group_count(self) -> int:
         return self._get_world().group_count
@@ -334,6 +340,14 @@ def group_count(self) -> int:
     def group_count(self, value):
         self._get_world().group_count = value
 
+    @property
+    def tags_to_pg(self):
+        return self._get_world().tags_to_pg
+
+    @property
+    def pg_to_tag(self):
+        return self._get_world().pg_to_tag
+
 
 _old_pg_world = None
 
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index 997006353bfb..83736b33b316 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -604,78 +604,78 @@ def test_invalid_devices(self):
             RuntimeError,
             r"Expected one of .+ device type at start of device string",
         ):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "{}/foo".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
         with self.assertRaisesRegex(
             RuntimeError, r"CUDA error: invalid device ordinal"
         ):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "{}/cuda:100".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
         with self.assertRaisesRegex(RuntimeError, r"Invalid device string: 'cpu2'"):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "{}/cpu2".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
         with self.assertRaisesRegex(RuntimeError, r"Device string must not be empty"):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "{}/".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
         with self.assertRaisesRegex(
             ValueError,
             r"Could not parse remote_device: worker1/cuda:0/cuda:1. The valid format is '<workername>/<device>'",
         ):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "{}/cuda:0/cuda:1".format(dst_worker_name),
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
         with self.assertRaisesRegex(
             ValueError,
             r"Could not parse remote_device: /. The valid format is '<workername>/<device>'",
         ):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "/",
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
         with self.assertRaisesRegex(
             ValueError,
             r"Could not parse remote_device: /cuda:0. The valid format is '<workername>/<device>'",
         ):
-            list(
+            [
                 m.forward()
                 for m in self._create_remote_module_iter(
                     "/cuda:0",
                     modes=[ModuleCreationMode.MODULE_CTOR],
                 )
-            )
+            ]
 
     @skip_if_lt_x_gpu(1)
     @dist_utils.dist_init
diff --git a/torch/testing/_internal/distributed/pipe_with_ddp_test.py b/torch/testing/_internal/distributed/pipe_with_ddp_test.py
index d49798cd9d8f..ab782479fb19 100644
--- a/torch/testing/_internal/distributed/pipe_with_ddp_test.py
+++ b/torch/testing/_internal/distributed/pipe_with_ddp_test.py
@@ -90,7 +90,7 @@ def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False, sta
 
         class MyModule(nn.Module):
             def __init__(self, device):
-                super(MyModule, self).__init__()
+                super().__init__()
                 self.fc2 = nn.Linear(8, 4, bias=False).cuda(device)
                 self.fc3 = nn.Linear(4, 2, bias=False).cuda(device)
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 5d7831659fc1..1f1b4db5676b 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -81,7 +81,9 @@ def create_tensor():
 def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32):
     i = [[0, 1, 1], [2, 0, 2]]
     v = [3.2, 4.1, 5.3]
-    tensor = torch.sparse_coo_tensor(i, v, (3, 3), requires_grad=requires_grad, dtype=dtype)
+    tensor = torch.sparse_coo_tensor(
+        i, v, (3, 3), requires_grad=requires_grad, dtype=dtype
+    )
     if coalesce:
         tensor = tensor.coalesce()
     return tensor
@@ -2375,7 +2377,7 @@ def backward(ctx, grad):
                 i = torch.ones(1, 1, dtype=torch.long)
                 nv = v.expand(8, 3)
                 ni = i.expand(1, 8)
-                ngrad = torch.sparse.FloatTensor(ni, nv, torch.Size([10, 3]))
+                ngrad = torch.sparse_coo_tensor(ni, nv, (10, 3), dtype=torch.float32)
                 NonContGradFunc.static_grad_ptr = ngrad._values().data_ptr()
                 return ngrad, ngrad
 
@@ -2680,9 +2682,6 @@ def test_device_maps_backward_pass(self):
         rpc.shutdown()
 
     class MyRemoteCompute(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-
         def forward(self, input):
             input = input * 2.0
             return input
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index 414e079b86d3..4de9ef0c261f 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -27,7 +27,7 @@ def timed_log(text):
     print(f"{datetime.now().strftime('%H:%M:%S')} {text}")
 
 
-class BatchUpdateParameterServer(object):
+class BatchUpdateParameterServer:
 
     def __init__(self, batch_update_size):
         self.model = nn.Linear(in_features, out_features)
@@ -47,7 +47,10 @@ def get_model(self):
     def update_and_fetch_model(ps_rref, grads):
         self = ps_rref.local_value()
         for p, g in zip(self.model.parameters(), grads):
-            p.grad += g
+            if p.grad is None:
+                p.grad = g
+            else:
+                p.grad += g
         with self.lock:
             timed_log(f"PS got {self.curr_update_size}/{self.batch_update_size} updates")
             self.curr_update_size += 1
@@ -66,7 +69,7 @@ def update_and_fetch_model(ps_rref, grads):
         return fut
 
 
-class Trainer(object):
+class Trainer:
 
     def __init__(self, ps_rref):
         self.ps_rref = ps_rref
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index fff6e5865f77..13d755d39a49 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -43,7 +43,7 @@ class Policy(nn.Module):
     See https://github.com/pytorch/examples/tree/master/reinforcement_learning
     """
     def __init__(self):
-        super(Policy, self).__init__()
+        super().__init__()
         self.affine1 = nn.Linear(4, 128)
         self.dropout = nn.Dropout(p=0.6)
         self.affine2 = nn.Linear(128, 2)
diff --git a/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py b/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
index 6586b7824bb3..d050a2138b79 100644
--- a/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
@@ -54,7 +54,7 @@ def test_verify_backend_options(self):
     @dist_init(faulty_messages=["RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT"])
     def test_custom_faulty_messages(self):
         self.assertEqual(
-            set(["RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT"]),
+            {"RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT"},
             set(self.rpc_backend_options.messages_to_fail),
         )
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 4c0239ac653e..d85066930cf1 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1808,7 +1808,7 @@ def test_profiler_rpc_memory(self):
             res = fut.wait()
 
         function_events = p.function_events
-        event_cpu_mem_usages = set(event.cpu_memory_usage for event in function_events)
+        event_cpu_mem_usages = {event.cpu_memory_usage for event in function_events}
         # if cpu_memory_usage was not propagated over the wire, this set would
         # only contain 0 (indicates no memory being profiled)
         self.assertNotEqual({0}, event_cpu_mem_usages)
@@ -1818,7 +1818,7 @@ def test_profiler_rpc_memory(self):
             res = fut.wait()
 
         function_events = p.function_events
-        event_cpu_mem_usages = set(event.cpu_memory_usage for event in function_events)
+        event_cpu_mem_usages = {event.cpu_memory_usage for event in function_events}
         self.assertEqual({0}, event_cpu_mem_usages)
 
     @dist_init
diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py
index 6d649684896a..ec82aa2f70e9 100644
--- a/torch/testing/_internal/jit_metaprogramming_utils.py
+++ b/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -15,7 +15,7 @@
 import math  # noqa: F401
 
 # Testing utils
-from torch._six import inf
+from torch import inf
 
 # TODO: include files like this should not set the default dtype
 torch.set_default_dtype(torch.double)
@@ -50,7 +50,7 @@ def maybe_non_contig(tensor):
         def conjugate(tensor):
             return tensor.conj()
 
-        if isinstance(arg, torch.Size) or isinstance(arg, dont_convert):
+        if isinstance(arg, (torch.Size, dont_convert)):
             return arg
         elif isinstance(arg, tuple) and len(arg) == 0:
             var = conjugate(torch.randn((), dtype=dtype, device=device))
@@ -605,7 +605,7 @@ class TheModule(torch.jit.ScriptModule):
             __constants__ = submodule_constants
 
             def __init__(self):
-                super(TheModule, self).__init__()
+                super().__init__()
                 self.submodule = nn_module(*constructor_args)
 
         def make_module(script):
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 707529181b63..1146f98f777c 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -69,7 +69,7 @@ def get_execution_plan(graph_executor_state):
                            'only have one execution plan, got: {}'.format(num_plans))
     return execution_plans[0]
 
-class _AssertRaisesRegexWithHighlightContext(object):
+class _AssertRaisesRegexWithHighlightContext:
     """
     A context manager that is useful for checking that error messages highlight
     the correct part of the source code.
@@ -645,7 +645,7 @@ def checkModule(self, nn_module, args):
 
         return sm
 
-class NoTracerWarnContextManager(object):
+class NoTracerWarnContextManager:
     def __enter__(self):
         self.prev = torch._C._jit_get_tracer_state_warn()
         torch._C._jit_set_tracer_state_warn(False)
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 313c54acb8d9..665379f8cb14 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -56,7 +56,7 @@ def _getattr_qual(obj, name, default=_NOTHING):
             raise
 
 
-class DecorateInfo(object):
+class DecorateInfo:
     """Describes which test, or type of tests, should be wrapped in the given
     decorators when testing an operator. Any test that matches all provided
     arguments will be decorated. The decorators will only be applied if the
@@ -117,7 +117,7 @@ def is_active(self, cls_name, test_name, device_type, dtype, param_kwargs):
 # Note: historically the 'input' kwarg had to be a Tensor or TensorList, but we are trying
 #   to support scalar inputs, too. Some tests still depend on 'input' being a Tensor
 #   or TensorList, however.
-class SampleInput(object):
+class SampleInput:
     """Represents sample inputs to a function."""
 
     __slots__ = [
@@ -309,7 +309,7 @@ def to_noncontiguous(t):
 NumericsFilter = collections.namedtuple("NumericsFilter", ["condition", "safe_val"])
 
 
-class ErrorInput(object):
+class ErrorInput:
     """
     A SampleInput that will cause the operation to throw an error plus information
     about the resulting error.
@@ -323,7 +323,7 @@ def __init__(self, sample_input, *, error_type=RuntimeError, error_regex):
         self.error_regex = error_regex
 
 
-class AliasInfo(object):
+class AliasInfo:
     """Class holds alias information. For example, torch.abs ->
     torch.absolute, torch.Tensor.absolute, torch.Tensor.absolute_
     """
@@ -617,7 +617,7 @@ def __call__(self, *args, **kwargs):
 
 # Classes and methods for the operator database
 @dataclass
-class OpInfo(object):
+class OpInfo:
     """Operator information and helper functions for acquiring it."""
 
     # the string name of the function
@@ -1167,6 +1167,17 @@ def error_inputs(self, device, **kwargs):
         """
         return self.error_inputs_func(self, device, **kwargs)
 
+    def sample_inputs_sparse(
+        self, layout, device, dtype, requires_grad=False, **kwargs
+    ):
+        """Returns an iterable of SampleInputs that contain inputs with a
+        specified sparse layout.
+        """
+        sample_inputs_mth = getattr(
+            self, "sample_inputs_" + str(layout).split(".", 1)[-1]
+        )
+        return sample_inputs_mth(device, dtype, requires_grad=requires_grad, **kwargs)
+
     def sample_inputs_sparse_coo(self, device, dtype, requires_grad=False, **kwargs):
         """Returns an iterable of SampleInputs that contain inputs with sparse
         coo layout.
@@ -2000,7 +2011,7 @@ def __init__(
             ),
         )
         kwargs["skips"] = kwargs.get("skips", tuple()) + common_skips
-        super(BinaryUfuncInfo, self).__init__(
+        super().__init__(
             name,
             sample_inputs_func=sample_inputs_func,
             reference_inputs_func=reference_inputs_func,
@@ -2519,7 +2530,7 @@ def __init__(
         sample_inputs_func=None,
         **kwargs,
     ):
-        super(ShapeFuncInfo, self).__init__(
+        super().__init__(
             name,
             dtypes=dtypes,
             dtypesIfCUDA=dtypesIfCUDA,
@@ -2572,6 +2583,7 @@ def __init__(
         supports_alpha_param=False,
         sample_inputs_func=sample_inputs_foreach,
         supports_autograd=False,
+        supports_scalar_self_arg=False,
         **kwargs,
     ):
         super().__init__(
@@ -2583,6 +2595,7 @@ def __init__(
             supports_autograd=supports_autograd,
             **kwargs,
         )
+        self.supports_scalar_self_arg = supports_scalar_self_arg
 
         (
             foreach_method,
@@ -2700,5 +2713,5 @@ def clone_tensor(t):
     return SampleInput(
         clone_tensor(sample.input),
         args=tuple(map(clone_tensor, sample.args)),
-        kwargs=dict(((k, clone_tensor(v)) for k, v in sample_kwargs.items())),
+        kwargs={k: clone_tensor(v) for k, v in sample_kwargs.items()},
     )
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index e0d60c08022f..616c8cf42f4b 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -13,7 +13,6 @@
 from torch.testing._internal.common_cuda import (
     _get_magma_version,
     _get_torch_cuda_version,
-    CUDA11OrLater,
     with_tf32_off,
 )
 from torch.testing._internal.common_device_type import (
@@ -39,7 +38,6 @@
     make_fullrank_matrices_with_distinct_singular_values,
     skipIfSlowGradcheckEnv,
     slowTest,
-    TEST_WITH_ROCM,
 )
 from torch.testing._internal.opinfo.core import (
     clone_sample,
@@ -1203,9 +1201,7 @@ def make_input():
         aten_name="linalg_vecdot",
         ref=lambda x, y, *, dim=-1: (x.conj() * y).sum(dim),
         dtypes=floating_and_complex_types_and(torch.bfloat16),
-        dtypesIfCUDA=floating_and_complex_types_and(
-            torch.half, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []
-        ),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
         sample_inputs_func=sample_inputs_linalg_vecdot,
         check_batched_forward_grad=False,
         supports_forward_ad=True,
@@ -1538,9 +1534,7 @@ def make_input():
         # Need this lambda because gradcheck does not work with TensorList inputs
         aten_name="linalg_multi_dot",
         dtypes=all_types_and_complex_and(torch.bfloat16),
-        dtypesIfCUDA=floating_and_complex_types_and(
-            torch.half, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []
-        ),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
         supports_inplace_autograd=False,
         # Batched grad checks fail for empty input tensors (see https://github.com/pytorch/pytorch/issues/53407)
         check_batched_grad=False,
diff --git a/torch/testing/_internal/opinfo/refs.py b/torch/testing/_internal/opinfo/refs.py
index 500c93998e21..c3e6015c9588 100644
--- a/torch/testing/_internal/opinfo/refs.py
+++ b/torch/testing/_internal/opinfo/refs.py
@@ -115,7 +115,7 @@ def __init__(
 
         inherited = self.torch_opinfo._original_opinfo_args
         ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
-        super(PythonRefInfo, self).__init__(**ukwargs)
+        super().__init__(**ukwargs)
 
 
 class ReductionPythonRefInfo(ReductionOpInfo):
@@ -182,7 +182,7 @@ def __init__(
         inherited = self.torch_opinfo._original_unary_ufunc_args
         ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
 
-        super(ElementwiseUnaryPythonRefInfo, self).__init__(**ukwargs)
+        super().__init__(**ukwargs)
 
 
 class ElementwiseBinaryPythonRefInfo(BinaryUfuncInfo):
@@ -213,4 +213,4 @@ def __init__(
         inherited = self.torch_opinfo._original_binary_ufunc_args
         ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
 
-        super(ElementwiseBinaryPythonRefInfo, self).__init__(**ukwargs)
+        super().__init__(**ukwargs)
diff --git a/torch/testing/_internal/test_module/future_div.py b/torch/testing/_internal/test_module/future_div.py
index 3f042188490c..525c12af82b8 100644
--- a/torch/testing/_internal/test_module/future_div.py
+++ b/torch/testing/_internal/test_module/future_div.py
@@ -1,4 +1,3 @@
-from __future__ import division
 
 
 def div_int_future():
diff --git a/torch/types.py b/torch/types.py
index 0f62ca9561d5..bb973a3862fd 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -36,7 +36,7 @@ class SymInt:
 
 # Storage protocol implemented by ${Type}StorageBase classes
 
-class Storage(object):
+class Storage:
     _cdata: int
     device: torch.device
     dtype: torch.dtype
diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py
index 958d34ecc71a..0c09a82413fe 100644
--- a/torch/utils/_cpp_extension_versioner.py
+++ b/torch/utils/_cpp_extension_versioner.py
@@ -25,7 +25,7 @@ def hash_build_arguments(hash_value, build_arguments):
     return hash_value
 
 
-class ExtensionVersioner(object):
+class ExtensionVersioner:
     def __init__(self):
         self.entries = {}
 
diff --git a/torch/utils/_cuda_trace.py b/torch/utils/_cuda_trace.py
index bc62145d683d..6de1c4d4d09d 100644
--- a/torch/utils/_cuda_trace.py
+++ b/torch/utils/_cuda_trace.py
@@ -1,7 +1,7 @@
 import logging
 from typing import Callable, Generic, List
 
-from typing_extensions import ParamSpec
+from typing_extensions import ParamSpec  # Python 3.10+
 
 logger = logging.getLogger(__name__)
 P = ParamSpec("P")
diff --git a/torch/utils/_device.py b/torch/utils/_device.py
index 54fb15df9ab1..12e9da716eec 100644
--- a/torch/utils/_device.py
+++ b/torch/utils/_device.py
@@ -8,6 +8,7 @@ def _device_constructors():
     return {
         # standard ones
         torch.empty,
+        torch.empty_permuted,
         torch.empty_strided,
         torch.empty_quantized,
         torch.ones,
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index 367974fb2caf..fd7af0f8abff 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -35,3 +35,8 @@ def _group_tensors_by_device_and_dtype(tensorlistlist: List[List[Tensor]],
             # tack on previous index
             per_device_and_dtype_tensors[key][j + 1].append(i)
     return per_device_and_dtype_tensors
+
+def _has_foreach_support(tensors: List[Tensor], device: torch.device) -> bool:
+    if device.type not in ['cpu', 'cuda'] or torch.jit.is_scripting():
+        return False
+    return all([t is None or type(t) == torch.Tensor for t in tensors])
diff --git a/torch/utils/_freeze.py b/torch/utils/_freeze.py
index 6104801edb33..9ba1502c25ee 100644
--- a/torch/utils/_freeze.py
+++ b/torch/utils/_freeze.py
@@ -253,9 +253,10 @@ def compile_file(self, path: Path, top_package_path: Path):
     parser = argparse.ArgumentParser(description="Compile py source")
     parser.add_argument("paths", nargs="*", help="Paths to freeze.")
     parser.add_argument("--verbose", action="store_true", help="Print debug logs")
-    parser.add_argument("--install_dir", help="Root directory for all output files")
+    parser.add_argument("--install-dir", "--install_dir", help="Root directory for all output files")
     parser.add_argument("--oss", action="store_true", help="If it's OSS build, add a fake _PyImport_FrozenModules")
     parser.add_argument(
+        "--symbol-name",
         "--symbol_name",
         help="The name of the frozen module array symbol to generate",
         default="_PyImport_FrozenModules_torch",
diff --git a/torch/utils/_stats.py b/torch/utils/_stats.py
new file mode 100644
index 000000000000..5b33f7b8cb02
--- /dev/null
+++ b/torch/utils/_stats.py
@@ -0,0 +1,21 @@
+# NOTE! PLEASE KEEP THIS FILE *FREE* OF TORCH DEPS! IT SHOULD BE IMPORTABLE ANYWHERE.
+# IF YOU FEEL AN OVERWHELMING URGE TO ADD A TORCH DEP, MAKE A TRAMPOLINE FILE A LA torch._dynamo.utils
+# AND SCRUB AWAY TORCH NOTIONS THERE.
+import collections
+import functools
+from typing import OrderedDict
+
+simple_call_counter: OrderedDict[str, int] = collections.OrderedDict()
+
+def count_label(label):
+    prev = simple_call_counter.setdefault(label, 0)
+    simple_call_counter[label] = prev + 1
+
+def count(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if fn.__qualname__ not in simple_call_counter:
+            simple_call_counter[fn.__qualname__] = 0
+        simple_call_counter[fn.__qualname__] = simple_call_counter[fn.__qualname__] + 1
+        return fn(*args, **kwargs)
+    return wrapper
diff --git a/torch/utils/_sympy/__init__.py b/torch/utils/_sympy/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
new file mode 100644
index 000000000000..b2561d416893
--- /dev/null
+++ b/torch/utils/_sympy/interp.py
@@ -0,0 +1,90 @@
+"""
+This is a simple interpreter for Sympy expressions that dispatches to
+classes following the torch._inductor.virtualized calling convention.
+For directness, the interpreter takes the handler directly rather than
+consulting the TLS.  It does not use most of the methods on the full
+handler; only those with corresponding Sympy expressions.  To see an example
+of a full handler, see torch.utils._sympy.value_ranges.ValueRangeAnalysis.
+"""
+
+import functools
+from typing import Any, Dict, Union
+
+import sympy
+from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
+
+import torch
+
+
+# TODO: Dedupe this with SYMPY_INTERP
+
+
+@functools.lru_cache(None)
+def handlers():
+    from torch.fx.experimental.symbolic_shapes import FloorDiv, Pow, TrueDiv
+
+    HANDLERS = {
+        sympy.Or: "or_",
+        sympy.And: "and_",
+        sympy.Eq: "eq",
+        sympy.Ne: "ne",
+        sympy.Lt: "lt",
+        sympy.Gt: "gt",
+        sympy.Le: "le",
+        sympy.Ge: "ge",
+        sympy.Not: "not_",
+        TrueDiv: "truediv",
+        FloorDiv: "div",
+        sympy.Add: "add",
+        sympy.Mul: "mul",
+        Pow: "pow",
+        sympy.Pow: "pow",
+        sympy.Mod: "mod",
+        sympy.Abs: "abs",
+        sympy.log: "log",
+        sympy.exp: "exp",
+        sympy.floor: "floor",
+        sympy.ceiling: "ceil",
+        sympy.Min: "minimum",
+        sympy.Max: "maximum",
+    }
+    return HANDLERS
+
+
+ASSOCIATIVE_OPS = {"minimum", "maximum", "mul", "add", "and_", "or_"}
+
+
+def sympy_interp(
+    analysis, env: Dict[sympy.Symbol, Any], expr: Union[sympy.Expr, SympyBoolean]
+):
+    # Handle base cases
+    # TODO: not really sure if I'm passing the right dtype here
+    # TODO: wouldn't it be better to pass the sympy expression through
+    # sometimes?
+    if isinstance(expr, sympy.Integer):
+        return analysis.constant(int(expr), torch.int64)
+    elif isinstance(expr, sympy.Number):
+        return analysis.constant(float(expr), torch.double)
+    elif isinstance(expr, BooleanAtom):
+        return analysis.constant(bool(expr), torch.bool)
+    elif isinstance(expr, sympy.Symbol):
+        return env[expr]
+
+    # Special cases
+    if isinstance(expr, sympy.Pow) and isinstance(
+        expr.args[1], sympy.core.numbers.Half
+    ):
+        return analysis.sqrt(sympy_interp(analysis, env, expr.args[0]))
+
+    # Recursive case
+    args = [sympy_interp(analysis, env, arg) for arg in expr.args]  # type: ignore[arg-type]
+    handler_name = handlers()[expr.func]
+    handler = getattr(analysis, handler_name)
+    if handler_name in ASSOCIATIVE_OPS:
+        assert len(args) > 1
+        acc = handler(args[0], args[1])
+        for i in range(2, len(args)):
+            acc = handler(acc, args[i])
+        return acc
+    else:
+        return handler(*args)
diff --git a/torch/utils/_sympy/reference.py b/torch/utils/_sympy/reference.py
new file mode 100644
index 000000000000..5d9edc40ac4b
--- /dev/null
+++ b/torch/utils/_sympy/reference.py
@@ -0,0 +1,122 @@
+import sympy
+
+# The normal Python interpretation of the operators
+# NB: For magic methods this needs to use normal magic methods
+# so that test_magic_methods works
+class ReferenceAnalysis:
+    @staticmethod
+    def constant(c, dtype):
+        return sympy.sympify(c)
+
+    @staticmethod
+    def or_(a, b):
+        assert not isinstance(a, bool) and not isinstance(b, bool)
+        return a | b
+
+    @staticmethod
+    def and_(a, b):
+        assert not isinstance(a, bool) and not isinstance(b, bool)
+        return a & b
+
+    @staticmethod
+    def eq(a, b):
+        if isinstance(a, sympy.Expr) or isinstance(b, sympy.Expr):
+            return sympy.Eq(a, b)
+        return a == b
+
+    @classmethod
+    def ne(cls, a, b):
+        return cls.not_(cls.eq(a, b))
+
+    @staticmethod
+    def lt(a, b):
+        return a < b
+
+    @staticmethod
+    def gt(a, b):
+        return a > b
+
+    @staticmethod
+    def le(a, b):
+        return a <= b
+
+    @staticmethod
+    def ge(a, b):
+        return a >= b
+
+    @staticmethod
+    def not_(a):
+        assert not isinstance(a, bool)
+        return ~a
+
+    @staticmethod
+    def reciprocal(x):
+        return 1 / x
+
+    @staticmethod
+    def square(x):
+        return x * x
+
+    @staticmethod
+    def mod(x, y):
+        return x % y
+
+    @staticmethod
+    def abs(x):
+        return abs(x)
+
+    @staticmethod
+    def neg(x):
+        return -x
+
+    @staticmethod
+    def truediv(a, b):
+        return a / b
+
+    @staticmethod
+    def div(a, b):
+        return a // b
+
+    @staticmethod
+    def add(a, b):
+        return a + b
+
+    @staticmethod
+    def mul(a, b):
+        return a * b
+
+    @staticmethod
+    def sub(a, b):
+        return a - b
+
+    @staticmethod
+    def exp(x):
+        return sympy.exp(x)
+
+    @staticmethod
+    def log(x):
+        return sympy.log(x)
+
+    @staticmethod
+    def sqrt(x):
+        return sympy.sqrt(x)
+
+    @staticmethod
+    def pow(a, b):
+        return a**b
+
+    @staticmethod
+    def minimum(a, b):
+        return sympy.Min(a, b)
+
+    @staticmethod
+    def maximum(a, b):
+        return sympy.Max(a, b)
+
+    @staticmethod
+    def floor(x):
+        return sympy.floor(x)
+
+    @staticmethod
+    def ceil(x):
+        return sympy.ceiling(x)
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
new file mode 100644
index 000000000000..900fbd1ea7b1
--- /dev/null
+++ b/torch/utils/_sympy/value_ranges.py
@@ -0,0 +1,429 @@
+import dataclasses
+import itertools
+import sympy
+from sympy.logic.boolalg import BooleanAtom, Boolean as SympyBoolean
+import operator
+import math
+import logging
+import torch
+from typing import Union
+
+log = logging.getLogger(__name__)
+
+__all__ = ["ValueRanges", "ValueRangeAnalysis"]
+
+
+# Like sympify, but supports less stuff, and also ensures that direct
+# sympy expressions don't have free variables
+def simple_sympify(e):
+    if isinstance(e, int):
+        return sympy.Integer(e)
+    elif isinstance(e, float):
+        # infinity is special; we use it to bracket integers as well
+        if math.isinf(e):
+            return sympy.oo if e > 0 else -sympy.oo
+        return sympy.Float(e)
+    elif isinstance(e, bool):
+        return sympy.true if e else sympy.false
+    elif isinstance(e, sympy.Expr):
+        # TODO: Eventually, we will want to do indexing calculations with
+        # respect to symbols, so we can generate a dynamic kernel which will
+        # use 32-bit indexing so long as the dynamic dim isn't too big.  To do
+        # that, we will need to be able to do ValueRanges
+        assert not e.free_symbols, f"free variables NYI: {e}"
+        # NaNs can occur when doing things like 0 * sympy.oo, but it is better
+        # if the operator notices this and takes care of it, because sometimes
+        # the NaN is inappropriate (for example, for ints, the [-oo, oo] range
+        # should go to zero when multiplied with [0, 0])
+        assert e != sympy.nan
+        return e
+    elif isinstance(e, BooleanAtom):
+        return e
+    else:
+        raise AssertionError(f"not simple sympy type {type(e)}: {e}")
+
+
+# Sympy atomics only. Unlike <=, it also works on Sympy bools.
+def sympy_generic_le(lower, upper):
+    if isinstance(lower, sympy.Expr):
+        assert isinstance(upper, sympy.Expr)
+        return lower <= upper
+    else:
+        # only negative condition is True > False
+        assert isinstance(lower, SympyBoolean) and isinstance(upper, SympyBoolean)
+        return not (lower is sympy.true and upper is sympy.false)
+
+
+@dataclasses.dataclass(frozen=True)
+class ValueRanges:
+    # Although the type signature here suggests you can pass any
+    # sympy expression, in practice the analysis here only works
+    # with sympy expressions with no free variables
+    lower: Union[sympy.Expr, SympyBoolean]
+    upper: Union[sympy.Expr, SympyBoolean]
+
+    def __init__(self, lower, upper):
+        lower = simple_sympify(lower)
+        upper = simple_sympify(upper)
+        # We don't support point-ranges on floating point inf
+        assert lower != sympy.oo
+        assert upper != -sympy.oo
+        # TODO: when the bounds have free variables, this may be
+        # nontrivial to actually verify
+        assert sympy_generic_le(lower, upper)
+        # Because this is a frozen class
+        object.__setattr__(self, "lower", lower)
+        object.__setattr__(self, "upper", upper)
+        object.__setattr__(self, "is_bool", isinstance(lower, SympyBoolean))
+
+    def __contains__(self, x):
+        x = simple_sympify(x)
+        return sympy_generic_le(self.lower, x) and sympy_generic_le(x, self.upper)
+
+    def is_singleton(self) -> bool:
+        return self.lower == self.upper
+
+    # TODO: this doesn't work with bools but arguably it should
+    @classmethod
+    def unknown(cls):
+        return cls(-sympy.oo, sympy.oo)
+
+    @classmethod
+    def wrap(cls, arg):
+        if isinstance(arg, ValueRanges):
+            return arg
+        return ValueRanges(arg, arg)
+
+    @classmethod
+    def increasing_map(cls, x, fn):
+        """map lower and upper bound with fn"""
+        x = cls.wrap(x)
+        return ValueRanges(fn(x.lower), fn(x.upper))
+
+    @classmethod
+    def decreasing_map(cls, x, fn):
+        """map lower bound to upper bound and upper bound to lower bound"""
+        x = cls.wrap(x)
+        return ValueRanges(fn(x.upper), fn(x.lower))
+
+    @classmethod
+    def monotone_map(cls, x, fn):
+        """check the max and min of computed upper and lower bound for the output"""
+        x = cls.wrap(x)
+        l = fn(x.lower)
+        u = fn(x.upper)
+        return ValueRanges(min(l, u), max(l, u))
+
+    @classmethod
+    def convex_min_zero_map(cls, x, fn):
+        """the max is at one of the ends"""
+        x = ValueRanges.wrap(x)
+        if 0 in x:
+            return ValueRanges(0, max(fn(x.lower), fn(x.upper)))
+        else:
+            return cls.monotone_map(x, fn)
+
+    @classmethod
+    def coordinatewise_increasing_map(cls, x, y, fn):
+        """map upper and lower bounds accessing corresponding values of inputs"""
+        x, y = cls.wrap(x), cls.wrap(y)
+        return ValueRanges(
+            fn(x.lower, y.lower),
+            fn(x.upper, y.upper),
+        )
+
+    @classmethod
+    def coordinatewise_monotone_map(cls, x, y, fn):
+        """compute the product of all lower and upper bounds and take min and max"""
+        x, y = cls.wrap(x), cls.wrap(y)
+        products = [
+            fn(a, b)
+            for a, b in itertools.product([x.lower, x.upper], [y.lower, y.upper])
+        ]
+        return ValueRanges(min(products), max(products))
+
+
+class ValueRangeAnalysis:
+    def __init__(self):
+        self.name = "ValueRangeAnalysis"
+        boolean_operators = (
+            "xor",
+            "logical_and",
+            "logical_or",
+            "logical_not",
+        )
+        for op in boolean_operators:
+            setattr(self, op, self.bool_handler)
+
+    @staticmethod
+    def bool_handler(*args, **kwargs):
+        # just assuming bools can have both values
+        return ValueRanges(sympy.false, sympy.true)  # type: ignore[arg-type]
+
+    @staticmethod
+    def default_handler(*args, **kwargs):
+        # many ops are unlikely to show up in optimizable indexing compute,
+        # so we dont have full coverage
+        return ValueRanges.unknown()
+
+    def load(self, name: str, index: sympy.Expr):
+        return ValueRanges.unknown()
+
+    def store(self, name, index, value, mode=None):
+        return
+
+    def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
+        return ValueRanges.unknown()
+
+    def index_expr(self, index, dtype):
+        assert isinstance(index, ValueRanges)
+        return index
+
+    @staticmethod
+    def or_(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        assert a.is_bool and b.is_bool
+        if a.lower or b.lower:
+            return ValueRanges.wrap(sympy.true)
+        elif a.is_singleton() and b.is_singleton():
+            return ValueRanges.wrap(sympy.Or(a.lower, b.lower))
+        else:
+            return ValueRanges(sympy.false, sympy.true)
+
+    @staticmethod
+    def and_(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        assert a.is_bool and b.is_bool
+        if not a.upper or not b.upper:
+            return ValueRanges.wrap(sympy.false)
+        elif a.is_singleton() and b.is_singleton():
+            return ValueRanges.wrap(sympy.And(a.lower, b.lower))
+        else:
+            return ValueRanges(sympy.false, sympy.true)
+
+    @staticmethod
+    def eq(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if a.is_singleton() and b.is_singleton() and a.lower == b.lower:
+            return ValueRanges.wrap(sympy.true)
+        elif a.lower > b.upper or b.lower > a.upper:  # ranges disjoint
+            return ValueRanges.wrap(sympy.false)
+        return ValueRanges(sympy.false, sympy.true)
+
+    @classmethod
+    def ne(cls, a, b):
+        return cls.not_(cls.eq(a, b))
+
+    @staticmethod
+    def lt(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if a.upper < b.lower:
+            return ValueRanges.wrap(sympy.true)
+        elif a.lower >= b.upper:
+            return ValueRanges.wrap(sympy.false)
+        return ValueRanges(sympy.false, sympy.true)
+
+    @classmethod
+    def gt(cls, a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if a.lower > b.upper:
+            return ValueRanges.wrap(sympy.true)
+        elif a.upper <= b.lower:
+            return ValueRanges.wrap(sympy.false)
+        return ValueRanges(sympy.false, sympy.true)
+
+    @classmethod
+    def le(cls, a, b):
+        return cls.not_(cls.gt(a, b))
+
+    @classmethod
+    def ge(cls, a, b):
+        return cls.not_(cls.lt(a, b))
+
+    @staticmethod
+    def not_(a):
+        a = ValueRanges.wrap(a)
+        assert a.is_bool
+        if a.is_singleton():
+            return ValueRanges.wrap(sympy.Not(a.lower))
+        return ValueRanges(sympy.false, sympy.true)
+
+    @staticmethod
+    def to_dtype(x, dtype: torch.dtype):
+        def is_bool(val):
+            return isinstance(val, bool) or (
+                hasattr(val, "is_Boolean") and val.is_Boolean
+            )
+
+        x = ValueRanges.wrap(x)
+        low, up = x.lower, x.upper
+        if is_bool(low):
+            assert is_bool(up)
+            if dtype.is_floating_point:
+                return ValueRanges(0.0, 1.0)
+            else:
+                return ValueRanges(0, 1)
+        return ValueRanges.wrap(x)
+
+    @staticmethod
+    def constant(value, dtype):
+        # NB: value is NOT a sympy expression, it's a constant!
+        assert isinstance(value, (int, float, bool))
+        # using nan makes subsequent computation throw, and for the purposes of optimization
+        # returning -math.inf - math.inf is equivalent to giving up
+        if math.isnan(value):
+            return ValueRanges.unknown()
+        return ValueRanges.wrap(value)
+
+    @staticmethod
+    def reciprocal(x):
+        x = ValueRanges.wrap(x)
+        if 0 in x:
+            return ValueRanges.unknown()
+        else:
+            return ValueRanges.decreasing_map(x, lambda y: 1 / y)
+
+    @staticmethod
+    def square(x):
+        return ValueRanges.convex_min_zero_map(x, lambda y: y * y)
+
+    @staticmethod
+    def abs(x):
+        return ValueRanges.convex_min_zero_map(x, abs)
+
+    @staticmethod
+    def neg(x):
+        return ValueRanges.decreasing_map(x, operator.neg)
+
+    @staticmethod
+    def truediv(a, b):
+        b = ValueRanges.wrap(b)
+        if 0 in b:
+            return ValueRanges.unknown()
+        else:
+            return ValueRangeAnalysis.mul(a, ValueRanges(1 / b.upper, 1 / b.lower))
+
+    @staticmethod
+    def div(a, b):
+        # We think of this as floor(a / b)
+        out = ValueRangeAnalysis.truediv(a, b)
+        return ValueRangeAnalysis.floor(out)
+
+    @staticmethod
+    def add(a, b):
+        return ValueRanges.coordinatewise_increasing_map(a, b, operator.add)
+
+    @staticmethod
+    def mul(a, b):
+        def safe_mul(a, b):
+            if a == 0:
+                return 0
+            elif b == 0:
+                return 0
+            return a * b
+
+        return ValueRanges.coordinatewise_monotone_map(a, b, safe_mul)
+
+    @staticmethod
+    def sub(a, b):
+        b = ValueRanges.wrap(b)
+        return ValueRangeAnalysis.add(a, ValueRanges(-b.upper, -b.lower))
+
+    @staticmethod
+    def exp(x):
+        return ValueRanges.increasing_map(x, sympy.functions.elementary.exponential.exp)
+
+    @staticmethod
+    def log(x):
+        if x.lower <= 0:
+            return ValueRanges.unknown()
+        return ValueRanges.increasing_map(x, sympy.log)
+
+    @staticmethod
+    def mod(x, y):
+        if x.is_singleton() and y.is_singleton() and y.lower != 0:
+            return ValueRanges.wrap(x.lower % y.lower)
+        if y.lower <= 0:
+            return ValueRanges.unknown()
+        return ValueRanges(0, y.upper)
+
+    @staticmethod
+    def sqrt(x):
+        if x.lower < 0:
+            return ValueRanges.unknown()
+        return ValueRanges.increasing_map(x, sympy.sqrt)
+
+    @classmethod
+    def pow(cls, a, b):
+        def is_integer(val):
+            return isinstance(val, int) or (
+                hasattr(val, "is_integer") and val.is_integer
+            )
+
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if a.is_singleton() and b.is_singleton():
+            r = a.lower**b.lower
+            if r == sympy.zoo:
+                return ValueRanges.unknown()
+            return ValueRanges.wrap(r)
+        elif b.is_singleton() and is_integer(b.lower) and b.lower >= 0:
+            i = ValueRanges.wrap(1)
+            for _ in range(b.lower):
+                i = cls.mul(i, a)
+            return i
+        else:
+            # This is fairly difficult to analyze, so give up for anything
+            # complicated
+            return ValueRanges.unknown()
+
+    @staticmethod
+    def minimum(a, b):
+        return ValueRanges.coordinatewise_increasing_map(a, b, min)
+
+    @staticmethod
+    def maximum(a, b):
+        return ValueRanges.coordinatewise_increasing_map(a, b, max)
+
+    @staticmethod
+    def where(a, b, c):
+        b = ValueRanges.wrap(b)
+        c = ValueRanges.wrap(c)
+        return ValueRanges(min(b.lower, c.lower), max(b.upper, c.upper))
+
+    @staticmethod
+    def floor(x):
+        return ValueRangeAnalysis.floor_ceil(
+            x, sympy.functions.elementary.integers.floor
+        )
+
+    @staticmethod
+    def ceil(x):
+        return ValueRangeAnalysis.floor_ceil(
+            x, sympy.functions.elementary.integers.ceiling
+        )
+
+    @staticmethod
+    def floor_ceil(x, fn_int):
+        def is_integer(val):
+            return isinstance(val, int) or (
+                hasattr(val, "is_integer") and val.is_integer
+            )
+
+        if is_integer(x):
+            fn = fn_int
+        else:
+
+            def fn(x):
+                return sympy.Float(fn_int(x))
+
+        return ValueRanges.increasing_map(x, fn)
+
+    def __getattr__(self, name):
+        log.warning(f"unhandled ValueRange op {name}")
+        return self.default_handler
diff --git a/torch/utils/_zip.py b/torch/utils/_zip.py
index 26a1fa37667f..6295f5c194d4 100644
--- a/torch/utils/_zip.py
+++ b/torch/utils/_zip.py
@@ -40,10 +40,12 @@ def write_to_zip(file_path, strip_file_path, zf, prepend_str=""):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Zip py source")
     parser.add_argument("paths", nargs="*", help="Paths to zip.")
-    parser.add_argument("--install_dir", help="Root directory for all output files")
-    parser.add_argument("--strip_dir", help="The absolute directory we want to remove from zip")
-    parser.add_argument("--prepend_str", help="A string to prepend onto all paths of a file in the zip", default="")
-    parser.add_argument("--zip_name", help="Output zip name")
+    parser.add_argument("--install-dir", "--install_dir", help="Root directory for all output files")
+    parser.add_argument("--strip-dir", "--strip_dir", help="The absolute directory we want to remove from zip")
+    parser.add_argument(
+        "--prepend-str", "--prepend_str", help="A string to prepend onto all paths of a file in the zip", default=""
+    )
+    parser.add_argument("--zip-name", "--zip_name", help="Output zip name")
 
     args = parser.parse_args()
 
diff --git a/torch/utils/backcompat/__init__.py b/torch/utils/backcompat/__init__.py
index a8e179e0f3f0..fdd16eec5aca 100644
--- a/torch/utils/backcompat/__init__.py
+++ b/torch/utils/backcompat/__init__.py
@@ -4,7 +4,7 @@
 from torch._C import _get_backcompat_keepdim_warn
 
 
-class Warning(object):
+class Warning:
     def __init__(self, setter, getter):
         self.setter = setter
         self.getter = getter
diff --git a/torch/utils/benchmark/examples/blas_compare.py b/torch/utils/benchmark/examples/blas_compare.py
index 910dd30cbe13..805633a185e2 100644
--- a/torch/utils/benchmark/examples/blas_compare.py
+++ b/torch/utils/benchmark/examples/blas_compare.py
@@ -123,12 +123,12 @@ def run_subprocess(args):
             f"source activate {env} && "
             f"taskset --cpu-list {core_str} "
             f"python {os.path.abspath(__file__)} "
-            "--DETAIL_in_subprocess "
-            f"--DETAIL_seed {seed} "
-            f"--DETAIL_num_threads {num_threads} "
-            f"--DETAIL_sub_label '{sub_label}' "
-            f"--DETAIL_result_file {result_file} "
-            f"--DETAIL_env {env}",
+            "--DETAIL-in-subprocess "
+            f"--DETAIL-seed {seed} "
+            f"--DETAIL-num-threads {num_threads} "
+            f"--DETAIL-sub-label '{sub_label}' "
+            f"--DETAIL-result-file {result_file} "
+            f"--DETAIL-env {env}",
             env=env_vars,
             stdout=subprocess.PIPE,
             shell=True
@@ -197,7 +197,7 @@ def main():
     subprocess.run(
         f"source activate {env_path} && "
         f"python {os.path.abspath(__file__)} "
-        "--DETAIL_in_compare",
+        "--DETAIL-in-compare",
         shell=True
     )
 
@@ -205,13 +205,13 @@ def main():
 if __name__ == "__main__":
     # These flags are for subprocess control, not controlling the main loop.
     parser = argparse.ArgumentParser()
-    parser.add_argument("--DETAIL_in_subprocess", action="store_true")
-    parser.add_argument("--DETAIL_in_compare", action="store_true")
-    parser.add_argument("--DETAIL_seed", type=int, default=None)
-    parser.add_argument("--DETAIL_num_threads", type=int, default=None)
-    parser.add_argument("--DETAIL_sub_label", type=str, default="N/A")
-    parser.add_argument("--DETAIL_result_file", type=str, default=None)
-    parser.add_argument("--DETAIL_env", type=str, default=None)
+    parser.add_argument("--DETAIL-in-subprocess", "--DETAIL_in_subprocess", action="store_true")
+    parser.add_argument("--DETAIL-in-compare", "--DETAIL_in_compare", action="store_true")
+    parser.add_argument("--DETAIL-seed", "--DETAIL_seed", type=int, default=None)
+    parser.add_argument("--DETAIL-num-threads", "--DETAIL_num_threads", type=int, default=None)
+    parser.add_argument("--DETAIL-sub-label", "--DETAIL_sub_label", type=str, default="N/A")
+    parser.add_argument("--DETAIL-result-file", "--DETAIL_result_file", type=str, default=None)
+    parser.add_argument("--DETAIL-env", "--DETAIL_env", type=str, default=None)
     args = parser.parse_args()
 
     if args.DETAIL_in_subprocess:
diff --git a/torch/utils/benchmark/examples/blas_compare_setup.py b/torch/utils/benchmark/examples/blas_compare_setup.py
index eba387aa7c6d..13d798a71018 100644
--- a/torch/utils/benchmark/examples/blas_compare_setup.py
+++ b/torch/utils/benchmark/examples/blas_compare_setup.py
@@ -113,8 +113,7 @@ def main():
         base_source = subprocess.run(
             f"source activate {env_path}",
             shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
         if base_source.returncode:
             raise OSError(
@@ -147,8 +146,7 @@ def main():
                 f"source activate {env_path} && "
                 f"conda env config vars set {' '.join(env_spec.environment_variables)}",
                 shell=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                capture_output=True,
             )
             if env_set.returncode:
                 raise OSError(
@@ -161,8 +159,7 @@ def main():
             actual_env_vars = subprocess.run(
                 f"source activate {env_path} && env",
                 shell=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                capture_output=True,
             ).stdout.decode("utf-8").strip().splitlines()
             for e in env_spec.environment_variables:
                 assert e in actual_env_vars, f"{e} not in envs"
@@ -175,8 +172,7 @@ def main():
             f"cd {git_root} && "
             "python setup.py install --cmake",
             shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
 
         print("Checking configuration:")
@@ -192,8 +188,7 @@ def main():
             "stats = counts.as_standardized().stats(inclusive=True);"
             "print(stats.filter(lambda l: 'blas' in l.lower()))\"",
             shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            capture_output=True,
         )
         if check_run.returncode:
             raise OSError(
diff --git a/torch/utils/benchmark/examples/compare.py b/torch/utils/benchmark/examples/compare.py
index f1688976af37..6f99d9d06ad5 100644
--- a/torch/utils/benchmark/examples/compare.py
+++ b/torch/utils/benchmark/examples/compare.py
@@ -12,7 +12,7 @@
 import torch.utils.benchmark as benchmark_utils
 
 
-class FauxTorch(object):
+class FauxTorch:
     """Emulate different versions of pytorch.
 
     In normal circumstances this would be done with multiple processes
diff --git a/torch/utils/benchmark/examples/end_to_end.py b/torch/utils/benchmark/examples/end_to_end.py
index 524795188a91..5e0f42712d7c 100644
--- a/torch/utils/benchmark/examples/end_to_end.py
+++ b/torch/utils/benchmark/examples/end_to_end.py
@@ -82,15 +82,15 @@
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--pr", type=str, default=_PR_LIST[0], choices=_PR_LIST)
-    parser.add_argument("--num_gpus", type=int, default=None)
-    parser.add_argument("--test_variance", action="store_true")
+    parser.add_argument("--num-gpus", "--num_gpus", type=int, default=None)
+    parser.add_argument("--test-variance", "--test_variance", action="store_true")
 
     # (Implementation details)
-    parser.add_argument("--DETAIL_context", type=str, choices=(_MAIN, _SUBPROCESS), default=_MAIN)
-    parser.add_argument("--DETAIL_device", type=str, choices=(_CPU, _GPU), default=None)
-    parser.add_argument("--DETAIL_env", type=str, default=None)
-    parser.add_argument("--DETAIL_result_file", type=str, default=None)
-    parser.add_argument("--DETAIL_seed", type=int, default=None)
+    parser.add_argument("--DETAIL-context", "--DETAIL_context", type=str, choices=(_MAIN, _SUBPROCESS), default=_MAIN)
+    parser.add_argument("--DETAIL-device", "--DETAIL_device", type=str, choices=(_CPU, _GPU), default=None)
+    parser.add_argument("--DETAIL-env", "--DETAIL_env", type=str, default=None)
+    parser.add_argument("--DETAIL-result-file", "--DETAIL_result_file", type=str, default=None)
+    parser.add_argument("--DETAIL-seed", "--DETAIL_seed", type=int, default=None)
 
     args = parser.parse_args()
     if args.num_gpus is None:
@@ -101,11 +101,11 @@ def parse_args():
 _SUBPROCESS_CMD_TEMPLATE = (
     "source activate {source_env} && python -m examples.end_to_end "
     "--pr {pr} "
-    "--DETAIL_context subprocess "
-    "--DETAIL_device {device} "
-    "--DETAIL_env {env} "
-    "--DETAIL_result_file {result_file} "
-    "--DETAIL_seed {seed}"
+    "--DETAIL-context subprocess "
+    "--DETAIL-device {device} "
+    "--DETAIL-env {env} "
+    "--DETAIL-result-file {result_file} "
+    "--DETAIL-seed {seed}"
 )
 
 
diff --git a/torch/utils/benchmark/examples/sparse/compare.py b/torch/utils/benchmark/examples/sparse/compare.py
index 0dd96e77c4da..4adbd6d2b35e 100644
--- a/torch/utils/benchmark/examples/sparse/compare.py
+++ b/torch/utils/benchmark/examples/sparse/compare.py
@@ -11,7 +11,7 @@
 import torch.utils.benchmark as benchmark_utils
 
 
-class FauxTorch(object):
+class FauxTorch:
     """Emulate different versions of pytorch.
 
     In normal circumstances this would be done with multiple processes
diff --git a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
index 40baf061f8b5..d8284ee4187c 100644
--- a/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
+++ b/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@@ -87,7 +87,7 @@ def _output_csv(file, results):
     parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--samples', type=int, default=10)
-    parser.add_argument('--probability_regular', type=float, default=1.0)
+    parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
     parser.add_argument('-o', '--output', type=str)
     args = parser.parse_args()
 
diff --git a/torch/utils/benchmark/utils/_stubs.py b/torch/utils/benchmark/utils/_stubs.py
index 0b80a08e16c2..13fdd22e2727 100644
--- a/torch/utils/benchmark/utils/_stubs.py
+++ b/torch/utils/benchmark/utils/_stubs.py
@@ -1,11 +1,4 @@
-import sys
-from typing import Any, Callable, Dict, TYPE_CHECKING
-
-
-if TYPE_CHECKING or sys.version_info >= (3, 8):
-    from typing import runtime_checkable, Protocol
-else:
-    from typing_extensions import runtime_checkable, Protocol
+from typing import Any, Callable, Dict, Protocol, runtime_checkable
 
 
 class TimerClass(Protocol):
diff --git a/torch/utils/benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
index d3713fd708cf..9c7863e6a740 100644
--- a/torch/utils/benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -24,7 +24,7 @@ class Colorize(enum.Enum):
 
 
 # Classes to separate internal bookkeeping from what is rendered.
-class _Column(object):
+class _Column:
     def __init__(
         self,
         grouped_results: List[Tuple[Optional[common.Measurement], ...]],
@@ -75,10 +75,10 @@ def optional_min(seq):
     return None if len(l) == 0 else min(l)
 
 
-class _Row(object):
+class _Row:
     def __init__(self, results, row_group, render_env, env_str_len,
                  row_name_str_len, time_scale, colorize, num_threads=None):
-        super(_Row, self).__init__()
+        super().__init__()
         self._results = results
         self._row_group = row_group
         self._render_env = render_env
@@ -147,7 +147,7 @@ def finalize_column_strings(self, column_strings, col_widths):
         return row_contents
 
 
-class Table(object):
+class Table:
     def __init__(
             self,
             results: List[common.Measurement],
@@ -155,7 +155,7 @@ def __init__(
             trim_significant_figures: bool,
             highlight_warnings: bool
     ):
-        assert len(set(r.label for r in results)) == 1
+        assert len({r.label for r in results}) == 1
 
         self.results = results
         self._colorize = colorize
@@ -265,7 +265,7 @@ def render(self) -> str:
 {'(! XX%) Measurement has high variance, where XX is the IQR / median * 100.' + newline if has_warnings else ""}"""[1:]
 
 
-class Compare(object):
+class Compare:
     def __init__(self, results: List[common.Measurement]):
         self._results: List[common.Measurement] = []
         self.extend_results(results)
diff --git a/torch/utils/benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
index ac813bb42393..11e1c0482db2 100644
--- a/torch/utils/benchmark/utils/fuzzer.py
+++ b/torch/utils/benchmark/utils/fuzzer.py
@@ -19,7 +19,7 @@
 )
 
 
-class FuzzedParameter(object):
+class FuzzedParameter:
     """Specification for a parameter to be generated during fuzzing."""
     def __init__(
         self,
@@ -126,7 +126,7 @@ def _custom_distribution(self, state):
         return list(self._distribution.keys())[index]
 
 
-class ParameterAlias(object):
+class ParameterAlias:
     """Indicates that a parameter should alias the value of another parameter.
 
     When used in conjunction with a custom distribution, this allows fuzzed
@@ -176,7 +176,7 @@ def prod(values, base=1):
     return functools.reduce(lambda x, y: int(x) * int(y), values, base)
 
 
-class FuzzedTensor(object):
+class FuzzedTensor:
     def __init__(
         self,
         name: str,
@@ -340,7 +340,7 @@ def nullable_greater(left, right):
         ))
 
 
-class Fuzzer(object):
+class Fuzzer:
     def __init__(
         self,
         parameters: List[Union[FuzzedParameter, List[FuzzedParameter]]],
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 61b05e144924..c745601699b7 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -64,7 +64,7 @@ def timeit(self, number: int) -> float:
         return self._timeit_module.timeit(number)
 
 
-class Timer(object):
+class Timer:
     """Helper class for measuring execution time of PyTorch statements.
 
     For a full tutorial on how to use this class, see:
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
index bf97cf4c04bc..587685c7df74 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
@@ -28,13 +28,17 @@ int main(int argc, char* argv[]) {
   TORCH_CHECK(std::string(argv[1]) == "--number");
   auto number = std::stoi(argv[2]);
 
-  TORCH_CHECK(std::string(argv[3]) == "--number_warmup");
+  TORCH_CHECK(
+      std::string(argv[3]) == "--number-warmup" ||
+      std::string(argv[3]) == "--number_warmup");
   auto number_warmup = std::stoi(argv[4]);
 
   TORCH_CHECK(std::string(argv[5]) == "--repeats");
   auto repeats = std::stoi(argv[6]);
 
-  TORCH_CHECK(std::string(argv[7]) == "--number_threads");
+  TORCH_CHECK(
+      std::string(argv[7]) == "--number-threads" ||
+      std::string(argv[7]) == "--number_threads");
   auto number_threads = std::stoi(argv[8]);
   torch::set_num_threads(number_threads);
 
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 378dd27c65ba..71753bd59548 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -32,7 +32,7 @@
 
 
 @dataclasses.dataclass(repr=False, eq=False, frozen=True)
-class FunctionCounts(object):
+class FunctionCounts:
     """Container for manipulating Callgrind results.
 
     It supports:
@@ -52,8 +52,7 @@ class FunctionCounts(object):
     _linewidth: Optional[int] = None
 
     def __iter__(self) -> Generator[FunctionCount, None, None]:
-        for i in self._data:
-            yield i
+        yield from self._data
 
     def __len__(self) -> int:
         return len(self._data)
@@ -157,7 +156,7 @@ def _from_dict(counts: Dict[str, int], inclusive: bool) -> "FunctionCounts":
 
 
 @dataclasses.dataclass(repr=False, eq=False, frozen=True)
-class CallgrindStats(object):
+class CallgrindStats:
     """Top level container for Callgrind results collected by Timer.
 
     Manipulation is generally done using the FunctionCounts class, which is
@@ -471,7 +470,7 @@ def construct(self) -> str:
         return "\n".join(load_lines)
 
 
-class _ValgrindWrapper(object):
+class _ValgrindWrapper:
     def __init__(self) -> None:
         self._bindings_module: Optional[CallgrindModuleType] = None
         valgrind_symbols = (
@@ -494,8 +493,7 @@ def __init__(self) -> None:
             for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
                 self._commands_available[cmd] = not subprocess.run(
                     ["which", cmd],
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
+                    capture_output=True,
                 ).returncode
 
         self._build_type: Optional[str] = None
@@ -636,9 +634,9 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
                 run_loop_cmd = [
                     run_loop_exec,
                     "--number", str(number),
-                    "--number_warmup", str(min(number, 10)),
+                    "--number-warmup", str(min(number, 10)),
                     "--repeats", str(repeats),
-                    "--number_threads", str(task_spec.num_threads),
+                    "--number-threads", str(task_spec.num_threads),
                 ]
 
             valgrind_invocation, valgrind_invocation_output = run([
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 9483a742eddd..733d5b1a4f2f 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -41,8 +41,8 @@ def check_backward_validity(inputs: Iterable[Any]) -> None:
 def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]:
     # This will not error out if "arg" is a CPU tensor or a non-tensor type because
     # the conditionals short-circuit.
-    fwd_gpu_devices = list(set(arg.get_device() for arg in args
-                               if isinstance(arg, torch.Tensor) and arg.is_cuda))
+    fwd_gpu_devices = list({arg.get_device() for arg in args
+                            if isinstance(arg, torch.Tensor) and arg.is_cuda})
 
     fwd_gpu_states = []
     for device in fwd_gpu_devices:
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
index b658666dd2af..a97cb318d104 100644
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 
 # Unlike the rest of the PyTorch this file must be python2 compliant.
 # This script outputs relevant system environment info
@@ -44,6 +43,7 @@
     'miopen_runtime_version',
     'caching_allocator_config',
     'is_xnnpack_available',
+    'cpu_info',
 ])
 
 
@@ -180,7 +180,7 @@ def get_cudnn_version(run_lambda):
     if not files_set:
         return None
     # Alphabetize the result because the order is non-deterministic otherwise
-    files = list(sorted(files_set))
+    files = sorted(files_set)
     if len(files) == 1:
         return files[0]
     result = '\n'.join(files)
@@ -203,6 +203,98 @@ def get_nvidia_smi():
     return smi
 
 
+# example outputs of CPU infos
+#  * linux
+#    Architecture:            x86_64
+#      CPU op-mode(s):        32-bit, 64-bit
+#      Address sizes:         46 bits physical, 48 bits virtual
+#      Byte Order:            Little Endian
+#    CPU(s):                  128
+#      On-line CPU(s) list:   0-127
+#    Vendor ID:               GenuineIntel
+#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#        CPU family:          6
+#        Model:               106
+#        Thread(s) per core:  2
+#        Core(s) per socket:  32
+#        Socket(s):           2
+#        Stepping:            6
+#        BogoMIPS:            5799.78
+#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
+#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
+#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
+#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
+#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
+#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
+#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
+#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
+#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
+#    Virtualization features:
+#      Hypervisor vendor:     KVM
+#      Virtualization type:   full
+#    Caches (sum of all):
+#      L1d:                   3 MiB (64 instances)
+#      L1i:                   2 MiB (64 instances)
+#      L2:                    80 MiB (64 instances)
+#      L3:                    108 MiB (2 instances)
+#    NUMA:
+#      NUMA node(s):          2
+#      NUMA node0 CPU(s):     0-31,64-95
+#      NUMA node1 CPU(s):     32-63,96-127
+#    Vulnerabilities:
+#      Itlb multihit:         Not affected
+#      L1tf:                  Not affected
+#      Mds:                   Not affected
+#      Meltdown:              Not affected
+#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+#      Retbleed:              Not affected
+#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
+#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
+#      Srbds:                 Not affected
+#      Tsx async abort:       Not affected
+#  * win32
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU0
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+#
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU1
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda('wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID,\
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE')
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
 def get_platform():
     if sys.platform.startswith('linux'):
         return 'linux'
@@ -373,6 +465,7 @@ def get_env_info():
         cmake_version=get_cmake_version(run_lambda),
         caching_allocator_config=get_cachingallocator_config(),
         is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
     )
 
 env_info_fmt = """
@@ -399,6 +492,9 @@ def get_env_info():
 MIOpen runtime version: {miopen_runtime_version}
 Is XNNPACK available: {is_xnnpack_available}
 
+CPU:
+{cpu_info}
+
 Versions of relevant libraries:
 {pip_packages}
 {conda_packages}
@@ -476,6 +572,7 @@ def maybe_start_on_next_line(string):
     if mutable_dict['conda_packages']:
         mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'],
                                                  '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
     return env_info_fmt.format(**mutable_dict)
 
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 9d451b1846f7..cb5cbf0f02ab 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -424,7 +424,7 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
 # https://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj-when
 
 
-class BuildExtension(build_ext, object):
+class BuildExtension(build_ext):
     r'''
     A custom :mod:`setuptools` build extension .
 
@@ -464,7 +464,7 @@ def __init__(self, *args, **kwargs):
         return cls_with_options
 
     def __init__(self, *args, **kwargs) -> None:
-        super(BuildExtension, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", False)
 
         self.use_ninja = kwargs.get('use_ninja', True)
@@ -846,7 +846,7 @@ def get_ext_filename(self, ext_name):
         # Get the original shared library name. For Python 3, this name will be
         # suffixed with "<SOABI>.so", where <SOABI> will be something like
         # cpython-37m-x86_64-linux-gnu.
-        ext_filename = super(BuildExtension, self).get_ext_filename(ext_name)
+        ext_filename = super().get_ext_filename(ext_name)
         # If `no_python_abi_suffix` is `True`, we omit the Python 3 ABI
         # component. This makes building shared libraries with setuptools that
         # aren't Python modules nicer.
@@ -1401,7 +1401,7 @@ def load_inline(name,
             functions = [functions]
         if isinstance(functions, list):
             # Make the function docstring the same as the function name.
-            functions = dict((f, f) for f in functions)
+            functions = {f: f for f in functions}
         elif not isinstance(functions, dict):
             raise ValueError(f"Expected 'functions' to be a list or dict, but was {type(functions)}")
         for function_name, docstring in functions.items():
@@ -1736,13 +1736,13 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
         ('Pascal', '6.0;6.1+PTX'),
         ('Volta', '7.0+PTX'),
         ('Turing', '7.5+PTX'),
-        ('Ampere', '8.0;8.6+PTX'),
+        ('Ampere', '8.0;8.6+PTX;8.7+PTX'),
         ('Ada', '8.9+PTX'),
         ('Hopper', '9.0+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
-                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.9', '9.0']
+                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.7', '8.9', '9.0']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x
@@ -1790,7 +1790,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
             if arch.endswith('+PTX'):
                 flags.append(f'-gencode=arch=compute_{num},code=compute_{num}')
 
-    return sorted(list(set(flags)))
+    return sorted(set(flags))
 
 
 def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index 72479e0ee935..839cbbea2c79 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -13,7 +13,6 @@
 import torch
 
 from typing import Callable, Dict, Optional, Tuple, Type, Union
-from torch._six import string_classes
 
 np_str_obj_array_pattern = re.compile(r'[SaUO]')
 
@@ -70,7 +69,7 @@ def default_convert(data):
         return elem_type(*(default_convert(d) for d in data))
     elif isinstance(data, tuple):
         return [default_convert(d) for d in data]  # Backwards compatibility.
-    elif isinstance(data, collections.abc.Sequence) and not isinstance(data, string_classes):
+    elif isinstance(data, collections.abc.Sequence) and not isinstance(data, str):
         try:
             return elem_type([default_convert(d) for d in data])
         except TypeError:
@@ -198,7 +197,7 @@ def collate_str_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Typ
     default_collate_fn_map[(np.bool_, np.number, np.object_)] = collate_numpy_scalar_fn
 default_collate_fn_map[float] = collate_float_fn
 default_collate_fn_map[int] = collate_int_fn
-default_collate_fn_map[string_classes] = collate_str_fn
+default_collate_fn_map[str] = collate_str_fn
 
 
 def default_collate(batch):
diff --git a/torch/utils/data/_utils/fetch.py b/torch/utils/data/_utils/fetch.py
index 0262c078ca98..4a9782f06a85 100644
--- a/torch/utils/data/_utils/fetch.py
+++ b/torch/utils/data/_utils/fetch.py
@@ -4,7 +4,7 @@
 """
 
 
-class _BaseDatasetFetcher(object):
+class _BaseDatasetFetcher:
     def __init__(self, dataset, auto_collation, collate_fn, drop_last):
         self.dataset = dataset
         self.auto_collation = auto_collation
@@ -17,9 +17,7 @@ def fetch(self, possibly_batched_index):
 
 class _IterableDatasetFetcher(_BaseDatasetFetcher):
     def __init__(self, dataset, auto_collation, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(
-            dataset, auto_collation, collate_fn, drop_last
-        )
+        super().__init__(dataset, auto_collation, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
         self.ended = False
 
@@ -45,11 +43,6 @@ def fetch(self, possibly_batched_index):
 
 
 class _MapDatasetFetcher(_BaseDatasetFetcher):
-    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
-        super(_MapDatasetFetcher, self).__init__(
-            dataset, auto_collation, collate_fn, drop_last
-        )
-
     def fetch(self, possibly_batched_index):
         if self.auto_collation:
             if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index 466cf0c70e2a..7d2b7457f04e 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -9,7 +9,6 @@
 import queue
 
 import torch
-from torch._six import string_classes
 from . import MP_STATUS_CHECK_INTERVAL
 from torch._utils import ExceptionWrapper
 
@@ -54,7 +53,7 @@ def do_one_step():
 def pin_memory(data, device=None):
     if isinstance(data, torch.Tensor):
         return data.pin_memory(device)
-    elif isinstance(data, string_classes):
+    elif isinstance(data, str):
         return data
     elif isinstance(data, collections.abc.Mapping):
         try:
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index a12e4ea127b7..b4fc8e0748f0 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -22,7 +22,7 @@
     # On Windows, the parent ID of the worker process remains unchanged when the manager process
     # is gone, and the only way to check it through OS is to let the worker have a process handle
     # of the manager and ask if the process status has changed.
-    class ManagerWatchdog(object):
+    class ManagerWatchdog:
         def __init__(self):
             self.manager_pid = os.getppid()
 
@@ -48,7 +48,7 @@ def is_alive(self):
                 self.manager_dead = self.kernel32.WaitForSingleObject(self.manager_handle, 0) == 0
             return not self.manager_dead
 else:
-    class ManagerWatchdog(object):  # type: ignore[no-redef]
+    class ManagerWatchdog:  # type: ignore[no-redef]
         def __init__(self):
             self.manager_pid = os.getppid()
             self.manager_dead = False
@@ -61,7 +61,7 @@ def is_alive(self):
 _worker_info = None
 
 
-class WorkerInfo(object):
+class WorkerInfo:
     id: int
     num_workers: int
     seed: int
@@ -77,7 +77,7 @@ def __init__(self, **kwargs):
     def __setattr__(self, key, val):
         if self.__initialized:
             raise RuntimeError("Cannot assign attributes to {} objects".format(self.__class__.__name__))
-        return super(WorkerInfo, self).__setattr__(key, val)
+        return super().__setattr__(key, val)
 
     def __repr__(self):
         items = []
@@ -117,12 +117,12 @@ def get_worker_info() -> Optional[WorkerInfo]:
 
 r"""Dummy class used to signal the end of an IterableDataset"""
 @dataclass(frozen=True)
-class _IterableDatasetStopIteration(object):
+class _IterableDatasetStopIteration:
     worker_id: int
 
 r"""Dummy class used to resume the fetching when worker reuse is enabled"""
 @dataclass(frozen=True)
-class _ResumeIteration(object):
+class _ResumeIteration:
     seed: Optional[int] = None
 
 # The function `_generate_state` is adapted from `numpy.random.SeedSequence`
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index c86ac8813f9d..e914ec3f6321 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -22,7 +22,6 @@
 import torch.utils.data.graph_settings
 
 from torch._utils import ExceptionWrapper
-from torch._six import string_classes
 
 from . import (
     IterDataPipe,
@@ -35,7 +34,7 @@
     Dataset,)
 
 from torch.utils.data.datapipes.datapipe import _IterDataPipeSerializationWrapper, _MapDataPipeSerializationWrapper
-from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES
+from torch.utils.data.datapipes.iter.sharding import SHARDING_PRIORITIES
 
 from . import _utils
 
@@ -69,7 +68,7 @@
 logger = logging.getLogger(__name__)
 
 
-class _DatasetKind(object):
+class _DatasetKind:
     Map = 0
     Iterable = 1
 
@@ -90,7 +89,7 @@ class _InfiniteConstantSampler(Sampler):
     """
 
     def __init__(self):
-        super(_InfiniteConstantSampler, self).__init__(None)
+        super().__init__(None)
 
     def __iter__(self):
         while True:
@@ -181,8 +180,8 @@ class DataLoader(Generic[T_co]):
         persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
             the worker processes after a dataset has been consumed once. This allows to
             maintain the workers `Dataset` instances alive. (default: ``False``)
-        pin_memory_device (str, optional): the data loader will copy Tensors
-            into device pinned memory before returning them if pin_memory is set to true.
+        pin_memory_device (str, optional): the device to pin memory to if ``pin_memory`` is
+            ``True``.
 
 
     .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
@@ -396,7 +395,7 @@ def multiprocessing_context(self):
     def multiprocessing_context(self, multiprocessing_context):
         if multiprocessing_context is not None:
             if self.num_workers > 0:
-                if isinstance(multiprocessing_context, string_classes):
+                if isinstance(multiprocessing_context, str):
                     valid_start_methods = multiprocessing.get_all_start_methods()
                     if multiprocessing_context not in valid_start_methods:
                         raise ValueError(
@@ -423,7 +422,7 @@ def __setattr__(self, attr, val):
             raise ValueError('{} attribute should not be set after {} is '
                              'initialized'.format(attr, self.__class__.__name__))
 
-        super(DataLoader, self).__setattr__(attr, val)
+        super().__setattr__(attr, val)
 
     # We quote '_BaseDataLoaderIter' since it isn't defined yet and the definition can't be moved up
     # since '_BaseDataLoaderIter' references 'DataLoader'.
@@ -565,7 +564,7 @@ def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
                 cpuset_checked))
 
 
-class _BaseDataLoaderIter(object):
+class _BaseDataLoaderIter:
     def __init__(self, loader: DataLoader) -> None:
         self._dataset = loader.dataset
         self._shared_seed = None
@@ -661,7 +660,7 @@ def __getstate__(self):
 
 class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
     def __init__(self, loader):
-        super(_SingleProcessDataLoaderIter, self).__init__(loader)
+        super().__init__(loader)
         assert self._timeout == 0
         assert self._num_workers == 0
 
@@ -993,7 +992,7 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     #     down.
 
     def __init__(self, loader):
-        super(_MultiProcessingDataLoaderIter, self).__init__(loader)
+        super().__init__(loader)
 
         self._prefetch_factor = loader.prefetch_factor
 
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index e466de512523..e4cc9e4e5936 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -8,7 +8,7 @@
 ######################################################
 # Functional API
 ######################################################
-class functional_datapipe(object):
+class functional_datapipe:
     name: str
 
     def __init__(self, name: str, enable_df_api_tracing=False) -> None:
@@ -44,7 +44,7 @@ def __call__(self, cls):
 _determinism: bool = False
 
 
-class guaranteed_datapipes_determinism(object):
+class guaranteed_datapipes_determinism:
     prev: bool
 
     def __init__(self) -> None:
@@ -60,7 +60,7 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
         _determinism = self.prev
 
 
-class non_deterministic(object):
+class non_deterministic:
     cls: Optional[Type[IterDataPipe]] = None
     # TODO: Lambda for picking
     deterministic_fn: Callable[[], bool]
@@ -145,7 +145,7 @@ def wrapper(*args, **kwargs):
 _runtime_validation_enabled: bool = True
 
 
-class runtime_validation_disabled(object):
+class runtime_validation_disabled:
     prev: bool
 
     def __init__(self) -> None:
diff --git a/torch/utils/data/datapipes/_typing.py b/torch/utils/data/datapipes/_typing.py
index ab5e3fb33b60..a7cd07179d92 100644
--- a/torch/utils/data/datapipes/_typing.py
+++ b/torch/utils/data/datapipes/_typing.py
@@ -101,7 +101,7 @@ def _decompose_type(t, to_list=True):
             return None
         ts = [t]
     # Ignored: Generator has incompatible item type "object"; expected "Type[Any]"
-    ts = list(TYPE2ABC.get(_t, _t) for _t in ts)  # type: ignore[misc]
+    ts = [TYPE2ABC.get(_t, _t) for _t in ts]  # type: ignore[misc]
     return ts
 
 
diff --git a/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py b/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
index 540adc3777eb..d3d31ded8474 100644
--- a/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
+++ b/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
@@ -43,8 +43,7 @@ def is_column(cls, data):
     def iterate(cls, data):
         if not _with_pandas():
             raise Exception("DataFrames prototype requires pandas to function")
-        for d in data.itertuples(index=False):
-            yield d
+        yield from data.itertuples(index=False)
 
     @classmethod
     def concat(cls, buffer):
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index 3a7cbb44feaf..dfb1bc94df36 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -57,7 +57,7 @@ def __iter__(self):
 UNIMPLEMENTED_ATTR = ['__deepcopy__', '__setstate__', 'is_shardable', 'apply_sharding']
 
 
-class Capture(object):
+class Capture:
     # TODO: All operations are shared across entire InitialCapture, need to figure out what if we join two captures
 
     def __init__(self, schema_df=None):
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index 4463b0221b43..534962298141 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -384,9 +384,7 @@ def as_str(self, indent=''):
         return res
 
     def __iter__(self) -> Iterator[T]:
-        for i in super().__iter__():
-            yield i
+        yield from super().__iter__()
 
     def raw_iterator(self) -> T:  # type: ignore[misc]
-        for i in self.items:
-            yield i
+        yield from self.items
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index b3007799e29b..a775f0be8753 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -25,9 +25,11 @@
 from torch.utils.data.datapipes.iter.grouping import (
     BatcherIterDataPipe as Batcher,
     GrouperIterDataPipe as Grouper,
-    ShardingFilterIterDataPipe as ShardingFilter,
     UnBatcherIterDataPipe as UnBatcher,
 )
+from torch.utils.data.datapipes.iter.sharding import (
+    ShardingFilterIterDataPipe as ShardingFilter,
+)
 from torch.utils.data.datapipes.iter.routeddecoder import (
     RoutedDecoderIterDataPipe as RoutedDecoder,
 )
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index 9776bdb5d04d..efcc7d91b6fb 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -46,7 +46,7 @@ def __iter__(self) -> Iterator[T_co]:
 
     def __len__(self) -> int:
         # Dataset has been tested as `Sized`
-        if isinstance(self.sampler, Sized) and len(self.sampler) >= 0:
+        if isinstance(self.sampler, Sized):
             return len(self.sampler)
         raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
 
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 178f0430c5b5..131f92440b2a 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -586,8 +586,7 @@ def __init__(self, *datapipes: IterDataPipe):
 
     def __iter__(self) -> Iterator[Tuple[T_co]]:
         iterators = [iter(datapipe) for datapipe in self.datapipes]
-        for data in zip(*iterators):
-            yield data
+        yield from zip(*iterators)
 
     def __len__(self) -> int:
         if all(isinstance(dp, Sized) for dp in self.datapipes):
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index c47f0e9d4c0d..5b7837b8e738 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,88 +1,30 @@
+import warnings
 from collections import defaultdict
-from enum import IntEnum
+from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar
+
+import torch.utils.data.datapipes.iter.sharding
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
-from torch.utils.data.datapipes.datapipe import IterDataPipe, DataChunk
+from torch.utils.data.datapipes.datapipe import DataChunk, IterDataPipe
 from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
-from typing import Any, Callable, DefaultDict, Dict, Iterator, List, Optional, Sized, Tuple, TypeVar
 
 __all__ = [
     "BatcherIterDataPipe",
     "GrouperIterDataPipe",
-    "ShardingFilterIterDataPipe",
-    "SHARDING_PRIORITIES",
     "UnBatcherIterDataPipe",
 ]
 
-T_co = TypeVar('T_co', covariant=True)
-
-
-class SHARDING_PRIORITIES(IntEnum):
-    DEFAULT = 1
-    DISTRIBUTED = 2
-    MULTIPROCESSING = 3
-
-
-@functional_datapipe('sharding_filter')
-class ShardingFilterIterDataPipe(IterDataPipe):
-    r"""
-    Wrapper that allows DataPipe to be sharded (functional name: ``sharding_filter``). After ``apply_sharding`` is
-    called, each instance of the DataPipe (on different workers) will have every `n`-th element of the
-    original DataPipe, where `n` equals to the number of instances.
-
-    Args:
-        source_datapipe: Iterable DataPipe that will be sharded
-    """
-
-    def __init__(self, source_datapipe: IterDataPipe, sharding_group_filter=None):
-        self.source_datapipe = source_datapipe
-        self.sharding_group_filter = sharding_group_filter
-        self.groups: Dict[int, Tuple[int, int]] = {}
-        self.num_of_instances = 1
-        self.instance_id = 0
-        self._update_num_of_instances()
-
-    def is_shardable(self):
-        return True
-
-    def apply_sharding(self, num_of_instances, instance_id, sharding_group=SHARDING_PRIORITIES.DEFAULT):
-        if instance_id >= num_of_instances:
-            raise ValueError(f"instance_id({instance_id}) should be smaller than num_of_instances({num_of_instances})")
-        if sharding_group == SHARDING_PRIORITIES.DEFAULT:
-            if len(self.groups) and SHARDING_PRIORITIES.DEFAULT not in self.groups:
-                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
-        else:
-            if SHARDING_PRIORITIES.DEFAULT in self.groups:
-                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
-        self.groups[sharding_group] = (num_of_instances, instance_id)
-        self._update_num_of_instances()
-
-    def _update_num_of_instances(self):
-        sorted_sharding_groups = []
-        for key in sorted(self.groups.keys()):
-            if self.sharding_group_filter is None or key == self.sharding_group_filter:
-                sorted_sharding_groups.append(self.groups[key])
+T_co = TypeVar("T_co", covariant=True)
 
-        sorted_sharding_groups.reverse()
-
-        self.num_of_instances = 1
-        self.instance_id = 0
-
-        for group_num_of_instances, group_instance_id in sorted_sharding_groups:
-            self.instance_id += self.num_of_instances * group_instance_id
-            self.num_of_instances *= group_num_of_instances
-
-    def __iter__(self):
-        for i, item in enumerate(self.source_datapipe):
-            if i % self.num_of_instances == self.instance_id:
-                yield item
+def __getattr__(name: str):
+    if name in ["SHARDING_PRIORITIES", "ShardingFilterIterDataPipe"]:
+        warnings.warn(f"`{name}` from `torch.utils.data.datapipes.iter.grouping` is going to be removed in PyTorch 2.1"
+                      f"Please use `{name}` from the `torch.utils.data.datapipes.iter.sharding`",
+                      category=FutureWarning, stacklevel=2)
 
-    def __len__(self):
-        if isinstance(self.source_datapipe, Sized):
-            return len(self.source_datapipe) // self.num_of_instances +\
-                (1 if (self.instance_id < len(self.source_datapipe) % self.num_of_instances) else 0)
-        raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
+        return getattr(torch.utils.data.datapipes.iter.sharding, name)
 
+    raise AttributeError(f"module {__name__} has no attribute {name}")
 
 @functional_datapipe('batch')
 class BatcherIterDataPipe(IterDataPipe[DataChunk]):
@@ -182,7 +124,7 @@ def _dive(self, element, unbatch_level):
         if unbatch_level < -1:
             raise ValueError("unbatch_level must be -1 or >= 0")
         if unbatch_level == -1:
-            if isinstance(element, list) or isinstance(element, DataChunk):
+            if isinstance(element, (list, DataChunk)):
                 for item in element:
                     for i in self._dive(item, unbatch_level=-1):
                         yield i
@@ -191,7 +133,7 @@ def _dive(self, element, unbatch_level):
         elif unbatch_level == 0:
             yield element
         else:
-            if isinstance(element, list) or isinstance(element, DataChunk):
+            if isinstance(element, (list, DataChunk)):
                 for item in element:
                     for i in self._dive(item, unbatch_level=unbatch_level - 1):
                         yield i
@@ -216,6 +158,8 @@ class GrouperIterDataPipe(IterDataPipe[DataChunk]):
     Args:
         datapipe: Iterable datapipe to be grouped
         group_key_fn: Function used to generate group key from the data of the source datapipe
+        keep_key: Option to yield the matching key along with the items in a tuple,
+            resulting in `(key, [items])` otherwise returning [items]
         buffer_size: The size of buffer for ungrouped data
         group_size: The max size of each group, a batch is yielded as soon as it reaches this size
         guaranteed_group_size: The guaranteed minimum group size to be yielded in case the buffer is full
@@ -243,8 +187,9 @@ class GrouperIterDataPipe(IterDataPipe[DataChunk]):
     """
     def __init__(self,
                  datapipe: IterDataPipe[T_co],
-                 group_key_fn: Callable,
+                 group_key_fn: Callable[[T_co], Any],
                  *,
+                 keep_key: bool = False,
                  buffer_size: int = 10000,
                  group_size: Optional[int] = None,
                  guaranteed_group_size: Optional[int] = None,
@@ -253,6 +198,7 @@ def __init__(self,
         self.datapipe = datapipe
         self.group_key_fn = group_key_fn
 
+        self.keep_key = keep_key
         self.max_buffer_size = buffer_size
         self.buffer_elements: DefaultDict[Any, List] = defaultdict(list)
         self.curr_buffer_size = 0
@@ -295,19 +241,21 @@ def __iter__(self):
             self.curr_buffer_size += 1
 
             if self.group_size is not None and self.group_size == len(self.buffer_elements[key]):
-                yield self.wrapper_class(self.buffer_elements[key])
+                result: DataChunk[Any] = self.wrapper_class(self.buffer_elements[key])
+                yield (key, result) if self.keep_key else result
                 self.curr_buffer_size -= len(self.buffer_elements[key])
                 del self.buffer_elements[key]
 
             if self.curr_buffer_size == self.max_buffer_size:
                 result_to_yield = self._remove_biggest_key()
                 if result_to_yield is not None:
-                    yield self.wrapper_class(result_to_yield)
+                    result = self.wrapper_class(result_to_yield)
+                    yield (key, result) if self.keep_key else result
 
         for key in tuple(self.buffer_elements.keys()):
-            res = self.buffer_elements.pop(key)
-            self.curr_buffer_size -= len(res)
-            yield self.wrapper_class(res)
+            result = self.wrapper_class(self.buffer_elements.pop(key))
+            self.curr_buffer_size -= len(result)
+            yield (key, result) if self.keep_key else result
 
     def reset(self) -> None:
         self.curr_buffer_size = 0
@@ -317,6 +265,7 @@ def __getstate__(self):
         state = (
             self.datapipe,
             self.group_key_fn,
+            self.keep_key,
             self.max_buffer_size,
             self.group_size,
             self.guaranteed_group_size,
@@ -333,6 +282,7 @@ def __setstate__(self, state):
         (
             self.datapipe,
             self.group_key_fn,
+            self.keep_key,
             self.max_buffer_size,
             self.group_size,
             self.guaranteed_group_size,
diff --git a/torch/utils/data/datapipes/iter/sharding.py b/torch/utils/data/datapipes/iter/sharding.py
new file mode 100644
index 000000000000..83185f44139a
--- /dev/null
+++ b/torch/utils/data/datapipes/iter/sharding.py
@@ -0,0 +1,80 @@
+from typing import (
+    Dict,
+    Sized,
+    Tuple,
+)
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from enum import IntEnum
+
+__all__ = [
+    "SHARDING_PRIORITIES",
+    "ShardingFilterIterDataPipe",
+]
+
+class SHARDING_PRIORITIES(IntEnum):
+    DEFAULT = 1
+    DISTRIBUTED = 2
+    MULTIPROCESSING = 3
+
+class _ShardingIterDataPipe(IterDataPipe):
+    def apply_sharding(self, num_of_instances, instance_id, sharding_group):
+        raise NotImplementedError
+
+@functional_datapipe('sharding_filter')
+class ShardingFilterIterDataPipe(_ShardingIterDataPipe):
+    r"""
+    Wrapper that allows DataPipe to be sharded (functional name: ``sharding_filter``). After ``apply_sharding`` is
+    called, each instance of the DataPipe (on different workers) will have every `n`-th element of the
+    original DataPipe, where `n` equals to the number of instances.
+
+    Args:
+        source_datapipe: Iterable DataPipe that will be sharded
+    """
+
+    def __init__(self, source_datapipe: IterDataPipe, sharding_group_filter=None):
+        self.source_datapipe = source_datapipe
+        self.sharding_group_filter = sharding_group_filter
+        self.groups: Dict[int, Tuple[int, int]] = {}
+        self.num_of_instances = 1
+        self.instance_id = 0
+        self._update_num_of_instances()
+
+    def apply_sharding(self, num_of_instances, instance_id, sharding_group=SHARDING_PRIORITIES.DEFAULT):
+        if instance_id >= num_of_instances:
+            raise ValueError(f"instance_id({instance_id}) should be smaller than num_of_instances({num_of_instances})")
+        if sharding_group == SHARDING_PRIORITIES.DEFAULT:
+            if len(self.groups) and SHARDING_PRIORITIES.DEFAULT not in self.groups:
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+        else:
+            if SHARDING_PRIORITIES.DEFAULT in self.groups:
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+        self.groups[sharding_group] = (num_of_instances, instance_id)
+        self._update_num_of_instances()
+
+    def _update_num_of_instances(self):
+        sorted_sharding_groups = []
+        for key in sorted(self.groups.keys()):
+            if self.sharding_group_filter is None or key == self.sharding_group_filter:
+                sorted_sharding_groups.append(self.groups[key])
+
+        sorted_sharding_groups.reverse()
+
+        self.num_of_instances = 1
+        self.instance_id = 0
+
+        for group_num_of_instances, group_instance_id in sorted_sharding_groups:
+            self.instance_id += self.num_of_instances * group_instance_id
+            self.num_of_instances *= group_num_of_instances
+
+    def __iter__(self):
+        for i, item in enumerate(self.source_datapipe):
+            if i % self.num_of_instances == self.instance_id:
+                yield item
+
+    def __len__(self):
+        if isinstance(self.source_datapipe, Sized):
+            return len(self.source_datapipe) // self.num_of_instances +\
+                (1 if (self.instance_id < len(self.source_datapipe) % self.num_of_instances) else 0)
+        raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py
index 4f113577494e..2a28fa596967 100644
--- a/torch/utils/data/datapipes/iter/streamreader.py
+++ b/torch/utils/data/datapipes/iter/streamreader.py
@@ -14,7 +14,7 @@ class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
     Args:
         datapipe: Iterable DataPipe provides label/URL and byte stream
         chunk: Number of bytes to be read from stream per iteration.
-            If ``None``, all bytes will be read util the EOF.
+            If ``None``, all bytes will be read until the EOF.
 
     Example:
         >>> # xdoctest: +SKIP
diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py
index 415190f3e279..f7f25cbc71ae 100644
--- a/torch/utils/data/datapipes/iter/utils.py
+++ b/torch/utils/data/datapipes/iter/utils.py
@@ -44,8 +44,7 @@ def __iter__(self):
                     "The input iterable can not be deepcopied, "
                     "please be aware of in-place modification would affect source data."
                 )
-        for data in source_data:
-            yield data
+        yield from source_data
 
     def __len__(self):
         return len(self.iterable)
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index b739a6020af8..311392721e75 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -321,7 +321,7 @@ def close_streams(cls, v, depth=0):
             if isinstance(v, dict):
                 for kk, vv in v.items():
                     cls.close_streams(vv, depth=depth + 1)
-            elif isinstance(v, list) or isinstance(v, tuple):
+            elif isinstance(v, (list, tuple)):
                 for vv in v:
                     cls.close_streams(vv, depth=depth + 1)
 
@@ -356,15 +356,14 @@ def autoclose(self):
     def __dir__(self):
         attrs = list(self.__dict__.keys()) + list(StreamWrapper.__dict__.keys())
         attrs += dir(self.file_obj)
-        return list(set(list(attrs)))
+        return list(set(attrs))
 
     def __del__(self):
         if not self.closed:
             self.close()
 
     def __iter__(self):
-        for line in self.file_obj:
-            yield line
+        yield from self.file_obj
 
     def __next__(self):
         return next(self.file_obj)
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index fe3f4b8502d0..4da810c32766 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -287,7 +287,7 @@ def add_handler(self, *handler):
     @staticmethod
     def _is_stream_handle(data):
         obj_to_check = data.file_obj if isinstance(data, StreamWrapper) else data
-        return isinstance(obj_to_check, io.BufferedIOBase) or isinstance(obj_to_check, io.RawIOBase)
+        return isinstance(obj_to_check, (io.BufferedIOBase, io.RawIOBase))
 
     def decode1(self, key, data):
         if not data:
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 828639432bcd..299e1c9fdf5e 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -220,7 +220,7 @@ def cumsum(sequence):
         return r
 
     def __init__(self, datasets: Iterable[Dataset]) -> None:
-        super(ConcatDataset, self).__init__()
+        super().__init__()
         self.datasets = list(datasets)
         assert len(self.datasets) > 0, 'datasets should not be an empty iterable'  # type: ignore[arg-type]
         for d in self.datasets:
@@ -260,7 +260,7 @@ class ChainDataset(IterableDataset):
         datasets (iterable of IterableDataset): datasets to be chained together
     """
     def __init__(self, datasets: Iterable[Dataset]) -> None:
-        super(ChainDataset, self).__init__()
+        super().__init__()
         self.datasets = datasets
 
     def __iter__(self):
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index f3e1a18f3f61..37cbdc901739 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -5,8 +5,11 @@
 
 import torch
 
+from torch.utils.data.datapipes.iter.sharding import (
+    _ShardingIterDataPipe,
+    SHARDING_PRIORITIES,
+)
 from torch.utils.data.graph import DataPipe, DataPipeGraph, traverse_dps
-from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES
 
 __all__ = [
     "apply_random_seed",
@@ -45,13 +48,12 @@ def apply_sharding(datapipe: DataPipe,
     def _helper(graph, prev_applied=None):
         for _, (dp, sub_graph) in graph.items():
             applied = None
-            if hasattr(dp, 'is_shardable') and dp.is_shardable():
-                if hasattr(dp, 'apply_sharding'):
-                    if prev_applied is not None:
-                        raise RuntimeError("Sharding twice on a single pipeline is likely unintended and will cause data loss. "
-                                           f"Sharding already applied to {prev_applied} while trying to apply to {dp}")
-                    dp.apply_sharding(num_of_instances, instance_id, sharding_group=sharding_group)
-                    applied = dp
+            if isinstance(dp, _ShardingIterDataPipe):
+                if prev_applied is not None:
+                    raise RuntimeError("Sharding twice on a single pipeline is likely unintended and will cause data loss. "
+                                       f"Sharding already applied to {prev_applied} while trying to apply to {dp}")
+                dp.apply_sharding(num_of_instances, instance_id, sharding_group=sharding_group)
+                applied = dp
             if applied is None:
                 applied = prev_applied
             _helper(sub_graph, applied)
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index e2a1ea8bc389..45e3cb69af8a 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -563,6 +563,8 @@
         ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)),
         ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)),
         ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
+        ("cudaGraph_t", ("hipGraph_t", CONV_TYPE, API_RAND)),
+        ("cudaGraphExec_t", ("hipGraphExec_t", CONV_TYPE, API_RAND)),
     ]
 )
 
@@ -4131,6 +4133,22 @@
         ("cudaCpuDeviceId", ("hipCpuDeviceId", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamGetCaptureInfo", ("hipStreamGetCaptureInfo", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureStatusActive", ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureModeGlobal", ("hipStreamCaptureModeGlobal", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureModeRelaxed", ("hipStreamCaptureModeRelaxed", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamBeginCapture", ("hipStreamBeginCapture", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamEndCapture", ("hipStreamEndCapture", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphDestroy", ("hipGraphDestroy", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphExecDestroy", ("hipGraphExecDestroy", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphLaunch", ("hipGraphLaunch", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphGetNodes", ("hipGraphGetNodes", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphDebugDotPrint", ("hipGraphDebugDotPrint", CONV_TYPE, API_RUNTIME)),
+        ("cudaThreadExchangeStreamCaptureMode", ("hipThreadExchangeStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamIsCapturing", ("hipStreamIsCapturing", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)),
         ("cudaDeviceReset", ("hipDeviceReset", CONV_DEVICE, API_RUNTIME)),
         ("cudaSetDevice", ("hipSetDevice", CONV_DEVICE, API_RUNTIME)),
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index a1432ad041cc..164cd53dafab 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -58,7 +58,7 @@ class InputError(Exception):
     # Exception raised for errors in the input.
 
     def __init__(self, message):
-        super(InputError, self).__init__(message)
+        super().__init__(message)
         self.message = message
 
     def __str__(self):
@@ -156,6 +156,7 @@ def matched_files_iter(
                 dirs.remove("build")
             if "third_party" in dirs:
                 dirs.remove("third_party")
+                dirs.append("third_party/nvfuser")
         for filename in filenames:
             filepath = os.path.join(abs_dirpath, filename)
             rel_filepath = os.path.join(rel_dirpath, filename)
@@ -595,6 +596,8 @@ def is_out_of_place(rel_filepath):
     assert not os.path.isabs(rel_filepath)
     if rel_filepath.startswith("torch/"):
         return False
+    if rel_filepath.startswith("third_party/nvfuser/"):
+        return False
     if rel_filepath.startswith("tools/autograd/templates/"):
         return False
     return True
@@ -609,6 +612,8 @@ def is_pytorch_file(rel_filepath):
         return True
     if rel_filepath.startswith("torch/"):
         return True
+    if rel_filepath.startswith("third_party/nvfuser/"):
+        return True
     if rel_filepath.startswith("tools/autograd/templates/"):
         return True
     return False
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index be9a4c1f0a65..6d5a97d4288e 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -6,7 +6,7 @@
 
 __all__ = ["RemovableHandle", "unserializable_hook", "warn_if_has_hooks", "BackwardHook"]
 
-class RemovableHandle(object):
+class RemovableHandle:
     r"""
     A handle which provides the capability to remove a hook.
 
@@ -89,7 +89,7 @@ def warn_if_has_hooks(tensor):
                               "decorate the function with @torch.utils.hooks.unserializable_hook "
                               "to suppress this warning".format(repr(hook)))
 
-class BackwardHook(object):
+class BackwardHook:
     """
     A wrapper class to implement nn.Module backward hooks.
     It handles:
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index 6c105d0b123c..2f52abe22998 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -3,7 +3,7 @@
 
 class MkldnnLinear(torch.jit.ScriptModule):
     def __init__(self, dense_module, dtype):
-        super(MkldnnLinear, self).__init__()
+        super().__init__()
         self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
         if dense_module.bias is not None:
             # Bias can be fp32 or bf16 for OneDNN bf16 path, but for good accuracy,
@@ -38,7 +38,7 @@ class _MkldnnConvNd(torch.jit.ScriptModule):
     __constants__ = ['stride', 'padding', 'dilation', 'groups']
 
     def __init__(self, dense_module):
-        super(_MkldnnConvNd, self).__init__()
+        super().__init__()
 
         self.stride = dense_module.stride
         self.padding = dense_module.padding
@@ -73,7 +73,7 @@ def forward(self, x):
 
 class MkldnnConv1d(_MkldnnConvNd):
     def __init__(self, dense_module, dtype):
-        super(MkldnnConv1d, self).__init__(dense_module)
+        super().__init__(dense_module)
 
         self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
 
@@ -86,7 +86,7 @@ def __setstate__(self, state):
 
 class MkldnnConv2d(_MkldnnConvNd):
     def __init__(self, dense_module, dtype):
-        super(MkldnnConv2d, self).__init__(dense_module)
+        super().__init__(dense_module)
 
         self.register_buffer('weight', torch._C._nn.mkldnn_reorder_conv2d_weight(
             dense_module.weight.to_mkldnn(dtype),
@@ -108,7 +108,7 @@ def __setstate__(self, state):
 
 class MkldnnConv3d(_MkldnnConvNd):
     def __init__(self, dense_module, dtype):
-        super(MkldnnConv3d, self).__init__(dense_module)
+        super().__init__(dense_module)
 
         self.register_buffer('weight', torch._C._nn.mkldnn_reorder_conv3d_weight(
             dense_module.weight.to_mkldnn(dtype),
@@ -133,7 +133,7 @@ class MkldnnBatchNorm(torch.jit.ScriptModule):
     __constants__ = ['exponential_average_factor', 'eps']
 
     def __init__(self, dense_module):
-        super(MkldnnBatchNorm, self).__init__()
+        super().__init__()
 
         assert(not dense_module.training)
         assert(dense_module.track_running_stats)
@@ -182,7 +182,7 @@ def forward(self, x):
 
 class MkldnnPrelu(torch.jit.ScriptModule):
     def __init__(self, dense_module, dtype):
-        super(MkldnnPrelu, self).__init__()
+        super().__init__()
         self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
 
     @torch.jit.script_method
@@ -214,7 +214,7 @@ def m_fn(m, d):
             return MkldnnConv2d(m, d)
         elif isinstance(m, torch.nn.Conv3d):
             return MkldnnConv3d(m, d)
-        elif isinstance(m, torch.nn.BatchNorm2d) or isinstance(m, torch.nn.BatchNorm3d):
+        elif isinstance(m, (torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)):
             # For batchnorm bf16 path, OneDNN requires weight and bias need fp32 dtype.
             # so it doesn't need dtype argument.
             return MkldnnBatchNorm(m)
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index bbb456a6f14b..c80f1dbfe7ea 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -360,9 +360,6 @@ def get_inline_skeleton():
     It can load model_info.json over HTTP, or be passed to burn_in_info.
     """
 
-    if sys.version_info < (3, 7):
-        raise Exception("get_inline_skeleton requires Python 3.7")
-
     import importlib.resources
 
     skeleton = importlib.resources.read_text(__package__, "skeleton.html")
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 6ccda4cdde2f..e83bed48e666 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -9,7 +9,7 @@
 
 __all__ = ["FakeObject", "FakeClass", "DumpUnpickler", "main"]
 
-class FakeObject(object):
+class FakeObject:
     def __init__(self, module, name, args):
         self.module = module
         self.name = name
@@ -43,7 +43,7 @@ def pp_format(printer, obj, stream, indent, allowance, context, level):
         raise Exception("Need to implement")
 
 
-class FakeClass(object):
+class FakeClass:
     def __init__(self, module, name):
         self.module = module
         self.name = name
diff --git a/torch/utils/tensorboard/__init__.py b/torch/utils/tensorboard/__init__.py
index b6c437e90a4f..39ac89116569 100644
--- a/torch/utils/tensorboard/__init__.py
+++ b/torch/utils/tensorboard/__init__.py
@@ -1,12 +1,12 @@
 import tensorboard
-from distutils.version import LooseVersion
+from packaging.version import Version
 
-if not hasattr(tensorboard, "__version__") or LooseVersion(
+if not hasattr(tensorboard, "__version__") or Version(
     tensorboard.__version__
-) < LooseVersion("1.15"):
+) < Version("1.15"):
     raise ImportError("TensorBoard logging requires TensorBoard version 1.15 or above")
 
-del LooseVersion
+del Version
 del tensorboard
 
 from .writer import FileWriter, SummaryWriter  # noqa: F401
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index c35cf88213be..f03812b603e1 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -32,7 +32,7 @@
 CLASSTYPE_KIND = "ClassType"
 
 
-class NodeBase(object):
+class NodeBase:
     def __init__(
         self,
         debugName=None,
@@ -64,7 +64,7 @@ def __repr__(self):
 
 class NodePy(NodeBase):
     def __init__(self, node_cpp, valid_methods):
-        super(NodePy, self).__init__(node_cpp)
+        super().__init__(node_cpp)
         valid_methods = valid_methods[:]
         self.inputs = []
 
@@ -89,7 +89,7 @@ def __init__(self, node_cpp, valid_methods):
 
 class NodePyIO(NodePy):
     def __init__(self, node_cpp, input_or_output=None):
-        super(NodePyIO, self).__init__(node_cpp, methods_IO)
+        super().__init__(node_cpp, methods_IO)
         try:
             tensor_size = node_cpp.type().sizes()
         except RuntimeError:
@@ -109,7 +109,7 @@ def __init__(self, node_cpp, input_or_output=None):
 
 class NodePyOP(NodePy):
     def __init__(self, node_cpp):
-        super(NodePyOP, self).__init__(node_cpp, methods_OP)
+        super().__init__(node_cpp, methods_OP)
         # Replace single quote which causes strange behavior in TensorBoard
         # TODO: See if we can remove this in the future
         self.attributes = str(
@@ -118,7 +118,7 @@ def __init__(self, node_cpp):
         self.kind = node_cpp.kind()
 
 
-class GraphPy(object):
+class GraphPy:
     """Helper class to convert torch.nn.Module to GraphDef proto and visualization
     with TensorBoard.
 
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index 1ddf603d4f74..08e42e01c784 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -6,8 +6,6 @@
 import numpy as np
 from google.protobuf import struct_pb2
 
-# pylint: disable=unused-import
-from six.moves import range
 from tensorboard.compat.proto.summary_pb2 import HistogramProto
 from tensorboard.compat.proto.summary_pb2 import Summary
 from tensorboard.compat.proto.summary_pb2 import SummaryMetadata
@@ -94,7 +92,6 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
         SessionEndInfo
     """
     import torch
-    from six import string_types
     from tensorboard.plugins.hparams.api_pb2 import (
         Experiment,
         HParamInfo,
@@ -157,7 +154,7 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
     for k, v in hparam_dict.items():
         if v is None:
             continue
-        if isinstance(v, int) or isinstance(v, float):
+        if isinstance(v, (int, float)):
             ssi.hparams[k].number_value = v
 
             if k in hparam_domain_discrete:
@@ -179,7 +176,7 @@ def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
             )
             continue
 
-        if isinstance(v, string_types):
+        if isinstance(v, str):
             ssi.hparams[k].string_value = v
 
             if k in hparam_domain_discrete:
@@ -383,7 +380,8 @@ def make_histogram(values, bins, max_bins=None):
         limits = new_limits
 
     # Find the first and the last bin defining the support of the histogram:
-    cum_counts = np.cumsum(np.greater(counts, 0, dtype=np.int32))
+
+    cum_counts = np.cumsum(np.greater(counts, 0))
     start, end = np.searchsorted(cum_counts, [0, cum_counts[-1] - 1], side="right")
     start = int(start)
     end = int(end) + 1
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 83bd0a25d103..893ddd7082bd 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -41,7 +41,7 @@
 
 __all__ = ['FileWriter', 'SummaryWriter']
 
-class FileWriter(object):
+class FileWriter:
     """Writes protocol buffers to event files to be consumed by TensorBoard.
 
     The `FileWriter` class provides a mechanism to create an event file in a
@@ -164,7 +164,7 @@ def reopen(self):
         self.event_writer.reopen()
 
 
-class SummaryWriter(object):
+class SummaryWriter:
     """Writes entries directly to event files in the log_dir to be
     consumed by TensorBoard.
 
diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py
index 1dae4b937783..8b2fd1a76ca8 100644
--- a/torch/utils/throughput_benchmark.py
+++ b/torch/utils/throughput_benchmark.py
@@ -24,7 +24,7 @@ def format_time(time_us=None, time_ms=None, time_s=None):
     return '{:.3f}us'.format(time_us)
 
 
-class ExecutionStats(object):
+class ExecutionStats:
     def __init__(self, c_stats, benchmark_config):
         self._c_stats = c_stats
         self.benchmark_config = benchmark_config
@@ -58,7 +58,7 @@ def __str__(self):
         ])
 
 
-class ThroughputBenchmark(object):
+class ThroughputBenchmark:
     '''
     This class is a wrapper around a c++ component throughput_benchmark::ThroughputBenchmark
     responsible for executing a PyTorch module (nn.Module or ScriptModule)
diff --git a/torchgen/api/autograd.py b/torchgen/api/autograd.py
index bb3998f39efc..5ff9e1ad7a55 100644
--- a/torchgen/api/autograd.py
+++ b/torchgen/api/autograd.py
@@ -1,12 +1,14 @@
-import copy
 import re
 from dataclasses import dataclass
-from typing import Dict, List, Match, Optional, Sequence, Set, Tuple
+from typing import cast, Dict, List, Match, Optional, Sequence, Set, Tuple
+
+from torchgen import local
 
 from torchgen.api import cpp
 from torchgen.api.types import BaseCType, Binding, NamedCType, tensorListT
 from torchgen.model import (
     FunctionSchema,
+    ListType,
     NativeFunction,
     NativeFunctionsViewGroup,
     SchemaKind,
@@ -323,6 +325,31 @@ def match_differentiability_info(
         if schema.kind() != SchemaKind.functional
     }
 
+    def is_foreach_func(f: NativeFunction) -> bool:
+        base_op_name = f.func.name.name
+        return base_op_name.base.startswith("_foreach_") and not base_op_name.inplace
+
+    def is_reference_for_foreach(
+        f: NativeFunction,
+        function_schema: FunctionSchema,
+    ) -> bool:
+        return (
+            f.func.name.name.base.split("_foreach_")[-1]
+            == function_schema.name.name.base
+            and not function_schema.name.name.inplace
+            and (
+                True
+                if len(f.func.arguments.post_self_positional) == 0
+                else all(
+                    ref_arg.type in (arg.type, getattr(arg.type, "elem", None))
+                    for arg, ref_arg in zip(
+                        f.func.arguments.flat_non_out,
+                        function_schema.arguments.flat_non_out,
+                    )
+                )
+            )
+        )
+
     def find_info(
         f: NativeFunction,
     ) -> Tuple[Optional[Dict[str, DifferentiabilityInfo]], bool]:
@@ -358,93 +385,137 @@ def find_info(
  this is not currently supported (we'd need to fix up the formula in the codegen)."""
             return info_dict, False
 
-        # (4) Generate derivative information of unary foreach functions if none is defined in `derivatives.yaml`
+        # (4) Generate derivative information of foreach functions if none is defined in `derivatives.yaml`
         base_op_name = f.func.name.name
-        if (
-            base_op_name.base.startswith("_foreach")
-            and not base_op_name.inplace
-            and len(f.func.arguments.post_self_positional) == 0
-        ):
-            ref_native_op_name = base_op_name.base.split("_foreach_")[-1]
+        if is_foreach_func(f):
             for function_schema in functional_info_by_signature:
-                if (
-                    function_schema.name.name.base == ref_native_op_name
-                    and not function_schema.name.name.inplace
+                if not is_reference_for_foreach(f, function_schema):
+                    continue
+                if function_schema in differentiability_infos:
+                    ref_diff_info = differentiability_infos[function_schema]["Default"]
+                elif (
+                    function_schema.signature(strip_default=True)
+                    in functional_info_by_signature
                 ):
-                    all_saved_inputs = []
-                    all_saved_outputs = []
-                    diff_info_dict = copy.deepcopy(
-                        differentiability_infos[function_schema]
+                    ref_diff_info = functional_info_by_signature[
+                        function_schema.signature(strip_default=True)
+                    ]["Default"]
+                else:
+                    raise RuntimeError(
+                        f"Reference `DifferentiabilityInfo` for {f.func} not found: query: {function_schema}"
                     )
-                    diff_info = diff_info_dict["Default"]
-                    modified_derivative_formulas = []
-                    for derivative in diff_info.derivatives:
-                        saved_inputs = []
-                        saved_outputs = []
-                        modified_formula = (
-                            derivative.formula.replace("grad", "grads[i]")
-                            .replace("self", "self[i]")
-                            .replace("result", "result[i]")
-                        )
-                        if "self" in modified_formula:
+
+                map_refarg2foreacharg = {}
+                map_name2arg = {}
+                for arg, ref_arg in zip(
+                    f.func.arguments.flat_non_out,
+                    function_schema.arguments.flat_non_out,
+                ):
+                    map_refarg2foreacharg[ref_arg.name] = arg.name
+                    map_name2arg[arg.name] = arg
+
+                all_saved_inputs: List[SavedAttribute] = []
+                all_saved_outputs: List[SavedAttribute] = []
+                modified_derivative_formulas: List[Derivative] = []
+                all_var_names: List[str] = []
+                for derivative in ref_diff_info.derivatives:
+                    # note(crcrpar): Assumption: `grads` and `result` always are a sequence of Tensors.
+                    modified_formula = derivative.formula.replace(
+                        "grad", "grads[i]"
+                    ).replace("result", "result[i]")
+
+                    saved_inputs, saved_outputs = [], []
+                    with local.parametrize(
+                        use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors,
+                        use_ilistref_for_tensor_lists=f.part_of_structured_group,
+                    ):
+                        for ref_input in derivative.saved_inputs:
+                            ref_input_jit_name = ref_input.expr.split(".")[0]
+                            mapped_name = map_refarg2foreacharg[ref_input_jit_name]
+                            if isinstance(map_name2arg[mapped_name].type, ListType):
+                                mapped_expr = mapped_name + "[i]"
+                            else:
+                                mapped_expr = mapped_name
+                            new_expr = ref_input.expr.replace(
+                                ref_input_jit_name, mapped_expr
+                            )
+                            modified_formula = modified_formula.replace(
+                                cast(str, ref_input.nctype.name), new_expr
+                            )
+
+                            nctype = cpp.argument_type(
+                                map_name2arg[mapped_name], binds=mapped_name
+                            )
+                            canonical_nctype = NamedCType(
+                                nctype.name, nctype.type.remove_const_ref()
+                            )
                             saved_inputs.append(
                                 SavedAttribute(
-                                    nctype=NamedCType(
-                                        name="self", type=BaseCType(tensorListT)
-                                    ),
-                                    expr="self",
+                                    nctype=canonical_nctype, expr=mapped_name
                                 )
                             )
-                            all_saved_inputs.append(saved_inputs[-1])
-                        if "result" in modified_formula:
-                            saved_outputs.append(
-                                SavedAttribute(
-                                    nctype=NamedCType(
-                                        name="result", type=BaseCType(tensorListT)
-                                    ),
-                                    expr="result",
+                        for ref_output in derivative.saved_outputs:
+                            if ref_output.nctype.name == "result":
+                                saved_outputs.append(
+                                    SavedAttribute(
+                                        nctype=NamedCType(
+                                            name="result", type=BaseCType(tensorListT)
+                                        ),
+                                        expr="result",
+                                    )
                                 )
-                            )
-                            all_saved_outputs.append(saved_outputs[-1])
-                        modified_derivative = Derivative(
-                            formula=modified_formula,
-                            original_formula=derivative.original_formula,
-                            var_names=("self",),
-                            saved_inputs=tuple(saved_inputs),
-                            saved_outputs=tuple(saved_outputs),
-                            named_gradients=set(),
-                        )
-                        modified_derivative_formulas.append(modified_derivative)
-                    assert f.func.arguments.self_arg is not None
-                    diff_info = DifferentiabilityInfo(
-                        name=base_op_name.base,
-                        func=f,
-                        op=f"Foreach{diff_info.op}",
-                        derivatives=modified_derivative_formulas,
-                        forward_derivatives=[],
-                        all_saved_inputs=tuple(set(all_saved_inputs)),
-                        all_saved_outputs=tuple(set(all_saved_outputs)),
-                        available_named_gradients=(),
-                        used_named_gradients=set(),
-                        args_with_derivatives=[
-                            Binding(
-                                name="self",
-                                nctype=NamedCType(
-                                    name="self", type=BaseCType(tensorListT)
-                                ),
-                                argument=f.func.arguments.self_arg.argument,
-                                default=None,
-                            )
-                        ],
-                        non_differentiable_arg_names=[],
-                        output_differentiability=None,
-                        output_differentiability_conditions=None,
+                            else:
+                                raise RuntimeError(
+                                    f"Counterpart of {ref_output} not found"
+                                )
+                    var_names = [
+                        map_refarg2foreacharg[var] for var in derivative.var_names
+                    ]
+                    all_var_names.extend(var_names)
+                    all_saved_inputs.extend(saved_inputs)
+                    all_saved_outputs.extend(saved_outputs)
+                    modified_derivative = Derivative(
+                        formula=modified_formula,
+                        original_formula=derivative.formula,
+                        var_names=tuple(var_names),
+                        saved_inputs=tuple(saved_inputs),
+                        saved_outputs=tuple(saved_outputs),
+                        named_gradients=set(),
                     )
-                    diff_info_dict["Default"] = diff_info
-                    if f.func not in differentiability_infos:
-                        differentiability_infos[f.func] = diff_info_dict
-                        functional_info_by_signature[f.func] = diff_info_dict
-                    return diff_info_dict, True
+                    modified_derivative_formulas.append(modified_derivative)
+                with local.parametrize(
+                    use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors,
+                    use_ilistref_for_tensor_lists=f.part_of_structured_group,
+                ):
+                    args_with_derivatives = [
+                        Binding(
+                            name=var,
+                            nctype=cpp.argument_type(map_name2arg[var], binds=var),
+                            argument=map_name2arg[var],
+                            default=None,
+                        )
+                        for var in all_var_names
+                    ]
+                diff_info = DifferentiabilityInfo(
+                    name=base_op_name.base,
+                    func=f,
+                    op=f"Foreach{ref_diff_info.op}{f.func.name.overload_name}",
+                    derivatives=modified_derivative_formulas,
+                    forward_derivatives=[],
+                    all_saved_inputs=tuple(set(all_saved_inputs)),
+                    all_saved_outputs=tuple(set(all_saved_outputs)),
+                    available_named_gradients=(),
+                    used_named_gradients=set(),
+                    args_with_derivatives=args_with_derivatives,
+                    non_differentiable_arg_names=[],
+                    output_differentiability=None,
+                    output_differentiability_conditions=None,
+                )
+                diff_info_dict = {"Default": diff_info}
+                if f.func not in differentiability_infos:
+                    differentiability_infos[f.func] = diff_info_dict
+                    functional_info_by_signature[f.func] = diff_info_dict
+                return diff_info_dict, True
 
         return None, False
 
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
index bfb7a1435bee..8f1ecf9e9dab 100644
--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@@ -756,9 +756,9 @@ def signature_from_schema(
     args.extend(func.arguments.post_tensor_options_kwarg_only)
     args.extend(func.arguments.out)
 
-    input_arg_set = set(a.name for a in func.arguments.flat_positional)
-    kwarg_only_set = set(a.name for a in func.arguments.flat_kwarg_only)
-    out_arg_set = set(a.name for a in func.arguments.out)
+    input_arg_set = {a.name for a in func.arguments.flat_positional}
+    kwarg_only_set = {a.name for a in func.arguments.flat_kwarg_only}
+    out_arg_set = {a.name for a in func.arguments.out}
 
     input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args)))
     input_kwargs = tuple(
@@ -1072,7 +1072,7 @@ def dispatch_lambda_args(
         method=False,
         cpp_no_default_args=f.cpp_no_default_args,
     )
-    out_args: Set[str] = set(a.name for a in schema.arguments.out)
+    out_args: Set[str] = {a.name for a in schema.arguments.out}
 
     # Convert from cpp argument to lambda argument
     def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument:
@@ -1109,6 +1109,7 @@ def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument:
     "::std::tuple<at::Tensor,at::Tensor,at::Tensor>",
     "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
     "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor>",
     "::std::tuple<at::Tensor,at::Tensor,at::Tensor,int64_t>",
     "::std::tuple<at::Tensor,at::Tensor,double,int64_t>",
     "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t>",
@@ -1116,6 +1117,8 @@ def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument:
     "::std::tuple<double,int64_t>",
     "::std::tuple<at::Tensor,::std::vector<at::Tensor>>",
     "::std::vector<at::Tensor>",
+    # Needed for flash attention forw/backward
+    "::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t,int64_t,int64_t,int64_t,at::Tensor>",
     "at::Scalar",
     "bool",
     "int64_t",
diff --git a/torchgen/dest/lazy_ts_lowering.py b/torchgen/dest/lazy_ts_lowering.py
index bb1d69ee393a..70161216d8e7 100644
--- a/torchgen/dest/lazy_ts_lowering.py
+++ b/torchgen/dest/lazy_ts_lowering.py
@@ -1,4 +1,4 @@
-from torchgen.api.lazy import LazyIrSchema
+from torchgen.api.lazy import LazyArgument, LazyIrSchema
 from torchgen.api.types import OptionalCType
 
 
@@ -6,14 +6,15 @@ def ts_lowering_body(schema: LazyIrSchema) -> str:
     # for now, we just want one IR class decl and soon after also the method defs
     # and we use the functional version not out/inplace.
     emplace_arguments = []
+
+    def get_value(arg: LazyArgument) -> str:
+        if isinstance(arg.lazy_type, OptionalCType):
+            return f"has_{arg.name} ? loctx->GetOutputOp(operand(i++)) : nullptr"
+        return "loctx->GetOutputOp(operand(i++))"
+
     for arg in schema.positional_args:
         if arg.is_lazy_value:
-            if isinstance(arg.lazy_type, OptionalCType):
-                emplace_arguments.append(
-                    f"has_{arg.name} ? loctx->GetOutputOp(operand(i++)) : nullptr"
-                )
-                continue
-            emplace_arguments.append("loctx->GetOutputOp(operand(i++))")
+            emplace_arguments.append(get_value(arg))
             continue
         emplace_arguments.append(f'"{arg.name}", {arg.name}')
 
@@ -21,8 +22,7 @@ def ts_lowering_body(schema: LazyIrSchema) -> str:
         [f"arguments.emplace_back({a});" for a in emplace_arguments]
     )
     emplace_kwarg_values = [
-        f'"{arg.name}", loctx->GetOutputOp(operand(i++))'
-        for arg in schema.keyword_values
+        f'"{arg.name}", {get_value(arg)}' for arg in schema.keyword_values
     ]
     emplace_kwarg_scalars = [
         f'"{arg.name}", {arg.name}' for arg in schema.keyword_scalars
diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py
index 871d227eba8f..8f28be67274e 100644
--- a/torchgen/dest/register_dispatch_key.py
+++ b/torchgen/dest/register_dispatch_key.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
-from typing_extensions import Literal
+from typing_extensions import Literal  # Python 3.8+
 
 import torchgen.api.cpp as cpp
 import torchgen.api.meta as meta
diff --git a/torchgen/executorch/api/types/signatures.py b/torchgen/executorch/api/types/signatures.py
index 10f2c9d36a5d..d79a4521644a 100644
--- a/torchgen/executorch/api/types/signatures.py
+++ b/torchgen/executorch/api/types/signatures.py
@@ -6,12 +6,15 @@
 from torchgen.api.types import Binding, CType
 from torchgen.model import FunctionSchema, NativeFunction
 
+from .types import contextArg
+
 
 @dataclass(frozen=True)
 class ExecutorchCppSignature:
     """
-    This signature is merely a CppSignature with Executorch types. The inline definition
-    of CppSignature is generated in Functions.h and it's used by unboxing functions.
+    This signature is merely a CppSignature with Executorch types (optionally contains
+    RuntimeContext as well). The inline definition of CppSignature is generated in Functions.h
+    and it's used by unboxing functions.
     """
 
     # The schema this signature is derived from
@@ -25,8 +28,8 @@ class ExecutorchCppSignature:
     # and need to avoid naming collisions.
     prefix: str = ""
 
-    def arguments(self) -> List[Binding]:
-        return et_cpp.arguments(
+    def arguments(self, *, include_context: bool = True) -> List[Binding]:
+        return ([contextArg] if include_context else []) + et_cpp.arguments(
             self.func.arguments,
             faithful=True,  # always faithful, out argument at the end
             method=False,  # method not supported
@@ -39,8 +42,10 @@ def name(self) -> str:
             faithful_name_for_out_overloads=True,
         )
 
-    def decl(self, name: Optional[str] = None) -> str:
-        args_str = ", ".join(a.decl() for a in self.arguments())
+    def decl(self, name: Optional[str] = None, *, include_context: bool = True) -> str:
+        args_str = ", ".join(
+            a.decl() for a in self.arguments(include_context=include_context)
+        )
         if name is None:
             name = self.name()
         return f"{self.returns_type().cpp_type()} {name}({args_str})"
diff --git a/torchgen/executorch/api/types/types.py b/torchgen/executorch/api/types/types.py
index d4217c0b9457..f6775ca61b65 100644
--- a/torchgen/executorch/api/types/types.py
+++ b/torchgen/executorch/api/types/types.py
@@ -1,7 +1,18 @@
 from dataclasses import dataclass
 from typing import Dict
 
-from torchgen.api.types import BaseCppType, boolT, CType, doubleT, longT
+from torchgen.api.types import (
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    CType,
+    doubleT,
+    Expr,
+    longT,
+    MutRefCType,
+    NamedCType,
+)
 from torchgen.model import BaseTy
 
 halfT = BaseCppType("torch::executor", "Half")
@@ -14,6 +25,19 @@
 memoryFormatT = BaseCppType("torch::executor", "MemoryFormat")
 intArrayRefT = BaseCppType("torch::executor", "IntArrayRef")
 optionalT = BaseCppType("torch::executor", "optional")
+contextT = BaseCppType("torch::executor", "RuntimeContext")
+
+contextExpr = Expr(
+    expr="context",
+    type=NamedCType(name="context", type=MutRefCType(BaseCType(contextT))),
+)
+
+contextArg = Binding(
+    name="context",
+    nctype=contextExpr.type,
+    argument=None,  # type: ignore[arg-type]
+    default=None,
+)
 
 BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = {
     BaseTy.int: longT,
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 4076e4293108..0df9e3e81fcc 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -19,7 +19,7 @@
 )
 
 import yaml
-from typing_extensions import Literal
+from typing_extensions import Literal  # Python 3.8+
 
 import torchgen.api.dispatcher as dispatcher
 import torchgen.api.meta as meta
@@ -549,7 +549,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 # and (2) don't want to worry about method-only operators.
 @dataclass(frozen=True)
 class ComputeOperators:
-    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+    target: Literal[Target.DECLARATION, Target.DEFINITION]
     static_dispatch_backend_indices: List[BackendIndex]
 
     @method_with_native_function
@@ -694,7 +694,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 # public C++ API, and the scaffolding to call into the dispatcher from these functions.
 @dataclass(frozen=True)
 class ComputeTensorMethod:
-    target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]]
+    target: Literal[Target.DECLARATION, Target.DEFINITION]
     static_dispatch_backend_indices: List[BackendIndex]
 
     @method_with_native_function
@@ -913,7 +913,7 @@ def needs_backend_select(f: NativeFunction, selector: SelectiveBuilder) -> bool:
 # be easily done automatically using templating.
 @dataclass(frozen=True)
 class ComputeBackendSelect:
-    target: Union[Literal[Target.DEFINITION], Literal[Target.REGISTRATION]]
+    target: Literal[Target.DEFINITION, Target.REGISTRATION]
 
     # Selector object to determine which operators to generate
     # registration code for.
@@ -1188,8 +1188,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
 
     # These sets are used to conveniently test if an argument is a
     # kwarg-only or out argument
-    kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
-    out_arg_set = set(a.name for a in f.func.arguments.out)
+    kwarg_only_set = {a.name for a in f.func.arguments.flat_kwarg_only}
+    out_arg_set = {a.name for a in f.func.arguments.out}
 
     sig_group = CppSignatureGroup.from_native_function(
         f, method=False, fallback_binding=False
@@ -2099,21 +2099,19 @@ def gen_aten_interned_strings() -> Dict[str, str]:
 
         # These are keywords in C++, so aren't valid symbol names
         # https://en.cppreference.com/w/cpp/language/operator_alternative
-        names -= set(
-            [
-                "and",
-                "and_eq",
-                "bitand",
-                "bitor",
-                "compl",
-                "not",
-                "not_eq",
-                "or",
-                "or_eq",
-                "xor",
-                "xor_eq",
-            ]
-        )
+        names -= {
+            "and",
+            "and_eq",
+            "bitand",
+            "bitor",
+            "compl",
+            "not",
+            "not_eq",
+            "or",
+            "or_eq",
+            "xor",
+            "xor_eq",
+        }
 
         return {
             "aten_symbols": " \\\n".join(
@@ -2611,7 +2609,11 @@ def main() -> None:
         help="generate separate headers per operator in ATen/ops",
     )
     parser.add_argument(
-        "-d", "--install_dir", help="output directory", default="build/aten/src/ATen"
+        "-d",
+        "--install-dir",
+        "--install_dir",
+        help="output directory",
+        default="build/aten/src/ATen",
     )
     parser.add_argument(
         "--rocm",
@@ -2623,10 +2625,11 @@ def main() -> None:
         action="store_true",
         help="Generate MPS registration code when set",
     )
-    # TODO: --op_registration_whitelist will be removed when all call-sites
+    # TODO: --op-registration-whitelist will be removed when all call-sites
     # for gen.py are moved over to using the operator YAML file for mobile
     # custom build.
     parser.add_argument(
+        "--op-registration-whitelist",
         "--op_registration_whitelist",
         nargs="*",
         help="filter op registrations by the whitelist (if set); "
@@ -2634,6 +2637,7 @@ def main() -> None:
         "e.g.: aten::empty aten::conv2d ...",
     )
     parser.add_argument(
+        "--op-selection-yaml-path",
         "--op_selection_yaml_path",
         help="Provide a path to the operator selection (for custom build) YAML "
         "that contains the information about the set of selected operators "
@@ -2642,26 +2646,30 @@ def main() -> None:
         "The operator names also contain the namespace prefix (e.g. aten::)",
     )
     parser.add_argument(
+        "--backend-whitelist",
         "--backend_whitelist",
         nargs="*",
         help="filter dispatch backend by the whitelist (if set), "
         "e.g.: CPU CUDA QuantizedCPU ...",
     )
     parser.add_argument(
+        "--static-dispatch-backend",
         "--static_dispatch_backend",
         nargs="*",
         help="generate static dispatch code for the specific backend (if set)",
     )
     parser.add_argument(
+        "--skip-dispatcher-op-registration",
         "--skip_dispatcher_op_registration",
         action="store_true",
         help="Avoid registering operators into the dispatcher.",
     )
     parser.add_argument(
+        "--force-schema-registration",
         "--force_schema_registration",
         action="store_true",
         help="force it to generate schema-only registrations for all ops, including"
-        "those that are not listed on --op_registration_whitelist",
+        "those that are not listed on --op-registration-whitelist",
     )
     parser.add_argument(
         "--generate",
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index b04b3bd83c29..a8dc476254cf 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -339,12 +339,16 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Generate backend stub files")
     parser.add_argument(
         "-s",
+        "--source-yaml",
         "--source_yaml",
         help="path to source yaml file containing operator external definitions",
     )
-    parser.add_argument("-o", "--output_dir", help="output directory")
-    parser.add_argument("--dry_run", type=bool, default=False, help="output directory")
+    parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory")
     parser.add_argument(
+        "--dry-run", "--dry_run", type=bool, default=False, help="output directory"
+    )
+    parser.add_argument(
+        "--impl-path",
         "--impl_path",
         type=str,
         default=None,
@@ -373,29 +377,25 @@ def gen_dispatchkey_nativefunc_headers(
     # Convert to a set first to remove duplicate kernel names.
     # Backends are allowed to repeat kernel names; only generate the declaration once!
     # Sort for deterministic output.
-    backend_declarations = list(
-        sorted(
-            set(
-                concatMap(
-                    lambda f: dest.compute_native_function_declaration(
-                        f, backend_indices[backend_dispatch_key]
-                    ),
-                    grouped_native_functions,
-                )
+    backend_declarations = sorted(
+        set(
+            concatMap(
+                lambda f: dest.compute_native_function_declaration(
+                    f, backend_indices[backend_dispatch_key]
+                ),
+                grouped_native_functions,
             )
         )
     )
-    autograd_declarations = list(
-        sorted(
-            set(
-                concatMap(
-                    lambda f: []
-                    if autograd_dispatch_key is None
-                    else dest.compute_native_function_declaration(
-                        f, backend_indices[autograd_dispatch_key]
-                    ),
-                    grouped_native_functions,
-                )
+    autograd_declarations = sorted(
+        set(
+            concatMap(
+                lambda f: []
+                if autograd_dispatch_key is None
+                else dest.compute_native_function_declaration(
+                    f, backend_indices[autograd_dispatch_key]
+                ),
+                grouped_native_functions,
             )
         )
     )
diff --git a/torchgen/gen_executorch.py b/torchgen/gen_executorch.py
index 47a7fb89ee59..e10b07742dbb 100644
--- a/torchgen/gen_executorch.py
+++ b/torchgen/gen_executorch.py
@@ -17,7 +17,7 @@
     ComputeNativeFunctionStub,
     gen_custom_ops_registration,
 )
-from torchgen.executorch.api.types import ExecutorchCppSignature
+from torchgen.executorch.api.types import contextArg, ExecutorchCppSignature
 from torchgen.executorch.api.unboxing import Unboxing
 from torchgen.gen import (
     get_custom_build_selector,
@@ -29,10 +29,13 @@
 )
 from torchgen.model import (
     BackendIndex,
+    BackendMetadata,
     DispatchKey,
+    is_cuda_dispatch_key,
     Location,
     NativeFunction,
     NativeFunctionsGroup,
+    OperatorName,
     Variant,
 )
 from torchgen.selective_build.selector import SelectiveBuilder
@@ -93,6 +96,8 @@ class ComputeFunction:
 
     use_aten_lib: bool
 
+    is_custom_op: Callable[[NativeFunction], bool]
+
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
         if not self.selector.is_root_operator(f"{f.namespace}::{f.func.name}"):
@@ -106,7 +111,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
             if self.use_aten_lib
             else ExecutorchCppSignature.from_native_function(f)
         )
-        if self.use_aten_lib and f.namespace == "aten":
+        if self.use_aten_lib and not self.is_custom_op(f):
             comma = ", "
 
             return f"""
@@ -144,14 +149,16 @@ def __call__(self, f: NativeFunction) -> str:
             ).most_faithful_signature()
             argument_type_gen = aten_cpp.argumenttype_type
             return_type_gen = aten_cpp.returns_type
+            arguments = sig.arguments()
         else:
             sig = ExecutorchCppSignature.from_native_function(f)
             argument_type_gen = et_cpp.argumenttype_type
             return_type_gen = et_cpp.returns_type
+            arguments = sig.arguments(include_context=False)
         # parse arguments into C++ code
         binding_list, code_list = Unboxing(
             argument_type_gen=argument_type_gen
-        ).convert_arguments(sig.arguments())
+        ).convert_arguments(arguments)
 
         # for each C++ argument, generate the conversion code
         code_connector = "\n\t"
@@ -180,11 +187,12 @@ def __call__(self, f: NativeFunction) -> str:
         return f"""
 Operator(
     "{f.namespace}::{f.func.name}",
-    [](EValue** stack) {{
+    []({contextArg.defn()}, EValue** stack) {{
+        {"(void)context;" if self.use_aten_lib else ""}
         {code_connector.join(code_list)}
 
         EXECUTORCH_SCOPE_PROF("native_call_{f.func.name}");
-        {ret_prefix}torch::executor::{f.namespace}::{sig.name()}({args_str});
+        {ret_prefix}torch::executor::{f.namespace}::{sig.name()}({"" if self.use_aten_lib else "context, "}{args_str});
 
         {return_assignment}
     }}
@@ -224,7 +232,12 @@ def compute_native_function_declaration(
     if metadata is None:
         return []
     prefix = "static" if backend_index.external else "TORCH_API"
-    return [f"{prefix} {sig.decl(name=metadata.kernel)};"]
+    # for kernels in lean mode, we declare two versions, one with context and one without.
+    # In the end we will cleanup the unused one.
+    return [
+        f"{prefix} {sig.decl(name=metadata.kernel)};",
+        f"{prefix} {sig.decl(name=metadata.kernel, include_context=False)};",
+    ]
 
 
 def gen_functions_declarations(
@@ -233,6 +246,7 @@ def gen_functions_declarations(
     static_dispatch_idx: List[BackendIndex],
     selector: SelectiveBuilder,
     use_aten_lib: bool,
+    custom_ops_native_functions: Optional[Sequence[NativeFunction]] = None,
 ) -> str:
     """
     Generates namespace separated C++ function API inline declaration/definitions.
@@ -260,6 +274,8 @@ def gen_functions_declarations(
                     static_dispatch_backend_indices=static_dispatch_idx,
                     selector=selector,
                     use_aten_lib=use_aten_lib,
+                    is_custom_op=lambda f: custom_ops_native_functions is not None
+                    and f in custom_ops_native_functions,
                 ),
                 ns_grouped_functions[namespace],
             )
@@ -275,19 +291,44 @@ def gen_functions_declarations(
 def gen_headers(
     *,
     native_functions: Sequence[NativeFunction],
+    gen_custom_ops_header: bool,
+    custom_ops_native_functions: Sequence[NativeFunction],
     static_dispatch_idx: List[BackendIndex],
     selector: SelectiveBuilder,
     backend_indices: Dict[DispatchKey, BackendIndex],
     cpu_fm: FileManager,
     use_aten_lib: bool,
 ) -> None:
+    """Generate headers.
+
+    Args:
+        native_functions (Sequence[NativeFunction]): a collection of NativeFunction for ATen ops.
+        gen_custom_ops_header (bool): whether we should generate CustomOpsNativeFunctions.h
+        custom_ops_native_functions (Sequence[NativeFunction]): a collection of NativeFunction for custom ops.
+        static_dispatch_idx (List[BackendIndex]): kernel collection
+        selector (SelectiveBuilder): for selective build
+        backend_indices (Dict[DispatchKey, BackendIndex]): kernel collection TODO (larryliu): merge with static_dispatch_idx
+        cpu_fm (FileManager): file manager manages output stream
+        use_aten_lib (bool): whether we are generating for PyTorch types or Executorch types.
+    """
+    aten_headers = ["#include <ATen/Functions.h>"]
+    if gen_custom_ops_header:
+        cpu_fm.write_with_template(
+            "CustomOpsNativeFunctions.h",
+            "NativeFunctions.h",
+            lambda: {
+                "nativeFunctions_declarations": get_native_function_declarations(
+                    grouped_native_functions=custom_ops_native_functions,
+                    backend_indices=backend_indices,
+                    native_function_decl_gen=dest.compute_native_function_declaration,
+                ),
+            },
+        )
+        aten_headers.append('#include "CustomOpsNativeFunctions.h"')
     cpu_fm.write(
         "Functions.h",
         lambda: {
-            "static_dispatch_extra_headers": [
-                '#include "CustomOpsNativeFunctions.h"',
-                "#include <ATen/Functions.h>",
-            ]
+            "static_dispatch_extra_headers": aten_headers
             if use_aten_lib
             else ['#include "NativeFunctions.h"'],
             "Functions_declarations": gen_functions_declarations(
@@ -295,6 +336,7 @@ def gen_headers(
                 static_dispatch_idx=static_dispatch_idx,
                 selector=selector,
                 use_aten_lib=use_aten_lib,
+                custom_ops_native_functions=custom_ops_native_functions,
             ),
         },
     )
@@ -332,17 +374,6 @@ def gen_custom_ops(
         backend_index=backend_index,
         rocm=rocm,
     )
-    cpu_fm.write_with_template(
-        "CustomOpsNativeFunctions.h",
-        "NativeFunctions.h",
-        lambda: {
-            "nativeFunctions_declarations": get_native_function_declarations(
-                grouped_native_functions=native_functions,
-                backend_indices=backend_indices,
-                native_function_decl_gen=dest.compute_native_function_declaration,
-            ),
-        },
-    )
     cpu_fm.write_with_template(
         f"Register{dispatch_key}CustomOps.cpp",
         "RegisterDispatchKeyCustomOps.cpp",
@@ -389,7 +420,7 @@ def gen_custom_ops(
 def translate_native_yaml(
     tags_yaml_path: str,
     aten_yaml_path: str,
-    native_yaml_path: str,
+    native_yaml_path: Optional[str],
     use_aten_lib: bool,
     out_file: TextIO,
 ) -> None:
@@ -442,9 +473,16 @@ def translate_native_yaml(
     schema_dict = {
         f"{f.namespace}::{f.func.name}": str(f.func) for f in aten_native_functions
     }
-
+    if (
+        not native_yaml_path
+        or not os.path.exists(native_yaml_path)
+        or os.stat(native_yaml_path).st_size == 0
+    ):
+        return
     with open(native_yaml_path, "r") as native_yaml:
         native_es = yaml.load(native_yaml, Loader=LineLoader)
+        if not native_es:
+            return
         for e in native_es:
             assert isinstance(e.get("__line__"), int), e
             loc = Location(native_yaml_path, e.pop("__line__"))
@@ -462,11 +500,67 @@ def translate_native_yaml(
         yaml.dump(native_es, out_file, width=1000)
 
 
+def convert_backend_indices(
+    bs: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]]
+) -> Dict[DispatchKey, BackendIndex]:
+    indices: Dict[DispatchKey, BackendIndex] = defaultdict(
+        lambda: BackendIndex(
+            dispatch_key=DispatchKey.Undefined,
+            use_out_as_primary=True,
+            external=False,
+            device_guard=False,
+            index={},
+        )
+    )
+    for k, v in bs.items():
+        indices[k] = BackendIndex(
+            dispatch_key=k,
+            use_out_as_primary=True,
+            external=False,
+            # Only cuda-like devices in tree require device guards
+            device_guard=is_cuda_dispatch_key(k),
+            index=v,
+        )
+    return indices
+
+
+def parse_yaml(
+    path: Optional[str],
+    tags_yaml_path: str,
+    function_filter: Callable[[NativeFunction], bool],
+    skip_native_fns_gen: bool = False,
+) -> Tuple[
+    List[NativeFunction], Dict[DispatchKey, Dict[OperatorName, BackendMetadata]]
+]:
+    if path and os.path.exists(path) and os.stat(path).st_size > 0:
+        parsed_yaml = parse_native_yaml(
+            path,
+            tags_yaml_path,
+            None,
+            skip_native_fns_gen=skip_native_fns_gen,
+        )
+        native_functions = list(filter(function_filter, parsed_yaml.native_functions))
+        op_names = [f.func.name for f in native_functions]
+
+        def map_index(
+            m: Dict[OperatorName, BackendMetadata]
+        ) -> Dict[OperatorName, BackendMetadata]:
+            return {op: m[op] for op in m if op in op_names}
+
+        backend_indices = {
+            k: map_index(b.index) for (k, b) in parsed_yaml.backend_indices.items()
+        }
+        return native_functions, backend_indices
+    else:
+        return [], {}
+
+
 def parse_yaml_files(
     tags_yaml_path: str,
     aten_yaml_path: str,
     native_yaml_path: Optional[str],
     custom_ops_yaml_path: Optional[str],
+    selector: SelectiveBuilder,
     use_aten_lib: bool,
 ) -> Tuple[ParsedYaml, Optional[ParsedYaml]]:
     """Parses functions.yaml and custom_ops.yaml files.
@@ -481,6 +575,7 @@ def parse_yaml_files(
             file are appended to the yaml input to be parsed.
         custom_ops_yaml_path: Path to a custom_ops.yaml file to parse. If
             the path does not exist in the filesystem, it is ignored.
+        selector: For selective build.
         use_aten_lib: We use this flag to determine if we want to generate native
             functions. In ATen mode we should generate out= variants.
     Returns:
@@ -492,14 +587,11 @@ def parse_yaml_files(
     """
     import tempfile
 
-    gen_native_fns = use_aten_lib and native_yaml_path
+    # only include selected ops, this is because we want to avoid
+    def function_filter(f: NativeFunction) -> bool:
+        return selector.is_native_function_selected(f)
+
     with tempfile.TemporaryDirectory() as tmpdirname:
-        # If native_yaml_path doesn't exist, point to an empty file.
-        if not native_yaml_path or not os.path.exists(native_yaml_path):
-            native_yaml_path = os.path.join(tmpdirname, "functions.yaml")
-            with open(native_yaml_path, "w"):
-                pass
-        # Translate native_yaml_path to the same format of native_functions.yaml
         translated_yaml_path = os.path.join(tmpdirname, "translated.yaml")
         with open(translated_yaml_path, "w") as translated:
             translate_native_yaml(
@@ -509,31 +601,35 @@ def parse_yaml_files(
                 use_aten_lib,
                 translated,
             )
-        # If custom_ops_yaml_path doesn't exist, point to an empty file.
-        if not custom_ops_yaml_path or not os.path.exists(custom_ops_yaml_path):
-            custom_ops_yaml_path = os.path.join(tmpdirname, "custom_ops.yaml")
-            with open(custom_ops_yaml_path, "w"):
-                pass
-        combined_yaml_path = os.path.join(tmpdirname, "combined.yaml")
-        with open(combined_yaml_path, "w") as tmp, open(
-            translated_yaml_path, "r"
-        ) as native, open(custom_ops_yaml_path, "r") as custom:
-            for line in native.readlines():
-                tmp.write(line)
-            for line in custom.readlines():
-                tmp.write(line)
-        custom_ops_parsed_yaml = parse_native_yaml(
-            custom_ops_yaml_path, tags_yaml_path, None, skip_native_fns_gen=True
+        translated_functions, translated_backend_indices = parse_yaml(
+            translated_yaml_path, tags_yaml_path, function_filter, not use_aten_lib
         )
-
-        parsed_yaml = parse_native_yaml(
-            combined_yaml_path,
-            tags_yaml_path,
-            None,
-            skip_native_fns_gen=(not gen_native_fns),
+        custom_ops_functions, custom_ops_backend_indices = parse_yaml(
+            custom_ops_yaml_path, tags_yaml_path, function_filter, True
         )
 
-    return parsed_yaml, custom_ops_parsed_yaml
+        combined_functions = translated_functions + custom_ops_functions
+        combined_backend_indices: Dict[
+            DispatchKey, Dict[OperatorName, BackendMetadata]
+        ] = defaultdict(dict)
+        combined_backend_indices.update(translated_backend_indices)
+
+        for dk in custom_ops_backend_indices:
+            if dk not in combined_backend_indices:
+                combined_backend_indices.update({dk: custom_ops_backend_indices[dk]})
+            else:
+                combined_backend_indices[dk] = {
+                    **combined_backend_indices[dk],
+                    **custom_ops_backend_indices[dk],
+                }
+
+        combined_yaml = ParsedYaml(
+            combined_functions, convert_backend_indices(combined_backend_indices)
+        )
+        custom_ops_parsed_yaml = ParsedYaml(
+            custom_ops_functions, convert_backend_indices(custom_ops_backend_indices)
+        )
+    return combined_yaml, custom_ops_parsed_yaml
 
 
 def main() -> None:
@@ -547,24 +643,31 @@ def main() -> None:
         help="path to source directory for kernel templates",
     )
     parser.add_argument(
+        "--functions-yaml-path",
         "--functions_yaml_path",
         help="path to the functions.yaml file to use. Optional, but at least "
-        "one of --functions_yaml_path and --custom_ops_yaml_path must be "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
         "specified.",
     )
     parser.add_argument(
+        "--custom-ops-yaml-path",
         "--custom_ops_yaml_path",
         help="path to the custom_ops.yaml file to use. Optional, but at least "
-        "one of --functions_yaml_path and --custom_ops_yaml_path must be "
+        "one of --functions-yaml-path and --custom-ops-yaml-path must be "
         "specified.",
     )
     parser.add_argument(
+        "--aten-yaml-path",
         "--aten_yaml_path",
         help="path to native_functions.yaml file.",
     )
     # Note that make_file_manager() also looks at --install-dir.
     parser.add_argument(
-        "-d", "--install_dir", help="output directory", default="build/generated"
+        "-d",
+        "--install-dir",
+        "--install_dir",
+        help="output directory",
+        default="build/generated",
     )
     parser.add_argument(
         "-o",
@@ -579,11 +682,13 @@ def main() -> None:
         help="run without writing any files (still updates outputs)",
     )
     parser.add_argument(
+        "--static-dispatch-backend",
         "--static_dispatch_backend",
         nargs="*",
         help="generate static dispatch code for the specific backend (if set)",
     )
     parser.add_argument(
+        "--op-registration-whitelist",
         "--op_registration_whitelist",
         nargs="*",
         help="filter op registrations by the whitelist (if set); "
@@ -591,6 +696,7 @@ def main() -> None:
         "e.g.: aten::empty aten::conv2d ...",
     )
     parser.add_argument(
+        "--op-selection-yaml-path",
         "--op_selection_yaml_path",
         help="Provide a path to the operator selection (for custom build) YAML "
         "that contains the information about the set of selected operators "
@@ -608,6 +714,7 @@ def main() -> None:
         help="reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly",
     )
     parser.add_argument(
+        "--use-aten-lib",
         "--use_aten_lib",
         action="store_true",
         help="a boolean flag to indicate whether we use ATen kernels or not, in the future this flag will be per "
@@ -623,11 +730,18 @@ def main() -> None:
     )
     options = parser.parse_args()
     assert options.tags_path, "tags.yaml is required by codegen yaml parsing."
+
+    selector = get_custom_build_selector(
+        options.op_registration_whitelist,
+        options.op_selection_yaml_path,
+    )
+
     parsed_yaml, custom_ops_parsed_yaml = parse_yaml_files(
         aten_yaml_path=options.aten_yaml_path,
         tags_yaml_path=options.tags_path,
         native_yaml_path=options.functions_yaml_path,
         custom_ops_yaml_path=options.custom_ops_yaml_path,
+        selector=selector,
         use_aten_lib=options.use_aten_lib,
     )
     native_functions, backend_indices = (
@@ -635,21 +749,19 @@ def main() -> None:
         parsed_yaml.backend_indices,
     )
     custom_ops_native_functions = (
-        custom_ops_parsed_yaml.native_functions if custom_ops_parsed_yaml else None
+        custom_ops_parsed_yaml.native_functions if custom_ops_parsed_yaml else []
     )
 
     cpu_fm = make_file_manager(options=options)
 
-    selector = get_custom_build_selector(
-        options.op_registration_whitelist,
-        options.op_selection_yaml_path,
-    )
-
     static_dispatch_idx: List[BackendIndex] = [backend_indices[DispatchKey.CPU]]
 
     if "headers" in options.generate:
+        # generate CustomOpsNativeFunctions.h when custom_ops.yaml is present, to match the build system.
         gen_headers(
             native_functions=native_functions,
+            gen_custom_ops_header=options.custom_ops_yaml_path,
+            custom_ops_native_functions=custom_ops_native_functions,
             static_dispatch_idx=static_dispatch_idx,
             selector=selector,
             backend_indices=backend_indices,
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index d7361ad7435c..90b057890715 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -210,53 +210,64 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Generate Lazy Tensor backend files")
     parser.add_argument(
         "-s",
+        "--source-yaml",
         "--source_yaml",
         help="path to source yaml file containing operator external definitions",
     )
-    parser.add_argument("-o", "--output_dir", help="output directory")
-    parser.add_argument("--dry_run", type=bool, default=False, help="output directory")
+    parser.add_argument("-o", "--output-dir", "--output_dir", help="output directory")
     parser.add_argument(
+        "--dry-run", "--dry_run", type=bool, default=False, help="output directory"
+    )
+    parser.add_argument(
+        "--impl-path",
         "--impl_path",
         type=str,
         default=None,
         help="path to the source C++ file containing kernel definitions",
     )
     parser.add_argument(
+        "--gen-ts-lowerings",
         "--gen_ts_lowerings",
         action="store_true",
         help="Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions",
     )
     parser.add_argument(
+        "--node-base",
         "--node_base",
         type=str,
         default=default_args.node_base,
         help="Name of backend specific custom Lazy IR Node base class",
     )
     parser.add_argument(
+        "--node-base-hdr",
         "--node_base_hdr",
         type=str,
         default=default_args.node_base_hdr,
         help="Path to header file defining custom Lazy IR Node base class",
     )
     parser.add_argument(
+        "--shape-inference-hdr",
         "--shape_inference_hdr",
         type=str,
         default=default_args.shape_inference_hdr,
         help="Path to header file defining custom Lazy shape inference functions",
     )
     parser.add_argument(
+        "--tensor-class",
         "--tensor_class",
         type=str,
         default=default_args.tensor_class,
         help="Name of backend specific custom Lazy Tensor class",
     )
     parser.add_argument(
+        "--tensor-class-hdr",
         "--tensor_class_hdr",
         type=str,
         default=default_args.tensor_class_hdr,
         help="Path to header file defining custom Lazy Tensor class",
     )
     parser.add_argument(
+        "--backend-name",
         "--backend_name",
         type=str,
         default=default_args.backend_name,
diff --git a/torchgen/model.py b/torchgen/model.py
index 6e34f85b679f..75f2b0892322 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -638,6 +638,7 @@ def from_yaml(
         raw_dispatch = e.pop("dispatch", None)
         assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
         dispatch: Dict[DispatchKey, BackendMetadata] = {}
+        num_dispatch_keys: int = 0
         if raw_dispatch is not None:
             assert not manual_kernel_registration, (
                 "cannot specify both manual_kernel_registration and dispatch; with "
@@ -650,16 +651,18 @@ def from_yaml(
                 assert isinstance(ks, str), e
                 for k in ks.split(","):
                     dispatch_key = DispatchKey.parse(k.strip())
+                    num_dispatch_keys += 1
+
                     if ignore_keys and dispatch_key in ignore_keys:
                         continue
                     assert dispatch_key in dispatch_keys, (
                         f"Dispatch key {dispatch_key} of kernel {v} "
                         "is not a supported dispatch key."
                     )
-                    # We only allow at most 2 levels of namespace for kernels.
+                    # We only allow at most 3 levels of namespace for kernels.
                     # We will append "native" to a custom kernel namespace.
                     namespace_helper = NamespaceHelper.from_namespaced_entity(
-                        v, max_level=2
+                        v, max_level=3
                     )
                     kernel_namespace = namespace_helper.get_cpp_namespace(default="at")
                     # Why is 'structured' included? External backends (e.g.
@@ -677,7 +680,12 @@ def from_yaml(
                     ):
                         redundant_composite_implicit_autograd = True
 
-            assert not (len(dispatch) == 1 and redundant_composite_implicit_autograd), (
+            # We count the number of dispatch keys which have not been ignored to prevent a dispatch table
+            # in which all backend keys are ignored but necessarily kept, remaining compositeimplicit,
+            # from being treated as redundant.
+            assert not (
+                num_dispatch_keys == 1 and redundant_composite_implicit_autograd
+            ), (
                 "unnecessary dispatch table for this function; just delete the dispatch "
                 "key entirely"
             )
@@ -687,6 +695,7 @@ def from_yaml(
                 structured_delegate
                 or dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}
                 or dispatch[DispatchKey.CompositeImplicitAutograd].supports_symint()
+                or num_dispatch_keys != 1
             ), (
                 f"unexpected name for singleton CompositeImplicitAutograd dispatch entry: expected {cpp.name(func)} "
                 f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}.  Rename your implementation to the expected "
@@ -1058,7 +1067,7 @@ def __post_init__(self) -> None:
         for f in self.functions():
             expected_generated_fns.update(str(op) for op in f.autogen)
         expected_generated_fns_str = ", ".join(
-            str(x) for x in sorted(list(expected_generated_fns))
+            str(x) for x in sorted(expected_generated_fns)
         )
         if len(expected_generated_fns) == 0 and len(generated_fns) > 0:
             raise RuntimeError(
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index f1ba555be62e..ee8fc0312f87 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -319,13 +319,14 @@ def generate_function(
             )
         }
     }
+    tags = {"generated"} | set(f.tags & {"nondeterministic_seeded", "view_copy"})
 
     return (
         NativeFunction(
             func=func,
             use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors,
             # These generated fn's aren't meant to be user friendly- don't generate methods.
-            variants=set([Variant.function]),
+            variants={Variant.function},
             structured=False,
             structured_delegate=None,
             structured_inherits=None,
@@ -347,7 +348,7 @@ def generate_function(
             has_composite_explicit_autograd_non_functional_kernel=False,
             # Every generated NativeFunction gets a "generated" tag, so it's easy to tell
             # which NativeFunction objects did not come directly from native_functions.yaml.
-            tags=set(["generated"]) | (f.tags & {"nondeterministic_seeded"}),
+            tags=tags,
             namespace=f.namespace,
         ),
         backend_metadata,
diff --git a/torchgen/operator_versions/gen_mobile_upgraders.py b/torchgen/operator_versions/gen_mobile_upgraders.py
index 5006f4f6d89a..e5287cffc568 100644
--- a/torchgen/operator_versions/gen_mobile_upgraders.py
+++ b/torchgen/operator_versions/gen_mobile_upgraders.py
@@ -384,7 +384,7 @@ def main() -> None:
     for up in sorted_upgrader_list:
         print("after sort upgrader : ", next(iter(up)))
 
-    pytorch_dir = Path(__file__).resolve().parents[3]
+    pytorch_dir = Path(__file__).resolve().parents[2]
     upgrader_path = pytorch_dir / "torch" / "csrc" / "jit" / "mobile"
     write_cpp(str(upgrader_path), sorted_upgrader_list)
 
diff --git a/torchgen/selective_build/selector.py b/torchgen/selective_build/selector.py
index 32f0f9e219ca..03e638c179f5 100644
--- a/torchgen/selective_build/selector.py
+++ b/torchgen/selective_build/selector.py
@@ -231,7 +231,7 @@ def to_dict(self) -> Dict[str, object]:
             ret["debug_info"] = sorted(self._debug_info)
 
         ret["kernel_metadata"] = {
-            k: sorted(list(v)) for (k, v) in self.kernel_metadata.items()
+            k: sorted(v) for (k, v) in self.kernel_metadata.items()
         }
 
         ret["custom_classes"] = sorted(self.custom_classes)
diff --git a/torchgen/static_runtime/generator.py b/torchgen/static_runtime/generator.py
index f0645c8251be..a2e2938a7f38 100644
--- a/torchgen/static_runtime/generator.py
+++ b/torchgen/static_runtime/generator.py
@@ -98,7 +98,6 @@ def has_alias(
         "median",
         "nanmedian",
         "_sparse_sparse_matmul",
-        "_sparse_mask_helper",
         "batch_norm_backward_elemt",
         "_euclidean_dist",
         "pixel_shuffle",
@@ -170,7 +169,7 @@ def has_alias(
         "_test_warn_in_autograd",
         "_test_autograd_multiple_dispatch_view",
         "_test_autograd_multiple_dispatch_view_copy",
-        "segment_reduce",
+        "_segment_reduce",
         "_segment_reduce_backward",
         "_fw_primal_copy",
         "_make_dual_copy",
diff --git a/torchgen/utils.py b/torchgen/utils.py
index e9746e941c8d..bb72134247c8 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -25,7 +25,7 @@
     Union,
 )
 
-from typing_extensions import Literal
+from typing_extensions import Literal  # Python 3.8+
 
 from torchgen.code_template import CodeTemplate
 
@@ -62,8 +62,8 @@ def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
 # code we want.
 #
 # This is an OPEN enum (we may add more cases to it in the future), so be sure
-# to explicitly specify with Union[Literal[Target.XXX]] what targets are valid
-# for your use.
+# to explicitly specify with Literal[Target.XXX] or Literal[Target.XXX, Target.YYY]
+# what targets are valid for your use.
 class Target(Enum):
     # top level namespace (not including at)
     DEFINITION = auto()